[PATCH v3 1/4] vfio/pci: detect the support of dynamic MSI-X allocation

2023-09-25 Thread Jing Liu
Kernel provides the guidance of dynamic MSI-X allocation support of
passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
guide user space.

Fetch the flags from host to determine if dynamic MSI-X allocation is
supported.

Originally-by: Reinette Chatre 
Signed-off-by: Jing Liu 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Alex Williamson 
---
Changes since v2:
- Apply Alex's Reviewed-by.

Changes since v1:
- Free msix when failed to get MSI-X irq info. (Cédric)
- Apply Cédric's Reviewed-by.

Changes since RFC v1:
- Filter the dynamic MSI-X allocation flag and store as a bool type.
  (Alex)
- Move the detection to vfio_msix_early_setup(). (Alex)
- Report error of getting irq info and remove the trace of failure
  case. (Alex, Cédric)
---
 hw/vfio/pci.c| 16 ++--
 hw/vfio/pci.h|  1 +
 hw/vfio/trace-events |  2 +-
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 3b2ca3c24ca2..a94eef50e41e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1493,7 +1493,9 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 uint8_t pos;
 uint16_t ctrl;
 uint32_t table, pba;
-int fd = vdev->vbasedev.fd;
+int ret, fd = vdev->vbasedev.fd;
+struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
+  .index = VFIO_PCI_MSIX_IRQ_INDEX };
 VFIOMSIXInfo *msix;
 
 pos = pci_find_capability(>pdev, PCI_CAP_ID_MSIX);
@@ -1530,6 +1532,15 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
 msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
 
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, _info);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
+g_free(msix);
+return;
+}
+
+msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
+
 /*
  * Test the size of the pba_offset variable and catch if it extends outside
  * of the specified BAR. If it is the case, we need to apply a hardware
@@ -1562,7 +1573,8 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 }
 
 trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
-msix->table_offset, msix->entries);
+msix->table_offset, msix->entries,
+msix->noresize);
 vdev->msix = msix;
 
 vfio_pci_fixup_msix_region(vdev);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index 2d836093a83d..0d89eb761ece 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
 uint32_t table_offset;
 uint32_t pba_offset;
 unsigned long *pending;
+bool noresize;
 } VFIOMSIXInfo;
 
 #define TYPE_VFIO_PCI "vfio-pci"
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index e64ca4a01961..0ba3c5a0e26b 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -27,7 +27,7 @@ vfio_vga_read(uint64_t addr, int size, uint64_t data) " 
(0x%"PRIx64", %d) = 0x%"
 vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, 
@0x%x, len=0x%x) 0x%x"
 vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, 
@0x%x, 0x%x, len=0x%x)"
 vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
-vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, 
int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
+vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, 
int entries, bool noresize) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, 
entries %d, noresize %d"
 vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
 vfio_check_pm_reset(const char *name) "%s Supports PM reset"
 vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
-- 
2.27.0




[PATCH v3 2/4] vfio/pci: enable vector on dynamic MSI-X allocation

2023-09-25 Thread Jing Liu
The vector_use callback is used to enable vector that is unmasked in
guest. The kernel used to only support static MSI-X allocation. When
allocating a new interrupt using "static MSI-X allocation" kernels,
QEMU first disables all previously allocated vectors and then
re-allocates all including the new one. The nr_vectors of VFIOPCIDevice
indicates that all vectors from 0 to nr_vectors are allocated (and may
be enabled), which is used to loop all the possibly used vectors when
e.g., disabling MSI-X interrupts.

Extend the vector_use function to support dynamic MSI-X allocation when
host supports the capability. QEMU therefore can individually allocate
and enable a new interrupt without affecting others or causing interrupts
lost during runtime.

Utilize nr_vectors to calculate the upper bound of enabled vectors in
dynamic MSI-X allocation mode since looping all msix_entries_nr is not
efficient and unnecessary.

Signed-off-by: Jing Liu 
Tested-by: Reinette Chatre 
Reviewed-by: Alex Williamson 
---
Changes since v2:
- Use a bool type to test (vdev->nr_vectors < nr + 1). (Alex)
- Revise the comments. (Alex)
- Apply Alex's Reviewed-by.

Changes since v1:
- Revise Qemu to QEMU.

Changes since RFC v1:
- Test vdev->msix->noresize to identify the allocation mode. (Alex)
- Move defer_kvm_irq_routing test out and update nr_vectors in a
  common place before vfio_enable_vectors(). (Alex)
- Revise the comments. (Alex)
---
 hw/vfio/pci.c | 46 --
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a94eef50e41e..27a65302ea69 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -470,6 +470,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
unsigned int nr,
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
 VFIOMSIVector *vector;
 int ret;
+bool resizing = !!(vdev->nr_vectors < nr + 1);
 
 trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
 
@@ -512,33 +513,42 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
unsigned int nr,
 }
 
 /*
- * We don't want to have the host allocate all possible MSI vectors
- * for a device if they're not in use, so we shutdown and incrementally
- * increase them as needed.
+ * When dynamic allocation is not supported, we don't want to have the
+ * host allocate all possible MSI vectors for a device if they're not
+ * in use, so we shutdown and incrementally increase them as needed.
+ * nr_vectors represents the total number of vectors allocated.
+ *
+ * When dynamic allocation is supported, let the host only allocate
+ * and enable a vector when it is in use in guest. nr_vectors represents
+ * the upper bound of vectors being enabled (but not all of the ranges
+ * is allocated or enabled).
  */
-if (vdev->nr_vectors < nr + 1) {
+if (resizing) {
 vdev->nr_vectors = nr + 1;
-if (!vdev->defer_kvm_irq_routing) {
+}
+
+if (!vdev->defer_kvm_irq_routing) {
+if (vdev->msix->noresize && resizing) {
 vfio_disable_irqindex(>vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 ret = vfio_enable_vectors(vdev, true);
 if (ret) {
 error_report("vfio: failed to enable vectors, %d", ret);
 }
-}
-} else {
-Error *err = NULL;
-int32_t fd;
-
-if (vector->virq >= 0) {
-fd = event_notifier_get_fd(>kvm_interrupt);
 } else {
-fd = event_notifier_get_fd(>interrupt);
-}
+Error *err = NULL;
+int32_t fd;
 
-if (vfio_set_irq_signaling(>vbasedev,
- VFIO_PCI_MSIX_IRQ_INDEX, nr,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) {
-error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+if (vector->virq >= 0) {
+fd = event_notifier_get_fd(>kvm_interrupt);
+} else {
+fd = event_notifier_get_fd(>interrupt);
+}
+
+if (vfio_set_irq_signaling(>vbasedev,
+   VFIO_PCI_MSIX_IRQ_INDEX, nr,
+   VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) 
{
+error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+}
 }
 }
 
-- 
2.27.0




[PATCH v3 3/4] vfio/pci: use an invalid fd to enable MSI-X

2023-09-25 Thread Jing Liu
Guests typically enable MSI-X with all of the vectors masked in the MSI-X
vector table. To match the guest state of device, QEMU enables MSI-X by
enabling vector 0 with userspace triggering and immediately release.
However the release function actually does not release it due to already
using userspace mode.

It is no need to enable triggering on host and rely on the mask bit to
avoid spurious interrupts. Use an invalid fd (i.e. fd = -1) is enough
to get MSI-X enabled.

After dynamic MSI-X allocation is supported, the interrupt restoring
also need use such way to enable MSI-X, therefore, create a function
for that.

Suggested-by: Alex Williamson 
Signed-off-by: Jing Liu 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Alex Williamson 
---
Changes since v2:
- Apply Cédric's Reviewed-by.
- Apply Alex's Reviewed-by.

Changes since v1:
- Revise Qemu to QEMU. (Cédric)
- Use g_autofree to automatically release. (Cédric)
- Just return 'ret' and let the caller of vfio_enable_msix_no_vec()
  report the error. (Cédric)

Changes since RFC v1:
- A new patch. Use an invalid fd to get MSI-X enabled instead of using
  userspace triggering. (Alex)
---
 hw/vfio/pci.c | 44 
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 27a65302ea69..bf676a49ae77 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -369,6 +369,33 @@ static void vfio_msi_interrupt(void *opaque)
 notify(>pdev, nr);
 }
 
+/*
+ * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an 
invalid
+ * fd to kernel.
+ */
+static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
+{
+g_autofree struct vfio_irq_set *irq_set = NULL;
+int ret = 0, argsz;
+int32_t *fd;
+
+argsz = sizeof(*irq_set) + sizeof(*fd);
+
+irq_set = g_malloc0(argsz);
+irq_set->argsz = argsz;
+irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+irq_set->start = 0;
+irq_set->count = 1;
+fd = (int32_t *)_set->data;
+*fd = -1;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+return ret;
+}
+
 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
 {
 struct vfio_irq_set *irq_set;
@@ -618,6 +645,8 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice 
*vdev)
 
 static void vfio_msix_enable(VFIOPCIDevice *vdev)
 {
+int ret;
+
 vfio_disable_interrupts(vdev);
 
 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
@@ -640,8 +669,6 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
 vfio_commit_kvm_msi_virq_batch(vdev);
 
 if (vdev->nr_vectors) {
-int ret;
-
 ret = vfio_enable_vectors(vdev, true);
 if (ret) {
 error_report("vfio: failed to enable vectors, %d", ret);
@@ -655,13 +682,14 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
  * MSI-X capability, but leaves the vector table masked.  We therefore
  * can't rely on a vector_use callback (from request_irq() in the 
guest)
  * to switch the physical device into MSI-X mode because that may come 
a
- * long time after pci_enable_msix().  This code enables vector 0 with
- * triggering to userspace, then immediately release the vector, 
leaving
- * the physical device with no vectors enabled, but MSI-X enabled, just
- * like the guest view.
+ * long time after pci_enable_msix().  This code sets vector 0 with an
+ * invalid fd to make the physical device MSI-X enabled, but with no
+ * vectors enabled, just like the guest view.
  */
-vfio_msix_vector_do_use(>pdev, 0, NULL, NULL);
-vfio_msix_vector_release(>pdev, 0);
+ret = vfio_enable_msix_no_vec(vdev);
+if (ret) {
+error_report("vfio: failed to enable MSI-X, %d", ret);
+}
 }
 
 trace_vfio_msix_enable(vdev->vbasedev.name);
-- 
2.27.0




[PATCH v3 0/4] Support dynamic MSI-X allocation

2023-09-25 Thread Jing Liu
Changes since v2:
- v2: https://www.mail-archive.com/qemu-devel@nongnu.org/msg989852.html
- Use a bool type to test (vdev->nr_vectors < nr + 1). (Alex)
- Revise comments. (Alex)
- Apply Cédric's and Alex's Reviewed-by.

Changes since v1:
- v1: https://www.mail-archive.com/qemu-devel@nongnu.org/msg982842.html
- Revise Qemu to QEMU. (Cédric)
- Add g_free when failure of getting MSI-X irq info. (Cédric)
- Apply Cédric's Reviewed-by. (Cédric)
- Use g_autofree to automatically release. (Cédric)
- Remove the failure message in vfio_enable_msix_no_vec(). (Cédric)

Changes since RFC v1:
- RFC v1: https://www.mail-archive.com/qemu-devel@nongnu.org/msg978637.html
- Revise the comments. (Alex)
- Report error of getting irq info and remove the trace of failure
  case. (Alex, Cédric)
- Only store dynamic allocation flag as a bool type and test
  accordingly. (Alex)
- Move dynamic allocation detection to vfio_msix_early_setup(). (Alex)
- Change the condition logic in vfio_msix_vector_do_use() that moving
  the defer_kvm_irq_routing test out and create a common place to update
  nr_vectors. (Alex)
- Consolidate the way of MSI-X enabling during device initialization and
  interrupt restoring that uses fd = -1 trick. Create a function doing
  that. (Alex)

Before kernel v6.5, dynamic allocation of MSI-X interrupts was not
supported. QEMU therefore when allocating a new interrupt, should first
release all previously allocated interrupts (including disable of MSI-X)
and re-allocate all interrupts that includes the new one.

The kernel series [1] adds the support of dynamic MSI-X allocation to
vfio-pci and uses the existing flag VFIO_IRQ_INFO_NORESIZE to guide user
space, that when dynamic MSI-X is supported the flag is cleared.

This series makes the behavior for VFIO PCI devices when dynamic MSI-X
allocation is supported. When guest unmasks an interrupt, QEMU can
directly allocate an interrupt on host for this and has nothing to do
with the previously allocated ones. Therefore, host only allocates
interrupts for those unmasked (enabled) interrupts inside guest when
dynamic MSI-X allocation is supported by device.

When guests enable MSI-X with all of the vectors masked, QEMU need match
the state to enable MSI-X with no vector enabled. During migration
restore, QEMU also need enable MSI-X first in dynamic allocation mode,
to avoid the guest unused vectors being allocated on host. To
consolidate them, we use vector 0 with an invalid fd to get MSI-X
enabled and create a common function for this. This is cleaner than
setting userspace triggering and immediately release.

Any feedback is appreciated.

Jing

[1] https://lwn.net/Articles/931679/

Jing Liu (4):
  vfio/pci: detect the support of dynamic MSI-X allocation
  vfio/pci: enable vector on dynamic MSI-X allocation
  vfio/pci: use an invalid fd to enable MSI-X
  vfio/pci: enable MSI-X in interrupt restoring on dynamic allocation

 hw/vfio/pci.c| 123 +--
 hw/vfio/pci.h|   1 +
 hw/vfio/trace-events |   2 +-
 3 files changed, 97 insertions(+), 29 deletions(-)

-- 
2.27.0




[PATCH v3 4/4] vfio/pci: enable MSI-X in interrupt restoring on dynamic allocation

2023-09-25 Thread Jing Liu
During migration restoring, vfio_enable_vectors() is called to restore
enabling MSI-X interrupts for assigned devices. It sets the range from
0 to nr_vectors to kernel to enable MSI-X and the vectors unmasked in
guest. During the MSI-X enabling, all the vectors within the range are
allocated according to the VFIO_DEVICE_SET_IRQS ioctl.

When dynamic MSI-X allocation is supported, we only want the guest
unmasked vectors being allocated and enabled. Use vector 0 with an
invalid fd to get MSI-X enabled, after that, all the vectors can be
allocated in need.

Signed-off-by: Jing Liu 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Alex Williamson 
---
Changes since v2:
- Apply Cédric's Reviewed-by.
- Apply Alex's Reviewed-by.

Changes since v1:
- No change.

Changes since RFC v1:
- Revise the comments. (Alex)
- Call the new helper function in previous patch to enable MSI-X. (Alex)
---
 hw/vfio/pci.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index bf676a49ae77..8a082af39e77 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -402,6 +402,23 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool 
msix)
 int ret = 0, i, argsz;
 int32_t *fds;
 
+/*
+ * If dynamic MSI-X allocation is supported, the vectors to be allocated
+ * and enabled can be scattered. Before kernel enabling MSI-X, setting
+ * nr_vectors causes all these vectors to be allocated on host.
+ *
+ * To keep allocation as needed, use vector 0 with an invalid fd to get
+ * MSI-X enabled first, then set vectors with a potentially sparse set of
+ * eventfds to enable interrupts only when enabled in guest.
+ */
+if (msix && !vdev->msix->noresize) {
+ret = vfio_enable_msix_no_vec(vdev);
+
+if (ret) {
+return ret;
+}
+}
+
 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 
 irq_set = g_malloc0(argsz);
-- 
2.27.0




[PATCH v2 0/4] Support dynamic MSI-X allocation

2023-09-18 Thread Jing Liu
Changes since v1:
- v1: https://www.mail-archive.com/qemu-devel@nongnu.org/msg982842.html
- Revise Qemu to QEMU. (Cédric)
- Add g_free when failure of getting MSI-X irq info. (Cédric)
- Apply Cédric's Reviewed-by. (Cédric)
- Use g_autofree to automatically release. (Cédric)
- Remove the failure message in vfio_enable_msix_no_vec(). (Cédric)

Changes since RFC v1:
- RFC v1: https://www.mail-archive.com/qemu-devel@nongnu.org/msg978637.html
- Revise the comments. (Alex)
- Report error of getting irq info and remove the trace of failure
  case. (Alex, Cédric)
- Only store dynamic allocation flag as a bool type and test
  accordingly. (Alex)
- Move dynamic allocation detection to vfio_msix_early_setup(). (Alex)
- Change the condition logic in vfio_msix_vector_do_use() that moving
  the defer_kvm_irq_routing test out and create a common place to update
  nr_vectors. (Alex)
- Consolidate the way of MSI-X enabling during device initialization and
  interrupt restoring that uses fd = -1 trick. Create a function doing
  that. (Alex)

Before kernel v6.5, dynamic allocation of MSI-X interrupts was not
supported. QEMU therefore when allocating a new interrupt, should first
release all previously allocated interrupts (including disable of MSI-X)
and re-allocate all interrupts that includes the new one.

The kernel series [1] adds the support of dynamic MSI-X allocation to
vfio-pci and uses the existing flag VFIO_IRQ_INFO_NORESIZE to guide user
space, that when dynamic MSI-X is supported the flag is cleared.

This series makes the behavior for VFIO PCI devices when dynamic MSI-X
allocation is supported. When guest unmasks an interrupt, QEMU can
directly allocate an interrupt on host for this and has nothing to do
with the previously allocated ones. Therefore, host only allocates
interrupts for those unmasked (enabled) interrupts inside guest when
dynamic MSI-X allocation is supported by device.

When guests enable MSI-X with all of the vectors masked, QEMU need match
the state to enable MSI-X with no vector enabled. During migration
restore, QEMU also need enable MSI-X first in dynamic allocation mode,
to avoid the guest unused vectors being allocated on host. To
consolidate them, we use vector 0 with an invalid fd to get MSI-X
enabled and create a common function for this. This is cleaner than
setting userspace triggering and immediately release.

Any feedback is appreciated.

Jing

[1] https://lwn.net/Articles/931679/

Jing Liu (4):
  vfio/pci: detect the support of dynamic MSI-X allocation
  vfio/pci: enable vector on dynamic MSI-X allocation
  vfio/pci: use an invalid fd to enable MSI-X
  vfio/pci: enable MSI-X in interrupt restoring on dynamic allocation

 hw/vfio/pci.c| 121 +--
 hw/vfio/pci.h|   1 +
 hw/vfio/trace-events |   2 +-
 3 files changed, 96 insertions(+), 28 deletions(-)

-- 
2.27.0




[PATCH v2 1/4] vfio/pci: detect the support of dynamic MSI-X allocation

2023-09-18 Thread Jing Liu
Kernel provides the guidance of dynamic MSI-X allocation support of
passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
guide user space.

Fetch the flags from host to determine if dynamic MSI-X allocation is
supported.

Originally-by: Reinette Chatre 
Signed-off-by: Jing Liu 
Reviewed-by: Cédric Le Goater 
---
Changes since v1:
- Free msix when failed to get MSI-X irq info. (Cédric)
- Apply Cédric's Reviewed-by.

Changes since RFC v1:
- Filter the dynamic MSI-X allocation flag and store as a bool type.
  (Alex)
- Move the detection to vfio_msix_early_setup(). (Alex)
- Report error of getting irq info and remove the trace of failure
  case. (Alex, Cédric)
---
 hw/vfio/pci.c| 16 ++--
 hw/vfio/pci.h|  1 +
 hw/vfio/trace-events |  2 +-
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a205c6b1130f..60654ca28ab8 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1493,7 +1493,9 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 uint8_t pos;
 uint16_t ctrl;
 uint32_t table, pba;
-int fd = vdev->vbasedev.fd;
+int ret, fd = vdev->vbasedev.fd;
+struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
+  .index = VFIO_PCI_MSIX_IRQ_INDEX };
 VFIOMSIXInfo *msix;
 
 pos = pci_find_capability(>pdev, PCI_CAP_ID_MSIX);
@@ -1530,6 +1532,15 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
 msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
 
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, _info);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
+g_free(msix);
+return;
+}
+
+msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
+
 /*
  * Test the size of the pba_offset variable and catch if it extends outside
  * of the specified BAR. If it is the case, we need to apply a hardware
@@ -1562,7 +1573,8 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 }
 
 trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
-msix->table_offset, msix->entries);
+msix->table_offset, msix->entries,
+msix->noresize);
 vdev->msix = msix;
 
 vfio_pci_fixup_msix_region(vdev);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a2771b9ff3cc..0717574d79e9 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
 uint32_t table_offset;
 uint32_t pba_offset;
 unsigned long *pending;
+bool noresize;
 } VFIOMSIXInfo;
 
 #define TYPE_VFIO_PCI "vfio-pci"
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index 81ec7c7a958b..cc7c21365c92 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -27,7 +27,7 @@ vfio_vga_read(uint64_t addr, int size, uint64_t data) " 
(0x%"PRIx64", %d) = 0x%"
 vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, 
@0x%x, len=0x%x) 0x%x"
 vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, 
@0x%x, 0x%x, len=0x%x)"
 vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
-vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, 
int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
+vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, 
int entries, bool noresize) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, 
entries %d, noresize %d"
 vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
 vfio_check_pm_reset(const char *name) "%s Supports PM reset"
 vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
-- 
2.27.0




[PATCH v2 2/4] vfio/pci: enable vector on dynamic MSI-X allocation

2023-09-18 Thread Jing Liu
The vector_use callback is used to enable vector that is unmasked in
guest. The kernel used to only support static MSI-X allocation. When
allocating a new interrupt using "static MSI-X allocation" kernels,
QEMU first disables all previously allocated vectors and then
re-allocates all including the new one. The nr_vectors of VFIOPCIDevice
indicates that all vectors from 0 to nr_vectors are allocated (and may
be enabled), which is used to to loop all the possibly used vectors
When, e.g., disabling MSI-X interrupts.

Extend the vector_use function to support dynamic MSI-X allocation when
host supports the capability. QEMU therefore can individually allocate
and enable a new interrupt without affecting others or causing interrupts
lost during runtime.

Utilize nr_vectors to calculate the upper bound of enabled vectors in
dynamic MSI-X allocation mode since looping all msix_entries_nr is not
efficient and unnecessary.

Signed-off-by: Jing Liu 
Tested-by: Reinette Chatre 
---
Changes since v1:
- Revise Qemu to QEMU.

Changes since RFC v1:
- Test vdev->msix->noresize to identify the allocation mode. (Alex)
- Move defer_kvm_irq_routing test out and update nr_vectors in a
  common place before vfio_enable_vectors(). (Alex)
- Revise the comments. (Alex)
---
 hw/vfio/pci.c | 44 +++-
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 60654ca28ab8..84987e46fd7a 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -470,6 +470,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
unsigned int nr,
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
 VFIOMSIVector *vector;
 int ret;
+int old_nr_vecs = vdev->nr_vectors;
 
 trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
 
@@ -512,33 +513,42 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
unsigned int nr,
 }
 
 /*
- * We don't want to have the host allocate all possible MSI vectors
- * for a device if they're not in use, so we shutdown and incrementally
- * increase them as needed.
+ * When dynamic allocation is not supported, we don't want to have the
+ * host allocate all possible MSI vectors for a device if they're not
+ * in use, so we shutdown and incrementally increase them as needed.
+ * nr_vectors represents the total number of vectors allocated.
+ *
+ * When dynamic allocation is supported, let the host only allocate
+ * and enable a vector when it is in use in guest. nr_vectors represents
+ * the upper bound of vectors being enabled (but not all of the ranges
+ * is allocated or enabled).
  */
 if (vdev->nr_vectors < nr + 1) {
 vdev->nr_vectors = nr + 1;
-if (!vdev->defer_kvm_irq_routing) {
+}
+
+if (!vdev->defer_kvm_irq_routing) {
+if (vdev->msix->noresize && (old_nr_vecs < nr + 1)) {
 vfio_disable_irqindex(>vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 ret = vfio_enable_vectors(vdev, true);
 if (ret) {
 error_report("vfio: failed to enable vectors, %d", ret);
 }
-}
-} else {
-Error *err = NULL;
-int32_t fd;
-
-if (vector->virq >= 0) {
-fd = event_notifier_get_fd(>kvm_interrupt);
 } else {
-fd = event_notifier_get_fd(>interrupt);
-}
+Error *err = NULL;
+int32_t fd;
 
-if (vfio_set_irq_signaling(>vbasedev,
- VFIO_PCI_MSIX_IRQ_INDEX, nr,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) {
-error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+if (vector->virq >= 0) {
+fd = event_notifier_get_fd(>kvm_interrupt);
+} else {
+fd = event_notifier_get_fd(>interrupt);
+}
+
+if (vfio_set_irq_signaling(>vbasedev,
+   VFIO_PCI_MSIX_IRQ_INDEX, nr,
+   VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) 
{
+error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+}
 }
 }
 
-- 
2.27.0




[PATCH v2 3/4] vfio/pci: use an invalid fd to enable MSI-X

2023-09-18 Thread Jing Liu
Guests typically enable MSI-X with all of the vectors masked in the MSI-X
vector table. To match the guest state of device, QEMU enables MSI-X by
enabling vector 0 with userspace triggering and immediately release.
However the release function actually does not release it due to already
using userspace mode.

It is no need to enable triggering on host and rely on the mask bit to
avoid spurious interrupts. Use an invalid fd (i.e. fd = -1) is enough
to get MSI-X enabled.

After dynamic MSI-X allocation is supported, the interrupt restoring
also need use such way to enable MSI-X, therefore, create a function
for that.

Suggested-by: Alex Williamson 
Signed-off-by: Jing Liu 
---
Changes since v1:
- Revise Qemu to QEMU. (Cédric)
- Use g_autofree to automatically release. (Cédric)
- Just return 'ret' and let the caller of vfio_enable_msix_no_vec()
  report the error. (Cédric)

Changes since RFC v1:
- A new patch. Use an invalid fd to get MSI-X enabled instead of using
  userspace triggering. (Alex)
---
 hw/vfio/pci.c | 44 
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 84987e46fd7a..0117f230e934 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -369,6 +369,33 @@ static void vfio_msi_interrupt(void *opaque)
 notify(>pdev, nr);
 }
 
+/*
+ * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an 
invalid
+ * fd to kernel.
+ */
+static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
+{
+g_autofree struct vfio_irq_set *irq_set = NULL;
+int ret = 0, argsz;
+int32_t *fd;
+
+argsz = sizeof(*irq_set) + sizeof(*fd);
+
+irq_set = g_malloc0(argsz);
+irq_set->argsz = argsz;
+irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+irq_set->start = 0;
+irq_set->count = 1;
+fd = (int32_t *)_set->data;
+*fd = -1;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+return ret;
+}
+
 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
 {
 struct vfio_irq_set *irq_set;
@@ -618,6 +645,8 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice 
*vdev)
 
 static void vfio_msix_enable(VFIOPCIDevice *vdev)
 {
+int ret;
+
 vfio_disable_interrupts(vdev);
 
 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
@@ -640,8 +669,6 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
 vfio_commit_kvm_msi_virq_batch(vdev);
 
 if (vdev->nr_vectors) {
-int ret;
-
 ret = vfio_enable_vectors(vdev, true);
 if (ret) {
 error_report("vfio: failed to enable vectors, %d", ret);
@@ -655,13 +682,14 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
  * MSI-X capability, but leaves the vector table masked.  We therefore
  * can't rely on a vector_use callback (from request_irq() in the 
guest)
  * to switch the physical device into MSI-X mode because that may come 
a
- * long time after pci_enable_msix().  This code enables vector 0 with
- * triggering to userspace, then immediately release the vector, 
leaving
- * the physical device with no vectors enabled, but MSI-X enabled, just
- * like the guest view.
+ * long time after pci_enable_msix().  This code sets vector 0 with an
+ * invalid fd to make the physical device MSI-X enabled, but with no
+ * vectors enabled, just like the guest view.
  */
-vfio_msix_vector_do_use(>pdev, 0, NULL, NULL);
-vfio_msix_vector_release(>pdev, 0);
+ret = vfio_enable_msix_no_vec(vdev);
+if (ret) {
+error_report("vfio: failed to enable MSI-X, %d", ret);
+}
 }
 
 trace_vfio_msix_enable(vdev->vbasedev.name);
-- 
2.27.0




[PATCH v2 4/4] vfio/pci: enable MSI-X in interrupt restoring on dynamic allocation

2023-09-18 Thread Jing Liu
During migration restoring, vfio_enable_vectors() is called to restore
enabling MSI-X interrupts for assigned devices. It sets the range from
0 to nr_vectors to kernel to enable MSI-X and the vectors unmasked in
guest. During the MSI-X enabling, all the vectors within the range are
allocated according to the VFIO_DEVICE_SET_IRQS ioctl.

When dynamic MSI-X allocation is supported, we only want the guest
unmasked vectors being allocated and enabled. Use vector 0 with an
invalid fd to get MSI-X enabled, after that, all the vectors can be
allocated in need.

Signed-off-by: Jing Liu 
---
Changes since v1:
- No change.

Changes since RFC v1:
- Revise the comments. (Alex)
- Call the new helper function in previous patch to enable MSI-X. (Alex)
---
 hw/vfio/pci.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0117f230e934..f5f891dc0792 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -402,6 +402,23 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool 
msix)
 int ret = 0, i, argsz;
 int32_t *fds;
 
+/*
+ * If dynamic MSI-X allocation is supported, the vectors to be allocated
+ * and enabled can be scattered. Before kernel enabling MSI-X, setting
+ * nr_vectors causes all these vectors to be allocated on host.
+ *
+ * To keep allocation as needed, use vector 0 with an invalid fd to get
+ * MSI-X enabled first, then set vectors with a potentially sparse set of
+ * eventfds to enable interrupts only when enabled in guest.
+ */
+if (msix && !vdev->msix->noresize) {
+ret = vfio_enable_msix_no_vec(vdev);
+
+if (ret) {
+return ret;
+}
+}
+
 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 
 irq_set = g_malloc0(argsz);
-- 
2.27.0




[PATCH v1 3/4] vfio/pci: use an invalid fd to enable MSI-X

2023-08-22 Thread Jing Liu
Guests typically enable MSI-X with all of the vectors masked in the MSI-X
vector table. To match the guest state of device, Qemu enables MSI-X by
enabling vector 0 with userspace triggering and immediately release.
However the release function actually does not release it due to already
using userspace mode.

It is no need to enable triggering on host and rely on the mask bit to
avoid spurious interrupts. Use an invalid fd (i.e. fd = -1) is enough
to get MSI-X enabled.

After dynamic MSI-X allocation is supported, the interrupt restoring
also need use such way to enable MSI-X, therefore, create a function
for that.

Suggested-by: Alex Williamson 
Signed-off-by: Jing Liu 
---
Changes since RFC v1:
- A new patch. Use an invalid fd to get MSI-X enabled instead of using
  userspace triggering. (Alex)
---
 hw/vfio/pci.c | 50 ++
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 31f36d68bb19..e24c21241a0c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -369,6 +369,39 @@ static void vfio_msi_interrupt(void *opaque)
 notify(>pdev, nr);
 }
 
+/*
+ * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an 
invalid
+ * fd to kernel.
+ */
+static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
+{
+struct vfio_irq_set *irq_set;
+int ret = 0, argsz;
+int32_t *fd;
+
+argsz = sizeof(*irq_set) + sizeof(*fd);
+
+irq_set = g_malloc0(argsz);
+irq_set->argsz = argsz;
+irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+irq_set->start = 0;
+irq_set->count = 1;
+fd = (int32_t *)_set->data;
+*fd = -1;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+if (ret) {
+error_report("vfio: failed to enable MSI-X with vector 0 trick, %d",
+ ret);
+}
+
+g_free(irq_set);
+
+return ret;
+}
+
 static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
 {
 struct vfio_irq_set *irq_set;
@@ -618,6 +651,8 @@ static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice 
*vdev)
 
 static void vfio_msix_enable(VFIOPCIDevice *vdev)
 {
+int ret;
+
 vfio_disable_interrupts(vdev);
 
 vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
@@ -640,8 +675,6 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
 vfio_commit_kvm_msi_virq_batch(vdev);
 
 if (vdev->nr_vectors) {
-int ret;
-
 ret = vfio_enable_vectors(vdev, true);
 if (ret) {
 error_report("vfio: failed to enable vectors, %d", ret);
@@ -655,13 +688,14 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
  * MSI-X capability, but leaves the vector table masked.  We therefore
  * can't rely on a vector_use callback (from request_irq() in the 
guest)
  * to switch the physical device into MSI-X mode because that may come 
a
- * long time after pci_enable_msix().  This code enables vector 0 with
- * triggering to userspace, then immediately release the vector, 
leaving
- * the physical device with no vectors enabled, but MSI-X enabled, just
- * like the guest view.
+ * long time after pci_enable_msix().  This code sets vector 0 with an
+ * invalid fd to make the physical device MSI-X enabled, but with no
+ * vectors enabled, just like the guest view.
  */
-vfio_msix_vector_do_use(>pdev, 0, NULL, NULL);
-vfio_msix_vector_release(>pdev, 0);
+ret = vfio_enable_msix_no_vec(vdev);
+if (ret) {
+error_report("vfio: failed to enable MSI-X, %d", ret);
+}
 }
 
 trace_vfio_msix_enable(vdev->vbasedev.name);
-- 
2.27.0




[PATCH v1 0/4] Support dynamic MSI-X allocation

2023-08-22 Thread Jing Liu
Changes since RFC v1:
- RFC v1: https://www.mail-archive.com/qemu-devel@nongnu.org/msg978637.html
- Revise the comments. (Alex)
- Report error of getting irq info and remove the trace of failure
  case. (Alex, Cédric)
- Only store dynamic allocation flag as a bool type and test
  accordingly. (Alex)
- Move dynamic allocation detection to vfio_msix_early_setup(). (Alex)
- Change the condition logic in vfio_msix_vector_do_use() that moving
  the defer_kvm_irq_routing test out and create a common place to update
  nr_vectors. (Alex)
- Consolidate the way of MSI-X enabling during device initialization and
  interrupt restoring that uses fd = -1 trick. Create a function doing
  that. (Alex)

Before kernel v6.5, dynamic allocation of MSI-X interrupts was not
supported. Qemu therefore when allocating a new interrupt, should first
release all previously allocated interrupts (including disable of MSI-X)
and re-allocate all interrupts that includes the new one.

The kernel series [1] adds the support of dynamic MSI-X allocation to
vfio-pci and uses the existing flag VFIO_IRQ_INFO_NORESIZE to guide user
space, that when dynamic MSI-X is supported the flag is cleared.

This series makes the behavior for VFIO PCI devices when dynamic MSI-X
allocation is supported. When guest unmasks an interrupt, Qemu can
directly allocate an interrupt on host for this and has nothing to do
with the previously allocated ones. Therefore, host only allocates
interrupts for those unmasked (enabled) interrupts inside guest when
dynamic MSI-X allocation is supported by device.

When guests enable MSI-X with all of the vectors masked, Qemu need match
the state to enable MSI-X with no vector enabled. During migration
restore, Qemu also need enable MSI-X first in dynamic allocation mode,
to avoid the guest unused vectors being allocated on host. To
consolidate them, we use vector 0 with an invalid fd to get MSI-X
enabled and create a common function for this. This is cleaner than
setting userspace triggering and immediately release.

Any feedback is appreciated.

Jing

[1] https://lwn.net/Articles/931679/

Jing Liu (4):
  vfio/pci: detect the support of dynamic MSI-X allocation
  vfio/pci: enable vector on dynamic MSI-X allocation
  vfio/pci: use an invalid fd to enable MSI-X
  vfio/pci: enable MSI-X in interrupt restoring on dynamic allocation

 hw/vfio/pci.c| 126 +--
 hw/vfio/pci.h|   1 +
 hw/vfio/trace-events |   2 +-
 3 files changed, 101 insertions(+), 28 deletions(-)

-- 
2.27.0




[PATCH v1 2/4] vfio/pci: enable vector on dynamic MSI-X allocation

2023-08-22 Thread Jing Liu
The vector_use callback is used to enable vector that is unmasked in
guest. The kernel used to only support static MSI-X allocation. When
allocating a new interrupt using "static MSI-X allocation" kernels,
Qemu first disables all previously allocated vectors and then
re-allocates all including the new one. The nr_vectors of VFIOPCIDevice
indicates that all vectors from 0 to nr_vectors are allocated (and may
be enabled), which is used to to loop all the possibly used vectors
When, e.g., disabling MSI-X interrupts.

Extend the vector_use function to support dynamic MSI-X allocation when
host supports the capability. Qemu therefore can individually allocate
and enable a new interrupt without affecting others or causing interrupts
lost during runtime.

Utilize nr_vectors to calculate the upper bound of enabled vectors in
dynamic MSI-X allocation mode since looping all msix_entries_nr is not
efficient and unnecessary.

Signed-off-by: Jing Liu 
Tested-by: Reinette Chatre 
---
Changes since RFC v1:
- Test vdev->msix->noresize to identify the allocation mode. (Alex)
- Move defer_kvm_irq_routing test out and update nr_vectors in a
  common place before vfio_enable_vectors(). (Alex)
- Revise the comments. (Alex)
---
 hw/vfio/pci.c | 44 +++-
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 8a3b34f3c196..31f36d68bb19 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -470,6 +470,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
unsigned int nr,
 VFIOPCIDevice *vdev = VFIO_PCI(pdev);
 VFIOMSIVector *vector;
 int ret;
+int old_nr_vecs = vdev->nr_vectors;
 
 trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
 
@@ -512,33 +513,42 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
unsigned int nr,
 }
 
 /*
- * We don't want to have the host allocate all possible MSI vectors
- * for a device if they're not in use, so we shutdown and incrementally
- * increase them as needed.
+ * When dynamic allocation is not supported, we don't want to have the
+ * host allocate all possible MSI vectors for a device if they're not
+ * in use, so we shutdown and incrementally increase them as needed.
+ * nr_vectors represents the total number of vectors allocated.
+ *
+ * When dynamic allocation is supported, let the host only allocate
+ * and enable a vector when it is in use in guest. nr_vectors represents
+ * the upper bound of vectors being enabled (but not all of the ranges
+ * is allocated or enabled).
  */
 if (vdev->nr_vectors < nr + 1) {
 vdev->nr_vectors = nr + 1;
-if (!vdev->defer_kvm_irq_routing) {
+}
+
+if (!vdev->defer_kvm_irq_routing) {
+if (vdev->msix->noresize && (old_nr_vecs < nr + 1)) {
 vfio_disable_irqindex(>vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 ret = vfio_enable_vectors(vdev, true);
 if (ret) {
 error_report("vfio: failed to enable vectors, %d", ret);
 }
-}
-} else {
-Error *err = NULL;
-int32_t fd;
-
-if (vector->virq >= 0) {
-fd = event_notifier_get_fd(>kvm_interrupt);
 } else {
-fd = event_notifier_get_fd(>interrupt);
-}
+Error *err = NULL;
+int32_t fd;
 
-if (vfio_set_irq_signaling(>vbasedev,
- VFIO_PCI_MSIX_IRQ_INDEX, nr,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) {
-error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+if (vector->virq >= 0) {
+fd = event_notifier_get_fd(>kvm_interrupt);
+} else {
+fd = event_notifier_get_fd(>interrupt);
+}
+
+if (vfio_set_irq_signaling(>vbasedev,
+   VFIO_PCI_MSIX_IRQ_INDEX, nr,
+   VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) 
{
+error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+}
 }
 }
 
-- 
2.27.0




[PATCH v1 4/4] vfio/pci: enable MSI-X in interrupt restoring on dynamic allocation

2023-08-22 Thread Jing Liu
During migration restoring, vfio_enable_vectors() is called to restore
enabling MSI-X interrupts for assigned devices. It sets the range from
0 to nr_vectors to kernel to enable MSI-X and the vectors unmasked in
guest. During the MSI-X enabling, all the vectors within the range are
allocated according to the VFIO_DEVICE_SET_IRQS ioctl.

When dynamic MSI-X allocation is supported, we only want the guest
unmasked vectors being allocated and enabled. Use vector 0 with an
invalid fd to get MSI-X enabled, after that, all the vectors can be
allocated in need.

Signed-off-by: Jing Liu 
---
Changes since RFC v1:
- Revise the comments. (Alex)
- Call the new helper function in previous patch to enable MSI-X. (Alex)
---
 hw/vfio/pci.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e24c21241a0c..3c37d036e727 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -408,6 +408,23 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool 
msix)
 int ret = 0, i, argsz;
 int32_t *fds;
 
+/*
+ * If dynamic MSI-X allocation is supported, the vectors to be allocated
+ * and enabled can be scattered. Before kernel enabling MSI-X, setting
+ * nr_vectors causes all these vectors to be allocated on host.
+ *
+ * To keep allocation as needed, use vector 0 with an invalid fd to get
+ * MSI-X enabled first, then set vectors with a potentially sparse set of
+ * eventfds to enable interrupts only when enabled in guest.
+ */
+if (msix && !vdev->msix->noresize) {
+ret = vfio_enable_msix_no_vec(vdev);
+
+if (ret) {
+return ret;
+}
+}
+
 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 
 irq_set = g_malloc0(argsz);
-- 
2.27.0




[PATCH v1 1/4] vfio/pci: detect the support of dynamic MSI-X allocation

2023-08-22 Thread Jing Liu
Kernel provides the guidance of dynamic MSI-X allocation support of
passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
guide user space.

Fetch the flags from host to determine if dynamic MSI-X allocation is
supported.

Originally-by: Reinette Chatre 
Signed-off-by: Jing Liu 
---
Changes since RFC v1:
- Filter the dynamic MSI-X allocation flag and store as a bool type.
  (Alex)
- Move the detection to vfio_msix_early_setup(). (Alex)
- Report error of getting irq info and remove the trace of failure
  case. (Alex, Cédric)
---
 hw/vfio/pci.c| 15 +--
 hw/vfio/pci.h|  1 +
 hw/vfio/trace-events |  2 +-
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a205c6b1130f..8a3b34f3c196 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1493,7 +1493,9 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 uint8_t pos;
 uint16_t ctrl;
 uint32_t table, pba;
-int fd = vdev->vbasedev.fd;
+int ret, fd = vdev->vbasedev.fd;
+struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
+  .index = VFIO_PCI_MSIX_IRQ_INDEX };
 VFIOMSIXInfo *msix;
 
 pos = pci_find_capability(>pdev, PCI_CAP_ID_MSIX);
@@ -1530,6 +1532,14 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
 msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
 
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, _info);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
+return;
+}
+
+msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
+
 /*
  * Test the size of the pba_offset variable and catch if it extends outside
  * of the specified BAR. If it is the case, we need to apply a hardware
@@ -1562,7 +1572,8 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 }
 
 trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
-msix->table_offset, msix->entries);
+msix->table_offset, msix->entries,
+msix->noresize);
 vdev->msix = msix;
 
 vfio_pci_fixup_msix_region(vdev);
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a2771b9ff3cc..0717574d79e9 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
 uint32_t table_offset;
 uint32_t pba_offset;
 unsigned long *pending;
+bool noresize;
 } VFIOMSIXInfo;
 
 #define TYPE_VFIO_PCI "vfio-pci"
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index ee7509e68e4f..6de5d9ba8e46 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -27,7 +27,7 @@ vfio_vga_read(uint64_t addr, int size, uint64_t data) " 
(0x%"PRIx64", %d) = 0x%"
 vfio_pci_read_config(const char *name, int addr, int len, int val) " (%s, 
@0x%x, len=0x%x) 0x%x"
 vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, 
@0x%x, 0x%x, len=0x%x)"
 vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
-vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, 
int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
+vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, 
int entries, bool noresize) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, 
entries %d, noresize %d"
 vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
 vfio_check_pm_reset(const char *name) "%s Supports PM reset"
 vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
-- 
2.27.0




[PATCH RFC v1 1/3] vfio/pci: detect the support of dynamic MSI-X allocation

2023-07-27 Thread Jing Liu
From: Reinette Chatre 

Kernel provides the guidance of dynamic MSI-X allocation support of
passthrough device, by clearing the VFIO_IRQ_INFO_NORESIZE flag to
guide user space.

Fetch and store the flags from host for later use to determine if
specific flags are set.

Signed-off-by: Reinette Chatre 
Signed-off-by: Jing Liu 
---
 hw/vfio/pci.c| 12 
 hw/vfio/pci.h|  1 +
 hw/vfio/trace-events |  2 ++
 3 files changed, 15 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index a205c6b1130f..0c4ac0873d40 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1572,6 +1572,7 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, 
Error **errp)
 
 static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
 {
+struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
 int ret;
 Error *err = NULL;
 
@@ -1624,6 +1625,17 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, 
Error **errp)
 memory_region_set_enabled(>pdev.msix_table_mmio, false);
 }
 
+irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, _info);
+if (ret) {
+/* This can fail for an old kernel or legacy PCI dev */
+trace_vfio_msix_setup_get_irq_info_failure(strerror(errno));
+} else {
+vdev->msix->irq_info_flags = irq_info.flags;
+}
+trace_vfio_msix_setup_irq_info_flags(vdev->vbasedev.name,
+ vdev->msix->irq_info_flags);
+
 return 0;
 }
 
diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
index a2771b9ff3cc..ad34ec56d0ae 100644
--- a/hw/vfio/pci.h
+++ b/hw/vfio/pci.h
@@ -113,6 +113,7 @@ typedef struct VFIOMSIXInfo {
 uint32_t table_offset;
 uint32_t pba_offset;
 unsigned long *pending;
+uint32_t irq_info_flags;
 } VFIOMSIXInfo;
 
 #define TYPE_VFIO_PCI "vfio-pci"
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index ee7509e68e4f..7d4a398f044d 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -28,6 +28,8 @@ vfio_pci_read_config(const char *name, int addr, int len, int 
val) " (%s, @0x%x,
 vfio_pci_write_config(const char *name, int addr, int val, int len) " (%s, 
@0x%x, 0x%x, len=0x%x)"
 vfio_msi_setup(const char *name, int pos) "%s PCI MSI CAP @0x%x"
 vfio_msix_early_setup(const char *name, int pos, int table_bar, int offset, 
int entries) "%s PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d"
+vfio_msix_setup_get_irq_info_failure(const char *errstr) 
"VFIO_DEVICE_GET_IRQ_INFO failure: %s"
+vfio_msix_setup_irq_info_flags(const char *name, uint32_t flags) " (%s) MSI-X 
irq info flags 0x%x"
 vfio_check_pcie_flr(const char *name) "%s Supports FLR via PCIe cap"
 vfio_check_pm_reset(const char *name) "%s Supports PM reset"
 vfio_check_af_flr(const char *name) "%s Supports FLR via AF cap"
-- 
2.27.0




[PATCH RFC v1 3/3] vfio/pci: dynamic MSI-X allocation in interrupt restoring

2023-07-27 Thread Jing Liu
During migration restoring, vfio_enable_vectors() is called to restore
enabling MSI-X interrupts for assigned devices. It sets the range from 0
to nr_vectors to kernel to enable MSI-X and the vectors unmasked in
guest. During the MSI-X enabling, all the vectors within the range are
allocated according to the ioctl().

When dynamic MSI-X allocation is supported, we only want the guest
unmasked vectors being allocated and enabled. Therefore, Qemu can first
set vector 0 to enable MSI-X and after that, all the vectors can be
allocated in need.

Signed-off-by: Jing Liu 
---
 hw/vfio/pci.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 8c485636445c..43ffacd5b36a 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -375,6 +375,38 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool 
msix)
 int ret = 0, i, argsz;
 int32_t *fds;
 
+/*
+ * If dynamic MSI-X allocation is supported, the vectors to be allocated
+ * and enabled can be scattered. Before kernel enabling MSI-X, setting
+ * nr_vectors causes all these vectors being allocated on host.
+ *
+ * To keep allocation as needed, first setup vector 0 with an invalid
+ * fd to make MSI-X enabled, then enable vectors by setting all so that
+ * kernel allocates and enables interrupts only when enabled in guest.
+ */
+if (msix && !(vdev->msix->irq_info_flags & VFIO_IRQ_INFO_NORESIZE)) {
+argsz = sizeof(*irq_set) + sizeof(*fds);
+
+irq_set = g_malloc0(argsz);
+irq_set->argsz = argsz;
+irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
+ VFIO_PCI_MSI_IRQ_INDEX;
+irq_set->start = 0;
+irq_set->count = 1;
+fds = (int32_t *)_set->data;
+fds[0] = -1;
+
+ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+g_free(irq_set);
+
+if (ret) {
+return ret;
+}
+}
+
 argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
 
 irq_set = g_malloc0(argsz);
-- 
2.27.0




[PATCH RFC v1 2/3] vfio/pci: enable vector on dynamic MSI-X allocation

2023-07-27 Thread Jing Liu
The vector_use callback is used to enable vector that is unmasked in
guest. The kernel used to only support static MSI-X allocation. When
allocating a new interrupt using "static MSI-X allocation" kernels,
Qemu first disables all previously allocated vectors and then
re-allocates all including the new one. The nr_vectors of VFIOPCIDevice
indicates that all vectors from 0 to nr_vectors are allocated (and may
be enabled), which is used to to loop all the possibly used vectors
When, e.g., disabling MSI-X interrupts.

Extend the vector_use function to support dynamic MSI-X allocation when
host supports the capability. Qemu therefore can individually allocate
and enable a new interrupt without affecting others or causing interrupts
lost during runtime.

Utilize nr_vectors to calculate the upper bound of enabled vectors in
dynamic MSI-X allocation mode since looping all msix_entries_nr is not
efficient and unnecessary.

Signed-off-by: Jing Liu 
Tested-by: Reinette Chatre 
---
 hw/vfio/pci.c | 40 +++-
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 0c4ac0873d40..8c485636445c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -512,12 +512,20 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
unsigned int nr,
 }
 
 /*
- * We don't want to have the host allocate all possible MSI vectors
- * for a device if they're not in use, so we shutdown and incrementally
- * increase them as needed.
+ * When dynamic allocation is not supported, we don't want to have the
+ * host allocate all possible MSI vectors for a device if they're not
+ * in use, so we shutdown and incrementally increase them as needed.
+ * And nr_vectors stands for the number of vectors being allocated.
+ *
+ * When dynamic allocation is supported, let the host only allocate
+ * and enable a vector when it is in use in guest. nr_vectors stands
+ * for the upper bound of vectors being enabled (but not all of the
+ * ranges is allocated or enabled).
  */
-if (vdev->nr_vectors < nr + 1) {
+if ((vdev->msix->irq_info_flags & VFIO_IRQ_INFO_NORESIZE) &&
+(vdev->nr_vectors < nr + 1)) {
 vdev->nr_vectors = nr + 1;
+
 if (!vdev->defer_kvm_irq_routing) {
 vfio_disable_irqindex(>vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
 ret = vfio_enable_vectors(vdev, true);
@@ -529,16 +537,22 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, 
unsigned int nr,
 Error *err = NULL;
 int32_t fd;
 
-if (vector->virq >= 0) {
-fd = event_notifier_get_fd(>kvm_interrupt);
-} else {
-fd = event_notifier_get_fd(>interrupt);
-}
+if (!vdev->defer_kvm_irq_routing) {
+if (vector->virq >= 0) {
+fd = event_notifier_get_fd(>kvm_interrupt);
+} else {
+fd = event_notifier_get_fd(>interrupt);
+}
 
-if (vfio_set_irq_signaling(>vbasedev,
- VFIO_PCI_MSIX_IRQ_INDEX, nr,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) {
-error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+if (vfio_set_irq_signaling(>vbasedev,
+   VFIO_PCI_MSIX_IRQ_INDEX, nr,
+   VFIO_IRQ_SET_ACTION_TRIGGER, fd, )) 
{
+error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+}
+}
+/* Increase for dynamic allocation case. */
+if (vdev->nr_vectors < nr + 1) {
+vdev->nr_vectors = nr + 1;
 }
 }
 
-- 
2.27.0




[PATCH RFC v1 0/3] Support dynamic MSI-X allocation

2023-07-27 Thread Jing Liu
Before kernel v6.5, dynamic allocation of MSI-X interrupts was not
supported. Qemu therefore when allocating a new interrupt, should first
release all previously allocated interrupts (including disable of MSI-X)
and re-allocate all interrupts that includes the new one.

The kernel series [1] adds the support of dynamic MSI-X allocation to
vfio-pci and uses the existing flag VFIO_IRQ_INFO_NORESIZE to guide user
space, that when dynamic MSI-X is supported the flag is cleared.

This series makes the behavior for VFIO PCI devices when dynamic MSI-X
allocation is supported. When guest unmasks an interrupt, Qemu can
directly allocate an interrupt on host for this and has nothing to do
with the previously allocated ones. Therefore, host only allocates
interrupts for those unmasked (enabled) interrupts inside guest when
dynamic MSI-X allocation is supported by device.

During migration restore, Qemu calls vfio_enable_vectors() to enable
MSI-X and interrupts. Since the API causes that a number of irqs set to
host kernel are all allocated when enabling MSI-X, to avoid this, one
possible way is that Qemu first sets vector 0 to host kernel to enable
MSI-X with an invalid fd. After MSI-X enabling, the API can decide which
should be allocated via the event fd value. In this way, the interrupts
allocation on target would be the same as migration source.

Jing Liu (2):
  vfio/pci: enable vector on dynamic MSI-X allocation
  vfio/pci: dynamic MSI-X allocation in interrupt restoring

Reinette Chatre (1):
  vfio/pci: detect the support of dynamic MSI-X allocation

 hw/vfio/pci.c| 84 +---
 hw/vfio/pci.h|  1 +
 hw/vfio/trace-events |  2 ++
 3 files changed, 74 insertions(+), 13 deletions(-)

-- 
2.27.0




[virtio-dev] [PATCH v2 4/5] virtio-mmio: Introduce MSI details

2020-01-20 Thread Jing Liu
With VIRTIO_F_MMIO_MSI feature bit offered, the Message Signal
Interrupts (MSI) is supported as first priority. For any reason it
fails to use MSI, it need use the single dedicated interrupt as before.

For MSI vectors and events mapping relationship, introduce in next patch.

Co-developed-by: Chao Peng 
Signed-off-by: Chao Peng 
Co-developed-by: Liu Jiang 
Signed-off-by: Liu Jiang 
Co-developed-by: Zha Bin 
Signed-off-by: Zha Bin 
Signed-off-by: Jing Liu 
---
 content.tex | 171 ++--
 msi-state.c |   4 ++
 2 files changed, 159 insertions(+), 16 deletions(-)
 create mode 100644 msi-state.c

diff --git a/content.tex b/content.tex
index ff151ba..dcf6c71 100644
--- a/content.tex
+++ b/content.tex
@@ -1687,7 +1687,8 @@ \subsection{MMIO Device Register Layout}\label{sec:Virtio 
Transport Options / Vi
   \hline 
   \mmioreg{InterruptStatus}{Interrupt status}{0x60}{R}{%
 Reading from this register returns a bit mask of events that
-caused the device interrupt to be asserted.
+caused the device interrupt to be asserted. This is only used
+when MSI is not enabled.
 The following events are possible:
 \begin{description}
   \item[Used Buffer Notification] - bit 0 - the interrupt was asserted
@@ -1701,7 +1702,7 @@ \subsection{MMIO Device Register Layout}\label{sec:Virtio 
Transport Options / Vi
   \mmioreg{InterruptACK}{Interrupt acknowledge}{0x064}{W}{%
 Writing a value with bits set as defined in \field{InterruptStatus}
 to this register notifies the device that events causing
-the interrupt have been handled.
+the interrupt have been handled. This is only used when MSI is not enabled.
   }
   \hline 
   \mmioreg{Status}{Device status}{0x070}{RW}{%
@@ -1760,6 +1761,47 @@ \subsection{MMIO Device Register 
Layout}\label{sec:Virtio Transport Options / Vi
 \field{SHMSel} is unused) results in a base address of
 0x.
   }
+  \hline
+  \mmioreg{MsiVecNum}{MSI max vector number}{0x0c0}{R}{%
+When VIRTIO_F_MMIO_MSI has been negotiated, reading
+from this register returns the maximum MSI vector number
+that device supports.
+  }
+  \hline
+  \mmioreg{MsiState}{MSI state}{0x0c4}{R}{%
+When VIRTIO_F_MMIO_MSI has been negotiated, reading
+from this register returns the global MSI enable/disable status.
+\lstinputlisting{msi-state.c}
+  }
+  \hline
+  \mmioreg{MsiCmd}{MSI command}{0x0c8}{W}{%
+When VIRTIO_F_MMIO_MSI has been negotiated, writing
+to this register executes the corresponding command to device.
+Part of this applies to the MSI vector selected by writing to 
\field{MsiVecSel}.
+See \ref{sec:Virtio Transport Options / Virtio Over MMIO / MMIO-specific 
Initialization And Device Operation / Device Initialization / MSI Vector 
Configuration}
+for using details.
+  }
+  \hline
+  \mmioreg{MsiVecSel}{MSI vector index}{0x0d0}{W}{%
+When VIRTIO_F_MMIO_MSI has been negotiated, writing
+to this register selects the MSI vector index that the following operations
+on \field{MsiAddrLow}, \field{MsiAddrHigh}, \field{MsiData} and part of
+\field{MsiCmd} commands specified in \ref{sec:Virtio Transport Options / 
Virtio Over MMIO / MMIO-specific Initialization And Device Operation / Device 
Initialization / MSI Vector Configuration}
+apply to. The index number of the first vector is zero (0x0).
+  }
+  \hline
+  \mmiodreg{MsiAddrLow}{MsiAddrHigh}{MSI 64 bit address}{0x0d4}{0x0d8}{W}{%
+When VIRTIO_F_MMIO_MSI has been negotiated, writing
+to these two registers (lower 32 bits of the address to \field{MsiAddrLow},
+higher 32 bits to \field{MsiAddrHigh}) notifies the device about the
+MSI address. This applies to the MSI vector selected by writing to 
\field{MsiVecSel}.
+  }
+  \hline
+  \mmioreg{MsiData}{MSI 32 bit data}{0x0dc}{W}{%
+When VIRTIO_F_MMIO_MSI has been negotiated, writing
+to this register notifies the device about the MSI data.
+This applies to the MSI vector selected by writing to \field{MsiVecSel}.
+  }
   \hline 
   \mmioreg{ConfigGeneration}{Configuration atomicity value}{0x0fc}{R}{
 Reading from this register returns a value describing a version of the 
device-specific configuration space (see \field{Config}).
@@ -1783,10 +1825,16 @@ \subsection{MMIO Device Register 
Layout}\label{sec:Virtio Transport Options / Vi
 
 The device MUST return value 0x2 in \field{Version}.
 
-The device MUST present each event by setting the corresponding bit in 
\field{InterruptStatus} from the
+When MSI is disabled, the device MUST present each event by setting the
+corresponding bit in \field{InterruptStatus} from the
 moment it takes place, until the driver acknowledges the interrupt
-by writing a corresponding bit mask to the \field{InterruptACK} register.  
Bits which
-do not represent events which took place MUST be zero.
+by writing a corresponding bit mask to the \field{InterruptACK} register.
+Bits which do

[virtio-dev] [PATCH v2 5/5] virtio-mmio: MSI vector and event mapping

2020-01-20 Thread Jing Liu
Bit 1 msi_sharing reported in the MsiState register indicates the mapping mode
device uses.

Bit 1 is 0 - device uses MSI non-sharing mode. This indicates vector per event 
and
fixed static vectors and events relationship. This fits for devices with a high 
interrupt
rate and best performance;
Bit 1 is 1 - device uses MSI sharing mode. This indicates vectors and events
dynamic mapping and fits for devices not requiring a high interrupt rate.

Co-developed-by: Chao Peng 
Signed-off-by: Chao Peng 
Co-developed-by: Liu Jiang 
Signed-off-by: Liu Jiang 
Co-developed-by: Zha Bin 
Signed-off-by: Zha Bin 
Signed-off-by: Jing Liu 
---
 content.tex | 48 +++-
 msi-state.c |  3 ++-
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/content.tex b/content.tex
index dcf6c71..2fd1686 100644
--- a/content.tex
+++ b/content.tex
@@ -1770,7 +1770,8 @@ \subsection{MMIO Device Register Layout}\label{sec:Virtio 
Transport Options / Vi
   \hline
   \mmioreg{MsiState}{MSI state}{0x0c4}{R}{%
 When VIRTIO_F_MMIO_MSI has been negotiated, reading
-from this register returns the global MSI enable/disable status.
+from this register returns the global MSI enable/disable status
+and whether device uses MSI sharing mode.
 \lstinputlisting{msi-state.c}
   }
   \hline
@@ -1926,12 +1927,18 @@ \subsubsection{Device Initialization}\label{sec:Virtio 
Transport Options / Virti
 mask and unmask the MSI vector applying to the one selected by writing
 to \field{MsiVecSel}.
 
+VIRTIO_MMIO_MSI_CMD_MAP_CONFIG command is to set the configuration event and 
MSI vector
+mapping. VIRTIO_MMIO_MSI_CMD_MAP_QUEUE is to set the queue event and MSI vector
+mapping. They SHOULD only be used in MSI sharing mode.
+
 \begin{lstlisting}
 #define  VIRTIO_MMIO_MSI_CMD_ENABLE   0x1
 #define  VIRTIO_MMIO_MSI_CMD_DISABLE  0x2
 #define  VIRTIO_MMIO_MSI_CMD_CONFIGURE0x3
 #define  VIRTIO_MMIO_MSI_CMD_MASK 0x4
 #define  VIRTIO_MMIO_MSI_CMD_UNMASK   0x5
+#define  VIRTIO_MMIO_MSI_CMD_MAP_CONFIG   0x6
+#define  VIRTIO_MMIO_MSI_CMD_MAP_QUEUE0x7
 \end{lstlisting}
 
 Setting a special NO_VECTOR value means disabling an interrupt for an event 
type.
@@ -1941,10 +1948,49 @@ \subsubsection{Device Initialization}\label{sec:Virtio 
Transport Options / Virti
 #define VIRTIO_MMIO_MSI_NO_VECTOR 0x
 \end{lstlisting}
 
+\subparagraph{MSI Vector and Event Mapping}\label{sec:Virtio Transport Options 
/ Virtio Over MMIO / MMIO-specific Initialization And Device Operation / Device 
Initialization / MSI Vector Configuration}
+The reported \field{msi_sharing} bit in the \field{MsiState} return value shows
+the MSI sharing mode that device uses.
+
+When \field{msi_sharing} bit is 0, it indicates the device uses non-sharing 
mode
+and vector per event fixed static relationship is used. The first vector is 
for device
+configuraiton change event, the second vector is for virtqueue 1, the third 
vector
+is for virtqueue 2 and so on.
+
+When \field{msi_sharing} bit is 1, it indicates the device uses MSI sharing 
mode,
+and the vector and event mapping is dynamic. Writing \field{MsiVecSel}
+followed by writing 
VIRTIO_MMIO_MSI_CMD_MAP_CONFIG/VIRTIO_MMIO_MSI_CMD_MAP_QUEUE command
+maps interrupts triggered by the configuration change/selected queue events 
respectively
+to the corresponding MSI vector.
+
+\devicenormative{\subparagraph}{MSI Vector Configuration}{Virtio Transport 
Options / Virtio Over MMIO / MMIO-specific Initialization And Device Operation 
/ MSI Vector Configuration}
+
+When the device reports \field{msi_sharing} bit as 0, it SHOULD support a 
number of
+vectors that greater than the maximum number of virtqueues.
+Device MUST report the number of vectors supported in \field{MsiVecNum}.
+
+When the device reports \field{msi_sharing} bit as 1, it SHOULD support at 
least
+2 MSI vectors and MUST report in \field{MsiVecNum}. Device SHOULD support 
mapping any
+event type to any vector under \field{MsiVecNum}.
+
+Device MUST support unmapping any event type (NO_VECTOR).
+
+The device SHOULD restrict the reported \field{msi_sharing} and 
\field{MsiVecNum}
+to a value that might benefit system performance.
+
+\begin{note}
+For example, a device which does not expect to send interrupts at a high rate 
might
+return \field{msi_sharing} bit as 1.
+\end{note}
+
 \drivernormative{\subparagraph}{MSI Vector Configuration}{Virtio Transport 
Options / Virtio Over MMIO / MMIO-specific Initialization And Device Operation 
/ MSI Vector Configuration}
 When VIRTIO_F_MMIO_MSI has been negotiated, driver should try to configure
 and enable MSI.
 
+To set up the event and vector mapping for MSI sharing mode, driver SHOULD
+write a valid \field{MsiVecSel} followed by 
VIRTIO_MMIO_MSI_CMD_MAP_CONFIG/VIRTIO_MMIO_MSI_CMD_MAP_QUEUE
+command to map the configuration change/selected queue events respectively.
+
 To configure MSI vector, driver SHOULD firstly

[virtio-dev] [PATCH v2 2/5] virtio-mmio: Enhance queue notification support

2020-01-20 Thread Jing Liu
With VIRTIO_F_MMIO_NOTIFICATION feature bit offered, the notification
mechanism is enhanced. Driver reads QueueNotify register to get
notification structure and calculate notification addresses of
each virtqueue.

Co-developed-by: Chao Peng 
Signed-off-by: Chao Peng 
Co-developed-by: Liu Jiang 
Signed-off-by: Liu Jiang 
Co-developed-by: Zha Bin 
Signed-off-by: Zha Bin 
Signed-off-by: Jing Liu 
---
 content.tex | 53 -
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/content.tex b/content.tex
index 826bc7d..5881253 100644
--- a/content.tex
+++ b/content.tex
@@ -1671,20 +1671,18 @@ \subsection{MMIO Device Register 
Layout}\label{sec:Virtio Transport Options / Vi
 accesses apply to the queue selected by writing to \field{QueueSel}.
   }
   \hline 
-  \mmioreg{QueueNotify}{Queue notifier}{0x050}{W}{%
-Writing a value to this register notifies the device that
-there are new buffers to process in a queue.
+  \mmioreg{QueueNotify}{Queue notifier}{0x050}{RW}{%
+When VIRTIO_F_MMIO_NOTIFICATION has not been negotiated, writing to this
+register notifies the device that there are new buffers to process in a 
queue.
 
-When VIRTIO_F_NOTIFICATION_DATA has not been negotiated,
-the value written is the queue index.
+When VIRTIO_F_MMIO_NOTIFICATION has been negotiated, reading this register
+returns the virtqueue notification structure for calculating notification 
location.
 
-When VIRTIO_F_NOTIFICATION_DATA has been negotiated,
-the \field{Notification data} value has the following format:
+See \ref{sec:Virtio Transport Options / Virtio Over MMIO / MMIO-specific 
Initialization And Device Operation / Notification Structure Layout}
+for the notification structure format.
 
-\lstinputlisting{notifications-le.c}
-
-See \ref{sec:Virtqueues / Driver notifications}~\nameref{sec:Virtqueues / 
Driver notifications}
-for the definition of the components.
+See \ref{sec:Virtio Transport Options / Virtio Over MMIO / MMIO-specific 
Initialization And Device Operation / Available Buffer Notifications}
+for the notification data format.
   }
   \hline 
   \mmioreg{InterruptStatus}{Interrupt status}{0x60}{R}{%
@@ -1858,6 +1856,31 @@ \subsubsection{Device Initialization}\label{sec:Virtio 
Transport Options / Virti
 Further initialization MUST follow the procedure described in
 \ref{sec:General Initialization And Device Operation / Device 
Initialization}~\nameref{sec:General Initialization And Device Operation / 
Device Initialization}.
 
+\subsubsection{Notification Structure Layout}\label{sec:Virtio Transport 
Options / Virtio Over MMIO / MMIO-specific Initialization And Device Operation 
/ Notification Structure Layout}
+
+When VIRTIO_F_MMIO_NOTIFICATION has been negotiated, the notification location 
is calculated
+by notification structure. Driver reads \field{QueueNotify} to get this 
structure formatted
+as follows.
+
+\begin{lstlisting}
+le32 {
+notify_base : 16;
+notify_multiplier : 16;
+};
+\end{lstlisting}
+
+\field{notify_multiplier} is combined with virtqueue index to derive the Queue 
Notify address
+within a memory mapped control registers for a virtqueue:
+
+\begin{lstlisting}
+notify_base + queue_index * notify_multiplier
+\end{lstlisting}
+
+\begin{note}
+For example, if notify_multiplier is 0, the device uses the same Queue Notify 
address for all
+queues.
+\end{note}
+
 \subsubsection{Virtqueue Configuration}\label{sec:Virtio Transport Options / 
Virtio Over MMIO / MMIO-specific Initialization And Device Operation / 
Virtqueue Configuration}
 
 The driver will typically initialize the virtual queue in the following way:
@@ -1893,16 +1916,20 @@ \subsubsection{Available Buffer 
Notifications}\label{sec:Virtio Transport Option
 When VIRTIO_F_NOTIFICATION_DATA has not been negotiated,
 the driver sends an available buffer notification to the device by writing
 the 16-bit virtqueue index
-of the queue to be notified to \field{QueueNotify}.
+of the queue to be notified to Queue Notify address.
 
 When VIRTIO_F_NOTIFICATION_DATA has been negotiated,
 the driver sends an available buffer notification to the device by writing
-the following 32-bit value to \field{QueueNotify}:
+the following 32-bit value to Queue Notify address:
 \lstinputlisting{notifications-le.c}
 
 See \ref{sec:Virtqueues / Driver notifications}~\nameref{sec:Virtqueues / 
Driver notifications}
 for the definition of the components.
 
+For device not offering VIRTIO_F_MMIO_NOTIFICATION, the Queue Notify address 
is \field{QueueNotify}.
+For device offering VIRTIO_F_MMIO_NOTIFICATION, see \ref{sec:Virtio Transport 
Options / Virtio Over MMIO / MMIO-specific Initialization And Device Operation 
/ Notification Structure Layout}
+for how to calculate the Queue Notify address.
+
 \subsubsection{Notifications From The Device}\label{sec:Virtio Transport 
Options / Virtio Over MMIO / MMIO-specific

[virtio-dev] [PATCH v2 3/5] virtio-mmio: Add feature bit for MMIO MSI

2020-01-20 Thread Jing Liu
The current MMIO transport layer uses a single, dedicated interrupt
signal, which brings performance penalty. Add a feature bit (40)
for introducing MSI capability.

Co-developed-by: Chao Peng 
Signed-off-by: Chao Peng 
Co-developed-by: Liu Jiang 
Signed-off-by: Liu Jiang 
Co-developed-by: Zha Bin 
Signed-off-by: Zha Bin 
Signed-off-by: Jing Liu 
---
 content.tex | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/content.tex b/content.tex
index 5881253..ff151ba 100644
--- a/content.tex
+++ b/content.tex
@@ -5840,6 +5840,9 @@ \chapter{Reserved Feature Bits}\label{sec:Reserved 
Feature Bits}
   \item[VIRTIO_F_MMIO_NOTIFICATION(39)] This feature indicates
   that the device supports enhanced notification mechanism on
   MMIO transport layer.
+  \item[VIRTIO_F_MMIO_MSI(40)] This feature indicates that the
+  device supports Message Signal Interrupts (MSI) mechanism on
+  MMIO transport layer.
 
 \drivernormative{\section}{Reserved Feature Bits}{Reserved Feature Bits}
 
@@ -5875,6 +5878,10 @@ \chapter{Reserved Feature Bits}\label{sec:Reserved 
Feature Bits}
 
 A driver SHOULD accept VIRTIO_F_MMIO_NOTIFICATION if it is offered.
 
+A driver SHOULD accept VIRTIO_F_MMIO_MSI if it is offered.
+If VIRTIO_F_MMIO_MSI has been negotiated, a driver MUST try to
+set up MSI at first priority.
+
 \devicenormative{\section}{Reserved Feature Bits}{Reserved Feature Bits}
 
 A device MUST offer VIRTIO_F_VERSION_1.  A device MAY fail to operate further
-- 
2.7.4




[virtio-dev] [PATCH v2 0/5] virtio-mmio enhancement

2020-01-20 Thread Jing Liu
The current virtio over MMIO has some limitations that impact the performance.
It only supports single legacy, dedicated interrupt and one virtqueue
notification register for all virtqueues which cause performance penalties.

To address such limitations, we proposed to update virtio-mmio spec with
two new feature bits to support MSI interrupt and enhancing notification 
mechanism.

For keeping virtio-mmio simple as it was, and considering practical usages, this
provides two kinds of mapping mode for device: MSI non-sharing mode and MSI 
sharing mode.
MSI non-sharng mode indicates a fixed static vector and event relationship 
specified
in spec, which can simplify the setup process and reduce vmexit, fitting for
a high interrupt rate request.
MSI sharing mode indicates a dynamic mapping, which is more like PCI does, 
fitting
for a non-high interrupt rate request.

Change Log:
v1->v2:
* Change version update to feature bit
* Add mask/unmask support
* Add two MSI sharing/non-sharing modes
* Change MSI registers layout and bits

Jing Liu (5):
  virtio-mmio: Add feature bit for MMIO notification
  virtio-mmio: Enhance queue notification support
  virtio-mmio: Add feature bit for MMIO MSI
  virtio-mmio: Introduce MSI details
  virtio-mmio: MSI vector and event mapping

 content.tex | 286 ++--
 msi-state.c |   5 ++
 2 files changed, 262 insertions(+), 29 deletions(-)
 create mode 100644 msi-state.c

-- 
2.7.4




[virtio-dev] [PATCH v2 1/5] virtio-mmio: Add feature bit for MMIO notification

2020-01-20 Thread Jing Liu
All the queues notifications use the same register on MMIO transport
layer. Add a feature bit (39) for enhancing the notification capability.
The detailed mechanism would be in next patch.

Co-developed-by: Chao Peng 
Signed-off-by: Chao Peng 
Co-developed-by: Liu Jiang 
Signed-off-by: Liu Jiang 
Co-developed-by: Zha Bin 
Signed-off-by: Zha Bin 
Signed-off-by: Jing Liu 
---
 content.tex | 9 +
 1 file changed, 9 insertions(+)

diff --git a/content.tex b/content.tex
index d68cfaf..826bc7d 100644
--- a/content.tex
+++ b/content.tex
@@ -5810,6 +5810,9 @@ \chapter{Reserved Feature Bits}\label{sec:Reserved 
Feature Bits}
   in its device notifications.
   See \ref{sec:Virtqueues / Driver notifications}~\nameref{sec:Virtqueues / 
Driver notifications}.
 \end{description}
+  \item[VIRTIO_F_MMIO_NOTIFICATION(39)] This feature indicates
+  that the device supports enhanced notification mechanism on
+  MMIO transport layer.
 
 \drivernormative{\section}{Reserved Feature Bits}{Reserved Feature Bits}
 
@@ -5843,6 +5846,8 @@ \chapter{Reserved Feature Bits}\label{sec:Reserved 
Feature Bits}
 or partially reset, and even without re-negotiating
 VIRTIO_F_SR_IOV after the reset.
 
+A driver SHOULD accept VIRTIO_F_MMIO_NOTIFICATION if it is offered.
+
 \devicenormative{\section}{Reserved Feature Bits}{Reserved Feature Bits}
 
 A device MUST offer VIRTIO_F_VERSION_1.  A device MAY fail to operate further
@@ -5872,6 +5877,10 @@ \chapter{Reserved Feature Bits}\label{sec:Reserved 
Feature Bits}
 and presents a PCI SR-IOV capability structure, otherwise
 it MUST NOT offer VIRTIO_F_SR_IOV.
 
+If VIRTIO_F_MMIO_NOTIFICATION has been negotiated, a device
+MUST support handling the notification from driver at the
+calculated location.
+
 \section{Legacy Interface: Reserved Feature Bits}\label{sec:Reserved Feature 
Bits / Legacy Interface: Reserved Feature Bits}
 
 Transitional devices MAY offer the following:
-- 
2.7.4




Re: [Qemu-devel] [PATCH v3 0/4] Introduce the microvm machine type

2019-09-01 Thread Jing Liu




On 8/30/2019 10:27 PM, Sergio Lopez wrote:


Jing Liu  writes:


Hi Sergio,

On 8/29/2019 11:46 PM, Sergio Lopez wrote:


Jing Liu  writes:


Hi Sergio,

The idea is interesting and I tried to launch a guest by your
guide but seems failed to me. I tried both legacy and normal modes,
but the vncviewer connected and told me that:
The vm has no graphic display device.
All the screen in vnc is just black.


The microvm machine type doesn't support any graphics device, so you
need to rely on the serial console.

Got it.




kernel config:
CONFIG_KVM_MMIO=y
CONFIG_VIRTIO_MMIO=y

I don't know if any specified kernel version/patch/config
is needed or anything I missed.
Could you kindly give some tips?


I'm testing it with upstream vanilla Linux. In addition to MMIO, you
need to add support for PVH (the next version of this patchset, v4, will
support booting from FW, so it'll be possible to use non-PVH ELF kernels
and bzImages too).

I've just uploaded a working kernel config here:

https://gist.github.com/slp/1060ba3aaf708584572ad4109f28c8f9


Thanks very much and this config is helpful to me.


As for the QEMU command line, something like this should do the trick:

./x86_64-softmmu/qemu-system-x86_64 -smp 1 -m 1g -enable-kvm -M microvm,legacy -kernel 
vmlinux -append "earlyprintk=ttyS0 console=ttyS0 reboot=k panic=1" -nodefaults 
-no-user-config -nographic -serial stdio

If this works, you can move to non-legacy mode with a virtio-console:

./x86_64-softmmu/qemu-system-x86_64 -smp 1 -m 1g -enable-kvm -M microvm -kernel vmlinux 
-append "console=hvc0 reboot=k panic=1" -nodefaults -no-user-config -nographic 
-serial pty -chardev stdio,id=virtiocon0,server -device virtio-serial-device -device 
virtconsole,chardev=virtiocon0


I tried the above two ways and it works now. Thanks!


If is still working, you can try adding some devices too:

./x86_64-softmmu/qemu-system-x86_64 -smp 1 -m 1g -enable-kvm -M microvm -kernel vmlinux 
-append "console=hvc0 reboot=k panic=1 root=/dev/vda" -nodefaults 
-no-user-config -nographic -serial pty -chardev stdio,id=virtiocon0,server -device 
virtio-serial-device -device virtconsole,chardev=virtiocon0 -netdev user,id=testnet 
-device virtio-net-device,netdev=testnet -drive 
id=test,file=alpine-rootfs-x86_64.raw,format=raw,if=none -device 
virtio-blk-device,drive=test


But I'm wondering why the image I used can not be found.
root=/dev/vda3 and the same image worked well on normal qemu/guest-
config bootup, but didn't work here. The details are,

-append "console=hvc0 reboot=k panic=1 root=/dev/vda3 rw rootfstype=ext4" \

[0.022784] Key type encrypted registered
[0.022988] VFS: Cannot open root device "vda3" or
unknown-block(254,3): error -6
[0.023041] Please append a correct "root=" boot option; here are
the available partitions:
[0.023089] fe00 8946688 vda
[0.023090]  driver: virtio_blk
[0.023143] Kernel panic - not syncing: VFS: Unable to mount root
fs on unknown-block(254,3)
[0.023201] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc3 #23


BTW, root=/dev/vda is also tried and didn't work. The dmesg is a
little different:

[0.028050] Key type encrypted registered
[0.028484] List of all partitions:
[0.028529] fe00 8946688 vda
[0.028529]  driver: virtio_blk
[0.028615] No filesystem could mount root, tried:
[0.028616]  ext4
[0.028670]
[0.028712] Kernel panic - not syncing: VFS: Unable to mount root
fs on unknown-block(254,0)

I tried another ext4 img but still doesn't work.
Is there any limitation of blk image? Could I copy your image for simple
test?


The kernel config I posted lacks support for DOS partitions. Adding
CONFIG_MSDOS_PARTITION=y should allow you to boot from /dev/vda3.

Anyway, in case you also want to try booting from /dev/vda (without
partitions), this is the recipe I use to quickly create a minimal rootfs
image:

# wget 
http://dl-cdn.alpinelinux.org/alpine/v3.10/releases/x86_64/alpine-minirootfs-3.10.2-x86_64.tar.gz
# qemu-img create -f raw alpine-rootfs-x86_64.raw 1G
# sudo losetup /dev/loop0 alpine-rootfs-x86_64.raw
# sudo mkfs.ext4 /dev/loop0
# sudo mount /dev/loop0 /mnt
# sudo tar xpf alpine-minirootfs-3.10.2-x86_64.tar.gz -C /mnt
# sudo umount /mnt
# sudo losetup -d /dev/loop0

The rootfs will be missing openrc, so you'll need to add "init=/bin/sh"
to the command line.



Thank you Sergio. I'll try that.

Jing

Sergio.


Thanks in advance,
Jing


Sergio.




Re: [Qemu-devel] [PATCH v3 0/4] Introduce the microvm machine type

2019-08-29 Thread Jing Liu

Hi Sergio,

On 8/29/2019 11:46 PM, Sergio Lopez wrote:


Jing Liu  writes:


Hi Sergio,

The idea is interesting and I tried to launch a guest by your
guide but seems failed to me. I tried both legacy and normal modes,
but the vncviewer connected and told me that:
The vm has no graphic display device.
All the screen in vnc is just black.


The microvm machine type doesn't support any graphics device, so you
need to rely on the serial console.

Got it.




kernel config:
CONFIG_KVM_MMIO=y
CONFIG_VIRTIO_MMIO=y

I don't know if any specified kernel version/patch/config
is needed or anything I missed.
Could you kindly give some tips?


I'm testing it with upstream vanilla Linux. In addition to MMIO, you
need to add support for PVH (the next version of this patchset, v4, will
support booting from FW, so it'll be possible to use non-PVH ELF kernels
and bzImages too).

I've just uploaded a working kernel config here:

https://gist.github.com/slp/1060ba3aaf708584572ad4109f28c8f9


Thanks very much and this config is helpful to me.


As for the QEMU command line, something like this should do the trick:

./x86_64-softmmu/qemu-system-x86_64 -smp 1 -m 1g -enable-kvm -M microvm,legacy -kernel 
vmlinux -append "earlyprintk=ttyS0 console=ttyS0 reboot=k panic=1" -nodefaults 
-no-user-config -nographic -serial stdio

If this works, you can move to non-legacy mode with a virtio-console:

./x86_64-softmmu/qemu-system-x86_64 -smp 1 -m 1g -enable-kvm -M microvm -kernel vmlinux 
-append "console=hvc0 reboot=k panic=1" -nodefaults -no-user-config -nographic 
-serial pty -chardev stdio,id=virtiocon0,server -device virtio-serial-device -device 
virtconsole,chardev=virtiocon0


I tried the above two ways and it works now. Thanks!


If is still working, you can try adding some devices too:

./x86_64-softmmu/qemu-system-x86_64 -smp 1 -m 1g -enable-kvm -M microvm -kernel vmlinux 
-append "console=hvc0 reboot=k panic=1 root=/dev/vda" -nodefaults 
-no-user-config -nographic -serial pty -chardev stdio,id=virtiocon0,server -device 
virtio-serial-device -device virtconsole,chardev=virtiocon0 -netdev user,id=testnet 
-device virtio-net-device,netdev=testnet -drive 
id=test,file=alpine-rootfs-x86_64.raw,format=raw,if=none -device 
virtio-blk-device,drive=test


But I'm wondering why the image I used can not be found.
root=/dev/vda3 and the same image worked well on normal qemu/guest-
config bootup, but didn't work here. The details are,

-append "console=hvc0 reboot=k panic=1 root=/dev/vda3 rw rootfstype=ext4" \

[0.022784] Key type encrypted registered
[0.022988] VFS: Cannot open root device "vda3" or 
unknown-block(254,3): error -6
[0.023041] Please append a correct "root=" boot option; here are the 
available partitions:

[0.023089] fe00 8946688 vda
[0.023090]  driver: virtio_blk
[0.023143] Kernel panic - not syncing: VFS: Unable to mount root fs 
on unknown-block(254,3)

[0.023201] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc3 #23


BTW, root=/dev/vda is also tried and didn't work. The dmesg is a little 
different:


[0.028050] Key type encrypted registered
[0.028484] List of all partitions:
[0.028529] fe00 8946688 vda
[0.028529]  driver: virtio_blk
[0.028615] No filesystem could mount root, tried:
[0.028616]  ext4
[0.028670]
[0.028712] Kernel panic - not syncing: VFS: Unable to mount root fs 
on unknown-block(254,0)


I tried another ext4 img but still doesn't work.
Is there any limitation of blk image? Could I copy your image for simple
test?

Thanks in advance,
Jing


Sergio.


Thanks very much.
Jing




A QEMU instance with the microvm machine type can be invoked this way:

   - Normal mode:

qemu-system-x86_64 -M microvm -m 512m -smp 2 \
   -kernel vmlinux -append "console=hvc0 root=/dev/vda" \
   -nodefaults -no-user-config \
   -chardev pty,id=virtiocon0,server \
   -device virtio-serial-device \
   -device virtconsole,chardev=virtiocon0 \
   -drive id=test,file=test.img,format=raw,if=none \
   -device virtio-blk-device,drive=test \
   -netdev tap,id=tap0,script=no,downscript=no \
   -device virtio-net-device,netdev=tap0

   - Legacy mode:

qemu-system-x86_64 -M microvm,legacy -m 512m -smp 2 \
   -kernel vmlinux -append "console=ttyS0 root=/dev/vda" \
   -nodefaults -no-user-config \
   -drive id=test,file=test.img,format=raw,if=none \
   -device virtio-blk-device,drive=test \
   -netdev tap,id=tap0,script=no,downscript=no \
   -device virtio-net-device,netdev=tap0 \
   -serial stdio







Re: [Qemu-devel] [PATCH v3 0/4] Introduce the microvm machine type

2019-08-29 Thread Jing Liu

Hi Sergio,

The idea is interesting and I tried to launch a guest by your
guide but seems failed to me. I tried both legacy and normal modes,
but the vncviewer connected and told me that:
The vm has no graphic display device.
All the screen in vnc is just black.

kernel config:
CONFIG_KVM_MMIO=y
CONFIG_VIRTIO_MMIO=y

I don't know if any specified kernel version/patch/config
is needed or anything I missed.
Could you kindly give some tips?

Thanks very much.
Jing




A QEMU instance with the microvm machine type can be invoked this way:

  - Normal mode:

qemu-system-x86_64 -M microvm -m 512m -smp 2 \
  -kernel vmlinux -append "console=hvc0 root=/dev/vda" \
  -nodefaults -no-user-config \
  -chardev pty,id=virtiocon0,server \
  -device virtio-serial-device \
  -device virtconsole,chardev=virtiocon0 \
  -drive id=test,file=test.img,format=raw,if=none \
  -device virtio-blk-device,drive=test \
  -netdev tap,id=tap0,script=no,downscript=no \
  -device virtio-net-device,netdev=tap0

  - Legacy mode:

qemu-system-x86_64 -M microvm,legacy -m 512m -smp 2 \
  -kernel vmlinux -append "console=ttyS0 root=/dev/vda" \
  -nodefaults -no-user-config \
  -drive id=test,file=test.img,format=raw,if=none \
  -device virtio-blk-device,drive=test \
  -netdev tap,id=tap0,script=no,downscript=no \
  -device virtio-net-device,netdev=tap0 \
  -serial stdio





Re: [Qemu-devel] [PATCH v2] x86: Intel AVX512_BF16 feature enabling

2019-08-19 Thread Jing Liu

Ping~ :)

Thanks,
Jing

On 7/25/2019 2:14 PM, Jing Liu wrote:

Intel CooperLake cpu adds AVX512_BF16 instruction, defining as
CPUID.(EAX=7,ECX=1):EAX[bit 05].

The patch adds a property for setting the subleaf of CPUID leaf 7 in
case that people would like to specify it.

The release spec link as follows,
https://software.intel.com/sites/default/files/managed/c5/15/\
architecture-instruction-set-extensions-programming-reference.pdf

Signed-off-by: Jing Liu 
---
  target/i386/cpu.c | 39 ++-
  target/i386/cpu.h |  7 +++
  target/i386/kvm.c |  3 ++-
  3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 805ce95..517dedb 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -770,6 +770,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t 
vendor1,
/* CPUID_7_0_ECX_OSPKE is dynamic */ \
CPUID_7_0_ECX_LA57)
  #define TCG_7_0_EDX_FEATURES 0
+#define TCG_7_1_EAX_FEATURES 0
  #define TCG_APM_FEATURES 0
  #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
  #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1)
@@ -1095,6 +1096,25 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
  },
  .tcg_features = TCG_7_0_EDX_FEATURES,
  },
+[FEAT_7_1_EAX] = {
+.type = CPUID_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, NULL, NULL,
+NULL, "avx512-bf16", NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.cpuid = {
+.eax = 7,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EAX,
+},
+.tcg_features = TCG_7_1_EAX_FEATURES,
+},
  [FEAT_8000_0007_EDX] = {
  .type = CPUID_FEATURE_WORD,
  .feat_names = {
@@ -4293,13 +4313,19 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
  case 7:
  /* Structured Extended Feature Flags Enumeration Leaf */
  if (count == 0) {
-*eax = 0; /* Maximum ECX value for sub-leaves */
+/* Maximum ECX value for sub-leaves */
+*eax = env->cpuid_level_func7;
  *ebx = env->features[FEAT_7_0_EBX]; /* Feature flags */
  *ecx = env->features[FEAT_7_0_ECX]; /* Feature flags */
  if ((*ecx & CPUID_7_0_ECX_PKU) && env->cr[4] & CR4_PKE_MASK) {
  *ecx |= CPUID_7_0_ECX_OSPKE;
  }
  *edx = env->features[FEAT_7_0_EDX]; /* Feature flags */
+} else if (count == 1) {
+*eax = env->features[FEAT_7_1_EAX];
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
  } else {
  *eax = 0;
  *ebx = 0;
@@ -4949,6 +4975,11 @@ static void x86_cpu_adjust_feat_level(X86CPU *cpu, 
FeatureWord w)
  x86_cpu_adjust_level(cpu, >cpuid_min_xlevel2, eax);
  break;
  }
+
+if (eax == 7) {
+x86_cpu_adjust_level(cpu, >cpuid_min_level_func7,
+ fi->cpuid.ecx);
+}
  }
  
  /* Calculate XSAVE components based on the configured CPU feature flags */

@@ -5067,6 +5098,7 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
  x86_cpu_adjust_feat_level(cpu, FEAT_1_ECX);
  x86_cpu_adjust_feat_level(cpu, FEAT_6_EAX);
  x86_cpu_adjust_feat_level(cpu, FEAT_7_0_ECX);
+x86_cpu_adjust_feat_level(cpu, FEAT_7_1_EAX);
  x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_EDX);
  x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_ECX);
  x86_cpu_adjust_feat_level(cpu, FEAT_8000_0007_EDX);
@@ -5098,6 +5130,9 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
  }
  
  /* Set cpuid_*level* based on cpuid_min_*level, if not explicitly set */

+if (env->cpuid_level_func7 == UINT32_MAX) {
+env->cpuid_level_func7 = env->cpuid_min_level_func7;
+}
  if (env->cpuid_level == UINT32_MAX) {
  env->cpuid_level = env->cpuid_min_level;
  }
@@ -5869,6 +5904,8 @@ static Property x86_cpu_properties[] = {
  DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false),
  DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU, host_phys_bits_limit, 
0),
  DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),
+DEFINE_PROP_UINT32("level-func7", X86CPU, env.cpuid_level_func7,
+   UINT32_MAX),
  DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, UINT32_MAX),
  DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, UINT32_MAX),
  DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, UINT32_MAX),
diff --git a/target/i386/cpu.h b

Re: [Qemu-devel] [PATCH v2] x86: Intel AVX512_BF16 feature enabling

2019-07-31 Thread Jing Liu

Hi,

Looking forward to your comments. :)

Thanks!
Jing

On 7/25/2019 2:14 PM, Jing Liu wrote:

Intel CooperLake cpu adds AVX512_BF16 instruction, defining as
CPUID.(EAX=7,ECX=1):EAX[bit 05].

The patch adds a property for setting the subleaf of CPUID leaf 7 in
case that people would like to specify it.

The release spec link as follows,
https://software.intel.com/sites/default/files/managed/c5/15/\
architecture-instruction-set-extensions-programming-reference.pdf

Signed-off-by: Jing Liu 
---
  target/i386/cpu.c | 39 ++-
  target/i386/cpu.h |  7 +++
  target/i386/kvm.c |  3 ++-
  3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 805ce95..517dedb 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -770,6 +770,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t 
vendor1,
/* CPUID_7_0_ECX_OSPKE is dynamic */ \
CPUID_7_0_ECX_LA57)
  #define TCG_7_0_EDX_FEATURES 0
+#define TCG_7_1_EAX_FEATURES 0
  #define TCG_APM_FEATURES 0
  #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
  #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1)
@@ -1095,6 +1096,25 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
  },
  .tcg_features = TCG_7_0_EDX_FEATURES,
  },
+[FEAT_7_1_EAX] = {
+.type = CPUID_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, NULL, NULL,
+NULL, "avx512-bf16", NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.cpuid = {
+.eax = 7,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EAX,
+},
+.tcg_features = TCG_7_1_EAX_FEATURES,
+},
  [FEAT_8000_0007_EDX] = {
  .type = CPUID_FEATURE_WORD,
  .feat_names = {
@@ -4293,13 +4313,19 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
  case 7:
  /* Structured Extended Feature Flags Enumeration Leaf */
  if (count == 0) {
-*eax = 0; /* Maximum ECX value for sub-leaves */
+/* Maximum ECX value for sub-leaves */
+*eax = env->cpuid_level_func7;
  *ebx = env->features[FEAT_7_0_EBX]; /* Feature flags */
  *ecx = env->features[FEAT_7_0_ECX]; /* Feature flags */
  if ((*ecx & CPUID_7_0_ECX_PKU) && env->cr[4] & CR4_PKE_MASK) {
  *ecx |= CPUID_7_0_ECX_OSPKE;
  }
  *edx = env->features[FEAT_7_0_EDX]; /* Feature flags */
+} else if (count == 1) {
+*eax = env->features[FEAT_7_1_EAX];
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
  } else {
  *eax = 0;
  *ebx = 0;
@@ -4949,6 +4975,11 @@ static void x86_cpu_adjust_feat_level(X86CPU *cpu, 
FeatureWord w)
  x86_cpu_adjust_level(cpu, >cpuid_min_xlevel2, eax);
  break;
  }
+
+if (eax == 7) {
+x86_cpu_adjust_level(cpu, >cpuid_min_level_func7,
+ fi->cpuid.ecx);
+}
  }
  
  /* Calculate XSAVE components based on the configured CPU feature flags */

@@ -5067,6 +5098,7 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
  x86_cpu_adjust_feat_level(cpu, FEAT_1_ECX);
  x86_cpu_adjust_feat_level(cpu, FEAT_6_EAX);
  x86_cpu_adjust_feat_level(cpu, FEAT_7_0_ECX);
+x86_cpu_adjust_feat_level(cpu, FEAT_7_1_EAX);
  x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_EDX);
  x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_ECX);
  x86_cpu_adjust_feat_level(cpu, FEAT_8000_0007_EDX);
@@ -5098,6 +5130,9 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
  }
  
  /* Set cpuid_*level* based on cpuid_min_*level, if not explicitly set */

+if (env->cpuid_level_func7 == UINT32_MAX) {
+env->cpuid_level_func7 = env->cpuid_min_level_func7;
+}
  if (env->cpuid_level == UINT32_MAX) {
  env->cpuid_level = env->cpuid_min_level;
  }
@@ -5869,6 +5904,8 @@ static Property x86_cpu_properties[] = {
  DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false),
  DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU, host_phys_bits_limit, 
0),
  DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),
+DEFINE_PROP_UINT32("level-func7", X86CPU, env.cpuid_level_func7,
+   UINT32_MAX),
  DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, UINT32_MAX),
  DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, UINT32_MAX),
  DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, UINT32_MAX

[Qemu-devel] [PATCH v2] x86: Intel AVX512_BF16 feature enabling

2019-07-25 Thread Jing Liu
Intel CooperLake cpu adds AVX512_BF16 instruction, defining as
CPUID.(EAX=7,ECX=1):EAX[bit 05].

The patch adds a property for setting the subleaf of CPUID leaf 7 in
case that people would like to specify it.

The release spec link as follows,
https://software.intel.com/sites/default/files/managed/c5/15/\
architecture-instruction-set-extensions-programming-reference.pdf

Signed-off-by: Jing Liu 
---
 target/i386/cpu.c | 39 ++-
 target/i386/cpu.h |  7 +++
 target/i386/kvm.c |  3 ++-
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 805ce95..517dedb 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -770,6 +770,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t 
vendor1,
   /* CPUID_7_0_ECX_OSPKE is dynamic */ \
   CPUID_7_0_ECX_LA57)
 #define TCG_7_0_EDX_FEATURES 0
+#define TCG_7_1_EAX_FEATURES 0
 #define TCG_APM_FEATURES 0
 #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
 #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1)
@@ -1095,6 +1096,25 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
 },
 .tcg_features = TCG_7_0_EDX_FEATURES,
 },
+[FEAT_7_1_EAX] = {
+.type = CPUID_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, NULL, NULL,
+NULL, "avx512-bf16", NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.cpuid = {
+.eax = 7,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EAX,
+},
+.tcg_features = TCG_7_1_EAX_FEATURES,
+},
 [FEAT_8000_0007_EDX] = {
 .type = CPUID_FEATURE_WORD,
 .feat_names = {
@@ -4293,13 +4313,19 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 case 7:
 /* Structured Extended Feature Flags Enumeration Leaf */
 if (count == 0) {
-*eax = 0; /* Maximum ECX value for sub-leaves */
+/* Maximum ECX value for sub-leaves */
+*eax = env->cpuid_level_func7;
 *ebx = env->features[FEAT_7_0_EBX]; /* Feature flags */
 *ecx = env->features[FEAT_7_0_ECX]; /* Feature flags */
 if ((*ecx & CPUID_7_0_ECX_PKU) && env->cr[4] & CR4_PKE_MASK) {
 *ecx |= CPUID_7_0_ECX_OSPKE;
 }
 *edx = env->features[FEAT_7_0_EDX]; /* Feature flags */
+} else if (count == 1) {
+*eax = env->features[FEAT_7_1_EAX];
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
 } else {
 *eax = 0;
 *ebx = 0;
@@ -4949,6 +4975,11 @@ static void x86_cpu_adjust_feat_level(X86CPU *cpu, 
FeatureWord w)
 x86_cpu_adjust_level(cpu, >cpuid_min_xlevel2, eax);
 break;
 }
+
+if (eax == 7) {
+x86_cpu_adjust_level(cpu, >cpuid_min_level_func7,
+ fi->cpuid.ecx);
+}
 }
 
 /* Calculate XSAVE components based on the configured CPU feature flags */
@@ -5067,6 +5098,7 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
 x86_cpu_adjust_feat_level(cpu, FEAT_1_ECX);
 x86_cpu_adjust_feat_level(cpu, FEAT_6_EAX);
 x86_cpu_adjust_feat_level(cpu, FEAT_7_0_ECX);
+x86_cpu_adjust_feat_level(cpu, FEAT_7_1_EAX);
 x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_EDX);
 x86_cpu_adjust_feat_level(cpu, FEAT_8000_0001_ECX);
 x86_cpu_adjust_feat_level(cpu, FEAT_8000_0007_EDX);
@@ -5098,6 +5130,9 @@ static void x86_cpu_expand_features(X86CPU *cpu, Error 
**errp)
 }
 
 /* Set cpuid_*level* based on cpuid_min_*level, if not explicitly set */
+if (env->cpuid_level_func7 == UINT32_MAX) {
+env->cpuid_level_func7 = env->cpuid_min_level_func7;
+}
 if (env->cpuid_level == UINT32_MAX) {
 env->cpuid_level = env->cpuid_min_level;
 }
@@ -5869,6 +5904,8 @@ static Property x86_cpu_properties[] = {
 DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false),
 DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU, host_phys_bits_limit, 0),
 DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),
+DEFINE_PROP_UINT32("level-func7", X86CPU, env.cpuid_level_func7,
+   UINT32_MAX),
 DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, UINT32_MAX),
 DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, UINT32_MAX),
 DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, UINT32_MAX),
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 05393cf..df9106f 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -479,6 

Re: [Qemu-devel] [PATCH v1] x86: Intel AVX512_BF16 feature enabling

2019-07-24 Thread Jing Liu




On 7/22/2019 7:50 PM, Paolo Bonzini wrote:

On 22/07/19 04:59, Jing Liu wrote:



On 7/19/2019 4:10 PM, Paolo Bonzini wrote:

On 19/07/19 09:20, Jing Liu wrote:

Then CPUID[7,0].EAX is set automatically to 0 or 1 depending on whether
BF16 is enabled or not.


Could I ask why don't we directly check BF16 enabling when
cpu_x86_cpuid(env, 7, 0, ...) during kvm_arch_init_vcpu ?


Because the code for setting CPUID is common for all accelerators (there
are five supported: KVM, HAX, HVF, TCG, WHPX).


What is the use of the two new properties? Are they used for users
setting parameters when boot up guest, and why we need users setting
func7 level?


For example to test guests with CPUID[7,0].EAX==1, even if the host does
not have BF16.


Thanks. :)




@@ -5075,6 +5101,10 @@ static void x86_cpu_expand_features(X86CPU *cpu,
Error **errp)
   x86_cpu_adjust_feat_level(cpu, FEAT_SVM);
   x86_cpu_adjust_feat_level(cpu, FEAT_XSAVE);

+   if ((env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_AVX512_BF16) &&
+    kvm_enabled()) {


No need to check KVM.  You could also do just
x86_cpu_adjust_feat_level(cpu, FEAT_7_1_EAX) and set
cpu->min_level_func7 in x86_cpu_adjust_feat_level with something like

  if (eax == 7) {
  x86_cpu_adjust_level(cpu, >cpu_min_level_func7,
   fi->cpuid.ecx);
  }



Got it. One question I'm wondering is, is it possible for users setting
an invalid property like level-func7=2? Do we need some protection?


No, it's still not found in Intel silicon, but in principle you could
have higher indices than 1.  So it's okay, if something breaks it's the
fault of whoever set the option!



Thanks very much. So would you like me to update the patch with v2 now?

Jing




Re: [Qemu-devel] [PATCH v1] x86: Intel AVX512_BF16 feature enabling

2019-07-21 Thread Jing Liu




On 7/19/2019 4:10 PM, Paolo Bonzini wrote:

On 19/07/19 09:20, Jing Liu wrote:

Then CPUID[7,0].EAX is set automatically to 0 or 1 depending on whether
BF16 is enabled or not.


Could I ask why don't we directly check BF16 enabling when
cpu_x86_cpuid(env, 7, 0, ...) during kvm_arch_init_vcpu ?


Because the code for setting CPUID is common for all accelerators (there
are five supported: KVM, HAX, HVF, TCG, WHPX).


What is the use of the two new properties? Are they used for users
setting parameters when boot up guest, and why we need users setting
func7 level?


For example to test guests with CPUID[7,0].EAX==1, even if the host does
not have BF16.


Thanks. :)




@@ -5075,6 +5101,10 @@ static void x86_cpu_expand_features(X86CPU *cpu,
Error **errp)
  x86_cpu_adjust_feat_level(cpu, FEAT_SVM);
  x86_cpu_adjust_feat_level(cpu, FEAT_XSAVE);

+   if ((env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_AVX512_BF16) &&
+    kvm_enabled()) {


No need to check KVM.  You could also do just
x86_cpu_adjust_feat_level(cpu, FEAT_7_1_EAX) and set
cpu->min_level_func7 in x86_cpu_adjust_feat_level with something like

 if (eax == 7) {
 x86_cpu_adjust_level(cpu, >cpu_min_level_func7,
  fi->cpuid.ecx);
 }



Got it. One question I'm wondering is, is it possible for users setting
an invalid property like level-func7=2? Do we need some protection?


@@ -5098,6 +5128,9 @@ static void x86_cpu_expand_features(X86CPU *cpu,
Error **errp)
  }

  /* Set cpuid_*level* based on cpuid_min_*level, if not explicitly
set */
+    if (env->cpuid_level_func7 == UINT32_MAX) {
+    env->cpuid_level_func7 = env->cpuid_min_level_func7;
+    }


Looks good.


  if (env->cpuid_level == UINT32_MAX) {
  env->cpuid_level = env->cpuid_min_level;
  }
@@ -5869,9 +5902,11 @@ static Property x86_cpu_properties[] = {
  DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false),
  DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU,
host_phys_bits_limit, 0),
  DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),
+    DEFINE_PROP_UINT32("level-func7", X86CPU, env.cpuid_level_func7,
UINT32_MAX),


Looks good.


  DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, UINT32_MAX),
  DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, UINT32_MAX),
  DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, UINT32_MAX),
+    DEFINE_PROP_UINT32("min-level-func7", X86CPU,
env.cpuid_min_level_func7, 0),


No need for this property, just like there is no min-level property.


Would remove it.

Thanks,
Jing





Re: [Qemu-devel] [PATCH v1] x86: Intel AVX512_BF16 feature enabling

2019-07-19 Thread Jing Liu




On 7/18/2019 4:15 PM, Paolo Bonzini wrote:

On 18/07/19 06:55, Jing Liu wrote:


+    *eax = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x7,
+    count, R_EAX);

This needs to be firstly checked as follows, otherwise some
architectures would fail to compile.

What about hvf and tcg CPUID 07 EAX value?

+    /* Maximum ECX value for sub-leaves */
+    if (kvm_enabled()) {
+    *eax = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x7,
+    count, R_EAX);
+    } else if (hvf_enabled()) {
+    *eax = hvf_get_supported_cpuid(0x7, count, R_EAX);
+    } else {
+    *eax = 0;
+    }



Good question.  You need to add a new property, for example
cpuid_level_func7, whose code would be modeled around cpuid_level (and a
field cpuid_min_level_func7 whose code would be modeled around
cpuid_min_level).

Then CPUID[7,0].EAX is set automatically to 0 or 1 depending on whether
BF16 is enabled or not.


Could I ask why don't we directly check BF16 enabling when
cpu_x86_cpuid(env, 7, 0, ...) during kvm_arch_init_vcpu ?

What is the use of the two new properties? Are they used for users
setting parameters when boot up guest, and why we need users setting 
func7 level?


I tried to implement the code as follows.

@@ -4293,13 +4313,19 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t 
index, uint32_t count,

 case 7:
 /* Structured Extended Feature Flags Enumeration Leaf */
 if (count == 0) {
-*eax = 0; /* Maximum ECX value for sub-leaves */
+/* Maximum ECX value for sub-leaves */
+*eax = env->cpuid_level_func7;
[...]
+} else if (count == 1) {
+*eax = env->features[FEAT_7_1_EAX];
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
[...]
@@ -5075,6 +5101,10 @@ static void x86_cpu_expand_features(X86CPU *cpu, 
Error **errp)

 x86_cpu_adjust_feat_level(cpu, FEAT_SVM);
 x86_cpu_adjust_feat_level(cpu, FEAT_XSAVE);

+   if ((env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_AVX512_BF16) &&
+kvm_enabled()) {
+x86_cpu_adjust_level(cpu, >cpuid_min_level_func7, 1);
+}
 /* Intel Processor Trace requires CPUID[0x14] */
 if ((env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) &&
  kvm_enabled() && cpu->intel_pt_auto_level) {
@@ -5098,6 +5128,9 @@ static void x86_cpu_expand_features(X86CPU *cpu, 
Error **errp)

 }

 /* Set cpuid_*level* based on cpuid_min_*level, if not explicitly 
set */

+if (env->cpuid_level_func7 == UINT32_MAX) {
+env->cpuid_level_func7 = env->cpuid_min_level_func7;
+}
 if (env->cpuid_level == UINT32_MAX) {
 env->cpuid_level = env->cpuid_min_level;
 }
@@ -5869,9 +5902,11 @@ static Property x86_cpu_properties[] = {
 DEFINE_PROP_BOOL("host-phys-bits", X86CPU, host_phys_bits, false),
 DEFINE_PROP_UINT8("host-phys-bits-limit", X86CPU, 
host_phys_bits_limit, 0),

 DEFINE_PROP_BOOL("fill-mtrr-mask", X86CPU, fill_mtrr_mask, true),
+DEFINE_PROP_UINT32("level-func7", X86CPU, env.cpuid_level_func7, 
UINT32_MAX),

 DEFINE_PROP_UINT32("level", X86CPU, env.cpuid_level, UINT32_MAX),
 DEFINE_PROP_UINT32("xlevel", X86CPU, env.cpuid_xlevel, UINT32_MAX),
 DEFINE_PROP_UINT32("xlevel2", X86CPU, env.cpuid_xlevel2, UINT32_MAX),
+DEFINE_PROP_UINT32("min-level-func7", X86CPU, 
env.cpuid_min_level_func7, 0),

[...]

Thanks,
Jing




Paolo





Re: [Qemu-devel] [PATCH v1] x86: Intel AVX512_BF16 feature enabling

2019-07-17 Thread Jing Liu




On 7/11/2019 1:38 PM, Jing Liu wrote:

Intel CooperLake cpu adds AVX512_BF16 instruction, defining as
CPUID.(EAX=7,ECX=1):EAX[bit 05].

The release spec link as follows,
https://software.intel.com/sites/default/files/managed/c5/15/\
architecture-instruction-set-extensions-programming-reference.pdf

Signed-off-by: Jing Liu 
---
  target/i386/cpu.c | 29 -
  target/i386/cpu.h |  3 +++
  target/i386/kvm.c |  3 ++-
  3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index c1ab86d..de3daf5 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -767,6 +767,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t 
vendor1,
/* CPUID_7_0_ECX_OSPKE is dynamic */ \
CPUID_7_0_ECX_LA57)
  #define TCG_7_0_EDX_FEATURES 0
+#define TCG_7_1_EAX_FEATURES 0
  #define TCG_APM_FEATURES 0
  #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
  #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1)
@@ -1092,6 +1093,25 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
  },
  .tcg_features = TCG_7_0_EDX_FEATURES,
  },
+[FEAT_7_1_EAX] = {
+.type = CPUID_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, NULL, NULL,
+NULL, "avx512-bf16", NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.cpuid = {
+.eax = 7,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EAX,
+},
+.tcg_features = TCG_7_1_EAX_FEATURES,
+},
  [FEAT_8000_0007_EDX] = {
  .type = CPUID_FEATURE_WORD,
  .feat_names = {
@@ -4344,13 +4364,20 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
  case 7:
  /* Structured Extended Feature Flags Enumeration Leaf */
  if (count == 0) {
-*eax = 0; /* Maximum ECX value for sub-leaves */
+/* Maximum ECX value for sub-leaves */
+*eax = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x7,
+count, R_EAX);
This needs to be firstly checked as follows, otherwise some 
architectures would fail to compile.


What about hvf and tcg CPUID 07 EAX value?

+/* Maximum ECX value for sub-leaves */
+if (kvm_enabled()) {
+*eax = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x7,
+count, R_EAX);
+} else if (hvf_enabled()) {
+*eax = hvf_get_supported_cpuid(0x7, count, R_EAX);
+} else {
+*eax = 0;
+}


Thanks,
Jing


  *ebx = env->features[FEAT_7_0_EBX]; /* Feature flags */
  *ecx = env->features[FEAT_7_0_ECX]; /* Feature flags */
  if ((*ecx & CPUID_7_0_ECX_PKU) && env->cr[4] & CR4_PKE_MASK) {
  *ecx |= CPUID_7_0_ECX_OSPKE;
  }
  *edx = env->features[FEAT_7_0_EDX]; /* Feature flags */
+} else if (count == 1) {
+*eax = env->features[FEAT_7_1_EAX];
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
  } else {
  *eax = 0;
  *ebx = 0;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index bd06523..40594a1 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -488,6 +488,7 @@ typedef enum FeatureWord {
  FEAT_7_0_EBX,   /* CPUID[EAX=7,ECX=0].EBX */
  FEAT_7_0_ECX,   /* CPUID[EAX=7,ECX=0].ECX */
  FEAT_7_0_EDX,   /* CPUID[EAX=7,ECX=0].EDX */
+FEAT_7_1_EAX,   /* CPUID[EAX=7,ECX=1].EAX */
  FEAT_8000_0001_EDX, /* CPUID[8000_0001].EDX */
  FEAT_8000_0001_ECX, /* CPUID[8000_0001].ECX */
  FEAT_8000_0007_EDX, /* CPUID[8000_0007].EDX */
@@ -699,6 +700,8 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
  #define CPUID_7_0_EDX_ARCH_CAPABILITIES (1U << 29)  /*Arch Capabilities*/
  #define CPUID_7_0_EDX_SPEC_CTRL_SSBD  (1U << 31) /* Speculative Store Bypass 
Disable */
  
+#define CPUID_7_1_EAX_AVX512_BF16 (1U << 5) /* AVX512 BFloat16 Instruction */

+
  #define CPUID_8000_0008_EBX_WBNOINVD  (1U << 9)  /* Write back and
   
do not invalidate cache */
  #define CPUID_8000_0008_EBX_IBPB(1U << 12) /* Indirect Branch Prediction 
Barrier */
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 3b29ce5..977aaa5 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1110,6 +1110,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
  c = _data.entries[cpuid_i++];
  }
  break;
+case 0x7:
  case 0x14: {
  uint32_t times;
  
@@ -112

[Qemu-devel] [PATCH v1] x86: Intel AVX512_BF16 feature enabling

2019-07-10 Thread Jing Liu
Intel CooperLake cpu adds AVX512_BF16 instruction, defining as
CPUID.(EAX=7,ECX=1):EAX[bit 05].

The release spec link as follows,
https://software.intel.com/sites/default/files/managed/c5/15/\
architecture-instruction-set-extensions-programming-reference.pdf

Signed-off-by: Jing Liu 
---
 target/i386/cpu.c | 29 -
 target/i386/cpu.h |  3 +++
 target/i386/kvm.c |  3 ++-
 3 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index c1ab86d..de3daf5 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -767,6 +767,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t 
vendor1,
   /* CPUID_7_0_ECX_OSPKE is dynamic */ \
   CPUID_7_0_ECX_LA57)
 #define TCG_7_0_EDX_FEATURES 0
+#define TCG_7_1_EAX_FEATURES 0
 #define TCG_APM_FEATURES 0
 #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
 #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1)
@@ -1092,6 +1093,25 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
 },
 .tcg_features = TCG_7_0_EDX_FEATURES,
 },
+[FEAT_7_1_EAX] = {
+.type = CPUID_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, NULL, NULL,
+NULL, "avx512-bf16", NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.cpuid = {
+.eax = 7,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EAX,
+},
+.tcg_features = TCG_7_1_EAX_FEATURES,
+},
 [FEAT_8000_0007_EDX] = {
 .type = CPUID_FEATURE_WORD,
 .feat_names = {
@@ -4344,13 +4364,20 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 case 7:
 /* Structured Extended Feature Flags Enumeration Leaf */
 if (count == 0) {
-*eax = 0; /* Maximum ECX value for sub-leaves */
+/* Maximum ECX value for sub-leaves */
+*eax = kvm_arch_get_supported_cpuid(cs->kvm_state, 0x7,
+count, R_EAX);
 *ebx = env->features[FEAT_7_0_EBX]; /* Feature flags */
 *ecx = env->features[FEAT_7_0_ECX]; /* Feature flags */
 if ((*ecx & CPUID_7_0_ECX_PKU) && env->cr[4] & CR4_PKE_MASK) {
 *ecx |= CPUID_7_0_ECX_OSPKE;
 }
 *edx = env->features[FEAT_7_0_EDX]; /* Feature flags */
+} else if (count == 1) {
+*eax = env->features[FEAT_7_1_EAX];
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
 } else {
 *eax = 0;
 *ebx = 0;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index bd06523..40594a1 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -488,6 +488,7 @@ typedef enum FeatureWord {
 FEAT_7_0_EBX,   /* CPUID[EAX=7,ECX=0].EBX */
 FEAT_7_0_ECX,   /* CPUID[EAX=7,ECX=0].ECX */
 FEAT_7_0_EDX,   /* CPUID[EAX=7,ECX=0].EDX */
+FEAT_7_1_EAX,   /* CPUID[EAX=7,ECX=1].EAX */
 FEAT_8000_0001_EDX, /* CPUID[8000_0001].EDX */
 FEAT_8000_0001_ECX, /* CPUID[8000_0001].ECX */
 FEAT_8000_0007_EDX, /* CPUID[8000_0007].EDX */
@@ -699,6 +700,8 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EDX_ARCH_CAPABILITIES (1U << 29)  /*Arch Capabilities*/
 #define CPUID_7_0_EDX_SPEC_CTRL_SSBD  (1U << 31) /* Speculative Store Bypass 
Disable */
 
+#define CPUID_7_1_EAX_AVX512_BF16 (1U << 5) /* AVX512 BFloat16 Instruction */
+
 #define CPUID_8000_0008_EBX_WBNOINVD  (1U << 9)  /* Write back and
  
do not invalidate cache */
 #define CPUID_8000_0008_EBX_IBPB(1U << 12) /* Indirect Branch Prediction 
Barrier */
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 3b29ce5..977aaa5 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1110,6 +1110,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 c = _data.entries[cpuid_i++];
 }
 break;
+case 0x7:
 case 0x14: {
 uint32_t times;
 
@@ -1122,7 +1123,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 for (j = 1; j <= times; ++j) {
 if (cpuid_i == KVM_MAX_CPUID_ENTRIES) {
 fprintf(stderr, "cpuid_data is full, no space for "
-"cpuid(eax:0x14,ecx:0x%x)\n", j);
+"cpuid(eax:0x%x,ecx:0x%x)\n", i, j);
 abort();
 }
 c = _data.entries[cpuid_i++];
-- 
1.8.3.1




[Qemu-devel] [PATCH RFC] x86: BFloat16 feature enabling on Cooper Lake

2019-06-20 Thread Jing Liu
Intel CooperLake cpu adds AVX512_BF16 instruction, defining as
CPUID.(EAX=7,EXC=1):EAX[bit 05].

The release spec link as follows,
https://software.intel.com/sites/default/files/managed/c5/15/\
architecture-instruction-set-extensions-programming-reference.pdf

Signed-off-by: Jing Liu 
---
 target/i386/cpu.c | 46 +++---
 target/i386/cpu.h |  3 +++
 target/i386/kvm.c |  3 ++-
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index c1ab86d..249f7f9 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -664,6 +664,9 @@ static CPUCacheInfo legacy_l3_cache = {
 #define L2_ITLB_4K_ASSOC   4
 #define L2_ITLB_4K_ENTRIES   512
 
+/* CPUID Leaf 0x07 constants: */
+#define INTEL_LEAF_7_MAX_SUBLEAF 0x1
+
 /* CPUID Leaf 0x14 constants: */
 #define INTEL_PT_MAX_SUBLEAF 0x1
 /*
@@ -767,6 +770,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t 
vendor1,
   /* CPUID_7_0_ECX_OSPKE is dynamic */ \
   CPUID_7_0_ECX_LA57)
 #define TCG_7_0_EDX_FEATURES 0
+#define TCG_7_1_EAX_FEATURES 0
 #define TCG_APM_FEATURES 0
 #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
 #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1)
@@ -1092,6 +1096,25 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] 
= {
 },
 .tcg_features = TCG_7_0_EDX_FEATURES,
 },
+[FEAT_7_1_EAX] = {
+.type = CPUID_FEATURE_WORD,
+.feat_names = {
+NULL, NULL, NULL, NULL,
+NULL, "avx512-bf16", NULL, NULL,
+   NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+NULL, NULL, NULL, NULL,
+},
+.cpuid = {
+.eax = 7,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EAX,
+},
+.tcg_features = TCG_7_1_EAX_FEATURES,
+},
 [FEAT_8000_0007_EDX] = {
 .type = CPUID_FEATURE_WORD,
 .feat_names = {
@@ -4343,19 +4366,28 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 break;
 case 7:
 /* Structured Extended Feature Flags Enumeration Leaf */
-if (count == 0) {
-*eax = 0; /* Maximum ECX value for sub-leaves */
+*eax = 0;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+
+switch (count) {
+case 0:
+if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_AVX512_BF16) {
+*eax = INTEL_LEAF_7_MAX_SUBLEAF; /* Maximum ECX value for 
sub-leaves */
+}
 *ebx = env->features[FEAT_7_0_EBX]; /* Feature flags */
 *ecx = env->features[FEAT_7_0_ECX]; /* Feature flags */
 if ((*ecx & CPUID_7_0_ECX_PKU) && env->cr[4] & CR4_PKE_MASK) {
 *ecx |= CPUID_7_0_ECX_OSPKE;
 }
 *edx = env->features[FEAT_7_0_EDX]; /* Feature flags */
-} else {
-*eax = 0;
-*ebx = 0;
-*ecx = 0;
-*edx = 0;
+break;
+case 1:
+*eax = env->features[FEAT_7_1_EAX];
+break;
+default:
+break;
 }
 break;
 case 9:
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index bd06523..40594a1 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -488,6 +488,7 @@ typedef enum FeatureWord {
 FEAT_7_0_EBX,   /* CPUID[EAX=7,ECX=0].EBX */
 FEAT_7_0_ECX,   /* CPUID[EAX=7,ECX=0].ECX */
 FEAT_7_0_EDX,   /* CPUID[EAX=7,ECX=0].EDX */
+FEAT_7_1_EAX,   /* CPUID[EAX=7,ECX=1].EAX */
 FEAT_8000_0001_EDX, /* CPUID[8000_0001].EDX */
 FEAT_8000_0001_ECX, /* CPUID[8000_0001].ECX */
 FEAT_8000_0007_EDX, /* CPUID[8000_0007].EDX */
@@ -699,6 +700,8 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EDX_ARCH_CAPABILITIES (1U << 29)  /*Arch Capabilities*/
 #define CPUID_7_0_EDX_SPEC_CTRL_SSBD  (1U << 31) /* Speculative Store Bypass 
Disable */
 
+#define CPUID_7_1_EAX_AVX512_BF16 (1U << 5) /* AVX512 BFloat16 Instruction */
+
 #define CPUID_8000_0008_EBX_WBNOINVD  (1U << 9)  /* Write back and
  
do not invalidate cache */
 #define CPUID_8000_0008_EBX_IBPB(1U << 12) /* Indirect Branch Prediction 
Barrier */
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 3b29ce5..977aaa5 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1110,6 +1110,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 c = _data.entries[cpuid_i++];
 }
 break;
+case 0x7:
 case 0x14: {
 uint32_t times;
 
@@ -1122,7 +1123,7 @@ int kvm_arch_init_vcpu(CPUState *cs)
 for (j = 1; j <= times; ++j) {
 if (cpuid_i == 

[Qemu-devel] [PATCH v3 2/2] hw/pci: add PCI resource reserve capability to legacy PCI bridge

2018-08-20 Thread Jing Liu
Add hint to firmware (e.g. SeaBIOS) to reserve addtional
BUS/IO/MEM/PREF resource for legacy pci-pci bridge. Add the
resource reserve capability deleting in pci_bridge_dev_exitfn.

Signed-off-by: Jing Liu 
---
 hw/pci-bridge/pci_bridge_dev.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/hw/pci-bridge/pci_bridge_dev.c b/hw/pci-bridge/pci_bridge_dev.c
index b2d861d..97a8e8b 100644
--- a/hw/pci-bridge/pci_bridge_dev.c
+++ b/hw/pci-bridge/pci_bridge_dev.c
@@ -46,6 +46,9 @@ struct PCIBridgeDev {
 uint32_t flags;
 
 OnOffAuto msi;
+
+/* additional resources to reserve */
+PCIResReserve res_reserve;
 };
 typedef struct PCIBridgeDev PCIBridgeDev;
 
@@ -95,6 +98,12 @@ static void pci_bridge_dev_realize(PCIDevice *dev, Error 
**errp)
 error_free(local_err);
 }
 
+err = pci_bridge_qemu_reserve_cap_init(dev, 0,
+ bridge_dev->res_reserve, errp);
+if (err) {
+goto cap_error;
+}
+
 if (shpc_present(dev)) {
 /* TODO: spec recommends using 64 bit prefetcheable BAR.
  * Check whether that works well. */
@@ -103,6 +112,8 @@ static void pci_bridge_dev_realize(PCIDevice *dev, Error 
**errp)
 }
 return;
 
+cap_error:
+msi_uninit(dev);
 msi_error:
 slotid_cap_cleanup(dev);
 slotid_error:
@@ -116,6 +127,8 @@ shpc_error:
 static void pci_bridge_dev_exitfn(PCIDevice *dev)
 {
 PCIBridgeDev *bridge_dev = PCI_BRIDGE_DEV(dev);
+
+pci_del_capability(dev, PCI_CAP_ID_VNDR, sizeof(PCIBridgeQemuCap));
 if (msi_present(dev)) {
 msi_uninit(dev);
 }
@@ -162,6 +175,17 @@ static Property pci_bridge_dev_properties[] = {
 ON_OFF_AUTO_AUTO),
 DEFINE_PROP_BIT(PCI_BRIDGE_DEV_PROP_SHPC, PCIBridgeDev, flags,
 PCI_BRIDGE_DEV_F_SHPC_REQ, true),
+DEFINE_PROP_UINT32("bus-reserve", PCIBridgeDev,
+   res_reserve.bus, -1),
+DEFINE_PROP_SIZE("io-reserve", PCIBridgeDev,
+ res_reserve.io, -1),
+DEFINE_PROP_SIZE("mem-reserve", PCIBridgeDev,
+ res_reserve.mem_non_pref, -1),
+DEFINE_PROP_SIZE("pref32-reserve", PCIBridgeDev,
+ res_reserve.mem_pref_32, -1),
+DEFINE_PROP_SIZE("pref64-reserve", PCIBridgeDev,
+ res_reserve.mem_pref_64, -1),
+
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
1.8.3.1




[Qemu-devel] [PATCH v3 0/2] hw/pci: PCI resource reserve capability

2018-08-20 Thread Jing Liu
This patch serial is about PCI resource reserve capability.

First patch refactors the resource reserve fields in GenPCIERoorPort
structure out to another new structure, called "PCIResReserve". Modify
the parameter list of pci_bridge_qemu_reserve_cap_init().

Second patch enables the resource reserve capability for legacy PCI bridge
so that firmware can reserve additional resources for this bridge.

Change Logs:
v3 -> v2
* remove the teardown patch because only need pci_del_capability
* keep the names to be consistent with firmware counterpart
* some minor fixes

v2 -> v1
* add refactoring patch
* add teardown function
* some other fixes

Jing Liu (2):
  hw/pci: factor PCI reserve resources to a separate structure
  hw/pci: add PCI resource reserve capability to legacy PCI bridge

 hw/pci-bridge/gen_pcie_root_port.c | 33 +
 hw/pci-bridge/pci_bridge_dev.c | 24 
 hw/pci/pci_bridge.c| 38 +-
 include/hw/pci/pci_bridge.h| 18 +-
 4 files changed, 71 insertions(+), 42 deletions(-)

-- 
1.8.3.1




[Qemu-devel] [PATCH v3 1/2] hw/pci: factor PCI reserve resources to a separate structure

2018-08-20 Thread Jing Liu
Factor "bus_reserve", "io_reserve", "mem_reserve", "pref32_reserve"
and "pref64_reserve" fields of the "GenPCIERootPort" structure out
to "PCIResReserve" structure, so that other PCI bridges can
reuse it to add resource reserve capability.

Signed-off-by: Jing Liu 
---
 hw/pci-bridge/gen_pcie_root_port.c | 33 +
 hw/pci/pci_bridge.c| 38 +-
 include/hw/pci/pci_bridge.h| 18 +-
 3 files changed, 47 insertions(+), 42 deletions(-)

diff --git a/hw/pci-bridge/gen_pcie_root_port.c 
b/hw/pci-bridge/gen_pcie_root_port.c
index d117e20..299de42 100644
--- a/hw/pci-bridge/gen_pcie_root_port.c
+++ b/hw/pci-bridge/gen_pcie_root_port.c
@@ -29,12 +29,8 @@ typedef struct GenPCIERootPort {
 
 bool migrate_msix;
 
-/* additional resources to reserve on firmware init */
-uint32_t bus_reserve;
-uint64_t io_reserve;
-uint64_t mem_reserve;
-uint64_t pref32_reserve;
-uint64_t pref64_reserve;
+/* additional resources to reserve */
+PCIResReserve res_reserve;
 } GenPCIERootPort;
 
 static uint8_t gen_rp_aer_vector(const PCIDevice *d)
@@ -82,16 +78,15 @@ static void gen_rp_realize(DeviceState *dev, Error **errp)
 return;
 }
 
-int rc = pci_bridge_qemu_reserve_cap_init(d, 0, grp->bus_reserve,
-grp->io_reserve, grp->mem_reserve, grp->pref32_reserve,
-grp->pref64_reserve, errp);
+int rc = pci_bridge_qemu_reserve_cap_init(d, 0,
+  grp->res_reserve, errp);
 
 if (rc < 0) {
 rpc->parent_class.exit(d);
 return;
 }
 
-if (!grp->io_reserve) {
+if (!grp->res_reserve.io) {
 pci_word_test_and_clear_mask(d->wmask + PCI_COMMAND,
  PCI_COMMAND_IO);
 d->wmask[PCI_IO_BASE] = 0;
@@ -117,12 +112,18 @@ static const VMStateDescription vmstate_rp_dev = {
 };
 
 static Property gen_rp_props[] = {
-DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort, migrate_msix, true),
-DEFINE_PROP_UINT32("bus-reserve", GenPCIERootPort, bus_reserve, -1),
-DEFINE_PROP_SIZE("io-reserve", GenPCIERootPort, io_reserve, -1),
-DEFINE_PROP_SIZE("mem-reserve", GenPCIERootPort, mem_reserve, -1),
-DEFINE_PROP_SIZE("pref32-reserve", GenPCIERootPort, pref32_reserve, -1),
-DEFINE_PROP_SIZE("pref64-reserve", GenPCIERootPort, pref64_reserve, -1),
+DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort,
+ migrate_msix, true),
+DEFINE_PROP_UINT32("bus-reserve", GenPCIERootPort,
+   res_reserve.bus, -1),
+DEFINE_PROP_SIZE("io-reserve", GenPCIERootPort,
+ res_reserve.io, -1),
+DEFINE_PROP_SIZE("mem-reserve", GenPCIERootPort,
+ res_reserve.mem_non_pref, -1),
+DEFINE_PROP_SIZE("pref32-reserve", GenPCIERootPort,
+ res_reserve.mem_pref_32, -1),
+DEFINE_PROP_SIZE("pref64-reserve", GenPCIERootPort,
+ res_reserve.mem_pref_64, -1),
 DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
index 40a39f5..08b7e44 100644
--- a/hw/pci/pci_bridge.c
+++ b/hw/pci/pci_bridge.c
@@ -411,38 +411,34 @@ void pci_bridge_map_irq(PCIBridge *br, const char* 
bus_name,
 
 
 int pci_bridge_qemu_reserve_cap_init(PCIDevice *dev, int cap_offset,
- uint32_t bus_reserve, uint64_t io_reserve,
- uint64_t mem_non_pref_reserve,
- uint64_t mem_pref_32_reserve,
- uint64_t mem_pref_64_reserve,
- Error **errp)
+ PCIResReserve res_reserve, Error **errp)
 {
-if (mem_pref_32_reserve != (uint64_t)-1 &&
-mem_pref_64_reserve != (uint64_t)-1) {
+if (res_reserve.mem_pref_32 != (uint64_t)-1 &&
+res_reserve.mem_pref_64 != (uint64_t)-1) {
 error_setg(errp,
"PCI resource reserve cap: PREF32 and PREF64 conflict");
 return -EINVAL;
 }
 
-if (mem_non_pref_reserve != (uint64_t)-1 &&
-mem_non_pref_reserve >= (1ULL << 32)) {
+if (res_reserve.mem_non_pref != (uint64_t)-1 &&
+res_reserve.mem_non_pref >= (1ULL << 32)) {
 error_setg(errp,
"PCI resource reserve cap: mem-reserve must be less than 
4G");
 return -EINVAL;
 }
 
-if (mem_pref_32_reserve != (uint64_t)-1 &&
-mem_pref_32_reserve >= (1ULL << 32)) {
+if (res_reserve.mem_pref_32 != (uint64_t)-1 &&
+  

[Qemu-devel] [PATCH v2 3/3] hw/pci: add PCI resource reserve capability to legacy PCI bridge

2018-08-16 Thread Jing Liu
Add hint to firmware (e.g. SeaBIOS) to reserve addtional
BUS/IO/MEM/PREF resource for legacy pci-pci bridge.

Signed-off-by: Jing Liu 
---
 hw/pci-bridge/pci_bridge_dev.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/hw/pci-bridge/pci_bridge_dev.c b/hw/pci-bridge/pci_bridge_dev.c
index b2d861d..874e618 100644
--- a/hw/pci-bridge/pci_bridge_dev.c
+++ b/hw/pci-bridge/pci_bridge_dev.c
@@ -46,6 +46,8 @@ struct PCIBridgeDev {
 uint32_t flags;
 
 OnOffAuto msi;
+
+PCIResReserve res_reserve;
 };
 typedef struct PCIBridgeDev PCIBridgeDev;
 
@@ -95,6 +97,12 @@ static void pci_bridge_dev_realize(PCIDevice *dev, Error 
**errp)
 error_free(local_err);
 }
 
+err = pci_bridge_qemu_reserve_cap_init(dev, 0,
+ bridge_dev->res_reserve, errp);
+if (err) {
+goto cap_error;
+}
+
 if (shpc_present(dev)) {
 /* TODO: spec recommends using 64 bit prefetcheable BAR.
  * Check whether that works well. */
@@ -103,6 +111,10 @@ static void pci_bridge_dev_realize(PCIDevice *dev, Error 
**errp)
 }
 return;
 
+cap_error:
+if (msi_present(dev)) {
+msi_uninit(dev);
+}
 msi_error:
 slotid_cap_cleanup(dev);
 slotid_error:
@@ -116,6 +128,8 @@ shpc_error:
 static void pci_bridge_dev_exitfn(PCIDevice *dev)
 {
 PCIBridgeDev *bridge_dev = PCI_BRIDGE_DEV(dev);
+
+pci_bridge_qemu_reserve_cap_uninit(dev);
 if (msi_present(dev)) {
 msi_uninit(dev);
 }
@@ -162,6 +176,17 @@ static Property pci_bridge_dev_properties[] = {
 ON_OFF_AUTO_AUTO),
 DEFINE_PROP_BIT(PCI_BRIDGE_DEV_PROP_SHPC, PCIBridgeDev, flags,
 PCI_BRIDGE_DEV_F_SHPC_REQ, true),
+DEFINE_PROP_UINT32("bus-reserve", PCIBridgeDev,
+   res_reserve.bus_reserve, -1),
+DEFINE_PROP_SIZE("io-reserve", PCIBridgeDev,
+ res_reserve.io_reserve, -1),
+DEFINE_PROP_SIZE("mem-reserve", PCIBridgeDev,
+ res_reserve.mem_reserve, -1),
+DEFINE_PROP_SIZE("pref32-reserve", PCIBridgeDev,
+ res_reserve.pref32_reserve, -1),
+DEFINE_PROP_SIZE("pref64-reserve", PCIBridgeDev,
+ res_reserve.pref64_reserve, -1),
+
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
1.8.3.1




[Qemu-devel] [PATCH v2 1/3] hw/pci: factor PCI reserve resources to a separate structure

2018-08-16 Thread Jing Liu
Factor "bus_reserve", "io_reserve", "mem_reserve", "pref32_reserve"
and "pref64_reserve" fields of the "GenPCIERootPort" structure out
to "PCIResReserve" structure, so that other PCI bridges can
reuse it to add resource reserve capability.

Signed-off-by: Jing Liu 
---
 hw/pci-bridge/gen_pcie_root_port.c | 32 
 hw/pci/pci_bridge.c| 38 +-
 include/hw/pci/pci_bridge.h| 17 -
 3 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/hw/pci-bridge/gen_pcie_root_port.c 
b/hw/pci-bridge/gen_pcie_root_port.c
index d117e20..babce3c 100644
--- a/hw/pci-bridge/gen_pcie_root_port.c
+++ b/hw/pci-bridge/gen_pcie_root_port.c
@@ -29,12 +29,7 @@ typedef struct GenPCIERootPort {
 
 bool migrate_msix;
 
-/* additional resources to reserve on firmware init */
-uint32_t bus_reserve;
-uint64_t io_reserve;
-uint64_t mem_reserve;
-uint64_t pref32_reserve;
-uint64_t pref64_reserve;
+PCIResReserve res_reserve;
 } GenPCIERootPort;
 
 static uint8_t gen_rp_aer_vector(const PCIDevice *d)
@@ -82,16 +77,15 @@ static void gen_rp_realize(DeviceState *dev, Error **errp)
 return;
 }
 
-int rc = pci_bridge_qemu_reserve_cap_init(d, 0, grp->bus_reserve,
-grp->io_reserve, grp->mem_reserve, grp->pref32_reserve,
-grp->pref64_reserve, errp);
+int rc = pci_bridge_qemu_reserve_cap_init(d, 0,
+  grp->res_reserve, errp);
 
 if (rc < 0) {
 rpc->parent_class.exit(d);
 return;
 }
 
-if (!grp->io_reserve) {
+if (!grp->res_reserve.io_reserve) {
 pci_word_test_and_clear_mask(d->wmask + PCI_COMMAND,
  PCI_COMMAND_IO);
 d->wmask[PCI_IO_BASE] = 0;
@@ -117,12 +111,18 @@ static const VMStateDescription vmstate_rp_dev = {
 };
 
 static Property gen_rp_props[] = {
-DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort, migrate_msix, true),
-DEFINE_PROP_UINT32("bus-reserve", GenPCIERootPort, bus_reserve, -1),
-DEFINE_PROP_SIZE("io-reserve", GenPCIERootPort, io_reserve, -1),
-DEFINE_PROP_SIZE("mem-reserve", GenPCIERootPort, mem_reserve, -1),
-DEFINE_PROP_SIZE("pref32-reserve", GenPCIERootPort, pref32_reserve, -1),
-DEFINE_PROP_SIZE("pref64-reserve", GenPCIERootPort, pref64_reserve, -1),
+DEFINE_PROP_BOOL("x-migrate-msix", GenPCIERootPort,
+ migrate_msix, true),
+DEFINE_PROP_UINT32("bus-reserve", GenPCIERootPort,
+   res_reserve.bus_reserve, -1),
+DEFINE_PROP_SIZE("io-reserve", GenPCIERootPort,
+ res_reserve.io_reserve, -1),
+DEFINE_PROP_SIZE("mem-reserve", GenPCIERootPort,
+ res_reserve.mem_reserve, -1),
+DEFINE_PROP_SIZE("pref32-reserve", GenPCIERootPort,
+ res_reserve.pref32_reserve, -1),
+DEFINE_PROP_SIZE("pref64-reserve", GenPCIERootPort,
+ res_reserve.pref64_reserve, -1),
 DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
index 40a39f5..15b055e 100644
--- a/hw/pci/pci_bridge.c
+++ b/hw/pci/pci_bridge.c
@@ -411,38 +411,34 @@ void pci_bridge_map_irq(PCIBridge *br, const char* 
bus_name,
 
 
 int pci_bridge_qemu_reserve_cap_init(PCIDevice *dev, int cap_offset,
- uint32_t bus_reserve, uint64_t io_reserve,
- uint64_t mem_non_pref_reserve,
- uint64_t mem_pref_32_reserve,
- uint64_t mem_pref_64_reserve,
- Error **errp)
+ PCIResReserve res, Error **errp)
 {
-if (mem_pref_32_reserve != (uint64_t)-1 &&
-mem_pref_64_reserve != (uint64_t)-1) {
+if (res.pref32_reserve != (uint64_t)-1 &&
+res.pref64_reserve != (uint64_t)-1) {
 error_setg(errp,
"PCI resource reserve cap: PREF32 and PREF64 conflict");
 return -EINVAL;
 }
 
-if (mem_non_pref_reserve != (uint64_t)-1 &&
-mem_non_pref_reserve >= (1ULL << 32)) {
+if (res.mem_reserve != (uint64_t)-1 &&
+res.mem_reserve >= (1ULL << 32)) {
 error_setg(errp,
"PCI resource reserve cap: mem-reserve must be less than 
4G");
 return -EINVAL;
 }
 
-if (mem_pref_32_reserve != (uint64_t)-1 &&
-mem_pref_32_reserve >= (1ULL << 32)) {
+if (res.pref32_reserve != (uint64_t)-1 &&
+res.pref32_reserve >= (1ULL << 32)) {
 error_setg(er

[Qemu-devel] [PATCH v2 0/3] hw/pci: PCI resource reserve capability

2018-08-16 Thread Jing Liu
This patch serial is about PCI resource reserve capability.

First patch refactors the resource reserve fields in GenPCIERoorPort
structure out to another new structure, called "PCIResReserve". Modify
the parameter list of pci_bridge_qemu_reserve_cap_init().

Then we add the teardown function called pci_bridge_qemu_reserve_cap_uninit().

Last we enable the resource reserve capability for legacy PCI bridge
so that firmware can reserve additional resources for the bridge.

Change Log:
v2 -> v1
* add refactoring patch
* add teardown function
* some other fixes

Jing Liu (3):
  hw/pci: factor PCI reserve resources to a separate structure
  hw/pci: add teardown function for PCI resource reserve capability
  hw/pci: add PCI resource reserve capability to legacy PCI bridge

 hw/pci-bridge/gen_pcie_root_port.c | 32 +-
 hw/pci-bridge/pci_bridge_dev.c | 25 
 hw/pci/pci_bridge.c| 47 +-
 include/hw/pci/pci_bridge.h| 18 +++
 4 files changed, 80 insertions(+), 42 deletions(-)

-- 
1.8.3.1




[Qemu-devel] [PATCH v2 2/3] hw/pci: add teardown function for PCI resource reserve capability

2018-08-16 Thread Jing Liu
Clean up the PCI config space of resource reserve capability.

Signed-off-by: Jing Liu 
---
 hw/pci/pci_bridge.c | 9 +
 include/hw/pci/pci_bridge.h | 1 +
 2 files changed, 10 insertions(+)

diff --git a/hw/pci/pci_bridge.c b/hw/pci/pci_bridge.c
index 15b055e..dbcee90 100644
--- a/hw/pci/pci_bridge.c
+++ b/hw/pci/pci_bridge.c
@@ -465,6 +465,15 @@ int pci_bridge_qemu_reserve_cap_init(PCIDevice *dev, int 
cap_offset,
 return 0;
 }
 
+void pci_bridge_qemu_reserve_cap_uninit(PCIDevice *dev)
+{
+uint8_t pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+
+pci_del_capability(dev, PCI_CAP_ID_VNDR, sizeof(PCIBridgeQemuCap));
+memset(dev->config + pos + PCI_CAP_FLAGS, 0,
+   sizeof(PCIBridgeQemuCap) - PCI_CAP_FLAGS);
+}
+
 static const TypeInfo pci_bridge_type_info = {
 .name = TYPE_PCI_BRIDGE,
 .parent = TYPE_PCI_DEVICE,
diff --git a/include/hw/pci/pci_bridge.h b/include/hw/pci/pci_bridge.h
index 6186a32..b1e25ad 100644
--- a/include/hw/pci/pci_bridge.h
+++ b/include/hw/pci/pci_bridge.h
@@ -147,4 +147,5 @@ typedef struct PCIResReserve {
 int pci_bridge_qemu_reserve_cap_init(PCIDevice *dev, int cap_offset,
PCIResReserve res_reserve, Error **errp);
 
+void pci_bridge_qemu_reserve_cap_uninit(PCIDevice *dev);
 #endif /* QEMU_PCI_BRIDGE_H */
-- 
1.8.3.1




[Qemu-devel] [PATCH] hw/pci: add pci capability to pci-pci bridge

2018-08-07 Thread Jing Liu
Add hint to firmware (e.g. SeaBIOS) to reserve addtional
IO/MEM/PREF spaces for legacy pci-pci bridge, to enable
some pci devices hotplugging whose IO/MEM/PREF spaces
requests are larger than the ones in pci-pci bridge set
by firmware.

Signed-off-by: Jing Liu 
---
 hw/pci-bridge/pci_bridge_dev.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/hw/pci-bridge/pci_bridge_dev.c b/hw/pci-bridge/pci_bridge_dev.c
index b2d861d..8e9afbd 100644
--- a/hw/pci-bridge/pci_bridge_dev.c
+++ b/hw/pci-bridge/pci_bridge_dev.c
@@ -46,6 +46,12 @@ struct PCIBridgeDev {
 uint32_t flags;
 
 OnOffAuto msi;
+
+/* additional resources to reserve on firmware init */
+uint64_t io_reserve;
+uint64_t mem_reserve;
+uint64_t pref32_reserve;
+uint64_t pref64_reserve;
 };
 typedef struct PCIBridgeDev PCIBridgeDev;
 
@@ -95,6 +101,13 @@ static void pci_bridge_dev_realize(PCIDevice *dev, Error 
**errp)
 error_free(local_err);
 }
 
+err = pci_bridge_qemu_reserve_cap_init(dev, 0, 0,
+  bridge_dev->io_reserve, bridge_dev->mem_reserve,
+  bridge_dev->pref32_reserve, bridge_dev->pref64_reserve, errp);
+if (err) {
+goto cap_error;
+}
+
 if (shpc_present(dev)) {
 /* TODO: spec recommends using 64 bit prefetcheable BAR.
  * Check whether that works well. */
@@ -103,6 +116,8 @@ static void pci_bridge_dev_realize(PCIDevice *dev, Error 
**errp)
 }
 return;
 
+cap_error:
+msi_uninit(dev);
 msi_error:
 slotid_cap_cleanup(dev);
 slotid_error:
@@ -162,6 +177,11 @@ static Property pci_bridge_dev_properties[] = {
 ON_OFF_AUTO_AUTO),
 DEFINE_PROP_BIT(PCI_BRIDGE_DEV_PROP_SHPC, PCIBridgeDev, flags,
 PCI_BRIDGE_DEV_F_SHPC_REQ, true),
+DEFINE_PROP_SIZE("io-reserve", PCIBridgeDev, io_reserve, -1),
+DEFINE_PROP_SIZE("mem-reserve", PCIBridgeDev, mem_reserve, -1),
+DEFINE_PROP_SIZE("pref32-reserve", PCIBridgeDev, pref32_reserve, -1),
+DEFINE_PROP_SIZE("pref64-reserve", PCIBridgeDev, pref64_reserve, -1),
+
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
1.8.3.1




Re: [Qemu-devel] [Qemu-block] [PATCH 3/3] qemu-iotests: require CONFIG_LINUX_AIO for test 087

2017-07-26 Thread Jing Liu



On 2017/7/25 下午11:48, Daniel P. Berrange wrote:

On Tue, Jul 25, 2017 at 04:45:46PM +0100, Stefan Hajnoczi wrote:

On Mon, Jul 24, 2017 at 02:44:13PM +0800, Jing Liu wrote:

On 2017/7/21 上午11:47, Cleber Rosa wrote:

One of the "sub-"tests of test 087 requires CONFIG_LINUX_AIO.

As a PoC/RFC, this goes the easy route and skips the test as a whole
when that feature is missing.  Other approaches include splitting
the test and adding extra filtering.

Signed-off-by: Cleber Rosa <cr...@redhat.com>
---
   tests/qemu-iotests/087 | 1 +
   1 file changed, 1 insertion(+)

diff --git a/tests/qemu-iotests/087 b/tests/qemu-iotests/087
index f8e4903..a2fb7de 100755
--- a/tests/qemu-iotests/087
+++ b/tests/qemu-iotests/087
@@ -34,6 +34,7 @@ status=1  # failure is the default!
   _supported_fmt qcow2
   _supported_proto file
   _supported_os Linux
+_require_feature CONFIG_LINUX_AIO

I tested that CONFIG_NETTLE_KDF is also a necessary for 087.

+_require_feature CONFIG_NETTLE_KDF

Are you sure?  Looks like either nettle or gcrypt is needed:

Correct, it works with either.

Ah, because I just found out nettle which related to KDF.
why can not find out gcrypt.h in qemu?
How to compile config_gcrypt_kdf into qemu?



crypto/Makefile.objs:crypto-obj-$(CONFIG_NETTLE_KDF) += pbkdf-nettle.o
crypto/Makefile.objs:crypto-obj-$(if 
$(CONFIG_NETTLE_KDF),n,$(CONFIG_GCRYPT_KDF)) += pbkdf-gcrypt.o

But this shows why the compile-time testing of features is ugly:

1. It duplicates build dependency logic into the test cases.

2. The test cases don't care about nettle vs gcrypt and they shouldn't
have to know about it.  They just care whether LUKS is available or
not.

Yeah that knowledge is messy and fragile wrt future changes.


Regards,
Daniel





Re: [Qemu-devel] [PATCH 3/3] qemu-iotests: require CONFIG_LINUX_AIO for test 087

2017-07-24 Thread Jing Liu

Hi Cleber,


On 2017/7/21 上午11:47, Cleber Rosa wrote:

One of the "sub-"tests of test 087 requires CONFIG_LINUX_AIO.

As a PoC/RFC, this goes the easy route and skips the test as a whole
when that feature is missing.  Other approaches include splitting
the test and adding extra filtering.

Signed-off-by: Cleber Rosa 
---
  tests/qemu-iotests/087 | 1 +
  1 file changed, 1 insertion(+)

diff --git a/tests/qemu-iotests/087 b/tests/qemu-iotests/087
index f8e4903..a2fb7de 100755
--- a/tests/qemu-iotests/087
+++ b/tests/qemu-iotests/087
@@ -34,6 +34,7 @@ status=1  # failure is the default!
  _supported_fmt qcow2
  _supported_proto file
  _supported_os Linux
+_require_feature CONFIG_LINUX_AIO

I tested that CONFIG_NETTLE_KDF is also a necessary for 087.

+_require_feature CONFIG_NETTLE_KDF




  function do_run_qemu()
  {





Re: [Qemu-devel] Question for iotests 188, 189 and 087

2017-07-20 Thread Jing Liu

Hi Cleber Rosa,


On 2017/7/19 上午5:22, Cleber Rosa wrote:


On 07/18/2017 02:07 PM, John Snow wrote:


On 07/17/2017 11:01 PM, Jing Liu wrote:


[...]

I see issues here:

1) The qemu-iotests "runner", that is, "./check", understands that a
file number is a test.  No further granularity is (currently?)
achievable here.

The easy solution would be to split tests which depend on optional
features to separate test files.  In the 087 test given as example,
the "=== aio=native without O_DIRECT ===" block would become, say, 190.

2) Skip tests based on features more easily.  There's already support
for skipping tests on various situations.  From 087:

...
_supported_fmt qcow2
_supported_proto file
_supported_os Linux
...

It's trivial to also have access to other compile time settings.  I did
a quick experiment that would add a "_required_feature" function to
"common.rc".

This seems a good way to solve this problem.
I failed for 087 just because there is no CONFIG_NETTLE_KDF.

I wonder how can common.rc detect if there's CONFIG_XXX in config-host.mak ?

Thanks.
Jing

  For 087 (or 190?) it would look like:

...
_supported_fmt qcow2
_supported_proto file
_supported_os Linux
_required_feature CONFIG_LINUX_AIO
...

Does it make sense?  Would a patch series along those lines would be
welcomed by reviews?

Thanks!






Re: [Qemu-devel] Question for iotests 188, 189 and 087

2017-07-19 Thread Jing Liu



On 2017/7/19 上午2:07, John Snow wrote:


On 07/17/2017 11:01 PM, Jing Liu wrote:

Hi all,

Do you anybody have iotests failure: 188, 189 and 087 of the latest qemu
upsteam?

I just wondered if it has something wrong with my test machines because
I have different results with two machines.


Thanks.

Jing



Presumably you are missing aio=native support for 087,

and 188/189 I believe require compression and/or LUKS libraries.

Hi John,

Yes, I indeed miss some crypto libraries.
Thank you for your support. Now all of them are fixed.

Jing

These are false positives caused by missing libraries. We should
probably work out a proper solution to meaningfully skipping tests we
didn't compile support for.

--js






Re: [Qemu-devel] Question for iotests 188, 189 and 087

2017-07-18 Thread Jing Liu

Hi Eric,


On 2017/7/18 下午10:57, Eric Blake wrote:

On 07/17/2017 10:01 PM, Jing Liu wrote:

Hi all,

Do you anybody have iotests failure: 188, 189 and 087 of the latest qemu
upsteam?

I just wondered if it has something wrong with my test machines because
I have different results with two machines.

It might help to show the ./check command line you are using, as well as
the errors you are getting.  Right now, master is failing on 51, 140,
and 143 when I test across './check -raw', './check -qcow2', and
'./check nbd',

I used ./check -qcow2 to do it.
I tested on different machines and got different results. One is failed 
on 140 and 143. The other one told

me 087, 140, 143, 188, 189.
I guess it is because some configuration or libraries, and will also 
refer to other replies from the mail.

I'll check the details.

Thank you so much.

Jing



but not the three you mention.  But the iotests framework
allows a lot more combinations of testing (and we probably have some
lurking testsuite bugs on some of the more exotic combinations).






[Qemu-devel] Question for iotests 188, 189 and 087

2017-07-17 Thread Jing Liu

Hi all,

Do you anybody have iotests failure: 188, 189 and 087 of the latest qemu 
upsteam?


I just wondered if it has something wrong with my test machines because 
I have different results with two machines.



Thanks.

Jing




Re: [Qemu-devel] [PATCH for-2.7 7/8] s390x/css: Factor out virtual css bridge and bus

2016-07-05 Thread Jing Liu

Dear Conny,

On 07/05/2016 03:56 PM, Cornelia Huck wrote:


+
+static const TypeInfo virtual_css_bridge_info = {
+.name  = TYPE_VIRTUAL_CSS_BRIDGE,
+.parent= TYPE_SYS_BUS_DEVICE,
+.instance_size = sizeof(SysBusDevice),


So we do not use VirtualCssBridge macro which includes max_queue for 2.7?
Just curious that in this way, how to deal with the compat for 2.6, 2.5 ...?

Jing


+.class_init= virtual_css_bridge_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_HOTPLUG_HANDLER },
+{ }
+}
+};




-
-static const TypeInfo virtual_css_bridge_info = {
-.name  = TYPE_VIRTUAL_CSS_BRIDGE,
-.parent= TYPE_SYS_BUS_DEVICE,
-.instance_size = sizeof(SysBusDevice),
-.class_init= virtual_css_bridge_class_init,
-.interfaces = (InterfaceInfo[]) {
-{ TYPE_HOTPLUG_HANDLER },
-{ }
-}
-};
-
  /* virtio-ccw-bus */





-/* virtual css bridge type */
-#define TYPE_VIRTUAL_CSS_BRIDGE "virtual-css-bridge"
-
-/* virtual css bus type */
-typedef struct VirtualCssBus {
-BusState parent_obj;
-} VirtualCssBus;
-
-#define TYPE_VIRTUAL_CSS_BUS "virtual-css-bus"
-#define VIRTUAL_CSS_BUS(obj) \
- OBJECT_CHECK(VirtualCssBus, (obj), TYPE_VIRTUAL_CSS_BUS)
+void virtio_ccw_busdev_unplug(HotplugHandler *hotplug_dev,
+  DeviceState *dev, Error **errp);

  /* virtio-scsi-ccw */

@@ -192,7 +183,6 @@ typedef struct VirtIORNGCcw {
  VirtIORNG vdev;
  } VirtIORNGCcw;

-VirtualCssBus *virtual_css_bus_init(void);
  void virtio_ccw_device_update_status(SubchDev *sch);
  VirtIODevice *virtio_ccw_get_vdev(SubchDev *sch);

diff --git a/include/hw/s390x/css-bridge.h b/include/hw/s390x/css-bridge.h
new file mode 100644
index 000..ad73c1f
--- /dev/null
+++ b/include/hw/s390x/css-bridge.h
@@ -0,0 +1,31 @@
+/*
+ * virtual css bridge definition
+ *
+ * Copyright 2012,2016 IBM Corp.
+ * Author(s): Cornelia Huck 
+ *Pierre Morel 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#ifndef HW_S390X_CSS_BRIDGE_H
+#define HW_S390X_CSS_BRIDGE_H
+#include "qom/object.h"
+#include "hw/qdev-core.h"
+
+/* virtual css bridge */
+#define TYPE_VIRTUAL_CSS_BRIDGE "virtual-css-bridge"
+
+/* virtual css bus type */
+typedef struct VirtualCssBus {
+BusState parent_obj;
+} VirtualCssBus;
+
+#define TYPE_VIRTUAL_CSS_BUS "virtual-css-bus"
+#define VIRTUAL_CSS_BUS(obj) \
+ OBJECT_CHECK(VirtualCssBus, (obj), TYPE_VIRTUAL_CSS_BUS)
+VirtualCssBus *virtual_css_bus_init(void);
+
+#endif