[PATCH 2/5] MAINTAINERS: Add RISC-V IOMMU maintainers

2023-07-19 Thread Tomasz Jeznach
Signed-off-by: Tomasz Jeznach 
---
 MAINTAINERS | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 43bd9afc19..ed8f65d879 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1971,6 +1971,12 @@ F: include/hw/i2c/ppc4xx_i2c.h
 F: hw/intc/ppc-uic.c
 F: include/hw/intc/ppc-uic.h
 
+RISC-V IOMMU
+M: Tomasz Jeznach 
+L: qemu-ri...@nongnu.org
+S: Maintained
+F: hw/riscv/riscv-iommu*
+
 Character devices
 M: Marc-André Lureau 
 R: Paolo Bonzini 
-- 
2.34.1




[PATCH 3/5] exec/memtxattr: add process identifier to the transaction attributes

2023-07-19 Thread Tomasz Jeznach
Extend memory transaction attributes with process identifier to allow
per-request address translation logic to use requester_id / process_id
to identify memory mapping (e.g. enabling IOMMU w/ PASID translations).

Signed-off-by: Tomasz Jeznach 
---
 include/exec/memattrs.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/exec/memattrs.h b/include/exec/memattrs.h
index d04170aa27..61c2dfac14 100644
--- a/include/exec/memattrs.h
+++ b/include/exec/memattrs.h
@@ -64,6 +64,12 @@ typedef struct MemTxAttrs {
 unsigned int target_tlb_bit0 : 1;
 unsigned int target_tlb_bit1 : 1;
 unsigned int target_tlb_bit2 : 1;
+
+/*
+ * PCI PASID support: Limited to 8 bits process identifier.
+ */
+unsigned int pasid:8;
+
 } MemTxAttrs;
 
 /* Bus masters which don't specify any attributes will get this,
-- 
2.34.1




[PATCH 5/5] hw/riscv: virt: support for RISC-V IOMMU platform device.

2023-07-19 Thread Tomasz Jeznach
Adding virt machine property 'iommu' to enable/disable IOMMU
support, with platform RISC-V IOMMU device implementation.

Generate device tree entry for riscv-iommu device, along with
mapping all PCI device identifiers to the single IOMMU device
instance.

Signed-off-by: Tomasz Jeznach 
---
 hw/riscv/Kconfig|   1 +
 hw/riscv/virt.c | 100 +++-
 include/hw/riscv/virt.h |   3 ++
 3 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index 617a509f1b..b1a3a9994f 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -41,6 +41,7 @@ config RISCV_VIRT
 select SERIAL
 select RISCV_ACLINT
 select RISCV_APLIC
+select RISCV_IOMMU
 select RISCV_IMSIC
 select SIFIVE_PLIC
 select SIFIVE_TEST
diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
index d90286dc46..49cc7105af 100644
--- a/hw/riscv/virt.c
+++ b/hw/riscv/virt.c
@@ -32,6 +32,7 @@
 #include "hw/core/sysbus-fdt.h"
 #include "target/riscv/pmu.h"
 #include "hw/riscv/riscv_hart.h"
+#include "hw/riscv/iommu.h"
 #include "hw/riscv/virt.h"
 #include "hw/riscv/boot.h"
 #include "hw/riscv/numa.h"
@@ -88,7 +89,8 @@ static const MemMapEntry virt_memmap[] = {
 [VIRT_APLIC_M] =  {  0xc00, APLIC_SIZE(VIRT_CPUS_MAX) },
 [VIRT_APLIC_S] =  {  0xd00, APLIC_SIZE(VIRT_CPUS_MAX) },
 [VIRT_UART0] ={ 0x1000, 0x100 },
-[VIRT_VIRTIO] =   { 0x10001000,0x1000 },
+[VIRT_IOMMU] ={ 0x10001000,0x1000 },
+[VIRT_VIRTIO] =   { 0x10008000,0x1000 }, /* VIRTIO_COUNT */
 [VIRT_FW_CFG] =   { 0x1010,  0x18 },
 [VIRT_FLASH] ={ 0x2000, 0x400 },
 [VIRT_IMSIC_M] =  { 0x2400, VIRT_IMSIC_MAX_SIZE },
@@ -1019,6 +1021,44 @@ static void create_fdt_fw_cfg(RISCVVirtState *s, const 
MemMapEntry *memmap)
 g_free(nodename);
 }
 
+static void create_fdt_iommu(RISCVVirtState *s, const MemMapEntry *memmap,
+uint32_t irq_mmio_phandle)
+{
+MachineState *ms = MACHINE(s);
+uint32_t iommu_phandle;
+const char *irq_names[] = { "cmdq", "fltq", "pm", "priq" };
+char *iommu_node;
+char *pci_node;
+
+pci_node = g_strdup_printf("/soc/pci@%" PRIx64, 
memmap[VIRT_PCIE_ECAM].base);
+iommu_node = g_strdup_printf("/soc/iommu@%" PRIx64, 
memmap[VIRT_IOMMU].base);
+
+iommu_phandle = qemu_fdt_alloc_phandle(ms->fdt);
+qemu_fdt_add_subnode(ms->fdt, iommu_node);
+qemu_fdt_setprop_string(ms->fdt, iommu_node, "compatible", "riscv,iommu");
+qemu_fdt_setprop_cell(ms->fdt, iommu_node, "#iommu-cells", 1);
+qemu_fdt_setprop_cell(ms->fdt, iommu_node, "phandle", iommu_phandle);
+qemu_fdt_setprop_cells(ms->fdt, iommu_node, "reg",
+0x0, memmap[VIRT_IOMMU].base, 0x0, memmap[VIRT_IOMMU].size);
+qemu_fdt_setprop_cell(ms->fdt, iommu_node, "interrupt-parent", 
irq_mmio_phandle);
+qemu_fdt_setprop_string_array(ms->fdt, iommu_node, "interrupt-names",
+(char **) _names, ARRAY_SIZE(irq_names));
+qemu_fdt_setprop_cells(ms->fdt, iommu_node, "interrupts",
+IOMMU_IRQ + 0, 0x4,
+IOMMU_IRQ + 1, 0x4,
+IOMMU_IRQ + 2, 0x4,
+IOMMU_IRQ + 3, 0x4);
+qemu_fdt_setprop_cells(ms->fdt, pci_node, "iommu-map",
+0x0, iommu_phandle, 0x0, 0x);
+g_free(iommu_node);
+g_free(pci_node);
+}
+
+static bool virt_is_iommu_enabled(RISCVVirtState *s)
+{
+return s->iommu != ON_OFF_AUTO_OFF;
+}
+
 static void create_fdt(RISCVVirtState *s, const MemMapEntry *memmap)
 {
 MachineState *ms = MACHINE(s);
@@ -1051,6 +1091,10 @@ static void create_fdt(RISCVVirtState *s, const 
MemMapEntry *memmap)
 
 create_fdt_pcie(s, memmap, irq_pcie_phandle, msi_pcie_phandle);
 
+if (virt_is_iommu_enabled(s)) {
+create_fdt_iommu(s, memmap, irq_mmio_phandle);
+}
+
 create_fdt_reset(s, memmap, );
 
 create_fdt_uart(s, memmap, irq_mmio_phandle);
@@ -1210,6 +1254,31 @@ static DeviceState *virt_create_aia(RISCVVirtAIAType 
aia_type, int aia_guests,
 return aplic_m;
 }
 
+static DeviceState *virt_create_iommu(RISCVVirtState *s, DeviceState *irqchip)
+{
+DeviceState *iommu;
+int i;
+
+iommu = qdev_new(TYPE_RISCV_IOMMU_SYS);
+
+if (s->aia_type != VIRT_AIA_TYPE_APLIC_IMSIC) {
+/* Disable MSI_FLAT [22], MSI_MRIF [23] if IMSIC is not enabled. */
+qdev_prop_set_uint64(iommu, "capabilities", ~(BIT_ULL(22) | 
BIT_ULL(23)));
+}
+
+/* Fixed base register address */
+qdev_prop_set_uint64(iommu, "addr", virt_memmap[VIRT_IOMMU].base);
+
+sysbus_realize_and_unref(SYS_BUS_DEVICE(iommu), _fatal);
+
+for (i = 0; i < 4; i++) {
+sysbus_connect_irq(SYS_BUS_DEVICE(iommu), i,
+qdev_get_gpio_in(irqchip, IOMMU_IRQ + i));
+}
+
+return iommu;
+}
+
 static void create_platform_bus(RISCVVirtState *s, DeviceState *irqchip)
 {
 DeviceState *dev;
@@ -1506,6 +1575,10 @@ static void virt_machine_init(MachineState 

[PATCH 4/5] hw/riscv: IOMMU: use process identifier from transaction attributes.

2023-07-19 Thread Tomasz Jeznach
Use iommu index as process identifier, linking transaction
memory attributes with translation request.

Signed-off-by: Tomasz Jeznach 
---
 hw/riscv/riscv-iommu.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/hw/riscv/riscv-iommu.c b/hw/riscv/riscv-iommu.c
index fd271b2988..62525df2e2 100644
--- a/hw/riscv/riscv-iommu.c
+++ b/hw/riscv/riscv-iommu.c
@@ -2236,6 +2236,12 @@ static void riscv_iommu_realize(DeviceState *dev, Error 
**errp)
 /* Report QEMU target physical address space limits */
 s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS, 
TARGET_PHYS_ADDR_SPACE_BITS);
 
+/* Restricted to the size of MemTxAttrs.pasid field. */
+if (s->cap & RISCV_IOMMU_CAP_PD8) {
+MemTxAttrs attrs = { .pasid = ~0 };
+s->pasid_bits = ctz32(~((unsigned)attrs.pasid));
+}
+
 /* Adjust reported PD capabilities */
 if (s->pasid_bits < 20) {
 s->cap &= ~RISCV_IOMMU_CAP_PD20;
@@ -2506,12 +2512,13 @@ void riscv_iommu_pci_setup_iommu(RISCVIOMMUState 
*iommu, PCIBus *bus,
 static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
 MemTxAttrs attrs)
 {
-return RISCV_IOMMU_NOPASID;
+return attrs.unspecified ? RISCV_IOMMU_NOPASID : (int)attrs.pasid;
 }
 
 static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
 {
-return 1;
+RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
+return 1 << as->iommu->pasid_bits;
 }
 
 static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
-- 
2.34.1




[PATCH 1/5] hw/riscv: Introduction of RISC-V IOMMU device

2023-07-19 Thread Tomasz Jeznach
The RISC-V IOMMU specification is now ratified as-per the RISC-V international
process [1]. The latest frozen specifcation can be found at:
https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf

The patch add device emulation for RISC-V IOMMU which supports device and 
process
context lookups, command and fault queue interfaces, two stage address 
translation
logic with Sv32, Sv39, Sv48, Sv57 addressing modes, address translation cache,
MSI remapping with FLAT/MRIF modes, initial ATS and PRI interfaces, debug 
capabilities,
hardware performance counters. Platform and PCIe device instantiation is 
supported,
with wire-signaled and message-signaled interrupt capabilities.

Hardware interface definition file is shared with Linux kernel driver 
implementation,
available in the maintainer's branch riscv_iommu_v1 at 
https://github.com/tjeznach/linux.

Co-developed-by: Sebastien Boeuf 
Signed-off-by: Sebastien Boeuf 
Signed-off-by: Tomasz Jeznach 
---
 hw/riscv/Kconfig|3 +
 hw/riscv/meson.build|1 +
 hw/riscv/riscv-iommu-bits.h |  749 +++
 hw/riscv/riscv-iommu-pci.c  |  181 +++
 hw/riscv/riscv-iommu-sys.c  |  123 ++
 hw/riscv/riscv-iommu.c  | 2539 +++
 hw/riscv/riscv-iommu.h  |  152 +++
 hw/riscv/trace-events   |   14 +
 hw/riscv/trace.h|2 +
 include/hw/riscv/iommu.h|   40 +
 meson.build |1 +
 11 files changed, 3805 insertions(+)
 create mode 100644 hw/riscv/riscv-iommu-bits.h
 create mode 100644 hw/riscv/riscv-iommu-pci.c
 create mode 100644 hw/riscv/riscv-iommu-sys.c
 create mode 100644 hw/riscv/riscv-iommu.c
 create mode 100644 hw/riscv/riscv-iommu.h
 create mode 100644 hw/riscv/trace-events
 create mode 100644 hw/riscv/trace.h
 create mode 100644 include/hw/riscv/iommu.h

diff --git a/hw/riscv/Kconfig b/hw/riscv/Kconfig
index b6a5eb4452..617a509f1b 100644
--- a/hw/riscv/Kconfig
+++ b/hw/riscv/Kconfig
@@ -1,3 +1,6 @@
+config RISCV_IOMMU
+bool
+
 config RISCV_NUMA
 bool
 
diff --git a/hw/riscv/meson.build b/hw/riscv/meson.build
index 2f7ee81be3..e37c5d78e2 100644
--- a/hw/riscv/meson.build
+++ b/hw/riscv/meson.build
@@ -10,5 +10,6 @@ riscv_ss.add(when: 'CONFIG_SIFIVE_U', if_true: 
files('sifive_u.c'))
 riscv_ss.add(when: 'CONFIG_SPIKE', if_true: files('spike.c'))
 riscv_ss.add(when: 'CONFIG_MICROCHIP_PFSOC', if_true: 
files('microchip_pfsoc.c'))
 riscv_ss.add(when: 'CONFIG_ACPI', if_true: files('virt-acpi-build.c'))
+riscv_ss.add(when: 'CONFIG_RISCV_IOMMU', if_true: files('riscv-iommu.c', 
'riscv-iommu-pci.c', 'riscv-iommu-sys.c'))
 
 hw_arch += {'riscv': riscv_ss}
diff --git a/hw/riscv/riscv-iommu-bits.h b/hw/riscv/riscv-iommu-bits.h
new file mode 100644
index 00..9ce713361f
--- /dev/null
+++ b/hw/riscv/riscv-iommu-bits.h
@@ -0,0 +1,749 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright © 2022-2023 Rivos Inc.
+ * Copyright © 2023 FORTH-ICS/CARV
+ * Copyright © 2023 RISC-V IOMMU Task Group
+ *
+ * RISC-V Ziommu - Register Layout and Data Structures.
+ *
+ * Based on the 'RISC-V IOMMU Architecture Specification', Version 1.0
+ * Published at  https://github.com/riscv-non-isa/riscv-iommu
+ *
+ */
+
+#ifndef HW_RISCV_IOMMU_BITS_H
+#define HW_RISCV_IOMMU_BITS_H
+
+/*
+ * This file is based on Linux RISC-V IOMMU file
+ * located at 'drivers/iommu/riscv/iommu-bits.h'
+ */
+
+#include "qemu/osdep.h"
+
+#define RISCV_IOMMU_SPEC_DOT_VER 0x010
+
+#ifndef GENMASK_ULL
+#define GENMASK_ULL(h, l) (((~0ULL) >> (63 - (h) + (l))) << (l))
+#endif
+
+/*
+ * Chapter 5: Memory Mapped register interface
+ */
+
+/* Common field positions */
+#define RISCV_IOMMU_PPN_FIELD   GENMASK_ULL(53, 10)
+#define RISCV_IOMMU_QUEUE_LOGSZ_FIELD   GENMASK_ULL(4, 0)
+#define RISCV_IOMMU_QUEUE_INDEX_FIELD   GENMASK_ULL(31, 0)
+#define RISCV_IOMMU_QUEUE_ENABLEBIT(0)
+#define RISCV_IOMMU_QUEUE_INTR_ENABLE   BIT(1)
+#define RISCV_IOMMU_QUEUE_MEM_FAULT BIT(8)
+#define RISCV_IOMMU_QUEUE_OVERFLOW  BIT(9)
+#define RISCV_IOMMU_QUEUE_ACTIVEBIT(16)
+#define RISCV_IOMMU_QUEUE_BUSY  BIT(17)
+#define RISCV_IOMMU_ATP_PPN_FIELD   GENMASK_ULL(43, 0)
+#define RISCV_IOMMU_ATP_MODE_FIELD  GENMASK_ULL(63, 60)
+
+/* 5.3 IOMMU Capabilities (64bits) */
+#define RISCV_IOMMU_REG_CAP 0x
+#define RISCV_IOMMU_CAP_VERSION GENMASK_ULL(7, 0)
+#define RISCV_IOMMU_CAP_S_SV32  BIT_ULL(8)
+#define RISCV_IOMMU_CAP_S_SV39  BIT_ULL(9)
+#define RISCV_IOMMU_CAP_S_SV48  BIT_ULL(10)
+#define RISCV_IOMMU_CAP_S_SV57  BIT_ULL(11)
+#define RISCV_IOMMU_CAP_SVPBMT  BIT_ULL(15)
+#define RISCV_IOMMU_CAP_G_SV32  BIT_ULL(16)
+#define RISCV_IOMMU_CAP_G_SV39  BIT_ULL(17)
+#define RISCV_IOMMU_CAP_G_SV48  BIT_ULL(18)
+#define RISCV_IOMMU_CAP_G_SV57  BIT_ULL(19)
+#define RISCV_IOMMU_CAP_MSI_FLATBIT_ULL(22)
+#define RISCV_IOMMU_CAP_MSI_MRIFBIT_ULL(23)
+#define 

[PATCH 0/5] QEMU RISC-V IOMMU Support

2023-07-19 Thread Tomasz Jeznach
The RISC-V IOMMU specification is now ratified as-per the RISC-V international
process [1]. The latest frozen specifcation can be found at:
https://github.com/riscv-non-isa/riscv-iommu/releases/download/v1.0/riscv-iommu.pdf

This series introduces a RISC-V IOMMU device emulation implementation with two 
stage
address translation logic, device and process translation context mapping and 
queue
interfaces, along with riscv/virt machine bindings (patch 5) and memory 
attributes
extensions for PASID support (patch 3,4).

This series is based on incremental patches created during RISC-V International 
IOMMU
Task Group discussions and specification development process, with original 
series
available in the the maintainer's repository branch [2].

These patches can also be found in the riscv_iommu_v1 branch at:
https://github.com/tjeznach/qemu/tree/riscv_iommu_v1

To test this series, use Linux v6.5-rc2 with RISC-V IOMMU implementation 
available in
the riscv_iommu_v1 branch at:
https://github.com/tjeznach/linux/tree/riscv_iommu_v1

References:
[1] - https://wiki.riscv.org/display/HOME/Specification+Status
[2] - https://github.com/tjeznach/qemu/tree/tjeznach/riscv-iommu-20230719

Tomasz Jeznach (5):
  hw/riscv: Introduction of RISC-V IOMMU device
  MAINTAINERS: Add RISC-V IOMMU maintainers
  exec/memtxattr: add process identifier to the transaction attributes
  hw/riscv: IOMMU: use process identifier from transaction attributes.
  hw/riscv: virt: support for RISC-V IOMMU platform device.

 MAINTAINERS |6 +
 hw/riscv/Kconfig|4 +
 hw/riscv/meson.build|1 +
 hw/riscv/riscv-iommu-bits.h |  749 +++
 hw/riscv/riscv-iommu-pci.c  |  181 +++
 hw/riscv/riscv-iommu-sys.c  |  123 ++
 hw/riscv/riscv-iommu.c  | 2546 +++
 hw/riscv/riscv-iommu.h  |  152 +++
 hw/riscv/trace-events   |   14 +
 hw/riscv/trace.h|2 +
 hw/riscv/virt.c |  100 +-
 include/exec/memattrs.h |6 +
 include/hw/riscv/iommu.h|   40 +
 include/hw/riscv/virt.h |3 +
 meson.build |1 +
 15 files changed, 3927 insertions(+), 1 deletion(-)
 create mode 100644 hw/riscv/riscv-iommu-bits.h
 create mode 100644 hw/riscv/riscv-iommu-pci.c
 create mode 100644 hw/riscv/riscv-iommu-sys.c
 create mode 100644 hw/riscv/riscv-iommu.c
 create mode 100644 hw/riscv/riscv-iommu.h
 create mode 100644 hw/riscv/trace-events
 create mode 100644 hw/riscv/trace.h
 create mode 100644 include/hw/riscv/iommu.h

-- 
2.34.1




Re: [RFC 1/1] virtio-pci: add SR-IOV capability

2023-07-19 Thread Jason Wang
On Wed, Jul 19, 2023 at 9:59 AM Yui Washizu  wrote:
>
> This enables SR-IOV emulation on virtio-pci devices by adding SR-IOV 
> capability
> It also introduces a newly added property 'sriov_max_vfs'
> to enable or disable the SR-IOV feature on the virtio-pci device in guest,
> as well as to specify the maximum number of VFs that can be created in the 
> guest.
> Currently only virtio-net is supported.
> Also, the vendor ID and device ID remain the same for both the PF and VF,
> enabling existing guest PF drivers to be used in the VF without any 
> modifications.
>
> Signed-off-by: Yui Washizu 
> ---
>  hw/pci/msix.c  |  8 +++--
>  hw/pci/pci.c   |  4 +++
>  hw/virtio/virtio-pci.c | 62 ++
>  include/hw/virtio/virtio-pci.h |  1 +
>  4 files changed, 66 insertions(+), 9 deletions(-)
>
> diff --git a/hw/pci/msix.c b/hw/pci/msix.c
> index ab8869d9d0..3b94ce389f 100644
> --- a/hw/pci/msix.c
> +++ b/hw/pci/msix.c
> @@ -421,8 +421,12 @@ int msix_init_exclusive_bar(PCIDevice *dev, unsigned 
> short nentries,
>  return ret;
>  }
>
> -pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY,
> - >msix_exclusive_bar);
> +if (pci_is_vf(dev)) {
> +pcie_sriov_vf_register_bar(dev, bar_nr, >msix_exclusive_bar);
> +} else {
> +pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY,
> + >msix_exclusive_bar);
> +}
>
>  return 0;
>  }
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index e2eb4c3b4a..cbd50b38ea 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -2325,6 +2325,10 @@ static void pci_add_option_rom(PCIDevice *pdev, bool 
> is_default_rom,
>  return;
>  }
>
> +if (pci_is_vf(pdev)) {
> +return;
> +}
> +
>  if (!pdev->rom_bar) {
>  /*
>   * Load rom via fw_cfg instead of creating a rom bar,
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index edbc0daa18..2315c2647a 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -49,6 +49,8 @@
>   * configuration space */
>  #define VIRTIO_PCI_CONFIG_SIZE(dev) 
> VIRTIO_PCI_CONFIG_OFF(msix_enabled(dev))
>
> +#define VIRTIO_MAX_VFS 127
> +
>  static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
> VirtIOPCIProxy *dev);
>  static void virtio_pci_reset(DeviceState *qdev);
> @@ -1907,6 +1909,11 @@ static void virtio_pci_pre_plugged(DeviceState *d, 
> Error **errp)
>
>  if (virtio_pci_modern(proxy)) {
>  virtio_add_feature(>host_features, VIRTIO_F_VERSION_1);
> +if (proxy->sriov_max_vfs) {
> +virtio_add_feature(>host_features, VIRTIO_F_SR_IOV);
> +}
> +} else if (proxy->sriov_max_vfs) {
> +error_setg(errp, "VirtIO PCI modern is required for the use of 
> SR-IOV");
>  }
>
>  virtio_add_feature(>host_features, VIRTIO_F_BAD_FEATURE);
> @@ -2015,22 +2022,62 @@ static void virtio_pci_device_plugged(DeviceState *d, 
> Error **errp)
>  virtio_pci_modern_mem_region_map(proxy, >device, );
>  virtio_pci_modern_mem_region_map(proxy, >notify, );
>
> +if (!pci_is_vf(>pci_dev) && proxy->sriov_max_vfs) {
> +if (virtio_bus_get_vdev_id(bus) != VIRTIO_ID_NET) {
> +error_setg(errp, "sriov_max_vfs prop is not supported by %s",
> +   proxy->pci_dev.name);
> +return;
> +}
> +if (proxy->sriov_max_vfs > VIRTIO_MAX_VFS) {
> +error_setg(errp, "sriov_max_vfs must be between 0 and %d",
> +   VIRTIO_MAX_VFS);
> +return;
> +}
> +
> +pcie_sriov_pf_init(>pci_dev, PCI_CONFIG_SPACE_SIZE,
> +   proxy->pci_dev.name,
> +   PCI_DEVICE_ID_VIRTIO_10_BASE
> +   + virtio_bus_get_vdev_id(bus),
> +   proxy->sriov_max_vfs, proxy->sriov_max_vfs, 
> 1, 1);
> +if (proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY) {
> +pcie_sriov_pf_init_vf_bar(>pci_dev, 
> proxy->modern_io_bar_idx,
> +  PCI_BASE_ADDRESS_SPACE_IO, 4);
> +}
> +if (proxy->nvectors) {
> +pcie_sriov_pf_init_vf_bar(>pci_dev, 
> proxy->msix_bar_idx,
> +  PCI_BASE_ADDRESS_SPACE_MEMORY, 4 * 
> 1024);
> +}
> +pcie_sriov_pf_init_vf_bar(>pci_dev, 
> proxy->modern_mem_bar_idx,
> +  PCI_BASE_ADDRESS_SPACE_MEMORY |
> +  PCI_BASE_ADDRESS_MEM_PREFETCH |
> +  PCI_BASE_ADDRESS_MEM_TYPE_64,
> +  16 * 1024);
> +}
> +
>  if (modern_pio) {
>  memory_region_init(>io_bar, OBJECT(proxy),

Re: [RFC 0/1] virtio-net: add support for SR-IOV emulation

2023-07-19 Thread Jason Wang
On Wed, Jul 19, 2023 at 9:59 AM Yui Washizu  wrote:
>
> This patch series is the first step towards enabling
> hardware offloading of the L2 packet switching feature on virtio-net device 
> to host machine.
> We are considering that this hardware offloading enables
> the use of high-performance networks in virtual infrastructures,
> such as container infrastructures on VMs.
>
> To enable L2 packet switching by SR-IOV VFs, we are considering the following:
> - making the guest recognize virtio-net devices as SR-IOV PF devices
>   (archived with this patch series)
> - allowing virtio-net devices to connect SR-IOV VFs to the backend networks,
>   leaving the L2 packet switching feature to the management layer like libvirt

Could you please show the qemu command line you want to propose here?

>   - This makes hardware offloading of L2 packet switching possible.
> For example, when using vDPA devices, it allows the guest
> to utilize SR-IOV NIC embedded switch of hosts.

This would be interesting.

Thanks

>
> This patch series aims to enable SR-IOV emulation on virtio-net devices.
> With this series, the guest can identify the virtio-net device as an SR-IOV 
> PF device.
> The newly added property 'sriov_max_vfs' allows us to enable the SR-IOV 
> feature
> on the virtio-net device.
> Currently, we are unable to specify the properties of a VF created from the 
> guest.
> The properties are set to their default values.
> In the future, we plan to allow users to set the properties.
>
> qemu-system-x86_64 --device virtio-net,sriov_max_vfs=
> # when 'sriov_max_vfs' is present, the SR-IOV feature will be automatically 
> enabled
> #  means the max number of VF on guest
>
> Example commands to create VFs in virtio-net device from the guest:
>
> guest% readlink -f /sys/class/net/eth1/device
>  /sys/devices/pci:00/:00:02.0/:01:00.0/virtio1
> guest% echo "2" > 
> /sys/devices/pci:00/:00:02.0/:01:00.0/sriov_numvfs
> guest% ip link show
>  eth0: 
>  eth1: 
>  eth2:  #virtual VF created
>  eth3:  #virtual VF created
>
> Please note that communication between VF and PF/VF is not possible by this 
> patch series itself.
>
> Yui Washizu (1):
>   virtio-pci: add SR-IOV capability
>
>  hw/pci/msix.c  |  8 +++--
>  hw/pci/pci.c   |  4 +++
>  hw/virtio/virtio-pci.c | 62 ++
>  include/hw/virtio/virtio-pci.h |  1 +
>  4 files changed, 66 insertions(+), 9 deletions(-)
>
> --
> 2.39.3
>




Re: Reducing vdpa migration downtime because of memory pin / maps

2023-07-19 Thread Si-Wei Liu




On 7/19/2023 3:40 AM, Eugenio Perez Martin wrote:

On Mon, Jul 17, 2023 at 9:57 PM Si-Wei Liu  wrote:

Hey,

I am now back from the break. Sorry for the delayed response, please see
in line.

On 7/9/2023 11:04 PM, Eugenio Perez Martin wrote:

On Sat, Jul 8, 2023 at 11:14 AM Si-Wei Liu  wrote:


On 7/5/2023 10:46 PM, Eugenio Perez Martin wrote:

On Thu, Jul 6, 2023 at 2:13 AM Si-Wei Liu  wrote:

On 7/5/2023 11:03 AM, Eugenio Perez Martin wrote:

On Tue, Jun 27, 2023 at 8:36 AM Si-Wei Liu  wrote:

On 6/9/2023 7:32 AM, Eugenio Perez Martin wrote:

On Fri, Jun 9, 2023 at 12:39 AM Si-Wei Liu  wrote:

On 6/7/23 01:08, Eugenio Perez Martin wrote:

On Wed, Jun 7, 2023 at 12:43 AM Si-Wei Liu  wrote:

Sorry for reviving this old thread, I lost the best timing to follow up
on this while I was on vacation. I have been working on this and found
out some discrepancy, please see below.

On 4/5/23 04:37, Eugenio Perez Martin wrote:

Hi!

As mentioned in the last upstream virtio-networking meeting, one of
the factors that adds more downtime to migration is the handling of
the guest memory (pin, map, etc). At this moment this handling is
bound to the virtio life cycle (DRIVER_OK, RESET). In that sense, the
destination device waits until all the guest memory / state is
migrated to start pinning all the memory.

The proposal is to bind it to the char device life cycle (open vs
close),

Hmmm, really? If it's the life cycle for char device, the next guest /
qemu launch on the same vhost-vdpa device node won't make it work.


Maybe my sentence was not accurate, but I think we're on the same page here.

Two qemu instances opening the same char device at the same time are
not allowed, and vhost_vdpa_release clean all the maps. So the next
qemu that opens the char device should see a clean device anyway.

I mean the pin can't be done at the time of char device open, where the
user address space is not known/bound yet. The earliest point possible
for pinning would be until the vhost_attach_mm() call from SET_OWNER is
done.

Maybe we are deviating, let me start again.

Using QEMU code, what I'm proposing is to modify the lifecycle of the
.listener member of struct vhost_vdpa.

At this moment, the memory listener is registered at
vhost_vdpa_dev_start(dev, started=true) call for the last vhost_dev,
and is unregistered in both vhost_vdpa_reset_status and
vhost_vdpa_cleanup.

My original proposal was just to move the memory listener registration
to the last vhost_vdpa_init, and remove the unregister from
vhost_vdpa_reset_status. The calls to vhost_vdpa_dma_map/unmap would
be the same, the device should not realize this change.

This can address LM downtime latency for sure, but it won't help
downtime during dynamic SVQ switch - which still needs to go through the
full unmap/map cycle (that includes the slow part for pinning) from
passthrough to SVQ mode. Be noted not every device could work with a
separate ASID for SVQ descriptors. The fix should expect to work on
normal vDPA vendor devices without a separate descriptor ASID, with
platform IOMMU underneath or with on-chip IOMMU.


At this moment the SVQ switch is very inefficient mapping-wise, as it
unmap all the GPA->HVA maps and overrides it. In particular, SVQ is
allocated in low regions of the iova space, and then the guest memory
is allocated in this new IOVA region incrementally.

Yep. The key to build this fast path for SVQ switching I think is to
maintain the identity mapping for the passthrough queues so that QEMU
can reuse the old mappings for guest memory (e.g. GIOVA identity mapped
to GPA) while incrementally adding new mappings for SVQ vrings.


We can optimize that if we place SVQ in a free GPA area instead.

Here's a question though: it might not be hard to find a free GPA range
for the non-vIOMMU case (allocate iova from beyond the 48bit or 52bit
ranges), but I'm not sure if easy to find a free GIOVA range for the
vIOMMU case - particularly this has to work in the same entire 64bit
IOVA address ranges that (for now) QEMU won't be able to "reserve" a
specific IOVA ranges for SVQ from the vIOMMU. Do you foresee this can be
done for every QEMU emulated vIOMMU (intel-iommu amd-iommu, arm smmu and
virito-iommu) so that we can call it out as a generic means for SVQ
switching optimization?


In the case vIOMMU allocates a new block we will use the same algorithm as now:
* Find a new free IOVA chunk of the same size
* Map this new SVQ IOVA, that may or may not be the same as SVQ

Since we must go through the translation phase to sanitize guest's
available descriptors anyway, it has zero added cost.

Not sure I followed, this can work but doesn't seem able to reuse the
old host kernel mappings for guest memory, hence still requires remap of
the entire host IOVA ranges when SVQ IOVA comes along. I think by
maintaining 1:1 identity map on guest memory, we don't have to bother
tearing down existing HVA->HPA mappings in kernel thus save the
expensive pinning calls at large. I 

Re: [RFC 1/1] virtio-pci: add SR-IOV capability

2023-07-19 Thread Akihiko Odaki

On 2023/07/19 10:56, Yui Washizu wrote:

This enables SR-IOV emulation on virtio-pci devices by adding SR-IOV capability
It also introduces a newly added property 'sriov_max_vfs'
to enable or disable the SR-IOV feature on the virtio-pci device in guest,
as well as to specify the maximum number of VFs that can be created in the 
guest.
Currently only virtio-net is supported.
Also, the vendor ID and device ID remain the same for both the PF and VF,
enabling existing guest PF drivers to be used in the VF without any 
modifications.

Signed-off-by: Yui Washizu 
---
  hw/pci/msix.c  |  8 +++--
  hw/pci/pci.c   |  4 +++
  hw/virtio/virtio-pci.c | 62 ++
  include/hw/virtio/virtio-pci.h |  1 +
  4 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index ab8869d9d0..3b94ce389f 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -421,8 +421,12 @@ int msix_init_exclusive_bar(PCIDevice *dev, unsigned short 
nentries,
  return ret;
  }
  
-pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY,

- >msix_exclusive_bar);
+if (pci_is_vf(dev)) {
+pcie_sriov_vf_register_bar(dev, bar_nr, >msix_exclusive_bar);
+} else {
+pci_register_bar(dev, bar_nr, PCI_BASE_ADDRESS_SPACE_MEMORY,
+ >msix_exclusive_bar);
+}
  
  return 0;

  }
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e2eb4c3b4a..cbd50b38ea 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2325,6 +2325,10 @@ static void pci_add_option_rom(PCIDevice *pdev, bool 
is_default_rom,
  return;
  }
  
+if (pci_is_vf(pdev)) {

+return;
+}
+
  if (!pdev->rom_bar) {
  /*
   * Load rom via fw_cfg instead of creating a rom bar,
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index edbc0daa18..2315c2647a 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -49,6 +49,8 @@
   * configuration space */
  #define VIRTIO_PCI_CONFIG_SIZE(dev) 
VIRTIO_PCI_CONFIG_OFF(msix_enabled(dev))
  
+#define VIRTIO_MAX_VFS 127

+
  static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
 VirtIOPCIProxy *dev);
  static void virtio_pci_reset(DeviceState *qdev);
@@ -1907,6 +1909,11 @@ static void virtio_pci_pre_plugged(DeviceState *d, Error 
**errp)
  
  if (virtio_pci_modern(proxy)) {

  virtio_add_feature(>host_features, VIRTIO_F_VERSION_1);
+if (proxy->sriov_max_vfs) {
+virtio_add_feature(>host_features, VIRTIO_F_SR_IOV);
+}
+} else if (proxy->sriov_max_vfs) {
+error_setg(errp, "VirtIO PCI modern is required for the use of 
SR-IOV");


Missing: return;


  }
  
  virtio_add_feature(>host_features, VIRTIO_F_BAD_FEATURE);

@@ -2015,22 +2022,62 @@ static void virtio_pci_device_plugged(DeviceState *d, 
Error **errp)
  virtio_pci_modern_mem_region_map(proxy, >device, );
  virtio_pci_modern_mem_region_map(proxy, >notify, );
  
+if (!pci_is_vf(>pci_dev) && proxy->sriov_max_vfs) {

+if (virtio_bus_get_vdev_id(bus) != VIRTIO_ID_NET) {
+error_setg(errp, "sriov_max_vfs prop is not supported by %s",
+   proxy->pci_dev.name);
+return;
+}
+if (proxy->sriov_max_vfs > VIRTIO_MAX_VFS) {
+error_setg(errp, "sriov_max_vfs must be between 0 and %d",
+   VIRTIO_MAX_VFS);
+return;
+}
+
+pcie_sriov_pf_init(>pci_dev, PCI_CONFIG_SPACE_SIZE,
+   proxy->pci_dev.name,
+   PCI_DEVICE_ID_VIRTIO_10_BASE
+   + virtio_bus_get_vdev_id(bus),
+   proxy->sriov_max_vfs, proxy->sriov_max_vfs, 1, 
1);
+if (proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY) {
+pcie_sriov_pf_init_vf_bar(>pci_dev, 
proxy->modern_io_bar_idx,
+  PCI_BASE_ADDRESS_SPACE_IO, 4);
+}
+if (proxy->nvectors) {
+pcie_sriov_pf_init_vf_bar(>pci_dev, proxy->msix_bar_idx,
+  PCI_BASE_ADDRESS_SPACE_MEMORY, 4 * 
1024);
+}
+pcie_sriov_pf_init_vf_bar(>pci_dev, 
proxy->modern_mem_bar_idx,
+  PCI_BASE_ADDRESS_SPACE_MEMORY |
+  PCI_BASE_ADDRESS_MEM_PREFETCH |
+  PCI_BASE_ADDRESS_MEM_TYPE_64,
+  16 * 1024);
+}
+
  if (modern_pio) {
  memory_region_init(>io_bar, OBJECT(proxy),
 "virtio-pci-io", 0x4);
  
-pci_register_bar(>pci_dev, proxy->modern_io_bar_idx,

- PCI_BASE_ADDRESS_SPACE_IO, 

Re: [PATCH 4/4] virtio-net: Added uso check

2023-07-19 Thread Akihiko Odaki
Placing this patch after "[PATCH 3/4] virtio-net: added USO support" may 
interfer with "git bisect" on a host that does not support USO as 
virtio-net can advertise USO support where it's not supported.


I suggest to combine this patch with the earlier patch aforementioned. 
It will make the entire patch big so I think it's also better to extract 
the change for tap into another patch.


On 2023/07/20 0:21, Yuri Benditovich wrote:

From: Andrew Melnychenko 

Added tap uso check with stubs for non-Linux systems.

Signed-off-by: Yuri Benditovich 
Signed-off-by: Andrew Melnychenko 
---
  hw/net/virtio-net.c | 15 +++
  include/net/net.h   |  3 +++
  net/net.c   |  9 +
  net/tap-bsd.c   |  5 +
  net/tap-linux.c | 12 
  net/tap-solaris.c   |  5 +
  net/tap-stub.c  |  5 +
  net/tap.c   | 12 
  net/tap_int.h   |  1 +
  9 files changed, 67 insertions(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index e76cad923b..d950d3a77f 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n)
  return n->has_ufo;
  }
  
+static int peer_has_uso(VirtIONet *n)

+{
+if (!peer_has_vnet_hdr(n)) {
+return 0;
+}
+
+return qemu_has_uso(qemu_get_queue(n->nic)->peer);
+}
+
  static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
 int version_1, int hash_report)
  {
@@ -808,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice 
*vdev, uint64_t features,
  virtio_clear_feature(, VIRTIO_NET_F_HOST_UFO);
  }
  
+if (!peer_has_uso(n)) {

+virtio_clear_feature(, VIRTIO_NET_F_HOST_USO);
+virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO4);
+virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO6);
+}
+
  if (!get_vhost_net(nc->peer)) {
  return features;
  }
diff --git a/include/net/net.h b/include/net/net.h
index b5ccfbbffb..330d285930 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -54,6 +54,7 @@ typedef void (LinkStatusChanged)(NetClientState *);
  typedef void (NetClientDestructor)(NetClientState *);
  typedef RxFilterInfo *(QueryRxFilter)(NetClientState *);
  typedef bool (HasUfo)(NetClientState *);
+typedef bool (HasUso)(NetClientState *);
  typedef bool (HasVnetHdr)(NetClientState *);
  typedef bool (HasVnetHdrLen)(NetClientState *, int);
  typedef bool (GetUsingVnetHdr)(NetClientState *);
@@ -84,6 +85,7 @@ typedef struct NetClientInfo {
  QueryRxFilter *query_rx_filter;
  NetPoll *poll;
  HasUfo *has_ufo;
+HasUso *has_uso;
  HasVnetHdr *has_vnet_hdr;
  HasVnetHdrLen *has_vnet_hdr_len;
  GetUsingVnetHdr *get_using_vnet_hdr;
@@ -187,6 +189,7 @@ void qemu_set_info_str(NetClientState *nc,
 const char *fmt, ...) G_GNUC_PRINTF(2, 3);
  void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]);
  bool qemu_has_ufo(NetClientState *nc);
+bool qemu_has_uso(NetClientState *nc);
  bool qemu_has_vnet_hdr(NetClientState *nc);
  bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
  bool qemu_get_using_vnet_hdr(NetClientState *nc);
diff --git a/net/net.c b/net/net.c
index 543e6dec43..b110e61f66 100644
--- a/net/net.c
+++ b/net/net.c
@@ -495,6 +495,15 @@ bool qemu_has_ufo(NetClientState *nc)
  return nc->info->has_ufo(nc);
  }
  
+bool qemu_has_uso(NetClientState *nc)

+{
+if (!nc || !nc->info->has_uso) {
+return false;
+}
+
+return nc->info->has_uso(nc);
+}
+
  bool qemu_has_vnet_hdr(NetClientState *nc)
  {
  if (!nc || !nc->info->has_vnet_hdr) {
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index abd16a2ad2..274ea7bd2c 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -212,6 +212,11 @@ int tap_probe_has_ufo(int fd)
  return 0;
  }
  
+int tap_probe_has_uso(int fd)

+{
+return 0;
+}
+
  int tap_probe_vnet_hdr_len(int fd, int len)
  {
  return 0;
diff --git a/net/tap-linux.c b/net/tap-linux.c
index 30fcca1bc2..c7e514ecb0 100644
--- a/net/tap-linux.c
+++ b/net/tap-linux.c
@@ -173,6 +173,18 @@ int tap_probe_has_ufo(int fd)
  return 1;
  }
  
+int tap_probe_has_uso(int fd)

+{
+unsigned offload;
+
+offload = TUN_F_CSUM | TUN_F_USO4 | TUN_F_USO6;
+
+if (ioctl(fd, TUNSETOFFLOAD, offload) < 0) {
+return 0;
+}
+return 1;
+}
+
  /* Verify that we can assign given length */
  int tap_probe_vnet_hdr_len(int fd, int len)
  {
diff --git a/net/tap-solaris.c b/net/tap-solaris.c
index a617a10e5c..08b13af512 100644
--- a/net/tap-solaris.c
+++ b/net/tap-solaris.c
@@ -216,6 +216,11 @@ int tap_probe_has_ufo(int fd)
  return 0;
  }
  
+int tap_probe_has_uso(int fd)

+{
+return 0;
+}
+
  int tap_probe_vnet_hdr_len(int fd, int len)
  {
  return 0;
diff --git a/net/tap-stub.c b/net/tap-stub.c
index ac8dfc03b4..4b24f61e3a 100644
--- a/net/tap-stub.c
+++ b/net/tap-stub.c
@@ -47,6 +47,11 @@ int tap_probe_has_ufo(int 

Re: [PATCH 3/4] virtio-net: added USO support

2023-07-19 Thread Akihiko Odaki

On 2023/07/20 0:21, Yuri Benditovich wrote:

virtio-net can suggest USO features TX, RX v4 and RX v6,
depending on kernel TUN ability to support them. These
features require explicit enable in command-line.


Shouldn't we enable these by default as the other offload features are?



Signed-off-by: Yuri Benditovich 
---
  hw/net/virtio-net.c | 16 ++--
  1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index d2311e7d6e..e76cad923b 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -796,6 +796,10 @@ static uint64_t virtio_net_get_features(VirtIODevice 
*vdev, uint64_t features,
  virtio_clear_feature(, VIRTIO_NET_F_GUEST_TSO6);
  virtio_clear_feature(, VIRTIO_NET_F_GUEST_ECN);
  
+virtio_clear_feature(, VIRTIO_NET_F_HOST_USO);

+virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO4);
+virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO6);
+
  virtio_clear_feature(, VIRTIO_NET_F_HASH_REPORT);
  }
  
@@ -864,14 +868,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)

  !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
  }
  
-static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)

+static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
  {
  static const uint64_t guest_offloads_mask =
  (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
  (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
  (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
  (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
-(1ULL << VIRTIO_NET_F_GUEST_UFO);
+(1ULL << VIRTIO_NET_F_GUEST_UFO)  |
+(1ULL << VIRTIO_NET_F_GUEST_USO4) |
+(1ULL << VIRTIO_NET_F_GUEST_USO6);
  
  return guest_offloads_mask & features;

  }
@@ -3924,6 +3930,12 @@ static Property virtio_net_properties[] = {
  DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
  DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
  DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
+DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
+  VIRTIO_NET_F_GUEST_USO4, false),
+DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
+  VIRTIO_NET_F_GUEST_USO6, false),
+DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
+  VIRTIO_NET_F_HOST_USO, false),
  DEFINE_PROP_END_OF_LIST(),
  };
  




Re: [PATCH 1/4] tap: Added USO support to tap device.

2023-07-19 Thread Akihiko Odaki
Nitpicking: the subject of this patch is somewhat unconventional. What 
about: "tap: Add USO support to tap device"?


On 2023/07/20 0:21, Yuri Benditovich wrote:

From: Andrew Melnychenko 

Passing additional parameters (USOv4 and USOv6 offloads) when
setting TAP offloads

Signed-off-by: Yuri Benditovich 
Signed-off-by: Andrew Melnychenko 
---
  hw/net/e1000e_core.c |  2 +-
  hw/net/igb_core.c|  2 +-
  hw/net/virtio-net.c  |  4 +++-
  hw/net/vmxnet3.c |  2 ++
  include/net/net.h|  4 ++--
  net/net.c|  4 ++--
  net/tap-bsd.c|  2 +-
  net/tap-linux.c  | 15 ---
  net/tap-linux.h  |  2 ++
  net/tap-solaris.c|  2 +-
  net/tap-stub.c   |  2 +-
  net/tap-win32.c  |  2 +-
  net/tap.c|  6 +++---
  net/tap_int.h|  3 ++-
  14 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index f8aeafa16b..d4055956ad 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -2852,7 +2852,7 @@ e1000e_update_rx_offloads(E1000ECore *core)
  
  if (core->has_vnet) {

  qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
- cso_state, 0, 0, 0, 0);
+ cso_state, 0, 0, 0, 0, 0, 0);
  }
  }
  
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c

index 8b6b75c522..389eef1549 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -2753,7 +2753,7 @@ igb_update_rx_offloads(IGBCore *core)
  
  if (core->has_vnet) {

  qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
- cso_state, 0, 0, 0, 0);
+ cso_state, 0, 0, 0, 0, 0, 0);
  }
  }
  
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c

index 7102ec4817..d2311e7d6e 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -859,7 +859,9 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
  !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
  !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
  !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
-!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)));
+!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
+!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
+!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
  }
  
  static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)

diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 5dfacb1098..886adae42b 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -1341,6 +1341,8 @@ static void vmxnet3_update_features(VMXNET3State *s)
   s->lro_supported,
   s->lro_supported,
   0,
+ 0,
+ 0,
   0);
  }
  }
diff --git a/include/net/net.h b/include/net/net.h
index 1448d00afb..b5ccfbbffb 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -58,7 +58,7 @@ typedef bool (HasVnetHdr)(NetClientState *);
  typedef bool (HasVnetHdrLen)(NetClientState *, int);
  typedef bool (GetUsingVnetHdr)(NetClientState *);
  typedef void (UsingVnetHdr)(NetClientState *, bool);
-typedef void (SetOffload)(NetClientState *, int, int, int, int, int);
+typedef void (SetOffload)(NetClientState *, int, int, int, int, int, int, int);
  typedef int (GetVnetHdrLen)(NetClientState *);
  typedef void (SetVnetHdrLen)(NetClientState *, int);
  typedef int (SetVnetLE)(NetClientState *, bool);
@@ -192,7 +192,7 @@ bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
  bool qemu_get_using_vnet_hdr(NetClientState *nc);
  void qemu_using_vnet_hdr(NetClientState *nc, bool enable);
  void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
-  int ecn, int ufo);
+  int ecn, int ufo, int uso4, int uso6);
  int qemu_get_vnet_hdr_len(NetClientState *nc);
  void qemu_set_vnet_hdr_len(NetClientState *nc, int len);
  int qemu_set_vnet_le(NetClientState *nc, bool is_le);
diff --git a/net/net.c b/net/net.c
index 6492ad530e..543e6dec43 100644
--- a/net/net.c
+++ b/net/net.c
@@ -532,13 +532,13 @@ void qemu_using_vnet_hdr(NetClientState *nc, bool enable)
  }
  
  void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,

-  int ecn, int ufo)
+  int ecn, int ufo, int uso4, int uso6)
  {
  if (!nc || !nc->info->set_offload) {
  return;
  }
  
-nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo);

+nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo, uso4, uso6);
  }
  
  int qemu_get_vnet_hdr_len(NetClientState *nc)

diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index 4c98fdd337..abd16a2ad2 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -232,7 +232,7 @@ int tap_fd_set_vnet_be(int fd, int 

Re: [PATCH v3 0/8] vdpa: Send all CVQ state load commands in parallel

2023-07-19 Thread Lei Yang
On Wed, Jul 19, 2023 at 11:25 PM Hawkins Jiawei  wrote:
>
> 在 2023/7/19 20:44, Lei Yang 写道:
> > Hello Hawkins and Michael
> >
> > Looks like there are big changes about vp_vdpa, therefore, if needed,
> > QE can test this series in QE's environment before the patch is
>
> Hi Lei,
>
> This patch series does not modify the code of vp_vdpa. Instead, it only
> modifies how QEMU sends SVQ control commands to the vdpa device.
>
Hi Hawkins

> Considering that the behavior of the vp_vdpa device differs from that
> of real vdpa hardware, would it be possible for you to test this patch
> series on a real vdpa device?

Yes, there is a hardware device to test it , I will update the test
results ASAP.

BR
Lei
>
> Thanks!
>
>
> > merged, and provide the result.
> >
> > BR
> > Lei
> >
> >
> > On Wed, Jul 19, 2023 at 8:37 PM Hawkins Jiawei  wrote:
> >>
> >> 在 2023/7/19 17:11, Michael S. Tsirkin 写道:
> >>> On Wed, Jul 19, 2023 at 03:53:45PM +0800, Hawkins Jiawei wrote:
>  This patchset allows QEMU to delay polling and checking the device
>  used buffer until either the SVQ is full or control commands shadow
>  buffers are full, instead of polling and checking immediately after
>  sending each SVQ control command, so that QEMU can send all the SVQ
>  control commands in parallel, which have better performance improvement.
> 
>  I use vp_vdpa device to simulate vdpa device, and create 4094 VLANS in
>  guest to build a test environment for sending multiple CVQ state load
>  commands. This patch series can improve latency from 10023 us to
>  8697 us for about 4099 CVQ state load commands, about 0.32 us per 
>  command.
> >>>
> >>> Looks like a tiny improvement.
> >>> At the same time we have O(n^2) behaviour with memory mappings.
> >>
> >> Hi Michael,
> >>
> >> Thanks for your review.
> >>
> >> I wonder why you say "we have O(n^2) behaviour on memory mappings" here?
> >>
> >>   From my understanding, QEMU maps two page-size buffers as control
> >> commands shadow buffers at device startup. These buffers then are used
> >> to cache SVQ control commands, where QEMU fills them with multiple SVQ 
> >> control
> >> commands bytes, flushes them when SVQ descriptors are full or these
> >> control commands shadow buffers reach their capacity.
> >>
> >> QEMU repeats this process until all CVQ state load commands have been
> >> sent in loading.
> >>
> >> In this loading process, only control commands shadow buffers
> >> translation should be relative to memory mappings, which should be
> >> O(log n) behaviour to my understanding(Please correct me if I am wrong).
> >>
> >>> Not saying we must not do this but I think it's worth
> >>> checking where the bottleneck is. My guess would be
> >>> vp_vdpa is not doing things in parallel. Want to try fixing that
> >>
> >> As for "vp_vdpa is not doing things in parallel.", do you mean
> >> the vp_vdpa device cannot process QEMU's SVQ control commands
> >> in parallel?
> >>
> >> In this situation, I will try to use real vdpa hardware to
> >> test the patch series performance.
> >>
> >>> to see how far it can be pushed?
> >>
> >> Currently, I am involved in the "Add virtio-net Control Virtqueue state
> >> restore support" project in Google Summer of Code now. Because I am
> >> uncertain about the time it will take to fix that problem in the vp_vdpa
> >> device, I prefer to complete the gsoc project first.
> >>
> >> Thanks!
> >>
> >>
> >>>
> >>>
>  Note that this patch should be based on
>  patch "Vhost-vdpa Shadow Virtqueue VLAN support" at [1].
> 
>  [1]. https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg03719.html
> 
>  TestStep
>  
>  1. regression testing using vp-vdpa device
>  - For L0 guest, boot QEMU with two virtio-net-pci net device with
>  `ctrl_vq`, `ctrl_rx`, `ctrl_rx_extra` features on, command line like:
>  -device virtio-net-pci,disable-legacy=on,disable-modern=off,
>  iommu_platform=on,mq=on,ctrl_vq=on,guest_announce=off,
>  indirect_desc=off,queue_reset=off,ctrl_rx=on,ctrl_rx_extra=on,...
> 
>  - For L1 guest, apply the patch series and compile the source code,
>  start QEMU with two vdpa device with svq mode on, enable the `ctrl_vq`,
>  `ctrl_rx`, `ctrl_rx_extra` features on, command line like:
>  -netdev type=vhost-vdpa,x-svq=true,...
>  -device virtio-net-pci,mq=on,guest_announce=off,ctrl_vq=on,
>  ctrl_rx=on,ctrl_rx_extra=on...
> 
>  - For L2 source guest, run the following bash command:
>  ```bash
>  #!/bin/sh
> 
>  for idx1 in {0..9}
>  do
>  for idx2 in {0..9}
>  do
>    for idx3 in {0..6}
>    do
>  ip link add macvlan$idx1$idx2$idx3 link eth0
>  address 4a:30:10:19:$idx1$idx2:1$idx3 type macvlan mode bridge
>  ip link set macvlan$idx1$idx2$idx3 up
>    done
>  done
>  done
>  ```
>  - 

[PATCH v3 09/14] tests/tcg/s390x: Test CLGEBR and CGEBRA

2023-07-19 Thread Ilya Leoshkevich
Add a small test to prevent regressions.

Tested-by: Thomas Huth 
Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.target |  5 +
 tests/tcg/s390x/cgebra.c| 32 
 tests/tcg/s390x/clgebr.c| 32 
 3 files changed, 69 insertions(+)
 create mode 100644 tests/tcg/s390x/cgebra.c
 create mode 100644 tests/tcg/s390x/clgebr.c

diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 19fbbc6e531..71bf39b78d3 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -39,12 +39,17 @@ TESTS+=mxdb
 TESTS+=epsw
 TESTS+=larl
 TESTS+=mdeb
+TESTS+=cgebra
+TESTS+=clgebr
 
 cdsg: CFLAGS+=-pthread
 cdsg: LDFLAGS+=-pthread
 
 rxsbg: CFLAGS+=-O2
 
+cgebra: LDFLAGS+=-lm
+clgebr: LDFLAGS+=-lm
+
 include $(S390X_SRC)/pgm-specification.mak
 $(PGM_SPECIFICATION_TESTS): pgm-specification-user.o
 $(PGM_SPECIFICATION_TESTS): LDFLAGS+=pgm-specification-user.o
diff --git a/tests/tcg/s390x/cgebra.c b/tests/tcg/s390x/cgebra.c
new file mode 100644
index 000..f91e10d2d3c
--- /dev/null
+++ b/tests/tcg/s390x/cgebra.c
@@ -0,0 +1,32 @@
+/*
+ * Test the CGEBRA instruction.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include 
+#include 
+#include 
+
+int main(void)
+{
+float r2 = 1E+300;
+long long r1;
+int cc;
+
+feclearexcept(FE_ALL_EXCEPT);
+asm("cgebra %[r1],%[m3],%[r2],%[m4]\n"
+"ipm %[cc]\n"
+: [r1] "=r" (r1)
+, [cc] "=r" (cc)
+: [m3] "i" (5) /* round toward 0 */
+, [r2] "f" (r2)
+, [m4] "i" (8) /* bit 0 is set, but must be ignored; XxC is not set */
+: "cc");
+cc >>= 28;
+
+assert(r1 == 0x7fffLL);
+assert(cc == 3);
+assert(fetestexcept(FE_ALL_EXCEPT) == (FE_INVALID | FE_INEXACT));
+
+return EXIT_SUCCESS;
+}
diff --git a/tests/tcg/s390x/clgebr.c b/tests/tcg/s390x/clgebr.c
new file mode 100644
index 000..d491899b56e
--- /dev/null
+++ b/tests/tcg/s390x/clgebr.c
@@ -0,0 +1,32 @@
+/*
+ * Test the CLGEBR instruction.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include 
+#include 
+#include 
+
+int main(void)
+{
+float r2 = -1;
+long long r1;
+int cc;
+
+feclearexcept(FE_ALL_EXCEPT);
+asm("clgebr %[r1],%[m3],%[r2],%[m4]\n"
+"ipm %[cc]\n"
+: [r1] "=r" (r1)
+, [cc] "=r" (cc)
+: [m3] "i" (5) /* round toward 0 */
+, [r2] "f" (r2)
+, [m4] "i" (8) /* bit 0 is set, but must be ignored; XxC is not set */
+: "cc");
+cc >>= 28;
+
+assert(r1 == 0);
+assert(cc == 3);
+assert(fetestexcept(FE_ALL_EXCEPT) == (FE_INVALID | FE_INEXACT));
+
+return EXIT_SUCCESS;
+}
-- 
2.41.0




[PATCH v3 07/14] target/s390x: Fix assertion failure in VFMIN/VFMAX with type 13

2023-07-19 Thread Ilya Leoshkevich
Type 13 is reserved, so using it should result in specification
exception. Due to an off-by-1 error the code triggers an assertion at a
later point in time instead.

Cc: qemu-sta...@nongnu.org
Fixes: da4807527f3b ("s390x/tcg: Implement VECTOR FP (MAXIMUM|MINIMUM)")
Reviewed-by: David Hildenbrand 
Signed-off-by: Ilya Leoshkevich 
---
 target/s390x/tcg/translate_vx.c.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/s390x/tcg/translate_vx.c.inc 
b/target/s390x/tcg/translate_vx.c.inc
index 43dfbfd03f6..f8df121d3d3 100644
--- a/target/s390x/tcg/translate_vx.c.inc
+++ b/target/s390x/tcg/translate_vx.c.inc
@@ -3047,7 +3047,7 @@ static DisasJumpType op_vfmax(DisasContext *s, DisasOps 
*o)
 const uint8_t m5 = get_field(s, m5);
 gen_helper_gvec_3_ptr *fn;
 
-if (m6 == 5 || m6 == 6 || m6 == 7 || m6 > 13) {
+if (m6 == 5 || m6 == 6 || m6 == 7 || m6 >= 13) {
 gen_program_exception(s, PGM_SPECIFICATION);
 return DISAS_NORETURN;
 }
-- 
2.41.0




[PATCH v3 12/14] tests/tcg/s390x: Test MC

2023-07-19 Thread Ilya Leoshkevich
Add a small test to prevent regressions.

Tested-by: Thomas Huth 
Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.softmmu-target |  1 +
 tests/tcg/s390x/mc.S| 56 +
 2 files changed, 57 insertions(+)
 create mode 100644 tests/tcg/s390x/mc.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index 58684d7da71..145e0bfde16 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -24,6 +24,7 @@ ASM_TESTS =   
 \
 lpsw   
\
 lpswe-early
\
 lra
\
+mc 
\
 ssm-early  
\
 stosm-early
\
 unaligned-lowcore
diff --git a/tests/tcg/s390x/mc.S b/tests/tcg/s390x/mc.S
new file mode 100644
index 000..e7466bb4b57
--- /dev/null
+++ b/tests/tcg/s390x/mc.S
@@ -0,0 +1,56 @@
+.org 0x8d
+ilc:
+.org 0x8e
+program_interruption_code:
+.org 0x94
+monitor_class:
+.org 0xb0
+monitor_code:
+.org 0x150
+program_old_psw:
+.org 0x1d0  /* program new PSW */
+.quad 0x18000,pgm   /* 64-bit mode */
+.org 0x200  /* lowcore padding */
+.globl _start
+_start:
+stctg %c8,%c8,c8/* enable only monitor class 1 */
+mvhhi c8+6,0x4000
+lctlg %c8,%c8,c8
+mc_nop:
+mc 123,0
+mc_monitor_event:
+mc 321,1
+j failure
+mc_specification:
+mc 333,16
+j failure
+pgm:
+lgrl %r0,program_old_psw+8  /* ilc adjustment */
+llgc %r1,ilc
+sgr %r0,%r1
+larl %r1,mc_monitor_event   /* dispatch based on old PSW */
+cgrje %r0,%r1,pgm_monitor_event
+larl %r1,mc_specification
+cgrje %r0,%r1,pgm_specification
+j failure
+pgm_monitor_event:
+chhsi program_interruption_code,0x40/* monitor event? */
+jne failure
+chhsi monitor_class,1   /* class from mc_monitor_event? */
+jne failure
+cghsi monitor_code,321  /* code from mc_monitor_event? */
+jne failure
+j mc_specification  /* next test */
+pgm_specification:
+chhsi program_interruption_code,6   /* specification exception? */
+jne failure
+lpswe success_psw
+failure:
+lpswe failure_psw
+.align 8
+c8:
+.quad 0
+success_psw:
+.quad 0x2,0xfff /* see is_special_wait_psw() */
+failure_psw:
+.quad 0x2,0 /* disabled wait */
-- 
2.41.0




[PATCH v3 00/14] target/s390x: Miscellaneous TCG fixes, part 2

2023-07-19 Thread Ilya Leoshkevich
v2: https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg03762.html
v2 -> v3: Document the new constraint set (Philippe).
  Fix clang build (Thomas).
  Add T-bs.

v1: https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg03648.html
v1 -> v2: Move the case in 04/14 (David).
  Simplify the reserved type checking in 07/14 (David).
  Add R-bs.

Hi,

Here is another set of fixes for issues found by randomized testing.

Most of them have to do with simple insufficient error handling or
corner cases, but 3/14 and 6/14 took a while to figure out, and
hopefully I got the fixes right. 13/14 is a test for an issue that
Richard has already fixed, but I thought it would be helpful to have it
anyway.

Best regards,
Ilya

Ilya Leoshkevich (14):
  target/s390x: Make CKSM raise an exception if R2 is odd
  target/s390x: Fix CLM with M3=0
  target/s390x: Fix CONVERT TO LOGICAL/FIXED with out-of-range inputs
  target/s390x: Fix ICM with M3=0
  target/s390x: Make MC raise specification exception when class >= 16
  tcg/{i386,s390x}: Add earlyclobber to the op_add2's first output
  target/s390x: Fix assertion failure in VFMIN/VFMAX with type 13
  tests/tcg/s390x: Test CKSM
  tests/tcg/s390x: Test CLGEBR and CGEBRA
  tests/tcg/s390x: Test CLM
  tests/tcg/s390x: Test ICM
  tests/tcg/s390x: Test MC
  tests/tcg/s390x: Test STPQ
  tests/tcg/s390x: Test VCKSM

 target/s390x/tcg/excp_helper.c  |  2 +-
 target/s390x/tcg/fpu_helper.c   |  3 +-
 target/s390x/tcg/mem_helper.c   |  5 +++
 target/s390x/tcg/translate.c| 21 --
 target/s390x/tcg/translate_vx.c.inc |  2 +-
 tcg/i386/tcg-target-con-set.h   |  5 ++-
 tcg/i386/tcg-target.c.inc   |  2 +-
 tcg/s390x/tcg-target-con-set.h  |  8 ++--
 tcg/s390x/tcg-target.c.inc  |  4 +-
 tcg/tcg.c   |  8 +++-
 tests/tcg/s390x/Makefile.softmmu-target |  5 +++
 tests/tcg/s390x/Makefile.target |  6 +++
 tests/tcg/s390x/cgebra.c| 32 ++
 tests/tcg/s390x/cksm.S  | 29 +
 tests/tcg/s390x/clgebr.c| 32 ++
 tests/tcg/s390x/clm.S   | 29 +
 tests/tcg/s390x/icm.S   | 32 ++
 tests/tcg/s390x/mc.S| 56 +
 tests/tcg/s390x/stpq.S  | 20 +
 tests/tcg/s390x/vcksm.c | 31 ++
 tests/tcg/s390x/vx.h|  2 +
 21 files changed, 319 insertions(+), 15 deletions(-)
 create mode 100644 tests/tcg/s390x/cgebra.c
 create mode 100644 tests/tcg/s390x/cksm.S
 create mode 100644 tests/tcg/s390x/clgebr.c
 create mode 100644 tests/tcg/s390x/clm.S
 create mode 100644 tests/tcg/s390x/icm.S
 create mode 100644 tests/tcg/s390x/mc.S
 create mode 100644 tests/tcg/s390x/stpq.S
 create mode 100644 tests/tcg/s390x/vcksm.c

-- 
2.41.0




[PATCH v3 06/14] tcg/{i386, s390x}: Add earlyclobber to the op_add2's first output

2023-07-19 Thread Ilya Leoshkevich
i386 and s390x implementations of op_add2 require an earlyclobber,
which is currently missing. This breaks VCKSM in s390x guests. E.g., on
x86_64 the following op:

add2_i32 tmp2,tmp3,tmp2,tmp3,tmp3,tmp2   dead: 0 2 3 4 5  pref=none,0x

is translated to:

addl %ebx, %r12d
adcl %r12d, %ebx

Introduce a new C_N1_O1_I4 constraint, and make sure that earlyclobber
of aliased outputs is honored.

Cc: qemu-sta...@nongnu.org
Fixes: 82790a870992 ("tcg: Add markup for output requires new register")
Signed-off-by: Ilya Leoshkevich 
---
 tcg/i386/tcg-target-con-set.h  | 5 -
 tcg/i386/tcg-target.c.inc  | 2 +-
 tcg/s390x/tcg-target-con-set.h | 8 +---
 tcg/s390x/tcg-target.c.inc | 4 ++--
 tcg/tcg.c  | 8 +++-
 5 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/tcg/i386/tcg-target-con-set.h b/tcg/i386/tcg-target-con-set.h
index 91ceb0e1da2..5ea3a292f0f 100644
--- a/tcg/i386/tcg-target-con-set.h
+++ b/tcg/i386/tcg-target-con-set.h
@@ -11,6 +11,9 @@
  *
  * C_N1_Im(...) defines a constraint set with 1 output and  inputs,
  * except that the output must use a new register.
+ *
+ * C_Nn_Om_Ik(...) defines a constraint set with  outputs and 
+ * inputs, except that the first  outputs must use new registers.
  */
 C_O0_I1(r)
 C_O0_I2(L, L)
@@ -53,4 +56,4 @@ C_O2_I1(r, r, L)
 C_O2_I2(a, d, a, r)
 C_O2_I2(r, r, L, L)
 C_O2_I3(a, d, 0, 1, r)
-C_O2_I4(r, r, 0, 1, re, re)
+C_N1_O1_I4(r, r, 0, 1, re, re)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index ab997b5fb39..77482da0709 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -3335,7 +3335,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode 
op)
 case INDEX_op_add2_i64:
 case INDEX_op_sub2_i32:
 case INDEX_op_sub2_i64:
-return C_O2_I4(r, r, 0, 1, re, re);
+return C_N1_O1_I4(r, r, 0, 1, re, re);
 
 case INDEX_op_ctz_i32:
 case INDEX_op_ctz_i64:
diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
index cbad91b2b56..9a420374999 100644
--- a/tcg/s390x/tcg-target-con-set.h
+++ b/tcg/s390x/tcg-target-con-set.h
@@ -8,6 +8,9 @@
  * C_On_Im(...) defines a constraint set with  outputs and  inputs.
  * Each operand should be a sequence of constraint letters as defined by
  * tcg-target-con-str.h; the constraint combination is inclusive or.
+ *
+ * C_Nn_Om_Ik(...) defines a constraint set with  outputs and 
+ * inputs, except that the first  outputs must use new registers.
  */
 C_O0_I1(r)
 C_O0_I2(r, r)
@@ -41,6 +44,5 @@ C_O2_I1(o, m, r)
 C_O2_I2(o, m, 0, r)
 C_O2_I2(o, m, r, r)
 C_O2_I3(o, m, 0, 1, r)
-C_O2_I4(r, r, 0, 1, rA, r)
-C_O2_I4(r, r, 0, 1, ri, r)
-C_O2_I4(r, r, 0, 1, r, r)
+C_N1_O1_I4(r, r, 0, 1, ri, r)
+C_N1_O1_I4(r, r, 0, 1, rA, r)
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index a878acd8ca6..a94f7908d64 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -3229,11 +3229,11 @@ static TCGConstraintSetIndex 
tcg_target_op_def(TCGOpcode op)
 
 case INDEX_op_add2_i32:
 case INDEX_op_sub2_i32:
-return C_O2_I4(r, r, 0, 1, ri, r);
+return C_N1_O1_I4(r, r, 0, 1, ri, r);
 
 case INDEX_op_add2_i64:
 case INDEX_op_sub2_i64:
-return C_O2_I4(r, r, 0, 1, rA, r);
+return C_N1_O1_I4(r, r, 0, 1, rA, r);
 
 case INDEX_op_st_vec:
 return C_O0_I2(v, r);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 652e8ea6b93..ddfe9a96cb7 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -648,6 +648,7 @@ static void tcg_out_movext3(TCGContext *s, const 
TCGMovExtend *i1,
 #define C_O2_I2(O1, O2, I1, I2) C_PFX4(c_o2_i2_, O1, O2, I1, I2),
 #define C_O2_I3(O1, O2, I1, I2, I3) C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3),
 #define C_O2_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, 
I4),
+#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_n1_o1_i4_, O1, O2, I1, I2, 
I3, I4),
 
 typedef enum {
 #include "tcg-target-con-set.h"
@@ -668,6 +669,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode);
 #undef C_O2_I2
 #undef C_O2_I3
 #undef C_O2_I4
+#undef C_N1_O1_I4
 
 /* Put all of the constraint sets into an array, indexed by the enum. */
 
@@ -687,6 +689,7 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode);
 #define C_O2_I2(O1, O2, I1, I2) { .args_ct_str = { #O1, #O2, #I1, #I2 
} },
 #define C_O2_I3(O1, O2, I1, I2, I3) { .args_ct_str = { #O1, #O2, #I1, #I2, 
#I3 } },
 #define C_O2_I4(O1, O2, I1, I2, I3, I4) { .args_ct_str = { #O1, #O2, #I1, #I2, 
#I3, #I4 } },
+#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) { .args_ct_str = { "&" #O1, #O2, 
#I1, #I2, #I3, #I4 } },
 
 static const TCGTargetOpDef constraint_sets[] = {
 #include "tcg-target-con-set.h"
@@ -706,6 +709,7 @@ static const TCGTargetOpDef constraint_sets[] = {
 #undef C_O2_I2
 #undef C_O2_I3
 #undef C_O2_I4
+#undef C_N1_O1_I4
 
 /* Expand the enumerator to be returned from tcg_target_op_def(). */
 
@@ -725,6 +729,7 @@ static const 

[PATCH v3 10/14] tests/tcg/s390x: Test CLM

2023-07-19 Thread Ilya Leoshkevich
Add a small test to prevent regressions.

Tested-by: Thomas Huth 
Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.softmmu-target |  1 +
 tests/tcg/s390x/clm.S   | 29 +
 2 files changed, 30 insertions(+)
 create mode 100644 tests/tcg/s390x/clm.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index e813e318db9..062d8e368aa 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -17,6 +17,7 @@ LDFLAGS=-nostdlib -static
 ASM_TESTS =
\
 bal
\
 cksm   
\
+clm
\
 exrl-ssm-early 
\
 sam
\
 lpsw   
\
diff --git a/tests/tcg/s390x/clm.S b/tests/tcg/s390x/clm.S
new file mode 100644
index 000..17156a81f2a
--- /dev/null
+++ b/tests/tcg/s390x/clm.S
@@ -0,0 +1,29 @@
+.org 0x8e
+program_interruption_code:
+.org 0x1d0 /* program new PSW */
+.quad 0,pgm
+.org 0x200 /* lowcore padding */
+.globl _start
+_start:
+lgrl %r0,op1
+clm %r0,6,op2
+jle failure
+lgrl %r1,bad_addr
+clm %r0,0,0(%r1)
+failure:
+lpswe failure_psw
+pgm:
+chhsi program_interruption_code,5  /* addressing exception? */
+jne failure
+lpswe success_psw
+.align 8
+op1:
+.quad 0x1234567887654321
+op2:
+.quad 0x3456789abcdef012
+bad_addr:
+.quad 0x
+success_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+failure_psw:
+.quad 0x2,0/* disabled wait */
-- 
2.41.0




[PATCH v3 08/14] tests/tcg/s390x: Test CKSM

2023-07-19 Thread Ilya Leoshkevich
Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.softmmu-target |  1 +
 tests/tcg/s390x/cksm.S  | 29 +
 2 files changed, 30 insertions(+)
 create mode 100644 tests/tcg/s390x/cksm.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index 242c7b0f83c..e813e318db9 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -16,6 +16,7 @@ LDFLAGS=-nostdlib -static
 
 ASM_TESTS =
\
 bal
\
+cksm   
\
 exrl-ssm-early 
\
 sam
\
 lpsw   
\
diff --git a/tests/tcg/s390x/cksm.S b/tests/tcg/s390x/cksm.S
new file mode 100644
index 000..563fd3d233e
--- /dev/null
+++ b/tests/tcg/s390x/cksm.S
@@ -0,0 +1,29 @@
+.org 0x8e
+program_interruption_code:
+.org 0x1d0 /* program new PSW */
+.quad 0,pgm
+.org 0x200 /* lowcore padding */
+.globl _start
+_start:
+lmg %r0,%r1,cksm_args
+cksm %r2,%r0
+c %r2,cksm_exp
+jne failure
+.insn rre,0xb241,%r2,%r15  /* cksm %r2,%r15 */
+failure:
+lpswe failure_psw
+pgm:
+chhsi program_interruption_code,6  /* specification exception? */
+jne failure
+lpswe success_psw
+cksm_args:
+.quad cksm_buf, 16
+cksm_buf:
+.quad 0x, 0x12345678
+cksm_exp:
+.long 0x89ab1234
+.align 8
+success_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+failure_psw:
+.quad 0x2,0/* disabled wait */
-- 
2.41.0




[PATCH v3 04/14] target/s390x: Fix ICM with M3=0

2023-07-19 Thread Ilya Leoshkevich
When the mask is zero, access exceptions should still be recognized for
1 byte at the second-operand address. CC should be set to 0.

Cc: qemu-sta...@nongnu.org
Fixes: e023e832d0ac ("s390x: translate engine for s390x CPU")
Reviewed-by: David Hildenbrand 
Signed-off-by: Ilya Leoshkevich 
---
 target/s390x/tcg/translate.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index 2f61e879878..2f193339709 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -2522,6 +2522,12 @@ static DisasJumpType op_icm(DisasContext *s, DisasOps *o)
 ccm = ((1ull << len) - 1) << pos;
 break;
 
+case 0:
+/* Recognize access exceptions for the first byte.  */
+tcg_gen_qemu_ld_i64(tmp, o->in2, get_mem_index(s), MO_UB);
+gen_op_movi_cc(s, 0);
+return DISAS_NEXT;
+
 default:
 /* This is going to be a sequence of loads and inserts.  */
 pos = base + 32 - 8;
-- 
2.41.0




[PATCH v3 14/14] tests/tcg/s390x: Test VCKSM

2023-07-19 Thread Ilya Leoshkevich
Add a small test to prevent regressions.

Tested-by: Thomas Huth 
Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.target |  1 +
 tests/tcg/s390x/vcksm.c | 31 +++
 tests/tcg/s390x/vx.h|  2 ++
 3 files changed, 34 insertions(+)
 create mode 100644 tests/tcg/s390x/vcksm.c

diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 71bf39b78d3..1fc98099070 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -58,6 +58,7 @@ TESTS += $(PGM_SPECIFICATION_TESTS)
 Z13_TESTS=vistr
 Z13_TESTS+=lcbb
 Z13_TESTS+=locfhr
+Z13_TESTS+=vcksm
 $(Z13_TESTS): CFLAGS+=-march=z13 -O2
 TESTS+=$(Z13_TESTS)
 
diff --git a/tests/tcg/s390x/vcksm.c b/tests/tcg/s390x/vcksm.c
new file mode 100644
index 000..452daaae6ce
--- /dev/null
+++ b/tests/tcg/s390x/vcksm.c
@@ -0,0 +1,31 @@
+/*
+ * Test the VCKSM instruction.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include 
+#include 
+#include 
+#include "vx.h"
+
+int main(void)
+{
+S390Vector v1;
+S390Vector v2 = {
+.d[0] = 0xb2261c8140edce49ULL,
+.d[1] = 0x387bf5a433af39d1ULL,
+};
+S390Vector v3 = {
+.d[0] = 0x73b03d2c7f9e654eULL,
+.d[1] = 0x23d74e51fb479877ULL,
+};
+S390Vector exp = {.d[0] = 0xdedd7f8eULL, .d[1] = 0ULL};
+
+asm volatile("vcksm %[v1],%[v2],%[v3]"
+ : [v1] "=v" (v1.v)
+ : [v2] "v" (v2.v)
+ , [v3] "v" (v3.v));
+assert(memcmp(, , sizeof(v1)) == 0);
+
+return EXIT_SUCCESS;
+}
diff --git a/tests/tcg/s390x/vx.h b/tests/tcg/s390x/vx.h
index 02e7fd518a8..00701dbe35f 100644
--- a/tests/tcg/s390x/vx.h
+++ b/tests/tcg/s390x/vx.h
@@ -1,6 +1,8 @@
 #ifndef QEMU_TESTS_S390X_VX_H
 #define QEMU_TESTS_S390X_VX_H
 
+#include 
+
 typedef union S390Vector {
 uint64_t d[2];  /* doubleword */
 uint32_t w[4];  /* word */
-- 
2.41.0




[PATCH v3 11/14] tests/tcg/s390x: Test ICM

2023-07-19 Thread Ilya Leoshkevich
Add a small test to prevent regressions.

Tested-by: Thomas Huth 
Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.softmmu-target |  1 +
 tests/tcg/s390x/icm.S   | 32 +
 2 files changed, 33 insertions(+)
 create mode 100644 tests/tcg/s390x/icm.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index 062d8e368aa..58684d7da71 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -19,6 +19,7 @@ ASM_TESTS =   
 \
 cksm   
\
 clm
\
 exrl-ssm-early 
\
+icm
\
 sam
\
 lpsw   
\
 lpswe-early
\
diff --git a/tests/tcg/s390x/icm.S b/tests/tcg/s390x/icm.S
new file mode 100644
index 000..d24d1f52fb8
--- /dev/null
+++ b/tests/tcg/s390x/icm.S
@@ -0,0 +1,32 @@
+.org 0x8e
+program_interruption_code:
+.org 0x1d0 /* program new PSW */
+.quad 0,pgm
+.org 0x200 /* lowcore padding */
+.globl _start
+_start:
+lgrl %r0,op1
+icm %r0,10,op2
+cg %r0,exp
+jne failure
+lgrl %r1,bad_addr
+icm %r0,0,0(%r1)
+failure:
+lpswe failure_psw
+pgm:
+chhsi program_interruption_code,5  /* addressing exception? */
+jne failure
+lpswe success_psw
+.align 8
+op1:
+.quad 0x1234567887654321
+op2:
+.quad 0x0011223344556677
+exp:
+.quad 0x1234567800651121
+bad_addr:
+.quad 0x
+success_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+failure_psw:
+.quad 0x2,0/* disabled wait */
-- 
2.41.0




[PATCH v3 13/14] tests/tcg/s390x: Test STPQ

2023-07-19 Thread Ilya Leoshkevich
Add a small test to prevent regressions.

Tested-by: Thomas Huth 
Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/s390x/Makefile.softmmu-target |  1 +
 tests/tcg/s390x/stpq.S  | 20 
 2 files changed, 21 insertions(+)
 create mode 100644 tests/tcg/s390x/stpq.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index 145e0bfde16..76345b6e643 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -27,6 +27,7 @@ ASM_TESTS =   
 \
 mc 
\
 ssm-early  
\
 stosm-early
\
+stpq   
\
 unaligned-lowcore
 
 include $(S390X_SRC)/pgm-specification.mak
diff --git a/tests/tcg/s390x/stpq.S b/tests/tcg/s390x/stpq.S
new file mode 100644
index 000..687a52eafa7
--- /dev/null
+++ b/tests/tcg/s390x/stpq.S
@@ -0,0 +1,20 @@
+.org 0x200 /* lowcore padding */
+.globl _start
+_start:
+lgrl %r0,value
+lgrl %r1,value+8
+stpq %r0,stored_value
+clc stored_value(16),value
+jne failure
+lpswe success_psw
+failure:
+lpswe failure_psw
+.align 16
+value:
+.quad 0x1234567887654321, 0x8765432112345678
+stored_value:
+.quad 0, 0
+success_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+failure_psw:
+.quad 0x2,0/* disabled wait */
-- 
2.41.0




[PATCH v3 02/14] target/s390x: Fix CLM with M3=0

2023-07-19 Thread Ilya Leoshkevich
When the mask is zero, access exceptions should still be recognized for
1 byte at the second-operand address. CC should be set to 0.

Reviewed-by: David Hildenbrand 
Cc: qemu-sta...@nongnu.org
Fixes: defb0e3157af ("s390x: Implement opcode helpers")
Signed-off-by: Ilya Leoshkevich 
---
 target/s390x/tcg/mem_helper.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/target/s390x/tcg/mem_helper.c b/target/s390x/tcg/mem_helper.c
index f417fb1183c..d6dc8b32620 100644
--- a/target/s390x/tcg/mem_helper.c
+++ b/target/s390x/tcg/mem_helper.c
@@ -667,6 +667,11 @@ uint32_t HELPER(clm)(CPUS390XState *env, uint32_t r1, 
uint32_t mask,
 HELPER_LOG("%s: r1 0x%x mask 0x%x addr 0x%" PRIx64 "\n", __func__, r1,
mask, addr);
 
+if (!mask) {
+/* Recognize access exceptions for the first byte */
+cpu_ldub_data_ra(env, addr, ra);
+}
+
 while (mask) {
 if (mask & 8) {
 uint8_t d = cpu_ldub_data_ra(env, addr, ra);
-- 
2.41.0




[PATCH v3 05/14] target/s390x: Make MC raise specification exception when class >= 16

2023-07-19 Thread Ilya Leoshkevich
MC requires bit positions 8-11 (upper 4 bits of class) to be zeros,
otherwise it must raise a specification exception.

Cc: qemu-sta...@nongnu.org
Fixes: 20d143e2cab8 ("s390x/tcg: Implement MONITOR CALL")
Reviewed-by: David Hildenbrand 
Signed-off-by: Ilya Leoshkevich 
---
 target/s390x/tcg/excp_helper.c | 2 +-
 target/s390x/tcg/translate.c   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/target/s390x/tcg/excp_helper.c b/target/s390x/tcg/excp_helper.c
index 228aa9f2373..3da337f7c72 100644
--- a/target/s390x/tcg/excp_helper.c
+++ b/target/s390x/tcg/excp_helper.c
@@ -639,7 +639,7 @@ void monitor_event(CPUS390XState *env,
 void HELPER(monitor_call)(CPUS390XState *env, uint64_t monitor_code,
   uint32_t monitor_class)
 {
-g_assert(monitor_class <= 0xff);
+g_assert(monitor_class <= 0xf);
 
 if (env->cregs[8] & (0x8000 >> monitor_class)) {
 monitor_event(env, monitor_code, monitor_class, GETPC());
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index 2f193339709..9a4fd3d8911 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -3184,9 +3184,9 @@ static DisasJumpType op_lcbb(DisasContext *s, DisasOps *o)
 
 static DisasJumpType op_mc(DisasContext *s, DisasOps *o)
 {
-const uint16_t monitor_class = get_field(s, i2);
+const uint8_t monitor_class = get_field(s, i2);
 
-if (monitor_class & 0xff00) {
+if (monitor_class & 0xf0) {
 gen_program_exception(s, PGM_SPECIFICATION);
 return DISAS_NORETURN;
 }
-- 
2.41.0




[PATCH v3 03/14] target/s390x: Fix CONVERT TO LOGICAL/FIXED with out-of-range inputs

2023-07-19 Thread Ilya Leoshkevich
CONVERT TO LOGICAL/FIXED deviate from IEEE 754 in that they raise an
inexact exception on out-of-range inputs. float_flag_invalid_cvti
aligns nicely with that behavior, so convert it to
S390_IEEE_MASK_INEXACT.

Cc: qemu-sta...@nongnu.org
Fixes: defb0e3157af ("s390x: Implement opcode helpers")
Reviewed-by: David Hildenbrand 
Signed-off-by: Ilya Leoshkevich 
---
 target/s390x/tcg/fpu_helper.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/s390x/tcg/fpu_helper.c b/target/s390x/tcg/fpu_helper.c
index 4b7fa58af3e..3d941ed2d28 100644
--- a/target/s390x/tcg/fpu_helper.c
+++ b/target/s390x/tcg/fpu_helper.c
@@ -52,7 +52,8 @@ uint8_t s390_softfloat_exc_to_ieee(unsigned int exc)
 s390_exc |= (exc & float_flag_divbyzero) ? S390_IEEE_MASK_DIVBYZERO : 0;
 s390_exc |= (exc & float_flag_overflow) ? S390_IEEE_MASK_OVERFLOW : 0;
 s390_exc |= (exc & float_flag_underflow) ? S390_IEEE_MASK_UNDERFLOW : 0;
-s390_exc |= (exc & float_flag_inexact) ? S390_IEEE_MASK_INEXACT : 0;
+s390_exc |= (exc & (float_flag_inexact | float_flag_invalid_cvti)) ?
+S390_IEEE_MASK_INEXACT : 0;
 
 return s390_exc;
 }
-- 
2.41.0




[PATCH v3 01/14] target/s390x: Make CKSM raise an exception if R2 is odd

2023-07-19 Thread Ilya Leoshkevich
R2 designates an even-odd register pair; the instruction should raise
a specification exception when R2 is not even.

Cc: qemu-sta...@nongnu.org
Fixes: e023e832d0ac ("s390x: translate engine for s390x CPU")
Reviewed-by: David Hildenbrand 
Signed-off-by: Ilya Leoshkevich 
---
 target/s390x/tcg/translate.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index 6661b27efa4..2f61e879878 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -1991,11 +1991,18 @@ static DisasJumpType op_cxlgb(DisasContext *s, DisasOps 
*o)
 static DisasJumpType op_cksm(DisasContext *s, DisasOps *o)
 {
 int r2 = get_field(s, r2);
-TCGv_i128 pair = tcg_temp_new_i128();
-TCGv_i64 len = tcg_temp_new_i64();
+TCGv_i128 pair;
+TCGv_i64 len;
+
+if (r2 & 1) {
+gen_program_exception(s, PGM_SPECIFICATION);
+return DISAS_NORETURN;
+}
 
+pair = tcg_temp_new_i128();
 gen_helper_cksm(pair, cpu_env, o->in1, o->in2, regs[r2 + 1]);
 set_cc_static(s);
+len = tcg_temp_new_i64();
 tcg_gen_extr_i128_i64(o->out, len, pair);
 
 tcg_gen_add_i64(regs[r2], regs[r2], len);
-- 
2.41.0




Re: [PULL 10/66] tests/qtest: enable tests for virtio-scmi

2023-07-19 Thread Fabiano Rosas
Thomas Huth  writes:

> On 18/07/2023 14.55, Milan Zamazal wrote:
>> Thomas Huth  writes:
>> 
>>> On 11/07/2023 01.02, Michael S. Tsirkin wrote:
 From: Milan Zamazal 
 We don't have a virtio-scmi implementation in QEMU and only support
>>>
 a
 vhost-user backend.  This is very similar to virtio-gpio and we add the 
 same
 set of tests, just passing some vhost-user messages over the control 
 socket.
 Signed-off-by: Milan Zamazal 
 Acked-by: Thomas Huth 
 Message-Id: <20230628100524.342666-4-mzama...@redhat.com>
 Reviewed-by: Michael S. Tsirkin 
 Signed-off-by: Michael S. Tsirkin 
 ---
tests/qtest/libqos/virtio-scmi.h |  34 ++
tests/qtest/libqos/virtio-scmi.c | 174 +++
tests/qtest/vhost-user-test.c|  44 
MAINTAINERS  |   1 +
tests/qtest/libqos/meson.build   |   1 +
5 files changed, 254 insertions(+)
create mode 100644 tests/qtest/libqos/virtio-scmi.h
create mode 100644 tests/qtest/libqos/virtio-scmi.c
>>>
>>>   Hi!
>>>
>>> I'm seeing some random failures with this new scmi test, so far only
>>> on non-x86 systems, e.g.:
>>>
>>>   https://app.travis-ci.com/github/huth/qemu/jobs/606246131#L4774
>>>
>>> It also reproduces on a s390x host here, but only if I run "make check
>>> -j$(nproc)" - if I run the tests single-threaded, the qos-test passes
>>> there. Seems like there is a race somewhere in this test?
>> 
>> Hmm, it's basically the same as virtio-gpio.c test, so it should be OK.
>> Is it possible that the two tests (virtio-gpio.c & virtio-scmi.c)
>> interfere with each other in some way?  Is there possibly a way to
>> serialize them to check?
>
> I think within one qos-test, the sub-tests are already run serialized. But 
> there might be multiple qos-tests running in parallel, e.g. one for the 
> aarch64 target and one for the ppc64 target. And indeed, I can reproduce the 
> problem on my x86 laptop by running this in one terminal window:
>
> for ((x=0;x<1000;x++)); do \
>   QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon \
>   G_TEST_DBUS_DAEMON=.tests/dbus-vmstate-daemon.sh \
>   QTEST_QEMU_BINARY=./qemu-system-ppc64 \
>   MALLOC_PERTURB_=188 QTEST_QEMU_IMG=./qemu-img \
>   tests/qtest/qos-test -p \
>   
> /ppc64/pseries/spapr-pci-host-bridge/pci-bus-spapr/pci-bus/vhost-user-scmi-pci/vhost-user-scmi/vhost-user-scmi-tests/scmi/read-guest-mem/memfile
>  \
>   || break ; \
> done
>
> And this in another terminal window at the same time:
>
> for ((x=0;x<1000;x++)); do \
>   QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon \
>   G_TEST_DBUS_DAEMON=.tests/dbus-vmstate-daemon.sh \
>   QTEST_QEMU_BINARY=./qemu-system-aarch64 \
>   MALLOC_PERTURB_=188 QTEST_QEMU_IMG=./qemu-img \
>   tests/qtest/qos-test -p \
>   
> /aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/vhost-user-scmi-pci/vhost-user-scmi/vhost-user-scmi-tests/scmi/read-guest-mem/memfile
>  \
>   || break ; \
> done
>
> After a while, the aarch64 test broke with:
>
> /aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/vhost-user-scmi-pci/vhost-user-scmi/vhost-user-scmi-tests/scmi/read-guest-mem/memfile:
>  qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument 
> (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument 
> (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost_set_vring_call failed 22
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost_set_vring_call failed 22
> qemu-system-aarch64: Failed to write msg. Wrote -1 instead of 20.
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument 
> (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument 
> (22)
> qemu-system-aarch64: ../../devel/qemu/hw/pci/msix.c:659: 
> msix_unset_vector_notifiers: Assertion `dev->msix_vector_use_notifier && 
> dev->msix_vector_release_notifier' failed.
> ../../devel/qemu/tests/qtest/libqtest.c:200: kill_qemu() detected QEMU death 
> from signal 6 (Aborted) (core dumped)

If it helps,

it looks like msix_unset_vector_notifiers is being called twice, once
from vu_scmi_set_status() and another from vu_scmi_disconnect():

msix_unset_vector_notifiers
virtio_pci_set_guest_notifiers
vu_scmi_stop
vu_scmi_disconnect   <-
vu_scmi_event
chr_be_event
qemu_chr_be_event
tcp_chr_disconnect_locked
tcp_chr_write
qemu_chr_write_buffer

msix_unset_vector_notifiers
virtio_pci_set_guest_notifiers
vu_scmi_stop
vu_scmi_set_status   <-
virtio_set_status
virtio_vmstate_change
vm_state_notify
do_vm_stop
vm_shutdown
qemu_cleanup



Re: [PATCH v5 6/6] tests/qtest: migration-test: Add tests for file-based migration

2023-07-19 Thread Peter Xu
On Wed, Jul 12, 2023 at 04:07:42PM -0300, Fabiano Rosas wrote:
> Add basic tests for file-based migration.
> 
> Note that we cannot use test_precopy_common because that routine
> expects it to be possible to run the migration live. With the file
> transport there is no live migration because we must wait for the
> source to finish writing the migration data to the file before the
> destination can start reading. Add a new migration function
> specifically to handle the file migration.
> 
> Signed-off-by: Fabiano Rosas 

Reviewed-by: Peter Xu 

-- 
Peter Xu




[PULL 10/14] nbd/server: Refactor to pass full request around

2023-07-19 Thread Eric Blake
Part of NBD's 64-bit headers extension involves passing the client's
requested offset back as part of the reply header (one reason it
stated for this change: converting absolute offsets stored in
NBD_REPLY_TYPE_OFFSET_DATA to relative offsets within the buffer is
easier if the absolute offset of the buffer is also available).  This
is a refactoring patch to pass the full request around the reply
stack, rather than just the handle, so that later patches can then
access request->from when extended headers are active.  Meanwhile,
this patch enables us to now assert that simple replies are only
attempted when appropriate, and otherwise has no semantic change.

Signed-off-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Message-ID: <20230608135653.2918540-5-ebl...@redhat.com>
---
 nbd/server.c | 114 ++-
 1 file changed, 59 insertions(+), 55 deletions(-)

diff --git a/nbd/server.c b/nbd/server.c
index 6698ab46365..26b27d69202 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1893,7 +1893,7 @@ static inline void set_be_simple_reply(NBDSimpleReply 
*reply, uint64_t error,
 }

 static int coroutine_fn nbd_co_send_simple_reply(NBDClient *client,
- uint64_t handle,
+ NBDRequest *request,
  uint32_t error,
  void *data,
  size_t len,
@@ -1907,9 +1907,10 @@ static int coroutine_fn 
nbd_co_send_simple_reply(NBDClient *client,
 };

 assert(!len || !nbd_err);
-trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err),
-   len);
-set_be_simple_reply(, nbd_err, handle);
+assert(!client->structured_reply || request->type != NBD_CMD_READ);
+trace_nbd_co_send_simple_reply(request->handle, nbd_err,
+   nbd_err_lookup(nbd_err), len);
+set_be_simple_reply(, nbd_err, request->handle);

 return nbd_co_send_iov(client, iov, 2, errp);
 }
@@ -1924,7 +1925,7 @@ static int coroutine_fn 
nbd_co_send_simple_reply(NBDClient *client,
  */
 static inline void set_be_chunk(NBDClient *client, struct iovec *iov,
 size_t niov, uint16_t flags, uint16_t type,
-uint64_t handle)
+NBDRequest *request)
 {
 /* TODO - handle structured vs. extended replies */
 NBDStructuredReplyChunk *chunk = iov->iov_base;
@@ -1939,12 +1940,12 @@ static inline void set_be_chunk(NBDClient *client, 
struct iovec *iov,
 stl_be_p(>magic, NBD_STRUCTURED_REPLY_MAGIC);
 stw_be_p(>flags, flags);
 stw_be_p(>type, type);
-stq_be_p(>handle, handle);
+stq_be_p(>handle, request->handle);
 stl_be_p(>length, length);
 }

 static int coroutine_fn nbd_co_send_chunk_done(NBDClient *client,
-   uint64_t handle,
+   NBDRequest *request,
Error **errp)
 {
 NBDReply hdr;
@@ -1952,15 +1953,15 @@ static int coroutine_fn 
nbd_co_send_chunk_done(NBDClient *client,
 {.iov_base = },
 };

-trace_nbd_co_send_chunk_done(handle);
+trace_nbd_co_send_chunk_done(request->handle);
 set_be_chunk(client, iov, 1, NBD_REPLY_FLAG_DONE,
- NBD_REPLY_TYPE_NONE, handle);
+ NBD_REPLY_TYPE_NONE, request);

 return nbd_co_send_iov(client, iov, 1, errp);
 }

 static int coroutine_fn nbd_co_send_chunk_read(NBDClient *client,
-   uint64_t handle,
+   NBDRequest *request,
uint64_t offset,
void *data,
size_t size,
@@ -1976,16 +1977,16 @@ static int coroutine_fn 
nbd_co_send_chunk_read(NBDClient *client,
 };

 assert(size);
-trace_nbd_co_send_chunk_read(handle, offset, data, size);
+trace_nbd_co_send_chunk_read(request->handle, offset, data, size);
 set_be_chunk(client, iov, 3, final ? NBD_REPLY_FLAG_DONE : 0,
- NBD_REPLY_TYPE_OFFSET_DATA, handle);
+ NBD_REPLY_TYPE_OFFSET_DATA, request);
 stq_be_p(, offset);

 return nbd_co_send_iov(client, iov, 3, errp);
 }
-
+/*ebb*/
 static int coroutine_fn nbd_co_send_chunk_error(NBDClient *client,
-uint64_t handle,
+NBDRequest *request,
 uint32_t error,
 const char *msg,
 Error **errp)
@@ -2000,10 +2001,10 @@ static int coroutine_fn 

[PULL 06/14] qemu-nbd: make verbose bool and local variable in main()

2023-07-19 Thread Eric Blake
From: "Denis V. Lunev" 

Pass 'verbose' to nbd_client_thread() inside NbdClientOpts which looks
a little bit cleaner and make it bool as it is used as bool actually.

Signed-off-by: Denis V. Lunev 
CC: Eric Blake 
CC: Vladimir Sementsov-Ogievskiy 
Message-ID: <20230717202520.236999-1-...@openvz.org>
Reviewed-by: Eric Blake 
Signed-off-by: Eric Blake 
---
 qemu-nbd.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index e30c9ac1793..5b2757920c1 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -73,7 +73,6 @@

 #define MBR_SIZE 512

-static int verbose;
 static char *srcpath;
 static SocketAddress *saddr;
 static int persistent = 0;
@@ -275,6 +274,7 @@ static void *show_parts(void *arg)
 struct NbdClientOpts {
 char *device;
 bool fork_process;
+bool verbose;
 };

 static void *nbd_client_thread(void *arg)
@@ -318,7 +318,7 @@ static void *nbd_client_thread(void *arg)
 /* update partition table */
 pthread_create(_parts_thread, NULL, show_parts, opts->device);

-if (verbose && !opts->fork_process) {
+if (opts->verbose && !opts->fork_process) {
 fprintf(stderr, "NBD device %s is now connected to %s\n",
 opts->device, srcpath);
 } else {
@@ -582,6 +582,7 @@ int main(int argc, char **argv)
 const char *tlshostname = NULL;
 bool imageOpts = false;
 bool writethrough = false; /* Client will flush as needed. */
+bool verbose = false;
 bool fork_process = false;
 bool list = false;
 unsigned socket_activation;
@@ -746,7 +747,7 @@ int main(int argc, char **argv)
 }
 break;
 case 'v':
-verbose = 1;
+verbose = true;
 break;
 case 'V':
 version(argv[0]);
@@ -1147,6 +1148,7 @@ int main(int argc, char **argv)
 struct NbdClientOpts opts = {
 .device = device,
 .fork_process = fork_process,
+.verbose = verbose,
 };

 ret = pthread_create(_thread, NULL, nbd_client_thread, );
-- 
2.41.0




[PULL 03/14] qemu-nbd: properly report error if qemu_daemon() is failed

2023-07-19 Thread Eric Blake
From: "Denis V. Lunev" 

errno has been overwritten by dup2() just below qemu_daemon() and thus
improperly returned to the caller. Fix accordingly.

Signed-off-by: Denis V. Lunev 
CC: Eric Blake 
CC: Vladimir Sementsov-Ogievskiy 
Message-ID: <20230717145544.194786-5-...@openvz.org>
Reviewed-by: Eric Blake 
[eblake: reorder patch series]
Signed-off-by: Eric Blake 
---
 qemu-nbd.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 186ce9474c3..5a8ae1f7472 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -932,14 +932,17 @@ int main(int argc, char **argv)
 error_report("Failed to fork: %s", strerror(errno));
 exit(EXIT_FAILURE);
 } else if (pid == 0) {
+int saved_errno;
+
 close(stderr_fd[0]);

 ret = qemu_daemon(1, 0);
+saved_errno = errno;/* dup2 will overwrite error below */

 /* Temporarily redirect stderr to the parent's pipe...  */
 dup2(stderr_fd[1], STDERR_FILENO);
 if (ret < 0) {
-error_report("Failed to daemonize: %s", strerror(errno));
+error_report("Failed to daemonize: %s", strerror(saved_errno));
 exit(EXIT_FAILURE);
 }

-- 
2.41.0




Re: [Libguestfs] [PATCH v4 09/24] nbd: Replace bool structured_reply with mode enum

2023-07-19 Thread Eric Blake
On Mon, Jun 12, 2023 at 02:24:52PM -0500, Eric Blake wrote:
> On Mon, Jun 12, 2023 at 06:07:59PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> > On 08.06.23 16:56, Eric Blake wrote:
> > > The upcoming patches for 64-bit extensions requires various points in
> > > the protocol to make decisions based on what was negotiated.  While we
> > > could easily add a 'bool extended_headers' alongside the existing
> > > 'bool structured_reply', this does not scale well if more modes are
> > > added in the future.  Better is to expose the mode enum added in the
> > > previous patch out to a wider use in the code base.
> > > 
> > > Where the code previously checked for structured_reply being set or
> > > clear, it now prefers checking for an inequality; this works because
> > > the nodes are in a continuum of increasing abilities, and allows us to
> > > touch fewer places if we ever insert other modes in the middle of the
> > > enum.  There should be no semantic change in this patch.
> > > 
> > > Signed-off-by: Eric Blake 
> > > ---
> > > 
> > > v4: new patch, expanding enum idea from v3 4/14
> > > ---
> > 
> > [..]
> > 
> > > diff --git a/nbd/server.c b/nbd/server.c
> > > index 8486b64b15d..bade4f7990c 100644
> > > --- a/nbd/server.c
> > > +++ b/nbd/server.c
> 
> > > @@ -1261,13 +1262,13 @@ static int nbd_negotiate_options(NBDClient 
> > > *client, Error **errp)
> > >   case NBD_OPT_STRUCTURED_REPLY:
> > >   if (length) {
> > >   ret = nbd_reject_length(client, false, errp);
> > > -} else if (client->structured_reply) {
> > > +} else if (client->mode >= NBD_MODE_STRUCTURED) {
> > >   ret = nbd_negotiate_send_rep_err(
> > >   client, NBD_REP_ERR_INVALID, errp,
> > >   "structured reply already negotiated");
> > >   } else {
> > >   ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, 
> > > errp);
> > > -client->structured_reply = true;
> > > +client->mode = NBD_MODE_STRUCTURED;
> > 
> > Hmm. in all other cases in server code client.mode remains zero = OLDSTYLE, 
> > which is not quite correct.
> 
> Good catch.  Consider this squashed in (note that as a server we NEVER
> talk NBD_MODE_OLDSTYLE - we ripped that out back in commit 7f7dfe2a;
> but whether we end up on EXPORT_NAME or SIMPLE depends on the client's
> response to our initial flag advertisement.  The only reason I didn't
> spot it sooner is that in the server, all subsequent checks of
> client->mode grouped OLDSTYLE, EXPORT_NAME, and SIMPLE into the same
> handling.

To move things along, I have now staged 1-8 in my NBD queue for a pull
request, and will then repost this patch and the remainder of the
series as v5, to make it easier to pick up the final needed R-b.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org




[PULL 13/14] nbd/client: Add safety check on chunk payload length

2023-07-19 Thread Eric Blake
Our existing use of structured replies either reads into a qiov capped
at 32M (NBD_CMD_READ) or caps allocation to 1000 bytes (see
NBD_MAX_MALLOC_PAYLOAD in block/nbd.c).  But the existing length
checks are rather late; if we encounter a buggy (or malicious) server
that sends a super-large payload length, we should drop the connection
right then rather than assuming the layer on top will be careful.
This becomes more important when we permit 64-bit lengths which are
even more likely to have the potential for attempted denial of service
abuse.

Signed-off-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Message-ID: <20230608135653.2918540-8-ebl...@redhat.com>
---
 nbd/client.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/nbd/client.c b/nbd/client.c
index ea3590ca3d0..1b5569556fe 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -1413,6 +1413,18 @@ static int nbd_receive_structured_reply_chunk(QIOChannel 
*ioc,
 chunk->cookie = be64_to_cpu(chunk->cookie);
 chunk->length = be32_to_cpu(chunk->length);

+/*
+ * Because we use BLOCK_STATUS with REQ_ONE, and cap READ requests
+ * at 32M, no valid server should send us payload larger than
+ * this.  Even if we stopped using REQ_ONE, sane servers will cap
+ * the number of extents they return for block status.
+ */
+if (chunk->length > NBD_MAX_BUFFER_SIZE + sizeof(NBDStructuredReadData)) {
+error_setg(errp, "server chunk %" PRIu32 " (%s) payload is too long",
+   chunk->type, nbd_rep_lookup(chunk->type));
+return -EINVAL;
+}
+
 return 0;
 }

-- 
2.41.0




Re: [PULL 1/1] hw/nvme: fix endianness issue for shadow doorbells

2023-07-19 Thread Michael Tokarev

19.07.2023 10:36, Klaus Jensen wrote:
pu(req->cmd.dptr.prp2);

+uint32_t v;



  if (sq) {
+v = cpu_to_le32(sq->tail);



-pci_dma_write(pci, sq->db_addr, >tail, sizeof(sq->tail));
+pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail));


This and similar cases hurts my eyes.

Why we pass address of v here, but use sizeof(sq->tail) ?

Yes, I know both in theory should be of the same size, but heck,
this is puzzling at best, and confusing in a regular case.

Dunno how it slipped in the review, it instantly catched my eye
in a row of applied patches..

Also, why v is computed a few lines before it is used, with
some expressions between the assignment and usage?

How about the following patch:

From: Michael Tokarev 
Date: Wed, 19 Jul 2023 23:10:53 +0300
Subject: [PATCH trivial] hw/nvme: fix sizeof() misuse and move endianness 
conversions
 closer to users

Signed-off-by: Michael Tokarev 
Fixes: ea3c76f1494d0
---
 hw/nvme/ctrl.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index dadc2dc7da..e33b28cf66 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -6820,6 +6820,4 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)

 if (sq) {
-v = cpu_to_le32(sq->tail);
-
 /*
  * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
@@ -6829,5 +6827,6 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
 sq->db_addr = dbs_addr + (i << 3);
 sq->ei_addr = eis_addr + (i << 3);
-pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail));
+v = cpu_to_le32(sq->tail);
+pci_dma_write(pci, sq->db_addr, , sizeof(v));

 if (n->params.ioeventfd && sq->sqid != 0) {
@@ -6839,10 +6838,9 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)

 if (cq) {
-v = cpu_to_le32(cq->head);
-
 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
-pci_dma_write(pci, cq->db_addr, , sizeof(cq->head));
+v = cpu_to_le32(cq->head);
+pci_dma_write(pci, cq->db_addr, , sizeof(v));

 if (n->params.ioeventfd && cq->cqid != 0) {
@@ -7661,5 +7659,5 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
 if (!qid && n->dbbuf_enabled) {
 v = cpu_to_le32(cq->head);
-pci_dma_write(pci, cq->db_addr, , sizeof(cq->head));
+pci_dma_write(pci, cq->db_addr, , sizeof(v));
 }
 if (start_sqs) {
@@ -7721,6 +7719,4 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
 sq->tail = new_tail;
 if (!qid && n->dbbuf_enabled) {
-v = cpu_to_le32(sq->tail);
-
 /*
  * The spec states "the host shall also update the controller's
@@ -7736,5 +7732,6 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
  * so we can't trust reading it for an appropriate sq tail.
  */
-pci_dma_write(pci, sq->db_addr, , sizeof(sq->tail));
+v = cpu_to_le32(sq->tail);
+pci_dma_write(pci, sq->db_addr, , sizeof(v));
 }





[PULL 09/14] nbd/server: Prepare for alternate-size headers

2023-07-19 Thread Eric Blake
Upstream NBD now documents[1] an extension that supports 64-bit effect
lengths in requests.  As part of that extension, the size of the reply
headers will change in order to permit a 64-bit length in the reply
for symmetry[2].  Additionally, where the reply header is currently 16
bytes for simple reply, and 20 bytes for structured reply; with the
extension enabled, there will only be one extended reply header, of 32
bytes, with both structured and extended modes sending identical
payloads for chunked replies.

Since we are already wired up to use iovecs, it is easiest to allow
for this change in header size by splitting each structured reply
across multiple iovecs, one for the header (which will become wider in
a future patch according to client negotiation), and the other(s) for
the chunk payload, and removing the header from the payload struct
definitions.  Rename the affected functions with s/structured/chunk/
to make it obvious that the code will be reused in extended mode.

Interestingly, the client side code never utilized the packed types,
so only the server code needs to be updated.

[1] 
https://github.com/NetworkBlockDevice/nbd/blob/extension-ext-header/doc/proto.md
as of NBD commit e6f3b94a934

[2] Note that on the surface, this is because some future server might
permit a 4G+ NBD_CMD_READ and need to reply with that much data in one
transaction.  But even though the extended reply length is widened to
64 bits, for now the NBD spec is clear that servers will not reply
with more than a maximum payload bounded by the 32-bit
NBD_INFO_BLOCK_SIZE field; allowing a client and server to mutually
agree to transactions larger than 4G would require yet another
extension.

Signed-off-by: Eric Blake 
Message-ID: <20230608135653.2918540-4-ebl...@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/nbd.h |   8 +--
 nbd/server.c| 137 ++--
 nbd/trace-events|   8 +--
 3 files changed, 88 insertions(+), 65 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 9dcb5357d15..ee71af099a3 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -97,28 +97,28 @@ typedef union NBDReply {

 /* Header of chunk for NBD_REPLY_TYPE_OFFSET_DATA */
 typedef struct NBDStructuredReadData {
-NBDStructuredReplyChunk h; /* h.length >= 9 */
+/* header's .length >= 9 */
 uint64_t offset;
 /* At least one byte of data payload follows, calculated from h.length */
 } QEMU_PACKED NBDStructuredReadData;

 /* Complete chunk for NBD_REPLY_TYPE_OFFSET_HOLE */
 typedef struct NBDStructuredReadHole {
-NBDStructuredReplyChunk h; /* h.length == 12 */
+/* header's length == 12 */
 uint64_t offset;
 uint32_t length;
 } QEMU_PACKED NBDStructuredReadHole;

 /* Header of all NBD_REPLY_TYPE_ERROR* errors */
 typedef struct NBDStructuredError {
-NBDStructuredReplyChunk h; /* h.length >= 6 */
+/* header's length >= 6 */
 uint32_t error;
 uint16_t message_length;
 } QEMU_PACKED NBDStructuredError;

 /* Header of NBD_REPLY_TYPE_BLOCK_STATUS */
 typedef struct NBDStructuredMeta {
-NBDStructuredReplyChunk h; /* h.length >= 12 (at least one extent) */
+/* header's length >= 12 (at least one extent) */
 uint32_t context_id;
 /* extents follows */
 } QEMU_PACKED NBDStructuredMeta;
diff --git a/nbd/server.c b/nbd/server.c
index febe001a399..6698ab46365 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (C) 2016-2022 Red Hat, Inc.
+ *  Copyright Red Hat
  *  Copyright (C) 2005  Anthony Liguori 
  *
  *  Network Block Device Server Side
@@ -1906,16 +1906,36 @@ static int coroutine_fn 
nbd_co_send_simple_reply(NBDClient *client,
 {.iov_base = data, .iov_len = len}
 };

+assert(!len || !nbd_err);
 trace_nbd_co_send_simple_reply(handle, nbd_err, nbd_err_lookup(nbd_err),
len);
 set_be_simple_reply(, nbd_err, handle);

-return nbd_co_send_iov(client, iov, len ? 2 : 1, errp);
+return nbd_co_send_iov(client, iov, 2, errp);
 }

-static inline void set_be_chunk(NBDStructuredReplyChunk *chunk, uint16_t flags,
-uint16_t type, uint64_t handle, uint32_t 
length)
+/*
+ * Prepare the header of a reply chunk for network transmission.
+ *
+ * On input, @iov is partially initialized: iov[0].iov_base must point
+ * to an uninitialized NBDReply, while the remaining @niov elements
+ * (if any) must be ready for transmission.  This function then
+ * populates iov[0] for transmission.
+ */
+static inline void set_be_chunk(NBDClient *client, struct iovec *iov,
+size_t niov, uint16_t flags, uint16_t type,
+uint64_t handle)
 {
+/* TODO - handle structured vs. extended replies */
+NBDStructuredReplyChunk *chunk = iov->iov_base;
+size_t i, length = 0;
+
+for (i = 1; i < niov; i++) {
+length += iov[i].iov_len;
+}

[PULL 08/14] nbd: Consistent typedef usage in header

2023-07-19 Thread Eric Blake
We had a mix of struct declarations followed by typedefs, and direct
struct definitions as part of a typedef.  Pick a single style.  Also
float forward declarations of opaque types to the top of the file,
rather than interspersed with function declarations, which will help a
future patch that wants to expose yet another opaque type that will be
referenced in NBDRequest.  No semantic impact.

Signed-off-by: Eric Blake 
Message-ID: <20230608135653.2918540-3-ebl...@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy 
[eblake: alter patch per mailing list feedback]
Signed-off-by: Eric Blake 
---
 include/block/nbd.h | 31 +--
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index a4c98169c39..9dcb5357d15 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (C) 2016-2022 Red Hat, Inc.
+ *  Copyright Red Hat
  *  Copyright (C) 2005  Anthony Liguori 
  *
  *  Network Block Device
@@ -26,24 +26,26 @@
 #include "qapi/error.h"
 #include "qemu/bswap.h"

+typedef struct NBDExport NBDExport;
+typedef struct NBDClient NBDClient;
+typedef struct NBDClientConnection NBDClientConnection;
+
 extern const BlockExportDriver blk_exp_nbd;

 /* Handshake phase structs - this struct is passed on the wire */

-struct NBDOption {
+typedef struct NBDOption {
 uint64_t magic; /* NBD_OPTS_MAGIC */
 uint32_t option; /* NBD_OPT_* */
 uint32_t length;
-} QEMU_PACKED;
-typedef struct NBDOption NBDOption;
+} QEMU_PACKED NBDOption;

-struct NBDOptionReply {
+typedef struct NBDOptionReply {
 uint64_t magic; /* NBD_REP_MAGIC */
 uint32_t option; /* NBD_OPT_* */
 uint32_t type; /* NBD_REP_* */
 uint32_t length;
-} QEMU_PACKED;
-typedef struct NBDOptionReply NBDOptionReply;
+} QEMU_PACKED NBDOptionReply;

 typedef struct NBDOptionReplyMetaContext {
 NBDOptionReply h; /* h.type = NBD_REP_META_CONTEXT, h.length > 4 */
@@ -56,14 +58,13 @@ typedef struct NBDOptionReplyMetaContext {
  * Note: these are _NOT_ the same as the network representation of an NBD
  * request and reply!
  */
-struct NBDRequest {
+typedef struct NBDRequest {
 uint64_t handle;
 uint64_t from;
 uint32_t len;
 uint16_t flags; /* NBD_CMD_FLAG_* */
 uint16_t type; /* NBD_CMD_* */
-};
-typedef struct NBDRequest NBDRequest;
+} NBDRequest;

 typedef struct NBDSimpleReply {
 uint32_t magic;  /* NBD_SIMPLE_REPLY_MAGIC */
@@ -282,7 +283,7 @@ static inline bool nbd_reply_type_is_error(int type)
 #define NBD_ESHUTDOWN  108

 /* Details collected by NBD_OPT_EXPORT_NAME and NBD_OPT_GO */
-struct NBDExportInfo {
+typedef struct NBDExportInfo {
 /* Set by client before nbd_receive_negotiate() */
 bool request_sizes;
 char *x_dirty_bitmap;
@@ -310,8 +311,7 @@ struct NBDExportInfo {
 char *description;
 int n_contexts;
 char **contexts;
-};
-typedef struct NBDExportInfo NBDExportInfo;
+} NBDExportInfo;

 int nbd_receive_negotiate(AioContext *aio_context, QIOChannel *ioc,
   QCryptoTLSCreds *tlscreds,
@@ -330,9 +330,6 @@ int nbd_client(int fd);
 int nbd_disconnect(int fd);
 int nbd_errno_to_system_errno(int err);

-typedef struct NBDExport NBDExport;
-typedef struct NBDClient NBDClient;
-
 void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk);

 AioContext *nbd_export_aio_context(NBDExport *exp);
@@ -409,8 +406,6 @@ const char *nbd_cmd_lookup(uint16_t info);
 const char *nbd_err_lookup(int err);

 /* nbd/client-connection.c */
-typedef struct NBDClientConnection NBDClientConnection;
-
 void nbd_client_connection_enable_retry(NBDClientConnection *conn);

 NBDClientConnection *nbd_client_connection_new(const SocketAddress *saddr,
-- 
2.41.0




[PULL 05/14] qemu-nbd: handle dup2() error when qemu-nbd finished setup process

2023-07-19 Thread Eric Blake
From: "Denis V. Lunev" 

Fail on error, we are in trouble.

Signed-off-by: Denis V. Lunev 
CC: Eric Blake 
CC: Vladimir Sementsov-Ogievskiy 
Message-ID: <20230717145544.194786-6-...@openvz.org>
Reviewed-by: Eric Blake 
[eblake: avoid intermediate variable]
Signed-off-by: Eric Blake 
---
 qemu-nbd.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index f27613cb572..e30c9ac1793 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -323,7 +323,11 @@ static void *nbd_client_thread(void *arg)
 opts->device, srcpath);
 } else {
 /* Close stderr so that the qemu-nbd process exits.  */
-dup2(STDOUT_FILENO, STDERR_FILENO);
+if (dup2(STDOUT_FILENO, STDERR_FILENO) < 0) {
+error_report("Could not set stderr to /dev/null: %s",
+ strerror(errno));
+exit(EXIT_FAILURE);
+}
 }

 if (nbd_client(fd) < 0) {
@@ -1171,7 +1175,11 @@ int main(int argc, char **argv)
 }

 if (fork_process) {
-dup2(STDOUT_FILENO, STDERR_FILENO);
+if (dup2(STDOUT_FILENO, STDERR_FILENO) < 0) {
+error_report("Could not set stderr to /dev/null: %s",
+ strerror(errno));
+exit(EXIT_FAILURE);
+}
 }

 state = RUNNING;
-- 
2.41.0




[PULL 12/14] nbd/client: Simplify cookie vs. index computation

2023-07-19 Thread Eric Blake
Our code relies on a sentinel cookie value of zero for deciding when a
packet has been handled, as well as relying on array indices between 0
and MAX_NBD_REQUESTS-1 for dereferencing purposes.  As long as we can
symmetrically convert between two forms, there is no reason to go with
the odd choice of using XOR with a random pointer, when we can instead
simplify the mappings with a mere offset of 1.

Using ((uint64_t)-1) as the sentinel instead of NULL such that the two
macros could be entirely eliminated might also be possible, but would
require a more careful audit to find places where we currently rely on
zero-initialization to be interpreted as the sentinel value, so I did
not pursue that course.

Signed-off-by: Eric Blake 
Message-ID: <20230608135653.2918540-7-ebl...@redhat.com>
[eblake: enhance commit message]
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/nbd.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index be3c46c6fee..5322e66166c 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -50,8 +50,8 @@
 #define EN_OPTSTR ":exportname="
 #define MAX_NBD_REQUESTS16

-#define COOKIE_TO_INDEX(bs, cookie) ((cookie) ^ (uint64_t)(intptr_t)(bs))
-#define INDEX_TO_COOKIE(bs, index)  ((index)  ^ (uint64_t)(intptr_t)(bs))
+#define COOKIE_TO_INDEX(cookie) ((cookie) - 1)
+#define INDEX_TO_COOKIE(index)  ((index) + 1)

 typedef struct {
 Coroutine *coroutine;
@@ -420,7 +420,7 @@ static void coroutine_fn GRAPH_RDLOCK 
nbd_reconnect_attempt(BDRVNBDState *s)
 static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t cookie)
 {
 int ret;
-uint64_t ind = COOKIE_TO_INDEX(s, cookie), ind2;
+uint64_t ind = COOKIE_TO_INDEX(cookie), ind2;
 QEMU_LOCK_GUARD(>receive_mutex);

 while (true) {
@@ -435,7 +435,7 @@ static coroutine_fn int nbd_receive_replies(BDRVNBDState 
*s, uint64_t cookie)
  * woken by whoever set s->reply.cookie (or never wait in this
  * yield). So, we should not wake it here.
  */
-ind2 = COOKIE_TO_INDEX(s, s->reply.cookie);
+ind2 = COOKIE_TO_INDEX(s->reply.cookie);
 assert(!s->requests[ind2].receiving);

 s->requests[ind].receiving = true;
@@ -468,7 +468,7 @@ static coroutine_fn int nbd_receive_replies(BDRVNBDState 
*s, uint64_t cookie)
 nbd_channel_error(s, -EINVAL);
 return -EINVAL;
 }
-ind2 = COOKIE_TO_INDEX(s, s->reply.cookie);
+ind2 = COOKIE_TO_INDEX(s->reply.cookie);
 if (ind2 >= MAX_NBD_REQUESTS || !s->requests[ind2].coroutine) {
 nbd_channel_error(s, -EINVAL);
 return -EINVAL;
@@ -519,7 +519,7 @@ nbd_co_send_request(BlockDriverState *bs, NBDRequest 
*request,
 qemu_mutex_unlock(>requests_lock);

 qemu_co_mutex_lock(>send_mutex);
-request->cookie = INDEX_TO_COOKIE(s, i);
+request->cookie = INDEX_TO_COOKIE(i);

 assert(s->ioc);

@@ -832,7 +832,7 @@ static coroutine_fn int nbd_co_do_receive_one_chunk(
 int *request_ret, QEMUIOVector *qiov, void **payload, Error **errp)
 {
 int ret;
-int i = COOKIE_TO_INDEX(s, cookie);
+int i = COOKIE_TO_INDEX(cookie);
 void *local_payload = NULL;
 NBDStructuredReplyChunk *chunk;

@@ -1038,7 +1038,7 @@ static bool coroutine_fn 
nbd_reply_chunk_iter_receive(BDRVNBDState *s,

 break_loop:
 qemu_mutex_lock(>requests_lock);
-s->requests[COOKIE_TO_INDEX(s, cookie)].coroutine = NULL;
+s->requests[COOKIE_TO_INDEX(cookie)].coroutine = NULL;
 s->in_flight--;
 qemu_co_queue_next(>free_sema);
 qemu_mutex_unlock(>requests_lock);
-- 
2.41.0




[PULL 11/14] nbd: s/handle/cookie/ to match NBD spec

2023-07-19 Thread Eric Blake
Externally, libnbd exposed the 64-bit opaque marker for each client
NBD packet as the "cookie", because it was less confusing when
contrasted with 'struct nbd_handle *' holding all libnbd state.  It
also avoids confusion between the noun 'handle' as a way to identify a
packet and the verb 'handle' for reacting to things like signals.
Upstream NBD changed their spec to favor the name "cookie" based on
libnbd's recommendations[1], so we can do likewise.

[1] https://github.com/NetworkBlockDevice/nbd/commit/ca4392eb2b

Signed-off-by: Eric Blake 
Message-ID: <20230608135653.2918540-6-ebl...@redhat.com>
[eblake: typo fix]
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/nbd.h | 11 +++---
 block/nbd.c | 96 +++--
 nbd/client.c| 14 +++
 nbd/server.c| 29 +++---
 nbd/trace-events| 22 +--
 5 files changed, 87 insertions(+), 85 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index ee71af099a3..fb935d56e57 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -59,7 +59,7 @@ typedef struct NBDOptionReplyMetaContext {
  * request and reply!
  */
 typedef struct NBDRequest {
-uint64_t handle;
+uint64_t cookie;
 uint64_t from;
 uint32_t len;
 uint16_t flags; /* NBD_CMD_FLAG_* */
@@ -69,7 +69,7 @@ typedef struct NBDRequest {
 typedef struct NBDSimpleReply {
 uint32_t magic;  /* NBD_SIMPLE_REPLY_MAGIC */
 uint32_t error;
-uint64_t handle;
+uint64_t cookie;
 } QEMU_PACKED NBDSimpleReply;

 /* Header of all structured replies */
@@ -77,7 +77,7 @@ typedef struct NBDStructuredReplyChunk {
 uint32_t magic;  /* NBD_STRUCTURED_REPLY_MAGIC */
 uint16_t flags;  /* combination of NBD_REPLY_FLAG_* */
 uint16_t type;   /* NBD_REPLY_TYPE_* */
-uint64_t handle; /* request handle */
+uint64_t cookie; /* request handle */
 uint32_t length; /* length of payload */
 } QEMU_PACKED NBDStructuredReplyChunk;

@@ -85,13 +85,14 @@ typedef union NBDReply {
 NBDSimpleReply simple;
 NBDStructuredReplyChunk structured;
 struct {
-/* @magic and @handle fields have the same offset and size both in
+/*
+ * @magic and @cookie fields have the same offset and size both in
  * simple reply and structured reply chunk, so let them be accessible
  * without ".simple." or ".structured." specification
  */
 uint32_t magic;
 uint32_t _skip;
-uint64_t handle;
+uint64_t cookie;
 } QEMU_PACKED;
 } NBDReply;

diff --git a/block/nbd.c b/block/nbd.c
index 5aef5cb6bd5..be3c46c6fee 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -1,8 +1,8 @@
 /*
- * QEMU Block driver for  NBD
+ * QEMU Block driver for NBD
  *
  * Copyright (c) 2019 Virtuozzo International GmbH.
- * Copyright (C) 2016 Red Hat, Inc.
+ * Copyright Red Hat
  * Copyright (C) 2008 Bull S.A.S.
  * Author: Laurent Vivier 
  *
@@ -50,8 +50,8 @@
 #define EN_OPTSTR ":exportname="
 #define MAX_NBD_REQUESTS16

-#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ (uint64_t)(intptr_t)(bs))
-#define INDEX_TO_HANDLE(bs, index)  ((index)  ^ (uint64_t)(intptr_t)(bs))
+#define COOKIE_TO_INDEX(bs, cookie) ((cookie) ^ (uint64_t)(intptr_t)(bs))
+#define INDEX_TO_COOKIE(bs, index)  ((index)  ^ (uint64_t)(intptr_t)(bs))

 typedef struct {
 Coroutine *coroutine;
@@ -417,25 +417,25 @@ static void coroutine_fn GRAPH_RDLOCK 
nbd_reconnect_attempt(BDRVNBDState *s)
 reconnect_delay_timer_del(s);
 }

-static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t handle)
+static coroutine_fn int nbd_receive_replies(BDRVNBDState *s, uint64_t cookie)
 {
 int ret;
-uint64_t ind = HANDLE_TO_INDEX(s, handle), ind2;
+uint64_t ind = COOKIE_TO_INDEX(s, cookie), ind2;
 QEMU_LOCK_GUARD(>receive_mutex);

 while (true) {
-if (s->reply.handle == handle) {
+if (s->reply.cookie == cookie) {
 /* We are done */
 return 0;
 }

-if (s->reply.handle != 0) {
+if (s->reply.cookie != 0) {
 /*
  * Some other request is being handled now. It should already be
- * woken by whoever set s->reply.handle (or never wait in this
+ * woken by whoever set s->reply.cookie (or never wait in this
  * yield). So, we should not wake it here.
  */
-ind2 = HANDLE_TO_INDEX(s, s->reply.handle);
+ind2 = COOKIE_TO_INDEX(s, s->reply.cookie);
 assert(!s->requests[ind2].receiving);

 s->requests[ind].receiving = true;
@@ -445,9 +445,9 @@ static coroutine_fn int nbd_receive_replies(BDRVNBDState 
*s, uint64_t handle)
 /*
  * We may be woken for 2 reasons:
  * 1. From this function, executing in parallel coroutine, when our
- *handle is received.
+ *cookie is received.
  * 2. From nbd_co_receive_one_chunk(), 

[PULL 07/14] nbd/client: Use smarter assert

2023-07-19 Thread Eric Blake
Assigning strlen() to a uint32_t and then asserting that it isn't too
large doesn't catch the case of an input string 4G in length.
Thankfully, the incoming strings can never be that large: if the
export name or query is reflecting a string the client got from the
server, we already guarantee that we dropped the NBD connection if the
server sent more than 32M in a single reply to our NBD_OPT_* request;
if the export name is coming from qemu, nbd_receive_negotiate()
asserted that strlen(info->name) <= NBD_MAX_STRING_SIZE; and
similarly, a query string via x->dirty_bitmap coming from the user was
bounds-checked in either qemu-nbd or by the limitations of QMP.
Still, it doesn't hurt to be more explicit in how we write our
assertions to not have to analyze whether inadvertent wraparound is
possible.

Fixes: 93676c88 ("nbd: Don't send oversize strings", v4.2.0)
Reported-by: Dr. David Alan Gilbert 
Signed-off-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Message-ID: <20230608135653.2918540-2-ebl...@redhat.com>
---
 nbd/client.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nbd/client.c b/nbd/client.c
index 30d5383cb19..ff75722e487 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -650,19 +650,20 @@ static int nbd_send_meta_query(QIOChannel *ioc, uint32_t 
opt,
Error **errp)
 {
 int ret;
-uint32_t export_len = strlen(export);
+uint32_t export_len;
 uint32_t queries = !!query;
 uint32_t query_len = 0;
 uint32_t data_len;
 char *data;
 char *p;

+assert(strnlen(export, NBD_MAX_STRING_SIZE + 1) <= NBD_MAX_STRING_SIZE);
+export_len = strlen(export);
 data_len = sizeof(export_len) + export_len + sizeof(queries);
-assert(export_len <= NBD_MAX_STRING_SIZE);
 if (query) {
+assert(strnlen(query, NBD_MAX_STRING_SIZE + 1) <= NBD_MAX_STRING_SIZE);
 query_len = strlen(query);
 data_len += sizeof(query_len) + query_len;
-assert(query_len <= NBD_MAX_STRING_SIZE);
 } else {
 assert(opt == NBD_OPT_LIST_META_CONTEXT);
 }
-- 
2.41.0




[PULL 04/14] qemu-nbd: properly report error on error in dup2() after qemu_daemon()

2023-07-19 Thread Eric Blake
From: "Denis V. Lunev" 

We are trying to temporarily redirect stderr of daemonized process to
a pipe to report a error and get failed. In that case we could not
use error_report() helper, but should write the message directly into
the problematic pipe.

Signed-off-by: Denis V. Lunev 
CC: Eric Blake 
CC: Vladimir Sementsov-Ogievskiy 
Message-ID: <20230717145544.194786-4-...@openvz.org>
Reviewed-by: Eric Blake 
[eblake: rearrange patch series, fix typo]
Signed-off-by: Eric Blake 
---
 qemu-nbd.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 5a8ae1f7472..f27613cb572 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -940,7 +940,20 @@ int main(int argc, char **argv)
 saved_errno = errno;/* dup2 will overwrite error below */

 /* Temporarily redirect stderr to the parent's pipe...  */
-dup2(stderr_fd[1], STDERR_FILENO);
+if (dup2(stderr_fd[1], STDERR_FILENO) < 0) {
+char str[256];
+snprintf(str, sizeof(str),
+ "%s: Failed to link stderr to the pipe: %s\n",
+ g_get_prgname(), strerror(errno));
+/*
+ * We are unable to use error_report() here as we need to get
+ * stderr pointed to the parent's pipe. Write to that pipe
+ * manually.
+ */
+ret = write(stderr_fd[1], str, strlen(str));
+exit(EXIT_FAILURE);
+}
+
 if (ret < 0) {
 error_report("Failed to daemonize: %s", strerror(saved_errno));
 exit(EXIT_FAILURE);
-- 
2.41.0




[PULL 14/14] nbd: Use enum for various negotiation modes

2023-07-19 Thread Eric Blake
Deciphering the hard-coded list of integer return values from
nbd_start_negotiate() will only get more confusing when adding support
for 64-bit extended headers.  Better is to name things in an enum.
Although the function in question is private to client.c, putting the
enum in a public header and including an enum-to-string conversion
will allow its use in more places in upcoming patches.

The enum is intentionally laid out so that operators like <= can be
used to group multiple modes with similar characteristics, and where
the least powerful mode has value 0, even though this patch does not
exploit that.  No semantic change intended.

Signed-off-by: Eric Blake 
Message-ID: <20230608135653.2918540-9-ebl...@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/nbd.h | 11 +++
 nbd/client.c| 46 -
 nbd/common.c| 17 +
 3 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index fb935d56e57..4428bcffbb9 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -53,6 +53,16 @@ typedef struct NBDOptionReplyMetaContext {
 /* metadata context name follows */
 } QEMU_PACKED NBDOptionReplyMetaContext;

+/* Track results of negotiation */
+typedef enum NBDMode {
+/* Keep this list in a continuum of increasing features. */
+NBD_MODE_OLDSTYLE, /* server lacks newstyle negotiation */
+NBD_MODE_EXPORT_NAME,  /* newstyle but only OPT_EXPORT_NAME safe */
+NBD_MODE_SIMPLE,   /* newstyle but only simple replies */
+NBD_MODE_STRUCTURED,   /* newstyle, structured replies enabled */
+/* TODO add NBD_MODE_EXTENDED */
+} NBDMode;
+
 /* Transmission phase structs
  *
  * Note: these are _NOT_ the same as the network representation of an NBD
@@ -405,6 +415,7 @@ const char *nbd_rep_lookup(uint32_t rep);
 const char *nbd_info_lookup(uint16_t info);
 const char *nbd_cmd_lookup(uint16_t info);
 const char *nbd_err_lookup(int err);
+const char *nbd_mode_lookup(NBDMode mode);

 /* nbd/client-connection.c */
 void nbd_client_connection_enable_retry(NBDClientConnection *conn);
diff --git a/nbd/client.c b/nbd/client.c
index 1b5569556fe..479208d5d9d 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -875,10 +875,7 @@ static int nbd_list_meta_contexts(QIOChannel *ioc,
  * Start the handshake to the server.  After a positive return, the server
  * is ready to accept additional NBD_OPT requests.
  * Returns: negative errno: failure talking to server
- *  0: server is oldstyle, must call nbd_negotiate_finish_oldstyle
- *  1: server is newstyle, but can only accept EXPORT_NAME
- *  2: server is newstyle, but lacks structured replies
- *  3: server is newstyle and set up for structured replies
+ *  non-negative: enum NBDMode describing server abilities
  */
 static int nbd_start_negotiate(AioContext *aio_context, QIOChannel *ioc,
QCryptoTLSCreds *tlscreds,
@@ -969,16 +966,16 @@ static int nbd_start_negotiate(AioContext *aio_context, 
QIOChannel *ioc,
 return -EINVAL;
 }
 }
-return 2 + result;
+return result ? NBD_MODE_STRUCTURED : NBD_MODE_SIMPLE;
 } else {
-return 1;
+return NBD_MODE_EXPORT_NAME;
 }
 } else if (magic == NBD_CLIENT_MAGIC) {
 if (tlscreds) {
 error_setg(errp, "Server does not support STARTTLS");
 return -EINVAL;
 }
-return 0;
+return NBD_MODE_OLDSTYLE;
 } else {
 error_setg(errp, "Bad server magic received: 0x%" PRIx64, magic);
 return -EINVAL;
@@ -1032,6 +1029,9 @@ int nbd_receive_negotiate(AioContext *aio_context, 
QIOChannel *ioc,

 result = nbd_start_negotiate(aio_context, ioc, tlscreds, hostname, outioc,
  info->structured_reply, , errp);
+if (result < 0) {
+return result;
+}

 info->structured_reply = false;
 info->base_allocation = false;
@@ -1039,8 +1039,8 @@ int nbd_receive_negotiate(AioContext *aio_context, 
QIOChannel *ioc,
 ioc = *outioc;
 }

-switch (result) {
-case 3: /* newstyle, with structured replies */
+switch ((NBDMode)result) {
+case NBD_MODE_STRUCTURED:
 info->structured_reply = true;
 if (base_allocation) {
 result = nbd_negotiate_simple_meta_context(ioc, info, errp);
@@ -1050,7 +1050,7 @@ int nbd_receive_negotiate(AioContext *aio_context, 
QIOChannel *ioc,
 info->base_allocation = result == 1;
 }
 /* fall through */
-case 2: /* newstyle, try OPT_GO */
+case NBD_MODE_SIMPLE:
 /* Try NBD_OPT_GO first - if it works, we are done (it
  * also gives us a good message if the server requires
  * TLS).  If it is not available, fall back to
@@ -1073,7 +1073,7 @@ int 

[PULL 02/14] qemu-nbd: fix regression with qemu-nbd --fork run over ssh

2023-07-19 Thread Eric Blake
From: "Denis V. Lunev" 

Commit e6df58a5578fee7a50bbf36f4a50a2781cff855d
Author: Hanna Reitz 
Date:   Wed May 8 23:18:18 2019 +0200
qemu-nbd: Do not close stderr

has introduced an interesting regression. Original behavior of
ssh somehost qemu-nbd /home/den/tmp/file -f raw --fork
was the following:
 * qemu-nbd was started as a daemon
 * the command execution is done and ssh exited with success

The patch has changed this behavior and 'ssh' command now hangs forever.

According to the normal specification of the daemon() call, we should
endup with STDERR pointing to /dev/null. That should be done at the
very end of the successful startup sequence when the pipe to the
bootstrap process (used for diagnostics) is no longer needed.

This could be achived in the same way as done for 'qemu-nbd -c' case.
That was commit 0eaf453e, also fixing up e6df58a5. STDOUT copying to
STDERR does the trick.

This also leads to proper 'ssh' connection closing which fixes my
original problem.

Signed-off-by: Denis V. Lunev 
CC: Eric Blake 
CC: Vladimir Sementsov-Ogievskiy 
CC: Hanna Reitz 
CC: 
Message-ID: <20230717145544.194786-3-...@openvz.org>
Reviewed-by: Eric Blake 
Signed-off-by: Eric Blake 
---
 qemu-nbd.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 77f98c736bb..186ce9474c3 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -274,6 +274,7 @@ static void *show_parts(void *arg)

 struct NbdClientOpts {
 char *device;
+bool fork_process;
 };

 static void *nbd_client_thread(void *arg)
@@ -317,7 +318,7 @@ static void *nbd_client_thread(void *arg)
 /* update partition table */
 pthread_create(_parts_thread, NULL, show_parts, opts->device);

-if (verbose) {
+if (verbose && !opts->fork_process) {
 fprintf(stderr, "NBD device %s is now connected to %s\n",
 opts->device, srcpath);
 } else {
@@ -579,7 +580,6 @@ int main(int argc, char **argv)
 bool writethrough = false; /* Client will flush as needed. */
 bool fork_process = false;
 bool list = false;
-int old_stderr = -1;
 unsigned socket_activation;
 const char *pid_file_name = NULL;
 const char *selinux_label = NULL;
@@ -934,11 +934,6 @@ int main(int argc, char **argv)
 } else if (pid == 0) {
 close(stderr_fd[0]);

-/* Remember parent's stderr if we will be restoring it. */
-if (fork_process) {
-old_stderr = dup(STDERR_FILENO);
-}
-
 ret = qemu_daemon(1, 0);

 /* Temporarily redirect stderr to the parent's pipe...  */
@@ -1131,6 +1126,7 @@ int main(int argc, char **argv)
 int ret;
 struct NbdClientOpts opts = {
 .device = device,
+.fork_process = fork_process,
 };

 ret = pthread_create(_thread, NULL, nbd_client_thread, );
@@ -1159,8 +1155,7 @@ int main(int argc, char **argv)
 }

 if (fork_process) {
-dup2(old_stderr, STDERR_FILENO);
-close(old_stderr);
+dup2(STDOUT_FILENO, STDERR_FILENO);
 }

 state = RUNNING;
-- 
2.41.0




[PULL 00/14] NBD patches for 2023-07-19

2023-07-19 Thread Eric Blake
The following changes since commit 2c27fdc7a626408ee2cf30d791aa0b63027c7404:

  Update version for v8.1.0-rc0 release (2023-07-19 20:31:43 +0100)

are available in the Git repository at:

  https://repo.or.cz/qemu/ericb.git tags/pull-nbd-2023-07-19

for you to fetch changes up to bfe04d0a7d5e8a4f4c9014ee7622af2056685974:

  nbd: Use enum for various negotiation modes (2023-07-19 15:26:13 -0500)


NBD patches through 2023-07-19

- Denis V. Lunev: fix hang with 'ssh ... "qemu-nbd -c"'
- Eric Blake: preliminary work towards NBD 64-bit extensions


Denis V. Lunev (6):
  qemu-nbd: pass structure into nbd_client_thread instead of plain char*
  qemu-nbd: fix regression with qemu-nbd --fork run over ssh
  qemu-nbd: properly report error if qemu_daemon() is failed
  qemu-nbd: properly report error on error in dup2() after qemu_daemon()
  qemu-nbd: handle dup2() error when qemu-nbd finished setup process
  qemu-nbd: make verbose bool and local variable in main()

Eric Blake (8):
  nbd/client: Use smarter assert
  nbd: Consistent typedef usage in header
  nbd/server: Prepare for alternate-size headers
  nbd/server: Refactor to pass full request around
  nbd: s/handle/cookie/ to match NBD spec
  nbd/client: Simplify cookie vs. index computation
  nbd/client: Add safety check on chunk payload length
  nbd: Use enum for various negotiation modes

 include/block/nbd.h |  61 +++---
 block/nbd.c |  96 +++---
 nbd/client.c|  79 ++
 nbd/common.c|  17 
 nbd/server.c| 224 +---
 qemu-nbd.c  |  68 +++-
 nbd/trace-events|  22 +++---
 7 files changed, 332 insertions(+), 235 deletions(-)

base-commit: 2c27fdc7a626408ee2cf30d791aa0b63027c7404
-- 
2.41.0




[PULL 01/14] qemu-nbd: pass structure into nbd_client_thread instead of plain char*

2023-07-19 Thread Eric Blake
From: "Denis V. Lunev" 

We are going to pass additional flag inside next patch.

Signed-off-by: Denis V. Lunev 
CC: Eric Blake 
CC: Vladimir Sementsov-Ogievskiy 
CC: 
Message-ID: <20230717145544.194786-2-...@openvz.org>
Reviewed-by: Eric Blake 
Signed-off-by: Eric Blake 
---
 qemu-nbd.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 4276163564b..77f98c736bb 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -272,9 +272,13 @@ static void *show_parts(void *arg)
 return NULL;
 }

+struct NbdClientOpts {
+char *device;
+};
+
 static void *nbd_client_thread(void *arg)
 {
-char *device = arg;
+struct NbdClientOpts *opts = arg;
 NBDExportInfo info = { .request_sizes = false, .name = g_strdup("") };
 QIOChannelSocket *sioc;
 int fd = -1;
@@ -298,10 +302,10 @@ static void *nbd_client_thread(void *arg)
 goto out;
 }

-fd = open(device, O_RDWR);
+fd = open(opts->device, O_RDWR);
 if (fd < 0) {
 /* Linux-only, we can use %m in printf.  */
-error_report("Failed to open %s: %m", device);
+error_report("Failed to open %s: %m", opts->device);
 goto out;
 }

@@ -311,11 +315,11 @@ static void *nbd_client_thread(void *arg)
 }

 /* update partition table */
-pthread_create(_parts_thread, NULL, show_parts, device);
+pthread_create(_parts_thread, NULL, show_parts, opts->device);

 if (verbose) {
 fprintf(stderr, "NBD device %s is now connected to %s\n",
-device, srcpath);
+opts->device, srcpath);
 } else {
 /* Close stderr so that the qemu-nbd process exits.  */
 dup2(STDOUT_FILENO, STDERR_FILENO);
@@ -1125,8 +1129,11 @@ int main(int argc, char **argv)
 if (device) {
 #if HAVE_NBD_DEVICE
 int ret;
+struct NbdClientOpts opts = {
+.device = device,
+};

-ret = pthread_create(_thread, NULL, nbd_client_thread, device);
+ret = pthread_create(_thread, NULL, nbd_client_thread, );
 if (ret != 0) {
 error_report("Failed to create client thread: %s", strerror(ret));
 exit(EXIT_FAILURE);
-- 
2.41.0




Re: [PULL 10/66] tests/qtest: enable tests for virtio-scmi

2023-07-19 Thread Milan Zamazal
Thomas Huth  writes:

> On 18/07/2023 14.55, Milan Zamazal wrote:
>> Thomas Huth  writes:
>> 
>
>>> On 11/07/2023 01.02, Michael S. Tsirkin wrote:
 From: Milan Zamazal 
 We don't have a virtio-scmi implementation in QEMU and only support
>>>
 a
 vhost-user backend.  This is very similar to virtio-gpio and we add the 
 same
 set of tests, just passing some vhost-user messages over the control 
 socket.
 Signed-off-by: Milan Zamazal 
 Acked-by: Thomas Huth 
 Message-Id: <20230628100524.342666-4-mzama...@redhat.com>
 Reviewed-by: Michael S. Tsirkin 
 Signed-off-by: Michael S. Tsirkin 
 ---
tests/qtest/libqos/virtio-scmi.h |  34 ++
tests/qtest/libqos/virtio-scmi.c | 174 +++
tests/qtest/vhost-user-test.c|  44 
MAINTAINERS  |   1 +
tests/qtest/libqos/meson.build   |   1 +
5 files changed, 254 insertions(+)
create mode 100644 tests/qtest/libqos/virtio-scmi.h
create mode 100644 tests/qtest/libqos/virtio-scmi.c
>>>
>>>   Hi!
>>>
>>> I'm seeing some random failures with this new scmi test, so far only
>>> on non-x86 systems, e.g.:
>>>
>>>   https://app.travis-ci.com/github/huth/qemu/jobs/606246131#L4774
>>>
>>> It also reproduces on a s390x host here, but only if I run "make check
>>> -j$(nproc)" - if I run the tests single-threaded, the qos-test passes
>>> there. Seems like there is a race somewhere in this test?
>> Hmm, it's basically the same as virtio-gpio.c test, so it should be
>> OK.
>> Is it possible that the two tests (virtio-gpio.c & virtio-scmi.c)
>> interfere with each other in some way?  Is there possibly a way to
>> serialize them to check?
>
> I think within one qos-test, the sub-tests are already run
> serialized. 

I see, OK.

> But there might be multiple qos-tests running in parallel, e.g. one
> for the aarch64 target and one for the ppc64 target. And indeed, I can
> reproduce the problem on my x86 laptop by running this in one terminal
> window:
>
> for ((x=0;x<1000;x++)); do \
>  QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon \
>  G_TEST_DBUS_DAEMON=.tests/dbus-vmstate-daemon.sh \
>  QTEST_QEMU_BINARY=./qemu-system-ppc64 \
>  MALLOC_PERTURB_=188 QTEST_QEMU_IMG=./qemu-img \
>  tests/qtest/qos-test -p \
>  
> /ppc64/pseries/spapr-pci-host-bridge/pci-bus-spapr/pci-bus/vhost-user-scmi-pci/vhost-user-scmi/vhost-user-scmi-tests/scmi/read-guest-mem/memfile
> \
>  || break ; \
> done
>
> And this in another terminal window at the same time:
>
> for ((x=0;x<1000;x++)); do \
>  QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon \
>  G_TEST_DBUS_DAEMON=.tests/dbus-vmstate-daemon.sh \
>  QTEST_QEMU_BINARY=./qemu-system-aarch64 \
>  MALLOC_PERTURB_=188 QTEST_QEMU_IMG=./qemu-img \
>  tests/qtest/qos-test -p \
>  
> /aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/vhost-user-scmi-pci/vhost-user-scmi/vhost-user-scmi-tests/scmi/read-guest-mem/memfile
> \
>  || break ; \
> done
>
> After a while, the aarch64 test broke with:
>
> /aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/vhost-user-scmi-pci/vhost-user-scmi/vhost-user-scmi-tests/scmi/read-guest-mem/memfile:
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument 
> (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument 
> (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost_set_vring_call failed 22
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost_set_vring_call failed 22
> qemu-system-aarch64: Failed to write msg. Wrote -1 instead of 20.
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 0 ring restore failed: -22: Invalid argument 
> (22)
> qemu-system-aarch64: Failed to set msg fds.
> qemu-system-aarch64: vhost VQ 1 ring restore failed: -22: Invalid argument 
> (22)
> qemu-system-aarch64: ../../devel/qemu/hw/pci/msix.c:659:
> msix_unset_vector_notifiers: Assertion `dev->msix_vector_use_notifier
> && dev->msix_vector_release_notifier' failed.
> ../../devel/qemu/tests/qtest/libqtest.c:200: kill_qemu() detected QEMU
> death from signal 6 (Aborted) (core dumped)
> **
> ERROR:../../devel/qemu/tests/qtest/qos-test.c:191:subprocess_run_one_test:
> child process
> (/aarch64/virt/generic-pcihost/pci-bus-generic/pci-bus/vhost-user-scmi-pci/vhost-user-scmi/vhost-user-scmi-tests/scmi/read-guest-mem/memfile/subprocess
> [488457]) failed unexpectedly
> Aborted (core dumped)

Interesting, good discovery.

> Can you also reproduce it this way?

Unfortunately not.  I ran the loops several times and everything passed.
I tried to compile and run it in a different distro container and it
passed too.  I also haven't been successful in getting any idea how the
processes could influence 

Re: [PULL 0/1] hw/nvme fixes

2023-07-19 Thread Peter Maydell
On Wed, 19 Jul 2023 at 08:36, Klaus Jensen  wrote:
>
> From: Klaus Jensen 
>
> Hi,
>
> The following changes since commit 361d5397355276e3007825cc17217c1e4d4320f7:
>
>   Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into 
> staging (2023-07-17 15:49:27 +0100)
>
> are available in the Git repository at:
>
>   https://gitlab.com/birkelund/qemu.git tags/nvme-next-pull-request
>
> for you to fetch changes up to ea3c76f1494d0c75873c3b470e6e048202661ad8:
>
>   hw/nvme: fix endianness issue for shadow doorbells (2023-07-19 09:33:54 
> +0200)
>
> 
> hw/nvme fixes
>
> * fix shadow doorbell endian issue
> -BEGIN PGP SIGNATURE-


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.1
for any user-visible changes.

-- PMM



Re: [PULL 0/5] riscv-to-apply queue

2023-07-19 Thread Peter Maydell
On Wed, 19 Jul 2023 at 05:46, Alistair Francis  wrote:
>
> The following changes since commit 361d5397355276e3007825cc17217c1e4d4320f7:
>
>   Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into 
> staging (2023-07-17 15:49:27 +0100)
>
> are available in the Git repository at:
>
>   https://github.com/alistair23/qemu.git tags/pull-riscv-to-apply-20230719-1
>
> for you to fetch changes up to 32be32509987fbe42cf5c2fd3cea3c2ad6eae179:
>
>   target/riscv: Fix LMUL check to use VLEN (2023-07-19 14:37:26 +1000)
>
> 
> Fourth RISC-V PR for 8.1
>
> * Fix LMUL check to use VLEN
> * Fix typo field in NUMA error_report
> * check priv_ver before auto-enable zca/zcd/zcf
> * Fix disas output of upper immediates
> * tidy CPU firmware section
>


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.1
for any user-visible changes.

-- PMM



Re: [RFC PATCH 10/17] misc/i2c_mctp_cxl: Initial device emulation

2023-07-19 Thread Gregory Price
On Wed, Jul 19, 2023 at 09:19:47AM +0100, Jonathan Cameron wrote:
> On Tue, 18 Jul 2023 17:30:57 -0400
> Gregory Price  wrote:
> 
> > On Mon, Jul 17, 2023 at 06:16:39PM +0100, Jonathan Cameron wrote:
> > > @@ -397,8 +401,9 @@ struct CXLType3Dev {
> > >  AddressSpace hostpmem_as;
> > >  CXLComponentState cxl_cstate;
> > >  CXLDeviceState cxl_dstate;
> > > -CXLCCI cci;
> > > -
> > > +CXLCCI cci; /* Primary PCI mailbox CCI */
> > > +CXLCCI oob_mctp_cci; /* Initialized only if targetted */
> > > +  
> > 
> > I've been humming and hawing over this on the MHD stuff because I wanted
> > to figure out how to "add a CCI command" to a type-3 device without
> > either having a billion definitions for CCI command sets - or doing
> > something like this.
> > 
> > I don't hate this design pattern, I just want to ask whether your
> > intent is to end up with CXLType3Dev hosting many CXLCCI's based on what
> > wrapper types you have. 
> > 
> > Example: a type-3 device with mctp pass through and the MHD command set
> > 
> > CXLType3Dev {
> > ...
> > CXLCCI cci;
> > CXLCCI oob_mctp_cci;
> > CXLCCI mhd_cci;
> > ...
> > }
> 
> Yes - that's what I was thinking.  In some cases a CCI may be accessed by
> tunneling on a different CCI on the same device as well as the option
> of tunneling to different devices.
> 
> So far the set that we'll end up with isn't too large. And if some aren't
> used for a given instantiation that's fine if it keeps the code simple.
> We may end up with other MCTP buses and to keep things consistent each one
> will need it's own target CXLCCI. If we need to rethink and make it dynamic
> to some degree we can look at it later.
> 

Maybe a dangerous suggestion.  Right now the CCI's are static:

static const struct cxl_cmd cxl_cmd_set[256][256]

how difficult might it be to allow these tables to be dynamic instead?
Then we could add an interface like this:

void cxl_add_cmd_set(CXLCCI *cci, CXLCCI *cmd_set, payload_max) {
copy(cci, cmd_set);
}

This would enable not just adding sub-components piece-meal, but also if
someone wants to model a real device with custom CCI commands, they can
simply define a CCI set and pass it in via

cxl_add_cmd_set(>cci, my_cmd_set, payload_max);

Which lets the existing /dev/cxl/memN device dispatch those commands,
and makes modeling real devices an easier endeavor.

Only downside is that this may require changing the command structure to
include a callback type and pointer per cci function. The upside is this
would also allow commands to be written somewhat agnostic to the device
they're being inherited by and allow for device nesting like...

-device cxl-type3, id=ct3d
-device cxl-mhd, target=ct3d
-device my_vendor_cxl_type3, target=ct3d
etc etc

otherwise we're probably going to end up with a cxl-type3 -device line
300 characters long.

Maybe that's over-generalizing a bit much n.n;

~Gregory



[RFC] cxl/type3: minimum MHD cci support

2023-07-19 Thread Gregory Price
Implement the MHD GET_INFO cci command and add a shared memory
region to the type3 device to host the information.

Add a helper program to initialize this shared memory region.

For now, limit the number of LD's to the number of heads. Later,
this limitation will need to be lifted for MH-MLDs.

Intended use case:

1. Create the shared memory region
2. Format the shared memory region
3. Launch QEMU with `is_mhd=true,mhd_head=N,mhd_shmid=$shmid`

shmid=`ipcmk -M 4096 | grep -o -E '[0-9]+' | head -1`
cxl_mhd_init 4 $shmid
qemu-system-x86_64 \
  -nographic \
  -accel kvm \
  -drive file=./mhd.qcow2,format=qcow2,index=0,media=disk,id=hd \
  -m 4G,slots=4,maxmem=8G \
  -smp 4 \
  -machine type=q35,cxl=on,hmat=on \
  -device pxb-cxl,id=cxl.0,bus=pcie.0,bus_nr=52 \
  -device cxl-rp,id=rp0,bus=cxl.0,chassis=0,port=0,slot=0 \
  -object memory-backend-file,id=mem0,mem-path=/tmp/mem0,size=4G,share=true \
  -device 
cxl-type3,bus=rp0,volatile-memdev=mem0,id=cxl-mem0,sn=6,is_mhd=true,mhd_head=0,mhd_shmid=$shmid
 \
  -M cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.size=4G




Comments:

base repo: https://gitlab.com/jic23/qemu
base branch: cxl-2023-07-17

Originally I wanted to add this as a separate CCI, but I realized this
wouldn't work as intended because that would require a separate pci/cxl
device in /dev/ to tunnel messages though.  This is not how this will
present on real devices, so I went away from that.

Next I wanted to simply *dynamically* add the command to the existing
CCI in the type3 device, but these are statically defined in
cxl-mailbox.

I settled for simply adding the cci command to the type 3 device by
default, and checking for whether `is_mhd` is set in the command.

Ultimately, for MHD, they are likely to have a bunch of vendor specific
commands associated with them *and* a bunch of vendor specific state. It
would be nice to able to have something like "cci_add_command_set()" to
the cci-mailbox, and an interface to override certain type3 functions
such as read/write (but this is an exercise for a later patch set).

---
 hw/cxl/cxl-mailbox-utils.c  | 53 +++
 hw/mem/cxl_type3.c  | 50 +
 include/hw/cxl/cxl_device.h | 12 +++
 tools/cxl/cxl_mhd_init.c| 63 +
 tools/cxl/meson.build   |  3 ++
 tools/meson.build   |  1 +
 6 files changed, 182 insertions(+)
 create mode 100644 tools/cxl/cxl_mhd_init.c
 create mode 100644 tools/cxl/meson.build

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 2819914e8d..9ef4d7f5e0 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -84,6 +84,8 @@ enum {
 #define GET_PHYSICAL_PORT_STATE 0x1
 TUNNEL = 0x53,
 #define MANAGEMENT_COMMAND 0x0
+MHD = 0x55,
+#define GET_MHD_INFO 0x0
 };
 
 /* CCI Message Format CXL r3.0 Figure 7-19 */
@@ -1155,6 +1157,56 @@ static CXLRetCode cmd_media_clear_poison(const struct 
cxl_cmd *cmd,
 return CXL_MBOX_SUCCESS;
 }
 
+static CXLRetCode cmd_mhd_get_info(const struct cxl_cmd *cmd,
+   uint8_t *payload_in,
+   size_t len_in,
+   uint8_t *payload_out,
+   size_t *len_out,
+   CXLCCI *cci)
+{
+CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
+struct {
+uint8_t start_ld;
+uint8_t ldmap_len;
+} QEMU_PACKED *input = (void *)payload_in;
+
+struct {
+uint8_t nr_lds;
+uint8_t nr_heads;
+uint16_t resv1;
+uint8_t start_ld;
+uint8_t ldmap_len;
+uint16_t resv2;
+uint8_t ldmap[];
+} QEMU_PACKED *output = (void *)payload_out;
+
+uint8_t start_ld = input->start_ld;
+uint8_t ldmap_len = input->ldmap_len;
+uint8_t i;
+
+if (!ct3d->is_mhd) {
+return CXL_MBOX_UNSUPPORTED;
+}
+
+if (start_ld >= ct3d->mhd_state->nr_lds) {
+return CXL_MBOX_INVALID_INPUT;
+}
+
+output->nr_lds = ct3d->mhd_state->nr_lds;
+output->nr_heads = ct3d->mhd_state->nr_heads;
+output->resv1 = 0;
+output->start_ld = start_ld;
+output->resv2 = 0;
+
+for (i = 0; i < ldmap_len && (start_ld + i) < output->nr_lds; i++) {
+output->ldmap[i] = ct3d->mhd_state->ldmap[start_ld + i];
+}
+output->ldmap_len = i;
+
+*len_out = sizeof(*output) + output->ldmap_len;
+return CXL_MBOX_SUCCESS;
+}
+
 #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
 #define IMMEDIATE_DATA_CHANGE (1 << 2)
 #define IMMEDIATE_POLICY_CHANGE (1 << 3)
@@ -1195,6 +1247,7 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = {
 cmd_media_inject_poison, 8, 0 },
 [MEDIA_AND_POISON][CLEAR_POISON] = { "MEDIA_AND_POISON_CLEAR_POISON",
 cmd_media_clear_poison, 72, 0 },
+[MHD][GET_MHD_INFO] = {"GET_MULTI_HEADED_INFO", cmd_mhd_get_info, 2, 0},
 };
 
 static const struct cxl_cmd 

[PATCH for-8.1] hw/nvme: fix compliance issue wrt. iosqes/iocqes

2023-07-19 Thread Klaus Jensen
From: Klaus Jensen 

As of prior to this patch, the controller checks the value of CC.IOCQES
and CC.IOSQES prior to enabling the controller. As reported by Ben in
GitLab issue #1691, this is not spec compliant. The controller should
only check these values when queues are created.

This patch moves these checks to nvme_create_cq(). We do not need to
check it in nvme_create_sq() since that will error out if the completion
queue is not already created.

Also, since the controlle exclusively supports SQEs of size 64 bytes and
CQEs of size 16 bytes, hard code that.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1691
Signed-off-by: Klaus Jensen 
---
 hw/nvme/ctrl.c   | 46 
 hw/nvme/nvme.h   |  9 +++--
 hw/nvme/trace-events |  1 +
 3 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 8e8e870b9a80..414e9ea60e05 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1511,7 +1511,7 @@ static void nvme_post_cqes(void *opaque)
 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
 req->cqe.sq_id = cpu_to_le16(sq->sqid);
 req->cqe.sq_head = cpu_to_le16(sq->head);
-addr = cq->dma_addr + cq->tail * n->cqe_size;
+addr = cq->dma_addr + (cq->tail << NVME_CQES);
 ret = pci_dma_write(PCI_DEVICE(n), addr, (void *)>cqe,
 sizeof(req->cqe));
 if (ret) {
@@ -5299,10 +5299,18 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
*req)
 uint16_t qsize = le16_to_cpu(c->qsize);
 uint16_t qflags = le16_to_cpu(c->cq_flags);
 uint64_t prp1 = le64_to_cpu(c->prp1);
+uint32_t cc = ldq_le_p(>bar.cc);
+uint8_t iocqes = NVME_CC_IOCQES(cc);
+uint8_t iosqes = NVME_CC_IOSQES(cc);
 
 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
  NVME_CQ_FLAGS_IEN(qflags) != 0);
 
+if (iosqes != NVME_SQES || iocqes != NVME_CQES) {
+trace_pci_nvme_err_invalid_create_cq_entry_size(iosqes, iocqes);
+return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
+}
+
 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
 return NVME_INVALID_QID | NVME_DNR;
@@ -7003,7 +7011,7 @@ static void nvme_process_sq(void *opaque)
 }
 
 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(>req_list))) {
-addr = sq->dma_addr + sq->head * n->sqe_size;
+addr = sq->dma_addr + (sq->head << NVME_SQES);
 if (nvme_addr_read(n, addr, (void *), sizeof(cmd))) {
 trace_pci_nvme_err_addr_read(addr);
 trace_pci_nvme_err_cfs();
@@ -7228,34 +7236,6 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 NVME_CAP_MPSMAX(cap));
 return -1;
 }
-if (unlikely(NVME_CC_IOCQES(cc) <
- NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
-trace_pci_nvme_err_startfail_cqent_too_small(
-NVME_CC_IOCQES(cc),
-NVME_CTRL_CQES_MIN(cap));
-return -1;
-}
-if (unlikely(NVME_CC_IOCQES(cc) >
- NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
-trace_pci_nvme_err_startfail_cqent_too_large(
-NVME_CC_IOCQES(cc),
-NVME_CTRL_CQES_MAX(cap));
-return -1;
-}
-if (unlikely(NVME_CC_IOSQES(cc) <
- NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
-trace_pci_nvme_err_startfail_sqent_too_small(
-NVME_CC_IOSQES(cc),
-NVME_CTRL_SQES_MIN(cap));
-return -1;
-}
-if (unlikely(NVME_CC_IOSQES(cc) >
- NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
-trace_pci_nvme_err_startfail_sqent_too_large(
-NVME_CC_IOSQES(cc),
-NVME_CTRL_SQES_MAX(cap));
-return -1;
-}
 if (unlikely(!NVME_AQA_ASQS(aqa))) {
 trace_pci_nvme_err_startfail_asqent_sz_zero();
 return -1;
@@ -7268,8 +7248,6 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 n->page_bits = page_bits;
 n->page_size = page_size;
 n->max_prp_ents = n->page_size / sizeof(uint64_t);
-n->cqe_size = 1 << NVME_CC_IOCQES(cc);
-n->sqe_size = 1 << NVME_CC_IOSQES(cc);
 nvme_init_cq(>admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
 nvme_init_sq(>admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
 
@@ -8238,8 +8216,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
 
-id->sqes = (0x6 << 4) | 0x6;
-id->cqes = (0x4 << 4) | 0x4;
+id->sqes = (NVME_SQES << 4) | NVME_SQES;
+id->cqes = (NVME_CQES << 4) | NVME_CQES;
 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
NVME_ONCS_FEATURES | NVME_ONCS_DSM |
diff --git 

[PATCH] roms/opensbi: Upgrade from v1.3 to v1.3.1

2023-07-19 Thread Bin Meng
Upgrade OpenSBI from v1.3 to v1.3.1 and the pre-built bios images
which fixes the boot failure seen when using QEMU to do a direct
kernel boot with Microchip Icicle Kit board machine.

The v1.3.1 release includes the following commits:

0907de3 lib: sbi: fix comment indent
eb736a5 lib: sbi_pmu: Avoid out of bounds access
7828eeb gpio/desginware: add Synopsys DesignWare APB GPIO support
c6a3573 lib: utils: Fix sbi_hartid_to_scratch() usage in ACLINT drivers
057eb10 lib: utils/gpio: Fix RV32 compile error for designware GPIO driver

Signed-off-by: Bin Meng 

---
Please pull the complete patch from https://github.com/lbmeng/qemu
opensbi branch.

 .../opensbi-riscv32-generic-fw_dynamic.bin| Bin 135344 -> 135376 bytes
 .../opensbi-riscv64-generic-fw_dynamic.bin| Bin 138304 -> 138368 bytes
 roms/opensbi  |   2 +-
 3 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/pc-bios/opensbi-riscv32-generic-fw_dynamic.bin 
b/pc-bios/opensbi-riscv32-generic-fw_dynamic.bin
index 7b6c67e0ae..9a2ba3f2a4 100644
Binary files a/pc-bios/opensbi-riscv32-generic-fw_dynamic.bin and 
b/pc-bios/opensbi-riscv32-generic-fw_dynamic.bin differ
diff --git a/pc-bios/opensbi-riscv64-generic-fw_dynamic.bin 
b/pc-bios/opensbi-riscv64-generic-fw_dynamic.bin
index 1b831b412c..5d4e812819 100644
Binary files a/pc-bios/opensbi-riscv64-generic-fw_dynamic.bin and 
b/pc-bios/opensbi-riscv64-generic-fw_dynamic.bin differ
diff --git a/roms/opensbi b/roms/opensbi
index 2552799a1d..057eb10b6d 16
--- a/roms/opensbi
+++ b/roms/opensbi
@@ -1 +1 @@
-Subproject commit 2552799a1df30a3dcd2321a8b75d61d06f5fb9fc
+Subproject commit 057eb10b6d523540012e6947d5c9f63e95244e94
-- 
2.34.1




Re: [PATCH v2 1/4] vhost-user.rst: Migrating back-end-internal state

2023-07-19 Thread Hanna Czenczek

On 18.07.23 17:57, Stefan Hajnoczi wrote:

On Wed, Jul 12, 2023 at 01:16:59PM +0200, Hanna Czenczek wrote:

For vhost-user devices, qemu can migrate the virtio state, but not the
back-end's internal state.  To do so, we need to be able to transfer
this internal state between front-end (qemu) and back-end.

At this point, this new feature is added for the purpose of virtio-fs
migration.  Because virtiofsd's internal state will not be too large, we
believe it is best to transfer it as a single binary blob after the
streaming phase.

These are the additions to the protocol:
- New vhost-user protocol feature VHOST_USER_PROTOCOL_F_MIGRATORY_STATE

It's not 100% clear whether "migratory" is related to live migration or
something else. I don't like the name :P.

The name "VHOST_USER_PROTOCOL_F_DEVICE_STATE" would be more obviously
associated with SET_DEVICE_STATE_FD and CHECK_DEVICE_STATE than
"MIGRATORY_STATE".


Sure, sure.  Naming things is hard. :)


- SET_DEVICE_STATE_FD function: Front-end and back-end negotiate a pipe
   over which to transfer the state.

Does it need to be a pipe or can it be another type of file (e.g. UNIX
domain socket)?


It’s difficult to say, honestly.  It can be anything, but I’m not sure 
how to describe that in this specification.


It must be any FD into which the state sender can write the state and 
signal end of state by closing its FD; and from which the state receiver 
can read the state, terminated by seeing an EOF.  As you say, that 
doesn’t mean that the sender has to write the state into the FD, nor 
that the receiver has to read it (into memory), it’s just that either 
side must ensure the other can do it.



In the future the fd may become bi-directional. Pipes are
uni-directional on Linux.

I suggest calling it a "file descriptor" and not mentioning "pipes"
explicitly.


Works here in the commit message, but in the document, we need to be 
explicit about the requirements for this FD, i.e. the way in which 
front-end and back-end can expect the FD to be usable.  Calling it a 
“pipe” was a simple way, but you’re right, it’s more general than that.



- CHECK_DEVICE_STATE: After the state has been transferred through the
   pipe, the front-end invokes this function to verify success.  There is
   no in-band way (through the pipe) to indicate failure, so we need to
   check explicitly.

Once the transfer pipe has been established via SET_DEVICE_STATE_FD
(which includes establishing the direction of transfer and migration
phase), the sending side writes its data into the pipe, and the reading
side reads it until it sees an EOF.  Then, the front-end will check for
success via CHECK_DEVICE_STATE, which on the destination side includes
checking for integrity (i.e. errors during deserialization).

Suggested-by: Stefan Hajnoczi 
Signed-off-by: Hanna Czenczek 
---
  docs/interop/vhost-user.rst | 87 +
  1 file changed, 87 insertions(+)

diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index ac6be34c4c..c98dfeca25 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -334,6 +334,7 @@ in the ancillary data:
  * ``VHOST_USER_SET_VRING_ERR``
  * ``VHOST_USER_SET_BACKEND_REQ_FD`` (previous name 
``VHOST_USER_SET_SLAVE_REQ_FD``)
  * ``VHOST_USER_SET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``)
+* ``VHOST_USER_SET_DEVICE_STATE_FD``
  
  If *front-end* is unable to send the full message or receives a wrong

  reply it will close the connection. An optional reconnection mechanism
@@ -497,6 +498,44 @@ it performs WAKE ioctl's on the userfaultfd to wake the 
stalled
  back-end.  The front-end indicates support for this via the
  ``VHOST_USER_PROTOCOL_F_PAGEFAULT`` feature.
  
+.. _migrating_backend_state:

+
+Migrating back-end state
+
+
+If the back-end has internal state that is to be sent from source to
+destination,

Migration and the terms "source" and "destination" have not been
defined. Here is a suggestion for an introductory paragraph:

   Migrating device state involves transferring the state from one
   back-end, called the source, to another back-end, called the
   destination. After migration, the destination transparently resumes
   operation without requiring the driver to re-initialize the device at
   the VIRTIO level. If the migration fails, then the source can
   transparently resume operation until another migration attempt is
   made.


You’re right, thanks!  Maybe I’ll try to be even more verbose here, and 
include what VM and guest do.



the front-end may be able to store and transfer it via an
+internal migration stream.  Support for this is negotiated with the
+``VHOST_USER_PROTOCOL_F_MIGRATORY_STATE`` feature.
+
+First, a channel over which the state is transferred is established on
+the source side using the ``VHOST_USER_SET_DEVICE_STATE_FD`` message.
+This message has two parameters:
+
+* Direction of transfer: On the source, the data is 

Re: Boot failure after QEMU's upgrade to OpenSBI v1.3 (was Re: [PATCH for-8.2 6/7] target/riscv: add 'max' CPU type)

2023-07-19 Thread Bin Meng
Hi Anup,

On Thu, Jul 20, 2023 at 12:10 AM Anup Patel  wrote:
>
> Hi Bin,
>
> On Wed, Jul 19, 2023 at 9:15 PM Bin Meng  wrote:
> >
> > On Wed, Jul 19, 2023 at 11:22 PM Anup Patel  wrote:
> > >
> > > On Wed, Jul 19, 2023 at 3:23 PM Alistair Francis  
> > > wrote:
> > > >
> > > > On Wed, Jul 19, 2023 at 3:39 PM Anup Patel  wrote:
> > > > >
> > > > > On Wed, Jul 19, 2023 at 7:03 AM Alistair Francis 
> > > > >  wrote:
> > > > > >
> > > > > > On Sat, Jul 15, 2023 at 7:14 PM Atish Patra  
> > > > > > wrote:
> > > > > > >
> > > > > > > On Fri, Jul 14, 2023 at 5:29 AM Conor Dooley  
> > > > > > > wrote:
> > > > > > > >
> > > > > > > > On Fri, Jul 14, 2023 at 11:19:34AM +0100, Conor Dooley wrote:
> > > > > > > > > On Fri, Jul 14, 2023 at 10:00:19AM +0530, Anup Patel wrote:
> > > > > > > > >
> > > > > > > > > > > > OpenSBI v1.3
> > > > > > > > > > > >_  _
> > > > > > > > > > > >   / __ \  / |  _ \_   _|
> > > > > > > > > > > >  | |  | |_ __   ___ _ __ | (___ | |_) || |
> > > > > > > > > > > >  | |  | | '_ \ / _ \ '_ \ \___ \|  _ < | |
> > > > > > > > > > > >  | |__| | |_) |  __/ | | |) | |_) || |_
> > > > > > > > > > > >   \/| .__/ \___|_| |_|_/|___/_|
> > > > > > > > > > > > | |
> > > > > > > > > > > > |_|
> > > > > > > > > > > >
> > > > > > > > > > > > init_coldboot: ipi init failed (error -1009)
> > > > > > > > > > > >
> > > > > > > > > > > > Just to note, because we use our own firmware that 
> > > > > > > > > > > > vendors in OpenSBI
> > > > > > > > > > > > and compiles only a significantly cut down number of 
> > > > > > > > > > > > files from it, we
> > > > > > > > > > > > do not use the fw_dynamic etc flow on our hardware. As 
> > > > > > > > > > > > a result, we have
> > > > > > > > > > > > not tested v1.3, nor do we have any immediate plans to 
> > > > > > > > > > > > change our
> > > > > > > > > > > > platform firmware to vendor v1.3 either.
> > > > > > > > > > > >
> > > > > > > > > > > > I unless there's something obvious to you, it sounds 
> > > > > > > > > > > > like I will need to
> > > > > > > > > > > > go and bisect OpenSBI. That's a job for another day 
> > > > > > > > > > > > though, given the
> > > > > > > > > > > > time.
> > > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > The real issue is some CPU/HART DT nodes marked as disabled 
> > > > > > > > > > in the
> > > > > > > > > > DT passed to OpenSBI 1.3.
> > > > > > > > > >
> > > > > > > > > > This issue does not exist in any of the DTs generated by 
> > > > > > > > > > QEMU but some
> > > > > > > > > > of the DTs in the kernel (such as microchip and SiFive 
> > > > > > > > > > board DTs) have
> > > > > > > > > > the E-core disabled.
> > > > > > > > > >
> > > > > > > > > > I had discovered this issue in a totally different context 
> > > > > > > > > > after the OpenSBI 1.3
> > > > > > > > > > release happened. This issue is already fixed in the latest 
> > > > > > > > > > OpenSBI by the
> > > > > > > > > > following commit c6a35733b74aeff612398f274ed19a74f81d1f37 
> > > > > > > > > > ("lib: utils:
> > > > > > > > > > Fix sbi_hartid_to_scratch() usage in ACLINT drivers").
> > > > > > > > >
> > > > > > > > > Great, thanks Anup! I thought I had tested tip-of-tree too, 
> > > > > > > > > but
> > > > > > > > > obviously not.
> > > > > > > > >
> > > > > > > > > > I always assumed that Microchip hss.bin is the preferred 
> > > > > > > > > > BIOS for the
> > > > > > > > > > QEMU microchip-icicle-kit machine but I guess that's not 
> > > > > > > > > > true.
> > > > > > > > >
> > > > > > > > > Unfortunately the HSS has not worked in QEMU for a long time, 
> > > > > > > > > and while
> > > > > > > > > I would love to fix it, but am pretty stretched for spare 
> > > > > > > > > time to begin
> > > > > > > > > with.
> > > > > > > > > I usually just do direct kernel boots, which use the OpenSBI 
> > > > > > > > > that comes
> > > > > > > > > with QEMU, as I am sure you already know :)
> > > > > > > > >
> > > > > > > > > > At this point, you can either:
> > > > > > > > > > 1) Use latest OpenSBI on QEMU microchip-icicle-kit machine
> > > > > > > >
> > > > > > > > I forgot to reply to this point, wondering what should be done 
> > > > > > > > with
> > > > > > > > QEMU. Bumping to v1.3 in QEMU introduces a regression here, 
> > > > > > > > regardless
> > > > > > > > of whether I can go and build a fixed version of OpenSBI.
> > > > > > > >
> > > > > > > FYI: The no-map fix went in OpenSBI v1.3. Without the upgrade, any
> > > > > > > user using the latest kernel (> v6.4)
> > > > > > > may hit those random linear map related issues (in hibernation or 
> > > > > > > EFI
> > > > > > > booting path).
> > > > > > >
> > > > > > > There are three possible scenarios:
> > > > > > >
> > > > > > > 1. Upgrade to OpenSBI v1.3: Any user of microchip-icicle-kit 
> > > > > > > machine
> > > > > > > or sifive fu540 machine users
> > > > > > > may hit this issue if the device tree 

Re: Boot failure after QEMU's upgrade to OpenSBI v1.3 (was Re: [PATCH for-8.2 6/7] target/riscv: add 'max' CPU type)

2023-07-19 Thread Andreas Schwab
On Jul 19 2023, Bin Meng wrote:

>> Please check
>> https://github.com/riscv-software-src/opensbi/releases/tag/v1.3.1
>>
>> I hope this helps.
>
> Hi Alistair,
>
> Do we need to update QEMU's opensbi binaries to v1.3.1?
>
> Hi Anup,
>
> Somehow I cannot see the 'tag' v1.3.1 being populated in the opensbi
> git repo. Am I missing anything?

You need to run git fetch --tags, because the tag is not part of any
branch, thus not fetched automatically.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."



Re: Boot failure after QEMU's upgrade to OpenSBI v1.3 (was Re: [PATCH for-8.2 6/7] target/riscv: add 'max' CPU type)

2023-07-19 Thread Anup Patel
Hi Bin,

On Wed, Jul 19, 2023 at 9:15 PM Bin Meng  wrote:
>
> On Wed, Jul 19, 2023 at 11:22 PM Anup Patel  wrote:
> >
> > On Wed, Jul 19, 2023 at 3:23 PM Alistair Francis  
> > wrote:
> > >
> > > On Wed, Jul 19, 2023 at 3:39 PM Anup Patel  wrote:
> > > >
> > > > On Wed, Jul 19, 2023 at 7:03 AM Alistair Francis  
> > > > wrote:
> > > > >
> > > > > On Sat, Jul 15, 2023 at 7:14 PM Atish Patra  
> > > > > wrote:
> > > > > >
> > > > > > On Fri, Jul 14, 2023 at 5:29 AM Conor Dooley  
> > > > > > wrote:
> > > > > > >
> > > > > > > On Fri, Jul 14, 2023 at 11:19:34AM +0100, Conor Dooley wrote:
> > > > > > > > On Fri, Jul 14, 2023 at 10:00:19AM +0530, Anup Patel wrote:
> > > > > > > >
> > > > > > > > > > > OpenSBI v1.3
> > > > > > > > > > >_  _
> > > > > > > > > > >   / __ \  / |  _ \_   _|
> > > > > > > > > > >  | |  | |_ __   ___ _ __ | (___ | |_) || |
> > > > > > > > > > >  | |  | | '_ \ / _ \ '_ \ \___ \|  _ < | |
> > > > > > > > > > >  | |__| | |_) |  __/ | | |) | |_) || |_
> > > > > > > > > > >   \/| .__/ \___|_| |_|_/|___/_|
> > > > > > > > > > > | |
> > > > > > > > > > > |_|
> > > > > > > > > > >
> > > > > > > > > > > init_coldboot: ipi init failed (error -1009)
> > > > > > > > > > >
> > > > > > > > > > > Just to note, because we use our own firmware that 
> > > > > > > > > > > vendors in OpenSBI
> > > > > > > > > > > and compiles only a significantly cut down number of 
> > > > > > > > > > > files from it, we
> > > > > > > > > > > do not use the fw_dynamic etc flow on our hardware. As a 
> > > > > > > > > > > result, we have
> > > > > > > > > > > not tested v1.3, nor do we have any immediate plans to 
> > > > > > > > > > > change our
> > > > > > > > > > > platform firmware to vendor v1.3 either.
> > > > > > > > > > >
> > > > > > > > > > > I unless there's something obvious to you, it sounds like 
> > > > > > > > > > > I will need to
> > > > > > > > > > > go and bisect OpenSBI. That's a job for another day 
> > > > > > > > > > > though, given the
> > > > > > > > > > > time.
> > > > > > > > > > >
> > > > > > > > >
> > > > > > > > > The real issue is some CPU/HART DT nodes marked as disabled 
> > > > > > > > > in the
> > > > > > > > > DT passed to OpenSBI 1.3.
> > > > > > > > >
> > > > > > > > > This issue does not exist in any of the DTs generated by QEMU 
> > > > > > > > > but some
> > > > > > > > > of the DTs in the kernel (such as microchip and SiFive board 
> > > > > > > > > DTs) have
> > > > > > > > > the E-core disabled.
> > > > > > > > >
> > > > > > > > > I had discovered this issue in a totally different context 
> > > > > > > > > after the OpenSBI 1.3
> > > > > > > > > release happened. This issue is already fixed in the latest 
> > > > > > > > > OpenSBI by the
> > > > > > > > > following commit c6a35733b74aeff612398f274ed19a74f81d1f37 
> > > > > > > > > ("lib: utils:
> > > > > > > > > Fix sbi_hartid_to_scratch() usage in ACLINT drivers").
> > > > > > > >
> > > > > > > > Great, thanks Anup! I thought I had tested tip-of-tree too, but
> > > > > > > > obviously not.
> > > > > > > >
> > > > > > > > > I always assumed that Microchip hss.bin is the preferred BIOS 
> > > > > > > > > for the
> > > > > > > > > QEMU microchip-icicle-kit machine but I guess that's not true.
> > > > > > > >
> > > > > > > > Unfortunately the HSS has not worked in QEMU for a long time, 
> > > > > > > > and while
> > > > > > > > I would love to fix it, but am pretty stretched for spare time 
> > > > > > > > to begin
> > > > > > > > with.
> > > > > > > > I usually just do direct kernel boots, which use the OpenSBI 
> > > > > > > > that comes
> > > > > > > > with QEMU, as I am sure you already know :)
> > > > > > > >
> > > > > > > > > At this point, you can either:
> > > > > > > > > 1) Use latest OpenSBI on QEMU microchip-icicle-kit machine
> > > > > > >
> > > > > > > I forgot to reply to this point, wondering what should be done 
> > > > > > > with
> > > > > > > QEMU. Bumping to v1.3 in QEMU introduces a regression here, 
> > > > > > > regardless
> > > > > > > of whether I can go and build a fixed version of OpenSBI.
> > > > > > >
> > > > > > FYI: The no-map fix went in OpenSBI v1.3. Without the upgrade, any
> > > > > > user using the latest kernel (> v6.4)
> > > > > > may hit those random linear map related issues (in hibernation or 
> > > > > > EFI
> > > > > > booting path).
> > > > > >
> > > > > > There are three possible scenarios:
> > > > > >
> > > > > > 1. Upgrade to OpenSBI v1.3: Any user of microchip-icicle-kit machine
> > > > > > or sifive fu540 machine users
> > > > > > may hit this issue if the device tree has the disabled hart (e 
> > > > > > core).
> > > > > > 2. No upgrade to OpenSBI v1.2. Any user using hibernation or UEFI 
> > > > > > may
> > > > > > have issues [1]
> > > > > > 3. Include a non-release version OpenSBI in Qemu with the fix as an 
> > > > > > exception.
> > > > > >
> > > > > > #3 probably deviates from policy 

[PATCH 0/5] target/arm: Fixes for RME

2023-07-19 Thread Jean-Philippe Brucker
With these patches I'm able to boot a Realm guest under
"-cpu max,x-rme=on". They are based on Peter's series which fixes
handling of NSTable:
https://lore.kernel.org/qemu-devel/20230714154648.327466-1-peter.mayd...@linaro.org/


Running a Realm guest requires components at EL3 and R-EL2. Some rough
support for TF-A and RMM is available here:
https://jpbrucker.net/git/tf-a/log/?h=qemu-rme
https://jpbrucker.net/git/rmm/log/?h=qemu-rme
I'll clean this up before sending it out.

I also need to manually disable FEAT_SME in QEMU in order to boot this,
otherwise the Linux host fails to boot because hyp-stub accesses to SME
regs are trapped to EL3, which doesn't support RME+SME at the moment.
The right fix is probably in TF-A but I haven't investigated yet.

Jean-Philippe Brucker (5):
  target/arm/ptw: Load stage-2 tables from realm physical space
  target/arm/helper: Fix vae2_tlbmask()
  target/arm: Skip granule protection checks for AT instructions
  target/arm: Pass security space rather than flag for AT instructions
  target/arm/helper: Implement CNTHCTL_EL2.CNT[VP]MASK

 target/arm/internals.h | 25 --
 target/arm/helper.c| 78 --
 target/arm/ptw.c   | 19 ++
 3 files changed, 79 insertions(+), 43 deletions(-)

-- 
2.41.0




[PATCH 4/5] target/arm: Pass security space rather than flag for AT instructions

2023-07-19 Thread Jean-Philippe Brucker
At the moment we only handle Secure and Nonsecure security spaces for
the AT instructions. Add support for Realm and Root.

For AArch64, arm_security_space() gives the desired space. ARM DDI0487J
says (R_NYXTL):

  If EL3 is implemented, then when an address translation instruction
  that applies to an Exception level lower than EL3 is executed, the
  Effective value of SCR_EL3.{NSE, NS} determines the target Security
  state that the instruction applies to.

For AArch32, some instructions can access NonSecure space from Secure,
so we still need to pass the state explicitly to do_ats_write().

Signed-off-by: Jean-Philippe Brucker 
---
I haven't tested AT instructions in Realm/Root space yet, but it looks
like the patch is needed. RMM doesn't issue AT instructions like KVM
does in non-secure state (which triggered the bug in the previous
patch).
---
 target/arm/internals.h | 18 +-
 target/arm/helper.c| 27 ---
 target/arm/ptw.c   | 12 ++--
 3 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/target/arm/internals.h b/target/arm/internals.h
index fc90c364f7..cf13bb94f5 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -1217,24 +1217,24 @@ bool get_phys_addr(CPUARMState *env, target_ulong 
address,
 __attribute__((nonnull));
 
 /**
- * get_phys_addr_with_secure_nogpc: get the physical address for a virtual
- *  address
+ * get_phys_addr_with_space_nogpc: get the physical address for a virtual
+ * address
  * @env: CPUARMState
  * @address: virtual address to get physical address for
  * @access_type: 0 for read, 1 for write, 2 for execute
  * @mmu_idx: MMU index indicating required translation regime
- * @is_secure: security state for the access
+ * @space: security space for the access
  * @result: set on translation success.
  * @fi: set to fault info if the translation fails
  *
- * Similar to get_phys_addr, but use the given security regime and don't 
perform
+ * Similar to get_phys_addr, but use the given security space and don't perform
  * a Granule Protection Check on the resulting address.
  */
-bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address,
- MMUAccessType access_type,
- ARMMMUIdx mmu_idx, bool is_secure,
- GetPhysAddrResult *result,
- ARMMMUFaultInfo *fi)
+bool get_phys_addr_with_space_nogpc(CPUARMState *env, target_ulong address,
+MMUAccessType access_type,
+ARMMMUIdx mmu_idx, ARMSecuritySpace space,
+GetPhysAddrResult *result,
+ARMMMUFaultInfo *fi)
 __attribute__((nonnull));
 
 bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address,
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 3ee2bb5fe1..2017b11795 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -3357,7 +3357,7 @@ static int par_el1_shareability(GetPhysAddrResult *res)
 
 static uint64_t do_ats_write(CPUARMState *env, uint64_t value,
  MMUAccessType access_type, ARMMMUIdx mmu_idx,
- bool is_secure)
+ ARMSecuritySpace ss)
 {
 bool ret;
 uint64_t par64;
@@ -3369,8 +3369,8 @@ static uint64_t do_ats_write(CPUARMState *env, uint64_t 
value,
  * I_MXTJT: Granule protection checks are not performed on the final 
address
  * of a successful translation.
  */
-ret = get_phys_addr_with_secure_nogpc(env, value, access_type, mmu_idx,
-  is_secure, , );
+ret = get_phys_addr_with_space_nogpc(env, value, access_type, mmu_idx, ss,
+ , );
 
 /*
  * ATS operations only do S1 or S1+S2 translations, so we never
@@ -3535,7 +3535,7 @@ static void ats_write(CPUARMState *env, const 
ARMCPRegInfo *ri, uint64_t value)
 uint64_t par64;
 ARMMMUIdx mmu_idx;
 int el = arm_current_el(env);
-bool secure = arm_is_secure_below_el3(env);
+ARMSecuritySpace ss = arm_security_space(env);
 
 switch (ri->opc2 & 6) {
 case 0:
@@ -3543,10 +3543,9 @@ static void ats_write(CPUARMState *env, const 
ARMCPRegInfo *ri, uint64_t value)
 switch (el) {
 case 3:
 mmu_idx = ARMMMUIdx_E3;
-secure = true;
 break;
 case 2:
-g_assert(!secure);  /* ARMv8.4-SecEL2 is 64-bit only */
+g_assert(ss != ARMSS_Secure);  /* ARMv8.4-SecEL2 is 64-bit only */
 /* fall through */
 case 1:
 if (ri->crm == 9 && (env->uncached_cpsr & CPSR_PAN)) {
@@ -3564,10 +3563,9 @@ static void ats_write(CPUARMState *env, const 
ARMCPRegInfo *ri, uint64_t value)
 switch (el) {
 

[PATCH 5/5] target/arm/helper: Implement CNTHCTL_EL2.CNT[VP]MASK

2023-07-19 Thread Jean-Philippe Brucker
When FEAT_RME is implemented, these bits override the value of
CNT[VP]_CTL_EL0.IMASK in Realm and Root state.

Signed-off-by: Jean-Philippe Brucker 
---
 target/arm/helper.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 2017b11795..5b173a827f 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -2608,6 +2608,23 @@ static uint64_t gt_get_countervalue(CPUARMState *env)
 return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / gt_cntfrq_period_ns(cpu);
 }
 
+static bool gt_is_masked(CPUARMState *env, int timeridx)
+{
+ARMSecuritySpace ss = arm_security_space(env);
+
+/*
+ * If bits CNTHCTL_EL2.CNT[VP]MASK are set, they override
+ * CNT[VP]_CTL_EL0.IMASK. They are RES0 in Secure and NonSecure state.
+ */
+if ((ss == ARMSS_Root || ss == ARMSS_Realm) &&
+((timeridx == GTIMER_VIRT && extract64(env->cp15.cnthctl_el2, 18, 1)) 
||
+ (timeridx == GTIMER_PHYS && extract64(env->cp15.cnthctl_el2, 19, 
1 {
+return true;
+}
+
+return env->cp15.c14_timer[timeridx].ctl & 2;
+}
+
 static void gt_recalc_timer(ARMCPU *cpu, int timeridx)
 {
 ARMGenericTimer *gt = >env.cp15.c14_timer[timeridx];
@@ -2627,7 +2644,7 @@ static void gt_recalc_timer(ARMCPU *cpu, int timeridx)
 
 gt->ctl = deposit32(gt->ctl, 2, 1, istatus);
 
-irqstate = (istatus && !(gt->ctl & 2));
+irqstate = (istatus && !gt_is_masked(>env, timeridx));
 qemu_set_irq(cpu->gt_timer_outputs[timeridx], irqstate);
 
 if (istatus) {
@@ -2759,7 +2776,7 @@ static void gt_ctl_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
  * IMASK toggled: don't need to recalculate,
  * just set the interrupt line based on ISTATUS
  */
-int irqstate = (oldval & 4) && !(value & 2);
+int irqstate = (oldval & 4) && !gt_is_masked(env, timeridx);
 
 trace_arm_gt_imask_toggle(timeridx, irqstate);
 qemu_set_irq(cpu->gt_timer_outputs[timeridx], irqstate);
-- 
2.41.0




[PATCH 2/5] target/arm/helper: Fix vae2_tlbmask()

2023-07-19 Thread Jean-Philippe Brucker
When HCR_EL2.E2H is enabled, TLB entries are formed using the EL2&0
translation regime, instead of the EL2 translation regime. The TLB VAE2*
instructions invalidate the regime that corresponds to the current value
of HCR_EL2.E2H.

At the moment we only invalidate the EL2 translation regime. This causes
problems with RMM, which issues TLBI VAE2IS instructions with
HCR_EL2.E2H enabled. Update vae2_tlbmask() to take HCR_EL2.E2H into
account.

Signed-off-by: Jean-Philippe Brucker 
---
 target/arm/helper.c | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index e1b3db6f5f..07a9ac70f5 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -4663,6 +4663,21 @@ static int vae1_tlbmask(CPUARMState *env)
 return mask;
 }
 
+static int vae2_tlbmask(CPUARMState *env)
+{
+uint64_t hcr = arm_hcr_el2_eff(env);
+uint16_t mask;
+
+if (hcr & HCR_E2H) {
+mask = ARMMMUIdxBit_E20_2 |
+   ARMMMUIdxBit_E20_2_PAN |
+   ARMMMUIdxBit_E20_0;
+} else {
+mask = ARMMMUIdxBit_E2;
+}
+return mask;
+}
+
 /* Return 56 if TBI is enabled, 64 otherwise. */
 static int tlbbits_for_regime(CPUARMState *env, ARMMMUIdx mmu_idx,
   uint64_t addr)
@@ -4781,7 +4796,7 @@ static void tlbi_aa64_vae2_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
  * flush-last-level-only.
  */
 CPUState *cs = env_cpu(env);
-int mask = e2_tlbmask(env);
+int mask = vae2_tlbmask(env);
 uint64_t pageaddr = sextract64(value << 12, 0, 56);
 
 tlb_flush_page_by_mmuidx(cs, pageaddr, mask);
@@ -4838,11 +4853,11 @@ static void tlbi_aa64_vae2is_write(CPUARMState *env, 
const ARMCPRegInfo *ri,
uint64_t value)
 {
 CPUState *cs = env_cpu(env);
+int mask = vae2_tlbmask(env);
 uint64_t pageaddr = sextract64(value << 12, 0, 56);
 int bits = tlbbits_for_regime(env, ARMMMUIdx_E2, pageaddr);
 
-tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr,
-  ARMMMUIdxBit_E2, bits);
+tlb_flush_page_bits_by_mmuidx_all_cpus_synced(cs, pageaddr, mask, bits);
 }
 
 static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -5014,11 +5029,6 @@ static void tlbi_aa64_rvae1is_write(CPUARMState *env,
 do_rvae_write(env, value, vae1_tlbmask(env), true);
 }
 
-static int vae2_tlbmask(CPUARMState *env)
-{
-return ARMMMUIdxBit_E2;
-}
-
 static void tlbi_aa64_rvae2_write(CPUARMState *env,
   const ARMCPRegInfo *ri,
   uint64_t value)
-- 
2.41.0




[PATCH 3/5] target/arm: Skip granule protection checks for AT instructions

2023-07-19 Thread Jean-Philippe Brucker
GPC checks are not performed on the output address for AT instructions,
as stated by ARM DDI 0487J in D8.12.2:

  When populating PAR_EL1 with the result of an address translation
  instruction, granule protection checks are not performed on the final
  output address of a successful translation.

Rename get_phys_addr_with_secure(), since it's only used to handle AT
instructions.

Signed-off-by: Jean-Philippe Brucker 
---
This incidentally fixes a problem with AT S1E1 instructions which can
output an IPA and should definitely not cause a GPC.
---
 target/arm/internals.h | 25 ++---
 target/arm/helper.c|  8 ++--
 target/arm/ptw.c   | 11 ++-
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/target/arm/internals.h b/target/arm/internals.h
index 0f01bc32a8..fc90c364f7 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -1190,12 +1190,11 @@ typedef struct GetPhysAddrResult {
 } GetPhysAddrResult;
 
 /**
- * get_phys_addr_with_secure: get the physical address for a virtual address
+ * get_phys_addr: get the physical address for a virtual address
  * @env: CPUARMState
  * @address: virtual address to get physical address for
  * @access_type: 0 for read, 1 for write, 2 for execute
  * @mmu_idx: MMU index indicating required translation regime
- * @is_secure: security state for the access
  * @result: set on translation success.
  * @fi: set to fault info if the translation fails
  *
@@ -1212,26 +1211,30 @@ typedef struct GetPhysAddrResult {
  *  * for PSMAv5 based systems we don't bother to return a full FSR format
  *value.
  */
-bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address,
-   MMUAccessType access_type,
-   ARMMMUIdx mmu_idx, bool is_secure,
-   GetPhysAddrResult *result, ARMMMUFaultInfo *fi)
+bool get_phys_addr(CPUARMState *env, target_ulong address,
+   MMUAccessType access_type, ARMMMUIdx mmu_idx,
+   GetPhysAddrResult *result, ARMMMUFaultInfo *fi)
 __attribute__((nonnull));
 
 /**
- * get_phys_addr: get the physical address for a virtual address
+ * get_phys_addr_with_secure_nogpc: get the physical address for a virtual
+ *  address
  * @env: CPUARMState
  * @address: virtual address to get physical address for
  * @access_type: 0 for read, 1 for write, 2 for execute
  * @mmu_idx: MMU index indicating required translation regime
+ * @is_secure: security state for the access
  * @result: set on translation success.
  * @fi: set to fault info if the translation fails
  *
- * Similarly, but use the security regime of @mmu_idx.
+ * Similar to get_phys_addr, but use the given security regime and don't 
perform
+ * a Granule Protection Check on the resulting address.
  */
-bool get_phys_addr(CPUARMState *env, target_ulong address,
-   MMUAccessType access_type, ARMMMUIdx mmu_idx,
-   GetPhysAddrResult *result, ARMMMUFaultInfo *fi)
+bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address,
+ MMUAccessType access_type,
+ ARMMMUIdx mmu_idx, bool is_secure,
+ GetPhysAddrResult *result,
+ ARMMMUFaultInfo *fi)
 __attribute__((nonnull));
 
 bool pmsav8_mpu_lookup(CPUARMState *env, uint32_t address,
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 07a9ac70f5..3ee2bb5fe1 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -3365,8 +3365,12 @@ static uint64_t do_ats_write(CPUARMState *env, uint64_t 
value,
 ARMMMUFaultInfo fi = {};
 GetPhysAddrResult res = {};
 
-ret = get_phys_addr_with_secure(env, value, access_type, mmu_idx,
-is_secure, , );
+/*
+ * I_MXTJT: Granule protection checks are not performed on the final 
address
+ * of a successful translation.
+ */
+ret = get_phys_addr_with_secure_nogpc(env, value, access_type, mmu_idx,
+  is_secure, , );
 
 /*
  * ATS operations only do S1 or S1+S2 translations, so we never
diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 6318e13b98..1aef2b8cef 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -3412,16 +3412,17 @@ static bool get_phys_addr_gpc(CPUARMState *env, 
S1Translate *ptw,
 return false;
 }
 
-bool get_phys_addr_with_secure(CPUARMState *env, target_ulong address,
-   MMUAccessType access_type, ARMMMUIdx mmu_idx,
-   bool is_secure, GetPhysAddrResult *result,
-   ARMMMUFaultInfo *fi)
+bool get_phys_addr_with_secure_nogpc(CPUARMState *env, target_ulong address,
+ MMUAccessType access_type,
+ ARMMMUIdx mmu_idx, bool 

[PATCH 1/5] target/arm/ptw: Load stage-2 tables from realm physical space

2023-07-19 Thread Jean-Philippe Brucker
In realm state, stage-2 translation tables are fetched from the realm
physical address space (R_PGRQD).

Signed-off-by: Jean-Philippe Brucker 
---
 target/arm/ptw.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index d1de934702..6318e13b98 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -164,7 +164,11 @@ static ARMMMUIdx ptw_idx_for_stage_2(CPUARMState *env, 
ARMMMUIdx stage2idx)
  * an NS stage 1+2 lookup while the NS bit is 0.)
  */
 if (!arm_is_secure_below_el3(env) || !arm_el_is_aa64(env, 3)) {
-return ARMMMUIdx_Phys_NS;
+if (arm_security_space_below_el3(env) == ARMSS_Realm) {
+return ARMMMUIdx_Phys_Realm;
+} else {
+return ARMMMUIdx_Phys_NS;
+}
 }
 if (stage2idx == ARMMMUIdx_Stage2_S) {
 s2walk_secure = !(env->cp15.vstcr_el2 & VSTCR_SW);
-- 
2.41.0




[PULL 5/5] linux-user: Fix qemu-arm to run static armhf binaries

2023-07-19 Thread Helge Deller
qemu-user crashes immediately when running static binaries on the armhf
architecture. The problem is the memory layout where the executable is
loaded before the interpreter library, in which case the reserved brk
region clashes with the interpreter code and is released before qemu
tries to start the program.

At load time qemu calculates a brk value for interpreter and executable
each.  The fix is to choose the higher one of both.

Signed-off-by: Helge Deller 
Cc: Andreas Schwab 
Cc: qemu-sta...@nongnu.org
Reported-by:  venkata.p...@toshiba-tsip.com
Closes: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1040981
---
 linux-user/elfload.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index a26200d9f3..94951630b1 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -3615,6 +3615,13 @@ int load_elf_binary(struct linux_binprm *bprm, struct 
image_info *info)

 if (elf_interpreter) {
 load_elf_interp(elf_interpreter, _info, bprm->buf);
+/*
+ * adjust brk address if the interpreter was loaded above the main
+ * executable, e.g. happens with static binaries on armhf
+ */
+if (interp_info.brk > info->brk) {
+info->brk = interp_info.brk;
+}

 /* If the program interpreter is one of these two, then assume
an iBCS2 image.  Otherwise assume a native linux image.  */
--
2.41.0




[PULL 2/5] linux-user: Prohibit brk() to to shrink below initial heap address

2023-07-19 Thread Helge Deller
Since commit 86f04735ac ("linux-user: Fix brk() to release pages") it's
possible for userspace applications to reduce their memory footprint by
calling brk() with a lower address and free up memory. Before that commit
guest heap memory was never unmapped.

But the Linux kernel prohibits to reduce brk() below the initial memory
address which is set at startup by the set_brk() function in binfmt_elf.c.
Such a range check was missed in commit 86f04735ac.

This patch adds the missing check by storing the initial brk value in
initial_target_brk and verify any new brk addresses against that value.

Tested with the i386 upx binary from
https://github.com/upx/upx/releases/download/v4.0.2/upx-4.0.2-i386_linux.tar.xz

Signed-off-by: Helge Deller 
Tested-by: "Markus F.X.J. Oberhumer" 
Fixes: 86f04735ac ("linux-user: Fix brk() to release pages")
Cc: qemu-sta...@nongnu.org
Buglink: https://github.com/upx/upx/issues/683
---
 linux-user/syscall.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index ee54eed33b..125fcbe423 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -801,12 +801,13 @@ static inline int host_to_target_sock_type(int host_type)
 return target_type;
 }

-static abi_ulong target_brk;
+static abi_ulong target_brk, initial_target_brk;
 static abi_ulong brk_page;

 void target_set_brk(abi_ulong new_brk)
 {
 target_brk = TARGET_PAGE_ALIGN(new_brk);
+initial_target_brk = target_brk;
 brk_page = HOST_PAGE_ALIGN(target_brk);
 }

@@ -824,6 +825,11 @@ abi_long do_brk(abi_ulong brk_val)
 return target_brk;
 }

+/* do not allow to shrink below initial brk value */
+if (brk_val < initial_target_brk) {
+brk_val = initial_target_brk;
+}
+
 new_brk = TARGET_PAGE_ALIGN(brk_val);
 new_host_brk_page = HOST_PAGE_ALIGN(brk_val);

--
2.41.0




[PULL 0/5] Linux user brk fixes patches

2023-07-19 Thread Helge Deller
The following changes since commit 361d5397355276e3007825cc17217c1e4d4320f7:

  Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into 
staging (2023-07-17 15:49:27 +0100)

are available in the Git repository at:

  https://github.com/hdeller/qemu-hppa.git 
tags/linux-user-brk-fixes-pull-request

for you to fetch changes up to 518f32221af759a29500ac172c4c857bef142067:

  linux-user: Fix qemu-arm to run static armhf binaries (2023-07-18 20:42:05 
+0200)


linux-user: brk() syscall fixes and armhf static binary fix

Commit 86f04735ac ("linux-user: Fix brk() to release pages") introduced
the possibility for userspace applications to reduce memory footprint by
calling brk() with a lower address and as such free up memory, the same
way as the Linux kernel allows on physical machines.

This change introduced some failures for applications with errors like
- accesing bytes above the brk heap address on the same page,
- freeing memory below the initial brk address,
and introduced a behaviour which isn't done by the kernel (e.g. zeroing
memory above brk).

This patch series fixes those issues and has been tested with existing
programs (e.g. upx).

Additionally one patch fixes running static armhf executables (e.g. fstype)
which was broken since qemu-8.0.

Changes in v2:
- dropped patch to revert d28b3c90cfad ("linux-user: Make sure initial brk(0)
  is page-aligned")
- rephrased some commit messages
- fixed Cc email addresses, added new ones
- added R-b tags

Helge



Helge Deller (5):
  linux-user: Fix qemu brk() to not zero bytes on current page
  linux-user: Prohibit brk() to to shrink below initial heap address
  linux-user: Fix signed math overflow in brk() syscall
  linux-user: Fix strace output for old_mmap
  linux-user: Fix qemu-arm to run static armhf binaries

 linux-user/elfload.c |  7 +++
 linux-user/strace.c  | 49 
 linux-user/syscall.c | 23 +
 3 files changed, 66 insertions(+), 13 deletions(-)

--
2.41.0




[PULL 4/5] linux-user: Fix strace output for old_mmap

2023-07-19 Thread Helge Deller
The old_mmap syscall (e.g. on i386) hands over the parameters in
a struct. Adjust the strace output to print the correct values.

Signed-off-by: Helge Deller 
Reported-by: John Reiser 
Closes: https://gitlab.com/qemu-project/qemu/-/issues/1760
---
 linux-user/strace.c | 49 +
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index bbd29148d4..e0ab8046ec 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -3767,10 +3767,24 @@ print_utimensat(CPUArchState *cpu_env, const struct 
syscallname *name,

 #if defined(TARGET_NR_mmap) || defined(TARGET_NR_mmap2)
 static void
-print_mmap(CPUArchState *cpu_env, const struct syscallname *name,
+print_mmap_both(CPUArchState *cpu_env, const struct syscallname *name,
abi_long arg0, abi_long arg1, abi_long arg2,
-   abi_long arg3, abi_long arg4, abi_long arg5)
-{
+   abi_long arg3, abi_long arg4, abi_long arg5,
+   bool is_old_mmap)
+{
+if (is_old_mmap) {
+abi_ulong *v;
+abi_ulong argp = arg0;
+if (!(v = lock_user(VERIFY_READ, argp, 6 * sizeof(abi_ulong), 1)))
+return;
+arg0 = tswapal(v[0]);
+arg1 = tswapal(v[1]);
+arg2 = tswapal(v[2]);
+arg3 = tswapal(v[3]);
+arg4 = tswapal(v[4]);
+arg5 = tswapal(v[5]);
+unlock_user(v, argp, 0);
+}
 print_syscall_prologue(name);
 print_pointer(arg0, 0);
 print_raw_param("%d", arg1, 0);
@@ -3780,7 +3794,34 @@ print_mmap(CPUArchState *cpu_env, const struct 
syscallname *name,
 print_raw_param("%#x", arg5, 1);
 print_syscall_epilogue(name);
 }
-#define print_mmap2 print_mmap
+#endif
+
+#if defined(TARGET_NR_mmap)
+static void
+print_mmap(CPUArchState *cpu_env, const struct syscallname *name,
+   abi_long arg0, abi_long arg1, abi_long arg2,
+   abi_long arg3, abi_long arg4, abi_long arg5)
+{
+return print_mmap_both(cpu_env, name, arg0, arg1, arg2, arg3,
+   arg4, arg5,
+#if defined(TARGET_NR_mmap2)
+true
+#else
+false
+#endif
+);
+}
+#endif
+
+#if defined(TARGET_NR_mmap2)
+static void
+print_mmap2(CPUArchState *cpu_env, const struct syscallname *name,
+   abi_long arg0, abi_long arg1, abi_long arg2,
+   abi_long arg3, abi_long arg4, abi_long arg5)
+{
+return print_mmap_both(cpu_env, name, arg0, arg1, arg2, arg3,
+   arg4, arg5, false);
+}
 #endif

 #ifdef TARGET_NR_mprotect
--
2.41.0




[PULL 3/5] linux-user: Fix signed math overflow in brk() syscall

2023-07-19 Thread Helge Deller
Fix the math overflow when calculating the new_malloc_size.

new_host_brk_page and brk_page are unsigned integers. If userspace
reduces the heap, new_host_brk_page is lower than brk_page which results
in a huge positive number (but should actually be negative).

Fix it by adding a proper check and as such make the code more readable.

Signed-off-by: Helge Deller 
Tested-by: "Markus F.X.J. Oberhumer" 
Reviewed-by: Philippe Mathieu-Daudé 
Fixes: 86f04735ac ("linux-user: Fix brk() to release pages")
Cc: qemu-sta...@nongnu.org
Buglink: https://github.com/upx/upx/issues/683
---
 linux-user/syscall.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 125fcbe423..95727a816a 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -860,12 +860,13 @@ abi_long do_brk(abi_ulong brk_val)
  * itself); instead we treat "mapped but at wrong address" as
  * a failure and unmap again.
  */
-new_alloc_size = new_host_brk_page - brk_page;
-if (new_alloc_size) {
+if (new_host_brk_page > brk_page) {
+new_alloc_size = new_host_brk_page - brk_page;
 mapped_addr = get_errno(target_mmap(brk_page, new_alloc_size,
 PROT_READ|PROT_WRITE,
 MAP_ANON|MAP_PRIVATE, 0, 0));
 } else {
+new_alloc_size = 0;
 mapped_addr = brk_page;
 }

--
2.41.0




[PULL 1/5] linux-user: Fix qemu brk() to not zero bytes on current page

2023-07-19 Thread Helge Deller
The qemu brk() implementation is too aggressive and cleans remaining bytes
on the current page above the last brk address.

But some existing applications are buggy and read/write bytes above their
current heap address. On a phyiscal machine this does not trigger a
runtime error as long as the access happens on the same page. Additionally
the Linux kernel allocates only full pages and does no zeroing on already
allocated pages, even if the brk address is lowered.

Fix qemu to behave the same way as the kernel does. Do not touch already
allocated pages, and - when running with different page sizes of guest and
host - zero out only those memory areas where the host page size is bigger
than the guest page size.

Signed-off-by: Helge Deller 
Tested-by: "Markus F.X.J. Oberhumer" 
Fixes: 86f04735ac ("linux-user: Fix brk() to release pages")
Cc: qemu-sta...@nongnu.org
Buglink: https://github.com/upx/upx/issues/683
---
 linux-user/syscall.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index c99ef9c01e..ee54eed33b 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -829,10 +829,8 @@ abi_long do_brk(abi_ulong brk_val)

 /* brk_val and old target_brk might be on the same page */
 if (new_brk == TARGET_PAGE_ALIGN(target_brk)) {
-if (brk_val > target_brk) {
-/* empty remaining bytes in (possibly larger) host page */
-memset(g2h_untagged(target_brk), 0, new_host_brk_page - 
target_brk);
-}
+/* empty remaining bytes in (possibly larger) host page */
+memset(g2h_untagged(new_brk), 0, new_host_brk_page - new_brk);
 target_brk = brk_val;
 return target_brk;
 }
@@ -840,7 +838,7 @@ abi_long do_brk(abi_ulong brk_val)
 /* Release heap if necesary */
 if (new_brk < target_brk) {
 /* empty remaining bytes in (possibly larger) host page */
-memset(g2h_untagged(brk_val), 0, new_host_brk_page - brk_val);
+memset(g2h_untagged(new_brk), 0, new_host_brk_page - new_brk);

 /* free unused host pages and set new brk_page */
 target_munmap(new_host_brk_page, brk_page - new_host_brk_page);
@@ -873,7 +871,7 @@ abi_long do_brk(abi_ulong brk_val)
  * come from the remaining part of the previous page: it may
  * contains garbage data due to a previous heap usage (grown
  * then shrunken).  */
-memset(g2h_untagged(target_brk), 0, brk_page - target_brk);
+memset(g2h_untagged(brk_page), 0, HOST_PAGE_ALIGN(brk_page) - 
brk_page);

 target_brk = brk_val;
 brk_page = new_host_brk_page;
--
2.41.0




Re: [PATCH for-8.1] tests/test-util-filemonitor: Avoid pointless allocations

2023-07-19 Thread Philippe Mathieu-Daudé

On 19/7/23 17:39, Daniel P. Berrangé wrote:

On Wed, Jul 19, 2023 at 05:01:03PM +0200, Philippe Mathieu-Daudé wrote:

Coverity reports few resource leaks. While they are
harmless, fix them to avoid them showing on the reports.

Reported-by: Coverity (CID 1432615: RESOURCE_LEAK)
Fixes: 4f370b1098 ("test-util-filemonitor: Skip test on non-x86 Travis 
containers")
Signed-off-by: Philippe Mathieu-Daudé 
---
  tests/unit/test-util-filemonitor.c | 13 -
  1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/unit/test-util-filemonitor.c 
b/tests/unit/test-util-filemonitor.c
index b629e10857..3ca687860d 100644
--- a/tests/unit/test-util-filemonitor.c
+++ b/tests/unit/test-util-filemonitor.c
@@ -398,7 +398,7 @@ test_file_monitor_events(void)




  /*
   * This test does not work on Travis LXD containers since some
   * syscalls are blocked in that environment.


Right here is logic that checks the TRAVIS_ARCH env variable.

IMHO this should just be moved out into the main() method, so
we don't even start test when under Travis. Just make the whole
program exits with a skip status on travis.


Clever eh :)




Re: Boot failure after QEMU's upgrade to OpenSBI v1.3 (was Re: [PATCH for-8.2 6/7] target/riscv: add 'max' CPU type)

2023-07-19 Thread Bin Meng
On Wed, Jul 19, 2023 at 11:22 PM Anup Patel  wrote:
>
> On Wed, Jul 19, 2023 at 3:23 PM Alistair Francis  wrote:
> >
> > On Wed, Jul 19, 2023 at 3:39 PM Anup Patel  wrote:
> > >
> > > On Wed, Jul 19, 2023 at 7:03 AM Alistair Francis  
> > > wrote:
> > > >
> > > > On Sat, Jul 15, 2023 at 7:14 PM Atish Patra  
> > > > wrote:
> > > > >
> > > > > On Fri, Jul 14, 2023 at 5:29 AM Conor Dooley  wrote:
> > > > > >
> > > > > > On Fri, Jul 14, 2023 at 11:19:34AM +0100, Conor Dooley wrote:
> > > > > > > On Fri, Jul 14, 2023 at 10:00:19AM +0530, Anup Patel wrote:
> > > > > > >
> > > > > > > > > > OpenSBI v1.3
> > > > > > > > > >_  _
> > > > > > > > > >   / __ \  / |  _ \_   _|
> > > > > > > > > >  | |  | |_ __   ___ _ __ | (___ | |_) || |
> > > > > > > > > >  | |  | | '_ \ / _ \ '_ \ \___ \|  _ < | |
> > > > > > > > > >  | |__| | |_) |  __/ | | |) | |_) || |_
> > > > > > > > > >   \/| .__/ \___|_| |_|_/|___/_|
> > > > > > > > > > | |
> > > > > > > > > > |_|
> > > > > > > > > >
> > > > > > > > > > init_coldboot: ipi init failed (error -1009)
> > > > > > > > > >
> > > > > > > > > > Just to note, because we use our own firmware that vendors 
> > > > > > > > > > in OpenSBI
> > > > > > > > > > and compiles only a significantly cut down number of files 
> > > > > > > > > > from it, we
> > > > > > > > > > do not use the fw_dynamic etc flow on our hardware. As a 
> > > > > > > > > > result, we have
> > > > > > > > > > not tested v1.3, nor do we have any immediate plans to 
> > > > > > > > > > change our
> > > > > > > > > > platform firmware to vendor v1.3 either.
> > > > > > > > > >
> > > > > > > > > > I unless there's something obvious to you, it sounds like I 
> > > > > > > > > > will need to
> > > > > > > > > > go and bisect OpenSBI. That's a job for another day though, 
> > > > > > > > > > given the
> > > > > > > > > > time.
> > > > > > > > > >
> > > > > > > >
> > > > > > > > The real issue is some CPU/HART DT nodes marked as disabled in 
> > > > > > > > the
> > > > > > > > DT passed to OpenSBI 1.3.
> > > > > > > >
> > > > > > > > This issue does not exist in any of the DTs generated by QEMU 
> > > > > > > > but some
> > > > > > > > of the DTs in the kernel (such as microchip and SiFive board 
> > > > > > > > DTs) have
> > > > > > > > the E-core disabled.
> > > > > > > >
> > > > > > > > I had discovered this issue in a totally different context 
> > > > > > > > after the OpenSBI 1.3
> > > > > > > > release happened. This issue is already fixed in the latest 
> > > > > > > > OpenSBI by the
> > > > > > > > following commit c6a35733b74aeff612398f274ed19a74f81d1f37 
> > > > > > > > ("lib: utils:
> > > > > > > > Fix sbi_hartid_to_scratch() usage in ACLINT drivers").
> > > > > > >
> > > > > > > Great, thanks Anup! I thought I had tested tip-of-tree too, but
> > > > > > > obviously not.
> > > > > > >
> > > > > > > > I always assumed that Microchip hss.bin is the preferred BIOS 
> > > > > > > > for the
> > > > > > > > QEMU microchip-icicle-kit machine but I guess that's not true.
> > > > > > >
> > > > > > > Unfortunately the HSS has not worked in QEMU for a long time, and 
> > > > > > > while
> > > > > > > I would love to fix it, but am pretty stretched for spare time to 
> > > > > > > begin
> > > > > > > with.
> > > > > > > I usually just do direct kernel boots, which use the OpenSBI that 
> > > > > > > comes
> > > > > > > with QEMU, as I am sure you already know :)
> > > > > > >
> > > > > > > > At this point, you can either:
> > > > > > > > 1) Use latest OpenSBI on QEMU microchip-icicle-kit machine
> > > > > >
> > > > > > I forgot to reply to this point, wondering what should be done with
> > > > > > QEMU. Bumping to v1.3 in QEMU introduces a regression here, 
> > > > > > regardless
> > > > > > of whether I can go and build a fixed version of OpenSBI.
> > > > > >
> > > > > FYI: The no-map fix went in OpenSBI v1.3. Without the upgrade, any
> > > > > user using the latest kernel (> v6.4)
> > > > > may hit those random linear map related issues (in hibernation or EFI
> > > > > booting path).
> > > > >
> > > > > There are three possible scenarios:
> > > > >
> > > > > 1. Upgrade to OpenSBI v1.3: Any user of microchip-icicle-kit machine
> > > > > or sifive fu540 machine users
> > > > > may hit this issue if the device tree has the disabled hart (e core).
> > > > > 2. No upgrade to OpenSBI v1.2. Any user using hibernation or UEFI may
> > > > > have issues [1]
> > > > > 3. Include a non-release version OpenSBI in Qemu with the fix as an 
> > > > > exception.
> > > > >
> > > > > #3 probably deviates from policy and sets a bad precedent. So I am not
> > > > > advocating for it though ;)
> > > > > For both #1 & #2, the solution would be to use the latest OpenSBI in
> > > > > -bios argument instead of the stock one.
> > > > > I could be wrong but my guess is the number of users facing #2 would
> > > > > be higher than #1.
> > > >
> > > > Thanks for 

Re: [PATCH v21 16/20] tests/avocado: s390x cpu topology entitlement tests

2023-07-19 Thread Pierre Morel



On 7/19/23 16:13, Nina Schoetterl-Glausch wrote:

On Wed, 2023-07-19 at 16:08 +0200, Pierre Morel wrote:

On 7/14/23 18:30, Nina Schoetterl-Glausch wrote:

On Wed, 2023-07-12 at 22:11 +0200, Thomas Huth wrote:

On 12/07/2023 21.37, Nina Schoetterl-Glausch wrote:

On Wed, 2023-07-05 at 12:22 +0200, Thomas Huth wrote:

On 30/06/2023 11.17, Pierre Morel wrote:

This test takes care to check the changes on different
entitlements
when the guest requests a polarization change.

Signed-off-by: Pierre Morel 
---
     tests/avocado/s390_topology.py | 47
++
     1 file changed, 47 insertions(+)

diff --git a/tests/avocado/s390_topology.py
b/tests/avocado/s390_topology.py
index 2cf731cb1d..4855e5d7e4 100644
--- a/tests/avocado/s390_topology.py
+++ b/tests/avocado/s390_topology.py
@@ -240,3 +240,50 @@ def test_polarisation(self):
     res = self.vm.qmp('query-cpu-polarization')
     self.assertEqual(res['return']['polarization'],
'horizontal')
     self.check_topology(0, 0, 0, 0, 'medium',
False)
+
+    def test_entitlement(self):
+    """
+    This test verifies that QEMU modifies the
polarization
+    after a guest request.

...

+    self.check_topology(0, 0, 0, 0, 'low', False)
+    self.check_topology(1, 0, 0, 0, 'medium', False)
+    self.check_topology(2, 1, 0, 0, 'high', False)
+    self.check_topology(3, 1, 0, 0, 'high', False)
+
+    self.guest_set_dispatching('1');
+
+    self.check_topology(0, 0, 0, 0, 'low', False)
+    self.check_topology(1, 0, 0, 0, 'medium', False)
+    self.check_topology(2, 1, 0, 0, 'high', False)
+    self.check_topology(3, 1, 0, 0, 'high', False)
+
+    self.guest_set_dispatching('0');
+
+    self.check_topology(0, 0, 0, 0, 'low', False)
+    self.check_topology(1, 0, 0, 0, 'medium', False)
+    self.check_topology(2, 1, 0, 0, 'high', False)
+    self.check_topology(3, 1, 0, 0, 'high', False)

Sorry, I think I'm too blind to see it, but what has changed
after
the guest
changed the polarization?

Nothing, the values are retained, they're just not active.
The guest will see a horizontal polarization until it changes
back
to
vertical.

But then the comment in front of it ("This test verifies that
QEMU
*modifies* the polarization...") does not quite match, does it?

Yeah, it tests that QEMU reports it's own state changed when using
set-cpu-topology.
I think it would be a good idea to get the guests view from the
sysfs,
also.


    Thomas



Yes, I think you are right, I rewrite this to check the guest view of
the changes.

As you said the values are retained when not used by horizontal
polarization so it is a non sense to check from host view.

I don't think it's bad to check the host view, you can do both.



OK, then I do it.

Thanks,

Pierre





Re: [PATCH for-8.1] tests/test-util-filemonitor: Avoid pointless allocations

2023-07-19 Thread Daniel P . Berrangé
On Wed, Jul 19, 2023 at 05:01:03PM +0200, Philippe Mathieu-Daudé wrote:
> Coverity reports few resource leaks. While they are
> harmless, fix them to avoid them showing on the reports.
> 
> Reported-by: Coverity (CID 1432615: RESOURCE_LEAK)
> Fixes: 4f370b1098 ("test-util-filemonitor: Skip test on non-x86 Travis 
> containers")
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  tests/unit/test-util-filemonitor.c | 13 -
>  1 file changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/tests/unit/test-util-filemonitor.c 
> b/tests/unit/test-util-filemonitor.c
> index b629e10857..3ca687860d 100644
> --- a/tests/unit/test-util-filemonitor.c
> +++ b/tests/unit/test-util-filemonitor.c
> @@ -398,7 +398,7 @@ test_file_monitor_events(void)
>  };
>  Error *local_err = NULL;
>  GError *gerr = NULL;
> -QFileMonitor *mon = qemu_file_monitor_new(_err);
> +QFileMonitor *mon;
>  QemuThread th;
>  GTimer *timer;
>  gchar *dir = NULL;
> @@ -407,12 +407,9 @@ test_file_monitor_events(void)
>  char *pathsrc = NULL;
>  char *pathdst = NULL;
>  QFileMonitorTestData data;
> -GHashTable *ids = g_hash_table_new(g_int64_hash, g_int64_equal);
> +GHashTable *ids;
>  char *travis_arch;
>  
> -qemu_mutex_init();
> -data.records = NULL;
> -
>  /*
>   * This test does not work on Travis LXD containers since some
>   * syscalls are blocked in that environment.

Right here is logic that checks the TRAVIS_ARCH env variable.

IMHO this should just be moved out into the main() method, so
we don't even start test when under Travis. Just make the whole
program exits with a skip status on travis.

> @@ -423,6 +420,12 @@ test_file_monitor_events(void)
>  return;
>  }
>  
> +mon = qemu_file_monitor_new(_err);
> +ids = g_hash_table_new(g_int64_hash, g_int64_equal);
> +
> +qemu_mutex_init();
> +data.records = NULL;
> +
>  /*
>   * The file monitor needs the main loop running in
>   * order to receive events from inotify. We must
> -- 
> 2.38.1
> 

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v21 00/20] s390x: CPU Topology

2023-07-19 Thread Pierre Morel



On 7/5/23 12:02, Thomas Huth wrote:

On 30/06/2023 11.17, Pierre Morel wrote:
...

Testing
===

To use the QEMU patches, you will need Linux V6-rc1 or newer,
or use the following Linux mainline patches:

f5ecfee94493 2022-07-20 KVM: s390: resetting the Topology-Change-Report
24fe0195bc19 2022-07-20 KVM: s390: guest support for topology function
0130337ec45b 2022-07-20 KVM: s390: Cleanup ipte lock access and SIIF 
fac..


Currently this code is for KVM only, I have no idea if it is interesting
to provide a TCG patch. If ever it will be done in another series.

This series provide 12 avocado tests using Fedora-35 kernel and initrd
image.


 Hi Pierre,

the new avocado tests currently fail if you run them on a x86 host. 
Could you please add a check that they are properly skipped instead if 
the environment does not match? I guess a


 self.require_accelerator('kvm')

should do the job...

 Thomas


Yes, thanks, I add this during initialization of the VM.

Regards,

Pierre




Re: [PATCH v3 0/8] vdpa: Send all CVQ state load commands in parallel

2023-07-19 Thread Hawkins Jiawei
在 2023/7/19 20:44, Lei Yang 写道:
> Hello Hawkins and Michael
>
> Looks like there are big changes about vp_vdpa, therefore, if needed,
> QE can test this series in QE's environment before the patch is

Hi Lei,

This patch series does not modify the code of vp_vdpa. Instead, it only
modifies how QEMU sends SVQ control commands to the vdpa device.

Considering that the behavior of the vp_vdpa device differs from that
of real vdpa hardware, would it be possible for you to test this patch
series on a real vdpa device?

Thanks!


> merged, and provide the result.
>
> BR
> Lei
>
>
> On Wed, Jul 19, 2023 at 8:37 PM Hawkins Jiawei  wrote:
>>
>> 在 2023/7/19 17:11, Michael S. Tsirkin 写道:
>>> On Wed, Jul 19, 2023 at 03:53:45PM +0800, Hawkins Jiawei wrote:
 This patchset allows QEMU to delay polling and checking the device
 used buffer until either the SVQ is full or control commands shadow
 buffers are full, instead of polling and checking immediately after
 sending each SVQ control command, so that QEMU can send all the SVQ
 control commands in parallel, which have better performance improvement.

 I use vp_vdpa device to simulate vdpa device, and create 4094 VLANS in
 guest to build a test environment for sending multiple CVQ state load
 commands. This patch series can improve latency from 10023 us to
 8697 us for about 4099 CVQ state load commands, about 0.32 us per command.
>>>
>>> Looks like a tiny improvement.
>>> At the same time we have O(n^2) behaviour with memory mappings.
>>
>> Hi Michael,
>>
>> Thanks for your review.
>>
>> I wonder why you say "we have O(n^2) behaviour on memory mappings" here?
>>
>>   From my understanding, QEMU maps two page-size buffers as control
>> commands shadow buffers at device startup. These buffers then are used
>> to cache SVQ control commands, where QEMU fills them with multiple SVQ 
>> control
>> commands bytes, flushes them when SVQ descriptors are full or these
>> control commands shadow buffers reach their capacity.
>>
>> QEMU repeats this process until all CVQ state load commands have been
>> sent in loading.
>>
>> In this loading process, only control commands shadow buffers
>> translation should be relative to memory mappings, which should be
>> O(log n) behaviour to my understanding(Please correct me if I am wrong).
>>
>>> Not saying we must not do this but I think it's worth
>>> checking where the bottleneck is. My guess would be
>>> vp_vdpa is not doing things in parallel. Want to try fixing that
>>
>> As for "vp_vdpa is not doing things in parallel.", do you mean
>> the vp_vdpa device cannot process QEMU's SVQ control commands
>> in parallel?
>>
>> In this situation, I will try to use real vdpa hardware to
>> test the patch series performance.
>>
>>> to see how far it can be pushed?
>>
>> Currently, I am involved in the "Add virtio-net Control Virtqueue state
>> restore support" project in Google Summer of Code now. Because I am
>> uncertain about the time it will take to fix that problem in the vp_vdpa
>> device, I prefer to complete the gsoc project first.
>>
>> Thanks!
>>
>>
>>>
>>>
 Note that this patch should be based on
 patch "Vhost-vdpa Shadow Virtqueue VLAN support" at [1].

 [1]. https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg03719.html

 TestStep
 
 1. regression testing using vp-vdpa device
 - For L0 guest, boot QEMU with two virtio-net-pci net device with
 `ctrl_vq`, `ctrl_rx`, `ctrl_rx_extra` features on, command line like:
 -device virtio-net-pci,disable-legacy=on,disable-modern=off,
 iommu_platform=on,mq=on,ctrl_vq=on,guest_announce=off,
 indirect_desc=off,queue_reset=off,ctrl_rx=on,ctrl_rx_extra=on,...

 - For L1 guest, apply the patch series and compile the source code,
 start QEMU with two vdpa device with svq mode on, enable the `ctrl_vq`,
 `ctrl_rx`, `ctrl_rx_extra` features on, command line like:
 -netdev type=vhost-vdpa,x-svq=true,...
 -device virtio-net-pci,mq=on,guest_announce=off,ctrl_vq=on,
 ctrl_rx=on,ctrl_rx_extra=on...

 - For L2 source guest, run the following bash command:
 ```bash
 #!/bin/sh

 for idx1 in {0..9}
 do
 for idx2 in {0..9}
 do
   for idx3 in {0..6}
   do
 ip link add macvlan$idx1$idx2$idx3 link eth0
 address 4a:30:10:19:$idx1$idx2:1$idx3 type macvlan mode bridge
 ip link set macvlan$idx1$idx2$idx3 up
   done
 done
 done
 ```
 - Execute the live migration in L2 source monitor

 - Result
   * with this series, QEMU should not trigger any error or warning.



 2. perf using vp-vdpa device
 - For L0 guest, boot QEMU with two virtio-net-pci net device with
 `ctrl_vq`, `ctrl_vlan` features on, command line like:
 -device virtio-net-pci,disable-legacy=on,disable-modern=off,
 

[PATCH 2/4] virtio-net: Added USO flags to vhost support.

2023-07-19 Thread Yuri Benditovich
From: Andrew Melnychenko 

New features are subject to check with vhost-user and vdpa.

Signed-off-by: Yuri Benditovich 
Signed-off-by: Andrew Melnychenko 
---
 hw/net/vhost_net.c | 3 +++
 net/vhost-vdpa.c   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 6b958d6363..57427a3997 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -78,6 +78,9 @@ static const int user_feature_bits[] = {
 VIRTIO_F_RING_RESET,
 VIRTIO_NET_F_RSS,
 VIRTIO_NET_F_HASH_REPORT,
+VIRTIO_NET_F_GUEST_USO4,
+VIRTIO_NET_F_GUEST_USO6,
+VIRTIO_NET_F_HOST_USO,
 
 /* This bit implies RARP isn't sent by QEMU out of band */
 VIRTIO_NET_F_GUEST_ANNOUNCE,
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 9795306742..1dca37aae2 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -75,11 +75,14 @@ const int vdpa_feature_bits[] = {
 VIRTIO_NET_F_GUEST_TSO4,
 VIRTIO_NET_F_GUEST_TSO6,
 VIRTIO_NET_F_GUEST_UFO,
+VIRTIO_NET_F_GUEST_USO4,
+VIRTIO_NET_F_GUEST_USO6,
 VIRTIO_NET_F_HASH_REPORT,
 VIRTIO_NET_F_HOST_ECN,
 VIRTIO_NET_F_HOST_TSO4,
 VIRTIO_NET_F_HOST_TSO6,
 VIRTIO_NET_F_HOST_UFO,
+VIRTIO_NET_F_HOST_USO,
 VIRTIO_NET_F_MQ,
 VIRTIO_NET_F_MRG_RXBUF,
 VIRTIO_NET_F_MTU,
-- 
2.34.3




[PATCH 1/4] tap: Added USO support to tap device.

2023-07-19 Thread Yuri Benditovich
From: Andrew Melnychenko 

Passing additional parameters (USOv4 and USOv6 offloads) when
setting TAP offloads

Signed-off-by: Yuri Benditovich 
Signed-off-by: Andrew Melnychenko 
---
 hw/net/e1000e_core.c |  2 +-
 hw/net/igb_core.c|  2 +-
 hw/net/virtio-net.c  |  4 +++-
 hw/net/vmxnet3.c |  2 ++
 include/net/net.h|  4 ++--
 net/net.c|  4 ++--
 net/tap-bsd.c|  2 +-
 net/tap-linux.c  | 15 ---
 net/tap-linux.h  |  2 ++
 net/tap-solaris.c|  2 +-
 net/tap-stub.c   |  2 +-
 net/tap-win32.c  |  2 +-
 net/tap.c|  6 +++---
 net/tap_int.h|  3 ++-
 14 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index f8aeafa16b..d4055956ad 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -2852,7 +2852,7 @@ e1000e_update_rx_offloads(E1000ECore *core)
 
 if (core->has_vnet) {
 qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
- cso_state, 0, 0, 0, 0);
+ cso_state, 0, 0, 0, 0, 0, 0);
 }
 }
 
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 8b6b75c522..389eef1549 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -2753,7 +2753,7 @@ igb_update_rx_offloads(IGBCore *core)
 
 if (core->has_vnet) {
 qemu_set_offload(qemu_get_queue(core->owner_nic)->peer,
- cso_state, 0, 0, 0, 0);
+ cso_state, 0, 0, 0, 0, 0, 0);
 }
 }
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 7102ec4817..d2311e7d6e 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -859,7 +859,9 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
 !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
 !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
 !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
-!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)));
+!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
+!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
+!!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
 }
 
 static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 5dfacb1098..886adae42b 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -1341,6 +1341,8 @@ static void vmxnet3_update_features(VMXNET3State *s)
  s->lro_supported,
  s->lro_supported,
  0,
+ 0,
+ 0,
  0);
 }
 }
diff --git a/include/net/net.h b/include/net/net.h
index 1448d00afb..b5ccfbbffb 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -58,7 +58,7 @@ typedef bool (HasVnetHdr)(NetClientState *);
 typedef bool (HasVnetHdrLen)(NetClientState *, int);
 typedef bool (GetUsingVnetHdr)(NetClientState *);
 typedef void (UsingVnetHdr)(NetClientState *, bool);
-typedef void (SetOffload)(NetClientState *, int, int, int, int, int);
+typedef void (SetOffload)(NetClientState *, int, int, int, int, int, int, int);
 typedef int (GetVnetHdrLen)(NetClientState *);
 typedef void (SetVnetHdrLen)(NetClientState *, int);
 typedef int (SetVnetLE)(NetClientState *, bool);
@@ -192,7 +192,7 @@ bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
 bool qemu_get_using_vnet_hdr(NetClientState *nc);
 void qemu_using_vnet_hdr(NetClientState *nc, bool enable);
 void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
-  int ecn, int ufo);
+  int ecn, int ufo, int uso4, int uso6);
 int qemu_get_vnet_hdr_len(NetClientState *nc);
 void qemu_set_vnet_hdr_len(NetClientState *nc, int len);
 int qemu_set_vnet_le(NetClientState *nc, bool is_le);
diff --git a/net/net.c b/net/net.c
index 6492ad530e..543e6dec43 100644
--- a/net/net.c
+++ b/net/net.c
@@ -532,13 +532,13 @@ void qemu_using_vnet_hdr(NetClientState *nc, bool enable)
 }
 
 void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
-  int ecn, int ufo)
+  int ecn, int ufo, int uso4, int uso6)
 {
 if (!nc || !nc->info->set_offload) {
 return;
 }
 
-nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo);
+nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo, uso4, uso6);
 }
 
 int qemu_get_vnet_hdr_len(NetClientState *nc)
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index 4c98fdd337..abd16a2ad2 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -232,7 +232,7 @@ int tap_fd_set_vnet_be(int fd, int is_be)
 }
 
 void tap_fd_set_offload(int fd, int csum, int tso4,
-int tso6, int ecn, int ufo)
+int tso6, int ecn, int ufo, int uso4, int uso6)
 {
 }
 
diff --git a/net/tap-linux.c 

[PATCH 0/4] virtio-net: add USO feature (UDP segmentation offload)

2023-07-19 Thread Yuri Benditovich
Starting from 6.2 the kernel supports UDP segmentation offload, the
kernel uses GSO_UDP_L4 to mark packets with USB sermentation request 


Andrew Melnychenko (3):
  tap: Added USO support to tap device.
  virtio-net: Added USO flags to vhost support.
  virtio-net: Added uso check

Yuri Benditovich (1):
  virtio-net: added USO support

 hw/net/e1000e_core.c |  2 +-
 hw/net/igb_core.c|  2 +-
 hw/net/vhost_net.c   |  3 +++
 hw/net/virtio-net.c  | 35 ---
 hw/net/vmxnet3.c |  2 ++
 include/net/net.h|  7 +--
 net/net.c| 13 +++--
 net/tap-bsd.c|  7 ++-
 net/tap-linux.c  | 27 ---
 net/tap-linux.h  |  2 ++
 net/tap-solaris.c|  7 ++-
 net/tap-stub.c   |  7 ++-
 net/tap-win32.c  |  2 +-
 net/tap.c| 18 +++---
 net/tap_int.h|  4 +++-
 net/vhost-vdpa.c |  3 +++
 16 files changed, 121 insertions(+), 20 deletions(-)

-- 
2.34.3




[PATCH 4/4] virtio-net: Added uso check

2023-07-19 Thread Yuri Benditovich
From: Andrew Melnychenko 

Added tap uso check with stubs for non-Linux systems.

Signed-off-by: Yuri Benditovich 
Signed-off-by: Andrew Melnychenko 
---
 hw/net/virtio-net.c | 15 +++
 include/net/net.h   |  3 +++
 net/net.c   |  9 +
 net/tap-bsd.c   |  5 +
 net/tap-linux.c | 12 
 net/tap-solaris.c   |  5 +
 net/tap-stub.c  |  5 +
 net/tap.c   | 12 
 net/tap_int.h   |  1 +
 9 files changed, 67 insertions(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index e76cad923b..d950d3a77f 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -659,6 +659,15 @@ static int peer_has_ufo(VirtIONet *n)
 return n->has_ufo;
 }
 
+static int peer_has_uso(VirtIONet *n)
+{
+if (!peer_has_vnet_hdr(n)) {
+return 0;
+}
+
+return qemu_has_uso(qemu_get_queue(n->nic)->peer);
+}
+
 static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
int version_1, int hash_report)
 {
@@ -808,6 +817,12 @@ static uint64_t virtio_net_get_features(VirtIODevice 
*vdev, uint64_t features,
 virtio_clear_feature(, VIRTIO_NET_F_HOST_UFO);
 }
 
+if (!peer_has_uso(n)) {
+virtio_clear_feature(, VIRTIO_NET_F_HOST_USO);
+virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO4);
+virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO6);
+}
+
 if (!get_vhost_net(nc->peer)) {
 return features;
 }
diff --git a/include/net/net.h b/include/net/net.h
index b5ccfbbffb..330d285930 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -54,6 +54,7 @@ typedef void (LinkStatusChanged)(NetClientState *);
 typedef void (NetClientDestructor)(NetClientState *);
 typedef RxFilterInfo *(QueryRxFilter)(NetClientState *);
 typedef bool (HasUfo)(NetClientState *);
+typedef bool (HasUso)(NetClientState *);
 typedef bool (HasVnetHdr)(NetClientState *);
 typedef bool (HasVnetHdrLen)(NetClientState *, int);
 typedef bool (GetUsingVnetHdr)(NetClientState *);
@@ -84,6 +85,7 @@ typedef struct NetClientInfo {
 QueryRxFilter *query_rx_filter;
 NetPoll *poll;
 HasUfo *has_ufo;
+HasUso *has_uso;
 HasVnetHdr *has_vnet_hdr;
 HasVnetHdrLen *has_vnet_hdr_len;
 GetUsingVnetHdr *get_using_vnet_hdr;
@@ -187,6 +189,7 @@ void qemu_set_info_str(NetClientState *nc,
const char *fmt, ...) G_GNUC_PRINTF(2, 3);
 void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]);
 bool qemu_has_ufo(NetClientState *nc);
+bool qemu_has_uso(NetClientState *nc);
 bool qemu_has_vnet_hdr(NetClientState *nc);
 bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
 bool qemu_get_using_vnet_hdr(NetClientState *nc);
diff --git a/net/net.c b/net/net.c
index 543e6dec43..b110e61f66 100644
--- a/net/net.c
+++ b/net/net.c
@@ -495,6 +495,15 @@ bool qemu_has_ufo(NetClientState *nc)
 return nc->info->has_ufo(nc);
 }
 
+bool qemu_has_uso(NetClientState *nc)
+{
+if (!nc || !nc->info->has_uso) {
+return false;
+}
+
+return nc->info->has_uso(nc);
+}
+
 bool qemu_has_vnet_hdr(NetClientState *nc)
 {
 if (!nc || !nc->info->has_vnet_hdr) {
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index abd16a2ad2..274ea7bd2c 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -212,6 +212,11 @@ int tap_probe_has_ufo(int fd)
 return 0;
 }
 
+int tap_probe_has_uso(int fd)
+{
+return 0;
+}
+
 int tap_probe_vnet_hdr_len(int fd, int len)
 {
 return 0;
diff --git a/net/tap-linux.c b/net/tap-linux.c
index 30fcca1bc2..c7e514ecb0 100644
--- a/net/tap-linux.c
+++ b/net/tap-linux.c
@@ -173,6 +173,18 @@ int tap_probe_has_ufo(int fd)
 return 1;
 }
 
+int tap_probe_has_uso(int fd)
+{
+unsigned offload;
+
+offload = TUN_F_CSUM | TUN_F_USO4 | TUN_F_USO6;
+
+if (ioctl(fd, TUNSETOFFLOAD, offload) < 0) {
+return 0;
+}
+return 1;
+}
+
 /* Verify that we can assign given length */
 int tap_probe_vnet_hdr_len(int fd, int len)
 {
diff --git a/net/tap-solaris.c b/net/tap-solaris.c
index a617a10e5c..08b13af512 100644
--- a/net/tap-solaris.c
+++ b/net/tap-solaris.c
@@ -216,6 +216,11 @@ int tap_probe_has_ufo(int fd)
 return 0;
 }
 
+int tap_probe_has_uso(int fd)
+{
+return 0;
+}
+
 int tap_probe_vnet_hdr_len(int fd, int len)
 {
 return 0;
diff --git a/net/tap-stub.c b/net/tap-stub.c
index ac8dfc03b4..4b24f61e3a 100644
--- a/net/tap-stub.c
+++ b/net/tap-stub.c
@@ -47,6 +47,11 @@ int tap_probe_has_ufo(int fd)
 return 0;
 }
 
+int tap_probe_has_uso(int fd)
+{
+return 0;
+}
+
 int tap_probe_vnet_hdr_len(int fd, int len)
 {
 return 0;
diff --git a/net/tap.c b/net/tap.c
index 14ea4ef26f..bcea8d03f9 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -57,6 +57,7 @@ typedef struct TAPState {
 bool write_poll;
 bool using_vnet_hdr;
 bool has_ufo;
+bool has_uso;
 bool enabled;
 VHostNetState *vhost_net;
 unsigned host_vnet_hdr_len;
@@ -237,6 +238,15 @@ static bool 

Re: Boot failure after QEMU's upgrade to OpenSBI v1.3 (was Re: [PATCH for-8.2 6/7] target/riscv: add 'max' CPU type)

2023-07-19 Thread Anup Patel
On Wed, Jul 19, 2023 at 3:23 PM Alistair Francis  wrote:
>
> On Wed, Jul 19, 2023 at 3:39 PM Anup Patel  wrote:
> >
> > On Wed, Jul 19, 2023 at 7:03 AM Alistair Francis  
> > wrote:
> > >
> > > On Sat, Jul 15, 2023 at 7:14 PM Atish Patra  wrote:
> > > >
> > > > On Fri, Jul 14, 2023 at 5:29 AM Conor Dooley  wrote:
> > > > >
> > > > > On Fri, Jul 14, 2023 at 11:19:34AM +0100, Conor Dooley wrote:
> > > > > > On Fri, Jul 14, 2023 at 10:00:19AM +0530, Anup Patel wrote:
> > > > > >
> > > > > > > > > OpenSBI v1.3
> > > > > > > > >_  _
> > > > > > > > >   / __ \  / |  _ \_   _|
> > > > > > > > >  | |  | |_ __   ___ _ __ | (___ | |_) || |
> > > > > > > > >  | |  | | '_ \ / _ \ '_ \ \___ \|  _ < | |
> > > > > > > > >  | |__| | |_) |  __/ | | |) | |_) || |_
> > > > > > > > >   \/| .__/ \___|_| |_|_/|___/_|
> > > > > > > > > | |
> > > > > > > > > |_|
> > > > > > > > >
> > > > > > > > > init_coldboot: ipi init failed (error -1009)
> > > > > > > > >
> > > > > > > > > Just to note, because we use our own firmware that vendors in 
> > > > > > > > > OpenSBI
> > > > > > > > > and compiles only a significantly cut down number of files 
> > > > > > > > > from it, we
> > > > > > > > > do not use the fw_dynamic etc flow on our hardware. As a 
> > > > > > > > > result, we have
> > > > > > > > > not tested v1.3, nor do we have any immediate plans to change 
> > > > > > > > > our
> > > > > > > > > platform firmware to vendor v1.3 either.
> > > > > > > > >
> > > > > > > > > I unless there's something obvious to you, it sounds like I 
> > > > > > > > > will need to
> > > > > > > > > go and bisect OpenSBI. That's a job for another day though, 
> > > > > > > > > given the
> > > > > > > > > time.
> > > > > > > > >
> > > > > > >
> > > > > > > The real issue is some CPU/HART DT nodes marked as disabled in the
> > > > > > > DT passed to OpenSBI 1.3.
> > > > > > >
> > > > > > > This issue does not exist in any of the DTs generated by QEMU but 
> > > > > > > some
> > > > > > > of the DTs in the kernel (such as microchip and SiFive board DTs) 
> > > > > > > have
> > > > > > > the E-core disabled.
> > > > > > >
> > > > > > > I had discovered this issue in a totally different context after 
> > > > > > > the OpenSBI 1.3
> > > > > > > release happened. This issue is already fixed in the latest 
> > > > > > > OpenSBI by the
> > > > > > > following commit c6a35733b74aeff612398f274ed19a74f81d1f37 ("lib: 
> > > > > > > utils:
> > > > > > > Fix sbi_hartid_to_scratch() usage in ACLINT drivers").
> > > > > >
> > > > > > Great, thanks Anup! I thought I had tested tip-of-tree too, but
> > > > > > obviously not.
> > > > > >
> > > > > > > I always assumed that Microchip hss.bin is the preferred BIOS for 
> > > > > > > the
> > > > > > > QEMU microchip-icicle-kit machine but I guess that's not true.
> > > > > >
> > > > > > Unfortunately the HSS has not worked in QEMU for a long time, and 
> > > > > > while
> > > > > > I would love to fix it, but am pretty stretched for spare time to 
> > > > > > begin
> > > > > > with.
> > > > > > I usually just do direct kernel boots, which use the OpenSBI that 
> > > > > > comes
> > > > > > with QEMU, as I am sure you already know :)
> > > > > >
> > > > > > > At this point, you can either:
> > > > > > > 1) Use latest OpenSBI on QEMU microchip-icicle-kit machine
> > > > >
> > > > > I forgot to reply to this point, wondering what should be done with
> > > > > QEMU. Bumping to v1.3 in QEMU introduces a regression here, regardless
> > > > > of whether I can go and build a fixed version of OpenSBI.
> > > > >
> > > > FYI: The no-map fix went in OpenSBI v1.3. Without the upgrade, any
> > > > user using the latest kernel (> v6.4)
> > > > may hit those random linear map related issues (in hibernation or EFI
> > > > booting path).
> > > >
> > > > There are three possible scenarios:
> > > >
> > > > 1. Upgrade to OpenSBI v1.3: Any user of microchip-icicle-kit machine
> > > > or sifive fu540 machine users
> > > > may hit this issue if the device tree has the disabled hart (e core).
> > > > 2. No upgrade to OpenSBI v1.2. Any user using hibernation or UEFI may
> > > > have issues [1]
> > > > 3. Include a non-release version OpenSBI in Qemu with the fix as an 
> > > > exception.
> > > >
> > > > #3 probably deviates from policy and sets a bad precedent. So I am not
> > > > advocating for it though ;)
> > > > For both #1 & #2, the solution would be to use the latest OpenSBI in
> > > > -bios argument instead of the stock one.
> > > > I could be wrong but my guess is the number of users facing #2 would
> > > > be higher than #1.
> > >
> > > Thanks for that info Atish!
> > >
> > > We are stuck in a bad situation.
> > >
> > > The best solution would be if OpenSBI can release a 1.3.1, @Anup Patel
> > > do you think you could do that?
> >
> > OpenSBI has a major number and minor number in the version but it does
> > not have release/patch number so best 

[PATCH 3/4] virtio-net: added USO support

2023-07-19 Thread Yuri Benditovich
virtio-net can suggest USO features TX, RX v4 and RX v6,
depending on kernel TUN ability to support them. These
features require explicit enable in command-line.

Signed-off-by: Yuri Benditovich 
---
 hw/net/virtio-net.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index d2311e7d6e..e76cad923b 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -796,6 +796,10 @@ static uint64_t virtio_net_get_features(VirtIODevice 
*vdev, uint64_t features,
 virtio_clear_feature(, VIRTIO_NET_F_GUEST_TSO6);
 virtio_clear_feature(, VIRTIO_NET_F_GUEST_ECN);
 
+virtio_clear_feature(, VIRTIO_NET_F_HOST_USO);
+virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO4);
+virtio_clear_feature(, VIRTIO_NET_F_GUEST_USO6);
+
 virtio_clear_feature(, VIRTIO_NET_F_HASH_REPORT);
 }
 
@@ -864,14 +868,16 @@ static void virtio_net_apply_guest_offloads(VirtIONet *n)
 !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
 }
 
-static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
+static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
 {
 static const uint64_t guest_offloads_mask =
 (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
 (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
 (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
 (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
-(1ULL << VIRTIO_NET_F_GUEST_UFO);
+(1ULL << VIRTIO_NET_F_GUEST_UFO)  |
+(1ULL << VIRTIO_NET_F_GUEST_USO4) |
+(1ULL << VIRTIO_NET_F_GUEST_USO6);
 
 return guest_offloads_mask & features;
 }
@@ -3924,6 +3930,12 @@ static Property virtio_net_properties[] = {
 DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
 DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
 DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
+DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
+  VIRTIO_NET_F_GUEST_USO4, false),
+DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
+  VIRTIO_NET_F_GUEST_USO6, false),
+DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
+  VIRTIO_NET_F_HOST_USO, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.34.3




Re: [PATCH v8 3/9] migration: convert socket backend to accept MigrateAddress

2023-07-19 Thread Daniel P . Berrangé
On Thu, Jul 13, 2023 at 10:57:07AM +, Het Gala wrote:
> Socket transport backend for 'migrate'/'migrate-incoming' QAPIs accept
> new wire protocol of MigrateAddress struct.
> 
> It is achived by parsing 'uri' string and storing migration parameters
> required for socket connection into well defined SocketAddress struct.
> 
> Suggested-by: Aravind Retnakaran 
> Signed-off-by: Het Gala 
> ---
>  migration/migration.c | 32 +++-
>  migration/socket.c| 34 +-
>  migration/socket.h|  7 ---
>  3 files changed, 28 insertions(+), 45 deletions(-)

> diff --git a/migration/socket.c b/migration/socket.c
> index 1b6f5baefb..8e7430b266 100644
> --- a/migration/socket.c
> +++ b/migration/socket.c
> @@ -108,10 +108,9 @@ out:
>  object_unref(OBJECT(sioc));
>  }
>  
> -static void
> -socket_start_outgoing_migration_internal(MigrationState *s,
> - SocketAddress *saddr,
> - Error **errp)
> +void socket_start_outgoing_migration(MigrationState *s,
> + SocketAddress *saddr,
> + Error **errp)
>  {
>  QIOChannelSocket *sioc = qio_channel_socket_new();
>  struct SocketConnectData *data = g_new0(struct SocketConnectData, 1);
> @@ -135,18 +134,6 @@ socket_start_outgoing_migration_internal(MigrationState 
> *s,
>   NULL);
>  }
>  
> -void socket_start_outgoing_migration(MigrationState *s,
> - const char *str,
> - Error **errp)
> -{
> -Error *err = NULL;
> -SocketAddress *saddr = socket_parse(str, );
> -if (!err) {
> -socket_start_outgoing_migration_internal(s, saddr, );
> -}
> -error_propagate(errp, err);
> -}

In this original code, socket_start_outgoing_migration would allocate
the SocketAddress, and then call socket_start_outgoing_migration_internal
which would take ownership of it. This is fine.

In the new code, the caller of socket_start_outgoing_migration
owns the SocketAddress. So socket_start_outgoing_migration must
create its own copy. IOW, this patch is where the QAPI_CLONE
additions from patch 8 must be put.


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|




Re: [PATCH v3 0/8] vdpa: Send all CVQ state load commands in parallel

2023-07-19 Thread Hawkins Jiawei
在 2023/7/19 20:46, Michael S. Tsirkin 写道:
> On Wed, Jul 19, 2023 at 08:35:50PM +0800, Hawkins Jiawei wrote:
>> 在 2023/7/19 17:11, Michael S. Tsirkin 写道:
>>> On Wed, Jul 19, 2023 at 03:53:45PM +0800, Hawkins Jiawei wrote:
 This patchset allows QEMU to delay polling and checking the device
 used buffer until either the SVQ is full or control commands shadow
 buffers are full, instead of polling and checking immediately after
 sending each SVQ control command, so that QEMU can send all the SVQ
 control commands in parallel, which have better performance improvement.

 I use vp_vdpa device to simulate vdpa device, and create 4094 VLANS in
 guest to build a test environment for sending multiple CVQ state load
 commands. This patch series can improve latency from 10023 us to
 8697 us for about 4099 CVQ state load commands, about 0.32 us per command.
>>>
>>> Looks like a tiny improvement.
>>> At the same time we have O(n^2) behaviour with memory mappings.
>>
>> Hi Michael,
>>
>> Thanks for your review.
>>
>> I wonder why you say "we have O(n^2) behaviour on memory mappings" here?
>
> it's not specific to virtio - it's related to device init.
> generally each device has some memory. during boot bios
> enables each individually O(n) where n is # of devices.
> memory maps has to be updated and in qemu this update
> is at least superlinear with n (more like O(n log n) I think).
> This gets up > O(n^2) with n number of devices.

Thanks for your explanation.


>
>>   From my understanding, QEMU maps two page-size buffers as control
>> commands shadow buffers at device startup. These buffers then are used
>> to cache SVQ control commands, where QEMU fills them with multiple SVQ 
>> control
>> commands bytes, flushes them when SVQ descriptors are full or these
>> control commands shadow buffers reach their capacity.
>>
>> QEMU repeats this process until all CVQ state load commands have been
>> sent in loading.
>>
>> In this loading process, only control commands shadow buffers
>> translation should be relative to memory mappings, which should be
>> O(log n) behaviour to my understanding(Please correct me if I am wrong).
>>
>>> Not saying we must not do this but I think it's worth
>>> checking where the bottleneck is. My guess would be
>>> vp_vdpa is not doing things in parallel. Want to try fixing that
>>
>> As for "vp_vdpa is not doing things in parallel.", do you mean
>> the vp_vdpa device cannot process QEMU's SVQ control commands
>> in parallel?
>>
>> In this situation, I will try to use real vdpa hardware to
>> test the patch series performance.
>
> yea, pls do that.
>
>>> to see how far it can be pushed?
>>
>> Currently, I am involved in the "Add virtio-net Control Virtqueue state
>> restore support" project in Google Summer of Code now. Because I am
>> uncertain about the time it will take to fix that problem in the vp_vdpa
>> device, I prefer to complete the gsoc project first.
>>
>> Thanks!
>>
>>
>>>
>>>
 Note that this patch should be based on
 patch "Vhost-vdpa Shadow Virtqueue VLAN support" at [1].

 [1]. https://lists.gnu.org/archive/html/qemu-devel/2023-07/msg03719.html

 TestStep
 
 1. regression testing using vp-vdpa device
 - For L0 guest, boot QEMU with two virtio-net-pci net device with
 `ctrl_vq`, `ctrl_rx`, `ctrl_rx_extra` features on, command line like:
 -device virtio-net-pci,disable-legacy=on,disable-modern=off,
 iommu_platform=on,mq=on,ctrl_vq=on,guest_announce=off,
 indirect_desc=off,queue_reset=off,ctrl_rx=on,ctrl_rx_extra=on,...

 - For L1 guest, apply the patch series and compile the source code,
 start QEMU with two vdpa device with svq mode on, enable the `ctrl_vq`,
 `ctrl_rx`, `ctrl_rx_extra` features on, command line like:
 -netdev type=vhost-vdpa,x-svq=true,...
 -device virtio-net-pci,mq=on,guest_announce=off,ctrl_vq=on,
 ctrl_rx=on,ctrl_rx_extra=on...

 - For L2 source guest, run the following bash command:
 ```bash
 #!/bin/sh

 for idx1 in {0..9}
 do
 for idx2 in {0..9}
 do
   for idx3 in {0..6}
   do
 ip link add macvlan$idx1$idx2$idx3 link eth0
 address 4a:30:10:19:$idx1$idx2:1$idx3 type macvlan mode bridge
 ip link set macvlan$idx1$idx2$idx3 up
   done
 done
 done
 ```
 - Execute the live migration in L2 source monitor

 - Result
   * with this series, QEMU should not trigger any error or warning.



 2. perf using vp-vdpa device
 - For L0 guest, boot QEMU with two virtio-net-pci net device with
 `ctrl_vq`, `ctrl_vlan` features on, command line like:
 -device virtio-net-pci,disable-legacy=on,disable-modern=off,
 iommu_platform=on,mq=on,ctrl_vq=on,guest_announce=off,
 indirect_desc=off,queue_reset=off,ctrl_vlan=on,...

 - For 

Re: [PATCH 5/6] vhost-vdpa: Match vhost-user's status reset

2023-07-19 Thread Stefan Hajnoczi
On Wed, 19 Jul 2023 at 10:10, Hanna Czenczek  wrote:
>
> On 18.07.23 16:50, Stefan Hajnoczi wrote:
> > On Tue, Jul 11, 2023 at 05:52:27PM +0200, Hanna Czenczek wrote:
> >> vhost-vdpa and vhost-user differ in how they reset the status in their
> >> respective vhost_reset_status implementations: vhost-vdpa zeroes it,
> >> then re-adds the S_ACKNOWLEDGE and S_DRIVER config bits.  S_DRIVER_OK is
> >> then set in vhost_vdpa_dev_start().
> >>
> >> vhost-user in contrast just zeroes the status, and does no re-add any
> >> config bits until vhost_user_dev_start() (where it does re-add all of
> >> S_ACKNOWLEDGE, S_DRIVER, and S_DRIVER_OK).
> >>
> >> There is no documentation for vhost_reset_status, but its only caller is
> >> vhost_dev_stop().  So apparently, the device is to be stopped after
> >> vhost_reset_status, and therefore it makes more sense to keep the status
> >> field fully cleared until the back-end is re-started, which is how
> >> vhost-user does it.  Make vhost-vdpa do the same -- if nothing else it's
> >> confusing to have both vhost implementations handle this differently.
> >>
> >> Signed-off-by: Hanna Czenczek 
> >> ---
> >>   hw/virtio/vhost-vdpa.c | 6 +++---
> >>   1 file changed, 3 insertions(+), 3 deletions(-)
> > Hi Hanna,
> > The VIRTIO spec lists the Device Initialization sequence including the
> > bits set in the Device Status Register here:
> > https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-1070001
> >
> > ACKNOWLEDGE and DRIVER must be set before FEATURES_OK. DRIVER_OK is set
> > after FEATURES_OK.
> >
> > The driver may read the Device Configuration Space once ACKNOWLEDGE and
> > DRIVER are set.
> >
> > QEMU's vhost code should follow this sequence (especially for vDPA where
> > full VIRTIO devices are implemented).
> >
> > vhost-user is not faithful to the VIRTIO spec here. That's probably due
> > to the fact that vhost-user didn't have the concept of the Device Status
> > Register until recently and back-ends mostly ignore it.
> >
> > Please do the opposite of this patch: bring vhost-user in line with the
> > VIRTIO specification so that the Device Initialization sequence is
> > followed correctly. I think vhost-vdpa already does the right thing.
>
> Hm.  This sounds all very good, but what leaves me lost is the fact that
> we never actually expose the status field to the guest, as far as I can
> see.  We have no set_status callback, and as written in the commit
> message, the only caller of reset_status is vhost_dev_stop().  So the
> status field seems completely artificial in vhost right now.  That is
> why I’m wondering what the flags even really mean.

vhost (including vDPA and vhost-user) is not a 100% passthrough
solution. The VMM emulates a VIRTIO device (e.g. virtio-fs-pci) that
has some separate state from the vhost back-end, including the Device
Status Register. This is analogous to how passthrough PCI devices
still have emulated PCI registers that are not passed through to the
physical PCI device.

However, just because the vDPA, and now vhost-user with the SET_STATUS
message, back-end is not directly exposed to the guest does not mean
it should diverge from the VIRTIO specification for no reason.

> Another point I made in the commit message is that it is strange that we
> reset the status to 0, and then add the ACKNOWLEDGE and DRIVER while the
> VM is still stopped.  It doesn’t make sense to me to set these flags
> while the guest driver is not operative.

While there is no harm in setting those bits, I agree that leaving the
Device Status Register at 0 while the VM is stopped would be nicer.

> If what you’re saying is that we must set FEATURES_OK only after
> ACKNOWLEDGE and DRIVER, wouldn’t it be still better to set all of these
> flags only in vhost_*_dev_start(), but do it in two separate SET_STATUS
> calls?

The device initialization sequence could be put into vhost_dev_start():
1. ACKNOWLEDGE | DRIVER
2. FEATURES_OK via vhost_dev_set_features()
3. DRIVER_OK via ->vhost_dev_start()

But note that the ->vhost_dev_start() callback is too late to set
ACKNOWLEDGE | DRIVER because feature negotiation happens earlier.

> (You mentioned the configuration space – is that accessed while between
> vhost_dev_stop and vhost_dev_start?)

I don't think so.

>
> Hanna
>
> >> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> >> index f7fd19a203..0cde8b40de 100644
> >> --- a/hw/virtio/vhost-vdpa.c
> >> +++ b/hw/virtio/vhost-vdpa.c
> >> @@ -1294,8 +1294,6 @@ static void vhost_vdpa_reset_status(struct vhost_dev 
> >> *dev)
> >>   }
> >>
> >>   vhost_vdpa_reset_device(dev);
> >> -vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
> >> -   VIRTIO_CONFIG_S_DRIVER);
> >>   memory_listener_unregister(>listener);
> >>   }
> >>
> >> @@ -1334,7 +1332,9 @@ static int vhost_vdpa_dev_start(struct vhost_dev 
> >> *dev, bool started)
> >>   }
> >>   memory_listener_register(>listener, 

Re: [PULL 0/8] s390x, qtest and misc patches for QEMU 8.1 rc1

2023-07-19 Thread Philippe Mathieu-Daudé

On 19/7/23 14:40, Peter Maydell wrote:

On Tue, 18 Jul 2023 at 10:31, Thomas Huth  wrote:


The following changes since commit 361d5397355276e3007825cc17217c1e4d4320f7:

   Merge tag 'block-pull-request' of https://gitlab.com/stefanha/qemu into 
staging (2023-07-17 15:49:27 +0100)

are available in the Git repository at:

   https://gitlab.com/thuth/qemu.git tags/pull-request-2023-07-18

for you to fetch changes up to a5754847e0fc2bc08a414dd381803009e8bca390:

   tests/avocado: Disable the test_sbsaref_edk2_firmware by default (2023-07-18 
11:22:51 +0200)


* Fix s390x KVM guests when compiling with --without-default-devices
* Fix /proc/cpuinfo features list in s390x linux-user emulation
* Generate FreeBSD VM package list via lcitool
* Disable the flaky test_sbsaref_edk2_firmware avocado test by default





Applied to target-arm.next, thanks.


s/target-arm.next/master/ ;)




[PATCH for-8.1] tests/test-util-filemonitor: Avoid pointless allocations

2023-07-19 Thread Philippe Mathieu-Daudé
Coverity reports few resource leaks. While they are
harmless, fix them to avoid them showing on the reports.

Reported-by: Coverity (CID 1432615: RESOURCE_LEAK)
Fixes: 4f370b1098 ("test-util-filemonitor: Skip test on non-x86 Travis 
containers")
Signed-off-by: Philippe Mathieu-Daudé 
---
 tests/unit/test-util-filemonitor.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/unit/test-util-filemonitor.c 
b/tests/unit/test-util-filemonitor.c
index b629e10857..3ca687860d 100644
--- a/tests/unit/test-util-filemonitor.c
+++ b/tests/unit/test-util-filemonitor.c
@@ -398,7 +398,7 @@ test_file_monitor_events(void)
 };
 Error *local_err = NULL;
 GError *gerr = NULL;
-QFileMonitor *mon = qemu_file_monitor_new(_err);
+QFileMonitor *mon;
 QemuThread th;
 GTimer *timer;
 gchar *dir = NULL;
@@ -407,12 +407,9 @@ test_file_monitor_events(void)
 char *pathsrc = NULL;
 char *pathdst = NULL;
 QFileMonitorTestData data;
-GHashTable *ids = g_hash_table_new(g_int64_hash, g_int64_equal);
+GHashTable *ids;
 char *travis_arch;
 
-qemu_mutex_init();
-data.records = NULL;
-
 /*
  * This test does not work on Travis LXD containers since some
  * syscalls are blocked in that environment.
@@ -423,6 +420,12 @@ test_file_monitor_events(void)
 return;
 }
 
+mon = qemu_file_monitor_new(_err);
+ids = g_hash_table_new(g_int64_hash, g_int64_equal);
+
+qemu_mutex_init();
+data.records = NULL;
+
 /*
  * The file monitor needs the main loop running in
  * order to receive events from inotify. We must
-- 
2.38.1




Re: [PATCH v2 14/14] tests/tcg/s390x: Test VCKSM

2023-07-19 Thread Thomas Huth

On 19/07/2023 11.44, Ilya Leoshkevich wrote:

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
  tests/tcg/s390x/Makefile.target |  1 +
  tests/tcg/s390x/vcksm.c | 31 +++
  tests/tcg/s390x/vx.h|  2 ++
  3 files changed, 34 insertions(+)
  create mode 100644 tests/tcg/s390x/vcksm.c


Tested-by: Thomas Huth 





Re: [PATCH v2 13/14] tests/tcg/s390x: Test STPQ

2023-07-19 Thread Thomas Huth

On 19/07/2023 11.44, Ilya Leoshkevich wrote:

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
  tests/tcg/s390x/Makefile.softmmu-target |  1 +
  tests/tcg/s390x/stpq.S  | 20 
  2 files changed, 21 insertions(+)
  create mode 100644 tests/tcg/s390x/stpq.S


Tested-by: Thomas Huth 





Re: [PATCH v2 12/14] tests/tcg/s390x: Test MC

2023-07-19 Thread Thomas Huth

On 19/07/2023 11.44, Ilya Leoshkevich wrote:

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
  tests/tcg/s390x/Makefile.softmmu-target |  1 +
  tests/tcg/s390x/mc.S| 56 +
  2 files changed, 57 insertions(+)
  create mode 100644 tests/tcg/s390x/mc.S


Tested-by: Thomas Huth 





Re: [PATCH v2 11/14] tests/tcg/s390x: Test ICM

2023-07-19 Thread Thomas Huth

On 19/07/2023 11.44, Ilya Leoshkevich wrote:

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
  tests/tcg/s390x/Makefile.softmmu-target |  1 +
  tests/tcg/s390x/icm.S   | 32 +
  2 files changed, 33 insertions(+)
  create mode 100644 tests/tcg/s390x/icm.S


Tested-by: Thomas Huth 




Re: [PATCH v2 10/14] tests/tcg/s390x: Test CLM

2023-07-19 Thread Thomas Huth

On 19/07/2023 11.44, Ilya Leoshkevich wrote:

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
  tests/tcg/s390x/Makefile.softmmu-target |  1 +
  tests/tcg/s390x/clm.S   | 29 +
  2 files changed, 30 insertions(+)
  create mode 100644 tests/tcg/s390x/clm.S


Tested-by: Thomas Huth 





Re: [PATCH v2 09/14] tests/tcg/s390x: Test CLGEBR and CGEBRA

2023-07-19 Thread Thomas Huth

On 19/07/2023 11.44, Ilya Leoshkevich wrote:

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
  tests/tcg/s390x/Makefile.target |  5 +
  tests/tcg/s390x/cgebra.c| 32 
  tests/tcg/s390x/clgebr.c| 32 
  3 files changed, 69 insertions(+)
  create mode 100644 tests/tcg/s390x/cgebra.c
  create mode 100644 tests/tcg/s390x/clgebr.c


Tested-by: Thomas Huth 





Re: [PATCH 6/6] vhost-user: Have reset_status fall back to reset

2023-07-19 Thread Hanna Czenczek

On 19.07.23 16:11, Hanna Czenczek wrote:

On 18.07.23 17:10, Stefan Hajnoczi wrote:

On Tue, Jul 11, 2023 at 05:52:28PM +0200, Hanna Czenczek wrote:

The only user of vhost_user_reset_status() is vhost_dev_stop(), which
only uses it as a fall-back to stop the back-end if it does not support
SUSPEND.  However, vhost-user's implementation is a no-op unless the
back-end supports SET_STATUS.

vhost-vdpa's implementation instead just calls
vhost_vdpa_reset_device(), implying that it's OK to fully reset the
device if SET_STATUS is not supported.

To be fair, vhost_vdpa_reset_device() does nothing but to set the 
status

to zero.  However, that may well be because vhost-vdpa has no method
besides this to reset a device.  In contrast, vhost-user has
RESET_DEVICE and a RESET_OWNER, which can be used instead.

While it is not entirely clear from documentation or git logs, from
discussions and the order of vhost-user protocol features, it 
appears to

me as if RESET_OWNER originally had no real meaning for vhost-user, and
was thus used to signal a device reset to the back-end.  Then,
RESET_DEVICE was introduced, to have a well-defined dedicated reset
command.  Finally, vhost-user received full STATUS support, including
SET_STATUS, so setting the device status to 0 is now the preferred way
of resetting a device.  Still, RESET_DEVICE and RESET_OWNER should
remain valid as fall-backs.

Therefore, have vhost_user_reset_status() fall back to
vhost_user_reset_device() if the back-end has no STATUS support.

Signed-off-by: Hanna Czenczek 
---
  hw/virtio/vhost-user.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 4507de5a92..53a881ec2a 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -2833,6 +2833,8 @@ static void vhost_user_reset_status(struct 
vhost_dev *dev)

  if (virtio_has_feature(dev->protocol_features,
 VHOST_USER_PROTOCOL_F_STATUS)) {
  vhost_user_set_status(dev, 0);
+    } else {
+    vhost_user_reset_device(dev);
  }
  }

Did you check whether DPDK treats setting the status to 0 as equivalent
to RESET_DEVICE?


If it doesn’t, what’s even the point of using reset_status?


Sorry, I’m being unclear, and I think this may be important because it 
ties into the question from patch 1, what qemu is even trying to do by 
running SET_STATUS(0) vhost_dev_stop(), so here’s what gave me the 
impression that SET_STATUS(0) and RESET_DEVICE should be equivalent:


vhost-vdpa.c runs SET_STATUS(0) in a function called 
vhost_vdpa_reset_device().  This is one thing that gave me the 
impression that this is about an actual full reset.


Another is the whole discussion that we’ve had.  vhost_dev_stop() does 
not call a `vhost_reset_device()` function, it calls 
`vhost_reset_status()`.  Still, we were always talking about resetting 
the device.


It doesn’t make sense to me that vDPA would provide no function to fully 
reset a device, while vhost-user does.  Being able to reset a device 
sounds vital to me.  This also gave me the impression that SET_STATUS(0) 
on vDPA at least is functionally equivalent to a full device reset.


Maybe SET_STATUS(0) does mean a full device reset on vDPA, but not on 
vhost-user.  That would be a real shame, so I assumed this would not be 
the case; that SET_STATUS(0) does the same thing on both protocols.


The virtio specification says “Writing 0 into this field resets the 
device.” about the device_status field.


This also makes sense, because the device_status field is basically used 
to tell the device that a driver has taken control.  If reset, this 
indicates the driver has given up control, and to me this is a point 
where a device should fully reset itself.


So all in all, I can’t see the rationale why any implementation that 
supports SET_STATUS would decide to treat SET_STATUS(0) not as 
equivalent or a superset of RESET_DEVICE.  I may be wrong, and this 
might explain a whole deal about what kind of background operations we 
hope to stop with SET_STATUS(0).


Hanna




Re: [PATCH v21 20/20] tests/avocado: s390x cpu topology bad move

2023-07-19 Thread Pierre Morel



On 7/5/23 12:32, Thomas Huth wrote:

On 30/06/2023 11.17, Pierre Morel wrote:

This test verifies that QEMU refuses to move a CPU to an
unexistant location.


s/unexistant/nonexistent/ ?


Signed-off-by: Pierre Morel 
---
  tests/avocado/s390_topology.py | 25 +
  1 file changed, 25 insertions(+)

diff --git a/tests/avocado/s390_topology.py 
b/tests/avocado/s390_topology.py

index 99d9508cef..ea39168b53 100644
--- a/tests/avocado/s390_topology.py
+++ b/tests/avocado/s390_topology.py
@@ -388,3 +388,28 @@ def test_dedicated_error(self):
  res = self.vm.qmp('set-cpu-topology',
    {'core-id': 0, 'entitlement': 'medium', 
'dedicated': False})

  self.assertEqual(res['return'], {})
+
+    def test_move_error(self):
+    """
+    This test verifies that QEMU refuses to move a CPU to an
+    unexistant location


s/unexistant/nonexistent/ ?

With the words fixed:
Reviewed-by: Thomas Huth 


I fix it, thanks,

Regards,

Pierre




Re: [PATCH v2 08/14] tests/tcg/s390x: Test CKSM

2023-07-19 Thread Ilya Leoshkevich
On Wed, 2023-07-19 at 16:20 +0200, Thomas Huth wrote:
> On 19/07/2023 11.44, Ilya Leoshkevich wrote:
> > Add a small test to prevent regressions.
> > 
> > Signed-off-by: Ilya Leoshkevich 
> > ---
> >   tests/tcg/s390x/Makefile.softmmu-target |  1 +
> >   tests/tcg/s390x/cksm.S  | 29
> > +
> >   2 files changed, 30 insertions(+)
> >   create mode 100644 tests/tcg/s390x/cksm.S
> > 
> > diff --git a/tests/tcg/s390x/Makefile.softmmu-target
> > b/tests/tcg/s390x/Makefile.softmmu-target
> > index 242c7b0f83c..e813e318db9 100644
> > --- a/tests/tcg/s390x/Makefile.softmmu-target
> > +++ b/tests/tcg/s390x/Makefile.softmmu-target
> > @@ -16,6 +16,7 @@ LDFLAGS=-nostdlib -static
> >   
> >   ASM_TESTS
> > =  
> >   \
> >  
> > bal
> >     \
> > +   
> > cksm   
> >     \
> >   exrl-ssm-
> > early \
> >  
> > sam
> >     \
> >  
> > lpsw   
> >     \
> > diff --git a/tests/tcg/s390x/cksm.S b/tests/tcg/s390x/cksm.S
> > new file mode 100644
> > index 000..a45f3ef6bfd
> > --- /dev/null
> > +++ b/tests/tcg/s390x/cksm.S
> > @@ -0,0 +1,29 @@
> > +    .org 0x8e
> > +program_interruption_code:
> > +    .org 0x1d0 /* program new PSW */
> > +    .quad 0,pgm
> > +    .org 0x200 /* lowcore padding */
> > +    .globl _start
> > +_start:
> > +    lmg %r0,%r1,cksm_args
> > +    cksm %r2,%r0
> > +    c %r2,cksm_exp
> > +    jne failure
> > +    cksm %r2,%r15
> 
> Clang is smart enough to detect that this is a bad instruction:
> 
> $ make check-tcg
>    BUILD   s390x-softmmu guest-tests
> tests/tcg/s390x/cksm.S:12:14: error: invalid register pair
>  cksm %r2,%r15
>   ^
> 
> I guess you have to manually create the opcode here?
> 
>   Thomas

Argh, I really need to start testing my submissions with clang.
Thanks for noticing, I will fix this.


Re: [PATCH v21 19/20] tests/avocado: s390x cpu topology dedicated errors

2023-07-19 Thread Pierre Morel



On 7/5/23 12:28, Thomas Huth wrote:

On 30/06/2023 11.17, Pierre Morel wrote:

Let's test that QEMU refuses to setup a dedicated CPU with
low or medium entitlement.

Signed-off-by: Pierre Morel 
---
  tests/avocado/s390_topology.py | 48 ++
  1 file changed, 48 insertions(+)


Reviewed-by: Thomas Huth 


Thanks,

Pierre




Re: [PATCH v21 18/20] tests/avocado: s390x cpu topology test socket full

2023-07-19 Thread Pierre Morel



On 7/5/23 12:26, Thomas Huth wrote:

On 30/06/2023 11.17, Pierre Morel wrote:

This test verifies that QMP set-cpu-topology does not accept
to overload a socket.

Signed-off-by: Pierre Morel 
---
  tests/avocado/s390_topology.py | 25 +
  1 file changed, 25 insertions(+)

diff --git a/tests/avocado/s390_topology.py 
b/tests/avocado/s390_topology.py

index cba44bec91..0003b30702 100644
--- a/tests/avocado/s390_topology.py
+++ b/tests/avocado/s390_topology.py
@@ -315,3 +315,28 @@ def test_dedicated(self):
  self.guest_set_dispatching('0');
    self.check_topology(0, 0, 0, 0, 'high', True)
+
+    def test_socket_full(self):
+    """
+    This test verifies that QEMU does not accept to overload a 
socket.
+    The socket-id 0 on book-id 0 already contains CPUs 0 and 1 
and can

+    not accept any new CPU while socket-id 0 on book-id 1 is free.
+
+    :avocado: tags=arch:s390x
+    :avocado: tags=machine:s390-ccw-virtio
+    """
+    self.kernel_init()
+    self.vm.add_args('-smp',
+ '3,drawers=2,books=2,sockets=3,cores=2,maxcpus=24')
+    self.vm.launch()
+    self.wait_until_booted()
+
+    self.system_init()
+
+    res = self.vm.qmp('set-cpu-topology',
+  {'core-id': 2, 'socket-id': 0, 'book-id': 0})
+    self.assertEqual(res['error']['class'], 'GenericError')
+
+    res = self.vm.qmp('set-cpu-topology',
+  {'core-id': 2, 'socket-id': 0, 'book-id': 1})
+    self.assertEqual(res['return'], {})


Reviewed-by: Thomas Huth 


Thanks,

Pierre




Re: [PATCH v2 08/14] tests/tcg/s390x: Test CKSM

2023-07-19 Thread Thomas Huth

On 19/07/2023 11.44, Ilya Leoshkevich wrote:

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
  tests/tcg/s390x/Makefile.softmmu-target |  1 +
  tests/tcg/s390x/cksm.S  | 29 +
  2 files changed, 30 insertions(+)
  create mode 100644 tests/tcg/s390x/cksm.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
index 242c7b0f83c..e813e318db9 100644
--- a/tests/tcg/s390x/Makefile.softmmu-target
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -16,6 +16,7 @@ LDFLAGS=-nostdlib -static
  
  ASM_TESTS =\

  bal   
 \
+cksm   
\
  exrl-ssm-early
 \
  sam   
 \
  lpsw  
 \
diff --git a/tests/tcg/s390x/cksm.S b/tests/tcg/s390x/cksm.S
new file mode 100644
index 000..a45f3ef6bfd
--- /dev/null
+++ b/tests/tcg/s390x/cksm.S
@@ -0,0 +1,29 @@
+.org 0x8e
+program_interruption_code:
+.org 0x1d0 /* program new PSW */
+.quad 0,pgm
+.org 0x200 /* lowcore padding */
+.globl _start
+_start:
+lmg %r0,%r1,cksm_args
+cksm %r2,%r0
+c %r2,cksm_exp
+jne failure
+cksm %r2,%r15


Clang is smart enough to detect that this is a bad instruction:

$ make check-tcg
  BUILD   s390x-softmmu guest-tests
tests/tcg/s390x/cksm.S:12:14: error: invalid register pair
cksm %r2,%r15
 ^

I guess you have to manually create the opcode here?

 Thomas




  1   2   3   >