date:20200506

Re: [PATCH v1 2/2] Sample mtty: Add migration capability to mtty module

2020-05-06 Thread Kirti Wankhede





On 5/7/2020 6:31 AM, Yan Zhao wrote:

On Tue, May 05, 2020 at 01:54:20AM +0800, Kirti Wankhede wrote:

This patch makes mtty device migration capable. Purpose od this code is
to test migration interface. Only stop-and-copy phase is implemented.
Postcopy migration is not supported.

Actual data for mtty device migration is very less. Appended dummy data to
migration data stream, default 100 Mbytes. Added sysfs file
'dummy_data_size_MB' to get dummy data size from user which can be used
to check performance of based of data size. During resuming dummy data is
read and discarded.

Signed-off-by: Kirti Wankhede 
---
  samples/vfio-mdev/mtty.c | 602 ---
  1 file changed, 574 insertions(+), 28 deletions(-)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index bf666cce5bb7..f9194234fc6a 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -44,9 +44,23 @@
  
  #define MTTY_STRING_LEN		16
  
-#define MTTY_CONFIG_SPACE_SIZE  0xff

-#define MTTY_IO_BAR_SIZE0x8
-#define MTTY_MMIO_BAR_SIZE  0x10
+#define MTTY_CONFIG_SPACE_SIZE 0xff
+#define MTTY_IO_BAR_SIZE   0x8
+#define MTTY_MMIO_BAR_SIZE 0x10
+#define MTTY_MIGRATION_REGION_SIZE 0x100   // 16M
+
+#define MTTY_MIGRATION_REGION_INDEXVFIO_PCI_NUM_REGIONS
+#define MTTY_REGIONS_MAX   (MTTY_MIGRATION_REGION_INDEX + 1)
+
+/* Data section start from page aligned offset */
+#define MTTY_MIGRATION_REGION_DATA_OFFSET  (0x1000)
+
+/* First page is used for struct vfio_device_migration_info */
+#define MTTY_MIGRATION_REGION_SIZE_MMAP \
+   (MTTY_MIGRATION_REGION_SIZE - MTTY_MIGRATION_REGION_DATA_OFFSET)
+
+#define MIGRATION_INFO_OFFSET(MEMBER)  \
+   offsetof(struct vfio_device_migration_info, MEMBER)
  
  #define STORE_LE16(addr, val)   (*(u16 *)addr = val)

  #define STORE_LE32(addr, val)   (*(u32 *)addr = val)
@@ -129,6 +143,28 @@ struct serial_port {
u8 intr_trigger_level;  /* interrupt trigger level */
  };
  
+/* Migration packet */

+#define PACKET_ID  (u16)(0xfeedbaba)
+
+#define PACKET_FLAGS_ACTUAL_DATA   (1 << 0)
+#define PACKET_FLAGS_DUMMY_DATA(1 << 1)
+
+#define PACKET_DATA_SIZE_MAX   (8 * 1024 * 1024)
+
+struct packet {
+   u16 id;
+   u16 flags;
+   u32 data_size;
+   u8 data[];
+};
+
+enum {
+   PACKET_STATE_NONE = 0,
+   PACKET_STATE_PREPARED,
+   PACKET_STATE_COPIED,
+   PACKET_STATE_LAST,
+};
+
  /* State of each mdev device */
  struct mdev_state {
int irq_fd;
@@ -138,22 +174,37 @@ struct mdev_state {
u8 *vconfig;
struct mutex ops_lock;
struct mdev_device *mdev;
-   struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS];
-   u32 bar_mask[VFIO_PCI_NUM_REGIONS];
+   struct mdev_region_info region_info[MTTY_REGIONS_MAX];
+   u32 bar_mask[MTTY_REGIONS_MAX];
struct list_head next;
struct serial_port s[2];
struct mutex rxtx_lock;
struct vfio_device_info dev_info;
-   int nr_ports;
+   u32 nr_ports;
  
  	/* List of pinned gpfns, gpfn as index and content is translated hpfn */

unsigned long *gpfn_to_hpfn;
struct notifier_block nb;
+
+   u32 device_state;
+   u64 saved_size;
+   void *mig_region_base;
+   bool is_actual_data_sent;
+   struct packet *pkt;
+   u32 packet_state;
+   u64 dummy_data_size;
  };
  
  static struct mutex mdev_list_lock;

  static struct list_head mdev_devices_list;
  
+/*

+ * Default dummy data size set to 100 MB. To change value of dummy data size at
+ * runtime but before migration write size in MB to sysfs file
+ * dummy_data_size_MB
+ */
+static unsigned long user_dummy_data_size = (100 * 1024 * 1024);
+
  static const struct file_operations vd_fops = {
.owner  = THIS_MODULE,
  };
@@ -639,6 +690,288 @@ static void mdev_read_base(struct mdev_state *mdev_state)
}
  }
  
+static int save_setup(struct mdev_state *mdev_state)

+{
+   mdev_state->is_actual_data_sent = false;
+
+   memset(mdev_state->pkt, 0, sizeof(struct packet) +
+  PACKET_DATA_SIZE_MAX);
+
+   return 0;
+}
+
+static int set_device_state(struct mdev_state *mdev_state, u32 device_state)
+{
+   int ret = 0;
+
+   if (mdev_state->device_state == device_state)
+   return 0;
+
+   if (device_state & VFIO_DEVICE_STATE_RUNNING) {
+#if defined(DEBUG)
+   if (device_state & VFIO_DEVICE_STATE_SAVING) {
+   pr_info("%s: %s Pre-copy\n", __func__,
+   dev_name(mdev_dev(mdev_state->mdev)));
+   } else
+   pr_info("%s: %s Running\n", __func__,
+   dev_name(mdev_dev(mdev_state->mdev)));
+#endif
+   } else {
+   if (device_state & VFIO_DEVICE_STATE_SAVING) {
+#if

Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices

2020-05-06 Thread Kirti Wankhede





On 5/7/2020 1:33 AM, Alex Williamson wrote:

On Thu, 7 May 2020 01:18:19 +0530
Kirti Wankhede  wrote:


On 5/6/2020 11:41 AM, Yan Zhao wrote:

On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:

On Tue, 5 May 2020 04:48:37 +0530
Kirti Wankhede  wrote:
  

On 3/26/2020 1:26 AM, Alex Williamson wrote:

On Wed, 25 Mar 2020 02:39:02 +0530
Kirti Wankhede  wrote:
  

These functions save and restore PCI device specific data - config
space of PCI device.
Tested save and restore with MSI and MSIX type.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
hw/vfio/pci.c | 163 
++
include/hw/vfio/vfio-common.h |   2 +
2 files changed, 165 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6c77c12e44b9..8deb11e87ef7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -41,6 +41,7 @@
#include "trace.h"
#include "qapi/error.h"
#include "migration/blocker.h"
+#include "migration/qemu-file.h"

#define TYPE_VFIO_PCI "vfio-pci"

#define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
@@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
}
}

+static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)

+{
+PCIDevice *pdev = >pdev;
+VFIOBAR *bar = >bars[nr];
+uint64_t addr;
+uint32_t addr_lo, addr_hi = 0;
+
+/* Skip unimplemented BARs and the upper half of 64bit BARS. */
+if (!bar->size) {
+return 0;
+}
+
+addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
+
+addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
+   PCI_BASE_ADDRESS_MEM_MASK);


Nit, &= or combine with previous set.
  

+if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+addr_hi = pci_default_read_config(pdev,
+ PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
+}
+
+addr = ((uint64_t)addr_hi << 32) | addr_lo;


Could we use a union?
  

+
+if (!QEMU_IS_ALIGNED(addr, bar->size)) {
+return -EINVAL;
+}


What specifically are we validating here?  This should be true no
matter what we wrote to the BAR or else BAR emulation is broken.  The
bits that could make this unaligned are not implemented in the BAR.
  

+
+return 0;
+}
+
+static int vfio_bars_validate(VFIOPCIDevice *vdev)
+{
+int i, ret;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+ret = vfio_bar_validate(vdev, i);
+if (ret) {
+error_report("vfio: BAR address %d validation failed", i);
+return ret;
+}
+}
+return 0;
+}
+
static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
{
VFIOBAR *bar = >bars[nr];
@@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
*vbasedev)
return OBJECT(vdev);
}

+static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)

+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+uint16_t pci_cmd;
+int i;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+uint32_t bar;
+
+bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
+qemu_put_be32(f, bar);
+}
+
+qemu_put_be32(f, vdev->interrupt);
+if (vdev->interrupt == VFIO_INT_MSI) {
+uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+bool msi_64bit;
+
+msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
PCI_MSI_FLAGS,
+2);
+msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
+
+msi_addr_lo = pci_default_read_config(pdev,
+ pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
4);
+qemu_put_be32(f, msi_addr_lo);
+
+if (msi_64bit) {
+msi_addr_hi = pci_default_read_config(pdev,
+ pdev->msi_cap + 
PCI_MSI_ADDRESS_HI,
+ 4);
+}
+qemu_put_be32(f, msi_addr_hi);
+
+msi_data = pci_default_read_config(pdev,
+pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32),
+2);
+qemu_put_be32(f, msi_data);


Isn't the data field only a u16?
  


Yes, fixing it.
  

+} else if (vdev->interrupt == VFIO_INT_MSIX) {
+uint16_t offset;
+
+/* save enable bit and maskall bit */
+offset = pci_default_read_config(pdev,
+   pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
+qemu_put_be16(f, offset);
+msix_save(pdev, f);
+}
+pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+qemu_put_be16(f, pci_cmd);
+}
+
+static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+

Re: [PATCH Kernel v18 6/7] vfio iommu: Add migration capability to report supported features

2020-05-06 Thread Kirti Wankhede





On 5/7/2020 3:57 AM, Alex Williamson wrote:

On Mon, 4 May 2020 21:28:58 +0530
Kirti Wankhede  wrote:


Added migration capability in IOMMU info chain.
User application should check IOMMU info chain for migration capability
to use dirty page tracking feature provided by kernel module.

Signed-off-by: Kirti Wankhede 
---
  drivers/vfio/vfio_iommu_type1.c | 15 +++
  include/uapi/linux/vfio.h   | 14 ++
  2 files changed, 29 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 8b27faf1ec38..b38d278d7bff 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2378,6 +2378,17 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu 
*iommu,
return ret;
  }
  
+static int vfio_iommu_migration_build_caps(struct vfio_info_cap *caps)

+{
+   struct vfio_iommu_type1_info_cap_migration cap_mig;
+
+   cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
+   cap_mig.header.version = 1;
+   cap_mig.flags = VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK;
+
+   return vfio_info_add_capability(caps, _mig.header, sizeof(cap_mig));
+}
+
  static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
  {
@@ -2427,6 +2438,10 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
if (ret)
return ret;
  
+		ret = vfio_iommu_migration_build_caps();

+   if (ret)
+   return ret;
+
if (caps.size) {
info.flags |= VFIO_IOMMU_INFO_CAPS;
  
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h

index e3cbf8b78623..df9ce8aaafab 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1013,6 +1013,20 @@ struct vfio_iommu_type1_info_cap_iova_range {
struct  vfio_iova_range iova_ranges[];
  };
  
+/*

+ * The migration capability allows to report supported features for migration.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
+
+struct vfio_iommu_type1_info_cap_migration {
+   struct  vfio_info_cap_header header;
+   __u32   flags;
+   /* supports dirty page tracking */
+#define VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK(1 << 0)
+};
+


What about exposing the maximum supported dirty bitmap size and the
supported page sizes?  Thanks,



How should user application use that?

Thanks,
Kirti


Alex


  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
  
  /**

[PULL 07/18] spapr: Drop CAS reboot flag

2020-05-06 Thread David Gibson

From: Greg Kurz 

The CAS reboot flag is false by default and all the locations that
could set it to true have been dropped. This means that all code
blocks depending on the flag being set is dead code and the other
code blocks should be executed always.

Just do that and drop the now uneeded CAS reboot flag. Fix a
comment on the way to make checkpatch happy.

Signed-off-by: Greg Kurz 
Message-Id: <158514994893.478799.11772512888322840990.st...@bahia.lan>
Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c | 18 --
 hw/ppc/spapr_hcall.c   | 33 ++---
 include/hw/ppc/spapr.h |  1 -
 3 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index f52488d397..841b5ec59b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1579,9 +1579,7 @@ void spapr_setup_hpt(SpaprMachineState *spapr)
 {
 int hpt_shift;
 
-if ((spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED)
-|| (spapr->cas_reboot
-&& !spapr_ovec_test(spapr->ov5_cas, OV5_HPT_RESIZE))) {
+if (spapr->resize_hpt == SPAPR_RESIZE_HPT_DISABLED) {
 hpt_shift = spapr_hpt_shift_for_ramsize(MACHINE(spapr)->maxram_size);
 } else {
 uint64_t current_ram_size;
@@ -1645,16 +1643,10 @@ static void spapr_machine_reset(MachineState *machine)
 
 qemu_devices_reset();
 
-/*
- * If this reset wasn't generated by CAS, we should reset our
- * negotiated options and start from scratch
- */
-if (!spapr->cas_reboot) {
-spapr_ovec_cleanup(spapr->ov5_cas);
-spapr->ov5_cas = spapr_ovec_new();
+spapr_ovec_cleanup(spapr->ov5_cas);
+spapr->ov5_cas = spapr_ovec_new();
 
-ppc_set_compat_all(spapr->max_compat_pvr, _fatal);
-}
+ppc_set_compat_all(spapr->max_compat_pvr, _fatal);
 
 /*
  * This is fixing some of the default configuration of the XIVE
@@ -1707,8 +1699,6 @@ static void spapr_machine_reset(MachineState *machine)
 spapr_cpu_set_entry_state(first_ppc_cpu, SPAPR_ENTRY_POINT, 0, fdt_addr, 
0);
 first_ppc_cpu->env.gpr[5] = 0;
 
-spapr->cas_reboot = false;
-
 spapr->fwnmi_system_reset_addr = -1;
 spapr->fwnmi_machine_check_addr = -1;
 spapr->fwnmi_machine_check_interlock = -1;
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 48a8745514..0f54988f2e 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1678,6 +1678,7 @@ target_ulong do_client_architecture_support(PowerPCCPU 
*cpu,
 bool raw_mode_supported = false;
 bool guest_xive;
 CPUState *cs;
+void *fdt;
 
 /* CAS is supposed to be called early when only the boot vCPU is active. */
 CPU_FOREACH(cs) {
@@ -1818,27 +1819,21 @@ target_ulong do_client_architecture_support(PowerPCCPU 
*cpu,
 
 spapr_handle_transient_dev_before_cas(spapr);
 
-if (!spapr->cas_reboot) {
-void *fdt;
-
-/* If spapr_machine_reset() did not set up a HPT but one is necessary
- * (because the guest isn't going to use radix) then set it up here. */
-if ((spapr->patb_entry & PATE1_GR) && !guest_radix) {
-/* legacy hash or new hash: */
-spapr_setup_hpt(spapr);
-}
-
-fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
-
-g_free(spapr->fdt_blob);
-spapr->fdt_size = fdt_totalsize(fdt);
-spapr->fdt_initial_size = spapr->fdt_size;
-spapr->fdt_blob = fdt;
+/*
+ * If spapr_machine_reset() did not set up a HPT but one is necessary
+ * (because the guest isn't going to use radix) then set it up here.
+ */
+if ((spapr->patb_entry & PATE1_GR) && !guest_radix) {
+/* legacy hash or new hash: */
+spapr_setup_hpt(spapr);
 }
 
-if (spapr->cas_reboot) {
-qemu_system_reset_request(SHUTDOWN_CAUSE_SUBSYSTEM_RESET);
-}
+fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
+
+g_free(spapr->fdt_blob);
+spapr->fdt_size = fdt_totalsize(fdt);
+spapr->fdt_initial_size = spapr->fdt_size;
+spapr->fdt_blob = fdt;
 
 return H_SUCCESS;
 }
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index b7e13e5aaf..e579eaf28c 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -178,7 +178,6 @@ struct SpaprMachineState {
 SpaprEventSource *event_sources;
 
 /* ibm,client-architecture-support option negotiation */
-bool cas_reboot;
 bool cas_pre_isa3_guest;
 SpaprOptionVector *ov5; /* QEMU-supported option vectors */
 SpaprOptionVector *ov5_cas; /* negotiated (via CAS) option vectors */
-- 
2.26.2

[PULL 15/18] target/ppc: Add support for Radix partition-scoped translation

2020-05-06 Thread David Gibson

From: Cédric Le Goater 

The Radix tree translation model currently supports process-scoped
translation for the PowerNV machine (Hypervisor mode) and for the
pSeries machine (Guest mode). Guests running under an emulated
Hypervisor (PowerNV machine) require a new type of Radix translation,
called partition-scoped, which is missing today.

The Radix tree translation is a 2 steps process. The first step,
process-scoped translation, converts an effective Address to a guest
real address, and the second step, partition-scoped translation,
converts a guest real address to a host real address.

There are difference cases to covers :

* Hypervisor real mode access: no Radix translation.

* Hypervisor or host application access (quadrant 0 and 3) with
  relocation on: process-scoped translation.

* Guest OS real mode access: only partition-scoped translation.

* Guest OS real or guest application access (quadrant 0 and 3) with
  relocation on: both process-scoped translation and partition-scoped
  translations.

* Hypervisor access in quadrant 1 and 2 with relocation on: both
  process-scoped translation and partition-scoped translations.

The radix tree partition-scoped translation is performed using tables
pointed to by the first double-word of the Partition Table Entries and
process-scoped translation uses tables pointed to by the Process Table
Entries (second double-word of the Partition Table Entries).

Both partition-scoped and process-scoped translations process are
identical and thus the radix tree traversing code is largely reused.
However, errors in partition-scoped translations generate hypervisor
exceptions.

Signed-off-by: Suraj Jitindar Singh 
Signed-off-by: Greg Kurz 
Signed-off-by: Cédric Le Goater 
Message-Id: <20200403140056.59465-5-...@kaod.org>
[dwg: Fixup from Greg Kurz folded in]
Signed-off-by: David Gibson 
---
 target/ppc/cpu.h |   3 +
 target/ppc/excp_helper.c |   3 +-
 target/ppc/mmu-radix64.c | 194 +++
 3 files changed, 181 insertions(+), 19 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index f4a5304d43..6b6dd7e483 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -463,6 +463,9 @@ typedef struct ppc_v3_pate_t {
 #define DSISR_AMR0x0020
 /* Unsupported Radix Tree Configuration */
 #define DSISR_R_BADCONFIG0x0008
+#define DSISR_ATOMIC_RC  0x0004
+/* Unable to translate address of (guest) pde or process/page table entry */
+#define DSISR_PRTABLE_FAULT  0x0002
 
 /* SRR1 error code fields */
 
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 1acc3786de..f052979664 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -506,9 +506,10 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
excp_model, int excp)
 case POWERPC_EXCP_ISEG:  /* Instruction segment exception*/
 case POWERPC_EXCP_TRACE: /* Trace exception  */
 break;
+case POWERPC_EXCP_HISI:  /* Hypervisor instruction storage exception */
+msr |= env->error_code;
 case POWERPC_EXCP_HDECR: /* Hypervisor decrementer exception */
 case POWERPC_EXCP_HDSI:  /* Hypervisor data storage exception*/
-case POWERPC_EXCP_HISI:  /* Hypervisor instruction storage exception */
 case POWERPC_EXCP_HDSEG: /* Hypervisor data segment exception*/
 case POWERPC_EXCP_HISEG: /* Hypervisor instruction segment exception */
 case POWERPC_EXCP_SDOOR_HV:  /* Hypervisor Doorbell interrupt*/
diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 2400da41e0..1404e53dec 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -103,6 +103,27 @@ static void ppc_radix64_raise_si(PowerPCCPU *cpu, int rwx, 
vaddr eaddr,
 }
 }
 
+static void ppc_radix64_raise_hsi(PowerPCCPU *cpu, int rwx, vaddr eaddr,
+  hwaddr g_raddr, uint32_t cause)
+{
+CPUState *cs = CPU(cpu);
+CPUPPCState *env = >env;
+
+if (rwx == 2) { /* H Instruction Storage Interrupt */
+cs->exception_index = POWERPC_EXCP_HISI;
+env->spr[SPR_ASDR] = g_raddr;
+env->error_code = cause;
+} else { /* H Data Storage Interrupt */
+cs->exception_index = POWERPC_EXCP_HDSI;
+if (rwx == 1) { /* Write -> Store */
+cause |= DSISR_ISSTORE;
+}
+env->spr[SPR_HDSISR] = cause;
+env->spr[SPR_HDAR] = eaddr;
+env->spr[SPR_ASDR] = g_raddr;
+env->error_code = 0;
+}
+}
 
 static bool ppc_radix64_check_prot(PowerPCCPU *cpu, int rwx, uint64_t pte,
int *fault_cause, int *prot,
@@ -243,6 +264,37 @@ static bool validate_pate(PowerPCCPU *cpu, uint64_t lpid, 
ppc_v3_pate_t *pate)
 return true;
 }
 
+static int ppc_radix64_partition_scoped_xlate(PowerPCCPU *cpu, int rwx,
+

[PULL 18/18] target-ppc: fix rlwimi, rlwinm, rlwnm for Clang-9

2020-05-06 Thread David Gibson

From: Daniele Buono 

Starting with Clang v9, -Wtype-limits is implemented and triggers a
few "result of comparison is always true" errors when compiling PPC32
targets.

The comparisons seem to be necessary only on PPC64, since the
else branch in PPC32 only has a "g_assert_not_reached();" in all cases.

This patch restructures the code so that the actual if/else is done on a
local flag variable, that is set accordingly for PPC64, and always
true for PPC32.

Signed-off-by: Daniele Buono 
Message-Id: <20200505183818.32688-2-dbu...@linux.vnet.ibm.com>
Signed-off-by: David Gibson 
---
 target/ppc/translate.c | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 807d14faaa..338529879f 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -1882,6 +1882,7 @@ static void gen_rlwimi(DisasContext *ctx)
 tcg_gen_deposit_tl(t_ra, t_ra, t_rs, sh, me - mb + 1);
 } else {
 target_ulong mask;
+bool mask_in_32b = true;
 TCGv t1;
 
 #if defined(TARGET_PPC64)
@@ -1890,8 +1891,13 @@ static void gen_rlwimi(DisasContext *ctx)
 #endif
 mask = MASK(mb, me);
 
+#if defined(TARGET_PPC64)
+if (mask > 0xu) {
+mask_in_32b = false;
+}
+#endif
 t1 = tcg_temp_new();
-if (mask <= 0xu) {
+if (mask_in_32b) {
 TCGv_i32 t0 = tcg_temp_new_i32();
 tcg_gen_trunc_tl_i32(t0, t_rs);
 tcg_gen_rotli_i32(t0, t0, sh);
@@ -1933,12 +1939,18 @@ static void gen_rlwinm(DisasContext *ctx)
 tcg_gen_extract_tl(t_ra, t_rs, rsh, len);
 } else {
 target_ulong mask;
+bool mask_in_32b = true;
 #if defined(TARGET_PPC64)
 mb += 32;
 me += 32;
 #endif
 mask = MASK(mb, me);
-if (mask <= 0xu) {
+#if defined(TARGET_PPC64)
+if (mask > 0xu) {
+mask_in_32b = false;
+}
+#endif
+if (mask_in_32b) {
 if (sh == 0) {
 tcg_gen_andi_tl(t_ra, t_rs, mask);
 } else {
@@ -1973,6 +1985,7 @@ static void gen_rlwnm(DisasContext *ctx)
 uint32_t mb = MB(ctx->opcode);
 uint32_t me = ME(ctx->opcode);
 target_ulong mask;
+bool mask_in_32b = true;
 
 #if defined(TARGET_PPC64)
 mb += 32;
@@ -1980,7 +1993,12 @@ static void gen_rlwnm(DisasContext *ctx)
 #endif
 mask = MASK(mb, me);
 
-if (mask <= 0xu) {
+#if defined(TARGET_PPC64)
+if (mask > 0xu) {
+mask_in_32b = false;
+}
+#endif
+if (mask_in_32b) {
 TCGv_i32 t0 = tcg_temp_new_i32();
 TCGv_i32 t1 = tcg_temp_new_i32();
 tcg_gen_trunc_tl_i32(t0, t_rb);
-- 
2.26.2

[PULL 13/18] target/ppc: Extend ppc_radix64_check_prot() with a 'partition_scoped' bool

2020-05-06 Thread David Gibson

From: Cédric Le Goater 

This prepares ground for partition-scoped Radix translation.

Signed-off-by: Suraj Jitindar Singh 
Signed-off-by: Cédric Le Goater 
Reviewed-by: Greg Kurz 
Message-Id: <20200403140056.59465-3-...@kaod.org>
Signed-off-by: David Gibson 
---
 target/ppc/mmu-radix64.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 4b0d0ff50a..11b3c6d48c 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -105,7 +105,8 @@ static void ppc_radix64_raise_si(PowerPCCPU *cpu, int rwx, 
vaddr eaddr,
 
 
 static bool ppc_radix64_check_prot(PowerPCCPU *cpu, int rwx, uint64_t pte,
-   int *fault_cause, int *prot)
+   int *fault_cause, int *prot,
+   bool partition_scoped)
 {
 CPUPPCState *env = >env;
 const int need_prot[] = { PAGE_READ, PAGE_WRITE, PAGE_EXEC };
@@ -121,11 +122,11 @@ static bool ppc_radix64_check_prot(PowerPCCPU *cpu, int 
rwx, uint64_t pte,
 }
 
 /* Determine permissions allowed by Encoded Access Authority */
-if ((pte & R_PTE_EAA_PRIV) && msr_pr) { /* Insufficient Privilege */
+if (!partition_scoped && (pte & R_PTE_EAA_PRIV) && msr_pr) {
 *prot = 0;
-} else if (msr_pr || (pte & R_PTE_EAA_PRIV)) {
+} else if (msr_pr || (pte & R_PTE_EAA_PRIV) || partition_scoped) {
 *prot = ppc_radix64_get_prot_eaa(pte);
-} else { /* !msr_pr && !(pte & R_PTE_EAA_PRIV) */
+} else { /* !msr_pr && !(pte & R_PTE_EAA_PRIV) && !partition_scoped */
 *prot = ppc_radix64_get_prot_eaa(pte);
 *prot &= ppc_radix64_get_prot_amr(cpu); /* Least combined permissions 
*/
 }
@@ -250,7 +251,7 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU 
*cpu, int rwx,
 g_raddr, g_page_size, _cause, _addr);
 
 if (!(pte & R_PTE_VALID) ||
-ppc_radix64_check_prot(cpu, rwx, pte, _cause, g_prot)) {
+ppc_radix64_check_prot(cpu, rwx, pte, _cause, g_prot, false)) {
 /* No valid pte or access denied due to protection */
 if (cause_excp) {
 ppc_radix64_raise_si(cpu, rwx, eaddr, fault_cause);
-- 
2.26.2

[PULL 14/18] target/ppc: Rework ppc_radix64_walk_tree() for partition-scoped translation

2020-05-06 Thread David Gibson

From: Cédric Le Goater 

The ppc_radix64_walk_tree() routine walks through the nested radix
tables to look for a PTE.

Split it in two and introduce a new routine ppc_radix64_next_level()
which we will use for partition-scoped Radix translation when
translating the process tree addresses. The prototypes are slightly
change to use a 'AddressSpace *' parameter, instead of a 'PowerPCCPU *'
which is not required, and to return an error code instead of a PTE
value. It clarifies error handling in the callers.

Signed-off-by: Suraj Jitindar Singh 
Signed-off-by: Greg Kurz 
Signed-off-by: Cédric Le Goater 
Message-Id: <20200403140056.59465-4-...@kaod.org>
Signed-off-by: David Gibson 
---
 target/ppc/mmu-radix64.c | 79 ++--
 1 file changed, 52 insertions(+), 27 deletions(-)

diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 11b3c6d48c..2400da41e0 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -163,44 +163,67 @@ static void ppc_radix64_set_rc(PowerPCCPU *cpu, int rwx, 
uint64_t pte,
 }
 }
 
-static uint64_t ppc_radix64_walk_tree(PowerPCCPU *cpu, vaddr eaddr,
-  uint64_t base_addr, uint64_t nls,
-  hwaddr *raddr, int *psize,
-  int *fault_cause, hwaddr *pte_addr)
+static int ppc_radix64_next_level(AddressSpace *as, vaddr eaddr,
+  uint64_t *pte_addr, uint64_t *nls,
+  int *psize, uint64_t *pte, int *fault_cause)
 {
-CPUState *cs = CPU(cpu);
 uint64_t index, pde;
 
-if (nls < 5) { /* Directory maps less than 2**5 entries */
+if (*nls < 5) { /* Directory maps less than 2**5 entries */
 *fault_cause |= DSISR_R_BADCONFIG;
-return 0;
+return 1;
 }
 
 /* Read page  entry from guest address space */
-index = eaddr >> (*psize - nls); /* Shift */
-index &= ((1UL << nls) - 1); /* Mask */
-pde = ldq_phys(cs->as, base_addr + (index * sizeof(pde)));
-if (!(pde & R_PTE_VALID)) { /* Invalid Entry */
+pde = ldq_phys(as, *pte_addr);
+if (!(pde & R_PTE_VALID)) { /* Invalid Entry */
 *fault_cause |= DSISR_NOPTE;
-return 0;
+return 1;
 }
 
-*psize -= nls;
+*pte = pde;
+*psize -= *nls;
+if (!(pde & R_PTE_LEAF)) { /* Prepare for next iteration */
+*nls = pde & R_PDE_NLS;
+index = eaddr >> (*psize - *nls);   /* Shift */
+index &= ((1UL << *nls) - 1);   /* Mask */
+*pte_addr = (pde & R_PDE_NLB) + (index * sizeof(pde));
+}
+return 0;
+}
 
-/* Check if Leaf Entry -> Page Table Entry -> Stop the Search */
-if (pde & R_PTE_LEAF) {
-uint64_t rpn = pde & R_PTE_RPN;
-uint64_t mask = (1UL << *psize) - 1;
+static int ppc_radix64_walk_tree(AddressSpace *as, vaddr eaddr,
+ uint64_t base_addr, uint64_t nls,
+ hwaddr *raddr, int *psize, uint64_t *pte,
+ int *fault_cause, hwaddr *pte_addr)
+{
+uint64_t index, pde, rpn , mask;
 
-/* Or high bits of rpn and low bits to ea to form whole real addr */
-*raddr = (rpn & ~mask) | (eaddr & mask);
-*pte_addr = base_addr + (index * sizeof(pde));
-return pde;
+if (nls < 5) { /* Directory maps less than 2**5 entries */
+*fault_cause |= DSISR_R_BADCONFIG;
+return 1;
 }
 
-/* Next Level of Radix Tree */
-return ppc_radix64_walk_tree(cpu, eaddr, pde & R_PDE_NLB, pde & R_PDE_NLS,
- raddr, psize, fault_cause, pte_addr);
+index = eaddr >> (*psize - nls);/* Shift */
+index &= ((1UL << nls) - 1);   /* Mask */
+*pte_addr = base_addr + (index * sizeof(pde));
+do {
+int ret;
+
+ret = ppc_radix64_next_level(as, eaddr, pte_addr, , psize, ,
+ fault_cause);
+if (ret) {
+return ret;
+}
+} while (!(pde & R_PTE_LEAF));
+
+*pte = pde;
+rpn = pde & R_PTE_RPN;
+mask = (1UL << *psize) - 1;
+
+/* Or high bits of rpn and low bits to ea to form whole real addr */
+*raddr = (rpn & ~mask) | (eaddr & mask);
+return 0;
 }
 
 static bool validate_pate(PowerPCCPU *cpu, uint64_t lpid, ppc_v3_pate_t *pate)
@@ -230,6 +253,7 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU 
*cpu, int rwx,
 uint64_t offset, size, prtbe_addr, prtbe0, pte;
 int fault_cause = 0;
 hwaddr pte_addr;
+int ret;
 
 /* Index Process Table by PID to Find Corresponding Process Table Entry */
 offset = pid * sizeof(struct prtb_entry);
@@ -246,11 +270,12 @@ static int ppc_radix64_process_scoped_xlate(PowerPCCPU 
*cpu, int rwx,
 
 /* Walk Radix Tree from Process Table Entry to Convert EA to RA */
 *g_page_size = PRTBE_R_GET_RTS(prtbe0);
-pte =

[PULL 08/18] target/ppc: Enforce that the root page directory size must be at least 5

2020-05-06 Thread David Gibson

From: Suraj Jitindar Singh 

According to the ISA the root page directory size of a radix tree for
either process- or partition-scoped translation must be >= 5.

Thus add this to the list of conditions checked when validating the
partition table entry in validate_pate();

Signed-off-by: Suraj Jitindar Singh 
Reviewed-by: David Gibson 
Signed-off-by: Cédric Le Goater 
Message-Id: <20200330094946.24678-2-...@kaod.org>
Reviewed-by: Greg Kurz 
Signed-off-by: David Gibson 
---
 target/ppc/mmu-radix64.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 224e646c50..9967857058 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -212,6 +212,9 @@ static bool validate_pate(PowerPCCPU *cpu, uint64_t lpid, 
ppc_v3_pate_t *pate)
 if (lpid == 0 && !msr_hv) {
 return false;
 }
+if ((pate->dw0 & PATE1_R_PRTS) < 5) {
+return false;
+}
 /* More checks ... */
 return true;
 }
-- 
2.26.2

[PULL 09/18] target/ppc: Introduce a relocation bool in ppc_radix64_handle_mmu_fault()

2020-05-06 Thread David Gibson

From: Cédric Le Goater 

It will ease the introduction of new routines for partition-scoped
Radix translation.

Signed-off-by: Suraj Jitindar Singh 
Signed-off-by: Cédric Le Goater 
Message-Id: <20200330094946.24678-3-...@kaod.org>
Reviewed-by: Greg Kurz 
Signed-off-by: David Gibson 
---
 target/ppc/mmu-radix64.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index 9967857058..f6007e9565 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -229,12 +229,13 @@ int ppc_radix64_handle_mmu_fault(PowerPCCPU *cpu, vaddr 
eaddr, int rwx,
 uint64_t lpid = 0, pid = 0, offset, size, prtbe0, pte;
 int page_size, prot, fault_cause = 0;
 ppc_v3_pate_t pate;
+bool relocation;
 
 assert((rwx == 0) || (rwx == 1) || (rwx == 2));
 
+relocation = ((rwx == 2) && (msr_ir == 1)) || ((rwx != 2) && (msr_dr == 
1));
 /* HV or virtual hypervisor Real Mode Access */
-if ((msr_hv || cpu->vhyp) &&
-(((rwx == 2) && (msr_ir == 0)) || ((rwx != 2) && (msr_dr == 0 {
+if (!relocation && (msr_hv || cpu->vhyp)) {
 /* In real mode top 4 effective addr bits (mostly) ignored */
 raddr = eaddr & 0x0FFFULL;
 
-- 
2.26.2

[PULL 17/18] spapr_nvdimm: Tweak error messages

2020-05-06 Thread David Gibson

The restrictions here (which are checked at pre-plug time) are PAPR
specific, rather than being inherent to the NVDIMM devices.  Adjust the
error messages to be clearer about this.

Signed-off-by: David Gibson 
---
 hw/ppc/spapr_nvdimm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 9abcdcc26b..81410aa63f 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -39,13 +39,13 @@ void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, 
uint64_t size,
 
 if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP,
 _abort) == 0) {
-error_setg(errp, "NVDIMM device requires label-size to be set");
+error_setg(errp, "PAPR requires NVDIMM devices to have label-size 
set");
 return;
 }
 
 if (size % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
-error_setg(errp, "NVDIMM memory size excluding the label area"
-   " must be a multiple of %" PRIu64 "MB",
+error_setg(errp, "PAPR requires NVDIMM memory size (excluding label)"
+   " to be a multiple of %" PRIu64 "MB",
SPAPR_MINIMUM_SCM_BLOCK_SIZE / MiB);
 return;
 }
-- 
2.26.2

[PULL 16/18] spapr_nvdimm.c: make 'label-size' mandatory

2020-05-06 Thread David Gibson

From: Daniel Henrique Barboza 

The pseries machine does not support NVDIMM modules without label.
Attempting to do so, even if the overall block size is aligned with
256MB, will seg fault the guest kernel during NVDIMM probe. This
can be avoided by forcing 'label-size' to always be present for
sPAPR NVDIMMs.

The verification was put before the alignment check because the
presence of label-size affects the alignment calculation, so
it's not optimal to warn the user about an alignment error,
then about the lack of label-size, then about a new alignment
error when the user sets a label-size.

Signed-off-by: Daniel Henrique Barboza 
Message-Id: <20200413203628.31636-1-danielhb...@gmail.com>
Signed-off-by: David Gibson 
---
 hw/ppc/spapr_nvdimm.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
index 25be8082d7..9abcdcc26b 100644
--- a/hw/ppc/spapr_nvdimm.c
+++ b/hw/ppc/spapr_nvdimm.c
@@ -37,6 +37,12 @@ void spapr_nvdimm_validate_opts(NVDIMMDevice *nvdimm, 
uint64_t size,
 QemuUUID uuid;
 int ret;
 
+if (object_property_get_int(OBJECT(nvdimm), NVDIMM_LABEL_SIZE_PROP,
+_abort) == 0) {
+error_setg(errp, "NVDIMM device requires label-size to be set");
+return;
+}
+
 if (size % SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
 error_setg(errp, "NVDIMM memory size excluding the label area"
" must be a multiple of %" PRIu64 "MB",
-- 
2.26.2

[PULL 10/18] target/ppc: Assert if HV mode is set when running under a pseries machine

2020-05-06 Thread David Gibson

From: Cédric Le Goater 

Signed-off-by: Suraj Jitindar Singh 
Signed-off-by: Cédric Le Goater 
Message-Id: <20200330094946.24678-4-...@kaod.org>
Reviewed-by: Greg Kurz 
Signed-off-by: David Gibson 
---
 target/ppc/mmu-radix64.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index f6007e9565..d2422d1c54 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -231,6 +231,7 @@ int ppc_radix64_handle_mmu_fault(PowerPCCPU *cpu, vaddr 
eaddr, int rwx,
 ppc_v3_pate_t pate;
 bool relocation;
 
+assert(!(msr_hv && cpu->vhyp));
 assert((rwx == 0) || (rwx == 1) || (rwx == 2));
 
 relocation = ((rwx == 2) && (msr_ir == 1)) || ((rwx != 2) && (msr_dr == 
1));
-- 
2.26.2

[PULL 11/18] spapr: Don't allow unplug of NVLink2 devices

2020-05-06 Thread David Gibson

Currently, we can't properly handle unplug of NVLink2 devices, because we
don't have code to tear down their special memory resources.  There's not
a lot of impetus to implement that: since hardware NVLink2 devices can't
be hot unplugged, the guest side drivers don't usually support unplug
anyway.

Therefore, simply prevent unplug of NVLink2 devices.

Signed-off-by: David Gibson 
Reviewed-by: Alexey Kardashevskiy 
---
 hw/ppc/spapr_pci.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 55ca9dee1e..61b84a392d 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -1665,6 +1665,10 @@ static void spapr_pci_unplug_request(HotplugHandler 
*plug_handler,
 error_setg(errp, "PCI: Hot unplug of PCI bridges not supported");
 return;
 }
+if (object_property_get_uint(OBJECT(pdev), "nvlink2-tgt", NULL)) {
+error_setg(errp, "PCI: Cannot unplug NVLink2 devices");
+return;
+}
 
 /* ensure any other present functions are pending unplug */
 if (PCI_FUNC(pdev->devfn) == 0) {
-- 
2.26.2

[PULL 12/18] target/ppc: Introduce ppc_radix64_xlate() for Radix tree translation

2020-05-06 Thread David Gibson

From: Cédric Le Goater 

This is moving code under a new ppc_radix64_xlate() routine shared by
the MMU Radix page fault handler and the 'get_phys_page_debug' PPC
callback. The difference being that 'get_phys_page_debug' does not
generate exceptions.

The specific part of process-scoped Radix translation is moved under
ppc_radix64_process_scoped_xlate() in preparation of the future support
for partition-scoped Radix translation. Routines raising the exceptions
now take a 'cause_excp' bool to cover the 'get_phys_page_debug' case.

It should be functionally equivalent.

Signed-off-by: Suraj Jitindar Singh 
Signed-off-by: Cédric Le Goater 
Message-Id: <20200403140056.59465-2-...@kaod.org>
Reviewed-by: Greg Kurz 
Signed-off-by: David Gibson 
---
 target/ppc/mmu-radix64.c | 219 ++-
 1 file changed, 123 insertions(+), 96 deletions(-)

diff --git a/target/ppc/mmu-radix64.c b/target/ppc/mmu-radix64.c
index d2422d1c54..4b0d0ff50a 100644
--- a/target/ppc/mmu-radix64.c
+++ b/target/ppc/mmu-radix64.c
@@ -219,17 +219,127 @@ static bool validate_pate(PowerPCCPU *cpu, uint64_t 
lpid, ppc_v3_pate_t *pate)
 return true;
 }
 
+static int ppc_radix64_process_scoped_xlate(PowerPCCPU *cpu, int rwx,
+vaddr eaddr, uint64_t pid,
+ppc_v3_pate_t pate, hwaddr 
*g_raddr,
+int *g_prot, int *g_page_size,
+bool cause_excp)
+{
+CPUState *cs = CPU(cpu);
+uint64_t offset, size, prtbe_addr, prtbe0, pte;
+int fault_cause = 0;
+hwaddr pte_addr;
+
+/* Index Process Table by PID to Find Corresponding Process Table Entry */
+offset = pid * sizeof(struct prtb_entry);
+size = 1ULL << ((pate.dw1 & PATE1_R_PRTS) + 12);
+if (offset >= size) {
+/* offset exceeds size of the process table */
+if (cause_excp) {
+ppc_radix64_raise_si(cpu, rwx, eaddr, DSISR_NOPTE);
+}
+return 1;
+}
+prtbe_addr = (pate.dw1 & PATE1_R_PRTB) + offset;
+prtbe0 = ldq_phys(cs->as, prtbe_addr);
+
+/* Walk Radix Tree from Process Table Entry to Convert EA to RA */
+*g_page_size = PRTBE_R_GET_RTS(prtbe0);
+pte = ppc_radix64_walk_tree(cpu, eaddr & R_EADDR_MASK,
+prtbe0 & PRTBE_R_RPDB, prtbe0 & PRTBE_R_RPDS,
+g_raddr, g_page_size, _cause, _addr);
+
+if (!(pte & R_PTE_VALID) ||
+ppc_radix64_check_prot(cpu, rwx, pte, _cause, g_prot)) {
+/* No valid pte or access denied due to protection */
+if (cause_excp) {
+ppc_radix64_raise_si(cpu, rwx, eaddr, fault_cause);
+}
+return 1;
+}
+
+ppc_radix64_set_rc(cpu, rwx, pte, pte_addr, g_prot);
+
+return 0;
+}
+
+static int ppc_radix64_xlate(PowerPCCPU *cpu, vaddr eaddr, int rwx,
+ bool relocation,
+ hwaddr *raddr, int *psizep, int *protp,
+ bool cause_excp)
+{
+uint64_t lpid = 0, pid = 0;
+ppc_v3_pate_t pate;
+int psize, prot;
+hwaddr g_raddr;
+
+/* Virtual Mode Access - get the fully qualified address */
+if (!ppc_radix64_get_fully_qualified_addr(>env, eaddr, , )) {
+if (cause_excp) {
+ppc_radix64_raise_segi(cpu, rwx, eaddr);
+}
+return 1;
+}
+
+/* Get Process Table */
+if (cpu->vhyp) {
+PPCVirtualHypervisorClass *vhc;
+vhc = PPC_VIRTUAL_HYPERVISOR_GET_CLASS(cpu->vhyp);
+vhc->get_pate(cpu->vhyp, );
+} else {
+if (!ppc64_v3_get_pate(cpu, lpid, )) {
+if (cause_excp) {
+ppc_radix64_raise_si(cpu, rwx, eaddr, DSISR_NOPTE);
+}
+return 1;
+}
+if (!validate_pate(cpu, lpid, )) {
+if (cause_excp) {
+ppc_radix64_raise_si(cpu, rwx, eaddr, DSISR_R_BADCONFIG);
+}
+return 1;
+}
+/* We don't support guest mode yet */
+if (lpid != 0) {
+error_report("PowerNV guest support Unimplemented");
+exit(1);
+}
+}
+
+*psizep = INT_MAX;
+*protp = PAGE_READ | PAGE_WRITE | PAGE_EXEC;
+
+/*
+ * Perform process-scoped translation if relocation enabled.
+ *
+ * - Translates an effective address to a host real address in
+ *   quadrants 0 and 3 when HV=1.
+ */
+if (relocation) {
+int ret = ppc_radix64_process_scoped_xlate(cpu, rwx, eaddr, pid,
+   pate, _raddr, ,
+   , cause_excp);
+if (ret) {
+return ret;
+}
+*psizep = MIN(*psizep, psize);
+*protp &= prot;
+} else {
+g_raddr = eaddr & R_EADDR_MASK;
+}
+
+*raddr = g_raddr;
+return 0;
+}
+
 int

[PULL 01/18] target/ppc: Improve syscall exception logging

2020-05-06 Thread David Gibson

From: Nicholas Piggin 

system calls (at least in Linux) use registers r3-r8 for inputs, so
include those registers in the dump.

This also adds a mode for PAPR hcalls, which have a different calling
convention.

Signed-off-by: Nicholas Piggin 
Message-Id: <20200317054918.199161-1-npig...@gmail.com>
Signed-off-by: David Gibson 
---
 target/ppc/excp_helper.c | 30 ++
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 08bc885ca6..81ee19ebae 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -57,12 +57,29 @@ static void ppc_hw_interrupt(CPUPPCState *env)
 #else /* defined(CONFIG_USER_ONLY) */
 static inline void dump_syscall(CPUPPCState *env)
 {
-qemu_log_mask(CPU_LOG_INT, "syscall r0=%016" PRIx64 " r3=%016" PRIx64
-  " r4=%016" PRIx64 " r5=%016" PRIx64 " r6=%016" PRIx64
+qemu_log_mask(CPU_LOG_INT, "syscall r0=%016" PRIx64
+  " r3=%016" PRIx64 " r4=%016" PRIx64 " r5=%016" PRIx64
+  " r6=%016" PRIx64 " r7=%016" PRIx64 " r8=%016" PRIx64
   " nip=" TARGET_FMT_lx "\n",
   ppc_dump_gpr(env, 0), ppc_dump_gpr(env, 3),
   ppc_dump_gpr(env, 4), ppc_dump_gpr(env, 5),
-  ppc_dump_gpr(env, 6), env->nip);
+  ppc_dump_gpr(env, 6), ppc_dump_gpr(env, 7),
+  ppc_dump_gpr(env, 8), env->nip);
+}
+
+static inline void dump_hcall(CPUPPCState *env)
+{
+qemu_log_mask(CPU_LOG_INT, "hypercall r3=%016" PRIx64
+ " r4=%016" PRIx64 " r5=%016" PRIx64 " r6=%016" PRIx64
+ " r7=%016" PRIx64 " r8=%016" PRIx64 " r9=%016" PRIx64
+ " r10=%016" PRIx64 " r11=%016" PRIx64 " r12=%016" PRIx64
+  " nip=" TARGET_FMT_lx "\n",
+  ppc_dump_gpr(env, 3), ppc_dump_gpr(env, 4),
+ ppc_dump_gpr(env, 5), ppc_dump_gpr(env, 6),
+ ppc_dump_gpr(env, 7), ppc_dump_gpr(env, 8),
+ ppc_dump_gpr(env, 9), ppc_dump_gpr(env, 10),
+ ppc_dump_gpr(env, 11), ppc_dump_gpr(env, 12),
+ env->nip);
 }
 
 static int powerpc_reset_wakeup(CPUState *cs, CPUPPCState *env, int excp,
@@ -379,9 +396,14 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
excp_model, int excp)
 }
 break;
 case POWERPC_EXCP_SYSCALL:   /* System call exception*/
-dump_syscall(env);
 lev = env->error_code;
 
+if ((lev == 1) && cpu->vhyp) {
+dump_hcall(env);
+} else {
+dump_syscall(env);
+}
+
 /*
  * We need to correct the NIP which in this case is supposed
  * to point to the next instruction
-- 
2.26.2

[PULL 05/18] spapr: Simplify selection of radix/hash during CAS

2020-05-06 Thread David Gibson

From: Greg Kurz 

The guest can select the MMU mode by setting bits 0-1 of byte 24
in OV5 to to 0b00 for hash or 0b01 for radix. As required by the
architecture, we terminate the boot process if any other value
is found there.

The usual way to negotiate features in OV5 is basically ANDing
the bitfield provided by the guest and the bitfield of features
supported by QEMU, previously populated at machine init.

For some not documented reason, MMU is treated differently : bit 1
of byte 24 (the radix/hash bit) is cleared from the guest OV5 and
explicitely set in the final negotiated OV5 if radix was requested.

Since the only expected input from the guest is the radix/hash bit
being set or not, it seems more appropriate to handle this like we
do for XIVE.

Set the radix bit in spapr->ov5 at machine init if it has a chance
to work (ie. power9, either TCG or a radix capable KVM) and rely
exclusively on spapr_ovec_intersect() to set the radix bit in
spapr->ov5_cas.

Signed-off-by: Greg Kurz 
Message-Id: <158514993621.478799.4204740354545734293.st...@bahia.lan>
Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c   | 1 +
 hw/ppc/spapr_hcall.c | 6 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 785c41d205..167b1216ba 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -2837,6 +2837,7 @@ static void spapr_machine_init(MachineState *machine)
 if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
 ppc_type_check_compat(machine->cpu_type, CPU_POWERPC_LOGICAL_3_00, 0,
   spapr->max_compat_pvr)) {
+spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_300);
 /* KVM and TCG always allow GTSE with radix... */
 spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
 }
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index e8ee447537..fb4fdd4a0c 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1739,9 +1739,7 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 exit(EXIT_FAILURE);
 }
 
-/* The radix/hash bit in byte 24 requires special handling: */
 guest_radix = spapr_ovec_test(ov5_guest, OV5_MMU_RADIX_300);
-spapr_ovec_clear(ov5_guest, OV5_MMU_RADIX_300);
 
 guest_xive = spapr_ovec_test(ov5_guest, OV5_XIVE_EXPLOIT);
 
@@ -1786,14 +1784,12 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 /* full range of negotiated ov5 capabilities */
 spapr_ovec_intersect(spapr->ov5_cas, spapr->ov5, ov5_guest);
 spapr_ovec_cleanup(ov5_guest);
-/* Now that processing is finished, set the radix/hash bit for the
- * guest if it requested a valid mode; otherwise terminate the boot. */
+
 if (guest_radix) {
 if (kvm_enabled() && !kvmppc_has_cap_mmu_radix()) {
 error_report("Guest requested unavailable MMU mode (radix).");
 exit(EXIT_FAILURE);
 }
-spapr_ovec_set(spapr->ov5_cas, OV5_MMU_RADIX_300);
 } else {
 if (kvm_enabled() && kvmppc_has_cap_mmu_radix()
 && !kvmppc_has_cap_mmu_hash_v3()) {
-- 
2.26.2

[PULL 02/18] spapr: Don't check capabilities removed between CAS calls

2020-05-06 Thread David Gibson

From: Greg Kurz 

We currently check if some capability in OV5 was removed by the guest
since the previous CAS, and we trigger a CAS reboot in that case. This
was required because it could call for a device-tree property or node
removal, that we didn't support until recently (see commit 6787d27b04a7
"spapr: add option vector handling in CAS-generated resets" for details).

Now that we render a full FDT at CAS and that SLOF is able to handle
node removal, we don't need to do a CAS reset in this case anymore.
Also, this check can only return true if the guest has already called
CAS since the last full system reset (otherwise spapr->ov5_cas is
empty). Linux doesn't do that so this can be considered as dead code
for the vast majority of existing setups.

Drop the check. Since the only use of the ov5_cas_old variable is
precisely the check itself, drop the variable as well.

Signed-off-by: Greg Kurz 
Message-Id: <158514993021.478799.10928618293640651819.st...@bahia.lan>
Signed-off-by: David Gibson 
---
 hw/ppc/spapr_hcall.c | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index 0d50fc9117..e8ee447537 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1676,7 +1676,7 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 target_ulong fdt_bufsize = args[2];
 target_ulong ov_table;
 uint32_t cas_pvr;
-SpaprOptionVector *ov1_guest, *ov5_guest, *ov5_cas_old;
+SpaprOptionVector *ov1_guest, *ov5_guest;
 bool guest_radix;
 Error *local_err = NULL;
 bool raw_mode_supported = false;
@@ -1782,22 +1782,10 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
  * by LoPAPR 1.1, 14.5.4.8, which QEMU doesn't implement, we don't need
  * to worry about this for now.
  */
-ov5_cas_old = spapr_ovec_clone(spapr->ov5_cas);
-
-/* also clear the radix/hash bit from the current ov5_cas bits to
- * be in sync with the newly ov5 bits. Else the radix bit will be
- * seen as being removed and this will generate a reset loop
- */
-spapr_ovec_clear(ov5_cas_old, OV5_MMU_RADIX_300);
 
 /* full range of negotiated ov5 capabilities */
 spapr_ovec_intersect(spapr->ov5_cas, spapr->ov5, ov5_guest);
 spapr_ovec_cleanup(ov5_guest);
-/* capabilities that have been added since CAS-generated guest reset.
- * if capabilities have since been removed, generate another reset
- */
-spapr->cas_reboot = !spapr_ovec_subset(ov5_cas_old, spapr->ov5_cas);
-spapr_ovec_cleanup(ov5_cas_old);
 /* Now that processing is finished, set the radix/hash bit for the
  * guest if it requested a valid mode; otherwise terminate the boot. */
 if (guest_radix) {
-- 
2.26.2

[PULL 04/18] ppc/pnv: Add support for NMI interface

2020-05-06 Thread David Gibson

From: Nicholas Piggin 

This implements the NMI interface for the PNV machine, similarly to
commit 3431648272d ("spapr: Add support for new NMI interface") for
SPAPR.

Signed-off-by: Nicholas Piggin 
Message-Id: <20200325144147.221875-3-npig...@gmail.com>
Reviewed-by: Cédric Le Goater 
Signed-off-by: David Gibson 
---
 hw/ppc/pnv.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/hw/ppc/pnv.c b/hw/ppc/pnv.c
index c9cb6fa357..a3b7a8d0ff 100644
--- a/hw/ppc/pnv.c
+++ b/hw/ppc/pnv.c
@@ -27,6 +27,7 @@
 #include "sysemu/runstate.h"
 #include "sysemu/cpus.h"
 #include "sysemu/device_tree.h"
+#include "sysemu/hw_accel.h"
 #include "target/ppc/cpu.h"
 #include "qemu/log.h"
 #include "hw/ppc/fdt.h"
@@ -34,6 +35,7 @@
 #include "hw/ppc/pnv.h"
 #include "hw/ppc/pnv_core.h"
 #include "hw/loader.h"
+#include "hw/nmi.h"
 #include "exec/address-spaces.h"
 #include "qapi/visitor.h"
 #include "monitor/monitor.h"
@@ -1977,10 +1979,35 @@ static void pnv_machine_set_hb(Object *obj, bool value, 
Error **errp)
 }
 }
 
+static void pnv_cpu_do_nmi_on_cpu(CPUState *cs, run_on_cpu_data arg)
+{
+PowerPCCPU *cpu = POWERPC_CPU(cs);
+CPUPPCState *env = >env;
+
+cpu_synchronize_state(cs);
+ppc_cpu_do_system_reset(cs);
+/*
+ * SRR1[42:45] is set to 0100 which the ISA defines as implementation
+ * dependent. POWER processors use this for xscom triggered interrupts,
+ * which come from the BMC or NMI IPIs.
+ */
+env->spr[SPR_SRR1] |= PPC_BIT(43);
+}
+
+static void pnv_nmi(NMIState *n, int cpu_index, Error **errp)
+{
+CPUState *cs;
+
+CPU_FOREACH(cs) {
+async_run_on_cpu(cs, pnv_cpu_do_nmi_on_cpu, RUN_ON_CPU_NULL);
+}
+}
+
 static void pnv_machine_class_init(ObjectClass *oc, void *data)
 {
 MachineClass *mc = MACHINE_CLASS(oc);
 InterruptStatsProviderClass *ispc = INTERRUPT_STATS_PROVIDER_CLASS(oc);
+NMIClass *nc = NMI_CLASS(oc);
 
 mc->desc = "IBM PowerNV (Non-Virtualized)";
 mc->init = pnv_init;
@@ -1997,6 +2024,7 @@ static void pnv_machine_class_init(ObjectClass *oc, void 
*data)
 mc->default_ram_size = INITRD_LOAD_ADDR + INITRD_MAX_SIZE;
 mc->default_ram_id = "pnv.ram";
 ispc->print_info = pnv_pic_print_info;
+nc->nmi_monitor_handler = pnv_nmi;
 
 object_class_property_add_bool(oc, "hb-mode",
pnv_machine_get_hb, pnv_machine_set_hb,
@@ -2060,6 +2088,7 @@ static const TypeInfo types[] = {
 .class_size= sizeof(PnvMachineClass),
 .interfaces = (InterfaceInfo[]) {
 { TYPE_INTERRUPT_STATS_PROVIDER },
+{ TYPE_NMI },
 { },
 },
 },
-- 
2.26.2

[PULL 06/18] spapr/cas: Separate CAS handling from rebuilding the FDT

2020-05-06 Thread David Gibson

From: Alexey Kardashevskiy 

At the moment "ibm,client-architecture-support" ("CAS") is implemented
in SLOF and QEMU assists via the custom H_CAS hypercall which copies
an updated flatten device tree (FDT) blob to the SLOF memory which
it then uses to update its internal tree.

When we enable the OpenFirmware client interface in QEMU, we won't need
to copy the FDT to the guest as the client is expected to fetch
the device tree using the client interface.

This moves FDT rebuild out to a separate helper which is going to be
called from the "ibm,client-architecture-support" handler and leaves
writing FDT to the guest in the H_CAS handler.

This should not cause any behavioral change.

Signed-off-by: Alexey Kardashevskiy 
Message-Id: <20200310050733.29805-3-...@ozlabs.ru>
Signed-off-by: Greg Kurz 
Message-Id: <158514994229.478799.2178881312094922324.st...@bahia.lan>
Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c |  1 -
 hw/ppc/spapr_hcall.c   | 67 ++
 include/hw/ppc/spapr.h |  7 +
 3 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 167b1216ba..f52488d397 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -96,7 +96,6 @@
  *
  * We load our kernel at 4M, leaving space for SLOF initial image
  */
-#define FDT_MAX_SIZE0x10
 #define RTAS_MAX_ADDR   0x8000 /* RTAS must stay below that */
 #define FW_MAX_SIZE 0x40
 #define FW_FILE_NAME"slof.bin"
diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
index fb4fdd4a0c..48a8745514 100644
--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@@ -1665,16 +1665,12 @@ static void 
spapr_handle_transient_dev_before_cas(SpaprMachineState *spapr)
 spapr_clear_pending_hotplug_events(spapr);
 }
 
-static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
-  SpaprMachineState *spapr,
-  target_ulong opcode,
-  target_ulong *args)
+target_ulong do_client_architecture_support(PowerPCCPU *cpu,
+SpaprMachineState *spapr,
+target_ulong vec,
+target_ulong fdt_bufsize)
 {
-/* Working address in data buffer */
-target_ulong addr = ppc64_phys_to_real(args[0]);
-target_ulong fdt_buf = args[1];
-target_ulong fdt_bufsize = args[2];
-target_ulong ov_table;
+target_ulong ov_table; /* Working address in data buffer */
 uint32_t cas_pvr;
 SpaprOptionVector *ov1_guest, *ov5_guest;
 bool guest_radix;
@@ -1694,7 +1690,7 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 }
 }
 
-cas_pvr = cas_check_pvr(spapr, cpu, , _mode_supported, 
_err);
+cas_pvr = cas_check_pvr(spapr, cpu, , _mode_supported, _err);
 if (local_err) {
 error_report_err(local_err);
 return H_HARDWARE;
@@ -1717,7 +1713,7 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 }
 
 /* For the future use: here @ov_table points to the first option vector */
-ov_table = addr;
+ov_table = vec;
 
 ov1_guest = spapr_ovec_parse_vector(ov_table, 1);
 if (!ov1_guest) {
@@ -1824,7 +1820,6 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 
 if (!spapr->cas_reboot) {
 void *fdt;
-SpaprDeviceTreeUpdateHeader hdr = { .version_id = 1 };
 
 /* If spapr_machine_reset() did not set up a HPT but one is necessary
  * (because the guest isn't going to use radix) then set it up here. */
@@ -1833,21 +1828,7 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 spapr_setup_hpt(spapr);
 }
 
-if (fdt_bufsize < sizeof(hdr)) {
-error_report("SLOF provided insufficient CAS buffer "
- TARGET_FMT_lu " (min: %zu)", fdt_bufsize, 
sizeof(hdr));
-exit(EXIT_FAILURE);
-}
-
-fdt_bufsize -= sizeof(hdr);
-
 fdt = spapr_build_fdt(spapr, false, fdt_bufsize);
-_FDT((fdt_pack(fdt)));
-
-cpu_physical_memory_write(fdt_buf, , sizeof(hdr));
-cpu_physical_memory_write(fdt_buf + sizeof(hdr), fdt,
-  fdt_totalsize(fdt));
-trace_spapr_cas_continue(fdt_totalsize(fdt) + sizeof(hdr));
 
 g_free(spapr->fdt_blob);
 spapr->fdt_size = fdt_totalsize(fdt);
@@ -1862,6 +1843,40 @@ static target_ulong 
h_client_architecture_support(PowerPCCPU *cpu,
 return H_SUCCESS;
 }
 
+static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
+  SpaprMachineState *spapr,
+  target_ulong opcode,
+  target_ulong *args)
+{
+

[PULL 00/18] ppc-for-5.1 queue 20200507

2020-05-06 Thread David Gibson

The following changes since commit 570a9214827e3d42f7173c4d4c9f045b99834cf0:

  Merge remote-tracking branch 
'remotes/alistair/tags/pull-reg-to-apply-20200505' into staging (2020-05-06 
15:38:02 +0100)

are available in the Git repository at:

  git://github.com/dgibson/qemu.git tags/ppc-for-5.1-20200507

for you to fetch changes up to c4f6a4a3dd5f2aa15329b8158de25f50b5ba3252:

  target-ppc: fix rlwimi, rlwinm, rlwnm for Clang-9 (2020-05-07 11:10:50 +1000)


ppc patch queue for 2020-04-07

First pull request for qemu-5.1.  This includes:
 * Removal of all remaining cases where we had CAS triggered reboots
 * A number of improvements to NMI injection
 * Support for partition scoped radix translation in softmmu
 * Some fixes for NVDIMM handling
 * A handful of other minor fixes


Alexey Kardashevskiy (1):
  spapr/cas: Separate CAS handling from rebuilding the FDT

Cédric Le Goater (6):
  target/ppc: Introduce a relocation bool in ppc_radix64_handle_mmu_fault()
  target/ppc: Assert if HV mode is set when running under a pseries machine
  target/ppc: Introduce ppc_radix64_xlate() for Radix tree translation
  target/ppc: Extend ppc_radix64_check_prot() with a 'partition_scoped' bool
  target/ppc: Rework ppc_radix64_walk_tree() for partition-scoped 
translation
  target/ppc: Add support for Radix partition-scoped translation

Daniel Henrique Barboza (1):
  spapr_nvdimm.c: make 'label-size' mandatory

Daniele Buono (1):
  target-ppc: fix rlwimi, rlwinm, rlwnm for Clang-9

David Gibson (2):
  spapr: Don't allow unplug of NVLink2 devices
  spapr_nvdimm: Tweak error messages

Greg Kurz (3):
  spapr: Don't check capabilities removed between CAS calls
  spapr: Simplify selection of radix/hash during CAS
  spapr: Drop CAS reboot flag

Nicholas Piggin (3):
  target/ppc: Improve syscall exception logging
  ppc/spapr: tweak change system reset helper
  ppc/pnv: Add support for NMI interface

Suraj Jitindar Singh (1):
  target/ppc: Enforce that the root page directory size must be at least 5

 hw/ppc/pnv.c |  29 +++
 hw/ppc/spapr.c   |  29 ++-
 hw/ppc/spapr_hcall.c | 108 ++-
 hw/ppc/spapr_nvdimm.c|  10 +-
 hw/ppc/spapr_pci.c   |   4 +
 include/hw/ppc/spapr.h   |   8 +-
 target/ppc/cpu.h |   5 +-
 target/ppc/excp_helper.c |  38 +++-
 target/ppc/mmu-radix64.c | 468 ++-
 target/ppc/translate.c   |  24 ++-
 10 files changed, 506 insertions(+), 217 deletions(-)

[PULL 03/18] ppc/spapr: tweak change system reset helper

2020-05-06 Thread David Gibson

From: Nicholas Piggin 

Rather than have the helper take an optional vector address
override, instead have its caller modify env->nip itself.
This is more consistent when adding pnv nmi support, and also
with mce injection added later.

Signed-off-by: Nicholas Piggin 
Message-Id: <20200325144147.221875-2-npig...@gmail.com>
Reviewed-by: Cédric Le Goater 
Signed-off-by: David Gibson 
---
 hw/ppc/spapr.c   | 9 ++---
 target/ppc/cpu.h | 2 +-
 target/ppc/excp_helper.c | 5 +
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 9a2bd501aa..785c41d205 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -3385,13 +3385,13 @@ static void spapr_machine_finalizefn(Object *obj)
 void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg)
 {
 SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+PowerPCCPU *cpu = POWERPC_CPU(cs);
+CPUPPCState *env = >env;
 
 cpu_synchronize_state(cs);
 /* If FWNMI is inactive, addr will be -1, which will deliver to 0x100 */
 if (spapr->fwnmi_system_reset_addr != -1) {
 uint64_t rtas_addr, addr;
-PowerPCCPU *cpu = POWERPC_CPU(cs);
-CPUPPCState *env = >env;
 
 /* get rtas addr from fdt */
 rtas_addr = spapr_get_rtas_addr();
@@ -3405,7 +3405,10 @@ void spapr_do_system_reset_on_cpu(CPUState *cs, 
run_on_cpu_data arg)
 stq_be_phys(_space_memory, addr + sizeof(uint64_t), 0);
 env->gpr[3] = addr;
 }
-ppc_cpu_do_system_reset(cs, spapr->fwnmi_system_reset_addr);
+ppc_cpu_do_system_reset(cs);
+if (spapr->fwnmi_system_reset_addr != -1) {
+env->nip = spapr->fwnmi_system_reset_addr;
+}
 }
 
 static void spapr_nmi(NMIState *n, int cpu_index, Error **errp)
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 88d9449555..f4a5304d43 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -1220,7 +1220,7 @@ int ppc64_cpu_write_elf64_note(WriteCoreDumpFunction f, 
CPUState *cs,
 int ppc32_cpu_write_elf32_note(WriteCoreDumpFunction f, CPUState *cs,
int cpuid, void *opaque);
 #ifndef CONFIG_USER_ONLY
-void ppc_cpu_do_system_reset(CPUState *cs, target_ulong vector);
+void ppc_cpu_do_system_reset(CPUState *cs);
 void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, target_ulong vector);
 extern const VMStateDescription vmstate_ppc_cpu;
 #endif
diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 81ee19ebae..1acc3786de 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -983,15 +983,12 @@ static void ppc_hw_interrupt(CPUPPCState *env)
 }
 }
 
-void ppc_cpu_do_system_reset(CPUState *cs, target_ulong vector)
+void ppc_cpu_do_system_reset(CPUState *cs)
 {
 PowerPCCPU *cpu = POWERPC_CPU(cs);
 CPUPPCState *env = >env;
 
 powerpc_excp(cpu, env->excp_model, POWERPC_EXCP_RESET);
-if (vector != -1) {
-env->nip = vector;
-}
 }
 
 void ppc_cpu_do_fwnmi_machine_check(CPUState *cs, target_ulong vector)
-- 
2.26.2

Re: [PATCH v3] tests/qht-bench: Fix Clang 'implicit-int-float-conversion' warning

2020-05-06 Thread Emilio G. Cota

On Mon, May 04, 2020 at 16:43:52 +0200, Philippe Mathieu-Daudé wrote:
> When building with Clang 10 on Fedora 32, we get:
> 
>   tests/qht-bench.c:287:29: error: implicit conversion from 'unsigned long' 
> to 'double' changes value from 18446744073709551615 to 18446744073709551616 
> [-Werror,-Wimplicit-int-float-conversion]
(snip)
> @@ -284,7 +285,7 @@ static void do_threshold(double rate, uint64_t *threshold)
>  if (rate == 1.0) {
>  *threshold = UINT64_MAX;
>  } else {
> -*threshold = rate * UINT64_MAX;
> +*threshold = rate * nextafter(0x1p64, 0.0);

Reviewed-by: Emilio G. Cota 

Please consider mentioning 25f74087c69 in the commit log -- it clearly
describes the problem.

Thanks,

Emilio

Re: [PATCH v25 00/10] Add ARMv8 RAS virtualization support in QEMU

2020-05-06 Thread gengdongjiu

On 2020/5/7 4:25, Michael S. Tsirkin wrote:
> On Wed, May 06, 2020 at 07:42:19PM +0800, gengdongjiu wrote:
>> On 2020/4/17 21:32, Peter Maydell wrote:
>>> On Fri, 10 Apr 2020 at 12:46, Dongjiu Geng  wrote:

 In the ARMv8 platform, the CPU error types includes synchronous external 
 abort(SEA)
 and SError Interrupt (SEI). If exception happens in guest, host does not 
 know the detailed
 information of guest, so it is expected that guest can do the recovery. 
 For example, if an
 exception happens in a guest user-space application, host does not know 
 which application
 encounters errors, only guest knows it.

 For the ARMv8 SEA/SEI, KVM or host kernel delivers SIGBUS to notify 
 userspace.
 After user space gets the notification, it will record the CPER into guest 
 GHES
 buffer and inject an exception or IRQ to guest.

 In the current implementation, if the type of SIGBUS is BUS_MCEERR_AR, we 
 will
 treat it as a synchronous exception, and notify guest with ARMv8 SEA
 notification type after recording CPER into guest.
>>>
>>> Hi. I left a comment on patch 1. The other 3 patches unreviewed
>>> are 5, 6 and 8, which are all ACPI core code, so that's for
>>> MST, Igor or Shannon to review.
>>>
>>> Once those have been reviewed, please ping me if you want this
>>> to go via target-arm.next.
>>
>> Hi Peter,
>>Igor have reviewed all ACPI core code. whether you can apply this series 
>> to target-arm.next I can make another patches to solve your comments on 
>> patch1 and another APCI comment.
>> Thanks very much in advance.
> 
> Given it all starts with patch 1, it's probably easier to address the
> comment and repost.

Ok, I will do it. thanks.

> 
> 
>>>
>>> thanks
>>> -- PMM
>>>
>>> .
>>>
> 
> .
>

[Bug 1805256] Re: qemu-img hangs on rcu_call_ready_event logic in Aarch64 when converting images

2020-05-06 Thread Launchpad Bug Tracker

** Merge proposal linked:
   
https://code.launchpad.net/~rafaeldtinoco/ubuntu/+source/qemu/+git/qemu/+merge/383566

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1805256

Title:
  qemu-img hangs on rcu_call_ready_event logic in Aarch64 when
  converting images

Status in kunpeng920:
  Triaged
Status in kunpeng920 ubuntu-18.04 series:
  Triaged
Status in kunpeng920 ubuntu-18.04-hwe series:
  Triaged
Status in kunpeng920 ubuntu-19.10 series:
  Triaged
Status in kunpeng920 ubuntu-20.04 series:
  Triaged
Status in kunpeng920 upstream-kernel series:
  Fix Committed
Status in QEMU:
  Fix Released
Status in qemu package in Ubuntu:
  In Progress
Status in qemu source package in Bionic:
  In Progress
Status in qemu source package in Disco:
  In Progress
Status in qemu source package in Eoan:
  In Progress
Status in qemu source package in Focal:
  In Progress

Bug description:
  [Impact]

  * QEMU locking primitives might face a race condition in QEMU Async
  I/O bottom halves scheduling. This leads to a dead lock making either
  QEMU or one of its tools to hang indefinitely.

  [Test Case]

  * qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs in Aarch64.

  [Regression Potential]

  * This is a change to a core part of QEMU: The AIO scheduling. It
  works like a "kernel" scheduler, whereas kernel schedules OS tasks,
  the QEMU AIO code is responsible to schedule QEMU coroutines or event
  listeners callbacks.

  * There was a long discussion upstream about primitives and Aarch64.
  After quite sometime Paolo released this patch and it solves the
  issue. Tested platforms were: amd64 and aarch64 based on his commit
  log.

  * Christian suggests that this fix stay little longer in -proposed to
  make sure it won't cause any regressions.

  * dannf suggests we also check for performance regressions; e.g. how
  long it takes to convert a cloud image on high-core systems.

  [Other Info]

   * Original Description bellow:

  Command:

  qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs.

  

  Workaround:

  qemu-img convert -m 1 -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Run "qemu-img convert" with "a single coroutine" to avoid this issue.

  

  (gdb) thread 1
  ...
  (gdb) bt
  #0 0xbf1ad81c in __GI_ppoll
  #1 0xaabcf73c in ppoll
  #2 qemu_poll_ns
  #3 0xaabd0764 in os_host_main_loop_wait
  #4 main_loop_wait
  ...

  (gdb) thread 2
  ...
  (gdb) bt
  #0 syscall ()
  #1 0xaabd41cc in qemu_futex_wait
  #2 qemu_event_wait (ev=ev@entry=0xaac86ce8 )
  #3 0xaabed05c in call_rcu_thread
  #4 0xaabd34c8 in qemu_thread_start
  #5 0xbf25c880 in start_thread
  #6 0xbf1b6b9c in thread_start ()

  (gdb) thread 3
  ...
  (gdb) bt
  #0 0xbf11aa20 in __GI___sigtimedwait
  #1 0xbf2671b4 in __sigwait
  #2 0xaabd1ddc in sigwait_compat
  #3 0xaabd34c8 in qemu_thread_start
  #4 0xbf25c880 in start_thread
  #5 0xbf1b6b9c in thread_start

  

  (gdb) run
  Starting program: /usr/bin/qemu-img convert -f qcow2 -O qcow2
  ./disk01.ext4.qcow2 ./output.qcow2

  [New Thread 0xbec5ad90 (LWP 72839)]
  [New Thread 0xbe459d90 (LWP 72840)]
  [New Thread 0xbdb57d90 (LWP 72841)]
  [New Thread 0xacac9d90 (LWP 72859)]
  [New Thread 0xa7ffed90 (LWP 72860)]
  [New Thread 0xa77fdd90 (LWP 72861)]
  [New Thread 0xa6ffcd90 (LWP 72862)]
  [New Thread 0xa67fbd90 (LWP 72863)]
  [New Thread 0xa5ffad90 (LWP 72864)]

  [Thread 0xa5ffad90 (LWP 72864) exited]
  [Thread 0xa6ffcd90 (LWP 72862) exited]
  [Thread 0xa77fdd90 (LWP 72861) exited]
  [Thread 0xbdb57d90 (LWP 72841) exited]
  [Thread 0xa67fbd90 (LWP 72863) exited]
  [Thread 0xacac9d90 (LWP 72859) exited]
  [Thread 0xa7ffed90 (LWP 72860) exited]

  
  """

  All the tasks left are blocked in a system call, so no task left to call
  qemu_futex_wake() to unblock thread #2 (in futex()), which would unblock
  thread #1 (doing poll() in a pipe with thread #2).

  Those 7 threads exit before disk conversion is complete (sometimes in
  the beginning, sometimes at the end).

  

  On the HiSilicon D06 system - a 96 core NUMA arm64 box - qemu-img
  frequently hangs (~50% of the time) with this command:

  qemu-img convert -f qcow2 -O qcow2 /tmp/cloudimg /tmp/cloudimg2

  Where "cloudimg" is a standard qcow2 Ubuntu cloud image. This
  qcow2->qcow2 conversion happens to be something uvtool does every time
  it fetches images.

  Once hung, attaching gdb gives the following backtrace:

  (gdb) bt
  #0  0xae4f8154 in __GI_ppoll (fds=0xe8a67dc0, 
nfds=187650274213760,
  timeout=, timeout@entry=0x0, sigmask=0xc123b950)
  at ../sysdeps/unix/sysv/linux/ppoll.c:39
  #1

Re: [PATCH v4 5/6] i386: Hyper-V VMBus ACPI DSDT entry

2020-05-06 Thread Jon Doron


Thank you Maciej :)

Igor it seems like the IRQ being used is 5 and not 7 & 13 like in the 
current patch. Seems like it needs to reside in the _CRS like you said.


Seems like it has all those _STA/_DIS/_PS0 just like the way it's 
currently in the patch (unless I'm missing something).


Notice _PS3 is not a Method.

So just to summarize the changes i need to do:
1. Change from 2 IRQs to single one (and use 5 as the default)
2. IRQs needs to be under _CRS.
3. You mentioned you want under a different location than the ISA bug 
where would you want it to be?


Please let me know if there is anything else.

Thanks,
-- Jon.

On 06/05/2020, Maciej S. Szmigiero wrote:

On 05.05.2020 17:38, Jon Doron wrote:

On 05/05/2020, Igor Mammedov wrote:

I dont know what were the original intentions of the original patch authors (at 
this point I simply rebased it, and to be honest I did not need this patch to 
get where I was going to, but it was part of the original patchset).

But I'm willing to do any changes so we can keep going forward with this.


On Fri, 24 Apr 2020 15:34:43 +0300
Jon Doron  wrote:


Guest OS uses ACPI to discover VMBus presence.  Add a corresponding
entry to DSDT in case VMBus has been enabled.

Experimentally Windows guests were found to require this entry to
include two IRQ resources. They seem to never be used but they still
have to be there.

Make IRQ numbers user-configurable via corresponding properties; use 7
and 13 by default.

well, it seems that at least linux guest driver uses one IRQ,
abeit not from ACPI descriptior

perhaps it's what hyperv host puts into _CRS.
Could you dump ACPI tables and check how hyperv describes vmbus in acpi?




I can no longer get to the HyperV computer I had (in the office so hopefully if 
someone else has access to HyperV machine and willing to reply here with the 
dumped ACPI tables that would be great).



Here is a VMBus ACPI device description from Hyper-V in Windows Server 2019:

Device (\_SB.VMOD.VMBS)
{
   Name (STA, 0x0F)
   Name (_ADR, Zero)  // _ADR: Address
   Name (_DDN, "VMBUS")  // _DDN: DOS Device Name
   Name (_HID, "VMBus")  // _HID: Hardware ID
   Name (_UID, Zero)  // _UID: Unique ID
   Method (_DIS, 0, NotSerialized)  // _DIS: Disable Device
   {
STA &= 0x0D
   }

   Method (_PS0, 0, NotSerialized)  // _PS0: Power State 0
   {
STA |= 0x0F
   }

   Method (_STA, 0, NotSerialized)  // _STA: Status
   {
Return (STA) /* \_SB_.VMOD.VMBS.STA_ */
   }

   Name (_PS3, Zero)  // _PS3: Power State 3
   Name (_CRS, ResourceTemplate ()  // _CRS: Current Resource Settings
   {
IRQ (Edge, ActiveHigh, Exclusive, )
{5}
   })
}

It seems to use just IRQ 5.

Maciej

Re: [PATCH 0/5] target/i386: fxtract, fscale fixes

2020-05-06 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/alpine.deb.2.21.2005070038550.18...@digraph.polyomino.org.uk/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Message-id: alpine.deb.2.21.2005070038550.18...@digraph.polyomino.org.uk
Subject: [PATCH 0/5] target/i386: fxtract, fscale fixes
Type: series

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Switched to a new branch 'test'
ef3dfb7 target/i386: fix fscale handling of rounding precision
0ef4ac9 target/i386: fix fscale handling of infinite exponents
9c12341 target/i386: fix fscale handling of invalid exponent encodings
aac0b0b target/i386: fix fscale handling of signaling NaN
69eed0b target/i386: implement special cases for fxtract

=== OUTPUT BEGIN ===
1/5 Checking commit 69eed0bcaaaf (target/i386: implement special cases for 
fxtract)
WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#55: 
new file mode 100644

ERROR: Use of volatile is usually wrong, please add a comment
#70: FILE: tests/tcg/i386/test-i386-fxtract.c:11:
+volatile union u ld_pseudo_m16382 = { .s = { UINT64_C(1) << 63, 0 } };

ERROR: Use of volatile is usually wrong, please add a comment
#71: FILE: tests/tcg/i386/test-i386-fxtract.c:12:
+volatile union u ld_invalid_1 = { .s = { 1, 1234 } };

ERROR: Use of volatile is usually wrong, please add a comment
#72: FILE: tests/tcg/i386/test-i386-fxtract.c:13:
+volatile union u ld_invalid_2 = { .s = { 0, 1234 } };

ERROR: Use of volatile is usually wrong, please add a comment
#73: FILE: tests/tcg/i386/test-i386-fxtract.c:14:
+volatile union u ld_invalid_3 = { .s = { 0, 0x7fff } };

ERROR: Use of volatile is usually wrong, please add a comment
#74: FILE: tests/tcg/i386/test-i386-fxtract.c:15:
+volatile union u ld_invalid_4 = { .s = { (UINT64_C(1) << 63) - 1, 0x7fff } };

ERROR: Use of volatile is usually wrong, please add a comment
#76: FILE: tests/tcg/i386/test-i386-fxtract.c:17:
+volatile long double ld_sig, ld_exp;

ERROR: spaces required around that '-' (ctx:VxV)
#139: FILE: tests/tcg/i386/test-i386-fxtract.c:80:
+  "0" (0x1p-16445L));
^

total: 7 errors, 1 warnings, 154 lines checked

Patch 1/5 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

2/5 Checking commit aac0b0b6881b (target/i386: fix fscale handling of signaling 
NaN)
WARNING: added, moved or deleted file(s), does MAINTAINERS need updating?
#30: 
new file mode 100644

ERROR: Use of volatile is usually wrong, please add a comment
#45: FILE: tests/tcg/i386/test-i386-fscale.c:11:
+volatile long double ld_res;

total: 1 errors, 1 warnings, 47 lines checked

Patch 2/5 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

3/5 Checking commit 9c123418e935 (target/i386: fix fscale handling of invalid 
exponent encodings)
ERROR: Use of volatile is usually wrong, please add a comment
#40: FILE: tests/tcg/i386/test-i386-fscale.c:11:
+volatile union u ld_invalid_1 = { .s = { 1, 1234 } };

ERROR: Use of volatile is usually wrong, please add a comment
#41: FILE: tests/tcg/i386/test-i386-fscale.c:12:
+volatile union u ld_invalid_2 = { .s = { 0, 1234 } };

ERROR: Use of volatile is usually wrong, please add a comment
#42: FILE: tests/tcg/i386/test-i386-fscale.c:13:
+volatile union u ld_invalid_3 = { .s = { 0, 0x7fff } };

ERROR: Use of volatile is usually wrong, please add a comment
#43: FILE: tests/tcg/i386/test-i386-fscale.c:14:
+volatile union u ld_invalid_4 = { .s = { (UINT64_C(1) << 63) - 1, 0x7fff } };

total: 4 errors, 0 warnings, 51 lines checked

Patch 3/5 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

4/5 Checking commit 0ef4ac9a50a5 (target/i386: fix fscale handling of infinite 
exponents)
5/5 Checking commit ef3dfb7e7c89 (target/i386: fix fscale handling of rounding 
precision)
ERROR: Use of volatile is usually wrong, please add a comment
#41: FILE: tests/tcg/i386/test-i386-fscale.c:11:
+volatile long double ld_third = 1.0L / 3.0L;

ERROR: Use of volatile is usually wrong, please add a comment
#42: FILE: tests/tcg/i386/test-i386-fscale.c:12:
+volatile long double ld_four_thirds = 4.0L / 3.0L;

total: 2 errors, 0 warnings, 34 lines checked

Patch 5/5 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


The full log is available at

Re: [PATCH] hw/net: Added basic IPv6 fragmentation. Fixed IPv6 payload length. Fixed CSO for IPv6.

2020-05-06 Thread no-reply

Patchew URL: https://patchew.org/QEMU/20200507005234.959590-1-and...@daynix.com/



Hi,

This series failed the asan build test. Please find the testing commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
export ARCH=x86_64
make docker-image-fedora V=1 NETWORK=1
time make docker-test-debug@fedora TARGET_LIST=x86_64-softmmu J=14 NETWORK=1
=== TEST SCRIPT END ===

  CC  hw/pci/pci_host.o
  CC  hw/pci/pcie_aer.o
  CC  hw/pci/pcie.o
/tmp/qemu-test/src/hw/net/net_tx_pkt.c:486:16: error: variable 'cso' is used 
uninitialized whenever 'if' condition is false 
[-Werror,-Wsometimes-uninitialized]
} else if (l3_proto == ETH_P_IPV6) {
   ^~
/tmp/qemu-test/src/hw/net/net_tx_pkt.c:494:75: note: uninitialized use occurs 
here
---
^
 = 0
1 error generated.
make: *** [/tmp/qemu-test/src/rules.mak:69: hw/net/net_tx_pkt.o] Error 1
make: *** Waiting for unfinished jobs
Traceback (most recent call last):
  File "./tests/docker/docker.py", line 664, in 
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=e154d651c56545a88917b911771f2f22', '-u', 
'1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 
'TARGET_LIST=x86_64-softmmu', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 
'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=', '-e', 'CCACHE_DIR=/var/tmp/ccache', 
'-v', '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-g2q7crd4/src/docker-src.2020-05-06-21.14.48.1453:/var/tmp/qemu:z,ro',
 'qemu:fedora', '/var/tmp/qemu/run', 'test-debug']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=e154d651c56545a88917b911771f2f22
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-g2q7crd4/src'
make: *** [docker-run-test-debug@fedora] Error 2

real4m5.832s
user0m8.624s


The full log is available at
http://patchew.org/logs/20200507005234.959590-1-and...@daynix.com/testing.asan/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH v1 2/2] Sample mtty: Add migration capability to mtty module

2020-05-06 Thread Yan Zhao

On Tue, May 05, 2020 at 01:54:20AM +0800, Kirti Wankhede wrote:
> This patch makes mtty device migration capable. Purpose od this code is
> to test migration interface. Only stop-and-copy phase is implemented.
> Postcopy migration is not supported.
> 
> Actual data for mtty device migration is very less. Appended dummy data to
> migration data stream, default 100 Mbytes. Added sysfs file
> 'dummy_data_size_MB' to get dummy data size from user which can be used
> to check performance of based of data size. During resuming dummy data is
> read and discarded.
> 
> Signed-off-by: Kirti Wankhede 
> ---
>  samples/vfio-mdev/mtty.c | 602 
> ---
>  1 file changed, 574 insertions(+), 28 deletions(-)
> 
> diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
> index bf666cce5bb7..f9194234fc6a 100644
> --- a/samples/vfio-mdev/mtty.c
> +++ b/samples/vfio-mdev/mtty.c
> @@ -44,9 +44,23 @@
>  
>  #define MTTY_STRING_LEN  16
>  
> -#define MTTY_CONFIG_SPACE_SIZE  0xff
> -#define MTTY_IO_BAR_SIZE0x8
> -#define MTTY_MMIO_BAR_SIZE  0x10
> +#define MTTY_CONFIG_SPACE_SIZE   0xff
> +#define MTTY_IO_BAR_SIZE 0x8
> +#define MTTY_MMIO_BAR_SIZE   0x10
> +#define MTTY_MIGRATION_REGION_SIZE   0x100   // 16M
> +
> +#define MTTY_MIGRATION_REGION_INDEX  VFIO_PCI_NUM_REGIONS
> +#define MTTY_REGIONS_MAX (MTTY_MIGRATION_REGION_INDEX + 1)
> +
> +/* Data section start from page aligned offset */
> +#define MTTY_MIGRATION_REGION_DATA_OFFSET(0x1000)
> +
> +/* First page is used for struct vfio_device_migration_info */
> +#define MTTY_MIGRATION_REGION_SIZE_MMAP \
> + (MTTY_MIGRATION_REGION_SIZE - MTTY_MIGRATION_REGION_DATA_OFFSET)
> +
> +#define MIGRATION_INFO_OFFSET(MEMBER)\
> + offsetof(struct vfio_device_migration_info, MEMBER)
>  
>  #define STORE_LE16(addr, val)   (*(u16 *)addr = val)
>  #define STORE_LE32(addr, val)   (*(u32 *)addr = val)
> @@ -129,6 +143,28 @@ struct serial_port {
>   u8 intr_trigger_level;  /* interrupt trigger level */
>  };
>  
> +/* Migration packet */
> +#define PACKET_ID(u16)(0xfeedbaba)
> +
> +#define PACKET_FLAGS_ACTUAL_DATA (1 << 0)
> +#define PACKET_FLAGS_DUMMY_DATA  (1 << 1)
> +
> +#define PACKET_DATA_SIZE_MAX (8 * 1024 * 1024)
> +
> +struct packet {
> + u16 id;
> + u16 flags;
> + u32 data_size;
> + u8 data[];
> +};
> +
> +enum {
> + PACKET_STATE_NONE = 0,
> + PACKET_STATE_PREPARED,
> + PACKET_STATE_COPIED,
> + PACKET_STATE_LAST,
> +};
> +
>  /* State of each mdev device */
>  struct mdev_state {
>   int irq_fd;
> @@ -138,22 +174,37 @@ struct mdev_state {
>   u8 *vconfig;
>   struct mutex ops_lock;
>   struct mdev_device *mdev;
> - struct mdev_region_info region_info[VFIO_PCI_NUM_REGIONS];
> - u32 bar_mask[VFIO_PCI_NUM_REGIONS];
> + struct mdev_region_info region_info[MTTY_REGIONS_MAX];
> + u32 bar_mask[MTTY_REGIONS_MAX];
>   struct list_head next;
>   struct serial_port s[2];
>   struct mutex rxtx_lock;
>   struct vfio_device_info dev_info;
> - int nr_ports;
> + u32 nr_ports;
>  
>   /* List of pinned gpfns, gpfn as index and content is translated hpfn */
>   unsigned long *gpfn_to_hpfn;
>   struct notifier_block nb;
> +
> + u32 device_state;
> + u64 saved_size;
> + void *mig_region_base;
> + bool is_actual_data_sent;
> + struct packet *pkt;
> + u32 packet_state;
> + u64 dummy_data_size;
>  };
>  
>  static struct mutex mdev_list_lock;
>  static struct list_head mdev_devices_list;
>  
> +/*
> + * Default dummy data size set to 100 MB. To change value of dummy data size 
> at
> + * runtime but before migration write size in MB to sysfs file
> + * dummy_data_size_MB
> + */
> +static unsigned long user_dummy_data_size = (100 * 1024 * 1024);
> +
>  static const struct file_operations vd_fops = {
>   .owner  = THIS_MODULE,
>  };
> @@ -639,6 +690,288 @@ static void mdev_read_base(struct mdev_state 
> *mdev_state)
>   }
>  }
>  
> +static int save_setup(struct mdev_state *mdev_state)
> +{
> + mdev_state->is_actual_data_sent = false;
> +
> + memset(mdev_state->pkt, 0, sizeof(struct packet) +
> +PACKET_DATA_SIZE_MAX);
> +
> + return 0;
> +}
> +
> +static int set_device_state(struct mdev_state *mdev_state, u32 device_state)
> +{
> + int ret = 0;
> +
> + if (mdev_state->device_state == device_state)
> + return 0;
> +
> + if (device_state & VFIO_DEVICE_STATE_RUNNING) {
> +#if defined(DEBUG)
> + if (device_state & VFIO_DEVICE_STATE_SAVING) {
> + pr_info("%s: %s Pre-copy\n", __func__,
> + dev_name(mdev_dev(mdev_state->mdev)));
> + } else
> + pr_info("%s: %s Running\n", __func__,
> +

[PATCH 5/5] target/i386: fix fscale handling of rounding precision

2020-05-06 Thread Joseph Myers

The fscale implementation uses floatx80_scalbn for the final scaling
operation.  floatx80_scalbn ends up rounding the result using the
dynamic rounding precision configured for the FPU.  But only a limited
set of x87 floating-point instructions are supposed to respect the
dynamic rounding precision, and fscale is not in that set.  Fix the
implementation to save and restore the rounding precision around the
call to floatx80_scalbn.

Signed-off-by: Joseph Myers 
---
 target/i386/fpu_helper.c  |  3 +++
 tests/tcg/i386/test-i386-fscale.c | 13 +
 2 files changed, 16 insertions(+)

diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c
index d4c15728e1..0c3fce933c 100644
--- a/target/i386/fpu_helper.c
+++ b/target/i386/fpu_helper.c
@@ -1001,7 +1001,10 @@ void helper_fscale(CPUX86State *env)
 }
 } else {
 int n = floatx80_to_int32_round_to_zero(ST1, >fp_status);
+signed char save = env->fp_status.floatx80_rounding_precision;
+env->fp_status.floatx80_rounding_precision = 80;
 ST0 = floatx80_scalbn(ST0, n, >fp_status);
+env->fp_status.floatx80_rounding_precision = save;
 }
 }
 
diff --git a/tests/tcg/i386/test-i386-fscale.c 
b/tests/tcg/i386/test-i386-fscale.c
index b953e7c563..d23b3cfeec 100644
--- a/tests/tcg/i386/test-i386-fscale.c
+++ b/tests/tcg/i386/test-i386-fscale.c
@@ -8,6 +8,8 @@ union u {
 long double ld;
 };
 
+volatile long double ld_third = 1.0L / 3.0L;
+volatile long double ld_four_thirds = 4.0L / 3.0L;
 volatile union u ld_invalid_1 = { .s = { 1, 1234 } };
 volatile union u ld_invalid_2 = { .s = { 0, 1234 } };
 volatile union u ld_invalid_3 = { .s = { 0, 0x7fff } };
@@ -91,5 +93,16 @@ int main(void)
 printf("FAIL: fscale finite down inf\n");
 ret = 1;
 }
+/* Set round-to-nearest with single-precision rounding.  */
+cw = cw & ~0xf00;
+__asm__ volatile ("fldcw %0" : : "m" (cw));
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (ld_third), "u" (2.0L));
+cw = cw | 0x300;
+__asm__ volatile ("fldcw %0" : : "m" (cw));
+if (ld_res != ld_four_thirds) {
+printf("FAIL: fscale single-precision\n");
+ret = 1;
+}
 return ret;
 }
-- 
2.17.1


-- 
Joseph S. Myers
jos...@codesourcery.com

[PATCH 4/5] target/i386: fix fscale handling of infinite exponents

2020-05-06 Thread Joseph Myers

The fscale implementation passes infinite exponents through to generic
code that rounds the exponent to a 32-bit integer before using
floatx80_scalbn.  In round-to-nearest mode, and ignoring exceptions,
this works in many cases.  But it fails to handle the special cases of
scaling 0 by a +Inf exponent or an infinity by a -Inf exponent, which
should produce a NaN, and because it produces an inexact result for
finite nonzero numbers being scaled, the result is sometimes incorrect
in other rounding modes.  Add appropriate handling of infinite
exponents to produce a NaN or an appropriately signed exact zero or
infinity as a result.

Signed-off-by: Joseph Myers 
---
 target/i386/fpu_helper.c  | 22 ++
 tests/tcg/i386/test-i386-fscale.c | 29 +
 2 files changed, 51 insertions(+)

diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c
index 7709af8fdd..d4c15728e1 100644
--- a/target/i386/fpu_helper.c
+++ b/target/i386/fpu_helper.c
@@ -977,6 +977,28 @@ void helper_fscale(CPUX86State *env)
 float_raise(float_flag_invalid, >fp_status);
 ST0 = floatx80_silence_nan(ST0, >fp_status);
 }
+} else if (floatx80_is_infinity(ST1) &&
+   !floatx80_invalid_encoding(ST0) &&
+   !floatx80_is_any_nan(ST0)) {
+if (floatx80_is_neg(ST1)) {
+if (floatx80_is_infinity(ST0)) {
+float_raise(float_flag_invalid, >fp_status);
+ST0 = floatx80_default_nan(>fp_status);
+} else {
+ST0 = (floatx80_is_neg(ST0) ?
+   floatx80_chs(floatx80_zero) :
+   floatx80_zero);
+}
+} else {
+if (floatx80_is_zero(ST0)) {
+float_raise(float_flag_invalid, >fp_status);
+ST0 = floatx80_default_nan(>fp_status);
+} else {
+ST0 = (floatx80_is_neg(ST0) ?
+   floatx80_chs(floatx80_infinity) :
+   floatx80_infinity);
+}
+}
 } else {
 int n = floatx80_to_int32_round_to_zero(ST1, >fp_status);
 ST0 = floatx80_scalbn(ST0, n, >fp_status);
diff --git a/tests/tcg/i386/test-i386-fscale.c 
b/tests/tcg/i386/test-i386-fscale.c
index b65a055d0a..b953e7c563 100644
--- a/tests/tcg/i386/test-i386-fscale.c
+++ b/tests/tcg/i386/test-i386-fscale.c
@@ -31,6 +31,7 @@ int issignaling_ld(long double x)
 
 int main(void)
 {
+short cw;
 int ret = 0;
 __asm__ volatile ("fscale" : "=t" (ld_res) :
   "0" (2.5L), "u" (__builtin_nansl("")));
@@ -62,5 +63,33 @@ int main(void)
 printf("FAIL: fscale invalid 4\n");
 ret = 1;
 }
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (0.0L), "u" (__builtin_infl()));
+if (!isnan_ld(ld_res) || issignaling_ld(ld_res)) {
+printf("FAIL: fscale 0 up inf\n");
+ret = 1;
+}
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (__builtin_infl()), "u" (-__builtin_infl()));
+if (!isnan_ld(ld_res) || issignaling_ld(ld_res)) {
+printf("FAIL: fscale inf down inf\n");
+ret = 1;
+}
+/* Set round-downward.  */
+__asm__ volatile ("fnstcw %0" : "=m" (cw));
+cw = (cw & ~0xc00) | 0x400;
+__asm__ volatile ("fldcw %0" : : "m" (cw));
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (1.0L), "u" (__builtin_infl()));
+if (ld_res != __builtin_infl()) {
+printf("FAIL: fscale finite up inf\n");
+ret = 1;
+}
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (-1.0L), "u" (-__builtin_infl()));
+if (ld_res != -0.0L || __builtin_copysignl(1.0L, ld_res) != -1.0L) {
+printf("FAIL: fscale finite down inf\n");
+ret = 1;
+}
 return ret;
 }
-- 
2.17.1


-- 
Joseph S. Myers
jos...@codesourcery.com

[PATCH 2/5] target/i386: fix fscale handling of signaling NaN

2020-05-06 Thread Joseph Myers

The implementation of the fscale instruction returns a NaN exponent
unchanged.  Fix it to return a quiet NaN when the provided exponent is
a signaling NaN.

Signed-off-by: Joseph Myers 
---
 target/i386/fpu_helper.c  |  4 
 tests/tcg/i386/test-i386-fscale.c | 37 +++
 2 files changed, 41 insertions(+)
 create mode 100644 tests/tcg/i386/test-i386-fscale.c

diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c
index 71a696a863..60012c405c 100644
--- a/target/i386/fpu_helper.c
+++ b/target/i386/fpu_helper.c
@@ -970,6 +970,10 @@ void helper_fscale(CPUX86State *env)
 {
 if (floatx80_is_any_nan(ST1)) {
 ST0 = ST1;
+if (floatx80_is_signaling_nan(ST0, >fp_status)) {
+float_raise(float_flag_invalid, >fp_status);
+ST0 = floatx80_silence_nan(ST0, >fp_status);
+}
 } else {
 int n = floatx80_to_int32_round_to_zero(ST1, >fp_status);
 ST0 = floatx80_scalbn(ST0, n, >fp_status);
diff --git a/tests/tcg/i386/test-i386-fscale.c 
b/tests/tcg/i386/test-i386-fscale.c
new file mode 100644
index 00..aecac5125f
--- /dev/null
+++ b/tests/tcg/i386/test-i386-fscale.c
@@ -0,0 +1,37 @@
+/* Test fscale instruction.  */
+
+#include 
+#include 
+
+union u {
+struct { uint64_t sig; uint16_t sign_exp; } s;
+long double ld;
+};
+
+volatile long double ld_res;
+
+int isnan_ld(long double x)
+{
+  union u tmp = { .ld = x };
+  return ((tmp.s.sign_exp & 0x7fff) == 0x7fff &&
+  (tmp.s.sig >> 63) != 0 &&
+  (tmp.s.sig << 1) != 0);
+}
+
+int issignaling_ld(long double x)
+{
+union u tmp = { .ld = x };
+return isnan_ld(x) && (tmp.s.sig & UINT64_C(0x4000)) == 0;
+}
+
+int main(void)
+{
+int ret = 0;
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (2.5L), "u" (__builtin_nansl("")));
+if (!isnan_ld(ld_res) || issignaling_ld(ld_res)) {
+printf("FAIL: fscale snan\n");
+ret = 1;
+}
+return ret;
+}
-- 
2.17.1


-- 
Joseph S. Myers
jos...@codesourcery.com

[PATCH 3/5] target/i386: fix fscale handling of invalid exponent encodings

2020-05-06 Thread Joseph Myers

The fscale implementation does not check for invalid encodings in the
exponent operand, thus treating them like INT_MIN (the value returned
for invalid encodings by floatx80_to_int32_round_to_zero).  Fix it to
treat them similarly to signaling NaN exponents, thus generating a
quiet NaN result.

Signed-off-by: Joseph Myers 
---
 target/i386/fpu_helper.c  |  5 -
 tests/tcg/i386/test-i386-fscale.c | 29 +
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c
index 60012c405c..7709af8fdd 100644
--- a/target/i386/fpu_helper.c
+++ b/target/i386/fpu_helper.c
@@ -968,7 +968,10 @@ void helper_frndint(CPUX86State *env)
 
 void helper_fscale(CPUX86State *env)
 {
-if (floatx80_is_any_nan(ST1)) {
+if (floatx80_invalid_encoding(ST1)) {
+float_raise(float_flag_invalid, >fp_status);
+ST0 = floatx80_default_nan(>fp_status);
+} else if (floatx80_is_any_nan(ST1)) {
 ST0 = ST1;
 if (floatx80_is_signaling_nan(ST0, >fp_status)) {
 float_raise(float_flag_invalid, >fp_status);
diff --git a/tests/tcg/i386/test-i386-fscale.c 
b/tests/tcg/i386/test-i386-fscale.c
index aecac5125f..b65a055d0a 100644
--- a/tests/tcg/i386/test-i386-fscale.c
+++ b/tests/tcg/i386/test-i386-fscale.c
@@ -8,6 +8,11 @@ union u {
 long double ld;
 };
 
+volatile union u ld_invalid_1 = { .s = { 1, 1234 } };
+volatile union u ld_invalid_2 = { .s = { 0, 1234 } };
+volatile union u ld_invalid_3 = { .s = { 0, 0x7fff } };
+volatile union u ld_invalid_4 = { .s = { (UINT64_C(1) << 63) - 1, 0x7fff } };
+
 volatile long double ld_res;
 
 int isnan_ld(long double x)
@@ -33,5 +38,29 @@ int main(void)
 printf("FAIL: fscale snan\n");
 ret = 1;
 }
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (2.5L), "u" (ld_invalid_1.ld));
+if (!isnan_ld(ld_res) || issignaling_ld(ld_res)) {
+printf("FAIL: fscale invalid 1\n");
+ret = 1;
+}
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (2.5L), "u" (ld_invalid_2.ld));
+if (!isnan_ld(ld_res) || issignaling_ld(ld_res)) {
+printf("FAIL: fscale invalid 2\n");
+ret = 1;
+}
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (2.5L), "u" (ld_invalid_3.ld));
+if (!isnan_ld(ld_res) || issignaling_ld(ld_res)) {
+printf("FAIL: fscale invalid 3\n");
+ret = 1;
+}
+__asm__ volatile ("fscale" : "=t" (ld_res) :
+  "0" (2.5L), "u" (ld_invalid_4.ld));
+if (!isnan_ld(ld_res) || issignaling_ld(ld_res)) {
+printf("FAIL: fscale invalid 4\n");
+ret = 1;
+}
 return ret;
 }
-- 
2.17.1


-- 
Joseph S. Myers
jos...@codesourcery.com

[PATCH 1/5] target/i386: implement special cases for fxtract

2020-05-06 Thread Joseph Myers

The implementation of the fxtract instruction treats all nonzero
operands as normal numbers, so yielding incorrect results for invalid
formats, infinities, NaNs and subnormal and pseudo-denormal operands.
Implement appropriate handling of all those cases.

Signed-off-by: Joseph Myers 
---
 target/i386/fpu_helper.c   |  25 +-
 tests/tcg/i386/test-i386-fxtract.c | 120 +
 2 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/i386/test-i386-fxtract.c

diff --git a/target/i386/fpu_helper.c b/target/i386/fpu_helper.c
index 792a128a6d..71a696a863 100644
--- a/target/i386/fpu_helper.c
+++ b/target/i386/fpu_helper.c
@@ -767,10 +767,33 @@ void helper_fxtract(CPUX86State *env)
>fp_status);
 fpush(env);
 ST0 = temp.d;
+} else if (floatx80_invalid_encoding(ST0)) {
+float_raise(float_flag_invalid, >fp_status);
+ST0 = floatx80_default_nan(>fp_status);
+fpush(env);
+ST0 = ST1;
+} else if (floatx80_is_any_nan(ST0)) {
+if (floatx80_is_signaling_nan(ST0, >fp_status)) {
+float_raise(float_flag_invalid, >fp_status);
+ST0 = floatx80_silence_nan(ST0, >fp_status);
+}
+fpush(env);
+ST0 = ST1;
+} else if (floatx80_is_infinity(ST0)) {
+fpush(env);
+ST0 = ST1;
+ST1 = floatx80_infinity;
 } else {
 int expdif;
 
-expdif = EXPD(temp) - EXPBIAS;
+if (EXPD(temp) == 0) {
+int shift = clz64(temp.l.lower);
+temp.l.lower <<= shift;
+expdif = 1 - EXPBIAS - shift;
+float_raise(float_flag_input_denormal, >fp_status);
+} else {
+expdif = EXPD(temp) - EXPBIAS;
+}
 /* DP exponent bias */
 ST0 = int32_to_floatx80(expdif, >fp_status);
 fpush(env);
diff --git a/tests/tcg/i386/test-i386-fxtract.c 
b/tests/tcg/i386/test-i386-fxtract.c
new file mode 100644
index 00..64fd93d333
--- /dev/null
+++ b/tests/tcg/i386/test-i386-fxtract.c
@@ -0,0 +1,120 @@
+/* Test fxtract instruction.  */
+
+#include 
+#include 
+
+union u {
+struct { uint64_t sig; uint16_t sign_exp; } s;
+long double ld;
+};
+
+volatile union u ld_pseudo_m16382 = { .s = { UINT64_C(1) << 63, 0 } };
+volatile union u ld_invalid_1 = { .s = { 1, 1234 } };
+volatile union u ld_invalid_2 = { .s = { 0, 1234 } };
+volatile union u ld_invalid_3 = { .s = { 0, 0x7fff } };
+volatile union u ld_invalid_4 = { .s = { (UINT64_C(1) << 63) - 1, 0x7fff } };
+
+volatile long double ld_sig, ld_exp;
+
+int isnan_ld(long double x)
+{
+  union u tmp = { .ld = x };
+  return ((tmp.s.sign_exp & 0x7fff) == 0x7fff &&
+  (tmp.s.sig >> 63) != 0 &&
+  (tmp.s.sig << 1) != 0);
+}
+
+int issignaling_ld(long double x)
+{
+union u tmp = { .ld = x };
+return isnan_ld(x) && (tmp.s.sig & UINT64_C(0x4000)) == 0;
+}
+
+int main(void)
+{
+int ret = 0;
+__asm__ volatile ("fxtract" : "=t" (ld_sig), "=u" (ld_exp) : "0" (2.5L));
+if (ld_sig != 1.25L || ld_exp != 1.0L) {
+printf("FAIL: fxtract 2.5\n");
+ret = 1;
+}
+__asm__ volatile ("fxtract" : "=t" (ld_sig), "=u" (ld_exp) : "0" (0.0L));
+if (ld_sig != 0.0L || __builtin_copysignl(1.0L, ld_sig) != 1.0L ||
+ld_exp != -__builtin_infl()) {
+printf("FAIL: fxtract 0.0\n");
+ret = 1;
+}
+__asm__ volatile ("fxtract" : "=t" (ld_sig), "=u" (ld_exp) : "0" (-0.0L));
+if (ld_sig != -0.0L || __builtin_copysignl(1.0L, ld_sig) != -1.0L ||
+ld_exp != -__builtin_infl()) {
+printf("FAIL: fxtract -0.0\n");
+ret = 1;
+}
+__asm__ volatile ("fxtract" : "=t" (ld_sig), "=u" (ld_exp) :
+  "0" (__builtin_infl()));
+if (ld_sig != __builtin_infl() || ld_exp != __builtin_infl()) {
+printf("FAIL: fxtract inf\n");
+ret = 1;
+}
+__asm__ volatile ("fxtract" : "=t" (ld_sig), "=u" (ld_exp) :
+  "0" (-__builtin_infl()));
+if (ld_sig != -__builtin_infl() || ld_exp != __builtin_infl()) {
+printf("FAIL: fxtract -inf\n");
+ret = 1;
+}
+__asm__ volatile ("fxtract" : "=t" (ld_sig), "=u" (ld_exp) :
+  "0" (__builtin_nanl("")));
+if (!isnan_ld(ld_sig) || issignaling_ld(ld_sig) ||
+!isnan_ld(ld_exp) || issignaling_ld(ld_exp)) {
+printf("FAIL: fxtract qnan\n");
+ret = 1;
+}
+__asm__ volatile ("fxtract" : "=t" (ld_sig), "=u" (ld_exp) :
+  "0" (__builtin_nansl("")));
+if (!isnan_ld(ld_sig) || issignaling_ld(ld_sig) ||
+!isnan_ld(ld_exp) || issignaling_ld(ld_exp)) {
+printf("FAIL: fxtract snan\n");
+ret = 1;
+}
+__asm__ volatile ("fxtract" : "=t" (ld_sig), "=u" (ld_exp) :
+  "0" (0x1p-16445L));
+if (ld_sig != 1.0L || ld_exp != -16445.0L) {
+printf("FAIL: fxtract subnormal\n");
+ret

[PATCH 0/5] target/i386: fxtract, fscale fixes

2020-05-06 Thread Joseph Myers

Among the various bugs in the x87 floating-point emulation that show
up through a combination of glibc testing and code inspection, there
are several in the implementations of the fxtract and fscale
instructions.  This series fixes those bugs.

Bugs in other instructions, and bugs relating to floating-point
exceptions and flag setting, will be addressed separately.  In
particular, while some of these patches add code that sets exception
flags in the softfloat state, it's generally the case that the x87
emulation ignores exceptions in that state rather than propagating
them to the status word (and to generating traps where appropriate).
I intend to address that missing propagation of exceptions in a
subsequent patch series; until it's addressed, the code setting
exceptions won't actually do anything useful.  (There is also code in
the x87 emulation, including that of fscale, that would result in
spurious exceptions being set from a naive propagation of exceptions
from the softfloat state, and thus will need updating to avoid
propagating inappropriate exceptions when such propagation is
implemented.)

Joseph Myers (5):
  target/i386: implement special cases for fxtract
  target/i386: fix fscale handling of signaling NaN
  target/i386: fix fscale handling of invalid exponent encodings
  target/i386: fix fscale handling of infinite exponents
  target/i386: fix fscale handling of rounding precision

 target/i386/fpu_helper.c   |  59 +-
 tests/tcg/i386/test-i386-fscale.c  | 108 ++
 tests/tcg/i386/test-i386-fxtract.c | 120 +
 3 files changed, 285 insertions(+), 2 deletions(-)
 create mode 100644 tests/tcg/i386/test-i386-fscale.c
 create mode 100644 tests/tcg/i386/test-i386-fxtract.c

-- 
2.17.1


-- 
Joseph S. Myers
jos...@codesourcery.com

[PATCH] hw/net: Added basic IPv6 fragmentation. Fixed IPv6 payload length. Fixed CSO for IPv6.

2020-05-06 Thread andrew

From: Andrew Melnychenko 

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1708065
Overall, there was an issue that big frames of IPv6 doesn't sent.
With network backend with 'virtual header' - there was an issue
in 'plen' field. Overall, during TSO, 'plen' would be changed,
but with 'vheader' this field should be set to the size of the
payload itself instead of '0'.
For software offload - there is added basic IPv6 fragmentation.
Also fixed checksum offload for IPv6.
The basic IPv6 fragmentation - adding 'frag' extension to
the packet, overall shares some logic with IPv4. It works,
but there are still issues with a combination of
extensions - in the future, it would require refactoring
work to implement workflow with IPv6 and extension.
e1000e driver doesn't set the 'plen' field for IPv6 for big packets
if TSO is enabled. "Jumbo option" isn't added yet, until
qemu supports packets greater than 64K.

Signed-off-by: Andrew Melnychenko 
---
 hw/net/net_tx_pkt.c | 53 ---
 hw/net/net_tx_pkt.h |  7 
 include/net/eth.h   | 15 ++--
 net/eth.c   | 89 ++---
 4 files changed, 150 insertions(+), 14 deletions(-)

diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
index 162f802dd7..e3c7850b54 100644
--- a/hw/net/net_tx_pkt.c
+++ b/hw/net/net_tx_pkt.c
@@ -468,8 +468,8 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
 /* num of iovec without vhdr */
 uint32_t iov_len = pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1;
 uint16_t csl;
-struct ip_header *iphdr;
 size_t csum_offset = pkt->virt_hdr.csum_start + pkt->virt_hdr.csum_offset;
+uint16_t l3_proto = eth_get_l3_proto(iov, 1, iov->iov_len);
 
 /* Put zero to checksum field */
 iov_from_buf(iov, iov_len, csum_offset, , sizeof csum);
@@ -477,9 +477,17 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
 /* Calculate L4 TCP/UDP checksum */
 csl = pkt->payload_len;
 
+csum_cntr = 0;
 /* add pseudo header to csum */
-iphdr = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base;
-csum_cntr = eth_calc_ip4_pseudo_hdr_csum(iphdr, csl, );
+if (l3_proto == ETH_P_IP) {
+csum_cntr = eth_calc_ip4_pseudo_hdr_csum(
+pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base,
+csl, );
+} else if (l3_proto == ETH_P_IPV6) {
+csum_cntr = eth_calc_ip6_pseudo_hdr_csum(
+pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base,
+csl, pkt->l4proto, );
+}
 
 /* data checksum */
 csum_cntr +=
@@ -580,10 +588,11 @@ static bool net_tx_pkt_do_sw_fragmentation(struct 
NetTxPkt *pkt,
 
 more_frags = (fragment_offset + fragment_len < pkt->payload_len);
 
-eth_setup_ip4_fragmentation(l2_iov_base, l2_iov_len, l3_iov_base,
-l3_iov_len, fragment_len, fragment_offset, more_frags);
+eth_setup_ip_fragmentation(l2_iov_base, l2_iov_len, l3_iov_base,
+_iov_len, ETH_MAX_IP_DGRAM_LEN,
+fragment_len, fragment_offset, more_frags);
 
-eth_fix_ip4_checksum(l3_iov_base, l3_iov_len);
+fragment[NET_TX_PKT_FRAGMENT_L3_HDR_POS].iov_len = l3_iov_len;
 
 net_tx_pkt_sendv(pkt, nc, fragment, dst_idx);
 
@@ -617,6 +626,7 @@ bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState 
*nc)
 
 if (pkt->has_virt_hdr ||
 pkt->virt_hdr.gso_type == VIRTIO_NET_HDR_GSO_NONE) {
+net_tx_pkt_fix_ip6_payload_len(pkt);
 net_tx_pkt_sendv(pkt, nc, pkt->vec,
 pkt->payload_frags + NET_TX_PKT_PL_START_FRAG);
 return true;
@@ -635,3 +645,34 @@ bool net_tx_pkt_send_loopback(struct NetTxPkt *pkt, 
NetClientState *nc)
 
 return res;
 }
+
+void net_tx_pkt_fix_ip6_payload_len(struct NetTxPkt *pkt)
+{
+/*
+ * If ipv6 payload length field is 0 - then there should be Hop-by-Hop
+ * option for packets greater than 65,535.
+ * For packets with payload less than 65,535: fix 'plen' field.
+ * For now, qemu drops every packet with size greater 64K
+ * (see net_tx_pkt_send()) so, there is no reason to add jumbo option to 
ip6
+ * hop-by-hop extension if it's missed
+ */
+
+struct iovec *l2 = >vec[NET_TX_PKT_L2HDR_FRAG];
+if (eth_get_l3_proto(l2, 1, l2->iov_len) == ETH_P_IPV6) {
+struct ip6_header *ip6 = (struct ip6_header *) pkt->l3_hdr;
+/*
+ * TODO: if qemu would support >64K packets - add jumbo option check
+ * something like that:
+ * 'if (ip6->ip6_plen == 0 && !has_jumbo_option(ip6)) {'
+ */
+if (ip6->ip6_plen == 0) {
+if (pkt->payload_len <= ETH_MAX_IP_DGRAM_LEN) {
+ip6->ip6_plen = htons(pkt->payload_len);
+}
+/*
+ * TODO: if qemu would support >64K packets
+ * add jumbo option for packets greater then 65,535 bytes
+ */
+}
+}
+}
diff --git a/hw/net/net_tx_pkt.h b/hw/net/net_tx_pkt.h
index 212ecc62fc..912d56ef13

[PATCH] Fix stack corruption when handling PR_GETDEATHSIG

2020-05-06 Thread Stephen Long

From: Ana Pazos 

Signed-off-by: Ana Pazos 
---
Submitting this patch on behalf of Ana Pazos. The bug was triggered by
the following c file on aarch64-linux-user.

> #include 
> #include 
>
> int main() {
>   int PDeachSig = 0;
>   if (prctl(PR_GET_PDEATHSIG, ) == 0 && PDeachSig == SIGKILL)
> prctl(PR_SET_PDEATHSIG, 0);
>   return (PDeachSig == SIGKILL);
> }

 linux-user/syscall.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 05f03919ff..4eac567f97 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -10253,10 +10253,10 @@ static abi_long do_syscall1(void *cpu_env, int num, 
abi_long arg1,
 switch (arg1) {
 case PR_GET_PDEATHSIG:
 {
-int deathsig;
+uint32_t deathsig;
 ret = get_errno(prctl(arg1, , arg3, arg4, arg5));
 if (!is_error(ret) && arg2
-&& put_user_ual(deathsig, arg2)) {
+&& put_user_u32(deathsig, arg2)) {
 return -TARGET_EFAULT;
 }
 return ret;
-- 
2.17.1

Re: [PATCH V3 01/14] KVM: MIPS: Define KVM_ENTRYHI_ASID to cpu_asid_mask(_cpu_data)

2020-05-06 Thread Sasha Levin

Hi

[This is an automated email]

This commit has been processed because it contains a -stable tag.
The stable tag indicates that it's relevant for the following trees: all

The bot has tested the following trees: v5.6.10, v5.4.38, v4.19.120, v4.14.178, 
v4.9.221, v4.4.221.

v5.6.10: Build OK!
v5.4.38: Build OK!
v4.19.120: Build OK!
v4.14.178: Build OK!
v4.9.221: Build OK!
v4.4.221: Failed to apply! Possible dependencies:
029499b47738 ("KVM: x86: MMU: Make mmu_set_spte() return emulate value")
19d194c62b25 ("MIPS: KVM: Simplify TLB_* macros")
403015b323a2 ("MIPS: KVM: Move non-TLB handling code out of tlb.c")
7ee0e5b29d27 ("KVM: x86: MMU: Remove unused parameter of __direct_map()")
9fbfb06a4065 ("MIPS: KVM: Arrayify struct kvm_mips_tlb::tlb_lo*")
ba049e93aef7 ("kvm: rename pfn_t to kvm_pfn_t")
bdb7ed8608f8 ("MIPS: KVM: Convert headers to kernel sized types")
ca64c2beecd4 ("MIPS: KVM: Abstract guest ASID mask")
caa1faa7aba6 ("MIPS: KVM: Trivial whitespace and style fixes")
e6207bbea16c ("MIPS: KVM: Use MIPS_ENTRYLO_* defs from mipsregs.h")


NOTE: The patch will not be queued to stable trees until it is upstream.

How should we proceed with this patch?

-- 
Thanks
Sasha

Re: [PATCH Kernel v18 6/7] vfio iommu: Add migration capability to report supported features

2020-05-06 Thread Alex Williamson

On Mon, 4 May 2020 21:28:58 +0530
Kirti Wankhede  wrote:

> Added migration capability in IOMMU info chain.
> User application should check IOMMU info chain for migration capability
> to use dirty page tracking feature provided by kernel module.
> 
> Signed-off-by: Kirti Wankhede 
> ---
>  drivers/vfio/vfio_iommu_type1.c | 15 +++
>  include/uapi/linux/vfio.h   | 14 ++
>  2 files changed, 29 insertions(+)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 8b27faf1ec38..b38d278d7bff 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -2378,6 +2378,17 @@ static int vfio_iommu_iova_build_caps(struct 
> vfio_iommu *iommu,
>   return ret;
>  }
>  
> +static int vfio_iommu_migration_build_caps(struct vfio_info_cap *caps)
> +{
> + struct vfio_iommu_type1_info_cap_migration cap_mig;
> +
> + cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
> + cap_mig.header.version = 1;
> + cap_mig.flags = VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK;
> +
> + return vfio_info_add_capability(caps, _mig.header, sizeof(cap_mig));
> +}
> +
>  static long vfio_iommu_type1_ioctl(void *iommu_data,
>  unsigned int cmd, unsigned long arg)
>  {
> @@ -2427,6 +2438,10 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>   if (ret)
>   return ret;
>  
> + ret = vfio_iommu_migration_build_caps();
> + if (ret)
> + return ret;
> +
>   if (caps.size) {
>   info.flags |= VFIO_IOMMU_INFO_CAPS;
>  
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index e3cbf8b78623..df9ce8aaafab 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -1013,6 +1013,20 @@ struct vfio_iommu_type1_info_cap_iova_range {
>   struct  vfio_iova_range iova_ranges[];
>  };
>  
> +/*
> + * The migration capability allows to report supported features for 
> migration.
> + *
> + * The structures below define version 1 of this capability.
> + */
> +#define VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION  1
> +
> +struct vfio_iommu_type1_info_cap_migration {
> + struct  vfio_info_cap_header header;
> + __u32   flags;
> + /* supports dirty page tracking */
> +#define VFIO_IOMMU_INFO_CAPS_MIGRATION_DIRTY_PAGE_TRACK  (1 << 0)
> +};
> +

What about exposing the maximum supported dirty bitmap size and the
supported page sizes?  Thanks,

Alex

>  #define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
>  
>  /**

Re: [PATCH Kernel v18 5/7] vfio iommu: Update UNMAP_DMA ioctl to get dirty bitmap before unmap

2020-05-06 Thread Alex Williamson

On Mon, 4 May 2020 21:28:57 +0530
Kirti Wankhede  wrote:

> DMA mapped pages, including those pinned by mdev vendor drivers, might
> get unpinned and unmapped while migration is active and device is still
> running. For example, in pre-copy phase while guest driver could access
> those pages, host device or vendor driver can dirty these mapped pages.
> Such pages should be marked dirty so as to maintain memory consistency
> for a user making use of dirty page tracking.
> 
> To get bitmap during unmap, user should allocate memory for bitmap, set
> size of allocated memory, set page size to be considered for bitmap and
> set flag VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP.
> 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 
> ---
>  drivers/vfio/vfio_iommu_type1.c | 84 
> +++--
>  include/uapi/linux/vfio.h   | 10 +
>  2 files changed, 90 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 01dcb417836f..8b27faf1ec38 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -983,12 +983,14 @@ static int verify_bitmap_size(uint64_t npages, uint64_t 
> bitmap_size)
>  }
>  
>  static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> -  struct vfio_iommu_type1_dma_unmap *unmap)
> +  struct vfio_iommu_type1_dma_unmap *unmap,
> +  struct vfio_bitmap *bitmap)
>  {
>   uint64_t mask;
>   struct vfio_dma *dma, *dma_last = NULL;
>   size_t unmapped = 0;
>   int ret = 0, retries = 0;
> + unsigned long *final_bitmap = NULL, *temp_bitmap = NULL;
>  
>   mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
>  
> @@ -1041,6 +1043,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>   ret = -EINVAL;
>   goto unlock;
>   }
> +
>   dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
>   if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
>   ret = -EINVAL;
> @@ -1048,6 +1051,22 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>   }
>   }
>  
> + if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> +  iommu->dirty_page_tracking) {

Why do we even accept VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP when not
dirty page tracking rather than returning -EINVAL?  It would simplify
things here to reject it at the ioctl and silently ignoring a flag is
rarely if ever the right approach.

> + final_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
> + if (!final_bitmap) {
> + ret = -ENOMEM;
> + goto unlock;
> + }
> +
> + temp_bitmap = kvzalloc(bitmap->size, GFP_KERNEL);
> + if (!temp_bitmap) {
> + ret = -ENOMEM;
> + kfree(final_bitmap);
> + goto unlock;
> + }

YIKES!  So the user can instantly trigger the kernel to internally
allocate 2 x 256MB, regardless of how much they can actually map.

> + }
> +
>   while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
>   if (!iommu->v2 && unmap->iova > dma->iova)
>   break;
> @@ -1058,6 +1077,24 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>   if (dma->task->mm != current->mm)
>   break;
>  
> + if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
> +  iommu->dirty_page_tracking) {
> + unsigned long pgshift = __ffs(bitmap->pgsize);
> + unsigned int npages = dma->size >> pgshift;
> + unsigned int shift;
> +
> + vfio_iova_dirty_bitmap(iommu, dma->iova, dma->size,
> + bitmap->pgsize, (u64 *)temp_bitmap);

vfio_iova_dirty_bitmap() takes a __user bitmap, we're doing
copy_to_user() on a kernel allocated buffer???

> +
> + shift = (dma->iova - unmap->iova) >> pgshift;
> + if (shift)
> + bitmap_shift_left(temp_bitmap, temp_bitmap,
> +   shift, npages);
> + bitmap_or(final_bitmap, final_bitmap, temp_bitmap,
> +   shift + npages);
> + memset(temp_bitmap, 0, bitmap->size);
> + }

It seems like if the per vfio_dma dirty bitmap was oversized by a long
that we could shift it in place, then we'd only need one working bitmap
buffer and we could size that to fit the vfio_dma (or the largest
vfio_dma if we don't want to free and re-alloc for each vfio_dma).
We'd need to do more copy_to/from_user()s, but we'd also avoid copying
between sparse mappings (user zero'd bitmap required) and we'd have a
far more

Re: [PATCH v2 5/5] vhost: add device started check in migration set log

2020-05-06 Thread Raphael Norwitz

As you correctly point out, this code needs to be looked at more
carefully so that
if the device does disconnect in the background we can handle the migration path
gracefully. In particular, we need to decide whether a migration
should be allowed
to continue if a device disconnects durning the migration stage.

mst, any thoughts?

Have you looked at the suggestion I gave Li Feng to move vhost_dev_cleanup()
into the connection path in vhost-user-blk? I’m not sure if he’s
actively working on it,
but I would prefer if we can find a way to keep some state around
between reconnects
so we aren’t constantly checking dev->started. A device can be stopped
for reasons
other than backend disconnect so I’d rather not reuse this field to
check for backend
disconnect failures.

On Thu, Apr 30, 2020 at 9:57 AM Dima Stepanov  wrote:
>
> If vhost-user daemon is used as a backend for the vhost device, then we
> should consider a possibility of disconnect at any moment. If such
> disconnect happened in the vhost_migration_log() routine the vhost
> device structure will be clean up.
> At the start of the vhost_migration_log() function there is a check:
>   if (!dev->started) {
>   dev->log_enabled = enable;
>   return 0;
>   }
> To be consistent with this check add the same check after calling the
> vhost_dev_set_log() routine. This in general help not to break a

Could you point to the specific asserts which are being triggered?

> migration due the assert() message. But it looks like that this code
> should be revised to handle these errors more carefully.
>
> In case of vhost-user device backend the fail paths should consider the
> state of the device. In this case we should skip some function calls
> during rollback on the error paths, so not to get the NULL dereference
> errors.
>
> Signed-off-by: Dima Stepanov 
> ---
>  hw/virtio/vhost.c | 39 +++
>  1 file changed, 35 insertions(+), 4 deletions(-)
>
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 3ee50c4..d5ab96d 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -787,6 +787,17 @@ static int vhost_dev_set_features(struct vhost_dev *dev,
>  static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
>  {
>  int r, i, idx;

A couple points here

(1) This will fail the live migration if the device is disconnected.
That my be the right thing
  to do, but if there are cases where migrations can proceed with
a disconnected device,
  this may not be desirable.

(2) This looks racy. As far as I can tell vhost_dev_set_log() is only
called by vhost_migration_log(),
  and as you say one of the first things vhost_migration_log does
is return if dev->started is not
  set. What’s to stop a disconnect from clearing the vdev right
after this check, just before
  vhost_dev_set_features() is called?

As stated above, I would prefer it if we could add some state which
would persist between
reconnects which could then be checked in the vhost-user code before
interacting with
the backend. I understand this will be a much more involved change and
will require a lot
of thought.

Also, regarding (1) above, if the original check in
vhost_migration_log() returns success if the
device is not started why return an error here? I imagine this could
lead to some inconsistent
behavior if the device disconnects before the first check verses
before the second.

> +
> +if (!dev->started) {
> +/*
> + * If vhost-user daemon is used as a backend for the
> + * device and the connection is broken, then the vhost_dev
> + * structure will be reset all its values to 0.
> + * Add additional check for the device state.
> + */
> +return -1;
> +}
> +
>  r = vhost_dev_set_features(dev, enable_log);
>  if (r < 0) {
>  goto err_features;
> @@ -801,12 +812,19 @@ static int vhost_dev_set_log(struct vhost_dev *dev, 
> bool enable_log)
>  }

Re: [PATCH v2 5/5] qemu-img: Add --start-offset and --max-length to map

2020-05-06 Thread Eric Blake


On 5/6/20 4:34 PM, Eyal Moscovici wrote:

The mapping operation of large disks especially ones stored over a
long chain of QCOW2 files can take a long time to finish.
Additionally when mapping fails there was no way recover by
restarting the mapping from the failed location.

The new options, --start-offset and --max-length allows the user to
divide these type of map operations into shorter independent tasks.

Reviewed-by: Eric Blake 


This patch has some changes from v1.  Among others,...


@@ -3041,6 +3045,18 @@ static int img_map(int argc, char **argv)
  case OPTION_OUTPUT:
  output = optarg;
  break;
+case 's':
+start_offset = cvtnum("start offset", optarg);
+if (start_offset < 0) {
+return 1;
+}
+break;


the new semantics of cvtnum() in this series is enough of a difference 
that I would have removed R-b to make sure the updated patch gets 
re-reviewed, if it had been me as author.  But in this case, it does 
look like the changes are all addressed to comments I suggested in v1, 
so I'm fine that you left my R-b.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2 3/5] qemu-img: validate image length in img_map

2020-05-06 Thread Eric Blake


On 5/6/20 4:34 PM, Eyal Moscovici wrote:

The code handles this case correctly we merely skip the loop. However it
is probably best to return an explicit error.

Acked-by: Mark Kanda 
Signed-off-by: Eyal Moscovici 
---
  qemu-img.c | 5 +
  1 file changed, 5 insertions(+)


Reviewed-by: Eric Blake 



diff --git a/qemu-img.c b/qemu-img.c
index 4a06ab7fe8..a1b507a0be 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3086,6 +3086,11 @@ static int img_map(int argc, char **argv)
  }
  
  length = blk_getlength(blk);

+if (length < 0) {
+error_report("Failed to get size for '%s'", filename);
+return 1;
+}
+
  while (curr.start + curr.length < length) {
  int64_t offset = curr.start + curr.length;
  int64_t n;



--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2 2/5] qemu_img: add error report to cvtnum

2020-05-06 Thread Eric Blake


On 5/6/20 4:34 PM, Eyal Moscovici wrote:

All calls to cvtnum check the return value and print the same error message more
or less. And so error reporting moved to cvtnum to reduce duplicate code and
provide a single error message.

Acked-by: Mark Kanda 
Signed-off-by: Eyal Moscovici 
---
  qemu-img.c | 63 --
  tests/qemu-iotests/049.out |  4 +--
  2 files changed, 28 insertions(+), 39 deletions(-)





-err = qemu_strtosz(s, NULL, );
-if (err < 0) {
+err = qemu_strtosz(arg_value, NULL, );
+if (err < 0 && err != -ERANGE) {
+error_report("Invalid %s specified! You may use "
+ "k, M, G, T, P or E suffixes for ", arg_name);
+error_report("kilobytes, megabytes, gigabytes, terabytes, "
+ "petabytes and exabytes.");
 return err;
 }
-if (value > INT64_MAX) {
+if (err == -ERANGE || value > INT64_MAX) {
+error_report("Invalid %s specified! Must be less than 8 EiB!",


Copied from our pre-existing errors, but why are we shouting at our 
user?  This would be a good time to s/!/./ to tone it down a bit.



@@ -4491,10 +4488,12 @@ static int img_dd_bs(const char *arg,
  {
  int64_t res;
  
-res = cvtnum(arg);

+res = cvtnum("bs", arg);
  
-if (res <= 0) {

-error_report("invalid number: '%s'", arg);
+if (res < 0) {
+return 1;
+} else if (res == 0) {
+error_report("Invalid bs specified! It cannot be 0.");


Maybe it's worth two functions:

int64_t cvtnum_full(const char *name, const char *value, int64_t min, 
int64_t max)


and then a common helper:

int64_t cvtnum(const char *name, const char *value) {
return cvtnum_full(name, value, 0, INT64_MAX);
}

many existing callers remain with cvtnum(), but callers like this could 
be cvtnum("bs", arg, 1, INT64_MAX).  You'd still have to special-case 
other restrictions, such as whether a number must a power-of-2, but 
that's fewer places.



+++ b/tests/qemu-iotests/049.out
@@ -92,13 +92,13 @@ Formatting 'TEST_DIR/t.qcow2', fmt=qcow2 size=1649267441664 
cluster_size=65536 l
  == 3. Invalid sizes ==
  
  qemu-img create -f qcow2 TEST_DIR/t.qcow2 -- -1024

-qemu-img: Image size must be less than 8 EiB!
+qemu-img: Invalid image size specified! Must be less than 8 EiB!


Nice that you checked for iotest fallout.  Is this really the only 
impacted test?


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

[Bug 1805256] Re: qemu-img hangs on rcu_call_ready_event logic in Aarch64 when converting images

2020-05-06 Thread Launchpad Bug Tracker

** Merge proposal linked:
   
https://code.launchpad.net/~rafaeldtinoco/ubuntu/+source/qemu/+git/qemu/+merge/383551

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1805256

Title:
  qemu-img hangs on rcu_call_ready_event logic in Aarch64 when
  converting images

Status in kunpeng920:
  Triaged
Status in kunpeng920 ubuntu-18.04 series:
  Triaged
Status in kunpeng920 ubuntu-18.04-hwe series:
  Triaged
Status in kunpeng920 ubuntu-19.10 series:
  Triaged
Status in kunpeng920 ubuntu-20.04 series:
  Triaged
Status in kunpeng920 upstream-kernel series:
  Fix Committed
Status in QEMU:
  Fix Released
Status in qemu package in Ubuntu:
  In Progress
Status in qemu source package in Bionic:
  In Progress
Status in qemu source package in Disco:
  In Progress
Status in qemu source package in Eoan:
  In Progress
Status in qemu source package in Focal:
  In Progress

Bug description:
  [Impact]

  * QEMU locking primitives might face a race condition in QEMU Async
  I/O bottom halves scheduling. This leads to a dead lock making either
  QEMU or one of its tools to hang indefinitely.

  [Test Case]

  * qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs in Aarch64.

  [Regression Potential]

  * This is a change to a core part of QEMU: The AIO scheduling. It
  works like a "kernel" scheduler, whereas kernel schedules OS tasks,
  the QEMU AIO code is responsible to schedule QEMU coroutines or event
  listeners callbacks.

  * There was a long discussion upstream about primitives and Aarch64.
  After quite sometime Paolo released this patch and it solves the
  issue. Tested platforms were: amd64 and aarch64 based on his commit
  log.

  * Christian suggests that this fix stay little longer in -proposed to
  make sure it won't cause any regressions.

  * dannf suggests we also check for performance regressions; e.g. how
  long it takes to convert a cloud image on high-core systems.

  [Other Info]

   * Original Description bellow:

  Command:

  qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs.

  

  Workaround:

  qemu-img convert -m 1 -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Run "qemu-img convert" with "a single coroutine" to avoid this issue.

  

  (gdb) thread 1
  ...
  (gdb) bt
  #0 0xbf1ad81c in __GI_ppoll
  #1 0xaabcf73c in ppoll
  #2 qemu_poll_ns
  #3 0xaabd0764 in os_host_main_loop_wait
  #4 main_loop_wait
  ...

  (gdb) thread 2
  ...
  (gdb) bt
  #0 syscall ()
  #1 0xaabd41cc in qemu_futex_wait
  #2 qemu_event_wait (ev=ev@entry=0xaac86ce8 )
  #3 0xaabed05c in call_rcu_thread
  #4 0xaabd34c8 in qemu_thread_start
  #5 0xbf25c880 in start_thread
  #6 0xbf1b6b9c in thread_start ()

  (gdb) thread 3
  ...
  (gdb) bt
  #0 0xbf11aa20 in __GI___sigtimedwait
  #1 0xbf2671b4 in __sigwait
  #2 0xaabd1ddc in sigwait_compat
  #3 0xaabd34c8 in qemu_thread_start
  #4 0xbf25c880 in start_thread
  #5 0xbf1b6b9c in thread_start

  

  (gdb) run
  Starting program: /usr/bin/qemu-img convert -f qcow2 -O qcow2
  ./disk01.ext4.qcow2 ./output.qcow2

  [New Thread 0xbec5ad90 (LWP 72839)]
  [New Thread 0xbe459d90 (LWP 72840)]
  [New Thread 0xbdb57d90 (LWP 72841)]
  [New Thread 0xacac9d90 (LWP 72859)]
  [New Thread 0xa7ffed90 (LWP 72860)]
  [New Thread 0xa77fdd90 (LWP 72861)]
  [New Thread 0xa6ffcd90 (LWP 72862)]
  [New Thread 0xa67fbd90 (LWP 72863)]
  [New Thread 0xa5ffad90 (LWP 72864)]

  [Thread 0xa5ffad90 (LWP 72864) exited]
  [Thread 0xa6ffcd90 (LWP 72862) exited]
  [Thread 0xa77fdd90 (LWP 72861) exited]
  [Thread 0xbdb57d90 (LWP 72841) exited]
  [Thread 0xa67fbd90 (LWP 72863) exited]
  [Thread 0xacac9d90 (LWP 72859) exited]
  [Thread 0xa7ffed90 (LWP 72860) exited]

  
  """

  All the tasks left are blocked in a system call, so no task left to call
  qemu_futex_wake() to unblock thread #2 (in futex()), which would unblock
  thread #1 (doing poll() in a pipe with thread #2).

  Those 7 threads exit before disk conversion is complete (sometimes in
  the beginning, sometimes at the end).

  

  On the HiSilicon D06 system - a 96 core NUMA arm64 box - qemu-img
  frequently hangs (~50% of the time) with this command:

  qemu-img convert -f qcow2 -O qcow2 /tmp/cloudimg /tmp/cloudimg2

  Where "cloudimg" is a standard qcow2 Ubuntu cloud image. This
  qcow2->qcow2 conversion happens to be something uvtool does every time
  it fetches images.

  Once hung, attaching gdb gives the following backtrace:

  (gdb) bt
  #0  0xae4f8154 in __GI_ppoll (fds=0xe8a67dc0, 
nfds=187650274213760,
  timeout=, timeout@entry=0x0, sigmask=0xc123b950)
  at ../sysdeps/unix/sysv/linux/ppoll.c:39
  #1

Re: [PATCH v2 1/5] qemu-img: remove check that cvtnum value > MAX_INT

2020-05-06 Thread Eric Blake


On 5/6/20 4:34 PM, Eyal Moscovici wrote:

Following commit f46bfdbfc8f95cf65d7818ef68a801e063c40332 (util/cutils: Change
qemu_strtosz*() from int64_t to uint64_t) which added a similar check to
cvtnum. As a result there is no need to check it separately outside of cvtnum.

Acked-by: Mark Kanda 
Signed-off-by: Eyal Moscovici 
---
  qemu-img.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 6a4327aaba..116a9c6349 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -4307,7 +4307,7 @@ static int img_bench(int argc, char **argv)
  int64_t sval;
  
  sval = cvtnum(optarg);

-if (sval < 0 || sval > INT_MAX) {
+if (sval < 0) {
  error_report("Invalid buffer size specified");


INT_MAX is smaller than cvtnum's check for INT64_MAX.  This code change 
allows larger buffer sizes, which is probably not a good idea.



  return 1;
  }
@@ -4320,7 +4320,7 @@ static int img_bench(int argc, char **argv)
  int64_t sval;
  
  sval = cvtnum(optarg);

-if (sval < 0 || sval > INT_MAX) {
+if (sval < 0) {
  error_report("Invalid step size specified");
  return 1;
  }
@@ -4493,7 +4493,7 @@ static int img_dd_bs(const char *arg,
  
  res = cvtnum(arg);
  
-if (res <= 0 || res > INT_MAX) {

+if (res <= 0) {
  error_report("invalid number: '%s'", arg);
  return 1;
  }



NACK.

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2 0/5] Additional parameters for qemu_img map

2020-05-06 Thread Eric Blake


On 5/6/20 4:34 PM, Eyal Moscovici wrote:

Hi,

The following series adds two parameters to qemu-img map:
1. start-offset: mapping starting offset.
2. max-length: the length of the mapping.

These parameters proved useful when mapping large disk spread across
long store file chains. It allows us to bound the execution time of each
qemu-img map execution as well as recover from failed mapping
operations. In addition the map operation can divided to
multiple independent tasks.

V2 changes:
1. add error reporting to cvtnum.
2. add image length validation in img_map.
3. rebase over QEMU 5.0.


It's better to post a v2 as a new top-level thread rather than buried 
in-reply-to the v1 thread; among other things, burying a reply can cause 
automated patch tooling to miss the updated series.


But since I see it, I'll review.

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH 6/8] block/vhdx: drop unallocated_blocks_are_zero

2020-05-06 Thread Eric Blake


On 5/6/20 4:25 AM, Vladimir Sementsov-Ogievskiy wrote:

vhdx doesn't have .bdrv_co_block_status handler, so DATA|ALLOCATED is
always assumed for it. unallocated_blocks_are_zero is useless, drop it.



After the analysis I did in patch 1, this is correct.  No behavior change.

Reviewed-by: Eric Blake 


Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/vhdx.c | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/block/vhdx.c b/block/vhdx.c
index aedd782604..45963a3166 100644
--- a/block/vhdx.c
+++ b/block/vhdx.c
@@ -1164,9 +1164,6 @@ static int vhdx_get_info(BlockDriverState *bs, 
BlockDriverInfo *bdi)
  
  bdi->cluster_size = s->block_size;
  
-bdi->unallocated_blocks_are_zero =

-(s->params.data_bits & VHDX_PARAMS_HAS_PARENT) == 0;
-
  return 0;
  }
  



--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH 5/8] block/file-posix: drop unallocated_blocks_are_zero

2020-05-06 Thread Eric Blake


On 5/6/20 4:25 AM, Vladimir Sementsov-Ogievskiy wrote:

raw_co_block_status() in block/file-posix.c never returns 0, so
unallocated_blocks_are_zero is useless. Drop it.


As in 4/8, you are correct that it had no impact on block_status, but it 
did affect qemu-img convert.  So again, removing the clients first makes 
this easier to justify as a cleanup patch.


That said...



Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/file-posix.c | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 05e094be29..5c01735108 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2878,9 +2878,6 @@ static int coroutine_fn raw_co_pwrite_zeroes(
  
  static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)

  {
-BDRVRawState *s = bs->opaque;
-
-bdi->unallocated_blocks_are_zero = s->discard_zeroes;
  return 0;
  }


the function now does nothing.  Hmm - why does bdrv_get_info() return 
-ENOTSUP if the driver does not implement this function?  Wouldn't it be 
better if the block layer could allow us to omit .bdrv_get_info and do 
the same thing, without us having to write a dummy function that does 
nothing but return 0?  As separate patches, of course, as it would 
require changing several existing bdrv_get_info() callers to behave 
sanely when getting an all-0 success return where they now deal with an 
-ENOTSUP return.


So in the meantime, yes, we need this placeholder.
Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

[PATCH v2 5/5] qemu-img: Add --start-offset and --max-length to map

2020-05-06 Thread Eyal Moscovici

The mapping operation of large disks especially ones stored over a
long chain of QCOW2 files can take a long time to finish.
Additionally when mapping fails there was no way recover by
restarting the mapping from the failed location.

The new options, --start-offset and --max-length allows the user to
divide these type of map operations into shorter independent tasks.

Reviewed-by: Eric Blake 
Acked-by: Mark Kanda 
Co-developed-by: Yoav Elnekave 
Signed-off-by: Yoav Elnekave 
Signed-off-by: Eyal Moscovici 
---
 docs/tools/qemu-img.rst |  2 +-
 qemu-img-cmds.hx|  4 ++--
 qemu-img.c  | 22 +-
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
index 0080f83a76..f4ffe528ea 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -519,7 +519,7 @@ Command description:
 ``ImageInfoSpecific*`` QAPI object (e.g. ``ImageInfoSpecificQCow2``
 for qcow2 images).
 
-.. option:: map [--object OBJECTDEF] [--image-opts] [-f FMT] [--output=OFMT] 
[-U] FILENAME
+.. option:: map [--object OBJECTDEF] [--image-opts] [-f FMT] 
[--start-offset=OFFSET] [--max-length=LEN] [--output=OFMT] [-U] FILENAME
 
   Dump the metadata of image *FILENAME* and its backing file chain.
   In particular, this commands dumps the allocation state of every sector
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index c9c54de1df..35f832816f 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -63,9 +63,9 @@ SRST
 ERST
 
 DEF("map", img_map,
-"map [--object objectdef] [--image-opts] [-f fmt] [--output=ofmt] [-U] 
filename")
+"map [--object objectdef] [--image-opts] [-f fmt] [--start-offset=offset] 
[--max-length=len] [--output=ofmt] [-U] filename")
 SRST
-.. option:: map [--object OBJECTDEF] [--image-opts] [-f FMT] [--output=OFMT] 
[-U] FILENAME
+.. option:: map [--object OBJECTDEF] [--image-opts] [-f FMT] 
[--start-offset=OFFSET] [--max-length=LEN] [--output=OFMT] [-U] FILENAME
 ERST
 
 DEF("measure", img_measure,
diff --git a/qemu-img.c b/qemu-img.c
index 0a140fe564..f59b2c0a7c 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3003,6 +3003,8 @@ static int img_map(int argc, char **argv)
 int ret = 0;
 bool image_opts = false;
 bool force_share = false;
+int64_t start_offset = 0;
+int64_t max_length = -1;
 
 fmt = NULL;
 output = NULL;
@@ -3015,9 +3017,11 @@ static int img_map(int argc, char **argv)
 {"object", required_argument, 0, OPTION_OBJECT},
 {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
 {"force-share", no_argument, 0, 'U'},
+{"start-offset", required_argument, 0, 's'},
+{"max-length", required_argument, 0, 'l'},
 {0, 0, 0, 0}
 };
-c = getopt_long(argc, argv, ":f:hU",
+c = getopt_long(argc, argv, ":f:s:l:hU",
 long_options, _index);
 if (c == -1) {
 break;
@@ -3041,6 +3045,18 @@ static int img_map(int argc, char **argv)
 case OPTION_OUTPUT:
 output = optarg;
 break;
+case 's':
+start_offset = cvtnum("start offset", optarg);
+if (start_offset < 0) {
+return 1;
+}
+break;
+case 'l':
+max_length = cvtnum("max length", optarg);
+if (max_length < 0) {
+return 1;
+}
+break;
 case OPTION_OBJECT: {
 QemuOpts *opts;
 opts = qemu_opts_parse_noisily(_object_opts,
@@ -3091,7 +3107,11 @@ static int img_map(int argc, char **argv)
 error_report("Failed to get size for '%s'", filename);
 return 1;
 }
+if (max_length != -1) {
+length = MIN(start_offset + max_length, length);
+}
 
+curr.start = start_offset;
 while (curr.start + curr.length < length) {
 int64_t offset = curr.start + curr.length;
 int64_t n;
-- 
2.17.2 (Apple Git-113)

[PATCH v2 4/5] qemu-img: refactor dump_map_entry JSON format output

2020-05-06 Thread Eyal Moscovici

Previously dump_map_entry identified whether we need to start a new JSON
array based on whether start address == 0. In this refactor we remove
this assumption as in following patches we will allow map to start from
an arbitrary position.

Reviewed-by: Eric Blake 
Acked-by: Mark Kanda 
Signed-off-by: Eyal Moscovici 
---
 qemu-img.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index a1b507a0be..0a140fe564 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -2896,9 +2896,8 @@ static int dump_map_entry(OutputFormat output_format, 
MapEntry *e,
 }
 break;
 case OFORMAT_JSON:
-printf("%s{ \"start\": %"PRId64", \"length\": %"PRId64","
+printf("{ \"start\": %"PRId64", \"length\": %"PRId64","
" \"depth\": %"PRId64", \"zero\": %s, \"data\": %s",
-   (e->start == 0 ? "[" : ",\n"),
e->start, e->length, e->depth,
e->zero ? "true" : "false",
e->data ? "true" : "false");
@@ -2907,8 +2906,8 @@ static int dump_map_entry(OutputFormat output_format, 
MapEntry *e,
 }
 putchar('}');
 
-if (!next) {
-printf("]\n");
+if (next) {
+puts(",");
 }
 break;
 }
@@ -3083,6 +3082,8 @@ static int img_map(int argc, char **argv)
 
 if (output_format == OFORMAT_HUMAN) {
 printf("%-16s%-16s%-16s%s\n", "Offset", "Length", "Mapped to", "File");
+} else if (output_format == OFORMAT_JSON) {
+putchar('[');
 }
 
 length = blk_getlength(blk);
@@ -3119,6 +3120,9 @@ static int img_map(int argc, char **argv)
 }
 
 ret = dump_map_entry(output_format, , NULL);
+if (output_format == OFORMAT_JSON) {
+puts("]");
+}
 
 out:
 blk_unref(blk);
-- 
2.17.2 (Apple Git-113)

[PATCH v2 3/5] qemu-img: validate image length in img_map

2020-05-06 Thread Eyal Moscovici

The code handles this case correctly we merely skip the loop. However it
is probably best to return an explicit error.

Acked-by: Mark Kanda 
Signed-off-by: Eyal Moscovici 
---
 qemu-img.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/qemu-img.c b/qemu-img.c
index 4a06ab7fe8..a1b507a0be 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3086,6 +3086,11 @@ static int img_map(int argc, char **argv)
 }
 
 length = blk_getlength(blk);
+if (length < 0) {
+error_report("Failed to get size for '%s'", filename);
+return 1;
+}
+
 while (curr.start + curr.length < length) {
 int64_t offset = curr.start + curr.length;
 int64_t n;
-- 
2.17.2 (Apple Git-113)

[PATCH v2 2/5] qemu_img: add error report to cvtnum

2020-05-06 Thread Eyal Moscovici

All calls to cvtnum check the return value and print the same error message more
or less. And so error reporting moved to cvtnum to reduce duplicate code and
provide a single error message.

Acked-by: Mark Kanda 
Signed-off-by: Eyal Moscovici 
---
 qemu-img.c | 63 --
 tests/qemu-iotests/049.out |  4 +--
 2 files changed, 28 insertions(+), 39 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 116a9c6349..4a06ab7fe8 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -470,16 +470,22 @@ static int add_old_style_options(const char *fmt, 
QemuOpts *opts,
 return 0;
 }
 
-static int64_t cvtnum(const char *s)
+static int64_t cvtnum(const char *arg_name, const char *arg_value)
 {
 int err;
 uint64_t value;
 
-err = qemu_strtosz(s, NULL, );
-if (err < 0) {
+err = qemu_strtosz(arg_value, NULL, );
+if (err < 0 && err != -ERANGE) {
+error_report("Invalid %s specified! You may use "
+ "k, M, G, T, P or E suffixes for ", arg_name);
+error_report("kilobytes, megabytes, gigabytes, terabytes, "
+ "petabytes and exabytes.");
 return err;
 }
-if (value > INT64_MAX) {
+if (err == -ERANGE || value > INT64_MAX) {
+error_report("Invalid %s specified! Must be less than 8 EiB!",
+ arg_name);
 return -ERANGE;
 }
 return value;
@@ -572,16 +578,8 @@ static int img_create(int argc, char **argv)
 if (optind < argc) {
 int64_t sval;
 
-sval = cvtnum(argv[optind++]);
+sval = cvtnum("image size", argv[optind++]);
 if (sval < 0) {
-if (sval == -ERANGE) {
-error_report("Image size must be less than 8 EiB!");
-} else {
-error_report("Invalid image size specified! You may use k, M, "
-  "G, T, P or E suffixes for ");
-error_report("kilobytes, megabytes, gigabytes, terabytes, "
- "petabytes and exabytes.");
-}
 goto fail;
 }
 img_size = (uint64_t)sval;
@@ -2187,8 +2185,10 @@ static int img_convert(int argc, char **argv)
 {
 int64_t sval;
 
-sval = cvtnum(optarg);
-if (sval < 0 || !QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) ||
+sval = cvtnum("buffer size for sparse output", optarg);
+if (sval < 0) {
+goto fail_getopt;
+} else if (!QEMU_IS_ALIGNED(sval, BDRV_SECTOR_SIZE) ||
 sval / BDRV_SECTOR_SIZE > MAX_BUF_SECTORS) {
 error_report("Invalid buffer size for sparse output specified. 
"
 "Valid sizes are multiples of %llu up to %llu. Select "
@@ -4291,9 +4291,8 @@ static int img_bench(int argc, char **argv)
 break;
 case 'o':
 {
-offset = cvtnum(optarg);
+offset = cvtnum("offset", optarg);
 if (offset < 0) {
-error_report("Invalid offset specified");
 return 1;
 }
 break;
@@ -4306,9 +4305,8 @@ static int img_bench(int argc, char **argv)
 {
 int64_t sval;
 
-sval = cvtnum(optarg);
+sval = cvtnum("buffer size", optarg);
 if (sval < 0) {
-error_report("Invalid buffer size specified");
 return 1;
 }
 
@@ -4319,9 +4317,8 @@ static int img_bench(int argc, char **argv)
 {
 int64_t sval;
 
-sval = cvtnum(optarg);
+sval = cvtnum("step_size", optarg);
 if (sval < 0) {
-error_report("Invalid step size specified");
 return 1;
 }
 
@@ -4491,10 +4488,12 @@ static int img_dd_bs(const char *arg,
 {
 int64_t res;
 
-res = cvtnum(arg);
+res = cvtnum("bs", arg);
 
-if (res <= 0) {
-error_report("invalid number: '%s'", arg);
+if (res < 0) {
+return 1;
+} else if (res == 0) {
+error_report("Invalid bs specified! It cannot be 0.");
 return 1;
 }
 in->bsz = out->bsz = res;
@@ -4506,10 +4505,9 @@ static int img_dd_count(const char *arg,
 struct DdIo *in, struct DdIo *out,
 struct DdInfo *dd)
 {
-dd->count = cvtnum(arg);
+dd->count = cvtnum("count", arg);
 
 if (dd->count < 0) {
-error_report("invalid number: '%s'", arg);
 return 1;
 }
 
@@ -4538,10 +4536,9 @@ static int img_dd_skip(const char *arg,
struct DdIo *in, struct DdIo *out,
struct DdInfo *dd)
 {
-in->offset = cvtnum(arg);
+in->offset = cvtnum("skip", arg);
 
 if (in->offset < 0) {
-error_report("invalid number: '%s'", arg);
 return 1;
 }
 
@@ -4923,16 +4920,8 @@ static int img_measure(int argc, char **argv)
 {
 int64_t sval;
 
-

[PATCH v2 0/5] Additional parameters for qemu_img map

2020-05-06 Thread Eyal Moscovici

Hi,

The following series adds two parameters to qemu-img map:
1. start-offset: mapping starting offset.
2. max-length: the length of the mapping.

These parameters proved useful when mapping large disk spread across
long store file chains. It allows us to bound the execution time of each
qemu-img map execution as well as recover from failed mapping
operations. In addition the map operation can divided to
multiple independent tasks.

V2 changes:
1. add error reporting to cvtnum.
2. add image length validation in img_map.
3. rebase over QEMU 5.0.

Eyal Moscovici (5):
  qemu-img: remove check that cvtnum value > MAX_INT
  qemu_img: add error report to cvtnum
  qemu-img: validate image length in img_map
  qemu-img: refactor dump_map_entry JSON format output
  qemu-img: Add --start-offset and --max-length to map

 docs/tools/qemu-img.rst|   2 +-
 qemu-img-cmds.hx   |   4 +-
 qemu-img.c | 106 ++---
 tests/qemu-iotests/049.out |   4 +-
 4 files changed, 67 insertions(+), 49 deletions(-)

-- 
2.17.2 (Apple Git-113)

[PATCH v2 1/5] qemu-img: remove check that cvtnum value > MAX_INT

2020-05-06 Thread Eyal Moscovici

Following commit f46bfdbfc8f95cf65d7818ef68a801e063c40332 (util/cutils: Change
qemu_strtosz*() from int64_t to uint64_t) which added a similar check to
cvtnum. As a result there is no need to check it separately outside of cvtnum.

Acked-by: Mark Kanda 
Signed-off-by: Eyal Moscovici 
---
 qemu-img.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 6a4327aaba..116a9c6349 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -4307,7 +4307,7 @@ static int img_bench(int argc, char **argv)
 int64_t sval;
 
 sval = cvtnum(optarg);
-if (sval < 0 || sval > INT_MAX) {
+if (sval < 0) {
 error_report("Invalid buffer size specified");
 return 1;
 }
@@ -4320,7 +4320,7 @@ static int img_bench(int argc, char **argv)
 int64_t sval;
 
 sval = cvtnum(optarg);
-if (sval < 0 || sval > INT_MAX) {
+if (sval < 0) {
 error_report("Invalid step size specified");
 return 1;
 }
@@ -4493,7 +4493,7 @@ static int img_dd_bs(const char *arg,
 
 res = cvtnum(arg);
 
-if (res <= 0 || res > INT_MAX) {
+if (res <= 0) {
 error_report("invalid number: '%s'", arg);
 return 1;
 }
-- 
2.17.2 (Apple Git-113)

Re: [PATCH 4/8] block/iscsi: drop unallocated_blocks_are_zero

2020-05-06 Thread Eric Blake


On 5/6/20 4:25 AM, Vladimir Sementsov-Ogievskiy wrote:

We set bdi->unallocated_blocks_are_zero = iscsilun->lbprz, but
iscsi_co_block_status doesn't return 0 in case of iscsilun->lbprz, it
returns ZERO when appropriate. So actually unallocated_blocks_are_zero
is useless. Drop it now.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/iscsi.c | 1 -
  1 file changed, 1 deletion(-)


This one is easier to justify after removing the 2 clients.  But it's 
simpler than patch 1 in that because block_status never returned 0, this 
has no visible impact to 'qemu-io -c map' or similar, so it doesn't need 
the commit message justification about any change in behavior like patch 
1 needed.


Reviewed-by: Eric Blake 



diff --git a/block/iscsi.c b/block/iscsi.c
index a8b76979d8..767e3e75fd 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2163,7 +2163,6 @@ static int coroutine_fn 
iscsi_co_truncate(BlockDriverState *bs, int64_t offset,
  static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
  {
  IscsiLun *iscsilun = bs->opaque;
-bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
  bdi->cluster_size = iscsilun->cluster_size;
  return 0;
  }



--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH 3/8] block/crypto: drop unallocated_blocks_are_zero

2020-05-06 Thread Eric Blake


On 5/6/20 4:25 AM, Vladimir Sementsov-Ogievskiy wrote:

It's false by default, no needs to set it. We are going to drop this
variable at all, so drop it now here, it doesn't hurt.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/crypto.c | 1 -
  1 file changed, 1 deletion(-)


Trivially correct, regardless of clients.

Reviewed-by: Eric Blake 



diff --git a/block/crypto.c b/block/crypto.c
index e02f343590..7685e61844 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -694,7 +694,6 @@ static int block_crypto_get_info_luks(BlockDriverState *bs,
  return ret;
  }
  
-bdi->unallocated_blocks_are_zero = false;

  bdi->cluster_size = subbdi.cluster_size;
  
  return 0;




--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH 2/8] block/vpc: return ZERO block-status when appropriate

2020-05-06 Thread Eric Blake


On 5/6/20 4:25 AM, Vladimir Sementsov-Ogievskiy wrote:

In case when get_image_offset() returns -1, we do zero out the
corresponding chunk of qiov. So, this should be reported as ZERO.

After block-status update, it never reports 0, so setting
unallocated_blocks_are_zero doesn't make sense. Drop it.


Same analysis as in patch 1 as to the lone two clients that cared, and 
the fact that we are changing 'qemu-io -c map' output by reporting data 
as allocated now.  But I concur that as there is never a backing file, 
the change is not a regression, but rather a bug fix.




Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/vpc.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)


While the commit message could be improved, the code change itself looks 
correct.


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2] target/riscv: fix check of guest pa top bits

2020-05-06 Thread Alistair Francis

On Tue, May 5, 2020 at 1:40 PM Alistair Francis  wrote:
>
> On Fri, May 1, 2020 at 11:51 AM Jose Martins  wrote:
> >
> > The spec states that on sv39x4 guest physical  "address bits 63:41 must
> > all be zeros, or else a guest-page-fault exception occurs.".  However,
> > the check performed for these top bits of the virtual address on the
> > second stage is the same as the one performed for virtual addresses on
> > the first stage except with the 2-bit extension, effectively creating
> > the same kind of "hole" in the guest's physical address space. I believe
> > the following patch fixes this issue:
> >
> > Signed-off-by: Jose Martins 
>
> Reviewed-by: Alistair Francis 
>
> Applied to RISC-V tree.

This breaks 32-bit Hypervisors, can you look into it?

Alistair

>
> Alistair
>
> > ---
> >  target/riscv/cpu_helper.c | 20 +---
> >  1 file changed, 13 insertions(+), 7 deletions(-)
> >
> > diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
> > index 247304d850..ae22c30bdd 100644
> > --- a/target/riscv/cpu_helper.c
> > +++ b/target/riscv/cpu_helper.c
> > @@ -426,15 +426,21 @@ static int get_physical_address(CPURISCVState *env, 
> > hwaddr *physical,
> >  int va_bits = PGSHIFT + levels * ptidxbits + widened;
> >  target_ulong mask, masked_msbs;
> >
> > -if (TARGET_LONG_BITS > (va_bits - 1)) {
> > -mask = (1L << (TARGET_LONG_BITS - (va_bits - 1))) - 1;
> > +if (!first_stage) {
> > +if ((addr >> va_bits) != 0) {
> > +return TRANSLATE_FAIL;
> > +}
> >  } else {
> > -mask = 0;
> > -}
> > -masked_msbs = (addr >> (va_bits - 1)) & mask;
> > +if (TARGET_LONG_BITS > (va_bits - 1)) {
> > +mask = (1L << (TARGET_LONG_BITS - (va_bits - 1))) - 1;
> > +} else {
> > +mask = 0;
> > +}
> > +masked_msbs = (addr >> (va_bits - 1)) & mask;
> >
> > -if (masked_msbs != 0 && masked_msbs != mask) {
> > -return TRANSLATE_FAIL;
> > +if (masked_msbs != 0 && masked_msbs != mask) {
> > +return TRANSLATE_FAIL;
> > +}
> >  }
> >
> >  int ptshift = (levels - 1) * ptidxbits;
> > --
> > 2.25.1
> >
> >

[Bug 1805256] Re: qemu-img hangs on rcu_call_ready_event logic in Aarch64 when converting images

2020-05-06 Thread Launchpad Bug Tracker

** Merge proposal linked:
   
https://code.launchpad.net/~rafaeldtinoco/ubuntu/+source/qemu/+git/qemu/+merge/383545

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1805256

Title:
  qemu-img hangs on rcu_call_ready_event logic in Aarch64 when
  converting images

Status in kunpeng920:
  Triaged
Status in kunpeng920 ubuntu-18.04 series:
  Triaged
Status in kunpeng920 ubuntu-18.04-hwe series:
  Triaged
Status in kunpeng920 ubuntu-19.10 series:
  Triaged
Status in kunpeng920 ubuntu-20.04 series:
  Triaged
Status in kunpeng920 upstream-kernel series:
  Fix Committed
Status in QEMU:
  Fix Released
Status in qemu package in Ubuntu:
  In Progress
Status in qemu source package in Bionic:
  In Progress
Status in qemu source package in Disco:
  In Progress
Status in qemu source package in Eoan:
  In Progress
Status in qemu source package in Focal:
  In Progress

Bug description:
  [Impact]

  * QEMU locking primitives might face a race condition in QEMU Async
  I/O bottom halves scheduling. This leads to a dead lock making either
  QEMU or one of its tools to hang indefinitely.

  [Test Case]

  * qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs in Aarch64.

  [Regression Potential]

  * This is a change to a core part of QEMU: The AIO scheduling. It
  works like a "kernel" scheduler, whereas kernel schedules OS tasks,
  the QEMU AIO code is responsible to schedule QEMU coroutines or event
  listeners callbacks.

  * There was a long discussion upstream about primitives and Aarch64.
  After quite sometime Paolo released this patch and it solves the
  issue. Tested platforms were: amd64 and aarch64 based on his commit
  log.

  * Christian suggests that this fix stay little longer in -proposed to
  make sure it won't cause any regressions.

  * dannf suggests we also check for performance regressions; e.g. how
  long it takes to convert a cloud image on high-core systems.

  [Other Info]

   * Original Description bellow:

  Command:

  qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs.

  

  Workaround:

  qemu-img convert -m 1 -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Run "qemu-img convert" with "a single coroutine" to avoid this issue.

  

  (gdb) thread 1
  ...
  (gdb) bt
  #0 0xbf1ad81c in __GI_ppoll
  #1 0xaabcf73c in ppoll
  #2 qemu_poll_ns
  #3 0xaabd0764 in os_host_main_loop_wait
  #4 main_loop_wait
  ...

  (gdb) thread 2
  ...
  (gdb) bt
  #0 syscall ()
  #1 0xaabd41cc in qemu_futex_wait
  #2 qemu_event_wait (ev=ev@entry=0xaac86ce8 )
  #3 0xaabed05c in call_rcu_thread
  #4 0xaabd34c8 in qemu_thread_start
  #5 0xbf25c880 in start_thread
  #6 0xbf1b6b9c in thread_start ()

  (gdb) thread 3
  ...
  (gdb) bt
  #0 0xbf11aa20 in __GI___sigtimedwait
  #1 0xbf2671b4 in __sigwait
  #2 0xaabd1ddc in sigwait_compat
  #3 0xaabd34c8 in qemu_thread_start
  #4 0xbf25c880 in start_thread
  #5 0xbf1b6b9c in thread_start

  

  (gdb) run
  Starting program: /usr/bin/qemu-img convert -f qcow2 -O qcow2
  ./disk01.ext4.qcow2 ./output.qcow2

  [New Thread 0xbec5ad90 (LWP 72839)]
  [New Thread 0xbe459d90 (LWP 72840)]
  [New Thread 0xbdb57d90 (LWP 72841)]
  [New Thread 0xacac9d90 (LWP 72859)]
  [New Thread 0xa7ffed90 (LWP 72860)]
  [New Thread 0xa77fdd90 (LWP 72861)]
  [New Thread 0xa6ffcd90 (LWP 72862)]
  [New Thread 0xa67fbd90 (LWP 72863)]
  [New Thread 0xa5ffad90 (LWP 72864)]

  [Thread 0xa5ffad90 (LWP 72864) exited]
  [Thread 0xa6ffcd90 (LWP 72862) exited]
  [Thread 0xa77fdd90 (LWP 72861) exited]
  [Thread 0xbdb57d90 (LWP 72841) exited]
  [Thread 0xa67fbd90 (LWP 72863) exited]
  [Thread 0xacac9d90 (LWP 72859) exited]
  [Thread 0xa7ffed90 (LWP 72860) exited]

  
  """

  All the tasks left are blocked in a system call, so no task left to call
  qemu_futex_wake() to unblock thread #2 (in futex()), which would unblock
  thread #1 (doing poll() in a pipe with thread #2).

  Those 7 threads exit before disk conversion is complete (sometimes in
  the beginning, sometimes at the end).

  

  On the HiSilicon D06 system - a 96 core NUMA arm64 box - qemu-img
  frequently hangs (~50% of the time) with this command:

  qemu-img convert -f qcow2 -O qcow2 /tmp/cloudimg /tmp/cloudimg2

  Where "cloudimg" is a standard qcow2 Ubuntu cloud image. This
  qcow2->qcow2 conversion happens to be something uvtool does every time
  it fetches images.

  Once hung, attaching gdb gives the following backtrace:

  (gdb) bt
  #0  0xae4f8154 in __GI_ppoll (fds=0xe8a67dc0, 
nfds=187650274213760,
  timeout=, timeout@entry=0x0, sigmask=0xc123b950)
  at ../sysdeps/unix/sysv/linux/ppoll.c:39
  #1

Re: [PATCH] riscv: Change the default behavior if no -bios option is specified

2020-05-06 Thread Alistair Francis

On Tue, May 5, 2020 at 6:34 PM Bin Meng  wrote:
>
> Hi Alistair,
>
> On Wed, May 6, 2020 at 6:37 AM Alistair Francis  wrote:
> >
> > On Tue, May 5, 2020 at 1:34 PM Alistair Francis  
> > wrote:
> > >
> > > On Fri, May 1, 2020 at 5:21 AM Bin Meng  wrote:
> > > >
> > > > From: Bin Meng 
> > > >
> > > > Per QEMU deprecated doc, QEMU 4.1 introduced support for the -bios
> > > > option in QEMU for RISC-V for the virt machine and sifive_u machine.
> > > > The default behavior has been that QEMU does not automatically load
> > > > any firmware if no -bios option is included.
> > > >
> > > > Now 2 releases passed, it's time to change the default behavior to
> > > > load the default OpenSBI firmware automatically. The firmware is
> > > > included with the QEMU release and no user interaction is required.
> > > > All a user needs to do is specify the kernel they want to boot with
> > > > the -kernel option.
> > > >
> > > > Signed-off-by: Bin Meng 
> > >
> > > Thanks!
> > >
> > > Reviewed-by: Alistair Francis 
> > >
> > > Applied to the RISC-V tree.
> >
> > This fails `make check`
> >
> > qemu-system-riscv64: Unable to load the RISC-V firmware
> > "opensbi-riscv64-spike-fw_jump.elf"
> > Broken pipe
> > /scratch/alistair/software/qemu/tests/qtest/libqtest.c:166:
> > kill_qemu() tried to terminate QEMU process but encountered exit
> > status 1 (expected 0)
> > ERROR - too few tests run (expected 7, got 2)
> > make: *** [/scratch/alistair/software/qemu/tests/Makefile.include:637:
> > check-qtest-riscv64] Error 1
> >
>
> Please apply this patch to fix the "make check" as well.
>
> [5/5] riscv: Suppress the error report for QEMU testing with
> riscv_find_firmware()
> http://patchwork.ozlabs.org/project/qemu-devel/patch/1588348254-7241-6-git-send-email-bmeng...@gmail.com/

In future please send all related patches in a single series.

I have applied those two patches.

Alistair

>
> Regards,
> Bin

[PATCH v2] e1000e: Added ICR clearing by corresponding IMS bit.

2020-05-06 Thread andrew

From: Andrew Melnychenko 

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1707441
Added ICR clearing if there is IMS bit - according to the note by
section 13.3.27 of the 8257X developers manual.

Signed-off-by: Andrew Melnychenko 
---
 hw/net/e1000e_core.c | 9 +
 hw/net/trace-events  | 1 +
 2 files changed, 10 insertions(+)

diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index d5676871fa..302e99ff46 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -2624,6 +2624,15 @@ e1000e_mac_icr_read(E1000ECore *core, int index)
 e1000e_clear_ims_bits(core, core->mac[IAM]);
 }
 
+/*
+ * PCIe* GbE Controllers Open Source Software Developer's Manual
+ * 13.3.27 Interrupt Cause Read Register
+ */
+if (core->mac[ICR] & core->mac[IMS]) {
+trace_e1000e_irq_icr_clear_icr_bit_ims(core->mac[ICR], core->mac[IMS]);
+core->mac[ICR] = 0;
+}
+
 trace_e1000e_irq_icr_read_exit(core->mac[ICR]);
 e1000e_update_interrupt_state(core);
 return ret;
diff --git a/hw/net/trace-events b/hw/net/trace-events
index e18f883cfd..46e40fcfa9 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -237,6 +237,7 @@ e1000e_irq_icr_read_entry(uint32_t icr) "Starting ICR read. 
Current ICR: 0x%x"
 e1000e_irq_icr_read_exit(uint32_t icr) "Ending ICR read. Current ICR: 0x%x"
 e1000e_irq_icr_clear_zero_ims(void) "Clearing ICR on read due to zero IMS"
 e1000e_irq_icr_clear_iame(void) "Clearing ICR on read due to IAME"
+e1000e_irq_icr_clear_icr_bit_ims(uint32_t icr, uint32_t ims) "Clearing ICR on 
read due corresponding IMS bit: 0x%x & 0x%x"
 e1000e_irq_iam_clear_eiame(uint32_t iam, uint32_t cause) "Clearing IMS due to 
EIAME, IAM: 0x%X, cause: 0x%X"
 e1000e_irq_icr_clear_eiac(uint32_t icr, uint32_t eiac) "Clearing ICR bits due 
to EIAC, ICR: 0x%X, EIAC: 0x%X"
 e1000e_irq_ims_clear_set_imc(uint32_t val) "Clearing IMS bits due to IMC write 
0x%x"
-- 
2.24.1

Re: [PATCH v25 00/10] Add ARMv8 RAS virtualization support in QEMU

2020-05-06 Thread Michael S. Tsirkin

On Wed, May 06, 2020 at 07:42:19PM +0800, gengdongjiu wrote:
> On 2020/4/17 21:32, Peter Maydell wrote:
> > On Fri, 10 Apr 2020 at 12:46, Dongjiu Geng  wrote:
> >>
> >> In the ARMv8 platform, the CPU error types includes synchronous external 
> >> abort(SEA)
> >> and SError Interrupt (SEI). If exception happens in guest, host does not 
> >> know the detailed
> >> information of guest, so it is expected that guest can do the recovery. 
> >> For example, if an
> >> exception happens in a guest user-space application, host does not know 
> >> which application
> >> encounters errors, only guest knows it.
> >>
> >> For the ARMv8 SEA/SEI, KVM or host kernel delivers SIGBUS to notify 
> >> userspace.
> >> After user space gets the notification, it will record the CPER into guest 
> >> GHES
> >> buffer and inject an exception or IRQ to guest.
> >>
> >> In the current implementation, if the type of SIGBUS is BUS_MCEERR_AR, we 
> >> will
> >> treat it as a synchronous exception, and notify guest with ARMv8 SEA
> >> notification type after recording CPER into guest.
> > 
> > Hi. I left a comment on patch 1. The other 3 patches unreviewed
> > are 5, 6 and 8, which are all ACPI core code, so that's for
> > MST, Igor or Shannon to review.
> > 
> > Once those have been reviewed, please ping me if you want this
> > to go via target-arm.next.
> 
> Hi Peter,
>Igor have reviewed all ACPI core code. whether you can apply this series 
> to target-arm.next I can make another patches to solve your comments on 
> patch1 and another APCI comment.
> Thanks very much in advance.

Given it all starts with patch 1, it's probably easier to address the
comment and repost.


> > 
> > thanks
> > -- PMM
> > 
> > .
> >

Re: [PATCH v25 08/10] ACPI: Record Generic Error Status Block(GESB) table

2020-05-06 Thread Michael S. Tsirkin

On Fri, Apr 10, 2020 at 07:46:37PM +0800, Dongjiu Geng wrote:
> kvm_arch_on_sigbus_vcpu() error injection uses source_id as
> index in etc/hardware_errors to find out Error Status Data
> Block entry corresponding to error source. So supported source_id
> values should be assigned here and not be changed afterwards to
> make sure that guest will write error into expected Error Status
> Data Block.
> 
> Before QEMU writes a new error to ACPI table, it will check whether
> previous error has been acknowledged. If not acknowledged, the new
> errors will be ignored and not be recorded. For the errors section
> type, QEMU simulate it to memory section error.
> 
> Signed-off-by: Dongjiu Geng 
> Signed-off-by: Xiang Zheng 

Reviewed-by: Michael S. Tsirkin 

> ---
> change since v24:
> 1. Using g_array_append_vals() to replace build_append_int_noprefix() to 
> build FRU Text.
> 2. Remove the judgement that judge whether acpi_ged_state is NULL.
> 3. Add le64_to_cpu() to error_block_address
> ---
>  hw/acpi/ghes.c | 219 
> +
>  include/hw/acpi/ghes.h |   1 +
>  2 files changed, 220 insertions(+)
> 
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index e74af23..a3ab2e4 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -26,6 +26,7 @@
>  #include "qemu/error-report.h"
>  #include "hw/acpi/generic_event_device.h"
>  #include "hw/nvram/fw_cfg.h"
> +#include "qemu/uuid.h"
>  
>  #define ACPI_GHES_ERRORS_FW_CFG_FILE"etc/hardware_errors"
>  #define ACPI_GHES_DATA_ADDR_FW_CFG_FILE "etc/hardware_errors_addr"
> @@ -43,6 +44,36 @@
>  #define GAS_ADDR_OFFSET 4
>  
>  /*
> + * The total size of Generic Error Data Entry
> + * ACPI 6.1/6.2: 18.3.2.7.1 Generic Error Data,
> + * Table 18-343 Generic Error Data Entry
> + */
> +#define ACPI_GHES_DATA_LENGTH   72
> +
> +/* The memory section CPER size, UEFI 2.6: N.2.5 Memory Error Section */
> +#define ACPI_GHES_MEM_CPER_LENGTH   80
> +
> +/* Masks for block_status flags */
> +#define ACPI_GEBS_UNCORRECTABLE 1
> +
> +/*
> + * Total size for Generic Error Status Block except Generic Error Data 
> Entries
> + * ACPI 6.2: 18.3.2.7.1 Generic Error Data,
> + * Table 18-380 Generic Error Status Block
> + */
> +#define ACPI_GHES_GESB_SIZE 20
> +
> +/*
> + * Values for error_severity field
> + */
> +enum AcpiGenericErrorSeverity {
> +ACPI_CPER_SEV_RECOVERABLE = 0,
> +ACPI_CPER_SEV_FATAL = 1,
> +ACPI_CPER_SEV_CORRECTED = 2,
> +ACPI_CPER_SEV_NONE = 3,
> +};
> +
> +/*
>   * Hardware Error Notification
>   * ACPI 4.0: 17.3.2.7 Hardware Error Notification
>   * Composes dummy Hardware Error Notification descriptor of specified type
> @@ -73,6 +104,138 @@ static void build_ghes_hw_error_notification(GArray 
> *table, const uint8_t type)
>  }
>  
>  /*
> + * Generic Error Data Entry
> + * ACPI 6.1: 18.3.2.7.1 Generic Error Data
> + */
> +static void acpi_ghes_generic_error_data(GArray *table,
> +const uint8_t *section_type, uint32_t error_severity,
> +uint8_t validation_bits, uint8_t flags,
> +uint32_t error_data_length, QemuUUID fru_id,
> +uint64_t time_stamp)
> +{
> +const uint8_t fru_text[20] = {0};
> +
> +/* Section Type */
> +g_array_append_vals(table, section_type, 16);
> +
> +/* Error Severity */
> +build_append_int_noprefix(table, error_severity, 4);
> +/* Revision */
> +build_append_int_noprefix(table, 0x300, 2);
> +/* Validation Bits */
> +build_append_int_noprefix(table, validation_bits, 1);
> +/* Flags */
> +build_append_int_noprefix(table, flags, 1);
> +/* Error Data Length */
> +build_append_int_noprefix(table, error_data_length, 4);
> +
> +/* FRU Id */
> +g_array_append_vals(table, fru_id.data, ARRAY_SIZE(fru_id.data));
> +
> +/* FRU Text */
> +g_array_append_vals(table, fru_text, sizeof(fru_text));
> +
> +/* Timestamp */
> +build_append_int_noprefix(table, time_stamp, 8);
> +}
> +
> +/*
> + * Generic Error Status Block
> + * ACPI 6.1: 18.3.2.7.1 Generic Error Data
> + */
> +static void acpi_ghes_generic_error_status(GArray *table, uint32_t 
> block_status,
> +uint32_t raw_data_offset, uint32_t raw_data_length,
> +uint32_t data_length, uint32_t error_severity)
> +{
> +/* Block Status */
> +build_append_int_noprefix(table, block_status, 4);
> +/* Raw Data Offset */
> +build_append_int_noprefix(table, raw_data_offset, 4);
> +/* Raw Data Length */
> +build_append_int_noprefix(table, raw_data_length, 4);
> +/* Data Length */
> +build_append_int_noprefix(table, data_length, 4);
> +/* Error Severity */
> +build_append_int_noprefix(table, error_severity, 4);
> +}
> +
> +/* UEFI 2.6: N.2.5 Memory Error Section */
> +static void acpi_ghes_build_append_mem_cper(GArray *table,
> +uint64_t error_physical_addr)
>

Re: [PATCH v25 06/10] ACPI: Record the Generic Error Status Block address

2020-05-06 Thread Michael S. Tsirkin

On Fri, Apr 10, 2020 at 07:46:35PM +0800, Dongjiu Geng wrote:
> Record the GHEB address via fw_cfg file, when recording
> a error to CPER, it will use this address to find out
> Generic Error Data Entries and write the error.
> 
> In order to avoid migration failure, make hardware
> error table address to a part of GED device instead
> of global variable, then this address will be migrated
> to target QEMU.
> 
> Acked-by: Xiang Zheng 
> Signed-off-by: Dongjiu Geng 

Reviewed-by: Michael S. Tsirkin 

> ---
> change since v24:
> 1. Use s->ghes_state.ghes_addr_le to check in ghes_needed()
> 2. Using hardware_error->len instead of request_block_size to calculate in 
> acpi_ghes_add_fw_cfg()
> 3. Remove assert(vms->acpi_dev) be build APEI table
> 4. Directly use ACPI_GED(vms->acpi_dev) instead of ACPI_GED(vms->acpi_dev)
> ---
>  hw/acpi/generic_event_device.c | 19 +++
>  hw/acpi/ghes.c | 14 ++
>  hw/arm/virt-acpi-build.c   |  8 
>  include/hw/acpi/generic_event_device.h |  2 ++
>  include/hw/acpi/ghes.h |  6 ++
>  5 files changed, 49 insertions(+)
> 
> diff --git a/hw/acpi/generic_event_device.c b/hw/acpi/generic_event_device.c
> index 021ed2b..1491291 100644
> --- a/hw/acpi/generic_event_device.c
> +++ b/hw/acpi/generic_event_device.c
> @@ -234,6 +234,24 @@ static const VMStateDescription vmstate_ged_state = {
>  }
>  };
>  
> +static bool ghes_needed(void *opaque)
> +{
> +AcpiGedState *s = opaque;
> +return s->ghes_state.ghes_addr_le;
> +}
> +
> +static const VMStateDescription vmstate_ghes_state = {
> +.name = "acpi-ged/ghes",
> +.version_id = 1,
> +.minimum_version_id = 1,
> +.needed = ghes_needed,
> +.fields  = (VMStateField[]) {
> +VMSTATE_STRUCT(ghes_state, AcpiGedState, 1,
> +   vmstate_ghes_state, AcpiGhesState),
> +VMSTATE_END_OF_LIST()
> +}
> +};
> +
>  static const VMStateDescription vmstate_acpi_ged = {
>  .name = "acpi-ged",
>  .version_id = 1,
> @@ -244,6 +262,7 @@ static const VMStateDescription vmstate_acpi_ged = {
>  },
>  .subsections = (const VMStateDescription * []) {
>  _memhp_state,
> +_ghes_state,
>  NULL
>  }
>  };
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index 091fd87..e74af23 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -24,6 +24,8 @@
>  #include "hw/acpi/ghes.h"
>  #include "hw/acpi/aml-build.h"
>  #include "qemu/error-report.h"
> +#include "hw/acpi/generic_event_device.h"
> +#include "hw/nvram/fw_cfg.h"
>  
>  #define ACPI_GHES_ERRORS_FW_CFG_FILE"etc/hardware_errors"
>  #define ACPI_GHES_DATA_ADDR_FW_CFG_FILE "etc/hardware_errors_addr"
> @@ -213,3 +215,15 @@ void acpi_build_hest(GArray *table_data, BIOSLinker 
> *linker)
>  build_header(linker, table_data, (void *)(table_data->data + hest_start),
>  "HEST", table_data->len - hest_start, 1, NULL, NULL);
>  }
> +
> +void acpi_ghes_add_fw_cfg(AcpiGhesState *ags, FWCfgState *s,
> +  GArray *hardware_error)
> +{
> +/* Create a read-only fw_cfg file for GHES */
> +fw_cfg_add_file(s, ACPI_GHES_ERRORS_FW_CFG_FILE, hardware_error->data,
> +hardware_error->len);
> +
> +/* Create a read-write fw_cfg file for Address */
> +fw_cfg_add_file_callback(s, ACPI_GHES_DATA_ADDR_FW_CFG_FILE, NULL, NULL,
> +NULL, &(ags->ghes_addr_le), sizeof(ags->ghes_addr_le), false);
> +}
> diff --git a/hw/arm/virt-acpi-build.c b/hw/arm/virt-acpi-build.c
> index f611bce..2726aac 100644
> --- a/hw/arm/virt-acpi-build.c
> +++ b/hw/arm/virt-acpi-build.c
> @@ -911,6 +911,7 @@ void virt_acpi_setup(VirtMachineState *vms)
>  {
>  AcpiBuildTables tables;
>  AcpiBuildState *build_state;
> +AcpiGedState *acpi_ged_state;
>  
>  if (!vms->fw_cfg) {
>  trace_virt_acpi_setup();
> @@ -941,6 +942,13 @@ void virt_acpi_setup(VirtMachineState *vms)
>  fw_cfg_add_file(vms->fw_cfg, ACPI_BUILD_TPMLOG_FILE, 
> tables.tcpalog->data,
>  acpi_data_len(tables.tcpalog));
>  
> +if (vms->ras) {
> +assert(vms->acpi_dev);
> +acpi_ged_state = ACPI_GED(vms->acpi_dev);
> +acpi_ghes_add_fw_cfg(_ged_state->ghes_state,
> + vms->fw_cfg, tables.hardware_errors);
> +}
> +
>  build_state->rsdp_mr = acpi_add_rom_blob(virt_acpi_build_update,
>   build_state, tables.rsdp,
>   ACPI_BUILD_RSDP_FILE, 0);
> diff --git a/include/hw/acpi/generic_event_device.h 
> b/include/hw/acpi/generic_event_device.h
> index d157eac..037d2b5 100644
> --- a/include/hw/acpi/generic_event_device.h
> +++ b/include/hw/acpi/generic_event_device.h
> @@ -61,6 +61,7 @@
>  
>  #include "hw/sysbus.h"
>  #include "hw/acpi/memory_hotplug.h"
> +#include "hw/acpi/ghes.h"
>  
>  #define ACPI_POWER_BUTTON_DEVICE "PWRB"
>  
>

Re: [PATCH v25 05/10] ACPI: Build Hardware Error Source Table

2020-05-06 Thread Michael S. Tsirkin

On Fri, Apr 10, 2020 at 07:46:34PM +0800, Dongjiu Geng wrote:
> This patch builds Hardware Error Source Table(HEST) via fw_cfg blobs.
> Now it only supports ARMv8 SEA, a type of Generic Hardware Error
> Source version 2(GHESv2) error source. Afterwards, we can extend
> the supported types if needed. For the CPER section, currently it
> is memory section because kernel mainly wants userspace to handle
> the memory errors.
> 
> This patch follows the spec ACPI 6.2 to build the Hardware Error
> Source table. For more detailed information, please refer to
> document: docs/specs/acpi_hest_ghes.rst
> 
> build_ghes_hw_error_notification() helper will help to add Hardware
> Error Notification to ACPI tables without using packed C structures
> and avoid endianness issues as API doesn't need explicit conversion.
> 
> Signed-off-by: Xiang Zheng 
> Signed-off-by: Dongjiu Geng 

Reviewed-by: Michael S. Tsirkin 

> ---
> change since v24:
> 1. Add acpi_add_table() before acpi_build_hest()
> 2. Pass NULL for oem_table_id in build_header() to build Hardware
>Error Source Table header
> ---
>  hw/acpi/ghes.c   | 126 
> +++
>  hw/arm/virt-acpi-build.c |   2 +
>  include/hw/acpi/ghes.h   |  39 +++
>  3 files changed, 167 insertions(+)
> 
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> index e1b3f8f..091fd87 100644
> --- a/hw/acpi/ghes.c
> +++ b/hw/acpi/ghes.c
> @@ -23,6 +23,7 @@
>  #include "qemu/units.h"
>  #include "hw/acpi/ghes.h"
>  #include "hw/acpi/aml-build.h"
> +#include "qemu/error-report.h"
>  
>  #define ACPI_GHES_ERRORS_FW_CFG_FILE"etc/hardware_errors"
>  #define ACPI_GHES_DATA_ADDR_FW_CFG_FILE "etc/hardware_errors_addr"
> @@ -33,6 +34,42 @@
>  /* Now only support ARMv8 SEA notification type error source */
>  #define ACPI_GHES_ERROR_SOURCE_COUNT1
>  
> +/* Generic Hardware Error Source version 2 */
> +#define ACPI_GHES_SOURCE_GENERIC_ERROR_V2   10
> +
> +/* Address offset in Generic Address Structure(GAS) */
> +#define GAS_ADDR_OFFSET 4
> +
> +/*
> + * Hardware Error Notification
> + * ACPI 4.0: 17.3.2.7 Hardware Error Notification
> + * Composes dummy Hardware Error Notification descriptor of specified type
> + */
> +static void build_ghes_hw_error_notification(GArray *table, const uint8_t 
> type)
> +{
> +/* Type */
> +build_append_int_noprefix(table, type, 1);
> +/*
> + * Length:
> + * Total length of the structure in bytes
> + */
> +build_append_int_noprefix(table, 28, 1);
> +/* Configuration Write Enable */
> +build_append_int_noprefix(table, 0, 2);
> +/* Poll Interval */
> +build_append_int_noprefix(table, 0, 4);
> +/* Vector */
> +build_append_int_noprefix(table, 0, 4);
> +/* Switch To Polling Threshold Value */
> +build_append_int_noprefix(table, 0, 4);
> +/* Switch To Polling Threshold Window */
> +build_append_int_noprefix(table, 0, 4);
> +/* Error Threshold Value */
> +build_append_int_noprefix(table, 0, 4);
> +/* Error Threshold Window */
> +build_append_int_noprefix(table, 0, 4);
> +}
> +
>  /*
>   * Build table for the hardware error fw_cfg blob.
>   * Initialize "etc/hardware_errors" and "etc/hardware_errors_addr" fw_cfg 
> blobs.
> @@ -87,3 +124,92 @@ void build_ghes_error_table(GArray *hardware_errors, 
> BIOSLinker *linker)
>  bios_linker_loader_write_pointer(linker, ACPI_GHES_DATA_ADDR_FW_CFG_FILE,
>  0, sizeof(uint64_t), ACPI_GHES_ERRORS_FW_CFG_FILE, 0);
>  }
> +
> +/* Build Generic Hardware Error Source version 2 (GHESv2) */
> +static void build_ghes_v2(GArray *table_data, int source_id, BIOSLinker 
> *linker)
> +{
> +uint64_t address_offset;
> +/*
> + * Type:
> + * Generic Hardware Error Source version 2(GHESv2 - Type 10)
> + */
> +build_append_int_noprefix(table_data, ACPI_GHES_SOURCE_GENERIC_ERROR_V2, 
> 2);
> +/* Source Id */
> +build_append_int_noprefix(table_data, source_id, 2);
> +/* Related Source Id */
> +build_append_int_noprefix(table_data, 0x, 2);
> +/* Flags */
> +build_append_int_noprefix(table_data, 0, 1);
> +/* Enabled */
> +build_append_int_noprefix(table_data, 1, 1);
> +
> +/* Number of Records To Pre-allocate */
> +build_append_int_noprefix(table_data, 1, 4);
> +/* Max Sections Per Record */
> +build_append_int_noprefix(table_data, 1, 4);
> +/* Max Raw Data Length */
> +build_append_int_noprefix(table_data, ACPI_GHES_MAX_RAW_DATA_LENGTH, 4);
> +
> +address_offset = table_data->len;
> +/* Error Status Address */
> +build_append_gas(table_data, AML_AS_SYSTEM_MEMORY, 0x40, 0,
> + 4 /* QWord access */, 0);
> +bios_linker_loader_add_pointer(linker, ACPI_BUILD_TABLE_FILE,
> +address_offset + GAS_ADDR_OFFSET, sizeof(uint64_t),
> +ACPI_GHES_ERRORS_FW_CFG_FILE, source_id * sizeof(uint64_t));
> +
> +switch (source_id) {
> +case ACPI_HEST_SRC_ID_SEA:
> +

Re: [PATCH v25 04/10] ACPI: Build related register address fields via hardware error fw_cfg blob

2020-05-06 Thread Michael S. Tsirkin

On Fri, Apr 10, 2020 at 07:46:33PM +0800, Dongjiu Geng wrote:
> This patch builds error_block_address and read_ack_register fields
> in hardware errors table , the error_block_address points to Generic
> Error Status Block(GESB) via bios_linker. The max size for one GESB
> is 1kb, For more detailed information, please refer to
> document: docs/specs/acpi_hest_ghes.rst
> 
> Now we only support one Error source, if necessary, we can extend to
> support more.
> 
> Suggested-by: Laszlo Ersek 
> Signed-off-by: Xiang Zheng 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: Igor Mammedov 
> Signed-off-by: Dongjiu Geng 

Reviewed-by: Michael S. Tsirkin 

> ---
> change since v24:
> 1.move acpi_add_table() to the patch that adds acpi_build_hest()
> ---
>  default-configs/arm-softmmu.mak |  1 +
>  hw/acpi/Kconfig |  4 ++
>  hw/acpi/Makefile.objs   |  1 +
>  hw/acpi/aml-build.c |  2 +
>  hw/acpi/ghes.c  | 89 
> +
>  hw/arm/virt-acpi-build.c|  5 +++
>  include/hw/acpi/aml-build.h |  1 +
>  include/hw/acpi/ghes.h  | 28 +
>  8 files changed, 131 insertions(+)
>  create mode 100644 hw/acpi/ghes.c
>  create mode 100644 include/hw/acpi/ghes.h
> 
> diff --git a/default-configs/arm-softmmu.mak b/default-configs/arm-softmmu.mak
> index 36a0e89..8fc09a4 100644
> --- a/default-configs/arm-softmmu.mak
> +++ b/default-configs/arm-softmmu.mak
> @@ -42,3 +42,4 @@ CONFIG_FSL_IMX7=y
>  CONFIG_FSL_IMX6UL=y
>  CONFIG_SEMIHOSTING=y
>  CONFIG_ALLWINNER_H3=y
> +CONFIG_ACPI_APEI=y
> diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig
> index 54209c6..1932f66 100644
> --- a/hw/acpi/Kconfig
> +++ b/hw/acpi/Kconfig
> @@ -28,6 +28,10 @@ config ACPI_HMAT
>  bool
>  depends on ACPI
>  
> +config ACPI_APEI
> +bool
> +depends on ACPI
> +
>  config ACPI_PCI
>  bool
>  depends on ACPI && PCI
> diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs
> index 777da07..28c5ddb 100644
> --- a/hw/acpi/Makefile.objs
> +++ b/hw/acpi/Makefile.objs
> @@ -8,6 +8,7 @@ common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o
>  common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o
>  common-obj-$(CONFIG_ACPI_HW_REDUCED) += generic_event_device.o
>  common-obj-$(CONFIG_ACPI_HMAT) += hmat.o
> +common-obj-$(CONFIG_ACPI_APEI) += ghes.o
>  common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o
>  common-obj-$(call lnot,$(CONFIG_PC)) += acpi-x86-stub.o
>  
> diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
> index 2c3702b..3681ec6 100644
> --- a/hw/acpi/aml-build.c
> +++ b/hw/acpi/aml-build.c
> @@ -1578,6 +1578,7 @@ void acpi_build_tables_init(AcpiBuildTables *tables)
>  tables->table_data = g_array_new(false, true /* clear */, 1);
>  tables->tcpalog = g_array_new(false, true /* clear */, 1);
>  tables->vmgenid = g_array_new(false, true /* clear */, 1);
> +tables->hardware_errors = g_array_new(false, true /* clear */, 1);
>  tables->linker = bios_linker_loader_init();
>  }
>  
> @@ -1588,6 +1589,7 @@ void acpi_build_tables_cleanup(AcpiBuildTables *tables, 
> bool mfre)
>  g_array_free(tables->table_data, true);
>  g_array_free(tables->tcpalog, mfre);
>  g_array_free(tables->vmgenid, mfre);
> +g_array_free(tables->hardware_errors, mfre);
>  }
>  
>  /*
> diff --git a/hw/acpi/ghes.c b/hw/acpi/ghes.c
> new file mode 100644
> index 000..e1b3f8f
> --- /dev/null
> +++ b/hw/acpi/ghes.c
> @@ -0,0 +1,89 @@
> +/*
> + * Support for generating APEI tables and recording CPER for Guests
> + *
> + * Copyright (c) 2020 HUAWEI TECHNOLOGIES CO., LTD.
> + *
> + * Author: Dongjiu Geng 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> +
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +
> + * You should have received a copy of the GNU General Public License along
> + * with this program; if not, see .
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/units.h"
> +#include "hw/acpi/ghes.h"
> +#include "hw/acpi/aml-build.h"
> +
> +#define ACPI_GHES_ERRORS_FW_CFG_FILE"etc/hardware_errors"
> +#define ACPI_GHES_DATA_ADDR_FW_CFG_FILE "etc/hardware_errors_addr"
> +
> +/* The max size in bytes for one error block */
> +#define ACPI_GHES_MAX_RAW_DATA_LENGTH   (1 * KiB)
> +
> +/* Now only support ARMv8 SEA notification type error source */
> +#define ACPI_GHES_ERROR_SOURCE_COUNT1
> +
> +/*
> + * Build table for the hardware error fw_cfg blob.
> + * Initialize "etc/hardware_errors" and "etc/hardware_errors_addr" fw_cfg 
> blobs.
> + * See

[Bug 1805256] Re: qemu-img hangs on rcu_call_ready_event logic in Aarch64 when converting images

2020-05-06 Thread Rafael David Tinoco

FYIO, from now on all the "merge" work will be done in the merge
requests being linked to this BUG (at the top). @paelzer will be
verifying those.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1805256

Title:
  qemu-img hangs on rcu_call_ready_event logic in Aarch64 when
  converting images

Status in kunpeng920:
  Triaged
Status in kunpeng920 ubuntu-18.04 series:
  Triaged
Status in kunpeng920 ubuntu-18.04-hwe series:
  Triaged
Status in kunpeng920 ubuntu-19.10 series:
  Triaged
Status in kunpeng920 ubuntu-20.04 series:
  Triaged
Status in kunpeng920 upstream-kernel series:
  Fix Committed
Status in QEMU:
  Fix Released
Status in qemu package in Ubuntu:
  In Progress
Status in qemu source package in Bionic:
  In Progress
Status in qemu source package in Disco:
  In Progress
Status in qemu source package in Eoan:
  In Progress
Status in qemu source package in Focal:
  In Progress

Bug description:
  [Impact]

  * QEMU locking primitives might face a race condition in QEMU Async
  I/O bottom halves scheduling. This leads to a dead lock making either
  QEMU or one of its tools to hang indefinitely.

  [Test Case]

  * qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs in Aarch64.

  [Regression Potential]

  * This is a change to a core part of QEMU: The AIO scheduling. It
  works like a "kernel" scheduler, whereas kernel schedules OS tasks,
  the QEMU AIO code is responsible to schedule QEMU coroutines or event
  listeners callbacks.

  * There was a long discussion upstream about primitives and Aarch64.
  After quite sometime Paolo released this patch and it solves the
  issue. Tested platforms were: amd64 and aarch64 based on his commit
  log.

  * Christian suggests that this fix stay little longer in -proposed to
  make sure it won't cause any regressions.

  * dannf suggests we also check for performance regressions; e.g. how
  long it takes to convert a cloud image on high-core systems.

  [Other Info]

   * Original Description bellow:

  Command:

  qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs.

  

  Workaround:

  qemu-img convert -m 1 -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Run "qemu-img convert" with "a single coroutine" to avoid this issue.

  

  (gdb) thread 1
  ...
  (gdb) bt
  #0 0xbf1ad81c in __GI_ppoll
  #1 0xaabcf73c in ppoll
  #2 qemu_poll_ns
  #3 0xaabd0764 in os_host_main_loop_wait
  #4 main_loop_wait
  ...

  (gdb) thread 2
  ...
  (gdb) bt
  #0 syscall ()
  #1 0xaabd41cc in qemu_futex_wait
  #2 qemu_event_wait (ev=ev@entry=0xaac86ce8 )
  #3 0xaabed05c in call_rcu_thread
  #4 0xaabd34c8 in qemu_thread_start
  #5 0xbf25c880 in start_thread
  #6 0xbf1b6b9c in thread_start ()

  (gdb) thread 3
  ...
  (gdb) bt
  #0 0xbf11aa20 in __GI___sigtimedwait
  #1 0xbf2671b4 in __sigwait
  #2 0xaabd1ddc in sigwait_compat
  #3 0xaabd34c8 in qemu_thread_start
  #4 0xbf25c880 in start_thread
  #5 0xbf1b6b9c in thread_start

  

  (gdb) run
  Starting program: /usr/bin/qemu-img convert -f qcow2 -O qcow2
  ./disk01.ext4.qcow2 ./output.qcow2

  [New Thread 0xbec5ad90 (LWP 72839)]
  [New Thread 0xbe459d90 (LWP 72840)]
  [New Thread 0xbdb57d90 (LWP 72841)]
  [New Thread 0xacac9d90 (LWP 72859)]
  [New Thread 0xa7ffed90 (LWP 72860)]
  [New Thread 0xa77fdd90 (LWP 72861)]
  [New Thread 0xa6ffcd90 (LWP 72862)]
  [New Thread 0xa67fbd90 (LWP 72863)]
  [New Thread 0xa5ffad90 (LWP 72864)]

  [Thread 0xa5ffad90 (LWP 72864) exited]
  [Thread 0xa6ffcd90 (LWP 72862) exited]
  [Thread 0xa77fdd90 (LWP 72861) exited]
  [Thread 0xbdb57d90 (LWP 72841) exited]
  [Thread 0xa67fbd90 (LWP 72863) exited]
  [Thread 0xacac9d90 (LWP 72859) exited]
  [Thread 0xa7ffed90 (LWP 72860) exited]

  
  """

  All the tasks left are blocked in a system call, so no task left to call
  qemu_futex_wake() to unblock thread #2 (in futex()), which would unblock
  thread #1 (doing poll() in a pipe with thread #2).

  Those 7 threads exit before disk conversion is complete (sometimes in
  the beginning, sometimes at the end).

  

  On the HiSilicon D06 system - a 96 core NUMA arm64 box - qemu-img
  frequently hangs (~50% of the time) with this command:

  qemu-img convert -f qcow2 -O qcow2 /tmp/cloudimg /tmp/cloudimg2

  Where "cloudimg" is a standard qcow2 Ubuntu cloud image. This
  qcow2->qcow2 conversion happens to be something uvtool does every time
  it fetches images.

  Once hung, attaching gdb gives the following backtrace:

  (gdb) bt
  #0  0xae4f8154 in __GI_ppoll (fds=0xe8a67dc0, 
nfds=187650274213760,
  timeout=, timeout@entry=0x0, sigmask=0xc123b950)
  at

Re: [PULL 0/1] Register API Queue

2020-05-06 Thread Peter Maydell

On Wed, 6 May 2020 at 00:18, Alistair Francis  wrote:
>
> The following changes since commit f19d118bed77bb95681b07f4e76dbb700c16918d:
>
>   Merge remote-tracking branch 'remotes/ericb/tags/pull-nbd-2020-05-04' into 
> staging (2020-05-05 15:47:44 +0100)
>
> are available in the Git repository at:
>
>   g...@github.com:alistair23/qemu.git tags/pull-reg-to-apply-20200505
>
> for you to fetch changes up to f08085f49fb66a5cdc86653bd896d0e728bcee50:
>
>   hw/core/register: Add register_init_block8 helper. (2020-05-05 13:37:51 
> -0700)
>
> 
> Pull request for RegisterAPI
>
> This is a single patch to add support to the RegisterAPI for different
> data sizes.


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/5.1
for any user-visible changes.

-- PMM

Re: [Bug 1805256] Re: qemu-img hangs on rcu_call_ready_event logic in Aarch64 when converting images

2020-05-06 Thread dann frazier

On Wed, May 6, 2020 at 1:20 PM Philippe Mathieu-Daudé
<1805...@bugs.launchpad.net> wrote:
>
> Isn't this fixed by commit 5710a3e09f9?

See comment #43. The discussions hence are about testing/integration
of that fix.

  -dann

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1805256

Title:
  qemu-img hangs on rcu_call_ready_event logic in Aarch64 when
  converting images

Status in kunpeng920:
  Triaged
Status in kunpeng920 ubuntu-18.04 series:
  Triaged
Status in kunpeng920 ubuntu-18.04-hwe series:
  Triaged
Status in kunpeng920 ubuntu-19.10 series:
  Triaged
Status in kunpeng920 ubuntu-20.04 series:
  Triaged
Status in kunpeng920 upstream-kernel series:
  Fix Committed
Status in QEMU:
  Fix Released
Status in qemu package in Ubuntu:
  In Progress
Status in qemu source package in Bionic:
  In Progress
Status in qemu source package in Disco:
  In Progress
Status in qemu source package in Eoan:
  In Progress
Status in qemu source package in Focal:
  In Progress

Bug description:
  [Impact]

  * QEMU locking primitives might face a race condition in QEMU Async
  I/O bottom halves scheduling. This leads to a dead lock making either
  QEMU or one of its tools to hang indefinitely.

  [Test Case]

  * qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs in Aarch64.

  [Regression Potential]

  * This is a change to a core part of QEMU: The AIO scheduling. It
  works like a "kernel" scheduler, whereas kernel schedules OS tasks,
  the QEMU AIO code is responsible to schedule QEMU coroutines or event
  listeners callbacks.

  * There was a long discussion upstream about primitives and Aarch64.
  After quite sometime Paolo released this patch and it solves the
  issue. Tested platforms were: amd64 and aarch64 based on his commit
  log.

  * Christian suggests that this fix stay little longer in -proposed to
  make sure it won't cause any regressions.

  * dannf suggests we also check for performance regressions; e.g. how
  long it takes to convert a cloud image on high-core systems.

  [Other Info]

   * Original Description bellow:

  Command:

  qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs.

  Workaround:

  qemu-img convert -m 1 -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Run "qemu-img convert" with "a single coroutine" to avoid this issue.

  (gdb) thread 1
  ...
  (gdb) bt
  #0 0xbf1ad81c in __GI_ppoll
  #1 0xaabcf73c in ppoll
  #2 qemu_poll_ns
  #3 0xaabd0764 in os_host_main_loop_wait
  #4 main_loop_wait
  ...

  (gdb) thread 2
  ...
  (gdb) bt
  #0 syscall ()
  #1 0xaabd41cc in qemu_futex_wait
  #2 qemu_event_wait (ev=ev@entry=0xaac86ce8 )
  #3 0xaabed05c in call_rcu_thread
  #4 0xaabd34c8 in qemu_thread_start
  #5 0xbf25c880 in start_thread
  #6 0xbf1b6b9c in thread_start ()

  (gdb) thread 3
  ...
  (gdb) bt
  #0 0xbf11aa20 in __GI___sigtimedwait
  #1 0xbf2671b4 in __sigwait
  #2 0xaabd1ddc in sigwait_compat
  #3 0xaabd34c8 in qemu_thread_start
  #4 0xbf25c880 in start_thread
  #5 0xbf1b6b9c in thread_start

  (gdb) run
  Starting program: /usr/bin/qemu-img convert -f qcow2 -O qcow2
  ./disk01.ext4.qcow2 ./output.qcow2

  [New Thread 0xbec5ad90 (LWP 72839)]
  [New Thread 0xbe459d90 (LWP 72840)]
  [New Thread 0xbdb57d90 (LWP 72841)]
  [New Thread 0xacac9d90 (LWP 72859)]
  [New Thread 0xa7ffed90 (LWP 72860)]
  [New Thread 0xa77fdd90 (LWP 72861)]
  [New Thread 0xa6ffcd90 (LWP 72862)]
  [New Thread 0xa67fbd90 (LWP 72863)]
  [New Thread 0xa5ffad90 (LWP 72864)]

  [Thread 0xa5ffad90 (LWP 72864) exited]
  [Thread 0xa6ffcd90 (LWP 72862) exited]
  [Thread 0xa77fdd90 (LWP 72861) exited]
  [Thread 0xbdb57d90 (LWP 72841) exited]
  [Thread 0xa67fbd90 (LWP 72863) exited]
  [Thread 0xacac9d90 (LWP 72859) exited]
  [Thread 0xa7ffed90 (LWP 72860) exited]

  """

  All the tasks left are blocked in a system call, so no task left to call
  qemu_futex_wake() to unblock thread #2 (in futex()), which would unblock
  thread #1 (doing poll() in a pipe with thread #2).

  Those 7 threads exit before disk conversion is complete (sometimes in
  the beginning, sometimes at the end).

  On the HiSilicon D06 system - a 96 core NUMA arm64 box - qemu-img
  frequently hangs (~50% of the time) with this command:

  qemu-img convert -f qcow2 -O qcow2 /tmp/cloudimg /tmp/cloudimg2

  Where "cloudimg" is a standard qcow2 Ubuntu cloud image. This
  qcow2->qcow2 conversion happens to be something uvtool does every time
  it fetches images.

  Once hung, attaching gdb gives the following backtrace:

  (gdb) bt
  #0  0xae4f8154 in __GI_ppoll (fds=0xe8a67dc0, 
nfds=187650274213760,

Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices

2020-05-06 Thread Alex Williamson

On Thu, 7 May 2020 01:18:19 +0530
Kirti Wankhede  wrote:

> On 5/6/2020 11:41 AM, Yan Zhao wrote:
> > On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:  
> >> On Tue, 5 May 2020 04:48:37 +0530
> >> Kirti Wankhede  wrote:
> >>  
> >>> On 3/26/2020 1:26 AM, Alex Williamson wrote:  
>  On Wed, 25 Mar 2020 02:39:02 +0530
>  Kirti Wankhede  wrote:
>   
> > These functions save and restore PCI device specific data - config
> > space of PCI device.
> > Tested save and restore with MSI and MSIX type.
> >
> > Signed-off-by: Kirti Wankhede 
> > Reviewed-by: Neo Jia 
> > ---
> >hw/vfio/pci.c | 163 
> > ++
> >include/hw/vfio/vfio-common.h |   2 +
> >2 files changed, 165 insertions(+)
> >
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index 6c77c12e44b9..8deb11e87ef7 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -41,6 +41,7 @@
> >#include "trace.h"
> >#include "qapi/error.h"
> >#include "migration/blocker.h"
> > +#include "migration/qemu-file.h"
> >
> >#define TYPE_VFIO_PCI "vfio-pci"
> >#define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, 
> > TYPE_VFIO_PCI)
> > @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice 
> > *vdev)
> >}
> >}
> >
> > +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> > +{
> > +PCIDevice *pdev = >pdev;
> > +VFIOBAR *bar = >bars[nr];
> > +uint64_t addr;
> > +uint32_t addr_lo, addr_hi = 0;
> > +
> > +/* Skip unimplemented BARs and the upper half of 64bit BARS. */
> > +if (!bar->size) {
> > +return 0;
> > +}
> > +
> > +addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 
> > 4, 4);
> > +
> > +addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> > +   PCI_BASE_ADDRESS_MEM_MASK);  
> 
>  Nit, &= or combine with previous set.
>   
> > +if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +addr_hi = pci_default_read_config(pdev,
> > + PCI_BASE_ADDRESS_0 + (nr + 1) 
> > * 4, 4);
> > +}
> > +
> > +addr = ((uint64_t)addr_hi << 32) | addr_lo;  
> 
>  Could we use a union?
>   
> > +
> > +if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> > +return -EINVAL;
> > +}  
> 
>  What specifically are we validating here?  This should be true no
>  matter what we wrote to the BAR or else BAR emulation is broken.  The
>  bits that could make this unaligned are not implemented in the BAR.
>   
> > +
> > +return 0;
> > +}
> > +
> > +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> > +{
> > +int i, ret;
> > +
> > +for (i = 0; i < PCI_ROM_SLOT; i++) {
> > +ret = vfio_bar_validate(vdev, i);
> > +if (ret) {
> > +error_report("vfio: BAR address %d validation failed", i);
> > +return ret;
> > +}
> > +}
> > +return 0;
> > +}
> > +
> >static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> >{
> >VFIOBAR *bar = >bars[nr];
> > @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
> > *vbasedev)
> >return OBJECT(vdev);
> >}
> >
> > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> > +{
> > +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, 
> > vbasedev);
> > +PCIDevice *pdev = >pdev;
> > +uint16_t pci_cmd;
> > +int i;
> > +
> > +for (i = 0; i < PCI_ROM_SLOT; i++) {
> > +uint32_t bar;
> > +
> > +bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 
> > 4, 4);
> > +qemu_put_be32(f, bar);
> > +}
> > +
> > +qemu_put_be32(f, vdev->interrupt);
> > +if (vdev->interrupt == VFIO_INT_MSI) {
> > +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> > +bool msi_64bit;
> > +
> > +msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
> > PCI_MSI_FLAGS,
> > +2);
> > +msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> > +
> > +msi_addr_lo = pci_default_read_config(pdev,
> > + pdev->msi_cap + 
> > PCI_MSI_ADDRESS_LO, 4);
> > +qemu_put_be32(f, msi_addr_lo);
> > +
> > +if (msi_64bit) {
> > +msi_addr_hi = pci_default_read_config(pdev,
> > +

Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices

2020-05-06 Thread Kirti Wankhede





On 5/6/2020 11:41 AM, Yan Zhao wrote:

On Tue, May 05, 2020 at 12:37:11PM +0800, Alex Williamson wrote:

On Tue, 5 May 2020 04:48:37 +0530
Kirti Wankhede  wrote:


On 3/26/2020 1:26 AM, Alex Williamson wrote:

On Wed, 25 Mar 2020 02:39:02 +0530
Kirti Wankhede  wrote:
   

These functions save and restore PCI device specific data - config
space of PCI device.
Tested save and restore with MSI and MSIX type.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
---
   hw/vfio/pci.c | 163 
++
   include/hw/vfio/vfio-common.h |   2 +
   2 files changed, 165 insertions(+)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 6c77c12e44b9..8deb11e87ef7 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -41,6 +41,7 @@
   #include "trace.h"
   #include "qapi/error.h"
   #include "migration/blocker.h"
+#include "migration/qemu-file.h"
   
   #define TYPE_VFIO_PCI "vfio-pci"

   #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
@@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
   }
   }
   
+static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)

+{
+PCIDevice *pdev = >pdev;
+VFIOBAR *bar = >bars[nr];
+uint64_t addr;
+uint32_t addr_lo, addr_hi = 0;
+
+/* Skip unimplemented BARs and the upper half of 64bit BARS. */
+if (!bar->size) {
+return 0;
+}
+
+addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
+
+addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
+   PCI_BASE_ADDRESS_MEM_MASK);


Nit, &= or combine with previous set.
   

+if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
+addr_hi = pci_default_read_config(pdev,
+ PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 4);
+}
+
+addr = ((uint64_t)addr_hi << 32) | addr_lo;


Could we use a union?
   

+
+if (!QEMU_IS_ALIGNED(addr, bar->size)) {
+return -EINVAL;
+}


What specifically are we validating here?  This should be true no
matter what we wrote to the BAR or else BAR emulation is broken.  The
bits that could make this unaligned are not implemented in the BAR.
   

+
+return 0;
+}
+
+static int vfio_bars_validate(VFIOPCIDevice *vdev)
+{
+int i, ret;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+ret = vfio_bar_validate(vdev, i);
+if (ret) {
+error_report("vfio: BAR address %d validation failed", i);
+return ret;
+}
+}
+return 0;
+}
+
   static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
   {
   VFIOBAR *bar = >bars[nr];
@@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
*vbasedev)
   return OBJECT(vdev);
   }
   
+static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)

+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+uint16_t pci_cmd;
+int i;
+
+for (i = 0; i < PCI_ROM_SLOT; i++) {
+uint32_t bar;
+
+bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
+qemu_put_be32(f, bar);
+}
+
+qemu_put_be32(f, vdev->interrupt);
+if (vdev->interrupt == VFIO_INT_MSI) {
+uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+bool msi_64bit;
+
+msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
PCI_MSI_FLAGS,
+2);
+msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
+
+msi_addr_lo = pci_default_read_config(pdev,
+ pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
4);
+qemu_put_be32(f, msi_addr_lo);
+
+if (msi_64bit) {
+msi_addr_hi = pci_default_read_config(pdev,
+ pdev->msi_cap + 
PCI_MSI_ADDRESS_HI,
+ 4);
+}
+qemu_put_be32(f, msi_addr_hi);
+
+msi_data = pci_default_read_config(pdev,
+pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
PCI_MSI_DATA_32),
+2);
+qemu_put_be32(f, msi_data);


Isn't the data field only a u16?
   


Yes, fixing it.


+} else if (vdev->interrupt == VFIO_INT_MSIX) {
+uint16_t offset;
+
+/* save enable bit and maskall bit */
+offset = pci_default_read_config(pdev,
+   pdev->msix_cap + PCI_MSIX_FLAGS + 1, 2);
+qemu_put_be16(f, offset);
+msix_save(pdev, f);
+}
+pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
+qemu_put_be16(f, pci_cmd);
+}
+
+static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
+{
+VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
+PCIDevice *pdev = >pdev;
+uint32_t interrupt_type;
+uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
+uint16_t pci_cmd;
+bool msi_64bit;
+

Re: [PATCH Kernel v18 4/7] vfio iommu: Implementation of ioctl for dirty pages tracking.

2020-05-06 Thread Kirti Wankhede





On 5/6/2020 1:45 PM, Yan Zhao wrote:

On Mon, May 04, 2020 at 11:58:56PM +0800, Kirti Wankhede wrote:





  /*
   * Helper Functions for host iova-pfn list
   */
@@ -567,6 +654,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
vfio_unpin_page_external(dma, iova, do_accounting);
goto pin_unwind;
}
+
+   if (iommu->dirty_page_tracking) {
+   unsigned long pgshift =
+__ffs(vfio_pgsize_bitmap(iommu));
+

hi Kirti,
may I know if there's any vfio_pin_pages() happpening during NVidia's vGPU 
migration?
the code would enter into deadlock as I reported in last version.



Hm, you are right and same is the case in vfio_iommu_type1_dma_rw_chunk().

Instead of calling vfio_pgsize_bitmap() from lots of places, I'm 
thinking of saving pgsize_bitmap in struct vfio_iommu, which should be 
populated whenever domain_list is updated. Alex, will that be fine?


Thanks,
Kirti



Thanks
Yan

Re: [PATCH v16 QEMU 08/16] vfio: Register SaveVMHandlers for VFIO device

2020-05-06 Thread Kirti Wankhede





On 5/6/2020 10:23 PM, Dr. David Alan Gilbert wrote:

* Cornelia Huck (coh...@redhat.com) wrote:

On Wed, 6 May 2020 02:38:46 -0400
Yan Zhao  wrote:


On Tue, May 05, 2020 at 12:37:26PM +0800, Alex Williamson wrote:

It's been a long time, but that doesn't seem like what I was asking.
The sysfs version checking is used to select a target that is likely to
succeed, but the migration stream is still generated by a user and the
vendor driver is still ultimately responsible for validating that
stream.  I would hope that a vendor migration stream therefore starts
with information similar to that found in the sysfs interface, allowing
the receiving vendor driver to validate the source device and vendor
software version, such that we can fail an incoming migration that the
vendor driver deems incompatible.  Ideally the vendor driver might also
include consistency and sequence checking throughout the stream to
prevent a malicious user from exploiting the internal operation of the
vendor driver.  Thanks,


Some kind of somewhat standardized marker for driver/version seems like
a good idea. Further checking is also a good idea, but I think the
details of that need to be left to the individual drivers.


Standardised markers like that would be useful; although the rules of
how to compare them might be a bit vendor specific; but still - it would
be good for us to be able to dump something out when it all goes wrong.



Such checking should already there in vendor driver. Vendor driver might 
also support across version migration. I think checking in QEMU again 
would be redundant. Let vendor driver handle version checks.


Thanks,
Kirti

   

maybe we can add a rw field migration_version in
struct vfio_device_migration_info besides sysfs interface ?

when reading it in src, it gets the same string as that from sysfs;
when writing it in target, it returns success or not to check
compatibility and fails the migration early in setup phase.


Getting both populated from the same source seems like a good idea.

Not sure if a string is the best value to put into a migration stream;
maybe the sysfs interface can derive a human-readable string from a
more compact value to be put into the migration region (and ultimately
the stream)? Might be overengineering, just thinking out aloud here.


A string might be OK fi you specify a little about it.

Dave

--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH v18 QEMU 02/18] vfio: Add function to unmap VFIO region

2020-05-06 Thread Kirti Wankhede





On 5/5/2020 11:46 AM, Philippe Mathieu-Daudé wrote:

Hi Kirti,

On 5/5/20 12:44 AM, Kirti Wankhede wrote:

This function will be used for migration region.
Migration region is mmaped when migration starts and will be unmapped 
when

migration is complete.

Signed-off-by: Kirti Wankhede 
Reviewed-by: Neo Jia 
Reviewed-by: Cornelia Huck 
---
  hw/vfio/common.c  | 20 
  hw/vfio/trace-events  |  1 +
  include/hw/vfio/vfio-common.h |  1 +
  3 files changed, 22 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 0b3593b3c0c4..4a2f0d6a2233 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -983,6 +983,26 @@ int vfio_region_mmap(VFIORegion *region)
  return 0;
  }
+void vfio_region_unmap(VFIORegion *region)
+{
+    int i;
+
+    if (!region->mem) {
+    return;
+    }
+
+    for (i = 0; i < region->nr_mmaps; i++) {


I'd refactor this  block <...
+
trace_vfio_region_unmap(memory_region_name(>mmaps[i].mem),

+    region->mmaps[i].offset,
+    region->mmaps[i].offset +
+    region->mmaps[i].size - 1);
+    memory_region_del_subregion(region->mem, >mmaps[i].mem);
+    munmap(region->mmaps[i].mmap, region->mmaps[i].size);
+    object_unparent(OBJECT(>mmaps[i].mem));
+    region->mmaps[i].mmap = NULL;


...> into a helper and reuse it in vfio_region_mmap(). Well, actually 
I'd factor it out from vfio_region_mmap() then reuse it here. Anyway 
this is v18 so can be done later on top.




Nevermind, this is not the last version, I'll do suggested change.


Reviewed-by: Philippe Mathieu-Daudé 



Thanks,
Kirti


+    }
+}
+
  void vfio_region_exit(VFIORegion *region)
  {
  int i;
diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
index b1ef55a33ffd..8cdc27946cb8 100644
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -111,6 +111,7 @@ vfio_region_mmap(const char *name, unsigned long 
offset, unsigned long end) "Reg

  vfio_region_exit(const char *name, int index) "Device %s, region %d"
  vfio_region_finalize(const char *name, int index) "Device %s, region 
%d"
  vfio_region_mmaps_set_enabled(const char *name, bool enabled) 
"Region %s mmaps enabled: %d"
+vfio_region_unmap(const char *name, unsigned long offset, unsigned 
long end) "Region %s unmap [0x%lx - 0x%lx]"
  vfio_region_sparse_mmap_header(const char *name, int index, int 
nr_areas) "Device %s region %d: %d sparse mmap entries"
  vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned 
long end) "sparse entry %d [0x%lx - 0x%lx]"
  vfio_get_dev_region(const char *name, int index, uint32_t type, 
uint32_t subtype) "%s index %d, %08x/%0x8"
diff --git a/include/hw/vfio/vfio-common.h 
b/include/hw/vfio/vfio-common.h

index fd564209ac71..8d7a0fbb1046 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -171,6 +171,7 @@ int vfio_region_setup(Object *obj, VFIODevice 
*vbasedev, VFIORegion *region,

    int index, const char *name);
  int vfio_region_mmap(VFIORegion *region);
  void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled);
+void vfio_region_unmap(VFIORegion *region);
  void vfio_region_exit(VFIORegion *region);
  void vfio_region_finalize(VFIORegion *region);
  void vfio_reset_handler(void *opaque);

[Bug 1805256] Re: qemu-img hangs on rcu_call_ready_event logic in Aarch64 when converting images

2020-05-06 Thread Philippe Mathieu-Daudé

Isn't this fixed by commit 5710a3e09f9?

commit 5710a3e09f9b85801e5ce70797a4a511e5fc9e2c
Author: Paolo Bonzini 
Date:   Tue Apr 7 10:07:46 2020 -0400

async: use explicit memory barriers

When using C11 atomics, non-seqcst reads and writes do not participate
in the total order of seqcst operations.  In util/async.c and 
util/aio-posix.c,
in particular, the pattern that we use

  write ctx->notify_me write bh->scheduled
  read bh->scheduled   read ctx->notify_me
  if !bh->scheduled, sleep if ctx->notify_me, notify

needs to use seqcst operations for both the write and the read.  In
general this is something that we do not want, because there can be
many sources that are polled in addition to bottom halves.  The
alternative is to place a seqcst memory barrier between the write
and the read.  This also comes with a disadvantage, in that the
memory barrier is implicit on strongly-ordered architectures and
it wastes a few dozen clock cycles.

Fortunately, ctx->notify_me is never written concurrently by two
threads, so we can assert that and relax the writes to ctx->notify_me.
The resulting solution works and performs well on both aarch64 and x86.

Note that the atomic_set/atomic_read combination is not an atomic
read-modify-write, and therefore it is even weaker than C11 ATOMIC_RELAXED;
on x86, ATOMIC_RELAXED compiles to a locked operation.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1805256

Title:
  qemu-img hangs on rcu_call_ready_event logic in Aarch64 when
  converting images

Status in kunpeng920:
  Triaged
Status in kunpeng920 ubuntu-18.04 series:
  Triaged
Status in kunpeng920 ubuntu-18.04-hwe series:
  Triaged
Status in kunpeng920 ubuntu-19.10 series:
  Triaged
Status in kunpeng920 ubuntu-20.04 series:
  Triaged
Status in kunpeng920 upstream-kernel series:
  Fix Committed
Status in QEMU:
  Fix Released
Status in qemu package in Ubuntu:
  In Progress
Status in qemu source package in Bionic:
  In Progress
Status in qemu source package in Disco:
  In Progress
Status in qemu source package in Eoan:
  In Progress
Status in qemu source package in Focal:
  In Progress

Bug description:
  [Impact]

  * QEMU locking primitives might face a race condition in QEMU Async
  I/O bottom halves scheduling. This leads to a dead lock making either
  QEMU or one of its tools to hang indefinitely.

  [Test Case]

  * qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs in Aarch64.

  [Regression Potential]

  * This is a change to a core part of QEMU: The AIO scheduling. It
  works like a "kernel" scheduler, whereas kernel schedules OS tasks,
  the QEMU AIO code is responsible to schedule QEMU coroutines or event
  listeners callbacks.

  * There was a long discussion upstream about primitives and Aarch64.
  After quite sometime Paolo released this patch and it solves the
  issue. Tested platforms were: amd64 and aarch64 based on his commit
  log.

  * Christian suggests that this fix stay little longer in -proposed to
  make sure it won't cause any regressions.

  * dannf suggests we also check for performance regressions; e.g. how
  long it takes to convert a cloud image on high-core systems.

  [Other Info]

   * Original Description bellow:

  Command:

  qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs.

  

  Workaround:

  qemu-img convert -m 1 -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Run "qemu-img convert" with "a single coroutine" to avoid this issue.

  

  (gdb) thread 1
  ...
  (gdb) bt
  #0 0xbf1ad81c in __GI_ppoll
  #1 0xaabcf73c in ppoll
  #2 qemu_poll_ns
  #3 0xaabd0764 in os_host_main_loop_wait
  #4 main_loop_wait
  ...

  (gdb) thread 2
  ...
  (gdb) bt
  #0 syscall ()
  #1 0xaabd41cc in qemu_futex_wait
  #2 qemu_event_wait (ev=ev@entry=0xaac86ce8 )
  #3 0xaabed05c in call_rcu_thread
  #4 0xaabd34c8 in qemu_thread_start
  #5 0xbf25c880 in start_thread
  #6 0xbf1b6b9c in thread_start ()

  (gdb) thread 3
  ...
  (gdb) bt
  #0 0xbf11aa20 in __GI___sigtimedwait
  #1 0xbf2671b4 in __sigwait
  #2 0xaabd1ddc in sigwait_compat
  #3 0xaabd34c8 in qemu_thread_start
  #4 0xbf25c880 in start_thread
  #5 0xbf1b6b9c in thread_start

  

  (gdb) run
  Starting program: /usr/bin/qemu-img convert -f qcow2 -O qcow2
  ./disk01.ext4.qcow2 ./output.qcow2

  [New Thread 0xbec5ad90 (LWP 72839)]
  [New Thread 0xbe459d90 (LWP 72840)]
  [New Thread 0xbdb57d90 (LWP 72841)]
  [New Thread 0xacac9d90 (LWP 72859)]
  [New Thread 0xa7ffed90 (LWP 72860)]
  [New Thread 0xa77fdd90 (LWP

[Bug 1805256] Re: qemu-img hangs on rcu_call_ready_event logic in Aarch64 when converting images

2020-05-06 Thread Launchpad Bug Tracker

** Merge proposal linked:
   
https://code.launchpad.net/~rafaeldtinoco/ubuntu/+source/qemu/+git/qemu/+merge/383530

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1805256

Title:
  qemu-img hangs on rcu_call_ready_event logic in Aarch64 when
  converting images

Status in kunpeng920:
  Triaged
Status in kunpeng920 ubuntu-18.04 series:
  Triaged
Status in kunpeng920 ubuntu-18.04-hwe series:
  Triaged
Status in kunpeng920 ubuntu-19.10 series:
  Triaged
Status in kunpeng920 ubuntu-20.04 series:
  Triaged
Status in kunpeng920 upstream-kernel series:
  Fix Committed
Status in QEMU:
  Fix Released
Status in qemu package in Ubuntu:
  In Progress
Status in qemu source package in Bionic:
  In Progress
Status in qemu source package in Disco:
  In Progress
Status in qemu source package in Eoan:
  In Progress
Status in qemu source package in Focal:
  In Progress

Bug description:
  [Impact]

  * QEMU locking primitives might face a race condition in QEMU Async
  I/O bottom halves scheduling. This leads to a dead lock making either
  QEMU or one of its tools to hang indefinitely.

  [Test Case]

  * qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs in Aarch64.

  [Regression Potential]

  * This is a change to a core part of QEMU: The AIO scheduling. It
  works like a "kernel" scheduler, whereas kernel schedules OS tasks,
  the QEMU AIO code is responsible to schedule QEMU coroutines or event
  listeners callbacks.

  * There was a long discussion upstream about primitives and Aarch64.
  After quite sometime Paolo released this patch and it solves the
  issue. Tested platforms were: amd64 and aarch64 based on his commit
  log.

  * Christian suggests that this fix stay little longer in -proposed to
  make sure it won't cause any regressions.

  * dannf suggests we also check for performance regressions; e.g. how
  long it takes to convert a cloud image on high-core systems.

  [Other Info]

   * Original Description bellow:

  Command:

  qemu-img convert -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Hangs indefinitely approximately 30% of the runs.

  

  Workaround:

  qemu-img convert -m 1 -f qcow2 -O qcow2 ./disk01.qcow2 ./output.qcow2

  Run "qemu-img convert" with "a single coroutine" to avoid this issue.

  

  (gdb) thread 1
  ...
  (gdb) bt
  #0 0xbf1ad81c in __GI_ppoll
  #1 0xaabcf73c in ppoll
  #2 qemu_poll_ns
  #3 0xaabd0764 in os_host_main_loop_wait
  #4 main_loop_wait
  ...

  (gdb) thread 2
  ...
  (gdb) bt
  #0 syscall ()
  #1 0xaabd41cc in qemu_futex_wait
  #2 qemu_event_wait (ev=ev@entry=0xaac86ce8 )
  #3 0xaabed05c in call_rcu_thread
  #4 0xaabd34c8 in qemu_thread_start
  #5 0xbf25c880 in start_thread
  #6 0xbf1b6b9c in thread_start ()

  (gdb) thread 3
  ...
  (gdb) bt
  #0 0xbf11aa20 in __GI___sigtimedwait
  #1 0xbf2671b4 in __sigwait
  #2 0xaabd1ddc in sigwait_compat
  #3 0xaabd34c8 in qemu_thread_start
  #4 0xbf25c880 in start_thread
  #5 0xbf1b6b9c in thread_start

  

  (gdb) run
  Starting program: /usr/bin/qemu-img convert -f qcow2 -O qcow2
  ./disk01.ext4.qcow2 ./output.qcow2

  [New Thread 0xbec5ad90 (LWP 72839)]
  [New Thread 0xbe459d90 (LWP 72840)]
  [New Thread 0xbdb57d90 (LWP 72841)]
  [New Thread 0xacac9d90 (LWP 72859)]
  [New Thread 0xa7ffed90 (LWP 72860)]
  [New Thread 0xa77fdd90 (LWP 72861)]
  [New Thread 0xa6ffcd90 (LWP 72862)]
  [New Thread 0xa67fbd90 (LWP 72863)]
  [New Thread 0xa5ffad90 (LWP 72864)]

  [Thread 0xa5ffad90 (LWP 72864) exited]
  [Thread 0xa6ffcd90 (LWP 72862) exited]
  [Thread 0xa77fdd90 (LWP 72861) exited]
  [Thread 0xbdb57d90 (LWP 72841) exited]
  [Thread 0xa67fbd90 (LWP 72863) exited]
  [Thread 0xacac9d90 (LWP 72859) exited]
  [Thread 0xa7ffed90 (LWP 72860) exited]

  
  """

  All the tasks left are blocked in a system call, so no task left to call
  qemu_futex_wake() to unblock thread #2 (in futex()), which would unblock
  thread #1 (doing poll() in a pipe with thread #2).

  Those 7 threads exit before disk conversion is complete (sometimes in
  the beginning, sometimes at the end).

  

  On the HiSilicon D06 system - a 96 core NUMA arm64 box - qemu-img
  frequently hangs (~50% of the time) with this command:

  qemu-img convert -f qcow2 -O qcow2 /tmp/cloudimg /tmp/cloudimg2

  Where "cloudimg" is a standard qcow2 Ubuntu cloud image. This
  qcow2->qcow2 conversion happens to be something uvtool does every time
  it fetches images.

  Once hung, attaching gdb gives the following backtrace:

  (gdb) bt
  #0  0xae4f8154 in __GI_ppoll (fds=0xe8a67dc0, 
nfds=187650274213760,
  timeout=, timeout@entry=0x0, sigmask=0xc123b950)
  at ../sysdeps/unix/sysv/linux/ppoll.c:39
  #1

Re: [PATCH] virtiofsd: Use clone() and not unshare(), support non-root

2020-05-06 Thread Dr. David Alan Gilbert

* Colin Walters (walt...@verbum.org) wrote:
> I'd like to make use of virtiofs as part of our tooling in
> https://github.com/coreos/coreos-assembler
> Most of the code runs as non-root today; qemu also runs as non-root.
> We use 9p right now.
> 
> virtiofsd's builtin sandboxing effectively assumes it runs as
> root.
> 
> First, change the code to use `clone()` and not `unshare()+fork()`.
> 
> Next, automatically use `CLONE_NEWUSER` if we're running as non root.

Is it ever useful for root to run the code in a new user namespace?

Dave

> This is similar logic to that in https://github.com/containers/bubblewrap
> (Which...BTW, it could make sense for virtiofs to depend on bubblewrap
>  and re-exec itself rather than re-implementing the containerization
>  itself)
> 
> Signed-off-by: Colin Walters 
> ---
>  tools/virtiofsd/passthrough_ll.c | 26 +-
>  1 file changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/tools/virtiofsd/passthrough_ll.c 
> b/tools/virtiofsd/passthrough_ll.c
> index 4c35c95b25..468617f6d6 100644
> --- a/tools/virtiofsd/passthrough_ll.c
> +++ b/tools/virtiofsd/passthrough_ll.c
> @@ -2530,6 +2530,21 @@ static void print_capabilities(void)
>  printf("}\n");
>  }
>  
> +/* Copied from bubblewrap */
> +static int
> +raw_clone(unsigned long flags, void *child_stack)
> +{
> +#if defined(__s390__) || defined(__CRIS__)
> +  /*
> +   * On s390 and cris the order of the first and second arguments
> +   * of the raw clone() system call is reversed.
> +   */
> +return (int) syscall(__NR_clone, child_stack, flags);
> +#else
> +return (int) syscall(__NR_clone, flags, child_stack);
> +#endif
> +}
> +
>  /*
>   * Move to a new mount, net, and pid namespaces to isolate this process.
>   */
> @@ -2547,14 +2562,15 @@ static void setup_namespaces(struct lo_data *lo, 
> struct fuse_session *se)
>   * an empty network namespace to prevent TCP/IP and other network
>   * activity in case this process is compromised.
>   */
> -if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
> -fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
> -exit(1);
> +int clone_flags = SIGCHLD | CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET;
> +/* If we're non root, we need a new user namespace */
> +if (getuid() != 0) {
> +clone_flags |= CLONE_NEWUSER;
>  }
>  
> -child = fork();
> +child = raw_clone(clone_flags, NULL);
>  if (child < 0) {
> -fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
> +fuse_log(FUSE_LOG_ERR, "clone() failed: %m\n");
>  exit(1);
>  }
>  if (child > 0) {
> -- 
> 2.24.1
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH v1 13/17] hmp: Handle virtio-mem when printing memory device info

2020-05-06 Thread Pankaj Gupta

> Print the memory device info just like for other memory devices.
>
> Cc: "Dr. David Alan Gilbert" 
> Cc: "Michael S. Tsirkin" 
> Signed-off-by: David Hildenbrand 
> ---
>  monitor/hmp-cmds.c | 16 
>  1 file changed, 16 insertions(+)
>
> diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
> index 7f6e982dc8..4b3638a2a6 100644
> --- a/monitor/hmp-cmds.c
> +++ b/monitor/hmp-cmds.c
> @@ -1805,6 +1805,7 @@ void hmp_info_memory_devices(Monitor *mon, const QDict 
> *qdict)
>  MemoryDeviceInfoList *info_list = qmp_query_memory_devices();
>  MemoryDeviceInfoList *info;
>  VirtioPMEMDeviceInfo *vpi;
> +VirtioMEMDeviceInfo *vmi;
>  MemoryDeviceInfo *value;
>  PCDIMMDeviceInfo *di;
>
> @@ -1839,6 +1840,21 @@ void hmp_info_memory_devices(Monitor *mon, const QDict 
> *qdict)
>  monitor_printf(mon, "  size: %" PRIu64 "\n", vpi->size);
>  monitor_printf(mon, "  memdev: %s\n", vpi->memdev);
>  break;
> +case MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM:
> +vmi = value->u.virtio_mem.data;
> +monitor_printf(mon, "Memory device [%s]: \"%s\"\n",
> +   MemoryDeviceInfoKind_str(value->type),
> +   vmi->id ? vmi->id : "");
> +monitor_printf(mon, "  memaddr: 0x%" PRIx64 "\n", 
> vmi->memaddr);
> +monitor_printf(mon, "  node: %" PRId64 "\n", vmi->node);
> +monitor_printf(mon, "  requested-size: %" PRIu64 "\n",
> +   vmi->requested_size);
> +monitor_printf(mon, "  size: %" PRIu64 "\n", vmi->size);
> +monitor_printf(mon, "  max-size: %" PRIu64 "\n", 
> vmi->max_size);
> +monitor_printf(mon, "  block-size: %" PRIu64 "\n",
> +   vmi->block_size);
> +monitor_printf(mon, "  memdev: %s\n", vmi->memdev);
> +break;
>  default:
>  g_assert_not_reached();
>  }
> --
> 2.25.3

Reviewed-by: Pankaj Gupta

Re: [PATCH v1 11/17] virtio-pci: Proxy for virtio-mem

2020-05-06 Thread Pankaj Gupta

> Let's add a proxy for virtio-mem, make it a memory device, and
> pass-through the properties.
>
> Cc: "Michael S. Tsirkin" 
> Cc: Marcel Apfelbaum 
> Cc: "Dr. David Alan Gilbert" 
> Cc: Igor Mammedov 
> Signed-off-by: David Hildenbrand 
> ---
>  hw/virtio/Makefile.objs|   1 +
>  hw/virtio/virtio-mem-pci.c | 131 +
>  hw/virtio/virtio-mem-pci.h |  33 ++
>  include/hw/pci/pci.h   |   1 +
>  4 files changed, 166 insertions(+)
>  create mode 100644 hw/virtio/virtio-mem-pci.c
>  create mode 100644 hw/virtio/virtio-mem-pci.h
>
> diff --git a/hw/virtio/Makefile.objs b/hw/virtio/Makefile.objs
> index 7df70e977e..b9661f9c01 100644
> --- a/hw/virtio/Makefile.objs
> +++ b/hw/virtio/Makefile.objs
> @@ -19,6 +19,7 @@ obj-$(call 
> land,$(CONFIG_VHOST_USER_FS),$(CONFIG_VIRTIO_PCI)) += vhost-user-fs-p
>  obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
>  obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock.o
>  obj-$(CONFIG_VIRTIO_MEM) += virtio-mem.o
> +common-obj-$(call land,$(CONFIG_VIRTIO_MEM),$(CONFIG_VIRTIO_PCI)) += 
> virtio-mem-pci.o
>
>  ifeq ($(CONFIG_VIRTIO_PCI),y)
>  obj-$(CONFIG_VHOST_VSOCK) += vhost-vsock-pci.o
> diff --git a/hw/virtio/virtio-mem-pci.c b/hw/virtio/virtio-mem-pci.c
> new file mode 100644
> index 00..a47d21c81f
> --- /dev/null
> +++ b/hw/virtio/virtio-mem-pci.c
> @@ -0,0 +1,131 @@
> +/*
> + * Virtio MEM PCI device
> + *
> + * Copyright (C) 2020 Red Hat, Inc.
> + *
> + * Authors:
> + *  David Hildenbrand 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +
Don't think we need the blank line here.

> +#include "virtio-mem-pci.h"
> +#include "hw/mem/memory-device.h"
> +#include "qapi/error.h"
> +
> +static void virtio_mem_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
> +{
> +VirtIOMEMPCI *mem_pci = VIRTIO_MEM_PCI(vpci_dev);
> +DeviceState *vdev = DEVICE(_pci->vdev);
> +
> +qdev_set_parent_bus(vdev, BUS(_dev->bus));
> +object_property_set_bool(OBJECT(vdev), true, "realized", errp);
> +}
> +
> +static void virtio_mem_pci_set_addr(MemoryDeviceState *md, uint64_t addr,
> +Error **errp)
> +{
> +object_property_set_uint(OBJECT(md), addr, VIRTIO_MEM_ADDR_PROP, errp);
> +}
> +
> +static uint64_t virtio_mem_pci_get_addr(const MemoryDeviceState *md)
> +{
> +return object_property_get_uint(OBJECT(md), VIRTIO_MEM_ADDR_PROP,
> +_abort);
> +}
> +
> +static MemoryRegion *virtio_mem_pci_get_memory_region(MemoryDeviceState *md,
> +  Error **errp)
> +{
> +VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
> +VirtIOMEM *vmem = VIRTIO_MEM(_mem->vdev);
> +VirtIOMEMClass *vmc = VIRTIO_MEM_GET_CLASS(vmem);
> +
> +return vmc->get_memory_region(vmem, errp);
> +}
> +
> +static uint64_t virtio_mem_pci_get_plugged_size(const MemoryDeviceState *md,
> +Error **errp)
> +{
> +return object_property_get_uint(OBJECT(md), VIRTIO_MEM_SIZE_PROP,
> +errp);
> +}
> +
> +static void virtio_mem_pci_fill_device_info(const MemoryDeviceState *md,
> +MemoryDeviceInfo *info)
> +{
> +VirtioMEMDeviceInfo *vi = g_new0(VirtioMEMDeviceInfo, 1);
> +VirtIOMEMPCI *pci_mem = VIRTIO_MEM_PCI(md);
> +VirtIOMEM *vmem = VIRTIO_MEM(_mem->vdev);
> +VirtIOMEMClass *vpc = VIRTIO_MEM_GET_CLASS(vmem);
> +DeviceState *dev = DEVICE(md);
> +
> +if (dev->id) {
> +vi->has_id = true;
> +vi->id = g_strdup(dev->id);
> +}
> +
> +/* let the real device handle everything else */
> +vpc->fill_device_info(vmem, vi);
> +
> +info->u.virtio_mem.data = vi;
> +info->type = MEMORY_DEVICE_INFO_KIND_VIRTIO_MEM;
> +}
> +
> +static void virtio_mem_pci_class_init(ObjectClass *klass, void *data)
> +{
> +DeviceClass *dc = DEVICE_CLASS(klass);
> +VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
> +PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
> +MemoryDeviceClass *mdc = MEMORY_DEVICE_CLASS(klass);
> +
> +k->realize = virtio_mem_pci_realize;
> +set_bit(DEVICE_CATEGORY_MISC, dc->categories);
> +pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
> +pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_MEM;
> +pcidev_k->revision = VIRTIO_PCI_ABI_VERSION;
> +pcidev_k->class_id = PCI_CLASS_OTHERS;
> +
> +mdc->get_addr = virtio_mem_pci_get_addr;
> +mdc->set_addr = virtio_mem_pci_set_addr;
> +mdc->get_plugged_size = virtio_mem_pci_get_plugged_size;
> +mdc->get_memory_region = virtio_mem_pci_get_memory_region;
> +mdc->fill_device_info = virtio_mem_pci_fill_device_info;
> +}
> +
> +static void virtio_mem_pci_instance_init(Object *obj)
> +{
> +VirtIOMEMPCI *dev = VIRTIO_MEM_PCI(obj);
> +
> +virtio_instance_init_common(obj,

[PATCH v3] aspeed: Add support for the sonorapass-bmc board

2020-05-06 Thread Patrick Williams

Sonora Pass is a 2 socket x86 motherboard designed by Facebook
and supported by OpenBMC.  Strapping configuration was obtained
from hardware and i2c configuration is based on dts found at:

https://github.com/facebook/openbmc-linux/blob/1633c87b8ba7c162095787c988979b748ba65dc8/arch/arm/boot/dts/aspeed-bmc-facebook-sonorapass.dts

Booted a test image of http://github.com/facebook/openbmc to login
prompt.

Signed-off-by: Patrick Williams 
Reviewed-by: Amithash Prasad 
Reviewed-by: Cédric Le Goater 
---
 hw/arm/aspeed.c | 77 +
 1 file changed, 77 insertions(+)

diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
index 6f4d7075c4..74c46681e8 100644
--- a/hw/arm/aspeed.c
+++ b/hw/arm/aspeed.c
@@ -74,6 +74,21 @@ struct AspeedBoardState {
 SCU_AST2500_HW_STRAP_ACPI_ENABLE |  \
 SCU_HW_STRAP_SPI_MODE(SCU_HW_STRAP_SPI_MASTER))
 
+/* Sonorapass hardware value: 0xF100D216 */
+#define SONORAPASS_BMC_HW_STRAP1 (  \
+SCU_AST2500_HW_STRAP_SPI_AUTOFETCH_ENABLE | \
+SCU_AST2500_HW_STRAP_GPIO_STRAP_ENABLE |\
+SCU_AST2500_HW_STRAP_UART_DEBUG |   \
+SCU_AST2500_HW_STRAP_RESERVED28 |   \
+SCU_AST2500_HW_STRAP_DDR4_ENABLE |  \
+SCU_HW_STRAP_VGA_CLASS_CODE |   \
+SCU_HW_STRAP_LPC_RESET_PIN |\
+SCU_HW_STRAP_SPI_MODE(SCU_HW_STRAP_SPI_MASTER) |\
+SCU_AST2500_HW_STRAP_SET_AXI_AHB_RATIO(AXI_AHB_RATIO_2_1) | \
+SCU_HW_STRAP_VGA_BIOS_ROM | \
+SCU_HW_STRAP_VGA_SIZE_SET(VGA_16M_DRAM) |   \
+SCU_AST2500_HW_STRAP_RESERVED1)
+
 /* Swift hardware value: 0xF11AD206 */
 #define SWIFT_BMC_HW_STRAP1 (   \
 AST2500_HW_STRAP1_DEFAULTS |\
@@ -434,6 +449,49 @@ static void swift_bmc_i2c_init(AspeedBoardState *bmc)
 i2c_create_slave(aspeed_i2c_get_bus(DEVICE(>i2c), 12), "tmp105", 
0x4a);
 }
 
+static void sonorapass_bmc_i2c_init(AspeedBoardState *bmc)
+{
+AspeedSoCState *soc = >soc;
+
+/* bus 2 : */
+i2c_create_slave(aspeed_i2c_get_bus(DEVICE(>i2c), 2), "tmp105", 0x48);
+i2c_create_slave(aspeed_i2c_get_bus(DEVICE(>i2c), 2), "tmp105", 0x49);
+/* bus 2 : pca9546 @ 0x73 */
+
+/* bus 3 : pca9548 @ 0x70 */
+
+/* bus 4 : */
+uint8_t *eeprom4_54 = g_malloc0(8 * 1024);
+smbus_eeprom_init_one(aspeed_i2c_get_bus(DEVICE(>i2c), 4), 0x54,
+  eeprom4_54);
+/* PCA9539 @ 0x76, but PCA9552 is compatible */
+i2c_create_slave(aspeed_i2c_get_bus(DEVICE(>i2c), 4), "pca9552", 
0x76);
+/* PCA9539 @ 0x77, but PCA9552 is compatible */
+i2c_create_slave(aspeed_i2c_get_bus(DEVICE(>i2c), 4), "pca9552", 
0x77);
+
+/* bus 6 : */
+i2c_create_slave(aspeed_i2c_get_bus(DEVICE(>i2c), 6), "tmp105", 0x48);
+i2c_create_slave(aspeed_i2c_get_bus(DEVICE(>i2c), 6), "tmp105", 0x49);
+/* bus 6 : pca9546 @ 0x73 */
+
+/* bus 8 : */
+uint8_t *eeprom8_56 = g_malloc0(8 * 1024);
+smbus_eeprom_init_one(aspeed_i2c_get_bus(DEVICE(>i2c), 8), 0x56,
+  eeprom8_56);
+i2c_create_slave(aspeed_i2c_get_bus(DEVICE(>i2c), 8), "pca9552", 
0x60);
+i2c_create_slave(aspeed_i2c_get_bus(DEVICE(>i2c), 8), "pca9552", 
0x61);
+/* bus 8 : adc128d818 @ 0x1d */
+/* bus 8 : adc128d818 @ 0x1f */
+
+/* bus 13 : pca9548 @ 0x71
+ *  - channel 3:
+ *  - tmm421 @ 0x4c
+ *  - tmp421 @ 0x4e
+ *  - tmp421 @ 0x4f
+ */
+
+}
+
 static void witherspoon_bmc_i2c_init(AspeedBoardState *bmc)
 {
 AspeedSoCState *soc = >soc;
@@ -552,6 +610,21 @@ static void aspeed_machine_romulus_class_init(ObjectClass 
*oc, void *data)
 mc->default_ram_size   = 512 * MiB;
 };
 
+static void aspeed_machine_sonorapass_class_init(ObjectClass *oc, void *data)
+{
+MachineClass *mc = MACHINE_CLASS(oc);
+AspeedMachineClass *amc = ASPEED_MACHINE_CLASS(oc);
+
+mc->desc   = "OCP SonoraPass BMC (ARM1176)";
+amc->soc_name  = "ast2500-a1";
+amc->hw_strap1 = SONORAPASS_BMC_HW_STRAP1;
+amc->fmc_model = "mx66l1g45g";
+amc->spi_model = "mx66l1g45g";
+amc->num_cs= 2;
+amc->i2c_init  = sonorapass_bmc_i2c_init;
+mc->default_ram_size   = 512 * MiB;
+};
+
 static void aspeed_machine_swift_class_init(ObjectClass *oc, void *data)
 {
 MachineClass *mc = MACHINE_CLASS(oc);
@@ -631,6 +704,10 @@ static const TypeInfo aspeed_machine_types[] = {
 .name  = MACHINE_TYPE_NAME("swift-bmc"),
 .parent= TYPE_ASPEED_MACHINE,
 .class_init= aspeed_machine_swift_class_init,
+}, {
+.name  =

[PULL 08/10] tcg: Improve vector tail clearing

2020-05-06 Thread Richard Henderson

Better handling of non-power-of-2 tails as seen with Arm 8-byte
vector operations.

Reviewed-by: Alex Bennée 
Signed-off-by: Richard Henderson 
---
 tcg/tcg-op-gvec.c | 82 ---
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 5a6cc19812..43cac1a0bf 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -326,11 +326,34 @@ void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, 
uint32_t bofs,
in units of LNSZ.  This limits the expansion of inline code.  */
 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 {
-if (oprsz % lnsz == 0) {
-uint32_t lnct = oprsz / lnsz;
-return lnct >= 1 && lnct <= MAX_UNROLL;
+uint32_t q, r;
+
+if (oprsz < lnsz) {
+return false;
 }
-return false;
+
+q = oprsz / lnsz;
+r = oprsz % lnsz;
+tcg_debug_assert((r & 7) == 0);
+
+if (lnsz < 16) {
+/* For sizes below 16, accept no remainder. */
+if (r != 0) {
+return false;
+}
+} else {
+/*
+ * Recall that ARM SVE allows vector sizes that are not a
+ * power of 2, but always a multiple of 16.  The intent is
+ * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+ * In addition, expand_clr needs to handle a multiple of 8.
+ * Thus we can handle the tail with one more operation per
+ * diminishing power of 2.
+ */
+q += ctpop32(r);
+}
+
+return q <= MAX_UNROLL;
 }
 
 static void expand_clr(uint32_t dofs, uint32_t maxsz);
@@ -402,22 +425,31 @@ static void gen_dup_i64(unsigned vece, TCGv_i64 out, 
TCGv_i64 in)
 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
   uint32_t size, bool prefer_i64)
 {
-if (TCG_TARGET_HAS_v256 && check_size_impl(size, 32)) {
-/*
- * Recall that ARM SVE allows vector sizes that are not a
- * power of 2, but always a multiple of 16.  The intent is
- * that e.g. size == 80 would be expanded with 2x32 + 1x16.
- * It is hard to imagine a case in which v256 is supported
- * but v128 is not, but check anyway.
- */
-if (tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece)
-&& (size % 32 == 0
-|| tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) {
-return TCG_TYPE_V256;
-}
+/*
+ * Recall that ARM SVE allows vector sizes that are not a
+ * power of 2, but always a multiple of 16.  The intent is
+ * that e.g. size == 80 would be expanded with 2x32 + 1x16.
+ * It is hard to imagine a case in which v256 is supported
+ * but v128 is not, but check anyway.
+ * In addition, expand_clr needs to handle a multiple of 8.
+ */
+if (TCG_TARGET_HAS_v256 &&
+check_size_impl(size, 32) &&
+tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
+(!(size & 16) ||
+ (TCG_TARGET_HAS_v128 &&
+  tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
+(!(size & 8) ||
+ (TCG_TARGET_HAS_v64 &&
+  tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece {
+return TCG_TYPE_V256;
 }
-if (TCG_TARGET_HAS_v128 && check_size_impl(size, 16)
-&& tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece)) {
+if (TCG_TARGET_HAS_v128 &&
+check_size_impl(size, 16) &&
+tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
+(!(size & 8) ||
+ (TCG_TARGET_HAS_v64 &&
+  tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece {
 return TCG_TYPE_V128;
 }
 if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
@@ -432,6 +464,18 @@ static void do_dup_store(TCGType type, uint32_t dofs, 
uint32_t oprsz,
 {
 uint32_t i = 0;
 
+tcg_debug_assert(oprsz >= 8);
+
+/*
+ * This may be expand_clr for the tail of an operation, e.g.
+ * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
+ * are misaligned wrt the maximum vector size, so do that first.
+ */
+if (dofs & 8) {
+tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
+i += 8;
+}
+
 switch (type) {
 case TCG_TYPE_V256:
 /*
-- 
2.20.1

[PULL 07/10] tcg: Add tcg_gen_gvec_dup_tl

2020-05-06 Thread Richard Henderson

For use when a target needs to pass a configure-specific
target_ulong value to duplicate.

Reviewed-by: LIU Zhiwei 
Reviewed-by: David Hildenbrand 
Reviewed-by: Alex Bennée 
Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-op-gvec.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index fa8a0c8d03..d89f91f40e 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -320,6 +320,12 @@ void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, 
uint32_t s,
 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
   uint32_t m, TCGv_i64);
 
+#if TARGET_LONG_BITS == 64
+# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i64
+#else
+# define tcg_gen_gvec_dup_tl  tcg_gen_gvec_dup_i32
+#endif
+
 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
int64_t shift, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
-- 
2.20.1

[PULL 01/10] tcg: Add tcg_gen_gvec_dup_imm

2020-05-06 Thread Richard Henderson

Add a version of tcg_gen_dup_* that takes both immediate and
a vector element size operand.  This will replace the set of
tcg_gen_gvec_dup{8,16,32,64}i functions that encode the element
size within the function name.

Reviewed-by: LIU Zhiwei 
Reviewed-by: David Hildenbrand 
Reviewed-by: Alex Bennée 
Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-op-gvec.h | 2 ++
 tcg/tcg-op-gvec.c | 7 +++
 2 files changed, 9 insertions(+)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index 74534e2480..eb0d47a42b 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -313,6 +313,8 @@ void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 
 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
   uint32_t s, uint32_t m);
+void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t s,
+  uint32_t m, uint64_t imm);
 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t s,
   uint32_t m, TCGv_i32);
 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 327d9588e0..593bb4542e 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -1569,6 +1569,13 @@ void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
 }
 
+void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
+  uint32_t maxsz, uint64_t x)
+{
+check_size_align(oprsz, maxsz, dofs);
+do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
+}
+
 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
   uint32_t oprsz, uint32_t maxsz)
 {
-- 
2.20.1

[PULL 10/10] tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32, 64}

2020-05-06 Thread Richard Henderson

For the benefit of compatibility of function pointer types,
we have standardized on int32_t and int64_t as the integral
argument to tcg expanders.

We converted most of them in 474b2e8f0f7, but missed the rotates.

Reviewed-by: Alex Bennée 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-op.h |  8 
 tcg/tcg-op.c | 16 
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/tcg/tcg-op.h b/include/tcg/tcg-op.h
index 230db6e022..e3399d6a5e 100644
--- a/include/tcg/tcg-op.h
+++ b/include/tcg/tcg-op.h
@@ -297,9 +297,9 @@ void tcg_gen_ctzi_i32(TCGv_i32 ret, TCGv_i32 arg1, uint32_t 
arg2);
 void tcg_gen_clrsb_i32(TCGv_i32 ret, TCGv_i32 arg);
 void tcg_gen_ctpop_i32(TCGv_i32 a1, TCGv_i32 a2);
 void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
 void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2);
-void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2);
+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2);
 void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2,
  unsigned int ofs, unsigned int len);
 void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg,
@@ -493,9 +493,9 @@ void tcg_gen_ctzi_i64(TCGv_i64 ret, TCGv_i64 arg1, uint64_t 
arg2);
 void tcg_gen_clrsb_i64(TCGv_i64 ret, TCGv_i64 arg);
 void tcg_gen_ctpop_i64(TCGv_i64 a1, TCGv_i64 a2);
 void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
 void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2);
-void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2);
+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
 void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2,
  unsigned int ofs, unsigned int len);
 void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg,
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index e2e25ebf7d..e60b74fb82 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -540,9 +540,9 @@ void tcg_gen_rotl_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 
arg2)
 }
 }
 
-void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+void tcg_gen_rotli_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
-tcg_debug_assert(arg2 < 32);
+tcg_debug_assert(arg2 >= 0 && arg2 < 32);
 /* some cases can be optimized here */
 if (arg2 == 0) {
 tcg_gen_mov_i32(ret, arg1);
@@ -580,9 +580,9 @@ void tcg_gen_rotr_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 
arg2)
 }
 }
 
-void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, unsigned arg2)
+void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2)
 {
-tcg_debug_assert(arg2 < 32);
+tcg_debug_assert(arg2 >= 0 && arg2 < 32);
 /* some cases can be optimized here */
 if (arg2 == 0) {
 tcg_gen_mov_i32(ret, arg1);
@@ -1962,9 +1962,9 @@ void tcg_gen_rotl_i64(TCGv_i64 ret, TCGv_i64 arg1, 
TCGv_i64 arg2)
 }
 }
 
-void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+void tcg_gen_rotli_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 {
-tcg_debug_assert(arg2 < 64);
+tcg_debug_assert(arg2 >= 0 && arg2 < 64);
 /* some cases can be optimized here */
 if (arg2 == 0) {
 tcg_gen_mov_i64(ret, arg1);
@@ -2001,9 +2001,9 @@ void tcg_gen_rotr_i64(TCGv_i64 ret, TCGv_i64 arg1, 
TCGv_i64 arg2)
 }
 }
 
-void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, unsigned arg2)
+void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2)
 {
-tcg_debug_assert(arg2 < 64);
+tcg_debug_assert(arg2 >= 0 && arg2 < 64);
 /* some cases can be optimized here */
 if (arg2 == 0) {
 tcg_gen_mov_i64(ret, arg1);
-- 
2.20.1

[PULL 03/10] target/ppc: Use tcg_gen_gvec_dup_imm

2020-05-06 Thread Richard Henderson

We can now unify the implementation of the 3 VSPLTI instructions.

Acked-by: David Gibson 
Signed-off-by: Richard Henderson 
---
 target/ppc/translate/vmx-impl.inc.c | 32 -
 target/ppc/translate/vsx-impl.inc.c |  2 +-
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/target/ppc/translate/vmx-impl.inc.c 
b/target/ppc/translate/vmx-impl.inc.c
index 81d5a7a341..403ed3a01c 100644
--- a/target/ppc/translate/vmx-impl.inc.c
+++ b/target/ppc/translate/vmx-impl.inc.c
@@ -1035,21 +1035,25 @@ GEN_VXRFORM_DUAL(vcmpbfp, PPC_ALTIVEC, PPC_NONE, \
 GEN_VXRFORM_DUAL(vcmpgtfp, PPC_ALTIVEC, PPC_NONE, \
  vcmpgtud, PPC_NONE, PPC2_ALTIVEC_207)
 
-#define GEN_VXFORM_DUPI(name, tcg_op, opc2, opc3)   \
-static void glue(gen_, name)(DisasContext *ctx) \
-{   \
-int simm;   \
-if (unlikely(!ctx->altivec_enabled)) {  \
-gen_exception(ctx, POWERPC_EXCP_VPU);   \
-return; \
-}   \
-simm = SIMM5(ctx->opcode);  \
-tcg_op(avr_full_offset(rD(ctx->opcode)), 16, 16, simm); \
+static void gen_vsplti(DisasContext *ctx, int vece)
+{
+int simm;
+
+if (unlikely(!ctx->altivec_enabled)) {
+gen_exception(ctx, POWERPC_EXCP_VPU);
+return;
 }
 
-GEN_VXFORM_DUPI(vspltisb, tcg_gen_gvec_dup8i, 6, 12);
-GEN_VXFORM_DUPI(vspltish, tcg_gen_gvec_dup16i, 6, 13);
-GEN_VXFORM_DUPI(vspltisw, tcg_gen_gvec_dup32i, 6, 14);
+simm = SIMM5(ctx->opcode);
+tcg_gen_gvec_dup_imm(vece, avr_full_offset(rD(ctx->opcode)), 16, 16, simm);
+}
+
+#define GEN_VXFORM_VSPLTI(name, vece, opc2, opc3) \
+static void glue(gen_, name)(DisasContext *ctx) { gen_vsplti(ctx, vece); }
+
+GEN_VXFORM_VSPLTI(vspltisb, MO_8, 6, 12);
+GEN_VXFORM_VSPLTI(vspltish, MO_16, 6, 13);
+GEN_VXFORM_VSPLTI(vspltisw, MO_32, 6, 14);
 
 #define GEN_VXFORM_NOA(name, opc2, opc3)\
 static void glue(gen_, name)(DisasContext *ctx) \
@@ -1559,7 +1563,7 @@ GEN_VXFORM_DUAL(vsldoi, PPC_ALTIVEC, PPC_NONE,
 #undef GEN_VXRFORM_DUAL
 #undef GEN_VXRFORM1
 #undef GEN_VXRFORM
-#undef GEN_VXFORM_DUPI
+#undef GEN_VXFORM_VSPLTI
 #undef GEN_VXFORM_NOA
 #undef GEN_VXFORM_UIMM
 #undef GEN_VAFORM_PAIRED
diff --git a/target/ppc/translate/vsx-impl.inc.c 
b/target/ppc/translate/vsx-impl.inc.c
index 8287e272f5..b518de46db 100644
--- a/target/ppc/translate/vsx-impl.inc.c
+++ b/target/ppc/translate/vsx-impl.inc.c
@@ -1579,7 +1579,7 @@ static void gen_xxspltib(DisasContext *ctx)
 return;
 }
 }
-tcg_gen_gvec_dup8i(vsr_full_offset(rt), 16, 16, uim8);
+tcg_gen_gvec_dup_imm(MO_8, vsr_full_offset(rt), 16, 16, uim8);
 }
 
 static void gen_xxsldwi(DisasContext *ctx)
-- 
2.20.1

[PULL 05/10] tcg: Use tcg_gen_gvec_dup_imm in logical simplifications

2020-05-06 Thread Richard Henderson

Replace the outgoing interface.

Reviewed-by: LIU Zhiwei 
Reviewed-by: Alex Bennée 
Signed-off-by: Richard Henderson 
---
 tcg/tcg-op-gvec.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 593bb4542e..de16c027b3 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -2326,7 +2326,7 @@ void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 };
 
 if (aofs == bofs) {
-tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
+tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
 } else {
 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, );
 }
@@ -2343,7 +2343,7 @@ void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 };
 
 if (aofs == bofs) {
-tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, 0);
+tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
 } else {
 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, );
 }
@@ -2360,7 +2360,7 @@ void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 };
 
 if (aofs == bofs) {
-tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
+tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
 } else {
 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, );
 }
@@ -2411,7 +2411,7 @@ void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 };
 
 if (aofs == bofs) {
-tcg_gen_gvec_dup8i(dofs, oprsz, maxsz, -1);
+tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
 } else {
 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, );
 }
-- 
2.20.1

[PULL 09/10] tcg: Add load_dest parameter to GVecGen2

2020-05-06 Thread Richard Henderson

We have this same parameter for GVecGen2i, GVecGen3,
and GVecGen3i.  This will make some SVE2 insns easier
to parameterize.

Reviewed-by: Alex Bennée 
Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-op-gvec.h |  2 ++
 tcg/tcg-op-gvec.c | 45 ---
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index d89f91f40e..cea6497341 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -109,6 +109,8 @@ typedef struct {
 uint8_t vece;
 /* Prefer i64 to v64.  */
 bool prefer_i64;
+/* Load dest as a 2nd source operand.  */
+bool load_dest;
 } GVecGen2;
 
 typedef struct {
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index 43cac1a0bf..049a55e700 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -663,17 +663,22 @@ static void expand_clr(uint32_t dofs, uint32_t maxsz)
 
 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
- void (*fni)(TCGv_i32, TCGv_i32))
+ bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 {
 TCGv_i32 t0 = tcg_temp_new_i32();
+TCGv_i32 t1 = tcg_temp_new_i32();
 uint32_t i;
 
 for (i = 0; i < oprsz; i += 4) {
 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
-fni(t0, t0);
-tcg_gen_st_i32(t0, cpu_env, dofs + i);
+if (load_dest) {
+tcg_gen_ld_i32(t1, cpu_env, dofs + i);
+}
+fni(t1, t0);
+tcg_gen_st_i32(t1, cpu_env, dofs + i);
 }
 tcg_temp_free_i32(t0);
+tcg_temp_free_i32(t1);
 }
 
 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
@@ -793,17 +798,22 @@ static void expand_4_i32(uint32_t dofs, uint32_t aofs, 
uint32_t bofs,
 
 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
- void (*fni)(TCGv_i64, TCGv_i64))
+ bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 {
 TCGv_i64 t0 = tcg_temp_new_i64();
+TCGv_i64 t1 = tcg_temp_new_i64();
 uint32_t i;
 
 for (i = 0; i < oprsz; i += 8) {
 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
-fni(t0, t0);
-tcg_gen_st_i64(t0, cpu_env, dofs + i);
+if (load_dest) {
+tcg_gen_ld_i64(t1, cpu_env, dofs + i);
+}
+fni(t1, t0);
+tcg_gen_st_i64(t1, cpu_env, dofs + i);
 }
 tcg_temp_free_i64(t0);
+tcg_temp_free_i64(t1);
 }
 
 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
@@ -924,17 +934,23 @@ static void expand_4_i64(uint32_t dofs, uint32_t aofs, 
uint32_t bofs,
 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
  uint32_t oprsz, uint32_t tysz, TCGType type,
+ bool load_dest,
  void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 {
 TCGv_vec t0 = tcg_temp_new_vec(type);
+TCGv_vec t1 = tcg_temp_new_vec(type);
 uint32_t i;
 
 for (i = 0; i < oprsz; i += tysz) {
 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
-fni(vece, t0, t0);
-tcg_gen_st_vec(t0, cpu_env, dofs + i);
+if (load_dest) {
+tcg_gen_ld_vec(t1, cpu_env, dofs + i);
+}
+fni(vece, t1, t0);
+tcg_gen_st_vec(t1, cpu_env, dofs + i);
 }
 tcg_temp_free_vec(t0);
+tcg_temp_free_vec(t1);
 }
 
 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
@@ -1088,7 +1104,8 @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
  * that e.g. size == 80 would be expanded with 2x32 + 1x16.
  */
 some = QEMU_ALIGN_DOWN(oprsz, 32);
-expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
+expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
+ g->load_dest, g->fniv);
 if (some == oprsz) {
 break;
 }
@@ -1098,17 +1115,19 @@ void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
 maxsz -= some;
 /* fallthru */
 case TCG_TYPE_V128:
-expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
+expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
+ g->load_dest, g->fniv);
 break;
 case TCG_TYPE_V64:
-expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
+expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
+ g->load_dest, g->fniv);
 break;
 
 case 0:
 if (g->fni8 && check_size_impl(oprsz, 8)) {
-expand_2_i64(dofs, aofs, oprsz, g->fni8);
+expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
 } else if (g->fni4 &&

[PULL 00/10] tcg patch queue

2020-05-06 Thread Richard Henderson

The following changes since commit a36d64f43325fa503075cc9408ddabb69b32f829:

  Merge remote-tracking branch 
'remotes/stsquad/tags/pull-testing-and-gdbstub-060520-1' into staging 
(2020-05-06 14:06:00 +0100)

are available in the Git repository at:

  https://github.com/rth7680/qemu.git tags/pull-tcg-20200506

for you to fetch changes up to 07dada0336a83002dfa8673a9220a88e13d9a45c:

  tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32,64} (2020-05-06 
09:25:10 -0700)


Add tcg_gen_gvec_dup_imm
Misc tcg patches


Richard Henderson (10):
  tcg: Add tcg_gen_gvec_dup_imm
  target/s390x: Use tcg_gen_gvec_dup_imm
  target/ppc: Use tcg_gen_gvec_dup_imm
  target/arm: Use tcg_gen_gvec_dup_imm
  tcg: Use tcg_gen_gvec_dup_imm in logical simplifications
  tcg: Remove tcg_gen_gvec_dup{8,16,32,64}i
  tcg: Add tcg_gen_gvec_dup_tl
  tcg: Improve vector tail clearing
  tcg: Add load_dest parameter to GVecGen2
  tcg: Fix integral argument type to tcg_gen_rot[rl]i_i{32,64}

 include/tcg/tcg-op-gvec.h   |  13 ++-
 include/tcg/tcg-op.h|   8 +-
 target/arm/translate-a64.c  |  10 +--
 target/arm/translate-sve.c  |  12 ++-
 target/arm/translate.c  |   9 +-
 target/ppc/translate/vmx-impl.inc.c |  32 +++
 target/ppc/translate/vsx-impl.inc.c |   2 +-
 target/s390x/translate_vx.inc.c |  41 ++---
 tcg/tcg-op-gvec.c   | 162 +++-
 tcg/tcg-op.c|  16 ++--
 10 files changed, 166 insertions(+), 139 deletions(-)

[PULL 04/10] target/arm: Use tcg_gen_gvec_dup_imm

2020-05-06 Thread Richard Henderson

In a few cases, we're able to remove some manual replication.

Reviewed-by: Alex Bennée 
Signed-off-by: Richard Henderson 
---
 target/arm/translate-a64.c | 10 +-
 target/arm/translate-sve.c | 12 +---
 target/arm/translate.c |  9 ++---
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index a896f9c4b8..62e5729904 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -502,7 +502,7 @@ static void clear_vec_high(DisasContext *s, bool is_q, int 
rd)
 tcg_temp_free_i64(tcg_zero);
 }
 if (vsz > 16) {
-tcg_gen_gvec_dup8i(ofs + 16, vsz - 16, vsz - 16, 0);
+tcg_gen_gvec_dup_imm(MO_64, ofs + 16, vsz - 16, vsz - 16, 0);
 }
 }
 
@@ -7785,8 +7785,8 @@ static void disas_simd_mod_imm(DisasContext *s, uint32_t 
insn)
 
 if (!((cmode & 0x9) == 0x1 || (cmode & 0xd) == 0x9)) {
 /* MOVI or MVNI, with MVNI negation handled above.  */
-tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), is_q ? 16 : 8,
-vec_full_reg_size(s), imm);
+tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), is_q ? 16 : 8,
+ vec_full_reg_size(s), imm);
 } else {
 /* ORR or BIC, with BIC negation to AND handled above.  */
 if (is_neg) {
@@ -10214,8 +10214,8 @@ static void handle_vec_simd_shri(DisasContext *s, bool 
is_q, bool is_u,
 if (is_u) {
 if (shift == 8 << size) {
 /* Shift count the same size as element size produces zero.  */
-tcg_gen_gvec_dup8i(vec_full_reg_offset(s, rd),
-   is_q ? 16 : 8, vec_full_reg_size(s), 0);
+tcg_gen_gvec_dup_imm(size, vec_full_reg_offset(s, rd),
+ is_q ? 16 : 8, vec_full_reg_size(s), 0);
 } else {
 gen_gvec_fn2i(s, is_q, rd, rn, shift, tcg_gen_gvec_shri, size);
 }
diff --git a/target/arm/translate-sve.c b/target/arm/translate-sve.c
index b35bad245e..6c8bda4e4c 100644
--- a/target/arm/translate-sve.c
+++ b/target/arm/translate-sve.c
@@ -177,7 +177,7 @@ static bool do_mov_z(DisasContext *s, int rd, int rn)
 static void do_dupi_z(DisasContext *s, int rd, uint64_t word)
 {
 unsigned vsz = vec_full_reg_size(s);
-tcg_gen_gvec_dup64i(vec_full_reg_offset(s, rd), vsz, vsz, word);
+tcg_gen_gvec_dup_imm(MO_64, vec_full_reg_offset(s, rd), vsz, vsz, word);
 }
 
 /* Invoke a vector expander on two Pregs.  */
@@ -1453,7 +1453,7 @@ static bool do_predset(DisasContext *s, int esz, int rd, 
int pat, bool setflag)
 unsigned oprsz = size_for_gvec(setsz / 8);
 
 if (oprsz * 8 == setsz) {
-tcg_gen_gvec_dup64i(ofs, oprsz, maxsz, word);
+tcg_gen_gvec_dup_imm(MO_64, ofs, oprsz, maxsz, word);
 goto done;
 }
 }
@@ -2044,7 +2044,7 @@ static bool trans_DUP_x(DisasContext *s, arg_DUP_x *a)
 unsigned nofs = vec_reg_offset(s, a->rn, index, esz);
 tcg_gen_gvec_dup_mem(esz, dofs, nofs, vsz, vsz);
 } else {
-tcg_gen_gvec_dup64i(dofs, vsz, vsz, 0);
+tcg_gen_gvec_dup_imm(esz, dofs, vsz, vsz, 0);
 }
 }
 return true;
@@ -3260,9 +3260,7 @@ static bool trans_FDUP(DisasContext *s, arg_FDUP *a)
 
 /* Decode the VFP immediate.  */
 imm = vfp_expand_imm(a->esz, a->imm);
-imm = dup_const(a->esz, imm);
-
-tcg_gen_gvec_dup64i(dofs, vsz, vsz, imm);
+tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, imm);
 }
 return true;
 }
@@ -3276,7 +3274,7 @@ static bool trans_DUP_i(DisasContext *s, arg_DUP_i *a)
 unsigned vsz = vec_full_reg_size(s);
 int dofs = vec_full_reg_offset(s, a->rd);
 
-tcg_gen_gvec_dup64i(dofs, vsz, vsz, dup_const(a->esz, a->imm));
+tcg_gen_gvec_dup_imm(a->esz, dofs, vsz, vsz, a->imm);
 }
 return true;
 }
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 025747c0bd..74fac1d09c 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -5209,7 +5209,8 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t 
insn)
   MIN(shift, (8 << size) - 1),
   vec_size, vec_size);
 } else if (shift >= 8 << size) {
-tcg_gen_gvec_dup8i(rd_ofs, vec_size, vec_size, 0);
+tcg_gen_gvec_dup_imm(MO_8, rd_ofs, vec_size,
+ vec_size, 0);
 } else {
 tcg_gen_gvec_shri(size, rd_ofs, rm_ofs, shift,
   vec_size, vec_size);
@@ -5260,7 +5261,8 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t 
insn)
  * architecturally valid and results in zero.
  */

[PULL 06/10] tcg: Remove tcg_gen_gvec_dup{8,16,32,64}i

2020-05-06 Thread Richard Henderson

These interfaces are now unused.

Reviewed-by: LIU Zhiwei 
Reviewed-by: David Hildenbrand 
Reviewed-by: Alex Bennée 
Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-op-gvec.h |  5 -
 tcg/tcg-op-gvec.c | 28 
 2 files changed, 33 deletions(-)

diff --git a/include/tcg/tcg-op-gvec.h b/include/tcg/tcg-op-gvec.h
index eb0d47a42b..fa8a0c8d03 100644
--- a/include/tcg/tcg-op-gvec.h
+++ b/include/tcg/tcg-op-gvec.h
@@ -320,11 +320,6 @@ void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, 
uint32_t s,
 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t s,
   uint32_t m, TCGv_i64);
 
-void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t s, uint32_t m, uint8_t x);
-void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t s, uint32_t m, uint16_t x);
-void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t s, uint32_t m, uint32_t x);
-void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t s, uint32_t m, uint64_t x);
-
 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
int64_t shift, uint32_t oprsz, uint32_t maxsz);
 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
index de16c027b3..5a6cc19812 100644
--- a/tcg/tcg-op-gvec.c
+++ b/tcg/tcg-op-gvec.c
@@ -1541,34 +1541,6 @@ void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, 
uint32_t aofs,
 }
 }
 
-void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
- uint32_t maxsz, uint64_t x)
-{
-check_size_align(oprsz, maxsz, dofs);
-do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
-}
-
-void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
- uint32_t maxsz, uint32_t x)
-{
-check_size_align(oprsz, maxsz, dofs);
-do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
-}
-
-void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
- uint32_t maxsz, uint16_t x)
-{
-check_size_align(oprsz, maxsz, dofs);
-do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
-}
-
-void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
- uint32_t maxsz, uint8_t x)
-{
-check_size_align(oprsz, maxsz, dofs);
-do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
-}
-
 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
   uint32_t maxsz, uint64_t x)
 {
-- 
2.20.1

[PULL 02/10] target/s390x: Use tcg_gen_gvec_dup_imm

2020-05-06 Thread Richard Henderson

The gen_gvec_dupi switch is unnecessary with the new function.
Replace it with a local gen_gvec_dup_imm that takes care of the
register to offset conversion and length arguments.

Drop zero_vec and use use gen_gvec_dup_imm with 0.

Reviewed-by: David Hildenbrand 
Reviewed-by: Alex Bennée 
Signed-off-by: Richard Henderson 
---
 target/s390x/translate_vx.inc.c | 41 +++--
 1 file changed, 8 insertions(+), 33 deletions(-)

diff --git a/target/s390x/translate_vx.inc.c b/target/s390x/translate_vx.inc.c
index 24558cce80..12347f8a03 100644
--- a/target/s390x/translate_vx.inc.c
+++ b/target/s390x/translate_vx.inc.c
@@ -231,8 +231,8 @@ static void get_vec_element_ptr_i64(TCGv_ptr ptr, uint8_t 
reg, TCGv_i64 enr,
 #define gen_gvec_mov(v1, v2) \
 tcg_gen_gvec_mov(0, vec_full_reg_offset(v1), vec_full_reg_offset(v2), 16, \
  16)
-#define gen_gvec_dup64i(v1, c) \
-tcg_gen_gvec_dup64i(vec_full_reg_offset(v1), 16, 16, c)
+#define gen_gvec_dup_imm(es, v1, c) \
+tcg_gen_gvec_dup_imm(es, vec_full_reg_offset(v1), 16, 16, c);
 #define gen_gvec_fn_2(fn, es, v1, v2) \
 tcg_gen_gvec_##fn(es, vec_full_reg_offset(v1), vec_full_reg_offset(v2), \
   16, 16)
@@ -316,31 +316,6 @@ static void gen_gvec128_4_i64(gen_gvec128_4_i64_fn fn, 
uint8_t d, uint8_t a,
 tcg_temp_free_i64(cl);
 }
 
-static void gen_gvec_dupi(uint8_t es, uint8_t reg, uint64_t c)
-{
-switch (es) {
-case ES_8:
-tcg_gen_gvec_dup8i(vec_full_reg_offset(reg), 16, 16, c);
-break;
-case ES_16:
-tcg_gen_gvec_dup16i(vec_full_reg_offset(reg), 16, 16, c);
-break;
-case ES_32:
-tcg_gen_gvec_dup32i(vec_full_reg_offset(reg), 16, 16, c);
-break;
-case ES_64:
-gen_gvec_dup64i(reg, c);
-break;
-default:
-g_assert_not_reached();
-}
-}
-
-static void zero_vec(uint8_t reg)
-{
-tcg_gen_gvec_dup8i(vec_full_reg_offset(reg), 16, 16, 0);
-}
-
 static void gen_addi2_i64(TCGv_i64 dl, TCGv_i64 dh, TCGv_i64 al, TCGv_i64 ah,
   uint64_t b)
 {
@@ -396,8 +371,8 @@ static DisasJumpType op_vgbm(DisasContext *s, DisasOps *o)
  * Masks for both 64 bit elements of the vector are the same.
  * Trust tcg to produce a good constant loading.
  */
-gen_gvec_dup64i(get_field(s, v1),
-generate_byte_mask(i2 & 0xff));
+gen_gvec_dup_imm(ES_64, get_field(s, v1),
+ generate_byte_mask(i2 & 0xff));
 } else {
 TCGv_i64 t = tcg_temp_new_i64();
 
@@ -432,7 +407,7 @@ static DisasJumpType op_vgm(DisasContext *s, DisasOps *o)
 }
 }
 
-gen_gvec_dupi(es, get_field(s, v1), mask);
+gen_gvec_dup_imm(es, get_field(s, v1), mask);
 return DISAS_NEXT;
 }
 
@@ -585,7 +560,7 @@ static DisasJumpType op_vllez(DisasContext *s, DisasOps *o)
 
 t = tcg_temp_new_i64();
 tcg_gen_qemu_ld_i64(t, o->addr1, get_mem_index(s), MO_TE | es);
-zero_vec(get_field(s, v1));
+gen_gvec_dup_imm(es, get_field(s, v1), 0);
 write_vec_element_i64(t, get_field(s, v1), enr, es);
 tcg_temp_free_i64(t);
 return DISAS_NEXT;
@@ -892,7 +867,7 @@ static DisasJumpType op_vrepi(DisasContext *s, DisasOps *o)
 return DISAS_NORETURN;
 }
 
-gen_gvec_dupi(es, get_field(s, v1), data);
+gen_gvec_dup_imm(es, get_field(s, v1), data);
 return DISAS_NEXT;
 }
 
@@ -1372,7 +1347,7 @@ static DisasJumpType op_vcksm(DisasContext *s, DisasOps 
*o)
 read_vec_element_i32(tmp, get_field(s, v2), i, ES_32);
 tcg_gen_add2_i32(tmp, sum, sum, sum, tmp, tmp);
 }
-zero_vec(get_field(s, v1));
+gen_gvec_dup_imm(ES_32, get_field(s, v1), 0);
 write_vec_element_i32(sum, get_field(s, v1), 1, ES_32);
 
 tcg_temp_free_i32(tmp);
-- 
2.20.1

Re: [PATCH v2] aspeed: Add support for the sonorapass-bmc board

2020-05-06 Thread Patrick Williams

On Wed, May 06, 2020 at 06:06:34PM +, Amithash Prasad wrote:
> >> +    mc->desc   = "OpenPOWER SonoraPass BMC (ARM1176)";
> Open Compute Project?

Oops.  Yeah, this is not an OpenPOWER machine.  Will send a v3.

-- 
Patrick Williams


signature.asc
Description: PGP signature

Re: [PATCH v5 30/31] qcow2: Add subcluster support to qcow2_measure()

2020-05-06 Thread Eric Blake


On 5/5/20 12:38 PM, Alberto Garcia wrote:

Extended L2 entries are bigger than normal L2 entries so this has an
impact on the amount of metadata needed for a qcow2 file.

Signed-off-by: Alberto Garcia 
Reviewed-by: Max Reitz 
---
  block/qcow2.c | 19 ---
  1 file changed, 12 insertions(+), 7 deletions(-)


Should this be hoisted earlier in the series, before 28/31?

Should there be iotest coverage?

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v5 29/31] qcow2: Assert that expand_zero_clusters_in_l1() does not support subclusters

2020-05-06 Thread Eric Blake


On 5/5/20 12:38 PM, Alberto Garcia wrote:

This function is only used by qcow2_expand_zero_clusters() to
downgrade a qcow2 image to a previous version. It is however not
possible to downgrade an image with extended L2 entries because older
versions of qcow2 do not have this feature.

Signed-off-by: Alberto Garcia 
---
  block/qcow2-cluster.c  | 8 +++-
  tests/qemu-iotests/061 | 6 ++
  tests/qemu-iotests/061.out | 5 +
  3 files changed, 18 insertions(+), 1 deletion(-)


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v5 28/31] qcow2: Add the 'extended_l2' option and the QCOW2_INCOMPAT_EXTL2 bit

2020-05-06 Thread Eric Blake


On 5/5/20 12:38 PM, Alberto Garcia wrote:

Now that the implementation of subclusters is complete we can finally
add the necessary options to create and read images with this feature,
which we call "extended L2 entries".

Signed-off-by: Alberto Garcia 
Reviewed-by: Max Reitz 
---


What you have looks good, but I didn't notice anything affecting amend. 
The simplest option: amend can reject attempts to toggle the extended L2 
option (the zstd compression patches take that path).   More complicated 
is actually supporting it (in either direction, turning it on or off), 
which requires rewriting ALL L2 tables in the entry (including any in 
internal snapshots), which could be quite time-intensive, and where you 
must be careful to stage things so that failures during partial 
conversion merely leave leaked clusters rather than a header pointing to 
a half-converted state.  Either way, one of the iotests should probably 
add coverage on what happens when you attempt to amend the bit on or off.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2] aspeed: Add support for the sonorapass-bmc board

2020-05-06 Thread Amithash Prasad via

>> +    mc->desc   = "OpenPOWER SonoraPass BMC (ARM1176)";
Open Compute Project?

Re: [PATCH v5 23/31] qcow2: Add subcluster support to check_refcounts_l2()

2020-05-06 Thread Eric Blake


On 5/5/20 12:38 PM, Alberto Garcia wrote:

Setting the QCOW_OFLAG_ZERO bit of the L2 entry is forbidden if an
image has subclusters. Instead, the individual 'all zeroes' bits must
be used.


Should we s/is forbidden/is ignored/ based on your spec changes?

But the patch itself is right.
Reviewed-by: Eric Blake 



Signed-off-by: Alberto Garcia 
Reviewed-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
  block/qcow2-refcount.c | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index dfdcdd3c25..9bb161481e 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1686,8 +1686,13 @@ static int check_refcounts_l2(BlockDriverState *bs, 
BdrvCheckResult *res,
  int ign = active ? QCOW2_OL_ACTIVE_L2 :
 QCOW2_OL_INACTIVE_L2;
  
-l2_entry = QCOW_OFLAG_ZERO;

-set_l2_entry(s, l2_table, i, l2_entry);
+if (has_subclusters(s)) {
+set_l2_entry(s, l2_table, i, 0);
+set_l2_bitmap(s, l2_table, i,
+  QCOW_L2_BITMAP_ALL_ZEROES);
+} else {
+set_l2_entry(s, l2_table, i, QCOW_OFLAG_ZERO);
+}
  ret = qcow2_pre_write_overlap_check(bs, ign,
  l2e_offset, l2_entry_size(s), false);
  if (ret < 0) {



--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v5 20/31] qcow2: Add subcluster support to qcow2_get_host_offset()

2020-05-06 Thread Eric Blake


On 5/5/20 12:38 PM, Alberto Garcia wrote:

The logic of this function remains pretty much the same, except that
it uses count_contiguous_subclusters(), which combines the logic of
count_contiguous_clusters() / count_contiguous_clusters_unallocated()
and checks individual subclusters.



Maybe mention that qcow2_cluster_to_subcluster_type() is now inlined 
into its lone remaining caller.



Signed-off-by: Alberto Garcia 
---
  block/qcow2.h |  38 +---
  block/qcow2-cluster.c | 141 --
  2 files changed, 82 insertions(+), 97 deletions(-)




+++ b/block/qcow2-cluster.c
@@ -376,66 +376,58 @@ fail:



+static int count_contiguous_subclusters(BlockDriverState *bs, int nb_clusters,
+unsigned sc_index, uint64_t *l2_slice,
+int l2_index)
  {



+
+assert(type != QCOW2_SUBCLUSTER_INVALID); /* The caller should check this 
*/
+assert(l2_index + nb_clusters <= s->l2_size);
+
+if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
+/* Compressed clusters are always processed one by one */
+return s->subclusters_per_cluster - sc_index;


Should this assert(sc_index == 0)?


  for (i = 0; i < nb_clusters; i++) {
-uint64_t entry = get_l2_entry(s, l2_slice, l2_index + i);
-QCow2ClusterType type = qcow2_get_cluster_type(bs, entry);
-
-if (type != wanted_type) {
-break;
+l2_entry = get_l2_entry(s, l2_slice, l2_index + i);
+l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
+if (check_offset && expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
+return count;
+}
+for (j = (i == 0) ? sc_index : 0; j < s->subclusters_per_cluster; j++) 
{
+if (qcow2_get_subcluster_type(bs, l2_entry, l2_bitmap, j) != type) 
{
+return count;
+}


This really is checking that sub-clusters have the exact same type.


@@ -604,24 +604,17 @@ int qcow2_get_host_offset(BlockDriverState *bs, uint64_t 
offset,
  ret = -EIO;
  goto fail;
  }
-/* Compressed clusters can only be processed one by one */
-c = 1;
  *host_offset = l2_entry & L2E_COMPRESSED_OFFSET_SIZE_MASK;
  break;
-case QCOW2_CLUSTER_ZERO_PLAIN:
-case QCOW2_CLUSTER_UNALLOCATED:
-/* how many empty clusters ? */
-c = count_contiguous_clusters_unallocated(bs, nb_clusters,
-  l2_slice, l2_index, type);


The old code was counting how many contiguous clusters has similar 
types, coalescing ranges of two different cluster types into one 
nb_clusters result.



+case QCOW2_SUBCLUSTER_ZERO_PLAIN:
+case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
  *host_offset = 0;
  break;
-case QCOW2_CLUSTER_ZERO_ALLOC:
-case QCOW2_CLUSTER_NORMAL: {
+case QCOW2_SUBCLUSTER_ZERO_ALLOC:
+case QCOW2_SUBCLUSTER_NORMAL:
+case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC: {
  uint64_t host_cluster_offset = l2_entry & L2E_OFFSET_MASK;
  *host_offset = host_cluster_offset + offset_in_cluster;
-/* how many allocated clusters ? */
-c = count_contiguous_clusters(bs, nb_clusters, s->cluster_size,
-  l2_slice, l2_index, QCOW_OFLAG_ZERO);


and here coalescing three different cluster types into one nb_clusters 
result.



  if (offset_into_cluster(s, host_cluster_offset)) {
  qcow2_signal_corruption(bs, true, -1, -1,
  "Cluster allocation offset %#"
@@ -647,9 +640,11 @@ int qcow2_get_host_offset(BlockDriverState *bs, uint64_t 
offset,
  abort();
  }
  
+sc = count_contiguous_subclusters(bs, nb_clusters, sc_index,

+  l2_slice, l2_index);


But the new code is stopping at the first different subcluster type, 
rather than trying to coalesce as many compatible types into one larger 
nb_clusters.  When coupled with patch 19, that factors into my concern 
over whether patch 19 needed to check for INVALID clusters in the 
middle, or whether its fail: label was unreachable.  But it also means 
that you are potentially fragmenting the write in more places (every 
time a subcluster status changes) rather than coalescing similar status, 
the way the old code used to operate.


I don't think the extra fragmentation causes any correctness issues, but 
it may cause performance issues.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH] 9pfs: Fix potential deadlock of QEMU mainloop

2020-05-06 Thread Greg Kurz

On Wed, 06 May 2020 15:36:16 +0200
Christian Schoenebeck  wrote:

> On Mittwoch, 6. Mai 2020 15:05:23 CEST Christian Schoenebeck wrote:
> > > diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
> > > index 9e046f7acb51..ac84ae804496 100644
> > > --- a/hw/9pfs/9p.c
> > > +++ b/hw/9pfs/9p.c
> > > @@ -2170,7 +2170,7 @@ static int coroutine_fn
> > > v9fs_do_readdir_with_stat(V9fsPDU *pdu, int32_t count = 0;
> > > 
> > >  struct stat stbuf;
> > >  off_t saved_dir_pos;
> > > 
> > > -struct dirent *dent;
> > > +struct dirent dent;
> 
> One more: since this dirent structure is now on the stack, it should better 
> be 
> initialized for safety reasons.
> 

I don't think so, for two reasons:
- I can't think of an initializer that would make sense for a dirent
- if a future change introduces a branch where dent could be used
  uninitialized, I'd rather give a chance to the compiler to bark

> Best regards,
> Christian Schoenebeck
> 
>

1 2 3 >

1 - 100 of 292 matches

Mail list logo