date:20230116

[PATCH v7 18/51] i386/xen: implement HYPERVISOR_hvm_op

2023-01-16 Thread David Woodhouse

From: Joao Martins 

This is when guest queries for support for HVMOP_pagetable_dying.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/kvm/xen-emu.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 5ff739cc82..e8c5a8db53 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -25,6 +25,7 @@
 #include "standard-headers/xen/version.h"
 #include "standard-headers/xen/sched.h"
 #include "standard-headers/xen/memory.h"
+#include "standard-headers/xen/hvm/hvm_op.h"
 
 #include "xen-compat.h"
 
@@ -346,6 +347,19 @@ static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 return true;
 }
 
+static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
+ int cmd, uint64_t arg)
+{
+switch (cmd) {
+case HVMOP_pagetable_dying:
+exit->u.hcall.result = -ENOSYS;
+return true;
+
+default:
+return false;
+}
+}
+
 static int kvm_xen_soft_reset(void)
 {
 int err;
@@ -441,6 +455,9 @@ static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct 
kvm_xen_exit *exit)
 case __HYPERVISOR_sched_op:
 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
   exit->u.hcall.params[1]);
+case __HYPERVISOR_hvm_op:
+return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
+exit->u.hcall.params[1]);
 case __HYPERVISOR_memory_op:
 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
exit->u.hcall.params[1]);
-- 
2.39.0

[PATCH v7 23/51] i386/xen: implement HYPERVISOR_event_channel_op

2023-01-16 Thread David Woodhouse

From: Joao Martins 

Additionally set XEN_INTERFACE_VERSION to most recent in order to
exercise the "new" event_channel_op.

Signed-off-by: Joao Martins 
[dwmw2: Ditch event_channel_op_compat which was never available to HVM guests]
Signed-off-by: David Woodhouse 
---
 target/i386/kvm/xen-emu.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index b0ff03dbeb..686e5dfd38 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -27,6 +27,7 @@
 #include "standard-headers/xen/memory.h"
 #include "standard-headers/xen/hvm/hvm_op.h"
 #include "standard-headers/xen/vcpu.h"
+#include "standard-headers/xen/event_channel.h"
 
 #include "xen-compat.h"
 
@@ -585,6 +586,27 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 return true;
 }
 
+static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit,
+int cmd, uint64_t arg)
+{
+int err = -ENOSYS;
+
+switch (cmd) {
+case EVTCHNOP_init_control:
+   case EVTCHNOP_expand_array:
+   case EVTCHNOP_set_priority:
+/* We do not support FIFO channels at this point */
+err = -ENOSYS;
+break;
+
+default:
+return false;
+}
+
+exit->u.hcall.result = err;
+return true;
+}
+
 static int kvm_xen_soft_reset(void)
 {
 CPUState *cpu;
@@ -685,6 +707,9 @@ static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct 
kvm_xen_exit *exit)
 case __HYPERVISOR_sched_op:
 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
   exit->u.hcall.params[1]);
+case __HYPERVISOR_event_channel_op:
+return kvm_xen_hcall_evtchn_op(exit, exit->u.hcall.params[0],
+   exit->u.hcall.params[1]);
 case __HYPERVISOR_vcpu_op:
 return kvm_xen_hcall_vcpu_op(exit, cpu,
  exit->u.hcall.params[0],
-- 
2.39.0

[PATCH v7 44/51] i386/xen: Implement HYPERVISOR_grant_table_op and GNTTABOP_[gs]et_verson

2023-01-16 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_gnttab.c  | 31 
 hw/i386/kvm/xen_gnttab.h  |  5 
 target/i386/kvm/xen-emu.c | 60 +++
 3 files changed, 96 insertions(+)

diff --git a/hw/i386/kvm/xen_gnttab.c b/hw/i386/kvm/xen_gnttab.c
index 311f48bfdb..cdcfea5be3 100644
--- a/hw/i386/kvm/xen_gnttab.c
+++ b/hw/i386/kvm/xen_gnttab.c
@@ -186,3 +186,34 @@ int xen_gnttab_map_page(uint64_t idx, uint64_t gfn)
 return 0;
 }
 
+int xen_gnttab_set_version_op(struct gnttab_set_version *set)
+{
+int ret;
+
+switch (set->version) {
+case 1:
+ret = 0;
+break;
+
+case 2:
+/* Behave as before set_version was introduced. */
+ret = -ENOSYS;
+break;
+
+default:
+ret = -EINVAL;
+}
+
+set->version = 1;
+return ret;
+}
+
+int xen_gnttab_get_version_op(struct gnttab_get_version *get)
+{
+if (get->dom != DOMID_SELF && get->dom != xen_domid) {
+return -ESRCH;
+}
+
+get->version = 1;
+return 0;
+}
diff --git a/hw/i386/kvm/xen_gnttab.h b/hw/i386/kvm/xen_gnttab.h
index a7caa94c83..79579677ba 100644
--- a/hw/i386/kvm/xen_gnttab.h
+++ b/hw/i386/kvm/xen_gnttab.h
@@ -15,4 +15,9 @@
 void xen_gnttab_create(void);
 int xen_gnttab_map_page(uint64_t idx, uint64_t gfn);
 
+struct gnttab_set_version;
+struct gnttab_get_version;
+int xen_gnttab_set_version_op(struct gnttab_set_version *set);
+int xen_gnttab_get_version_op(struct gnttab_get_version *get);
+
 #endif /* QEMU_XEN_GNTTAB_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 59e4a9d4ac..fa2d3e5615 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -33,6 +33,7 @@
 #include "standard-headers/xen/hvm/params.h"
 #include "standard-headers/xen/vcpu.h"
 #include "standard-headers/xen/event_channel.h"
+#include "standard-headers/xen/grant_table.h"
 
 #include "xen-compat.h"
 
@@ -1137,6 +1138,61 @@ static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 return true;
 }
 
+static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
+int cmd, uint64_t arg, int count)
+{
+CPUState *cs = CPU(cpu);
+int err;
+
+switch (cmd) {
+case GNTTABOP_set_version: {
+struct gnttab_set_version set;
+
+qemu_build_assert(sizeof(set) == 4);
+if (kvm_copy_from_gva(cs, arg, , sizeof(set))) {
+err = -EFAULT;
+break;
+}
+
+err = xen_gnttab_set_version_op();
+if (!err && kvm_copy_to_gva(cs, arg, , sizeof(set))) {
+err = -EFAULT;
+}
+break;
+}
+case GNTTABOP_get_version: {
+struct gnttab_get_version get;
+
+qemu_build_assert(sizeof(get) == 8);
+if (kvm_copy_from_gva(cs, arg, , sizeof(get))) {
+err = -EFAULT;
+break;
+}
+
+err = xen_gnttab_get_version_op();
+if (!err && kvm_copy_to_gva(cs, arg, , sizeof(get))) {
+err = -EFAULT;
+}
+break;
+}
+case GNTTABOP_query_size:
+case GNTTABOP_setup_table:
+case GNTTABOP_copy:
+case GNTTABOP_map_grant_ref:
+case GNTTABOP_unmap_grant_ref:
+case GNTTABOP_swap_grant_ref:
+return false;
+
+default:
+/* Xen explicitly returns -ENOSYS to HVM guests for all others */
+err = -ENOSYS;
+break;
+}
+
+exit->u.hcall.result = err;
+return true;
+}
+
 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
 {
 uint16_t code = exit->u.hcall.input;
@@ -1147,6 +1203,10 @@ static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct 
kvm_xen_exit *exit)
 }
 
 switch (code) {
+case __HYPERVISOR_grant_table_op:
+return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
+   exit->u.hcall.params[1],
+   exit->u.hcall.params[2]);
 case __HYPERVISOR_sched_op:
 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
   exit->u.hcall.params[1]);
-- 
2.39.0

[PATCH v7 34/51] hw/xen: Implement EVTCHNOP_alloc_unbound

2023-01-16 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_evtchn.c  | 32 
 hw/i386/kvm/xen_evtchn.h  |  2 ++
 target/i386/kvm/xen-emu.c | 15 +++
 3 files changed, 49 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 107249c48c..15c467ad7f 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -828,6 +828,38 @@ int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
 return ret;
 }
 
+int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+uint16_t type_val;
+int ret;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+if (alloc->dom != DOMID_SELF && alloc->dom != xen_domid) {
+return -ESRCH;
+}
+
+if (alloc->remote_dom == DOMID_QEMU) {
+type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+} else if (alloc->remote_dom == DOMID_SELF ||
+   alloc->remote_dom == xen_domid) {
+type_val = 0;
+} else {
+return -EPERM;
+}
+
+qemu_mutex_lock(>port_lock);
+
+ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val, >port);
+
+qemu_mutex_unlock(>port_lock);
+
+return ret;
+}
+
 int xen_evtchn_send_op(struct evtchn_send *send)
 {
 XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 500fdbe8b8..fc080138e3 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -21,11 +21,13 @@ struct evtchn_unmask;
 struct evtchn_bind_virq;
 struct evtchn_bind_ipi;
 struct evtchn_send;
+struct evtchn_alloc_unbound;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
 int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
 int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
 int xen_evtchn_send_op(struct evtchn_send *send);
+int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
 
 #endif /* QEMU_XEN_EVTCHN_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 8497677c0c..9dd8abdd9a 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -910,6 +910,21 @@ static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 err = xen_evtchn_send_op();
 break;
 }
+case EVTCHNOP_alloc_unbound: {
+struct evtchn_alloc_unbound alloc;
+
+qemu_build_assert(sizeof(alloc) == 8);
+if (kvm_copy_from_gva(cs, arg, , sizeof(alloc))) {
+err = -EFAULT;
+break;
+}
+
+err = xen_evtchn_alloc_unbound_op();
+if (!err && kvm_copy_to_gva(cs, arg, , sizeof(alloc))) {
+err = -EFAULT;
+}
+break;
+}
 default:
 return false;
 }
-- 
2.39.0

[PATCH v7 21/51] i386/xen: handle VCPUOP_register_vcpu_time_info

2023-01-16 Thread David Woodhouse

From: Joao Martins 

In order to support Linux vdso in Xen.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/cpu.h |   1 +
 target/i386/kvm/xen-emu.c | 100 +-
 target/i386/machine.c |   1 +
 3 files changed, 90 insertions(+), 12 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 109b2e5669..96c2d0d5cb 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1790,6 +1790,7 @@ typedef struct CPUArchState {
 struct kvm_nested_state *nested_state;
 uint64_t xen_vcpu_info_gpa;
 uint64_t xen_vcpu_info_default_gpa;
+uint64_t xen_vcpu_time_info_gpa;
 #endif
 #if defined(CONFIG_HVF)
 HVFX86LazyFlags hvf_lflags;
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 817196c988..4f905eda74 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -36,28 +36,41 @@
 #define hypercall_compat32(longmode) (false)
 #endif
 
-static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
-  bool is_write)
+static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
+   size_t *len, bool is_write)
 {
-uint8_t *buf = (uint8_t *)_buf;
-int ret;
-
-while (sz) {
 struct kvm_translation tr = {
 .linear_address = gva,
 };
 
-size_t len = TARGET_PAGE_SIZE - (tr.linear_address & 
~TARGET_PAGE_MASK);
-if (len > sz) {
-len = sz;
+if (len) {
+*len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
+}
+
+if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, ) || !tr.valid ||
+(is_write && !tr.writeable)) {
+return false;
 }
+*gpa = tr.physical_address;
+return true;
+}
+
+static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
+  bool is_write)
+{
+uint8_t *buf = (uint8_t *)_buf;
+uint64_t gpa;
+size_t len;
 
-ret = kvm_vcpu_ioctl(cs, KVM_TRANSLATE, );
-if (ret || !tr.valid || (is_write && !tr.writeable)) {
+while (sz) {
+if (!kvm_gva_to_gpa(cs, gva, , , is_write)) {
 return -EFAULT;
 }
+if (len > sz) {
+len = sz;
+}
 
-cpu_physical_memory_rw(tr.physical_address, buf, len, is_write);
+cpu_physical_memory_rw(gpa, buf, len, is_write);
 
 buf += len;
 sz -= len;
@@ -145,6 +158,7 @@ int kvm_xen_init_vcpu(CPUState *cs)
 
 env->xen_vcpu_info_gpa = INVALID_GPA;
 env->xen_vcpu_info_default_gpa = INVALID_GPA;
+env->xen_vcpu_time_info_gpa = INVALID_GPA;
 
 return 0;
 }
@@ -228,6 +242,17 @@ static void do_set_vcpu_info_gpa(CPUState *cs, 
run_on_cpu_data data)
   env->xen_vcpu_info_gpa);
 }
 
+static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
+{
+X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = >env;
+
+env->xen_vcpu_time_info_gpa = data.host_ulong;
+
+kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
+  env->xen_vcpu_time_info_gpa);
+}
+
 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
 {
 X86CPU *cpu = X86_CPU(cs);
@@ -235,8 +260,11 @@ static void do_vcpu_soft_reset(CPUState *cs, 
run_on_cpu_data data)
 
 env->xen_vcpu_info_gpa = INVALID_GPA;
 env->xen_vcpu_info_default_gpa = INVALID_GPA;
+env->xen_vcpu_time_info_gpa = INVALID_GPA;
 
 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, INVALID_GPA);
+kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
+  INVALID_GPA);
 }
 
 static int xen_set_shared_info(uint64_t gfn)
@@ -450,6 +478,42 @@ static int vcpuop_register_vcpu_info(CPUState *cs, 
CPUState *target,
 return 0;
 }
 
+static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
+  uint64_t arg)
+{
+struct vcpu_register_time_memory_area tma;
+uint64_t gpa;
+size_t len;
+
+/* No need for 32/64 compat handling */
+qemu_build_assert(sizeof(tma) == 8);
+qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
+
+if (!target) {
+return -ENOENT;
+}
+
+if (kvm_copy_from_gva(cs, arg, , sizeof(tma))) {
+return -EFAULT;
+}
+
+/*
+ * Xen actually uses the GVA and does the translation through the guest
+ * page tables each time. But Linux/KVM uses the GPA, on the assumption
+ * that guests only ever use *global* addresses (kernel virtual addresses)
+ * for it. If Linux is changed to redo the GVA→GPA translation each time,
+ * it will offer a new vCPU attribute for that, and we'll use it instead.
+ */
+if (!kvm_gva_to_gpa(cs, tma.addr.p, , , false) ||
+len < sizeof(struct vcpu_time_info)) {
+return -EFAULT;
+}
+
+async_run_on_cpu(target,

[PATCH v7 07/51] xen-platform: exclude vfio-pci from the PCI platform unplug

2023-01-16 Thread David Woodhouse

From: Joao Martins 

Such that PCI passthrough devices work for Xen emulated guests.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/xen/xen_platform.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
index 7db0d94ec2..50174c2269 100644
--- a/hw/i386/xen/xen_platform.c
+++ b/hw/i386/xen/xen_platform.c
@@ -109,12 +109,25 @@ static void log_writeb(PCIXenPlatformState *s, char val)
 #define _UNPLUG_NVME_DISKS 3
 #define UNPLUG_NVME_DISKS (1u << _UNPLUG_NVME_DISKS)
 
+static bool pci_device_is_passthrough(PCIDevice *d)
+{
+if (!strcmp(d->name, "xen-pci-passthrough")) {
+return true;
+}
+
+if (xen_mode == XEN_EMULATE && !strcmp(d->name, "vfio-pci")) {
+return true;
+}
+
+return false;
+}
+
 static void unplug_nic(PCIBus *b, PCIDevice *d, void *o)
 {
 /* We have to ignore passthrough devices */
 if (pci_get_word(d->config + PCI_CLASS_DEVICE) ==
 PCI_CLASS_NETWORK_ETHERNET
-&& strcmp(d->name, "xen-pci-passthrough") != 0) {
+&& !pci_device_is_passthrough(d)) {
 object_unparent(OBJECT(d));
 }
 }
@@ -187,9 +200,8 @@ static void unplug_disks(PCIBus *b, PCIDevice *d, void 
*opaque)
 !(flags & UNPLUG_IDE_SCSI_DISKS);
 
 /* We have to ignore passthrough devices */
-if (!strcmp(d->name, "xen-pci-passthrough")) {
+if (pci_device_is_passthrough(d))
 return;
-}
 
 switch (pci_get_word(d->config + PCI_CLASS_DEVICE)) {
 case PCI_CLASS_STORAGE_IDE:
-- 
2.39.0

[PATCH v7 35/51] hw/xen: Implement EVTCHNOP_bind_interdomain

2023-01-16 Thread David Woodhouse

From: David Woodhouse 

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_evtchn.c  | 78 +++
 hw/i386/kvm/xen_evtchn.h  |  2 +
 target/i386/kvm/xen-emu.c | 16 
 3 files changed, 96 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 15c467ad7f..511d52a31d 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -713,6 +713,23 @@ static int close_port(XenEvtchnState *s, evtchn_port_t 
port)
 }
 break;
 
+case EVTCHNSTAT_interdomain:
+if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+/* Not yet implemented. This can't happen! */
+} else {
+/* Loopback interdomain */
+XenEvtchnPort *rp = >port_table[p->type_val];
+if (!valid_port(p->type_val) || rp->type_val != port ||
+rp->type != EVTCHNSTAT_interdomain) {
+error_report("Inconsistent state for interdomain unbind");
+} else {
+/* Set the other end back to unbound */
+rp->type = EVTCHNSTAT_unbound;
+rp->type_val = 0;
+}
+}
+break;
+
 default:
 break;
 }
@@ -828,6 +845,67 @@ int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
 return ret;
 }
 
+int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain *interdomain)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+uint16_t type_val;
+int ret;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+if (interdomain->remote_dom == DOMID_QEMU) {
+type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+} else if (interdomain->remote_dom == DOMID_SELF ||
+   interdomain->remote_dom == xen_domid) {
+type_val = 0;
+} else {
+return -ESRCH;
+}
+
+if (!valid_port(interdomain->remote_port)) {
+return -EINVAL;
+}
+
+qemu_mutex_lock(>port_lock);
+
+/* The newly allocated port starts out as unbound */
+ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val,
+>local_port);
+if (ret) {
+goto out;
+}
+
+if (interdomain->remote_dom == DOMID_QEMU) {
+/* We haven't hooked up QEMU's PV drivers to this yet */
+ret = -ENOSYS;
+} else {
+/* Loopback */
+XenEvtchnPort *rp = >port_table[interdomain->remote_port];
+XenEvtchnPort *lp = >port_table[interdomain->local_port];
+
+if (rp->type == EVTCHNSTAT_unbound && rp->type_val == 0) {
+/* It's a match! */
+rp->type = EVTCHNSTAT_interdomain;
+rp->type_val = interdomain->local_port;
+
+lp->type = EVTCHNSTAT_interdomain;
+lp->type_val = interdomain->remote_port;
+} else {
+ret = -EINVAL;
+}
+}
+
+if (ret) {
+free_port(s, interdomain->local_port);
+}
+ out:
+qemu_mutex_unlock(>port_lock);
+
+return ret;
+
+}
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc)
 {
 XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index fc080138e3..1ebc7580eb 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -22,6 +22,7 @@ struct evtchn_bind_virq;
 struct evtchn_bind_ipi;
 struct evtchn_send;
 struct evtchn_alloc_unbound;
+struct evtchn_bind_interdomain;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
@@ -29,5 +30,6 @@ int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
 int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
 int xen_evtchn_send_op(struct evtchn_send *send);
 int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
+int xen_evtchn_bind_interdomain_op(struct evtchn_bind_interdomain 
*interdomain);
 
 #endif /* QEMU_XEN_EVTCHN_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 9dd8abdd9a..5f2309d282 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -925,6 +925,22 @@ static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 }
 break;
 }
+case EVTCHNOP_bind_interdomain: {
+struct evtchn_bind_interdomain interdomain;
+
+qemu_build_assert(sizeof(interdomain) == 12);
+if (kvm_copy_from_gva(cs, arg, , sizeof(interdomain))) {
+err = -EFAULT;
+break;
+}
+
+err = xen_evtchn_bind_interdomain_op();
+if (!err &&
+kvm_copy_to_gva(cs, arg, , sizeof(interdomain))) {
+err = -EFAULT;
+}
+break;
+}
 default:
 return false;
 }
-- 
2.39.0

[PATCH v7 47/51] i386/xen: Reserve Xen special pages for console, xenstore rings

2023-01-16 Thread David Woodhouse

From: David Woodhouse 

Xen has eight frames at 0xfeff8000 for this; we only really need two for
now and KVM puts the identity map at 0xfeffc000, so limit ourselves to
four.

Signed-off-by: David Woodhouse 
---
 include/sysemu/kvm_xen.h  |  8 
 target/i386/kvm/xen-emu.c | 10 ++
 2 files changed, 18 insertions(+)

diff --git a/include/sysemu/kvm_xen.h b/include/sysemu/kvm_xen.h
index df92205a7d..ed12b3aaec 100644
--- a/include/sysemu/kvm_xen.h
+++ b/include/sysemu/kvm_xen.h
@@ -29,4 +29,12 @@ uint16_t kvm_xen_get_gnttab_max_frames(void);
 #define kvm_xen_has_cap(cap) (!!(kvm_xen_get_caps() &   \
  KVM_XEN_HVM_CONFIG_ ## cap))
 
+#define XEN_SPECIAL_AREA_ADDR 0xfeff8000UL
+#define XEN_SPECIAL_AREA_SIZE 0x4000UL
+
+#define XEN_SPECIALPAGE_CONSOLE 0
+#define XEN_SPECIALPAGE_XENSTORE1
+
+#define XEN_SPECIAL_PFN(x) ((XEN_SPECIAL_AREA_ADDR >> TARGET_PAGE_BITS) + 
XEN_SPECIALPAGE_##x)
+
 #endif /* QEMU_SYSEMU_KVM_XEN_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index c90ac7790a..0110ef55c7 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -22,6 +22,7 @@
 
 #include "hw/pci/msi.h"
 #include "hw/i386/apic-msidef.h"
+#include "hw/i386/e820_memory_layout.h"
 #include "hw/i386/kvm/xen_overlay.h"
 #include "hw/i386/kvm/xen_evtchn.h"
 #include "hw/i386/kvm/xen_gnttab.h"
@@ -168,6 +169,15 @@ int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
 }
 
 s->xen_caps = xen_caps;
+
+/* Tell fw_cfg to notify the BIOS to reserve the range. */
+ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE,
+ E820_RESERVED);
+if (ret < 0) {
+fprintf(stderr, "e820_add_entry() table is full\n");
+return ret;
+}
+
 return 0;
 }
 
-- 
2.39.0

[PATCH v7 08/51] xen-platform: allow its creation with XEN_EMULATE mode

2023-01-16 Thread David Woodhouse

From: Joao Martins 

The only thing we need to handle on KVM side is to change the
pfn from R/W to R/O.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
---
 hw/i386/xen/meson.build|  5 -
 hw/i386/xen/xen_platform.c | 39 +-
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/hw/i386/xen/meson.build b/hw/i386/xen/meson.build
index be84130300..79d75cc927 100644
--- a/hw/i386/xen/meson.build
+++ b/hw/i386/xen/meson.build
@@ -2,6 +2,9 @@ i386_ss.add(when: 'CONFIG_XEN', if_true: files(
   'xen-hvm.c',
   'xen-mapcache.c',
   'xen_apic.c',
-  'xen_platform.c',
   'xen_pvdevice.c',
 ))
+
+i386_ss.add(when: 'CONFIG_XENFV_MACHINE', if_true: files(
+  'xen_platform.c',
+))
diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
index 50174c2269..00f0527b30 100644
--- a/hw/i386/xen/xen_platform.c
+++ b/hw/i386/xen/xen_platform.c
@@ -28,9 +28,9 @@
 #include "hw/ide.h"
 #include "hw/ide/pci.h"
 #include "hw/pci/pci.h"
-#include "hw/xen/xen_common.h"
 #include "migration/vmstate.h"
-#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen.h"
+#include "net/net.h"
 #include "trace.h"
 #include "sysemu/xen.h"
 #include "sysemu/block-backend.h"
@@ -38,6 +38,11 @@
 #include "qemu/module.h"
 #include "qom/object.h"
 
+#ifdef CONFIG_XEN
+#include "hw/xen/xen_common.h"
+#include "hw/xen/xen-legacy-backend.h"
+#endif
+
 //#define DEBUG_PLATFORM
 
 #ifdef DEBUG_PLATFORM
@@ -280,18 +285,26 @@ static void platform_fixed_ioport_writeb(void *opaque, 
uint32_t addr, uint32_t v
 PCIXenPlatformState *s = opaque;
 
 switch (addr) {
-case 0: /* Platform flags */ {
-hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
-HVMMEM_ram_ro : HVMMEM_ram_rw;
-if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
-DPRINTF("unable to change ro/rw state of ROM memory area!\n");
-} else {
+case 0: /* Platform flags */
+if (xen_mode == XEN_EMULATE) {
+/* XX: Use i440gx/q35 PAM setup to do this? */
 s->flags = val & PFFLAG_ROM_LOCK;
-DPRINTF("changed ro/rw state of ROM memory area. now is %s 
state.\n",
-(mem_type == HVMMEM_ram_ro ? "ro":"rw"));
+#ifdef CONFIG_XEN
+} else {
+hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
+HVMMEM_ram_ro : HVMMEM_ram_rw;
+
+if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
+DPRINTF("unable to change ro/rw state of ROM memory area!\n");
+} else {
+s->flags = val & PFFLAG_ROM_LOCK;
+DPRINTF("changed ro/rw state of ROM memory area. now is %s 
state.\n",
+(mem_type == HVMMEM_ram_ro ? "ro" : "rw"));
+}
+#endif
 }
 break;
-}
+
 case 2:
 log_writeb(s, val);
 break;
@@ -509,8 +522,8 @@ static void xen_platform_realize(PCIDevice *dev, Error 
**errp)
 uint8_t *pci_conf;
 
 /* Device will crash on reset if xen is not initialized */
-if (!xen_enabled()) {
-error_setg(errp, "xen-platform device requires the Xen accelerator");
+if (xen_mode == XEN_DISABLED) {
+error_setg(errp, "xen-platform device requires a Xen guest");
 return;
 }
 
-- 
2.39.0

[PATCH v7 39/51] hw/xen: Support HVM_PARAM_CALLBACK_TYPE_GSI callback

2023-01-16 Thread David Woodhouse

From: David Woodhouse 

The GSI callback (and later PCI_INTX) is a level triggered interrupt. It
is asserted when an event channel is delivered to vCPU0, and is supposed
to be cleared when the vcpu_info->evtchn_upcall_pending field for vCPU0
is cleared again.

Thankfully, Xen does *not* assert the GSI if the guest sets its own
evtchn_upcall_pending field; we only need to assert the GSI when we
have delivered an event for ourselves. So that's the easy part, kind of.

There's a slight complexity in that we need to hold the BQL before we
can call qemu_set_irq(), and we definitely can't do that while holding
our own port_lock (because we'll need to take that from the qemu-side
functions that the PV backend drivers will call). So if we end up
wanting to set the IRQ in a context where we *don't* already hold the
BQL, defer to a BH.

However, we *do* need to poll for the evtchn_upcall_pending flag being
cleared. In an ideal world we would poll that when the EOI happens on
the PIC/IOAPIC. That's how it works in the kernel with the VFIO eventfd
pairs — one is used to trigger the interrupt, and the other works in the
other direction to 'resample' on EOI, and trigger the first eventfd
again if the line is still active.

However, QEMU doesn't seem to do that. Even VFIO level interrupts seem
to be supported by temporarily unmapping the device's BARs from the
guest when an interrupt happens, then trapping *all* MMIO to the device
and sending the 'resample' event on *every* MMIO access until the IRQ
is cleared! Maybe in future we'll plumb the 'resample' concept through
QEMU's irq framework but for now we'll do what Xen itself does: just
check the flag on every vmexit if the upcall GSI is known to be
asserted.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_evtchn.c  | 97 +++
 hw/i386/kvm/xen_evtchn.h  |  4 ++
 hw/i386/pc.c  |  6 +++
 include/sysemu/kvm_xen.h  |  1 +
 target/i386/cpu.h |  1 +
 target/i386/kvm/kvm.c | 13 ++
 target/i386/kvm/xen-emu.c | 32 +
 target/i386/kvm/xen-emu.h |  1 +
 8 files changed, 155 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index a73db5d2bc..e2ecee9a6f 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -26,6 +26,8 @@
 
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
+#include "hw/i386/x86.h"
+#include "hw/irq.h"
 
 #include "xen_evtchn.h"
 #include "xen_overlay.h"
@@ -99,9 +101,12 @@ struct XenEvtchnState {
 uint64_t callback_param;
 bool evtchn_in_kernel;
 
+QEMUBH *gsi_bh;
+
 QemuMutex port_lock;
 uint32_t nr_ports;
 XenEvtchnPort port_table[EVTCHN_2L_NR_CHANNELS];
+qemu_irq gsis[GSI_NUM_PINS];
 };
 
 struct XenEvtchnState *xen_evtchn_singleton;
@@ -166,13 +171,42 @@ static const TypeInfo xen_evtchn_info = {
 .class_init= xen_evtchn_class_init,
 };
 
+static void gsi_assert_bh(void *opaque)
+{
+struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
+if (vi) {
+xen_evtchn_set_callback_level(!!vi->evtchn_upcall_pending);
+}
+}
+
 void xen_evtchn_create(void)
 {
 XenEvtchnState *s = XEN_EVTCHN(sysbus_create_simple(TYPE_XEN_EVTCHN,
 -1, NULL));
+int i;
+
 xen_evtchn_singleton = s;
 
 qemu_mutex_init(>port_lock);
+s->gsi_bh = aio_bh_new(qemu_get_aio_context(), gsi_assert_bh, s);
+
+for (i = 0; i < GSI_NUM_PINS; i++) {
+sysbus_init_irq(SYS_BUS_DEVICE(s), >gsis[i]);
+}
+}
+
+void xen_evtchn_connect_gsis(qemu_irq *system_gsis)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+int i;
+
+if (!s) {
+return;
+}
+
+for (i = 0; i < GSI_NUM_PINS; i++) {
+sysbus_connect_irq(SYS_BUS_DEVICE(s), i, system_gsis[i]);
+}
 }
 
 static void xen_evtchn_register_types(void)
@@ -182,6 +216,64 @@ static void xen_evtchn_register_types(void)
 
 type_init(xen_evtchn_register_types)
 
+void xen_evtchn_set_callback_level(int level)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+uint32_t param;
+
+if (!s) {
+return;
+}
+
+/*
+ * We get to this function in a number of ways:
+ *
+ *  • From I/O context, via PV backend drivers sending a notification to
+ *the guest.
+ *
+ *  • From guest vCPU context, via loopback interdomain event channels
+ *(or theoretically even IPIs but guests don't use those with GSI
+ *delivery because that's pointless. We don't want a malicious guest
+ *to be able to trigger a deadlock though, so we can't rule it out.)
+ *
+ *  • From guest vCPU context when the HVM_PARAM_CALLBACK_IRQ is being
+ *configured.
+ *
+ *  • From guest vCPU context in the KVM exit handler, if the upcall
+ *pending flag has been cleared and the GSI needs to be deasserted.
+ *
+ *  • Maybe in future, in an interrupt ack/eoi notifier when the GSI has
+ *been acked in the irqchip.
+

Re: [PATCH] scsi/lsi53c895a: restrict DMA engine to memory regions (CVE-2023-0330)

2023-01-16 Thread Mauro Matteo Cascella

On Mon, Jan 16, 2023 at 9:42 PM Mauro Matteo Cascella
 wrote:
>
> This prevents the well known DMA-MMIO reentrancy problem (upstream issue #556)
> leading to memory corruption bugs like stack overflow or use-after-free.
>
> Fixes: CVE-2023-0330
> Signed-off-by: Mauro Matteo Cascella 
> Reported-by: Zheyu Ma 
> ---
>  hw/scsi/lsi53c895a.c   | 14 +
>  tests/qtest/fuzz-lsi53c895a-test.c | 32 ++
>  2 files changed, 42 insertions(+), 4 deletions(-)
>
> diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
> index af93557a9a..89c52594eb 100644
> --- a/hw/scsi/lsi53c895a.c
> +++ b/hw/scsi/lsi53c895a.c
> @@ -446,22 +446,28 @@ static void lsi_reselect(LSIState *s, lsi_request *p);
>  static inline void lsi_mem_read(LSIState *s, dma_addr_t addr,
> void *buf, dma_addr_t len)
>  {
> +const MemTxAttrs attrs = { .memory = true };
> +
>  if (s->dmode & LSI_DMODE_SIOM) {
> -address_space_read(>pci_io_as, addr, MEMTXATTRS_UNSPECIFIED,
> +address_space_read(>pci_io_as, addr, attrs,
> buf, len);
>  } else {
> -pci_dma_read(PCI_DEVICE(s), addr, buf, len);
> +pci_dma_rw(PCI_DEVICE(s), addr, buf, len,
> +  DMA_DIRECTION_TO_DEVICE, attrs);
>  }
>  }
>
>  static inline void lsi_mem_write(LSIState *s, dma_addr_t addr,
>  const void *buf, dma_addr_t len)
>  {
> +const MemTxAttrs attrs = { .memory = true };
> +
>  if (s->dmode & LSI_DMODE_DIOM) {
> -address_space_write(>pci_io_as, addr, MEMTXATTRS_UNSPECIFIED,
> +address_space_write(>pci_io_as, addr, attrs,
>  buf, len);
>  } else {
> -pci_dma_write(PCI_DEVICE(s), addr, buf, len);
> +pci_dma_rw(PCI_DEVICE(s), addr, (void *) buf, len,
> +  DMA_DIRECTION_FROM_DEVICE, attrs);
>  }
>  }
>
> diff --git a/tests/qtest/fuzz-lsi53c895a-test.c 
> b/tests/qtest/fuzz-lsi53c895a-test.c
> index 392a7ae7ed..35c02e89f3 100644
> --- a/tests/qtest/fuzz-lsi53c895a-test.c
> +++ b/tests/qtest/fuzz-lsi53c895a-test.c
> @@ -8,6 +8,35 @@
>  #include "qemu/osdep.h"
>  #include "libqtest.h"
>
> +/*
> + * This used to trigger a DMA reentrancy issue
> + * leading to memory corruption bugs like stack
> + * overflow or use-after-free
> + */
> +static void test_lsi_dma_reentrancy(void)
> +{
> +QTestState *s;
> +
> +s = qtest_init("-M q35 -m 512M -nodefaults "
> +   "-blockdev driver=null-co,node-name=null0 "
> +   "-device lsi53c810 -device scsi-cd,drive=null0");
> +
> +qtest_outl(s, 0xcf8, 0x8804); /* PCI Command Register */
> +qtest_outw(s, 0xcfc, 0x7);/* Enables accesses */
> +qtest_outl(s, 0xcf8, 0x8814); /* Memory Bar 1 */
> +qtest_outl(s, 0xcfc, 0xff10); /* Set MMIO Address*/
> +qtest_outl(s, 0xcf8, 0x8818); /* Memory Bar 2 */
> +qtest_outl(s, 0xcfc, 0xff00); /* Set RAM Address*/
> +qtest_writel(s, 0xff00, 0xc024);
> +qtest_writel(s, 0xff000114, 0x0080);
> +qtest_writel(s, 0xff00012c, 0xff00);
> +qtest_writel(s, 0xff04, 0xff000114);
> +qtest_writel(s, 0xff08, 0xff100014);
> +qtest_writel(s, 0xff10002f, 0x00ff);
> +
> +qtest_quit(s);
> +}
> +
>  /*
>   * This used to trigger a UAF in lsi_do_msgout()
>   * https://gitlab.com/qemu-project/qemu/-/issues/972
> @@ -120,5 +149,8 @@ int main(int argc, char **argv)
>  qtest_add_func("fuzz/lsi53c895a/lsi_do_msgout_cancel_req",
> test_lsi_do_msgout_cancel_req);
>
> +qtest_add_func("fuzz/lsi53c895a/lsi_dma_reentrancy",
> +   test_lsi_dma_reentrancy);
> +
>  return g_test_run();
>  }
> --
> 2.39.0
>

Reproducer:

cat << EOF | ./x86_64-softmmu/qemu-system-x86_64 -machine accel=qtest \
-m 512M -machine q35 -nodefaults -device lsi53c810 -device scsi-cd,drive=null0 \
-display none -blockdev driver=null-co,node-name=null0 -qtest stdio
outl 0xcf8 0x8804   /* PCI Command Register */
outl 0xcfc 0x7 /* Enable accesses */
outl 0xcf8 0x8814   /* Memory Bar 1 */
outl 0xcfc 0xff10 /* Set MMIO Address*/
outl 0xcf8 0x8818   /* Memory Bar 2 */
outl 0xcfc 0xff00 /* Set RAM Address*/
writel 0xff00 0xc024
writel 0xff000114 0x0080
writel 0xff00012c 0xff00
writel 0xff04 0xff000114
writel 0xff08 0xff100014
writel 0xff10002f 0x00ff
EOF

-- 
Mauro Matteo Cascella
Red Hat Product Security
PGP-Key ID: BB3410B0

Re: completion timeouts with pin-based interrupts in QEMU hw/nvme

2023-01-16 Thread Klaus Jensen

On Jan 12 14:10, Klaus Jensen wrote:
> Hi all (linux-nvme, qemu-devel, maintainers),
> 
> On QEMU riscv64, which does not use MSI/MSI-X and thus relies on
> pin-based interrupts, I'm seeing occasional completion timeouts, i.e.
> 
>   nvme nvme0: I/O 333 QID 1 timeout, completion polled
> 
> To rule out issues with shadow doorbells (which have been a source of
> frustration in the past), those are disabled. FWIW I'm also seeing the
> issue with shadow doorbells.
> 
>   diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
>   index f25cc2c235e9..28d8e7f4b56c 100644
>   --- a/hw/nvme/ctrl.c
>   +++ b/hw/nvme/ctrl.c
>   @@ -7407,7 +7407,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>id->mdts = n->params.mdts;
>id->ver = cpu_to_le32(NVME_SPEC_VER);
>id->oacs =
>   -cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | 
> NVME_OACS_DBBUF);
>   +cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
>id->cntrltype = 0x1;
> 
>/*
> 
> 
> I captured a trace from QEMU when this happens:
> 
> pci_nvme_mmio_write addr 0x1008 data 0x4e size 4
> pci_nvme_mmio_doorbell_sq sqid 1 new_tail 78
> pci_nvme_io_cmd cid 4428 nsid 0x1 sqid 1 opc 0x2 opname 'NVME_NVM_CMD_READ'
> pci_nvme_read cid 4428 nsid 1 nlb 32 count 16384 lba 0x1324
> pci_nvme_map_prp trans_len 4096 len 16384 prp1 0x80aca000 prp2 0x82474100 
> num_prps 5
> pci_nvme_map_addr addr 0x80aca000 len 4096
> pci_nvme_map_addr addr 0x80ac9000 len 4096
> pci_nvme_map_addr addr 0x80ac8000 len 4096
> pci_nvme_map_addr addr 0x80ac7000 len 4096
> pci_nvme_io_cmd cid 4429 nsid 0x1 sqid 1 opc 0x2 opname 'NVME_NVM_CMD_READ'
> pci_nvme_read cid 4429 nsid 1 nlb 224 count 114688 lba 0x1242
> pci_nvme_map_prp trans_len 4096 len 114688 prp1 0x80ae6000 prp2 0x82474000 
> num_prps 29
> pci_nvme_map_addr addr 0x80ae6000 len 4096
> pci_nvme_map_addr addr 0x80ae5000 len 4096
> pci_nvme_map_addr addr 0x80ae4000 len 4096
> pci_nvme_map_addr addr 0x80ae3000 len 4096
> pci_nvme_map_addr addr 0x80ae2000 len 4096
> pci_nvme_map_addr addr 0x80ae1000 len 4096
> pci_nvme_map_addr addr 0x80ae len 4096
> pci_nvme_map_addr addr 0x80adf000 len 4096
> pci_nvme_map_addr addr 0x80ade000 len 4096
> pci_nvme_map_addr addr 0x80add000 len 4096
> pci_nvme_map_addr addr 0x80adc000 len 4096
> pci_nvme_map_addr addr 0x80adb000 len 4096
> pci_nvme_map_addr addr 0x80ada000 len 4096
> pci_nvme_map_addr addr 0x80ad9000 len 4096
> pci_nvme_map_addr addr 0x80ad8000 len 4096
> pci_nvme_map_addr addr 0x80ad7000 len 4096
> pci_nvme_map_addr addr 0x80ad6000 len 4096
> pci_nvme_map_addr addr 0x80ad5000 len 4096
> pci_nvme_map_addr addr 0x80ad4000 len 4096
> pci_nvme_map_addr addr 0x80ad3000 len 4096
> pci_nvme_map_addr addr 0x80ad2000 len 4096
> pci_nvme_map_addr addr 0x80ad1000 len 4096
> pci_nvme_map_addr addr 0x80ad len 4096
> pci_nvme_map_addr addr 0x80acf000 len 4096
> pci_nvme_map_addr addr 0x80ace000 len 4096
> pci_nvme_map_addr addr 0x80acd000 len 4096
> pci_nvme_map_addr addr 0x80acc000 len 4096
> pci_nvme_map_addr addr 0x80acb000 len 4096
> pci_nvme_rw_cb cid 4428 blk 'd0'
> pci_nvme_rw_complete_cb cid 4428 blk 'd0'
> pci_nvme_enqueue_req_completion cid 4428 cqid 1 dw0 0x0 dw1 0x0 status 0x0
> [1]: pci_nvme_irq_pin pulsing IRQ pin
> pci_nvme_rw_cb cid 4429 blk 'd0'
> pci_nvme_rw_complete_cb cid 4429 blk 'd0'
> pci_nvme_enqueue_req_completion cid 4429 cqid 1 dw0 0x0 dw1 0x0 status 0x0
> [2]: pci_nvme_irq_pin pulsing IRQ pin
> [3]: pci_nvme_mmio_write addr 0x100c data 0x4d size 4
> [4]: pci_nvme_mmio_doorbell_cq cqid 1 new_head 77
>  TIMEOUT HERE (30s) ---
> [5]: pci_nvme_mmio_read addr 0x1c size 4
> [6]: pci_nvme_mmio_write addr 0x100c data 0x4e size 4
> [7]: pci_nvme_mmio_doorbell_cq cqid 1 new_head 78
> --- Interrupt deasserted (cq->tail == cq->head)
> [   31.757821] nvme nvme0: I/O 333 QID 1 timeout, completion polled
> 
> Following the timeout, everything returns to "normal" and device/driver
> happily continues.
> 
> The pin-based interrupt logic in hw/nvme seems sound enough to me, so I
> am wondering if there is something going on with the kernel driver (but
> I certainly do not rule out that hw/nvme is at fault here, since
> pin-based interrupts has also been a source of several issues in the
> past).
> 
> What I'm thinking is that following the interrupt in [1], the driver
> picks up completion for cid 4428 but does not find cid 4429 in the queue
> since it has not been posted yet. Before getting a cq head doorbell
> write (which would cause the pin to be deasserted), the device posts the
> completion for cid 4429 which just keeps the interrupt asserted in [2].
> The trace then shows the cq head doorbell update in [3,4] for cid 4428
> and then we hit the timeout since the driver is not aware that cid 4429
> has been posted in between this (why is it not aware of this?) Timing
> out, the driver then polls the queue and notices cid 4429 and updates
> the cq head doorbell in [5-7],

Re: [PATCH v14 08/11] qapi/s390/cpu topology: change-topology monitor command

2023-01-16 Thread Nina Schoetterl-Glausch

On Thu, 2023-01-05 at 15:53 +0100, Pierre Morel wrote:
> The modification of the CPU attributes are done through a monitor
> commands.
> 
> It allows to move the core inside the topology tree to optimise
> the cache usage in the case the host's hypervizor previously
> moved the CPU.
> 
> The same command allows to modifiy the CPU attributes modifiers
> like polarization entitlement and the dedicated attribute to notify
> the guest if the host admin modified scheduling or dedication of a vCPU.
> 
> With this knowledge the guest has the possibility to optimize the
> usage of the vCPUs.
> 
> Signed-off-by: Pierre Morel 
> ---
>  qapi/machine-target.json |  29 
>  include/monitor/hmp.h|   1 +
>  hw/s390x/cpu-topology.c  | 141 +++
>  hmp-commands.hx  |  16 +
>  4 files changed, 187 insertions(+)
> 
> diff --git a/qapi/machine-target.json b/qapi/machine-target.json
> index 2e267fa458..75b0aa254d 100644
> --- a/qapi/machine-target.json
> +++ b/qapi/machine-target.json
> @@ -342,3 +342,32 @@
> 'TARGET_S390X',
> 'TARGET_MIPS',
> 'TARGET_LOONGARCH64' ] } }
> +
> +##
> +# @change-topology:
> +#
> +# @core: the vCPU ID to be moved
> +# @socket: the destination socket where to move the vCPU
> +# @book: the destination book where to move the vCPU
> +# @drawer: the destination drawer where to move the vCPU
> +# @polarity: optional polarity, default is last polarity set by the guest
> +# @dedicated: optional, if the vCPU is dedicated to a real CPU
> +#
> +# Modifies the topology by moving the CPU inside the topology
> +# tree or by changing a modifier attribute of a CPU.
> +#
> +# Returns: Nothing on success, the reason on failure.
> +#
> +# Since: 
> +##
> +{ 'command': 'change-topology',
> +  'data': {
> +  'core': 'int',
> +  'socket': 'int',
> +  'book': 'int',
> +  'drawer': 'int',
> +  '*polarity': 'int',
> +  '*dedicated': 'bool'
> +  },
> +  'if': { 'all': [ 'TARGET_S390X', 'CONFIG_KVM' ] }
> +}
> diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
> index 27f86399f7..15c36bf549 100644
> --- a/include/monitor/hmp.h
> +++ b/include/monitor/hmp.h
> @@ -144,5 +144,6 @@ void hmp_human_readable_text_helper(Monitor *mon,
>  HumanReadableText *(*qmp_handler)(Error 
> **));
>  void hmp_info_stats(Monitor *mon, const QDict *qdict);
>  void hmp_pcie_aer_inject_error(Monitor *mon, const QDict *qdict);
> +void hmp_change_topology(Monitor *mon, const QDict *qdict);
>  
>  #endif
> diff --git a/hw/s390x/cpu-topology.c b/hw/s390x/cpu-topology.c
> index b69955a1cd..0faffe657e 100644
> --- a/hw/s390x/cpu-topology.c
> +++ b/hw/s390x/cpu-topology.c
> @@ -18,6 +18,10 @@
>  #include "target/s390x/cpu.h"
>  #include "hw/s390x/s390-virtio-ccw.h"
>  #include "hw/s390x/cpu-topology.h"
> +#include "qapi/qapi-commands-machine-target.h"
> +#include "qapi/qmp/qdict.h"
> +#include "monitor/hmp.h"
> +#include "monitor/monitor.h"
>  
>  /*
>   * s390_topology is used to keep the topology information.
> @@ -203,6 +207,21 @@ static void s390_topology_set_entry(S390TopologyEntry 
> *entry,
>  s390_topology.sockets[s390_socket_nb(id)]++;
>  }
>  
> +/**
> + * s390_topology_clear_entry:
> + * @entry: Topology entry to setup
> + * @id: topology id to use for the setup
> + *
> + * Clear the core bit inside the topology mask and
> + * decrements the number of cores for the socket.
> + */
> +static void s390_topology_clear_entry(S390TopologyEntry *entry,
> +  s390_topology_id id)
> +{
> +clear_bit(63 - id.core, >mask);

This doesn't take the origin into account.

> +s390_topology.sockets[s390_socket_nb(id)]--;

I suppose this function cannot run concurrently, so the same CPU doesn't get 
removed twice.

> +}
> +
>  /**
>   * s390_topology_new_entry:
>   * @id: s390_topology_id to add
> @@ -383,3 +402,125 @@ void s390_topology_set_cpu(MachineState *ms, S390CPU 
> *cpu, Error **errp)
>  
>  s390_topology_insert(id);
>  }
> +
> +/*
> + * qmp and hmp implementations
> + */
> +
> +static S390TopologyEntry *s390_topology_core_to_entry(int core)
> +{
> +S390TopologyEntry *entry;
> +
> +QTAILQ_FOREACH(entry, _topology.list, next) {
> +if (entry->mask & (1UL << (63 - core))) {

origin here also.

> +return entry;
> +}
> +}
> +return NULL;

This should not return NULL unless the core id is invalid.
Might be better to validate that somewhere else.

> +}
> +
> +static void s390_change_topology(Error **errp, int64_t core, int64_t socket,
> + int64_t book, int64_t drawer,
> + int64_t polarity, bool dedicated)
> +{
> +S390TopologyEntry *entry;
> +s390_topology_id new_id;
> +s390_topology_id old_id;
> +Error *local_error = NULL;

I think you could use ERRP_GUARD here also.
> +
> +/* Get the old entry */
> +

Re: [RFC PATCH for 8.0 10/13] virtio-net: Migrate vhost inflight descriptors

2023-01-16 Thread Michael S. Tsirkin

On Wed, Dec 07, 2022 at 09:56:20AM +0100, Eugenio Perez Martin wrote:
> > A dumb question, any reason we need bother with virtio-net? It looks
> > to me it's not a must and would complicate migration compatibility.
> >
> > I guess virtio-blk is the better place.
> >
> 
> I'm fine to start with -blk, but if -net devices are processing
> buffers out of order we have chances of losing descriptors too.
> 
> We can wait for more feedback to prioritize correctly this though.
> 
> Thanks!

Traditionally vhost serialized everything when dropping the VQ.
Would be interesting to hear from hardware vendors on whether
it's hard or easy to do in hardware.
But I suspect all devices will want the capability eventually
because why not, if we have the code let's just do it everywhere.

-- 
MST

Re: [RFC PATCH for 8.0 10/13] virtio-net: Migrate vhost inflight descriptors

2023-01-16 Thread Michael S. Tsirkin

On Wed, Jan 11, 2023 at 01:51:06PM +0800, Jason Wang wrote:
> On Wed, Jan 11, 2023 at 12:40 PM Parav Pandit  wrote:
> >
> >
> > > From: Jason Wang 
> > > Sent: Tuesday, January 10, 2023 11:35 PM
> > >
> > > On Tue, Jan 10, 2023 at 11:02 AM Parav Pandit  wrote:
> > > >
> > > > Hi Jason,
> > > >
> > > > > From: Jason Wang 
> > > > > Sent: Monday, December 5, 2022 10:25 PM
> > > >
> > > > >
> > > > > A dumb question, any reason we need bother with virtio-net? It looks
> > > > > to me it's not a must and would complicate migration compatibility.
> > > >
> > > > Virtio net vdpa device is processing the descriptors out of order.
> > > > This vdpa device doesn’t offer IN_ORDER flag.
> > > >
> > > > And when a VQ is suspended it cannot complete these descriptors as some
> > > dummy zero length completions.
> > > > The guest VM is flooded with [1].
> > >
> > > Yes, but any reason for the device to do out-of-order for RX?
> > >
> > For some devices it is more optimal to process them out of order.
> > And its not limited to RX.
> 
> TX should be fine, since the device can anyhow pretend to send all
> packets, so we won't have any in-flight descriptors.

And drop them all? You end up with multisecond delays for things like
DHCP. Yes theoretically packets can be dropped at any time, but
practically people expect this to happen on busy systems, not randomly
out of the blue.

> >
> > > >
> > > > So it is needed for the devices that doesn’t offer IN_ORDER feature.
> > > >
> > > > [1]
> > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tre
> > > > e/drivers/net/virtio_net.c?h=v6.2-rc3#n1252
> > >
> > > It is only enabled in a debug kernel which should be harmless?
> > it is KERN_DEBUG log level. Its is not debug kernel, just the debug log 
> > level.
> 
> Ok, but the production environment should not use that level anyhow.

It's just one example.  And it's enough in my eyes to prove we really
can't start sending zero length RX buffers to drivers and expect all to be
well. If we want to we need to negotiate a new feature bit.

> > And regardless, generating zero length packets for debug kernel is even 
> > more confusing.
> 
> Note that it is allowed in the virtio-spec[1] (we probably can fix
> that in the driver) and we have pr_debug() all over this drivers and
> other places. It doesn't cause any side effects except for the
> debugging purpose.
> 
> So I think having inflight tracking is useful, but I'm not sure it's
> worth bothering with virtio-net (or worth to bothering now):
> 
> - zero length is allowed
> - it only helps for debugging
> - may cause issues for migration compatibility
> - requires new infrastructure to be invented
> 
> Thanks
> 
> [1] spec said
> 
> "
> Note: len is particularly useful for drivers using untrusted buffers:
> if a driver does not know exactly how much has been written by the
> device, the driver would have to zero the buffer in advance to ensure
> no data leakage occurs.
> "

I don't think this talks about zero length at all.
Let me try to explain what this talk about in my opinion.

There are cases where device does not know exactly how much
data it wrote into buffer. Should it over-estimate
such that driver can be sure that buffer after the reported
length is unchanged? Or should it instead under-estimate
such that driver can be sure that the reported length has
been initialized by device?

What this text in the spec says is that it must always
under-estimate and not over-estimate. And it attempts to
explain why this is useful: imagine driver that trusts the
device and wants to make sure buffer is initialized.
With the definition in the spec, it only needs to initialize
data after the reported length. Initialize how? It's up to the
driver but for example it can zero this buffer.

In short, all the text says is "do not over-report length,
only set it to part of buffer you wrote".

Besides that, the text itself is from the original spec and it did not
age well:

1)- no one actually relies on this

2)- rather than untrusted "buffers" what we commonly have is untrusted
  devices so length can't be trusted either

3)- writes on PCI are posted and if your security model
  depends on buffer being initialized and you want to
  recover from errors you really can't expect device to
  give you this info. Luckily no one cares see 1) above.

-- 
MST

[PATCH] scsi/lsi53c895a: restrict DMA engine to memory regions (CVE-2023-0330)

2023-01-16 Thread Mauro Matteo Cascella

This prevents the well known DMA-MMIO reentrancy problem (upstream issue #556)
leading to memory corruption bugs like stack overflow or use-after-free.

Fixes: CVE-2023-0330
Signed-off-by: Mauro Matteo Cascella 
Reported-by: Zheyu Ma 
---
 hw/scsi/lsi53c895a.c   | 14 +
 tests/qtest/fuzz-lsi53c895a-test.c | 32 ++
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/hw/scsi/lsi53c895a.c b/hw/scsi/lsi53c895a.c
index af93557a9a..89c52594eb 100644
--- a/hw/scsi/lsi53c895a.c
+++ b/hw/scsi/lsi53c895a.c
@@ -446,22 +446,28 @@ static void lsi_reselect(LSIState *s, lsi_request *p);
 static inline void lsi_mem_read(LSIState *s, dma_addr_t addr,
void *buf, dma_addr_t len)
 {
+const MemTxAttrs attrs = { .memory = true };
+
 if (s->dmode & LSI_DMODE_SIOM) {
-address_space_read(>pci_io_as, addr, MEMTXATTRS_UNSPECIFIED,
+address_space_read(>pci_io_as, addr, attrs,
buf, len);
 } else {
-pci_dma_read(PCI_DEVICE(s), addr, buf, len);
+pci_dma_rw(PCI_DEVICE(s), addr, buf, len,
+  DMA_DIRECTION_TO_DEVICE, attrs);
 }
 }
 
 static inline void lsi_mem_write(LSIState *s, dma_addr_t addr,
 const void *buf, dma_addr_t len)
 {
+const MemTxAttrs attrs = { .memory = true };
+
 if (s->dmode & LSI_DMODE_DIOM) {
-address_space_write(>pci_io_as, addr, MEMTXATTRS_UNSPECIFIED,
+address_space_write(>pci_io_as, addr, attrs,
 buf, len);
 } else {
-pci_dma_write(PCI_DEVICE(s), addr, buf, len);
+pci_dma_rw(PCI_DEVICE(s), addr, (void *) buf, len,
+  DMA_DIRECTION_FROM_DEVICE, attrs);
 }
 }
 
diff --git a/tests/qtest/fuzz-lsi53c895a-test.c 
b/tests/qtest/fuzz-lsi53c895a-test.c
index 392a7ae7ed..35c02e89f3 100644
--- a/tests/qtest/fuzz-lsi53c895a-test.c
+++ b/tests/qtest/fuzz-lsi53c895a-test.c
@@ -8,6 +8,35 @@
 #include "qemu/osdep.h"
 #include "libqtest.h"
 
+/*
+ * This used to trigger a DMA reentrancy issue
+ * leading to memory corruption bugs like stack
+ * overflow or use-after-free
+ */
+static void test_lsi_dma_reentrancy(void)
+{
+QTestState *s;
+
+s = qtest_init("-M q35 -m 512M -nodefaults "
+   "-blockdev driver=null-co,node-name=null0 "
+   "-device lsi53c810 -device scsi-cd,drive=null0");
+
+qtest_outl(s, 0xcf8, 0x8804); /* PCI Command Register */
+qtest_outw(s, 0xcfc, 0x7);/* Enables accesses */
+qtest_outl(s, 0xcf8, 0x8814); /* Memory Bar 1 */
+qtest_outl(s, 0xcfc, 0xff10); /* Set MMIO Address*/
+qtest_outl(s, 0xcf8, 0x8818); /* Memory Bar 2 */
+qtest_outl(s, 0xcfc, 0xff00); /* Set RAM Address*/
+qtest_writel(s, 0xff00, 0xc024);
+qtest_writel(s, 0xff000114, 0x0080);
+qtest_writel(s, 0xff00012c, 0xff00);
+qtest_writel(s, 0xff04, 0xff000114);
+qtest_writel(s, 0xff08, 0xff100014);
+qtest_writel(s, 0xff10002f, 0x00ff);
+
+qtest_quit(s);
+}
+
 /*
  * This used to trigger a UAF in lsi_do_msgout()
  * https://gitlab.com/qemu-project/qemu/-/issues/972
@@ -120,5 +149,8 @@ int main(int argc, char **argv)
 qtest_add_func("fuzz/lsi53c895a/lsi_do_msgout_cancel_req",
test_lsi_do_msgout_cancel_req);
 
+qtest_add_func("fuzz/lsi53c895a/lsi_dma_reentrancy",
+   test_lsi_dma_reentrancy);
+
 return g_test_run();
 }
-- 
2.39.0

Re: [PATCH v14 01/11] s390x/cpu topology: adding s390 specificities to CPU topology

2023-01-16 Thread Nina Schoetterl-Glausch

On Mon, 2023-01-16 at 18:28 +0100, Pierre Morel wrote:
> 
> On 1/13/23 17:58, Nina Schoetterl-Glausch wrote:
> > On Thu, 2023-01-05 at 15:53 +0100, Pierre Morel wrote:
> > > S390 adds two new SMP levels, drawers and books to the CPU
> > > topology.
> > > The S390 CPU have specific toplogy features like dedication
> > > and polarity to give to the guest indications on the host
> > > vCPUs scheduling and help the guest take the best decisions
> > > on the scheduling of threads on the vCPUs.
> > > 
> > > Let us provide the SMP properties with books and drawers levels
> > > and S390 CPU with dedication and polarity,
> > > 
> > > Signed-off-by: Pierre Morel 
> > > ---
> > >   qapi/machine.json   | 14 --
> > >   include/hw/boards.h | 10 ++-
> > >   include/hw/s390x/cpu-topology.h | 23 
> > >   target/s390x/cpu.h  |  6 +
> > >   hw/core/machine-smp.c   | 48 -
> > >   hw/core/machine.c   |  4 +++
> > >   hw/s390x/s390-virtio-ccw.c  |  2 ++
> > >   softmmu/vl.c|  6 +
> > >   target/s390x/cpu.c  | 10 +++
> > >   qemu-options.hx |  6 +++--
> > >   10 files changed, 117 insertions(+), 12 deletions(-)
> > >   create mode 100644 include/hw/s390x/cpu-topology.h
> > > 
> > [...]
> > 
> > > diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
> > > index 7d6d01325b..39ea63a416 100644
> > > --- a/target/s390x/cpu.h
> > > +++ b/target/s390x/cpu.h
> > > @@ -131,6 +131,12 @@ struct CPUArchState {
> > >   
> > >   #if !defined(CONFIG_USER_ONLY)
> > >   uint32_t core_id; /* PoP "CPU address", same as cpu_index */
> > > +int32_t socket_id;
> > > +int32_t book_id;
> > > +int32_t drawer_id;
> > > +int32_t dedicated;
> > > +int32_t polarity;
> > 
> > If I understood the architecture correctly, the polarity is a property of 
> > the configuration,
> > not the cpus. So this should be vertical_entitlement, and there should be a 
> > machine (?) property
> > specifying if the polarity is horizontal or vertical.
> 
> You are right, considering PTF only, the documentation says PTF([01]) 
> does the following:
> 
> "... a process is initiated to place all CPUs in the configuration into 
> the polarization specified by the function code, ..."
> 
> So on one side the polarization property is explicitly set on the CPU, 
> and on the other side all CPU are supposed to be in the same 
> polarization state.

I'm worried about STSI showing both horizontal and vertical CPUs at the same 
time.
I don't know if this is allowed.
If it is not, you need a way to switch between those atomically, which is harder
if every CPU has this property.
> 
> So yes we can make the horizontal/vertical a machine property.
> However, we do not need to set this tunable as the documentation says 
> that the machine always start with horizontal polarization.
> 
> On the other hand the documentation mixes a lot vertical with different 
> entitlement and horizontal polarization, for TLE order and slacks so I 
> prefer to keep the complete description of the polarization as CPU 
> properties in case we miss something.
> 
> PTF([01]) are no performance bottle neck and the number of CPU is likely 
> to be small, even a maximum of 248 is possible KVM warns above 16 CPU so 
> the loop for setting all CPU inside PTF interception is not very 
> problematic I think.

Yeah, I'm not worried about that.
> 
> Doing like you say should simplify PTF interception (no loop) but 
> complicates (some more if/else) TLE handling and QMP information display 
> on CPU.
> So I will have a look at the implications and answer again on this.
> 
> Thanks,
> 
> Regards,
> Pierre
>

Re: [RFC PATCH] tests/tcg: skip the vma-pthread test on CI

2023-01-16 Thread Alex Bennée

I did consider it but it would involve messing about with filter to remove
the test from the wildcards. This way we don't forget about it when looking
through the logs.

I've not been able to get this to fail on any other machine though. It's
been rock solid over several thousand runs.

On Mon, 16 Jan 2023, 19:25 Richard Henderson, 
wrote:

> On 1/16/23 07:32, Alex Bennée wrote:
> > diff --git a/tests/tcg/multiarch/Makefile.target
> b/tests/tcg/multiarch/Makefile.target
> > index e7213af492..ae8b3d7268 100644
> > --- a/tests/tcg/multiarch/Makefile.target
> > +++ b/tests/tcg/multiarch/Makefile.target
> > @@ -42,6 +42,15 @@ munmap-pthread: LDFLAGS+=-pthread
> >   vma-pthread: CFLAGS+=-pthread
> >   vma-pthread: LDFLAGS+=-pthread
> >
> > +# The vma-pthread seems very sensitive on gitlab and we currently
> > +# don't know if its exposing a real bug or the test is flaky.
> > +ifneq ($(GITLAB_CI),)
> > +run-vma-pthread: vma-pthread
> > + $(call skip-test, $<, "flaky on CI?")
> > +run-plugin-vma-pthread-with-%: vma-pthread
> > + $(call skip-test, $<, "flaky on CI?")
> > +endif
> > +
>
> Ok I guess.  I'd have thought the ifdef around the entire mention of the
> test would be
> better -- no point in even building it.  But,
>
> Acked-by: Richard Henderson 
>
>
> r~
>

Re: [PATCH 0/1] hw/ide: share bmdma read and write functions

2023-01-16 Thread John Snow

On Fri, Jan 13, 2023 at 9:10 AM Liav Albani  wrote:
>
>
> On 1/11/23 01:07, Bernhard Beschow wrote:
>
> Am 9. Januar 2023 19:24:16 UTC schrieb John Snow :
>
> On Tue, Sep 6, 2022 at 10:27 AM Bernhard Beschow  wrote:
>
> Am 19. Februar 2022 08:08:17 UTC schrieb Liav Albani :
>
> This is a preparation before I send v3 of ich6-ide controller emulation patch.
> I figured that it's more trivial to split the changes this way, by extracting
> the bmdma functions from via.c and piix.c and sharing them together. Then,
> I could easily put these into use when I send v3 of the ich6-ide patch by just
> using the already separated functions. This was suggested by BALATON Zoltan 
> when
> he submitted a code review on my ich6-ide controller emulation patch.
>
> Ping. Any news?
>
> *cough*.
>
> Has this been folded into subsequent series, or does this still need 
> attention?
>
> Both piix and via still have their own bmdma implementations. This patch 
> might be worth having.
>
> Best regards,
> Bernhard
>
> I see. Since you are still interested, I will try to see what was the outcome 
> of that patch as I really don't remember if it passed the CI tests, etc. If 
> applicable, I will send this as v2, or if it's already approved, then I guess 
> we could just let it be merged to the tree?
>

I was just going to run some smoke tests on it and as long as it
didn't hurt anything, I'd wave it in. If you want it alongside other
patches that I also should stage, you can bundle them if you'd like.
Just let me know what you plan on doing.

--js

Re: [PATCH 0/2] target/arm: Look up ARMCPRegInfo at runtime

2023-01-16 Thread Richard Henderson


Ping.

r~

On 1/6/23 09:44, Richard Henderson wrote:

Here's a short-to-medium term alternative to moving all of the ARMCPU
cp_regs hash table to the ARMCPUClass, so that we're no longer leaving
dangling pointers to freed objects encoded in the compiled
TranslationBlocks.  (I still think we ought to do less work at
object_{init,realize}, but that may be a much longer term project.)

Instead of giving the helper a direct pointer, pass the cpreg hash key,
which will be constant across cpus.  Perform this lookup in the existing
helper_access_check_cp_reg (which had a return value going spare), or a
new helper_lookup_cp_reg.  The other cp_regs functions are unchanged,
because they still get a pointer.

This ought to be enough to re-instate Alex's linux-user patch
to free the cpu object after thread termination.


r~


Richard Henderson (2):
   target/arm: Reorg do_coproc_insn
   target/arm: Look up ARMCPRegInfo at runtime

  target/arm/helper.h|  11 +-
  target/arm/translate.h |   7 +
  target/arm/op_helper.c |  27 ++-
  target/arm/translate-a64.c |  49 +++--
  target/arm/translate.c | 430 +++--
  5 files changed, 285 insertions(+), 239 deletions(-)

Re: [PATCH 1/1] Fix some typos

2023-01-16 Thread John Snow

On Mon, Jan 16, 2023 at 1:11 PM Laurent Vivier  wrote:
>
> Le 30/11/2022 à 09:29, Philippe Mathieu-Daudé a écrit :
> > On 30/11/22 02:53, Dongdong Zhang wrote:
> >> Fix some typos in 'python' directory.
> >>
> >> Signed-off-by: Dongdong Zhang 
> >
> > Reviewed-by: Philippe Mathieu-Daudé 
> >
> >> ---
> >>   python/qemu/machine/console_socket.py | 2 +-
> >>   python/qemu/machine/qtest.py  | 2 +-
> >>   python/qemu/qmp/protocol.py   | 2 +-
> >>   python/qemu/qmp/qmp_tui.py| 6 +++---
> >>   4 files changed, 6 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/python/qemu/machine/console_socket.py 
> >> b/python/qemu/machine/console_socket.py
> >> index 8c4ff598ad..4e28ba9bb2 100644
> >> --- a/python/qemu/machine/console_socket.py
> >> +++ b/python/qemu/machine/console_socket.py
> >> @@ -68,7 +68,7 @@ def _thread_start(self) -> threading.Thread:
> >>   """Kick off a thread to drain the socket."""
> >>   # Configure socket to not block and timeout.
> >>   # This allows our drain thread to not block
> >> -# on recieve and exit smoothly.
> >> +# on receive and exit smoothly.
> >>   socket.socket.setblocking(self, False)
> >>   socket.socket.settimeout(self, 1)
> >>   drain_thread = threading.Thread(target=self._drain_fn)
> >> diff --git a/python/qemu/machine/qtest.py b/python/qemu/machine/qtest.py
> >> index 1a1fc6c9b0..906bd13298 100644
> >> --- a/python/qemu/machine/qtest.py
> >> +++ b/python/qemu/machine/qtest.py
> >> @@ -42,7 +42,7 @@ class QEMUQtestProtocol:
> >>   :raise socket.error: on socket connection errors
> >>   .. note::
> >> -   No conection is estabalished by __init__(), this is done
> >> +   No connection is estabalished by __init__(), this is done
> >>  by the connect() or accept() methods.
> >>   """
> >>   def __init__(self, address: SocketAddrT,
> >> diff --git a/python/qemu/qmp/protocol.py b/python/qemu/qmp/protocol.py
> >> index 6ea86650ad..15909b7dba 100644
> >> --- a/python/qemu/qmp/protocol.py
> >> +++ b/python/qemu/qmp/protocol.py
> >> @@ -812,7 +812,7 @@ def _done(task: Optional['asyncio.Future[Any]']) -> 
> >> bool:
> >>   @bottom_half
> >>   async def _bh_close_stream(self, error_pathway: bool = False) -> 
> >> None:
> >> -# NB: Closing the writer also implcitly closes the reader.
> >> +# NB: Closing the writer also implicitly closes the reader.
> >>   if not self._writer:
> >>   return
> >> diff --git a/python/qemu/qmp/qmp_tui.py b/python/qemu/qmp/qmp_tui.py
> >> index ce239d8979..8369144723 100644
> >> --- a/python/qemu/qmp/qmp_tui.py
> >> +++ b/python/qemu/qmp/qmp_tui.py
> >> @@ -71,7 +71,7 @@ def format_json(msg: str) -> str:
> >>   due to an decoding error then a simple string manipulation is done to
> >>   achieve a single line JSON string.
> >> -Converting into single line is more asthetically pleasing when looking
> >> +Converting into single line is more aesthetically pleasing when 
> >> looking
> >>   along with error messages.
> >>   Eg:
> >> @@ -91,7 +91,7 @@ def format_json(msg: str) -> str:
> >>   [1, true, 3]: QMP message is not a JSON object.
> >> -The single line mode is more asthetically pleasing.
> >> +The single line mode is more aesthetically pleasing.
> >>   :param msg:
> >>   The message to formatted into single line.
> >> @@ -498,7 +498,7 @@ def __init__(self, parent: App) -> None:
> >>   class HistoryBox(urwid.ListBox):
> >>   """
> >>   This widget is modelled using the ListBox widget, contains the list 
> >> of
> >> -all messages both QMP messages and log messsages to be shown in the 
> >> TUI.
> >> +all messages both QMP messages and log messages to be shown in the 
> >> TUI.
> >>   The messages are urwid.Text widgets. On every append of a message, 
> >> the
> >>   focus is shifted to the last appended message.
> >
> >
>
> Applied to my trivial-patches branch.
>
> Thanks,
> Laurent

Laurent, I'll grab this one, sorry!

RE: [RFC PATCH for 8.0 10/13] virtio-net: Migrate vhost inflight descriptors

2023-01-16 Thread Parav Pandit


> From: Jason Wang 
> Sent: Wednesday, January 11, 2023 12:51 AM
> 
> On Wed, Jan 11, 2023 at 12:40 PM Parav Pandit  wrote:
> >
> >
> > > From: Jason Wang 
> > > Sent: Tuesday, January 10, 2023 11:35 PM
> > >
> > > On Tue, Jan 10, 2023 at 11:02 AM Parav Pandit  wrote:
> > > >
> > > > Hi Jason,
> > > >
> > > > > From: Jason Wang 
> > > > > Sent: Monday, December 5, 2022 10:25 PM
> > > >
> > > > >
> > > > > A dumb question, any reason we need bother with virtio-net? It
> > > > > looks to me it's not a must and would complicate migration
> compatibility.
> > > >
> > > > Virtio net vdpa device is processing the descriptors out of order.
> > > > This vdpa device doesn’t offer IN_ORDER flag.
> > > >
> > > > And when a VQ is suspended it cannot complete these descriptors as
> > > > some
> > > dummy zero length completions.
> > > > The guest VM is flooded with [1].
> > >
> > > Yes, but any reason for the device to do out-of-order for RX?
> > >
> > For some devices it is more optimal to process them out of order.
> > And its not limited to RX.
> 
> TX should be fine, since the device can anyhow pretend to send all packets, so
> we won't have any in-flight descriptors.
> 
> >
> > > >
> > > > So it is needed for the devices that doesn’t offer IN_ORDER feature.
> > > >
> > > > [1]
> > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
> > > > /tre
> > > > e/drivers/net/virtio_net.c?h=v6.2-rc3#n1252
> > >
> > > It is only enabled in a debug kernel which should be harmless?
> > it is KERN_DEBUG log level. Its is not debug kernel, just the debug log 
> > level.
> 
> Ok, but the production environment should not use that level anyhow.
> 
> > And regardless, generating zero length packets for debug kernel is even more
> confusing.
> 
> Note that it is allowed in the virtio-spec[1] (we probably can fix that in the
> driver) and we have pr_debug() all over this drivers and other places. It 
> doesn't
> cause any side effects except for the debugging purpose.
> 
> So I think having inflight tracking is useful, but I'm not sure it's worth 
> bothering
> with virtio-net (or worth to bothering now):
> 
> - zero length is allowed
This isn’t explicitly called out. It may be worth to add to the spec.

> - it only helps for debugging
> - may cause issues for migration compatibility
We have this missing for long time regardless of this feature. So let's not mix 
here.

The mlx5 vdpa device can do eventual in-order completion before/at time of 
suspend, so I think we can wait for now to until more advance hw arrives.

> - requires new infrastructure to be invented
> 
> Thanks
> 
> [1] spec said
> 
This doesn’t say that its zero-length completion. Len is a mandatory field to 
tell how many bytes device wrote.
> "
> Note: len is particularly useful for drivers using untrusted buffers:
> if a driver does not know exactly how much has been written by the device, the
> driver would have to zero the buffer in advance to ensure no data leakage
> occurs.
> "

Re: [PATCH v6 23/51] i386/xen: implement HYPERVISOR_event_channel_op

2023-01-16 Thread David Woodhouse

On Mon, 2023-01-16 at 17:59 +, Paul Durrant wrote:
> > +
> > +    switch (cmd) {
> > +    case EVTCHNOP_init_control:
> > +    err = -ENOSYS;
> > +    break;
> 
> The commit comment doesn't explain why the above op is singled out for 
> this treatment. I assume it is because there is no intention to 
> implement FIFO event channels in subsequent patches, but it'd be nice to 
> say so here.

Indeed. I added EVTCHNOP_expand_array and EVTCHNOP_set_priority too, while I'm 
at it.


smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH v6 14/51] i386/xen: add pc_machine_kvm_type to initialize XEN_EMULATE mode

2023-01-16 Thread David Woodhouse

On Mon, 2023-01-16 at 17:17 +, Paul Durrant wrote:
> ASS(oc); \
> >    optsfn(mc); \
> >    mc->init = initfn; \
> > +    mc->kvm_type = pc_machine_kvm_type; \
> 
> Given that it does nothing in the non-Xen-emulate case, would it not be 
> neater to simply wrap the above line, and the definition of the 
> function, in #ifdef CONFIG_XEN_EMU?

I did it that way first, but you just end up with *more* ifdefs that way.

I'll fix up what looks like a stray tab vs. space issue though; I
thought I ran it all through checkpatch and fixed that kind of thing.

smime.p7s
Description: S/MIME cryptographic signature

Re: [RFC PATCH] tests/tcg: skip the vma-pthread test on CI

2023-01-16 Thread Richard Henderson


On 1/16/23 07:32, Alex Bennée wrote:

diff --git a/tests/tcg/multiarch/Makefile.target 
b/tests/tcg/multiarch/Makefile.target
index e7213af492..ae8b3d7268 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -42,6 +42,15 @@ munmap-pthread: LDFLAGS+=-pthread
  vma-pthread: CFLAGS+=-pthread
  vma-pthread: LDFLAGS+=-pthread
  
+# The vma-pthread seems very sensitive on gitlab and we currently

+# don't know if its exposing a real bug or the test is flaky.
+ifneq ($(GITLAB_CI),)
+run-vma-pthread: vma-pthread
+   $(call skip-test, $<, "flaky on CI?")
+run-plugin-vma-pthread-with-%: vma-pthread
+   $(call skip-test, $<, "flaky on CI?")
+endif
+


Ok I guess.  I'd have thought the ifdef around the entire mention of the test would be 
better -- no point in even building it.  But,


Acked-by: Richard Henderson 


r~

Re: [PATCH 4/4] tests/tcg/multiarch: add vma-pthread.c

2023-01-16 Thread Richard Henderson


On 1/16/23 06:27, Alex Bennée wrote:

Although looking at the test I'm beginning to wonder what the sync point
is between the mutator and the read/write threads?


What do you mean?

There is no explicit sync, because the mutator always leaves each page with the (one) 
permission that the read/write/execute thread requires.


Within qemu... the sync point is primarily the syscall for read/write, and the tb 
invalidate (during the mprotect) or the mmap lock for new translation for execute.



r~

Re: [PATCH trivial for 7.2 2/2] hw/virtio/virtio.c: spelling: suppoted

2023-01-16 Thread Laurent Vivier


Le 05/11/2022 à 12:48, Michael Tokarev a écrit :

Fixes: f3034ad71fcd0a6a58bc37830f182b307f089159
Signed-off-by: Michael Tokarev 
---
  hw/virtio/virtio.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 808446b4c9..e76218bdd5 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -340,7 +340,7 @@ qmp_virtio_feature_map_t virtio_net_feature_map[] = {
  qmp_virtio_feature_map_t virtio_scsi_feature_map[] = {
  FEATURE_ENTRY(VIRTIO_SCSI_F_INOUT, \
  "VIRTIO_SCSI_F_INOUT: Requests including read and writable data "
-"buffers suppoted"),
+"buffers supported"),
  FEATURE_ENTRY(VIRTIO_SCSI_F_HOTPLUG, \
  "VIRTIO_SCSI_F_HOTPLUG: Reporting and handling hot-plug events "
  "supported"),


This patch needs to be rebased.

This typo has been moved to hw/virtio/virtio-qmp.c

Thanks,
Laurent

Re: [PATCH trivial for 7.2] hw/ssi/sifive_spi.c: spelling: reigster

2023-01-16 Thread Laurent Vivier


Le 05/11/2022 à 12:53, Michael Tokarev a écrit :

Fixes: 0694dabe9763847f3010b54ab3ec7d367d2f0ff0
Signed-off-by: Michael Tokarev 
---
  hw/ssi/sifive_spi.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/ssi/sifive_spi.c b/hw/ssi/sifive_spi.c
index 03540cf5ca..1b4a401ca1 100644
--- a/hw/ssi/sifive_spi.c
+++ b/hw/ssi/sifive_spi.c
@@ -267,7 +267,7 @@ static void sifive_spi_write(void *opaque, hwaddr addr,
  case R_RXDATA:
  case R_IP:
  qemu_log_mask(LOG_GUEST_ERROR,
-  "%s: invalid write to read-only reigster 0x%"
+  "%s: invalid write to read-only register 0x%"
HWADDR_PRIx " with 0x%x\n", __func__, addr << 2, value);
  break;
  


Applied to my trivial-patches branch.

Thanks,
Laurent

Re: [PATCH v14 06/11] s390x/cpu topology: interception of PTF instruction

2023-01-16 Thread Nina Schoetterl-Glausch

On Thu, 2023-01-05 at 15:53 +0100, Pierre Morel wrote:
> When the host supports the CPU topology facility, the PTF
> instruction with function code 2 is interpreted by the SIE,
> provided that the userland hypervizor activates the interpretation
> by using the KVM_CAP_S390_CPU_TOPOLOGY KVM extension.
> 
> The PTF instructions with function code 0 and 1 are intercepted
> and must be emulated by the userland hypervizor.
> 
> During RESET all CPU of the configuration are placed in
> horizontal polarity.
> 
> Signed-off-by: Pierre Morel 
> ---
>  include/hw/s390x/cpu-topology.h|  3 +
>  include/hw/s390x/s390-virtio-ccw.h |  6 ++
>  target/s390x/cpu.h |  1 +
>  hw/s390x/cpu-topology.c| 92 ++
>  target/s390x/cpu-sysemu.c  | 16 ++
>  target/s390x/kvm/kvm.c | 11 
>  6 files changed, 129 insertions(+)
> 
> diff --git a/include/hw/s390x/cpu-topology.h b/include/hw/s390x/cpu-topology.h
> index 9571aa70e5..33e23d78b9 100644
> --- a/include/hw/s390x/cpu-topology.h
> +++ b/include/hw/s390x/cpu-topology.h
> @@ -55,11 +55,13 @@ typedef struct S390Topology {
>  QTAILQ_HEAD(, S390TopologyEntry) list;
>  uint8_t *sockets;
>  CpuTopology *smp;
> +int polarity;
>  } S390Topology;
>  
>  #ifdef CONFIG_KVM
>  bool s390_has_topology(void);
>  void s390_topology_set_cpu(MachineState *ms, S390CPU *cpu, Error **errp);
> +void s390_topology_set_polarity(int polarity);
>  #else
>  static inline bool s390_has_topology(void)
>  {
> @@ -68,6 +70,7 @@ static inline bool s390_has_topology(void)
>  static inline void s390_topology_set_cpu(MachineState *ms,
>   S390CPU *cpu,
>   Error **errp) {}
> +static inline void s390_topology_set_polarity(int polarity) {}
>  #endif
>  extern S390Topology s390_topology;
>  
> diff --git a/include/hw/s390x/s390-virtio-ccw.h 
> b/include/hw/s390x/s390-virtio-ccw.h
> index 9bba21a916..c1d46e78af 100644
> --- a/include/hw/s390x/s390-virtio-ccw.h
> +++ b/include/hw/s390x/s390-virtio-ccw.h
> @@ -30,6 +30,12 @@ struct S390CcwMachineState {
>  uint8_t loadparm[8];
>  };
>  
> +#define S390_PTF_REASON_NONE (0x00 << 8)
> +#define S390_PTF_REASON_DONE (0x01 << 8)
> +#define S390_PTF_REASON_BUSY (0x02 << 8)
> +#define S390_TOPO_FC_MASK 0xffUL
> +void s390_handle_ptf(S390CPU *cpu, uint8_t r1, uintptr_t ra);
> +
>  struct S390CcwMachineClass {
>  /*< private >*/
>  MachineClass parent_class;
> diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
> index 01ade07009..5da4041576 100644
> --- a/target/s390x/cpu.h
> +++ b/target/s390x/cpu.h
> @@ -864,6 +864,7 @@ void s390_do_cpu_set_diag318(CPUState *cs, 
> run_on_cpu_data arg);
>  int s390_assign_subch_ioeventfd(EventNotifier *notifier, uint32_t sch_id,
>  int vq, bool assign);
>  void s390_cpu_topology_reset(void);
> +void s390_cpu_topology_set(void);

I don't like this name much, it's nondescript.
s390_cpu_topology_set_modified ?

>  #ifndef CONFIG_USER_ONLY
>  unsigned int s390_cpu_set_state(uint8_t cpu_state, S390CPU *cpu);
>  #else
> diff --git a/hw/s390x/cpu-topology.c b/hw/s390x/cpu-topology.c
> index 438055c612..e6b4692581 100644
> --- a/hw/s390x/cpu-topology.c
> +++ b/hw/s390x/cpu-topology.c
> @@ -97,6 +97,98 @@ static s390_topology_id s390_topology_from_cpu(S390CPU 
> *cpu)
>  }
>  
>  /**
> + * s390_topology_set_polarity
> + * @polarity: horizontal or vertical
> + *
> + * Changes the polarity of all the CPU in the configuration.
> + *
> + * If the dedicated CPU modifier attribute is set a vertical
> + * polarization is always high (Architecture).
> + * Otherwise we decide to set it as medium.
> + *
> + * Once done, advertise a topology change.
> + */
> +void s390_topology_set_polarity(int polarity)

I don't like that this function ignores what kind of vertical polarization is 
passed,
it's confusing.
That seems like a further reason to split horizontal/vertical from the 
entitlement.

> +{
> +S390TopologyEntry *entry;

I also expected this function to set s390_topology.polarization, but it doesn't.
> +
> +QTAILQ_FOREACH(entry, _topology.list, next) {
> +if (polarity == S390_TOPOLOGY_POLARITY_HORIZONTAL) {
> +entry->id.p = polarity;
> +} else {
> +if (entry->id.d) {
> +entry->id.p = S390_TOPOLOGY_POLARITY_VERTICAL_HIGH;
> +} else {
> +entry->id.p = S390_TOPOLOGY_POLARITY_VERTICAL_MEDIUM;
> +}
> +}
> +}
> +s390_cpu_topology_set();
> +}
> +
> +/*
> + * s390_handle_ptf:
> + *
> + * @register 1: contains the function code
> + *
> + * Function codes 0 and 1 handle the CPU polarization.
> + * We assume an horizontal topology, the only one supported currently
> + * by Linux, consequently we answer to function code 0, requesting
> + * horizontal polarization that it is already the current polarization
> + * and reject vertical

Re: [PATCH v14 02/11] s390x/cpu topology: add topology entries on CPU hotplug

2023-01-16 Thread Pierre Morel





On 1/10/23 14:00, Thomas Huth wrote:

On 05/01/2023 15.53, Pierre Morel wrote:

The topology information are attributes of the CPU and are
specified during the CPU device creation.

On hot plug, we gather the topology information on the core,
creates a list of topology entries, each entry contains a single
core mask of each core with identical topology and finaly we
orders the list in topological order.
The topological order is, from higher to lower priority:
- physical topology
 - drawer
 - book
 - socket
 - core origin, offset in 64bit increment from core 0.
- modifier attributes
 - CPU type
 - polarization entitlement
 - dedication

The possibility to insert a CPU in a mask is dependent on the
number of cores allowed in a socket, a book or a drawer, the
checking is done during the hot plug of the CPU to have an
immediate answer.

If the complete topology is not specified, the core is added
in the physical topology based on its core ID and it gets
defaults values for the modifier attributes.

This way, starting QEMU without specifying the topology can
still get some adventage of the CPU topology.


s/adventage/advantage/


Signed-off-by: Pierre Morel 
---
  include/hw/s390x/cpu-topology.h |  48 ++
  hw/s390x/cpu-topology.c | 293 
  hw/s390x/s390-virtio-ccw.c  |  10 ++
  hw/s390x/meson.build    |   1 +
  4 files changed, 352 insertions(+)
  create mode 100644 hw/s390x/cpu-topology.c

diff --git a/include/hw/s390x/cpu-topology.h 
b/include/hw/s390x/cpu-topology.h

index d945b57fc3..b3fd752d8d 100644
--- a/include/hw/s390x/cpu-topology.h
+++ b/include/hw/s390x/cpu-topology.h
@@ -10,7 +10,11 @@
  #ifndef HW_S390X_CPU_TOPOLOGY_H
  #define HW_S390X_CPU_TOPOLOGY_H
+#include "qemu/queue.h"
+#include "hw/boards.h"
+
  #define S390_TOPOLOGY_CPU_IFL   0x03
+#define S390_TOPOLOGY_MAX_ORIGIN ((63 + S390_MAX_CPUS) / 64)
  #define S390_TOPOLOGY_POLARITY_HORIZONTAL  0x00
  #define S390_TOPOLOGY_POLARITY_VERTICAL_LOW    0x01
@@ -20,4 +24,48 @@
  #define S390_TOPOLOGY_SHARED    0x00
  #define S390_TOPOLOGY_DEDICATED 0x01
+typedef union s390_topology_id {
+    uint64_t id;
+    struct {
+    uint64_t level_6:8; /* byte 0 BE */
+    uint64_t level_5:8; /* byte 1 BE */
+    uint64_t drawer:8;  /* byte 2 BE */
+    uint64_t book:8;    /* byte 3 BE */
+    uint64_t socket:8;  /* byte 4 BE */
+    uint64_t rsrv:5;
+    uint64_t d:1;
+    uint64_t p:2;   /* byte 5 BE */
+    uint64_t type:8;    /* byte 6 BE */
+    uint64_t origin:2;
+    uint64_t core:6;    /* byte 7 BE */
+    };
+} s390_topology_id;


Bitmasks are OK for code that will definitely only ever work with KVM 
... but this will certainly fail completely if we ever try to get it 
running with TCG later. Do we care? ... if so, you should certainly 
avoid a bitfield here. Especially since most of the fields are 8-bit 
anyway and could easily be represented by a "uint8_t" variable. 
Otherwise, just ignore my comment.


The goal of using a bit mask here is not to use it with KVM but to have 
an easy way to order the TLE using the natural order of the placement of 
the fields in the uint64_t
However, if I remove the two unused levels 5 and 6 I can use uint8_t for 
all the entries.


I doubt we use the levels 5 and 6 in a short future.

So I switch on 1 uint8_t for each entry.

...


diff --git a/hw/s390x/cpu-topology.c b/hw/s390x/cpu-topology.c
new file mode 100644
index 00..438055c612
--- /dev/null
+++ b/hw/s390x/cpu-topology.c
@@ -0,0 +1,293 @@
+/*
+ * CPU Topology
+ *
+ * Copyright IBM Corp. 2022


Want to update to 2023 now?


+ * Author(s): Pierre Morel 
+
+ * This work is licensed under the terms of the GNU GPL, version 2 or 
(at

+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "hw/qdev-properties.h"
+#include "hw/boards.h"
+#include "qemu/typedefs.h"
+#include "target/s390x/cpu.h"
+#include "hw/s390x/s390-virtio-ccw.h"
+#include "hw/s390x/cpu-topology.h"
+
+/*
+ * s390_topology is used to keep the topology information.
+ * .list: queue the topology entries inside which
+ *    we keep the information on the CPU topology.
+ *
+ * .smp: keeps track of the machine topology.
+ *
+ * .socket: tracks information on the count of cores per socket.
+ *
+ */
+S390Topology s390_topology = {
+    .list = QTAILQ_HEAD_INITIALIZER(s390_topology.list),
+    .sockets = NULL, /* will be initialized after the cpu model is 
realized */

+};
+
+/**
+ * s390_socket_nb:
+ * @id: s390_topology_id
+ *
+ * Returns the socket number used inside the socket array.
+ */
+static int s390_socket_nb(s390_topology_id id)
+{
+    return (id.socket + 1) * (id.book + 1) * (id.drawer + 1); > +}
I think there might be an off-by-one error in here - you likely need a 
"- 1" at the very end.


For example, assume that

Re: [PATCH] hw/cxl/cxl-host: Fix an error message typo

2023-01-16 Thread Laurent Vivier


Le 27/11/2022 à 04:22, Hoa Nguyen a écrit :

Signed-off-by: Hoa Nguyen 
---
  hw/cxl/cxl-host.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-host.c b/hw/cxl/cxl-host.c
index 1adf61231a..3c1ec8732a 100644
--- a/hw/cxl/cxl-host.c
+++ b/hw/cxl/cxl-host.c
@@ -47,7 +47,7 @@ static void cxl_fixed_memory_window_config(CXLState 
*cxl_state,
  
  if (object->size % (256 * MiB)) {

  error_setg(errp,
-   "Size of a CXL fixed memory window must my a multiple of 
256MiB");
+   "Size of a CXL fixed memory window must be a multiple of 
256MiB");
  return;
  }
  fw->size = object->size;


Applied to my trivial-patches branch.

Thanks,
Laurent

Re: [PATCH] ui/vnc.c: Allow websocket connections over AF_UNIX sockets

2023-01-16 Thread Laurent Vivier


Hi Gerd,

If this patch is correct I can queue it via trivial branch.

Thanks,
Laurent

Le 02/12/2022 à 16:12, Pierre-Yves Ritschard a écrit :

Hi,

The provided patch allows the VNC websocket server of a qemu process to
be provided over AF_UNIX as it is already possible for standard TCP VNC
servers.

Now that many clients support websocket connections, some exclusively,
it can be useful to expose the VNC server. One such case is when a
proxy is present on a host machine, allowing it to proxy to a
deterministic location withouth having to keep track of port mappings.

Removing the condition as is done in the provided patch creates a
functional server. If there's a downside to this approach I could not
figure it out while reading the code. My hunch was that the condition
was there for a reason, but it did not jump at me.

Cheers,
   - pyr

---
  ui/vnc.c | 5 -
  1 file changed, 5 deletions(-)

diff --git ui/vnc.c ui/vnc.c
index 88f55cbf3c..b01a655400 100644
--- ui/vnc.c
+++ ui/vnc.c
@@ -3729,11 +3729,6 @@ static int vnc_display_get_address(const char *addrstr,
  addr->type = SOCKET_ADDRESS_TYPE_UNIX;
  addr->u.q_unix.path = g_strdup(addrstr + 5);
  
-if (websocket) {

-error_setg(errp, "UNIX sockets not supported with websock");
-goto cleanup;
-}
-
  if (to) {
  error_setg(errp, "Port range not support with UNIX socket");
  goto cleanup;

Re: [PATCH 1/1] Fix some typos

2023-01-16 Thread Laurent Vivier


Le 30/11/2022 à 09:29, Philippe Mathieu-Daudé a écrit :

On 30/11/22 02:53, Dongdong Zhang wrote:

Fix some typos in 'python' directory.

Signed-off-by: Dongdong Zhang 


Reviewed-by: Philippe Mathieu-Daudé 


---
  python/qemu/machine/console_socket.py | 2 +-
  python/qemu/machine/qtest.py  | 2 +-
  python/qemu/qmp/protocol.py   | 2 +-
  python/qemu/qmp/qmp_tui.py    | 6 +++---
  4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/qemu/machine/console_socket.py 
b/python/qemu/machine/console_socket.py
index 8c4ff598ad..4e28ba9bb2 100644
--- a/python/qemu/machine/console_socket.py
+++ b/python/qemu/machine/console_socket.py
@@ -68,7 +68,7 @@ def _thread_start(self) -> threading.Thread:
  """Kick off a thread to drain the socket."""
  # Configure socket to not block and timeout.
  # This allows our drain thread to not block
-    # on recieve and exit smoothly.
+    # on receive and exit smoothly.
  socket.socket.setblocking(self, False)
  socket.socket.settimeout(self, 1)
  drain_thread = threading.Thread(target=self._drain_fn)
diff --git a/python/qemu/machine/qtest.py b/python/qemu/machine/qtest.py
index 1a1fc6c9b0..906bd13298 100644
--- a/python/qemu/machine/qtest.py
+++ b/python/qemu/machine/qtest.py
@@ -42,7 +42,7 @@ class QEMUQtestProtocol:
  :raise socket.error: on socket connection errors
  .. note::
-   No conection is estabalished by __init__(), this is done
+   No connection is estabalished by __init__(), this is done
 by the connect() or accept() methods.
  """
  def __init__(self, address: SocketAddrT,
diff --git a/python/qemu/qmp/protocol.py b/python/qemu/qmp/protocol.py
index 6ea86650ad..15909b7dba 100644
--- a/python/qemu/qmp/protocol.py
+++ b/python/qemu/qmp/protocol.py
@@ -812,7 +812,7 @@ def _done(task: Optional['asyncio.Future[Any]']) -> bool:
  @bottom_half
  async def _bh_close_stream(self, error_pathway: bool = False) -> None:
-    # NB: Closing the writer also implcitly closes the reader.
+    # NB: Closing the writer also implicitly closes the reader.
  if not self._writer:
  return
diff --git a/python/qemu/qmp/qmp_tui.py b/python/qemu/qmp/qmp_tui.py
index ce239d8979..8369144723 100644
--- a/python/qemu/qmp/qmp_tui.py
+++ b/python/qemu/qmp/qmp_tui.py
@@ -71,7 +71,7 @@ def format_json(msg: str) -> str:
  due to an decoding error then a simple string manipulation is done to
  achieve a single line JSON string.
-    Converting into single line is more asthetically pleasing when looking
+    Converting into single line is more aesthetically pleasing when looking
  along with error messages.
  Eg:
@@ -91,7 +91,7 @@ def format_json(msg: str) -> str:
  [1, true, 3]: QMP message is not a JSON object.
-    The single line mode is more asthetically pleasing.
+    The single line mode is more aesthetically pleasing.
  :param msg:
  The message to formatted into single line.
@@ -498,7 +498,7 @@ def __init__(self, parent: App) -> None:
  class HistoryBox(urwid.ListBox):
  """
  This widget is modelled using the ListBox widget, contains the list of
-    all messages both QMP messages and log messsages to be shown in the TUI.
+    all messages both QMP messages and log messages to be shown in the TUI.
  The messages are urwid.Text widgets. On every append of a message, the
  focus is shifted to the last appended message.





Applied to my trivial-patches branch.

Thanks,
Laurent

Re: [PATCH v8] xen/pt: reserve PCI slot 2 for Intel igd-passthru

2023-01-16 Thread Chuck Zmudzinski

On 1/16/23 10:33, Igor Mammedov wrote:
> On Fri, 13 Jan 2023 16:31:26 -0500
> Chuck Zmudzinski  wrote:
> 
>> On 1/13/23 4:33 AM, Igor Mammedov wrote:
>> > On Thu, 12 Jan 2023 23:14:26 -0500
>> > Chuck Zmudzinski  wrote:
>> >   
>> >> On 1/12/23 6:03 PM, Michael S. Tsirkin wrote:  
>> >> > On Thu, Jan 12, 2023 at 10:55:25PM +, Bernhard Beschow wrote:
>> >> >> I think the change Michael suggests is very minimalistic: Move the if
>> >> >> condition around xen_igd_reserve_slot() into the function itself and
>> >> >> always call it there unconditionally -- basically turning three lines
>> >> >> into one. Since xen_igd_reserve_slot() seems very problem specific,
>> >> >> Michael further suggests to rename it to something more general. All
>> >> >> in all no big changes required.
>> >> > 
>> >> > yes, exactly.
>> >> > 
>> >> 
>> >> OK, got it. I can do that along with the other suggestions.  
>> > 
>> > have you considered instead of reservation, putting a slot check in device 
>> > model
>> > and if it's intel igd being passed through, fail at realize time  if it 
>> > can't take
>> > required slot (with a error directing user to fix command line)?  
>> 
>> Yes, but the core pci code currently already fails at realize time
>> with a useful error message if the user tries to use slot 2 for the
>> igd, because of the xen platform device which has slot 2. The user
>> can fix this without patching qemu, but having the user fix it on
>> the command line is not the best way to solve the problem, primarily
>> because the user would need to hotplug the xen platform device via a
>> command line option instead of having the xen platform device added by
>> pc_xen_hvm_init functions almost immediately after creating the pci
>> bus, and that delay in adding the xen platform device degrades
>> startup performance of the guest.
>> 
>> > That could be less complicated than dealing with slot reservations at the 
>> > cost of
>> > being less convenient.  
>> 
>> And also a cost of reduced startup performance
> 
> Could you clarify how it affects performance (and how much).
> (as I see, setup done at board_init time is roughly the same
> as with '-device foo' CLI options, modulo time needed to parse
> options which should be negligible. and both ways are done before
> guest runs)

I preface my answer by saying there is a v9, but you don't
need to look at that. I will answer all your questions here.

I am going by what I observe on the main HDMI display with the
different approaches. With the approach of not patching Qemu
to fix this, which requires adding the Xen platform device a
little later, the length of time it takes to fully load the
guest is increased. I also noticed with Linux guests that use
the grub bootoader, the grub vga driver cannot display the
grub boot menu at the native resolution of the display, which
in the tested case is 1920x1080, when the Xen platform device
is added via a command line option instead of by the
pc_xen_hvm_init_pci fucntion in pc_piix.c, but with this patch
to Qemu, the grub menu is displayed at the full, 1920x1080
native resolution of the display. Once the guest fully loads,
there is no noticeable difference in performance. It is mainly
a degradation in startup performance, not performance once
the guest OS is fully loaded.

> 
>> However, the performance hit can be prevented by assigning slot
>> 3 instead of slot 2 for the xen platform device if igd passthrough
>> is configured on the command line instead of doing slot reservation,
>> but there would still be less convenience and, for libxl users, an
>> inability to easily configure the command line so that the igd can
>> still have slot 2 without a hacky and error-prone patch to libxl to
>> deal with this problem.
> libvirt manages to get it right on management side without quirks on
> QEMU side.

I think the reason libvirt/kvm gets it right is simply because the
code implementing the libvirt/kvm approach got more attention and testing
than the code that implements the libxl/Xen approach. This patch
really represents what should have been done when support for the
igd-passthru=on option for xenfv machines was added seven years ago,
but the code was apparently added without much testing and is stale now
and needs this fix which is entirely implemented in either files maintained
by Xen maintainers or, in the case of the small patch to pc_piix.c,
entirely within a section guarded by #ifdef CONFIG_XEN. Not much
maintenance burden for hw/i386 maintainers.

> 
>> I did post a patch on xen-devel to fix this using libxl, but so far
>> it has not yet been reviewed and I mentioned in that patch that the
>> approach of patching qemu so qemu reserves slot 2 for the igd is less
>> prone to coding errors and is easier to maintain than the patch that
>> would be required to implement the fix in libxl.
> 
> the patch is not trivial, and adds maintenance on QEMU.

For all practical purposes, the only additional maintenance would
be handled

Re: [PATCH v6 25/51] i386/xen: implement HVMOP_set_param

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Ankur Arora 

This is the hook for adding the HVM_PARAM_CALLBACK_IRQ parameter in a
subsequent commit.

Signed-off-by: Ankur Arora 
Signed-off-by: Joao Martins 
[dwmw2: Split out from another commit]
Signed-off-by: David Woodhouse 


Reviewed-by: Paul Durrant

Re: [PATCH v6 23/51] i386/xen: implement HYPERVISOR_event_channel_op

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Joao Martins 

Additionally set XEN_INTERFACE_VERSION to most recent in order to
exercise the "new" event_channel_op.

Signed-off-by: Joao Martins 
[dwmw2: Ditch event_channel_op_compat which was never available to HVM guests]
Signed-off-by: David Woodhouse 
---
  target/i386/kvm/xen-emu.c | 22 ++
  1 file changed, 22 insertions(+)

diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 7803e4a7a7..ff093328d7 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -27,6 +27,7 @@
  #include "standard-headers/xen/memory.h"
  #include "standard-headers/xen/hvm/hvm_op.h"
  #include "standard-headers/xen/vcpu.h"
+#include "standard-headers/xen/event_channel.h"
  
  #include "xen-compat.h"
  
@@ -585,6 +586,24 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,

  return true;
  }
  
+static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit,

+int cmd, uint64_t arg)
+{
+int err = -ENOSYS;
+
+switch (cmd) {
+case EVTCHNOP_init_control:
+err = -ENOSYS;
+break;


The commit comment doesn't explain why the above op is singled out for 
this treatment. I assume it is because there is no intention to 
implement FIFO event channels in subsequent patches, but it'd be nice to 
say so here.


  Paul


+
+default:
+return false;
+}
+
+exit->u.hcall.result = err;
+return true;
+}
+
  static int kvm_xen_soft_reset(void)
  {
  CPUState *cpu;
@@ -684,6 +703,9 @@ static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct 
kvm_xen_exit *exit)
  case __HYPERVISOR_sched_op:
  return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
exit->u.hcall.params[1]);
+case __HYPERVISOR_event_channel_op:
+return kvm_xen_hcall_evtchn_op(exit, exit->u.hcall.params[0],
+   exit->u.hcall.params[1]);
  case __HYPERVISOR_vcpu_op:
  return kvm_xen_hcall_vcpu_op(exit, cpu,
   exit->u.hcall.params[0],

Re: [PATCH v6 09/51] i386/xen: handle guest hypercalls

2023-01-16 Thread David Woodhouse

On Mon, 2023-01-16 at 16:24 +, Paul Durrant wrote:
> > +    trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
> > +    exit->u.hcall.input, exit->u.hcall.params[0],
> > +    exit->u.hcall.params[1], 
> > exit->u.hcall.params[2],
> > +    exit->u.hcall.result);
> 
> It seems odd to have the trace message after the hypercall is handled. 
> Any additional tracing in the handler if going to come out before we're 
> told what hypercall it is.

Well yeah, but if we do it sooner then we don't have the result, and
end up with two separate trace lines to print the result too.


smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH Trivial] hw/cxl/cxl-cdat.c: spelling: missmatch

2023-01-16 Thread Laurent Vivier


Le 15/12/2022 à 13:37, Michael Tokarev a écrit :

Introduced by: aba578bdace5303a441f8a37aad781b5cb06f38c

Signed-off-by: Michael Tokarev 
---
  hw/cxl/cxl-cdat.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-cdat.c b/hw/cxl/cxl-cdat.c
index 3653aa56f0..137abd0992 100644
--- a/hw/cxl/cxl-cdat.c
+++ b/hw/cxl/cxl-cdat.c
@@ -146,7 +146,7 @@ static void ct3_load_cdat(CDATObject *cdat, Error **errp)
  num_ent++;
  }
  if (i != file_size) {
-error_setg(errp, "CDAT: File length missmatch");
+error_setg(errp, "CDAT: File length mismatch");
  return;
  }
  


Applied to my trivial-patches branch.

Thanks,
Laurent

Re: [PATCH v6 08/51] xen-platform: allow its creation with XEN_EMULATE mode

2023-01-16 Thread David Woodhouse

On Mon, 2023-01-16 at 16:20 +, Paul Durrant wrote:
> > +    case 0: /* Platform flags */
> > +    if (xen_mode == XEN_EMULATE) {
> > +    /* XX: Use i440gx/q35 PAM setup to do this? */
> >    s->flags = val & PFFLAG_ROM_LOCK;
> 
> Given that this is not RFC, do you have a definite plan? TBH I think 
> only ancient (Bochs) ROMBIOS messes with this; I can't find any trace in 
> SeaBIOS anyway. So maybe we just don't care.

Indeed, I just don't think we care. If using the pam_config was easy
I'd have done it but it just isn't worth bothering with. So I'll leave
the note for anyone who comes later and finds a reason to care, but I
think it's fine.

smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH v6 22/51] i386/xen: handle VCPUOP_register_runstate_memory_area

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Joao Martins 

Allow guest to setup the vcpu runstates which is used as
steal clock.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
---
  target/i386/cpu.h |  1 +
  target/i386/kvm/xen-emu.c | 57 +++
  target/i386/machine.c |  1 +
  3 files changed, 59 insertions(+)



Reviewed-by: Paul Durrant

Re: [PATCH v6 21/51] i386/xen: handle VCPUOP_register_vcpu_time_info

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Joao Martins 

In order to support Linux vdso in Xen.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
---
  target/i386/cpu.h |   1 +
  target/i386/kvm/xen-emu.c | 100 +-
  target/i386/machine.c |   1 +
  3 files changed, 90 insertions(+), 12 deletions(-)



Reviewed-by: Paul Durrant

Re: [PATCH v3] hw/pvrdma: Protect against buggy or malicious guest driver

2023-01-16 Thread Laurent Vivier


Le 28/12/2022 à 20:32, Thomas Huth a écrit :

On 19/12/2022 12.21, Marcel Apfelbaum wrote:

On Mon, Dec 19, 2022 at 10:57 AM Yuval Shaia  wrote:


Can anyone else pick this one?


Adding Thomas,

I dropped the ball with this one, I am sorry about that, maybe it
doesn't worth a Pull Request only for it.


Why not? Pull request for single patches aren't that uncommon.


Maybe it can go through the Misc tree?


hw/rdma/ is really not my turf, but since the patch is small, it sounds like a good candidate for 
qemu-trivial, I think.


Applied to my trivial-patches branch.

Thanks,
Laurent



  Thomas



On Wed, 7 Dec 2022 at 17:05, Claudio Fontana  wrote:


On 4/5/22 12:31, Marcel Apfelbaum wrote:

Hi Yuval,
Thank you for the changes.

On Sun, Apr 3, 2022 at 11:54 AM Yuval Shaia  wrote:


Guest driver might execute HW commands when shared buffers are not yet
allocated.
This could happen on purpose (malicious guest) or because of some other
guest/host address mapping error.
We need to protect againts such case.

Fixes: CVE-2022-1050

Reported-by: Raven 
Signed-off-by: Yuval Shaia 
---
v1 -> v2:
 * Commit message changes
v2 -> v3:
 * Exclude cosmetic changes
---
  hw/rdma/vmw/pvrdma_cmd.c | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index da7ddfa548..89db963c46 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -796,6 +796,12 @@ int pvrdma_exec_cmd(PVRDMADev *dev)

  dsr_info = >dsr_info;

+    if (!dsr_info->dsr) {
+    /* Buggy or malicious guest driver */
+    rdma_error_report("Exec command without dsr, req or rsp buffers");
+    goto out;
+    }
+
  if (dsr_info->req->hdr.cmd >= sizeof(cmd_handlers) /
    sizeof(struct cmd_handler)) {
  rdma_error_report("Unsupported command");
--
2.20.1



cc-ing Peter and Philippe for a question:
Do we have a "Security Fixes" or a "Misc" subtree? Otherwise it will
have to wait a week or so.

Reviewed by: Marcel Apfelbaum 
Thanks,
Marcel



Hi all,

patch is reviewed, anything holding back the inclusion of this security fix?

Thanks,

Claudio

Re: [PATCH 01/10] ccid-card-emulated: fix cast warning/error

2023-01-16 Thread Laurent Vivier


Le 03/01/2023 à 12:08, marcandre.lur...@redhat.com a écrit :

From: Marc-André Lureau 

../hw/usb/ccid-card-emulated.c: In function 'handle_apdu_thread':
../hw/usb/ccid-card-emulated.c:251:24: error: cast from pointer to integer of 
different size [-Werror=pointer-to-int-cast]
   251 | assert((unsigned long)event > 1000);

Signed-off-by: Marc-André Lureau 
---
  hw/usb/ccid-card-emulated.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/usb/ccid-card-emulated.c b/hw/usb/ccid-card-emulated.c
index ee41a81801..c328660075 100644
--- a/hw/usb/ccid-card-emulated.c
+++ b/hw/usb/ccid-card-emulated.c
@@ -248,7 +248,7 @@ static void *handle_apdu_thread(void* arg)
  WITH_QEMU_LOCK_GUARD(>vreader_mutex) {
  while (!QSIMPLEQ_EMPTY(>guest_apdu_list)) {
  event = QSIMPLEQ_FIRST(>guest_apdu_list);
-assert((unsigned long)event > 1000);
+assert(event != NULL);
  QSIMPLEQ_REMOVE_HEAD(>guest_apdu_list, entry);
  if (event->p.data.type != EMUL_GUEST_APDU) {
  DPRINTF(card, 1, "unexpected message in 
handle_apdu_thread\n");


Applied to my trivial-patches branch.

Thanks,
Laurent

[PATCH v3 3/3] s390x/pv: Move check on hugepage under s390_pv_guest_check()

2023-01-16 Thread Cédric Le Goater

From: Cédric Le Goater 

Such conditions on Protected Virtualization can now be checked at init
time. This is possible because Protected Virtualization is initialized
after memory where the page size is set.

Reviewed-by: Thomas Huth 
Signed-off-by: Cédric Le Goater 
---
 hw/s390x/pv.c   | 13 -
 target/s390x/diag.c |  7 ---
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index 8405e73a1b..679d860f54 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -280,9 +280,20 @@ static bool s390_pv_check_cpus(Error **errp)
 return true;
 }
 
+static bool s390_pv_check_hpage(Error **errp)
+{
+if (kvm_s390_get_hpage_1m()) {
+error_setg(errp, "Protected VMs can currently not be backed with "
+   "huge pages");
+return false;
+}
+
+return true;
+}
+
 static bool s390_pv_guest_check(ConfidentialGuestSupport *cgs, Error **errp)
 {
-return s390_pv_check_cpus(errp);
+return s390_pv_check_cpus(errp) && s390_pv_check_hpage(errp);
 }
 
 int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
diff --git a/target/s390x/diag.c b/target/s390x/diag.c
index 9b16e25930..28f4350aed 100644
--- a/target/s390x/diag.c
+++ b/target/s390x/diag.c
@@ -170,13 +170,6 @@ out:
 return;
 }
 
-if (kvm_enabled() && kvm_s390_get_hpage_1m()) {
-error_report("Protected VMs can currently not be backed with "
- "huge pages");
-env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV;
-return;
-}
-
 if (!s390_pv_check(_err)) {
 error_report_err(local_err);
 env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV;
-- 
2.39.0

[PATCH v3 1/3] s390x/pv: Implement a CGS check helper

2023-01-16 Thread Cédric Le Goater

From: Cédric Le Goater 

When a protected VM is started with the maximum number of CPUs (248),
the service call providing information on the CPUs requires more
buffer space than allocated and QEMU disgracefully aborts :

LOADPARM=[]
Using virtio-blk.
Using SCSI scheme.

...
qemu-system-s390x: KVM_S390_MEM_OP failed: Argument list too long

When protected virtualization is initialized, compute the maximum
number of vCPUs supported by the machine and return useful information
to the user before the machine starts in case of error.

Suggested-by: Thomas Huth 
Reviewed-by: Thomas Huth 
Signed-off-by: Cédric Le Goater 
---
 hw/s390x/pv.c | 40 
 1 file changed, 40 insertions(+)

diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index 8dfe92d8df..8a1c71436b 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -20,6 +20,7 @@
 #include "exec/confidential-guest-support.h"
 #include "hw/s390x/ipl.h"
 #include "hw/s390x/pv.h"
+#include "hw/s390x/sclp.h"
 #include "target/s390x/kvm/kvm_s390x.h"
 
 static bool info_valid;
@@ -249,6 +250,41 @@ struct S390PVGuestClass {
 ConfidentialGuestSupportClass parent_class;
 };
 
+/*
+ * If protected virtualization is enabled, the amount of data that the
+ * Read SCP Info Service Call can use is limited to one page. The
+ * available space also depends on the Extended-Length SCCB (ELS)
+ * feature which can take more buffer space to store feature
+ * information. This impacts the maximum number of CPUs supported in
+ * the machine.
+ */
+static uint32_t s390_pv_get_max_cpus(void)
+{
+int offset_cpu = s390_has_feat(S390_FEAT_EXTENDED_LENGTH_SCCB) ?
+offsetof(ReadInfo, entries) : SCLP_READ_SCP_INFO_FIXED_CPU_OFFSET;
+
+return (TARGET_PAGE_SIZE - offset_cpu) / sizeof(CPUEntry);
+}
+
+static bool s390_pv_check_cpus(Error **errp)
+{
+MachineState *ms = MACHINE(qdev_get_machine());
+uint32_t pv_max_cpus = s390_pv_get_max_cpus();
+
+if (ms->smp.max_cpus > pv_max_cpus) {
+error_setg(errp, "Protected VMs support a maximum of %d CPUs",
+   pv_max_cpus);
+return false;
+}
+
+return true;
+}
+
+static bool s390_pv_guest_check(ConfidentialGuestSupport *cgs, Error **errp)
+{
+return s390_pv_check_cpus(errp);
+}
+
 int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp)
 {
 if (!object_dynamic_cast(OBJECT(cgs), TYPE_S390_PV_GUEST)) {
@@ -261,6 +297,10 @@ int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error 
**errp)
 return -1;
 }
 
+if (!s390_pv_guest_check(cgs, errp)) {
+return -1;
+}
+
 cgs->ready = true;
 
 return 0;
-- 
2.39.0

[PATCH v3 0/3]s390x/pv: Improve error reporting of protected VMs

2023-01-16 Thread Cédric Le Goater

Hello,

Here is a little series improving error reporting of protected VMs.

Thanks,

C.

Changes in v3:

 - dropped 's390x/pv: Check for support on the host'. This is already
   covered by the KVM capability.   
 - in s390_pv_check() (patch 2) drop the call to s390_pv_guest_check()
   since init time has already checked the same conditions. However,
   to report an error when the machine is not protected and the kernel
   secure, we still need s390_pv_check().

Changes in v2:

 - dropped ConfidentialGuestSupportClass handler. The check is now
   done from s390_pv_init() which is called after memory and CPU
   initialization. This gives us a better chance to tune the limits
   correctly.
 - pv_max_cpus now computed from the available size of the response
   buffer of the Read SCP Info Service Call (Thomas)
 
Cédric Le Goater (3):
  s390x/pv: Implement a CGS check helper
  s390x/pv: Introduce a s390_pv_check() helper for runtime
  s390x/pv: Move check on hugepage under s390_pv_guest_check()

 include/hw/s390x/pv.h |  2 ++
 hw/s390x/pv.c | 64 +++
 target/s390x/diag.c   |  6 ++--
 3 files changed, 69 insertions(+), 3 deletions(-)

-- 
2.39.0

Re: [PATCH v6 20/51] i386/xen: handle VCPUOP_register_vcpu_info

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Joao Martins 

Handle the hypercall to set a per vcpu info, and also wire up the default
vcpu_info in the shared_info page for the first 32 vCPUs.

To avoid deadlock within KVM a vCPU thread must set its *own* vcpu_info
rather than it being set from the context in which the hypercall is
invoked.

Add the vcpu_info (and default) GPA to the vmstate_x86_cpu for migration,
and restore it in kvm_arch_put_registers() appropriately.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
---
  target/i386/cpu.h|   2 +
  target/i386/kvm/kvm.c|  17 
  target/i386/kvm/trace-events |   1 +
  target/i386/kvm/xen-emu.c| 152 ++-
  target/i386/kvm/xen-emu.h|   2 +
  target/i386/machine.c|  19 +
  6 files changed, 190 insertions(+), 3 deletions(-)



Reviewed-by: Paul Durrant

[PATCH v3 2/3] s390x/pv: Introduce a s390_pv_check() helper for runtime

2023-01-16 Thread Cédric Le Goater

From: Cédric Le Goater 

If a secure kernel is started in a non-protected VM, the OS will hang
during boot without giving a proper error message to the user.

Perform the checks on Confidential Guest support at runtime with an
helper called from the service call switching the guest to protected
mode.

Signed-off-by: Cédric Le Goater 
---

  In s390_pv_check(), drop the call to s390_pv_guest_check() since
  init time has already checked the same conditions. However, to
  report an error when the machine is not protected and the kernel
  secure, we still need s390_pv_check().

 include/hw/s390x/pv.h |  2 ++
 hw/s390x/pv.c | 13 +
 target/s390x/diag.c   |  7 +++
 3 files changed, 22 insertions(+)

diff --git a/include/hw/s390x/pv.h b/include/hw/s390x/pv.h
index 9360aa1091..ca7dac2e20 100644
--- a/include/hw/s390x/pv.h
+++ b/include/hw/s390x/pv.h
@@ -55,6 +55,7 @@ int kvm_s390_dump_init(void);
 int kvm_s390_dump_cpu(S390CPU *cpu, void *buff);
 int kvm_s390_dump_mem_state(uint64_t addr, size_t len, void *dest);
 int kvm_s390_dump_completion_data(void *buff);
+bool s390_pv_check(Error **errp);
 #else /* CONFIG_KVM */
 static inline bool s390_is_pv(void) { return false; }
 static inline int s390_pv_query_info(void) { return 0; }
@@ -75,6 +76,7 @@ static inline int kvm_s390_dump_cpu(S390CPU *cpu, void *buff) 
{ return 0; }
 static inline int kvm_s390_dump_mem_state(uint64_t addr, size_t len,
   void *dest) { return 0; }
 static inline int kvm_s390_dump_completion_data(void *buff) { return 0; }
+static inline bool s390_pv_check(Error **errp) { return false; }
 #endif /* CONFIG_KVM */
 
 int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error **errp);
diff --git a/hw/s390x/pv.c b/hw/s390x/pv.c
index 8a1c71436b..8405e73a1b 100644
--- a/hw/s390x/pv.c
+++ b/hw/s390x/pv.c
@@ -306,6 +306,19 @@ int s390_pv_kvm_init(ConfidentialGuestSupport *cgs, Error 
**errp)
 return 0;
 }
 
+bool s390_pv_check(Error **errp)
+{
+MachineState *ms = MACHINE(qdev_get_machine());
+
+if (!ms->cgs) {
+error_setg(errp, "Protected VM is started without Confidential"
+   " Guest support");
+return false;
+}
+
+return true;
+}
+
 OBJECT_DEFINE_TYPE_WITH_INTERFACES(S390PVGuest,
s390_pv_guest,
S390_PV_GUEST,
diff --git a/target/s390x/diag.c b/target/s390x/diag.c
index 76b01dcd68..9b16e25930 100644
--- a/target/s390x/diag.c
+++ b/target/s390x/diag.c
@@ -79,6 +79,7 @@ void handle_diag_308(CPUS390XState *env, uint64_t r1, 
uint64_t r3, uintptr_t ra)
 uint64_t addr =  env->regs[r1];
 uint64_t subcode = env->regs[r3];
 IplParameterBlock *iplb;
+Error *local_err = NULL;
 
 if (env->psw.mask & PSW_MASK_PSTATE) {
 s390_program_interrupt(env, PGM_PRIVILEGED, ra);
@@ -176,6 +177,12 @@ out:
 return;
 }
 
+if (!s390_pv_check(_err)) {
+error_report_err(local_err);
+env->regs[r1 + 1] = DIAG_308_RC_INVAL_FOR_PV;
+return;
+}
+
 s390_ipl_reset_request(cs, S390_RESET_PV);
 break;
 default:
-- 
2.39.0

Re: [PATCH v2] hw/i386/pc: Remove unused 'owner' argument from pc_pci_as_mapping_init

2023-01-16 Thread Laurent Vivier


Le 05/01/2023 à 18:38, Philippe Mathieu-Daudé a écrit :

This argument was added 9 years ago in commit 83d08f2673
("pc: map PCI address space as catchall region for not mapped
addresses") and has never been used since, so remove it.

Signed-off-by: Philippe Mathieu-Daudé 
---
  hw/i386/pc.c | 2 +-
  hw/pci-host/i440fx.c | 3 +--
  hw/pci-host/q35.c| 3 +--
  include/hw/i386/pc.h | 2 +-
  4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index d489ecc0d1..6e592bd969 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -782,7 +782,7 @@ void pc_guest_info_init(PCMachineState *pcms)
  }
  
  /* setup pci memory address space mapping into system address space */

-void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,
+void pc_pci_as_mapping_init(MemoryRegion *system_memory,
  MemoryRegion *pci_address_space)
  {
  /* Set to lower priority than RAM */
diff --git a/hw/pci-host/i440fx.c b/hw/pci-host/i440fx.c
index d5426ef4a5..262f82c303 100644
--- a/hw/pci-host/i440fx.c
+++ b/hw/pci-host/i440fx.c
@@ -272,8 +272,7 @@ PCIBus *i440fx_init(const char *pci_type,
   IO_APIC_DEFAULT_ADDRESS - 1);
  
  /* setup pci memory mapping */

-pc_pci_as_mapping_init(OBJECT(f), f->system_memory,
-   f->pci_address_space);
+pc_pci_as_mapping_init(f->system_memory, f->pci_address_space);
  
  /* if *disabled* show SMRAM to all CPUs */

  memory_region_init_alias(>smram_region, OBJECT(d), "smram-region",
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index 20da121374..26390863d6 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -574,8 +574,7 @@ static void mch_realize(PCIDevice *d, Error **errp)
  }
  
  /* setup pci memory mapping */

-pc_pci_as_mapping_init(OBJECT(mch), mch->system_memory,
-   mch->pci_address_space);
+pc_pci_as_mapping_init(mch->system_memory, mch->pci_address_space);
  
  /* if *disabled* show SMRAM to all CPUs */

  memory_region_init_alias(>smram_region, OBJECT(mch), "smram-region",
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 991f905f5d..88a120bc23 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -156,7 +156,7 @@ void pc_guest_info_init(PCMachineState *pcms);
  #define PCI_HOST_ABOVE_4G_MEM_SIZE "above-4g-mem-size"
  
  
-void pc_pci_as_mapping_init(Object *owner, MemoryRegion *system_memory,

+void pc_pci_as_mapping_init(MemoryRegion *system_memory,
  MemoryRegion *pci_address_space);
  
  void xen_load_linux(PCMachineState *pcms);


Applied to my trivial-patches branch.

Thanks,
Laurent

Re: [PATCH v6 19/51] i386/xen: implement HYPERVISOR_vcpu_op

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Joao Martins 

This is simply when guest tries to register a vcpu_info
and since vcpu_info placement is optional in the minimum ABI
therefore we can just fail with -ENOSYS

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
---
  target/i386/kvm/xen-emu.c | 25 +
  1 file changed, 25 insertions(+)



Reviewed-by: Paul Durrant

Re: [PATCH v6 18/51] i386/xen: implement HYPERVISOR_hvm_op

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Joao Martins 

This is when guest queries for support for HVMOP_pagetable_dying.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
---
  target/i386/kvm/xen-emu.c | 17 +
  1 file changed, 17 insertions(+)



Reviewed-by: Paul Durrant

Re: [PATCH] tests/qtest/test-hmp: Improve the check for verbose mode

2023-01-16 Thread Laurent Vivier


Le 09/01/2023 à 11:13, Thomas Huth a écrit :

Running the test-hmp with V=2 up to V=9 runs the test in verbose mode,
but running for example with V=10 falls back to non-verbose mode ...
Improve this oddity by properly treating the argument as a number.

Signed-off-by: Thomas Huth 
---
  tests/qtest/test-hmp.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qtest/test-hmp.c b/tests/qtest/test-hmp.c
index f8b22abe4c..b4a920df89 100644
--- a/tests/qtest/test-hmp.c
+++ b/tests/qtest/test-hmp.c
@@ -151,7 +151,7 @@ int main(int argc, char **argv)
  {
  char *v_env = getenv("V");
  
-if (v_env && *v_env >= '2') {

+if (v_env && atoi(v_env) >= 2) {
  verbose = true;
  }
  


Applied to my trivial-patches branch.

Thanks,
Laurent

Re: [PATCH v6 17/51] i386/xen: implement XENMEM_add_to_physmap_batch

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: David Woodhouse 

Signed-off-by: David Woodhouse 


Reviewed-by: Paul Durrant

Re: [PATCH-for-8.0] block/nbd: Add missing include

2023-01-16 Thread Laurent Vivier


Le 12/01/2023 à 13:00, Philippe Mathieu-Daudé a écrit :

Hi, can this reviewed patch get merged via a block tree?


I can take this via trivial.

Thanks,
Laurent



On 25/11/22 18:53, Philippe Mathieu-Daudé wrote:

The inlined nbd_readXX() functions call beXX_to_cpu(), themselves
declared in . This fixes when refactoring:

   In file included from ../../block/nbd.c:44:
   include/block/nbd.h: In function 'nbd_read16':
   include/block/nbd.h:383:12: error: implicit declaration of function 'be16_to_cpu' 
[-Werror=implicit-function-declaration]

 383 | *val = be##bits##_to_cpu(*val);  
   \
 |    ^~
   include/block/nbd.h:387:1: note: in expansion of macro 'DEF_NBD_READ_N'
 387 | DEF_NBD_READ_N(16) /* Defines nbd_read16(). */
 | ^~

Signed-off-by: Philippe Mathieu-Daudé 
---
  include/block/nbd.h | 1 +
  1 file changed, 1 insertion(+)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index 4ede3b2bd0..a4c98169c3 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -24,6 +24,7 @@
  #include "io/channel-socket.h"
  #include "crypto/tlscreds.h"
  #include "qapi/error.h"
+#include "qemu/bswap.h"
  extern const BlockExportDriver blk_exp_nbd;

[PATCH v2 5/6] hw/riscv/virt.c: rename MachineState 'mc' pointers to 'ms'

2023-01-16 Thread Daniel Henrique Barboza

We have a convention in other QEMU boards/archs to name MachineState
pointers as either 'machine' or 'ms'. MachineClass pointers are usually
called 'mc'.

The 'virt' RISC-V machine has a lot of instances where MachineState
pointers are named 'mc'. There is nothing wrong with that, but we gain
more compatibility with the rest of the QEMU code base, and easier
reviews, if we follow QEMU conventions.

Rename all 'mc' MachineState pointers to 'ms'. This is a very tedious
and mechanical patch that was produced by doing the following:

- find/replace all 'MachineState *mc' to 'MachineState *ms';
- find/replace all 'mc->fdt' to 'ms->fdt';
- find/replace all 'mc->smp.cpus' to 'ms->smp.cpus';
- replace any remaining occurrences of 'mc' that the compiler complained
about.

Suggested-by: Philippe Mathieu-Daudé 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Daniel Henrique Barboza 
---
 hw/riscv/virt.c | 434 
 1 file changed, 217 insertions(+), 217 deletions(-)

diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
index 8ff89c217f..479a90b3d5 100644
--- a/hw/riscv/virt.c
+++ b/hw/riscv/virt.c
@@ -227,7 +227,7 @@ static void create_fdt_socket_cpus(RISCVVirtState *s, int 
socket,
 {
 int cpu;
 uint32_t cpu_phandle;
-MachineState *mc = MACHINE(s);
+MachineState *ms = MACHINE(s);
 char *name, *cpu_name, *core_name, *intc_name;
 bool is_32_bit = riscv_is_32bit(>soc[0]);
 
@@ -236,40 +236,40 @@ static void create_fdt_socket_cpus(RISCVVirtState *s, int 
socket,
 
 cpu_name = g_strdup_printf("/cpus/cpu@%d",
 s->soc[socket].hartid_base + cpu);
-qemu_fdt_add_subnode(mc->fdt, cpu_name);
+qemu_fdt_add_subnode(ms->fdt, cpu_name);
 if (riscv_feature(>soc[socket].harts[cpu].env,
   RISCV_FEATURE_MMU)) {
-qemu_fdt_setprop_string(mc->fdt, cpu_name, "mmu-type",
+qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type",
 (is_32_bit) ? "riscv,sv32" : "riscv,sv48");
 } else {
-qemu_fdt_setprop_string(mc->fdt, cpu_name, "mmu-type",
+qemu_fdt_setprop_string(ms->fdt, cpu_name, "mmu-type",
 "riscv,none");
 }
 name = riscv_isa_string(>soc[socket].harts[cpu]);
-qemu_fdt_setprop_string(mc->fdt, cpu_name, "riscv,isa", name);
+qemu_fdt_setprop_string(ms->fdt, cpu_name, "riscv,isa", name);
 g_free(name);
-qemu_fdt_setprop_string(mc->fdt, cpu_name, "compatible", "riscv");
-qemu_fdt_setprop_string(mc->fdt, cpu_name, "status", "okay");
-qemu_fdt_setprop_cell(mc->fdt, cpu_name, "reg",
+qemu_fdt_setprop_string(ms->fdt, cpu_name, "compatible", "riscv");
+qemu_fdt_setprop_string(ms->fdt, cpu_name, "status", "okay");
+qemu_fdt_setprop_cell(ms->fdt, cpu_name, "reg",
 s->soc[socket].hartid_base + cpu);
-qemu_fdt_setprop_string(mc->fdt, cpu_name, "device_type", "cpu");
-riscv_socket_fdt_write_id(mc, cpu_name, socket);
-qemu_fdt_setprop_cell(mc->fdt, cpu_name, "phandle", cpu_phandle);
+qemu_fdt_setprop_string(ms->fdt, cpu_name, "device_type", "cpu");
+riscv_socket_fdt_write_id(ms, cpu_name, socket);
+qemu_fdt_setprop_cell(ms->fdt, cpu_name, "phandle", cpu_phandle);
 
 intc_phandles[cpu] = (*phandle)++;
 
 intc_name = g_strdup_printf("%s/interrupt-controller", cpu_name);
-qemu_fdt_add_subnode(mc->fdt, intc_name);
-qemu_fdt_setprop_cell(mc->fdt, intc_name, "phandle",
+qemu_fdt_add_subnode(ms->fdt, intc_name);
+qemu_fdt_setprop_cell(ms->fdt, intc_name, "phandle",
 intc_phandles[cpu]);
-qemu_fdt_setprop_string(mc->fdt, intc_name, "compatible",
+qemu_fdt_setprop_string(ms->fdt, intc_name, "compatible",
 "riscv,cpu-intc");
-qemu_fdt_setprop(mc->fdt, intc_name, "interrupt-controller", NULL, 0);
-qemu_fdt_setprop_cell(mc->fdt, intc_name, "#interrupt-cells", 1);
+qemu_fdt_setprop(ms->fdt, intc_name, "interrupt-controller", NULL, 0);
+qemu_fdt_setprop_cell(ms->fdt, intc_name, "#interrupt-cells", 1);
 
 core_name = g_strdup_printf("%s/core%d", clust_name, cpu);
-qemu_fdt_add_subnode(mc->fdt, core_name);
-qemu_fdt_setprop_cell(mc->fdt, core_name, "cpu", cpu_phandle);
+qemu_fdt_add_subnode(ms->fdt, core_name);
+qemu_fdt_setprop_cell(ms->fdt, core_name, "cpu", cpu_phandle);
 
 g_free(core_name);
 g_free(intc_name);
@@ -282,16 +282,16 @@ static void create_fdt_socket_memory(RISCVVirtState *s,
 {
 char *mem_name;
 uint64_t addr, size;
-MachineState *mc = MACHINE(s);
+MachineState *ms = MACHINE(s);
 
-addr = memmap[VIRT_DRAM].base + riscv_socket_mem_offset(mc, socket);
-size = riscv_socket_mem_size(mc, socket);
+addr = memmap[VIRT_DRAM].base + riscv_socket_mem_offset(ms,

[PATCH v2 4/6] hw/riscv/virt.c: calculate socket count once in create_fdt_imsic()

2023-01-16 Thread Daniel Henrique Barboza

riscv_socket_count() returns either ms->numa_state->num_nodes or 1
depending on NUMA support. In any case the value can be retrieved only
once and used in the rest of the function.

This will also alleviate the rename we're going to do next by reducing
the instances of MachineState 'mc' inside hw/riscv/virt.c.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Daniel Henrique Barboza 
---
 hw/riscv/virt.c | 34 +++---
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
index cbba0b8930..8ff89c217f 100644
--- a/hw/riscv/virt.c
+++ b/hw/riscv/virt.c
@@ -505,13 +505,14 @@ static void create_fdt_imsic(RISCVVirtState *s, const 
MemMapEntry *memmap,
 int cpu, socket;
 char *imsic_name;
 MachineState *mc = MACHINE(s);
+int socket_count = riscv_socket_count(mc);
 uint32_t imsic_max_hart_per_socket, imsic_guest_bits;
 uint32_t *imsic_cells, *imsic_regs, imsic_addr, imsic_size;
 
 *msi_m_phandle = (*phandle)++;
 *msi_s_phandle = (*phandle)++;
 imsic_cells = g_new0(uint32_t, mc->smp.cpus * 2);
-imsic_regs = g_new0(uint32_t, riscv_socket_count(mc) * 4);
+imsic_regs = g_new0(uint32_t, socket_count * 4);
 
 /* M-level IMSIC node */
 for (cpu = 0; cpu < mc->smp.cpus; cpu++) {
@@ -519,7 +520,7 @@ static void create_fdt_imsic(RISCVVirtState *s, const 
MemMapEntry *memmap,
 imsic_cells[cpu * 2 + 1] = cpu_to_be32(IRQ_M_EXT);
 }
 imsic_max_hart_per_socket = 0;
-for (socket = 0; socket < riscv_socket_count(mc); socket++) {
+for (socket = 0; socket < socket_count; socket++) {
 imsic_addr = memmap[VIRT_IMSIC_M].base +
  socket * VIRT_IMSIC_GROUP_MAX_SIZE;
 imsic_size = IMSIC_HART_SIZE(0) * s->soc[socket].num_harts;
@@ -545,14 +546,14 @@ static void create_fdt_imsic(RISCVVirtState *s, const 
MemMapEntry *memmap,
 qemu_fdt_setprop(mc->fdt, imsic_name, "interrupts-extended",
 imsic_cells, mc->smp.cpus * sizeof(uint32_t) * 2);
 qemu_fdt_setprop(mc->fdt, imsic_name, "reg", imsic_regs,
-riscv_socket_count(mc) * sizeof(uint32_t) * 4);
+socket_count * sizeof(uint32_t) * 4);
 qemu_fdt_setprop_cell(mc->fdt, imsic_name, "riscv,num-ids",
 VIRT_IRQCHIP_NUM_MSIS);
-if (riscv_socket_count(mc) > 1) {
+if (socket_count > 1) {
 qemu_fdt_setprop_cell(mc->fdt, imsic_name, "riscv,hart-index-bits",
 imsic_num_bits(imsic_max_hart_per_socket));
 qemu_fdt_setprop_cell(mc->fdt, imsic_name, "riscv,group-index-bits",
-imsic_num_bits(riscv_socket_count(mc)));
+imsic_num_bits(socket_count));
 qemu_fdt_setprop_cell(mc->fdt, imsic_name, "riscv,group-index-shift",
 IMSIC_MMIO_GROUP_MIN_SHIFT);
 }
@@ -567,7 +568,7 @@ static void create_fdt_imsic(RISCVVirtState *s, const 
MemMapEntry *memmap,
 }
 imsic_guest_bits = imsic_num_bits(s->aia_guests + 1);
 imsic_max_hart_per_socket = 0;
-for (socket = 0; socket < riscv_socket_count(mc); socket++) {
+for (socket = 0; socket < socket_count; socket++) {
 imsic_addr = memmap[VIRT_IMSIC_S].base +
  socket * VIRT_IMSIC_GROUP_MAX_SIZE;
 imsic_size = IMSIC_HART_SIZE(imsic_guest_bits) *
@@ -594,18 +595,18 @@ static void create_fdt_imsic(RISCVVirtState *s, const 
MemMapEntry *memmap,
 qemu_fdt_setprop(mc->fdt, imsic_name, "interrupts-extended",
 imsic_cells, mc->smp.cpus * sizeof(uint32_t) * 2);
 qemu_fdt_setprop(mc->fdt, imsic_name, "reg", imsic_regs,
-riscv_socket_count(mc) * sizeof(uint32_t) * 4);
+socket_count * sizeof(uint32_t) * 4);
 qemu_fdt_setprop_cell(mc->fdt, imsic_name, "riscv,num-ids",
 VIRT_IRQCHIP_NUM_MSIS);
 if (imsic_guest_bits) {
 qemu_fdt_setprop_cell(mc->fdt, imsic_name, "riscv,guest-index-bits",
 imsic_guest_bits);
 }
-if (riscv_socket_count(mc) > 1) {
+if (socket_count > 1) {
 qemu_fdt_setprop_cell(mc->fdt, imsic_name, "riscv,hart-index-bits",
 imsic_num_bits(imsic_max_hart_per_socket));
 qemu_fdt_setprop_cell(mc->fdt, imsic_name, "riscv,group-index-bits",
-imsic_num_bits(riscv_socket_count(mc)));
+imsic_num_bits(socket_count));
 qemu_fdt_setprop_cell(mc->fdt, imsic_name, "riscv,group-index-shift",
 IMSIC_MMIO_GROUP_MIN_SHIFT);
 }
@@ -733,6 +734,7 @@ static void create_fdt_sockets(RISCVVirtState *s, const 
MemMapEntry *memmap,
 MachineState *mc = MACHINE(s);
 uint32_t msi_m_phandle = 0, msi_s_phandle = 0;
 uint32_t *intc_phandles, xplic_phandles[MAX_NODES];
+int socket_count = riscv_socket_count(mc);
 
 qemu_fdt_add_subnode(mc->fdt, "/cpus");
 qemu_fdt_setprop_cell(mc->fdt, "/cpus", "timebase-frequency",
@@ -744,7 +746,7 @@ static void create_fdt_sockets(RISCVVirtState *s, const 
MemMapEntry *memmap,
 intc_phandles = g_new0(uint32_t, mc->smp.cpus);

[PATCH v2 0/6] riscv: fdt related cleanups

2023-01-16 Thread Daniel Henrique Barboza

Hi,

In this version I included a rework in riscv_load_fdt() to separate the
fdt address calculation from the fdt load process. Having both in the
same function doesn't give us much and can lead to confusion due to how
other archs handle their respective load_fdt() functions.

Patches are based on riscv-to-apply.next.

Changes from v1:
- former patches 1-6: already applied to riscv-to-apply.next
- former patch 7: removed
- patch 1 (new):
  - fix a potential issue with fdt_pack() called after fdt_totalsize()
- patch 2 (new):  
  - split fdt address compute from fdt load logic
- patch 3 (new):
  - simplify the new riscv_compute_fdt_addr() by using MachineState
- patches 4,5,6:
  - added Phil's r-b
v1 link: https://lists.gnu.org/archive/html/qemu-devel/2023-01/msg02246.html


Daniel Henrique Barboza (6):
  hw/riscv/boot.c: calculate fdt size after fdt_pack()
  hw/riscv: split fdt address calculation from fdt load
  hw/riscv: simplify riscv_compute_fdt_addr()
  hw/riscv/virt.c: calculate socket count once in create_fdt_imsic()
  hw/riscv/virt.c: rename MachineState 'mc' pointers to 'ms'
  hw/riscv/spike.c: rename MachineState 'mc' pointers to' ms'

 hw/riscv/boot.c|  33 ++-
 hw/riscv/microchip_pfsoc.c |   6 +-
 hw/riscv/sifive_u.c|   7 +-
 hw/riscv/spike.c   |  24 +-
 hw/riscv/virt.c| 468 +++--
 include/hw/riscv/boot.h|   3 +-
 6 files changed, 281 insertions(+), 260 deletions(-)

-- 
2.39.0

[PATCH v2 2/6] hw/riscv: split fdt address calculation from fdt load

2023-01-16 Thread Daniel Henrique Barboza

A common trend in other archs is to calculate the fdt address, which is
usually straightforward, and then calling a function that loads the
fdt/dtb by using that address.

riscv_load_fdt() is doing a bit too much in comparison. It's calculating
the fdt address via an elaborated heuristic to put the FDT at the bottom
of DRAM, and "bottom of DRAM" will vary across boards and
configurations, then it's actually loading the fdt, and finally it's
returning the fdt address used to the caller.

Reduce the existing complexity of riscv_load_fdt() by splitting its code
into a new function, riscv_compute_fdt_addr(), that will take care of
all fdt address logic. riscv_load_fdt() can then be a simple function
that just loads a fdt at the given fdt address.

Signed-off-by: Daniel Henrique Barboza 
---
 hw/riscv/boot.c| 24 
 hw/riscv/microchip_pfsoc.c |  6 --
 hw/riscv/sifive_u.c|  7 ---
 hw/riscv/spike.c   |  6 +++---
 hw/riscv/virt.c|  7 ---
 include/hw/riscv/boot.h|  3 ++-
 6 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
index dc14d8cd14..b213a32157 100644
--- a/hw/riscv/boot.c
+++ b/hw/riscv/boot.c
@@ -249,9 +249,16 @@ void riscv_load_initrd(MachineState *machine, uint64_t 
kernel_entry)
 }
 }
 
-uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t mem_size, void *fdt)
+/*
+ * The FDT should be put at the farthest point possible to
+ * avoid overwriting it with the kernel/initrd.
+ *
+ * The FDT is fdt_packed() during the calculation.
+ */
+uint32_t riscv_compute_fdt_addr(hwaddr dram_base, uint64_t mem_size,
+void *fdt)
 {
-uint64_t temp, fdt_addr;
+uint64_t temp;
 hwaddr dram_end = dram_base + mem_size;
 int ret = fdt_pack(fdt);
 int fdtsize;
@@ -272,11 +279,14 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
mem_size, void *fdt)
  * end of dram or 3GB whichever is lesser.
  */
 temp = (dram_base < 3072 * MiB) ? MIN(dram_end, 3072 * MiB) : dram_end;
-fdt_addr = QEMU_ALIGN_DOWN(temp - fdtsize, 2 * MiB);
 
-ret = fdt_pack(fdt);
-/* Should only fail if we've built a corrupted tree */
-g_assert(ret == 0);
+return QEMU_ALIGN_DOWN(temp - fdtsize, 2 * MiB);
+}
+
+void riscv_load_fdt(uint32_t fdt_addr, void *fdt)
+{
+uint32_t fdtsize = fdt_totalsize(fdt);
+
 /* copy in the device tree */
 qemu_fdt_dumpdtb(fdt, fdtsize);
 
@@ -284,8 +294,6 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
mem_size, void *fdt)
   _space_memory);
 qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds,
 rom_ptr_for_as(_space_memory, fdt_addr, 
fdtsize));
-
-return fdt_addr;
 }
 
 void riscv_rom_copy_firmware_info(MachineState *machine, hwaddr rom_base,
diff --git a/hw/riscv/microchip_pfsoc.c b/hw/riscv/microchip_pfsoc.c
index 82ae5e7023..dcdbc2cac3 100644
--- a/hw/riscv/microchip_pfsoc.c
+++ b/hw/riscv/microchip_pfsoc.c
@@ -641,8 +641,10 @@ static void microchip_icicle_kit_machine_init(MachineState 
*machine)
 }
 
 /* Compute the fdt load address in dram */
-fdt_load_addr = riscv_load_fdt(memmap[MICROCHIP_PFSOC_DRAM_LO].base,
-   machine->ram_size, machine->fdt);
+fdt_load_addr = 
riscv_compute_fdt_addr(memmap[MICROCHIP_PFSOC_DRAM_LO].base,
+  machine->ram_size, machine->fdt);
+riscv_load_fdt(fdt_load_addr, machine->fdt);
+
 /* Load the reset vector */
 riscv_setup_rom_reset_vec(machine, >soc.u_cpus, firmware_load_addr,
   memmap[MICROCHIP_PFSOC_ENVM_DATA].base,
diff --git a/hw/riscv/sifive_u.c b/hw/riscv/sifive_u.c
index 2fb6ee231f..626d4dc2f3 100644
--- a/hw/riscv/sifive_u.c
+++ b/hw/riscv/sifive_u.c
@@ -616,9 +616,10 @@ static void sifive_u_machine_init(MachineState *machine)
 kernel_entry = 0;
 }
 
-/* Compute the fdt load address in dram */
-fdt_load_addr = riscv_load_fdt(memmap[SIFIVE_U_DEV_DRAM].base,
-   machine->ram_size, machine->fdt);
+fdt_load_addr = riscv_compute_fdt_addr(memmap[SIFIVE_U_DEV_DRAM].base,
+   machine->ram_size, machine->fdt);
+riscv_load_fdt(fdt_load_addr, machine->fdt);
+
 if (!riscv_is_32bit(>soc.u_cpus)) {
 start_addr_hi32 = (uint64_t)start_addr >> 32;
 }
diff --git a/hw/riscv/spike.c b/hw/riscv/spike.c
index badc11ec43..88b9fdfc36 100644
--- a/hw/riscv/spike.c
+++ b/hw/riscv/spike.c
@@ -324,9 +324,9 @@ static void spike_board_init(MachineState *machine)
 kernel_entry = 0;
 }
 
-/* Compute the fdt load address in dram */
-fdt_load_addr = riscv_load_fdt(memmap[SPIKE_DRAM].base,
-   machine->ram_size, machine->fdt);
+fdt_load_addr = riscv_compute_fdt_addr(memmap[SPIKE_DRAM].base,
+

[PATCH v2 1/6] hw/riscv/boot.c: calculate fdt size after fdt_pack()

2023-01-16 Thread Daniel Henrique Barboza

fdt_pack() can change the fdt size, meaning that fdt_totalsize() can
contain a now deprecated (bigger) value.

Signed-off-by: Daniel Henrique Barboza 
---
 hw/riscv/boot.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
index 2594276223..dc14d8cd14 100644
--- a/hw/riscv/boot.c
+++ b/hw/riscv/boot.c
@@ -253,8 +253,13 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
mem_size, void *fdt)
 {
 uint64_t temp, fdt_addr;
 hwaddr dram_end = dram_base + mem_size;
-int ret, fdtsize = fdt_totalsize(fdt);
+int ret = fdt_pack(fdt);
+int fdtsize;
 
+/* Should only fail if we've built a corrupted tree */
+g_assert(ret == 0);
+
+fdtsize = fdt_totalsize(fdt);
 if (fdtsize <= 0) {
 error_report("invalid device-tree");
 exit(1);
-- 
2.39.0

[PATCH v2 3/6] hw/riscv: simplify riscv_compute_fdt_addr()

2023-01-16 Thread Daniel Henrique Barboza

All callers are using attributes from the MachineState object. Use a
pointer to it instead of passing dram_size (which is always
machine->ram_size) and fdt (always machine->fdt).

Signed-off-by: Daniel Henrique Barboza 
---
 hw/riscv/boot.c| 6 +++---
 hw/riscv/microchip_pfsoc.c | 4 ++--
 hw/riscv/sifive_u.c| 4 ++--
 hw/riscv/spike.c   | 4 ++--
 hw/riscv/virt.c| 3 +--
 include/hw/riscv/boot.h| 2 +-
 6 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
index b213a32157..508da3f5c7 100644
--- a/hw/riscv/boot.c
+++ b/hw/riscv/boot.c
@@ -255,11 +255,11 @@ void riscv_load_initrd(MachineState *machine, uint64_t 
kernel_entry)
  *
  * The FDT is fdt_packed() during the calculation.
  */
-uint32_t riscv_compute_fdt_addr(hwaddr dram_base, uint64_t mem_size,
-void *fdt)
+uint32_t riscv_compute_fdt_addr(MachineState *machine, hwaddr dram_base)
 {
+void *fdt = machine->fdt;
 uint64_t temp;
-hwaddr dram_end = dram_base + mem_size;
+hwaddr dram_end = dram_base + machine->ram_size;
 int ret = fdt_pack(fdt);
 int fdtsize;
 
diff --git a/hw/riscv/microchip_pfsoc.c b/hw/riscv/microchip_pfsoc.c
index dcdbc2cac3..a53e48e996 100644
--- a/hw/riscv/microchip_pfsoc.c
+++ b/hw/riscv/microchip_pfsoc.c
@@ -641,8 +641,8 @@ static void microchip_icicle_kit_machine_init(MachineState 
*machine)
 }
 
 /* Compute the fdt load address in dram */
-fdt_load_addr = 
riscv_compute_fdt_addr(memmap[MICROCHIP_PFSOC_DRAM_LO].base,
-  machine->ram_size, machine->fdt);
+fdt_load_addr = riscv_compute_fdt_addr(machine,
+   
memmap[MICROCHIP_PFSOC_DRAM_LO].base);
 riscv_load_fdt(fdt_load_addr, machine->fdt);
 
 /* Load the reset vector */
diff --git a/hw/riscv/sifive_u.c b/hw/riscv/sifive_u.c
index 626d4dc2f3..ebfddf161d 100644
--- a/hw/riscv/sifive_u.c
+++ b/hw/riscv/sifive_u.c
@@ -616,8 +616,8 @@ static void sifive_u_machine_init(MachineState *machine)
 kernel_entry = 0;
 }
 
-fdt_load_addr = riscv_compute_fdt_addr(memmap[SIFIVE_U_DEV_DRAM].base,
-   machine->ram_size, machine->fdt);
+fdt_load_addr = riscv_compute_fdt_addr(machine,
+   memmap[SIFIVE_U_DEV_DRAM].base);
 riscv_load_fdt(fdt_load_addr, machine->fdt);
 
 if (!riscv_is_32bit(>soc.u_cpus)) {
diff --git a/hw/riscv/spike.c b/hw/riscv/spike.c
index 88b9fdfc36..afd581436b 100644
--- a/hw/riscv/spike.c
+++ b/hw/riscv/spike.c
@@ -324,8 +324,8 @@ static void spike_board_init(MachineState *machine)
 kernel_entry = 0;
 }
 
-fdt_load_addr = riscv_compute_fdt_addr(memmap[SPIKE_DRAM].base,
-   machine->ram_size, machine->fdt);
+fdt_load_addr = riscv_compute_fdt_addr(machine,
+   memmap[SPIKE_DRAM].base);
 riscv_load_fdt(fdt_load_addr, machine->fdt);
 
 /* load the reset vector */
diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
index 839dfaa125..cbba0b8930 100644
--- a/hw/riscv/virt.c
+++ b/hw/riscv/virt.c
@@ -1307,8 +1307,7 @@ static void virt_machine_done(Notifier *notifier, void 
*data)
 start_addr = virt_memmap[VIRT_FLASH].base;
 }
 
-fdt_load_addr = riscv_compute_fdt_addr(memmap[VIRT_DRAM].base,
-   machine->ram_size, machine->fdt);
+fdt_load_addr = riscv_compute_fdt_addr(machine, memmap[VIRT_DRAM].base);
 riscv_load_fdt(fdt_load_addr, machine->fdt);
 
 /* load the reset vector */
diff --git a/include/hw/riscv/boot.h b/include/hw/riscv/boot.h
index 9aea7b9c46..f933de88fb 100644
--- a/include/hw/riscv/boot.h
+++ b/include/hw/riscv/boot.h
@@ -47,7 +47,7 @@ target_ulong riscv_load_kernel(MachineState *machine,
target_ulong firmware_end_addr,
symbol_fn_t sym_cb);
 void riscv_load_initrd(MachineState *machine, uint64_t kernel_entry);
-uint32_t riscv_compute_fdt_addr(hwaddr dram_start, uint64_t dram_size, void 
*fdt);
+uint32_t riscv_compute_fdt_addr(MachineState *machine, hwaddr dram_start);
 void riscv_load_fdt(uint32_t fdt_addr, void *fdt);
 void riscv_setup_rom_reset_vec(MachineState *machine, RISCVHartArrayState 
*harts,
hwaddr saddr,
-- 
2.39.0

[PATCH v2 6/6] hw/riscv/spike.c: rename MachineState 'mc' pointers to' ms'

2023-01-16 Thread Daniel Henrique Barboza

Follow the QEMU convention of naming MachineState pointers as 'ms' by
renaming the instances where we're calling it 'mc'.

Suggested-by: Philippe Mathieu-Daudé 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Daniel Henrique Barboza 
---
 hw/riscv/spike.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/hw/riscv/spike.c b/hw/riscv/spike.c
index afd581436b..222fde0c5c 100644
--- a/hw/riscv/spike.c
+++ b/hw/riscv/spike.c
@@ -56,7 +56,7 @@ static void create_fdt(SpikeState *s, const MemMapEntry 
*memmap,
 uint64_t addr, size;
 unsigned long clint_addr;
 int cpu, socket;
-MachineState *mc = MACHINE(s);
+MachineState *ms = MACHINE(s);
 uint32_t *clint_cells;
 uint32_t cpu_phandle, intc_phandle, phandle = 1;
 char *name, *mem_name, *clint_name, *clust_name;
@@ -65,7 +65,7 @@ static void create_fdt(SpikeState *s, const MemMapEntry 
*memmap,
 "sifive,clint0", "riscv,clint0"
 };
 
-fdt = mc->fdt = create_device_tree(_size);
+fdt = ms->fdt = create_device_tree(_size);
 if (!fdt) {
 error_report("create_device_tree() failed");
 exit(1);
@@ -96,7 +96,7 @@ static void create_fdt(SpikeState *s, const MemMapEntry 
*memmap,
 qemu_fdt_setprop_cell(fdt, "/cpus", "#address-cells", 0x1);
 qemu_fdt_add_subnode(fdt, "/cpus/cpu-map");
 
-for (socket = (riscv_socket_count(mc) - 1); socket >= 0; socket--) {
+for (socket = (riscv_socket_count(ms) - 1); socket >= 0; socket--) {
 clust_name = g_strdup_printf("/cpus/cpu-map/cluster%d", socket);
 qemu_fdt_add_subnode(fdt, clust_name);
 
@@ -121,7 +121,7 @@ static void create_fdt(SpikeState *s, const MemMapEntry 
*memmap,
 qemu_fdt_setprop_cell(fdt, cpu_name, "reg",
 s->soc[socket].hartid_base + cpu);
 qemu_fdt_setprop_string(fdt, cpu_name, "device_type", "cpu");
-riscv_socket_fdt_write_id(mc, cpu_name, socket);
+riscv_socket_fdt_write_id(ms, cpu_name, socket);
 qemu_fdt_setprop_cell(fdt, cpu_name, "phandle", cpu_phandle);
 
 intc_name = g_strdup_printf("%s/interrupt-controller", cpu_name);
@@ -147,14 +147,14 @@ static void create_fdt(SpikeState *s, const MemMapEntry 
*memmap,
 g_free(cpu_name);
 }
 
-addr = memmap[SPIKE_DRAM].base + riscv_socket_mem_offset(mc, socket);
-size = riscv_socket_mem_size(mc, socket);
+addr = memmap[SPIKE_DRAM].base + riscv_socket_mem_offset(ms, socket);
+size = riscv_socket_mem_size(ms, socket);
 mem_name = g_strdup_printf("/memory@%lx", (long)addr);
 qemu_fdt_add_subnode(fdt, mem_name);
 qemu_fdt_setprop_cells(fdt, mem_name, "reg",
 addr >> 32, addr, size >> 32, size);
 qemu_fdt_setprop_string(fdt, mem_name, "device_type", "memory");
-riscv_socket_fdt_write_id(mc, mem_name, socket);
+riscv_socket_fdt_write_id(ms, mem_name, socket);
 g_free(mem_name);
 
 clint_addr = memmap[SPIKE_CLINT].base +
@@ -167,14 +167,14 @@ static void create_fdt(SpikeState *s, const MemMapEntry 
*memmap,
 0x0, clint_addr, 0x0, memmap[SPIKE_CLINT].size);
 qemu_fdt_setprop(fdt, clint_name, "interrupts-extended",
 clint_cells, s->soc[socket].num_harts * sizeof(uint32_t) * 4);
-riscv_socket_fdt_write_id(mc, clint_name, socket);
+riscv_socket_fdt_write_id(ms, clint_name, socket);
 
 g_free(clint_name);
 g_free(clint_cells);
 g_free(clust_name);
 }
 
-riscv_socket_fdt_write_distance_matrix(mc);
+riscv_socket_fdt_write_distance_matrix(ms);
 
 qemu_fdt_add_subnode(fdt, "/chosen");
 qemu_fdt_setprop_string(fdt, "/chosen", "stdout-path", "/htif");
-- 
2.39.0

[RFC PATCH] tests/tcg: skip the vma-pthread test on CI

2023-01-16 Thread Alex Bennée

We are getting a lot of failures that are not related to changes so
this could be a flaky test.

Signed-off-by: Alex Bennée 
Cc: Richard Henderson 
---
 tests/tcg/multiarch/Makefile.target | 9 +
 1 file changed, 9 insertions(+)

diff --git a/tests/tcg/multiarch/Makefile.target 
b/tests/tcg/multiarch/Makefile.target
index e7213af492..ae8b3d7268 100644
--- a/tests/tcg/multiarch/Makefile.target
+++ b/tests/tcg/multiarch/Makefile.target
@@ -42,6 +42,15 @@ munmap-pthread: LDFLAGS+=-pthread
 vma-pthread: CFLAGS+=-pthread
 vma-pthread: LDFLAGS+=-pthread
 
+# The vma-pthread seems very sensitive on gitlab and we currently
+# don't know if its exposing a real bug or the test is flaky.
+ifneq ($(GITLAB_CI),)
+run-vma-pthread: vma-pthread
+   $(call skip-test, $<, "flaky on CI?")
+run-plugin-vma-pthread-with-%: vma-pthread
+   $(call skip-test, $<, "flaky on CI?")
+endif
+
 # We define the runner for test-mmap after the individual
 # architectures have defined their supported pages sizes. If no
 # additional page sizes are defined we only run the default test.
-- 
2.34.1

Re: [PATCH v3 0/5] hw/i2c/bitbang_i2c: Housekeeping

2023-01-16 Thread Laurent Vivier


Le 11/01/2023 à 09:50, Philippe Mathieu-Daudé a écrit :

Series fully reviewed.

Since v2:
- Use array of const pointers to const (Richard)

Since v1:
- Fixed overwritten RECEIVING_BIT7 entry (Richard)
- Picked R-b tags

- Remove unused dummy MemoryRegion
- Convert DPRINTF() to using trace events

(series used as base for follow-up, better if merged via ARM tree)


I can take this via trivial, what do you prefer?

Laurent



Philippe Mathieu-Daudé (5):
   hw/i2c/bitbang_i2c: Define TYPE_GPIO_I2C in public header
   hw/i2c/bitbang_i2c: Remove unused dummy MemoryRegion
   hw/i2c/bitbang_i2c: Change state calling bitbang_i2c_set_state()
 helper
   hw/i2c/bitbang_i2c: Trace state changes
   hw/i2c/bitbang_i2c: Convert DPRINTF() to trace events

  hw/arm/musicpal.c|  3 +-
  hw/i2c/bitbang_i2c.c | 80 ++--
  hw/i2c/trace-events  |  7 
  include/hw/i2c/bitbang_i2c.h |  2 +
  4 files changed, 61 insertions(+), 31 deletions(-)

Re: [PATCH v6 16/51] i386/xen: implement HYPERVISOR_memory_op

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Joao Martins 

Specifically XENMEM_add_to_physmap with space XENMAPSPACE_shared_info to
allow the guest to set its shared_info page.

Signed-off-by: Joao Martins 
[dwmw2: Use the xen_overlay device, add compat support]
Signed-off-by: David Woodhouse 


Reviewed-by: Paul Durrant

Re: [PATCH v14 01/11] s390x/cpu topology: adding s390 specificities to CPU topology

2023-01-16 Thread Pierre Morel





On 1/13/23 17:58, Nina Schoetterl-Glausch wrote:

On Thu, 2023-01-05 at 15:53 +0100, Pierre Morel wrote:

S390 adds two new SMP levels, drawers and books to the CPU
topology.
The S390 CPU have specific toplogy features like dedication
and polarity to give to the guest indications on the host
vCPUs scheduling and help the guest take the best decisions
on the scheduling of threads on the vCPUs.

Let us provide the SMP properties with books and drawers levels
and S390 CPU with dedication and polarity,

Signed-off-by: Pierre Morel 
---
  qapi/machine.json   | 14 --
  include/hw/boards.h | 10 ++-
  include/hw/s390x/cpu-topology.h | 23 
  target/s390x/cpu.h  |  6 +
  hw/core/machine-smp.c   | 48 -
  hw/core/machine.c   |  4 +++
  hw/s390x/s390-virtio-ccw.c  |  2 ++
  softmmu/vl.c|  6 +
  target/s390x/cpu.c  | 10 +++
  qemu-options.hx |  6 +++--
  10 files changed, 117 insertions(+), 12 deletions(-)
  create mode 100644 include/hw/s390x/cpu-topology.h


[...]


diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index 7d6d01325b..39ea63a416 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -131,6 +131,12 @@ struct CPUArchState {
  
  #if !defined(CONFIG_USER_ONLY)

  uint32_t core_id; /* PoP "CPU address", same as cpu_index */
+int32_t socket_id;
+int32_t book_id;
+int32_t drawer_id;
+int32_t dedicated;
+int32_t polarity;


If I understood the architecture correctly, the polarity is a property of the 
configuration,
not the cpus. So this should be vertical_entitlement, and there should be a 
machine (?) property
specifying if the polarity is horizontal or vertical.


You are right, considering PTF only, the documentation says PTF([01]) 
does the following:


"... a process is initiated to place all CPUs in the configuration into 
the polarization specified by the function code, ..."


So on one side the polarization property is explicitly set on the CPU, 
and on the other side all CPU are supposed to be in the same 
polarization state.


So yes we can make the horizontal/vertical a machine property.
However, we do not need to set this tunable as the documentation says 
that the machine always start with horizontal polarization.


On the other hand the documentation mixes a lot vertical with different 
entitlement and horizontal polarization, for TLE order and slacks so I 
prefer to keep the complete description of the polarization as CPU 
properties in case we miss something.


PTF([01]) are no performance bottle neck and the number of CPU is likely 
to be small, even a maximum of 248 is possible KVM warns above 16 CPU so 
the loop for setting all CPU inside PTF interception is not very 
problematic I think.


Doing like you say should simplify PTF interception (no loop) but 
complicates (some more if/else) TLE handling and QMP information display 
on CPU.

So I will have a look at the implications and answer again on this.

Thanks,

Regards,
Pierre

--
Pierre Morel
IBM Lab Boeblingen

Re: [PATCH 00/12] More cleanups and fixes for drain

2023-01-16 Thread Kevin Wolf

Am 12.12.2022 um 13:59 hat Paolo Bonzini geschrieben:
> There are a few more lines of code that can be removed around draining
> code, but especially the logic can be simplified by removing unnecessary
> parameters.
> 
> Due to the failure of the block-next branch, the first three patches
> drop patches 14+15 of Kevin's drain cleanup series, and then redo
> patch 15 in a slightly less satisfactory way that still enables the
> remaining cleanups.  These reverts are not supposed to be applied;
> either the offending patches are dropped from the branch, or if the
> issue is fixed then my first three patches can go away.
> 
> The next three are taken from Emanuele's old subtree drain attempt
> at removing the AioContext.  The main one is the second, which is needed
> to avoid testcase failures, but I included all of them for simplicity.
> 
> Patch 7 fixes another latent bug exposed by the later cleanups, and while
> looking for a fix I noticed a general lack of thread-safety in BlockBackend's
> drain code.  There are some global properties that only need to be documented
> and enforced to be set only at creation time (patches 8/9), but also
> queued_requests is not protected by any mutex, which is fixed in patch 10.
> 
> Finally, patches 11-15 are the actual simplification.

Patches 12-15 actually look independent from the rest of the series, and
they look good to me. Could they be applied now even if the rest of the
series needs more discussion and a v2?

Kevin

Re: [PATCH 6/6] hw/arm/aspeed: Init fby35 BMC FRUID EEPROM

2023-01-16 Thread Peter Delevoryas

On Mon, Jan 16, 2023 at 01:30:19PM +0100, Philippe Mathieu-Daudé wrote:
> On 14/1/23 18:01, Peter Delevoryas wrote:
> > Signed-off-by: Peter Delevoryas 
> > ---
> >   hw/arm/aspeed.c | 49 +
> >   1 file changed, 49 insertions(+)
> > 
> > diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
> > index c929c61d582a..4ac8ff11a835 100644
> > --- a/hw/arm/aspeed.c
> > +++ b/hw/arm/aspeed.c
> > @@ -922,6 +922,52 @@ static void bletchley_bmc_i2c_init(AspeedMachineState 
> > *bmc)
> >   i2c_slave_create_simple(i2c[12], TYPE_PCA9552, 0x67);
> >   }
> > +static const uint8_t fby35_bmc_fruid[] = {
> [...]
> 
> > +};
> > +
> >   static void fby35_i2c_init(AspeedMachineState *bmc)
> >   {
> >   AspeedSoCState *soc = >soc;
> > @@ -1363,6 +1409,9 @@ static void fby35_reset(MachineState *state, 
> > ShutdownCause reason)
> >   object_property_set_bool(OBJECT(gpio), "gpioB3", false, _fatal);
> >   object_property_set_bool(OBJECT(gpio), "gpioB4", false, _fatal);
> >   object_property_set_bool(OBJECT(gpio), "gpioB5", false, _fatal);
> > +
> > +at24c_eeprom_write(aspeed_i2c_get_bus(>soc.i2c, 11),
> > +   0x54, 0, fby35_bmc_fruid, sizeof(fby35_bmc_fruid));
> 
> Why transfer the prom content on the i2c bus at each reset?
> 
> In particular this looks wrong if the prom is initialized with a 'drive'
> block backend (using -global).

Yeah, it looks like this might not be the right way to model it. I'm going
to try Cedric's suggestions.

> 
> >   }
> 
>

Re: [PATCH 2/6] hw/arm/aspeed: Remove local copy of at24c_eeprom_init

2023-01-16 Thread Peter Delevoryas

On Mon, Jan 16, 2023 at 01:24:36PM +0100, Philippe Mathieu-Daudé wrote:
> On 14/1/23 18:01, Peter Delevoryas wrote:
> > Signed-off-by: Peter Delevoryas 
> > ---
> >   hw/arm/aspeed.c | 10 +-
> >   1 file changed, 1 insertion(+), 9 deletions(-)
> 
> > -static void at24c_eeprom_init(I2CBus *bus, uint8_t addr, uint32_t rsize)
> > -{
> > -I2CSlave *i2c_dev = i2c_slave_new("at24c-eeprom", addr);
> > -DeviceState *dev = DEVICE(i2c_dev);
> > -
> > -qdev_prop_set_uint32(dev, "rom-size", rsize);
> > -i2c_slave_realize_and_unref(i2c_dev, bus, _abort);
> > -}
> 
> Why not squash in previous commit as 'extract helper' change?

+1, I'll squash this.

> 
> Anyhow,
> Reviewed-by: Philippe Mathieu-Daudé 
> 
>

Re: [PATCH 1/6] hw/nvram/eeprom_at24c: Add header w/ init helper

2023-01-16 Thread Peter Delevoryas

On Mon, Jan 16, 2023 at 01:23:01PM +0100, Philippe Mathieu-Daudé wrote:
> On 14/1/23 18:01, Peter Delevoryas wrote:
> > Signed-off-by: Peter Delevoryas 
> > ---
> >   hw/nvram/eeprom_at24c.c | 10 ++
> >   include/hw/nvram/eeprom_at24c.h | 10 ++
> >   2 files changed, 20 insertions(+)
> >   create mode 100644 include/hw/nvram/eeprom_at24c.h
> 
> > +void at24c_eeprom_init(I2CBus *bus, uint8_t address, uint32_t rom_size)
> > +{
> > +I2CSlave *i2c_dev = i2c_slave_new("at24c-eeprom", address);
> 
> Please use the type definition: TYPE_AT24C_EE.
> 
> > +DeviceState *dev = DEVICE(i2c_dev);
> > +
> > +qdev_prop_set_uint32(dev, "rom-size", rom_size);
> > +i2c_slave_realize_and_unref(i2c_dev, bus, _abort);
> 
> Although the allocated object is somehow reachable from the i2c bus
> object, it would be simpler to deallocate allowing the parent to keep
> a reference to it. So consider this prototype instead:
> 
>   I2CSlave *at24c_eeprom_create(I2CBus *bus, uint8_t address,
> uint32_t rom_size);
> 

Oh ok, yeah that sounds good. In this case, if I let the parent keep a
reference, maybe I shouldn't use i2c_slave_realize_and_unref, and just use
qdev_realize/etc (to avoid the unref?). I'll try just returning the pointer
from the function to start with though.

> > +}

Re: [PATCH v6 15/51] i386/xen: manage and save/restore Xen guest long_mode setting

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: David Woodhouse 

Xen will "latch" the guest's 32-bit or 64-bit ("long mode") setting when
the guest writes the MSR to fill in the hypercall page, or when the guest
sets the event channel callback in HVM_PARAM_CALLBACK_IRQ.

KVM handles the former and sets the kernel's long_mode flag accordingly.
The latter will be handled in userspace. Keep them in sync by noticing
when a hypercall is made in a mode that doesn't match qemu's idea of
the guest mode, and resyncing from the kernel. Do that same sync right
before serialization too, in case the guest has set the hypercall page
but hasn't yet made a system call.

Signed-off-by: David Woodhouse 


Reviewed-by: Paul Durrant 

... with one suggestion...


---
  hw/i386/kvm/xen_overlay.c | 65 +++
  hw/i386/kvm/xen_overlay.h |  4 +++
  target/i386/kvm/xen-emu.c | 12 
  3 files changed, 81 insertions(+)

diff --git a/hw/i386/kvm/xen_overlay.c b/hw/i386/kvm/xen_overlay.c
index 3e85bf912f..6fd63ff906 100644
--- a/hw/i386/kvm/xen_overlay.c
+++ b/hw/i386/kvm/xen_overlay.c
@@ -44,6 +44,7 @@ struct XenOverlayState {
  MemoryRegion shinfo_mem;
  void *shinfo_ptr;
  uint64_t shinfo_gpa;
+bool long_mode;
  };
  
  struct XenOverlayState *xen_overlay_singleton;

@@ -96,9 +97,21 @@ static void xen_overlay_realize(DeviceState *dev, Error 
**errp)
  
  s->shinfo_ptr = memory_region_get_ram_ptr(>shinfo_mem);

  s->shinfo_gpa = INVALID_GPA;
+s->long_mode = false;
  memset(s->shinfo_ptr, 0, XEN_PAGE_SIZE);
  }
  
+static int xen_overlay_pre_save(void *opaque)

+{
+/*
+ * Fetch the kernel's idea of long_mode to avoid the race condition
+ * where the guest has set the hypercall page up in 64-bit mode but
+ * not yet made a hypercall by the time migration happens, so qemu
+ * hasn't yet noticed.
+ */
+return xen_sync_long_mode();
+}
+
  static int xen_overlay_post_load(void *opaque, int version_id)
  {
  XenOverlayState *s = opaque;
@@ -107,6 +120,9 @@ static int xen_overlay_post_load(void *opaque, int 
version_id)
  xen_overlay_map_page_locked(>shinfo_mem, s->shinfo_gpa);
  xen_overlay_set_be_shinfo(s->shinfo_gpa >> XEN_PAGE_SHIFT);
  }
+if (s->long_mode) {
+xen_set_long_mode(true);
+}
  
  return 0;

  }
@@ -121,9 +137,11 @@ static const VMStateDescription xen_overlay_vmstate = {
  .version_id = 1,
  .minimum_version_id = 1,
  .needed = xen_overlay_is_needed,
+.pre_save = xen_overlay_pre_save,
  .post_load = xen_overlay_post_load,
  .fields = (VMStateField[]) {
  VMSTATE_UINT64(shinfo_gpa, XenOverlayState),
+VMSTATE_BOOL(long_mode, XenOverlayState),
  VMSTATE_END_OF_LIST()
  }
  };
@@ -198,3 +216,50 @@ void *xen_overlay_get_shinfo_ptr(void)
  
  return s->shinfo_ptr;

  }
+
+int xen_sync_long_mode(void)
+{
+int ret;
+struct kvm_xen_hvm_attr xa = {
+.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+};
+
+if (!xen_overlay_singleton) {
+return -ENOENT;
+}
+
+ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_GET_ATTR, );
+if (!ret) {
+xen_overlay_singleton->long_mode = xa.u.long_mode;
+}
+
+return ret;
+}
+
+int xen_set_long_mode(bool long_mode)
+{
+int ret;
+struct kvm_xen_hvm_attr xa = {
+.type = KVM_XEN_ATTR_TYPE_LONG_MODE,
+.u.long_mode = long_mode,
+};
+
+if (!xen_overlay_singleton) {
+return -ENOENT;
+}
+
+ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, );
+if (!ret) {
+xen_overlay_singleton->long_mode = xa.u.long_mode;
+}
+
+return ret;
+}
+
+bool xen_is_long_mode(void)
+{
+if (xen_overlay_singleton) {
+return xen_overlay_singleton->long_mode;
+}
+return false;


return xen_overlay_singleton && xen_overlay_singleton->long_mode;

perhaps?


+}
diff --git a/hw/i386/kvm/xen_overlay.h b/hw/i386/kvm/xen_overlay.h
index 00cff05bb0..5c46a0b036 100644
--- a/hw/i386/kvm/xen_overlay.h
+++ b/hw/i386/kvm/xen_overlay.h
@@ -17,4 +17,8 @@ void xen_overlay_create(void);
  int xen_overlay_map_shinfo_page(uint64_t gpa);
  void *xen_overlay_get_shinfo_ptr(void);
  
+int xen_sync_long_mode(void);

+int xen_set_long_mode(bool long_mode);
+bool xen_is_long_mode(void);
+
  #endif /* QEMU_XEN_OVERLAY_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 80005ea527..80f09f33df 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -19,6 +19,8 @@
  #include "trace.h"
  #include "sysemu/runstate.h"
  
+#include "hw/i386/kvm/xen_overlay.h"

+
  #include "standard-headers/xen/version.h"
  #include "standard-headers/xen/sched.h"
  
@@ -274,6 +276,16 @@ int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)

  return -1;
  }
  
+/*

+ * The kernel latches the guest 32/64 mode when the MSR is used to fill
+ * the hypercall page. So if we see a hypercall in

Re: [PATCH 6/6] hw/arm/aspeed: Init fby35 BMC FRUID EEPROM

2023-01-16 Thread Peter Delevoryas

On Mon, Jan 16, 2023 at 01:42:48PM +0100, Cédric Le Goater wrote:
> On 1/14/23 18:01, Peter Delevoryas wrote:
> > Signed-off-by: Peter Delevoryas 
> > ---
> >   hw/arm/aspeed.c | 49 +
> >   1 file changed, 49 insertions(+)
> > 
> > diff --git a/hw/arm/aspeed.c b/hw/arm/aspeed.c
> > index c929c61d582a..4ac8ff11a835 100644
> > --- a/hw/arm/aspeed.c
> > +++ b/hw/arm/aspeed.c
> > @@ -922,6 +922,52 @@ static void bletchley_bmc_i2c_init(AspeedMachineState 
> > *bmc)
> >   i2c_slave_create_simple(i2c[12], TYPE_PCA9552, 0x67);
> >   }
> > +static const uint8_t fby35_bmc_fruid[] = {
> > +0x01, 0x00, 0x00, 0x01, 0x0d, 0x00, 0x00, 0xf1, 0x01, 0x0c, 0x00, 0x36,
> > +0xe6, 0xd0, 0xc6, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2, 0x42, 0x4d,
> > +0x43, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x4d, 0x6f,
> > +0x64, 0x75, 0x6c, 0x65, 0xcd, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58,
> > +0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xce, 0x58, 0x58, 0x58, 0x58, 0x58,
> > +0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc3, 0x31, 0x2e,
> > +0x30, 0xc9, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2,
> > +0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58,
> > +0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc1, 0x39, 0x01, 0x0c, 0x00, 0xc6,
> > +0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xd2, 0x59, 0x6f, 0x73, 0x65, 0x6d,
> > +0x69, 0x74, 0x65, 0x20, 0x56, 0x33, 0x2e, 0x35, 0x20, 0x45, 0x56, 0x54,
> > +0x32, 0xce, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58,
> > +0x58, 0x58, 0x58, 0x58, 0xc4, 0x45, 0x56, 0x54, 0x32, 0xcd, 0x58, 0x58,
> > +0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc7,
> > +0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc3, 0x31, 0x2e, 0x30, 0xc9,
> > +0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0x58, 0xc8, 0x43, 0x6f,
> > +0x6e, 0x66, 0x69, 0x67, 0x20, 0x41, 0xc1, 0x45, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
> > +};
> 
> 
> I would introduce a new aspeed_eeprom.c file for these definitions because
> each machine could have its own set of eeproms and aspeed.c is already big
> enough.

+1

> 
> >   static void fby35_i2c_init(AspeedMachineState *bmc)
> >   {
> >   AspeedSoCState *soc = >soc;
> > @@ -1363,6 +1409,9 @@ static void fby35_reset(MachineState *state, 
> > ShutdownCause reason)
> >   object_property_set_bool(OBJECT(gpio), "gpioB3", false, _fatal);
> >   object_property_set_bool(OBJECT(gpio), "gpioB4", false, _fatal);
> >   object_property_set_bool(OBJECT(gpio), "gpioB5", false, _fatal);
> > +
> > +at24c_eeprom_write(aspeed_i2c_get_bus(>soc.i2c, 11),
> > +   0x54, 0, fby35_bmc_fruid, sizeof(fby35_bmc_fruid));
> >   }
> 
> That's one way to model the default reset values of the eeprom, we would
> loose any writes though.
> 
> I think we should have a

Re: [PATCH v6 14/51] i386/xen: add pc_machine_kvm_type to initialize XEN_EMULATE mode

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: David Woodhouse 

The xen_overlay device (and later similar devices for event channels and
grant tables) need to be instantiated. Do this from a kvm_type method on
the PC machine derivatives, since KVM is only way to support Xen emulation
for now.

Signed-off-by: David Woodhouse 
---
  hw/i386/pc.c | 11 +++
  include/hw/i386/pc.h |  3 +++
  2 files changed, 14 insertions(+)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index d489ecc0d1..0ddae2f6ad 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -89,6 +89,7 @@
  #include "hw/virtio/virtio-iommu.h"
  #include "hw/virtio/virtio-pmem-pci.h"
  #include "hw/virtio/virtio-mem-pci.h"
+#include "hw/i386/kvm/xen_overlay.h"
  #include "hw/mem/memory-device.h"
  #include "sysemu/replay.h"
  #include "target/i386/cpu.h"
@@ -1844,6 +1845,16 @@ static void pc_machine_initfn(Object *obj)
  cxl_machine_init(obj, >cxl_devices_state);
  }
  
+int pc_machine_kvm_type(MachineState *machine, const char *kvm_type)

+{
+#ifdef CONFIG_XEN_EMU
+if (xen_mode == XEN_EMULATE) {
+xen_overlay_create();
+}
+#endif
+return 0;
+}
+
  static void pc_machine_reset(MachineState *machine, ShutdownCause reason)
  {
  CPUState *cs;
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 991f905f5d..b866567b7b 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -293,12 +293,15 @@ extern const size_t pc_compat_1_5_len;
  extern GlobalProperty pc_compat_1_4[];
  extern const size_t pc_compat_1_4_len;
  
+extern int pc_machine_kvm_type(MachineState *machine, const char *vm_type);

+
  #define DEFINE_PC_MACHINE(suffix, namestr, initfn, optsfn) \
  static void pc_machine_##suffix##_class_init(ObjectClass *oc, void *data) 
\
  { \
  MachineClass *mc = MACHINE_CLASS(oc); \
  optsfn(mc); \
  mc->init = initfn; \
+mc->kvm_type = pc_machine_kvm_type; \


Given that it does nothing in the non-Xen-emulate case, would it not be 
neater to simply wrap the above line, and the definition of the 
function, in #ifdef CONFIG_XEN_EMU?


  Paul


  } \
  static const TypeInfo pc_machine_type_##suffix = { \
  .name   = namestr TYPE_MACHINE_SUFFIX, \

Re: [PATCH 4/4] tests/tcg/multiarch: add vma-pthread.c

2023-01-16 Thread Alex Bennée



Peter Maydell  writes:

> On Mon, 16 Jan 2023 at 16:33, Alex Bennée  wrote:
>>
>>
>> Peter Maydell  writes:
>>
>> > On Mon, 16 Jan 2023 at 12:40, Philippe Mathieu-Daudé  
>> > wrote:
>> >>
>> >> On 13/1/23 18:10, Alex Bennée wrote:
>> > Yep. Could somebody write a patch to disable this test while
>> > we figure out why it's flaky, please?
>>
>> I don't think the test is flaky - I think it is triggering a race in
>> QEMU code. I have not however been able to replicate it in anything other
>> than CI.
>
> My definition of "flaky" here is "the CI fails randomly".
> Unless you have a patch to fix the underlying bug ready to
> go right now, please can we disable this so we don't keep
> getting CI failures trying to test merges?

Yes - just testing the patch now.

>
> thanks
> -- PMM


-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [PATCH 0/7] Trivial: Mark some more files as target-independant

2023-01-16 Thread Laurent Vivier


Le 12/01/2023 à 14:49, Thomas Huth a écrit :

Here's a collection of low-hanging fruits to mark some more files
as target-independent (so that they do not have to be compiled twice,
once for qemu-system-arm and once for qemu-system-aarch64).

Philippe's patches have been on the list before, but I slightly
modified some of them (like fixing typos in the subject etc.).
My patches are new.

Philippe Mathieu-Daudé (4):
   hw/display: Move omap_lcdc.c out of target-specific source set
   hw/intc: Move some files out of the target-specific source set
   hw/tpm: Move tpm_ppi.c out of target-specific source set
   hw/arm: Move various units to softmmu_ss[]

Thomas Huth (3):
   hw/cpu: Mark arm11 and realview mpcore as target-independent code
   hw/intc: Mark more interrupt-controller files as target independent
   hw/usb: Mark the XLNX_VERSAL-related files as target-independent

  hw/arm/meson.build | 11 +++
  hw/cpu/meson.build |  4 ++--
  hw/display/meson.build |  2 +-
  hw/intc/meson.build| 12 ++--
  hw/tpm/meson.build |  4 ++--
  hw/usb/meson.build |  4 ++--
  6 files changed, 20 insertions(+), 17 deletions(-)



Series applied to my trivial-patches branch.

Thanks,
Laurent

Re: [PATCH v6 13/51] hw/xen: Add xen_overlay device for emulating shared xenheap pages

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: David Woodhouse 

For the shared info page and for grant tables, Xen shares its own pages
from the "Xen heap" to the guest. The guest requests that a given page
from a certain address space (XENMAPSPACE_shared_info, etc.) be mapped
to a given GPA using the XENMEM_add_to_physmap hypercall.

To support that in qemu when *emulating* Xen, create a memory region
(migratable) and allow it to be mapped as an overlay when requested.

Xen theoretically allows the same page to be mapped multiple times
into the guest, but that's hard to track and reinstate over migration,
so we automatically *unmap* any previous mapping when creating a new
one. This approach has been used in production with a non-trivial
number of guests expecting true Xen, without any problems yet being
noticed.

This adds just the shared info page for now. The grant tables will be
a larger region, and will need to be overlaid one page at a time. I
think that means I need to create separate aliases for each page of
the overall grant_frames region, so that they can be mapped individually.

Signed-off-by: David Woodhouse 


Reviewed-by: Paul Durrant

Re: [PATCH 11/15] block-backend: make queued_requests thread-safe

2023-01-16 Thread Kevin Wolf

Am 12.12.2022 um 13:59 hat Paolo Bonzini geschrieben:
> Protect quiesce_counter and queued_requests with a QemuMutex, so that
> they are protected from concurrent access in the main thread (for example
> blk_root_drained_end() reached from bdrv_drain_all()) and in the iothread
> (where any I/O operation will call blk_inc_in_flight()).
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block/block-backend.c | 44 +++
>  1 file changed, 36 insertions(+), 8 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 627d491d4155..fdf82f1f1599 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -23,6 +23,7 @@
>  #include "qapi/error.h"
>  #include "qapi/qapi-events-block.h"
>  #include "qemu/id.h"
> +#include "qemu/thread.h"
>  #include "qemu/main-loop.h"
>  #include "qemu/option.h"
>  #include "trace.h"
> @@ -85,6 +86,8 @@ struct BlockBackend {
>  NotifierList remove_bs_notifiers, insert_bs_notifiers;
>  QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
>  
> +/* Protected by quiesce_lock.  */
> +QemuMutex quiesce_lock;
>  int quiesce_counter;
>  CoQueue queued_requests;
>  
> @@ -372,6 +375,7 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, 
> uint64_t shared_perm)
>  
>  block_acct_init(>stats);
>  
> +qemu_mutex_init(>quiesce_lock);
>  qemu_co_queue_init(>queued_requests);
>  notifier_list_init(>remove_bs_notifiers);
>  notifier_list_init(>insert_bs_notifiers);
> @@ -490,6 +494,7 @@ static void blk_delete(BlockBackend *blk)
>  assert(QLIST_EMPTY(>insert_bs_notifiers.notifiers));
>  assert(QLIST_EMPTY(>aio_notifiers));
>  QTAILQ_REMOVE(_backends, blk, link);
> +qemu_mutex_destroy(>quiesce_lock);
>  drive_info_del(blk->legacy_dinfo);
>  block_acct_cleanup(>stats);
>  g_free(blk);
> @@ -1451,11 +1456,25 @@ void blk_inc_in_flight(BlockBackend *blk)
>  {
>  IO_CODE();
>  qatomic_inc(>in_flight);
> -if (!blk->disable_request_queuing) {
> -/* TODO: this is not thread-safe! */
> +
> +/*
> + * Avoid a continuous stream of requests from AIO callbacks, which
> + * call a user-provided callback while blk->in_flight is elevated,
> + * if the BlockBackend is being quiesced.
> + *
> + * This initial test does not have to be perfect: a race might
> + * cause an extra I/O to be queued, but sooner or later a nonzero
> + * quiesce_counter will be observed.

This is true in the initial drain phase while we're still polling. But
generally this is not only about avoiding a continuous stream of
requests, but about making sure that absolutely no new requests come in
while a node is drained.

> + */
> +if (!blk->disable_request_queuing && 
> qatomic_read(>quiesce_counter)) {

So if no other requests were pending and we didn't even call aio_poll()
because the AIO_WAIT_WHILE() condition was false from the start, could
it be that bdrv_drained_begin() has already returned on the thread that
drains, but another thread still sees the old value here?

Starting a new request after bdrv_drained_begin() has returned would be
a bug.

Kevin

Re: [PATCH v2] tests/qtest/qom-test: Do not print tested properties by default

2023-01-16 Thread Peter Maydell

On Thu, 15 Dec 2022 at 15:30, Thomas Huth  wrote:
>
> We're still running into the problem that some logs are cut in the
> gitlab-CI since they got too big. The biggest part of the log is
> still the output of the qom-test. Let's stop printing the properties
> by default to get to a saner size here. The full output can still
> be enabled by setting V=2 (or higher) in the environment.
>
> Signed-off-by: Thomas Huth 
> ---
>  v2: Use atoi() to do proper checking of the verbosity level

Applied to master in the hope of improving the CI logs; thanks.

-- PMM

Re: [PATCH 10/15] block-backend: always wait for drain before starting operation

2023-01-16 Thread Kevin Wolf

Am 12.12.2022 um 13:59 hat Paolo Bonzini geschrieben:
> All I/O operations call blk_wait_while_drained() immediately after
> blk_inc_in_flight(), except for blk_abort_aio_request() where it
> does not hurt to add such a call.  Merge the two functions into one,
> and add a note about a disturbing lack of thread-safety that will
> be fixed shortly.
> 
> While at it, make the quiesce_counter check a loop.  While the check
> does not have to be "perfect", i.e. it only matters that an endless
> stream of I/Os is stopped sooner or later, it is more logical to check
> the counter repeatedly until it is zero.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block/block-backend.c | 27 ---
>  1 file changed, 8 insertions(+), 19 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index fe42d53d655d..627d491d4155 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1270,18 +1270,6 @@ static int blk_check_byte_request(BlockBackend *blk, 
> int64_t offset,
>  return 0;
>  }
>  
> -/* To be called between exactly one pair of blk_inc/dec_in_flight() */
> -static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
> -{
> -assert(blk->in_flight > 0);
> -
> -if (blk->quiesce_counter && !blk->disable_request_queuing) {
> -blk_dec_in_flight(blk);
> -qemu_co_queue_wait(>queued_requests, NULL);
> -blk_inc_in_flight(blk);
> -}
> -}
> -
>  /* To be called between exactly one pair of blk_inc/dec_in_flight() */
>  static int coroutine_fn
>  blk_co_do_preadv_part(BlockBackend *blk, int64_t offset, int64_t bytes,
> @@ -1334,7 +1322,6 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, 
> int64_t offset,
>  IO_OR_GS_CODE();
>  
>  blk_inc_in_flight(blk);
> -blk_wait_while_drained(blk);
>  ret = blk_co_do_preadv_part(blk, offset, bytes, qiov, 0, flags);
>  blk_dec_in_flight(blk);
>  
> @@ -1349,7 +1336,6 @@ int coroutine_fn blk_co_preadv_part(BlockBackend *blk, 
> int64_t offset,
>  IO_OR_GS_CODE();
>  
>  blk_inc_in_flight(blk);
> -blk_wait_while_drained(blk);
>  ret = blk_co_do_preadv_part(blk, offset, bytes, qiov, qiov_offset, 
> flags);
>  blk_dec_in_flight(blk);
>  
> @@ -1401,7 +1387,6 @@ int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, 
> int64_t offset,
>  IO_OR_GS_CODE();
>  
>  blk_inc_in_flight(blk);
> -blk_wait_while_drained(blk);
>  ret = blk_co_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, 
> flags);
>  blk_dec_in_flight(blk);
>  
> @@ -1466,6 +1451,14 @@ void blk_inc_in_flight(BlockBackend *blk)
>  {
>  IO_CODE();
>  qatomic_inc(>in_flight);
> +if (!blk->disable_request_queuing) {
> +/* TODO: this is not thread-safe! */
> +while (blk->quiesce_counter) {
> +qatomic_dec(>in_flight);
> +qemu_co_queue_wait(>queued_requests, NULL);

blk_inc_in_flight() must be a coroutine_fn now.

blk_abort_aio_request() and blk_aio_prwv() aren't, but still call it.

> +qatomic_inc(>in_flight);
> +}
> +}
>  }

Kevin

Re: [PATCH 4/4] tests/tcg/multiarch: add vma-pthread.c

2023-01-16 Thread Peter Maydell

On Mon, 16 Jan 2023 at 16:33, Alex Bennée  wrote:
>
>
> Peter Maydell  writes:
>
> > On Mon, 16 Jan 2023 at 12:40, Philippe Mathieu-Daudé  
> > wrote:
> >>
> >> On 13/1/23 18:10, Alex Bennée wrote:
> > Yep. Could somebody write a patch to disable this test while
> > we figure out why it's flaky, please?
>
> I don't think the test is flaky - I think it is triggering a race in
> QEMU code. I have not however been able to replicate it in anything other
> than CI.

My definition of "flaky" here is "the CI fails randomly".
Unless you have a patch to fix the underlying bug ready to
go right now, please can we disable this so we don't keep
getting CI failures trying to test merges?

thanks
-- PMM

Re: [PATCH v3 5/7] hw/i386/acpi-build: Remove unused attributes

2023-01-16 Thread Igor Mammedov

On Mon, 16 Jan 2023 16:29:06 +0100
Bernhard Beschow  wrote:

> Ammends commit 3db119da7915 'pc: acpi: switch to AML API composed DSDT'.
> 
> Signed-off-by: Bernhard Beschow 
> Reviewed-by: Philippe Mathieu-Daudé 

Reviewed-by: Igor Mammedov 

> ---
>  hw/i386/acpi-build.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
> index 0be3960a37..428328dc2d 100644
> --- a/hw/i386/acpi-build.c
> +++ b/hw/i386/acpi-build.c
> @@ -117,8 +117,6 @@ typedef struct AcpiMiscInfo {
>  #ifdef CONFIG_TPM
>  TPMVersion tpm_version;
>  #endif
> -const unsigned char *dsdt_code;
> -unsigned dsdt_size;
>  } AcpiMiscInfo;
>  
>  typedef struct FwCfgTPMConfig {

Re: Call qemu_socketpair() instead of socketpair() when possible

2023-01-16 Thread Laurent Vivier


Le 16/01/2023 à 05:56, Guoyi Tu a écrit :

As qemu_socketpair() was introduced in commit 3c63b4e9
("oslib-posix: Introduce qemu_socketpair()"), it's time
to replace the other existing socketpair() calls with
qemu_socketpair() if possible

Signed-off-by: Guoyi Tu 
---
  backends/tpm/tpm_emulator.c | 2 +-
  tests/qtest/dbus-display-test.c | 5 +++--
  tests/qtest/migration-test.c    | 2 +-
  tests/unit/test-crypto-tlssession.c | 4 ++--
  tests/unit/test-io-channel-tls.c    | 2 +-
  5 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/backends/tpm/tpm_emulator.c b/backends/tpm/tpm_emulator.c
index 49cc3d749d..67e7b212e3 100644
--- a/backends/tpm/tpm_emulator.c
+++ b/backends/tpm/tpm_emulator.c
@@ -553,7 +553,7 @@ static int tpm_emulator_prepare_data_fd(TPMEmulator 
*tpm_emu)
  Error *err = NULL;
  int fds[2] = { -1, -1 };

-    if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) < 0) {
+    if (qemu_socketpair(AF_UNIX, SOCK_STREAM, 0, fds) < 0) {
  error_report("tpm-emulator: Failed to create socketpair");
  return -1;
  }
diff --git a/tests/qtest/dbus-display-test.c b/tests/qtest/dbus-display-test.c
index cb1b62d1d1..fef025ac6f 100644
--- a/tests/qtest/dbus-display-test.c
+++ b/tests/qtest/dbus-display-test.c
@@ -1,5 +1,6 @@
  #include "qemu/osdep.h"
  #include "qemu/dbus.h"
+#include "qemu/sockets.h"
  #include 
  #include 
  #include "libqtest.h"
@@ -36,7 +37,7 @@ test_setup(QTestState **qts, GDBusConnection **conn)

  *qts = qtest_init("-display dbus,p2p=yes -name dbus-test");

-    g_assert_cmpint(socketpair(AF_UNIX, SOCK_STREAM, 0, pair), ==, 0);
+    g_assert_cmpint(qemu_socketpair(AF_UNIX, SOCK_STREAM, 0, pair), ==, 0);

  qtest_qmp_add_client(*qts, "@dbus-display", pair[1]);

@@ -152,7 +153,7 @@ test_dbus_display_console(void)

  test_setup(, );

-    g_assert_cmpint(socketpair(AF_UNIX, SOCK_STREAM, 0, pair), ==, 0);
+    g_assert_cmpint(qemu_socketpair(AF_UNIX, SOCK_STREAM, 0, pair), ==, 0);
  fd_list = g_unix_fd_list_new();
  idx = g_unix_fd_list_append(fd_list, pair[1], NULL);

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index dbde726adf..1dd32c9506 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1661,7 +1661,7 @@ static void *test_migrate_fd_start_hook(QTestState *from,
  int pair[2];

  /* Create two connected sockets for migration */
-    ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
+    ret = qemu_socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
  g_assert_cmpint(ret, ==, 0);

  /* Send the 1st socket to the target */
diff --git a/tests/unit/test-crypto-tlssession.c 
b/tests/unit/test-crypto-tlssession.c
index 615a1344b4..b12e7b6879 100644
--- a/tests/unit/test-crypto-tlssession.c
+++ b/tests/unit/test-crypto-tlssession.c
@@ -82,7 +82,7 @@ static void test_crypto_tls_session_psk(void)
  int ret;

  /* We'll use this for our fake client-server connection */
-    ret = socketpair(AF_UNIX, SOCK_STREAM, 0, channel);
+    ret = qemu_socketpair(AF_UNIX, SOCK_STREAM, 0, channel);
  g_assert(ret == 0);

  /*
@@ -236,7 +236,7 @@ static void test_crypto_tls_session_x509(const void *opaque)
  int ret;

  /* We'll use this for our fake client-server connection */
-    ret = socketpair(AF_UNIX, SOCK_STREAM, 0, channel);
+    ret = qemu_socketpair(AF_UNIX, SOCK_STREAM, 0, channel);
  g_assert(ret == 0);

  /*
diff --git a/tests/unit/test-io-channel-tls.c b/tests/unit/test-io-channel-tls.c
index cc39247556..e036ac5df4 100644
--- a/tests/unit/test-io-channel-tls.c
+++ b/tests/unit/test-io-channel-tls.c
@@ -121,7 +121,7 @@ static void test_io_channel_tls(const void *opaque)
  GMainContext *mainloop;

  /* We'll use this for our fake client-server connection */
-    g_assert(socketpair(AF_UNIX, SOCK_STREAM, 0, channel) == 0);
+    g_assert(qemu_socketpair(AF_UNIX, SOCK_STREAM, 0, channel) == 0);

  #define CLIENT_CERT_DIR "tests/test-io-channel-tls-client/"
  #define SERVER_CERT_DIR "tests/test-io-channel-tls-server/"


Applied to my trivial-patches branch.

Thanks,
Laurent

Re: [PATCH v2] semihosting: add O_BINARY flag in host_open for NT compatibility

2023-01-16 Thread Peter Maydell

On Mon, 16 Jan 2023 at 15:56,  wrote:
>
>
>
> On 1/6/23 7:58 PM, Peter Maydell  wrote:
> > On Fri, 6 Jan 2023 at 18:22, Evgeny Iakovlev
> >  wrote:
> > >
> > >
> > > On 1/6/2023 17:28, Peter Maydell wrote:
> > >> On Fri, 6 Jan 2023 at 15:44, Alex Bennée  wrote:
> > >>> Peter Maydell  writes:
> > >> I think the theory when the semihosting API was originally designed
> > >> decades ago was basically "when the guest does fopen(...) this
> > >> should act like it does on the host". So as a bit of portable
> > >> guest code you would say whether you wanted a binary or a text
> > >> file, and the effect would be that if you were running on Windows
> > >> and you output a text file then you'd get \r\n like the user
> > >> probably expected, and if on Linux you get \n.
> >
> > > If SYS_OPEN is supposed to call fopen (i didn't actually know that..)
> > > then it does make more sense for binary/text mode to be propagated from
> > > guest.
> >
> > It's not required to literally call fopen(). It just has to
> > give the specified semantics for when the guest passes it a
> > mode integer, which is defined in terms of the ISO C
> > fopen() string semantics for "r", "rb", "r+", "r+b", etc.
> >
> > > Qemu's implementation calls open(2) though, which is not correct
> > > at all then. Well, as long as qemu does that, there is no
> > > posix-compliant way to tell open(2) if it should use binary or text
> > > mode, there is no notion of that as far as posix (and most
> > > implementations) is concerned.
> >
> > QEMU doesn't have to be pure POSIX compliant: we know what our
> > supported host platforms are and we can freely use extensions
> > they provide. If we want to achieve the semantics that semihosting
> > asks for then we can do that with open(), by passing O_BINARY when
> > the mode integer from the guest corresponds to a string with "b" in it.

> Thanks Peter, i think i see your point. However, if you ask me, i feel like 
> advertising a feature to guest code and only implementing it on 1 platform 
> that supports it just because it has a non-standard POSIX implementation will 
> only confuse the issue further.

Huh? We can implement it, if we want, on *all* hosts that
we support:
 * On Windows hosts, plumb the binary indication from the
   semihosting SYS_OPEN call through to whether we pass O_BINARY
   to open(2)
 * On all other hosts, do nothing: on these hosts, text and
   binary files are identical so there is nothing to do

Note that semihosting is not an API that QEMU has specified:
it's an external one provided by multiple platforms. We do
not "advertise" the existence of the 'binary' flag to SYS_OPEN:
it is part of the pre-existing decades-old specification we
implement.

> Guest code doesn't want to care whether or not an emulator is
> running on Linux or Windows, there is no notion of that leaking
> to guest code. What it cares about is being able to consistently
> use a certain feature in their code.

The trouble here is that we have two different choices
about how to be consistent:

(1) Consistently have guests that use semihosting to open
a file in text mode get the text-mode file that they asked for,
regardless of the host operating system and its definition of
what a text file is
(2) Consistently have guest code produce a binary-identical
output file regardless of host operating system

It is not possible to have both; we have to pick one.

On balance, I agree with Alex that option (2) is probably
better, especially with the file-I/O-via-gdbstub part of it;
but we are genuinely giving up property (1) in the process.

thanks
-- PMM

Re: [PATCH v3 1/7] hw/acpi/acpi_dev_interface: Remove unused parameter from AcpiDeviceIfClass::madt_cpu

2023-01-16 Thread Igor Mammedov

On Mon, 16 Jan 2023 16:29:02 +0100
Bernhard Beschow  wrote:

> The only function ever assigned to AcpiDeviceIfClass::madt_cpu is
> pc_madt_cpu_entry() which doesn't use the AcpiDeviceIf parameter.
> 
> Signed-off-by: Bernhard Beschow 

Reviewed-by: Igor Mammedov 

> ---
>  include/hw/acpi/acpi_dev_interface.h | 3 +--
>  include/hw/i386/pc.h | 6 ++
>  hw/acpi/acpi-x86-stub.c  | 5 ++---
>  hw/acpi/cpu.c| 3 +--
>  hw/i386/acpi-common.c| 7 +++
>  5 files changed, 9 insertions(+), 15 deletions(-)
> 
> diff --git a/include/hw/acpi/acpi_dev_interface.h 
> b/include/hw/acpi/acpi_dev_interface.h
> index ea6056ab92..a1648220ff 100644
> --- a/include/hw/acpi/acpi_dev_interface.h
> +++ b/include/hw/acpi/acpi_dev_interface.h
> @@ -52,8 +52,7 @@ struct AcpiDeviceIfClass {
>  /*  */
>  void (*ospm_status)(AcpiDeviceIf *adev, ACPIOSTInfoList ***list);
>  void (*send_event)(AcpiDeviceIf *adev, AcpiEventStatusBits ev);
> -void (*madt_cpu)(AcpiDeviceIf *adev, int uid,
> - const CPUArchIdList *apic_ids, GArray *entry,
> +void (*madt_cpu)(int uid, const CPUArchIdList *apic_ids, GArray *entry,
>   bool force_enabled);
>  };
>  #endif
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 991f905f5d..a0647165d1 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -9,7 +9,6 @@
>  #include "hw/block/flash.h"
>  #include "hw/i386/x86.h"
>  
> -#include "hw/acpi/acpi_dev_interface.h"
>  #include "hw/hotplug.h"
>  #include "qom/object.h"
>  #include "hw/i386/sgx-epc.h"
> @@ -193,9 +192,8 @@ bool pc_system_ovmf_table_find(const char *entry, uint8_t 
> **data,
>  void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size);
>  
>  /* hw/i386/acpi-common.c */
> -void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
> -   const CPUArchIdList *apic_ids, GArray *entry,
> -   bool force_enabled);
> +void pc_madt_cpu_entry(int uid, const CPUArchIdList *apic_ids,
> +   GArray *entry, bool force_enabled);
>  
>  /* sgx.c */
>  void pc_machine_init_sgx_epc(PCMachineState *pcms);
> diff --git a/hw/acpi/acpi-x86-stub.c b/hw/acpi/acpi-x86-stub.c
> index 3df1e090f4..d0d399d26b 100644
> --- a/hw/acpi/acpi-x86-stub.c
> +++ b/hw/acpi/acpi-x86-stub.c
> @@ -2,9 +2,8 @@
>  #include "hw/i386/pc.h"
>  #include "hw/i386/acpi-build.h"
>  
> -void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
> -   const CPUArchIdList *apic_ids, GArray *entry,
> -   bool force_enabled)
> +void pc_madt_cpu_entry(int uid, const CPUArchIdList *apic_ids,
> +   GArray *entry, bool force_enabled)
>  {
>  }
>  
> diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c
> index 4e580959a2..19c154d78f 100644
> --- a/hw/acpi/cpu.c
> +++ b/hw/acpi/cpu.c
> @@ -355,7 +355,6 @@ void build_cpus_aml(Aml *table, MachineState *machine, 
> CPUHotplugFeatures opts,
>  char *cphp_res_path = g_strdup_printf("%s." CPUHP_RES_DEVICE, res_root);
>  Object *obj = object_resolve_path_type("", TYPE_ACPI_DEVICE_IF, NULL);
>  AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj);
> -AcpiDeviceIf *adev = ACPI_DEVICE_IF(obj);
>  
>  cpu_ctrl_dev = aml_device("%s", cphp_res_path);
>  {
> @@ -666,7 +665,7 @@ void build_cpus_aml(Aml *table, MachineState *machine, 
> CPUHotplugFeatures opts,
>  
>  /* build _MAT object */
>  assert(adevc && adevc->madt_cpu);
> -adevc->madt_cpu(adev, i, arch_ids, madt_buf,
> +adevc->madt_cpu(i, arch_ids, madt_buf,
>  true); /* set enabled flag */
>  aml_append(dev, aml_name_decl("_MAT",
>  aml_buffer(madt_buf->len, (uint8_t *)madt_buf->data)));
> diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
> index 4aaafbdd7b..52e5c1439a 100644
> --- a/hw/i386/acpi-common.c
> +++ b/hw/i386/acpi-common.c
> @@ -33,9 +33,8 @@
>  #include "acpi-build.h"
>  #include "acpi-common.h"
>  
> -void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
> -   const CPUArchIdList *apic_ids, GArray *entry,
> -   bool force_enabled)
> +void pc_madt_cpu_entry(int uid, const CPUArchIdList *apic_ids,
> +   GArray *entry, bool force_enabled)
>  {
>  uint32_t apic_id = apic_ids->cpus[uid].arch_id;
>  /* Flags – Local APIC Flags */
> @@ -112,7 +111,7 @@ void acpi_build_madt(GArray *table_data, BIOSLinker 
> *linker,
>  build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags 
> */
>  
>  for (i = 0; i < apic_ids->len; i++) {
> -adevc->madt_cpu(adev, i, apic_ids, table_data, false);
> +adevc->madt_cpu(i, apic_ids, table_data, false);
>  if (apic_ids->cpus[i].arch_id > 254) {
>  x2apic_mode = true;
>  }

Re: [PATCH v6 12/51] i386/xen: Implement SCHEDOP_poll and SCHEDOP_yield

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: David Woodhouse 

They both do the same thing and just call sched_yield. This is enough to
stop the Linux guest panicking when running on a host kernel which doesn't
intercept SCHEDOP_poll and lets it reach userspace.

Signed-off-by: David Woodhouse 


Reviewed-by: Paul Durrant 

... with some observations...


---
  target/i386/kvm/xen-emu.c | 12 
  1 file changed, 12 insertions(+)

diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 5f2b55ef10..80005ea527 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -227,6 +227,18 @@ static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
  err = schedop_shutdown(cs, arg);
  break;
  
+case SCHEDOP_poll:

+/*
+ * Linux will panic if this doesn't work. Just yield; it's not
+ * worth overthinking it because wWith event channel handling


Typo 'wWith'. Also possibly worth mentioning that the reason a yield is 
ok is because the semantics of the hypercall allow for spurious wake-up.



+ * in KVM, the kernel will intercept this and it will never
+ * reach QEMU anyway.
+ */
+case SCHEDOP_yield:
+sched_yield();
+err = 0;
+break;
+
  default:
  return false;
  }

Re: [PATCH 4/4] tests/tcg/multiarch: add vma-pthread.c

2023-01-16 Thread Alex Bennée



Peter Maydell  writes:

> On Mon, 16 Jan 2023 at 12:40, Philippe Mathieu-Daudé  
> wrote:
>>
>> On 13/1/23 18:10, Alex Bennée wrote:
>> >
>> > Peter Maydell  writes:
>> >
>> >> On Sat, 24 Dec 2022 at 15:19, Richard Henderson
>> >>  wrote:
>> >>>
>> >>> From: Ilya Leoshkevich 
>> >>>
>> >>> Add a test that locklessly changes and exercises page protection bits
>> >>> from various threads. This helps catch race conditions in the VMA
>> >>> handling.
>> >>>
>> >>> Signed-off-by: Ilya Leoshkevich 
>> >>> Message-Id: <20221223120252.513319-1-...@linux.ibm.com>
>> >>> Signed-off-by: Richard Henderson 
>> >>
>> >> I've noticed that this newly added vma-pthread test seems to
>> >> be flaky. Here's an example from a clang-user job:
>> >> https://gitlab.com/qemu-project/qemu/-/jobs/3600385176
>> >>
>> >> TEST vma-pthread-with-libbb.so on aarch64
>> >> fail indirect write 0x5500b1eff0 (Bad address)
>> >> timeout: the monitored command dumped core
>> >> Aborted
>> >> make[1]: *** [Makefile:173: run-plugin-vma-pthread-with-libbb.so] Error 
>> >> 134
>> >>
>> >> and another from a few days earlier:
>> >> https://gitlab.com/qemu-project/qemu/-/jobs/3572970612
>> >>
>> >> TEST vma-pthread-with-libsyscall.so on s390x
>> >> fail indirect read 0x4000999000 (Bad address)
>> >> timeout: the monitored command dumped core
>> >> Aborted
>> >> make[1]: *** [Makefile:173: run-plugin-vma-pthread-with-libsyscall.so] 
>> >> Error 134
>>
>> Yet again:
>> https://gitlab.com/qemu-project/qemu/-/jobs/3608436731
>
> Yep. Could somebody write a patch to disable this test while
> we figure out why it's flaky, please?

I don't think the test is flaky - I think it is triggering a race in
QEMU code. I have not however been able to replicate it in anything other
than CI.

Although looking at the test I'm beginning to wonder what the sync point
is between the mutator and the read/write threads?

>
> thanks
> -- PMM


-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [PATCH v14 01/11] s390x/cpu topology: adding s390 specificities to CPU topology

2023-01-16 Thread Pierre Morel





On 1/10/23 12:37, Thomas Huth wrote:

On 05/01/2023 15.53, Pierre Morel wrote:

S390 adds two new SMP levels, drawers and books to the CPU
topology.
The S390 CPU have specific toplogy features like dedication
and polarity to give to the guest indications on the host
vCPUs scheduling and help the guest take the best decisions
on the scheduling of threads on the vCPUs.

Let us provide the SMP properties with books and drawers levels
and S390 CPU with dedication and polarity,

Signed-off-by: Pierre Morel 
---

...

diff --git a/qapi/machine.json b/qapi/machine.json
index b9228a5e46..ff8f2b0e84 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -900,13 +900,15 @@
  # a CPU is being hotplugged.
  #
  # @node-id: NUMA node ID the CPU belongs to
-# @socket-id: socket number within node/board the CPU belongs to
+# @drawer-id: drawer number within node/board the CPU belongs to
+# @book-id: book number within drawer/node/board the CPU belongs to
+# @socket-id: socket number within book/node/board the CPU belongs to


I think the new entries need a "(since 8.0)" comment (similar to die-id 
and cluster-id below).


right



Other question: Do we have "node-id"s on s390x? If not, is that similar 
to books or drawers, i.e. just another word? If so, we should maybe 
rather re-use "nodes" instead of introducing a new name for the same thing?


We have theoretically nodes-id on s390x, it is the level 5 of the 
topology, above drawers.
Currently it is not used in s390x topology, the maximum level returned 
to a LPAR host is 4.
I suppose that it adds a possibility to link several s390x with a fast 
network.





  # @die-id: die number within socket the CPU belongs to (since 4.1)
  # @cluster-id: cluster number within die the CPU belongs to (since 7.1)
  # @core-id: core number within cluster the CPU belongs to
  # @thread-id: thread number within core the CPU belongs to
  #
-# Note: currently there are 6 properties that could be present
+# Note: currently there are 8 properties that could be present
  #   but management should be prepared to pass through other
  #   properties with device_add command to allow for future
  #   interface extension. This also requires the filed names to 
be kept in

@@ -916,6 +918,8 @@
  ##
  { 'struct': 'CpuInstanceProperties',
    'data': { '*node-id': 'int',
+    '*drawer-id': 'int',
+    '*book-id': 'int',
  '*socket-id': 'int',
  '*die-id': 'int',
  '*cluster-id': 'int',
@@ -1465,6 +1469,10 @@
  #
  # @cpus: number of virtual CPUs in the virtual machine
  #
+# @drawers: number of drawers in the CPU topology
+#
+# @books: number of books in the CPU topology
+#


These also need a "(since 8.0)" comment at the end.


right again, I will add this.




  # @sockets: number of sockets in the CPU topology
  #
  # @dies: number of dies per socket in the CPU topology
@@ -1481,6 +1489,8 @@
  ##
  { 'struct': 'SMPConfiguration', 'data': {
   '*cpus': 'int',
+ '*drawers': 'int',
+ '*books': 'int',
   '*sockets': 'int',
   '*dies': 'int',
   '*clusters': 'int',

...

diff --git a/qemu-options.hx b/qemu-options.hx
index 7f99d15b23..8dc9a4c052 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -250,11 +250,13 @@ SRST
  ERST
  DEF("smp", HAS_ARG, QEMU_OPTION_smp,
-    "-smp 
[[cpus=]n][,maxcpus=maxcpus][,sockets=sockets][,dies=dies][,clusters=clusters][,cores=cores][,threads=threads]\n"
+    "-smp 
[[cpus=]n][,maxcpus=maxcpus][,drawers=drawers][,books=books][,sockets=sockets][,dies=dies][,clusters=clusters][,cores=cores][,threads=threads]\n"


This line now got too long. Please add a newline inbetween.


OK

Thanks.

Regards,
Pierre

--
Pierre Morel
IBM Lab Boeblingen

Re: [PATCH v3 2/7] hw/acpi/acpi_dev_interface: Resolve AcpiDeviceIfClass::madt_cpu

2023-01-16 Thread Igor Mammedov

On Mon, 16 Jan 2023 16:29:03 +0100
Bernhard Beschow  wrote:

> This class attribute was always set to pc_madt_cpu_entry().
> pc_madt_cpu_entry() is architecture dependent and was assigned to the
> attribute even in architecture agnostic code such as in hw/acpi/piix4.c
> and hw/isa/lpc_ich9. Not having to set madt_cpu there resolves the
> assumption that these device models are only ever used with ACPI on x86
> targets.
> 
> The only target independent location where madt_cpu was called was hw/
> acpi/cpu.c. Here a function pointer can be passed via an argument
> instead. The other locations where it was called was in x86-specific code
> where pc_madt_cpu_entry() can be used directly.
>
> While at it, move pc_madt_cpu_entry() from the public include/hw/i386/
> pc.h to the private hw/i386/acpi-common where it is also implemented.

I'm not sure about this approach,
the callback is intend to be used not only by x86 but also in
the end by ARM (it's just that arm/virt CPU hotplug patches are
still work in progress and haven't been merged).

So I'd prefer to keep AcpiDeviceIfClass::madt_cpu.

What's the end goal you are trying to achieve by getting
rid of this callback?

> Signed-off-by: Bernhard Beschow 
> ---
>  hw/i386/acpi-common.h|  7 +--
>  include/hw/acpi/acpi_dev_interface.h |  2 --
>  include/hw/acpi/cpu.h|  6 +-
>  include/hw/i386/pc.h |  4 
>  hw/acpi/acpi-x86-stub.c  |  6 --
>  hw/acpi/cpu.c| 10 --
>  hw/acpi/piix4.c  |  2 --
>  hw/i386/acpi-build.c |  5 ++---
>  hw/i386/acpi-common.c|  5 ++---
>  hw/i386/acpi-microvm.c   |  3 +--
>  hw/i386/generic_event_device_x86.c   |  9 -
>  hw/isa/lpc_ich9.c|  1 -
>  12 files changed, 19 insertions(+), 41 deletions(-)
> 
> diff --git a/hw/i386/acpi-common.h b/hw/i386/acpi-common.h
> index a68825acf5..968d625d88 100644
> --- a/hw/i386/acpi-common.h
> +++ b/hw/i386/acpi-common.h
> @@ -1,15 +1,18 @@
>  #ifndef HW_I386_ACPI_COMMON_H
>  #define HW_I386_ACPI_COMMON_H
>  
> -#include "hw/acpi/acpi_dev_interface.h"
>  #include "hw/acpi/bios-linker-loader.h"
>  #include "hw/i386/x86.h"
> +#include "hw/boards.h"
>  
>  /* Default IOAPIC ID */
>  #define ACPI_BUILD_IOAPIC_ID 0x0
>  
> +void pc_madt_cpu_entry(int uid, const CPUArchIdList *apic_ids, GArray *entry,
> +   bool force_enabled);
> +
>  void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
> - X86MachineState *x86ms, AcpiDeviceIf *adev,
> + X86MachineState *x86ms,
>   const char *oem_id, const char *oem_table_id);
>  
>  #endif
> diff --git a/include/hw/acpi/acpi_dev_interface.h 
> b/include/hw/acpi/acpi_dev_interface.h
> index a1648220ff..ca92928124 100644
> --- a/include/hw/acpi/acpi_dev_interface.h
> +++ b/include/hw/acpi/acpi_dev_interface.h
> @@ -52,7 +52,5 @@ struct AcpiDeviceIfClass {
>  /*  */
>  void (*ospm_status)(AcpiDeviceIf *adev, ACPIOSTInfoList ***list);
>  void (*send_event)(AcpiDeviceIf *adev, AcpiEventStatusBits ev);
> -void (*madt_cpu)(int uid, const CPUArchIdList *apic_ids, GArray *entry,
> - bool force_enabled);
>  };
>  #endif
> diff --git a/include/hw/acpi/cpu.h b/include/hw/acpi/cpu.h
> index 999caaf510..25b25bb594 100644
> --- a/include/hw/acpi/cpu.h
> +++ b/include/hw/acpi/cpu.h
> @@ -15,6 +15,7 @@
>  #include "hw/qdev-core.h"
>  #include "hw/acpi/acpi.h"
>  #include "hw/acpi/aml-build.h"
> +#include "hw/boards.h"
>  #include "hw/hotplug.h"
>  
>  typedef struct AcpiCpuStatus {
> @@ -55,8 +56,11 @@ typedef struct CPUHotplugFeatures {
>  const char *smi_path;
>  } CPUHotplugFeatures;
>  
> +typedef void (*madt_cpu_fn)(int uid, const CPUArchIdList *apic_ids,
> +GArray *entry, bool force_enabled);
> +
>  void build_cpus_aml(Aml *table, MachineState *machine, CPUHotplugFeatures 
> opts,
> -hwaddr io_base,
> +hwaddr io_base, madt_cpu_fn madt_cpu,
>  const char *res_root,
>  const char *event_handler_method);
>  
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index a0647165d1..a5cce88653 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -191,10 +191,6 @@ bool pc_system_ovmf_table_find(const char *entry, 
> uint8_t **data,
> int *data_len);
>  void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size);
>  
> -/* hw/i386/acpi-common.c */
> -void pc_madt_cpu_entry(int uid, const CPUArchIdList *apic_ids,
> -   GArray *entry, bool force_enabled);
> -
>  /* sgx.c */
>  void pc_machine_init_sgx_epc(PCMachineState *pcms);
>  
> diff --git a/hw/acpi/acpi-x86-stub.c b/hw/acpi/acpi-x86-stub.c
> index d0d399d26b..9662a594ad 100644
> --- a/hw/acpi/acpi-x86-stub.c
> +++

Re: [PATCH v6 11/51] i386/xen: implement HYPERVISOR_sched_op, SCHEDOP_shutdown

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Joao Martins 

It allows to shutdown itself via hypercall with any of the 3 reasons:
   1) self-reboot
   2) shutdown
   3) crash

Implementing SCHEDOP_shutdown sub op let us handle crashes gracefully rather
than leading to triple faults if it remains unimplemented.

In addition, the SHUTDOWN_soft_reset reason is used for kexec, to reset
Xen shared pages and other enlightenments and leave a clean slate for the
new kernel without the hypervisor helpfully writing information at
unexpected addresses.

Signed-off-by: Joao Martins 
[dwmw2: Ditch sched_op_compat which was never available for HVM guests,
 Add SCHEDOP_soft_reset]
Signed-off-by: David Woodhouse 


Reviewed-by: Paul Durrant

Re: [PATCH v2] semihosting: add O_BINARY flag in host_open for NT compatibility

2023-01-16 Thread Alex Bennée



eiakov...@linux.microsoft.com writes:

> On 1/6/23 7:58 PM, Peter Maydell  wrote:
>> On Fri, 6 Jan 2023 at 18:22, Evgeny Iakovlev
>>  wrote:
>> >
>> >
>> > On 1/6/2023 17:28, Peter Maydell wrote:
>> >> On Fri, 6 Jan 2023 at 15:44, Alex Bennée  wrote:
>> >>> Peter Maydell  writes:
>> >> I think the theory when the semihosting API was originally designed
>> >> decades ago was basically "when the guest does fopen(...) this
>> >> should act like it does on the host". So as a bit of portable
>> >> guest code you would say whether you wanted a binary or a text
>> >> file, and the effect would be that if you were running on Windows
>> >> and you output a text file then you'd get \r\n like the user
>> >> probably expected, and if on Linux you get \n.
>>  > If SYS_OPEN is supposed to call fopen (i didn't actually know
>> that..)
>> > then it does make more sense for binary/text mode to be propagated from
>> > guest.
>> It's not required to literally call fopen(). It just has to
>> give the specified semantics for when the guest passes it a
>> mode integer, which is defined in terms of the ISO C
>> fopen() string semantics for "r", "rb", "r+", "r+b", etc.
>>  > Qemu's implementation calls open(2) though, which is not correct
>> > at all then. Well, as long as qemu does that, there is no
>> > posix-compliant way to tell open(2) if it should use binary or text
>> > mode, there is no notion of that as far as posix (and most
>> > implementations) is concerned.
>> QEMU doesn't have to be pure POSIX compliant: we know what our
>> supported host platforms are and we can freely use extensions
>> they provide. If we want to achieve the semantics that semihosting
>> asks for then we can do that with open(), by passing O_BINARY when
>> the mode integer from the guest corresponds to a string with "b" in it.
>> I'm about 50:50 on whether we should do that vs documenting and
>> commenting that we deliberately produce the same behaviour on all
>> platforms by ignoring the 'b' flag, though.
>> thanks
>> -- PMM
>> 
>
> Thanks Peter, i think i see your point. However, if you ask me, i feel
> like advertising a feature to guest code and only implementing it on 1
> platform that supports it just because it has a non-standard POSIX
> implementation will only confuse the issue further.
> Guest code doesn't want to care whether or not an emulator is running
> on Linux or Windows, there is no notion of that leaking to guest code.
> What it cares about is being able to consistently use a certain
> feature in their code.
> So i think it would be rather useless to implement it on Windows-only
> given there is a clear alternative to switch to fopen. Just my 2
> cents.

It's not switching to fopen() that is the issue - it's the interaction
with gdb (via gdbstub) which has no idea about the distinction. Anyway I
already have the patch queued with an additional note in the
documentation that all file accesses are in binary mode.

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [PATCH v6 09/51] i386/xen: handle guest hypercalls

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:20, David Woodhouse wrote:

From: Joao Martins 

This means handling the new exit reason for Xen but still
crashing on purpose. As we implement each of the hypercalls
we will then return the right return code.

Signed-off-by: Joao Martins 
[dwmw2: Add CPL to hypercall tracing, disallow hypercalls from CPL > 0]
Signed-off-by: David Woodhouse 
---
  target/i386/kvm/kvm.c|  5 
  target/i386/kvm/trace-events |  3 +++
  target/i386/kvm/xen-emu.c| 44 
  target/i386/kvm/xen-emu.h|  1 +
  4 files changed, 53 insertions(+)

diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 4ab2c08af6..7cbfbed492 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -5477,6 +5477,11 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run 
*run)
  assert(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER);
  ret = kvm_handle_wrmsr(cpu, run);
  break;
+#ifdef CONFIG_XEN_EMU
+case KVM_EXIT_XEN:
+ret = kvm_xen_handle_exit(cpu, >xen);
+break;
+#endif
  default:
  fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason);
  ret = -1;
diff --git a/target/i386/kvm/trace-events b/target/i386/kvm/trace-events
index 7c369db1e1..cd6f842b1f 100644
--- a/target/i386/kvm/trace-events
+++ b/target/i386/kvm/trace-events
@@ -5,3 +5,6 @@ kvm_x86_fixup_msi_error(uint32_t gsi) "VT-d failed to remap 
interrupt for GSI %"
  kvm_x86_add_msi_route(int virq) "Adding route entry for virq %d"
  kvm_x86_remove_msi_route(int virq) "Removing route entry for virq %d"
  kvm_x86_update_msi_routes(int num) "Updated %d MSI routes"
+
+# xen-emu.c
+kvm_xen_hypercall(int cpu, uint8_t cpl, uint64_t input, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t ret) 
"xen_hypercall: cpu %d cpl %d input %" PRIu64 " a0 0x%" PRIx64 " a1 0x%" PRIx64 " a2 0x%" 
PRIx64" ret 0x%" PRIx64
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 4883b95d9d..476f464ee2 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -10,10 +10,12 @@
   */
  
  #include "qemu/osdep.h"

+#include "qemu/log.h"
  #include "sysemu/kvm_int.h"
  #include "sysemu/kvm_xen.h"
  #include "kvm/kvm_i386.h"
  #include "xen-emu.h"
+#include "trace.h"
  
  int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)

  {
@@ -84,3 +86,45 @@ uint32_t kvm_xen_get_caps(void)
  {
  return kvm_state->xen_caps;
  }
+
+static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
+{
+uint16_t code = exit->u.hcall.input;
+
+if (exit->u.hcall.cpl > 0) {
+exit->u.hcall.result = -EPERM;
+return true;
+}
+
+switch (code) {
+default:
+return false;
+}
+}
+
+int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
+{
+if (exit->type != KVM_EXIT_XEN_HCALL) {
+return -1;
+}
+
+if (!do_kvm_xen_handle_exit(cpu, exit)) {
+/*
+ * Some hypercalls will be deliberately "implemented" by returning
+ * -ENOSYS. This case is for hypercalls which are unexpected.
+ */
+exit->u.hcall.result = -ENOSYS;
+qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
+  PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
+  (uint64_t)exit->u.hcall.input,
+  (uint64_t)exit->u.hcall.params[0],
+  (uint64_t)exit->u.hcall.params[1],
+  (uint64_t)exit->u.hcall.params[2]);
+}
+
+trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
+exit->u.hcall.input, exit->u.hcall.params[0],
+exit->u.hcall.params[1], exit->u.hcall.params[2],
+exit->u.hcall.result);


It seems odd to have the trace message after the hypercall is handled. 
Any additional tracing in the handler if going to come out before we're 
told what hypercall it is.


  Paul


+return 0;
+}
diff --git a/target/i386/kvm/xen-emu.h b/target/i386/kvm/xen-emu.h
index d62f1d8ed8..21faf6bf38 100644
--- a/target/i386/kvm/xen-emu.h
+++ b/target/i386/kvm/xen-emu.h
@@ -25,5 +25,6 @@
  
  int kvm_xen_init(KVMState *s, uint32_t hypercall_msr);

  int kvm_xen_init_vcpu(CPUState *cs);
+int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit);
  
  #endif /* QEMU_I386_KVM_XEN_EMU_H */

Re: [PATCH v6 08/51] xen-platform: allow its creation with XEN_EMULATE mode

2023-01-16 Thread Paul Durrant


On 10/01/2023 12:19, David Woodhouse wrote:

From: Joao Martins 

The only thing we need to handle on KVM side is to change the
pfn from R/W to R/O.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
---
  hw/i386/xen/meson.build|  5 -
  hw/i386/xen/xen_platform.c | 39 +-
  2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/hw/i386/xen/meson.build b/hw/i386/xen/meson.build
index be84130300..79d75cc927 100644
--- a/hw/i386/xen/meson.build
+++ b/hw/i386/xen/meson.build
@@ -2,6 +2,9 @@ i386_ss.add(when: 'CONFIG_XEN', if_true: files(
'xen-hvm.c',
'xen-mapcache.c',
'xen_apic.c',
-  'xen_platform.c',
'xen_pvdevice.c',
  ))
+
+i386_ss.add(when: 'CONFIG_XENFV_MACHINE', if_true: files(
+  'xen_platform.c',
+))
diff --git a/hw/i386/xen/xen_platform.c b/hw/i386/xen/xen_platform.c
index 50174c2269..00f0527b30 100644
--- a/hw/i386/xen/xen_platform.c
+++ b/hw/i386/xen/xen_platform.c
@@ -28,9 +28,9 @@
  #include "hw/ide.h"
  #include "hw/ide/pci.h"
  #include "hw/pci/pci.h"
-#include "hw/xen/xen_common.h"
  #include "migration/vmstate.h"
-#include "hw/xen/xen-legacy-backend.h"
+#include "hw/xen/xen.h"
+#include "net/net.h"
  #include "trace.h"
  #include "sysemu/xen.h"
  #include "sysemu/block-backend.h"
@@ -38,6 +38,11 @@
  #include "qemu/module.h"
  #include "qom/object.h"
  
+#ifdef CONFIG_XEN

+#include "hw/xen/xen_common.h"
+#include "hw/xen/xen-legacy-backend.h"
+#endif
+
  //#define DEBUG_PLATFORM
  
  #ifdef DEBUG_PLATFORM

@@ -280,18 +285,26 @@ static void platform_fixed_ioport_writeb(void *opaque, 
uint32_t addr, uint32_t v
  PCIXenPlatformState *s = opaque;
  
  switch (addr) {

-case 0: /* Platform flags */ {
-hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
-HVMMEM_ram_ro : HVMMEM_ram_rw;
-if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
-DPRINTF("unable to change ro/rw state of ROM memory area!\n");
-} else {
+case 0: /* Platform flags */
+if (xen_mode == XEN_EMULATE) {
+/* XX: Use i440gx/q35 PAM setup to do this? */
  s->flags = val & PFFLAG_ROM_LOCK;


Given that this is not RFC, do you have a definite plan? TBH I think 
only ancient (Bochs) ROMBIOS messes with this; I can't find any trace in 
SeaBIOS anyway. So maybe we just don't care.


  Paul


-DPRINTF("changed ro/rw state of ROM memory area. now is %s 
state.\n",
-(mem_type == HVMMEM_ram_ro ? "ro":"rw"));
+#ifdef CONFIG_XEN
+} else {
+hvmmem_type_t mem_type = (val & PFFLAG_ROM_LOCK) ?
+HVMMEM_ram_ro : HVMMEM_ram_rw;
+
+if (xen_set_mem_type(xen_domid, mem_type, 0xc0, 0x40)) {
+DPRINTF("unable to change ro/rw state of ROM memory area!\n");
+} else {
+s->flags = val & PFFLAG_ROM_LOCK;
+DPRINTF("changed ro/rw state of ROM memory area. now is %s 
state.\n",
+(mem_type == HVMMEM_ram_ro ? "ro" : "rw"));
+}
+#endif
  }
  break;
-}
+
  case 2:
  log_writeb(s, val);
  break;
@@ -509,8 +522,8 @@ static void xen_platform_realize(PCIDevice *dev, Error 
**errp)
  uint8_t *pci_conf;
  
  /* Device will crash on reset if xen is not initialized */

-if (!xen_enabled()) {
-error_setg(errp, "xen-platform device requires the Xen accelerator");
+if (xen_mode == XEN_DISABLED) {
+error_setg(errp, "xen-platform device requires a Xen guest");
  return;
  }

Re: [RFC v2 08/13] vdpa: Negotiate _F_SUSPEND feature

2023-01-16 Thread Eugenio Perez Martin

On Mon, Jan 16, 2023 at 7:48 AM Jason Wang  wrote:
>
>
> 在 2023/1/13 16:45, Eugenio Perez Martin 写道:
> > On Fri, Jan 13, 2023 at 5:39 AM Jason Wang  wrote:
> >> On Fri, Jan 13, 2023 at 1:25 AM Eugenio Pérez  wrote:
> >>> This is needed for qemu to know it can suspend the device to retrieve
> >>> its status and enable SVQ with it, so all the process is transparent to
> >>> the guest.
> >>>
> >>> Signed-off-by: Eugenio Pérez 
> >> Acked-by: Jason Wang 
> >>
> >> We probably need to add the resume in the future to have a quick
> >> recovery from migration failures.
> >>
> > The capability of a resume can be useful here but only in a small
> > window. During the most time of the migration SVQ is enabled, so in
> > the event of a migration failure we may need to reset the whole device
> > to enable passthrough again.
>
>
> Yes.
>
>
> >
> > But maybe is it worth giving a quick review and adding some TODOs
> > where it can be useful in this series?
>
>
> We can start by having a TODO in this series, and leave resume in for
> the future.
>

Got it, I'll add in the next series.

Thanks!

> Thanks
>
>
> >
> > Thanks!
> >
> >> Thanks
> >>
> >>> ---
> >>>   hw/virtio/vhost-vdpa.c | 3 ++-
> >>>   1 file changed, 2 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> >>> index 4296427a69..a61a6b2a74 100644
> >>> --- a/hw/virtio/vhost-vdpa.c
> >>> +++ b/hw/virtio/vhost-vdpa.c
> >>> @@ -659,7 +659,8 @@ static int vhost_vdpa_set_backend_cap(struct 
> >>> vhost_dev *dev)
> >>>   uint64_t features;
> >>>   uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
> >>>   0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
> >>> -0x1ULL << VHOST_BACKEND_F_IOTLB_ASID;
> >>> +0x1ULL << VHOST_BACKEND_F_IOTLB_ASID |
> >>> +0x1ULL << VHOST_BACKEND_F_SUSPEND;
> >>>   int r;
> >>>
> >>>   if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, )) {
> >>> --
> >>> 2.31.1
> >>>
>

Re: [RFC v2 06/13] vhost: delay set_vring_ready after DRIVER_OK

2023-01-16 Thread Eugenio Perez Martin

On Mon, Jan 16, 2023 at 7:37 AM Jason Wang  wrote:
>
>
> 在 2023/1/13 16:19, Eugenio Perez Martin 写道:
> > On Fri, Jan 13, 2023 at 5:36 AM Jason Wang  wrote:
> >> On Fri, Jan 13, 2023 at 1:25 AM Eugenio Pérez  wrote:
> >>> To restore the device at the destination of a live migration we send the
> >>> commands through control virtqueue. For a device to read CVQ it must
> >>> have received the DRIVER_OK status bit.
> >> This probably requires the support from the parent driver and requires
> >> some changes or fixes in the parent driver.
> >>
> >> Some drivers did:
> >>
> >> parent_set_status():
> >> if (DRIVER_OK)
> >>  if (queue_enable)
> >>  write queue_enable to the device
> >>
> >> Examples are IFCVF or even vp_vdpa at least. MLX5 seems to be fine.
> >>
> > I don't get your point here. No device should start reading CVQ (or
> > any other VQ) without having received DRIVER_OK.
>
>
> If I understand the code correctly:
>
> For CVQ, we do SET_VRING_ENABLE before DRIVER_OK, that's fine.
>
> For datapath_vq, we do SET_VRING_ENABLE after DRIVER_OK, this requires
> parent driver support (explained above)
>
>
> >
> > Some parent drivers do not support sending the queue enable command
> > after DRIVER_OK, usually because they clean part of the state like the
> > set by set_vring_base. Even vdpa_net_sim needs fixes here.
>
>
> Yes, so the question is:
>
> Do we need another backend feature for this? (otherwise thing may break
> silently)
>
>
> >
> > But my understanding is that it should be supported so I consider it a
> > bug.
>
>
> Probably, we need fine some proof in the spec, e.g in 3.1.1:
>
> """
>
> 7.Perform device-specific setup, including discovery of virtqueues for
> the device, optional per-bus setup, reading and possibly writing the
> device’s virtio configuration space, and population of virtqueues.
> 8.Set the DRIVER_OK status bit. At this point the device is “live”.
>
> """
>
> So if my understanding is correct, "discovery of virtqueues for the
> device" implies queue_enable here which is expected to be done before
> DRIVER_OK. But it doesn't say anything regrading to the behaviour of
> setting queue ready after DRIVER_OK.
>
> I'm not sure it's a real bug or not, may Michael and comment on this.
>

Right, input on this topic would be really appreciated.

>
> >   Especially after queue_reset patches. Is that what you mean?
>
>
> We haven't supported queue_reset yet in Qemu. But it allows to write 1
> to queue_enable after DRIVER_OK for sure.
>

I was not clear, I meant in the emulated device. I'm testing this
series with the proposal of _F_STATE.

>
> >
> >>> However this opens a window where the device could start receiving
> >>> packets in rx queue 0 before it receives the RSS configuration. To avoid
> >>> that, we will not send vring_enable until all configuration is used by
> >>> the device.
> >>>
> >>> As a first step, run vhost_set_vring_ready for all vhost_net backend after
> >>> all of them are started (with DRIVER_OK). This code should not affect
> >>> vdpa.
> >>>
> >>> Signed-off-by: Eugenio Pérez 
> >>> ---
> >>>   hw/net/vhost_net.c | 17 -
> >>>   1 file changed, 12 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
> >>> index c4eecc6f36..3900599465 100644
> >>> --- a/hw/net/vhost_net.c
> >>> +++ b/hw/net/vhost_net.c
> >>> @@ -399,6 +399,18 @@ int vhost_net_start(VirtIODevice *dev, 
> >>> NetClientState *ncs,
> >>>   } else {
> >>>   peer = qemu_get_peer(ncs, n->max_queue_pairs);
> >>>   }
> >>> +r = vhost_net_start_one(get_vhost_net(peer), dev);
> >>> +if (r < 0) {
> >>> +goto err_start;
> >>> +}
> >>> +}
> >>> +
> >>> +for (int j = 0; j < nvhosts; j++) {
> >>> +if (j < data_queue_pairs) {
> >>> +peer = qemu_get_peer(ncs, j);
> >>> +} else {
> >>> +peer = qemu_get_peer(ncs, n->max_queue_pairs);
> >>> +}
> >> I fail to understand why we need to change the vhost_net layer? This
> >> is vhost-vDPA specific, so I wonder if we can limit the changes to e.g
> >> vhost_vdpa_dev_start()?
> >>
> > The vhost-net layer explicitly calls vhost_set_vring_enable before
> > vhost_dev_start, and this is exactly the behavior we want to avoid.
> > Even if we make changes to vhost_dev, this change is still needed.
>
>
> Note that the only user of vhost_set_vring_enable() is vhost-user where
> the semantic is different:
>
> It uses that to changes the number of active queues:
>
> static int peer_attach(VirtIONet *n, int index)
>
>  if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
> =>  vhost_set_vring_enable(nc->peer, 1);
>  }
>
> This is not the semantic of vhost-vDPA that tries to be complaint with
> virtio-spec. So I'm not sure how it can help here.
>

Right, but previous changes use enable callback to delay the enable of
the datapath virtqueues. I'll try to fit the changes in

Re: [QUESTION] About virtio and eventloop

2023-01-16 Thread Stefan Hajnoczi

On Mon, 16 Jan 2023 at 03:20, zhukeqian via  wrote:
> And if IO operation is blocked, is vCPU thread will blocked when do 
> deactivate?

Yes, blk_drain() is a synchronous function. It blocks until in-flight
I/O has finished. The vcpu thread will be blocked in
virtio_pci_common_write().

Stefan

Re: [PATCH 07/15] block-backend: enter aio coroutine only after drain

2023-01-16 Thread Kevin Wolf

Am 12.12.2022 um 13:59 hat Paolo Bonzini geschrieben:
> When called from within (another) coroutine, aio_co_enter will not
> enter a coroutine immediately; instead the new coroutine is scheduled
> to run after qemu_coroutine_yield().  This however might cause the
> currently-running coroutine to yield without having raised blk->in_flight.

I assume you're talking about the blk_aio_prwv() path here. However,
calling blk_inc_in_flight() is the very first thing it does (before even
calling bdrv_coroutine_enter -> aio_co_enter), so I don't understand how
it could happen that it yields before increasing the counter.

> If it was a ->drained_begin() callback who scheduled the coroutine,

Which one? The one that executes blk_aio_prwv()?

> bdrv_drained_begin() might exit without waiting for the I/O operation
> to finish.  Right now, this is masked by unnecessary polling done by
> bdrv_drained_begin() after the callbacks return, but it is wrong and
> a latent bug.
> 
> So, ensure that blk_inc_in_flight() and blk_wait_while_drained()
> are called before aio_co_enter().  To do so, pull the call to
> blk_wait_while_drained() out of the blk_co_do_* functions, which are
> called from the AIO coroutines, and place them separately in the public
> blk_co_* functions and in blk_aio_prwv.

You can't call blk_wait_while_drained() in blk_aio_prwv() because the
latter isn't a coroutine_fn.

> Signed-off-by: Paolo Bonzini 
> ---
>  block/block-backend.c | 16 +++-
>  1 file changed, 7 insertions(+), 9 deletions(-)

Kevin

Re: [PATCH v2] semihosting: add O_BINARY flag in host_open for NT compatibility

2023-01-16 Thread eiakovlev

On 1/6/23 7:58 PM, Peter Maydell  wrote:

On Fri, 6 Jan 2023 at 18:22, Evgeny Iakovlev
 wrote:
>
>
> On 1/6/2023 17:28, Peter Maydell wrote:
>> On Fri, 6 Jan 2023 at 15:44, Alex Bennée  wrote:
>>> Peter Maydell  writes:
>> I think the theory when the semihosting API was originally designed
>> decades ago was basically "when the guest does fopen(...) this
>> should act like it does on the host". So as a bit of portable
>> guest code you would say whether you wanted a binary or a text
>> file, and the effect would be that if you were running on Windows
>> and you output a text file then you'd get \r\n like the user
>> probably expected, and if on Linux you get \n.

> If SYS_OPEN is supposed to call fopen (i didn't actually know that..)
> then it does make more sense for binary/text mode to be propagated from
> guest.

It's not required to literally call fopen(). It just has to
give the specified semantics for when the guest passes it a
mode integer, which is defined in terms of the ISO C
fopen() string semantics for "r", "rb", "r+", "r+b", etc.

> Qemu's implementation calls open(2) though, which is not correct
> at all then. Well, as long as qemu does that, there is no
> posix-compliant way to tell open(2) if it should use binary or text
> mode, there is no notion of that as far as posix (and most
> implementations) is concerned.

QEMU doesn't have to be pure POSIX compliant: we know what our
supported host platforms are and we can freely use extensions
they provide. If we want to achieve the semantics that semihosting
asks for then we can do that with open(), by passing O_BINARY when
the mode integer from the guest corresponds to a string with "b" in it.

I'm about 50:50 on whether we should do that vs documenting and
commenting that we deliberately produce the same behaviour on all
platforms by ignoring the 'b' flag, though.

thanks
-- PMM

Thanks Peter, i think i see your point. However, if you ask me, i feel like 
advertising a feature to guest code and only implementing it on 1 platform that 
supports it just because it has a non-standard POSIX implementation will only 
confuse the issue further.
Guest code doesn't want to care whether or not an emulator is running on Linux 
or Windows, there is no notion of that leaking to guest code. What it cares 
about is being able to consistently use a certain feature in their code.
So i think it would be rather useless to implement it on Windows-only given 
there is a clear alternative to switch to fopen. Just my 2 cents.

Re: [PATCH 00/12] More cleanups and fixes for drain

2023-01-16 Thread Kevin Wolf

Am 12.12.2022 um 13:59 hat Paolo Bonzini geschrieben:
> There are a few more lines of code that can be removed around draining
> code, but especially the logic can be simplified by removing unnecessary
> parameters.
> 
> Due to the failure of the block-next branch, the first three patches
> drop patches 14+15 of Kevin's drain cleanup series, and then redo
> patch 15 in a slightly less satisfactory way that still enables the
> remaining cleanups.  These reverts are not supposed to be applied;
> either the offending patches are dropped from the branch, or if the
> issue is fixed then my first three patches can go away.

Can you remind me which problem this was? The patches are in master now,
but I'm not sure if the latest version fixed whatever you had in mind.

> The next three are taken from Emanuele's old subtree drain attempt
> at removing the AioContext.  The main one is the second, which is needed
> to avoid testcase failures, but I included all of them for simplicity.
> 
> Patch 7 fixes another latent bug exposed by the later cleanups, and while
> looking for a fix I noticed a general lack of thread-safety in BlockBackend's
> drain code.  There are some global properties that only need to be documented
> and enforced to be set only at creation time (patches 8/9), but also
> queued_requests is not protected by any mutex, which is fixed in patch 10.
> 
> Finally, patches 11-15 are the actual simplification.
> 
> Applies on top of block-next.

Not any more. :-)

I found out that it applies on top of 6355f90eef, which may work for
some basic review, but the conflicts when rebasing seem non-trivial, so
we'll need a v2.

Kevin

Re: [PATCH 2/7] hw/acpi/acpi_dev_interface: Remove unused parameter from AcpiDeviceIfClass::madt_cpu

2023-01-16 Thread Igor Mammedov

On Sat, 14 Jan 2023 23:27:33 +0100
Bernhard Beschow  wrote:

> The only function ever assigned to AcpiDeviceIfClass::madt_cpu is
> pc_madt_cpu_entry() which doesn't use the AcpiDeviceIf parameter.

intent for AcpiDeviceIfClass::madt_cpu is to make cpu hotplug AML
reusable (so it's not x86, specific and applicable to other target
that use ACPI with it's own madt_entry definition (think about
arm/virt machine)).

Indeed  (AcpiDeviceIf *adev) is unused and it is there only for
AcpiDeviceIfClass callbacks consistent signature. Other than
I don't see any possible use for adev within madt_cpu()

so
Reviewed-by: Igor Mammedov 

> 
> Signed-off-by: Bernhard Beschow 
> ---
>  include/hw/acpi/acpi_dev_interface.h | 3 +--
>  include/hw/i386/pc.h | 6 ++
>  hw/acpi/acpi-x86-stub.c  | 5 ++---
>  hw/acpi/cpu.c| 3 +--
>  hw/i386/acpi-common.c| 7 +++
>  5 files changed, 9 insertions(+), 15 deletions(-)
> 
> diff --git a/include/hw/acpi/acpi_dev_interface.h 
> b/include/hw/acpi/acpi_dev_interface.h
> index ea6056ab92..a1648220ff 100644
> --- a/include/hw/acpi/acpi_dev_interface.h
> +++ b/include/hw/acpi/acpi_dev_interface.h
> @@ -52,8 +52,7 @@ struct AcpiDeviceIfClass {
>  /*  */
>  void (*ospm_status)(AcpiDeviceIf *adev, ACPIOSTInfoList ***list);
>  void (*send_event)(AcpiDeviceIf *adev, AcpiEventStatusBits ev);
> -void (*madt_cpu)(AcpiDeviceIf *adev, int uid,
> - const CPUArchIdList *apic_ids, GArray *entry,
> +void (*madt_cpu)(int uid, const CPUArchIdList *apic_ids, GArray *entry,
>   bool force_enabled);
>  };
>  #endif
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index 991f905f5d..a0647165d1 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -9,7 +9,6 @@
>  #include "hw/block/flash.h"
>  #include "hw/i386/x86.h"
>  
> -#include "hw/acpi/acpi_dev_interface.h"
>  #include "hw/hotplug.h"
>  #include "qom/object.h"
>  #include "hw/i386/sgx-epc.h"
> @@ -193,9 +192,8 @@ bool pc_system_ovmf_table_find(const char *entry, uint8_t 
> **data,
>  void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size);
>  
>  /* hw/i386/acpi-common.c */
> -void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
> -   const CPUArchIdList *apic_ids, GArray *entry,
> -   bool force_enabled);
> +void pc_madt_cpu_entry(int uid, const CPUArchIdList *apic_ids,
> +   GArray *entry, bool force_enabled);
>  
>  /* sgx.c */
>  void pc_machine_init_sgx_epc(PCMachineState *pcms);
> diff --git a/hw/acpi/acpi-x86-stub.c b/hw/acpi/acpi-x86-stub.c
> index 3df1e090f4..d0d399d26b 100644
> --- a/hw/acpi/acpi-x86-stub.c
> +++ b/hw/acpi/acpi-x86-stub.c
> @@ -2,9 +2,8 @@
>  #include "hw/i386/pc.h"
>  #include "hw/i386/acpi-build.h"
>  
> -void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
> -   const CPUArchIdList *apic_ids, GArray *entry,
> -   bool force_enabled)
> +void pc_madt_cpu_entry(int uid, const CPUArchIdList *apic_ids,
> +   GArray *entry, bool force_enabled)
>  {
>  }
>  
> diff --git a/hw/acpi/cpu.c b/hw/acpi/cpu.c
> index 9148b3a49e..c59a0acbf1 100644
> --- a/hw/acpi/cpu.c
> +++ b/hw/acpi/cpu.c
> @@ -357,7 +357,6 @@ void build_cpus_aml(Aml *table, MachineState *machine, 
> CPUHotplugFeatures opts,
>  char *cphp_res_path = g_strdup_printf("%s." CPUHP_RES_DEVICE, res_root);
>  Object *obj = object_resolve_path_type("", TYPE_ACPI_DEVICE_IF, NULL);
>  AcpiDeviceIfClass *adevc = ACPI_DEVICE_IF_GET_CLASS(obj);
> -AcpiDeviceIf *adev = ACPI_DEVICE_IF(obj);
>  
>  cpu_ctrl_dev = aml_device("%s", cphp_res_path);
>  {
> @@ -668,7 +667,7 @@ void build_cpus_aml(Aml *table, MachineState *machine, 
> CPUHotplugFeatures opts,
>  
>  /* build _MAT object */
>  assert(adevc && adevc->madt_cpu);
> -adevc->madt_cpu(adev, i, arch_ids, madt_buf,
> +adevc->madt_cpu(i, arch_ids, madt_buf,
>  true); /* set enabled flag */
>  aml_append(dev, aml_name_decl("_MAT",
>  aml_buffer(madt_buf->len, (uint8_t *)madt_buf->data)));
> diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
> index 4aaafbdd7b..52e5c1439a 100644
> --- a/hw/i386/acpi-common.c
> +++ b/hw/i386/acpi-common.c
> @@ -33,9 +33,8 @@
>  #include "acpi-build.h"
>  #include "acpi-common.h"
>  
> -void pc_madt_cpu_entry(AcpiDeviceIf *adev, int uid,
> -   const CPUArchIdList *apic_ids, GArray *entry,
> -   bool force_enabled)
> +void pc_madt_cpu_entry(int uid, const CPUArchIdList *apic_ids,
> +   GArray *entry, bool force_enabled)
>  {
>  uint32_t apic_id = apic_ids->cpus[uid].arch_id;
>  /* Flags – Local APIC Flags */
> @@ -112,7 +111,7 @@ void acpi_build_madt(GArray *table_data, BIOSLinker 
> *linker,
>

[PATCH] usb-ccid: make ids and descriptor configurable

2023-01-16 Thread Ripke, Klaus

Signed-off-by: Klaus Ripke 

hw/usb/dev-smartcard-reader.c:
Set some static values from ccid_properties.

---
 hw/usb/dev-smartcard-reader.c | 35 ++-
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/hw/usb/dev-smartcard-reader.c b/hw/usb/dev-smartcard-
reader.c
index 28164d89be..4002157773 100644
--- a/hw/usb/dev-smartcard-reader.c
+++ b/hw/usb/dev-smartcard-reader.c
@@ -311,6 +311,11 @@ struct USBCCIDState {
 uint8_t  powered;
 uint8_t  notify_slot_change;
 uint8_t  debug;
+    /* the following are copied to static on initial realize */
+    uint16_t vendor;
+    uint16_t product;
+    uint8_t  maxslot;
+    uint8_t  feat2;
 };
 
 /*
@@ -323,7 +328,11 @@ struct USBCCIDState {
  *   0dc3:1004 Athena Smartcard Solutions, Inc.
  */
 
-static const uint8_t qemu_ccid_descriptor[] = {
+enum {
+    DESC_MAXSLOT = 4,
+    DESC_FEAT2 = 42 /* dwFeatures byte 2 */
+};
+static uint8_t qemu_ccid_descriptor[] = {
 /* Smart Card Device Class Descriptor */
 0x36,   /* u8  bLength; */
 0x21,   /* u8  bDescriptorType; Functional */
@@ -472,7 +481,7 @@ static const USBDescDevice desc_device = {
 },
 };
 
-static const USBDesc desc_ccid = {
+static USBDesc desc_ccid = {
 .id = {
 .idVendor  = CCID_VENDOR_ID,
 .idProduct = CCID_PRODUCT_ID,
@@ -1295,9 +1304,10 @@ static void ccid_card_realize(DeviceState *qdev,
Error **errp)
 USBCCIDState *s = USB_CCID_DEV(dev);
 Error *local_err = NULL;
 
-    if (card->slot != 0) {
-    error_setg(errp, "usb-ccid supports one slot, can't add %d",
-   card->slot);
+    DPRINTF(s, D_VERBOSE, "%s: slot %d\n", __func__, card->slot);
+    if (card->slot > qemu_ccid_descriptor[DESC_MAXSLOT]) {
+    error_setg(errp, "usb-ccid supports %d slot, can't add %d",
+   qemu_ccid_descriptor[DESC_MAXSLOT] + 1, card-
>slot);
 return;
 }
 if (s->card != NULL) {
@@ -1317,6 +1327,14 @@ static void ccid_card_realize(DeviceState *qdev,
Error **errp)
 static void ccid_realize(USBDevice *dev, Error **errp)
 {
 USBCCIDState *s = USB_CCID_DEV(dev);
+    static int initialized;
+    if (!initialized) {
+    desc_ccid.id.idVendor = s->vendor;
+    desc_ccid.id.idProduct = s->product;
+    qemu_ccid_descriptor[DESC_MAXSLOT] = s->maxslot;
+    qemu_ccid_descriptor[DESC_FEAT2] = s->feat2;
+    initialized = !0;
+    }
 
 usb_desc_create_serial(dev);
 usb_desc_init(dev);
@@ -1339,6 +1357,8 @@ static void ccid_realize(USBDevice *dev, Error
**errp)
 ccid_reset_parameters(s);
 ccid_reset(s);
 s->debug = parse_debug_env("QEMU_CCID_DEBUG", D_VERBOSE, s-
>debug);
+    DPRINTF(s, D_VERBOSE, "ccid_realize %d %x %x %x %x\n",
+    initialized, s->vendor, s->product, s->maxslot, s->feat2);
 }
 
 static int ccid_post_load(void *opaque, int version_id)
@@ -1434,9 +1454,14 @@ static const VMStateDescription ccid_vmstate = {
 
 static Property ccid_properties[] = {
 DEFINE_PROP_UINT8("debug", USBCCIDState, debug, 0),
+    DEFINE_PROP_UINT16("vendor", USBCCIDState, vendor,
CCID_VENDOR_ID),
+    DEFINE_PROP_UINT16("product", USBCCIDState, product,
CCID_PRODUCT_ID),
+    DEFINE_PROP_UINT8("maxslot", USBCCIDState, maxslot, 0),
+    DEFINE_PROP_UINT8("feat2", USBCCIDState, feat2, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
+
 static void ccid_class_initfn(ObjectClass *klass, void *data)
 {
 DeviceClass *dc = DEVICE_CLASS(klass);
-- 
2.34.1

-- 
Klaus Ripke
Senior Developer
Public Authorities Division
secunet Security Networks AG

Telefon:  +49 201 5454-2982

Re: [PATCH v14 03/11] target/s390x/cpu topology: handle STSI(15) and build the SYSIB

2023-01-16 Thread Pierre Morel





On 1/16/23 14:11, Nina Schoetterl-Glausch wrote:

On Thu, 2023-01-05 at 15:53 +0100, Pierre Morel wrote:

On interception of STSI(15.1.x) the System Information Block
(SYSIB) is built from the list of pre-ordered topology entries.

Signed-off-by: Pierre Morel 
---
  include/hw/s390x/cpu-topology.h |   3 +
  include/hw/s390x/sclp.h |   1 +
  target/s390x/cpu.h  |  78 ++
  target/s390x/kvm/cpu_topology.c | 136 
  target/s390x/kvm/kvm.c  |   5 +-
  target/s390x/kvm/meson.build|   3 +-
  6 files changed, 224 insertions(+), 2 deletions(-)
  create mode 100644 target/s390x/kvm/cpu_topology.c

diff --git a/include/hw/s390x/cpu-topology.h b/include/hw/s390x/cpu-topology.h
index b3fd752d8d..9571aa70e5 100644
--- a/include/hw/s390x/cpu-topology.h
+++ b/include/hw/s390x/cpu-topology.h
@@ -41,6 +41,9 @@ typedef union s390_topology_id {
  };
  } s390_topology_id;
  #define TOPO_CPU_MASK   0x003fUL
+#define TOPO_SOCKET_MASK0xff00UL
+#define TOPO_BOOK_MASK  0xUL
+#define TOPO_DRAWER_MASK0xff00UL
  
  typedef struct S390TopologyEntry {

  s390_topology_id id;
diff --git a/include/hw/s390x/sclp.h b/include/hw/s390x/sclp.h
index d3ade40a5a..712fd68123 100644
--- a/include/hw/s390x/sclp.h
+++ b/include/hw/s390x/sclp.h
@@ -112,6 +112,7 @@ typedef struct CPUEntry {
  } QEMU_PACKED CPUEntry;
  
  #define SCLP_READ_SCP_INFO_FIXED_CPU_OFFSET 128

+#define SCLP_READ_SCP_INFO_MNEST2
  typedef struct ReadInfo {
  SCCBHeader h;
  uint16_t rnmax;
diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index 39ea63a416..78988048dd 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -561,6 +561,25 @@ typedef struct SysIB_322 {
  } SysIB_322;
  QEMU_BUILD_BUG_ON(sizeof(SysIB_322) != 4096);
  
+#define S390_TOPOLOGY_MAG  6

+#define S390_TOPOLOGY_MAG6 0
+#define S390_TOPOLOGY_MAG5 1
+#define S390_TOPOLOGY_MAG4 2
+#define S390_TOPOLOGY_MAG3 3
+#define S390_TOPOLOGY_MAG2 4
+#define S390_TOPOLOGY_MAG1 5
+/* Configuration topology */
+typedef struct SysIB_151x {
+uint8_t  reserved0[2];
+uint16_t length;
+uint8_t  mag[S390_TOPOLOGY_MAG];
+uint8_t  reserved1;
+uint8_t  mnest;
+uint32_t reserved2;
+char tle[];
+} QEMU_PACKED QEMU_ALIGNED(8) SysIB_151x;
+QEMU_BUILD_BUG_ON(sizeof(SysIB_151x) != 16);
+
  typedef union SysIB {
  SysIB_111 sysib_111;
  SysIB_121 sysib_121;
@@ -568,9 +587,68 @@ typedef union SysIB {
  SysIB_221 sysib_221;
  SysIB_222 sysib_222;
  SysIB_322 sysib_322;
+SysIB_151x sysib_151x;
  } SysIB;
  QEMU_BUILD_BUG_ON(sizeof(SysIB) != 4096);
  
+/*

+ * CPU Topology List provided by STSI with fc=15 provides a list
+ * of two different Topology List Entries (TLE) types to specify
+ * the topology hierarchy.
+ *
+ * - Container Topology List Entry
+ *   Defines a container to contain other Topology List Entries
+ *   of any type, nested containers or CPU.
+ * - CPU Topology List Entry
+ *   Specifies the CPUs position, type, entitlement and polarization
+ *   of the CPUs contained in the last Container TLE.
+ *
+ * There can be theoretically up to five levels of containers, QEMU
+ * uses only one level, the socket level.
+ *
+ * A container of with a nesting level (NL) greater than 1 can only
+ * contain another container of nesting level NL-1.
+ *
+ * A container of nesting level 1 (socket), contains as many CPU TLE
+ * as needed to describe the position and qualities of all CPUs inside
+ * the container.
+ * The qualities of a CPU are polarization, entitlement and type.
+ *
+ * The CPU TLE defines the position of the CPUs of identical qualities
+ * using a 64bits mask which first bit has its offset defined by
+ * the CPU address orgin field of the CPU TLE like in:
+ * CPU address = origin * 64 + bit position within the mask
+ *
+ */
+/* Container type Topology List Entry */
+/* Container type Topology List Entry */
+typedef struct SysIBTl_container {
+uint8_t nl;
+uint8_t reserved[6];
+uint8_t id;
+} QEMU_PACKED QEMU_ALIGNED(8) SysIBTl_container;
+QEMU_BUILD_BUG_ON(sizeof(SysIBTl_container) != 8);
+
+/* CPU type Topology List Entry */
+typedef struct SysIBTl_cpu {
+uint8_t nl;
+uint8_t reserved0[3];
+uint8_t reserved1:5;
+uint8_t dedicated:1;
+uint8_t polarity:2;
+uint8_t type;
+uint16_t origin;
+uint64_t mask;
+} QEMU_PACKED QEMU_ALIGNED(8) SysIBTl_cpu;
+QEMU_BUILD_BUG_ON(sizeof(SysIBTl_cpu) != 16);
+
+/* Max size of a SYSIB structure is when all CPU are alone in a container */
+#define S390_TOPOLOGY_SYSIB_SIZE (sizeof(SysIB_151x) + 
\
+  S390_MAX_CPUS * (sizeof(SysIBTl_container) + 
\
+   sizeof(SysIBTl_cpu)))


I don't think this is accurate anymore, if you have drawers and books.
In that case you

< 1 2 3 4 >

101 - 200 of 337 matches

Mail list logo