date:20220725

Re: [PATCH v6 1/3] hw/intc: Move mtimer/mtimecmp to aclint

2022-07-25 Thread Atish Patra

On Sat, Jul 23, 2022 at 2:43 AM Andrew Jones  wrote:
>
> On Thu, Jul 21, 2022 at 06:00:44PM -0700, Atish Patra wrote:
> > Historically, The mtime/mtimecmp has been part of the CPU because
> > they are per hart entities. However, they actually belong to aclint
> > which is a MMIO device.
> >
> > Move them to the ACLINT device. This also emulates the real hardware
> > more closely.
> >
> > Reviewed-by: Anup Patel 
> > Reviewed-by: Alistair Francis 
> > Signed-off-by: Atish Patra 
> > ---
> >  hw/intc/riscv_aclint.c | 41 --
> >  hw/timer/ibex_timer.c  | 18 ++-
> >  include/hw/intc/riscv_aclint.h |  2 ++
> >  include/hw/timer/ibex_timer.h  |  2 ++
> >  target/riscv/cpu.h |  2 --
> >  target/riscv/machine.c |  5 ++---
> >  6 files changed, 42 insertions(+), 28 deletions(-)
> >
> > diff --git a/hw/intc/riscv_aclint.c b/hw/intc/riscv_aclint.c
> > index e7942c4e5a32..47f355224612 100644
> > --- a/hw/intc/riscv_aclint.c
> > +++ b/hw/intc/riscv_aclint.c
> > @@ -32,6 +32,7 @@
> >  #include "hw/intc/riscv_aclint.h"
> >  #include "qemu/timer.h"
> >  #include "hw/irq.h"
> > +#include "migration/vmstate.h"
> >
> >  typedef struct riscv_aclint_mtimer_callback {
> >  RISCVAclintMTimerState *s;
> > @@ -65,8 +66,8 @@ static void 
> > riscv_aclint_mtimer_write_timecmp(RISCVAclintMTimerState *mtimer,
> >
> >  uint64_t rtc_r = cpu_riscv_read_rtc(mtimer);
> >
> > -cpu->env.timecmp = value;
> > -if (cpu->env.timecmp <= rtc_r) {
> > +mtimer->timecmp[hartid] = value;
> > +if (mtimer->timecmp[hartid] <= rtc_r) {
> >  /*
> >   * If we're setting an MTIMECMP value in the "past",
> >   * immediately raise the timer interrupt
> > @@ -77,7 +78,7 @@ static void 
> > riscv_aclint_mtimer_write_timecmp(RISCVAclintMTimerState *mtimer,
> >
> >  /* otherwise, set up the future timer interrupt */
> >  qemu_irq_lower(mtimer->timer_irqs[hartid - mtimer->hartid_base]);
> > -diff = cpu->env.timecmp - rtc_r;
> > +diff = mtimer->timecmp[hartid] - rtc_r;
> >  /* back to ns (note args switched in muldiv64) */
> >  uint64_t ns_diff = muldiv64(diff, NANOSECONDS_PER_SECOND, 
> > timebase_freq);
> >
> > @@ -102,7 +103,7 @@ static void 
> > riscv_aclint_mtimer_write_timecmp(RISCVAclintMTimerState *mtimer,
> >  next = MIN(next, INT64_MAX);
> >  }
> >
> > -timer_mod(cpu->env.timer, next);
> > +timer_mod(mtimer->timers[hartid], next);
> >  }
> >
> >  /*
> > @@ -133,11 +134,11 @@ static uint64_t riscv_aclint_mtimer_read(void 
> > *opaque, hwaddr addr,
> >"aclint-mtimer: invalid hartid: %zu", hartid);
> >  } else if ((addr & 0x7) == 0) {
> >  /* timecmp_lo for RV32/RV64 or timecmp for RV64 */
> > -uint64_t timecmp = env->timecmp;
> > +uint64_t timecmp = mtimer->timecmp[hartid];
> >  return (size == 4) ? (timecmp & 0x) : timecmp;
> >  } else if ((addr & 0x7) == 4) {
> >  /* timecmp_hi */
> > -uint64_t timecmp = env->timecmp;
> > +uint64_t timecmp = mtimer->timecmp[hartid];
> >  return (timecmp >> 32) & 0x;
> >  } else {
> >  qemu_log_mask(LOG_UNIMP,
> > @@ -177,7 +178,7 @@ static void riscv_aclint_mtimer_write(void *opaque, 
> > hwaddr addr,
> >  } else if ((addr & 0x7) == 0) {
> >  if (size == 4) {
> >  /* timecmp_lo for RV32/RV64 */
> > -uint64_t timecmp_hi = env->timecmp >> 32;
> > +uint64_t timecmp_hi = mtimer->timecmp[hartid] >> 32;
> >  riscv_aclint_mtimer_write_timecmp(mtimer, RISCV_CPU(cpu), 
> > hartid,
> >  timecmp_hi << 32 | (value & 0x));
> >  } else {
> > @@ -188,7 +189,7 @@ static void riscv_aclint_mtimer_write(void *opaque, 
> > hwaddr addr,
> >  } else if ((addr & 0x7) == 4) {
> >  if (size == 4) {
> >  /* timecmp_hi for RV32/RV64 */
> > -uint64_t timecmp_lo = env->timecmp;
> > +uint64_t timecmp_lo = mtimer->timecmp[hartid];
> >  riscv_aclint_mtimer_write_timecmp(mtimer, RISCV_CPU(cpu), 
> > hartid,
> >  value << 32 | (timecmp_lo & 0x));
> >  } else {
> > @@ -234,7 +235,7 @@ static void riscv_aclint_mtimer_write(void *opaque, 
> > hwaddr addr,
> >  }
> >  riscv_aclint_mtimer_write_timecmp(mtimer, RISCV_CPU(cpu),
> >mtimer->hartid_base + i,
> > -  env->timecmp);
> > +  mtimer->timecmp[i]);
> >  }
> >  return;
> >  }
> > @@ -284,6 +285,8 @@ static void riscv_aclint_mtimer_realize(DeviceState 
> > *dev, Error **errp)
> >  s->timer_irqs = g_new(qemu_irq, s->num_harts);
> >

Re: [PATCH 09/16] vhost-user: enable/disable a single vring

2022-07-25 Thread Kangjie Xu




在 2022/7/26 12:07, Jason Wang 写道:


在 2022/7/18 19:17, Kangjie Xu 写道:

Implement the vhost_set_single_vring_enable, which is to enable or
disable a single vring.

The parameter wait_for_reply is added to help for some cases such as
vq reset.

Meanwhile, vhost_user_set_vring_enable() is refactored.

Signed-off-by: Kangjie Xu 
Signed-off-by: Xuan Zhuo 
---
  hw/virtio/vhost-user.c | 55 --
  1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 75b8df21a4..5a80a415f0 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -267,6 +267,8 @@ struct scrub_regions {
  int fd_idx;
  };
  +static int enforce_reply(struct vhost_dev *dev, const VhostUserMsg 
*msg);

+
  static bool ioeventfd_enabled(void)
  {
  return !kvm_enabled() || kvm_eventfds_enabled();
@@ -1198,6 +1200,49 @@ static int vhost_user_set_vring_base(struct 
vhost_dev *dev,

  return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);
  }
  +
+static int vhost_user_set_single_vring_enable(struct vhost_dev *dev,
+  int index,
+  int enable,
+  bool wait_for_reply)
+{
+    int ret;
+
+    if (index < dev->vq_index || index >= dev->vq_index + dev->nvqs) {
+    return -EINVAL;
+    }
+
+    struct vhost_vring_state state = {
+    .index = index,
+    .num   = enable,
+    };
+
+    VhostUserMsg msg = {
+    .hdr.request = VHOST_USER_SET_VRING_ENABLE,
+    .hdr.flags = VHOST_USER_VERSION,
+    .payload.state = state,
+    .hdr.size = sizeof(msg.payload.state),
+    };
+
+    bool reply_supported = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_REPLY_ACK);
+
+    if (reply_supported && wait_for_reply) {
+    msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
+    }



Do we need to fail if !realy_supported && wait_for_reply?

Thanks


I guess you mean "should we fail if VHOST_USER_PROTOCOL_F_REPLY_ACK 
feature is not supported?".


The implementation here is similar to that in vhost_user_set_vring_addr().

If this feature is not supported, it will call enforce_reply(), then 
call vhost_user_get_features() to get a reply.


Since the messages will be processed sequentailly in DPDK, success of 
enforce_reply() means the previous message VHOST_USER_SET_VRING_ENABLE 
has been processed.


Thanks




+
+    ret = vhost_user_write(dev, , NULL, 0);
+    if (ret < 0) {
+    return ret;
+    }
+
+    if (wait_for_reply) {
+    return enforce_reply(dev, );
+    }
+
+    return ret;
+}
+
  static int vhost_user_set_vring_enable(struct vhost_dev *dev, int 
enable)

  {
  int i;
@@ -1207,13 +1252,8 @@ static int vhost_user_set_vring_enable(struct 
vhost_dev *dev, int enable)

  }
    for (i = 0; i < dev->nvqs; ++i) {
-    int ret;
-    struct vhost_vring_state state = {
-    .index = dev->vq_index + i,
-    .num   = enable,
-    };
-
-    ret = vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, 
);
+    int ret = vhost_user_set_single_vring_enable(dev, 
dev->vq_index + i,

+ enable, false);
  if (ret < 0) {
  /*
   * Restoring the previous state is likely infeasible, 
as well as

@@ -2627,6 +2667,7 @@ const VhostOps user_ops = {
  .vhost_set_owner = vhost_user_set_owner,
  .vhost_reset_device = vhost_user_reset_device,
  .vhost_get_vq_index = vhost_user_get_vq_index,
+    .vhost_set_single_vring_enable = 
vhost_user_set_single_vring_enable,

  .vhost_set_vring_enable = vhost_user_set_vring_enable,
  .vhost_requires_shm_log = vhost_user_requires_shm_log,
  .vhost_migration_done = vhost_user_migration_done,

[PATCH v2 1/3] target/arm: Use kvm_arm_sve_supported in kvm_arm_get_host_cpu_features

2022-07-25 Thread Richard Henderson

Indication for support for SVE will not depend on whether we
perform the query on the main kvm_state or the temp vcpu.

Signed-off-by: Richard Henderson 
---
 target/arm/kvm64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index d16d4ea250..bb1516b3d5 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -675,7 +675,7 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
 }
 }
 
-sve_supported = ioctl(fdarray[0], KVM_CHECK_EXTENSION, KVM_CAP_ARM_SVE) > 
0;
+sve_supported = kvm_arm_sve_supported();
 
 /* Add feature bits that can't appear until after VCPU init. */
 if (sve_supported) {
-- 
2.34.1

[PATCH v2 2/3] target/arm: Set KVM_ARM_VCPU_SVE while probing the host

2022-07-25 Thread Richard Henderson

Because we weren't setting this flag, our probe of ID_AA64ZFR0
was always returning zero.  This also obviates the adjustment
of ID_AA64PFR0, which had sanitized the SVE field.

The effects of the bug are not visible, because the only thing that
ID_AA64ZFR0 is used for within qemu at present is tcg translation.
The other tests for SVE within KVM are via ID_AA64PFR0.SVE.

Reported-by: Zenghui Yu 
Signed-off-by: Richard Henderson 
---
 target/arm/kvm64.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index bb1516b3d5..43cd7eb890 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -507,7 +507,6 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
 bool sve_supported;
 bool pmu_supported = false;
 uint64_t features = 0;
-uint64_t t;
 int err;
 
 /* Old kernels may not know about the PREFERRED_TARGET ioctl: however
@@ -528,10 +527,17 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures 
*ahcf)
 struct kvm_vcpu_init init = { .target = -1, };
 
 /*
- * Ask for Pointer Authentication if supported. We can't play the
- * SVE trick of synthesising the ID reg as KVM won't tell us
- * whether we have the architected or IMPDEF version of PAuth, so
- * we have to use the actual ID regs.
+ * Ask for SVE if supported, so that we can query ID_AA64ZFR0,
+ * which is otherwise RAZ.
+ */
+sve_supported = kvm_arm_sve_supported();
+if (sve_supported) {
+init.features[0] |= 1 << KVM_ARM_VCPU_SVE;
+}
+
+/*
+ * Ask for Pointer Authentication if supported, so that we get
+ * the unsanitized field values for AA64ISAR1_EL1.
  */
 if (kvm_arm_pauth_supported()) {
 init.features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
@@ -675,20 +681,13 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures 
*ahcf)
 }
 }
 
-sve_supported = kvm_arm_sve_supported();
-
-/* Add feature bits that can't appear until after VCPU init. */
 if (sve_supported) {
-t = ahcf->isar.id_aa64pfr0;
-t = FIELD_DP64(t, ID_AA64PFR0, SVE, 1);
-ahcf->isar.id_aa64pfr0 = t;
-
 /*
  * There is a range of kernels between kernel commit 73433762fcae
  * and f81cb2c3ad41 which have a bug where the kernel doesn't expose
  * SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has enabled
- * SVE support, so we only read it here, rather than together with all
- * the other ID registers earlier.
+ * SVE support, which resulted in an error rather than RAZ.
+ * So only read the register if we set KVM_ARM_VCPU_SVE above.
  */
 err |= read_sys_reg64(fdarray[2], >isar.id_aa64zfr0,
   ARM64_SYS_REG(3, 0, 0, 4, 4));
-- 
2.34.1

[PATCH v2 3/3] target/arm: Move sve probe inside kvm >= 4.15 branch

2022-07-25 Thread Richard Henderson

The test for the IF block indicates no ID registers are exposed, much
less host support for SVE.  Move the SVE probe into the ELSE block.

Signed-off-by: Richard Henderson 
---
 target/arm/kvm64.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 43cd7eb890..9b9dd46d78 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -679,18 +679,18 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures 
*ahcf)
 err |= read_sys_reg64(fdarray[2], >isar.reset_pmcr_el0,
   ARM64_SYS_REG(3, 3, 9, 12, 0));
 }
-}
 
-if (sve_supported) {
-/*
- * There is a range of kernels between kernel commit 73433762fcae
- * and f81cb2c3ad41 which have a bug where the kernel doesn't expose
- * SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has enabled
- * SVE support, which resulted in an error rather than RAZ.
- * So only read the register if we set KVM_ARM_VCPU_SVE above.
- */
-err |= read_sys_reg64(fdarray[2], >isar.id_aa64zfr0,
-  ARM64_SYS_REG(3, 0, 0, 4, 4));
+if (sve_supported) {
+/*
+ * There is a range of kernels between kernel commit 73433762fcae
+ * and f81cb2c3ad41 which have a bug where the kernel doesn't
+ * expose SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has
+ * enabled SVE support, which resulted in an error rather than RAZ.
+ * So only read the register if we set KVM_ARM_VCPU_SVE above.
+ */
+err |= read_sys_reg64(fdarray[2], >isar.id_aa64zfr0,
+  ARM64_SYS_REG(3, 0, 0, 4, 4));
+}
 }
 
 kvm_arm_destroy_scratch_host_vcpu(fdarray);
-- 
2.34.1

[PATCH v2 0/3] target/arm: Fix kvm probe of ID_AA64ZFR0

2022-07-25 Thread Richard Henderson

Our probing of this SVE register was done within an incorrect
vCPU environment, so that the id register was always RAZ.

Changes for v2:
  * Include the commit text I forgot.
  * Fix svm thinko.


r~


Richard Henderson (3):
  target/arm: Use kvm_arm_sve_supported in kvm_arm_get_host_cpu_features
  target/arm: Set KVM_ARM_VCPU_SVE while probing the host
  target/arm: Move sve probe inside kvm >= 4.15 branch

 target/arm/kvm64.c | 45 ++---
 1 file changed, 22 insertions(+), 23 deletions(-)

-- 
2.34.1

Re: [PATCH 1/3] target/arm: Create kvm_arm_svm_supported

2022-07-25 Thread Richard Henderson


On 7/25/22 19:02, Zenghui Yu wrote:

Hi Richard,

On 2022/7/26 2:14, Richard Henderson wrote:

Indication for support for SVE will not depend on whether we
perform the query on the main kvm_state or the temp vcpu.
Mirror kvm_arm_pauth_supported.

Signed-off-by: Richard Henderson 
---
 target/arm/kvm64.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index d16d4ea250..36ff20c9e3 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -496,6 +496,11 @@ static bool kvm_arm_pauth_supported(void)
 kvm_check_extension(kvm_state, KVM_CAP_ARM_PTRAUTH_GENERIC));
 }

+static bool kvm_arm_svm_supported(void)
+{
+    return kvm_check_extension(kvm_state, KVM_CAP_ARM_SVE);
+}
+


Not sure if it's a typo. Maybe we should instead use
kvm_arm_sve_supported() which was introduced in commit 14e99e0fbbc6.


Oof, it certainly is.

r~

Re: [PATCH v2] vhost: Get vring base from vq, not svq

2022-07-25 Thread Jason Wang

On Mon, Jul 25, 2022 at 3:07 PM Jason Wang  wrote:
>
> On Mon, Jul 18, 2022 at 8:05 PM Eugenio Pérez  wrote:
> >
> > The SVQ vring used idx usually match with the guest visible one, as long
> > as all the guest buffers (GPA) maps to exactly one buffer within qemu's
> > VA. However, as we can see in virtqueue_map_desc, a single guest buffer
> > could map to many buffers in SVQ vring.
> >
> > Also, its also a mistake to rewind them at the source of migration.
> > Since VirtQueue is able to migrate the inflight descriptors, its
> > responsability of the destination to perform the rewind just in case it
> > cannot report the inflight descriptors to the device.
> >
> > This makes easier to migrate between backends or to recover them in
> > vhost devices that support set in flight descriptors.
> >
> > Fixes: 6d0b22266633 ("vdpa: Adapt vhost_vdpa_get_vring_base to SVQ")
> > Signed-off-by: Eugenio Pérez 
>
> Acked-by: Jason Wang 

I've queued this for hard-freeze.

Thanks

>
> >
> > --
> > v2: Squash both fixes in one.
> > ---
> >  hw/virtio/vhost-vdpa.c | 24 
> >  1 file changed, 12 insertions(+), 12 deletions(-)
> >
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index 795ed5a049..4458c8d23e 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -1178,7 +1178,18 @@ static int vhost_vdpa_set_vring_base(struct 
> > vhost_dev *dev,
> > struct vhost_vring_state *ring)
> >  {
> >  struct vhost_vdpa *v = dev->opaque;
> > +VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
> >
> > +/*
> > + * vhost-vdpa devices does not support in-flight requests. Set all of 
> > them
> > + * as available.
> > + *
> > + * TODO: This is ok for networking, but other kinds of devices might
> > + * have problems with these retransmissions.
> > + */
> > +while (virtqueue_rewind(vq, 1)) {
> > +continue;
> > +}
> >  if (v->shadow_vqs_enabled) {
> >  /*
> >   * Device vring base was set at device start. SVQ base is handled 
> > by
> > @@ -1194,21 +1205,10 @@ static int vhost_vdpa_get_vring_base(struct 
> > vhost_dev *dev,
> > struct vhost_vring_state *ring)
> >  {
> >  struct vhost_vdpa *v = dev->opaque;
> > -int vdpa_idx = ring->index - dev->vq_index;
> >  int ret;
> >
> >  if (v->shadow_vqs_enabled) {
> > -VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, 
> > vdpa_idx);
> > -
> > -/*
> > - * Setting base as last used idx, so destination will see as 
> > available
> > - * all the entries that the device did not use, including the 
> > in-flight
> > - * processing ones.
> > - *
> > - * TODO: This is ok for networking, but other kinds of devices 
> > might
> > - * have problems with these retransmissions.
> > - */
> > -ring->num = svq->last_used_idx;
> > +ring->num = virtio_queue_get_last_avail_idx(dev->vdev, 
> > ring->index);
> >  return 0;
> >  }
> >
> > --
> > 2.31.1
> >

Re: [PATCH] vdpa: Fix memory listener deletions of iova tree

2022-07-25 Thread Jason Wang

On Mon, Jul 25, 2022 at 3:05 PM Jason Wang  wrote:
>
> On Fri, Jul 22, 2022 at 4:26 PM Eugenio Pérez  wrote:
> >
> > vhost_vdpa_listener_region_del is always deleting the first iova entry
> > of the tree, since it's using the needle iova instead of the result's
> > one.
> >
> > This was detected using a vga virtual device in the VM using vdpa SVQ.
> > It makes some extra memory adding and deleting, so the wrong one was
> > mapped / unmapped. This was undetected before since all the memory was
> > mappend and unmapped totally without that device, but other conditions
> > could trigger it too:
> >
> > * mem_region was with .iova = 0, .translated_addr = (correct GPA).
> > * iova_tree_find_iova returned right result, but does not update
> >   mem_region.
> > * iova_tree_remove always removed region with .iova = 0. Right iova were
> >   sent to the device.
> > * Next map will fill the first region with .iova = 0, causing a mapping
> >   with the same iova and device complains, if the next action is a map.
> > * Next unmap will cause to try to unmap again iova = 0, causing the
> >   device to complain that no region was mapped at iova = 0.
> >
> > Fixes: 34e3c94edaef ("vdpa: Add custom IOTLB translations to SVQ")
> > Reported-by: Lei Yang 
> > Signed-off-by: Eugenio Pérez 
>
> Acked-by: Jason Wang 

I've queued this for hard-freeze.

Thanks

>
> > ---
> >  hw/virtio/vhost-vdpa.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index 291cd19054..00e990ea40 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -290,7 +290,7 @@ static void 
> > vhost_vdpa_listener_region_del(MemoryListener *listener,
> >
> >  result = vhost_iova_tree_find_iova(v->iova_tree, _region);
> >  iova = result->iova;
> > -vhost_iova_tree_remove(v->iova_tree, _region);
> > +vhost_iova_tree_remove(v->iova_tree, result);
> >  }
> >  vhost_vdpa_iotlb_batch_begin_once(v);
> >  ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
> > --
> > 2.31.1
> >

Re: [PATCH] e1000e: Fix possible interrupt loss when using MSI

2022-07-25 Thread Jason Wang

On Wed, Jul 20, 2022 at 7:14 PM Ake Koomsin  wrote:
>
> Commit "e1000e: Prevent MSI/MSI-X storms" introduced msi_causes_pending
> to prevent interrupt storms problem. It was tested with MSI-X.
>
> In case of MSI, the guest can rely solely on interrupts to clear ICR.
> Upon clearing all pending interrupts, msi_causes_pending gets cleared.
> However, when e1000e_itr_should_postpone() in e1000e_send_msi() returns
> true, MSI never gets fired by e1000e_intrmgr_on_throttling_timer()
> because msi_causes_pending is still set. This results in interrupt loss.
>
> To prevent this, we need to clear msi_causes_pending when MSI is going
> to get fired by the throttling timer. The guest can then receive
> interrupts eventually.
>
> Signed-off-by: Ake Koomsin 

I've queued this.

Thanks

> ---
>  hw/net/e1000e_core.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> index 2c51089a82..208e3e0d79 100644
> --- a/hw/net/e1000e_core.c
> +++ b/hw/net/e1000e_core.c
> @@ -159,6 +159,8 @@ e1000e_intrmgr_on_throttling_timer(void *opaque)
>
>  if (msi_enabled(timer->core->owner)) {
>  trace_e1000e_irq_msi_notify_postponed();
> +/* Clear msi_causes_pending to fire MSI eventually */
> +timer->core->msi_causes_pending = 0;
>  e1000e_set_interrupt_cause(timer->core, 0);
>  } else {
>  trace_e1000e_irq_legacy_notify_postponed();
> --
> 2.25.1
>

Re: [PATCH 16/16] vhost-net: vq reset feature bit support

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

Add support for negotation of vq reset feature bit.

Signed-off-by: Kangjie Xu 
Signed-off-by: Xuan Zhuo 



I'd suggest to add support for vhost-net kernel as well. It looks much 
more easier than vhost-user (I guess a stop/start would do the trick).


Thanks



---
  hw/net/vhost_net.c  | 1 +
  hw/net/virtio-net.c | 3 ++-
  2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index 4f5f034c11..de910f6466 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -73,6 +73,7 @@ static const int user_feature_bits[] = {
  VIRTIO_NET_F_MTU,
  VIRTIO_F_IOMMU_PLATFORM,
  VIRTIO_F_RING_PACKED,
+VIRTIO_F_RING_RESET,
  VIRTIO_NET_F_RSS,
  VIRTIO_NET_F_HASH_REPORT,
  
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c

index 0747ffe71c..a8b299067a 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -757,6 +757,8 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, 
uint64_t features,
  
  virtio_add_feature(, VIRTIO_NET_F_MAC);
  
+virtio_add_feature(, VIRTIO_F_RING_RESET);

+
  if (!peer_has_vnet_hdr(n)) {
  virtio_clear_feature(, VIRTIO_NET_F_CSUM);
  virtio_clear_feature(, VIRTIO_NET_F_HOST_TSO4);
@@ -777,7 +779,6 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, 
uint64_t features,
  }
  
  if (!get_vhost_net(nc->peer)) {

-virtio_add_feature(, VIRTIO_F_RING_RESET);
  return features;
  }

Re: [PATCH 14/16] virtio-net: support queue_enable for vhost-user

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

Support queue enable in vhost-user scenario. It will be called when
a vq reset operation is performed and the vq will be restared.

It should be noted that we can restart the vq when the vhost has
already started. When launching a new vhost-user device, the vhost
is not started and all vqs are not initalized until
VIRTIO_PCI_COMMON_STATUS is written. Thus, we should use vhost_started
to differentiate the two cases: vq reset and device start.

Signed-off-by: Kangjie Xu 
Signed-off-by: Xuan Zhuo 
---
  hw/net/virtio-net.c | 20 
  1 file changed, 20 insertions(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 8396e21a67..2c26e2ef73 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -544,6 +544,25 @@ static void virtio_net_queue_reset(VirtIODevice *vdev, 
uint32_t queue_index)
  assert(!virtio_net_get_subqueue(nc)->async_tx.elem);
  }
  
+static void virtio_net_queue_enable(VirtIODevice *vdev, uint32_t queue_index)

+{
+VirtIONet *n = VIRTIO_NET(vdev);
+NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
+int r;
+
+if (!nc->peer || !vdev->vhost_started) {
+return;
+}
+
+if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
+r = vhost_virtqueue_restart(vdev, nc, queue_index);
+if (r < 0) {
+error_report("unable to restart vhost net virtqueue: %d, "
+"when resetting the queue", queue_index);
+}
+}
+}



So we don't check queue_enable in vhost_dev_start(), does this mean the 
queue_enable is actually meaningless (since the virtqueue is already 
started there)?


And another issue is

peet_attach/peer_detach() "abuse" vhost_set_vring_enable(). This 
probably means we need to invent new type of request instead of re-using 
VHOST_USER_SET_VRING_ENABLE.


Thanks



+
  static void virtio_net_reset(VirtIODevice *vdev)
  {
  VirtIONet *n = VIRTIO_NET(vdev);
@@ -3781,6 +3800,7 @@ static void virtio_net_class_init(ObjectClass *klass, 
void *data)
  vdc->bad_features = virtio_net_bad_features;
  vdc->reset = virtio_net_reset;
  vdc->queue_reset = virtio_net_queue_reset;
+vdc->queue_enable = virtio_net_queue_enable;
  vdc->set_status = virtio_net_set_status;
  vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
  vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;

Re: [PATCH 13/16] virtio: introduce queue_enable in virtio

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

Introduce the interface queue_enable() in VirtioDeviceClass and the
fucntion virtio_queue_enable() in virtio, it can be called when
VIRTIO_PCI_COMMON_Q_ENABLE is written.



I'd suggest to split this series into two.

1) queue_enable

2) queue_reset

The patch looks good to me.

Thanks




Signed-off-by: Kangjie Xu 
Signed-off-by: Xuan Zhuo 
---
  hw/virtio/virtio-pci.c | 1 +
  hw/virtio/virtio.c | 9 +
  include/hw/virtio/virtio.h | 2 ++
  3 files changed, 12 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 35e8a5101a..85e1840479 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1335,6 +1335,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr 
addr,
 proxy->vqs[vdev->queue_sel].avail[0],
 ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
 proxy->vqs[vdev->queue_sel].used[0]);
+virtio_queue_enable(vdev, vdev->queue_sel);
  proxy->vqs[vdev->queue_sel].enabled = 1;
  proxy->vqs[vdev->queue_sel].reset = 0;
  } else {
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 0e9d41366f..82eb9dd4f2 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2050,6 +2050,15 @@ void virtio_queue_reset(VirtIODevice *vdev, uint32_t 
queue_index)
  __virtio_queue_reset(vdev, queue_index);
  }
  
+void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index)

+{
+VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+if (k->queue_enable) {
+k->queue_enable(vdev, queue_index);
+}
+}
+
  void virtio_reset(void *opaque)
  {
  VirtIODevice *vdev = opaque;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 879394299b..085997d8f3 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -131,6 +131,7 @@ struct VirtioDeviceClass {
  void (*reset)(VirtIODevice *vdev);
  void (*set_status)(VirtIODevice *vdev, uint8_t val);
  void (*queue_reset)(VirtIODevice *vdev, uint32_t queue_index);
+void (*queue_enable)(VirtIODevice *vdev, uint32_t queue_index);
  /* For transitional devices, this is a bitmap of features
   * that are only exposed on the legacy interface but not
   * the modern one.
@@ -270,6 +271,7 @@ int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, 
int n,
  int virtio_set_status(VirtIODevice *vdev, uint8_t val);
  void virtio_reset(void *opaque);
  void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index);
+void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index);
  void virtio_update_irq(VirtIODevice *vdev);
  int virtio_set_features(VirtIODevice *vdev, uint64_t val);

Re: [PATCH 12/16] vhost-net: introduce restart and stop for vhost_net's vqs

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

Introduce vhost_virtqueue_restart(), which can restart the
virtqueue when the vhost net started running before.

Introduce vhost_virtqueue_stop(), which can disable the vq
and unmap vrings and the desc of the vq. When disabling the
vq, the function is blocked and waits for a reply.

Combining the two functions, we can reset a virtqueue with a
started vhost net.

Signed-off-by: Kangjie Xu 
Signed-off-by: Xuan Zhuo 
---
  hw/net/vhost_net.c  | 55 +
  include/net/vhost_net.h |  5 
  2 files changed, 60 insertions(+)

diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index ccac5b7a64..4f5f034c11 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -514,3 +514,58 @@ int vhost_net_set_mtu(struct vhost_net *net, uint16_t mtu)
  
  return vhost_ops->vhost_net_set_mtu(>dev, mtu);

  }
+
+void vhost_virtqueue_stop(VirtIODevice *vdev, NetClientState *nc,
+  int vq_index)



Let's rename this as vhost_net_virtqueue_stop()



+{
+VHostNetState *net = get_vhost_net(nc->peer);
+const VhostOps *vhost_ops = net->dev.vhost_ops;
+int r;
+
+assert(vhost_ops);
+
+r = vhost_ops->vhost_set_single_vring_enable(>dev, vq_index, 0, true);
+if (r < 0) {
+goto err_queue_disable;
+}
+
+vhost_dev_virtqueue_release(>dev, vdev, vq_index);
+
+return;
+
+err_queue_disable:
+error_report("Error when releasing the qeuue.");
+}
+
+int vhost_virtqueue_restart(VirtIODevice *vdev, NetClientState *nc,
+int vq_index)
+{
+VHostNetState *net = get_vhost_net(nc->peer);
+const VhostOps *vhost_ops = net->dev.vhost_ops;
+int r;
+
+if (!net->dev.started) {
+return 0;
+}
+
+assert(vhost_ops);
+
+r = vhost_dev_virtqueue_restart(>dev, vdev, vq_index);
+if (r < 0) {
+goto err_start;
+}
+
+r = vhost_ops->vhost_set_single_vring_enable(>dev, vq_index, 1,
+ false);



So it looks nothing vhost_net specific but vhost. And why not do 
set_single_vring_enable in vhost_dev_virtqueue_restart() (btw, the name 
of this function looks confusing).


Thanks



+if (r < 0) {
+goto err_start;
+}
+
+return 0;
+
+err_start:
+error_report("Error when restarting the queue.");
+vhost_dev_stop(>dev, vdev);
+
+return r;
+}
diff --git a/include/net/vhost_net.h b/include/net/vhost_net.h
index 387e913e4e..fcb09e36ef 100644
--- a/include/net/vhost_net.h
+++ b/include/net/vhost_net.h
@@ -48,4 +48,9 @@ uint64_t vhost_net_get_acked_features(VHostNetState *net);
  
  int vhost_net_set_mtu(struct vhost_net *net, uint16_t mtu);
  
+void vhost_virtqueue_stop(VirtIODevice *vdev, NetClientState *nc,

+  int vq_index);
+int vhost_virtqueue_restart(VirtIODevice *vdev, NetClientState *nc,
+int vq_index);
+
  #endif

Re: [PATCH 11/16] vhost: introduce restart and release for vhost_dev's vqs

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

Introduce vhost_dev_virtqueue_restart(), which can restart the
virtqueue when the vhost has already started running.

Meanwhile, vhost_dev_virtqueue_release(), which can ummap the
vrings and the desc of a specific vq of a device.

Combining the two functions, we can reset a virtqueue with a
started vhost.

Signed-off-by: Kangjie Xu 
Signed-off-by: Xuan Zhuo 
---
  hw/virtio/vhost.c | 29 +
  include/hw/virtio/vhost.h |  6 ++
  2 files changed, 35 insertions(+)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index e467dfc7bc..d158d71866 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1904,3 +1904,32 @@ int vhost_net_set_backend(struct vhost_dev *hdev,
  
  return -ENOSYS;

  }
+
+void vhost_dev_virtqueue_release(struct vhost_dev *hdev, VirtIODevice *vdev,
+ int vq_index)
+{
+int idx = vq_index - hdev->vq_index;
+
+idx = hdev->vhost_ops->vhost_get_vq_index(hdev, idx);
+
+vhost_virtqueue_unmap(hdev,
+  vdev,
+  hdev->vqs + idx,
+  hdev->vq_index + idx);
+}



Anything wrong that makes you can't use vhost_virtqueue_stop() here?

Thanks



+
+int vhost_dev_virtqueue_restart(struct vhost_dev *hdev, VirtIODevice *vdev,
+int vq_index)
+{
+int idx = vq_index - hdev->vq_index;
+int r = 0;
+
+idx = hdev->vhost_ops->vhost_get_vq_index(hdev, idx);
+
+r = vhost_virtqueue_start(hdev,
+  vdev,
+  hdev->vqs + idx,
+  hdev->vq_index + idx);
+
+return r;
+}
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index a346f23d13..7df7dbe24d 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -277,6 +277,12 @@ bool vhost_has_free_slot(void);
  int vhost_net_set_backend(struct vhost_dev *hdev,
struct vhost_vring_file *file);
  
+

+void vhost_dev_virtqueue_release(struct vhost_dev *hdev, VirtIODevice *vdev,
+ int vq_index);
+int vhost_dev_virtqueue_restart(struct vhost_dev *hdev, VirtIODevice *vdev,
+int vq_index);
+
  int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write);
  
  void vhost_dev_reset_inflight(struct vhost_inflight *inflight);

Re: [PATCH 10/16] vhost: extract the logic of unmapping the vrings and desc

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

Introduce vhost_virtqueue_unmap() to ummap the vrings and desc
of a virtqueue.

The function will be used later.

Signed-off-by: Kangjie Xu 
Signed-off-by: Xuan Zhuo 



Acked-by: Jason Wang 



---
  hw/virtio/vhost.c | 20 ++--
  1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 0827d631c0..e467dfc7bc 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1197,6 +1197,19 @@ fail_alloc_desc:
  return r;
  }
  
+static void vhost_virtqueue_unmap(struct vhost_dev *dev,

+  struct VirtIODevice *vdev,
+  struct vhost_virtqueue *vq,
+  unsigned idx)
+{
+vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
+   1, virtio_queue_get_used_size(vdev, idx));
+vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
+   0, virtio_queue_get_avail_size(vdev, idx));
+vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
+   0, virtio_queue_get_desc_size(vdev, idx));
+}
+
  static void vhost_virtqueue_stop(struct vhost_dev *dev,
  struct VirtIODevice *vdev,
  struct vhost_virtqueue *vq,
@@ -1235,12 +1248,7 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
  vhost_vq_index);
  }
  
-vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),

-   1, virtio_queue_get_used_size(vdev, idx));
-vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
-   0, virtio_queue_get_avail_size(vdev, idx));
-vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
-   0, virtio_queue_get_desc_size(vdev, idx));
+vhost_virtqueue_unmap(dev, vdev, vq, idx);
  }
  
  static void vhost_eventfd_add(MemoryListener *listener,

Re: [PATCH 09/16] vhost-user: enable/disable a single vring

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

Implement the vhost_set_single_vring_enable, which is to enable or
disable a single vring.

The parameter wait_for_reply is added to help for some cases such as
vq reset.

Meanwhile, vhost_user_set_vring_enable() is refactored.

Signed-off-by: Kangjie Xu 
Signed-off-by: Xuan Zhuo 
---
  hw/virtio/vhost-user.c | 55 --
  1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 75b8df21a4..5a80a415f0 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -267,6 +267,8 @@ struct scrub_regions {
  int fd_idx;
  };
  
+static int enforce_reply(struct vhost_dev *dev, const VhostUserMsg *msg);

+
  static bool ioeventfd_enabled(void)
  {
  return !kvm_enabled() || kvm_eventfds_enabled();
@@ -1198,6 +1200,49 @@ static int vhost_user_set_vring_base(struct vhost_dev 
*dev,
  return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);
  }
  
+

+static int vhost_user_set_single_vring_enable(struct vhost_dev *dev,
+  int index,
+  int enable,
+  bool wait_for_reply)
+{
+int ret;
+
+if (index < dev->vq_index || index >= dev->vq_index + dev->nvqs) {
+return -EINVAL;
+}
+
+struct vhost_vring_state state = {
+.index = index,
+.num   = enable,
+};
+
+VhostUserMsg msg = {
+.hdr.request = VHOST_USER_SET_VRING_ENABLE,
+.hdr.flags = VHOST_USER_VERSION,
+.payload.state = state,
+.hdr.size = sizeof(msg.payload.state),
+};
+
+bool reply_supported = virtio_has_feature(dev->protocol_features,
+  VHOST_USER_PROTOCOL_F_REPLY_ACK);
+
+if (reply_supported && wait_for_reply) {
+msg.hdr.flags |= VHOST_USER_NEED_REPLY_MASK;
+}



Do we need to fail if !realy_supported && wait_for_reply?

Thanks




+
+ret = vhost_user_write(dev, , NULL, 0);
+if (ret < 0) {
+return ret;
+}
+
+if (wait_for_reply) {
+return enforce_reply(dev, );
+}
+
+return ret;
+}
+
  static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable)
  {
  int i;
@@ -1207,13 +1252,8 @@ static int vhost_user_set_vring_enable(struct vhost_dev 
*dev, int enable)
  }
  
  for (i = 0; i < dev->nvqs; ++i) {

-int ret;
-struct vhost_vring_state state = {
-.index = dev->vq_index + i,
-.num   = enable,
-};
-
-ret = vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, );
+int ret = vhost_user_set_single_vring_enable(dev, dev->vq_index + i,
+ enable, false);
  if (ret < 0) {
  /*
   * Restoring the previous state is likely infeasible, as well as
@@ -2627,6 +2667,7 @@ const VhostOps user_ops = {
  .vhost_set_owner = vhost_user_set_owner,
  .vhost_reset_device = vhost_user_reset_device,
  .vhost_get_vq_index = vhost_user_get_vq_index,
+.vhost_set_single_vring_enable = vhost_user_set_single_vring_enable,
  .vhost_set_vring_enable = vhost_user_set_vring_enable,
  .vhost_requires_shm_log = vhost_user_requires_shm_log,
  .vhost_migration_done = vhost_user_migration_done,

Re: [PATCH 08/16] vhost: add op to enable or disable a single vring

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

The interface to set enable status for a single vring is lacked in
VhostOps, since the vhost_set_vring_enable_op will manipulate all
virtqueues in a device.

Resetting a single vq will rely on this interface. It requires a
reply to indicate that the reset operation is finished, so the
parameter, wait_for_reply, is added.



The wait reply seems to be a implementation specific thing. Can we hide it?

Thanks




Signed-off-by: Kangjie Xu 
Signed-off-by: Xuan Zhuo 
---
  include/hw/virtio/vhost-backend.h | 4 
  1 file changed, 4 insertions(+)

diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index eab46d7f0b..7bddd1e9a0 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -81,6 +81,9 @@ typedef int (*vhost_set_backend_cap_op)(struct vhost_dev 
*dev);
  typedef int (*vhost_set_owner_op)(struct vhost_dev *dev);
  typedef int (*vhost_reset_device_op)(struct vhost_dev *dev);
  typedef int (*vhost_get_vq_index_op)(struct vhost_dev *dev, int idx);
+typedef int (*vhost_set_single_vring_enable_op)(struct vhost_dev *dev,
+int index, int enable,
+bool wait_for_reply);
  typedef int (*vhost_set_vring_enable_op)(struct vhost_dev *dev,
   int enable);
  typedef bool (*vhost_requires_shm_log_op)(struct vhost_dev *dev);
@@ -155,6 +158,7 @@ typedef struct VhostOps {
  vhost_set_owner_op vhost_set_owner;
  vhost_reset_device_op vhost_reset_device;
  vhost_get_vq_index_op vhost_get_vq_index;
+vhost_set_single_vring_enable_op vhost_set_single_vring_enable;
  vhost_set_vring_enable_op vhost_set_vring_enable;
  vhost_requires_shm_log_op vhost_requires_shm_log;
  vhost_migration_done_op vhost_migration_done;

Re: [PATCH 07/16] virtio-net: support queue reset

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

From: Xuan Zhuo 

virtio-net implements queue reset. Queued packets in the corresponding
queue pair are flushed or purged.

Queue reset is currently only implemented for non-vhosts.

Signed-off-by: Xuan Zhuo 
---
  hw/net/virtio-net.c | 15 +++
  1 file changed, 15 insertions(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 7ad948ee7c..8396e21a67 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -531,6 +531,19 @@ static RxFilterInfo 
*virtio_net_query_rxfilter(NetClientState *nc)
  return info;
  }
  
+static void virtio_net_queue_reset(VirtIODevice *vdev, uint32_t queue_index)

+{
+VirtIONet *n = VIRTIO_NET(vdev);
+NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
+
+if (!nc->peer) {
+return;
+}
+
+qemu_flush_or_purge_queued_packets(nc->peer, true);
+assert(!virtio_net_get_subqueue(nc)->async_tx.elem);



Let's try to reuse this function in virtio_net_reset().



+}
+
  static void virtio_net_reset(VirtIODevice *vdev)
  {
  VirtIONet *n = VIRTIO_NET(vdev);
@@ -741,6 +754,7 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, 
uint64_t features,
  }
  
  if (!get_vhost_net(nc->peer)) {

+virtio_add_feature(, VIRTIO_F_RING_RESET);



This breaks migration compatibility.

We probably need:

1) a new command line parameter
2) make it disabled for pre-7.2 machine

Thanks



  return features;
  }
  
@@ -3766,6 +3780,7 @@ static void virtio_net_class_init(ObjectClass *klass, void *data)

  vdc->set_features = virtio_net_set_features;
  vdc->bad_features = virtio_net_bad_features;
  vdc->reset = virtio_net_reset;
+vdc->queue_reset = virtio_net_queue_reset;
  vdc->set_status = virtio_net_set_status;
  vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
  vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;

Re: [PATCH 06/16] virtio-pci: support queue reset

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

From: Xuan Zhuo 

PCI devices support vq reset.

Based on this function, the driver can adjust the size of the ring, and
quickly recycle the buffer in the ring.

Signed-off-by: Xuan Zhuo 
---
  hw/virtio/virtio-pci.c | 16 
  include/hw/virtio/virtio-pci.h |  1 +
  2 files changed, 17 insertions(+)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 45327f0b31..35e8a5101a 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1246,6 +1246,9 @@ static uint64_t virtio_pci_common_read(void *opaque, 
hwaddr addr,
  case VIRTIO_PCI_COMMON_Q_USEDHI:
  val = proxy->vqs[vdev->queue_sel].used[1];
  break;
+case VIRTIO_PCI_COMMON_Q_RESET:
+val = proxy->vqs[vdev->queue_sel].reset;
+break;
  default:
  val = 0;
  }
@@ -1333,6 +1336,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr 
addr,
 ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
 proxy->vqs[vdev->queue_sel].used[0]);
  proxy->vqs[vdev->queue_sel].enabled = 1;
+proxy->vqs[vdev->queue_sel].reset = 0;
  } else {
  virtio_error(vdev, "wrong value for queue_enable %"PRIx64, val);
  }
@@ -1355,6 +1359,17 @@ static void virtio_pci_common_write(void *opaque, hwaddr 
addr,
  case VIRTIO_PCI_COMMON_Q_USEDHI:
  proxy->vqs[vdev->queue_sel].used[1] = val;
  break;
+case VIRTIO_PCI_COMMON_Q_RESET:
+if (val == 1) {
+proxy->vqs[vdev->queue_sel].reset = 1;
+
+virtio_queue_reset(vdev, vdev->queue_sel);
+
+/* mark reset complete */



The code explain itself, so the comment could be removed.



+proxy->vqs[vdev->queue_sel].reset = 0;
+proxy->vqs[vdev->queue_sel].enabled = 0;
+}
+break;
  default:
  break;
  }
@@ -1950,6 +1965,7 @@ static void virtio_pci_reset(DeviceState *qdev)
  
  for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {

  proxy->vqs[i].enabled = 0;
+proxy->vqs[i].reset = 0;
  proxy->vqs[i].num = 0;
  proxy->vqs[i].desc[0] = proxy->vqs[i].desc[1] = 0;
  proxy->vqs[i].avail[0] = proxy->vqs[i].avail[1] = 0;
diff --git a/include/hw/virtio/virtio-pci.h b/include/hw/virtio/virtio-pci.h
index 2446dcd9ae..e9290e2b94 100644
--- a/include/hw/virtio/virtio-pci.h
+++ b/include/hw/virtio/virtio-pci.h
@@ -117,6 +117,7 @@ typedef struct VirtIOPCIRegion {
  typedef struct VirtIOPCIQueue {
uint16_t num;
bool enabled;
+  bool reset;



How is this migrated?

Thanks



uint32_t desc[2];
uint32_t avail[2];
uint32_t used[2];

Re: [PATCH 05/16] virtio: introduce virtio_queue_reset()

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

From: Xuan Zhuo 

Introduce a new interface function virtio_queue_reset() to implement
reset for vq.

Add a new callback to VirtioDeviceClass for queue reset operation for
each child device.

Signed-off-by: Xuan Zhuo 



Acked-by: Jason Wang 



---
  hw/virtio/virtio.c | 11 +++
  include/hw/virtio/virtio.h |  2 ++
  2 files changed, 13 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 67d54832a9..0e9d41366f 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2039,6 +2039,17 @@ static void __virtio_queue_reset(VirtIODevice *vdev, 
uint32_t i)
  virtio_virtqueue_reset_region_cache(>vq[i]);
  }
  
+void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index)

+{
+VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
+
+if (k->queue_reset) {
+k->queue_reset(vdev, queue_index);
+}
+
+__virtio_queue_reset(vdev, queue_index);
+}
+
  void virtio_reset(void *opaque)
  {
  VirtIODevice *vdev = opaque;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index db1c0ddf6b..879394299b 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -130,6 +130,7 @@ struct VirtioDeviceClass {
  void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
  void (*reset)(VirtIODevice *vdev);
  void (*set_status)(VirtIODevice *vdev, uint8_t val);
+void (*queue_reset)(VirtIODevice *vdev, uint32_t queue_index);
  /* For transitional devices, this is a bitmap of features
   * that are only exposed on the legacy interface but not
   * the modern one.
@@ -268,6 +269,7 @@ int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, 
int n,
MemoryRegion *mr, bool assign);
  int virtio_set_status(VirtIODevice *vdev, uint8_t val);
  void virtio_reset(void *opaque);
+void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index);
  void virtio_update_irq(VirtIODevice *vdev);
  int virtio_set_features(VirtIODevice *vdev, uint64_t val);

Re: [PATCH 04/16] virtio: introduce __virtio_queue_reset()

2022-07-25 Thread Jason Wang




在 2022/7/18 19:17, Kangjie Xu 写道:

From: Xuan Zhuo 

Separate the logic of vq reset. This logic will be called directly
later.

Signed-off-by: Xuan Zhuo 



Acked-by: Jason Wang 



---
  hw/virtio/virtio.c | 37 +
  1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 5d607aeaa0..67d54832a9 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2019,6 +2019,26 @@ static enum virtio_device_endian 
virtio_current_cpu_endian(void)
  }
  }
  
+static void __virtio_queue_reset(VirtIODevice *vdev, uint32_t i)

+{
+vdev->vq[i].vring.desc = 0;
+vdev->vq[i].vring.avail = 0;
+vdev->vq[i].vring.used = 0;
+vdev->vq[i].last_avail_idx = 0;
+vdev->vq[i].shadow_avail_idx = 0;
+vdev->vq[i].used_idx = 0;
+vdev->vq[i].last_avail_wrap_counter = true;
+vdev->vq[i].shadow_avail_wrap_counter = true;
+vdev->vq[i].used_wrap_counter = true;
+virtio_queue_set_vector(vdev, i, VIRTIO_NO_VECTOR);
+vdev->vq[i].signalled_used = 0;
+vdev->vq[i].signalled_used_valid = false;
+vdev->vq[i].notification = true;
+vdev->vq[i].vring.num = vdev->vq[i].vring.num_default;
+vdev->vq[i].inuse = 0;
+virtio_virtqueue_reset_region_cache(>vq[i]);
+}
+
  void virtio_reset(void *opaque)
  {
  VirtIODevice *vdev = opaque;
@@ -2050,22 +2070,7 @@ void virtio_reset(void *opaque)
  virtio_notify_vector(vdev, vdev->config_vector);
  
  for(i = 0; i < VIRTIO_QUEUE_MAX; i++) {

-vdev->vq[i].vring.desc = 0;
-vdev->vq[i].vring.avail = 0;
-vdev->vq[i].vring.used = 0;
-vdev->vq[i].last_avail_idx = 0;
-vdev->vq[i].shadow_avail_idx = 0;
-vdev->vq[i].used_idx = 0;
-vdev->vq[i].last_avail_wrap_counter = true;
-vdev->vq[i].shadow_avail_wrap_counter = true;
-vdev->vq[i].used_wrap_counter = true;
-virtio_queue_set_vector(vdev, i, VIRTIO_NO_VECTOR);
-vdev->vq[i].signalled_used = 0;
-vdev->vq[i].signalled_used_valid = false;
-vdev->vq[i].notification = true;
-vdev->vq[i].vring.num = vdev->vq[i].vring.num_default;
-vdev->vq[i].inuse = 0;
-virtio_virtqueue_reset_region_cache(>vq[i]);
+__virtio_queue_reset(vdev, i);
  }
  }

Re: [PATCH 01/16] virtio-pci: virtio_pci_common_cfg add queue_notify_data

2022-07-25 Thread Jason Wang




在 2022/7/18 19:16, Kangjie Xu 写道:

From: Xuan Zhuo 

Add queue_notify_data in struct virtio_pci_common_cfg, which comes from
here https://github.com/oasis-tcs/virtio-spec/issues/89

Since I want to add queue_reset after queue_notify_data, I submitted
this patch first.

Signed-off-by: Xuan Zhuo 



This should be done by script/update-linux-headers.sh.

Thanks



---
  include/standard-headers/linux/virtio_pci.h | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/include/standard-headers/linux/virtio_pci.h 
b/include/standard-headers/linux/virtio_pci.h
index db7a8e2fcb..598ebe9825 100644
--- a/include/standard-headers/linux/virtio_pci.h
+++ b/include/standard-headers/linux/virtio_pci.h
@@ -164,6 +164,7 @@ struct virtio_pci_common_cfg {
uint32_t queue_avail_hi;/* read-write */
uint32_t queue_used_lo; /* read-write */
uint32_t queue_used_hi; /* read-write */
+   uint16_t queue_notify_data; /* read-write */
  };
  
  /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */

@@ -202,6 +203,7 @@ struct virtio_pci_cfg_cap {
  #define VIRTIO_PCI_COMMON_Q_AVAILHI   44
  #define VIRTIO_PCI_COMMON_Q_USEDLO48
  #define VIRTIO_PCI_COMMON_Q_USEDHI52
+#define VIRTIO_PCI_COMMON_Q_NOTIFY_DATA56
  
  #endif /* VIRTIO_PCI_NO_MODERN */

Re: [PATCH v2 7/7] vdpa: Always start CVQ in SVQ mode

2022-07-25 Thread Jason Wang




在 2022/7/22 21:43, Eugenio Pérez 写道:

Isolate control virtqueue in its own group, allowing to intercept control
commands but letting dataplane run totally passthrough to the guest.

Signed-off-by: Eugenio Pérez 
---
  hw/virtio/vhost-vdpa.c |   3 +-
  net/vhost-vdpa.c   | 158 +++--
  2 files changed, 156 insertions(+), 5 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index 79623badf2..fe1c85b086 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -668,7 +668,8 @@ static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
  {
  uint64_t features;
  uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
-0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
+0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
+0x1ULL << VHOST_BACKEND_F_IOTLB_ASID;
  int r;
  
  if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, )) {

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 6c1c64f9b1..f5075ef487 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -37,6 +37,9 @@ typedef struct VhostVDPAState {
  /* Control commands shadow buffers */
  void *cvq_cmd_out_buffer, *cvq_cmd_in_buffer;
  
+/* Number of address spaces supported by the device */

+unsigned address_space_num;
+
  /* The device always have SVQ enabled */
  bool always_svq;
  bool started;
@@ -100,6 +103,8 @@ static const uint64_t vdpa_svq_device_features =
  BIT_ULL(VIRTIO_NET_F_RSC_EXT) |
  BIT_ULL(VIRTIO_NET_F_STANDBY);
  
+#define VHOST_VDPA_NET_CVQ_ASID 1

+
  VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
  {
  VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
@@ -214,6 +219,109 @@ static ssize_t vhost_vdpa_receive(NetClientState *nc, 
const uint8_t *buf,
  return 0;
  }
  
+static int vhost_vdpa_get_vring_group(int device_fd,

+  struct vhost_vring_state *state)
+{
+int r = ioctl(device_fd, VHOST_VDPA_GET_VRING_GROUP, state);
+return r < 0 ? -errno : 0;
+}



It would be more convenient for the caller if we can simply return 0 here.



+
+/**
+ * Check if all the virtqueues of the virtio device are in a different vq than
+ * the last vq. VQ group of last group passed in cvq_group.
+ */
+static bool vhost_vdpa_cvq_group_is_independent(struct vhost_vdpa *v,
+struct vhost_vring_state cvq_group)
+{
+struct vhost_dev *dev = v->dev;
+int ret;
+
+for (int i = 0; i < (dev->vq_index_end - 1); ++i) {
+struct vhost_vring_state vq_group = {
+.index = i,
+};
+
+ret = vhost_vdpa_get_vring_group(v->device_fd, _group);
+if (unlikely(ret)) {
+goto call_err;
+}
+if (unlikely(vq_group.num == cvq_group.num)) {
+error_report("CVQ %u group is the same as VQ %u one (%u)",
+ cvq_group.index, vq_group.index, cvq_group.num);



Any reason we need error_report() here?

Btw, I'd suggest to introduce new field in vhost_vdpa, then we can get 
and store the group_id there during init.


This could be useful for the future e.g PASID virtualization.



+return false;
+}
+}
+
+return true;
+
+call_err:
+error_report("Can't read vq group, errno=%d (%s)", -ret, g_strerror(-ret));
+return false;
+}
+
+static int vhost_vdpa_set_address_space_id(struct vhost_vdpa *v,
+   unsigned vq_group,
+   unsigned asid_num)
+{
+struct vhost_vring_state asid = {
+.index = vq_group,
+.num = asid_num,
+};
+int ret;
+
+ret = ioctl(v->device_fd, VHOST_VDPA_SET_GROUP_ASID, );
+if (unlikely(ret < 0)) {
+error_report("Can't set vq group %u asid %u, errno=%d (%s)",
+asid.index, asid.num, errno, g_strerror(errno));
+}
+return ret;
+}
+
+static void vhost_vdpa_net_prepare(NetClientState *nc)
+{
+VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+struct vhost_vdpa *v = >vhost_vdpa;
+struct vhost_dev *dev = v->dev;
+struct vhost_vring_state cvq_group = {
+.index = v->dev->vq_index_end - 1,
+};
+int r;
+
+assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+
+if (dev->nvqs != 1 || dev->vq_index + dev->nvqs != dev->vq_index_end) {
+/* Only interested in CVQ */
+return;
+}
+
+if (s->always_svq) {
+/* SVQ is already enabled */
+return;
+}
+
+if (s->address_space_num < 2) {
+v->shadow_vqs_enabled = false;
+return;
+}
+
+r = vhost_vdpa_get_vring_group(v->device_fd, _group);
+if (unlikely(r)) {
+error_report("Can't read cvq group, errno=%d (%s)", r, g_strerror(-r));
+v->shadow_vqs_enabled = false;
+return;
+}
+
+if (!vhost_vdpa_cvq_group_is_independent(v, cvq_group)) {
+v->shadow_vqs_enabled = false;
+

Re: [PATCH v4 4/7] vdpa: add NetClientState->start() callback

2022-07-25 Thread Jason Wang




在 2022/7/22 19:12, Eugenio Pérez 写道:

It allows per-net client operations right after device's successful
start.

Vhost-vdpa net will use it to add the CVQ buffers to restore the device
status.

Signed-off-by: Eugenio Pérez 
---
  include/net/net.h  | 2 ++
  hw/net/vhost_net.c | 7 +++
  2 files changed, 9 insertions(+)

diff --git a/include/net/net.h b/include/net/net.h
index 523136c7ac..ad9e80083a 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -44,6 +44,7 @@ typedef struct NICConf {
  
  typedef void (NetPoll)(NetClientState *, bool enable);

  typedef bool (NetCanReceive)(NetClientState *);
+typedef int (NetStart)(NetClientState *);
  typedef ssize_t (NetReceive)(NetClientState *, const uint8_t *, size_t);
  typedef ssize_t (NetReceiveIOV)(NetClientState *, const struct iovec *, int);
  typedef void (NetCleanup) (NetClientState *);
@@ -71,6 +72,7 @@ typedef struct NetClientInfo {
  NetReceive *receive_raw;
  NetReceiveIOV *receive_iov;
  NetCanReceive *can_receive;
+NetStart *start;



I think we probably need a better name here. (start should go with 
DRIVER_OK or SET_VRING_ENABLE)


How about load or other (not a native speaker).

Thanks



  NetCleanup *cleanup;
  LinkStatusChanged *link_status_changed;
  QueryRxFilter *query_rx_filter;
diff --git a/hw/net/vhost_net.c b/hw/net/vhost_net.c
index ccac5b7a64..ddd9ee0441 100644
--- a/hw/net/vhost_net.c
+++ b/hw/net/vhost_net.c
@@ -274,6 +274,13 @@ static int vhost_net_start_one(struct vhost_net *net,
  }
  }
  }
+
+if (net->nc->info->start) {
+r = net->nc->info->start(net->nc);
+if (r < 0) {
+goto fail;
+}
+}
  return 0;
  fail:
  file.fd = -1;

Re: [PATCH v4 2/7] vdpa: Extract vhost_vdpa_net_cvq_add from vhost_vdpa_net_handle_ctrl_avail

2022-07-25 Thread Jason Wang




在 2022/7/22 19:12, Eugenio Pérez 写道:

So we can reuse to inject state messages.

Signed-off-by: Eugenio Pérez 
---
  net/vhost-vdpa.c | 74 ++--
  1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 6abad276a6..1b82ac2e07 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -334,6 +334,46 @@ static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState *s,
  return true;
  }
  
+static virtio_net_ctrl_ack vhost_vdpa_net_cvq_add(VhostShadowVirtqueue *svq,

+   const struct iovec *dev_buffers)



Let's make this support any layout by accepting in/out sg.



+{
+/* in buffer used for device model */
+virtio_net_ctrl_ack status;
+size_t dev_written;
+int r;
+
+/*
+ * Add a fake non-NULL VirtQueueElement since we'll remove before SVQ
+ * event loop can get it.
+ */
+r = vhost_svq_add(svq, _buffers[0], 1, _buffers[1], 1, (void *)1);



I'd suggest to avoid the trick like (void *)1, which is usually a hint 
of the defect of the API.


We can either:

1) make vhost_svq_get() check ndescs instead of elem

or

2) simple pass sg

Thanks



+if (unlikely(r != 0)) {
+if (unlikely(r == -ENOSPC)) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
+  __func__);
+}
+return VIRTIO_NET_ERR;
+}
+
+/*
+ * We can poll here since we've had BQL from the time we sent the
+ * descriptor. Also, we need to take the answer before SVQ pulls by itself,
+ * when BQL is released
+ */
+dev_written = vhost_svq_poll(svq);
+if (unlikely(dev_written < sizeof(status))) {
+error_report("Insufficient written data (%zu)", dev_written);
+return VIRTIO_NET_ERR;
+}
+
+memcpy(, dev_buffers[1].iov_base, sizeof(status));
+if (status != VIRTIO_NET_OK) {
+return VIRTIO_NET_ERR;
+}
+
+return VIRTIO_NET_OK;
+}
+
  /**
   * Do not forward commands not supported by SVQ. Otherwise, the device could
   * accept it and qemu would not know how to update the device model.
@@ -380,19 +420,18 @@ static int 
vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
  void *opaque)
  {
  VhostVDPAState *s = opaque;
-size_t in_len, dev_written;
+size_t in_len;
  virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
  /* out and in buffers sent to the device */
  struct iovec dev_buffers[2] = {
  { .iov_base = s->cvq_cmd_out_buffer },
  { .iov_base = s->cvq_cmd_in_buffer },
  };
-/* in buffer used for device model */
+/* in buffer seen by virtio-net device model */
  const struct iovec in = {
  .iov_base = ,
  .iov_len = sizeof(status),
  };
-int r = -EINVAL;
  bool ok;
  
  ok = vhost_vdpa_net_cvq_map_elem(s, elem, dev_buffers);

@@ -405,35 +444,16 @@ static int 
vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue *svq,
  goto out;
  }
  
-r = vhost_svq_add(svq, _buffers[0], 1, _buffers[1], 1, elem);

-if (unlikely(r != 0)) {
-if (unlikely(r == -ENOSPC)) {
-qemu_log_mask(LOG_GUEST_ERROR, "%s: No space on device queue\n",
-  __func__);
-}
-goto out;
-}
-
-/*
- * We can poll here since we've had BQL from the time we sent the
- * descriptor. Also, we need to take the answer before SVQ pulls by itself,
- * when BQL is released
- */
-dev_written = vhost_svq_poll(svq);
-if (unlikely(dev_written < sizeof(status))) {
-error_report("Insufficient written data (%zu)", dev_written);
-goto out;
-}
-
-memcpy(, dev_buffers[1].iov_base, sizeof(status));
+status = vhost_vdpa_net_cvq_add(svq, dev_buffers);
  if (status != VIRTIO_NET_OK) {
  goto out;
  }
  
  status = VIRTIO_NET_ERR;

-virtio_net_handle_ctrl_iov(svq->vdev, , 1, dev_buffers, 1);
-if (status != VIRTIO_NET_OK) {
+in_len = virtio_net_handle_ctrl_iov(svq->vdev, , 1, dev_buffers, 1);
+if (in_len != sizeof(status) || status != VIRTIO_NET_OK) {
  error_report("Bad CVQ processing in model");
+return VIRTIO_NET_ERR;
  }
  
  out:

@@ -450,7 +470,7 @@ out:
  if (dev_buffers[1].iov_base) {
  vhost_vdpa_cvq_unmap_buf(>vhost_vdpa, dev_buffers[1].iov_base);
  }
-return r;
+return status == VIRTIO_NET_OK ? 0 : 1;
  }
  
  static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops = {

Re: [PATCH 1/3] target/arm: Create kvm_arm_svm_supported

2022-07-25 Thread Zenghui Yu via


Hi Richard,

On 2022/7/26 2:14, Richard Henderson wrote:

Indication for support for SVE will not depend on whether we
perform the query on the main kvm_state or the temp vcpu.
Mirror kvm_arm_pauth_supported.

Signed-off-by: Richard Henderson 
---
 target/arm/kvm64.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index d16d4ea250..36ff20c9e3 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -496,6 +496,11 @@ static bool kvm_arm_pauth_supported(void)
 kvm_check_extension(kvm_state, KVM_CAP_ARM_PTRAUTH_GENERIC));
 }
 
+static bool kvm_arm_svm_supported(void)

+{
+return kvm_check_extension(kvm_state, KVM_CAP_ARM_SVE);
+}
+


Not sure if it's a typo. Maybe we should instead use
kvm_arm_sve_supported() which was introduced in commit 14e99e0fbbc6.

Zenghui

Re: [PATCH] .gitlab-ci.d/windows.yml: Enable native Windows symlink

2022-07-25 Thread Bin Meng

On Mon, Jul 25, 2022 at 9:48 PM Alex Bennée  wrote:
>
>
> Bin Meng  writes:
>
> > From: Bin Meng 
> >
> > The following error message was seen during the configure:
> >
> >   "ln: failed to create symbolic link
> >   'x86_64-softmmu/qemu-system-x86_64.exe': No such file or directory"
> >
> > By default the MSYS environment variable is not defined, so the runtime
> > behavior of winsymlinks is: if  does not exist, 'ln -s' fails.
> > At the configure phase, the qemu-system-x86_64.exe has not been built
> > so creation of the symbolic link fails hence the error message.
> >
> > Set winsymlinks to 'native' whose behavior is most similar to the
> > behavior of 'ln -s' on *nix, that is:
> >
> >   a) if native symlinks are enabled, and whether  exists
> >  or not, creates  as a native Windows symlink;
> >   b) else if native symlinks are not enabled, and whether 
> >  exists or not, 'ln -s' creates as a Windows shortcut file.
> >
> > Signed-off-by: Bin Meng 
>
> I'm still seeing Windows build failures such as:
>
>   https://gitlab.com/stsquad/qemu/-/jobs/2765579269

I've seen this one before. Looks like this one can be easily reproduced.

>
> and
>
>   https://gitlab.com/stsquad/qemu/-/jobs/2765579267

This one seems to be a random failure?

>
> Any idea what's falling over?
>

Regards,
Bin

Re: How to read RISC-V mcycle CSR from Linux userspace app?

2022-07-25 Thread Bin Meng

On Tue, Jul 26, 2022 at 12:58 AM Maxim Blinov  wrote:
>
> Hi all, stupid question but I can't for the life of me figure this out
> even with all the docs open.
>
> I need to get an estimate figure for the cyclecount of a busy loop in
> one of my small Linux userspace apps. The app in question is running
> in qemu-system-riscv64. I've compiled QEMU myself, and the full code
> is like this:
>
> #include 
> #include 
> #include 
> #include 
>
> uint64_t get_mcycle() {
>   uint64_t mcycle = 0;
>
>   asm volatile ("csrr %0,mcycle"   : "=r" (mcycle)  );

Change this to "csrr %0,cycle" and you should be able to run your program.

>
>   return mcycle;
> }
>
> int main(int argc, char **argv) {
>   printf("Hello\n");
>   printf("mcycle is %lu\n", get_mcycle());
>
>   return 0;
> }
>
> Now I get SIGILL when I hit the `csrr` insn, which makes sense.
> According to the "Privileged Architecture Version 1.10", page 32, [1]
> we need to set mcounteren, hcounteren, and scounteren low bits to 1 in
> order to get the mcycle csr to become available in userspace. So I add
> the following function:
>
> void enable_mcount() {
>   /* Enable IR, TM, CY */
>   uint64_t mcounteren = 0x5;
>   asm volatile ("csrw mcounteren,%0" : "=r" (mcounteren));
>   asm volatile ("csrw hcounteren,%0" : "=r" (mcounteren));
>   asm volatile ("csrw scounteren,%0" : "=r" (mcounteren));
> }
>
> And call it before I call get_mcycle(), but this triggers SIGILL
> (unsurprisingly) also, since these CSRs are also privileged. So
> there's a bit of a chicken and egg problem.
>
> Could someone more knowledgeable please suggest what the course of
> action here is? I've got QEMU revision f45fd24c90 checked out, and I'm
> staring at qemu/target/riscv/csr.c:71, which seems to deal with
> whether or not to raise a SIGILL upon access. I can see a condition
> for when we're in 'S' mode, but nothing for 'U' mode. Does that mean
> there is fundamentally no access to these CSR's from 'U' mode? Is it
> possible to just hack it in?

In the user space, you can only read the U-mode CSRs like cycle, but NOT mcycle.

>
> Maxim
>
> [1]: https://riscv.org/wp-content/uploads/2017/05/riscv-privileged-v1.10.pdf
>

Regards,
Bin

[PATCH v1 1/1] migration: add remaining params->has_* = true in migration_instance_init()

2022-07-25 Thread Leonardo Bras

Some of params->has_* = true are missing in migration_instance_init, this
causes migrate_params_check() to skip some tests, allowing some
unsupported scenarios.

Fix this by adding all missing params->has_* = true in
migration_instance_init().

Signed-off-by: Leonardo Bras 
---
 migration/migration.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index e03f698a3c..82fbe0cf55 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -4451,6 +4451,7 @@ static void migration_instance_init(Object *obj)
 /* Set has_* up only for parameter checks */
 params->has_compress_level = true;
 params->has_compress_threads = true;
+params->has_compress_wait_thread = true;
 params->has_decompress_threads = true;
 params->has_throttle_trigger_threshold = true;
 params->has_cpu_throttle_initial = true;
@@ -4471,6 +4472,9 @@ static void migration_instance_init(Object *obj)
 params->has_announce_max = true;
 params->has_announce_rounds = true;
 params->has_announce_step = true;
+params->has_tls_creds = true;
+params->has_tls_hostname = true;
+params->has_tls_authz = true;
 
 qemu_sem_init(>postcopy_pause_sem, 0);
 qemu_sem_init(>postcopy_pause_rp_sem, 0);
-- 
2.37.1

PING: [PATCH] monitor: Support specified vCPU registers

2022-07-25 Thread zhenwei pi


PING!

On 7/19/22 15:55, zhenwei pi wrote:

Originally we have to get all the vCPU registers and parse the
specified one. To improve the performance of this usage, allow user
specified vCPU id to query registers.

Run a VM with 16 vCPU, use bcc tool to track the latency of
'hmp_info_registers':
'info registers -a' uses about 3ms;
'info register 12' uses about 150us.

Signed-off-by: zhenwei pi 
---
  hmp-commands-info.hx |  6 +++---
  monitor/misc.c   | 19 +++
  2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 3ffa24bd67..6023e2b5c5 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -100,9 +100,9 @@ ERST
  
  {

  .name   = "registers",
-.args_type  = "cpustate_all:-a",
-.params = "[-a]",
-.help   = "show the cpu registers (-a: all - show register info for all 
cpus)",
+.args_type  = "cpustate_all:-a,vcpu:i?",
+.params = "[-a] [vcpu]",
+.help   = "show the cpu registers (-a: all - show register info for all 
cpus; vcpu: vCPU to query)",
  .cmd= hmp_info_registers,
  },
  
diff --git a/monitor/misc.c b/monitor/misc.c

index 3d2312ba8d..b12309faad 100644
--- a/monitor/misc.c
+++ b/monitor/misc.c
@@ -307,6 +307,7 @@ int monitor_get_cpu_index(Monitor *mon)
  static void hmp_info_registers(Monitor *mon, const QDict *qdict)
  {
  bool all_cpus = qdict_get_try_bool(qdict, "cpustate_all", false);
+int vcpu = qdict_get_try_int(qdict, "vcpu", -1);
  CPUState *cs;
  
  if (all_cpus) {

@@ -314,6 +315,24 @@ static void hmp_info_registers(Monitor *mon, const QDict 
*qdict)
  monitor_printf(mon, "\nCPU#%d\n", cs->cpu_index);
  cpu_dump_state(cs, NULL, CPU_DUMP_FPU);
  }
+} else if (vcpu >= 0) {
+CPUState *target_cs = NULL;
+
+CPU_FOREACH(cs) {
+if (cs->cpu_index == vcpu) {
+target_cs = cs;
+break;
+}
+}
+
+if (!target_cs) {
+monitor_printf(mon, "CPU#%d not available\n", vcpu);
+return;
+}
+
+monitor_printf(mon, "\nCPU#%d\n", target_cs->cpu_index);
+cpu_dump_state(target_cs, NULL, CPU_DUMP_FPU);
+return;
  } else {
  cs = mon_get_cpu(mon);
  


--
zhenwei pi

[PATCH v3 2/2] tests/tcg/s390x: Test unaligned accesses to lowcore

2022-07-25 Thread Ilya Leoshkevich

Add a small test to avoid regressions.

Signed-off-by: Ilya Leoshkevich 
Acked-by: Richard Henderson 
Acked-by: Thomas Huth 
---
 tests/tcg/s390x/Makefile.softmmu-target |  9 +
 tests/tcg/s390x/unaligned-lowcore.S | 19 +++
 2 files changed, 28 insertions(+)
 create mode 100644 tests/tcg/s390x/Makefile.softmmu-target
 create mode 100644 tests/tcg/s390x/unaligned-lowcore.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
new file mode 100644
index 00..a34fa68473
--- /dev/null
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -0,0 +1,9 @@
+S390X_SRC=$(SRC_PATH)/tests/tcg/s390x
+VPATH+=$(S390X_SRC)
+QEMU_OPTS=-action panic=exit-failure -kernel
+
+%: %.S
+   $(CC) -march=z13 -m64 -nostartfiles -static -Wl,-Ttext=0 \
+   -Wl,--build-id=none $< -o $@
+
+TESTS += unaligned-lowcore
diff --git a/tests/tcg/s390x/unaligned-lowcore.S 
b/tests/tcg/s390x/unaligned-lowcore.S
new file mode 100644
index 00..f5da2ae64c
--- /dev/null
+++ b/tests/tcg/s390x/unaligned-lowcore.S
@@ -0,0 +1,19 @@
+.org 0x1D0 /* program new PSW */
+.quad 0x2,0/* disabled wait */
+.org 0x200 /* lowcore padding */
+
+.globl _start
+_start:
+lctlg %c0,%c0,_c0
+vst %v0,_unaligned
+lpswe quiesce_psw
+
+.align 8
+quiesce_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+_c0:
+.quad 0x1006   /* lowcore protection, AFP, VX */
+
+.byte 0
+_unaligned:
+.octa 0
-- 
2.35.3

[PATCH v3 0/2] accel/tcg: Test unaligned stores to s390x low-address-protected lowcore

2022-07-25 Thread Ilya Leoshkevich

Hi,

This is a follow-up series for [1].

The fix has been committed.

I asked Christian what might be a good alternative for the
mmio-debug-exit device for testing, and he suggested to look into
shutdown/panic actions.

Patch 1 adds a new panic action.
Patch 2 tests unaligned stores to s390x low-address-protected lowcore;
it performs a shutdown on success and panic on failure.

Best regards,
Ilya

[1] https://lists.gnu.org/archive/html/qemu-devel/2022-07/msg01876.html

v2: https://lists.gnu.org/archive/html/qemu-devel/2022-07/msg04129.html
v2 -> v3: Add @since tag (Eric).
  Fix a small style issue in the test.

Ilya Leoshkevich (2):
  qapi: Add exit-failure PanicAction
  tests/tcg/s390x: Test unaligned accesses to lowcore

 include/sysemu/sysemu.h |  2 +-
 qapi/run-state.json |  5 -
 qemu-options.hx |  2 +-
 softmmu/main.c  |  6 --
 softmmu/runstate.c  | 17 +
 tests/tcg/s390x/Makefile.softmmu-target |  9 +
 tests/tcg/s390x/unaligned-lowcore.S | 19 +++
 7 files changed, 51 insertions(+), 9 deletions(-)
 create mode 100644 tests/tcg/s390x/Makefile.softmmu-target
 create mode 100644 tests/tcg/s390x/unaligned-lowcore.S

-- 
2.35.3

[PATCH v3 1/2] qapi: Add exit-failure PanicAction

2022-07-25 Thread Ilya Leoshkevich

Currently QEMU exits with code 0 on both panic an shutdown. For tests
it is useful to return 1 on panic, so that it counts as a test
failure.

Introduce a new exit-failure PanicAction that makes main() return
EXIT_FAILURE. Tests can use -action panic=exit-failure option to
activate this behavior.

Signed-off-by: Ilya Leoshkevich 
Reviewed-by: Richard Henderson 
Reviewed-by: David Hildenbrand 
---
 include/sysemu/sysemu.h |  2 +-
 qapi/run-state.json |  5 -
 qemu-options.hx |  2 +-
 softmmu/main.c  |  6 --
 softmmu/runstate.c  | 17 +
 5 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 812f66a31a..31aa45160b 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -103,7 +103,7 @@ void qemu_boot_set(const char *boot_order, Error **errp);
 bool defaults_enabled(void);
 
 void qemu_init(int argc, char **argv, char **envp);
-void qemu_main_loop(void);
+int qemu_main_loop(void);
 void qemu_cleanup(void);
 
 extern QemuOptsList qemu_legacy_drive_opts;
diff --git a/qapi/run-state.json b/qapi/run-state.json
index 6e2162d7b3..9273ea6516 100644
--- a/qapi/run-state.json
+++ b/qapi/run-state.json
@@ -364,10 +364,13 @@
 #
 # @shutdown: Shutdown the VM and exit, according to the shutdown action
 #
+# @exit-failure: Shutdown the VM and exit with nonzero status
+#(since 7.1)
+#
 # Since: 6.0
 ##
 { 'enum': 'PanicAction',
-  'data': [ 'pause', 'shutdown', 'none' ] }
+  'data': [ 'pause', 'shutdown', 'exit-failure', 'none' ] }
 
 ##
 # @watchdog-set-action:
diff --git a/qemu-options.hx b/qemu-options.hx
index 79e00916a1..8e17c5064a 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4239,7 +4239,7 @@ DEF("action", HAS_ARG, QEMU_OPTION_action,
 "   action when guest reboots [default=reset]\n"
 "-action shutdown=poweroff|pause\n"
 "   action when guest shuts down [default=poweroff]\n"
-"-action panic=pause|shutdown|none\n"
+"-action panic=pause|shutdown|exit-failure|none\n"
 "   action when guest panics [default=shutdown]\n"
 "-action watchdog=reset|shutdown|poweroff|inject-nmi|pause|debug|none\n"
 "   action when watchdog fires [default=reset]\n",
diff --git a/softmmu/main.c b/softmmu/main.c
index c00432ff09..1b675a8c03 100644
--- a/softmmu/main.c
+++ b/softmmu/main.c
@@ -32,11 +32,13 @@
 
 int qemu_main(int argc, char **argv, char **envp)
 {
+int status;
+
 qemu_init(argc, argv, envp);
-qemu_main_loop();
+status = qemu_main_loop();
 qemu_cleanup();
 
-return 0;
+return status;
 }
 
 #ifndef CONFIG_COCOA
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 168e1b78a0..1e68680b9d 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -482,7 +482,8 @@ void qemu_system_guest_panicked(GuestPanicInformation *info)
 qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_PAUSE,
 !!info, info);
 vm_stop(RUN_STATE_GUEST_PANICKED);
-} else if (panic_action == PANIC_ACTION_SHUTDOWN) {
+} else if (panic_action == PANIC_ACTION_SHUTDOWN ||
+   panic_action == PANIC_ACTION_EXIT_FAILURE) {
 qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_POWEROFF,
!!info, info);
 vm_stop(RUN_STATE_GUEST_PANICKED);
@@ -662,7 +663,7 @@ void qemu_system_debug_request(void)
 qemu_notify_event();
 }
 
-static bool main_loop_should_exit(void)
+static bool main_loop_should_exit(int *status)
 {
 RunState r;
 ShutdownCause request;
@@ -680,6 +681,10 @@ static bool main_loop_should_exit(void)
 if (shutdown_action == SHUTDOWN_ACTION_PAUSE) {
 vm_stop(RUN_STATE_SHUTDOWN);
 } else {
+if (request == SHUTDOWN_CAUSE_GUEST_PANIC &&
+panic_action == PANIC_ACTION_EXIT_FAILURE) {
+*status = EXIT_FAILURE;
+}
 return true;
 }
 }
@@ -715,12 +720,14 @@ static bool main_loop_should_exit(void)
 return false;
 }
 
-void qemu_main_loop(void)
+int qemu_main_loop(void)
 {
+int status = EXIT_SUCCESS;
 #ifdef CONFIG_PROFILER
 int64_t ti;
 #endif
-while (!main_loop_should_exit()) {
+
+while (!main_loop_should_exit()) {
 #ifdef CONFIG_PROFILER
 ti = profile_getclock();
 #endif
@@ -729,6 +736,8 @@ void qemu_main_loop(void)
 dev_time += profile_getclock() - ti;
 #endif
 }
+
+return status;
 }
 
 void qemu_add_exit_notifier(Notifier *notify)
-- 
2.35.3

Re: [PATCH] hw/intc: sifive_plic: Fix multi-socket plic configuraiton

2022-07-25 Thread Atish Kumar Patra

On Sun, Jul 24, 2022 at 6:14 PM Alistair Francis 
wrote:

> On Sat, Jul 23, 2022 at 7:22 PM Atish Patra  wrote:
> >
> > Since commit 40244040a7ac, multi-socket configuration with plic is
> > broken as the hartid for second socket is calculated incorrectly.
> > The hartid stored in addr_config already includes the offset
> > for the base hartid for that socket. Adding it again would lead
> > to segfault while creating the plic device for the virt machine.
> > qdev_connect_gpio_out was also invoked with incorrect number of gpio
> > lines.
> >
> > Fixes: 40244040a7ac (hw/intc: sifive_plic: Avoid overflowing the
> addr_config buffer)
> >
> > Signed-off-by: Atish Patra 
>
> Can you share the -cpu options that causes the segfault? I'll add it
> to my test case
>
>
"-cpu rv64 -M virt -m 2G -smp 4 -object
memory-backend-ram,size=1G,policy=bind,host-nodes=0,id=ram-node0  \
-numa node,memdev=ram-node0   \
-object memory-backend-ram,size=1G,policy=bind,host-nodes=0,id=ram-node1 \
-numa node,memdev=ram-node1"

You also need to enable  CONFIG_NUMA in kernel.

Reviewed-by: Alistair Francis 
>
> Alistair
>
> > ---
> >  hw/intc/sifive_plic.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/hw/intc/sifive_plic.c b/hw/intc/sifive_plic.c
> > index 56d60e9ac935..fdac028a521f 100644
> > --- a/hw/intc/sifive_plic.c
> > +++ b/hw/intc/sifive_plic.c
> > @@ -454,10 +454,10 @@ DeviceState *sifive_plic_create(hwaddr addr, char
> *hart_config,
> >
> >  for (i = 0; i < plic->num_addrs; i++) {
> >  int cpu_num = plic->addr_config[i].hartid;
> > -CPUState *cpu = qemu_get_cpu(hartid_base + cpu_num);
> > +CPUState *cpu = qemu_get_cpu(cpu_num);
> >
> >  if (plic->addr_config[i].mode == PLICMode_M) {
> > -qdev_connect_gpio_out(dev, num_harts + cpu_num,
> > +qdev_connect_gpio_out(dev, cpu_num,
> >qdev_get_gpio_in(DEVICE(cpu),
> IRQ_M_EXT));
> >  }
> >  if (plic->addr_config[i].mode == PLICMode_S) {
> > --
> > 2.25.1
> >
> >
>

Re: Attaching qcow2 images to containers

2022-07-25 Thread Kirill Tkhai

Hi, Stefan,

sorry for the late reply. I missed your message since I don't use that email 
address anymore.
Sent a patch to fix the stale address in .mailmap.

On 18.05.2022 09:30, Stefan Hajnoczi wrote:
> Hi Kirill,
> I saw your "[PATCH 0/4] dm: Introduce dm-qcow2 driver to attach QCOW2
> files as block device" patch series:
> https://lore.kernel.org/linux-kernel/ykme5zs2cpxun...@infradead.org/T/
> 
> There has been recent work in vDPA (VIRTIO Data Path Acceleration) to
> achieve similar functionality. The qemu-storage-daemon VDUSE export
> attaches a virtio-blk device to the host kernel and QEMU's qcow2
> implementation can be used:
> https://patchew.org/QEMU/20220504074051.90-1-xieyon...@bytedance.com/
> 
> A container can then access this virtio-blk device (/dev/vda). Note that
> the virtio-blk device is implemented in software using vDPA/VDUSE, there
> is no virtio-pci device.
> 
> As a quick comparison with a dm-qcow2 target, this approach keeps the
> qcow2 code in QEMU userspace and can take advantage of QEMU block layer
> features (storage migration/mirroring/backup, snapshots, etc). On the
> other hand, it's likely to be more heavyweight because bounce buffers
> are required in VDUSE for security reasons, there is a separate
> userspace process involved, and there's the virtio_blk.ko driver and an
> emulated virtio-blk device involved.

The main idea is to reach the best performance and density. This is possible 
only,
if driver's hot path is implemented in kernel. Comparing to NBD the driver shows
much better results in these criteria.

This has a continuation, and I mean DAX here. IO handling with any 
userspace-based
implementation will be slower, than DAX in case of kernel-based implementation. 
So,
in my driver IO handling is done in kernel, while DAX support is a subject of
the future development.

And this driver and advantages of QEMU block layer are not mutually exclusive.
This driver *does not implement* snapshot or backup support, here is only 
hot-path
doing IO handling.

> Another similar feature that was recently added to QEMU is the
> qemu-storage-daemon FUSE export:
> 
>   $ qemu-storage-daemon \
> --blockdev file,filename=test.img,node-name=drive0 \
>   --export fuse,node-name=drive0,id=fuse0,mountpoint=/tmp/foo
>   $ ls -alF /tmp/foo
>   -r. 1 me me 10737418240 May 18 07:22 /tmp/foo
> 
> This exports a disk image as a file via FUSE. Programs can access it
> like a regular file and qemu-storage-daemon will do the qcow2 I/O on the
> underlying file.
> 
> I wanted to mention these options for exposing qcow2 disk images to
> processes/containers on the host. Depending on your use cases they might
> be interesting. Performance comparisons against VDUSE and FUSE exports
> would be interesting since these new approaches seem to be replacing
> qemu-nbd.
> 
> Can you share more about your use cases for the dm-qcow2 target? It
> could be useful for everyone I've CCed to be aware of various efforts in
> this area.

The use case is containers, and they are the requestor for high density.
The userspace-based implementation overhead will be noticeable on nodes
running a lot of containers (just because of any overhead is noticeable
there :)). Also, it's very useful to use the same disk image for VMs and
containers giving people to choose what they want in the moment.

Best wishes,
Kirill

[RFC PATCH 1/1] block: add vhost-blk backend

2022-07-25 Thread Andrey Zhadchenko via

Although QEMU virtio is quite fast, there is still some room for
improvements. Disk latency can be reduced if we handle virito-blk requests
in host kernel istead of passing them to QEMU. The patch adds vhost-blk
backend which sets up vhost-blk kernel module to process requests.

test setup and results:
fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
QEMU drive options: cache=none
filesystem: xfs

SSD:
   | randread, IOPS  | randwrite, IOPS |
Host   |  95.8k  |  85.3k  |
QEMU virtio|  57.5k  |  79.4k  |
QEMU vhost-blk |  95.6k  |  84.3k  |

RAMDISK (vq == vcpu):
 | randread, IOPS | randwrite, IOPS |
virtio, 1vcpu|  123k  |  129k   |
virtio, 2vcpu|  253k (??) |  250k (??)  |
virtio, 4vcpu|  158k  |  154k   |
vhost-blk, 1vcpu |  110k  |  113k   |
vhost-blk, 2vcpu |  247k  |  252k   |
vhost-blk, 4vcpu |  576k  |  567k   |

Signed-off-by: Andrey Zhadchenko 
---
 hw/block/Kconfig  |   5 +
 hw/block/meson.build  |   4 +
 hw/block/vhost-blk.c  | 394 ++
 hw/virtio/meson.build |   3 +
 hw/virtio/vhost-blk-pci.c | 102 +
 include/hw/virtio/vhost-blk.h |  50 +
 meson.build   |   5 +
 7 files changed, 563 insertions(+)
 create mode 100644 hw/block/vhost-blk.c
 create mode 100644 hw/virtio/vhost-blk-pci.c
 create mode 100644 include/hw/virtio/vhost-blk.h

diff --git a/hw/block/Kconfig b/hw/block/Kconfig
index 9e8f28f982..b4286ad10e 100644
--- a/hw/block/Kconfig
+++ b/hw/block/Kconfig
@@ -36,6 +36,11 @@ config VIRTIO_BLK
 default y
 depends on VIRTIO
 
+config VHOST_BLK
+bool
+default n
+depends on VIRTIO && LINUX
+
 config VHOST_USER_BLK
 bool
 # Only PCI devices are provided for now
diff --git a/hw/block/meson.build b/hw/block/meson.build
index 2389326112..caf9bedff3 100644
--- a/hw/block/meson.build
+++ b/hw/block/meson.build
@@ -19,4 +19,8 @@ softmmu_ss.add(when: 'CONFIG_TC58128', if_true: 
files('tc58128.c'))
 specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
 specific_ss.add(when: 'CONFIG_VHOST_USER_BLK', if_true: 
files('vhost-user-blk.c'))
 
+if have_vhost_blk
+  specific_ss.add(files('vhost-blk.c'))
+endif
+
 subdir('dataplane')
diff --git a/hw/block/vhost-blk.c b/hw/block/vhost-blk.c
new file mode 100644
index 00..33d90af270
--- /dev/null
+++ b/hw/block/vhost-blk.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2022 Virtuozzo International GmbH.
+ * Author: Andrey Zhadchenko 
+ *
+ * vhost-blk is host kernel accelerator for virtio-blk.
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qom/object.h"
+#include "hw/qdev-core.h"
+#include "hw/boards.h"
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-blk.h"
+#include "hw/virtio/virtio.h"
+#include "hw/virtio/virtio-blk.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/virtio-pci.h"
+#include "sysemu/sysemu.h"
+#include "linux-headers/linux/vhost.h"
+#include 
+#include 
+
+static int vhost_blk_start(VirtIODevice *vdev)
+{
+VHostBlk *s = VHOST_BLK(vdev);
+struct vhost_vring_file backend;
+int ret, i;
+int *fd = blk_bs(s->conf.conf.blk)->file->bs->opaque;
+BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+if (!k->set_guest_notifiers) {
+error_report("vhost-blk: binding does not support guest notifiers");
+return -ENOSYS;
+}
+
+if (s->vhost_started) {
+return 0;
+}
+
+if (ioctl(s->vhostfd, VHOST_SET_OWNER, NULL)) {
+error_report("vhost-blk: unable to set owner");
+return -ENOSYS;
+}
+
+ret = vhost_dev_enable_notifiers(>dev, vdev);
+if (ret < 0) {
+error_report("vhost-blk: unable to enable dev notifiers", errno);
+return ret;
+}
+
+s->dev.acked_features = vdev->guest_features & s->dev.backend_features;
+
+ret = vhost_dev_start(>dev, vdev);
+if (ret < 0) {
+error_report("vhost-blk: unable to start vhost dev");
+return ret;
+}
+
+ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, true);
+if (ret < 0) {
+error_report("vhost-blk: unable to bind guest notifiers");
+goto out;
+}
+
+memset(, 0, sizeof(backend));
+backend.index = 0;
+backend.fd = *fd;
+if (ioctl(s->vhostfd, VHOST_BLK_SET_BACKEND, )) {
+error_report("vhost-blk: unable to set backend");
+ret = -errno;
+goto out;
+}
+
+for (i = 0; i < s->dev.nvqs; i++) {
+vhost_virtqueue_mask(>dev, vdev, i, false);
+}
+
+

[RFC patch 0/1] block: vhost-blk backend

2022-07-25 Thread Andrey Zhadchenko via

Although QEMU virtio-blk is quite fast, there is still some room for
improvements. Disk latency can be reduced if we handle virito-blk requests
in host kernel so we avoid a lot of syscalls and context switches.

The biggest disadvantage of this vhost-blk flavor is raw format.
Luckily Kirill Thai proposed device mapper driver for QCOW2 format to attach
files as block devices: https://www.spinics.net/lists/kernel/msg4292965.html

Also by using kernel modules we can bypass iothread limitation and finaly scale
block requests with cpus for high-performance devices. This is planned to be
implemented in next version.

Linux kernel module part:
https://lore.kernel.org/kvm/20220725202753.298725-1-andrey.zhadche...@virtuozzo.com/

test setups and results:
fio --direct=1 --rw=randread  --bs=4k  --ioengine=libaio --iodepth=128
QEMU drive options: cache=none
filesystem: xfs

SSD:
   | randread, IOPS  | randwrite, IOPS |
Host   |  95.8k  |  85.3k  |
QEMU virtio|  57.5k  |  79.4k  |
QEMU vhost-blk |  95.6k  |  84.3k  |

RAMDISK (vq == vcpu):
 | randread, IOPS | randwrite, IOPS |
virtio, 1vcpu|  123k  |  129k   |
virtio, 2vcpu|  253k (??) |  250k (??)  |
virtio, 4vcpu|  158k  |  154k   |
vhost-blk, 1vcpu |  110k  |  113k   |
vhost-blk, 2vcpu |  247k  |  252k   |
vhost-blk, 4vcpu |  576k  |  567k   |

Andrey Zhadchenko (1):
  block: add vhost-blk backend

 configure |  13 ++
 hw/block/Kconfig  |   5 +
 hw/block/meson.build  |   1 +
 hw/block/vhost-blk.c  | 395 ++
 hw/virtio/meson.build |   1 +
 hw/virtio/vhost-blk-pci.c | 102 +
 include/hw/virtio/vhost-blk.h |  44 
 linux-headers/linux/vhost.h   |   3 +
 8 files changed, 564 insertions(+)
 create mode 100644 hw/block/vhost-blk.c
 create mode 100644 hw/virtio/vhost-blk-pci.c
 create mode 100644 include/hw/virtio/vhost-blk.h

-- 
2.31.1

Re: [PATCH v4] hw/nvme: Use ioeventfd to handle doorbell updates

2022-07-25 Thread Klaus Jensen

On Jul  5 22:24, Jinhao Fan wrote:
> Add property "ioeventfd" which is enabled by default. When this is
> enabled, updates on the doorbell registers will cause KVM to signal
> an event to the QEMU main loop to handle the doorbell updates.
> Therefore, instead of letting the vcpu thread run both guest VM and
> IO emulation, we now use the main loop thread to do IO emulation and
> thus the vcpu thread has more cycles for the guest VM.
> 
> Since ioeventfd does not tell us the exact value that is written, it is
> only useful when shadow doorbell buffer is enabled, where we check
> for the value in the shadow doorbell buffer when we get the doorbell
> update event.
> 
> IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)
> 
> qd   1   4  16  64
> qemu35 121 176 153
> ioeventfd   41 133 258 313
> 
> Changes since v3:
>  - Do not deregister ioeventfd when it was not enabled on a SQ/CQ
> 
> Signed-off-by: Jinhao Fan 
> ---
>  hw/nvme/ctrl.c | 114 -
>  hw/nvme/nvme.h |   5 +++
>  2 files changed, 118 insertions(+), 1 deletion(-)
> 

We have a regression following this patch that we need to address.

With this patch, issuing a reset on the device (`nvme reset /dev/nvme0`
will do the trick) causes QEMU to hog my host cpu at 100%.

I'm still not sure what causes this. The trace output is a bit
inconclusive still.

I'll keep looking into it.


signature.asc
Description: PGP signature

[python-qemu-qmp MR #12] Post-release trivial fix roundup

2022-07-25 Thread GitLab Bot

Author: John Snow - https://gitlab.com/jsnow
Merge Request: 
https://gitlab.com/qemu-project/python-qemu-qmp/-/merge_requests/12
... from: jsnow/python-qemu-qmp:fixes-roundup
... into: qemu-project/python-qemu-qmp:main

This MR collects a number of small, trivial fixes that follow the first 
official release; covering graceful exits if dependencies are missing, better 
interactive debugging information, and improving packaging, publishing and 
GitLab CI definitions.

---

This is an automated message. This bot will only relay the creation of new merge
requests and will not relay review comments, new revisions, or concluded merges.
Please follow the GitLab link to participate in review.

[PATCH v2 1/1] target/ppc: fix unreachable code in do_ldst_quad()

2022-07-25 Thread Daniel Henrique Barboza

Coverity reports that commit fc34e81acd51 ("target/ppc: add macros to
check privilege level") turned the following code unreachable:

if (!prefixed && !(ctx->insns_flags2 & PPC2_LSQ_ISA207)) {
/* lq and stq were privileged prior to V. 2.07 */
REQUIRE_SV(ctx);

>>> CID 1490757:  Control flow issues  (UNREACHABLE)
>>> This code cannot be reached: "if (ctx->le_mode) {
if (ctx->le_mode) {
gen_align_no_le(ctx);
return true;
}
}

This happens because the macro REQUIRE_SV(), in CONFIG_USER_MODE, will
always result in a 'return true' statement. In fact, all REQUIRE_*
macros for target/ppc/translate.c behave the same way: if a condition
isn't met, an exception is generated and a 'return' statement is issued.

The difference is that all other callers are using it in insns that are
not implemented in user mode. do_ldst_quad(), on the other hand, is user
mode compatible.

Fixes include wrapping these lines in "if !defined(CONFIG_USER_MODE)",
making it explicit that these lines are not user mode anymore. Another
fix would be, for example, to change REQUIRE_SV() to not issue a
'return' and check if we're running in privileged mode or not by hand,
but this would change all other callers of the macro that are using it
in an adequate manner.

The code that was in place before fc34e81acd51 was good enough, so let's
go back to that: open code the ctx->pr condition and fire the exception
if we're not privileged. The difference from the code back then to what
we're doing now is an 'unlikely' compiler hint to ctx->pr and the use of
gen_priv_opc() instead of gen_priv_exception().

Fixes: Coverity CID 1490757
Cc: Matheus Ferst 
Signed-off-by: Daniel Henrique Barboza 
---
 target/ppc/translate/fixedpoint-impl.c.inc | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/target/ppc/translate/fixedpoint-impl.c.inc 
b/target/ppc/translate/fixedpoint-impl.c.inc
index db14d3bebc..a3ade4fe2b 100644
--- a/target/ppc/translate/fixedpoint-impl.c.inc
+++ b/target/ppc/translate/fixedpoint-impl.c.inc
@@ -79,8 +79,11 @@ static bool do_ldst_quad(DisasContext *ctx, arg_D *a, bool 
store, bool prefixed)
 REQUIRE_INSNS_FLAGS(ctx, 64BX);
 
 if (!prefixed && !(ctx->insns_flags2 & PPC2_LSQ_ISA207)) {
-/* lq and stq were privileged prior to V. 2.07 */
-REQUIRE_SV(ctx);
+if (unlikely(ctx->pr)) {
+/* lq and stq were privileged prior to V. 2.07 */
+gen_priv_opc(ctx);
+return true;
+}
 
 if (ctx->le_mode) {
 gen_align_no_le(ctx);
-- 
2.36.1

[PATCH v2 0/1] target/ppc: fix unreachable code in do_ldst_quad()

2022-07-25 Thread Daniel Henrique Barboza

changes from v1:
- do not use ifdefs
- returned to the code that was in placed before the macro
- v1 link: https://lists.gnu.org/archive/html/qemu-devel/2022-07/msg03697.html

Daniel Henrique Barboza (1):
  target/ppc: fix unreachable code in do_ldst_quad()

 target/ppc/translate/fixedpoint-impl.c.inc | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

-- 
2.36.1

Re: [PATCH v1 01/13] tests: refresh to latest libvirt-ci module

2022-07-25 Thread Richard Henderson


On 7/25/22 07:05, Alex Bennée wrote:

From: Daniel P. Berrangé 

Notable changes:

   - libvirt-ci source tree was re-arranged, so the script we
 run now lives in a bin/ sub-dir


It would have been nice to keep the submodule update plus the bin/ sub-dir change in one 
patch, and the generated file updates in a separate patch.  But I won't insist.


Acked-by: Richard Henderson 


r~



   - opensuse 15.2 is replaced by opensuse 15.3

   - libslirp is temporarily dropped on opensuse as the
 libslirp-version.h is broken

  https://bugzilla.opensuse.org/show_bug.cgi?id=1201551

   - The incorrectly named python3-virtualenv module was
 changed to python3-venv, but most distros don't need
 any package as 'venv' is a standard part of python

   - glibc-static was renamed to libc-static, to reflect
 fact that it isn't going to be glibc on all distros

   - The cmocka/json-c deps that were manually added to
 the centos dockerfile and are now consistently added
 to all targets

Acked-by: Thomas Huth 
Signed-off-by: Daniel P. Berrangé 
Message-Id: <20220722130431.2319019-2-berra...@redhat.com>
Signed-off-by: Alex Bennée 
---
  .gitlab-ci.d/cirrus/freebsd-12.vars   | 3 +--
  .gitlab-ci.d/cirrus/freebsd-13.vars   | 3 +--
  .gitlab-ci.d/cirrus/macos-11.vars | 4 ++--
  tests/docker/dockerfiles/alpine.docker| 4 +++-
  tests/docker/dockerfiles/centos8.docker   | 6 +++---
  tests/docker/dockerfiles/debian-amd64.docker  | 2 ++
  tests/docker/dockerfiles/debian-arm64-cross.docker| 2 ++
  tests/docker/dockerfiles/debian-armel-cross.docker| 2 ++
  tests/docker/dockerfiles/debian-armhf-cross.docker| 2 ++
  tests/docker/dockerfiles/debian-mips64el-cross.docker | 2 ++
  tests/docker/dockerfiles/debian-mipsel-cross.docker   | 2 ++
  tests/docker/dockerfiles/debian-ppc64el-cross.docker  | 2 ++
  tests/docker/dockerfiles/debian-s390x-cross.docker| 2 ++
  tests/docker/dockerfiles/fedora.docker| 3 ++-
  tests/docker/dockerfiles/opensuse-leap.docker | 7 ---
  tests/docker/dockerfiles/ubuntu2004.docker| 2 ++
  tests/lcitool/libvirt-ci  | 2 +-
  tests/lcitool/projects/qemu.yml   | 6 --
  tests/lcitool/refresh | 4 ++--
  19 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/.gitlab-ci.d/cirrus/freebsd-12.vars 
b/.gitlab-ci.d/cirrus/freebsd-12.vars
index f59263731f..8fa5a320e9 100644
--- a/.gitlab-ci.d/cirrus/freebsd-12.vars
+++ b/.gitlab-ci.d/cirrus/freebsd-12.vars
@@ -1,5 +1,4 @@
  # THIS FILE WAS AUTO-GENERATED
-# ... and then edited to fix py39, pending proper lcitool update.
  #
  #  $ lcitool variables freebsd-12 qemu
  #
@@ -12,6 +11,6 @@ MAKE='/usr/local/bin/gmake'
  NINJA='/usr/local/bin/ninja'
  PACKAGING_COMMAND='pkg'
  PIP3='/usr/local/bin/pip-3.8'
-PKGS='alsa-lib bash bzip2 ca_root_nss capstone4 ccache cdrkit-genisoimage 
ctags curl cyrus-sasl dbus diffutils dtc fusefs-libs3 gettext git glib gmake 
gnutls gsed gtk3 libepoxy libffi libgcrypt libjpeg-turbo libnfs libspice-server 
libssh libtasn1 llvm lzo2 meson ncurses nettle ninja opencv perl5 pixman 
pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx py39-sphinx_rtd_theme 
py39-virtualenv py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy 
spice-protocol tesseract texinfo usbredir virglrenderer vte3 zstd'
+PKGS='alsa-lib bash bzip2 ca_root_nss capstone4 ccache cdrkit-genisoimage 
cmocka ctags curl cyrus-sasl dbus diffutils dtc fusefs-libs3 gettext git glib 
gmake gnutls gsed gtk3 json-c libepoxy libffi libgcrypt libjpeg-turbo libnfs 
libspice-server libssh libtasn1 llvm lzo2 meson ncurses nettle ninja opencv 
perl5 pixman pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx 
py39-sphinx_rtd_theme py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy 
spice-protocol tesseract texinfo usbredir virglrenderer vte3 zstd'
  PYPI_PKGS=''
  PYTHON='/usr/local/bin/python3'
diff --git a/.gitlab-ci.d/cirrus/freebsd-13.vars 
b/.gitlab-ci.d/cirrus/freebsd-13.vars
index 40fc961398..8ed7e33a77 100644
--- a/.gitlab-ci.d/cirrus/freebsd-13.vars
+++ b/.gitlab-ci.d/cirrus/freebsd-13.vars
@@ -1,5 +1,4 @@
  # THIS FILE WAS AUTO-GENERATED
-# ... and then edited to fix py39, pending proper lcitool update.
  #
  #  $ lcitool variables freebsd-13 qemu
  #
@@ -12,6 +11,6 @@ MAKE='/usr/local/bin/gmake'
  NINJA='/usr/local/bin/ninja'
  PACKAGING_COMMAND='pkg'
  PIP3='/usr/local/bin/pip-3.8'
-PKGS='alsa-lib bash bzip2 ca_root_nss capstone4 ccache cdrkit-genisoimage 
ctags curl cyrus-sasl dbus diffutils dtc fusefs-libs3 gettext git glib gmake 
gnutls gsed gtk3 libepoxy libffi libgcrypt libjpeg-turbo libnfs libspice-server 
libssh libtasn1 llvm lzo2 meson ncurses nettle ninja opencv perl5 pixman 
pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx py39-sphinx_rtd_theme 
py39-virtualenv py39-yaml python3 rpm2cpio sdl2 sdl2_image

Re: [PATCH] linux-user: Use memfd for open syscall emulation

2022-07-25 Thread Richard Henderson


On 7/25/22 09:28, Rainer Müller wrote:

For certain paths in /proc, the open syscall is intercepted and the
returned file descriptor points to a temporary file with emulated
contents.

If TMPDIR is not accessible or writable for the current user (for
example in a read-only mounted chroot or container) tools such as ps
from procps may fail unexpectedly. Trying to read one of these paths
such as /proc/self/stat would return an error such as ENOENT or EROFS.

To relax the requirement on a writable TMPDIR, use memfd_create()
instead to create an anonymous file and return its file descriptor.

Signed-off-by: Rainer Müller 
---
  linux-user/syscall.c | 10 +-
  1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 991b85e6b4..3e4af930ad 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -8265,9 +8265,11 @@ static int do_openat(CPUArchState *cpu_env, int dirfd, 
const char *pathname, int
  }
  
  if (fake_open->filename) {

+int fd, r;
+
+#ifndef CONFIG_MEMFD
  const char *tmpdir;
  char filename[PATH_MAX];
-int fd, r;
  
  /* create temporary file to map stat to */

  tmpdir = getenv("TMPDIR");
@@ -8279,6 +8281,12 @@ static int do_openat(CPUArchState *cpu_env, int dirfd, 
const char *pathname, int
  return fd;
  }
  unlink(filename);
+#else
+fd = memfd_create("qemu-open", 0);
+if (fd < 0) {
+return fd;
+}
+#endif


Even without CONFIG_MEMFD, we will have the memfd_create function available in 
util/.
I think you should drop the ifdefs like so:

#include "qemu/memfd.h"

fd = memfd_create(...);
if (fd < 0) {
if (errno != ENOSYS) {
return fd;
}
// tmpdir fallback
}


r~

Re: [PATCH 2/3] target/arm: Set KVM_ARM_VCPU_SVE while probing the host

2022-07-25 Thread Richard Henderson


On 7/25/22 11:14, Richard Henderson wrote:

Because we weren't setting this flag, our probe of ID_AA64ZFR0
was always returning zero.  This also obviates the adjustment
of ID_AA64PFR0, which had sanitized the SVE field.



Oh, I meant to say here that this the effects of the bug are not visible, because the only 
thing that ISAR.ID_AA64ZFR0 is used for within qemu at present is tcg translation.  The 
other tests for SVE within KVM are via ISAR.ID_AA64PFR0.SVE.



r~




Reported-by: Zenghui Yu 
Signed-off-by: Richard Henderson 
---
  target/arm/kvm64.c | 27 +--
  1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 36ff20c9e3..8b2ae9948a 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -512,7 +512,6 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
  bool sve_supported;
  bool pmu_supported = false;
  uint64_t features = 0;
-uint64_t t;
  int err;
  
  /* Old kernels may not know about the PREFERRED_TARGET ioctl: however

@@ -533,10 +532,17 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures 
*ahcf)
  struct kvm_vcpu_init init = { .target = -1, };
  
  /*

- * Ask for Pointer Authentication if supported. We can't play the
- * SVE trick of synthesising the ID reg as KVM won't tell us
- * whether we have the architected or IMPDEF version of PAuth, so
- * we have to use the actual ID regs.
+ * Ask for SVE if supported, so that we can query ID_AA64ZFR0,
+ * which is otherwise RAZ.
+ */
+sve_supported = kvm_arm_svm_supported();
+if (sve_supported) {
+init.features[0] |= 1 << KVM_ARM_VCPU_SVE;
+}
+
+/*
+ * Ask for Pointer Authentication if supported, so that we get
+ * the unsanitized field values for AA64ISAR1_EL1.
   */
  if (kvm_arm_pauth_supported()) {
  init.features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
@@ -680,20 +686,13 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures 
*ahcf)
  }
  }
  
-sve_supported = kvm_arm_svm_supported();

-
-/* Add feature bits that can't appear until after VCPU init. */
  if (sve_supported) {
-t = ahcf->isar.id_aa64pfr0;
-t = FIELD_DP64(t, ID_AA64PFR0, SVE, 1);
-ahcf->isar.id_aa64pfr0 = t;
-
  /*
   * There is a range of kernels between kernel commit 73433762fcae
   * and f81cb2c3ad41 which have a bug where the kernel doesn't expose
   * SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has enabled
- * SVE support, so we only read it here, rather than together with all
- * the other ID registers earlier.
+ * SVE support, which resulted in an error rather than RAZ.
+ * So only read the register if we set KVM_ARM_VCPU_SVE above.
   */
  err |= read_sys_reg64(fdarray[2], >isar.id_aa64zfr0,
ARM64_SYS_REG(3, 0, 0, 4, 4));

[PATCH 2/3] target/arm: Set KVM_ARM_VCPU_SVE while probing the host

2022-07-25 Thread Richard Henderson

Because we weren't setting this flag, our probe of ID_AA64ZFR0
was always returning zero.  This also obviates the adjustment
of ID_AA64PFR0, which had sanitized the SVE field.

Reported-by: Zenghui Yu 
Signed-off-by: Richard Henderson 
---
 target/arm/kvm64.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 36ff20c9e3..8b2ae9948a 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -512,7 +512,6 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
 bool sve_supported;
 bool pmu_supported = false;
 uint64_t features = 0;
-uint64_t t;
 int err;
 
 /* Old kernels may not know about the PREFERRED_TARGET ioctl: however
@@ -533,10 +532,17 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures 
*ahcf)
 struct kvm_vcpu_init init = { .target = -1, };
 
 /*
- * Ask for Pointer Authentication if supported. We can't play the
- * SVE trick of synthesising the ID reg as KVM won't tell us
- * whether we have the architected or IMPDEF version of PAuth, so
- * we have to use the actual ID regs.
+ * Ask for SVE if supported, so that we can query ID_AA64ZFR0,
+ * which is otherwise RAZ.
+ */
+sve_supported = kvm_arm_svm_supported();
+if (sve_supported) {
+init.features[0] |= 1 << KVM_ARM_VCPU_SVE;
+}
+
+/*
+ * Ask for Pointer Authentication if supported, so that we get
+ * the unsanitized field values for AA64ISAR1_EL1.
  */
 if (kvm_arm_pauth_supported()) {
 init.features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
@@ -680,20 +686,13 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures 
*ahcf)
 }
 }
 
-sve_supported = kvm_arm_svm_supported();
-
-/* Add feature bits that can't appear until after VCPU init. */
 if (sve_supported) {
-t = ahcf->isar.id_aa64pfr0;
-t = FIELD_DP64(t, ID_AA64PFR0, SVE, 1);
-ahcf->isar.id_aa64pfr0 = t;
-
 /*
  * There is a range of kernels between kernel commit 73433762fcae
  * and f81cb2c3ad41 which have a bug where the kernel doesn't expose
  * SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has enabled
- * SVE support, so we only read it here, rather than together with all
- * the other ID registers earlier.
+ * SVE support, which resulted in an error rather than RAZ.
+ * So only read the register if we set KVM_ARM_VCPU_SVE above.
  */
 err |= read_sys_reg64(fdarray[2], >isar.id_aa64zfr0,
   ARM64_SYS_REG(3, 0, 0, 4, 4));
-- 
2.34.1

[PATCH 3/3] target/arm: Move sve probe inside kvm >= 4.15 branch

2022-07-25 Thread Richard Henderson

The test for the IF block indicates no ID registers are exposed, much
less host support for SVE.  Move the SVE probe into the ELSE block.

Signed-off-by: Richard Henderson 
---
 target/arm/kvm64.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 8b2ae9948a..bc3d7d9883 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -684,18 +684,18 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures 
*ahcf)
 err |= read_sys_reg64(fdarray[2], >isar.reset_pmcr_el0,
   ARM64_SYS_REG(3, 3, 9, 12, 0));
 }
-}
 
-if (sve_supported) {
-/*
- * There is a range of kernels between kernel commit 73433762fcae
- * and f81cb2c3ad41 which have a bug where the kernel doesn't expose
- * SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has enabled
- * SVE support, which resulted in an error rather than RAZ.
- * So only read the register if we set KVM_ARM_VCPU_SVE above.
- */
-err |= read_sys_reg64(fdarray[2], >isar.id_aa64zfr0,
-  ARM64_SYS_REG(3, 0, 0, 4, 4));
+if (sve_supported) {
+/*
+ * There is a range of kernels between kernel commit 73433762fcae
+ * and f81cb2c3ad41 which have a bug where the kernel doesn't
+ * expose SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has
+ * enabled SVE support, which resulted in an error rather than RAZ.
+ * So only read the register if we set KVM_ARM_VCPU_SVE above.
+ */
+err |= read_sys_reg64(fdarray[2], >isar.id_aa64zfr0,
+  ARM64_SYS_REG(3, 0, 0, 4, 4));
+}
 }
 
 kvm_arm_destroy_scratch_host_vcpu(fdarray);
-- 
2.34.1

[PATCH 1/3] target/arm: Create kvm_arm_svm_supported

2022-07-25 Thread Richard Henderson

Indication for support for SVE will not depend on whether we
perform the query on the main kvm_state or the temp vcpu.
Mirror kvm_arm_pauth_supported.

Signed-off-by: Richard Henderson 
---
 target/arm/kvm64.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index d16d4ea250..36ff20c9e3 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -496,6 +496,11 @@ static bool kvm_arm_pauth_supported(void)
 kvm_check_extension(kvm_state, KVM_CAP_ARM_PTRAUTH_GENERIC));
 }
 
+static bool kvm_arm_svm_supported(void)
+{
+return kvm_check_extension(kvm_state, KVM_CAP_ARM_SVE);
+}
+
 bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
 {
 /* Identify the feature bits corresponding to the host CPU, and
@@ -675,7 +680,7 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
 }
 }
 
-sve_supported = ioctl(fdarray[0], KVM_CHECK_EXTENSION, KVM_CAP_ARM_SVE) > 
0;
+sve_supported = kvm_arm_svm_supported();
 
 /* Add feature bits that can't appear until after VCPU init. */
 if (sve_supported) {
-- 
2.34.1

[PATCH 0/3] target/arm: Fix kvm probe of ID_AA64ZFR0

2022-07-25 Thread Richard Henderson

Our probing of this SVE register was done within an incorrect
vCPU environment, so that the id register was always RAZ.


r~


Richard Henderson (3):
  target/arm: Create kvm_arm_svm_supported
  target/arm: Set KVM_ARM_VCPU_SVE while probing the host
  target/arm: Move sve probe inside kvm >= 4.15 branch

 target/arm/kvm64.c | 50 +-
 1 file changed, 27 insertions(+), 23 deletions(-)

-- 
2.34.1

Re: [PATCH] contrib/vhost-user-blk: Clean up deallocation of VuVirtqElement

2022-07-25 Thread Markus Armbruster

Could this go via qemu-trivial now?

Markus Armbruster  writes:

> We allocate VuVirtqElement with g_malloc() in
> virtqueue_alloc_element(), but free it with free() in
> vhost-user-blk.c.  Harmless, but use g_free() anyway.
>
> One of the calls is guarded by a "not null" condition.  Useless,
> because it cannot be null (it's dereferenced right before), and even
> it it could be, free() and g_free() do the right thing.  Drop the
> conditional.
>
> Fixes: Coverity CID 1490290
> Signed-off-by: Markus Armbruster 
> ---
> Not even compile-tested, because I can't figure out how this thing is
> supposed to be built.  Its initial commit message says "make
> vhost-user-blk", but that doesn't work anymore.
>
>  contrib/vhost-user-blk/vhost-user-blk.c | 9 +++--
>  1 file changed, 3 insertions(+), 6 deletions(-)
>
> diff --git a/contrib/vhost-user-blk/vhost-user-blk.c 
> b/contrib/vhost-user-blk/vhost-user-blk.c
> index 9cb78ca1d0..d6932a2645 100644
> --- a/contrib/vhost-user-blk/vhost-user-blk.c
> +++ b/contrib/vhost-user-blk/vhost-user-blk.c
> @@ -106,10 +106,7 @@ static void vub_req_complete(VubReq *req)
>req->size + 1);
>  vu_queue_notify(vu_dev, req->vq);
>  
> -if (req->elem) {
> -free(req->elem);
> -}
> -
> +g_free(req->elem);
>  g_free(req);
>  }
>  
> @@ -243,7 +240,7 @@ static int vub_virtio_process_req(VubDev *vdev_blk,
>  /* refer to hw/block/virtio_blk.c */
>  if (elem->out_num < 1 || elem->in_num < 1) {
>  fprintf(stderr, "virtio-blk request missing headers\n");
> -free(elem);
> +g_free(elem);
>  return -1;
>  }
>  
> @@ -325,7 +322,7 @@ static int vub_virtio_process_req(VubDev *vdev_blk,
>  return 0;
>  
>  err:
> -free(elem);
> +g_free(elem);
>  g_free(req);
>  return -1;
>  }

Re: [PATCH v2 07/11] acpi/tests/bits: add python test that exercizes QEMU bios tables using biosbits

2022-07-25 Thread Ani Sinha



On Sat, 16 Jul 2022, Michael S. Tsirkin wrote:

> On Sat, Jul 16, 2022 at 12:06:00PM +0530, Ani Sinha wrote:
> >
> >
> > On Fri, Jul 15, 2022 at 11:20 Michael S. Tsirkin  wrote:
> >
> > On Fri, Jul 15, 2022 at 09:47:27AM +0530, Ani Sinha wrote:
> > > > Instead of all this mess, can't we just spawn e.g. "git clone 
> > --depth
> > 1"?
> > > > And if the directory exists I would fetch and checkout.
> > >
> > > There are two reasons I can think of why I do not like this idea:
> > >
> > > (a) a git clone of a whole directory would download all versions of 
> > the
> > > binary whereas we want only a specific version.
> >
> > You mention shallow clone yourself, and I used --depth 1 above.
> >
> > > Downloading a single file
> > > by shallow cloning or creating a git archive is overkill IMHO when a 
> > wget
> > > style retrieval works just fine.
> >
> > However, it does not provide for versioning, tagging etc so you have
> > to implement your own schema.
> >
> >
> > Hmm I’m not sure if we need all that. Bits has its own versioning mechanism 
> > and
> > I think all we need to do is maintain the same versioning logic and maintain
> > binaries of different  versions. Do we really need the power of git/version
> > control here? Dunno.
>
> Well we need some schema. Given we are not using official bits releases
> I don't think we can reuse theirs.

OK fine. Lets figuire out how to push bits somewhere in git.qemu.org and
the binaries in some other repo first. Everything else hinges on that. We
can fix the rest of the bits later incrementally.

>
> >
> >
> >
> >
> > > (b) we may later move the binary archives to a ftp server or a google
> > > drive. git/version control mechanisms are not the best place to store
> > > binary blobs IMHO. In this case also, wget also works.
> >
> > surely neither ftp nor google drive are reasonable dependencies
> > for a free software project. But qemu does maintain an http server
> > already so that't a plus.
> >
> >
> >
> > I am not insisting on git, but I do not like it that security,
> > mirroring, caching, versioning all have to be hand rolled and then
> > figured out by users and maintainers. Who frankly have other things to
> > do besides learning yet another boutique configuration language.
> >
> >
> > Yeah we do not want to reinvent the wheel all over again. 
> >
> >
> >
> >
> > And I worry that after a while we come up with a new organization schema
> > for the files, old ones are moved around and nothing relying on the URL
> > works.  git is kind of good in that it enforces the idea that history is
> > immutable.
> >
> >
> > Ah I see your point here.
> >
> >
> >
> >
> > If not vanilla git can we find another utility we can reuse?
> >
> > git lfs? It seems to be supported by both github and gitlab though
> > bizarrely github has bandwidth limits on git lfs but apparently not on
> > vanilla git. Hosting on qemu.org will require maintaining a server
> > there though.
> >
> >
> >
> > All that said maybe we should just run with it as it is, just so we get
> > *something* in the door, and then worry about getting the storage side
> > straight before making this test a requirement for all acpi developers.
> >
> >
> >
> >
> >
>
>

Re: [PULL 06/35] hw/acpi: refactor acpi hp modules so that targets can just use what they need

2022-07-25 Thread Ani Sinha




On Thu, 21 Jul 2022, Dr. David Alan Gilbert wrote:

> * Ani Sinha (a...@anisinha.ca) wrote:
> >
> >
> > On Wed, 20 Jul 2022, Peter Maydell wrote:
> >
> > > On Wed, 20 Jul 2022 at 19:37, Ani Sinha  wrote:
> > > >
> > > >
> > > >
> > > > On Tue, 19 Jul 2022, Peter Maydell wrote:
> > > >
> > > > > On Sat, 4 Sept 2021 at 22:36, Michael S. Tsirkin  
> > > > > wrote:
> > > > > How is this intended to work? The obvious fix from my point
> > > > > of view would just be to say "piix4.c requires pcihp.c"
> > > > > and cause CONFIG_ACPI_PIIX4 to pull in CONFIG_ACPI_PCIHP,
> > > > > but that seems like it would be rather undoing the point
> > > > > of this change.
> > > >
> > > > Yes. From the commit log and the vague recollection I have about this
> > > > change :
> > > >
> > > > > For example, mips only needs support for PIIX4 and does not
> > > > > need acpi pci hotplug support or cpu hotplug support or memory hotplug
> > > > support
> > > > > etc
> > > >
> > > > So does malta really need acpi hotplug? If not, then the stubbing out of
> > > > the vmstate struct is correct.
> > >
> > > It's not, because the vmstate struct is actually used when you
> > > savevm/loadvm a malta machine. If the malta shouldn't have
> > > acpi hotplug then we need to arrange for the hotplug code
> > > to be avoided at an earlier point, not just stub in the
> > > vmstate struct field.
> >
> > yes I think that would be more appropriate fix, I agree. Since mst added
> > that vmstate, maybe he can comment on this.
>
> Can't we just change the stub to be a valid vmstate structure?

This would be only a short term solution until we can re-org the code so
that Malta does not use acpi hotplug anymore.

>
> Dave
> (I coincidentally found this today because I'd been cc'd on
> https://gitlab.com/qemu-project/qemu/-/issues/995 a few months back
> and only just noticed)
>

Re: [PATCH v2 1/2] qapi: Add exit-failure PanicAction

2022-07-25 Thread Eric Blake

On Sat, Jul 23, 2022 at 01:36:13AM +0200, Ilya Leoshkevich wrote:
> Currently QEMU exits with code 0 on both panic an shutdown. For tests
> it is useful to return 1 on panic, so that it counts as a test
> failure.
> 
> Introduce a new exit-failure PanicAction that makes main() return
> EXIT_FAILURE. Tests can use -action panic=exit-failure option to
> activate this behavior.
> 
> Signed-off-by: Ilya Leoshkevich 
> ---

> +++ b/qapi/run-state.json
> @@ -364,10 +364,12 @@
>  #
>  # @shutdown: Shutdown the VM and exit, according to the shutdown action
>  #
> +# @exit-failure: Shutdown the VM and exit with nonzero status

Missing a '(since 7.1)' tag.  Otherwise a nice addition.

> +#
>  # Since: 6.0
>  ##
>  { 'enum': 'PanicAction',
> -  'data': [ 'pause', 'shutdown', 'none' ] }
> +  'data': [ 'pause', 'shutdown', 'exit-failure', 'none' ] }

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: TARGET_SYS_HEAPINFO and Cortex-A15 memory map

2022-07-25 Thread Liviu Ionescu

> On 25 Jul 2022, at 20:33, Peter Maydell  wrote:
> 
> ... I'll try a setup with code in flash and
> data in RAM, that sounds like it might be a way to cause the
> problem if there's not much code and a lot of data (because
> then the remaining space in the flash is larger than the
> remaining space in the RAM.)

My tests use several hundred KB of text (the C++ runtime) and a few tens of KB 
of bss+data; I don't know how to compute the remaining space, since I don't 
know the default sizes of flash & ram.

I do have some experience with linker scripts for Cortex-M devices, and I can 
tweak the projects generated by my template to allocate text in flash, but I 
have no experience with Cortex-A devices; if you tell me exactly at what 
address you want the program started, I can try to provide you a binary.

Liviu

Re: TARGET_SYS_HEAPINFO and Cortex-A15 memory map

2022-07-25 Thread Peter Maydell

On Mon, 25 Jul 2022 at 18:02, Liviu Ionescu  wrote:
> > On 25 Jul 2022, at 19:02, Peter Maydell  wrote:
> > The one where SYS_HEAPINFO produces the bogus result putting the
> > heap at 0x0400, that you mentioned in the original report with
> > the command line
> >
> > .../qemu-system-arm "--machine" "virt" "--cpu" "cortex-a15"
> > "--nographic" "-d" "unimp,guest_errors" "--semihosting-config"
> > "enable=on,target=native,arg=sample-test,arg=one,arg=two" -s -S
>
> ah, the bogus one... that's a bit more complicated, since it happened in the 
> early tests, and I don't remember how I did it, it might be that I tried to 
> load my code in flash and my data in ram, but I'm not sure.
>
> try to check the logic and avoid the cases when flash addresses are returned 
> for heap, if possible.

Yeah, that's my plan. I think it'll be a one-line fix. But it'll
be much easier to be sure it works if I have a test case that
triggers the problem. I'll try a setup with code in flash and
data in RAM, that sounds like it might be a way to cause the
problem if there's not much code and a lot of data (because
then the remaining space in the flash is larger than the
remaining space in the RAM.)

> btw, this might be a different topic, but on Cortex-M devices
> I'm used to configure the linker scripts to allocate the text
> in flash and the data+bss in ram; for qemu aarch32/64 devices
> I could not make this work, and I had to allocate everything
> in ram, which is functional, but probably not very accurate
> for some tests, that might fail when running from flash.

I'm not a linker/toolchain expert, I'm afraid. Certainly in
theory it should be possible to have a split text/data setup
on A-profile devices, the same as M-profile ones.

thanks
-- PMM

Re: TARGET_SYS_HEAPINFO and Cortex-A15 memory map

2022-07-25 Thread Liviu Ionescu

> On 25 Jul 2022, at 19:02, Peter Maydell  wrote:
> 
> 
> We document what the guest can assume about the virt board
> memory layout here:
> 
> https://www.qemu.org/docs/master/system/arm/virt.html#hardware-configuration-information-for-bare-metal-programming
> 
> Flash at 0, RAM at 0x4000_, must look in the DTB blob
> for all other information.

ah, ok, I probably missed this. :-(

btw, looking in the dtb blob is probably not a problem for a linux kernel, but 
if I want only to write a simple test that needs a timer, getting the timer 
address is probably more complicated than my entire test... :-(

> The one where SYS_HEAPINFO produces the bogus result putting the
> heap at 0x0400, that you mentioned in the original report with
> the command line
> 
> .../qemu-system-arm "--machine" "virt" "--cpu" "cortex-a15"
> "--nographic" "-d" "unimp,guest_errors" "--semihosting-config"
> "enable=on,target=native,arg=sample-test,arg=one,arg=two" -s -S

ah, the bogus one... that's a bit more complicated, since it happened in the 
early tests, and I don't remember how I did it, it might be that I tried to 
load my code in flash and my data in ram, but I'm not sure.

try to check the logic and avoid the cases when flash addresses are returned 
for heap, if possible.

btw, this might be a different topic, but on Cortex-M devices I'm used to 
configure the linker scripts to allocate the text in flash and the data+bss in 
ram; for qemu aarch32/64 devices I could not make this work, and I had to 
allocate everything in ram, which is functional, but probably not very accurate 
for some tests, that might fail when running from flash.

regards,

Liviu

How to read RISC-V mcycle CSR from Linux userspace app?

2022-07-25 Thread Maxim Blinov

Hi all, stupid question but I can't for the life of me figure this out
even with all the docs open.

I need to get an estimate figure for the cyclecount of a busy loop in
one of my small Linux userspace apps. The app in question is running
in qemu-system-riscv64. I've compiled QEMU myself, and the full code
is like this:

#include 
#include 
#include 
#include 

uint64_t get_mcycle() {
  uint64_t mcycle = 0;

  asm volatile ("csrr %0,mcycle"   : "=r" (mcycle)  );

  return mcycle;
}

int main(int argc, char **argv) {
  printf("Hello\n");
  printf("mcycle is %lu\n", get_mcycle());

  return 0;
}

Now I get SIGILL when I hit the `csrr` insn, which makes sense.
According to the "Privileged Architecture Version 1.10", page 32, [1]
we need to set mcounteren, hcounteren, and scounteren low bits to 1 in
order to get the mcycle csr to become available in userspace. So I add
the following function:

void enable_mcount() {
  /* Enable IR, TM, CY */
  uint64_t mcounteren = 0x5;
  asm volatile ("csrw mcounteren,%0" : "=r" (mcounteren));
  asm volatile ("csrw hcounteren,%0" : "=r" (mcounteren));
  asm volatile ("csrw scounteren,%0" : "=r" (mcounteren));
}

And call it before I call get_mcycle(), but this triggers SIGILL
(unsurprisingly) also, since these CSRs are also privileged. So
there's a bit of a chicken and egg problem.

Could someone more knowledgeable please suggest what the course of
action here is? I've got QEMU revision f45fd24c90 checked out, and I'm
staring at qemu/target/riscv/csr.c:71, which seems to deal with
whether or not to raise a SIGILL upon access. I can see a condition
for when we're in 'S' mode, but nothing for 'U' mode. Does that mean
there is fundamentally no access to these CSR's from 'U' mode? Is it
possible to just hack it in?

Maxim

[1]: https://riscv.org/wp-content/uploads/2017/05/riscv-privileged-v1.10.pdf

Re: [PATCH] ui/console: fix qemu_console_resize() regression

2022-07-25 Thread Mark Cave-Ayland


On 25/07/2022 12:58, marcandre.lur...@redhat.com wrote:


From: Marc-André Lureau 

The display may be corrupted when changing screen colour depth in
qemu-system-ppc/MacOS since 7.0.


Is it worth being more specific here? Whilst MacOS with its NDRV driver exhibits the 
issue, it's really only because MacOS has separate selections for depth and 
resolution which allows one to be set without updating the other. I did a quick play 
with the Forth reproducer, and even with current git master the issue goes away if 
you also change the width/height at the same time as the depth.



Do not short-cut qemu_console_resize() if the surface is backed by vga
vram. When the scanout isn't set, or it is already allocated, or opengl,
and the size is fitting, we still avoid the reallocation & replace path.

Fixes: commit cb8962c1 ("ui: do not create a surface when resizing a GL 
scanout")

Reported-by: Mark Cave-Ayland 
Signed-off-by: Marc-André Lureau 
---
  ui/console.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ui/console.c b/ui/console.c
index e139f7115e1f..765892f84f1c 100644
--- a/ui/console.c
+++ b/ui/console.c
@@ -2575,11 +2575,13 @@ static void vc_chr_open(Chardev *chr,
  
  void qemu_console_resize(QemuConsole *s, int width, int height)

  {
-DisplaySurface *surface;
+DisplaySurface *surface = qemu_console_surface(s);
  
  assert(s->console_type == GRAPHIC_CONSOLE);
  
-if (qemu_console_get_width(s, -1) == width &&

+if ((s->scanout.kind != SCANOUT_SURFACE ||
+ (surface && surface->flags & QEMU_ALLOCATED_FLAG)) &&
+qemu_console_get_width(s, -1) == width &&
  qemu_console_get_height(s, -1) == height) {
  return;
  }


The criteria listed for the short-cut in the commit message are quite handy, so is it 
worth adding a comment along the same lines as a reminder? Or is this logic touched 
so rarely that it isn't worthwhile?


Regardless of the above, thanks for coming up with the patch and I can confirm that 
it fixes both the Forth reproducer and the changing of the Monitor colour depth in 
MacOS itself:


Tested-by: Mark Cave-Ayland 


ATB,

Mark.

[PATCH] linux-user: Use memfd for open syscall emulation

2022-07-25 Thread Rainer Müller

For certain paths in /proc, the open syscall is intercepted and the
returned file descriptor points to a temporary file with emulated
contents.

If TMPDIR is not accessible or writable for the current user (for
example in a read-only mounted chroot or container) tools such as ps
from procps may fail unexpectedly. Trying to read one of these paths
such as /proc/self/stat would return an error such as ENOENT or EROFS.

To relax the requirement on a writable TMPDIR, use memfd_create()
instead to create an anonymous file and return its file descriptor.

Signed-off-by: Rainer Müller 
---
 linux-user/syscall.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 991b85e6b4..3e4af930ad 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -8265,9 +8265,11 @@ static int do_openat(CPUArchState *cpu_env, int dirfd, 
const char *pathname, int
 }
 
 if (fake_open->filename) {
+int fd, r;
+
+#ifndef CONFIG_MEMFD
 const char *tmpdir;
 char filename[PATH_MAX];
-int fd, r;
 
 /* create temporary file to map stat to */
 tmpdir = getenv("TMPDIR");
@@ -8279,6 +8281,12 @@ static int do_openat(CPUArchState *cpu_env, int dirfd, 
const char *pathname, int
 return fd;
 }
 unlink(filename);
+#else
+fd = memfd_create("qemu-open", 0);
+if (fd < 0) {
+return fd;
+}
+#endif
 
 if ((r = fake_open->fill(cpu_env, fd))) {
 int e = errno;
-- 
2.25.1

Re: TARGET_SYS_HEAPINFO and Cortex-A15 memory map

2022-07-25 Thread Peter Maydell

On Mon, 25 Jul 2022 at 16:56, Liviu Ionescu  wrote:
>
>
>
> > On 25 Jul 2022, at 18:43, Peter Maydell  wrote:
> >
> > ... the memory map for the 'virt' board doesn't change for the a72
> > versus the a15. In both cases, the memory from 0x0 to 0x0800
> > is the flash memory. So QEMU is incorrect to have reported that as
> > the place to put the heap in SYS_HEAPINFO.
>
> Could you also update the documentation pages to reflect the actual memory 
> map for the Arm devices? It took me some time and experimentation to find 
> this for my tests.

We document what the guest can assume about the virt board
memory layout here:

https://www.qemu.org/docs/master/system/arm/virt.html#hardware-configuration-information-for-bare-metal-programming

Flash at 0, RAM at 0x4000_, must look in the DTB blob
for all other information.

> > I suspect the fix to this bug is going to be "make sure that
> > SYS_HEAPINFO doesn't consider memory regions that are rom-devices"
>
> Sounds ok.
>
> > You don't mention which QEMU version you are using.
>
> 7.0.0
>
> > Do you have a test binary I can reproduce this with? That would save
> > me a little time.
>
> I have a template project that can generate several binaries intended to run 
> on QEMU:
>
> - https://github.com/micro-os-plus/hello-world-qemu-template-xpack/
>
> The only prerequisite is npm/xpm, the rest of the dependencies are handled 
> automatically.
>
> If this does not work for you, please tell me exactly which binary you need, 
> and I can try to generate it.

The one where SYS_HEAPINFO produces the bogus result putting the
heap at 0x0400, that you mentioned in the original report with
the command line

.../qemu-system-arm  "--machine" "virt" "--cpu" "cortex-a15"
"--nographic" "-d" "unimp,guest_errors" "--semihosting-config"
"enable=on,target=native,arg=sample-test,arg=one,arg=two" -s -S

thanks
-- PMM

Re: TARGET_SYS_HEAPINFO and Cortex-A15 memory map

2022-07-25 Thread Liviu Ionescu

> On 25 Jul 2022, at 18:43, Peter Maydell  wrote:
> 
> ... the memory map for the 'virt' board doesn't change for the a72
> versus the a15. In both cases, the memory from 0x0 to 0x0800
> is the flash memory. So QEMU is incorrect to have reported that as
> the place to put the heap in SYS_HEAPINFO.

Could you also update the documentation pages to reflect the actual memory map 
for the Arm devices? It took me some time and experimentation to find this for 
my tests.

> I suspect the fix to this bug is going to be "make sure that
> SYS_HEAPINFO doesn't consider memory regions that are rom-devices"

Sounds ok.

> You don't mention which QEMU version you are using.

7.0.0

> Do you have a test binary I can reproduce this with? That would save
> me a little time.

I have a template project that can generate several binaries intended to run on 
QEMU:

- https://github.com/micro-os-plus/hello-world-qemu-template-xpack/

The only prerequisite is npm/xpm, the rest of the dependencies are handled 
automatically.

If this does not work for you, please tell me exactly which binary you need, 
and I can try to generate it.

Thank you,

Liviu

Re: TARGET_SYS_HEAPINFO and Cortex-A15 memory map

2022-07-25 Thread Peter Maydell

On Fri, 3 Jun 2022 at 07:17, Liviu Ionescu  wrote:
> > On 2 Jun 2022, at 21:36, Liviu Ionescu  wrote:
> >
> > ... SYS_HEAPINFO...
> >
> > 0x0400 - heap base
> > 0x0800 - heap limit
> > 0x0800 - stack base
> > 0x0 - stack limit
>
> For Cortex-A72 I see similar values:
>
> 0x4400
> 0x4800
> 0x4800
> 0x4000
>
> just that in this case the memory is writable, and the startup proceeds as 
> expected.
>
> Any idea why in the Cortex-A15 case the memory below 0x0800 is not 
> writable?

Sorry I didn't get back to this before -- I marked it in my email client
but then forgot about it :-(

Anyway, the memory map for the 'virt' board doesn't change for the a72
versus the a15. In both cases, the memory from 0x0 to 0x0800
is the flash memory. So QEMU is incorrect to have reported that as
the place to put the heap in SYS_HEAPINFO.

I'm not sure why the A72 gives different results, but the heuristic
that tries to figure out where the heap goes is basically trying to
do "find the biggest lump of memory which doesn't have some bit of
the guest code in it", so it's going to be sensitive to things like
just how big that guest binary happens to be and where it's loaded.
Perhaps that is why it's ended up different.

I suspect the fix to this bug is going to be "make sure that
SYS_HEAPINFO doesn't consider memory regions that are rom-devices"
(we already ignore read-only memory, but flash devices are
technically not read-only).

You don't mention which QEMU version you are using. We
rewrote the whole SYS_HEAPINFO handling in commit 5fc983af8b20d6927
(Feb 2022, should be in 7.0.0) because the old implementation
could produce some completely wrong results. Looking at the code
I guess this was with the new implementation, though.

Do you have a test binary I can reproduce this with? That would save
me a little time.

thanks
-- PMM

Re: driver type raw-xz supports discard=unmap?

2022-07-25 Thread Chris Murphy




On Mon, Jul 25, 2022, at 9:53 AM, Daniel P. Berrangé wrote:
> On Mon, Jul 25, 2022 at 08:51:42AM -0400, Chris Murphy wrote:

>> Huh, interesting. I have no idea then. I just happened to notice it in the 
>> (libvirt) XML config that's used by oz.
>> https://kojipkgs.fedoraproject.org//packages/Fedora-Workstation/Rawhide/20220721.n.0/images/libvirt-raw-xz-aarch64.xml
>
> I don't see 'raw-xz' mentioned anywhere in the Oz code at
>
>   https://github.com/clalancette/oz
>
> was it a fork that's being used ?

Must be. I'm not seeing it in either oz or imagefactory source either.

>> I've got no idea what happens if an invalid type is specified in
>> the config. The VM's are definitely running despite this. I'll ask
>> oz devs.
>
> This is pretty surprising if they're actually running as it should
> cause a fatal error message
>
> error: unsupported configuration: unknown driver format value 'raw-xz'

Yep, I'm lost. I guess it's down a rabbit hole or yak shaving time.

-- 
Chris Murphy

Re: [PATCH v3 1/9] tests: introduce tree-wide code style checking

2022-07-25 Thread Peter Maydell

On Wed, 20 Jul 2022 at 17:32, Daniel P. Berrangé  wrote:
> I would certainly like to see us eventually remove
> checkpatch.pl because of the various downsides it has.
>
> The caveat is that I've not actually looked in any detail
> at what things checkpatch.pl actually checks for. Without
> looking my guess-timate is that we could probably replace
> 90% of it without much trouble. Perhaps we'll just decide
> some of the checkjs in checkpatch aren't worth the burden
> of maintaining its usage.

I went through checkpatch, and here are the warnings I think
are worth making sure we still have. I've included all the
ones that look like we've added them specifically for QEMU
on the basis that if we cared enough to edit checkpatch to
add them then we ought to care enough to retain them.

* "Do not add expected files together with tests,
   follow instructions in tests/qtest/bios-tables-test.c"
* "do not set execute permissions for source files"
* "please use python3 interpreter"
* "Author email address is mangled by the mailing list"
* syntax checks on Signed-off-by lines
* "does MAINTAINERS need updating?"
* "Invalid UTF-8"
* "trailing whitespace"
* "DOS line endings" (maybe)
* "Don't use '#' flag of printf format ('%#') in trace-events"
* "Hex numbers must be prefixed with '0x' [in trace-events]"
* line-length checks (though for a "whole codebase must pass"
  test we would want to set the length longer than the current
  "author should consider whether to wrap" value
* hard coded tabs
* the various dodgy-indentation checks
* the various space-required checks eg around operators
* misformatted block comments
* "do not use C99 // comments"
* "do not initialise globals/statics to 0 or NULL"
* "do not use assignment in if condition"
* "braces {} are necessary for all arms of this statement"
* "braces {} are necessary even for single statement blocks"
* "Use of volatile is usually wrong, please add a comment"
* "g_free(NULL) is safe this check is probably not required"
* "memory barrier without comment"
* "unnecessary cast may hide bugs, use g_new/g_renew instead"
* "consider using g_path_get_$1() in preference to g_strdup($1())"
* "use g_memdup2() instead of unsafe g_memdup()"
* "consider using qemu_$1 in preference to $1" (strto* etc)
* "use sigaction to establish signal handlers; signal is not portable"
* "please use block_init(), type_init() etc. instead of module_init()"
* "initializer for struct $1 should normally be const"
* "Error messages should not contain newlines"
* "use ctz32() instead of ffs()"
* ditto, ffsl, ffsll, bzero, getpagesize, _SC_PAGESIZE
* "Use g_assert or g_assert_not_reached" [instead of non-exiting glib asserts]

These seem to me to fall into three categories:

(1) many are easy enough to do with grep
(2) there are some checks we really do want to do on the patch,
not on the codebase (most obviously things like Signed-off-by:
format checks)
(3) there are coding style checks that do need to have at least some
idea of C syntax, to check brace placement, whitespace, etc

thanks
-- PMM

Re: [PATCH] target/riscv: Support SW update of PTE A/D bits and Ssptwad extension

2022-07-25 Thread Jim Shu

Hi Alistair,

Why do we want to support that? We can do either and we are
> implementing the much more usual scheme. I don't see a reason to
> bother implementing the other one. Is anyone ever going to use it?
>

Thanks for your response.
I got it.

Regards,
Jim Shu

Re: [PATCH v1 11/13] tests/tcg/s390x: Test unaligned accesses to lowcore

2022-07-25 Thread Thomas Huth


On 25/07/2022 16.05, Alex Bennée wrote:

From: Ilya Leoshkevich 

Add a small test to avoid regressions.

Signed-off-by: Ilya Leoshkevich 
Acked-by: Richard Henderson 
Message-Id: <20220722233614.7254-3-...@linux.ibm.com>
Signed-off-by: Alex Bennée 
---
  tests/tcg/s390x/Makefile.softmmu-target |  9 +
  tests/tcg/s390x/unaligned-lowcore.S | 19 +++
  2 files changed, 28 insertions(+)
  create mode 100644 tests/tcg/s390x/Makefile.softmmu-target
  create mode 100644 tests/tcg/s390x/unaligned-lowcore.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
new file mode 100644
index 00..a34fa68473
--- /dev/null
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -0,0 +1,9 @@
+S390X_SRC=$(SRC_PATH)/tests/tcg/s390x
+VPATH+=$(S390X_SRC)
+QEMU_OPTS=-action panic=exit-failure -kernel
+
+%: %.S
+   $(CC) -march=z13 -m64 -nostartfiles -static -Wl,-Ttext=0 \
+   -Wl,--build-id=none $< -o $@
+
+TESTS += unaligned-lowcore
diff --git a/tests/tcg/s390x/unaligned-lowcore.S 
b/tests/tcg/s390x/unaligned-lowcore.S
new file mode 100644
index 00..246b517f11
--- /dev/null
+++ b/tests/tcg/s390x/unaligned-lowcore.S
@@ -0,0 +1,19 @@
+.org 0x1D0 /* program new PSW */
+.quad 0x2, 0   /* disabled wait */
+.org 0x200 /* lowcore padding */
+
+.globl _start
+_start:
+lctlg %c0,%c0,_c0
+vst %v0,_unaligned
+lpswe quiesce_psw
+
+.align 8
+quiesce_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+_c0:
+.quad 0x1006   /* lowcore protection, AFP, VX */
+
+.byte 0
+_unaligned:
+.octa 0


Acked-by: Thomas Huth

Re: [PATCH] .gitlab-ci.d/windows.yml: Enable native Windows symlink

2022-07-25 Thread Thomas Huth


On 25/07/2022 15.47, Alex Bennée wrote:


Bin Meng  writes:


From: Bin Meng 

The following error message was seen during the configure:

   "ln: failed to create symbolic link
   'x86_64-softmmu/qemu-system-x86_64.exe': No such file or directory"

By default the MSYS environment variable is not defined, so the runtime
behavior of winsymlinks is: if  does not exist, 'ln -s' fails.
At the configure phase, the qemu-system-x86_64.exe has not been built
so creation of the symbolic link fails hence the error message.

Set winsymlinks to 'native' whose behavior is most similar to the
behavior of 'ln -s' on *nix, that is:

   a) if native symlinks are enabled, and whether  exists
  or not, creates  as a native Windows symlink;
   b) else if native symlinks are not enabled, and whether 
  exists or not, 'ln -s' creates as a Windows shortcut file.

Signed-off-by: Bin Meng 


I'm still seeing Windows build failures such as:

   https://gitlab.com/stsquad/qemu/-/jobs/2765579269

and

   https://gitlab.com/stsquad/qemu/-/jobs/2765579267

Any idea what's falling over?


No clue, but FWIW, I had the same problem in a run from last Friday here 
(without this symlink patch):


 https://gitlab.com/thuth/qemu/-/jobs/2758244223#L2817

I've never seen this failure before - so I guess it's rather something new?

 Thomas

Re: [PULL v2 0/8] More fixes + random seed patches for QEMU 7.1

2022-07-25 Thread Peter Maydell

On Fri, 22 Jul 2022 at 18:08, Paolo Bonzini  wrote:
>
> The following changes since commit 5288bee45fbd33203b61f8c76e41b15bb5913e6e:
>
>   Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging 
> (2022-07-21 11:13:01 +0100)
>
> are available in the Git repository at:
>
>   https://gitlab.com/bonzini/qemu.git tags/for-upstream2
>
> for you to fetch changes up to 9fa032885583a2f1cb9cacad2f717784ccea02a1:
>
>   hw/i386: pass RNG seed via setup_data entry (2022-07-22 19:01:44 +0200)

Hi -- this tag doesn't seem to be the same commit hash that this
pull request quotes. The tag is commit 67f7e426e53833. Is the
tag definitely pointing at the right thing ?

thanks
-- PMM

[PATCH v2] hw/display/bcm2835_fb: Fix framebuffer allocation address

2022-07-25 Thread Alan Jian

This patch fixes the dedicated framebuffer mailbox interface(marked as
deprecated in official docs, but can still be fixed for emulation purposes)
by removing unneeded offset to make it works like buffer allocate tag in
bcm2835_property interface[1], some baremetal applications like the
Screen01/Screen02 examples from Baking Pi tutorial[2] didn't work
before this patch.

[1] https://github.com/qemu/qemu/blob/master/hw/misc/bcm2835_property.c#L158
[2] https://www.cl.cam.ac.uk/projects/raspberrypi/tutorials/os/screen01.html

Signed-off-by: Alan Jian 
---
This patch is v2 because the previous one is signed by my username, not my full 
name, 
which is not allowed in the submission rule of QEMU.

 hw/display/bcm2835_fb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/display/bcm2835_fb.c b/hw/display/bcm2835_fb.c
index 088fc3d51c..a05277674f 100644
--- a/hw/display/bcm2835_fb.c
+++ b/hw/display/bcm2835_fb.c
@@ -279,8 +279,7 @@ static void bcm2835_fb_mbox_push(BCM2835FBState *s, 
uint32_t value)
 newconf.xoffset = ldl_le_phys(>dma_as, value + 24);
 newconf.yoffset = ldl_le_phys(>dma_as, value + 28);
 
-newconf.base = s->vcram_base | (value & 0xc000);
-newconf.base += BCM2835_FB_OFFSET;
+newconf.base = s->vcram_base + BCM2835_FB_OFFSET;
 
 /* Copy fields which we don't want to change from the existing config */
 newconf.pixo = s->config.pixo;
-- 
2.34.1

Re: [PULL 06/18] vfio-user: build library

2022-07-25 Thread Jag Raman



> On Jul 25, 2022, at 10:50 AM, Daniel P. Berrangé  wrote:
> 
> On Mon, Jul 25, 2022 at 02:45:09PM +, Jag Raman wrote:
>> Hi Daniel,
>> 
>> We’ve created the following issue to update QEMU’s libvfio-user mirror
>> to the latest:
>> https://gitlab.com/qemu-project/libvfio-user/-/issues/1
>> 
>> Will update QEMU’s submodule once this mirror is updated.
> 
> That sounds good, thank you. We should be fine to get the
> submodule reefreshed even in soft freeze, given that it is
> fixing a test failure bug.
> 
> Oh and I just noticed I messed up your name in my message
> below. I'm very sorry about that.

No worries. :)

> 
> With regards,
> Daniel 
> 
>> On Jul 21, 2022, at 6:25 AM, Daniel P. Berrangé 
>> mailto:berra...@redhat.com>> wrote:
>> 
>> Hi Jay / Stefan,
>> 
>> We've got a non-determinsitic hang in QEMU CI since this series
>> merged, which we tracked down to a libvfio-user test that is
>> flakey:
>> 
>> https://gitlab.com/qemu-project/qemu/-/issues/1114
>> 
>> John Levon has proposed a PR to libvfio-user to turn off the
>> test, but we'll need one of you to update the git submodule
>> for libvfio-user on the QEMU side, as I can't find a nice way
>> to selectively skip the test from QEMU side alone.
>> 
>> With regards
>> Daniel
>> 
> 
> With regards,
> Daniel
> -- 
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
>

Re: [PULL 06/18] vfio-user: build library

2022-07-25 Thread Daniel P . Berrangé

On Mon, Jul 25, 2022 at 02:45:09PM +, Jag Raman wrote:
> Hi Daniel,
> 
> We’ve created the following issue to update QEMU’s libvfio-user mirror
> to the latest:
> https://gitlab.com/qemu-project/libvfio-user/-/issues/1
> 
> Will update QEMU’s submodule once this mirror is updated.

That sounds good, thank you. We should be fine to get the
submodule reefreshed even in soft freeze, given that it is
fixing a test failure bug.

Oh and I just noticed I messed up your name in my message
below. I'm very sorry about that.

With regards,
Daniel 

> On Jul 21, 2022, at 6:25 AM, Daniel P. Berrangé 
> mailto:berra...@redhat.com>> wrote:
> 
> Hi Jay / Stefan,
> 
> We've got a non-determinsitic hang in QEMU CI since this series
> merged, which we tracked down to a libvfio-user test that is
> flakey:
> 
>  https://gitlab.com/qemu-project/qemu/-/issues/1114
> 
> John Levon has proposed a PR to libvfio-user to turn off the
> test, but we'll need one of you to update the git submodule
> for libvfio-user on the QEMU side, as I can't find a nice way
> to selectively skip the test from QEMU side alone.
> 
> With regards
> Daniel
> 

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v7 01/14] mm: Add F_SEAL_AUTO_ALLOCATE seal to memfd

2022-07-25 Thread Gupta, Pankaj





Normally, a write to unallocated space of a file or the hole of a sparse
file automatically causes space allocation, for memfd, this equals to
memory allocation. This new seal prevents such automatically allocating,
either this is from a direct write() or a write on the previously
mmap-ed area. The seal does not prevent fallocate() so an explicit
fallocate() can still cause allocating and can be used to reserve
memory.

This is used to prevent unintentional allocation from userspace on a
stray or careless write and any intentional allocation should use an
explicit fallocate(). One of the main usecases is to avoid memory double
allocation for confidential computing usage where we use two memfds to
back guest memory and at a single point only one memfd is alive and we
want to prevent memory allocation for the other memfd which may have
been mmap-ed previously. More discussion can be found at:

https://lkml.org/lkml/2022/6/14/1255

Suggested-by: Sean Christopherson 
Signed-off-by: Chao Peng 
---
   include/uapi/linux/fcntl.h |  1 +
   mm/memfd.c |  3 ++-
   mm/shmem.c | 16 ++--
   3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 2f86b2ad6d7e..98bdabc8e309 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -43,6 +43,7 @@
   #define F_SEAL_GROW  0x0004  /* prevent file from growing */
   #define F_SEAL_WRITE 0x0008  /* prevent writes */
   #define F_SEAL_FUTURE_WRITE  0x0010  /* prevent future writes while mapped */
+#define F_SEAL_AUTO_ALLOCATE   0x0020  /* prevent allocation for writes */


Why only "on writes" and not "on reads". IIRC, shmem doesn't support the
shared zeropage, so you'll simply allocate a new page via read() or on
read faults.


Also, I *think* you can place pages via userfaultfd into shmem. Not sure
if that would count "auto alloc", but it would certainly bypass fallocate().


I was also thinking this at the same time, but for different reason:

"Want to populate private preboot memory with firmware payload", so was
thinking userfaulftd could be an option as direct writes are restricted?


If that can be a side effect, I definitely glad to see it, though I'm
still not clear how userfaultfd can be particularly helpful for that.


Was thinking if we can use userfaultfd to monitor the pagefault on 
virtual firmware memory range and use to populate the private memory.


Not sure if it is a side effect. Was just theoretically thinking (for 
now kept the idea aside as these enhancements can be worked later).


Thanks,
Pankaj

Re: [PATCH v7 01/92] target/arm: Add ID_AA64ZFR0 fields and isar_feature_aa64_sve2

2022-07-25 Thread Richard Henderson


On 7/25/22 00:05, Zenghui Yu wrote:

diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index dff85f6db9..37ceadd9a9 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -647,17 +647,26 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures 
*ahcf)

 sve_supported = ioctl(fdarray[0], KVM_CHECK_EXTENSION, KVM_CAP_ARM_SVE) > 
0;

-    kvm_arm_destroy_scratch_host_vcpu(fdarray);
-
-    if (err < 0) {
-    return false;
-    }
-
 /* Add feature bits that can't appear until after VCPU init. */
 if (sve_supported) {
 t = ahcf->isar.id_aa64pfr0;
 t = FIELD_DP64(t, ID_AA64PFR0, SVE, 1);
 ahcf->isar.id_aa64pfr0 = t;
+
+    /*
+ * Before v5.1, KVM did not support SVE and did not expose
+ * ID_AA64ZFR0_EL1 even as RAZ.  After v5.1, KVM still does
+ * not expose the register to "user" requests like this
+ * unless the host supports SVE.
+ */
+    err |= read_sys_reg64(fdarray[2], >isar.id_aa64zfr0,
+  ARM64_SYS_REG(3, 0, 0, 4, 4));


If I read it correctly, we haven't yet enabled SVE for the scratch vcpu
(using the KVM_ARM_VCPU_INIT ioctl with KVM_ARM_VCPU_SVE). KVM will
therefore expose ID_AA64ZFR0_EL1 to userspace as RAZ at this point and
isar.id_aa64zfr0 is reset to 0. I wonder if it was intentional?


You are correct, this is a bug.  It appears this is hidden because nothing else actually 
depends on the value within the context of --accel=kvm, e.g. migration.



r~

Re: [PULL 06/18] vfio-user: build library

2022-07-25 Thread Jag Raman

Hi Daniel,

We’ve created the following issue to update QEMU’s libvfio-user mirror
to the latest:
https://gitlab.com/qemu-project/libvfio-user/-/issues/1

Will update QEMU’s submodule once this mirror is updated.

Thank you!
--
Jag

On Jul 21, 2022, at 6:25 AM, Daniel P. Berrangé 
mailto:berra...@redhat.com>> wrote:

Hi Jay / Stefan,

We've got a non-determinsitic hang in QEMU CI since this series
merged, which we tracked down to a libvfio-user test that is
flakey:

 https://gitlab.com/qemu-project/qemu/-/issues/1114

John Levon has proposed a PR to libvfio-user to turn off the
test, but we'll need one of you to update the git submodule
for libvfio-user on the QEMU side, as I can't find a nice way
to selectively skip the test from QEMU side alone.

With regards
Daniel

On Wed, Jun 15, 2022 at 04:51:17PM +0100, Stefan Hajnoczi wrote:
From: Jagannathan Raman mailto:jag.ra...@oracle.com>>

add the libvfio-user library as a submodule. build it as a meson
subproject.

libvfio-user is distributed with BSD 3-Clause license and
json-c with MIT (Expat) license

Signed-off-by: Elena Ufimtseva 
mailto:elena.ufimts...@oracle.com>>
Signed-off-by: John G Johnson 
mailto:john.g.john...@oracle.com>>
Signed-off-by: Jagannathan Raman 
mailto:jag.ra...@oracle.com>>
Reviewed-by: Stefan Hajnoczi mailto:stefa...@redhat.com>>
Message-id: 
c2adec87958b081d1dc8775d4aa05c897912f025.1655151679.git.jag.ra...@oracle.com

[Changed submodule URL to QEMU's libvfio-user mirror on GitLab. The QEMU
project mirrors its dependencies so that it can provide full source code
even in the event that its dependencies become unavailable. Note that
the mirror repo is manually updated, so please contact me to make newer
libvfio-user commits available. If I become a bottleneck we can set up a
cronjob.

Updated scripts/meson-buildoptions.sh to match the meson_options.txt
change. Failure to do so can result in scripts/meson-buildoptions.sh
being modified by the build system later on and you end up with a dirty
working tree.
--Stefan]

Signed-off-by: Stefan Hajnoczi mailto:stefa...@redhat.com>>
---
MAINTAINERS |  1 +
meson_options.txt   |  2 ++
configure   | 17 +
meson.build | 23 ++-
.gitlab-ci.d/buildtest.yml  |  1 +
.gitmodules |  3 +++
Kconfig.host|  4 
hw/remote/Kconfig   |  4 
hw/remote/meson.build   |  2 ++
scripts/meson-buildoptions.sh   |  4 
subprojects/libvfio-user|  1 +
tests/docker/dockerfiles/centos8.docker |  2 ++
12 files changed, 63 insertions(+), 1 deletion(-)
create mode 16 subprojects/libvfio-user

diff --git a/MAINTAINERS b/MAINTAINERS
index 5ba93348aa..d0fcaf0edb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3642,6 +3642,7 @@ F: hw/remote/proxy-memory-listener.c
F: include/hw/remote/proxy-memory-listener.h
F: hw/remote/iohub.c
F: include/hw/remote/iohub.h
+F: subprojects/libvfio-user

EBPF:
M: Jason Wang mailto:jasow...@redhat.com>>
diff --git a/meson_options.txt b/meson_options.txt
index 0e8197386b..f3e2f22c1e 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -88,6 +88,8 @@ option('cfi_debug', type: 'boolean', value: 'false',
   description: 'Verbose errors in case of CFI violation')
option('multiprocess', type: 'feature', value: 'auto',
   description: 'Out of process device emulation support')
+option('vfio_user_server', type: 'feature', value: 'disabled',
+   description: 'vfio-user server support')
option('dbus_display', type: 'feature', value: 'auto',
   description: '-display dbus support')
option('tpm', type : 'feature', value : 'auto',
diff --git a/configure b/configure
index 4b12a8094c..c14e7f590a 100755
--- a/configure
+++ b/configure
@@ -315,6 +315,7 @@ meson_args=""
ninja=""
bindir="bin"
skip_meson=no
+vfio_user_server="disabled"

# The following Meson options are handled manually (still they
# are included in the automatically generated help message)
@@ -909,6 +910,10 @@ for opt do
  ;;
  --disable-blobs) meson_option_parse --disable-install-blobs ""
  ;;
+  --enable-vfio-user-server) vfio_user_server="enabled"
+  ;;
+  --disable-vfio-user-server) vfio_user_server="disabled"
+  ;;
  --enable-tcmalloc) meson_option_parse --enable-malloc=tcmalloc tcmalloc
  ;;
  --enable-jemalloc) meson_option_parse --enable-malloc=jemalloc jemalloc
@@ -2132,6 +2137,17 @@ write_container_target_makefile() {



+##
+# check for vfio_user_server
+
+case "$vfio_user_server" in
+  enabled )
+if test "$git_submodules_action" != "ignore"; then
+  git_submodules="${git_submodules} subprojects/libvfio-user"
+fi
+;;
+esac
+
##
# End of CC checks
# After here, no more

Re: [PATCH] target/sh4: Honor QEMU_LOG_FILENAME with QEMU_LOG=cpu

2022-07-25 Thread Peter Maydell

On Mon, 25 Jul 2022 at 15:32, Ilya Leoshkevich  wrote:
>
> When using QEMU_LOG=cpu on sh4, QEMU_LOG_FILENAME is partially ignored.
> Fix by using qemu_fprintf() instead of qemu_printf() in the respective
> places.
>
> Fixes: 90c84c560067 ("qom/cpu: Simplify how CPUClass:cpu_dump_state() prints")
> Signed-off-by: Ilya Leoshkevich 

Reviewed-by: Peter Maydell 

Looks like sh4 was the only target where 90c84c560067 introduced this bug.

thanks
-- PMM

[PATCH RESEND] tests/tcg/linux-test: Fix random hangs in test_socket

2022-07-25 Thread Ilya Leoshkevich

test_socket hangs randomly in connect(), especially when run without
qemu. Apparently the reason is that linux started treating backlog
value of 0 literally instead of rounding it up since v4.4 (commit
ef547f2ac16b).

So set it to 1 instead.

Signed-off-by: Ilya Leoshkevich 
---

This is a rebase of the previous submission:
https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg00095.html

 tests/tcg/multiarch/linux/linux-test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/tcg/multiarch/linux/linux-test.c 
b/tests/tcg/multiarch/linux/linux-test.c
index 019d8175ca..5a2a4f2258 100644
--- a/tests/tcg/multiarch/linux/linux-test.c
+++ b/tests/tcg/multiarch/linux/linux-test.c
@@ -263,7 +263,7 @@ static int server_socket(void)
 sockaddr.sin_port = htons(0); /* choose random ephemeral port) */
 sockaddr.sin_addr.s_addr = 0;
 chk_error(bind(fd, (struct sockaddr *), sizeof(sockaddr)));
-chk_error(listen(fd, 0));
+chk_error(listen(fd, 1));
 return fd;
 
 }
-- 
2.35.3

[PATCH] target/sh4: Honor QEMU_LOG_FILENAME with QEMU_LOG=cpu

2022-07-25 Thread Ilya Leoshkevich

When using QEMU_LOG=cpu on sh4, QEMU_LOG_FILENAME is partially ignored.
Fix by using qemu_fprintf() instead of qemu_printf() in the respective
places.

Fixes: 90c84c560067 ("qom/cpu: Simplify how CPUClass:cpu_dump_state() prints")
Signed-off-by: Ilya Leoshkevich 
---
 target/sh4/translate.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index f1b190e7cf..9aadaf52cd 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -171,16 +171,16 @@ void superh_cpu_dump_state(CPUState *cs, FILE *f, int 
flags)
 qemu_fprintf(f, "sgr=0x%08x dbr=0x%08x delayed_pc=0x%08x fpul=0x%08x\n",
  env->sgr, env->dbr, env->delayed_pc, env->fpul);
 for (i = 0; i < 24; i += 4) {
-qemu_printf("r%d=0x%08x r%d=0x%08x r%d=0x%08x r%d=0x%08x\n",
-   i, env->gregs[i], i + 1, env->gregs[i + 1],
-   i + 2, env->gregs[i + 2], i + 3, env->gregs[i + 3]);
+qemu_fprintf(f, "r%d=0x%08x r%d=0x%08x r%d=0x%08x r%d=0x%08x\n",
+ i, env->gregs[i], i + 1, env->gregs[i + 1],
+ i + 2, env->gregs[i + 2], i + 3, env->gregs[i + 3]);
 }
 if (env->flags & DELAY_SLOT) {
-qemu_printf("in delay slot (delayed_pc=0x%08x)\n",
-   env->delayed_pc);
+qemu_fprintf(f, "in delay slot (delayed_pc=0x%08x)\n",
+ env->delayed_pc);
 } else if (env->flags & DELAY_SLOT_CONDITIONAL) {
-qemu_printf("in conditional delay slot (delayed_pc=0x%08x)\n",
-   env->delayed_pc);
+qemu_fprintf(f, "in conditional delay slot (delayed_pc=0x%08x)\n",
+ env->delayed_pc);
 } else if (env->flags & DELAY_SLOT_RTE) {
 qemu_fprintf(f, "in rte delay slot (delayed_pc=0x%08x)\n",
  env->delayed_pc);
-- 
2.35.3

Re: [PATCH for-7.2 00/10] add hmp 'save-fdt' and 'info fdt' commands

2022-07-25 Thread Daniel P . Berrangé

On Mon, Jul 25, 2022 at 10:16:11AM -0300, Daniel Henrique Barboza wrote:
> 
> 
> On 7/25/22 06:11, Daniel P. Berrangé wrote:
> > On Fri, Jul 22, 2022 at 04:59:57PM -0300, Daniel Henrique Barboza wrote:
> > > Hi,
> > > 
> > > After dealing with a FDT element that isn't being shown in the userspace
> > > and having to shutdown the guest, dump the FDT using 'machine -dumpdtb' 
> > > and
> > > then using 'dtc' to see what was inside the FDT, I thought it was a good
> > > idea to add extra support for FDT handling in QEMU.
> > > 
> > > This series introduces 2 commands. 'fdt-save' behaves similar to what
> > > 'machine -dumpdtb' does, with the advantage of saving the FDT of a running
> > > guest on demand. This command is implemented in patch 03.
> > > 
> > > The second command, 'info fdt ' is more sophisticated. This
> > > command can print specific nodes and properties of the FDT. A few
> > > examples:
> > > 
> > > - print the /cpus/cpu@0 from an ARM 'virt' machine:
> > > 
> > > (qemu) info fdt /cpus/cpu@0
> > > /cpus/cpu@0 {
> > >  phandle = <0x8001>
> > >  reg = <0x0>
> > >  compatible = 'arm,cortex-a57'
> > >  device_type = 'cpu'
> > > }
> > > (qemu)
> > > 
> > > - print the device_type property of the interrupt-controller node of a
> > > pSeries machine:
> > > 
> > > (qemu) info fdt /interrupt-controller/device_type
> > > /interrupt-controller/device_type = 
> > > 'PowerPC-External-Interrupt-Presentation'
> > > (qemu)
> > 
> > Please don't add new HMP-only commands. These should be provided
> > as QMP commands, where the HMP is a tiny shim to the QMP.
> 
> I wasn't sure if this would be useful to be in QMP, but perhaps it's better to
> let QMP consumers to decide whether use it or not.

That's not a relevant question to consider. The end goal is for HMP
to be 100% implemented in terms of QMP commands. So if anything is
required for HMP, then by definition it is also required for QMP.

The only question is whether the QMP command should be marked stable
or unstable. If there's any doubt, that pushes towards the 'unstable'
side, such that we don't have to promise API compat

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PATCH v1 09/13] semihosting: Fix handling of buffer in TARGET_SYS_TMPNAM

2022-07-25 Thread Alex Bennée

From: Peter Maydell 

The TARGET_SYS_TMPNAM implementation has two bugs spotted by
Coverity:
 * confusion about whether 'len' has the length of the string
   including or excluding the terminating NUL means we
   lock_user() len bytes of memory but memcpy() len + 1 bytes
 * In the error-exit cases we forget to free() the buffer
   that asprintf() returned to us

Resolves: Coverity CID 1490285, 1490289
Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Message-Id: <20220719121110.225657-5-peter.mayd...@linaro.org>
Signed-off-by: Alex Bennée 
---
 semihosting/arm-compat-semi.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/semihosting/arm-compat-semi.c b/semihosting/arm-compat-semi.c
index d12288fc80..e741674238 100644
--- a/semihosting/arm-compat-semi.c
+++ b/semihosting/arm-compat-semi.c
@@ -504,16 +504,25 @@ void do_common_semihosting(CPUState *cs)
 GET_ARG(1);
 GET_ARG(2);
 len = asprintf(, "/tmp/qemu-%x%02x", getpid(), (int)arg1 & 0xff);
+if (len < 0) {
+common_semi_set_ret(cs, -1);
+break;
+}
+
+/* Allow for trailing NUL */
+len++;
 /* Make sure there's enough space in the buffer */
-if (len < 0 || len >= arg2) {
+if (len > arg2) {
+free(s);
 common_semi_set_ret(cs, -1);
 break;
 }
 p = lock_user(VERIFY_WRITE, arg0, len, 0);
 if (!p) {
+free(s);
 goto do_fault;
 }
-memcpy(p, s, len + 1);
+memcpy(p, s, len);
 unlock_user(p, arg0, len);
 free(s);
 common_semi_set_ret(cs, 0);
-- 
2.30.2

[PATCH v1 12/13] docs/devel: fix description of OBJECT_DECLARE_SIMPLE_TYPE

2022-07-25 Thread Alex Bennée

Since 30b5707c26 (qom: Remove module_obj_name parameter from
OBJECT_DECLARE* macros) we don't need the additional two parameters.
Fix the documentation.

Signed-off-by: Alex Bennée 
Reviewed-by: Richard Henderson 
Reviewed-by: Philippe Mathieu-Daudé 
Message-Id: <20220712103131.2006653-1-alex.ben...@linaro.org>
---
 docs/devel/qom.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/devel/qom.rst b/docs/devel/qom.rst
index e5fe3597cd..0cf9a714f0 100644
--- a/docs/devel/qom.rst
+++ b/docs/devel/qom.rst
@@ -292,8 +292,7 @@ in the header file:
 .. code-block:: c
:caption: Declaring a simple type
 
-   OBJECT_DECLARE_SIMPLE_TYPE(MyDevice, my_device,
-  MY_DEVICE, DEVICE)
+   OBJECT_DECLARE_SIMPLE_TYPE(MyDevice, MY_DEVICE)
 
 This is equivalent to the following:
 
-- 
2.30.2

[PATCH v1 13/13] qemu-options: bring the kernel and image options together

2022-07-25 Thread Alex Bennée

How to control the booting of QEMU is often a source of confusion for
users. Bring the options that control this together in the manual
pages and add some verbiage to describe when each option is
appropriate. This attempts to codify some of the knowledge expressed
in:

  
https://stackoverflow.com/questions/58420670/qemu-bios-vs-kernel-vs-device-loader-file/58434837#58434837

Signed-off-by: Alex Bennée 
Reviewed-by: Peter Maydell 
Reviewed-by: Cédric Le Goater 
Message-Id: <20220707151037.397324-1-alex.ben...@linaro.org>
---
 qemu-options.hx | 96 +++--
 1 file changed, 78 insertions(+), 18 deletions(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index 8e17c5064a..3f23a42fa8 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1585,13 +1585,6 @@ SRST
 Use file as SecureDigital card image.
 ERST
 
-DEF("pflash", HAS_ARG, QEMU_OPTION_pflash,
-"-pflash fileuse 'file' as a parallel flash image\n", QEMU_ARCH_ALL)
-SRST
-``-pflash file``
-Use file as a parallel flash image.
-ERST
-
 DEF("snapshot", 0, QEMU_OPTION_snapshot,
 "-snapshot   write to temporary files instead of disk image files\n",
 QEMU_ARCH_ALL)
@@ -3684,12 +3677,67 @@ DEFHEADING()
 
 #endif
 
-DEFHEADING(Linux/Multiboot boot specific:)
+DEFHEADING(Boot Image or Kernel specific:)
+SRST
+There are broadly 4 ways you can boot a system with QEMU.
+
+ - specify a firmware and let it control finding a kernel
+ - specify a firmware and pass a hint to the kernel to boot
+ - direct kernel image boot
+ - manually load files into the guest's address space
+
+The third method is useful for quickly testing kernels but as there is
+no firmware to pass configuration information to the kernel the
+hardware must either be probeable, the kernel built for the exact
+configuration or passed some configuration data (e.g. a DTB blob)
+which tells the kernel what drivers it needs. This exact details are
+often hardware specific.
+
+The final method is the most generic way of loading images into the
+guest address space and used mostly for ``bare metal`` type
+development where the reset vectors of the processor are taken into
+account.
+
+ERST
+
 SRST
-When using these options, you can use a given Linux or Multiboot kernel
-without installing it in the disk image. It can be useful for easier
-testing of various kernels.
 
+For x86 machines and some other architectures ``-bios`` will generally
+do the right thing with whatever it is given. For other machines the
+more strict ``-pflash`` option needs an image that is sized for the
+flash device for the given machine type.
+
+Please see the :ref:`system-targets-ref` section of the manual for
+more detailed documentation.
+
+ERST
+
+DEF("bios", HAS_ARG, QEMU_OPTION_bios, \
+"-bios file  set the filename for the BIOS\n", QEMU_ARCH_ALL)
+SRST
+``-bios file``
+Set the filename for the BIOS.
+ERST
+
+DEF("pflash", HAS_ARG, QEMU_OPTION_pflash,
+"-pflash fileuse 'file' as a parallel flash image\n", QEMU_ARCH_ALL)
+SRST
+``-pflash file``
+Use file as a parallel flash image.
+ERST
+
+SRST
+
+The kernel options were designed to work with Linux kernels although
+other things (like hypervisors) can be packaged up as a kernel
+executable image. The exact format of a executable image is usually
+architecture specific.
+
+The way in which the kernel is started (what address it is loaded at,
+what if any information is passed to it via CPU registers, the state
+of the hardware when it is started, and so on) is also architecture
+specific. Typically it follows the specification laid down by the
+Linux kernel for how kernels for that architecture must be started.
 
 ERST
 
@@ -3729,6 +3777,25 @@ SRST
 kernel on boot.
 ERST
 
+SRST
+
+Finally you can also manually load images directly into the address
+space of the guest. This is most useful for developers who already
+know the layout of their guest and take care to ensure something sane
+will happen when the reset vector executes.
+
+The generic loader can be invoked by using the loader device:
+
+``-device 
loader,addr=,data=,data-len=[,data-be=][,cpu-num=]``
+
+there is also the guest loader which operates in a similar way but
+tweaks the DTB so a hypervisor loaded via ``-kernel`` can find where
+the guest image is:
+
+``-device 
guest-loader,addr=[,kernel=,[bootargs=]][,initrd=]``
+
+ERST
+
 DEFHEADING()
 
 DEFHEADING(Debug/Expert options:)
@@ -4179,13 +4246,6 @@ SRST
 To list all the data directories, use ``-L help``.
 ERST
 
-DEF("bios", HAS_ARG, QEMU_OPTION_bios, \
-"-bios file  set the filename for the BIOS\n", QEMU_ARCH_ALL)
-SRST
-``-bios file``
-Set the filename for the BIOS.
-ERST
-
 DEF("enable-kvm", 0, QEMU_OPTION_enable_kvm, \
 "-enable-kvm enable KVM full virtualization support\n",
 QEMU_ARCH_ARM | QEMU_ARCH_I386 | QEMU_ARCH_MIPS | QEMU_ARCH_PPC |
-- 
2.30.2

[PATCH v1 10/13] qapi: Add exit-failure PanicAction

2022-07-25 Thread Alex Bennée

From: Ilya Leoshkevich 

Currently QEMU exits with code 0 on both panic an shutdown. For tests
it is useful to return 1 on panic, so that it counts as a test
failure.

Introduce a new exit-failure PanicAction that makes main() return
EXIT_FAILURE. Tests can use -action panic=exit-failure option to
activate this behavior.

Signed-off-by: Ilya Leoshkevich 
Reviewed-by: David Hildenbrand 
Reviewed-by: Richard Henderson 
Message-Id: <20220722233614.7254-2-...@linux.ibm.com>
Signed-off-by: Alex Bennée 
---
 qapi/run-state.json |  4 +++-
 include/sysemu/sysemu.h |  2 +-
 softmmu/main.c  |  6 --
 softmmu/runstate.c  | 17 +
 qemu-options.hx |  2 +-
 5 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/qapi/run-state.json b/qapi/run-state.json
index 6e2162d7b3..d42c370c4f 100644
--- a/qapi/run-state.json
+++ b/qapi/run-state.json
@@ -364,10 +364,12 @@
 #
 # @shutdown: Shutdown the VM and exit, according to the shutdown action
 #
+# @exit-failure: Shutdown the VM and exit with nonzero status
+#
 # Since: 6.0
 ##
 { 'enum': 'PanicAction',
-  'data': [ 'pause', 'shutdown', 'none' ] }
+  'data': [ 'pause', 'shutdown', 'exit-failure', 'none' ] }
 
 ##
 # @watchdog-set-action:
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 812f66a31a..31aa45160b 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -103,7 +103,7 @@ void qemu_boot_set(const char *boot_order, Error **errp);
 bool defaults_enabled(void);
 
 void qemu_init(int argc, char **argv, char **envp);
-void qemu_main_loop(void);
+int qemu_main_loop(void);
 void qemu_cleanup(void);
 
 extern QemuOptsList qemu_legacy_drive_opts;
diff --git a/softmmu/main.c b/softmmu/main.c
index c00432ff09..1b675a8c03 100644
--- a/softmmu/main.c
+++ b/softmmu/main.c
@@ -32,11 +32,13 @@
 
 int qemu_main(int argc, char **argv, char **envp)
 {
+int status;
+
 qemu_init(argc, argv, envp);
-qemu_main_loop();
+status = qemu_main_loop();
 qemu_cleanup();
 
-return 0;
+return status;
 }
 
 #ifndef CONFIG_COCOA
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index 168e1b78a0..1e68680b9d 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -482,7 +482,8 @@ void qemu_system_guest_panicked(GuestPanicInformation *info)
 qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_PAUSE,
 !!info, info);
 vm_stop(RUN_STATE_GUEST_PANICKED);
-} else if (panic_action == PANIC_ACTION_SHUTDOWN) {
+} else if (panic_action == PANIC_ACTION_SHUTDOWN ||
+   panic_action == PANIC_ACTION_EXIT_FAILURE) {
 qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_POWEROFF,
!!info, info);
 vm_stop(RUN_STATE_GUEST_PANICKED);
@@ -662,7 +663,7 @@ void qemu_system_debug_request(void)
 qemu_notify_event();
 }
 
-static bool main_loop_should_exit(void)
+static bool main_loop_should_exit(int *status)
 {
 RunState r;
 ShutdownCause request;
@@ -680,6 +681,10 @@ static bool main_loop_should_exit(void)
 if (shutdown_action == SHUTDOWN_ACTION_PAUSE) {
 vm_stop(RUN_STATE_SHUTDOWN);
 } else {
+if (request == SHUTDOWN_CAUSE_GUEST_PANIC &&
+panic_action == PANIC_ACTION_EXIT_FAILURE) {
+*status = EXIT_FAILURE;
+}
 return true;
 }
 }
@@ -715,12 +720,14 @@ static bool main_loop_should_exit(void)
 return false;
 }
 
-void qemu_main_loop(void)
+int qemu_main_loop(void)
 {
+int status = EXIT_SUCCESS;
 #ifdef CONFIG_PROFILER
 int64_t ti;
 #endif
-while (!main_loop_should_exit()) {
+
+while (!main_loop_should_exit()) {
 #ifdef CONFIG_PROFILER
 ti = profile_getclock();
 #endif
@@ -729,6 +736,8 @@ void qemu_main_loop(void)
 dev_time += profile_getclock() - ti;
 #endif
 }
+
+return status;
 }
 
 void qemu_add_exit_notifier(Notifier *notify)
diff --git a/qemu-options.hx b/qemu-options.hx
index 79e00916a1..8e17c5064a 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4239,7 +4239,7 @@ DEF("action", HAS_ARG, QEMU_OPTION_action,
 "   action when guest reboots [default=reset]\n"
 "-action shutdown=poweroff|pause\n"
 "   action when guest shuts down [default=poweroff]\n"
-"-action panic=pause|shutdown|none\n"
+"-action panic=pause|shutdown|exit-failure|none\n"
 "   action when guest panics [default=shutdown]\n"
 "-action watchdog=reset|shutdown|poweroff|inject-nmi|pause|debug|none\n"
 "   action when watchdog fires [default=reset]\n",
-- 
2.30.2

[PATCH v1 11/13] tests/tcg/s390x: Test unaligned accesses to lowcore

2022-07-25 Thread Alex Bennée

From: Ilya Leoshkevich 

Add a small test to avoid regressions.

Signed-off-by: Ilya Leoshkevich 
Acked-by: Richard Henderson 
Message-Id: <20220722233614.7254-3-...@linux.ibm.com>
Signed-off-by: Alex Bennée 
---
 tests/tcg/s390x/Makefile.softmmu-target |  9 +
 tests/tcg/s390x/unaligned-lowcore.S | 19 +++
 2 files changed, 28 insertions(+)
 create mode 100644 tests/tcg/s390x/Makefile.softmmu-target
 create mode 100644 tests/tcg/s390x/unaligned-lowcore.S

diff --git a/tests/tcg/s390x/Makefile.softmmu-target 
b/tests/tcg/s390x/Makefile.softmmu-target
new file mode 100644
index 00..a34fa68473
--- /dev/null
+++ b/tests/tcg/s390x/Makefile.softmmu-target
@@ -0,0 +1,9 @@
+S390X_SRC=$(SRC_PATH)/tests/tcg/s390x
+VPATH+=$(S390X_SRC)
+QEMU_OPTS=-action panic=exit-failure -kernel
+
+%: %.S
+   $(CC) -march=z13 -m64 -nostartfiles -static -Wl,-Ttext=0 \
+   -Wl,--build-id=none $< -o $@
+
+TESTS += unaligned-lowcore
diff --git a/tests/tcg/s390x/unaligned-lowcore.S 
b/tests/tcg/s390x/unaligned-lowcore.S
new file mode 100644
index 00..246b517f11
--- /dev/null
+++ b/tests/tcg/s390x/unaligned-lowcore.S
@@ -0,0 +1,19 @@
+.org 0x1D0 /* program new PSW */
+.quad 0x2, 0   /* disabled wait */
+.org 0x200 /* lowcore padding */
+
+.globl _start
+_start:
+lctlg %c0,%c0,_c0
+vst %v0,_unaligned
+lpswe quiesce_psw
+
+.align 8
+quiesce_psw:
+.quad 0x2,0xfff/* see is_special_wait_psw() */
+_c0:
+.quad 0x1006   /* lowcore protection, AFP, VX */
+
+.byte 0
+_unaligned:
+.octa 0
-- 
2.30.2

[PATCH v1 05/13] .gitlab-ci.d/windows.yml: Enable native Windows symlink

2022-07-25 Thread Alex Bennée

From: Bin Meng 

The following error message was seen during the configure:

  "ln: failed to create symbolic link
  'x86_64-softmmu/qemu-system-x86_64.exe': No such file or directory"

By default the MSYS environment variable is not defined, so the runtime
behavior of winsymlinks is: if  does not exist, 'ln -s' fails.
At the configure phase, the qemu-system-x86_64.exe has not been built
so creation of the symbolic link fails hence the error message.

Set winsymlinks to 'native' whose behavior is most similar to the
behavior of 'ln -s' on *nix, that is:

  a) if native symlinks are enabled, and whether  exists
 or not, creates  as a native Windows symlink;
  b) else if native symlinks are not enabled, and whether 
 exists or not, 'ln -s' creates as a Windows shortcut file.

Signed-off-by: Bin Meng 
Message-Id: <20220725123000.807608-1-bmeng...@gmail.com>
---
 .gitlab-ci.d/windows.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitlab-ci.d/windows.yml b/.gitlab-ci.d/windows.yml
index 1b2ede49e1..0b9572a8a3 100644
--- a/.gitlab-ci.d/windows.yml
+++ b/.gitlab-ci.d/windows.yml
@@ -57,6 +57,7 @@ msys2-64bit:
   mingw-w64-x86_64-zstd "
   - $env:CHERE_INVOKING = 'yes'  # Preserve the current working directory
   - $env:MSYSTEM = 'MINGW64' # Start a 64 bit Mingw environment
+  - $env:MSYS = 'winsymlinks:native' # Enable native Windows symlink
   - .\msys64\usr\bin\bash -lc './configure --target-list=x86_64-softmmu
   --enable-capstone --without-default-devices'
   - .\msys64\usr\bin\bash -lc "sed -i '/^ROMS=/d' build/config-host.mak"
@@ -89,6 +90,7 @@ msys2-32bit:
   mingw-w64-i686-usbredir "
   - $env:CHERE_INVOKING = 'yes'  # Preserve the current working directory
   - $env:MSYSTEM = 'MINGW32' # Start a 32-bit MinG environment
+  - $env:MSYS = 'winsymlinks:native' # Enable native Windows symlink
   - mkdir output
   - cd output
   - ..\msys64\usr\bin\bash -lc "../configure --target-list=ppc64-softmmu"
-- 
2.30.2

[PATCH v1 07/13] semihosting: Don't copy buffer after console_write()

2022-07-25 Thread Alex Bennée

From: Peter Maydell 

The console_write() semihosting function outputs guest data from a
buffer; it doesn't update that buffer.  It therefore doesn't need to
pass a length value to unlock_user(), but can pass 0, meaning "do not
copy any data back to the guest memory".

Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Message-Id: <20220719121110.225657-3-peter.mayd...@linaro.org>
Signed-off-by: Alex Bennée 
---
 semihosting/syscalls.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/semihosting/syscalls.c b/semihosting/syscalls.c
index 4847f66c02..508a0ad88c 100644
--- a/semihosting/syscalls.c
+++ b/semihosting/syscalls.c
@@ -627,7 +627,7 @@ static void console_write(CPUState *cs, 
gdb_syscall_complete_cb complete,
 }
 ret = qemu_semihosting_console_write(ptr, len);
 complete(cs, ret ? ret : -1, ret ? 0 : EIO);
-unlock_user(ptr, buf, ret);
+unlock_user(ptr, buf, 0);
 }
 
 static void console_fstat(CPUState *cs, gdb_syscall_complete_cb complete,
-- 
2.30.2

[PATCH v1 02/13] gitlab: show testlog.txt contents when cirrus/custom-runner jobs fail

2022-07-25 Thread Alex Bennée

From: Daniel P. Berrangé 

When tests fail meson just displays a summary and tells you to look at
the testlog.txt file for details. The native jobs on shared runners
publish testlog.txt as an artifact. For the Cirrus jobs and custom
runner jobs this is not currently possible. The best we can do is cat
the log contents on failure, to give maintainers a fighting chance
of diagnosing the problem.

Reviewed-by: Thomas Huth 
Signed-off-by: Daniel P. Berrangé 
Message-Id: <20220722130431.2319019-3-berra...@redhat.com>
Signed-off-by: Alex Bennée 
---
 .gitlab-ci.d/cirrus/build.yml|  3 ++-
 .../custom-runners/centos-stream-8-x86_64.yml|  2 ++
 .gitlab-ci.d/custom-runners/ubuntu-20.04-aarch32.yml |  2 ++
 .gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml | 12 
 .gitlab-ci.d/custom-runners/ubuntu-20.04-s390x.yml   | 12 
 5 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci.d/cirrus/build.yml b/.gitlab-ci.d/cirrus/build.yml
index c555f5d36e..7ef6af8d33 100644
--- a/.gitlab-ci.d/cirrus/build.yml
+++ b/.gitlab-ci.d/cirrus/build.yml
@@ -32,5 +32,6 @@ build_task:
 - $MAKE -j$(sysctl -n hw.ncpu)
 - for TARGET in $TEST_TARGETS ;
   do
-$MAKE -j$(sysctl -n hw.ncpu) $TARGET V=1 ;
+$MAKE -j$(sysctl -n hw.ncpu) $TARGET V=1
+|| { cat meson-logs/testlog.txt; exit 1; } ;
   done
diff --git a/.gitlab-ci.d/custom-runners/centos-stream-8-x86_64.yml 
b/.gitlab-ci.d/custom-runners/centos-stream-8-x86_64.yml
index 49aa703f55..068b0c4335 100644
--- a/.gitlab-ci.d/custom-runners/centos-stream-8-x86_64.yml
+++ b/.gitlab-ci.d/custom-runners/centos-stream-8-x86_64.yml
@@ -23,6 +23,8 @@ centos-stream-8-x86_64:
  - mkdir build
  - cd build
  - ../scripts/ci/org.centos/stream/8/x86_64/configure
+   || { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make -j"$JOBS"
  - make NINJA=":" check
+   || { cat meson-logs/testlog.txt; exit 1; } ;
  - ../scripts/ci/org.centos/stream/8/x86_64/test-avocado
diff --git a/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch32.yml 
b/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch32.yml
index 1998460d06..cbfa9cc164 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch32.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch32.yml
@@ -19,5 +19,7 @@ ubuntu-20.04-aarch32-all:
  - mkdir build
  - cd build
  - ../configure --cross-prefix=arm-linux-gnueabihf-
+   || { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make --output-sync -j`nproc --ignore=40`
  - make --output-sync -j`nproc --ignore=40` check V=1
+   || { cat meson-logs/testlog.txt; exit 1; } ;
diff --git a/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml 
b/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml
index 65718a188a..3d878914e7 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-20.04-aarch64.yml
@@ -17,9 +17,12 @@ ubuntu-20.04-aarch64-all-linux-static:
  - mkdir build
  - cd build
  - ../configure --enable-debug --static --disable-system --disable-glusterfs 
--disable-libssh
+   || { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make --output-sync -j`nproc --ignore=40`
  - make --output-sync -j`nproc --ignore=40` check V=1
+   || { cat meson-logs/testlog.txt; exit 1; } ;
  - make --output-sync -j`nproc --ignore=40` check-tcg V=1
+   || { cat meson-logs/testlog.txt; exit 1; } ;
 
 ubuntu-20.04-aarch64-all:
  needs: []
@@ -38,8 +41,10 @@ ubuntu-20.04-aarch64-all:
  - mkdir build
  - cd build
  - ../configure --disable-libssh
+   || { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make --output-sync -j`nproc --ignore=40`
  - make --output-sync -j`nproc --ignore=40` check V=1
+   || { cat meson-logs/testlog.txt; exit 1; } ;
 
 ubuntu-20.04-aarch64-alldbg:
  needs: []
@@ -54,9 +59,11 @@ ubuntu-20.04-aarch64-alldbg:
  - mkdir build
  - cd build
  - ../configure --enable-debug --disable-libssh
+   || { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make clean
  - make --output-sync -j`nproc --ignore=40`
  - make --output-sync -j`nproc --ignore=40` check V=1
+   || { cat meson-logs/testlog.txt; exit 1; } ;
 
 ubuntu-20.04-aarch64-clang:
  needs: []
@@ -75,8 +82,10 @@ ubuntu-20.04-aarch64-clang:
  - mkdir build
  - cd build
  - ../configure --disable-libssh --cc=clang-10 --cxx=clang++-10 
--enable-sanitizers
+   || { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make --output-sync -j`nproc --ignore=40`
  - make --output-sync -j`nproc --ignore=40` check V=1
+   || { cat meson-logs/testlog.txt; exit 1; } ;
 
 ubuntu-20.04-aarch64-tci:
  needs: []
@@ -95,6 +104,7 @@ ubuntu-20.04-aarch64-tci:
  - mkdir build
  - cd build
  - ../configure --disable-libssh --enable-tcg-interpreter
+   || { cat config.log meson-logs/meson-log.txt; exit 1; }
  - make --output-sync -j`nproc --ignore=40`
 
 ubuntu-20.04-aarch64-notcg:
@@ -114,5 +124,7 @@ ubuntu-20.04-aarch64-notcg:
  - mkdir build
  - cd build
  - ../configure --disable-libssh

[PATCH v1 06/13] semihosting: Don't return negative values on qemu_semihosting_console_write() failure

2022-07-25 Thread Alex Bennée

From: Peter Maydell 

The documentation comment for qemu_semihosting_console_write() says
 * Returns: number of bytes written -- this should only ever be short
 * on some sort of i/o error.

and the callsites rely on this.  However, the implementation code
path which sends console output to a chardev doesn't honour this,
and will return negative values on error.  Bring it into line with
the other implementation codepaths and the documentation, so that
it returns 0 on error.

Spotted by Coverity, because console_write() passes the return value
to unlock_user(), which doesn't accept a negative length.

Resolves: Coverity CID 1490288
Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Message-Id: <20220719121110.225657-2-peter.mayd...@linaro.org>
Signed-off-by: Alex Bennée 
---
 semihosting/console.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/semihosting/console.c b/semihosting/console.c
index 5b1ec0a1c3..0f976fe8cb 100644
--- a/semihosting/console.c
+++ b/semihosting/console.c
@@ -111,7 +111,8 @@ int qemu_semihosting_console_read(CPUState *cs, void *buf, 
int len)
 int qemu_semihosting_console_write(void *buf, int len)
 {
 if (console.chr) {
-return qemu_chr_write_all(console.chr, (uint8_t *)buf, len);
+int r = qemu_chr_write_all(console.chr, (uint8_t *)buf, len);
+return r < 0 ? 0 : r;
 } else {
 return fwrite(buf, 1, len, stderr);
 }
-- 
2.30.2

[PATCH v1 04/13] .cirrus.yml: Change winsymlinks to 'native'

2022-07-25 Thread Alex Bennée

From: Bin Meng 

At present winsymlinks is set to 'nativestrict', and its behavior is:

  a) if native symlinks are enabled and  exists, creates
  as a native Windows symlink;
  b) else if native symlinks are not enabled or if  does
 not exist, 'ln -s' fails.

This causes the following error message was seen during the configure:

  "ln: failed to create symbolic link
  'x86_64-softmmu/qemu-system-x86_64.exe': No such file or directory"

Change winsymlinks to 'native' whose behavior is most similar to the
behavior of 'ln -s' on *nix, that is:

  a) if native symlinks are enabled, and whether  exists
 or not, creates  as a native Windows symlink;
  b) else if native symlinks are not enabled, and whether 
 exists or not, 'ln -s' creates as a Windows shortcut file.

Signed-off-by: Bin Meng 
Acked-by: Thomas Huth 
Reviewed-by: Yonggang Luo 
Message-Id: <20220719161230.766063-1-bmeng...@gmail.com>
Signed-off-by: Alex Bennée 
---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 20843a420c..eac39024f2 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -10,7 +10,7 @@ windows_msys2_task:
 memory: 8G
   env:
 CIRRUS_SHELL: powershell
-MSYS: winsymlinks:nativestrict
+MSYS: winsymlinks:native
 MSYSTEM: MINGW64
 MSYS2_URL: 
https://github.com/msys2/msys2-installer/releases/download/2022-05-03/msys2-base-x86_64-20220503.sfx.exe
 MSYS2_FINGERPRINT: 0
-- 
2.30.2

[PATCH v1 03/13] gitlab: drop 'containers-layer2' stage

2022-07-25 Thread Alex Bennée

From: Daniel P. Berrangé 

Since we express dependancies via a 'needs' clause, we don't need to
split container builds into separate stages. GitLab happily lets jobs
depend on other jobs in the same stage and will run them when possible.

Acked-by: Thomas Huth 
Signed-off-by: Daniel P. Berrangé 
Message-Id: <20220722130431.2319019-4-berra...@redhat.com>
Signed-off-by: Alex Bennée 
---
 .gitlab-ci.d/container-cross.yml | 24 
 .gitlab-ci.d/stages.yml  |  1 -
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/.gitlab-ci.d/container-cross.yml b/.gitlab-ci.d/container-cross.yml
index b7963498a3..505b267542 100644
--- a/.gitlab-ci.d/container-cross.yml
+++ b/.gitlab-ci.d/container-cross.yml
@@ -1,20 +1,20 @@
 alpha-debian-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-alpha-cross
 
 amd64-debian-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-amd64-cross
 
 amd64-debian-user-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-all-test-cross
@@ -65,21 +65,21 @@ hexagon-cross-container:
 
 hppa-debian-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-hppa-cross
 
 m68k-debian-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-m68k-cross
 
 mips64-debian-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-mips64-cross
@@ -92,7 +92,7 @@ mips64el-debian-cross-container:
 
 mips-debian-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-mips-cross
@@ -105,7 +105,7 @@ mipsel-debian-cross-container:
 
 powerpc-test-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian11-container']
   variables:
 NAME: debian-powerpc-test-cross
@@ -127,7 +127,7 @@ riscv64-debian-cross-container:
 # we can however build TCG tests using a non-sid base
 riscv64-debian-test-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian11-container']
   variables:
 NAME: debian-riscv64-test-cross
@@ -140,21 +140,21 @@ s390x-debian-cross-container:
 
 sh4-debian-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-sh4-cross
 
 sparc64-debian-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-sparc64-cross
 
 tricore-debian-cross-container:
   extends: .container_job_template
-  stage: containers-layer2
+  stage: containers
   needs: ['amd64-debian10-container']
   variables:
 NAME: debian-tricore-cross
diff --git a/.gitlab-ci.d/stages.yml b/.gitlab-ci.d/stages.yml
index f50826018d..f92f57a27d 100644
--- a/.gitlab-ci.d/stages.yml
+++ b/.gitlab-ci.d/stages.yml
@@ -3,6 +3,5 @@
 #  - test (for test stages, using build artefacts from a build stage)
 stages:
   - containers
-  - containers-layer2
   - build
   - test
-- 
2.30.2

[PATCH v1 01/13] tests: refresh to latest libvirt-ci module

2022-07-25 Thread Alex Bennée

From: Daniel P. Berrangé 

Notable changes:

  - libvirt-ci source tree was re-arranged, so the script we
run now lives in a bin/ sub-dir

  - opensuse 15.2 is replaced by opensuse 15.3

  - libslirp is temporarily dropped on opensuse as the
libslirp-version.h is broken

 https://bugzilla.opensuse.org/show_bug.cgi?id=1201551

  - The incorrectly named python3-virtualenv module was
changed to python3-venv, but most distros don't need
any package as 'venv' is a standard part of python

  - glibc-static was renamed to libc-static, to reflect
fact that it isn't going to be glibc on all distros

  - The cmocka/json-c deps that were manually added to
the centos dockerfile and are now consistently added
to all targets

Acked-by: Thomas Huth 
Signed-off-by: Daniel P. Berrangé 
Message-Id: <20220722130431.2319019-2-berra...@redhat.com>
Signed-off-by: Alex Bennée 
---
 .gitlab-ci.d/cirrus/freebsd-12.vars   | 3 +--
 .gitlab-ci.d/cirrus/freebsd-13.vars   | 3 +--
 .gitlab-ci.d/cirrus/macos-11.vars | 4 ++--
 tests/docker/dockerfiles/alpine.docker| 4 +++-
 tests/docker/dockerfiles/centos8.docker   | 6 +++---
 tests/docker/dockerfiles/debian-amd64.docker  | 2 ++
 tests/docker/dockerfiles/debian-arm64-cross.docker| 2 ++
 tests/docker/dockerfiles/debian-armel-cross.docker| 2 ++
 tests/docker/dockerfiles/debian-armhf-cross.docker| 2 ++
 tests/docker/dockerfiles/debian-mips64el-cross.docker | 2 ++
 tests/docker/dockerfiles/debian-mipsel-cross.docker   | 2 ++
 tests/docker/dockerfiles/debian-ppc64el-cross.docker  | 2 ++
 tests/docker/dockerfiles/debian-s390x-cross.docker| 2 ++
 tests/docker/dockerfiles/fedora.docker| 3 ++-
 tests/docker/dockerfiles/opensuse-leap.docker | 7 ---
 tests/docker/dockerfiles/ubuntu2004.docker| 2 ++
 tests/lcitool/libvirt-ci  | 2 +-
 tests/lcitool/projects/qemu.yml   | 6 --
 tests/lcitool/refresh | 4 ++--
 19 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/.gitlab-ci.d/cirrus/freebsd-12.vars 
b/.gitlab-ci.d/cirrus/freebsd-12.vars
index f59263731f..8fa5a320e9 100644
--- a/.gitlab-ci.d/cirrus/freebsd-12.vars
+++ b/.gitlab-ci.d/cirrus/freebsd-12.vars
@@ -1,5 +1,4 @@
 # THIS FILE WAS AUTO-GENERATED
-# ... and then edited to fix py39, pending proper lcitool update.
 #
 #  $ lcitool variables freebsd-12 qemu
 #
@@ -12,6 +11,6 @@ MAKE='/usr/local/bin/gmake'
 NINJA='/usr/local/bin/ninja'
 PACKAGING_COMMAND='pkg'
 PIP3='/usr/local/bin/pip-3.8'
-PKGS='alsa-lib bash bzip2 ca_root_nss capstone4 ccache cdrkit-genisoimage 
ctags curl cyrus-sasl dbus diffutils dtc fusefs-libs3 gettext git glib gmake 
gnutls gsed gtk3 libepoxy libffi libgcrypt libjpeg-turbo libnfs libspice-server 
libssh libtasn1 llvm lzo2 meson ncurses nettle ninja opencv perl5 pixman 
pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx py39-sphinx_rtd_theme 
py39-virtualenv py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy 
spice-protocol tesseract texinfo usbredir virglrenderer vte3 zstd'
+PKGS='alsa-lib bash bzip2 ca_root_nss capstone4 ccache cdrkit-genisoimage 
cmocka ctags curl cyrus-sasl dbus diffutils dtc fusefs-libs3 gettext git glib 
gmake gnutls gsed gtk3 json-c libepoxy libffi libgcrypt libjpeg-turbo libnfs 
libspice-server libssh libtasn1 llvm lzo2 meson ncurses nettle ninja opencv 
perl5 pixman pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx 
py39-sphinx_rtd_theme py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy 
spice-protocol tesseract texinfo usbredir virglrenderer vte3 zstd'
 PYPI_PKGS=''
 PYTHON='/usr/local/bin/python3'
diff --git a/.gitlab-ci.d/cirrus/freebsd-13.vars 
b/.gitlab-ci.d/cirrus/freebsd-13.vars
index 40fc961398..8ed7e33a77 100644
--- a/.gitlab-ci.d/cirrus/freebsd-13.vars
+++ b/.gitlab-ci.d/cirrus/freebsd-13.vars
@@ -1,5 +1,4 @@
 # THIS FILE WAS AUTO-GENERATED
-# ... and then edited to fix py39, pending proper lcitool update.
 #
 #  $ lcitool variables freebsd-13 qemu
 #
@@ -12,6 +11,6 @@ MAKE='/usr/local/bin/gmake'
 NINJA='/usr/local/bin/ninja'
 PACKAGING_COMMAND='pkg'
 PIP3='/usr/local/bin/pip-3.8'
-PKGS='alsa-lib bash bzip2 ca_root_nss capstone4 ccache cdrkit-genisoimage 
ctags curl cyrus-sasl dbus diffutils dtc fusefs-libs3 gettext git glib gmake 
gnutls gsed gtk3 libepoxy libffi libgcrypt libjpeg-turbo libnfs libspice-server 
libssh libtasn1 llvm lzo2 meson ncurses nettle ninja opencv perl5 pixman 
pkgconf png py39-numpy py39-pillow py39-pip py39-sphinx py39-sphinx_rtd_theme 
py39-virtualenv py39-yaml python3 rpm2cpio sdl2 sdl2_image snappy 
spice-protocol tesseract texinfo usbredir virglrenderer vte3 zstd'
+PKGS='alsa-lib bash bzip2 ca_root_nss capstone4 ccache cdrkit-genisoimage 
cmocka ctags curl cyrus-sasl dbus diffutils dtc fusefs-libs3 gettext git glib 
gmake gnutls gsed gtk3 json-c libepoxy libffi libgcrypt libjpeg-turbo

[PATCH v1 08/13] semihosting: Check for errors on SET_ARG()

2022-07-25 Thread Alex Bennée

From: Peter Maydell 

The SET_ARG() macro returns an error indication; we check this in the
TARGET_SYS_GET_CMDLINE case but not when we use it in implementing
TARGET_SYS_ELAPSED.  Check for and handle the errors via the do_fault
codepath, and update the comment documenting the SET_ARG() and
GET_ARG() macros to note how they handle memory access errors.

Resolves: Coverity CID 1490287
Signed-off-by: Peter Maydell 
Reviewed-by: Richard Henderson 
Message-Id: <20220719121110.225657-4-peter.mayd...@linaro.org>
Signed-off-by: Alex Bennée 
---
 semihosting/arm-compat-semi.c | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/semihosting/arm-compat-semi.c b/semihosting/arm-compat-semi.c
index 1a1e2a6960..d12288fc80 100644
--- a/semihosting/arm-compat-semi.c
+++ b/semihosting/arm-compat-semi.c
@@ -171,6 +171,12 @@ static LayoutInfo common_semi_find_bases(CPUState *cs)
  * Read the input value from the argument block; fail the semihosting
  * call if the memory read fails. Eventually we could use a generic
  * CPUState helper function here.
+ * Note that GET_ARG() handles memory access errors by jumping to
+ * do_fault, so must be used as the first thing done in handling a
+ * semihosting call, to avoid accidentally leaking allocated resources.
+ * SET_ARG(), since it unavoidably happens late, instead returns an
+ * error indication (0 on success, non-0 for error) which the caller
+ * should check.
  */
 
 #define GET_ARG(n) do { \
@@ -739,10 +745,14 @@ void do_common_semihosting(CPUState *cs)
 case TARGET_SYS_ELAPSED:
 elapsed = get_clock() - clock_start;
 if (sizeof(target_ulong) == 8) {
-SET_ARG(0, elapsed);
+if (SET_ARG(0, elapsed)) {
+goto do_fault;
+}
 } else {
-SET_ARG(0, (uint32_t) elapsed);
-SET_ARG(1, (uint32_t) (elapsed >> 32));
+if (SET_ARG(0, (uint32_t) elapsed) ||
+SET_ARG(1, (uint32_t) (elapsed >> 32))) {
+goto do_fault;
+}
 }
 common_semi_set_ret(cs, 0);
 break;
-- 
2.30.2

[PATCH v1 00/13] fixes for 7.1 (testing, docs, semihosting)

2022-07-25 Thread Alex Bennée

As per usual I've opened up a tree for fixes for the 7.1 release. It
started as testing/next but I've included some fixes for semihosting
and a few minor doc updates as well. I'll roll a PR from this at the
end of the week (unless it doesn't meet the bar for missing rc0
tomorrow).

Alex Bennée (2):
  docs/devel: fix description of OBJECT_DECLARE_SIMPLE_TYPE
  qemu-options: bring the kernel and image options together

Bin Meng (2):
  .cirrus.yml: Change winsymlinks to 'native'
  .gitlab-ci.d/windows.yml: Enable native Windows symlink

Daniel P. Berrangé (3):
  tests: refresh to latest libvirt-ci module
  gitlab: show testlog.txt contents when cirrus/custom-runner jobs fail
  gitlab: drop 'containers-layer2' stage

Ilya Leoshkevich (2):
  qapi: Add exit-failure PanicAction
  tests/tcg/s390x: Test unaligned accesses to lowcore

Peter Maydell (4):
  semihosting: Don't return negative values on
qemu_semihosting_console_write() failure
  semihosting: Don't copy buffer after console_write()
  semihosting: Check for errors on SET_ARG()
  semihosting: Fix handling of buffer in TARGET_SYS_TMPNAM

 docs/devel/qom.rst|  3 +-
 qapi/run-state.json   |  4 +-
 include/sysemu/sysemu.h   |  2 +-
 semihosting/arm-compat-semi.c | 29 +-
 semihosting/console.c |  3 +-
 semihosting/syscalls.c|  2 +-
 softmmu/main.c|  6 +-
 softmmu/runstate.c| 17 +++-
 .cirrus.yml   |  2 +-
 .gitlab-ci.d/cirrus/build.yml |  3 +-
 .gitlab-ci.d/cirrus/freebsd-12.vars   |  3 +-
 .gitlab-ci.d/cirrus/freebsd-13.vars   |  3 +-
 .gitlab-ci.d/cirrus/macos-11.vars |  4 +-
 .gitlab-ci.d/container-cross.yml  | 24 ++---
 .../custom-runners/centos-stream-8-x86_64.yml |  2 +
 .../custom-runners/ubuntu-20.04-aarch32.yml   |  2 +
 .../custom-runners/ubuntu-20.04-aarch64.yml   | 12 +++
 .../custom-runners/ubuntu-20.04-s390x.yml | 12 +++
 .gitlab-ci.d/stages.yml   |  1 -
 .gitlab-ci.d/windows.yml  |  2 +
 qemu-options.hx   | 98 +++
 tests/docker/dockerfiles/alpine.docker|  4 +-
 tests/docker/dockerfiles/centos8.docker   |  6 +-
 tests/docker/dockerfiles/debian-amd64.docker  |  2 +
 .../dockerfiles/debian-arm64-cross.docker |  2 +
 .../dockerfiles/debian-armel-cross.docker |  2 +
 .../dockerfiles/debian-armhf-cross.docker |  2 +
 .../dockerfiles/debian-mips64el-cross.docker  |  2 +
 .../dockerfiles/debian-mipsel-cross.docker|  2 +
 .../dockerfiles/debian-ppc64el-cross.docker   |  2 +
 .../dockerfiles/debian-s390x-cross.docker |  2 +
 tests/docker/dockerfiles/fedora.docker|  3 +-
 tests/docker/dockerfiles/opensuse-leap.docker |  7 +-
 tests/docker/dockerfiles/ubuntu2004.docker|  2 +
 tests/lcitool/libvirt-ci  |  2 +-
 tests/lcitool/projects/qemu.yml   |  6 +-
 tests/lcitool/refresh |  4 +-
 tests/tcg/s390x/Makefile.softmmu-target   |  9 ++
 tests/tcg/s390x/unaligned-lowcore.S   | 19 
 39 files changed, 242 insertions(+), 70 deletions(-)
 create mode 100644 tests/tcg/s390x/Makefile.softmmu-target
 create mode 100644 tests/tcg/s390x/unaligned-lowcore.S

-- 
2.30.2

Re: [PATCH RFC 0/7] hostmem: NUMA-aware memory preallocation using ThreadContext

2022-07-25 Thread Michal Prívozník

On 7/21/22 14:07, David Hildenbrand wrote:
> This is a follow-up on "util: NUMA aware memory preallocation" [1] by
> Michal.

I've skimmed through patches and haven't spotted anything obviously
wrong. I'll test these more once I write libvirt support for them (which
I plan to do soon).

Michal

Re: [PATCH v7 01/14] mm: Add F_SEAL_AUTO_ALLOCATE seal to memfd

2022-07-25 Thread Chao Peng

On Thu, Jul 21, 2022 at 12:27:03PM +0200, Gupta, Pankaj wrote:
> 
> > > Normally, a write to unallocated space of a file or the hole of a sparse
> > > file automatically causes space allocation, for memfd, this equals to
> > > memory allocation. This new seal prevents such automatically allocating,
> > > either this is from a direct write() or a write on the previously
> > > mmap-ed area. The seal does not prevent fallocate() so an explicit
> > > fallocate() can still cause allocating and can be used to reserve
> > > memory.
> > > 
> > > This is used to prevent unintentional allocation from userspace on a
> > > stray or careless write and any intentional allocation should use an
> > > explicit fallocate(). One of the main usecases is to avoid memory double
> > > allocation for confidential computing usage where we use two memfds to
> > > back guest memory and at a single point only one memfd is alive and we
> > > want to prevent memory allocation for the other memfd which may have
> > > been mmap-ed previously. More discussion can be found at:
> > > 
> > >https://lkml.org/lkml/2022/6/14/1255
> > > 
> > > Suggested-by: Sean Christopherson 
> > > Signed-off-by: Chao Peng 
> > > ---
> > >   include/uapi/linux/fcntl.h |  1 +
> > >   mm/memfd.c |  3 ++-
> > >   mm/shmem.c | 16 ++--
> > >   3 files changed, 17 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> > > index 2f86b2ad6d7e..98bdabc8e309 100644
> > > --- a/include/uapi/linux/fcntl.h
> > > +++ b/include/uapi/linux/fcntl.h
> > > @@ -43,6 +43,7 @@
> > >   #define F_SEAL_GROW 0x0004  /* prevent file from growing */
> > >   #define F_SEAL_WRITE0x0008  /* prevent writes */
> > >   #define F_SEAL_FUTURE_WRITE 0x0010  /* prevent future writes while 
> > > mapped */
> > > +#define F_SEAL_AUTO_ALLOCATE 0x0020  /* prevent allocation for 
> > > writes */
> > 
> > Why only "on writes" and not "on reads". IIRC, shmem doesn't support the
> > shared zeropage, so you'll simply allocate a new page via read() or on
> > read faults.
> > 
> > 
> > Also, I *think* you can place pages via userfaultfd into shmem. Not sure
> > if that would count "auto alloc", but it would certainly bypass fallocate().
> 
> I was also thinking this at the same time, but for different reason:
> 
> "Want to populate private preboot memory with firmware payload", so was
> thinking userfaulftd could be an option as direct writes are restricted?

If that can be a side effect, I definitely glad to see it, though I'm
still not clear how userfaultfd can be particularly helpful for that.

Chao
> 
> Thanks,
> Pankaj
> 
> 
> 
>

Re: [PATCH v3] hw/pci/pci_bridge: ensure PCIe slots have only one slot

2022-07-25 Thread Vladimir Sementsov-Ogievskiy


On 7/20/22 14:04, Daniel P. Berrangé wrote:

On Wed, Jul 20, 2022 at 02:00:16PM +0300, Roman Kagan wrote:

On Wed, Jul 20, 2022 at 11:44:26AM +0100, Daniel P. Berrangé wrote:

On Wed, Jul 20, 2022 at 01:25:55PM +0300, Roman Kagan wrote:

It's possible to create non-working configurations by attaching a device
to a derivative of PCIe slot (pcie-root-port, ioh3420, etc) and
specifying a slot number other that zero, e.g.:

 -device pcie-root-port,id=s0,... \
 -device virtio-blk-pci,bus=s0,addr=4,...

Make QEMU reject such configurations and only allow addr=0 on the
secondary bus of a PCIe slot.


What do you mean by 'non-working' in this case.  The guest OS boots
OK, but I indeed don't see the device in the guest, but IIUC it was
said that was just because Linux doesn't scan for a non-zero slot.


Right.  I don't remember if it was Linux or firmware or both but indeed
at least Linux guests don't see devices if attached to a PCIe slot at
addr != 0.  (Which is kinda natural for a thing called "slot", isn't it?)


I vaguely recall there was an option to tell linux to scan all slots,
not just slot 0, not sure if that's applicable here.




That wouldn't be a broken config from QEMU's POV though, merely a
guest OS limitation ?


Strictly speaking it wouldn't, indeed.  But we've had created such a
configuration (due to a bug in our management layer) and spent
non-negligible time trying to figure out why the attached device didn't
appear in the guest.  So I thought it made sense to reject a
configuration which is known to confuse guests.  Doesn't it?


If a configuration is a permissible per the hardware design / spec, then
QEMU should generally allow it.  We don't want to constrain host side
configs based on the current limitations of guest OS whose behaviour can
change over time, or where a different guest OS may have a different POV.



If I understand correctly further answers the configration that we try to 
forbid is not permissible by PCIe spec. So seems valid to forbid it. We still 
need to mention specification in commit message and in the comment.

If we still afraid to forbid at once that invalid configuration that was 
previously allowed, may be we can proceed with some of the following:

1. Make a deprecation period of three releases and print only warning during 
this period. And forbid the invalid configuration three releases later. Still 
I'm not sure that someone will see these warnings in logs..

2. Make a boolean config option, like CONFIG_PCIE_STRICT, which forbids invalid 
configurations. This way we keep default behavior, that allows to test 
something unusual, but add an option that we can use for production solution 
where it's important to reduce number of possibilities to break the VM.

What do you think?

--
Best regards,
Vladimir

Re: [PATCH v7 01/14] mm: Add F_SEAL_AUTO_ALLOCATE seal to memfd

2022-07-25 Thread Chao Peng

On Thu, Jul 21, 2022 at 03:05:09PM +, Sean Christopherson wrote:
> On Thu, Jul 21, 2022, David Hildenbrand wrote:
> > On 21.07.22 11:44, David Hildenbrand wrote:
> > > On 06.07.22 10:20, Chao Peng wrote:
> > >> Normally, a write to unallocated space of a file or the hole of a sparse
> > >> file automatically causes space allocation, for memfd, this equals to
> > >> memory allocation. This new seal prevents such automatically allocating,
> > >> either this is from a direct write() or a write on the previously
> > >> mmap-ed area. The seal does not prevent fallocate() so an explicit
> > >> fallocate() can still cause allocating and can be used to reserve
> > >> memory.
> > >>
> > >> This is used to prevent unintentional allocation from userspace on a
> > >> stray or careless write and any intentional allocation should use an
> > >> explicit fallocate(). One of the main usecases is to avoid memory double
> > >> allocation for confidential computing usage where we use two memfds to
> > >> back guest memory and at a single point only one memfd is alive and we
> > >> want to prevent memory allocation for the other memfd which may have
> > >> been mmap-ed previously. More discussion can be found at:
> > >>
> > >>   https://lkml.org/lkml/2022/6/14/1255
> > >>
> > >> Suggested-by: Sean Christopherson 
> > >> Signed-off-by: Chao Peng 
> > >> ---
> > >>  include/uapi/linux/fcntl.h |  1 +
> > >>  mm/memfd.c |  3 ++-
> > >>  mm/shmem.c | 16 ++--
> > >>  3 files changed, 17 insertions(+), 3 deletions(-)
> > >>
> > >> diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> > >> index 2f86b2ad6d7e..98bdabc8e309 100644
> > >> --- a/include/uapi/linux/fcntl.h
> > >> +++ b/include/uapi/linux/fcntl.h
> > >> @@ -43,6 +43,7 @@
> > >>  #define F_SEAL_GROW 0x0004  /* prevent file from growing */
> > >>  #define F_SEAL_WRITE0x0008  /* prevent writes */
> > >>  #define F_SEAL_FUTURE_WRITE 0x0010  /* prevent future writes while 
> > >> mapped */
> > >> +#define F_SEAL_AUTO_ALLOCATE0x0020  /* prevent allocation for 
> > >> writes */
> > > 
> > > Why only "on writes" and not "on reads". IIRC, shmem doesn't support the
> > > shared zeropage, so you'll simply allocate a new page via read() or on
> > > read faults.
> > 
> > Correction: on read() we don't allocate a fresh page. But on read faults
> > we would. So this comment here needs clarification.
> 
> Not just the comment, the code too.  The intent of F_SEAL_AUTO_ALLOCATE is 
> very
> much to block _all_ implicit allocations (or maybe just fault-based 
> allocations
> if "implicit" is too broad of a description).

So maybe still your initial suggestion F_SEAL_FAULT_ALLOCATIONS? One
reason I don't like it is the write() ioctl also cause allocation and we
want to prevent it.

Chao

Re: driver type raw-xz supports discard=unmap?

2022-07-25 Thread Daniel P . Berrangé

On Mon, Jul 25, 2022 at 08:51:42AM -0400, Chris Murphy wrote:
> 
> 
> On Mon, Jul 25, 2022, at 5:13 AM, Daniel P. Berrangé wrote:
> > On Fri, Jul 22, 2022 at 04:03:52PM -0400, Chris Murphy wrote:
> >> Is this valid?
> >> 
> >> `
> >> 
> >> 
> >> `
> >> `/>
> >> `
> >> 
> >> I know type="raw" works fine, I'm wondering if there'd be any problem
> >> with type "raw-xz" combined with discards?
> >
> > This is libvirt configuration, so libvirt-us...@redhat.com is the better
> > list in general. That said, what is this 'raw-xz' type you refer to ?
> >
> > AFAIK, that is not a disk driver type that exists in either libvirt or
> > QEMU releases.
> 
> Huh, interesting. I have no idea then. I just happened to notice it in the 
> (libvirt) XML config that's used by oz.
> https://kojipkgs.fedoraproject.org//packages/Fedora-Workstation/Rawhide/20220721.n.0/images/libvirt-raw-xz-aarch64.xml

I don't see 'raw-xz' mentioned anywhere in the Oz code at

  https://github.com/clalancette/oz

was it a fork that's being used ?

> I've got no idea what happens if an invalid type is specified in
> the config. The VM's are definitely running despite this. I'll ask
> oz devs.

This is pretty surprising if they're actually running as it should
cause a fatal error message

error: unsupported configuration: unknown driver format value 'raw-xz'


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH] .gitlab-ci.d/windows.yml: Enable native Windows symlink

2022-07-25 Thread Alex Bennée



Bin Meng  writes:

> From: Bin Meng 
>
> The following error message was seen during the configure:
>
>   "ln: failed to create symbolic link
>   'x86_64-softmmu/qemu-system-x86_64.exe': No such file or directory"
>
> By default the MSYS environment variable is not defined, so the runtime
> behavior of winsymlinks is: if  does not exist, 'ln -s' fails.
> At the configure phase, the qemu-system-x86_64.exe has not been built
> so creation of the symbolic link fails hence the error message.
>
> Set winsymlinks to 'native' whose behavior is most similar to the
> behavior of 'ln -s' on *nix, that is:
>
>   a) if native symlinks are enabled, and whether  exists
>  or not, creates  as a native Windows symlink;
>   b) else if native symlinks are not enabled, and whether 
>  exists or not, 'ln -s' creates as a Windows shortcut file.
>
> Signed-off-by: Bin Meng 

I'm still seeing Windows build failures such as:

  https://gitlab.com/stsquad/qemu/-/jobs/2765579269

and

  https://gitlab.com/stsquad/qemu/-/jobs/2765579267

Any idea what's falling over?

> ---
>
>  .gitlab-ci.d/windows.yml | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/.gitlab-ci.d/windows.yml b/.gitlab-ci.d/windows.yml
> index 1b2ede49e1..0b9572a8a3 100644
> --- a/.gitlab-ci.d/windows.yml
> +++ b/.gitlab-ci.d/windows.yml
> @@ -57,6 +57,7 @@ msys2-64bit:
>mingw-w64-x86_64-zstd "
>- $env:CHERE_INVOKING = 'yes'  # Preserve the current working directory
>- $env:MSYSTEM = 'MINGW64' # Start a 64 bit Mingw environment
> +  - $env:MSYS = 'winsymlinks:native' # Enable native Windows symlink
>- .\msys64\usr\bin\bash -lc './configure --target-list=x86_64-softmmu
>--enable-capstone --without-default-devices'
>- .\msys64\usr\bin\bash -lc "sed -i '/^ROMS=/d' build/config-host.mak"
> @@ -89,6 +90,7 @@ msys2-32bit:
>mingw-w64-i686-usbredir "
>- $env:CHERE_INVOKING = 'yes'  # Preserve the current working directory
>- $env:MSYSTEM = 'MINGW32' # Start a 32-bit MinG environment
> +  - $env:MSYS = 'winsymlinks:native' # Enable native Windows symlink
>- mkdir output
>- cd output
>- ..\msys64\usr\bin\bash -lc "../configure --target-list=ppc64-softmmu"


-- 
Alex Bennée

Re: [PATCH v7 01/14] mm: Add F_SEAL_AUTO_ALLOCATE seal to memfd

2022-07-25 Thread Chao Peng

On Thu, Jul 21, 2022 at 11:44:11AM +0200, David Hildenbrand wrote:
> On 06.07.22 10:20, Chao Peng wrote:
> > Normally, a write to unallocated space of a file or the hole of a sparse
> > file automatically causes space allocation, for memfd, this equals to
> > memory allocation. This new seal prevents such automatically allocating,
> > either this is from a direct write() or a write on the previously
> > mmap-ed area. The seal does not prevent fallocate() so an explicit
> > fallocate() can still cause allocating and can be used to reserve
> > memory.
> > 
> > This is used to prevent unintentional allocation from userspace on a
> > stray or careless write and any intentional allocation should use an
> > explicit fallocate(). One of the main usecases is to avoid memory double
> > allocation for confidential computing usage where we use two memfds to
> > back guest memory and at a single point only one memfd is alive and we
> > want to prevent memory allocation for the other memfd which may have
> > been mmap-ed previously. More discussion can be found at:
> > 
> >   https://lkml.org/lkml/2022/6/14/1255
> > 
> > Suggested-by: Sean Christopherson 
> > Signed-off-by: Chao Peng 
> > ---
> >  include/uapi/linux/fcntl.h |  1 +
> >  mm/memfd.c |  3 ++-
> >  mm/shmem.c | 16 ++--
> >  3 files changed, 17 insertions(+), 3 deletions(-)
> > 
> > diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
> > index 2f86b2ad6d7e..98bdabc8e309 100644
> > --- a/include/uapi/linux/fcntl.h
> > +++ b/include/uapi/linux/fcntl.h
> > @@ -43,6 +43,7 @@
> >  #define F_SEAL_GROW0x0004  /* prevent file from growing */
> >  #define F_SEAL_WRITE   0x0008  /* prevent writes */
> >  #define F_SEAL_FUTURE_WRITE0x0010  /* prevent future writes while 
> > mapped */
> > +#define F_SEAL_AUTO_ALLOCATE   0x0020  /* prevent allocation for 
> > writes */
> 
> Why only "on writes" and not "on reads". IIRC, shmem doesn't support the
> shared zeropage, so you'll simply allocate a new page via read() or on
> read faults.

Right, it also prevents read faults.

> 
> 
> Also, I *think* you can place pages via userfaultfd into shmem. Not sure
> if that would count "auto alloc", but it would certainly bypass fallocate().

Userfaultfd sounds interesting, will further investigate it. But a rough
look sounds it only faults to usrspace for write/read fault, not
write()? Also sounds it operates on vma and userfaultfd_register() takes
mmap_lock which is what we want to avoid for frequent
register/unregister during private/shared memory conversion.

Chao
> 
> -- 
> Thanks,
> 
> David / dhildenb

[PATCH] linux-user: Implement stracing madvise()

2022-07-25 Thread Ilya Leoshkevich

The default implementation has several problems: the first argument is
not displayed as a pointer, making it harder to grep; the third
argument is not symbolized; and there are several extra unused
arguments.

Signed-off-by: Ilya Leoshkevich 
---
 linux-user/strace.c| 29 +
 linux-user/strace.list |  2 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index 7d882526da..32a6987844 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -2969,6 +2969,35 @@ print_stat(CPUArchState *cpu_env, const struct 
syscallname *name,
 #define print_lstat64   print_stat
 #endif
 
+#if defined(TARGET_NR_madvise)
+#define TARGET_MADV_NORMAL 0
+#define TARGET_MADV_RANDOM 1
+#define TARGET_MADV_SEQUENTIAL 2
+#define TARGET_MADV_WILLNEED 3
+#define TARGET_MADV_DONTNEED 4
+
+static struct enums madvise_advice[] = {
+ENUM_TARGET(MADV_NORMAL),
+ENUM_TARGET(MADV_RANDOM),
+ENUM_TARGET(MADV_SEQUENTIAL),
+ENUM_TARGET(MADV_WILLNEED),
+ENUM_TARGET(MADV_DONTNEED),
+ENUM_END,
+};
+
+static void
+print_madvise(CPUArchState *cpu_env, const struct syscallname *name,
+  abi_long arg0, abi_long arg1, abi_long arg2,
+  abi_long arg3, abi_long arg4, abi_long arg5)
+{
+print_syscall_prologue(name);
+print_pointer(arg0, 0);
+print_raw_param("%d", arg1, 0);
+print_enums(madvise_advice, arg2, 1);
+print_syscall_epilogue(name);
+}
+#endif
+
 #if defined(TARGET_NR_fstat) || defined(TARGET_NR_fstat64)
 static void
 print_fstat(CPUArchState *cpu_env, const struct syscallname *name,
diff --git a/linux-user/strace.list b/linux-user/strace.list
index 72e17b1acf..c93effdbc8 100644
--- a/linux-user/strace.list
+++ b/linux-user/strace.list
@@ -541,7 +541,7 @@
 { TARGET_NR_lstat64, "lstat64" , NULL, print_lstat64, NULL },
 #endif
 #ifdef TARGET_NR_madvise
-{ TARGET_NR_madvise, "madvise" , NULL, NULL, NULL },
+{ TARGET_NR_madvise, "madvise" , NULL, print_madvise, NULL },
 #endif
 #ifdef TARGET_NR_madvise1
 { TARGET_NR_madvise1, "madvise1" , NULL, NULL, NULL },
-- 
2.35.3

1 2 >

1 - 100 of 175 matches

Mail list logo