Re: [PATCH v1 2/2] virtio-net: virtio_net_flush_tx() check for per-queue reset

2023-01-28 Thread Xuan Zhuo
On Sun, 29 Jan 2023 14:23:21 +0800, Jason Wang  wrote:
> On Sun, Jan 29, 2023 at 10:52 AM Xuan Zhuo  wrote:
> >
> > Check whether it is per-queue reset state in virtio_net_flush_tx().
> >
> > Before per-queue reset, we need to recover async tx resources. At this
> > time, virtio_net_flush_tx() is called, but we should not try to send
> > new packets, so virtio_net_flush_tx() should check the current
> > per-queue reset state.
> >
> > Fixes: 7dc6be52 ("virtio-net: support queue reset")
> > Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1451
> > Reported-by: Alexander Bulekov 
> > Signed-off-by: Xuan Zhuo 
> > ---
> >  hw/net/virtio-net.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > index 3ae909041a..fba6451a50 100644
> > --- a/hw/net/virtio-net.c
> > +++ b/hw/net/virtio-net.c
> > @@ -2627,7 +2627,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
> >  VirtQueueElement *elem;
> >  int32_t num_packets = 0;
> >  int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
> > -if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
> > +if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) ||
> > +virtio_queue_reset_state(q->tx_vq)) {
>
> We have other places that check DRIVER_OK do we need to check queue
> reset as well?

I checked it again. I still think that the location of other checking DRIVER_OK
does not need to check the queue reset.

Thanks.


>
> E.g:
> virtio_net_can_receive()
> virtio_net_tx_{timer|bh}()
>
> Thanks
>
> >  return num_packets;
> >  }
> >
> > --
> > 2.32.0.3.g01195cf9f
> >
>



Re: [PATCH v1 1/2] virtio: struct VirtQueue introduce reset

2023-01-28 Thread Xuan Zhuo
On Sun, 29 Jan 2023 02:37:22 -0500, "Michael S. Tsirkin"  
wrote:
> On Sun, Jan 29, 2023 at 03:15:16PM +0800, Xuan Zhuo wrote:
> > On Sun, 29 Jan 2023 02:12:36 -0500, "Michael S. Tsirkin"  
> > wrote:
> > >
> > > subject seems wrong.
> >
> >
> > Will fix.
> >
> >
> > >
> > > On Sun, Jan 29, 2023 at 10:51:49AM +0800, Xuan Zhuo wrote:
> > > > In the current design, we stop the device from operating on the vring
> > > > during per-queue reset by resetting the structure VirtQueue.
> > > >
> > > > But before the reset operation, when recycling some resources, we should
> > > > stop referencing new vring resources. For example, when recycling
> > > > virtio-net's asynchronous sending resources, virtio-net should be able
> > > > to perceive that the current queue is in the per-queue reset state, and
> > > > stop sending new packets from the tx queue.
> > > >
> > > > Signed-off-by: Xuan Zhuo 
> > > > ---
> > > >  hw/virtio/virtio.c | 15 +++
> > > >  include/hw/virtio/virtio.h |  1 +
> > > >  2 files changed, 16 insertions(+)
> > > >
> > > > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > > > index f35178f5fc..c954f2a2b3 100644
> > > > --- a/hw/virtio/virtio.c
> > > > +++ b/hw/virtio/virtio.c
> > > > @@ -142,6 +142,8 @@ struct VirtQueue
> > > >  /* Notification enabled? */
> > > >  bool notification;
> > > >
> > > > +bool disabled_by_reset;
> > > > +
> > > >  uint16_t queue_index;
> > > >
> > > >  unsigned int inuse;
> > > > @@ -2079,6 +2081,12 @@ void virtio_queue_reset(VirtIODevice *vdev, 
> > > > uint32_t queue_index)
> > > >  {
> > > >  VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
> > > >
> > > > +/*
> > > > + * Mark this queue is per-queue reset status. The device should 
> > > > release the
> > > > + * references of the vring, and not refer more new vring item.
> > >
> > > items
> >
> >
> > Will fix.
> >
> > >
> > > > + */
> > > > +vdev->vq[queue_index].disabled_by_reset = true;
> > > > +
> > > >  if (k->queue_reset) {
> > > >  k->queue_reset(vdev, queue_index);
> > > >  }
> > >
> > > can we set this after calling queue_reset? For symmetry with enable.
> >
> >
> > In fact,  queue_reset() will check it.
> >
>
> when you disable you first set it then disable.
> so when we are not 100% ready it's already set.
> when you enable you first clear it then enable.
> so we are not 100% ready but it's no longer set.
> inconsistent.
>
>
> > >
> > > > @@ -2102,11 +2110,18 @@ void virtio_queue_enable(VirtIODevice *vdev, 
> > > > uint32_t queue_index)
> > > >  }
> > > >  */
> > > >
> > > > +vdev->vq[queue_index].disabled_by_reset = false;
> > > > +
> > > >  if (k->queue_enable) {
> > > >  k->queue_enable(vdev, queue_index);
> > > >  }
> > > >  }
> > > >
> > > > +bool virtio_queue_reset_state(VirtQueue *vq)
> > > > +{
> > > > +return vq->disabled_by_reset;
> > > > +}
> > > > +
> > > >  void virtio_reset(void *opaque)
> > > >  {
> > > >  VirtIODevice *vdev = opaque;
> > > > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > > > index 77c6c55929..00e91af7c4 100644
> > > > --- a/include/hw/virtio/virtio.h
> > > > +++ b/include/hw/virtio/virtio.h
> > > > @@ -319,6 +319,7 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t 
> > > > val);
> > > >  void virtio_reset(void *opaque);
> > > >  void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index);
> > > >  void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index);
> > > > +bool virtio_queue_reset_state(VirtQueue *vq);
> > > >  void virtio_update_irq(VirtIODevice *vdev);
> > > >  int virtio_set_features(VirtIODevice *vdev, uint64_t val);
> > >
> > > OK I guess ... what about migration. This state won't be
> > > set correctly will it?
> >
> > I think it has no effect. After the reset, there is actually no state. We 
> > can
> > migrate.
> >
> > The current variable is only used by ->queue_reset().
> >
> > Thanks.
> >
>
> Yea maybe it works for this bug but ... yack. This means the state has
> no logic consistency.  It's just there because you found a bug and
> wanted to fix it.
> An ultra specific
>   bool this_weird_state_fuzzer_gets_in_issue_1451;
> is hard to maintain, not happy :(


I agree.


Thanks.


>
>
> > >
> > >
> > > >
> > > > --
> > > > 2.32.0.3.g01195cf9f
> > >
>



Re: [PATCH v1 1/2] virtio: struct VirtQueue introduce reset

2023-01-28 Thread Michael S. Tsirkin
On Sun, Jan 29, 2023 at 03:15:16PM +0800, Xuan Zhuo wrote:
> On Sun, 29 Jan 2023 02:12:36 -0500, "Michael S. Tsirkin"  
> wrote:
> >
> > subject seems wrong.
> 
> 
> Will fix.
> 
> 
> >
> > On Sun, Jan 29, 2023 at 10:51:49AM +0800, Xuan Zhuo wrote:
> > > In the current design, we stop the device from operating on the vring
> > > during per-queue reset by resetting the structure VirtQueue.
> > >
> > > But before the reset operation, when recycling some resources, we should
> > > stop referencing new vring resources. For example, when recycling
> > > virtio-net's asynchronous sending resources, virtio-net should be able
> > > to perceive that the current queue is in the per-queue reset state, and
> > > stop sending new packets from the tx queue.
> > >
> > > Signed-off-by: Xuan Zhuo 
> > > ---
> > >  hw/virtio/virtio.c | 15 +++
> > >  include/hw/virtio/virtio.h |  1 +
> > >  2 files changed, 16 insertions(+)
> > >
> > > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > > index f35178f5fc..c954f2a2b3 100644
> > > --- a/hw/virtio/virtio.c
> > > +++ b/hw/virtio/virtio.c
> > > @@ -142,6 +142,8 @@ struct VirtQueue
> > >  /* Notification enabled? */
> > >  bool notification;
> > >
> > > +bool disabled_by_reset;
> > > +
> > >  uint16_t queue_index;
> > >
> > >  unsigned int inuse;
> > > @@ -2079,6 +2081,12 @@ void virtio_queue_reset(VirtIODevice *vdev, 
> > > uint32_t queue_index)
> > >  {
> > >  VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
> > >
> > > +/*
> > > + * Mark this queue is per-queue reset status. The device should 
> > > release the
> > > + * references of the vring, and not refer more new vring item.
> >
> > items
> 
> 
> Will fix.
> 
> >
> > > + */
> > > +vdev->vq[queue_index].disabled_by_reset = true;
> > > +
> > >  if (k->queue_reset) {
> > >  k->queue_reset(vdev, queue_index);
> > >  }
> >
> > can we set this after calling queue_reset? For symmetry with enable.
> 
> 
> In fact,  queue_reset() will check it.
> 

when you disable you first set it then disable.
so when we are not 100% ready it's already set.
when you enable you first clear it then enable.
so we are not 100% ready but it's no longer set.
inconsistent.


> >
> > > @@ -2102,11 +2110,18 @@ void virtio_queue_enable(VirtIODevice *vdev, 
> > > uint32_t queue_index)
> > >  }
> > >  */
> > >
> > > +vdev->vq[queue_index].disabled_by_reset = false;
> > > +
> > >  if (k->queue_enable) {
> > >  k->queue_enable(vdev, queue_index);
> > >  }
> > >  }
> > >
> > > +bool virtio_queue_reset_state(VirtQueue *vq)
> > > +{
> > > +return vq->disabled_by_reset;
> > > +}
> > > +
> > >  void virtio_reset(void *opaque)
> > >  {
> > >  VirtIODevice *vdev = opaque;
> > > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > > index 77c6c55929..00e91af7c4 100644
> > > --- a/include/hw/virtio/virtio.h
> > > +++ b/include/hw/virtio/virtio.h
> > > @@ -319,6 +319,7 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t 
> > > val);
> > >  void virtio_reset(void *opaque);
> > >  void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index);
> > >  void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index);
> > > +bool virtio_queue_reset_state(VirtQueue *vq);
> > >  void virtio_update_irq(VirtIODevice *vdev);
> > >  int virtio_set_features(VirtIODevice *vdev, uint64_t val);
> >
> > OK I guess ... what about migration. This state won't be
> > set correctly will it?
> 
> I think it has no effect. After the reset, there is actually no state. We can
> migrate.
> 
> The current variable is only used by ->queue_reset().
> 
> Thanks.
> 

Yea maybe it works for this bug but ... yack. This means the state has
no logic consistency.  It's just there because you found a bug and
wanted to fix it.
An ultra specific
bool this_weird_state_fuzzer_gets_in_issue_1451;
is hard to maintain, not happy :(


> >
> >
> > >
> > > --
> > > 2.32.0.3.g01195cf9f
> >




Re: [PATCH v1 2/2] virtio-net: virtio_net_flush_tx() check for per-queue reset

2023-01-28 Thread Xuan Zhuo
On Sun, 29 Jan 2023 02:25:43 -0500, "Michael S. Tsirkin"  
wrote:
> On Sun, Jan 29, 2023 at 10:51:50AM +0800, Xuan Zhuo wrote:
> > Check whether it is per-queue reset state in virtio_net_flush_tx().
> >
> > Before per-queue reset, we need to recover async tx resources. At this
> > time, virtio_net_flush_tx() is called, but we should not try to send
> > new packets, so virtio_net_flush_tx() should check the current
> > per-queue reset state.
>
>
> What does "at this time" mean here?
> Do you in fact mean it's called from flush_or_purge_queued_packets?

Yes

virtio_queue_reset
k->queue_reset
virtio_net_queue_reset
flush_or_purge_queued_packets
qemu_flush_or_purge_queued_packets
.
(callback) virtio_net_tx_complete
virtio_net_flush_tx <-- here 
send new packet. We need stop it.


Because it is inside the callback, I can't pass information through the stack. I
originally thought it was a general situation, so I wanted to put it in
struct VirtQueue.

If it is not very suitable, it may be better to put it in VirtIONetQueue.

Thanks.

> What does the call stack look like?
>
> If yes introducing a vq state just so virtio_net_flush_tx
> knows we are in the process of reset would be a bad idea.
> We want something much more local, ideally on stack even ...
>
>
> >
> > Fixes: 7dc6be52 ("virtio-net: support queue reset")
> > Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1451
> > Reported-by: Alexander Bulekov 
> > Signed-off-by: Xuan Zhuo 
> > ---
> >  hw/net/virtio-net.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > index 3ae909041a..fba6451a50 100644
> > --- a/hw/net/virtio-net.c
> > +++ b/hw/net/virtio-net.c
> > @@ -2627,7 +2627,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
> >  VirtQueueElement *elem;
> >  int32_t num_packets = 0;
> >  int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
> > -if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
> > +if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) ||
> > +virtio_queue_reset_state(q->tx_vq)) {
> >  return num_packets;
> >  }
> >
> > --
> > 2.32.0.3.g01195cf9f
>



Re: [PATCH v1 0/2] virtio: fix for assertion failure: virtio_net_get_subqueue(nc)->async_tx.elem failed

2023-01-28 Thread Michael S. Tsirkin
On Sun, Jan 29, 2023 at 10:51:48AM +0800, Xuan Zhuo wrote:
> In the current design, we stop the device from operating on the vring
> during per-queue reset by resetting the structure VirtQueue.
> 
> But before the reset operation, when recycling some resources, we should
> stop referencing new vring resources.
> 
> This bug is caused by this reason.
> 
> https://gitlab.com/qemu-project/qemu/-/issues/1451
> 
> Before we reset the structure, we called the ->queue_reset callback to let the
> device reclaim resources. Here virtio-net tries to release the packets sent
> asynchronously, but during this process virtio_net_flush_tx() will be called,
> and new data will be sent again. This leads to asserted.
> 
>  assert(!virtio_net_get_subqueue(nc)->async_tx.elem);
> 
> This patch set introduce new item "reset" into struct VirtQueue, then device 
> can
> know this virtqueue is per-queue reset state.

Better but I still don't exacctly understand what this state means.
Sent some questions on the patches themselves.
Thanks!


> v1:
> 1. rename "reset" to disabled_by_reset
> 2. add api: virtio_queue_reset_state()
> 
> Xuan Zhuo (2):
>   virtio: struct VirtQueue introduce reset
>   virtio-net: virtio_net_flush_tx() check for per-queue reset
> 
>  hw/net/virtio-net.c|  3 ++-
>  hw/virtio/virtio.c | 15 +++
>  include/hw/virtio/virtio.h |  1 +
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> --
> 2.32.0.3.g01195cf9f




Re: [PATCH v1 1/2] virtio: struct VirtQueue introduce reset

2023-01-28 Thread Xuan Zhuo
On Sun, 29 Jan 2023 02:12:36 -0500, "Michael S. Tsirkin"  
wrote:
>
> subject seems wrong.


Will fix.


>
> On Sun, Jan 29, 2023 at 10:51:49AM +0800, Xuan Zhuo wrote:
> > In the current design, we stop the device from operating on the vring
> > during per-queue reset by resetting the structure VirtQueue.
> >
> > But before the reset operation, when recycling some resources, we should
> > stop referencing new vring resources. For example, when recycling
> > virtio-net's asynchronous sending resources, virtio-net should be able
> > to perceive that the current queue is in the per-queue reset state, and
> > stop sending new packets from the tx queue.
> >
> > Signed-off-by: Xuan Zhuo 
> > ---
> >  hw/virtio/virtio.c | 15 +++
> >  include/hw/virtio/virtio.h |  1 +
> >  2 files changed, 16 insertions(+)
> >
> > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > index f35178f5fc..c954f2a2b3 100644
> > --- a/hw/virtio/virtio.c
> > +++ b/hw/virtio/virtio.c
> > @@ -142,6 +142,8 @@ struct VirtQueue
> >  /* Notification enabled? */
> >  bool notification;
> >
> > +bool disabled_by_reset;
> > +
> >  uint16_t queue_index;
> >
> >  unsigned int inuse;
> > @@ -2079,6 +2081,12 @@ void virtio_queue_reset(VirtIODevice *vdev, uint32_t 
> > queue_index)
> >  {
> >  VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
> >
> > +/*
> > + * Mark this queue is per-queue reset status. The device should 
> > release the
> > + * references of the vring, and not refer more new vring item.
>
> items


Will fix.

>
> > + */
> > +vdev->vq[queue_index].disabled_by_reset = true;
> > +
> >  if (k->queue_reset) {
> >  k->queue_reset(vdev, queue_index);
> >  }
>
> can we set this after calling queue_reset? For symmetry with enable.


In fact,  queue_reset() will check it.


>
> > @@ -2102,11 +2110,18 @@ void virtio_queue_enable(VirtIODevice *vdev, 
> > uint32_t queue_index)
> >  }
> >  */
> >
> > +vdev->vq[queue_index].disabled_by_reset = false;
> > +
> >  if (k->queue_enable) {
> >  k->queue_enable(vdev, queue_index);
> >  }
> >  }
> >
> > +bool virtio_queue_reset_state(VirtQueue *vq)
> > +{
> > +return vq->disabled_by_reset;
> > +}
> > +
> >  void virtio_reset(void *opaque)
> >  {
> >  VirtIODevice *vdev = opaque;
> > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > index 77c6c55929..00e91af7c4 100644
> > --- a/include/hw/virtio/virtio.h
> > +++ b/include/hw/virtio/virtio.h
> > @@ -319,6 +319,7 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val);
> >  void virtio_reset(void *opaque);
> >  void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index);
> >  void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index);
> > +bool virtio_queue_reset_state(VirtQueue *vq);
> >  void virtio_update_irq(VirtIODevice *vdev);
> >  int virtio_set_features(VirtIODevice *vdev, uint64_t val);
>
> OK I guess ... what about migration. This state won't be
> set correctly will it?

I think it has no effect. After the reset, there is actually no state. We can
migrate.

The current variable is only used by ->queue_reset().

Thanks.


>
>
> >
> > --
> > 2.32.0.3.g01195cf9f
>



Re: [PATCH v1 2/2] virtio-net: virtio_net_flush_tx() check for per-queue reset

2023-01-28 Thread Michael S. Tsirkin
On Sun, Jan 29, 2023 at 10:51:50AM +0800, Xuan Zhuo wrote:
> Check whether it is per-queue reset state in virtio_net_flush_tx().
> 
> Before per-queue reset, we need to recover async tx resources. At this
> time, virtio_net_flush_tx() is called, but we should not try to send
> new packets, so virtio_net_flush_tx() should check the current
> per-queue reset state.


What does "at this time" mean here?
Do you in fact mean it's called from flush_or_purge_queued_packets?
What does the call stack look like?

If yes introducing a vq state just so virtio_net_flush_tx
knows we are in the process of reset would be a bad idea.
We want something much more local, ideally on stack even ...


> 
> Fixes: 7dc6be52 ("virtio-net: support queue reset")
> Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1451
> Reported-by: Alexander Bulekov 
> Signed-off-by: Xuan Zhuo 
> ---
>  hw/net/virtio-net.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 3ae909041a..fba6451a50 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -2627,7 +2627,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
>  VirtQueueElement *elem;
>  int32_t num_packets = 0;
>  int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
> -if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
> +if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) ||
> +virtio_queue_reset_state(q->tx_vq)) {
>  return num_packets;
>  }
>  
> -- 
> 2.32.0.3.g01195cf9f




Re: [PATCH 5/9] igb: respect E1000_VMOLR_RSSE

2023-01-28 Thread Akihiko Odaki

On 2023/01/28 22:46, Sriram Yagnaraman wrote:

RSS for VFs is only enabled if VMOLR[n].RSSE is set.

Signed-off-by: Sriram Yagnaraman 
---
  hw/net/igb_core.c | 18 +-
  1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 1eb7ba168f..e4fd4a1a5f 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -69,7 +69,7 @@ typedef struct IGBTxPktVmdqCallbackContext {
  
  static ssize_t

  igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt,
- bool has_vnet, bool *assigned);
+ bool has_vnet, bool *external_tx);


I admit external_tx is somewhat confusing, but it is more than just 
telling if it is assigned to Rx queue or not. If external_tx is not 
NULL, it indicates it is part of Tx packet switching. In that case, a 
bool value which describes whether the packet should be routed to 
external LAN must be assigned. The value can be false even if the packet 
is assigned to Rx queues; it will be always false if it is multicast, 
for example.


  
  static inline void

  igb_set_interrupt_cause(IGBCore *core, uint32_t val);
@@ -942,7 +942,7 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
  
  if (core->mac[MRQC] & 1) {

  if (is_broadcast_ether_addr(ehdr->h_dest)) {
-for (i = 0; i < 8; i++) {
+for (i = 0; i < IGB_MAX_VF_FUNCTIONS; i++) {


I just left it as 8 because VMDq is not specific to VF. Perhaps it is 
better to have another macro to denote the number of VMDq pools, but it 
is not done yet.



  if (core->mac[VMOLR0 + i] & E1000_VMOLR_BAM) {
  queues |= BIT(i);
  }
@@ -976,7 +976,7 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
  f = ta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3];
  f = (((ehdr->h_dest[5] << 8) | ehdr->h_dest[4]) >> f) & 0xfff;
  if (macp[f >> 5] & (1 << (f & 0x1f))) {
-for (i = 0; i < 8; i++) {
+for (i = 0; i < IGB_MAX_VF_FUNCTIONS; i++) {
  if (core->mac[VMOLR0 + i] & E1000_VMOLR_ROMPE) {
  queues |= BIT(i);
  }
@@ -999,7 +999,7 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
  }
  }
  } else {
-for (i = 0; i < 8; i++) {
+for (i = 0; i < IGB_MAX_VF_FUNCTIONS; i++) {
  if (core->mac[VMOLR0 + i] & E1000_VMOLR_AUPE) {
  mask |= BIT(i);
  }
@@ -1018,7 +1018,15 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
  queues &= core->mac[VFRE];
  igb_rss_parse_packet(core, core->rx_pkt, external_tx != NULL, 
rss_info);
  if (rss_info->queue & 1) {
-queues <<= 8;
+for (i = 0; i < IGB_MAX_VF_FUNCTIONS; i++) {
+if (!(queues & BIT(i))) {
+continue;
+}
+if (core->mac[VMOLR0 + i] & E1000_VMOLR_RSSE) {
+queues |= BIT(i + IGB_MAX_VF_FUNCTIONS);
+queues &= ~BIT(i);
+}
+}
  }
  } else {
  switch (net_rx_pkt_get_packet_type(core->rx_pkt)) {




Re: [PATCH v1 1/2] virtio: struct VirtQueue introduce reset

2023-01-28 Thread Michael S. Tsirkin


subject seems wrong.

On Sun, Jan 29, 2023 at 10:51:49AM +0800, Xuan Zhuo wrote:
> In the current design, we stop the device from operating on the vring
> during per-queue reset by resetting the structure VirtQueue.
> 
> But before the reset operation, when recycling some resources, we should
> stop referencing new vring resources. For example, when recycling
> virtio-net's asynchronous sending resources, virtio-net should be able
> to perceive that the current queue is in the per-queue reset state, and
> stop sending new packets from the tx queue.
> 
> Signed-off-by: Xuan Zhuo 
> ---
>  hw/virtio/virtio.c | 15 +++
>  include/hw/virtio/virtio.h |  1 +
>  2 files changed, 16 insertions(+)
> 
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index f35178f5fc..c954f2a2b3 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -142,6 +142,8 @@ struct VirtQueue
>  /* Notification enabled? */
>  bool notification;
>  
> +bool disabled_by_reset;
> +
>  uint16_t queue_index;
>  
>  unsigned int inuse;
> @@ -2079,6 +2081,12 @@ void virtio_queue_reset(VirtIODevice *vdev, uint32_t 
> queue_index)
>  {
>  VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
>  
> +/*
> + * Mark this queue is per-queue reset status. The device should release 
> the
> + * references of the vring, and not refer more new vring item.

items

> + */
> +vdev->vq[queue_index].disabled_by_reset = true;
> +
>  if (k->queue_reset) {
>  k->queue_reset(vdev, queue_index);
>  }

can we set this after calling queue_reset? For symmetry with enable.

> @@ -2102,11 +2110,18 @@ void virtio_queue_enable(VirtIODevice *vdev, uint32_t 
> queue_index)
>  }
>  */
>  
> +vdev->vq[queue_index].disabled_by_reset = false;
> +
>  if (k->queue_enable) {
>  k->queue_enable(vdev, queue_index);
>  }
>  }
>  
> +bool virtio_queue_reset_state(VirtQueue *vq)
> +{
> +return vq->disabled_by_reset;
> +}
> +
>  void virtio_reset(void *opaque)
>  {
>  VirtIODevice *vdev = opaque;
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index 77c6c55929..00e91af7c4 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -319,6 +319,7 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val);
>  void virtio_reset(void *opaque);
>  void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index);
>  void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index);
> +bool virtio_queue_reset_state(VirtQueue *vq);
>  void virtio_update_irq(VirtIODevice *vdev);
>  int virtio_set_features(VirtIODevice *vdev, uint64_t val);

OK I guess ... what about migration. This state won't be
set correctly will it?


>  
> -- 
> 2.32.0.3.g01195cf9f




Re: [PATCH 4/9] igb: check oversized packets for VMDq

2023-01-28 Thread Akihiko Odaki

On 2023/01/28 22:46, Sriram Yagnaraman wrote:

Signed-off-by: Sriram Yagnaraman 
---
  hw/net/igb_core.c | 74 ++-
  1 file changed, 54 insertions(+), 20 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 6bca5459b9..1eb7ba168f 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1476,6 +1476,30 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt 
*pkt,
  igb_update_rx_stats(core, size, total_size);
  }
  
+static inline bool


Please remove inline qualifier. inline qualifier has some adverse effects:
- It suppresses GCC warnings for unused functions. This behavior is 
useful when you write a function in a header file, but it is not in this 
case.
- It confuses the compiler if the function later grows and becomes 
unsuitable for inlining.

- It is noise in source code.

In this case, the compiler should be able to decide to inline the 
function or not by its own.



+igb_is_oversized(IGBCore *core, const E1000E_RingInfo *rxi, size_t size)
+{
+bool vmdq = core->mac[MRQC] & 1;
+uint16_t qn = rxi->idx;
+uint16_t pool = (qn > IGB_MAX_VF_FUNCTIONS) ?
+   (qn - IGB_MAX_VF_FUNCTIONS) : qn;


Write as qn % 8; this pattern is already prevalent.


+
+bool lpe = (vmdq ? core->mac[VMOLR0 + pool] & E1000_VMOLR_LPE :
+core->mac[RCTL] & E1000_RCTL_LPE);


RCTL.LPE should be checked even if VMDq is enabled; In section 7.10.3.4, 
Size Filtering is defined to check RCTL.LPE, and it is part of packet 
switching procedure for virtualized environment. Linux also ensures it 
sets the maximum value to RLPML.RLPML if VMDq is enabled:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cfbc871c2174f352542053d25659920d6841ed41


+bool sbp = core->mac[RCTL] & E1000_RCTL_SBP;
+int maximum_ethernet_vlan_size = 1522;
+int maximum_ethernet_lpe_size =
+(vmdq ? core->mac[VMOLR0 + pool] & E1000_VMOLR_RLPML_MASK :
+ core->mac[RLPML] & E1000_VMOLR_RLPML_MASK);
+
+if (size > maximum_ethernet_lpe_size ||
+(size > maximum_ethernet_vlan_size && !lpe && !sbp)) {
+return true;
+}
+
+return false;
+}
+
  static inline void
  igb_rx_fix_l4_csum(IGBCore *core, struct NetRxPkt *pkt)
  {
@@ -1499,7 +1523,8 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
  static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4);
  
  uint16_t queues = 0;

-uint32_t n = 0;
+uint16_t oversized = 0;
+uint32_t icr_bits = 0;
  uint8_t min_buf[ETH_ZLEN];
  struct iovec min_iov;
  struct eth_header *ehdr;
@@ -1509,7 +1534,7 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
  E1000E_RxRing rxr;
  E1000E_RSSInfo rss_info;
  size_t total_size;
-ssize_t retval;
+ssize_t retval = 0;
  int i;
  
  trace_e1000e_rx_receive_iov(iovcnt);

@@ -1550,11 +1575,6 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
  filter_buf = min_buf;
  }
  
-/* Discard oversized packets if !LPE and !SBP. */

-if (e1000x_is_oversized(core->mac, size)) {
-return orig_size;
-}
-
  ehdr = PKT_GET_ETH_HDR(filter_buf);
  net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr));
  
@@ -1571,8 +1591,6 @@ igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt,

  total_size = net_rx_pkt_get_total_len(core->rx_pkt) +
  e1000x_fcs_len(core->mac);
  
-retval = orig_size;

-
  for (i = 0; i < IGB_NUM_QUEUES; i++) {
  if (!(queues & BIT(i))) {
  continue;
@@ -1580,42 +1598,58 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
  
  igb_rx_ring_init(core, &rxr, i);

  if (!igb_has_rxbufs(core, rxr.i, total_size)) {
-retval = 0;
+icr_bits |= E1000_ICS_RXO;
  }
  }
  
-if (retval) {

+if (!icr_bits) {
+retval = orig_size;
  igb_rx_fix_l4_csum(core, core->rx_pkt);
  
  for (i = 0; i < IGB_NUM_QUEUES; i++) {

-if (!(queues & BIT(i)) ||
-!(core->mac[E1000_RXDCTL(i) >> 2] & 
E1000_RXDCTL_QUEUE_ENABLE)) {
+if (!(queues & BIT(i))) {
  continue;
  }
  
  igb_rx_ring_init(core, &rxr, i);

+if (igb_is_oversized(core, rxr.i, size)) {
+oversized |= BIT(i);
+continue;
+}


VMOLR.RLPML is checked during Rx queue assignment, which is implemented 
in igb_receive_assign(). The oversize check should be moved to the function.



+
+if (!(core->mac[RXDCTL0 + (i * 16)] & E1000_RXDCTL_QUEUE_ENABLE)) {
+continue;
+}
+
  trace_e1000e_rx_rss_dispatched_to_queue(rxr.i->idx);
  igb_write_packet_to_guest(core, core->rx_pkt, &rxr, &rss_info);
  
  /* Check if receive descriptor

Re: [PATCH v1 2/2] virtio-net: virtio_net_flush_tx() check for per-queue reset

2023-01-28 Thread Jason Wang
On Sun, Jan 29, 2023 at 10:52 AM Xuan Zhuo  wrote:
>
> Check whether it is per-queue reset state in virtio_net_flush_tx().
>
> Before per-queue reset, we need to recover async tx resources. At this
> time, virtio_net_flush_tx() is called, but we should not try to send
> new packets, so virtio_net_flush_tx() should check the current
> per-queue reset state.
>
> Fixes: 7dc6be52 ("virtio-net: support queue reset")
> Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1451
> Reported-by: Alexander Bulekov 
> Signed-off-by: Xuan Zhuo 
> ---
>  hw/net/virtio-net.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 3ae909041a..fba6451a50 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -2627,7 +2627,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
>  VirtQueueElement *elem;
>  int32_t num_packets = 0;
>  int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
> -if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
> +if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) ||
> +virtio_queue_reset_state(q->tx_vq)) {

We have other places that check DRIVER_OK do we need to check queue
reset as well?

E.g:
virtio_net_can_receive()
virtio_net_tx_{timer|bh}()

Thanks

>  return num_packets;
>  }
>
> --
> 2.32.0.3.g01195cf9f
>




Re: [PATCH 2/9] igb: handle PF/VF reset properly

2023-01-28 Thread Akihiko Odaki

On 2023/01/29 14:58, Akihiko Odaki wrote:

On 2023/01/28 22:46, Sriram Yagnaraman wrote:

Use PFRSTD to reset RSTI bit for VFs, and raise VFLRE interrupt when VF
is reset.

Signed-off-by: Sriram Yagnaraman 
---
  hw/net/e1000x_regs.h |  1 +
  hw/net/igb_core.c    | 33 +
  hw/net/trace-events  |  2 ++
  3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h
index fb5b861135..bb3fb36b8d 100644
--- a/hw/net/e1000x_regs.h
+++ b/hw/net/e1000x_regs.h
@@ -548,6 +548,7 @@
  #define E1000_CTRL_EXT_ASDCHK  0x1000 /* auto speed detection 
check */

  #define E1000_CTRL_EXT_EE_RST  0x2000 /* EEPROM reset */
+#define E1000_CTRL_EXT_PFRSTD  0x4000 /* PF reset done indication */
  #define E1000_CTRL_EXT_LINK_EN 0x0001 /* enable link status from 
external LINK_0 and LINK_1 pins */
  #define E1000_CTRL_EXT_DRV_LOAD 0x1000 /* Driver loaded bit for 
FW */

  #define E1000_CTRL_EXT_EIAME   0x0100
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index abeb9c7889..9bd53cc25f 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1902,14 +1902,6 @@ static void igb_set_eims(IGBCore *core, int 
index, uint32_t val)

  igb_update_interrupt_state(core);
  }
-static void igb_vf_reset(IGBCore *core, uint16_t vfn)
-{
-    /* TODO: Reset of the queue enable and the interrupt registers of 
the VF. */

-
-    core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_RSTI;
-    core->mac[V2PMAILBOX0 + vfn] = E1000_V2PMAILBOX_RSTD;
-}
-
  static void mailbox_interrupt_to_vf(IGBCore *core, uint16_t vfn)
  {
  uint32_t ent = core->mac[VTIVAR_MISC + vfn];
@@ -1987,6 +1979,17 @@ static void igb_set_vfmailbox(IGBCore *core, 
int index, uint32_t val)

  }
  }
+static void igb_vf_reset(IGBCore *core, uint16_t vfn)
+{
+    /* disable Rx and Tx for the VF*/
+    core->mac[VFTE] &= ~BIT(vfn);
+    core->mac[VFRE] &= ~BIT(vfn);
+    /* indicate VF reset to PF */
+    core->mac[VFLRE] |= BIT(vfn);
+    /* VFLRE and mailbox use the same interrupt cause */
+    mailbox_interrupt_to_pf(core);
+}
+


Please do not move the function unless you have a legitimate reason for 
that.


I got it. It is necessary to refer mailbox_interrupt_to_pf().

Reviewed-by: Akihiko Odaki 




  static void igb_w1c(IGBCore *core, int index, uint32_t val)
  {
  core->mac[index] &= ~val;
@@ -2241,14 +2244,20 @@ igb_set_status(IGBCore *core, int index, 
uint32_t val)

  static void
  igb_set_ctrlext(IGBCore *core, int index, uint32_t val)
  {
-    trace_e1000e_link_set_ext_params(!!(val & E1000_CTRL_EXT_ASDCHK),
- !!(val & E1000_CTRL_EXT_SPD_BYPS));
-
-    /* TODO: PFRSTD */
+    trace_igb_link_set_ext_params(!!(val & E1000_CTRL_EXT_ASDCHK),
+  !!(val & E1000_CTRL_EXT_SPD_BYPS),
+  !!(val & E1000_CTRL_EXT_PFRSTD));
  /* Zero self-clearing bits */
  val &= ~(E1000_CTRL_EXT_ASDCHK | E1000_CTRL_EXT_EE_RST);
  core->mac[CTRL_EXT] = val;
+
+    if (core->mac[CTRL_EXT] & E1000_CTRL_EXT_PFRSTD) {
+    for (int vfn = 0; vfn < IGB_MAX_VF_FUNCTIONS; vfn++) {
+    core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_RSTI;
+    core->mac[V2PMAILBOX0 + vfn] |= E1000_V2PMAILBOX_RSTD;
+    }
+    }
  }
  static void
diff --git a/hw/net/trace-events b/hw/net/trace-events
index 2f791b9b57..e94172e748 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -281,6 +281,8 @@ igb_core_mdic_read_unhandled(uint32_t addr) "MDIC 
READ: PHY[%u] UNHANDLED"
  igb_core_mdic_write(uint32_t addr, uint32_t data) "MDIC WRITE: 
PHY[%u] = 0x%x"
  igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] 
UNHANDLED"
+igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, 
bool pfrstd) "Set extended link params: ASD check: %d, Speed select 
bypass: %d, PF reset done: %d"

+
  igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
  igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* 
source, uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, 
length: %u"




Re: [PATCH 2/9] igb: handle PF/VF reset properly

2023-01-28 Thread Akihiko Odaki

On 2023/01/28 22:46, Sriram Yagnaraman wrote:

Use PFRSTD to reset RSTI bit for VFs, and raise VFLRE interrupt when VF
is reset.

Signed-off-by: Sriram Yagnaraman 
---
  hw/net/e1000x_regs.h |  1 +
  hw/net/igb_core.c| 33 +
  hw/net/trace-events  |  2 ++
  3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h
index fb5b861135..bb3fb36b8d 100644
--- a/hw/net/e1000x_regs.h
+++ b/hw/net/e1000x_regs.h
@@ -548,6 +548,7 @@
  
  #define E1000_CTRL_EXT_ASDCHK  0x1000 /* auto speed detection check */

  #define E1000_CTRL_EXT_EE_RST  0x2000 /* EEPROM reset */
+#define E1000_CTRL_EXT_PFRSTD  0x4000 /* PF reset done indication */
  #define E1000_CTRL_EXT_LINK_EN 0x0001 /* enable link status from external 
LINK_0 and LINK_1 pins */
  #define E1000_CTRL_EXT_DRV_LOAD 0x1000 /* Driver loaded bit for FW */
  #define E1000_CTRL_EXT_EIAME   0x0100
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index abeb9c7889..9bd53cc25f 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1902,14 +1902,6 @@ static void igb_set_eims(IGBCore *core, int index, 
uint32_t val)
  igb_update_interrupt_state(core);
  }
  
-static void igb_vf_reset(IGBCore *core, uint16_t vfn)

-{
-/* TODO: Reset of the queue enable and the interrupt registers of the VF. 
*/
-
-core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_RSTI;
-core->mac[V2PMAILBOX0 + vfn] = E1000_V2PMAILBOX_RSTD;
-}
-
  static void mailbox_interrupt_to_vf(IGBCore *core, uint16_t vfn)
  {
  uint32_t ent = core->mac[VTIVAR_MISC + vfn];
@@ -1987,6 +1979,17 @@ static void igb_set_vfmailbox(IGBCore *core, int index, 
uint32_t val)
  }
  }
  
+static void igb_vf_reset(IGBCore *core, uint16_t vfn)

+{
+/* disable Rx and Tx for the VF*/
+core->mac[VFTE] &= ~BIT(vfn);
+core->mac[VFRE] &= ~BIT(vfn);
+/* indicate VF reset to PF */
+core->mac[VFLRE] |= BIT(vfn);
+/* VFLRE and mailbox use the same interrupt cause */
+mailbox_interrupt_to_pf(core);
+}
+


Please do not move the function unless you have a legitimate reason for 
that.



  static void igb_w1c(IGBCore *core, int index, uint32_t val)
  {
  core->mac[index] &= ~val;
@@ -2241,14 +2244,20 @@ igb_set_status(IGBCore *core, int index, uint32_t val)
  static void
  igb_set_ctrlext(IGBCore *core, int index, uint32_t val)
  {
-trace_e1000e_link_set_ext_params(!!(val & E1000_CTRL_EXT_ASDCHK),
- !!(val & E1000_CTRL_EXT_SPD_BYPS));
-
-/* TODO: PFRSTD */
+trace_igb_link_set_ext_params(!!(val & E1000_CTRL_EXT_ASDCHK),
+  !!(val & E1000_CTRL_EXT_SPD_BYPS),
+  !!(val & E1000_CTRL_EXT_PFRSTD));
  
  /* Zero self-clearing bits */

  val &= ~(E1000_CTRL_EXT_ASDCHK | E1000_CTRL_EXT_EE_RST);
  core->mac[CTRL_EXT] = val;
+
+if (core->mac[CTRL_EXT] & E1000_CTRL_EXT_PFRSTD) {
+for (int vfn = 0; vfn < IGB_MAX_VF_FUNCTIONS; vfn++) {
+core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_RSTI;
+core->mac[V2PMAILBOX0 + vfn] |= E1000_V2PMAILBOX_RSTD;
+}
+}
  }
  
  static void

diff --git a/hw/net/trace-events b/hw/net/trace-events
index 2f791b9b57..e94172e748 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -281,6 +281,8 @@ igb_core_mdic_read_unhandled(uint32_t addr) "MDIC READ: PHY[%u] 
UNHANDLED"
  igb_core_mdic_write(uint32_t addr, uint32_t data) "MDIC WRITE: PHY[%u] = 0x%x"
  igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
  
+igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) "Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset done: %d"

+
  igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
  igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, uint32_t len) 
"addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
  




Re: [PATCH v8 2/5] riscv: Change type of valid_vm_1_10_[32|64] to bool

2023-01-28 Thread Bin Meng
On Thu, Jan 26, 2023 at 12:22 AM Alexandre Ghiti  wrote:
>
> This array is actually used as a boolean so swap its current char type
> to a boolean and at the same time, change the type of validate_vm to
> bool since it returns valid_vm_1_10_[32|64].
>
> Suggested-by: Andrew Jones 
> Signed-off-by: Alexandre Ghiti 
> Reviewed-by: Andrew Jones 
> Reviewed-by: Alistair Francis 
> ---
>  target/riscv/csr.c | 21 +++--
>  1 file changed, 11 insertions(+), 10 deletions(-)
>

Reviewed-by: Bin Meng 



Re: [PATCH v8 1/5] riscv: Pass Object to register_cpu_props instead of DeviceState

2023-01-28 Thread Bin Meng
On Thu, Jan 26, 2023 at 12:21 AM Alexandre Ghiti  wrote:
>
> One can extract the DeviceState pointer from the Object pointer, so pass
> the Object for future commits to access other fields of Object.
>
> No functional changes intended.
>
> Signed-off-by: Alexandre Ghiti 
> Reviewed-by: Alistair Francis 
> Reviewed-by: Frank Chang 
> Reviewed-by: Andrew Jones 
> ---
>  target/riscv/cpu.c | 15 ---
>  1 file changed, 8 insertions(+), 7 deletions(-)
>

Reviewed-by: Bin Meng 



Re: [PATCH] linux-user: move target_flat.h to target subdirs

2023-01-28 Thread Richard Henderson

On 1/28/23 14:46, Mike Frysinger wrote:

This makes target_flat.h behave like every other target_xxx.h header.
It also makes it actually work -- while the current header says adding
a header to the target subdir overrides the common one, it doesn't.
This is for two reasons:
* meson.build adds -Ilinux-user before -Ilinux-user/$arch
* the compiler search path for "target_flat.h" looks in the same dir
   as the source file before searching -I paths.

This can be seen with the xtensa port -- the subdir settings aren't
used which breaks stack setup.

Move it to the generic/ subdir and add include stubs like every
other target_xxx.h header is handled.

Signed-off-by: Mike Frysinger
---
  linux-user/aarch64/target_flat.h   | 1 +
  linux-user/arm/target_flat.h   | 1 +
  linux-user/{ => generic}/target_flat.h | 0
  linux-user/m68k/target_flat.h  | 1 +
  linux-user/microblaze/target_flat.h| 1 +
  linux-user/sh4/target_flat.h   | 1 +
  6 files changed, 5 insertions(+)
  create mode 100644 linux-user/aarch64/target_flat.h
  create mode 100644 linux-user/arm/target_flat.h
  rename linux-user/{ => generic}/target_flat.h (100%)
  create mode 100644 linux-user/m68k/target_flat.h
  create mode 100644 linux-user/microblaze/target_flat.h
  create mode 100644 linux-user/sh4/target_flat.h


Reviewed-by: Richard Henderson 


r~



Re: [PATCH v4 3/3] hw/riscv: change riscv_compute_fdt_addr() semantics

2023-01-28 Thread Bin Meng
On Thu, Jan 26, 2023 at 9:54 PM Daniel Henrique Barboza
 wrote:
>
> As it is now, riscv_compute_fdt_addr() is receiving a dram_base, a
> mem_size (which is defaulted to MachineState::ram_size in all boards)
> and the FDT pointer. And it makes a very important assumption: the DRAM
> interval dram_base + mem_size is contiguous. This is indeed the case for
> most boards that uses a FDT.

s/uses/use

>
> The Icicle Kit board works with 2 distinct RAM banks that are separated
> by a gap. We have a lower bank with 1GiB size, a gap follows, then at
> 64GiB the high memory starts. MachineClass::default_ram_size for this
> board is set to 1.5Gb, and machine_init() is enforcing it as minimal RAM
> size, meaning that there we'll always have at least 512 MiB in the Hi
> RAM area.
>
> Using riscv_compute_fdt_addr() in this board is weird because not only
> the board has sparse RAM, and it's calling it using the base address of
> the Lo RAM area, but it's also using a mem_size that we have guarantees
> that it will go up to the Hi RAM. All the function assumptions doesn't
> work for this board.
>
> In fact, what makes the function works at all in this case is a
> coincidence.  Commit 1a475d39ef54 introduced a 3GB boundary for the FDT,
> down from 4Gb, that is enforced if dram_base is lower than 3072 MiB. For
> the Icicle Kit board, memmap[MICROCHIP_PFSOC_DRAM_LO].base is 0x8000
> (2 Gb) and it has a 1Gb size, so it will fall in the conditions to put
> the FDT under a 3Gb address, which happens to be exactly at the end of
> DRAM_LO. If the base address of the Lo area started later than 3Gb this
> function would be unusable by the board. Changing any assumptions inside
> riscv_compute_fdt_addr() can also break it by accident as well.
>
> Let's change riscv_compute_fdt_addr() semantics to be appropriate to the
> Icicle Kit board and for future boards that might have sparse RAM
> topologies to worry about:
>
> - relieve the condition that the dram_base + mem_size area is contiguous,
> since this is already not the case today;
>
> - receive an extra 'dram_size' size attribute that refers to a contiguous
> RAM block that the board wants the FDT to reside on.
>
> Together with 'mem_size' and 'fdt', which are now now being consumed by a
> MachineState pointer, we're able to make clear assumptions based on the
> DRAM block and total mem_size available to ensure that the FDT will be put
> in a valid RAM address.
>

Well written commit message. Thanks!

> Signed-off-by: Daniel Henrique Barboza 
> ---
>  hw/riscv/boot.c| 38 ++
>  hw/riscv/microchip_pfsoc.c |  3 ++-
>  hw/riscv/sifive_u.c|  3 ++-
>  hw/riscv/spike.c   |  3 ++-
>  hw/riscv/virt.c|  3 ++-
>  include/hw/riscv/boot.h|  4 ++--
>  6 files changed, 36 insertions(+), 18 deletions(-)
>
> diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
> index a6f7b8ae8e..8f4991480b 100644
> --- a/hw/riscv/boot.c
> +++ b/hw/riscv/boot.c
> @@ -284,33 +284,47 @@ out:
>  }
>
>  /*
> - * The FDT should be put at the farthest point possible to
> - * avoid overwriting it with the kernel/initrd.
> + * This function makes an assumption that the DRAM interval
> + * 'dram_base' + 'dram_size' is contiguous.
>   *
> - * This function makes an assumption that the DRAM is
> - * contiguous. It also cares about 32-bit systems and
> - * will limit fdt_addr to be addressable by them even for
> - * 64-bit CPUs.
> + * Considering that 'dram_end' is the lowest value between
> + * the end of the DRAM block and MachineState->ram_size, the
> + * FDT location will vary according to 'dram_base':
> + *
> + * - if 'dram_base' is less that 3072 MiB, the FDT will be
> + * put at the lowest value between 3072 MiB and 'dram_end';
> + *
> + * - if 'dram_base' is higher than 3072 MiB, the FDT will be
> + * put at 'dram_end'.
>   *
>   * The FDT is fdt_packed() during the calculation.
>   */
> -uint32_t riscv_compute_fdt_addr(hwaddr dram_base, uint64_t mem_size,
> -void *fdt)
> +hwaddr riscv_compute_fdt_addr(hwaddr dram_base, hwaddr dram_size,

Using hwaddr to represent a size looks weird. Although technically
they are the same ... I would leave this as it is.

> +  MachineState *ms)
>  {
> -uint64_t temp;
> -hwaddr dram_end = dram_base + mem_size;
> -int ret = fdt_pack(fdt);
> +int ret = fdt_pack(ms->fdt);
> +hwaddr dram_end, temp;
>  int fdtsize;
>
>  /* Should only fail if we've built a corrupted tree */
>  g_assert(ret == 0);
>
> -fdtsize = fdt_totalsize(fdt);
> +fdtsize = fdt_totalsize(ms->fdt);
>  if (fdtsize <= 0) {
>  error_report("invalid device-tree");
>  exit(1);
>  }
>
> +/*
> + * A dram_size == 0, usually from a MemMapEntry[].size element,
> + * means that the DRAM block goes all the way to ms->ram_size.
> + */
> +if (dram_size == 0x0) {
> +dram_end = dram_base + ms->ram_size;
> +   

Re: [PATCH] vdpa: fix VHOST_BACKEND_F_IOTLB_ASID flag check

2023-01-28 Thread Jason Wang
On Mon, Jan 23, 2023 at 10:39 PM Michael S. Tsirkin  wrote:
>
> On Tue, Jan 17, 2023 at 11:53:08AM +0100, Eugenio Pérez wrote:
> > VHOST_BACKEND_F_IOTLB_ASID is the feature bit, not the bitmask. Since
> > the device under test also provided VHOST_BACKEND_F_IOTLB_MSG_V2 and
> > VHOST_BACKEND_F_IOTLB_BATCH, this went unnoticed.
> >
> > Fixes: c1a1008685 ("vdpa: always start CVQ in SVQ mode if possible")
> > Signed-off-by: Eugenio Pérez 
> > Acked-by: Jason Wang 
>
> Reviewed-by: Michael S. Tsirkin 
>
> Jason are you merging this?

Queued.

Thanks

>
> > ---
> > Originally on SUSPEND series, but it is a fix that it is worth to send
> > and apply individually:
> > https://lists.nongnu.org/archive/html/qemu-devel/2023-01/msg02574.html
> >
> > ---
> >  net/vhost-vdpa.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> > index 1a13a34d35..de5ed8ff22 100644
> > --- a/net/vhost-vdpa.c
> > +++ b/net/vhost-vdpa.c
> > @@ -384,7 +384,7 @@ static int vhost_vdpa_net_cvq_start(NetClientState *nc)
> >  g_strerror(errno), errno);
> >  return -1;
> >  }
> > -if (!(backend_features & VHOST_BACKEND_F_IOTLB_ASID) ||
> > +if (!(backend_features & BIT_ULL(VHOST_BACKEND_F_IOTLB_ASID)) ||
> >  !vhost_vdpa_net_valid_svq_features(v->dev->features, NULL)) {
> >  return 0;
> >  }
> > --
> > 2.31.1
>




Re: [PATCH 0/3] Fix UNMAP notifier for intel-iommu

2023-01-28 Thread Jason Wang
On Fri, Jan 27, 2023 at 9:17 PM Michael S. Tsirkin  wrote:
>
> On Mon, Jan 16, 2023 at 03:06:44PM +0800, Jason Wang wrote:
> > On Mon, Jan 16, 2023 at 7:30 AM Viktor Prutyanov  wrote:
> > >
> > > On Tue, Nov 29, 2022 at 11:10 AM Jason Wang  wrote:
> > > >
> > > > Hi All:
> > > >
> > > > According to ATS, device should work if ATS is disabled. This is not
> > > > correctly implemented in the current intel-iommu since it doesn't
> > > > handle the UNMAP notifier correctly. This breaks the vhost-net +
> > > > vIOMMU without dt.
> > > >
> > > > The root casue is that the when there's a device IOTLB miss (note that
> > > > it's not specific to PCI so it can work without ATS), Qemu doesn't
> > > > build the IOVA tree, so when guest start an IOTLB invalidation, Qemu
> > > > won't trigger the UNMAP notifier.
> > > >
> > > > Fixing by build IOVA tree during IOMMU translsation.
> > > >
> > > > Thanks
> > > >
> > > > Jason Wang (3):
> > > >   intel-iommu: fail MAP notifier without caching mode
> > > >   intel-iommu: fail DEVIOTLB_UNMAP without dt mode
> > > >   intel-iommu: build iova tree during IOMMU translation
> > > >
> > > >  hw/i386/intel_iommu.c | 58 ---
> > > >  1 file changed, 33 insertions(+), 25 deletions(-)
> > > >
> > > > --
> > > > 2.25.1
> > > >
> > >
> > > Hi Jason,
> > >
> > > I've tried the series with Windows Server 2022 guest with vhost and
> > > intel-iommu (device-iotlb=off) and now networking on this system has
> > > become working.
> > > So, as we discussed, I'm waiting for the series to be accepted in some
> > > form to continue my work about supporting guests who refuse Device-TLB
> > > on systems with device-iotlb=on.
> > >
> > > Tested-by: Viktor Prutyanov 
> >
> > Great, Peter has some comments on this series, so I will probably send
> > a new version (probably after the chinese new year).
> >
> > Thanks
>
> Were you going to post a new version?

Yes.

Thanks

>
> > >
> > > Best regards,
> > > Viktor Prutyanov
> > >
>




Re: [PATCH v4 2/3] hw/riscv: split fdt address calculation from fdt load

2023-01-28 Thread Bin Meng
On Thu, Jan 26, 2023 at 9:53 PM Daniel Henrique Barboza
 wrote:
>
> A common trend in other archs is to calculate the fdt address, which is
> usually straightforward, and then calling a function that loads the
> fdt/dtb by using that address.
>
> riscv_load_fdt() is doing a bit too much in comparison. It's calculating
> the fdt address via an elaborated heuristic to put the FDT at the bottom
> of DRAM, and "bottom of DRAM" will vary across boards and
> configurations, then it's actually loading the fdt, and finally it's
> returning the fdt address used to the caller.
>
> Reduce the existing complexity of riscv_load_fdt() by splitting its code
> into a new function, riscv_compute_fdt_addr(), that will take care of
> all fdt address logic. riscv_load_fdt() can then be a simple function
> that just loads a fdt at the given fdt address.
>
> We're also taken the opportunity to clarify the intentions and
> assumptions made by these functions. riscv_load_fdt() is now receiving a
> hwaddr as fdt_addr because there is no restriction of having to load the
> fdt in higher addresses that doesn't fit in an uint32_t.
>
> Reviewed-by: Alistair Francis 
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  hw/riscv/boot.c| 33 +
>  hw/riscv/microchip_pfsoc.c |  6 --
>  hw/riscv/sifive_u.c|  7 ---
>  hw/riscv/spike.c   |  6 +++---
>  hw/riscv/virt.c|  7 ---
>  include/hw/riscv/boot.h|  4 +++-
>  6 files changed, 43 insertions(+), 20 deletions(-)
>
> diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
> index a563b7482a..a6f7b8ae8e 100644
> --- a/hw/riscv/boot.c
> +++ b/hw/riscv/boot.c
> @@ -283,9 +283,21 @@ out:
>  return kernel_entry;
>  }
>
> -uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t mem_size, void *fdt)
> +/*
> + * The FDT should be put at the farthest point possible to
> + * avoid overwriting it with the kernel/initrd.
> + *
> + * This function makes an assumption that the DRAM is
> + * contiguous. It also cares about 32-bit systems and
> + * will limit fdt_addr to be addressable by them even for
> + * 64-bit CPUs.
> + *
> + * The FDT is fdt_packed() during the calculation.
> + */
> +uint32_t riscv_compute_fdt_addr(hwaddr dram_base, uint64_t mem_size,
> +void *fdt)

The original code returns a uint64_t for fdt_addr but now this is uint32_t?

>  {
> -uint64_t temp, fdt_addr;
> +uint64_t temp;
>  hwaddr dram_end = dram_base + mem_size;
>  int ret = fdt_pack(fdt);
>  int fdtsize;
> @@ -306,11 +318,18 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
> mem_size, void *fdt)
>   * end of dram or 3GB whichever is lesser.
>   */
>  temp = (dram_base < 3072 * MiB) ? MIN(dram_end, 3072 * MiB) : dram_end;
> -fdt_addr = QEMU_ALIGN_DOWN(temp - fdtsize, 2 * MiB);
>
> -ret = fdt_pack(fdt);
> -/* Should only fail if we've built a corrupted tree */
> -g_assert(ret == 0);
> +return QEMU_ALIGN_DOWN(temp - fdtsize, 2 * MiB);
> +}
> +
> +/*
> + * 'fdt_addr' is received as hwaddr because boards might put
> + * the FDT beyond 32-bit addressing boundary.
> + */
> +void riscv_load_fdt(hwaddr fdt_addr, void *fdt)
> +{
> +uint32_t fdtsize = fdt_totalsize(fdt);
> +
>  /* copy in the device tree */
>  qemu_fdt_dumpdtb(fdt, fdtsize);
>
> @@ -318,8 +337,6 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
> mem_size, void *fdt)
>&address_space_memory);
>  qemu_register_reset_nosnapshotload(qemu_fdt_randomize_seeds,
>  rom_ptr_for_as(&address_space_memory, fdt_addr, 
> fdtsize));
> -
> -return fdt_addr;
>  }
>
>  void riscv_rom_copy_firmware_info(MachineState *machine, hwaddr rom_base,
> diff --git a/hw/riscv/microchip_pfsoc.c b/hw/riscv/microchip_pfsoc.c
> index b7e171b605..a30203db85 100644
> --- a/hw/riscv/microchip_pfsoc.c
> +++ b/hw/riscv/microchip_pfsoc.c
> @@ -633,8 +633,10 @@ static void 
> microchip_icicle_kit_machine_init(MachineState *machine)
>   kernel_start_addr, true, NULL);
>
>  /* Compute the fdt load address in dram */
> -fdt_load_addr = riscv_load_fdt(memmap[MICROCHIP_PFSOC_DRAM_LO].base,
> -   machine->ram_size, machine->fdt);
> +fdt_load_addr = 
> riscv_compute_fdt_addr(memmap[MICROCHIP_PFSOC_DRAM_LO].base,
> +   machine->ram_size, 
> machine->fdt);
> +riscv_load_fdt(fdt_load_addr, machine->fdt);
> +
>  /* Load the reset vector */
>  riscv_setup_rom_reset_vec(machine, &s->soc.u_cpus, 
> firmware_load_addr,
>memmap[MICROCHIP_PFSOC_ENVM_DATA].base,
> diff --git a/hw/riscv/sifive_u.c b/hw/riscv/sifive_u.c
> index b0b3e6f03a..6bbdbe5fb7 100644
> --- a/hw/riscv/sifive_u.c
> +++ b/hw/riscv/sifive_u.c
> @@ -608,9 +608,10 @@ static void sifive_u_machine_init(MachineState *machine)
>  

Re: [PATCH v7] tests/qtest: netdev: test stream and dgram backends

2023-01-28 Thread Jason Wang
On Thu, Jan 26, 2023 at 8:48 PM Thomas Huth  wrote:
>
> On 18/01/2023 13.04, Laurent Vivier wrote:
> > Signed-off-by: Laurent Vivier 
> > Acked-by: Michael S. Tsirkin 
> > Acked-by: Thomas Huth 
> > ---
> >
> > Notes:
> >  v7:
> >- disable test_dgram_mcast() on windows
> >- disable test_dgram_unix() on windows as it also fails
> >  (we test for unix support dynamically but the test is done with
> >   SOCK_STREAM, and it fails here with SOCK_DGRAM)
> >- Tested with cirrus-ci (Thank you Thomas)
>
> Thanks, added to my staging branch:
>
>   https://gitlab.com/thuth/qemu/-/commits/staging
>
>Thomas

I've also queued this since it is required for the patch:

[PATCH v4] net: stream: add a new option to automatically reconnect

Thanks

>




Re: [PATCH v4] net: stream: add a new option to automatically reconnect

2023-01-28 Thread Jason Wang
On Thu, Jan 19, 2023 at 6:16 PM Laurent Vivier  wrote:
>
> In stream mode, if the server shuts down there is currently
> no way to reconnect the client to a new server without removing
> the NIC device and the netdev backend (or to reboot).
>
> This patch introduces a reconnect option that specifies a delay
> to try to reconnect with the same parameters.
>
> Add a new test in qtest to test the reconnect option and the
> connect/disconnect events.
>
> Signed-off-by: Laurent Vivier 

Applied.

Thanks

> ---
> Based-on: <20230118120405.1876329-1-lviv...@redhat.com>
>
> v4:
> - rebase
>
> v3:
> - add "since 8.0" in net.json
>
> v2:
> - rebase
>
>  net/stream.c|  53 ++-
>  qapi/net.json   |   7 ++-
>  qemu-options.hx |   6 +--
>  tests/qtest/netdev-socket.c | 101 
>  4 files changed, 162 insertions(+), 5 deletions(-)
>
> diff --git a/net/stream.c b/net/stream.c
> index 37ff727e0c42..9204b4c96e40 100644
> --- a/net/stream.c
> +++ b/net/stream.c
> @@ -39,6 +39,8 @@
>  #include "io/channel-socket.h"
>  #include "io/net-listener.h"
>  #include "qapi/qapi-events-net.h"
> +#include "qapi/qapi-visit-sockets.h"
> +#include "qapi/clone-visitor.h"
>
>  typedef struct NetStreamState {
>  NetClientState nc;
> @@ -49,11 +51,15 @@ typedef struct NetStreamState {
>  guint ioc_write_tag;
>  SocketReadState rs;
>  unsigned int send_index;  /* number of bytes sent*/
> +uint32_t reconnect;
> +guint timer_tag;
> +SocketAddress *addr;
>  } NetStreamState;
>
>  static void net_stream_listen(QIONetListener *listener,
>QIOChannelSocket *cioc,
>void *opaque);
> +static void net_stream_arm_reconnect(NetStreamState *s);
>
>  static gboolean net_stream_writable(QIOChannel *ioc,
>  GIOCondition condition,
> @@ -170,6 +176,7 @@ static gboolean net_stream_send(QIOChannel *ioc,
>  qemu_set_info_str(&s->nc, "%s", "");
>
>  qapi_event_send_netdev_stream_disconnected(s->nc.name);
> +net_stream_arm_reconnect(s);
>
>  return G_SOURCE_REMOVE;
>  }
> @@ -187,6 +194,14 @@ static gboolean net_stream_send(QIOChannel *ioc,
>  static void net_stream_cleanup(NetClientState *nc)
>  {
>  NetStreamState *s = DO_UPCAST(NetStreamState, nc, nc);
> +if (s->timer_tag) {
> +g_source_remove(s->timer_tag);
> +s->timer_tag = 0;
> +}
> +if (s->addr) {
> +qapi_free_SocketAddress(s->addr);
> +s->addr = NULL;
> +}
>  if (s->ioc) {
>  if (QIO_CHANNEL_SOCKET(s->ioc)->fd != -1) {
>  if (s->ioc_read_tag) {
> @@ -346,12 +361,37 @@ static void net_stream_client_connected(QIOTask *task, 
> gpointer opaque)
>  error:
>  object_unref(OBJECT(s->ioc));
>  s->ioc = NULL;
> +net_stream_arm_reconnect(s);
> +}
> +
> +static gboolean net_stream_reconnect(gpointer data)
> +{
> +NetStreamState *s = data;
> +QIOChannelSocket *sioc;
> +
> +s->timer_tag = 0;
> +
> +sioc = qio_channel_socket_new();
> +s->ioc = QIO_CHANNEL(sioc);
> +qio_channel_socket_connect_async(sioc, s->addr,
> + net_stream_client_connected, s,
> + NULL, NULL);
> +return G_SOURCE_REMOVE;
> +}
> +
> +static void net_stream_arm_reconnect(NetStreamState *s)
> +{
> +if (s->reconnect && s->timer_tag == 0) {
> +s->timer_tag = g_timeout_add_seconds(s->reconnect,
> + net_stream_reconnect, s);
> +}
>  }
>
>  static int net_stream_client_init(NetClientState *peer,
>const char *model,
>const char *name,
>SocketAddress *addr,
> +  uint32_t reconnect,
>Error **errp)
>  {
>  NetStreamState *s;
> @@ -364,6 +404,10 @@ static int net_stream_client_init(NetClientState *peer,
>  s->ioc = QIO_CHANNEL(sioc);
>  s->nc.link_down = true;
>
> +s->reconnect = reconnect;
> +if (reconnect) {
> +s->addr = QAPI_CLONE(SocketAddress, addr);
> +}
>  qio_channel_socket_connect_async(sioc, addr,
>   net_stream_client_connected, s,
>   NULL, NULL);
> @@ -380,7 +424,14 @@ int net_init_stream(const Netdev *netdev, const char 
> *name,
>  sock = &netdev->u.stream;
>
>  if (!sock->has_server || !sock->server) {
> -return net_stream_client_init(peer, "stream", name, sock->addr, 
> errp);
> +return net_stream_client_init(peer, "stream", name, sock->addr,
> +  sock->has_reconnect ? sock->reconnect 
> : 0,
> +  errp);
> +}
> +if (sock->has_reconnect) {
> +error_setg(errp, "'recon

Re: [PATCH v4 1/3] hw/riscv/boot.c: calculate fdt size after fdt_pack()

2023-01-28 Thread Bin Meng
On Sun, Jan 29, 2023 at 10:20 AM Bin Meng  wrote:
>
> Hi Daniel,
>
> On Thu, Jan 26, 2023 at 9:53 PM Daniel Henrique Barboza
>  wrote:
> >
> > fdt_pack() can change the fdt size, meaning that fdt_totalsize() can
> > contain a now deprecated (bigger) value.
>
> The commit message is a bit confusing.
>
> The original code in this patch does not call fdt_pack(). So not sure
> where the issue of "deprecated (bigger) value" happens?

I see where the call to fdt_pack() happens.

I think you should move the following changes in patch#2 of this
series to this commit.

-ret = fdt_pack(fdt);
-/* Should only fail if we've built a corrupted tree */
-g_assert(ret == 0);

After that, your commit message makes sense, as it describes the
problem and how your patch fixes the problem.

>
> >
> > Reviewed-by: Alistair Francis 
> > Signed-off-by: Daniel Henrique Barboza 
> > ---
> >  hw/riscv/boot.c | 7 ++-
> >  1 file changed, 6 insertions(+), 1 deletion(-)
> >
> > diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
> > index 3172a76220..a563b7482a 100644
> > --- a/hw/riscv/boot.c
> > +++ b/hw/riscv/boot.c
> > @@ -287,8 +287,13 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
> > mem_size, void *fdt)
> >  {
> >  uint64_t temp, fdt_addr;
> >  hwaddr dram_end = dram_base + mem_size;
> > -int ret, fdtsize = fdt_totalsize(fdt);
> > +int ret = fdt_pack(fdt);
> > +int fdtsize;
> >
> > +/* Should only fail if we've built a corrupted tree */
> > +g_assert(ret == 0);
> > +
> > +fdtsize = fdt_totalsize(fdt);
> >  if (fdtsize <= 0) {
> >  error_report("invalid device-tree");
> >  exit(1);
>

Regards,
Bin



Re: [PATCH] pci: add enforce_slot_reserved_mask_manual property

2023-01-28 Thread Chuck Zmudzinski
On 1/28/2023 4:58 PM, Mark Cave-Ayland wrote:
> On 28/01/2023 03:39, Chuck Zmudzinski wrote:
>
> > On 1/27/2023 8:28 AM, Michael S. Tsirkin wrote:
> >> On Sun, Jan 15, 2023 at 07:49:51PM -0500, Chuck Zmudzinski wrote:
> >>> The current reserved slot check in do_pci_register_device(), added with
> >>> commit 8b8849844fd6
> >>
> >> add ("subject here") please
> >>
> >>> ,is done even if the pci device being added is
> >>> configured manually for a particular slot. The new property, when set
> >>> to false, disables the check when the device is configured to request a
> >>> particular slot. This allows an administrator or management tool to
> >>> override slot_reserved_mask for a pci device by requesting a particular
> >>> slot for the device. The new property is initialized to true which
> >>> preserves the existing behavior of slot_reserved_mask by default.
> >>>
> >>> Signed-off-by: Chuck Zmudzinski 
> >>
> >> Thanks!
> >> I'm trying to think of the best default for this.
> > 
> > I think it would be better for the default value of
> > enforce_slot_reserved_mask_manual to be false, so that a
> > user-specified slot will by default override slot_reserved_mask.
> > But doing that would change the current behavior of
> > slot_reserved_mask.
> > 
> > Currently, this is the only place where slot_reserved_mask is used in all
> > of the Qemu source (code from hw/sparc64/sun4u.c):
> > 
> > -- snip ---
> >      /* Only in-built Simba APBs can exist on the root bus, slot 0 on busA 
> > is
> >     reserved (leaving no slots free after on-board devices) however 
> > slots
> >     0-3 are free on busB */
> >      pci_bus->slot_reserved_mask = 0xfffc;
> >      pci_busA->slot_reserved_mask = 0xfff1;
> >      pci_busB->slot_reserved_mask = 0xfff0;
> > -- snip ---
> > 
> > I think we could safely change the default value of
> > enforce_slot_reserved_mask_manual to false but set
> > it to true for the sparc64 sun4u board here to preserve
> > the current behavior of the only existing board in Qemu
> > that uses slot_reserved_mask.
> > 
> > What do you think?
> > 
> >> Users is trying to configure a specific device on a reserved
> >> slot. Should we
> >> CC a bunch more people for visibility. Input, anyone?
>
> For a bit of background, slot_reserved_mask was added by me to solve a 
> problem with 
> the sun4u machine: on a real Ultra-5, the pci "A" bus has 2 free slots and 
> the pci 
> "B" bus has 4 free slots. Whilst it is possible to plug a PCI device into any 
> slot in 
> QEMU, the PCI bridges only have IRQ mapping registers for those 6 slots, so 
> you can 
> easily end up with an auto-allocated slot where it is impossible for the OS 
> to map 
> the IRQ.
>
> Hence slot_reserved_mask was originally intended to mark slots as being 
> unavailable 
> for both manual and automatic allocation to ensure that devices plugged into 
> both PCI 
> buses would always work.
>
> If there is a need to change/refactor the logic then I can test the sun4u 
> machine to 
> ensure the original test case still works.
>
>
> ATB,
>
> Mark.

Thanks, I will let you know if there is a patch to test on the
sun4u machine. For now, we are waiting to see if the xen
maintainers will accept a patch that uses slot_reserved_mask
to prevent other devices from using the slot that is required
by the Intel igd in the xenfv machine. That patch does not change
the way slot_reserved_mask works, but if that patch is added
some users might want to add a capability for a user to override
slot_reserved_mask, and that is what this patch attempts to
implement.

Kind regards,

Chuck



[PATCH v4 12/12] MAINTAINERS: add myself as the maintainer for cryptodev

2023-01-28 Thread zhenwei pi
I developed the akcipher service, QoS setting, QMP/HMP commands and
statistics accounting for crypto device. Making myself as the
maintainer for QEMU's cryptodev.

Cc: Gonglei 
Signed-off-by: zhenwei pi 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9f6c54b145..e21a6ee470 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2857,6 +2857,7 @@ T: git https://gitlab.com/ehabkost/qemu.git machine-next
 
 Cryptodev Backends
 M: Gonglei 
+M: zhenwei pi 
 S: Maintained
 F: include/sysemu/cryptodev*.h
 F: backends/cryptodev*.c
-- 
2.34.1




[PATCH v4 11/12] cryptodev: Support query-stats QMP command

2023-01-28 Thread zhenwei pi
Now we can use "query-stats" QMP command to query statistics of
crypto devices. (Originally this was designed to show statistics
by '{"execute": "query-cryptodev"}'. Daniel Berrangé suggested that
querying configuration info by "query-cryptodev", and querying
runtime performance info by "query-stats". This makes sense!)

Example:
~# virsh qemu-monitor-command vm '{"execute": "query-stats", \
   "arguments": {"target": "cryptodev"} }' | jq
{
  "return": [
{
  "provider": "cryptodev",
  "stats": [
{
  "name": "asym-verify-bytes",
  "value": 7680
},
...
{
  "name": "asym-decrypt-ops",
  "value": 32
},
{
  "name": "asym-encrypt-ops",
  "value": 48
}
  ],
  "qom-path": "/objects/cryptodev0" # support asym only
},
{
  "provider": "cryptodev",
  "stats": [
{
  "name": "asym-verify-bytes",
  "value": 0
},
...
{
  "name": "sym-decrypt-bytes",
  "value": 5376
},
...
  ],
  "qom-path": "/objects/cryptodev1" # support asym/sym
}
  ],
  "id": "libvirt-422"
}

Suggested-by: Daniel P. Berrangé 
Signed-off-by: zhenwei pi 
---
 backends/cryptodev.c | 141 +++
 monitor/hmp-cmds.c   |   5 ++
 monitor/qmp-cmds.c   |   2 +
 qapi/stats.json  |  10 ++-
 4 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index 09ffdd345f..9d52220772 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -22,9 +22,11 @@
  */
 
 #include "qemu/osdep.h"
+#include "monitor/stats.h"
 #include "sysemu/cryptodev.h"
 #include "qapi/error.h"
 #include "qapi/qapi-commands-cryptodev.h"
+#include "qapi/qapi-types-stats.h"
 #include "qapi/visitor.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
@@ -32,6 +34,14 @@
 #include "qom/object_interfaces.h"
 #include "hw/virtio/virtio-crypto.h"
 
+typedef struct StatsArgs {
+union StatsResultsType {
+StatsResultList **stats;
+StatsSchemaList **schema;
+} result;
+strList *names;
+Error **errp;
+} StatsArgs;
 
 static QTAILQ_HEAD(, CryptoDevBackendClient) crypto_clients;
 
@@ -435,6 +445,134 @@ static void cryptodev_backend_finalize(Object *obj)
 }
 }
 
+static StatsList *cryptodev_backend_stats_add(const char *name, int64_t *val,
+  StatsList *stats_list)
+{
+Stats *stats = g_new0(Stats, 1);
+
+stats->name = g_strdup(name);
+stats->value = g_new0(StatsValue, 1);
+stats->value->type = QTYPE_QNUM;
+stats->value->u.scalar = *val;
+
+QAPI_LIST_PREPEND(stats_list, stats);
+return stats_list;
+}
+
+static int cryptodev_backend_stats_query(Object *obj, void *data)
+{
+StatsArgs *stats_args = data;
+StatsResultList **stats_results = stats_args->result.stats;
+StatsList *stats_list = NULL;
+StatsResult *entry;
+CryptoDevBackend *backend;
+QCryptodevBackendSymStat *sym_stat;
+QCryptodevBackendAsymStat *asym_stat;
+
+if (!object_dynamic_cast(obj, TYPE_CRYPTODEV_BACKEND)) {
+return 0;
+}
+
+backend = CRYPTODEV_BACKEND(obj);
+sym_stat = backend->sym_stat;
+if (sym_stat) {
+stats_list = cryptodev_backend_stats_add("sym-encrypt-ops",
+ &sym_stat->encrypt_ops, stats_list);
+stats_list = cryptodev_backend_stats_add("sym-decrypt-ops",
+ &sym_stat->decrypt_ops, stats_list);
+stats_list = cryptodev_backend_stats_add("sym-encrypt-bytes",
+ &sym_stat->encrypt_bytes, stats_list);
+stats_list = cryptodev_backend_stats_add("sym-decrypt-bytes",
+ &sym_stat->decrypt_bytes, stats_list);
+}
+
+asym_stat = backend->asym_stat;
+if (asym_stat) {
+stats_list = cryptodev_backend_stats_add("asym-encrypt-ops",
+ &asym_stat->encrypt_ops, stats_list);
+stats_list = cryptodev_backend_stats_add("asym-decrypt-ops",
+ &asym_stat->decrypt_ops, stats_list);
+stats_list = cryptodev_backend_stats_add("asym-sign-ops",
+ &asym_stat->sign_ops, stats_list);
+stats_list = cryptodev_backend_stats_add("asym-verify-ops",
+ &asym_stat->verify_ops, stats_list);
+stats_list = cryptodev_backend_stats_add("asym-encrypt-bytes",
+ &asym_stat->encrypt_bytes, stats_list);
+stats_list = cryptodev_backend_stats_add("asym-decrypt-bytes",
+ &asym_stat->decrypt_bytes, stats_list);
+stats_list = cryptodev_backend_stats_add("asym-sign-bytes",
+ &asym_stat->sign_bytes, stats_list);
+stats_list = cryptodev_backend_stats_add("asym-verify-bytes",
+ &asym_stat->verify_bytes, stats_list);
+ 

[PATCH v4 04/12] cryptodev: Introduce server type in QAPI

2023-01-28 Thread zhenwei pi
Introduce cryptodev service type in cryptodev.json, then apply this
to related codes. Now we can remove VIRTIO_CRYPTO_SERVICE_xxx
dependence from QEMU cryptodev.

Reviewed-by: Daniel P. Berrangé 
Signed-off-by: zhenwei pi 
---
 backends/cryptodev-builtin.c|  8 
 backends/cryptodev-lkcf.c   |  2 +-
 backends/cryptodev-vhost-user.c |  6 +++---
 hw/virtio/virtio-crypto.c   | 27 +--
 qapi/cryptodev.json | 11 +++
 5 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c
index e70dcd5dad..c0fbb650d7 100644
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -79,10 +79,10 @@ static void cryptodev_builtin_init(
 backend->conf.peers.ccs[0] = cc;
 
 backend->conf.crypto_services =
- 1u << VIRTIO_CRYPTO_SERVICE_CIPHER |
- 1u << VIRTIO_CRYPTO_SERVICE_HASH |
- 1u << VIRTIO_CRYPTO_SERVICE_MAC |
- 1u << VIRTIO_CRYPTO_SERVICE_AKCIPHER;
+ 1u << QCRYPTODEV_BACKEND_SERVICE_CIPHER |
+ 1u << QCRYPTODEV_BACKEND_SERVICE_HASH |
+ 1u << QCRYPTODEV_BACKEND_SERVICE_MAC |
+ 1u << QCRYPTODEV_BACKEND_SERVICE_AKCIPHER;
 backend->conf.cipher_algo_l = 1u << VIRTIO_CRYPTO_CIPHER_AES_CBC;
 backend->conf.hash_algo = 1u << VIRTIO_CRYPTO_HASH_SHA1;
 backend->conf.akcipher_algo = 1u << VIRTIO_CRYPTO_AKCIPHER_RSA;
diff --git a/backends/cryptodev-lkcf.c b/backends/cryptodev-lkcf.c
index 53a932b58d..edec99f104 100644
--- a/backends/cryptodev-lkcf.c
+++ b/backends/cryptodev-lkcf.c
@@ -230,7 +230,7 @@ static void cryptodev_lkcf_init(CryptoDevBackend *backend, 
Error **errp)
 backend->conf.peers.ccs[0] = cc;
 
 backend->conf.crypto_services =
-1u << VIRTIO_CRYPTO_SERVICE_AKCIPHER;
+1u << QCRYPTODEV_BACKEND_SERVICE_AKCIPHER;
 backend->conf.akcipher_algo = 1u << VIRTIO_CRYPTO_AKCIPHER_RSA;
 lkcf->running = true;
 
diff --git a/backends/cryptodev-vhost-user.c b/backends/cryptodev-vhost-user.c
index 580bd1abb0..b1d9eb735f 100644
--- a/backends/cryptodev-vhost-user.c
+++ b/backends/cryptodev-vhost-user.c
@@ -221,9 +221,9 @@ static void cryptodev_vhost_user_init(
  cryptodev_vhost_user_event, NULL, s, NULL, true);
 
 backend->conf.crypto_services =
- 1u << VIRTIO_CRYPTO_SERVICE_CIPHER |
- 1u << VIRTIO_CRYPTO_SERVICE_HASH |
- 1u << VIRTIO_CRYPTO_SERVICE_MAC;
+ 1u << QCRYPTODEV_BACKEND_SERVICE_CIPHER |
+ 1u << QCRYPTODEV_BACKEND_SERVICE_HASH |
+ 1u << QCRYPTODEV_BACKEND_SERVICE_MAC;
 backend->conf.cipher_algo_l = 1u << VIRTIO_CRYPTO_CIPHER_AES_CBC;
 backend->conf.hash_algo = 1u << VIRTIO_CRYPTO_HASH_SHA1;
 
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index 0d1be0ada9..e4f0de4d1c 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -997,12 +997,35 @@ static void virtio_crypto_reset(VirtIODevice *vdev)
 }
 }
 
+static uint32_t virtio_crypto_init_services(uint32_t qservices)
+{
+uint32_t vservices = 0;
+
+if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_CIPHER)) {
+vservices |= (1 << VIRTIO_CRYPTO_SERVICE_CIPHER);
+}
+if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_HASH)) {
+vservices |= (1 << VIRTIO_CRYPTO_SERVICE_HASH);
+}
+if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_MAC)) {
+vservices |= (1 << VIRTIO_CRYPTO_SERVICE_MAC);
+}
+if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_AEAD)) {
+vservices |= (1 << VIRTIO_CRYPTO_SERVICE_AEAD);
+}
+if (qservices & (1 << QCRYPTODEV_BACKEND_SERVICE_AKCIPHER)) {
+vservices |= (1 << VIRTIO_CRYPTO_SERVICE_AKCIPHER);
+}
+
+return vservices;
+}
+
 static void virtio_crypto_init_config(VirtIODevice *vdev)
 {
 VirtIOCrypto *vcrypto = VIRTIO_CRYPTO(vdev);
 
-vcrypto->conf.crypto_services =
- vcrypto->conf.cryptodev->conf.crypto_services;
+vcrypto->conf.crypto_services = virtio_crypto_init_services(
+ vcrypto->conf.cryptodev->conf.crypto_services);
 vcrypto->conf.cipher_algo_l =
  vcrypto->conf.cryptodev->conf.cipher_algo_l;
 vcrypto->conf.cipher_algo_h =
diff --git a/qapi/cryptodev.json b/qapi/cryptodev.json
index ebb6852035..8732a30524 100644
--- a/qapi/cryptodev.json
+++ b/qapi/cryptodev.json
@@ -18,6 +18,17 @@
   'prefix': 'QCRYPTODEV_BACKEND_ALG',
   'data': ['sym', 'asym']}
 
+##
+# @QCryptodevBackendServiceType:
+#
+# The supported service types of a crypto device.
+#
+# Since: 8.0
+##
+{ 'enum': 'QCryptodevBackendServiceType',
+  'prefix': 'QCRYPTODEV_BACKEND_SERVICE',
+  'data': ['cipher', 'hash', 'mac', 'aead', 'akcipher']}
+
 ##
 # @QCryptodev

[PATCH v4 06/12] cryptodev-builtin: Detect akcipher capability

2023-01-28 Thread zhenwei pi
Rather than exposing akcipher service/RSA algorithm to virtio crypto
device unconditionally, detect akcipher capability from akcipher
crypto framework. This avoids unsuccessful requests.

Reviewed-by: Daniel P. Berrangé 
Signed-off-by: zhenwei pi 
---
 backends/cryptodev-builtin.c | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c
index c0fbb650d7..c45b5906c5 100644
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -59,6 +59,19 @@ struct CryptoDevBackendBuiltin {
 CryptoDevBackendBuiltinSession *sessions[MAX_NUM_SESSIONS];
 };
 
+static void cryptodev_builtin_init_akcipher(CryptoDevBackend *backend)
+{
+QCryptoAkCipherOptions opts;
+
+opts.alg = QCRYPTO_AKCIPHER_ALG_RSA;
+opts.u.rsa.padding_alg = QCRYPTO_RSA_PADDING_ALG_RAW;
+if (qcrypto_akcipher_supports(&opts)) {
+backend->conf.crypto_services |=
+ (1u << QCRYPTODEV_BACKEND_SERVICE_AKCIPHER);
+backend->conf.akcipher_algo = 1u << VIRTIO_CRYPTO_AKCIPHER_RSA;
+}
+}
+
 static void cryptodev_builtin_init(
  CryptoDevBackend *backend, Error **errp)
 {
@@ -81,11 +94,9 @@ static void cryptodev_builtin_init(
 backend->conf.crypto_services =
  1u << QCRYPTODEV_BACKEND_SERVICE_CIPHER |
  1u << QCRYPTODEV_BACKEND_SERVICE_HASH |
- 1u << QCRYPTODEV_BACKEND_SERVICE_MAC |
- 1u << QCRYPTODEV_BACKEND_SERVICE_AKCIPHER;
+ 1u << QCRYPTODEV_BACKEND_SERVICE_MAC;
 backend->conf.cipher_algo_l = 1u << VIRTIO_CRYPTO_CIPHER_AES_CBC;
 backend->conf.hash_algo = 1u << VIRTIO_CRYPTO_HASH_SHA1;
-backend->conf.akcipher_algo = 1u << VIRTIO_CRYPTO_AKCIPHER_RSA;
 /*
  * Set the Maximum length of crypto request.
  * Why this value? Just avoid to overflow when
@@ -94,6 +105,7 @@ static void cryptodev_builtin_init(
 backend->conf.max_size = LONG_MAX - sizeof(CryptoDevBackendOpInfo);
 backend->conf.max_cipher_key_len = CRYPTODEV_BUITLIN_MAX_CIPHER_KEY_LEN;
 backend->conf.max_auth_key_len = CRYPTODEV_BUITLIN_MAX_AUTH_KEY_LEN;
+cryptodev_builtin_init_akcipher(backend);
 
 cryptodev_backend_set_ready(backend, true);
 }
-- 
2.34.1




[PATCH v4 10/12] cryptodev: support QoS

2023-01-28 Thread zhenwei pi
Add 'throttle-bps' and 'throttle-ops' limitation to set QoS. The
two arguments work with both QEMU command line and QMP command.

Example of QEMU command line:
-object cryptodev-backend-builtin,id=cryptodev1,throttle-bps=1600,\
throttle-ops=100

Example of QMP command:
virsh qemu-monitor-command buster --hmp qom-set /objects/cryptodev1 \
throttle-ops 100

or cancel limitation:
virsh qemu-monitor-command buster --hmp qom-set /objects/cryptodev1 \
throttle-ops 0

Signed-off-by: zhenwei pi 
---
 backends/cryptodev.c   | 138 +
 include/sysemu/cryptodev.h |   7 ++
 qapi/qom.json  |   8 ++-
 3 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index cc824e9665..09ffdd345f 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -28,6 +28,7 @@
 #include "qapi/visitor.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
+#include "qemu/main-loop.h"
 #include "qom/object_interfaces.h"
 #include "hw/virtio/virtio-crypto.h"
 
@@ -203,17 +204,53 @@ static int cryptodev_backend_account(CryptoDevBackend 
*backend,
 return len;
 }
 
+static void cryptodev_backend_throttle_timer_cb(void *opaque)
+{
+CryptoDevBackend *backend = (CryptoDevBackend *)opaque;
+CryptoDevBackendOpInfo *op_info, *tmpop;
+int ret;
+
+QTAILQ_FOREACH_SAFE(op_info, &backend->opinfos, next, tmpop) {
+QTAILQ_REMOVE(&backend->opinfos, op_info, next);
+ret = cryptodev_backend_account(backend, op_info);
+if (ret < 0) {
+op_info->cb(op_info->opaque, ret);
+continue;
+}
+
+throttle_account(&backend->ts, true, ret);
+cryptodev_backend_operation(backend, op_info);
+if (throttle_enabled(&backend->tc) &&
+throttle_schedule_timer(&backend->ts, &backend->tt, true)) {
+break;
+}
+}
+}
+
 int cryptodev_backend_crypto_operation(
  CryptoDevBackend *backend,
  CryptoDevBackendOpInfo *op_info)
 {
 int ret;
 
+if (!throttle_enabled(&backend->tc)) {
+goto do_account;
+}
+
+if (throttle_schedule_timer(&backend->ts, &backend->tt, true) ||
+!QTAILQ_EMPTY(&backend->opinfos)) {
+QTAILQ_INSERT_TAIL(&backend->opinfos, op_info, next);
+return 0;
+}
+
+do_account:
 ret = cryptodev_backend_account(backend, op_info);
 if (ret < 0) {
 return ret;
 }
 
+throttle_account(&backend->ts, true, ret);
+
 return cryptodev_backend_operation(backend, op_info);
 }
 
@@ -245,12 +282,98 @@ cryptodev_backend_set_queues(Object *obj, Visitor *v, 
const char *name,
 backend->conf.peers.queues = value;
 }
 
+static void cryptodev_backend_set_throttle(CryptoDevBackend *backend, int 
field,
+   uint64_t value, Error **errp)
+{
+uint64_t orig = backend->tc.buckets[field].avg;
+bool enabled = throttle_enabled(&backend->tc);
+
+if (orig == value) {
+return;
+}
+
+backend->tc.buckets[field].avg = value;
+if (!throttle_enabled(&backend->tc)) {
+throttle_timers_destroy(&backend->tt);
+cryptodev_backend_throttle_timer_cb(backend); /* drain opinfos */
+return;
+}
+
+if (!throttle_is_valid(&backend->tc, errp)) {
+backend->tc.buckets[field].avg = orig; /* revert change */
+return;
+}
+
+if (!enabled) {
+throttle_init(&backend->ts);
+throttle_timers_init(&backend->tt, qemu_get_aio_context(),
+ QEMU_CLOCK_REALTIME,
+ cryptodev_backend_throttle_timer_cb, /* FIXME */
+ cryptodev_backend_throttle_timer_cb, backend);
+}
+
+throttle_config(&backend->ts, QEMU_CLOCK_REALTIME, &backend->tc);
+}
+
+static void cryptodev_backend_get_bps(Object *obj, Visitor *v,
+  const char *name, void *opaque,
+  Error **errp)
+{
+CryptoDevBackend *backend = CRYPTODEV_BACKEND(obj);
+uint64_t value = backend->tc.buckets[THROTTLE_BPS_TOTAL].avg;
+
+visit_type_uint64(v, name, &value, errp);
+}
+
+static void cryptodev_backend_set_bps(Object *obj, Visitor *v, const char 
*name,
+  void *opaque, Error **errp)
+{
+CryptoDevBackend *backend = CRYPTODEV_BACKEND(obj);
+uint64_t value;
+
+if (!visit_type_uint64(v, name, &value, errp)) {
+return;
+}
+
+cryptodev_backend_set_throttle(backend, THROTTLE_BPS_TOTAL, value, errp);
+}
+
+static void cryptodev_backend_get_ops(Object *obj, Visitor *v, const char 
*name,
+  void *opaque, Error **errp)
+{
+CryptoDevBackend *backend = CRYPTODEV_BACKEND(obj);
+uint64_t value = backend->tc.buckets[THROTTLE_OPS_TOTAL].avg;
+
+visit_type_uint64(v, name, &value, errp);
+}
+
+static void cryptodev_backend_set_ops(Object 

[PATCH v4 02/12] cryptodev: Remove 'name' & 'model' fields

2023-01-28 Thread zhenwei pi
We have already used qapi to generate crypto device types, this allows
to convert type to a string 'model', so the 'model' field is not
needed.

And the 'name' field is not used by any backend driver, drop it.

Reviewed-by: Daniel P. Berrangé 
Signed-off-by: zhenwei pi 
---
 backends/cryptodev-builtin.c|  3 +--
 backends/cryptodev-lkcf.c   |  2 +-
 backends/cryptodev-vhost-user.c |  3 +--
 backends/cryptodev.c| 11 +--
 include/sysemu/cryptodev.h  | 12 +++-
 5 files changed, 7 insertions(+), 24 deletions(-)

diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c
index 8c7c10847d..08895271eb 100644
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -72,8 +72,7 @@ static void cryptodev_builtin_init(
 return;
 }
 
-cc = cryptodev_backend_new_client(
-  "cryptodev-builtin", NULL);
+cc = cryptodev_backend_new_client();
 cc->info_str = g_strdup_printf("cryptodev-builtin0");
 cc->queue_index = 0;
 cc->type = QCRYPTODEV_BACKEND_TYPE_BUILTIN;
diff --git a/backends/cryptodev-lkcf.c b/backends/cryptodev-lkcf.c
index 91e02c0df9..de3d1867c5 100644
--- a/backends/cryptodev-lkcf.c
+++ b/backends/cryptodev-lkcf.c
@@ -223,7 +223,7 @@ static void cryptodev_lkcf_init(CryptoDevBackend *backend, 
Error **errp)
 return;
 }
 
-cc = cryptodev_backend_new_client("cryptodev-lkcf", NULL);
+cc = cryptodev_backend_new_client();
 cc->info_str = g_strdup_printf("cryptodev-lkcf0");
 cc->queue_index = 0;
 cc->type = QCRYPTODEV_BACKEND_TYPE_LKCF;
diff --git a/backends/cryptodev-vhost-user.c b/backends/cryptodev-vhost-user.c
index c165a1b1d6..580bd1abb0 100644
--- a/backends/cryptodev-vhost-user.c
+++ b/backends/cryptodev-vhost-user.c
@@ -198,8 +198,7 @@ static void cryptodev_vhost_user_init(
 s->opened = true;
 
 for (i = 0; i < queues; i++) {
-cc = cryptodev_backend_new_client(
-  "cryptodev-vhost-user", NULL);
+cc = cryptodev_backend_new_client();
 cc->info_str = g_strdup_printf("cryptodev-vhost-user%zu to %s ",
i, chr->label);
 cc->queue_index = i;
diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index 54ee8c81f5..81941af816 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -34,18 +34,11 @@
 static QTAILQ_HEAD(, CryptoDevBackendClient) crypto_clients;
 
 
-CryptoDevBackendClient *
-cryptodev_backend_new_client(const char *model,
-const char *name)
+CryptoDevBackendClient *cryptodev_backend_new_client(void)
 {
 CryptoDevBackendClient *cc;
 
 cc = g_new0(CryptoDevBackendClient, 1);
-cc->model = g_strdup(model);
-if (name) {
-cc->name = g_strdup(name);
-}
-
 QTAILQ_INSERT_TAIL(&crypto_clients, cc, next);
 
 return cc;
@@ -55,8 +48,6 @@ void cryptodev_backend_free_client(
   CryptoDevBackendClient *cc)
 {
 QTAILQ_REMOVE(&crypto_clients, cc, next);
-g_free(cc->name);
-g_free(cc->model);
 g_free(cc->info_str);
 g_free(cc);
 }
diff --git a/include/sysemu/cryptodev.h b/include/sysemu/cryptodev.h
index 8d2adda974..af152d09db 100644
--- a/include/sysemu/cryptodev.h
+++ b/include/sysemu/cryptodev.h
@@ -218,8 +218,6 @@ struct CryptoDevBackendClass {
 
 struct CryptoDevBackendClient {
 QCryptodevBackendType type;
-char *model;
-char *name;
 char *info_str;
 unsigned int queue_index;
 int vring_enable;
@@ -264,11 +262,8 @@ struct CryptoDevBackend {
 
 /**
  * cryptodev_backend_new_client:
- * @model: the cryptodev backend model
- * @name: the cryptodev backend name, can be NULL
  *
- * Creates a new cryptodev backend client object
- * with the @name in the model @model.
+ * Creates a new cryptodev backend client object.
  *
  * The returned object must be released with
  * cryptodev_backend_free_client() when no
@@ -276,9 +271,8 @@ struct CryptoDevBackend {
  *
  * Returns: a new cryptodev backend client object
  */
-CryptoDevBackendClient *
-cryptodev_backend_new_client(const char *model,
-const char *name);
+CryptoDevBackendClient *cryptodev_backend_new_client(void);
+
 /**
  * cryptodev_backend_free_client:
  * @cc: the cryptodev backend client object
-- 
2.34.1




[PATCH v4 09/12] cryptodev: Account statistics

2023-01-28 Thread zhenwei pi
Account OPS/BPS for crypto device, this will be used for 'query-stats'
QEMU monitor command and QoS in the next step.

Note that a crypto device may support symmetric mode, asymmetric mode,
both symmetric and asymmetric mode. So we use two structure to
describe the statistics of a crypto device.

Signed-off-by: zhenwei pi 
---
 backends/cryptodev.c   | 68 +++---
 include/sysemu/cryptodev.h | 31 +
 qapi/cryptodev.json| 54 ++
 3 files changed, 148 insertions(+), 5 deletions(-)

diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index ba7b0bc770..cc824e9665 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -107,6 +107,9 @@ void cryptodev_backend_cleanup(
 if (bc->cleanup) {
 bc->cleanup(backend, errp);
 }
+
+g_free(backend->sym_stat);
+g_free(backend->asym_stat);
 }
 
 int cryptodev_backend_create_session(
@@ -154,16 +157,61 @@ static int cryptodev_backend_operation(
 return -VIRTIO_CRYPTO_NOTSUPP;
 }
 
+static int cryptodev_backend_account(CryptoDevBackend *backend,
+ CryptoDevBackendOpInfo *op_info)
+{
+enum QCryptodevBackendAlgType algtype = op_info->algtype;
+int len;
+
+if (algtype == QCRYPTODEV_BACKEND_ALG_ASYM) {
+CryptoDevBackendAsymOpInfo *asym_op_info = op_info->u.asym_op_info;
+len = asym_op_info->src_len;
+switch (op_info->op_code) {
+case VIRTIO_CRYPTO_AKCIPHER_ENCRYPT:
+QCryptodevAsymStatIncEncrypt(backend, len);
+break;
+case VIRTIO_CRYPTO_AKCIPHER_DECRYPT:
+QCryptodevAsymStatIncDecrypt(backend, len);
+break;
+case VIRTIO_CRYPTO_AKCIPHER_SIGN:
+QCryptodevAsymStatIncSign(backend, len);
+break;
+case VIRTIO_CRYPTO_AKCIPHER_VERIFY:
+QCryptodevAsymStatIncVerify(backend, len);
+break;
+default:
+return -VIRTIO_CRYPTO_NOTSUPP;
+}
+} else if (algtype == QCRYPTODEV_BACKEND_ALG_SYM) {
+CryptoDevBackendSymOpInfo *sym_op_info = op_info->u.sym_op_info;
+len = sym_op_info->src_len;
+switch (op_info->op_code) {
+case VIRTIO_CRYPTO_CIPHER_ENCRYPT:
+QCryptodevSymStatIncEncrypt(backend, len);
+break;
+case VIRTIO_CRYPTO_CIPHER_DECRYPT:
+QCryptodevSymStatIncDecrypt(backend, len);
+break;
+default:
+return -VIRTIO_CRYPTO_NOTSUPP;
+}
+} else {
+error_report("Unsupported cryptodev alg type: %" PRIu32 "", algtype);
+return -VIRTIO_CRYPTO_NOTSUPP;
+}
+
+return len;
+}
+
 int cryptodev_backend_crypto_operation(
  CryptoDevBackend *backend,
  CryptoDevBackendOpInfo *op_info)
 {
-QCryptodevBackendAlgType algtype = op_info->algtype;
+int ret;
 
-if ((algtype != QCRYPTODEV_BACKEND_ALG_SYM)
-&& (algtype != QCRYPTODEV_BACKEND_ALG_ASYM)) {
-error_report("Unsupported cryptodev alg type: %" PRIu32 "", algtype);
-return -VIRTIO_CRYPTO_NOTSUPP;
+ret = cryptodev_backend_account(backend, op_info);
+if (ret < 0) {
+return ret;
 }
 
 return cryptodev_backend_operation(backend, op_info);
@@ -202,10 +250,20 @@ cryptodev_backend_complete(UserCreatable *uc, Error 
**errp)
 {
 CryptoDevBackend *backend = CRYPTODEV_BACKEND(uc);
 CryptoDevBackendClass *bc = CRYPTODEV_BACKEND_GET_CLASS(uc);
+uint32_t services;
 
 if (bc->init) {
 bc->init(backend, errp);
 }
+
+services = backend->conf.crypto_services;
+if (services & (1 << QCRYPTODEV_BACKEND_SERVICE_CIPHER)) {
+backend->sym_stat = g_new0(QCryptodevBackendSymStat, 1);
+}
+
+if (services & (1 << QCRYPTODEV_BACKEND_SERVICE_AKCIPHER)) {
+backend->asym_stat = g_new0(QCryptodevBackendAsymStat, 1);
+}
 }
 
 void cryptodev_backend_set_used(CryptoDevBackend *backend, bool used)
diff --git a/include/sysemu/cryptodev.h b/include/sysemu/cryptodev.h
index 048a627035..15e8c04dcf 100644
--- a/include/sysemu/cryptodev.h
+++ b/include/sysemu/cryptodev.h
@@ -253,8 +253,39 @@ struct CryptoDevBackend {
 /* Tag the cryptodev backend is used by virtio-crypto or not */
 bool is_used;
 CryptoDevBackendConf conf;
+QCryptodevBackendSymStat *sym_stat;
+QCryptodevBackendAsymStat *asym_stat;
 };
 
+#define QCryptodevSymStatInc(be, op, bytes) do { \
+   be->sym_stat->op##_bytes += (bytes); \
+   be->sym_stat->op##_ops += 1; \
+} while (/*CONSTCOND*/0)
+
+#define QCryptodevSymStatIncEncrypt(be, bytes) \
+QCryptodevSymStatInc(be, encrypt, bytes)
+
+#define QCryptodevSymStatIncDecrypt(be, bytes) \
+QCryptodevSymStatInc(be, decrypt, bytes)
+
+#define QCryptodevAsymStatInc(be, op, bytes) do { \
+be->asym_stat->op##_bytes += (bytes); \
+be->asym_stat->op##_ops += 1; \
+} while (/*CONSTCOND*/0)
+
+#define QCryptodevAsymStatIncEncrypt(be, by

[PATCH v4 07/12] hmp: add cryptodev info command

2023-01-28 Thread zhenwei pi
Example of this command:
 # virsh qemu-monitor-command vm --hmp info cryptodev
cryptodev1: service=[akcipher|mac|hash|cipher]
queue 0: type=builtin
cryptodev0: service=[akcipher]
queue 0: type=lkcf

Signed-off-by: zhenwei pi 
---
 hmp-commands-info.hx  | 14 ++
 include/monitor/hmp.h |  1 +
 monitor/hmp-cmds.c| 37 +
 3 files changed, 52 insertions(+)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 754b1e8408..47d63d26db 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -993,3 +993,17 @@ SRST
   ``info virtio-queue-element`` *path* *queue* [*index*]
 Display element of a given virtio queue
 ERST
+
+{
+.name   = "cryptodev",
+.args_type  = "",
+.params = "",
+.help   = "show the crypto devices",
+.cmd= hmp_info_cryptodev,
+.flags  = "p",
+},
+
+SRST
+  ``info cryptodev``
+Show the crypto devices.
+ERST
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 1b3bdcb446..391a097ffd 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -151,5 +151,6 @@ void hmp_human_readable_text_helper(Monitor *mon,
 HumanReadableText *(*qmp_handler)(Error 
**));
 void hmp_info_stats(Monitor *mon, const QDict *qdict);
 void hmp_pcie_aer_inject_error(Monitor *mon, const QDict *qdict);
+void hmp_info_cryptodev(Monitor *mon, const QDict *qdict);
 
 #endif
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 1dba973092..cda52c2526 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -33,6 +33,7 @@
 #include "qapi/qapi-commands-block.h"
 #include "qapi/qapi-commands-char.h"
 #include "qapi/qapi-commands-control.h"
+#include "qapi/qapi-commands-cryptodev.h"
 #include "qapi/qapi-commands-machine.h"
 #include "qapi/qapi-commands-migration.h"
 #include "qapi/qapi-commands-misc.h"
@@ -2280,3 +2281,39 @@ void hmp_virtio_queue_element(Monitor *mon, const QDict 
*qdict)
 
 qapi_free_VirtioQueueElement(e);
 }
+
+void hmp_info_cryptodev(Monitor *mon, const QDict *qdict)
+{
+QCryptodevInfoList *il;
+QCryptodevBackendServiceTypeList *sl;
+QCryptodevBackendClientList *cl;
+
+for (il = qmp_query_cryptodev(NULL); il; il = il->next) {
+g_autofree char *services = NULL;
+QCryptodevInfo *info = il->value;
+char *tmp_services;
+
+/* build a string like 'service=[akcipher|mac|hash|cipher]' */
+for (sl = info->service; sl; sl = sl->next) {
+const char *service = QCryptodevBackendServiceType_str(sl->value);
+
+if (!services) {
+services = g_strdup(service);
+} else {
+tmp_services = g_strjoin("|", services, service, NULL);
+g_free(services);
+services = tmp_services;
+}
+}
+monitor_printf(mon, "%s: service=[%s]\n", info->id, services);
+
+for (cl = info->client; cl; cl = cl->next) {
+QCryptodevBackendClient *client = cl->value;
+monitor_printf(mon, "queue %" PRIu32 ": type=%s\n",
+   client->queue,
+   QCryptodevBackendType_str(client->type));
+}
+}
+
+qapi_free_QCryptodevInfoList(il);
+}
-- 
2.34.1




[PATCH v4 05/12] cryptodev: Introduce 'query-cryptodev' QMP command

2023-01-28 Thread zhenwei pi
Now we have a QMP command to query crypto devices:
virsh qemu-monitor-command vm '{"execute": "query-cryptodev"}' | jq
{
  "return": [
{
  "service": [
"akcipher",
"mac",
"hash",
"cipher"
  ],
  "id": "cryptodev1",
  "client": [
{
  "queue": 0,
  "type": "builtin"
}
  ]
},
{
  "service": [
"akcipher"
  ],
  "id": "cryptodev0",
  "client": [
{
  "queue": 0,
  "type": "lkcf"
}
  ]
}
  ],
  "id": "libvirt-417"
}

Signed-off-by: zhenwei pi 
---
 backends/cryptodev.c | 45 
 qapi/cryptodev.json  | 44 +++
 2 files changed, 89 insertions(+)

diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index c2a053db0e..3a45d19823 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -24,6 +24,7 @@
 #include "qemu/osdep.h"
 #include "sysemu/cryptodev.h"
 #include "qapi/error.h"
+#include "qapi/qapi-commands-cryptodev.h"
 #include "qapi/visitor.h"
 #include "qemu/config-file.h"
 #include "qemu/error-report.h"
@@ -33,6 +34,50 @@
 
 static QTAILQ_HEAD(, CryptoDevBackendClient) crypto_clients;
 
+static int qmp_query_cryptodev_foreach(Object *obj, void *data)
+{
+CryptoDevBackend *backend;
+QCryptodevInfoList **infolist = data;
+uint32_t services, i;
+
+if (!object_dynamic_cast(obj, TYPE_CRYPTODEV_BACKEND)) {
+return 0;
+}
+
+QCryptodevInfo *info = g_new0(QCryptodevInfo, 1);
+info->id = g_strdup(object_get_canonical_path_component(obj));
+
+backend = CRYPTODEV_BACKEND(obj);
+services = backend->conf.crypto_services;
+for (i = 0; i < QCRYPTODEV_BACKEND_SERVICE__MAX; i++) {
+if (services & (1 << i)) {
+QAPI_LIST_PREPEND(info->service, i);
+}
+}
+
+for (i = 0; i < backend->conf.peers.queues; i++) {
+CryptoDevBackendClient *cc = backend->conf.peers.ccs[i];
+QCryptodevBackendClient *client = g_new0(QCryptodevBackendClient, 1);
+
+client->queue = cc->queue_index;
+client->type = cc->type;
+QAPI_LIST_PREPEND(info->client, client);
+}
+
+QAPI_LIST_PREPEND(*infolist, info);
+
+return 0;
+}
+
+QCryptodevInfoList *qmp_query_cryptodev(Error **errp)
+{
+QCryptodevInfoList *list = NULL;
+Object *objs = container_get(object_get_root(), "/objects");
+
+object_child_foreach(objs, qmp_query_cryptodev_foreach, &list);
+
+return list;
+}
 
 CryptoDevBackendClient *cryptodev_backend_new_client(void)
 {
diff --git a/qapi/cryptodev.json b/qapi/cryptodev.json
index 8732a30524..f33f96a692 100644
--- a/qapi/cryptodev.json
+++ b/qapi/cryptodev.json
@@ -43,3 +43,47 @@
 { 'enum': 'QCryptodevBackendType',
   'prefix': 'QCRYPTODEV_BACKEND_TYPE',
   'data': ['builtin', 'vhost-user', 'lkcf']}
+
+##
+# @QCryptodevBackendClient:
+#
+# Information about a queue of crypto device.
+#
+# @queue: the queue index of the crypto device
+#
+# @type: the type of the crypto device
+#
+# Since: 8.0
+##
+{ 'struct': 'QCryptodevBackendClient',
+  'data': { 'queue': 'uint32',
+'type': 'QCryptodevBackendType' } }
+
+##
+# @QCryptodevInfo:
+#
+# Information about a crypto device.
+#
+# @id: the id of the crypto device
+#
+# @service: supported service types of a crypto device
+#
+# @client: the additional infomation of the crypto device
+#
+# Since: 8.0
+##
+{ 'struct': 'QCryptodevInfo',
+  'data': { 'id': 'str',
+'service': ['QCryptodevBackendServiceType'],
+'client': ['QCryptodevBackendClient'] } }
+
+##
+# @query-cryptodev:
+#
+# Returns information about current crypto devices.
+#
+# Returns: a list of @QCryptodevInfo
+#
+# Since: 8.0
+##
+{ 'command': 'query-cryptodev', 'returns': ['QCryptodevInfo']}
-- 
2.34.1




[PATCH v4 01/12] cryptodev: Introduce cryptodev.json

2023-01-28 Thread zhenwei pi
Introduce QCryptodevBackendType in cryptodev.json, also apply this to
related codes. Then we can drop 'enum CryptoDevBackendOptionsType'.

Note that `CRYPTODEV_BACKEND_TYPE_NONE` is *NOT* used by anywhere, so
drop it(no 'none' enum in QCryptodevBackendType).

Reviewed-by: Daniel P. Berrangé 
Signed-off-by: zhenwei pi 
---
 MAINTAINERS |  1 +
 backends/cryptodev-builtin.c|  2 +-
 backends/cryptodev-lkcf.c   |  2 +-
 backends/cryptodev-vhost-user.c |  4 ++--
 backends/cryptodev-vhost.c  |  4 ++--
 include/sysemu/cryptodev.h  | 11 ++-
 qapi/cryptodev.json | 20 
 qapi/meson.build|  1 +
 qapi/qapi-schema.json   |  1 +
 9 files changed, 31 insertions(+), 15 deletions(-)
 create mode 100644 qapi/cryptodev.json

diff --git a/MAINTAINERS b/MAINTAINERS
index c581c11a64..9f6c54b145 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2860,6 +2860,7 @@ M: Gonglei 
 S: Maintained
 F: include/sysemu/cryptodev*.h
 F: backends/cryptodev*.c
+F: qapi/cryptodev.json
 
 Python library
 M: John Snow 
diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c
index cda6ca3b71..8c7c10847d 100644
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -76,7 +76,7 @@ static void cryptodev_builtin_init(
   "cryptodev-builtin", NULL);
 cc->info_str = g_strdup_printf("cryptodev-builtin0");
 cc->queue_index = 0;
-cc->type = CRYPTODEV_BACKEND_TYPE_BUILTIN;
+cc->type = QCRYPTODEV_BACKEND_TYPE_BUILTIN;
 backend->conf.peers.ccs[0] = cc;
 
 backend->conf.crypto_services =
diff --git a/backends/cryptodev-lkcf.c b/backends/cryptodev-lkcf.c
index 133bd706a4..91e02c0df9 100644
--- a/backends/cryptodev-lkcf.c
+++ b/backends/cryptodev-lkcf.c
@@ -226,7 +226,7 @@ static void cryptodev_lkcf_init(CryptoDevBackend *backend, 
Error **errp)
 cc = cryptodev_backend_new_client("cryptodev-lkcf", NULL);
 cc->info_str = g_strdup_printf("cryptodev-lkcf0");
 cc->queue_index = 0;
-cc->type = CRYPTODEV_BACKEND_TYPE_LKCF;
+cc->type = QCRYPTODEV_BACKEND_TYPE_LKCF;
 backend->conf.peers.ccs[0] = cc;
 
 backend->conf.crypto_services =
diff --git a/backends/cryptodev-vhost-user.c b/backends/cryptodev-vhost-user.c
index ab3028e045..c165a1b1d6 100644
--- a/backends/cryptodev-vhost-user.c
+++ b/backends/cryptodev-vhost-user.c
@@ -67,7 +67,7 @@ cryptodev_vhost_user_get_vhost(
 {
 CryptoDevBackendVhostUser *s =
   CRYPTODEV_BACKEND_VHOST_USER(b);
-assert(cc->type == CRYPTODEV_BACKEND_TYPE_VHOST_USER);
+assert(cc->type == QCRYPTODEV_BACKEND_TYPE_VHOST_USER);
 assert(queue < MAX_CRYPTO_QUEUE_NUM);
 
 return s->vhost_crypto[queue];
@@ -203,7 +203,7 @@ static void cryptodev_vhost_user_init(
 cc->info_str = g_strdup_printf("cryptodev-vhost-user%zu to %s ",
i, chr->label);
 cc->queue_index = i;
-cc->type = CRYPTODEV_BACKEND_TYPE_VHOST_USER;
+cc->type = QCRYPTODEV_BACKEND_TYPE_VHOST_USER;
 
 backend->conf.peers.ccs[i] = cc;
 
diff --git a/backends/cryptodev-vhost.c b/backends/cryptodev-vhost.c
index 572f87b3be..a2b5a2cb3b 100644
--- a/backends/cryptodev-vhost.c
+++ b/backends/cryptodev-vhost.c
@@ -128,7 +128,7 @@ cryptodev_get_vhost(CryptoDevBackendClient *cc,
 
 switch (cc->type) {
 #if defined(CONFIG_VHOST_USER) && defined(CONFIG_LINUX)
-case CRYPTODEV_BACKEND_TYPE_VHOST_USER:
+case QCRYPTODEV_BACKEND_TYPE_VHOST_USER:
 vhost_crypto = cryptodev_vhost_user_get_vhost(cc, b, queue);
 break;
 #endif
@@ -196,7 +196,7 @@ int cryptodev_vhost_start(VirtIODevice *dev, int 
total_queues)
  * because vhost user doesn't interrupt masking/unmasking
  * properly.
  */
-if (cc->type == CRYPTODEV_BACKEND_TYPE_VHOST_USER) {
+if (cc->type == QCRYPTODEV_BACKEND_TYPE_VHOST_USER) {
 dev->use_guest_notifier_mask = false;
 }
  }
diff --git a/include/sysemu/cryptodev.h b/include/sysemu/cryptodev.h
index cf9b3f07fe..8d2adda974 100644
--- a/include/sysemu/cryptodev.h
+++ b/include/sysemu/cryptodev.h
@@ -25,6 +25,7 @@
 
 #include "qemu/queue.h"
 #include "qom/object.h"
+#include "qapi/qapi-types-cryptodev.h"
 
 /**
  * CryptoDevBackend:
@@ -215,16 +216,8 @@ struct CryptoDevBackendClass {
  void *opaque);
 };
 
-typedef enum CryptoDevBackendOptionsType {
-CRYPTODEV_BACKEND_TYPE_NONE = 0,
-CRYPTODEV_BACKEND_TYPE_BUILTIN = 1,
-CRYPTODEV_BACKEND_TYPE_VHOST_USER = 2,
-CRYPTODEV_BACKEND_TYPE_LKCF = 3,
-CRYPTODEV_BACKEND_TYPE__MAX,
-} CryptoDevBackendOptionsType;
-
 struct CryptoDevBackendClient {
-CryptoDevBackendOptionsType type;
+QCryptodevBackendType type;
 char *model;
 char *name;
 char *info_str;
diff --git a/qapi/cryptodev.json b/qapi/cryptodev.json
new file mode 100644
index 00..b65edbe183
--- /dev/null
+++ b/qapi/cryptodev.json
@@ -0,0 +1,20 @@

[PATCH v4 08/12] cryptodev: Use CryptoDevBackendOpInfo for operation

2023-01-28 Thread zhenwei pi
Move queue_index, CryptoDevCompletionFunc and opaque into struct
CryptoDevBackendOpInfo, then cryptodev_backend_crypto_operation()
needs an argument CryptoDevBackendOpInfo *op_info only. And remove
VirtIOCryptoReq from cryptodev. It's also possible to hide
VirtIOCryptoReq into virtio-crypto.c in the next step. (In theory,
VirtIOCryptoReq is a private structure used by virtio-crypto only)

Signed-off-by: zhenwei pi 
---
 backends/cryptodev-builtin.c |  9 +++--
 backends/cryptodev-lkcf.c|  9 +++--
 backends/cryptodev.c | 18 +-
 hw/virtio/virtio-crypto.c|  7 ---
 include/sysemu/cryptodev.h   | 26 ++
 5 files changed, 25 insertions(+), 44 deletions(-)

diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c
index c45b5906c5..39d0455280 100644
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -539,10 +539,7 @@ static int cryptodev_builtin_asym_operation(
 
 static int cryptodev_builtin_operation(
  CryptoDevBackend *backend,
- CryptoDevBackendOpInfo *op_info,
- uint32_t queue_index,
- CryptoDevCompletionFunc cb,
- void *opaque)
+ CryptoDevBackendOpInfo *op_info)
 {
 CryptoDevBackendBuiltin *builtin =
   CRYPTODEV_BACKEND_BUILTIN(backend);
@@ -574,8 +571,8 @@ static int cryptodev_builtin_operation(
 if (local_error) {
 error_report_err(local_error);
 }
-if (cb) {
-cb(opaque, status);
+if (op_info->cb) {
+op_info->cb(op_info->opaque, status);
 }
 return 0;
 }
diff --git a/backends/cryptodev-lkcf.c b/backends/cryptodev-lkcf.c
index edec99f104..45aba1ff67 100644
--- a/backends/cryptodev-lkcf.c
+++ b/backends/cryptodev-lkcf.c
@@ -469,10 +469,7 @@ static void *cryptodev_lkcf_worker(void *arg)
 
 static int cryptodev_lkcf_operation(
 CryptoDevBackend *backend,
-CryptoDevBackendOpInfo *op_info,
-uint32_t queue_index,
-CryptoDevCompletionFunc cb,
-void *opaque)
+CryptoDevBackendOpInfo *op_info)
 {
 CryptoDevBackendLKCF *lkcf =
 CRYPTODEV_BACKEND_LKCF(backend);
@@ -495,8 +492,8 @@ static int cryptodev_lkcf_operation(
 
 task = g_new0(CryptoDevLKCFTask, 1);
 task->op_info = op_info;
-task->cb = cb;
-task->opaque = opaque;
+task->cb = op_info->cb;
+task->opaque = op_info->opaque;
 task->sess = sess;
 task->lkcf = lkcf;
 task->status = -VIRTIO_CRYPTO_ERR;
diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index 3a45d19823..ba7b0bc770 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -143,29 +143,22 @@ int cryptodev_backend_close_session(
 
 static int cryptodev_backend_operation(
  CryptoDevBackend *backend,
- CryptoDevBackendOpInfo *op_info,
- uint32_t queue_index,
- CryptoDevCompletionFunc cb,
- void *opaque)
+ CryptoDevBackendOpInfo *op_info)
 {
 CryptoDevBackendClass *bc =
   CRYPTODEV_BACKEND_GET_CLASS(backend);
 
 if (bc->do_op) {
-return bc->do_op(backend, op_info, queue_index, cb, opaque);
+return bc->do_op(backend, op_info);
 }
 return -VIRTIO_CRYPTO_NOTSUPP;
 }
 
 int cryptodev_backend_crypto_operation(
  CryptoDevBackend *backend,
- void *opaque1,
- uint32_t queue_index,
- CryptoDevCompletionFunc cb, void *opaque2)
+ CryptoDevBackendOpInfo *op_info)
 {
-VirtIOCryptoReq *req = opaque1;
-CryptoDevBackendOpInfo *op_info = &req->op_info;
-QCryptodevBackendAlgType algtype = req->flags;
+QCryptodevBackendAlgType algtype = op_info->algtype;
 
 if ((algtype != QCRYPTODEV_BACKEND_ALG_SYM)
 && (algtype != QCRYPTODEV_BACKEND_ALG_ASYM)) {
@@ -173,8 +166,7 @@ int cryptodev_backend_crypto_operation(
 return -VIRTIO_CRYPTO_NOTSUPP;
 }
 
-return cryptodev_backend_operation(backend, op_info, queue_index,
-   cb, opaque2);
+return cryptodev_backend_operation(backend, op_info);
 }
 
 static void
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index e4f0de4d1c..802e1b9659 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -871,6 +871,9 @@ virtio_crypto_handle_request(VirtIOCryptoReq *request)
 opcode = ldl_le_p(&req.header.opcode);
 op_info->session_id = ldq_le_p(&req.header.session_id);
 op_info->op_code = opcode;
+op_info->queue_index = queue_index;
+op_info->cb = virtio_crypto_req_complete;
+op_info->opaque = request;
 
 switch (opcode) {
 case VIRTIO_CRYPTO_CIPHER_ENCRYPT:
@@ -898,9 +901,7 @@ check_result:
 virtio_crypto_req_complete(request, -VIRTIO_CRYPTO_NOTSUPP);
 } else {
 ret = cryptodev_backend_crypto_operation(vcrypto->cryptodev,
-   

[PATCH v4 03/12] cryptodev: Introduce cryptodev alg type in QAPI

2023-01-28 Thread zhenwei pi
Introduce cryptodev alg type in cryptodev.json, then apply this to
related codes, and drop 'enum CryptoDevBackendAlgType'.

There are two options:
1, { 'enum': 'QCryptodevBackendAlgType',
  'prefix': 'CRYPTODEV_BACKEND_ALG',
  'data': ['sym', 'asym']}
Then we can keep 'CRYPTODEV_BACKEND_ALG_SYM' and avoid lots of
changes.
2, changes in this patch(with prefix 'QCRYPTODEV_BACKEND_ALG').

To avoid breaking the rule of QAPI, use 2 here.

Reviewed-by: Daniel P. Berrangé 
Signed-off-by: zhenwei pi 
---
 backends/cryptodev-builtin.c |  6 +++---
 backends/cryptodev-lkcf.c|  4 ++--
 backends/cryptodev.c |  6 +++---
 hw/virtio/virtio-crypto.c| 14 +++---
 include/sysemu/cryptodev.h   |  8 +---
 qapi/cryptodev.json  | 14 ++
 6 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/backends/cryptodev-builtin.c b/backends/cryptodev-builtin.c
index 08895271eb..e70dcd5dad 100644
--- a/backends/cryptodev-builtin.c
+++ b/backends/cryptodev-builtin.c
@@ -537,7 +537,7 @@ static int cryptodev_builtin_operation(
 CryptoDevBackendBuiltinSession *sess;
 CryptoDevBackendSymOpInfo *sym_op_info;
 CryptoDevBackendAsymOpInfo *asym_op_info;
-enum CryptoDevBackendAlgType algtype = op_info->algtype;
+QCryptodevBackendAlgType algtype = op_info->algtype;
 int status = -VIRTIO_CRYPTO_ERR;
 Error *local_error = NULL;
 
@@ -549,11 +549,11 @@ static int cryptodev_builtin_operation(
 }
 
 sess = builtin->sessions[op_info->session_id];
-if (algtype == CRYPTODEV_BACKEND_ALG_SYM) {
+if (algtype == QCRYPTODEV_BACKEND_ALG_SYM) {
 sym_op_info = op_info->u.sym_op_info;
 status = cryptodev_builtin_sym_operation(sess, sym_op_info,
  &local_error);
-} else if (algtype == CRYPTODEV_BACKEND_ALG_ASYM) {
+} else if (algtype == QCRYPTODEV_BACKEND_ALG_ASYM) {
 asym_op_info = op_info->u.asym_op_info;
 status = cryptodev_builtin_asym_operation(sess, op_info->op_code,
   asym_op_info, &local_error);
diff --git a/backends/cryptodev-lkcf.c b/backends/cryptodev-lkcf.c
index de3d1867c5..53a932b58d 100644
--- a/backends/cryptodev-lkcf.c
+++ b/backends/cryptodev-lkcf.c
@@ -477,7 +477,7 @@ static int cryptodev_lkcf_operation(
 CryptoDevBackendLKCF *lkcf =
 CRYPTODEV_BACKEND_LKCF(backend);
 CryptoDevBackendLKCFSession *sess;
-enum CryptoDevBackendAlgType algtype = op_info->algtype;
+QCryptodevBackendAlgType algtype = op_info->algtype;
 CryptoDevLKCFTask *task;
 
 if (op_info->session_id >= MAX_SESSIONS ||
@@ -488,7 +488,7 @@ static int cryptodev_lkcf_operation(
 }
 
 sess = lkcf->sess[op_info->session_id];
-if (algtype != CRYPTODEV_BACKEND_ALG_ASYM) {
+if (algtype != QCRYPTODEV_BACKEND_ALG_ASYM) {
 error_report("algtype not supported: %u", algtype);
 return -VIRTIO_CRYPTO_NOTSUPP;
 }
diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index 81941af816..c2a053db0e 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -120,10 +120,10 @@ int cryptodev_backend_crypto_operation(
 {
 VirtIOCryptoReq *req = opaque1;
 CryptoDevBackendOpInfo *op_info = &req->op_info;
-enum CryptoDevBackendAlgType algtype = req->flags;
+QCryptodevBackendAlgType algtype = req->flags;
 
-if ((algtype != CRYPTODEV_BACKEND_ALG_SYM)
-&& (algtype != CRYPTODEV_BACKEND_ALG_ASYM)) {
+if ((algtype != QCRYPTODEV_BACKEND_ALG_SYM)
+&& (algtype != QCRYPTODEV_BACKEND_ALG_ASYM)) {
 error_report("Unsupported cryptodev alg type: %" PRIu32 "", algtype);
 return -VIRTIO_CRYPTO_NOTSUPP;
 }
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index 516425e26a..0d1be0ada9 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -462,7 +462,7 @@ static void virtio_crypto_init_request(VirtIOCrypto 
*vcrypto, VirtQueue *vq,
 req->in_iov = NULL;
 req->in_num = 0;
 req->in_len = 0;
-req->flags = CRYPTODEV_BACKEND_ALG__MAX;
+req->flags = QCRYPTODEV_BACKEND_ALG__MAX;
 memset(&req->op_info, 0x00, sizeof(req->op_info));
 }
 
@@ -472,7 +472,7 @@ static void virtio_crypto_free_request(VirtIOCryptoReq *req)
 return;
 }
 
-if (req->flags == CRYPTODEV_BACKEND_ALG_SYM) {
+if (req->flags == QCRYPTODEV_BACKEND_ALG_SYM) {
 size_t max_len;
 CryptoDevBackendSymOpInfo *op_info = req->op_info.u.sym_op_info;
 
@@ -485,7 +485,7 @@ static void virtio_crypto_free_request(VirtIOCryptoReq *req)
 /* Zeroize and free request data structure */
 memset(op_info, 0, sizeof(*op_info) + max_len);
 g_free(op_info);
-} else if (req->flags == CRYPTODEV_BACKEND_ALG_ASYM) {
+} else if (req->flags == QCRYPTODEV_BACKEND_ALG_ASYM) {
 CryptoDevBackendAsymOpInfo *op_info = req->op_info.u.asym_op_info;
 if (op_info) {
 g_free(op_info->src);
@@ -5

[PATCH v4 00/12] Refactor cryptodev

2023-01-28 Thread zhenwei pi
v4 -> v5:
- suggested by MST, use 'PRIu32' instead of '%u' to print a uint32_t value
- correct *QCryptodevBackendClient* and *QCryptodevInfo* in qapi/cryptodev.json

v3 -> v4:
- a small change in 
'0005-cryptodev-Introduce-query-cryptodev-QMP-command.patch':
  use 'uint32' instead of 'int' to describe CryptodevBackendClient:queue
- fix compling warning(gcc)/error(clang-11) on 32 bit platform in
  '0007-hmp-add-cryptodev-info-command.patch':
  use 'printf("%u", client->queue)' instead of 'printf("%ld", client->queue)'

v2 -> v3:
- rebase code against the lastest commist: fb7e7990342e59cf67d
- document the missing fields in qapi/cryptodev.json
- rework statistics part: use 'query-stats' command instead of
  'query-cryptodev'(cryptodev: Support query-stats QMP command)

v1 -> v2:
- fix coding style and use 'g_strjoin()' instead of 'char services[128]'
   (suggested by Dr. David Alan Gilbert)
- wrapper function 'cryptodev_backend_account' to record statistics, and
   allocate sym_stat/asym_stat in cryptodev base class. see patch:
   'cryptodev: Support statistics'.
- add more arguments into struct CryptoDevBackendOpInfo, then
   cryptodev_backend_crypto_operation() uses *op_info only.
- support cryptodev QoS settings(BPS&OPS), both QEMU command line and QMP
   command works fine.
- add myself as the maintainer for cryptodev.

v1:
- introduce cryptodev.json to describe the attributes of crypto device, then
   drop duplicated type declare, remove some virtio related dependence.
- add statistics: OPS and bandwidth.
- add QMP command: query-cryptodev
- add HMP info command: cryptodev
- misc fix: detect akcipher capability instead of exposing akcipher service
   unconditionally.

Zhenwei Pi (12):
  cryptodev: Introduce cryptodev.json
  cryptodev: Remove 'name' & 'model' fields
  cryptodev: Introduce cryptodev alg type in QAPI
  cryptodev: Introduce server type in QAPI
  cryptodev: Introduce 'query-cryptodev' QMP command
  cryptodev-builtin: Detect akcipher capability
  hmp: add cryptodev info command
  cryptodev: Use CryptoDevBackendOpInfo for operation
  cryptodev: Account statistics
  cryptodev: support QoS
  cryptodev: Support query-stats QMP command
  MAINTAINERS: add myself as the maintainer for cryptodev

 MAINTAINERS |   2 +
 backends/cryptodev-builtin.c|  42 ++--
 backends/cryptodev-lkcf.c   |  19 +-
 backends/cryptodev-vhost-user.c |  13 +-
 backends/cryptodev-vhost.c  |   4 +-
 backends/cryptodev.c| 419 ++--
 hmp-commands-info.hx|  14 ++
 hw/virtio/virtio-crypto.c   |  48 +++-
 include/monitor/hmp.h   |   1 +
 include/sysemu/cryptodev.h  |  95 
 monitor/hmp-cmds.c  |  42 
 monitor/qmp-cmds.c  |   2 +
 qapi/cryptodev.json | 143 +++
 qapi/meson.build|   1 +
 qapi/qapi-schema.json   |   1 +
 qapi/qom.json   |   8 +-
 qapi/stats.json |  10 +-
 17 files changed, 744 insertions(+), 120 deletions(-)
 create mode 100644 qapi/cryptodev.json

-- 
2.34.1




[PATCH v1 0/2] virtio: fix for assertion failure: virtio_net_get_subqueue(nc)->async_tx.elem failed

2023-01-28 Thread Xuan Zhuo
In the current design, we stop the device from operating on the vring
during per-queue reset by resetting the structure VirtQueue.

But before the reset operation, when recycling some resources, we should
stop referencing new vring resources.

This bug is caused by this reason.

https://gitlab.com/qemu-project/qemu/-/issues/1451

Before we reset the structure, we called the ->queue_reset callback to let the
device reclaim resources. Here virtio-net tries to release the packets sent
asynchronously, but during this process virtio_net_flush_tx() will be called,
and new data will be sent again. This leads to asserted.

 assert(!virtio_net_get_subqueue(nc)->async_tx.elem);

This patch set introduce new item "reset" into struct VirtQueue, then device can
know this virtqueue is per-queue reset state.

v1:
1. rename "reset" to disabled_by_reset
2. add api: virtio_queue_reset_state()

Xuan Zhuo (2):
  virtio: struct VirtQueue introduce reset
  virtio-net: virtio_net_flush_tx() check for per-queue reset

 hw/net/virtio-net.c|  3 ++-
 hw/virtio/virtio.c | 15 +++
 include/hw/virtio/virtio.h |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

--
2.32.0.3.g01195cf9f




[PATCH v1 1/2] virtio: struct VirtQueue introduce reset

2023-01-28 Thread Xuan Zhuo
In the current design, we stop the device from operating on the vring
during per-queue reset by resetting the structure VirtQueue.

But before the reset operation, when recycling some resources, we should
stop referencing new vring resources. For example, when recycling
virtio-net's asynchronous sending resources, virtio-net should be able
to perceive that the current queue is in the per-queue reset state, and
stop sending new packets from the tx queue.

Signed-off-by: Xuan Zhuo 
---
 hw/virtio/virtio.c | 15 +++
 include/hw/virtio/virtio.h |  1 +
 2 files changed, 16 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index f35178f5fc..c954f2a2b3 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -142,6 +142,8 @@ struct VirtQueue
 /* Notification enabled? */
 bool notification;
 
+bool disabled_by_reset;
+
 uint16_t queue_index;
 
 unsigned int inuse;
@@ -2079,6 +2081,12 @@ void virtio_queue_reset(VirtIODevice *vdev, uint32_t 
queue_index)
 {
 VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
 
+/*
+ * Mark this queue is per-queue reset status. The device should release the
+ * references of the vring, and not refer more new vring item.
+ */
+vdev->vq[queue_index].disabled_by_reset = true;
+
 if (k->queue_reset) {
 k->queue_reset(vdev, queue_index);
 }
@@ -2102,11 +2110,18 @@ void virtio_queue_enable(VirtIODevice *vdev, uint32_t 
queue_index)
 }
 */
 
+vdev->vq[queue_index].disabled_by_reset = false;
+
 if (k->queue_enable) {
 k->queue_enable(vdev, queue_index);
 }
 }
 
+bool virtio_queue_reset_state(VirtQueue *vq)
+{
+return vq->disabled_by_reset;
+}
+
 void virtio_reset(void *opaque)
 {
 VirtIODevice *vdev = opaque;
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 77c6c55929..00e91af7c4 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -319,6 +319,7 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val);
 void virtio_reset(void *opaque);
 void virtio_queue_reset(VirtIODevice *vdev, uint32_t queue_index);
 void virtio_queue_enable(VirtIODevice *vdev, uint32_t queue_index);
+bool virtio_queue_reset_state(VirtQueue *vq);
 void virtio_update_irq(VirtIODevice *vdev);
 int virtio_set_features(VirtIODevice *vdev, uint64_t val);
 
-- 
2.32.0.3.g01195cf9f




[PATCH v1 2/2] virtio-net: virtio_net_flush_tx() check for per-queue reset

2023-01-28 Thread Xuan Zhuo
Check whether it is per-queue reset state in virtio_net_flush_tx().

Before per-queue reset, we need to recover async tx resources. At this
time, virtio_net_flush_tx() is called, but we should not try to send
new packets, so virtio_net_flush_tx() should check the current
per-queue reset state.

Fixes: 7dc6be52 ("virtio-net: support queue reset")
Fixes: https://gitlab.com/qemu-project/qemu/-/issues/1451
Reported-by: Alexander Bulekov 
Signed-off-by: Xuan Zhuo 
---
 hw/net/virtio-net.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 3ae909041a..fba6451a50 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -2627,7 +2627,8 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
 VirtQueueElement *elem;
 int32_t num_packets = 0;
 int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
-if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) ||
+virtio_queue_reset_state(q->tx_vq)) {
 return num_packets;
 }
 
-- 
2.32.0.3.g01195cf9f




Re: [PATCH 0/3] Misc sm501 clean ups

2023-01-28 Thread BALATON Zoltan

On Mon, 23 Jan 2023, Philippe Mathieu-Daudé wrote:

On 21/1/23 21:35, BALATON Zoltan wrote:

Some small trivial clean ups I've found while looking at this file.

BALATON Zoltan (3):
   hw/display/sm501: Remove parenthesis around consant macro definitions
   hw/display/sm501: Remove unneeded casts from void pointer
   hw/display/sm501: Code style fix

  hw/display/sm501.c | 419 +++--
  1 file changed, 210 insertions(+), 209 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 


Ping? Who will merge this series? Should Daniel take it via PPC or Gerd 
for display? I only care that it gets in one way or another and not lost 
between maintainers.


Regards,
BALATON Zoltan

Re: [PATCH v4 1/3] hw/riscv/boot.c: calculate fdt size after fdt_pack()

2023-01-28 Thread Bin Meng
Hi Daniel,

On Thu, Jan 26, 2023 at 9:53 PM Daniel Henrique Barboza
 wrote:
>
> fdt_pack() can change the fdt size, meaning that fdt_totalsize() can
> contain a now deprecated (bigger) value.

The commit message is a bit confusing.

The original code in this patch does not call fdt_pack(). So not sure
where the issue of "deprecated (bigger) value" happens?

>
> Reviewed-by: Alistair Francis 
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  hw/riscv/boot.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c
> index 3172a76220..a563b7482a 100644
> --- a/hw/riscv/boot.c
> +++ b/hw/riscv/boot.c
> @@ -287,8 +287,13 @@ uint64_t riscv_load_fdt(hwaddr dram_base, uint64_t 
> mem_size, void *fdt)
>  {
>  uint64_t temp, fdt_addr;
>  hwaddr dram_end = dram_base + mem_size;
> -int ret, fdtsize = fdt_totalsize(fdt);
> +int ret = fdt_pack(fdt);
> +int fdtsize;
>
> +/* Should only fail if we've built a corrupted tree */
> +g_assert(ret == 0);
> +
> +fdtsize = fdt_totalsize(fdt);
>  if (fdtsize <= 0) {
>  error_report("invalid device-tree");
>  exit(1);

Regards,
Bin



[PATCH] linux-user: move target_flat.h to target subdirs

2023-01-28 Thread Mike Frysinger
This makes target_flat.h behave like every other target_xxx.h header.
It also makes it actually work -- while the current header says adding
a header to the target subdir overrides the common one, it doesn't.
This is for two reasons:
* meson.build adds -Ilinux-user before -Ilinux-user/$arch
* the compiler search path for "target_flat.h" looks in the same dir
  as the source file before searching -I paths.

This can be seen with the xtensa port -- the subdir settings aren't
used which breaks stack setup.

Move it to the generic/ subdir and add include stubs like every
other target_xxx.h header is handled.

Signed-off-by: Mike Frysinger 
---
 linux-user/aarch64/target_flat.h   | 1 +
 linux-user/arm/target_flat.h   | 1 +
 linux-user/{ => generic}/target_flat.h | 0
 linux-user/m68k/target_flat.h  | 1 +
 linux-user/microblaze/target_flat.h| 1 +
 linux-user/sh4/target_flat.h   | 1 +
 6 files changed, 5 insertions(+)
 create mode 100644 linux-user/aarch64/target_flat.h
 create mode 100644 linux-user/arm/target_flat.h
 rename linux-user/{ => generic}/target_flat.h (100%)
 create mode 100644 linux-user/m68k/target_flat.h
 create mode 100644 linux-user/microblaze/target_flat.h
 create mode 100644 linux-user/sh4/target_flat.h

diff --git a/linux-user/aarch64/target_flat.h b/linux-user/aarch64/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/aarch64/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
diff --git a/linux-user/arm/target_flat.h b/linux-user/arm/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/arm/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
diff --git a/linux-user/target_flat.h b/linux-user/generic/target_flat.h
similarity index 100%
rename from linux-user/target_flat.h
rename to linux-user/generic/target_flat.h
diff --git a/linux-user/m68k/target_flat.h b/linux-user/m68k/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/m68k/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
diff --git a/linux-user/microblaze/target_flat.h 
b/linux-user/microblaze/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/microblaze/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
diff --git a/linux-user/sh4/target_flat.h b/linux-user/sh4/target_flat.h
new file mode 100644
index ..bc83224cea12
--- /dev/null
+++ b/linux-user/sh4/target_flat.h
@@ -0,0 +1 @@
+#include "../generic/target_flat.h"
-- 
2.39.0




Re: [PATCH 23/23] target/arm: Enable FEAT_FGT on '-cpu max'

2023-01-28 Thread Richard Henderson

On 1/27/23 07:55, Peter Maydell wrote:

Update the ID registers for TCG's '-cpu max' to report the
presence of FEAT_FGT Fine-Grained Traps support.

Signed-off-by: Peter Maydell
---
  docs/system/arm/emulation.rst | 1 +
  target/arm/cpu64.c| 1 +
  2 files changed, 2 insertions(+)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 22/23] target/arm: Implement MDCR_EL2.TDCC and MDCR_EL3.TDCC traps

2023-01-28 Thread Richard Henderson

On 1/27/23 07:55, Peter Maydell wrote:

FEAT_FGT also implements an extra trap bit in the MDCR_EL2 and
MDCR_EL3 registers: bit TDCC enables trapping of use of the Debug
Comms Channel registers OSDTRRX_EL1, OSDTRTX_EL1, MDCCSR_EL0,
MDCCINT_EL0, DBGDTR_EL0, DBGDTRRX_EL0 and DBGDTRTX_EL0 (and their
AArch32 equivalents).  This trapping is independent of whether
fine-grained traps are enabled or not.

Implement these extra traps.  (We don't implement DBGDTR_EL0,
DBGDTRRX_EL0 and DBGDTRTX_EL0.)

Signed-off-by: Peter Maydell
---
  target/arm/debug_helper.c | 35 +++
  1 file changed, 31 insertions(+), 4 deletions(-)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 21/23] target/arm: Implement the HFGITR_EL2.SVC_EL0 and SVC_EL1 traps

2023-01-28 Thread Richard Henderson

On 1/27/23 07:55, Peter Maydell wrote:

Implement the HFGITR_EL2.SVC_EL0 and SVC_EL1 fine-grained traps.
These trap execution of the SVC instruction from AArch32 and AArch64.
(As usual, AArch32 can only trap from EL0, as fine grained traps are
disabled with an AArch32 EL1.)

Signed-off-by: Peter Maydell
---
  target/arm/cpu.h   |  1 +
  target/arm/translate.h |  2 ++
  target/arm/helper.c| 20 
  target/arm/translate-a64.c |  9 -
  target/arm/translate.c | 12 +---
  5 files changed, 40 insertions(+), 4 deletion


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 20/23] target/arm: Implement the HFGITR_EL2.ERET trap

2023-01-28 Thread Richard Henderson

On 1/27/23 07:55, Peter Maydell wrote:

Implement the HFGITR_EL2.ERET fine-grained trap.  This traps
execution from AArch64 EL1 of ERET, ERETAA and ERETAB.  The trap is
reported with a syndrome value of 0x1a.

The trap must take precedence over a possible pointer-authentication
trap for ERETAA and ERETAB.

Signed-off-by: Peter Maydell
---
  target/arm/cpu.h   |  1 +
  target/arm/syndrome.h  | 10 ++
  target/arm/translate.h |  2 ++
  target/arm/helper.c|  3 +++
  target/arm/translate-a64.c | 10 ++
  5 files changed, 26 insertions(+)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 19/23] target/arm: Mark up sysregs for HFGITR bits 48..63

2023-01-28 Thread Richard Henderson

On 1/27/23 07:55, Peter Maydell wrote:

Mark up the sysreg definitions for the system instructions
trapped by HFGITR bits 48..63.

Some of these bits are for trapping instructions which are
not in the system instruction encoding (i.e. which are
not handled by the ARMCPRegInfo mechanism):
  * ERET, ERETAA, ERETAB
  * SVC

We will have to handle those separately and manually.

Signed-off-by: Peter Maydell
---
  target/arm/cpregs.h | 4 
  target/arm/helper.c | 9 +
  2 files changed, 13 insertions(+)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 18/23] target/arm: Mark up sysregs for HFGITR bits 18..47

2023-01-28 Thread Richard Henderson

On 1/27/23 07:55, Peter Maydell wrote:

Mark up the sysreg definitions for the system instructions
trapped by HFGITR bits 18..47. These bits cover TLBI
TLB maintenance instructions.

(If we implemented FEAT_XS we would need to trap some of the
instructions added by that feature using these bits; but we don't
yet, so will need to add the .fgt markup when we do.)

Signed-off-by: Peter Maydell
---
  target/arm/cpregs.h | 30 ++
  target/arm/helper.c | 30 ++
  2 files changed, 60 insertions(+)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 17/23] target/arm: Mark up sysregs for HFGITR bits 12..17

2023-01-28 Thread Richard Henderson

On 1/27/23 07:55, Peter Maydell wrote:

Mark up the sysreg definitions for the system instructions
trapped by HFGITR bits 12..17. These bits cover AT address
translation instructions.

Signed-off-by: Peter Maydell
---
  target/arm/cpregs.h | 6 ++
  target/arm/helper.c | 6 ++
  2 files changed, 12 insertions(+)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 16/23] target/arm: Mark up sysregs for HFGITR bits 0..11

2023-01-28 Thread Richard Henderson

On 1/27/23 07:55, Peter Maydell wrote:

Mark up the sysreg definitions for the system instructions
trapped by HFGITR bits 0..11. These bits cover various
cache maintenance operations.

Signed-off-by: Peter Maydell
---
  target/arm/cpregs.h | 14 ++
  target/arm/helper.c | 28 
  2 files changed, 42 insertions(+)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 15/23] target/arm: Mark up sysregs for HDFGRTR bits 12..63

2023-01-28 Thread Richard Henderson

On 1/27/23 07:54, Peter Maydell wrote:

Mark up the sysreg definitions for the registers trapped
by HDFGRTR/HDFGWTR bits 12..x.

Bits 12..22 and bit 58 are for PMU registers.

The remaining bits in HDFGRTR/HDFGWTR are for traps on
registers that are part of features we don't implement:

Bits 23..32 and 63 : FEAT_SPE
Bits 33..48 : FEAT_ETE
Bits 50..56 : FEAT_TRBE
Bits 59..61 : FEAT_BRBE
Bit 62 : FEAT_SPEv1p2.

Signed-off-by: Peter Maydell
---
  target/arm/cpregs.h | 12 
  target/arm/helper.c | 37 +
  2 files changed, 49 insertions(+)


Reviewed-by: Richard Henderson 

r~



Re: [PATCH 0/4] E500 cleanups and enhancements

2023-01-28 Thread Daniel Henrique Barboza




On 1/25/23 10:00, Bernhard Beschow wrote:

This series includes some cleanups I came across when working on the ppce500
machine. Furthermore, it enables support for the 'dumpdtb' QMP/HMP command
which was missing so far.

Bernhard Beschow (4):
   hw/ppc: Set machine->fdt in e500 machines
   hw/ppc/e500{,plat}: Drop redundant checks for presence of platform bus
   hw/ppc/e500.c: Avoid hardcoding parent device in
 create_devtree_etsec()
   hw/ppc/e500.c: Attach eSDHC unimplemented region to ccsr_addr_space


Reviewed-by: Daniel Henrique Barboza 

And queued in gitlab.com/danielhb/qemu/tree/ppc-next. Thanks,


Daniel



  hw/ppc/e500.c | 24 
  hw/ppc/e500plat.c |  9 +++--
  2 files changed, 19 insertions(+), 14 deletions(-)





Re: [PATCH 0/4] PCI-related cleanup for ppc/powernv

2023-01-28 Thread Daniel Henrique Barboza




On 1/27/23 09:28, Frederic Barrat wrote:

Pretty simple PCI-related cleanup for powernv

Frederic Barrat (4):
   ppc/pnv/pci: Cleanup PnvPHBPecState structure
   ppc/pnv/pci: Remove duplicate definition of PNV_PHB5_DEVICE_ID
   ppc/pnv/pci: Update PHB5 version register
   ppc/pnv/pci: Fix PHB xscom registers memory region name


Queued in gitlab.com/danielhb/qemu/tree/ppc-next. Thanks,


Daniel



  hw/pci-host/pnv_phb4.c | 2 +-
  include/hw/pci-host/pnv_phb4.h | 5 +
  2 files changed, 2 insertions(+), 5 deletions(-)





[PATCH v2] hvf: arm: Add support for GICv3

2023-01-28 Thread Alexander Graf
We currently only support GICv2 emulation. To also support GICv3, we will
need to pass a few system registers into their respective handler functions.

This patch adds support for HVF to call into the TCG callbacks for GICv3
system register handlers. This is safe because the GICv3 TCG code is generic
as long as we limit ourselves to EL0 and EL1 - which are the only modes
supported by HVF.

To make sure nobody trips over that, we also annotate callbacks that don't
work in HVF mode, such as EL state change hooks.

With GICv3 support in place, we can run with more than 8 vCPUs.

Signed-off-by: Alexander Graf 

---

v1 -> v2:

  - assert when guest has EL2/EL3 and uses non-TCG GICv3
  - use defines for sysreg masks
---
 hw/intc/arm_gicv3_cpuif.c   |  15 +++-
 target/arm/hvf/hvf.c| 151 
 target/arm/hvf/trace-events |   2 +
 3 files changed, 167 insertions(+), 1 deletion(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index b17b29288c..c4ff595742 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -21,6 +21,7 @@
 #include "hw/irq.h"
 #include "cpu.h"
 #include "target/arm/cpregs.h"
+#include "sysemu/tcg.h"
 
 /*
  * Special case return value from hppvi_index(); must be larger than
@@ -2810,6 +2811,8 @@ void gicv3_init_cpuif(GICv3State *s)
  * which case we'd get the wrong value.
  * So instead we define the regs with no ri->opaque info, and
  * get back to the GICv3CPUState from the CPUARMState.
+ *
+ * These CP regs callbacks can be called from either TCG or HVF code.
  */
 define_arm_cp_regs(cpu, gicv3_cpuif_reginfo);
 
@@ -2905,6 +2908,16 @@ void gicv3_init_cpuif(GICv3State *s)
 define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr23_reginfo);
 }
 }
-arm_register_el_change_hook(cpu, gicv3_cpuif_el_change_hook, cs);
+if (tcg_enabled()) {
+/*
+ * We can only trap EL changes with TCG. However the GIC interrupt
+ * state only changes on EL changes involving EL2 or EL3, so for
+ * the non-TCG case this is OK, as EL2 and EL3 can't exist.
+ */
+arm_register_el_change_hook(cpu, gicv3_cpuif_el_change_hook, cs);
+} else {
+assert(!arm_feature(&cpu->env, ARM_FEATURE_EL2));
+assert(!arm_feature(&cpu->env, ARM_FEATURE_EL3));
+}
 }
 }
diff --git a/target/arm/hvf/hvf.c b/target/arm/hvf/hvf.c
index 060aa0ccf4..ad65603445 100644
--- a/target/arm/hvf/hvf.c
+++ b/target/arm/hvf/hvf.c
@@ -80,6 +80,33 @@
 #define SYSREG_PMCCNTR_EL0SYSREG(3, 3, 9, 13, 0)
 #define SYSREG_PMCCFILTR_EL0  SYSREG(3, 3, 14, 15, 7)
 
+#define SYSREG_ICC_AP0R0_EL1 SYSREG(3, 0, 12, 8, 4)
+#define SYSREG_ICC_AP0R1_EL1 SYSREG(3, 0, 12, 8, 5)
+#define SYSREG_ICC_AP0R2_EL1 SYSREG(3, 0, 12, 8, 6)
+#define SYSREG_ICC_AP0R3_EL1 SYSREG(3, 0, 12, 8, 7)
+#define SYSREG_ICC_AP1R0_EL1 SYSREG(3, 0, 12, 9, 0)
+#define SYSREG_ICC_AP1R1_EL1 SYSREG(3, 0, 12, 9, 1)
+#define SYSREG_ICC_AP1R2_EL1 SYSREG(3, 0, 12, 9, 2)
+#define SYSREG_ICC_AP1R3_EL1 SYSREG(3, 0, 12, 9, 3)
+#define SYSREG_ICC_ASGI1R_EL1SYSREG(3, 0, 12, 11, 6)
+#define SYSREG_ICC_BPR0_EL1  SYSREG(3, 0, 12, 8, 3)
+#define SYSREG_ICC_BPR1_EL1  SYSREG(3, 0, 12, 12, 3)
+#define SYSREG_ICC_CTLR_EL1  SYSREG(3, 0, 12, 12, 4)
+#define SYSREG_ICC_DIR_EL1   SYSREG(3, 0, 12, 11, 1)
+#define SYSREG_ICC_EOIR0_EL1 SYSREG(3, 0, 12, 8, 1)
+#define SYSREG_ICC_EOIR1_EL1 SYSREG(3, 0, 12, 12, 1)
+#define SYSREG_ICC_HPPIR0_EL1SYSREG(3, 0, 12, 8, 2)
+#define SYSREG_ICC_HPPIR1_EL1SYSREG(3, 0, 12, 12, 2)
+#define SYSREG_ICC_IAR0_EL1  SYSREG(3, 0, 12, 8, 0)
+#define SYSREG_ICC_IAR1_EL1  SYSREG(3, 0, 12, 12, 0)
+#define SYSREG_ICC_IGRPEN0_EL1   SYSREG(3, 0, 12, 12, 6)
+#define SYSREG_ICC_IGRPEN1_EL1   SYSREG(3, 0, 12, 12, 7)
+#define SYSREG_ICC_PMR_EL1   SYSREG(3, 0, 4, 6, 0)
+#define SYSREG_ICC_RPR_EL1   SYSREG(3, 0, 12, 11, 3)
+#define SYSREG_ICC_SGI0R_EL1 SYSREG(3, 0, 12, 11, 7)
+#define SYSREG_ICC_SGI1R_EL1 SYSREG(3, 0, 12, 11, 5)
+#define SYSREG_ICC_SRE_EL1   SYSREG(3, 0, 12, 12, 5)
+
 #define WFX_IS_WFE (1 << 0)
 
 #define TMR_CTL_ENABLE  (1 << 0)
@@ -788,6 +815,43 @@ static bool is_id_sysreg(uint32_t reg)
SYSREG_CRM(reg) < 8;
 }
 
+static uint32_t hvf_reg2cp_reg(uint32_t reg)
+{
+return ENCODE_AA64_CP_REG(CP_REG_ARM64_SYSREG_CP,
+  (reg >> SYSREG_CRN_SHIFT) & SYSREG_CRN_MASK,
+  (reg >> SYSREG_CRM_SHIFT) & SYSREG_CRM_MASK,
+  (reg >> SYSREG_OP0_SHIFT) & SYSREG_OP0_MASK,
+  (reg >> SYSREG_OP1_SHIFT) & SYSREG_OP1_MASK,
+  (reg >> SYSREG_OP2_SHIFT) & SYSREG_OP2_MASK);
+}
+
+static bool hvf_sysreg_read_cp(CPUState *cpu, uint32_t reg, uint64_t *val)
+{
+ARMCPU *arm_cpu = ARM_CPU(cpu

Re: [PATCH] hvf: arm: Add support for GICv3

2023-01-28 Thread Alexander Graf



On 06.01.23 17:37, Peter Maydell wrote:

On Mon, 19 Dec 2022 at 22:08, Alexander Graf  wrote:

We currently only support GICv2 emulation. To also support GICv3, we will
need to pass a few system registers into their respective handler functions.

This patch adds support for HVF to call into the TCG callbacks for GICv3
system register handlers. This is safe because the GICv3 TCG code is generic
as long as we limit ourselves to EL0 and EL1 - which are the only modes
supported by HVF.

To make sure nobody trips over that, we also annotate callbacks that don't
work in HVF mode, such as EL state change hooks.

With GICv3 support in place, we can run with more than 8 vCPUs.

Signed-off-by: Alexander Graf 
---
  hw/intc/arm_gicv3_cpuif.c   |   8 +-
  target/arm/hvf/hvf.c| 151 
  target/arm/hvf/trace-events |   2 +
  3 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index b17b29288c..b4e387268c 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -21,6 +21,7 @@
  #include "hw/irq.h"
  #include "cpu.h"
  #include "target/arm/cpregs.h"
+#include "sysemu/tcg.h"

  /*
   * Special case return value from hppvi_index(); must be larger than
@@ -2810,6 +2811,8 @@ void gicv3_init_cpuif(GICv3State *s)
   * which case we'd get the wrong value.
   * So instead we define the regs with no ri->opaque info, and
   * get back to the GICv3CPUState from the CPUARMState.
+ *
+ * These CP regs callbacks can be called from either TCG or HVF code.
   */
  define_arm_cp_regs(cpu, gicv3_cpuif_reginfo);

@@ -2905,6 +2908,9 @@ void gicv3_init_cpuif(GICv3State *s)
  define_arm_cp_regs(cpu, gicv3_cpuif_ich_apxr23_reginfo);
  }
  }
-arm_register_el_change_hook(cpu, gicv3_cpuif_el_change_hook, cs);
+if (tcg_enabled()) {
+/* We can only trap EL changes with TCG for now */

We could expand this a bit:

  We can only trap EL changes with TCG. However the GIC interrupt
  state only changes on EL changes involving EL2 or EL3, so for
  the non-TCG case this is OK, as EL2 and EL3 can't exist.

and assert:
  assert(!arm_feature(&cpu->env, ARM_FEATURE_EL2));
  assert(!arm_feature(&cpu->env, ARM_FEATURE_EL3));



Good idea! Let me add that.





+static uint32_t hvf_reg2cp_reg(uint32_t reg)
+{
+return ENCODE_AA64_CP_REG(CP_REG_ARM64_SYSREG_CP,
+  (reg >> 10) & 0xf,
+  (reg >> 1) & 0xf,
+  (reg >> 20) & 0x3,
+  (reg >> 14) & 0x7,
+  (reg >> 17) & 0x7);

This file has #defines for these shift and mask constants
(SYSREG_OP0_SHIFT etc).



Ugh, thanks for catching that!





+}
+
+static bool hvf_sysreg_read_cp(CPUState *cpu, uint32_t reg, uint64_t *val)
+{
+ARMCPU *arm_cpu = ARM_CPU(cpu);
+CPUARMState *env = &arm_cpu->env;
+const ARMCPRegInfo *ri;
+
+ri = get_arm_cp_reginfo(arm_cpu->cp_regs, hvf_reg2cp_reg(reg));
+if (ri) {
+if (ri->accessfn) {
+if (ri->accessfn(env, ri, true) != CP_ACCESS_OK) {
+return false;
+}
+}
+if (ri->type & ARM_CP_CONST) {
+*val = ri->resetvalue;
+} else if (ri->readfn) {
+*val = ri->readfn(env, ri);
+} else {
+*val = CPREG_FIELD64(env, ri);
+}
+trace_hvf_vgic_read(ri->name, *val);
+return true;
+}

Can we get here for attempts by EL0 to access EL1-only
sysregs, or does hvf send the exception to EL1 without
trapping out to us? If we can get here for EL0 accesses we
need to check against ri->access as well as ri->accessfn.



I just validated, GICv3 EL1 registers trap to EL1 inside the guest:


$ cat a.S
.global start
.global _main
_main:
start:
    mrs x0, ICC_AP0R0_EL1
    mov x0, #0x1234
    msr ICC_AP0R0_EL1, x0
    mov x0, #0
    ret
$ gcc -nostdlib a.S
$ gdb ./a.out
(gdb) r
Program received signal SIGILL, Illegal instruction.
0x004000d4 in start ()
(gdb) x/i $pc
=> 0x4000d4 :    mrs x0, icc_ap0r0_el1


So no need to check ri->access :)


Alex




Re: [PATCH 00/17] audio: improve callback interface for audio frontends

2023-01-28 Thread Mark Cave-Ayland

On 28/01/2023 09:03, Volker Rümelin wrote:


Am 22.01.23 um 19:13 schrieb Mark Cave-Ayland:

On 15/01/2023 13:45, Volker Rümelin wrote:


Am 15.01.23 um 14:08 schrieb Volker Rümelin:

Ccing a few more people who might be interested in this patch series.

@Mark:
After this patch series, the code in your out of tree ASC audio device (and a few 
in tree audio devices) could be simplified. write_audio() and the loops calling 
write_audio() could be removed.


Hi Volker,

I know we have discussed this in a separate thread off-list, but this is 
fantastic!

Just out of interest, if the available bytes wraps the circular buffer will the 
audio core call the audio callback twice to maximise the ability of the guest to 
generate samples before the next audio timer? Or does that not make much difference 
in practice?


Hi Mark,

I guess with circular buffer you refer to the mixing engine buffer. The audio system 
calls the callback once on every audio timer event. If the available bytes wrap the 
mixing engine ringbuffer, the audio_pcm_sw_resample_out() function uses two writes to 
write all available bytes. Compared to the unpatched version, nothing has changed in 
this regard. Of course the audio frontend devices are still free to write 'avail' 
bytes with multiple calls to AUD_write().


With best regards,
Volker


Yes that makes sense, thanks for confirming this. I'm sorry that I'm not familiar 
enough with the audio side to do a proper review but obviously the A-B still stands 
and I would certainly be keen to see this merged.



ATB,

Mark.



Re: [PATCH] pci: add enforce_slot_reserved_mask_manual property

2023-01-28 Thread Mark Cave-Ayland

On 28/01/2023 03:39, Chuck Zmudzinski wrote:


On 1/27/2023 8:28 AM, Michael S. Tsirkin wrote:

On Sun, Jan 15, 2023 at 07:49:51PM -0500, Chuck Zmudzinski wrote:

The current reserved slot check in do_pci_register_device(), added with
commit 8b8849844fd6


add ("subject here") please


,is done even if the pci device being added is
configured manually for a particular slot. The new property, when set
to false, disables the check when the device is configured to request a
particular slot. This allows an administrator or management tool to
override slot_reserved_mask for a pci device by requesting a particular
slot for the device. The new property is initialized to true which
preserves the existing behavior of slot_reserved_mask by default.

Signed-off-by: Chuck Zmudzinski 


Thanks!
I'm trying to think of the best default for this.


I think it would be better for the default value of
enforce_slot_reserved_mask_manual to be false, so that a
user-specified slot will by default override slot_reserved_mask.
But doing that would change the current behavior of
slot_reserved_mask.

Currently, this is the only place where slot_reserved_mask is used in all
of the Qemu source (code from hw/sparc64/sun4u.c):

-- snip ---
     /* Only in-built Simba APBs can exist on the root bus, slot 0 on busA is
    reserved (leaving no slots free after on-board devices) however slots
    0-3 are free on busB */
     pci_bus->slot_reserved_mask = 0xfffc;
     pci_busA->slot_reserved_mask = 0xfff1;
     pci_busB->slot_reserved_mask = 0xfff0;
-- snip ---

I think we could safely change the default value of
enforce_slot_reserved_mask_manual to false but set
it to true for the sparc64 sun4u board here to preserve
the current behavior of the only existing board in Qemu
that uses slot_reserved_mask.

What do you think?


Users is trying to configure a specific device on a reserved
slot. Should we
CC a bunch more people for visibility. Input, anyone?


For a bit of background, slot_reserved_mask was added by me to solve a problem with 
the sun4u machine: on a real Ultra-5, the pci "A" bus has 2 free slots and the pci 
"B" bus has 4 free slots. Whilst it is possible to plug a PCI device into any slot in 
QEMU, the PCI bridges only have IRQ mapping registers for those 6 slots, so you can 
easily end up with an auto-allocated slot where it is impossible for the OS to map 
the IRQ.


Hence slot_reserved_mask was originally intended to mark slots as being unavailable 
for both manual and automatic allocation to ensure that devices plugged into both PCI 
buses would always work.


If there is a need to change/refactor the logic then I can test the sun4u machine to 
ensure the original test case still works.



ATB,

Mark.



Re: [PATCH] pci: add enforce_slot_reserved_mask_manual property

2023-01-28 Thread Chuck Zmudzinski
On 1/28/23 2:14 PM, Michael S. Tsirkin wrote:
> On Sat, Jan 28, 2023 at 08:20:55AM -0500, Chuck Zmudzinski wrote:
>> On 1/28/23 5:26 AM, Michael S. Tsirkin wrote:
>> > On Fri, Jan 27, 2023 at 10:39:28PM -0500, Chuck Zmudzinski wrote:
>> >> On 1/27/2023 8:28 AM, Michael S. Tsirkin wrote:
>> >> > On Sun, Jan 15, 2023 at 07:49:51PM -0500, Chuck Zmudzinski wrote:
>> >> > > The current reserved slot check in do_pci_register_device(), added 
>> >> > > with
>> >> > > commit 8b8849844fd6
>> >> >
>> >> > add ("subject here") please
>> >> >
>> >> > > ,is done even if the pci device being added is
>> >> > > configured manually for a particular slot. The new property, when set
>> >> > > to false, disables the check when the device is configured to request 
>> >> > > a
>> >> > > particular slot. This allows an administrator or management tool to
>> >> > > override slot_reserved_mask for a pci device by requesting a 
>> >> > > particular
>> >> > > slot for the device. The new property is initialized to true which
>> >> > > preserves the existing behavior of slot_reserved_mask by default.
>> >> > > 
>> >> > > Signed-off-by: Chuck Zmudzinski 
>> >> >
>> >> > Thanks!
>> >> > I'm trying to think of the best default for this.
>> >> 
>> >> I think it would be better for the default value of
>> >> enforce_slot_reserved_mask_manual to be false, so that a
>> >> user-specified slot will by default override slot_reserved_mask.
>> >> But doing that would change the current behavior of
>> >> slot_reserved_mask.
>> >> 
>> >> Currently, this is the only place where slot_reserved_mask is used in all
>> >> of the Qemu source (code from hw/sparc64/sun4u.c):
>> >> 
>> >> -- snip ---
>> >>     /* Only in-built Simba APBs can exist on the root bus, slot 0 on busA 
>> >> is
>> >>    reserved (leaving no slots free after on-board devices) however 
>> >> slots
>> >>    0-3 are free on busB */
>> >>     pci_bus->slot_reserved_mask = 0xfffc;
>> >>     pci_busA->slot_reserved_mask = 0xfff1;
>> >>     pci_busB->slot_reserved_mask = 0xfff0;
>> >> -- snip ---
>> >> 
>> >> I think we could safely change the default value of
>> >> enforce_slot_reserved_mask_manual to false but set
>> >> it to true for the sparc64 sun4u board here to preserve
>> >> the current behavior of the only existing board in Qemu
>> >> that uses slot_reserved_mask.
>> >> 
>> >> What do you think?
>> > 
>> > I guess first can you answer whether this is still needed
>> > with the latest Xen patches?
>> > 
>> 
>> It's not really needed except for experimental purposes to allow
>> an administrator to test experimental configurations with a device
>> other than the igd at slot 2. That might be useful in some cases,
>> but it is not really necessary unless someone asks for that capability.
>> If libvirt users who ordinarily like to manually specify all the
>> settings will be OK with the proposed patch to xen that prevents
>> an administrator from being able to override a new setting that
>> reserves slot 2 for the igd for type "xenfv" machines configured for
>> igd passthrough, then there is no need for this patch. I don't think
>> many users need the capability to insert a different device in slot 2 for
>> the "xenfv" machine type configured with igd-passthru=on, so I would be
>> OK if this patch is not included in qemu.
>> 
>> Chuck
> 
> Pls wait and see if that patch gets picked up. Let me know.
> 

A day or two ago Anthony said he would look at the xen patch soon. So we'll
just wait for him, and I'll let you know if he is going to pull it up.



RE: [PATCH v2 00/13] Introduce igb

2023-01-28 Thread Sriram Yagnaraman
> -Original Message-
> From: Akihiko Odaki 
> Sent: Thursday, 26 January 2023 12:32
> To: Sriram Yagnaraman ; Jason Wang
> 
> Cc: Dmitry Fleytman ; Michael S. Tsirkin
> ; Marcel Apfelbaum ;
> Alex Bennée ; Philippe Mathieu-Daudé
> ; Thomas Huth ; Wainer dos Santos
> Moschetta ; Beraldo Leal ;
> Cleber Rosa ; Laurent Vivier ;
> Paolo Bonzini ; Alexander Bulekov ;
> Bandan Das ; Stefan Hajnoczi ;
> Darren Kenny ; Qiuhao Li
> ; qemu-devel@nongnu.org; qemu-
> p...@nongnu.org; de...@daynix.com; Yan Vugenfirer
> ; Yuri Benditovich 
> Subject: Re: [PATCH v2 00/13] Introduce igb
> 
> On 2023/01/26 18:34, Sriram Yagnaraman wrote:
> >
> >> -Original Message-
> >> From: Sriram Yagnaraman
> >> Sent: Tuesday, 24 January 2023 09:54
> >> To: Akihiko Odaki ; Jason Wang
> >> 
> >> Cc: Dmitry Fleytman ; Michael S. Tsirkin
> >> ; Marcel Apfelbaum ;
> Alex
> >> Bennée ; Philippe Mathieu-Daudé
> >> ; Thomas Huth ; Wainer dos
> >> Santos Moschetta ; Beraldo Leal
> >> ; Cleber Rosa ; Laurent Vivier
> >> ; Paolo Bonzini ; Alexander
> >> Bulekov ; Bandan Das ; Stefan
> Hajnoczi
> >> ; Darren Kenny ;
> Qiuhao
> >> Li ; qemu-devel@nongnu.org; qemu-
> >> p...@nongnu.org; de...@daynix.com; Yan Vugenfirer
> >> ; Yuri Benditovich 
> >> Subject: RE: [PATCH v2 00/13] Introduce igb
> >>
> >>
> >>> -Original Message-
> >>> From: Akihiko Odaki 
> >>> Sent: Tuesday, 24 January 2023 05:54
> >>> To: Jason Wang ; Sriram Yagnaraman
> >>> 
> >>> Cc: Dmitry Fleytman ; Michael S. Tsirkin
> >>> ; Marcel Apfelbaum
> ;
> >> Alex
> >>> Bennée ; Philippe Mathieu-Daudé
> >>> ; Thomas Huth ; Wainer dos
> >> Santos
> >>> Moschetta ; Beraldo Leal ;
> >>> Cleber Rosa ; Laurent Vivier ;
> >>> Paolo Bonzini ; Alexander Bulekov
> >>> ; Bandan Das ; Stefan Hajnoczi
> >>> ; Darren Kenny ;
> >> Qiuhao
> >>> Li ; qemu-devel@nongnu.org; qemu-
> >>> p...@nongnu.org; de...@daynix.com; Yan Vugenfirer
> >>> ; Yuri Benditovich
> >>> 
> >>> Subject: Re: [PATCH v2 00/13] Introduce igb
> >>>
> >>> On 2023/01/16 17:01, Jason Wang wrote:
>  On Sat, Jan 14, 2023 at 12:10 PM Akihiko Odaki
> >>>  wrote:
> >
> > Based-on: <20230114035919.35251-1-akihiko.od...@daynix.com>
> > ([PATCH 00/19] e1000x cleanups (preliminary for IGB))
> >
> > igb is a family of Intel's gigabit ethernet controllers. This
> > series implements
> > 82576 emulation in particular. You can see the last patch for the
> >>> documentation.
> >
> > Note that there is another effort to bring 82576 emulation. This
> > series was developed independently by Sriram Yagnaraman.
> > https://lists.gnu.org/archive/html/qemu-devel/2022-
> 12/msg04670.htm
> > l
> >
> > It is possible to merge the work from Sriram Yagnaraman and to
> > cherry-pick useful changes from this series later.
> >
> > I think there are several different ways to get the changes into
> > the
> >> mainline.
> > I'm open to any options.
> 
>  I can only do reviews for the general networking part but not the
>  82576 specific part. It would be better if either of the series can
>  get some ACKs from some ones that they are familiar with 82576,
>  then I can try to merge.
> 
>  Thanks
> >>>
> >>> I have just sent v3 to the list.
> >>>
> >>> Sriram Yagnaraman, who wrote another series for 82576, is the only
> >>> person I know who is familiar with the device.
> >>>
> >>> Sriram, can you take a look at v3 I have just sent?
> >>
> >> I am at best a good interpreter of the 82576 datasheet. I will review
> >> your changes get back here.
> >
> > I have reviewed and tested your changes and it looks great to me in general.
> > I would like to note some features that I would like to add on top of
> > your patch, if you have not worked on these already :)
> > - PFRSTD (PF reset done)
> > - SRRCTL (Rx desc buf size)
> > - RLPML (oversized packet handling)
> > - MAC/VLAN anti-spoof checks
> > - VMOLR_STRVLAN and RPLOLR_STRVLAN (VLAN stripping for VFs)
> > - VMVIR (VLAN insertion for VFs)
> > - VF reset
> > - VFTE, VFRE, VFLRE
> > - VF stats
> > - Set EITR initial value
> >
> > Since this is a new device and there are no existing users, is it possible 
> > to get
> the change into baseline first and fix missing features and bugs soon after?
> 
> Thanks for reviewing,
> 
> I have just submitted v4. The difference from v3 is only that igb now 
> correctly
> specifies VFs associated with queues for DMA.
> 
> RX descriptor buffer size in SRRCTL is respected since v3. I think the other
> features are missing. I am not planning to implement them either, but I'm
> considering to test the code with DPDK and I may add features it requires.

Ok, I just sent a patchset adding most of the features I listed above ([PATCH 
0/9] igb: add missing feature set).

> 
> I also want to get this series into the mainline before adding new features 
> as it
> is already so big, but please tell me if you noticed bugs, especially ones 
> which
> can be fixed withou

[PATCH 0/9] igb: add missing feature set from

2023-01-28 Thread Sriram Yagnaraman
Based-on: <20230126111943.38695-1-akihiko.od...@daynix.com>
([PATCH v4 00/13] Introduce igb)

Rebased on latest changes from Akihiko, and merged changes from my
original patchset:
https://lists.gnu.org/archive/html/qemu-devel/2022-12/msg04670.html

Sriram Yagnaraman (9):
  MAINTAINERS: Add Sriram Yagnaraman as a igb reviewer
  igb: handle PF/VF reset properly
  igb: implement VFRE and VFTE registers
  igb: check oversized packets for VMDq
  igb: respect E1000_VMOLR_RSSE
  igb: add ICR_RXDW
  igb: implement VF Tx and Rx stats
  igb: respect VT_CTL ignore MAC field
  igb: respect VMVIR and VMOLR for VLAN

 MAINTAINERS  |   1 +
 hw/net/e1000x_regs.h |   5 +
 hw/net/igb_core.c| 255 ---
 hw/net/igb_regs.h|   3 +-
 hw/net/trace-events  |   4 +
 5 files changed, 205 insertions(+), 63 deletions(-)

-- 
2.34.1




[PATCH 3/9] igb: implement VFRE and VFTE registers

2023-01-28 Thread Sriram Yagnaraman
Also add checks for RXDCTL/TXDCTL queue enable bits

Signed-off-by: Sriram Yagnaraman 
---
 hw/net/igb_core.c | 42 +++---
 hw/net/igb_regs.h |  3 ++-
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 9bd53cc25f..6bca5459b9 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -778,6 +778,19 @@ igb_txdesc_writeback(IGBCore *core, dma_addr_t base,
 return igb_tx_wb_eic(core, txi->idx);
 }
 
+static inline bool
+igb_tx_enabled(IGBCore *core, const E1000E_RingInfo *txi)
+{
+bool vmdq = core->mac[MRQC] & 1;
+uint16_t qn = txi->idx;
+uint16_t vfn = (qn > IGB_MAX_VF_FUNCTIONS) ?
+   (qn - IGB_MAX_VF_FUNCTIONS) : qn;
+
+return (core->mac[TCTL] & E1000_TCTL_EN) &&
+(vmdq ? (core->mac[VFTE] & BIT(vfn)) : true) &&
+(core->mac[TXDCTL0 + (qn * 16)] & E1000_TXDCTL_QUEUE_ENABLE);
+}
+
 static void
 igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
 {
@@ -787,8 +800,7 @@ igb_start_xmit(IGBCore *core, const IGB_TxRing *txr)
 const E1000E_RingInfo *txi = txr->i;
 uint32_t eic = 0;
 
-/* TODO: check if the queue itself is enabled too. */
-if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
+if (!igb_tx_enabled(core, txi)) {
 trace_e1000e_tx_disabled();
 return;
 }
@@ -1003,6 +1015,7 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
 queues = BIT(def_pl >> E1000_VT_CTL_DEFAULT_POOL_SHIFT);
 }
 
+queues &= core->mac[VFRE];
 igb_rss_parse_packet(core, core->rx_pkt, external_tx != NULL, 
rss_info);
 if (rss_info->queue & 1) {
 queues <<= 8;
@@ -1486,7 +1499,7 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4);
 
 uint16_t queues = 0;
-uint32_t n;
+uint32_t n = 0;
 uint8_t min_buf[ETH_ZLEN];
 struct iovec min_iov;
 struct eth_header *ehdr;
@@ -1566,26 +1579,22 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 }
 
 igb_rx_ring_init(core, &rxr, i);
-
-trace_e1000e_rx_rss_dispatched_to_queue(rxr.i->idx);
-
 if (!igb_has_rxbufs(core, rxr.i, total_size)) {
 retval = 0;
 }
 }
 
 if (retval) {
-n = E1000_ICR_RXT0;
-
 igb_rx_fix_l4_csum(core, core->rx_pkt);
 
 for (i = 0; i < IGB_NUM_QUEUES; i++) {
-if (!(queues & BIT(i))) {
+if (!(queues & BIT(i)) ||
+!(core->mac[E1000_RXDCTL(i) >> 2] & 
E1000_RXDCTL_QUEUE_ENABLE)) {
 continue;
 }
 
 igb_rx_ring_init(core, &rxr, i);
-
+trace_e1000e_rx_rss_dispatched_to_queue(rxr.i->idx);
 igb_write_packet_to_guest(core, core->rx_pkt, &rxr, &rss_info);
 
 /* Check if receive descriptor minimum threshold hit */
@@ -1594,6 +1603,9 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 }
 
 core->mac[EICR] |= igb_rx_wb_eic(core, rxr.i->idx);
+
+/* same as RXDW (rx descriptor written back)*/
+n = E1000_ICR_RXT0;
 }
 
 trace_e1000e_rx_written_to_guest(n);
@@ -1981,9 +1993,16 @@ static void igb_set_vfmailbox(IGBCore *core, int index, 
uint32_t val)
 
 static void igb_vf_reset(IGBCore *core, uint16_t vfn)
 {
+uint16_t qn0 = vfn;
+uint16_t qn1 = vfn + IGB_MAX_VF_FUNCTIONS;
+
 /* disable Rx and Tx for the VF*/
-core->mac[VFTE] &= ~BIT(vfn);
+core->mac[RXDCTL0 + (qn0 * 16)] &= ~E1000_RXDCTL_QUEUE_ENABLE;
+core->mac[RXDCTL0 + (qn1 * 16)] &= ~E1000_RXDCTL_QUEUE_ENABLE;
+core->mac[TXDCTL0 + (qn0 * 16)] &= ~E1000_TXDCTL_QUEUE_ENABLE;
+core->mac[TXDCTL0 + (qn1 * 16)] &= ~E1000_TXDCTL_QUEUE_ENABLE;
 core->mac[VFRE] &= ~BIT(vfn);
+core->mac[VFTE] &= ~BIT(vfn);
 /* indicate VF reset to PF */
 core->mac[VFLRE] |= BIT(vfn);
 /* VFLRE and mailbox use the same interrupt cause */
@@ -3889,6 +3908,7 @@ igb_phy_reg_init[] = {
 static const uint32_t igb_mac_reg_init[] = {
 [LEDCTL]= 2 | (3 << 8) | BIT(15) | (6 << 16) | (7 << 24),
 [EEMNGCTL]  = BIT(31),
+[TXDCTL0]   = E1000_TXDCTL_QUEUE_ENABLE,
 [RXDCTL0]   = E1000_RXDCTL_QUEUE_ENABLE | (1 << 16),
 [RXDCTL1]   = 1 << 16,
 [RXDCTL2]   = 1 << 16,
diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
index ebf3e95023..084e751378 100644
--- a/hw/net/igb_regs.h
+++ b/hw/net/igb_regs.h
@@ -160,7 +160,8 @@ union e1000_adv_rx_desc {
 #define E1000_MRQC_RSS_FIELD_IPV6_UDP   0x0080
 #define E1000_MRQC_RSS_FIELD_IPV6_UDP_EX0x0100
 
-/* Additional Receive Descriptor Control definitions */
+/* Additional RX/TX Descriptor Control definitions */
+#define E1000_TXDCTL_QUEUE_ENABLE  0x0200 /* Enable specific Tx Queue */
 #define E1000_RXDCTL_QUEUE_ENABLE  0x0200 /* Enable specific R

[PATCH 1/9] MAINTAINERS: Add Sriram Yagnaraman as a igb reviewer

2023-01-28 Thread Sriram Yagnaraman
I would like to review and be informed on changes to igb device

Signed-off-by: Sriram Yagnaraman 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index ece23b2b15..7d0e84ce37 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2224,6 +2224,7 @@ F: tests/qtest/libqos/e1000e.*
 
 igb
 M: Akihiko Odaki 
+R: Sriram Yagnaraman 
 S: Maintained
 F: docs/system/devices/igb.rst
 F: hw/net/igb*
-- 
2.34.1




[PATCH 4/9] igb: check oversized packets for VMDq

2023-01-28 Thread Sriram Yagnaraman
Signed-off-by: Sriram Yagnaraman 
---
 hw/net/igb_core.c | 74 ++-
 1 file changed, 54 insertions(+), 20 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 6bca5459b9..1eb7ba168f 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1476,6 +1476,30 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt 
*pkt,
 igb_update_rx_stats(core, size, total_size);
 }
 
+static inline bool
+igb_is_oversized(IGBCore *core, const E1000E_RingInfo *rxi, size_t size)
+{
+bool vmdq = core->mac[MRQC] & 1;
+uint16_t qn = rxi->idx;
+uint16_t pool = (qn > IGB_MAX_VF_FUNCTIONS) ?
+   (qn - IGB_MAX_VF_FUNCTIONS) : qn;
+
+bool lpe = (vmdq ? core->mac[VMOLR0 + pool] & E1000_VMOLR_LPE :
+core->mac[RCTL] & E1000_RCTL_LPE);
+bool sbp = core->mac[RCTL] & E1000_RCTL_SBP;
+int maximum_ethernet_vlan_size = 1522;
+int maximum_ethernet_lpe_size =
+(vmdq ? core->mac[VMOLR0 + pool] & E1000_VMOLR_RLPML_MASK :
+ core->mac[RLPML] & E1000_VMOLR_RLPML_MASK);
+
+if (size > maximum_ethernet_lpe_size ||
+(size > maximum_ethernet_vlan_size && !lpe && !sbp)) {
+return true;
+}
+
+return false;
+}
+
 static inline void
 igb_rx_fix_l4_csum(IGBCore *core, struct NetRxPkt *pkt)
 {
@@ -1499,7 +1523,8 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4);
 
 uint16_t queues = 0;
-uint32_t n = 0;
+uint16_t oversized = 0;
+uint32_t icr_bits = 0;
 uint8_t min_buf[ETH_ZLEN];
 struct iovec min_iov;
 struct eth_header *ehdr;
@@ -1509,7 +1534,7 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 E1000E_RxRing rxr;
 E1000E_RSSInfo rss_info;
 size_t total_size;
-ssize_t retval;
+ssize_t retval = 0;
 int i;
 
 trace_e1000e_rx_receive_iov(iovcnt);
@@ -1550,11 +1575,6 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 filter_buf = min_buf;
 }
 
-/* Discard oversized packets if !LPE and !SBP. */
-if (e1000x_is_oversized(core->mac, size)) {
-return orig_size;
-}
-
 ehdr = PKT_GET_ETH_HDR(filter_buf);
 net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr));
 
@@ -1571,8 +1591,6 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 total_size = net_rx_pkt_get_total_len(core->rx_pkt) +
 e1000x_fcs_len(core->mac);
 
-retval = orig_size;
-
 for (i = 0; i < IGB_NUM_QUEUES; i++) {
 if (!(queues & BIT(i))) {
 continue;
@@ -1580,42 +1598,58 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 
 igb_rx_ring_init(core, &rxr, i);
 if (!igb_has_rxbufs(core, rxr.i, total_size)) {
-retval = 0;
+icr_bits |= E1000_ICS_RXO;
 }
 }
 
-if (retval) {
+if (!icr_bits) {
+retval = orig_size;
 igb_rx_fix_l4_csum(core, core->rx_pkt);
 
 for (i = 0; i < IGB_NUM_QUEUES; i++) {
-if (!(queues & BIT(i)) ||
-!(core->mac[E1000_RXDCTL(i) >> 2] & 
E1000_RXDCTL_QUEUE_ENABLE)) {
+if (!(queues & BIT(i))) {
 continue;
 }
 
 igb_rx_ring_init(core, &rxr, i);
+if (igb_is_oversized(core, rxr.i, size)) {
+oversized |= BIT(i);
+continue;
+}
+
+if (!(core->mac[RXDCTL0 + (i * 16)] & E1000_RXDCTL_QUEUE_ENABLE)) {
+continue;
+}
+
 trace_e1000e_rx_rss_dispatched_to_queue(rxr.i->idx);
 igb_write_packet_to_guest(core, core->rx_pkt, &rxr, &rss_info);
 
 /* Check if receive descriptor minimum threshold hit */
 if (igb_rx_descr_threshold_hit(core, rxr.i)) {
-n |= E1000_ICS_RXDMT0;
+icr_bits |= E1000_ICS_RXDMT0;
 }
 
 core->mac[EICR] |= igb_rx_wb_eic(core, rxr.i->idx);
 
 /* same as RXDW (rx descriptor written back)*/
-n = E1000_ICR_RXT0;
+icr_bits |= E1000_ICR_RXT0;
 }
+}
+
+/* 8.19.37 increment ROC only if packet is oversized for all queues */
+if (oversized == queues) {
+trace_e1000x_rx_oversized(size);
+e1000x_inc_reg_if_not_full(core->mac, ROC);
+}
 
-trace_e1000e_rx_written_to_guest(n);
+if (icr_bits & E1000_ICR_RXT0) {
+trace_e1000e_rx_written_to_guest(icr_bits);
 } else {
-n = E1000_ICS_RXO;
-trace_e1000e_rx_not_written_to_guest(n);
+trace_e1000e_rx_not_written_to_guest(icr_bits);
 }
 
-trace_e1000e_rx_interrupt_set(n);
-igb_set_interrupt_cause(core, n);
+trace_e1000e_rx_interrupt_set(icr_bits);
+igb_set_interrupt_cause(core, icr_bits);
 
 return retval;
 }
-- 
2.34.1




[PATCH 7/9] igb: implement VF Tx and Rx stats

2023-01-28 Thread Sriram Yagnaraman
Please note that loopback counters for VM to VM traffic is not
implemented yet: VFGOTLBC, VFGPTLBC, VFGORLBC and VFGPRLBC.

Signed-off-by: Sriram Yagnaraman 
---
 hw/net/igb_core.c | 31 ++-
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 43ff387b16..375d9d5e34 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -488,7 +488,7 @@ igb_tx_pkt_send(IGBCore *core, struct igb_tx *tx, int 
queue_index)
 }
 
 static void
-igb_on_tx_done_update_stats(IGBCore *core, struct NetTxPkt *tx_pkt)
+igb_on_tx_done_update_stats(IGBCore *core, struct NetTxPkt *tx_pkt, int qn)
 {
 static const int PTCregs[6] = { PTC64, PTC127, PTC255, PTC511,
 PTC1023, PTC1522 };
@@ -515,6 +515,14 @@ igb_on_tx_done_update_stats(IGBCore *core, struct NetTxPkt 
*tx_pkt)
 core->mac[GPTC] = core->mac[TPT];
 core->mac[GOTCL] = core->mac[TOTL];
 core->mac[GOTCH] = core->mac[TOTH];
+
+if (core->mac[MRQC] & 1) {
+uint16_t pool = (qn > IGB_MAX_VF_FUNCTIONS) ?
+(qn - IGB_MAX_VF_FUNCTIONS) : qn;
+
+core->mac[PVFGOTC0 + (pool * 64)] += tot_len;
+core->mac[PVFGPTC0 + (pool * 64)]++;
+}
 }
 
 static void
@@ -577,7 +585,7 @@ igb_process_tx_desc(IGBCore *core,
 core->mac[VET] & 0x);
 }
 if (igb_tx_pkt_send(core, tx, queue_index)) {
-igb_on_tx_done_update_stats(core, tx->tx_pkt);
+igb_on_tx_done_update_stats(core, tx->tx_pkt, queue_index);
 }
 }
 
@@ -1364,7 +1372,8 @@ igb_write_to_rx_buffers(IGBCore *core,
 }
 
 static void
-igb_update_rx_stats(IGBCore *core, size_t data_size, size_t data_fcs_size)
+igb_update_rx_stats(IGBCore *core, const E1000E_RingInfo *rxi,
+size_t data_size, size_t data_fcs_size)
 {
 e1000x_update_rx_total_stats(core->mac, data_size, data_fcs_size);
 
@@ -1380,6 +1389,18 @@ igb_update_rx_stats(IGBCore *core, size_t data_size, 
size_t data_fcs_size)
 default:
 break;
 }
+
+if (core->mac[MRQC] & 1) {
+uint16_t qn = rxi->idx;
+uint16_t pool = (qn > IGB_MAX_VF_FUNCTIONS) ?
+(qn - IGB_MAX_VF_FUNCTIONS) : qn;
+
+core->mac[PVFGORC0 + (pool * 64)] += data_size + 4;
+core->mac[PVFGPRC0 + (pool * 64)]++;
+if (net_rx_pkt_get_packet_type(core->rx_pkt) == ETH_PKT_MCAST) {
+core->mac[PVFMPRC0 + (pool * 64)]++;
+}
+}
 }
 
 static inline bool
@@ -1481,7 +1502,7 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt 
*pkt,
 
 } while (desc_offset < total_size);
 
-igb_update_rx_stats(core, size, total_size);
+igb_update_rx_stats(core, rxi, size, total_size);
 }
 
 static inline bool
@@ -1490,7 +1511,7 @@ igb_is_oversized(IGBCore *core, const E1000E_RingInfo 
*rxi, size_t size)
 bool vmdq = core->mac[MRQC] & 1;
 uint16_t qn = rxi->idx;
 uint16_t pool = (qn > IGB_MAX_VF_FUNCTIONS) ?
-   (qn - IGB_MAX_VF_FUNCTIONS) : qn;
+(qn - IGB_MAX_VF_FUNCTIONS) : qn;
 
 bool lpe = (vmdq ? core->mac[VMOLR0 + pool] & E1000_VMOLR_LPE :
 core->mac[RCTL] & E1000_RCTL_LPE);
-- 
2.34.1




[PATCH 6/9] igb: add ICR_RXDW

2023-01-28 Thread Sriram Yagnaraman
IGB uses RXDW ICR bit to indicate that rx descriptor has been written
back. This is the same as RXT0 bit in older HW.

Signed-off-by: Sriram Yagnaraman 
---
 hw/net/e1000x_regs.h | 4 
 hw/net/igb_core.c| 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h
index bb3fb36b8d..3a3431d878 100644
--- a/hw/net/e1000x_regs.h
+++ b/hw/net/e1000x_regs.h
@@ -335,6 +335,7 @@
 #define E1000_ICR_RXDMT00x0010 /* rx desc min. threshold (0) */
 #define E1000_ICR_RXO   0x0040 /* rx overrun */
 #define E1000_ICR_RXT0  0x0080 /* rx timer intr (ring 0) */
+#define E1000_ICR_RXDW  0x0080 /* rx desc written back */
 #define E1000_ICR_MDAC  0x0200 /* MDIO access complete */
 #define E1000_ICR_RXCFG 0x0400 /* RX /c/ ordered set */
 #define E1000_ICR_GPI_EN0   0x0800 /* GP Int 0 */
@@ -378,6 +379,7 @@
 #define E1000_ICS_RXDMT0E1000_ICR_RXDMT0/* rx desc min. threshold */
 #define E1000_ICS_RXO   E1000_ICR_RXO   /* rx overrun */
 #define E1000_ICS_RXT0  E1000_ICR_RXT0  /* rx timer intr */
+#define E1000_ICS_RXDW  E1000_ICR_RXDW  /* rx desc written back */
 #define E1000_ICS_MDAC  E1000_ICR_MDAC  /* MDIO access complete */
 #define E1000_ICS_RXCFG E1000_ICR_RXCFG /* RX /c/ ordered set */
 #define E1000_ICS_GPI_EN0   E1000_ICR_GPI_EN0   /* GP Int 0 */
@@ -407,6 +409,7 @@
 #define E1000_IMS_RXDMT0E1000_ICR_RXDMT0/* rx desc min. threshold */
 #define E1000_IMS_RXO   E1000_ICR_RXO   /* rx overrun */
 #define E1000_IMS_RXT0  E1000_ICR_RXT0  /* rx timer intr */
+#define E1000_IMS_RXDW  E1000_ICR_RXDW  /* rx desc written back */
 #define E1000_IMS_MDAC  E1000_ICR_MDAC  /* MDIO access complete */
 #define E1000_IMS_RXCFG E1000_ICR_RXCFG /* RX /c/ ordered set */
 #define E1000_IMS_GPI_EN0   E1000_ICR_GPI_EN0   /* GP Int 0 */
@@ -441,6 +444,7 @@
 #define E1000_IMC_RXDMT0E1000_ICR_RXDMT0/* rx desc min. threshold */
 #define E1000_IMC_RXO   E1000_ICR_RXO   /* rx overrun */
 #define E1000_IMC_RXT0  E1000_ICR_RXT0  /* rx timer intr */
+#define E1000_IMC_RXDW  E1000_ICR_RXDW  /* rx desc written back */
 #define E1000_IMC_MDAC  E1000_ICR_MDAC  /* MDIO access complete */
 #define E1000_IMC_RXCFG E1000_ICR_RXCFG /* RX /c/ ordered set */
 #define E1000_IMC_GPI_EN0   E1000_ICR_GPI_EN0   /* GP Int 0 */
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index e4fd4a1a5f..43ff387b16 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1640,7 +1640,7 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 core->mac[EICR] |= igb_rx_wb_eic(core, rxr.i->idx);
 
 /* same as RXDW (rx descriptor written back)*/
-icr_bits |= E1000_ICR_RXT0;
+icr_bits |= E1000_ICR_RXDW;
 }
 }
 
@@ -1650,7 +1650,7 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 e1000x_inc_reg_if_not_full(core->mac, ROC);
 }
 
-if (icr_bits & E1000_ICR_RXT0) {
+if (icr_bits & E1000_ICR_RXDW) {
 trace_e1000e_rx_written_to_guest(icr_bits);
 } else {
 trace_e1000e_rx_not_written_to_guest(icr_bits);
-- 
2.34.1




[PATCH 5/9] igb: respect E1000_VMOLR_RSSE

2023-01-28 Thread Sriram Yagnaraman
RSS for VFs is only enabled if VMOLR[n].RSSE is set.

Signed-off-by: Sriram Yagnaraman 
---
 hw/net/igb_core.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 1eb7ba168f..e4fd4a1a5f 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -69,7 +69,7 @@ typedef struct IGBTxPktVmdqCallbackContext {
 
 static ssize_t
 igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt,
- bool has_vnet, bool *assigned);
+ bool has_vnet, bool *external_tx);
 
 static inline void
 igb_set_interrupt_cause(IGBCore *core, uint32_t val);
@@ -942,7 +942,7 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
 
 if (core->mac[MRQC] & 1) {
 if (is_broadcast_ether_addr(ehdr->h_dest)) {
-for (i = 0; i < 8; i++) {
+for (i = 0; i < IGB_MAX_VF_FUNCTIONS; i++) {
 if (core->mac[VMOLR0 + i] & E1000_VMOLR_BAM) {
 queues |= BIT(i);
 }
@@ -976,7 +976,7 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
 f = ta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3];
 f = (((ehdr->h_dest[5] << 8) | ehdr->h_dest[4]) >> f) & 0xfff;
 if (macp[f >> 5] & (1 << (f & 0x1f))) {
-for (i = 0; i < 8; i++) {
+for (i = 0; i < IGB_MAX_VF_FUNCTIONS; i++) {
 if (core->mac[VMOLR0 + i] & E1000_VMOLR_ROMPE) {
 queues |= BIT(i);
 }
@@ -999,7 +999,7 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
 }
 }
 } else {
-for (i = 0; i < 8; i++) {
+for (i = 0; i < IGB_MAX_VF_FUNCTIONS; i++) {
 if (core->mac[VMOLR0 + i] & E1000_VMOLR_AUPE) {
 mask |= BIT(i);
 }
@@ -1018,7 +1018,15 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
 queues &= core->mac[VFRE];
 igb_rss_parse_packet(core, core->rx_pkt, external_tx != NULL, 
rss_info);
 if (rss_info->queue & 1) {
-queues <<= 8;
+for (i = 0; i < IGB_MAX_VF_FUNCTIONS; i++) {
+if (!(queues & BIT(i))) {
+continue;
+}
+if (core->mac[VMOLR0 + i] & E1000_VMOLR_RSSE) {
+queues |= BIT(i + IGB_MAX_VF_FUNCTIONS);
+queues &= ~BIT(i);
+}
+}
 }
 } else {
 switch (net_rx_pkt_get_packet_type(core->rx_pkt)) {
-- 
2.34.1




[PATCH 9/9] igb: respect VMVIR and VMOLR for VLAN

2023-01-28 Thread Sriram Yagnaraman
Add support for stripping/inserting VLAN for VFs.

Signed-off-by: Sriram Yagnaraman 
---
 hw/net/igb_core.c | 100 ++
 1 file changed, 65 insertions(+), 35 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 8e33e15505..96a5c5eca3 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -384,6 +384,26 @@ igb_rss_parse_packet(IGBCore *core, struct NetRxPkt *pkt, 
bool tx,
 info->queue = E1000_RSS_QUEUE(&core->mac[RETA], info->hash);
 }
 
+static inline bool
+igb_tx_insert_vlan(IGBCore *core, uint16_t qn,
+   struct igb_tx *tx, bool desc_vle)
+{
+if (core->mac[MRQC] & 1) {
+uint16_t pool = (qn > IGB_MAX_VF_FUNCTIONS) ?
+(qn - IGB_MAX_VF_FUNCTIONS) : qn;
+
+if (core->mac[VMVIR0 + pool] & E1000_VMVIR_VLANA_DEFAULT) {
+/* always insert default VLAN */
+desc_vle = true;
+tx->vlan = core->mac[VMVIR0 + pool] & 0xfff;
+} else if (core->mac[VMVIR0 + pool] & E1000_VMVIR_VLANA_NEVER) {
+return false;
+}
+}
+
+return desc_vle && e1000x_vlan_enabled(core->mac);
+}
+
 static bool
 igb_setup_tx_offloads(IGBCore *core, struct igb_tx *tx)
 {
@@ -580,7 +600,8 @@ igb_process_tx_desc(IGBCore *core,
 
 if (cmd_type_len & E1000_TXD_CMD_EOP) {
 if (!tx->skip_cp && net_tx_pkt_parse(tx->tx_pkt)) {
-if (cmd_type_len & E1000_TXD_CMD_VLE) {
+if (igb_tx_insert_vlan(core, queue_index, tx,
+(cmd_type_len & E1000_TXD_CMD_VLE))) {
 net_tx_pkt_setup_vlan_header_ex(tx->tx_pkt, tx->vlan,
 core->mac[VET] & 0x);
 }
@@ -1514,6 +1535,22 @@ igb_write_packet_to_guest(IGBCore *core, struct NetRxPkt 
*pkt,
 igb_update_rx_stats(core, rxi, size, total_size);
 }
 
+static inline bool
+igb_rx_strip_vlan(IGBCore *core, const E1000E_RingInfo *rxi,
+eth_pkt_types_e pkt_type)
+{
+if (core->mac[MRQC] & 1) {
+uint16_t qn = rxi->idx;
+uint16_t pool = (qn > IGB_MAX_VF_FUNCTIONS) ?
+(qn - IGB_MAX_VF_FUNCTIONS) : qn;
+return (pkt_type == ETH_PKT_MCAST) ?
+core->mac[RPLOLR] & E1000_RPLOLR_STRVLAN :
+core->mac[VMOLR0 + pool] & E1000_VMOLR_STRVLAN;
+}
+
+return e1000x_vlan_enabled(core->mac);
+}
+
 static inline bool
 igb_is_oversized(IGBCore *core, const E1000E_RingInfo *rxi, size_t size)
 {
@@ -1574,6 +1611,7 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 size_t total_size;
 ssize_t retval = 0;
 int i;
+bool strip_vlan = false;
 
 trace_e1000e_rx_receive_iov(iovcnt);
 
@@ -1615,10 +1653,7 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 
 ehdr = PKT_GET_ETH_HDR(filter_buf);
 net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr));
-
-net_rx_pkt_attach_iovec_ex(core->rx_pkt, iov, iovcnt, iov_ofs,
-   e1000x_vlan_enabled(core->mac),
-   core->mac[VET] & 0x);
+net_rx_pkt_set_protocols(core->rx_pkt, filter_buf, size);
 
 queues = igb_receive_assign(core, ehdr, &rss_info, external_tx);
 if (!queues) {
@@ -1626,8 +1661,8 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 return orig_size;
 }
 
-total_size = net_rx_pkt_get_total_len(core->rx_pkt) +
-e1000x_fcs_len(core->mac);
+retval = orig_size;
+total_size = size + e1000x_fcs_len(core->mac);
 
 for (i = 0; i < IGB_NUM_QUEUES; i++) {
 if (!(queues & BIT(i))) {
@@ -1635,43 +1670,38 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 }
 
 igb_rx_ring_init(core, &rxr, i);
+strip_vlan = igb_rx_strip_vlan(core, rxr.i,
+get_eth_packet_type(ehdr));
+net_rx_pkt_attach_iovec_ex(core->rx_pkt, iov, iovcnt, iov_ofs,
+strip_vlan, core->mac[VET] & 0x);
+igb_rx_fix_l4_csum(core, core->rx_pkt);
+
 if (!igb_has_rxbufs(core, rxr.i, total_size)) {
 icr_bits |= E1000_ICS_RXO;
+continue;
 }
-}
-
-if (!icr_bits) {
-retval = orig_size;
-igb_rx_fix_l4_csum(core, core->rx_pkt);
-
-for (i = 0; i < IGB_NUM_QUEUES; i++) {
-if (!(queues & BIT(i))) {
-continue;
-}
 
-igb_rx_ring_init(core, &rxr, i);
-if (igb_is_oversized(core, rxr.i, size)) {
-oversized |= BIT(i);
-continue;
-}
+if (igb_is_oversized(core, rxr.i, total_size)) {
+oversized |= BIT(i);
+continue;
+}
 
-if (!(core->mac[RXDCTL0 + (i * 16)] & E1000_RXDCTL_QUEUE_ENABLE)) {
-continue;
-}
+if (!(core->mac[RXDCTL0 + (i * 16)] & E1000_RXDCTL_QUEUE_ENABLE)) {
+continue;
+}
 
-

[PATCH 2/9] igb: handle PF/VF reset properly

2023-01-28 Thread Sriram Yagnaraman
Use PFRSTD to reset RSTI bit for VFs, and raise VFLRE interrupt when VF
is reset.

Signed-off-by: Sriram Yagnaraman 
---
 hw/net/e1000x_regs.h |  1 +
 hw/net/igb_core.c| 33 +
 hw/net/trace-events  |  2 ++
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/hw/net/e1000x_regs.h b/hw/net/e1000x_regs.h
index fb5b861135..bb3fb36b8d 100644
--- a/hw/net/e1000x_regs.h
+++ b/hw/net/e1000x_regs.h
@@ -548,6 +548,7 @@
 
 #define E1000_CTRL_EXT_ASDCHK  0x1000 /* auto speed detection check */
 #define E1000_CTRL_EXT_EE_RST  0x2000 /* EEPROM reset */
+#define E1000_CTRL_EXT_PFRSTD  0x4000 /* PF reset done indication */
 #define E1000_CTRL_EXT_LINK_EN 0x0001 /* enable link status from external 
LINK_0 and LINK_1 pins */
 #define E1000_CTRL_EXT_DRV_LOAD 0x1000 /* Driver loaded bit for FW */
 #define E1000_CTRL_EXT_EIAME   0x0100
diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index abeb9c7889..9bd53cc25f 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1902,14 +1902,6 @@ static void igb_set_eims(IGBCore *core, int index, 
uint32_t val)
 igb_update_interrupt_state(core);
 }
 
-static void igb_vf_reset(IGBCore *core, uint16_t vfn)
-{
-/* TODO: Reset of the queue enable and the interrupt registers of the VF. 
*/
-
-core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_RSTI;
-core->mac[V2PMAILBOX0 + vfn] = E1000_V2PMAILBOX_RSTD;
-}
-
 static void mailbox_interrupt_to_vf(IGBCore *core, uint16_t vfn)
 {
 uint32_t ent = core->mac[VTIVAR_MISC + vfn];
@@ -1987,6 +1979,17 @@ static void igb_set_vfmailbox(IGBCore *core, int index, 
uint32_t val)
 }
 }
 
+static void igb_vf_reset(IGBCore *core, uint16_t vfn)
+{
+/* disable Rx and Tx for the VF*/
+core->mac[VFTE] &= ~BIT(vfn);
+core->mac[VFRE] &= ~BIT(vfn);
+/* indicate VF reset to PF */
+core->mac[VFLRE] |= BIT(vfn);
+/* VFLRE and mailbox use the same interrupt cause */
+mailbox_interrupt_to_pf(core);
+}
+
 static void igb_w1c(IGBCore *core, int index, uint32_t val)
 {
 core->mac[index] &= ~val;
@@ -2241,14 +2244,20 @@ igb_set_status(IGBCore *core, int index, uint32_t val)
 static void
 igb_set_ctrlext(IGBCore *core, int index, uint32_t val)
 {
-trace_e1000e_link_set_ext_params(!!(val & E1000_CTRL_EXT_ASDCHK),
- !!(val & E1000_CTRL_EXT_SPD_BYPS));
-
-/* TODO: PFRSTD */
+trace_igb_link_set_ext_params(!!(val & E1000_CTRL_EXT_ASDCHK),
+  !!(val & E1000_CTRL_EXT_SPD_BYPS),
+  !!(val & E1000_CTRL_EXT_PFRSTD));
 
 /* Zero self-clearing bits */
 val &= ~(E1000_CTRL_EXT_ASDCHK | E1000_CTRL_EXT_EE_RST);
 core->mac[CTRL_EXT] = val;
+
+if (core->mac[CTRL_EXT] & E1000_CTRL_EXT_PFRSTD) {
+for (int vfn = 0; vfn < IGB_MAX_VF_FUNCTIONS; vfn++) {
+core->mac[V2PMAILBOX0 + vfn] &= ~E1000_V2PMAILBOX_RSTI;
+core->mac[V2PMAILBOX0 + vfn] |= E1000_V2PMAILBOX_RSTD;
+}
+}
 }
 
 static void
diff --git a/hw/net/trace-events b/hw/net/trace-events
index 2f791b9b57..e94172e748 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -281,6 +281,8 @@ igb_core_mdic_read_unhandled(uint32_t addr) "MDIC READ: 
PHY[%u] UNHANDLED"
 igb_core_mdic_write(uint32_t addr, uint32_t data) "MDIC WRITE: PHY[%u] = 0x%x"
 igb_core_mdic_write_unhandled(uint32_t addr) "MDIC WRITE: PHY[%u] UNHANDLED"
 
+igb_link_set_ext_params(bool asd_check, bool speed_select_bypass, bool pfrstd) 
"Set extended link params: ASD check: %d, Speed select bypass: %d, PF reset 
done: %d"
+
 igb_rx_desc_buff_size(uint32_t b) "buffer size: %u"
 igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, const void* source, 
uint32_t len) "addr: 0x%"PRIx64", offset: %u, from: %p, length: %u"
 
-- 
2.34.1




[PATCH 8/9] igb: respect VT_CTL ignore MAC field

2023-01-28 Thread Sriram Yagnaraman
Also trace out a warning if replication mode is disabled, since we only
support replication mode enabled.

Signed-off-by: Sriram Yagnaraman 
---
 hw/net/igb_core.c   | 9 +
 hw/net/trace-events | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 375d9d5e34..8e33e15505 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -949,6 +949,10 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
 }
 
 if (core->mac[MRQC] & 1) {
+if (!(core->mac[VT_CTL] & E1000_VT_CTL_VM_REPL_EN)) {
+trace_igb_rx_vmdq_replication_mode_disabled();
+}
+
 if (is_broadcast_ether_addr(ehdr->h_dest)) {
 for (i = 0; i < IGB_MAX_VF_FUNCTIONS; i++) {
 if (core->mac[VMOLR0 + i] & E1000_VMOLR_BAM) {
@@ -995,6 +999,11 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
struct eth_header *ehdr,
 }
 }
 
+/* assume a full pool list if IGMAC is set */
+if (core->mac[VT_CTL] & E1000_VT_CTL_IGNORE_MAC) {
+queues = BIT(IGB_MAX_VF_FUNCTIONS) - 1;
+}
+
 if (e1000x_vlan_rx_filter_enabled(core->mac)) {
 uint16_t mask = 0;
 
diff --git a/hw/net/trace-events b/hw/net/trace-events
index e94172e748..9bc7658692 100644
--- a/hw/net/trace-events
+++ b/hw/net/trace-events
@@ -288,6 +288,8 @@ igb_rx_desc_buff_write(uint64_t addr, uint16_t offset, 
const void* source, uint3
 
 igb_rx_metadata_rss(uint32_t rss) "RSS data: 0x%X"
 
+igb_rx_vmdq_replication_mode_disabled(void) "WARN: Only replication mode 
enabled is supported"
+
 igb_irq_icr_clear_gpie_nsicr(void) "Clearing ICR on read due to GPIE.NSICR 
enabled"
 igb_irq_icr_write(uint32_t bits, uint32_t old_icr, uint32_t new_icr) "Clearing 
ICR bits 0x%x: 0x%x --> 0x%x"
 igb_irq_set_iam(uint32_t icr) "Update IAM: 0x%x"
-- 
2.34.1




Re: [PATCH] pci: add enforce_slot_reserved_mask_manual property

2023-01-28 Thread Michael S. Tsirkin
On Sat, Jan 28, 2023 at 08:20:55AM -0500, Chuck Zmudzinski wrote:
> On 1/28/23 5:26 AM, Michael S. Tsirkin wrote:
> > On Fri, Jan 27, 2023 at 10:39:28PM -0500, Chuck Zmudzinski wrote:
> >> On 1/27/2023 8:28 AM, Michael S. Tsirkin wrote:
> >> > On Sun, Jan 15, 2023 at 07:49:51PM -0500, Chuck Zmudzinski wrote:
> >> > > The current reserved slot check in do_pci_register_device(), added with
> >> > > commit 8b8849844fd6
> >> >
> >> > add ("subject here") please
> >> >
> >> > > ,is done even if the pci device being added is
> >> > > configured manually for a particular slot. The new property, when set
> >> > > to false, disables the check when the device is configured to request a
> >> > > particular slot. This allows an administrator or management tool to
> >> > > override slot_reserved_mask for a pci device by requesting a particular
> >> > > slot for the device. The new property is initialized to true which
> >> > > preserves the existing behavior of slot_reserved_mask by default.
> >> > > 
> >> > > Signed-off-by: Chuck Zmudzinski 
> >> >
> >> > Thanks!
> >> > I'm trying to think of the best default for this.
> >> 
> >> I think it would be better for the default value of
> >> enforce_slot_reserved_mask_manual to be false, so that a
> >> user-specified slot will by default override slot_reserved_mask.
> >> But doing that would change the current behavior of
> >> slot_reserved_mask.
> >> 
> >> Currently, this is the only place where slot_reserved_mask is used in all
> >> of the Qemu source (code from hw/sparc64/sun4u.c):
> >> 
> >> -- snip ---
> >>     /* Only in-built Simba APBs can exist on the root bus, slot 0 on busA 
> >> is
> >>    reserved (leaving no slots free after on-board devices) however 
> >> slots
> >>    0-3 are free on busB */
> >>     pci_bus->slot_reserved_mask = 0xfffc;
> >>     pci_busA->slot_reserved_mask = 0xfff1;
> >>     pci_busB->slot_reserved_mask = 0xfff0;
> >> -- snip ---
> >> 
> >> I think we could safely change the default value of
> >> enforce_slot_reserved_mask_manual to false but set
> >> it to true for the sparc64 sun4u board here to preserve
> >> the current behavior of the only existing board in Qemu
> >> that uses slot_reserved_mask.
> >> 
> >> What do you think?
> > 
> > I guess first can you answer whether this is still needed
> > with the latest Xen patches?
> > 
> 
> It's not really needed except for experimental purposes to allow
> an administrator to test experimental configurations with a device
> other than the igd at slot 2. That might be useful in some cases,
> but it is not really necessary unless someone asks for that capability.
> If libvirt users who ordinarily like to manually specify all the
> settings will be OK with the proposed patch to xen that prevents
> an administrator from being able to override a new setting that
> reserves slot 2 for the igd for type "xenfv" machines configured for
> igd passthrough, then there is no need for this patch. I don't think
> many users need the capability to insert a different device in slot 2 for
> the "xenfv" machine type configured with igd-passthru=on, so I would be
> OK if this patch is not included in qemu.
> 
> Chuck

Pls wait and see if that patch gets picked up. Let me know.




Re: [PATCH 06/23] target/arm: Make HSTR_EL2 traps take priority over UNDEF-at-EL1

2023-01-28 Thread Peter Maydell
On Sat, 28 Jan 2023 at 01:47, Richard Henderson
 wrote:
>
> On 1/27/23 07:54, Peter Maydell wrote:
> > +void HELPER(hstr_trap_check)(CPUARMState *env, uint32_t mask, uint32_t 
> > syndrome)
> > +{
> > +if (env->cp15.hstr_el2 & mask) {
> > +raise_exception(env, EXCP_UDEF, syndrome, 2);
> > +}
>
> This is so simple...
>
>
> > @@ -4760,6 +4761,28 @@ static void do_coproc_insn(DisasContext *s, int 
> > cpnum, int is64,
> >   break;
> >   }
> >
> > +if (s->hstr_active && cpnum == 15 && s->current_el == 1) {
> > +/*
> > + * At EL1, check for a HSTR_EL2 trap, which must take precedence
> > + * over the UNDEF for "no such register" or the UNDEF for "access
> > + * permissions forbid this EL1 access". HSTR_EL2 traps from EL0
> > + * only happen if the cpreg doesn't UNDEF at EL0, so we do those in
> > + * access_check_cp_reg(), after the checks for whether the access
> > + * configurably trapped to EL1.
> > + */
> > +uint32_t maskbit = is64 ? crm : crn;
> > +
> > +if (maskbit != 4 && maskbit != 14) {
> > +/* T4 and T14 are RES0 so never cause traps */
> > +gen_set_condexec(s);
> > +gen_update_pc(s, 0);
> > +emitted_update_pc = true;
> > +gen_helper_hstr_trap_check(cpu_env,
> > +   tcg_constant_i32(1 << maskbit),
> > +   tcg_constant_i32(syndrome));
> > +}
>
> How about
>
>  if (maskbit...) {
>  TCGv_i32 t = load_cpu_offset(offsetoflow32(CPUARMState, hstr_el2));
>  DisasLabel *over = gen_disas_label(s);
>
>  tcg_gen_andi_i32(t, t, 1u << maskbit);
>  tcg_gen_brcondi_i32(TCG_COND_EQ, t, 0, over.label);
>  tcg_temp_free_i32(t);
>
>  gen_exception_insn(s, 0, EXCP_UDEF, syndrome);
>  set_disas_label(s, over);
>  }
>
> which also eliminates the need for emitted_update_pc.

I really dislike use of brcond in generated TCG, because of the
massive beartrap it sets up where all your temporaries get nuked
but there's no compile-time checking that you didn't try to keep
using one after the brcond. So I generally prefer an approach that
avoids brcond over one that uses it, if it's available.

thanks
-- PMM



Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE

2023-01-28 Thread Chao Peng
On Sat, Jan 14, 2023 at 12:01:01AM +, Sean Christopherson wrote:
> On Fri, Dec 02, 2022, Chao Peng wrote:
... 
> Strongly prefer to use similar logic to existing code that detects wraps:
> 
>   mem->restricted_offset + mem->memory_size < 
> mem->restricted_offset
> 
> This is also where I'd like to add the "gfn is aligned to offset" check, 
> though
> my brain is too fried to figure that out right now.

Used count_trailing_zeros() for this TODO, unsure we have other better
approach.

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index afc8c26fa652..fd34c5f7cd2f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
@@ -2087,6 +2088,19 @@ static bool kvm_check_memslot_overlap(struct 
kvm_memslots *slots, int id,
return false;
 }
 
+/*
+ * Return true when ALIGNMENT(offset) >= ALIGNMENT(gpa).
+ */
+static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
+{
+   if (!offset)
+   return true;
+   if (!gpa)
+   return false;
+
+   return !!(count_trailing_zeros(offset) >= count_trailing_zeros(gpa));
+}
+
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
@@ -2128,7 +2142,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (mem->flags & KVM_MEM_PRIVATE &&
(mem->restrictedmem_offset & (PAGE_SIZE - 1) ||
 mem->restrictedmem_offset + mem->memory_size < 
mem->restrictedmem_offset ||
-0 /* TODO: require gfn be aligned with restricted offset */))
+!kvm_check_rmem_offset_alignment(mem->restrictedmem_offset,
+ mem->guest_phys_addr)))
return -EINVAL;
if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
return -EINVAL;




Re: [PATCH v10 7/9] KVM: Update lpage info when private/shared memory are mixed

2023-01-28 Thread Chao Peng
On Fri, Jan 13, 2023 at 11:16:27PM +, Sean Christopherson wrote:
> On Fri, Dec 02, 2022, Chao Peng wrote:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 9a07380f8d3c..5aefcff614d2 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -12362,6 +12362,8 @@ static int kvm_alloc_memslot_metadata(struct kvm 
> > *kvm,
> > if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 
> > 1))
> > linfo[lpages - 1].disallow_lpage = 1;
> > ugfn = slot->userspace_addr >> PAGE_SHIFT;
> > +   if (kvm_slot_can_be_private(slot))
> > +   ugfn |= slot->restricted_offset >> PAGE_SHIFT;
> > /*
> >  * If the gfn and userspace address are not aligned wrt each
> >  * other, disable large page support for this slot.
> 
> Forgot to talk about the bug.  This code needs to handle the scenario where a
> memslot is created with existing, non-uniform attributes.  It might be a bit 
> ugly
> (I didn't even try to write the code), but it's definitely possible, and since
> memslot updates are already slow I think it's best to handle things here.
> 
> In the meantime, I added this so we don't forget to fix it before merging.
> 
> #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
>   pr_crit_once("FIXME: Walk the memory attributes of the slot and set the 
> mixed status appropriately");
> #endif

Here is the code to fix (based on your latest github repo).

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e552374f2357..609ff1cba9c5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2195,4 +2195,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, 
unsigned long npages);
 KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \
 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
 
+#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
+void kvm_memory_attributes_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot);
+#endif
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index eda615f3951c..8833d7201e41 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7201,10 +7201,11 @@ static bool has_mixed_attrs(struct kvm *kvm, struct 
kvm_memory_slot *slot,
return false;
 }
 
-void kvm_arch_set_memory_attributes(struct kvm *kvm,
-   struct kvm_memory_slot *slot,
-   unsigned long attrs,
-   gfn_t start, gfn_t end)
+static void kvm_update_lpage_mixed_flag(struct kvm *kvm,
+   struct kvm_memory_slot *slot,
+   bool set_attrs,
+   unsigned long attrs,
+   gfn_t start, gfn_t end)
 {
unsigned long pages, mask;
gfn_t gfn, gfn_end, first, last;
@@ -7231,25 +7232,53 @@ void kvm_arch_set_memory_attributes(struct kvm *kvm,
first = start & mask;
last = (end - 1) & mask;
 
-   /*
-* We only need to scan the head and tail page, for middle pages
-* we know they will not be mixed.
-*/
+   /* head page */
gfn = max(first, slot->base_gfn);
gfn_end = min(first + pages, slot->base_gfn + slot->npages);
+   if(!set_attrs)
+   attrs = kvm_get_memory_attributes(kvm, gfn);
mixed = has_mixed_attrs(kvm, slot, level, attrs, gfn, gfn_end);
linfo_update_mixed(gfn, slot, level, mixed);
 
if (first == last)
return;
 
-   for (gfn = first + pages; gfn < last; gfn += pages)
-   linfo_update_mixed(gfn, slot, level, false);
+   /* middle pages */
+   for (gfn = first + pages; gfn < last; gfn += pages) {
+   if (set_attrs) {
+   mixed = false;
+   } else {
+   gfn_end = gfn + pages;
+   attrs = kvm_get_memory_attributes(kvm, gfn);
+   mixed = has_mixed_attrs(kvm, slot, level, attrs,
+   gfn, gfn_end);
+   }
+   linfo_update_mixed(gfn, slot, level, mixed);
+   }
 
+   /* tail page */
gfn = last;
gfn_end = min(last + pages, slot->base_gfn + slot->npages);
+   if(!set_attrs)
+   attrs = kvm_get_memory_attributes(kvm, gfn);
mixed = has_mixed_attrs(kvm, slot, level, attrs, gfn, gfn_end);
linfo_update_mixed(gfn, slot, level, mixed);
}
 }
+
+void kvm_arch_set_memory_att

Re: [PATCH] pci: add enforce_slot_reserved_mask_manual property

2023-01-28 Thread Chuck Zmudzinski
On 1/28/23 5:26 AM, Michael S. Tsirkin wrote:
> On Fri, Jan 27, 2023 at 10:39:28PM -0500, Chuck Zmudzinski wrote:
>> On 1/27/2023 8:28 AM, Michael S. Tsirkin wrote:
>> > On Sun, Jan 15, 2023 at 07:49:51PM -0500, Chuck Zmudzinski wrote:
>> > > The current reserved slot check in do_pci_register_device(), added with
>> > > commit 8b8849844fd6
>> >
>> > add ("subject here") please
>> >
>> > > ,is done even if the pci device being added is
>> > > configured manually for a particular slot. The new property, when set
>> > > to false, disables the check when the device is configured to request a
>> > > particular slot. This allows an administrator or management tool to
>> > > override slot_reserved_mask for a pci device by requesting a particular
>> > > slot for the device. The new property is initialized to true which
>> > > preserves the existing behavior of slot_reserved_mask by default.
>> > > 
>> > > Signed-off-by: Chuck Zmudzinski 
>> >
>> > Thanks!
>> > I'm trying to think of the best default for this.
>> 
>> I think it would be better for the default value of
>> enforce_slot_reserved_mask_manual to be false, so that a
>> user-specified slot will by default override slot_reserved_mask.
>> But doing that would change the current behavior of
>> slot_reserved_mask.
>> 
>> Currently, this is the only place where slot_reserved_mask is used in all
>> of the Qemu source (code from hw/sparc64/sun4u.c):
>> 
>> -- snip ---
>>     /* Only in-built Simba APBs can exist on the root bus, slot 0 on busA is
>>    reserved (leaving no slots free after on-board devices) however slots
>>    0-3 are free on busB */
>>     pci_bus->slot_reserved_mask = 0xfffc;
>>     pci_busA->slot_reserved_mask = 0xfff1;
>>     pci_busB->slot_reserved_mask = 0xfff0;
>> -- snip ---
>> 
>> I think we could safely change the default value of
>> enforce_slot_reserved_mask_manual to false but set
>> it to true for the sparc64 sun4u board here to preserve
>> the current behavior of the only existing board in Qemu
>> that uses slot_reserved_mask.
>> 
>> What do you think?
> 
> I guess first can you answer whether this is still needed
> with the latest Xen patches?
> 

It's not really needed except for experimental purposes to allow
an administrator to test experimental configurations with a device
other than the igd at slot 2. That might be useful in some cases,
but it is not really necessary unless someone asks for that capability.
If libvirt users who ordinarily like to manually specify all the
settings will be OK with the proposed patch to xen that prevents
an administrator from being able to override a new setting that
reserves slot 2 for the igd for type "xenfv" machines configured for
igd passthrough, then there is no need for this patch. I don't think
many users need the capability to insert a different device in slot 2 for
the "xenfv" machine type configured with igd-passthru=on, so I would be
OK if this patch is not included in qemu.

Chuck



Re: [PATCH v5] Emulate dip switch language layout settings on SUN keyboard

2023-01-28 Thread Henrik Carlqvist
On Sun, 22 Jan 2023 18:07:47 +
Mark Cave-Ayland  wrote:
> Did you see my comments re: OpenBIOS for the earlier version of this patch?

Sorry again for missing that comment, I sent a reply (
https://lists.nongnu.org/archive/html/qemu-devel/2023-01/msg05134.html )

I have now unsubscribed from the mailing list and hope that I will get copies
of any replies which then will be easier for me to note with a more reasonable
flow of emails through my inbox.

Having read about the CI gitlab issue I understand that my patch might not be
considered for integration until februari, but I am in no hurry, my first
attempt to submit this patch was in 2020 (
https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03826.html ), that
attempt resulted in a broken patch as my mail client wrapped long lines.

Best regards Henrik



Re: Re: [PATCH v3 05/12] cryptodev: Introduce 'query-cryptodev' QMP command

2023-01-28 Thread zhenwei pi

On 1/28/23 19:30, Michael S. Tsirkin wrote:

On Sat, Jan 28, 2023 at 11:56:26AM +0800, zhenwei pi wrote:

Now we have a QMP command to query crypto devices:
virsh qemu-monitor-command vm '{"execute": "query-cryptodev"}' | jq
{
   "return": [
 {
   "service": [
 "akcipher",
 "mac",
 "hash",
 "cipher"
   ],
   "id": "cryptodev1",
   "client": [
 {
   "queue": 0,
   "type": "builtin"
 }
   ]
 },
 {
   "service": [
 "akcipher"
   ],
   "id": "cryptodev0",
   "client": [
 {
   "queue": 0,
   "type": "lkcf"
 }
   ]
 }
   ],
   "id": "libvirt-417"
}

Signed-off-by: zhenwei pi 
---
  backends/cryptodev.c | 45 
  qapi/cryptodev.json  | 44 +++
  2 files changed, 89 insertions(+)

diff --git a/backends/cryptodev.c b/backends/cryptodev.c
index c2a053db0e..d51eeb5ee4 100644
--- a/backends/cryptodev.c
+++ b/backends/cryptodev.c
@@ -24,6 +24,7 @@
  #include "qemu/osdep.h"
  #include "sysemu/cryptodev.h"
  #include "qapi/error.h"
+#include "qapi/qapi-commands-cryptodev.h"
  #include "qapi/visitor.h"
  #include "qemu/config-file.h"
  #include "qemu/error-report.h"
@@ -33,6 +34,50 @@
  
  static QTAILQ_HEAD(, CryptoDevBackendClient) crypto_clients;
  
+static int qmp_query_cryptodev_foreach(Object *obj, void *data)

+{
+CryptoDevBackend *backend;
+CryptodevInfoList **infolist = data;
+uint32_t services, i;
+
+if (!object_dynamic_cast(obj, TYPE_CRYPTODEV_BACKEND)) {
+return 0;
+}
+
+CryptodevInfo *info = g_new0(CryptodevInfo, 1);
+info->id = g_strdup(object_get_canonical_path_component(obj));
+
+backend = CRYPTODEV_BACKEND(obj);
+services = backend->conf.crypto_services;
+for (i = 0; i < QCRYPTODEV_BACKEND_SERVICE__MAX; i++) {
+if (services & (1 << i)) {
+QAPI_LIST_PREPEND(info->service, i);
+}
+}
+
+for (i = 0; i < backend->conf.peers.queues; i++) {
+CryptoDevBackendClient *cc = backend->conf.peers.ccs[i];
+CryptodevBackendClient *client = g_new0(CryptodevBackendClient, 1);
+
+client->queue = cc->queue_index;
+client->type = cc->type;
+QAPI_LIST_PREPEND(info->client, client);
+}
+
+QAPI_LIST_PREPEND(*infolist, info);
+
+return 0;
+}
+
+CryptodevInfoList *qmp_query_cryptodev(Error **errp)
+{
+CryptodevInfoList *list = NULL;
+Object *objs = container_get(object_get_root(), "/objects");
+
+object_child_foreach(objs, qmp_query_cryptodev_foreach, &list);
+
+return list;
+}
  
  CryptoDevBackendClient *cryptodev_backend_new_client(void)

  {
diff --git a/qapi/cryptodev.json b/qapi/cryptodev.json
index 8732a30524..940078ace0 100644
--- a/qapi/cryptodev.json
+++ b/qapi/cryptodev.json
@@ -43,3 +43,47 @@
  { 'enum': 'QCryptodevBackendType',
'prefix': 'QCRYPTODEV_BACKEND_TYPE',
'data': ['builtin', 'vhost-user', 'lkcf']}
+
+##
+# @CryptodevBackendClient:
+#
+# Information about a queue of crypto device.
+#
+# @queue: the queue index of the crypto device
+#
+# @type: the type of the crypto device
+#
+# Since: 8.0
+##
+{ 'struct': 'CryptodevBackendClient',
+  'data': { 'queue': 'uint32',
+'type': 'QCryptodevBackendType' } }
+
+##
+# @CryptodevInfo:
+#
+# Information about a crypto device.
+#
+# @id: the id of the crypto device
+#
+# @service: supported service types of a crypto device
+#
+# @client: the additional infomation of the crypto device
+#
+# Since: 8.0
+##
+{ 'struct': 'CryptodevInfo',
+  'data': { 'id': 'str',
+'service': ['QCryptodevBackendServiceType'],
+'client': ['CryptodevBackendClient'] } }


So we end up with both CryptodevBackendClient and
CryptoDevBackendClient. Please don't do this.




Sorry, my fault, they should be *QCryptodevBackendClient* and 
*QCryptodevInfo^.



+
+##
+# @query-cryptodev:
+#
+# Returns information about current crypto devices.
+#
+# Returns: a list of @CryptodevInfo
+#
+# Since: 8.0
+##
+{ 'command': 'query-cryptodev', 'returns': ['CryptodevInfo']}
--
2.34.1




--
zhenwei pi



Re: Re: [PATCH v3 00/12] Refactor cryptodev

2023-01-28 Thread zhenwei pi

On 1/28/23 19:30, Michael S. Tsirkin wrote:

On Sat, Jan 28, 2023 at 11:56:21AM +0800, zhenwei pi wrote:

v3 -> v4:
- a small change in 
'0005-cryptodev-Introduce-query-cryptodev-QMP-command.patch':
   use 'uint32' instead of 'int' to describe CryptodevBackendClient:queue


what was the motivation for this change? we generally just use int
unless width is important.


Because we use uint32_t to describe the queues of a cryptodev:
struct CryptoDevBackendPeers {
CryptoDevBackendClient *ccs[MAX_CRYPTO_QUEUE_NUM];
uint32_t queues;
};

By the way, I also notice that 'int' and 'uint32_t' in several places, I 
think I need a followup patch to fix this(use 'uint32_t' only for queue 
index).



- fix compling warning(gcc)/error(clang-11) on 32 bit platform in
   '0007-hmp-add-cryptodev-info-command.patch':
   use 'printf("%u", client->queue)' instead of 'printf("%ld", client->queue)'



Are you aware of PRIu64? And if you are going to print uint32 use PRIu32


OK, I'll fix this in the next version.


v2 -> v3:
- rebase code against the lastest commist: fb7e7990342e59cf67d
- document the missing fields in qapi/cryptodev.json
- rework statistics part: use 'query-stats' command instead of
   'query-cryptodev'(cryptodev: Support query-stats QMP command)

v1 -> v2:
- fix coding style and use 'g_strjoin()' instead of 'char services[128]'
(suggested by Dr. David Alan Gilbert)
- wrapper function 'cryptodev_backend_account' to record statistics, and
allocate sym_stat/asym_stat in cryptodev base class. see patch:
'cryptodev: Support statistics'.
- add more arguments into struct CryptoDevBackendOpInfo, then
cryptodev_backend_crypto_operation() uses *op_info only.
- support cryptodev QoS settings(BPS&OPS), both QEMU command line and QMP
command works fine.
- add myself as the maintainer for cryptodev.

v1:
- introduce cryptodev.json to describe the attributes of crypto device, then
drop duplicated type declare, remove some virtio related dependence.
- add statistics: OPS and bandwidth.
- add QMP command: query-cryptodev
- add HMP info command: cryptodev
- misc fix: detect akcipher capability instead of exposing akcipher service
unconditionally.

Zhenwei Pi (12):
   cryptodev: Introduce cryptodev.json
   cryptodev: Remove 'name' & 'model' fields
   cryptodev: Introduce cryptodev alg type in QAPI
   cryptodev: Introduce server type in QAPI
   cryptodev: Introduce 'query-cryptodev' QMP command
   cryptodev-builtin: Detect akcipher capability
   hmp: add cryptodev info command
   cryptodev: Use CryptoDevBackendOpInfo for operation
   cryptodev: Account statistics
   cryptodev: support QoS
   cryptodev: Support query-stats QMP command
   MAINTAINERS: add myself as the maintainer for cryptodev

  MAINTAINERS |   2 +
  backends/cryptodev-builtin.c|  42 ++--
  backends/cryptodev-lkcf.c   |  19 +-
  backends/cryptodev-vhost-user.c |  13 +-
  backends/cryptodev-vhost.c  |   4 +-
  backends/cryptodev.c| 419 ++--
  hmp-commands-info.hx|  14 ++
  hw/virtio/virtio-crypto.c   |  48 +++-
  include/monitor/hmp.h   |   1 +
  include/sysemu/cryptodev.h  |  95 
  monitor/hmp-cmds.c  |  41 
  monitor/qmp-cmds.c  |   2 +
  qapi/cryptodev.json | 143 +++
  qapi/meson.build|   1 +
  qapi/qapi-schema.json   |   1 +
  qapi/qom.json   |   8 +-
  qapi/stats.json |  10 +-
  17 files changed, 743 insertions(+), 120 deletions(-)
  create mode 100644 qapi/cryptodev.json

--
2.34.1




--
zhenwei pi



Re: [PATCH v3 00/12] Refactor cryptodev

2023-01-28 Thread Michael S. Tsirkin
On Sat, Jan 28, 2023 at 11:56:21AM +0800, zhenwei pi wrote:
> v3 -> v4:
> - a small change in 
> '0005-cryptodev-Introduce-query-cryptodev-QMP-command.patch':
>   use 'uint32' instead of 'int' to describe CryptodevBackendClient:queue

what was the motivation for this change? we generally just use int
unless width is important.

> - fix compling warning(gcc)/error(clang-11) on 32 bit platform in
>   '0007-hmp-add-cryptodev-info-command.patch':
>   use 'printf("%u", client->queue)' instead of 'printf("%ld", client->queue)'
> 

Are you aware of PRIu64? And if you are going to print uint32 use PRIu32

> v2 -> v3:
> - rebase code against the lastest commist: fb7e7990342e59cf67d
> - document the missing fields in qapi/cryptodev.json
> - rework statistics part: use 'query-stats' command instead of
>   'query-cryptodev'(cryptodev: Support query-stats QMP command)
> 
> v1 -> v2:
> - fix coding style and use 'g_strjoin()' instead of 'char services[128]'
>(suggested by Dr. David Alan Gilbert)
> - wrapper function 'cryptodev_backend_account' to record statistics, and
>allocate sym_stat/asym_stat in cryptodev base class. see patch:
>'cryptodev: Support statistics'.
> - add more arguments into struct CryptoDevBackendOpInfo, then
>cryptodev_backend_crypto_operation() uses *op_info only.
> - support cryptodev QoS settings(BPS&OPS), both QEMU command line and QMP
>command works fine.
> - add myself as the maintainer for cryptodev.
> 
> v1:
> - introduce cryptodev.json to describe the attributes of crypto device, then
>drop duplicated type declare, remove some virtio related dependence.
> - add statistics: OPS and bandwidth.
> - add QMP command: query-cryptodev
> - add HMP info command: cryptodev
> - misc fix: detect akcipher capability instead of exposing akcipher service
>unconditionally.
> 
> Zhenwei Pi (12):
>   cryptodev: Introduce cryptodev.json
>   cryptodev: Remove 'name' & 'model' fields
>   cryptodev: Introduce cryptodev alg type in QAPI
>   cryptodev: Introduce server type in QAPI
>   cryptodev: Introduce 'query-cryptodev' QMP command
>   cryptodev-builtin: Detect akcipher capability
>   hmp: add cryptodev info command
>   cryptodev: Use CryptoDevBackendOpInfo for operation
>   cryptodev: Account statistics
>   cryptodev: support QoS
>   cryptodev: Support query-stats QMP command
>   MAINTAINERS: add myself as the maintainer for cryptodev
> 
>  MAINTAINERS |   2 +
>  backends/cryptodev-builtin.c|  42 ++--
>  backends/cryptodev-lkcf.c   |  19 +-
>  backends/cryptodev-vhost-user.c |  13 +-
>  backends/cryptodev-vhost.c  |   4 +-
>  backends/cryptodev.c| 419 ++--
>  hmp-commands-info.hx|  14 ++
>  hw/virtio/virtio-crypto.c   |  48 +++-
>  include/monitor/hmp.h   |   1 +
>  include/sysemu/cryptodev.h  |  95 
>  monitor/hmp-cmds.c  |  41 
>  monitor/qmp-cmds.c  |   2 +
>  qapi/cryptodev.json | 143 +++
>  qapi/meson.build|   1 +
>  qapi/qapi-schema.json   |   1 +
>  qapi/qom.json   |   8 +-
>  qapi/stats.json |  10 +-
>  17 files changed, 743 insertions(+), 120 deletions(-)
>  create mode 100644 qapi/cryptodev.json
> 
> -- 
> 2.34.1




Re: [PATCH v3 05/12] cryptodev: Introduce 'query-cryptodev' QMP command

2023-01-28 Thread Michael S. Tsirkin
On Sat, Jan 28, 2023 at 11:56:26AM +0800, zhenwei pi wrote:
> Now we have a QMP command to query crypto devices:
> virsh qemu-monitor-command vm '{"execute": "query-cryptodev"}' | jq
> {
>   "return": [
> {
>   "service": [
> "akcipher",
> "mac",
> "hash",
> "cipher"
>   ],
>   "id": "cryptodev1",
>   "client": [
> {
>   "queue": 0,
>   "type": "builtin"
> }
>   ]
> },
> {
>   "service": [
> "akcipher"
>   ],
>   "id": "cryptodev0",
>   "client": [
> {
>   "queue": 0,
>   "type": "lkcf"
> }
>   ]
> }
>   ],
>   "id": "libvirt-417"
> }
> 
> Signed-off-by: zhenwei pi 
> ---
>  backends/cryptodev.c | 45 
>  qapi/cryptodev.json  | 44 +++
>  2 files changed, 89 insertions(+)
> 
> diff --git a/backends/cryptodev.c b/backends/cryptodev.c
> index c2a053db0e..d51eeb5ee4 100644
> --- a/backends/cryptodev.c
> +++ b/backends/cryptodev.c
> @@ -24,6 +24,7 @@
>  #include "qemu/osdep.h"
>  #include "sysemu/cryptodev.h"
>  #include "qapi/error.h"
> +#include "qapi/qapi-commands-cryptodev.h"
>  #include "qapi/visitor.h"
>  #include "qemu/config-file.h"
>  #include "qemu/error-report.h"
> @@ -33,6 +34,50 @@
>  
>  static QTAILQ_HEAD(, CryptoDevBackendClient) crypto_clients;
>  
> +static int qmp_query_cryptodev_foreach(Object *obj, void *data)
> +{
> +CryptoDevBackend *backend;
> +CryptodevInfoList **infolist = data;
> +uint32_t services, i;
> +
> +if (!object_dynamic_cast(obj, TYPE_CRYPTODEV_BACKEND)) {
> +return 0;
> +}
> +
> +CryptodevInfo *info = g_new0(CryptodevInfo, 1);
> +info->id = g_strdup(object_get_canonical_path_component(obj));
> +
> +backend = CRYPTODEV_BACKEND(obj);
> +services = backend->conf.crypto_services;
> +for (i = 0; i < QCRYPTODEV_BACKEND_SERVICE__MAX; i++) {
> +if (services & (1 << i)) {
> +QAPI_LIST_PREPEND(info->service, i);
> +}
> +}
> +
> +for (i = 0; i < backend->conf.peers.queues; i++) {
> +CryptoDevBackendClient *cc = backend->conf.peers.ccs[i];
> +CryptodevBackendClient *client = g_new0(CryptodevBackendClient, 1);
> +
> +client->queue = cc->queue_index;
> +client->type = cc->type;
> +QAPI_LIST_PREPEND(info->client, client);
> +}
> +
> +QAPI_LIST_PREPEND(*infolist, info);
> +
> +return 0;
> +}
> +
> +CryptodevInfoList *qmp_query_cryptodev(Error **errp)
> +{
> +CryptodevInfoList *list = NULL;
> +Object *objs = container_get(object_get_root(), "/objects");
> +
> +object_child_foreach(objs, qmp_query_cryptodev_foreach, &list);
> +
> +return list;
> +}
>  
>  CryptoDevBackendClient *cryptodev_backend_new_client(void)
>  {
> diff --git a/qapi/cryptodev.json b/qapi/cryptodev.json
> index 8732a30524..940078ace0 100644
> --- a/qapi/cryptodev.json
> +++ b/qapi/cryptodev.json
> @@ -43,3 +43,47 @@
>  { 'enum': 'QCryptodevBackendType',
>'prefix': 'QCRYPTODEV_BACKEND_TYPE',
>'data': ['builtin', 'vhost-user', 'lkcf']}
> +
> +##
> +# @CryptodevBackendClient:
> +#
> +# Information about a queue of crypto device.
> +#
> +# @queue: the queue index of the crypto device
> +#
> +# @type: the type of the crypto device
> +#
> +# Since: 8.0
> +##
> +{ 'struct': 'CryptodevBackendClient',
> +  'data': { 'queue': 'uint32',
> +'type': 'QCryptodevBackendType' } }
> +
> +##
> +# @CryptodevInfo:
> +#
> +# Information about a crypto device.
> +#
> +# @id: the id of the crypto device
> +#
> +# @service: supported service types of a crypto device
> +#
> +# @client: the additional infomation of the crypto device
> +#
> +# Since: 8.0
> +##
> +{ 'struct': 'CryptodevInfo',
> +  'data': { 'id': 'str',
> +'service': ['QCryptodevBackendServiceType'],
> +'client': ['CryptodevBackendClient'] } }

So we end up with both CryptodevBackendClient and
CryptoDevBackendClient. Please don't do this.


> +
> +##
> +# @query-cryptodev:
> +#
> +# Returns information about current crypto devices.
> +#
> +# Returns: a list of @CryptodevInfo
> +#
> +# Since: 8.0
> +##
> +{ 'command': 'query-cryptodev', 'returns': ['CryptodevInfo']}
> -- 
> 2.34.1




Re: [PATCH 2/3] virtio: struct VirtQueue introduce reset

2023-01-28 Thread Michael S. Tsirkin
On Sat, Jan 28, 2023 at 06:41:09PM +0800, Xuan Zhuo wrote:
> On Sat, 28 Jan 2023 05:22:05 -0500, "Michael S. Tsirkin"  
> wrote:
> > On Sat, Jan 28, 2023 at 03:17:23PM +0800, Xuan Zhuo wrote:
> > >  In the current design, we stop the device from operating on the vring
> > >  during per-queue reset by resetting the structure VirtQueue.
> > >
> > >  But before the reset operation, when recycling some resources, we should
> > >  stop referencing new vring resources. For example, when recycling
> > >  virtio-net's asynchronous sending resources, virtio-net should be able
> > >  to perceive that the current queue is in the per-queue reset state, and
> > >  stop sending new packets from the tx queue.
> > >
> > >  Signed-off-by: Xuan Zhuo 
> > > ---
> > >  hw/virtio/virtio.c | 8 
> > >  include/hw/virtio/virtio.h | 3 +++
> > >  2 files changed, 11 insertions(+)
> > >
> > > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > > index 03077b2ecf..907d5b8bde 100644
> > > --- a/hw/virtio/virtio.c
> > > +++ b/hw/virtio/virtio.c
> > > @@ -2030,6 +2030,12 @@ void virtio_queue_reset(VirtIODevice *vdev, 
> > > uint32_t queue_index)
> > >  {
> > >  VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
> > >
> > > +/*
> > > + * Mark this queue is per-queue reset status. The device should 
> > > release the
> > > + * references of the vring, and not refer more new vring item.
> > > + */
> > > +vdev->vq[queue_index].reset = true;
> > > +
> > >  if (k->queue_reset) {
> > >  k->queue_reset(vdev, queue_index);
> > >  }
> > > @@ -2053,6 +2059,8 @@ void virtio_queue_enable(VirtIODevice *vdev, 
> > > uint32_t queue_index)
> > >  }
> > >  */
> > >
> > > +vdev->vq[queue_index].reset = false;
> > > +
> > >  if (k->queue_enable) {
> > >  k->queue_enable(vdev, queue_index);
> > >  }
> > > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > > index 1c0d77c670..b888538d09 100644
> > > --- a/include/hw/virtio/virtio.h
> > > +++ b/include/hw/virtio/virtio.h
> > > @@ -251,6 +251,9 @@ struct VirtQueue {
> > >  /* Notification enabled? */
> > >  bool notification;
> > >
> > > +/* Per-Queue Reset status */
> > > +bool reset;
> > > +
> > >  uint16_t queue_index;
> > >
> >
> > Reset state makes no sense. It seems to imply queue_reset
> > in the spec. And for extra fun there's "reset" in the pci
> > proxy which means "virtio_queue_reset is in progress" - I have no
> > idea what uses it though - it is not guest visible.  First what is it?
> > It actually means "queue has been reset and not has not been enabled since".
> > So disabled_by_reset maybe?
> 
> 
> In fact, when reading this, the queue has not been reset,
> so prepare_for_reset?

Makes it sound like it's some kind of temporary state where
it is not - it will stay like this until enabled.
As this makes no practical difference that it is set to
early, just set it later for consistency.

> >
> > Second this hack helps make the change minimal
> > so it's helpful for stable, but it's ugly in that it
> > duplicates the reverse of enabled value - we don't really
> > care what disabled it in practice.
> >
> > With the fixups above I can apply so it's easier to backport, but later
> > a patch on top should clean it all up, perhaps by adding
> > "enabled" in VirtQueue. We should also get rid of "reset" in the proxy
> > unless there's some way it's useful which I don't currently see.
> >
> 
> I have some confusion, I don't understand what you mean.
> 
> Why did we remove the "reset" in the proxy?

We did not but we should.
Why we should remove "reset" in the proxy?
Because guest can never read it as != 0:

case VIRTIO_PCI_COMMON_Q_RESET:
if (val == 1) {
proxy->vqs[vdev->queue_sel].reset = 1;

virtio_queue_reset(vdev, vdev->queue_sel);

proxy->vqs[vdev->queue_sel].reset = 0;
proxy->vqs[vdev->queue_sel].enabled = 0;
}
break;

from guest's POV reset is atomic and so does not need
a variable to track state.


> I agree to rename the "reset".
> 
> Thanks.
> 
> >
> >
> > >  unsigned int inuse;
> > > --
> > > 2.32.0.3.g01195cf9f
> >




Re: [PATCH qemu v3] x86: don't let decompressed kernel image clobber setup_data

2023-01-28 Thread Michael S. Tsirkin
On Mon, Jan 23, 2023 at 06:37:21AM -0600, Jason A. Donenfeld wrote:
> On Mon, Jan 23, 2023 at 6:12 AM Michael S. Tsirkin  wrote:
> >
> > On Sun, Jan 22, 2023 at 08:21:30PM -0800, Eric Biggers wrote:
> > > Hi Michael,
> > >
> > > On Tue, Jan 10, 2023 at 12:50:42PM -0500, Michael S. Tsirkin wrote:
> > > > On Tue, Jan 10, 2023 at 04:34:49PM +0100, Jason A. Donenfeld wrote:
> > > > > Hi Michael,
> > > > >
> > > > > Could you queue up this patch and mark it as a fix for 7.2.1? It is a
> > > > > straight-up bug fix for a 7.2 regression that's now affected several
> > > > > users.
> > > >
> > > > OK. In the future pls cc me if you want me to merge a patch. Thanks!
> > > >
> > > > > - It has two Tested-by tags on the thread.
> > > > > - hpa, the maintainer of the kernel side of this, confirmed on one of
> > > > >   the various tributary threads that this approach is a correct one.
> > > > > - It doesn't introduce any new functionality.
> > > > >
> > > > > For your convenience, you can grab this out of lore here:
> > > > >
> > > > >   
> > > > > https://lore.kernel.org/lkml/20221230220725.618763-1-ja...@zx2c4.com/
> > > > >
> > > > > Or if you want to yolo it:
> > > > >
> > > > >   curl 
> > > > > https://lore.kernel.org/lkml/20221230220725.618763-1-ja...@zx2c4.com/raw
> > > > >  | git am -s
> > > > >
> > > > > It's now sat silent on the mailing list for a while. So let's please 
> > > > > get
> > > > > this committed and backported so that the bug reports stop coming in.
> > > > >
> > >
> > > This patch still isn't on QEMU's master branch.  What happened to it?
> > >
> > > - Eric
> >
> > Indeed though I remember picking it up. Tagged again now. Thanks!
> 
> Thanks. What branch is this in? I didn't see it on:
> https://gitlab.com/mstredhat/qemu/-/branches/active
> https://github.com/mstsirkin/qemu/branches

I don't use github really. And it was not pushed to gitlab as I was
figuring out issues with other patches before starting CI as CI minutes
are limited.  BTW as checkpatch was unhappy I applied a fixup -
making checkpatch happier and in the process the code change a bit
smaller.  If you want to do cleanups on top be my guest but pls
make it pass checkpatch. Thanks!


commit a00d99e04c4481fca3ee2d7c40d42993b7b059c2
Author: Michael S. Tsirkin 
Date:   Sat Jan 28 06:08:43 2023 -0500

fixup! x86: don't let decompressed kernel image clobber setup_data

diff --git a/hw/i386/microvm.c b/hw/i386/microvm.c
index 1b19d28c02..29f30dd6d3 100644
--- a/hw/i386/microvm.c
+++ b/hw/i386/microvm.c
@@ -378,7 +378,7 @@ static void microvm_fix_kernel_cmdline(MachineState 
*machine)
 MicrovmMachineState *mms = MICROVM_MACHINE(machine);
 BusState *bus;
 BusChild *kid;
-char *cmdline, *existing_cmdline = fw_cfg_read_bytes_ptr(x86ms->fw_cfg, 
FW_CFG_CMDLINE_DATA);
+char *cmdline, *existing_cmdline;
 size_t len;
 
 /*
@@ -388,6 +388,7 @@ static void microvm_fix_kernel_cmdline(MachineState 
*machine)
  * Yes, this is a hack, but one that heavily improves the UX without
  * introducing any significant issues.
  */
+existing_cmdline = fw_cfg_read_bytes_ptr(x86ms->fw_cfg, 
FW_CFG_CMDLINE_DATA);
 cmdline = g_strdup(existing_cmdline);
 bus = sysbus_get_default();
 QTAILQ_FOREACH(kid, &bus->children, sibling) {
@@ -413,10 +414,11 @@ static void microvm_fix_kernel_cmdline(MachineState 
*machine)
 }
 
 len = strlen(cmdline);
-if (len > VIRTIO_CMDLINE_TOTAL_MAX_LEN + strlen(existing_cmdline))
+if (len > VIRTIO_CMDLINE_TOTAL_MAX_LEN + strlen(existing_cmdline)) {
 fprintf(stderr, "qemu: virtio mmio cmdline too large, skipping\n");
-else
+} else {
 memcpy(existing_cmdline, cmdline, len + 1);
+}
 g_free(cmdline);
 }
 
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index b57a993596..eaff4227bd 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -803,7 +803,7 @@ void x86_load_linux(X86MachineState *x86ms,
 bool linuxboot_dma_enabled = 
X86_MACHINE_GET_CLASS(x86ms)->fwcfg_dma_enabled;
 uint16_t protocol;
 int setup_size, kernel_size, cmdline_size;
-int dtb_size;
+int dtb_size, setup_data_offset;
 uint32_t initrd_max;
 uint8_t header[8192], *setup, *kernel;
 hwaddr real_addr, prot_addr, cmdline_addr, initrd_addr = 0, 
first_setup_data = 0;
@@ -818,8 +818,10 @@ void x86_load_linux(X86MachineState *x86ms,
 SevKernelLoaderContext sev_load_ctx = {};
 enum { RNG_SEED_LENGTH = 32 };
 
-/* Add the NUL terminator, some padding for the microvm cmdline fiddling
- * hack, and then align to 16 bytes as a paranoia measure */
+/*
+ * Add the NUL terminator, some padding for the microvm cmdline fiddling
+ * hack, and then align to 16 bytes as a paranoia measure
+ */
 cmdline_size = (strlen(machine->kernel_cmdline) + 1 +
 VIRTIO_CMDLINE_TOTAL_MAX_LEN + 16) & ~15;
 /* Make a copy, since we might append arbitrary bytes to it later. */
@@ -1090,22 +1092,24 @@ void x86_load_linux(X86M

Re: [PATCH 2/3] virtio: struct VirtQueue introduce reset

2023-01-28 Thread Xuan Zhuo
On Sat, 28 Jan 2023 05:22:05 -0500, "Michael S. Tsirkin"  
wrote:
> On Sat, Jan 28, 2023 at 03:17:23PM +0800, Xuan Zhuo wrote:
> >  In the current design, we stop the device from operating on the vring
> >  during per-queue reset by resetting the structure VirtQueue.
> >
> >  But before the reset operation, when recycling some resources, we should
> >  stop referencing new vring resources. For example, when recycling
> >  virtio-net's asynchronous sending resources, virtio-net should be able
> >  to perceive that the current queue is in the per-queue reset state, and
> >  stop sending new packets from the tx queue.
> >
> >  Signed-off-by: Xuan Zhuo 
> > ---
> >  hw/virtio/virtio.c | 8 
> >  include/hw/virtio/virtio.h | 3 +++
> >  2 files changed, 11 insertions(+)
> >
> > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > index 03077b2ecf..907d5b8bde 100644
> > --- a/hw/virtio/virtio.c
> > +++ b/hw/virtio/virtio.c
> > @@ -2030,6 +2030,12 @@ void virtio_queue_reset(VirtIODevice *vdev, uint32_t 
> > queue_index)
> >  {
> >  VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
> >
> > +/*
> > + * Mark this queue is per-queue reset status. The device should 
> > release the
> > + * references of the vring, and not refer more new vring item.
> > + */
> > +vdev->vq[queue_index].reset = true;
> > +
> >  if (k->queue_reset) {
> >  k->queue_reset(vdev, queue_index);
> >  }
> > @@ -2053,6 +2059,8 @@ void virtio_queue_enable(VirtIODevice *vdev, uint32_t 
> > queue_index)
> >  }
> >  */
> >
> > +vdev->vq[queue_index].reset = false;
> > +
> >  if (k->queue_enable) {
> >  k->queue_enable(vdev, queue_index);
> >  }
> > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > index 1c0d77c670..b888538d09 100644
> > --- a/include/hw/virtio/virtio.h
> > +++ b/include/hw/virtio/virtio.h
> > @@ -251,6 +251,9 @@ struct VirtQueue {
> >  /* Notification enabled? */
> >  bool notification;
> >
> > +/* Per-Queue Reset status */
> > +bool reset;
> > +
> >  uint16_t queue_index;
> >
>
> Reset state makes no sense. It seems to imply queue_reset
> in the spec. And for extra fun there's "reset" in the pci
> proxy which means "virtio_queue_reset is in progress" - I have no
> idea what uses it though - it is not guest visible.  First what is it?
> It actually means "queue has been reset and not has not been enabled since".
> So disabled_by_reset maybe?


In fact, when reading this, the queue has not been reset,
so prepare_for_reset?

>
> Second this hack helps make the change minimal
> so it's helpful for stable, but it's ugly in that it
> duplicates the reverse of enabled value - we don't really
> care what disabled it in practice.
>
> With the fixups above I can apply so it's easier to backport, but later
> a patch on top should clean it all up, perhaps by adding
> "enabled" in VirtQueue. We should also get rid of "reset" in the proxy
> unless there's some way it's useful which I don't currently see.
>

I have some confusion, I don't understand what you mean.

Why did we remove the "reset" in the proxy?

I agree to rename the "reset".

Thanks.

>
>
> >  unsigned int inuse;
> > --
> > 2.32.0.3.g01195cf9f
>



Re: [PATCH 1/3] virtio: move struct VirtQueue to include file

2023-01-28 Thread Xuan Zhuo
On Sat, 28 Jan 2023 05:23:46 -0500, "Michael S. Tsirkin"  
wrote:
> On Sat, Jan 28, 2023 at 03:17:22PM +0800, Xuan Zhuo wrote:
> > This patch move struct VirtQueue into virtio.h.
> >
> > In order to implement Queue Reset, we have to record the queue reset
> > status of in struct VirtQueue and provide it to device.
> >
> > Signed-off-by: Xuan Zhuo 
>
> So add an API please, no need to move the struct.
> This patch will go away then.

OK.

Thanks.


>
> > ---
> >  hw/virtio/virtio.c | 49 ---
> >  include/hw/virtio/virtio.h | 52 --
> >  2 files changed, 50 insertions(+), 51 deletions(-)
> >
> > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > index f35178f5fc..03077b2ecf 100644
> > --- a/hw/virtio/virtio.c
> > +++ b/hw/virtio/virtio.c
> > @@ -101,60 +101,11 @@ typedef struct VRingMemoryRegionCaches {
> >  MemoryRegionCache used;
> >  } VRingMemoryRegionCaches;
> >
> > -typedef struct VRing
> > -{
> > -unsigned int num;
> > -unsigned int num_default;
> > -unsigned int align;
> > -hwaddr desc;
> > -hwaddr avail;
> > -hwaddr used;
> > -VRingMemoryRegionCaches *caches;
> > -} VRing;
> > -
> >  typedef struct VRingPackedDescEvent {
> >  uint16_t off_wrap;
> >  uint16_t flags;
> >  } VRingPackedDescEvent ;
> >
> > -struct VirtQueue
> > -{
> > -VRing vring;
> > -VirtQueueElement *used_elems;
> > -
> > -/* Next head to pop */
> > -uint16_t last_avail_idx;
> > -bool last_avail_wrap_counter;
> > -
> > -/* Last avail_idx read from VQ. */
> > -uint16_t shadow_avail_idx;
> > -bool shadow_avail_wrap_counter;
> > -
> > -uint16_t used_idx;
> > -bool used_wrap_counter;
> > -
> > -/* Last used index value we have signalled on */
> > -uint16_t signalled_used;
> > -
> > -/* Last used index value we have signalled on */
> > -bool signalled_used_valid;
> > -
> > -/* Notification enabled? */
> > -bool notification;
> > -
> > -uint16_t queue_index;
> > -
> > -unsigned int inuse;
> > -
> > -uint16_t vector;
> > -VirtIOHandleOutput handle_output;
> > -VirtIODevice *vdev;
> > -EventNotifier guest_notifier;
> > -EventNotifier host_notifier;
> > -bool host_notifier_enabled;
> > -QLIST_ENTRY(VirtQueue) node;
> > -};
> > -
> >  const char *virtio_device_names[] = {
> >  [VIRTIO_ID_NET] = "virtio-net",
> >  [VIRTIO_ID_BLOCK] = "virtio-blk",
> > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> > index 77c6c55929..1c0d77c670 100644
> > --- a/include/hw/virtio/virtio.h
> > +++ b/include/hw/virtio/virtio.h
> > @@ -214,6 +214,56 @@ struct VirtioDeviceClass {
> >  struct vhost_dev *(*get_vhost)(VirtIODevice *vdev);
> >  };
> >
> > +typedef struct VRingMemoryRegionCaches VRingMemoryRegionCaches;
> > +typedef void (*VirtIOHandleOutput)(VirtIODevice *, VirtQueue *);
> > +
> > +typedef struct VRing {
> > +unsigned int num;
> > +unsigned int num_default;
> > +unsigned int align;
> > +hwaddr desc;
> > +hwaddr avail;
> > +hwaddr used;
> > +VRingMemoryRegionCaches *caches;
> > +} VRing;
> > +
> > +struct VirtQueue {
> > +VRing vring;
> > +VirtQueueElement *used_elems;
> > +
> > +/* Next head to pop */
> > +uint16_t last_avail_idx;
> > +bool last_avail_wrap_counter;
> > +
> > +/* Last avail_idx read from VQ. */
> > +uint16_t shadow_avail_idx;
> > +bool shadow_avail_wrap_counter;
> > +
> > +uint16_t used_idx;
> > +bool used_wrap_counter;
> > +
> > +/* Last used index value we have signalled on */
> > +uint16_t signalled_used;
> > +
> > +/* Last used index value we have signalled on */
> > +bool signalled_used_valid;
> > +
> > +/* Notification enabled? */
> > +bool notification;
> > +
> > +uint16_t queue_index;
> > +
> > +unsigned int inuse;
> > +
> > +uint16_t vector;
> > +VirtIOHandleOutput handle_output;
> > +VirtIODevice *vdev;
> > +EventNotifier guest_notifier;
> > +EventNotifier host_notifier;
> > +bool host_notifier_enabled;
> > +QLIST_ENTRY(VirtQueue) node;
> > +};
> > +
> >  void virtio_instance_init_common(Object *proxy_obj, void *data,
> >   size_t vdev_size, const char *vdev_name);
> >
> > @@ -226,8 +276,6 @@ void virtio_error(VirtIODevice *vdev, const char *fmt, 
> > ...) G_GNUC_PRINTF(2, 3);
> >  /* Set the child bus name. */
> >  void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name);
> >
> > -typedef void (*VirtIOHandleOutput)(VirtIODevice *, VirtQueue *);
> > -
> >  VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
> >  VirtIOHandleOutput handle_output);
> >
> > --
> > 2.32.0.3.g01195cf9f
>



Re: [PATCH v4 04/19] bsd-user: Clean up includes

2023-01-28 Thread Michael S. Tsirkin
On Fri, Jan 27, 2023 at 10:01:57AM -0500, Michael S. Tsirkin wrote:
> On Fri, Jan 27, 2023 at 02:54:30PM +, Peter Maydell wrote:
> > On Thu, 19 Jan 2023 at 14:42, Warner Losh  wrote:
> > >
> > > Also, why didn't you move sys/resource.h and other such files
> > > to os-dep.h? I'm struggling to understand the rules around what
> > > is or isn't included where?
> > 
> > The rough rule of thumb is that if some OS needs a compatibility
> > fixup or workaround for a system header (eg not every mmap.h
> > defines MAP_ANONYMOUS; on Windows unistd.h has to come before
> > time.h) then we put that header include and the compat workaround
> > into osdep.h. This avoids "only fails on obscure platform" issues
> > where somebody puts a header include into some specific .c file
> > but not the compat workaround, and it works on the Linux host
> > that most people develop and test on and we only find the
> > problem later.
> > 
> > There's also no doubt some includes there for historical
> > reasons, and some which really are "everybody needs these"
> > convenience ones. But we should probably not add new
> > includes to osdep.h unless they fall into the "working around
> > system header issues" bucket.
> > 
> > thanks
> > -- PMM
> 
> 
> BTW maybe we should teach checkpatch about that rule:
> if a header is in osdep do not include it directly.

To be more precise, make checkpatch run clean-includes somehow?
Or just make CI run clean-includes on the tree and verify result
is empty?

> -- 
> MST




Re: [PATCH v4 01/19] scripts/clean-includes: Fully skip / ignore files

2023-01-28 Thread Michael S. Tsirkin
On Thu, Jan 19, 2023 at 07:59:41AM +0100, Markus Armbruster wrote:
> When clean-includes claims to skip or ignore a file, only the part
> that sanitizes use of qemu/osdep.h skips the file.  The part that
> looks for duplicate #include does not, and neither does committing to
> Git.
> 
> The latter can get unrelated stuff included in the commit, but only if
> you run clean-includes in a dirty tree, which is unwise.  Messed up
> when we added skipping in commit fd3e39a40c "scripts/clean-includes:
> Enhance to handle header files".
> 
> The former can cause bogus reports for --check-dup-head.  Added in
> commit d66253e46a "scripts/clean-includes: added duplicate #include
> check", duplicating the prior mistake.
> 
> Fix the script to fully skip files.
> 
> Fixes: fd3e39a40ca2ee26b09a5de3149af8b056b85233
> Fixes: d66253e46ae2b9c36a9dd90b2b74c0dfa5804b22

Isn't
Fixes: %h (\"%s\")

the accepted format for this?

> Signed-off-by: Markus Armbruster 
> ---
>  scripts/clean-includes | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/scripts/clean-includes b/scripts/clean-includes
> index d37bd4f692..86944f27fc 100755
> --- a/scripts/clean-includes
> +++ b/scripts/clean-includes
> @@ -111,6 +111,7 @@ cat >"$COCCIFILE" <  )
>  EOT
>  
> +files=
>  for f in "$@"; do
>case "$f" in
>  *.c.inc)
> @@ -144,6 +145,7 @@ for f in "$@"; do
>continue
>;;
>esac
> +  files="$files $f"
>  
>if [ "$MODE" = "c" ]; then
>  # First, use Coccinelle to add qemu/osdep.h before the first existing 
> include
> @@ -174,8 +176,8 @@ for f in "$@"; do
>  
>  done
>  
> -if [ "$DUPHEAD" = "yes" ]; then
> -egrep "^[[:space:]]*#[[:space:]]*include" "$@" | tr -d '[:blank:]' \
> +if [ "$DUPHEAD" = "yes" ] && [ -n "$files" ]; then
> +egrep "^[[:space:]]*#[[:space:]]*include" $files | tr -d '[:blank:]' \
>  | sort | uniq -c | awk '{if ($1 > 1) print $0}'
>  if [ $? -eq 0 ]; then
>  echo "Found duplicate header file includes. Please check the above 
> files manually."
> @@ -184,7 +186,7 @@ if [ "$DUPHEAD" = "yes" ]; then
>  fi
>  
>  if [ "$GIT" = "yes" ]; then
> -git add -- "$@"
> +git add -- $files
>  git commit --signoff -F - <  $GITSUBJ: Clean up includes
>  
> -- 
> 2.39.0




Re: [PATCH] pci: add enforce_slot_reserved_mask_manual property

2023-01-28 Thread Michael S. Tsirkin
On Fri, Jan 27, 2023 at 10:39:28PM -0500, Chuck Zmudzinski wrote:
> On 1/27/2023 8:28 AM, Michael S. Tsirkin wrote:
> > On Sun, Jan 15, 2023 at 07:49:51PM -0500, Chuck Zmudzinski wrote:
> > > The current reserved slot check in do_pci_register_device(), added with
> > > commit 8b8849844fd6
> >
> > add ("subject here") please
> >
> > > ,is done even if the pci device being added is
> > > configured manually for a particular slot. The new property, when set
> > > to false, disables the check when the device is configured to request a
> > > particular slot. This allows an administrator or management tool to
> > > override slot_reserved_mask for a pci device by requesting a particular
> > > slot for the device. The new property is initialized to true which
> > > preserves the existing behavior of slot_reserved_mask by default.
> > > 
> > > Signed-off-by: Chuck Zmudzinski 
> >
> > Thanks!
> > I'm trying to think of the best default for this.
> 
> I think it would be better for the default value of
> enforce_slot_reserved_mask_manual to be false, so that a
> user-specified slot will by default override slot_reserved_mask.
> But doing that would change the current behavior of
> slot_reserved_mask.
> 
> Currently, this is the only place where slot_reserved_mask is used in all
> of the Qemu source (code from hw/sparc64/sun4u.c):
> 
> -- snip ---
>     /* Only in-built Simba APBs can exist on the root bus, slot 0 on busA is
>    reserved (leaving no slots free after on-board devices) however slots
>    0-3 are free on busB */
>     pci_bus->slot_reserved_mask = 0xfffc;
>     pci_busA->slot_reserved_mask = 0xfff1;
>     pci_busB->slot_reserved_mask = 0xfff0;
> -- snip ---
> 
> I think we could safely change the default value of
> enforce_slot_reserved_mask_manual to false but set
> it to true for the sparc64 sun4u board here to preserve
> the current behavior of the only existing board in Qemu
> that uses slot_reserved_mask.
> 
> What do you think?

I guess first can you answer whether this is still needed
with the latest Xen patches?

-- 
MST




Re: [PATCH 1/3] virtio: move struct VirtQueue to include file

2023-01-28 Thread Michael S. Tsirkin
On Sat, Jan 28, 2023 at 03:17:22PM +0800, Xuan Zhuo wrote:
> This patch move struct VirtQueue into virtio.h.
> 
> In order to implement Queue Reset, we have to record the queue reset
> status of in struct VirtQueue and provide it to device.
> 
> Signed-off-by: Xuan Zhuo 

So add an API please, no need to move the struct.
This patch will go away then.

> ---
>  hw/virtio/virtio.c | 49 ---
>  include/hw/virtio/virtio.h | 52 --
>  2 files changed, 50 insertions(+), 51 deletions(-)
> 
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index f35178f5fc..03077b2ecf 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -101,60 +101,11 @@ typedef struct VRingMemoryRegionCaches {
>  MemoryRegionCache used;
>  } VRingMemoryRegionCaches;
>  
> -typedef struct VRing
> -{
> -unsigned int num;
> -unsigned int num_default;
> -unsigned int align;
> -hwaddr desc;
> -hwaddr avail;
> -hwaddr used;
> -VRingMemoryRegionCaches *caches;
> -} VRing;
> -
>  typedef struct VRingPackedDescEvent {
>  uint16_t off_wrap;
>  uint16_t flags;
>  } VRingPackedDescEvent ;
>  
> -struct VirtQueue
> -{
> -VRing vring;
> -VirtQueueElement *used_elems;
> -
> -/* Next head to pop */
> -uint16_t last_avail_idx;
> -bool last_avail_wrap_counter;
> -
> -/* Last avail_idx read from VQ. */
> -uint16_t shadow_avail_idx;
> -bool shadow_avail_wrap_counter;
> -
> -uint16_t used_idx;
> -bool used_wrap_counter;
> -
> -/* Last used index value we have signalled on */
> -uint16_t signalled_used;
> -
> -/* Last used index value we have signalled on */
> -bool signalled_used_valid;
> -
> -/* Notification enabled? */
> -bool notification;
> -
> -uint16_t queue_index;
> -
> -unsigned int inuse;
> -
> -uint16_t vector;
> -VirtIOHandleOutput handle_output;
> -VirtIODevice *vdev;
> -EventNotifier guest_notifier;
> -EventNotifier host_notifier;
> -bool host_notifier_enabled;
> -QLIST_ENTRY(VirtQueue) node;
> -};
> -
>  const char *virtio_device_names[] = {
>  [VIRTIO_ID_NET] = "virtio-net",
>  [VIRTIO_ID_BLOCK] = "virtio-blk",
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index 77c6c55929..1c0d77c670 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -214,6 +214,56 @@ struct VirtioDeviceClass {
>  struct vhost_dev *(*get_vhost)(VirtIODevice *vdev);
>  };
>  
> +typedef struct VRingMemoryRegionCaches VRingMemoryRegionCaches;
> +typedef void (*VirtIOHandleOutput)(VirtIODevice *, VirtQueue *);
> +
> +typedef struct VRing {
> +unsigned int num;
> +unsigned int num_default;
> +unsigned int align;
> +hwaddr desc;
> +hwaddr avail;
> +hwaddr used;
> +VRingMemoryRegionCaches *caches;
> +} VRing;
> +
> +struct VirtQueue {
> +VRing vring;
> +VirtQueueElement *used_elems;
> +
> +/* Next head to pop */
> +uint16_t last_avail_idx;
> +bool last_avail_wrap_counter;
> +
> +/* Last avail_idx read from VQ. */
> +uint16_t shadow_avail_idx;
> +bool shadow_avail_wrap_counter;
> +
> +uint16_t used_idx;
> +bool used_wrap_counter;
> +
> +/* Last used index value we have signalled on */
> +uint16_t signalled_used;
> +
> +/* Last used index value we have signalled on */
> +bool signalled_used_valid;
> +
> +/* Notification enabled? */
> +bool notification;
> +
> +uint16_t queue_index;
> +
> +unsigned int inuse;
> +
> +uint16_t vector;
> +VirtIOHandleOutput handle_output;
> +VirtIODevice *vdev;
> +EventNotifier guest_notifier;
> +EventNotifier host_notifier;
> +bool host_notifier_enabled;
> +QLIST_ENTRY(VirtQueue) node;
> +};
> +
>  void virtio_instance_init_common(Object *proxy_obj, void *data,
>   size_t vdev_size, const char *vdev_name);
>  
> @@ -226,8 +276,6 @@ void virtio_error(VirtIODevice *vdev, const char *fmt, 
> ...) G_GNUC_PRINTF(2, 3);
>  /* Set the child bus name. */
>  void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name);
>  
> -typedef void (*VirtIOHandleOutput)(VirtIODevice *, VirtQueue *);
> -
>  VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
>  VirtIOHandleOutput handle_output);
>  
> -- 
> 2.32.0.3.g01195cf9f




Re: [PATCH 2/3] virtio: struct VirtQueue introduce reset

2023-01-28 Thread Michael S. Tsirkin
On Sat, Jan 28, 2023 at 03:17:23PM +0800, Xuan Zhuo wrote:
>  In the current design, we stop the device from operating on the vring
>  during per-queue reset by resetting the structure VirtQueue.
> 
>  But before the reset operation, when recycling some resources, we should
>  stop referencing new vring resources. For example, when recycling
>  virtio-net's asynchronous sending resources, virtio-net should be able
>  to perceive that the current queue is in the per-queue reset state, and
>  stop sending new packets from the tx queue.
> 
>  Signed-off-by: Xuan Zhuo 
> ---
>  hw/virtio/virtio.c | 8 
>  include/hw/virtio/virtio.h | 3 +++
>  2 files changed, 11 insertions(+)
> 
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 03077b2ecf..907d5b8bde 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -2030,6 +2030,12 @@ void virtio_queue_reset(VirtIODevice *vdev, uint32_t 
> queue_index)
>  {
>  VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
>  
> +/*
> + * Mark this queue is per-queue reset status. The device should release 
> the
> + * references of the vring, and not refer more new vring item.
> + */
> +vdev->vq[queue_index].reset = true;
> +
>  if (k->queue_reset) {
>  k->queue_reset(vdev, queue_index);
>  }
> @@ -2053,6 +2059,8 @@ void virtio_queue_enable(VirtIODevice *vdev, uint32_t 
> queue_index)
>  }
>  */
>  
> +vdev->vq[queue_index].reset = false;
> +
>  if (k->queue_enable) {
>  k->queue_enable(vdev, queue_index);
>  }
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index 1c0d77c670..b888538d09 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -251,6 +251,9 @@ struct VirtQueue {
>  /* Notification enabled? */
>  bool notification;
>  
> +/* Per-Queue Reset status */
> +bool reset;
> +
>  uint16_t queue_index;
>  

Reset state makes no sense. It seems to imply queue_reset
in the spec. And for extra fun there's "reset" in the pci
proxy which means "virtio_queue_reset is in progress" - I have no
idea what uses it though - it is not guest visible.  First what is it?
It actually means "queue has been reset and not has not been enabled since".
So disabled_by_reset maybe?

Second this hack helps make the change minimal
so it's helpful for stable, but it's ugly in that it
duplicates the reverse of enabled value - we don't really
care what disabled it in practice.

With the fixups above I can apply so it's easier to backport, but later
a patch on top should clean it all up, perhaps by adding
"enabled" in VirtQueue. We should also get rid of "reset" in the proxy
unless there's some way it's useful which I don't currently see.



>  unsigned int inuse;
> -- 
> 2.32.0.3.g01195cf9f




Re: [PATCH 00/17] audio: improve callback interface for audio frontends

2023-01-28 Thread Volker Rümelin

Am 22.01.23 um 19:13 schrieb Mark Cave-Ayland:

On 15/01/2023 13:45, Volker Rümelin wrote:


Am 15.01.23 um 14:08 schrieb Volker Rümelin:

Ccing a few more people who might be interested in this patch series.

@Mark:
After this patch series, the code in your out of tree ASC audio 
device (and a few in tree audio devices) could be simplified. 
write_audio() and the loops calling write_audio() could be removed.


Hi Volker,

I know we have discussed this in a separate thread off-list, but this 
is fantastic!


Just out of interest, if the available bytes wraps the circular buffer 
will the audio core call the audio callback twice to maximise the 
ability of the guest to generate samples before the next audio timer? 
Or does that not make much difference in practice?


Hi Mark,

I guess with circular buffer you refer to the mixing engine buffer. The 
audio system calls the callback once on every audio timer event. If the 
available bytes wrap the mixing engine ringbuffer, the 
audio_pcm_sw_resample_out() function uses two writes to write all 
available bytes. Compared to the unpatched version, nothing has changed 
in this regard. Of course the audio frontend devices are still free to 
write 'avail' bytes with multiple calls to AUD_write().


With best regards,
Volker



I'm not too familiar with the audio subsystem, but a quick skim of the 
series looks good (and being able to remove the write_audio() loops is 
a big plus). So I would certainly give this a thumbs up:


Acked-by: Mark Cave-Ayland 


ATB,

Mark.


With best regards,
Volker


Based-on: <3b1404eb-a7c5-f64c-3e47-1397c54c4...@t-online.de>
([PATCH 00/11] audio: more improvements)

The callback interface for emulated audio devices is strange. The 
callback function has an 'avail' parameter that passes the number of 
bytes that can be written or read. Unfortunately, this value 
sometimes is only an imprecise estimate and the callback functions 
must check the actual bytes written or read. For playback devices, 
this means that they either need a ring buffer or have to write the 
unwritten bytes again the next time. For recording devices, things 
are a bit easier. They only need to continue with the actual number 
of bytes read.


After this patch series, the 'avail' argument for the -audiodev 
out.mixing-engine=on and in.mixing-engine=on cases is exact. Audio 
frontends only need a linear frame buffer and there's a guarantee 
they can write or read 'avail' bytes.


The -audiodev out.mixing-engine=off case is also mostly accurate. 
Only the D-Bus audio backend is still missing a required function. 
The -audiodev in.mixing-engine=off case always passes a much too 
large 'avail' value. I haven't worked on this yet, because there was 
no reason for it so far.


The following logs show the improvements. Not only the audio 
frontends can write or read all needed or available bytes. The same 
is true for the audio backends. For playback, the first six lines in 
the logs are expected. Here you can see how quickly the guest fills 
the empty downstream buffers after playback starts.


QEMU was started with -device ich9-intel-hda,addr=0x1b -device 
hda-duplex,audiodev=audio0 -audiodev 
pa,out.frequency=96000,in.frequency=96000,id=audio0


playback guest 44100Hz => host 96000Hz

unpatched version:
hda_audio_output_cb: to write 8188, written 1704
audio_run_out: free 4458, played 926
hda_audio_output_cb: to write 6488, written 2384
audio_run_out: free 3532, played 1297
hda_audio_output_cb: to write 4104, written 2648
audio_run_out: free 2235, played 1441
audio_run_out: free 794, played 793
audio_run_out: free 897, played 896
audio_run_out: free 831, played 829
...
hda_audio_output_cb: could not write 4 bytes
hda_audio_output_cb: to write 1764, written 1760
audio_run_out: free 960, played 958
...

patched version:
hda_audio_output_cb: to write 8192, written 1620
audio_run_out: free 4458, played 880
hda_audio_output_cb: to write 6576, written 2508
audio_run_out: free 3578, played 1365
hda_audio_output_cb: to write 4068, written 2500
audio_run_out: free 2213, played 1360

record host 96000Hz => guest 44100Hz

unpatched version:
audio_run_in: avail 4458, acquired 4454
audio_run_in: avail 1574, acquired 1572
audio_run_in: avail 766, acquired 764
audio_run_in: avail 1052, acquired 1051
audio_run_in: avail 761, acquired 760
audio_run_in: avail 1123, acquired 1121
...
hda_audio_input_cb: could not read 4 bytes
hda_audio_input_cb: to read 1988, read 1984
audio_run_in: avail 1082, acquired 1080
...

patched version:
(no output)

QEMU was started with -device ich9-intel-hda,addr=0x1b -device 
hda-duplex,audiodev=audio0 -audiodev 
pa,out.frequency=32000,in.frequency=32000,id=audio0


playback guest 44100Hz => host 32000Hz

unpatched version:
hda_audio_output_cb: to write 8188, written 1620
audio_run_out: free 1486, played 294
hda_audio_output_cb: to write 6568, written 2512
audio_run_out: free 1192, played 455
hda_audio_output_cb: to write 4060, written 2504
audio_run_out: fr

[PATCH v9 52/58] hw/xen: Automatically add xen-platform PCI device for emulated Xen guests

2023-01-28 Thread David Woodhouse
From: David Woodhouse 

It isn't strictly mandatory but Linux guests at least will only map their
grant tables over the dummy BAR that it provides, and don't sufficient wit
to map them in any other unused part of their guest address space. So
include it by default for minimal surprise factor.

As I come to document "how to run a Xen guest in QEMU", this means one
fewer thing to tell the user about, according to the mantra of "if it
needs documenting, fix it first, then document what remains".

Signed-off-by: David Woodhouse 
---
 hw/i386/pc.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index a12a7a67e9..5ec3518b9e 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1313,6 +1313,9 @@ void pc_basic_device_init(struct PCMachineState *pcms,
 #ifdef CONFIG_XEN_EMU
 if (xen_mode == XEN_EMULATE) {
 xen_evtchn_connect_gsis(gsi);
+if (pcms->bus) {
+pci_create_simple(pcms->bus, -1, "xen-platform");
+}
 }
 #endif
 
-- 
2.39.0




[PATCH v9 17/58] i386/xen: implement XENMEM_add_to_physmap_batch

2023-01-28 Thread David Woodhouse
From: David Woodhouse 

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/kvm/xen-compat.h | 24 +
 target/i386/kvm/xen-emu.c| 69 
 2 files changed, 93 insertions(+)

diff --git a/target/i386/kvm/xen-compat.h b/target/i386/kvm/xen-compat.h
index 0b7088662a..ff5d20e901 100644
--- a/target/i386/kvm/xen-compat.h
+++ b/target/i386/kvm/xen-compat.h
@@ -15,6 +15,20 @@
 
 typedef uint32_t compat_pfn_t;
 typedef uint32_t compat_ulong_t;
+typedef uint32_t compat_ptr_t;
+
+#define __DEFINE_COMPAT_HANDLE(name, type)  \
+typedef struct {\
+compat_ptr_t c; \
+type *_[0] __attribute__((packed));   \
+} __compat_handle_ ## name; \
+
+#define DEFINE_COMPAT_HANDLE(name) __DEFINE_COMPAT_HANDLE(name, name)
+#define COMPAT_HANDLE(name) __compat_handle_ ## name
+
+DEFINE_COMPAT_HANDLE(compat_pfn_t);
+DEFINE_COMPAT_HANDLE(compat_ulong_t);
+DEFINE_COMPAT_HANDLE(int);
 
 struct compat_xen_add_to_physmap {
 domid_t domid;
@@ -24,4 +38,14 @@ struct compat_xen_add_to_physmap {
 compat_pfn_t gpfn;
 };
 
+struct compat_xen_add_to_physmap_batch {
+domid_t domid;
+uint16_t space;
+uint16_t size;
+uint16_t extra;
+COMPAT_HANDLE(compat_ulong_t) idxs;
+COMPAT_HANDLE(compat_pfn_t) gpfns;
+COMPAT_HANDLE(int) errs;
+};
+
 #endif /* QEMU_I386_XEN_COMPAT_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 533797a126..bd87541125 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -262,6 +262,71 @@ static int do_add_to_physmap(struct kvm_xen_exit *exit, 
X86CPU *cpu,
 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
 }
 
+static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
+   uint64_t arg)
+{
+struct xen_add_to_physmap_batch xatpb;
+unsigned long idxs_gva, gpfns_gva, errs_gva;
+CPUState *cs = CPU(cpu);
+size_t op_sz;
+
+if (hypercall_compat32(exit->u.hcall.longmode)) {
+struct compat_xen_add_to_physmap_batch xatpb32;
+
+qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 
20);
+if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
+return -EFAULT;
+}
+xatpb.domid = xatpb32.domid;
+xatpb.space = xatpb32.space;
+xatpb.size = xatpb32.size;
+
+idxs_gva = xatpb32.idxs.c;
+gpfns_gva = xatpb32.gpfns.c;
+errs_gva = xatpb32.errs.c;
+op_sz = sizeof(uint32_t);
+} else {
+if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
+return -EFAULT;
+}
+op_sz = sizeof(unsigned long);
+idxs_gva = (unsigned long)xatpb.idxs.p;
+gpfns_gva = (unsigned long)xatpb.gpfns.p;
+errs_gva = (unsigned long)xatpb.errs.p;
+}
+
+if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
+return -ESRCH;
+}
+
+/* Explicitly invalid for the batch op. Not that we implement it anyway. */
+if (xatpb.space == XENMAPSPACE_gmfn_range) {
+return -EINVAL;
+}
+
+while (xatpb.size--) {
+unsigned long idx = 0;
+unsigned long gpfn = 0;
+int err;
+
+/* For 32-bit compat this only copies the low 32 bits of each */
+if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
+kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
+return -EFAULT;
+}
+idxs_gva += op_sz;
+gpfns_gva += op_sz;
+
+err = add_to_physmap_one(xatpb.space, idx, gpfn);
+
+if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
+return -EFAULT;
+}
+errs_gva += sizeof(err);
+}
+return 0;
+}
+
 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
int cmd, uint64_t arg)
 {
@@ -272,6 +337,10 @@ static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 err = do_add_to_physmap(exit, cpu, arg);
 break;
 
+case XENMEM_add_to_physmap_batch:
+err = do_add_to_physmap_batch(exit, cpu, arg);
+break;
+
 default:
 return false;
 }
-- 
2.39.0




[PATCH v9 34/58] hw/xen: Implement EVTCHNOP_alloc_unbound

2023-01-28 Thread David Woodhouse
From: David Woodhouse 

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_evtchn.c  | 32 
 hw/i386/kvm/xen_evtchn.h  |  2 ++
 target/i386/kvm/xen-emu.c | 15 +++
 3 files changed, 49 insertions(+)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index e0e5b51e79..fa1d3a04f2 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -835,6 +835,38 @@ int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi)
 return ret;
 }
 
+int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+uint16_t type_val;
+int ret;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+if (alloc->dom != DOMID_SELF && alloc->dom != xen_domid) {
+return -ESRCH;
+}
+
+if (alloc->remote_dom == DOMID_QEMU) {
+type_val = PORT_INFO_TYPEVAL_REMOTE_QEMU;
+} else if (alloc->remote_dom == DOMID_SELF ||
+   alloc->remote_dom == xen_domid) {
+type_val = 0;
+} else {
+return -EPERM;
+}
+
+qemu_mutex_lock(&s->port_lock);
+
+ret = allocate_port(s, 0, EVTCHNSTAT_unbound, type_val, &alloc->port);
+
+qemu_mutex_unlock(&s->port_lock);
+
+return ret;
+}
+
 int xen_evtchn_send_op(struct evtchn_send *send)
 {
 XenEvtchnState *s = xen_evtchn_singleton;
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index 500fdbe8b8..fc080138e3 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -21,11 +21,13 @@ struct evtchn_unmask;
 struct evtchn_bind_virq;
 struct evtchn_bind_ipi;
 struct evtchn_send;
+struct evtchn_alloc_unbound;
 int xen_evtchn_status_op(struct evtchn_status *status);
 int xen_evtchn_close_op(struct evtchn_close *close);
 int xen_evtchn_unmask_op(struct evtchn_unmask *unmask);
 int xen_evtchn_bind_virq_op(struct evtchn_bind_virq *virq);
 int xen_evtchn_bind_ipi_op(struct evtchn_bind_ipi *ipi);
 int xen_evtchn_send_op(struct evtchn_send *send);
+int xen_evtchn_alloc_unbound_op(struct evtchn_alloc_unbound *alloc);
 
 #endif /* QEMU_XEN_EVTCHN_H */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index f6b5a23f79..e8486913bd 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -918,6 +918,21 @@ static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 err = xen_evtchn_send_op(&send);
 break;
 }
+case EVTCHNOP_alloc_unbound: {
+struct evtchn_alloc_unbound alloc;
+
+qemu_build_assert(sizeof(alloc) == 8);
+if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
+err = -EFAULT;
+break;
+}
+
+err = xen_evtchn_alloc_unbound_op(&alloc);
+if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
+err = -EFAULT;
+}
+break;
+}
 default:
 return false;
 }
-- 
2.39.0




[PATCH v9 03/58] xen: Add XEN_DISABLED mode and make it default

2023-01-28 Thread David Woodhouse
From: David Woodhouse 

Also set XEN_ATTACH mode in xen_init() to reflect the truth; not that
anyone ever cared before. It was *only* ever checked in xen_init_pv()
before.

Suggested-by: Paolo Bonzini 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 accel/xen/xen-all.c  | 2 ++
 include/hw/xen/xen.h | 5 +++--
 softmmu/globals.c| 2 +-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index 69aa7d018b..2329556595 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -181,6 +181,8 @@ static int xen_init(MachineState *ms)
  * opt out of system RAM being allocated by generic code
  */
 mc->default_ram_id = NULL;
+
+xen_mode = XEN_ATTACH;
 return 0;
 }
 
diff --git a/include/hw/xen/xen.h b/include/hw/xen/xen.h
index 4d412fd4b2..b3873c581b 100644
--- a/include/hw/xen/xen.h
+++ b/include/hw/xen/xen.h
@@ -22,8 +22,9 @@
 
 /* xen-machine.c */
 enum xen_mode {
-XEN_EMULATE = 0,  // xen emulation, using xenner (default)
-XEN_ATTACH// attach to xen domain created by libxl
+XEN_DISABLED = 0, // xen support disabled (default)
+XEN_ATTACH,   // attach to xen domain created by libxl
+XEN_EMULATE,
 };
 
 extern uint32_t xen_domid;
diff --git a/softmmu/globals.c b/softmmu/globals.c
index 527edbefdd..0a4405614e 100644
--- a/softmmu/globals.c
+++ b/softmmu/globals.c
@@ -63,5 +63,5 @@ QemuUUID qemu_uuid;
 bool qemu_uuid_set;
 
 uint32_t xen_domid;
-enum xen_mode xen_mode = XEN_EMULATE;
+enum xen_mode xen_mode = XEN_DISABLED;
 bool xen_domid_restrict;
-- 
2.39.0




[PATCH v9 20/58] i386/xen: handle VCPUOP_register_vcpu_info

2023-01-28 Thread David Woodhouse
From: Joao Martins 

Handle the hypercall to set a per vcpu info, and also wire up the default
vcpu_info in the shared_info page for the first 32 vCPUs.

To avoid deadlock within KVM a vCPU thread must set its *own* vcpu_info
rather than it being set from the context in which the hypercall is
invoked.

Add the vcpu_info (and default) GPA to the vmstate_x86_cpu for migration,
and restore it in kvm_arch_put_registers() appropriately.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/cpu.h|   2 +
 target/i386/kvm/kvm.c|  17 
 target/i386/kvm/trace-events |   1 +
 target/i386/kvm/xen-emu.c| 152 ++-
 target/i386/kvm/xen-emu.h|   2 +
 target/i386/machine.c|  19 +
 6 files changed, 190 insertions(+), 3 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index c6c57baed5..109b2e5669 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1788,6 +1788,8 @@ typedef struct CPUArchState {
 #endif
 #if defined(CONFIG_KVM)
 struct kvm_nested_state *nested_state;
+uint64_t xen_vcpu_info_gpa;
+uint64_t xen_vcpu_info_default_gpa;
 #endif
 #if defined(CONFIG_HVF)
 HVFX86LazyFlags hvf_lflags;
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 018b64ff58..37d6b78105 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -4735,6 +4735,15 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
 kvm_arch_set_tsc_khz(cpu);
 }
 
+#ifdef CONFIG_XEN_EMU
+if (xen_mode == XEN_EMULATE && level == KVM_PUT_FULL_STATE) {
+ret = kvm_put_xen_state(cpu);
+if (ret < 0) {
+return ret;
+}
+}
+#endif
+
 ret = kvm_getput_regs(x86_cpu, 1);
 if (ret < 0) {
 return ret;
@@ -4834,6 +4843,14 @@ int kvm_arch_get_registers(CPUState *cs)
 if (ret < 0) {
 goto out;
 }
+#ifdef CONFIG_XEN_EMU
+if (xen_mode == XEN_EMULATE) {
+ret = kvm_get_xen_state(cs);
+if (ret < 0) {
+goto out;
+}
+}
+#endif
 ret = 0;
  out:
 cpu_sync_bndcs_hflags(&cpu->env);
diff --git a/target/i386/kvm/trace-events b/target/i386/kvm/trace-events
index 8e9f269f56..a840e0333d 100644
--- a/target/i386/kvm/trace-events
+++ b/target/i386/kvm/trace-events
@@ -10,3 +10,4 @@ kvm_x86_update_msi_routes(int num) "Updated %d MSI routes"
 kvm_xen_hypercall(int cpu, uint8_t cpl, uint64_t input, uint64_t a0, uint64_t 
a1, uint64_t a2, uint64_t ret) "xen_hypercall: cpu %d cpl %d input %" PRIu64 " 
a0 0x%" PRIx64 " a1 0x%" PRIx64 " a2 0x%" PRIx64" ret 0x%" PRIx64
 kvm_xen_soft_reset(void) ""
 kvm_xen_set_shared_info(uint64_t gfn) "shared info at gfn 0x%" PRIx64
+kvm_xen_set_vcpu_attr(int cpu, int type, uint64_t gpa) "vcpu attr cpu %d type 
%d gpa 0x%" PRIx64
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index e940901f80..cdb2166a67 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -119,6 +119,8 @@ int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
 
 int kvm_xen_init_vcpu(CPUState *cs)
 {
+X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = &cpu->env;
 int err;
 
 /*
@@ -142,6 +144,9 @@ int kvm_xen_init_vcpu(CPUState *cs)
 }
 }
 
+env->xen_vcpu_info_gpa = INVALID_GPA;
+env->xen_vcpu_info_default_gpa = INVALID_GPA;
+
 return 0;
 }
 
@@ -187,10 +192,58 @@ static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit 
*exit, X86CPU *cpu,
 return true;
 }
 
+static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
+{
+struct kvm_xen_vcpu_attr xhsi;
+
+xhsi.type = type;
+xhsi.u.gpa = gpa;
+
+trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
+
+return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
+}
+
+static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
+{
+X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = &cpu->env;
+
+env->xen_vcpu_info_default_gpa = data.host_ulong;
+
+/* Changing the default does nothing if a vcpu_info was explicitly set. */
+if (env->xen_vcpu_info_gpa == INVALID_GPA) {
+kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
+  env->xen_vcpu_info_default_gpa);
+}
+}
+
+static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
+{
+X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = &cpu->env;
+
+env->xen_vcpu_info_gpa = data.host_ulong;
+
+kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
+  env->xen_vcpu_info_gpa);
+}
+
+static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
+{
+X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = &cpu->env;
+
+env->xen_vcpu_info_gpa = INVALID_GPA;
+env->xen_vcpu_info_default_gpa = INVALID_GPA;
+
+kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, INVALID_GPA);
+}
+
 static int xen_set_shared_info(uint64_t gfn)
 {
 uint64_t g

[PATCH v9 53/58] i386/xen: Document Xen HVM emulation

2023-01-28 Thread David Woodhouse
From: David Woodhouse 

Signed-off-by: David Woodhouse 
---
 docs/system/i386/xen.rst| 50 +
 docs/system/target-i386.rst |  1 +
 2 files changed, 51 insertions(+)
 create mode 100644 docs/system/i386/xen.rst

diff --git a/docs/system/i386/xen.rst b/docs/system/i386/xen.rst
new file mode 100644
index 00..71506eb5c4
--- /dev/null
+++ b/docs/system/i386/xen.rst
@@ -0,0 +1,50 @@
+Xen HVM guest support
+=
+
+
+Description
+---
+
+KVM has support for hosting Xen guests, intercepting Xen hypercalls and event
+channel (Xen PV interrupt) delivery. This allows guests which expect to be
+run under Xen to be hosted in QEMU under Linux/KVM instead.
+
+Setup
+-
+
+Xen mode is enabled by setting the ``xen-version`` property of the KVM
+accelerator to a 32-bit value in the ``XENVER_version`` form, with the Xen
+major version in the top 16 bits and the minor version in the low 16 bits,
+for example for Xen 4.10:
+
+.. parsed-literal::
+
+  |qemu_system| --accel kvm,xen-version=0x4000a
+
+Additionally, virtual APIC support can be advertised to the guest through the
+``xen-vapic`` CPU flag:
+
+.. parsed-literal::
+
+  |qemu_system| --accel kvm,xen-version=0x4000a --cpu host,+xen_vapic
+
+When Xen support is enabled, QEMU changes hypervisor identification (CPUID
+0x4000..0x400A) to Xen. The KVM identification and features are not
+advertised to a Xen guest. If Hyper-V is also enabled, the Xen identification
+moves to leaves 0x4100..0x410A.
+
+The Xen platform device is enabled automatically for a Xen guest. This allows
+a guest to unplug all emulated devices, in order to use Xen PV block and 
network
+drivers instead. Note that until the Xen PV device back ends are enabled to 
work
+with Xen mode in QEMU, that is unlikely to cause significant joy. Linux guests
+can be dissuaded from this by adding 'xen_emul_unplug=never' on their command
+line, and it can also be noted that AHCI disk controllers are exempt from being
+unplugged, as are passthrough VFIO PCI devices.
+
+OS requirements
+---
+
+The minimal Xen support in the KVM accelerator requires the host to be running
+Linux v5.12 or newer. Later versions add optimisations: Linux v5.17 added
+acceleration of interrupt delivery via the Xen PIRQ mechanism, and Linux v5.19
+accelerated Xen PV timers and inter-processor interrupts (IPIs).
diff --git a/docs/system/target-i386.rst b/docs/system/target-i386.rst
index e64c013077..77c2f3b979 100644
--- a/docs/system/target-i386.rst
+++ b/docs/system/target-i386.rst
@@ -27,6 +27,7 @@ Architectural features
 
i386/cpu
i386/hyperv
+   i386/xen
i386/kvm-pv
i386/sgx
i386/amd-memory-encryption
-- 
2.39.0




[PATCH v9 13/58] hw/xen: Add xen_overlay device for emulating shared xenheap pages

2023-01-28 Thread David Woodhouse
From: David Woodhouse 

For the shared info page and for grant tables, Xen shares its own pages
from the "Xen heap" to the guest. The guest requests that a given page
from a certain address space (XENMAPSPACE_shared_info, etc.) be mapped
to a given GPA using the XENMEM_add_to_physmap hypercall.

To support that in qemu when *emulating* Xen, create a memory region
(migratable) and allow it to be mapped as an overlay when requested.

Xen theoretically allows the same page to be mapped multiple times
into the guest, but that's hard to track and reinstate over migration,
so we automatically *unmap* any previous mapping when creating a new
one. This approach has been used in production with a non-trivial
number of guests expecting true Xen, without any problems yet being
noticed.

This adds just the shared info page for now. The grant tables will be
a larger region, and will need to be overlaid one page at a time. I
think that means I need to create separate aliases for each page of
the overall grant_frames region, so that they can be mapped individually.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/meson.build   |   1 +
 hw/i386/kvm/xen_overlay.c | 205 ++
 hw/i386/kvm/xen_overlay.h |  20 
 include/sysemu/kvm_xen.h  |   4 +
 4 files changed, 230 insertions(+)
 create mode 100644 hw/i386/kvm/xen_overlay.c
 create mode 100644 hw/i386/kvm/xen_overlay.h

diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 95467f1ded..6165cbf019 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -4,5 +4,6 @@ i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c'))
 i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
+i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen_overlay.c'))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/xen_overlay.c b/hw/i386/kvm/xen_overlay.c
new file mode 100644
index 00..0cd623992c
--- /dev/null
+++ b/hw/i386/kvm/xen_overlay.c
@@ -0,0 +1,205 @@
+/*
+ * QEMU Xen emulation: Shared/overlay pages support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_overlay.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+#include 
+
+#include "standard-headers/xen/memory.h"
+
+
+#define TYPE_XEN_OVERLAY "xen-overlay"
+OBJECT_DECLARE_SIMPLE_TYPE(XenOverlayState, XEN_OVERLAY)
+
+#define XEN_PAGE_SHIFT 12
+#define XEN_PAGE_SIZE (1ULL << XEN_PAGE_SHIFT)
+
+struct XenOverlayState {
+/*< private >*/
+SysBusDevice busdev;
+/*< public >*/
+
+MemoryRegion shinfo_mem;
+void *shinfo_ptr;
+uint64_t shinfo_gpa;
+};
+
+struct XenOverlayState *xen_overlay_singleton;
+
+static void xen_overlay_do_map_page(MemoryRegion *page, uint64_t gpa)
+{
+/*
+ * Xen allows guests to map the same page as many times as it likes
+ * into guest physical frames. We don't, because it would be hard
+ * to track and restore them all. One mapping of each page is
+ * perfectly sufficient for all known guests... and we've tested
+ * that theory on a few now in other implementations. dwmw2.
+ */
+if (memory_region_is_mapped(page)) {
+if (gpa == INVALID_GPA) {
+memory_region_del_subregion(get_system_memory(), page);
+} else {
+/* Just move it */
+memory_region_set_address(page, gpa);
+}
+} else if (gpa != INVALID_GPA) {
+memory_region_add_subregion_overlap(get_system_memory(), gpa, page, 0);
+}
+}
+
+/* KVM is the only existing back end for now. Let's not overengineer it yet. */
+static int xen_overlay_set_be_shinfo(uint64_t gfn)
+{
+struct kvm_xen_hvm_attr xa = {
+.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+.u.shared_info.gfn = gfn,
+};
+
+return kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+}
+
+
+static void xen_overlay_realize(DeviceState *dev, Error **errp)
+{
+XenOverlayState *s = XEN_OVERLAY(dev);
+
+if (xen_mode != XEN_EMULATE) {
+error_setg(errp, "Xen overlay page support is for Xen emulation");
+return;
+}
+
+memory_region_init_ram(&s->shinfo_mem, OBJECT(dev), "xen:shared_info",
+   XEN_PAGE_SIZE, &error_abort);
+memory_region_set_enabled(&s->shinfo_mem, true);
+
+s->shinfo_ptr = memo

[PATCH v9 26/58] hw/xen: Add xen_evtchn device for event channel emulation

2023-01-28 Thread David Woodhouse
From: David Woodhouse 

Include basic support for setting HVM_PARAM_CALLBACK_IRQ to the global
vector method HVM_PARAM_CALLBACK_TYPE_VECTOR, which is handled in-kernel
by raising the vector whenever the vCPU's vcpu_info->evtchn_upcall_pending
flag is set.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/meson.build   |   5 +-
 hw/i386/kvm/xen_evtchn.c  | 155 ++
 hw/i386/kvm/xen_evtchn.h  |  18 +
 hw/i386/pc.c  |   2 +
 target/i386/kvm/xen-emu.c |  15 
 5 files changed, 194 insertions(+), 1 deletion(-)
 create mode 100644 hw/i386/kvm/xen_evtchn.c
 create mode 100644 hw/i386/kvm/xen_evtchn.h

diff --git a/hw/i386/kvm/meson.build b/hw/i386/kvm/meson.build
index 6165cbf019..cab64df339 100644
--- a/hw/i386/kvm/meson.build
+++ b/hw/i386/kvm/meson.build
@@ -4,6 +4,9 @@ i386_kvm_ss.add(when: 'CONFIG_APIC', if_true: files('apic.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8254', if_true: files('i8254.c'))
 i386_kvm_ss.add(when: 'CONFIG_I8259', if_true: files('i8259.c'))
 i386_kvm_ss.add(when: 'CONFIG_IOAPIC', if_true: files('ioapic.c'))
-i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen_overlay.c'))
+i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files(
+  'xen_overlay.c',
+  'xen_evtchn.c',
+  ))
 
 i386_ss.add_all(when: 'CONFIG_KVM', if_true: i386_kvm_ss)
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
new file mode 100644
index 00..99d9d84716
--- /dev/null
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -0,0 +1,155 @@
+/*
+ * QEMU Xen emulation: Event channel support
+ *
+ * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ * Authors: David Woodhouse 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/module.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "exec/target_page.h"
+#include "exec/address-spaces.h"
+#include "migration/vmstate.h"
+
+#include "hw/sysbus.h"
+#include "hw/xen/xen.h"
+#include "xen_evtchn.h"
+
+#include "sysemu/kvm.h"
+#include "sysemu/kvm_xen.h"
+#include 
+
+#include "standard-headers/xen/memory.h"
+#include "standard-headers/xen/hvm/params.h"
+
+#define TYPE_XEN_EVTCHN "xen-evtchn"
+OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN)
+
+struct XenEvtchnState {
+/*< private >*/
+SysBusDevice busdev;
+/*< public >*/
+
+uint64_t callback_param;
+bool evtchn_in_kernel;
+
+QemuMutex port_lock;
+};
+
+struct XenEvtchnState *xen_evtchn_singleton;
+
+/* Top bits of callback_param are the type (HVM_PARAM_CALLBACK_TYPE_xxx) */
+#define CALLBACK_VIA_TYPE_SHIFT 56
+
+static int xen_evtchn_post_load(void *opaque, int version_id)
+{
+XenEvtchnState *s = opaque;
+
+if (s->callback_param) {
+xen_evtchn_set_callback_param(s->callback_param);
+}
+
+return 0;
+}
+
+static bool xen_evtchn_is_needed(void *opaque)
+{
+return xen_mode == XEN_EMULATE;
+}
+
+static const VMStateDescription xen_evtchn_vmstate = {
+.name = "xen_evtchn",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = xen_evtchn_is_needed,
+.post_load = xen_evtchn_post_load,
+.fields = (VMStateField[]) {
+VMSTATE_UINT64(callback_param, XenEvtchnState),
+VMSTATE_END_OF_LIST()
+}
+};
+
+static void xen_evtchn_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+
+dc->vmsd = &xen_evtchn_vmstate;
+}
+
+static const TypeInfo xen_evtchn_info = {
+.name  = TYPE_XEN_EVTCHN,
+.parent= TYPE_SYS_BUS_DEVICE,
+.instance_size = sizeof(XenEvtchnState),
+.class_init= xen_evtchn_class_init,
+};
+
+void xen_evtchn_create(void)
+{
+XenEvtchnState *s = XEN_EVTCHN(sysbus_create_simple(TYPE_XEN_EVTCHN,
+-1, NULL));
+xen_evtchn_singleton = s;
+
+qemu_mutex_init(&s->port_lock);
+}
+
+static void xen_evtchn_register_types(void)
+{
+type_register_static(&xen_evtchn_info);
+}
+
+type_init(xen_evtchn_register_types)
+
+int xen_evtchn_set_callback_param(uint64_t param)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+struct kvm_xen_hvm_attr xa = {
+.type = KVM_XEN_ATTR_TYPE_UPCALL_VECTOR,
+.u.vector = 0,
+};
+bool in_kernel = false;
+int ret;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+qemu_mutex_lock(&s->port_lock);
+
+switch (param >> CALLBACK_VIA_TYPE_SHIFT) {
+case HVM_PARAM_CALLBACK_TYPE_VECTOR: {
+xa.u.vector = (uint8_t)param,
+
+ret = kvm_vm_ioctl(kvm_state, KVM_XEN_HVM_SET_ATTR, &xa);
+if (!ret && kvm_xen_has_cap(EVTCHN_SEND)) {
+in_kernel = true;
+}
+break;
+}
+default:
+/* Xen doesn't return error even if you set something bogus */
+ret = 0;
+break;

[PATCH v9 21/58] i386/xen: handle VCPUOP_register_vcpu_time_info

2023-01-28 Thread David Woodhouse
From: Joao Martins 

In order to support Linux vdso in Xen.

Signed-off-by: Joao Martins 
Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 target/i386/cpu.h |   1 +
 target/i386/kvm/xen-emu.c | 100 +-
 target/i386/machine.c |   1 +
 3 files changed, 90 insertions(+), 12 deletions(-)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 109b2e5669..96c2d0d5cb 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1790,6 +1790,7 @@ typedef struct CPUArchState {
 struct kvm_nested_state *nested_state;
 uint64_t xen_vcpu_info_gpa;
 uint64_t xen_vcpu_info_default_gpa;
+uint64_t xen_vcpu_time_info_gpa;
 #endif
 #if defined(CONFIG_HVF)
 HVFX86LazyFlags hvf_lflags;
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index cdb2166a67..735ccc3869 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -37,28 +37,41 @@
 #define hypercall_compat32(longmode) (false)
 #endif
 
-static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
-  bool is_write)
+static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
+   size_t *len, bool is_write)
 {
-uint8_t *buf = (uint8_t *)_buf;
-int ret;
-
-while (sz) {
 struct kvm_translation tr = {
 .linear_address = gva,
 };
 
-size_t len = TARGET_PAGE_SIZE - (tr.linear_address & 
~TARGET_PAGE_MASK);
-if (len > sz) {
-len = sz;
+if (len) {
+*len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
+}
+
+if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
+(is_write && !tr.writeable)) {
+return false;
 }
+*gpa = tr.physical_address;
+return true;
+}
+
+static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
+  bool is_write)
+{
+uint8_t *buf = (uint8_t *)_buf;
+uint64_t gpa;
+size_t len;
 
-ret = kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr);
-if (ret || !tr.valid || (is_write && !tr.writeable)) {
+while (sz) {
+if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
 return -EFAULT;
 }
+if (len > sz) {
+len = sz;
+}
 
-cpu_physical_memory_rw(tr.physical_address, buf, len, is_write);
+cpu_physical_memory_rw(gpa, buf, len, is_write);
 
 buf += len;
 sz -= len;
@@ -146,6 +159,7 @@ int kvm_xen_init_vcpu(CPUState *cs)
 
 env->xen_vcpu_info_gpa = INVALID_GPA;
 env->xen_vcpu_info_default_gpa = INVALID_GPA;
+env->xen_vcpu_time_info_gpa = INVALID_GPA;
 
 return 0;
 }
@@ -229,6 +243,17 @@ static void do_set_vcpu_info_gpa(CPUState *cs, 
run_on_cpu_data data)
   env->xen_vcpu_info_gpa);
 }
 
+static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
+{
+X86CPU *cpu = X86_CPU(cs);
+CPUX86State *env = &cpu->env;
+
+env->xen_vcpu_time_info_gpa = data.host_ulong;
+
+kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
+  env->xen_vcpu_time_info_gpa);
+}
+
 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
 {
 X86CPU *cpu = X86_CPU(cs);
@@ -236,8 +261,11 @@ static void do_vcpu_soft_reset(CPUState *cs, 
run_on_cpu_data data)
 
 env->xen_vcpu_info_gpa = INVALID_GPA;
 env->xen_vcpu_info_default_gpa = INVALID_GPA;
+env->xen_vcpu_time_info_gpa = INVALID_GPA;
 
 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, INVALID_GPA);
+kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
+  INVALID_GPA);
 }
 
 static int xen_set_shared_info(uint64_t gfn)
@@ -453,6 +481,42 @@ static int vcpuop_register_vcpu_info(CPUState *cs, 
CPUState *target,
 return 0;
 }
 
+static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
+  uint64_t arg)
+{
+struct vcpu_register_time_memory_area tma;
+uint64_t gpa;
+size_t len;
+
+/* No need for 32/64 compat handling */
+qemu_build_assert(sizeof(tma) == 8);
+qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
+
+if (!target) {
+return -ENOENT;
+}
+
+if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
+return -EFAULT;
+}
+
+/*
+ * Xen actually uses the GVA and does the translation through the guest
+ * page tables each time. But Linux/KVM uses the GPA, on the assumption
+ * that guests only ever use *global* addresses (kernel virtual addresses)
+ * for it. If Linux is changed to redo the GVA→GPA translation each time,
+ * it will offer a new vCPU attribute for that, and we'll use it instead.
+ */
+if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
+len < sizeof(struct vcpu_time_info)) {
+return -EFAULT;
+}
+
+async_run_on_cpu(tar

[PATCH v9 28/58] hw/xen: Implement EVTCHNOP_status

2023-01-28 Thread David Woodhouse
From: David Woodhouse 

This adds the basic structure for maintaining the port table and reporting
the status of ports therein.

Signed-off-by: David Woodhouse 
Reviewed-by: Paul Durrant 
---
 hw/i386/kvm/xen_evtchn.c  | 104 ++
 hw/i386/kvm/xen_evtchn.h  |   3 ++
 include/sysemu/kvm_xen.h  |   3 ++
 target/i386/kvm/xen-emu.c |  20 +++-
 4 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 99d9d84716..f43d904ca0 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -22,6 +22,7 @@
 #include "hw/sysbus.h"
 #include "hw/xen/xen.h"
 #include "xen_evtchn.h"
+#include "xen_overlay.h"
 
 #include "sysemu/kvm.h"
 #include "sysemu/kvm_xen.h"
@@ -33,6 +34,22 @@
 #define TYPE_XEN_EVTCHN "xen-evtchn"
 OBJECT_DECLARE_SIMPLE_TYPE(XenEvtchnState, XEN_EVTCHN)
 
+typedef struct XenEvtchnPort {
+uint32_t vcpu;  /* Xen/ACPI vcpu_id */
+uint16_t type;  /* EVTCHNSTAT_ */
+uint16_t type_val;  /* pirq# / virq# / remote port according to type */
+} XenEvtchnPort;
+
+#define COMPAT_EVTCHN_2L_NR_CHANNELS1024
+
+/*
+ * For unbound/interdomain ports there are only two possible remote
+ * domains; self and QEMU. Use a single high bit in type_val for that,
+ * and the low bits for the remote port number (or 0 for unbound).
+ */
+#define PORT_INFO_TYPEVAL_REMOTE_QEMU   0x8000
+#define PORT_INFO_TYPEVAL_REMOTE_PORT_MASK  0x7FFF
+
 struct XenEvtchnState {
 /*< private >*/
 SysBusDevice busdev;
@@ -42,6 +59,8 @@ struct XenEvtchnState {
 bool evtchn_in_kernel;
 
 QemuMutex port_lock;
+uint32_t nr_ports;
+XenEvtchnPort port_table[EVTCHN_2L_NR_CHANNELS];
 };
 
 struct XenEvtchnState *xen_evtchn_singleton;
@@ -65,6 +84,18 @@ static bool xen_evtchn_is_needed(void *opaque)
 return xen_mode == XEN_EMULATE;
 }
 
+static const VMStateDescription xen_evtchn_port_vmstate = {
+.name = "xen_evtchn_port",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_UINT32(vcpu, XenEvtchnPort),
+VMSTATE_UINT16(type, XenEvtchnPort),
+VMSTATE_UINT16(type_val, XenEvtchnPort),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static const VMStateDescription xen_evtchn_vmstate = {
 .name = "xen_evtchn",
 .version_id = 1,
@@ -73,6 +104,9 @@ static const VMStateDescription xen_evtchn_vmstate = {
 .post_load = xen_evtchn_post_load,
 .fields = (VMStateField[]) {
 VMSTATE_UINT64(callback_param, XenEvtchnState),
+VMSTATE_UINT32(nr_ports, XenEvtchnState),
+VMSTATE_STRUCT_VARRAY_UINT32(port_table, XenEvtchnState, nr_ports, 1,
+ xen_evtchn_port_vmstate, XenEvtchnPort),
 VMSTATE_END_OF_LIST()
 }
 };
@@ -153,3 +187,73 @@ int xen_evtchn_set_callback_param(uint64_t param)
 
 return ret;
 }
+
+static bool valid_port(evtchn_port_t port)
+{
+if (!port) {
+return false;
+}
+
+if (xen_is_long_mode()) {
+return port < EVTCHN_2L_NR_CHANNELS;
+} else {
+return port < COMPAT_EVTCHN_2L_NR_CHANNELS;
+}
+}
+
+int xen_evtchn_status_op(struct evtchn_status *status)
+{
+XenEvtchnState *s = xen_evtchn_singleton;
+XenEvtchnPort *p;
+
+if (!s) {
+return -ENOTSUP;
+}
+
+if (status->dom != DOMID_SELF && status->dom != xen_domid) {
+return -ESRCH;
+}
+
+if (!valid_port(status->port)) {
+return -EINVAL;
+}
+
+qemu_mutex_lock(&s->port_lock);
+
+p = &s->port_table[status->port];
+
+status->status = p->type;
+status->vcpu = p->vcpu;
+
+switch (p->type) {
+case EVTCHNSTAT_unbound:
+if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+status->u.unbound.dom = DOMID_QEMU;
+} else {
+status->u.unbound.dom = xen_domid;
+}
+break;
+
+case EVTCHNSTAT_interdomain:
+if (p->type_val & PORT_INFO_TYPEVAL_REMOTE_QEMU) {
+status->u.interdomain.dom = DOMID_QEMU;
+} else {
+status->u.interdomain.dom = xen_domid;
+}
+
+status->u.interdomain.port = p->type_val &
+PORT_INFO_TYPEVAL_REMOTE_PORT_MASK;
+break;
+
+case EVTCHNSTAT_pirq:
+status->u.pirq = p->type_val;
+break;
+
+case EVTCHNSTAT_virq:
+status->u.virq = p->type_val;
+break;
+}
+
+qemu_mutex_unlock(&s->port_lock);
+return 0;
+}
diff --git a/hw/i386/kvm/xen_evtchn.h b/hw/i386/kvm/xen_evtchn.h
index c9b7f9d11f..76467636ee 100644
--- a/hw/i386/kvm/xen_evtchn.h
+++ b/hw/i386/kvm/xen_evtchn.h
@@ -15,4 +15,7 @@
 void xen_evtchn_create(void);
 int xen_evtchn_set_callback_param(uint64_t param);
 
+struct evtchn_status;
+int xen_evtchn_status_op(struct evtchn_status *status);
+
 #endif /* QEMU_XEN_EVTCHN_H */
diff --git a/include/sysemu/kvm_xen.h b/include/sysemu/kvm_xen.h
index a7260f5d72..0c0efbe699 100644
--- a/in

  1   2   >