date:20230919

Re: [QEMU PATCH v5 09/13] virtio-gpu: Handle resource blob commands

2023-09-19 Thread Huang Rui via

On Sat, Sep 16, 2023 at 12:37:29AM +0800, Akihiko Odaki wrote:
> On 2023/09/16 1:04, Akihiko Odaki wrote:
> > On 2023/09/15 20:11, Huang Rui wrote:
> >> From: Antonio Caggiano 
> >>
> >> Support BLOB resources creation, mapping and unmapping by calling the
> >> new stable virglrenderer 0.10 interface. Only enabled when available and
> >> via the blob config. E.g. -device virtio-vga-gl,blob=true
> >>
> >> Signed-off-by: Antonio Caggiano 
> >> Signed-off-by: Dmitry Osipenko 
> >> Signed-off-by: Xenia Ragiadakou 
> >> Signed-off-by: Huang Rui 
> >> ---
> >>
> >> V4 -> V5:
> >>  - Use memory_region_init_ram_ptr() instead of
> >>    memory_region_init_ram_device_ptr() (Akihiko)
> >>
> >>   hw/display/virtio-gpu-virgl.c  | 213 +
> >>   hw/display/virtio-gpu.c    |   4 +-
> >>   include/hw/virtio/virtio-gpu.h |   5 +
> >>   meson.build    |   4 +
> >>   4 files changed, 225 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/hw/display/virtio-gpu-virgl.c 
> >> b/hw/display/virtio-gpu-virgl.c
> >> index 312953ec16..563a6f2f58 100644
> >> --- a/hw/display/virtio-gpu-virgl.c
> >> +++ b/hw/display/virtio-gpu-virgl.c
> >> @@ -17,6 +17,7 @@
> >>   #include "trace.h"
> >>   #include "hw/virtio/virtio.h"
> >>   #include "hw/virtio/virtio-gpu.h"
> >> +#include "hw/virtio/virtio-gpu-bswap.h"
> >>   #include "ui/egl-helpers.h"
> >> @@ -78,9 +79,24 @@ static void virgl_cmd_create_resource_3d(VirtIOGPU *g,
> >>   virgl_renderer_resource_create(, NULL, 0);
> >>   }
> >> +static void virgl_resource_destroy(VirtIOGPU *g,
> >> +   struct virtio_gpu_simple_resource 
> >> *res)
> >> +{
> >> +    if (!res)
> >> +    return;
> >> +
> >> +    QTAILQ_REMOVE(>reslist, res, next);
> >> +
> >> +    virtio_gpu_cleanup_mapping_iov(g, res->iov, res->iov_cnt);
> >> +    g_free(res->addrs);
> >> +
> >> +    g_free(res);
> >> +}
> >> +
> >>   static void virgl_cmd_resource_unref(VirtIOGPU *g,
> >>    struct virtio_gpu_ctrl_command 
> >> *cmd)
> >>   {
> >> +    struct virtio_gpu_simple_resource *res;
> >>   struct virtio_gpu_resource_unref unref;
> >>   struct iovec *res_iovs = NULL;
> >>   int num_iovs = 0;
> >> @@ -88,13 +104,22 @@ static void virgl_cmd_resource_unref(VirtIOGPU *g,
> >>   VIRTIO_GPU_FILL_CMD(unref);
> >>   trace_virtio_gpu_cmd_res_unref(unref.resource_id);
> >> +    res = virtio_gpu_find_resource(g, unref.resource_id);
> >> +
> >>   virgl_renderer_resource_detach_iov(unref.resource_id,
> >>  _iovs,
> >>  _iovs);
> >>   if (res_iovs != NULL && num_iovs != 0) {
> >>   virtio_gpu_cleanup_mapping_iov(g, res_iovs, num_iovs);
> >> +    if (res) {
> >> +    res->iov = NULL;
> >> +    res->iov_cnt = 0;
> >> +    }
> >>   }
> >> +
> >>   virgl_renderer_resource_unref(unref.resource_id);
> >> +
> >> +    virgl_resource_destroy(g, res);
> 
> This may leak memory region.

The memory region should be freed under virgl_cmd_resource_unmap_blob()
which is calling memory_region_del_subregion(>hostmem, res->region).
Because this region is created by map_blob(). Do we have the case to call
virgl_cmd_resource_unref() without calling virgl_cmd_resource_unmap_blob()
for blob memory?

Thanks,
Ray

Re: [QEMU PATCH v5 09/13] virtio-gpu: Handle resource blob commands

2023-09-19 Thread Akihiko Odaki


On 2023/09/20 14:50, Huang Rui wrote:

On Sat, Sep 16, 2023 at 12:37:29AM +0800, Akihiko Odaki wrote:

On 2023/09/16 1:04, Akihiko Odaki wrote:

On 2023/09/15 20:11, Huang Rui wrote:

From: Antonio Caggiano 

Support BLOB resources creation, mapping and unmapping by calling the
new stable virglrenderer 0.10 interface. Only enabled when available and
via the blob config. E.g. -device virtio-vga-gl,blob=true

Signed-off-by: Antonio Caggiano 
Signed-off-by: Dmitry Osipenko 
Signed-off-by: Xenia Ragiadakou 
Signed-off-by: Huang Rui 
---

V4 -> V5:
  - Use memory_region_init_ram_ptr() instead of
    memory_region_init_ram_device_ptr() (Akihiko)

   hw/display/virtio-gpu-virgl.c  | 213 +
   hw/display/virtio-gpu.c    |   4 +-
   include/hw/virtio/virtio-gpu.h |   5 +
   meson.build    |   4 +
   4 files changed, 225 insertions(+), 1 deletion(-)

diff --git a/hw/display/virtio-gpu-virgl.c
b/hw/display/virtio-gpu-virgl.c
index 312953ec16..563a6f2f58 100644
--- a/hw/display/virtio-gpu-virgl.c
+++ b/hw/display/virtio-gpu-virgl.c
@@ -17,6 +17,7 @@
   #include "trace.h"
   #include "hw/virtio/virtio.h"
   #include "hw/virtio/virtio-gpu.h"
+#include "hw/virtio/virtio-gpu-bswap.h"
   #include "ui/egl-helpers.h"
@@ -78,9 +79,24 @@ static void virgl_cmd_create_resource_3d(VirtIOGPU *g,
   virgl_renderer_resource_create(, NULL, 0);
   }
+static void virgl_resource_destroy(VirtIOGPU *g,
+   struct virtio_gpu_simple_resource
*res)
+{
+    if (!res)
+    return;
+
+    QTAILQ_REMOVE(>reslist, res, next);
+
+    virtio_gpu_cleanup_mapping_iov(g, res->iov, res->iov_cnt);
+    g_free(res->addrs);
+
+    g_free(res);
+}
+
   static void virgl_cmd_resource_unref(VirtIOGPU *g,
    struct virtio_gpu_ctrl_command
*cmd)
   {
+    struct virtio_gpu_simple_resource *res;
   struct virtio_gpu_resource_unref unref;
   struct iovec *res_iovs = NULL;
   int num_iovs = 0;
@@ -88,13 +104,22 @@ static void virgl_cmd_resource_unref(VirtIOGPU *g,
   VIRTIO_GPU_FILL_CMD(unref);
   trace_virtio_gpu_cmd_res_unref(unref.resource_id);
+    res = virtio_gpu_find_resource(g, unref.resource_id);
+
   virgl_renderer_resource_detach_iov(unref.resource_id,
  _iovs,
  _iovs);
   if (res_iovs != NULL && num_iovs != 0) {
   virtio_gpu_cleanup_mapping_iov(g, res_iovs, num_iovs);
+    if (res) {
+    res->iov = NULL;
+    res->iov_cnt = 0;
+    }
   }
+
   virgl_renderer_resource_unref(unref.resource_id);
+
+    virgl_resource_destroy(g, res);


This may leak memory region.


The memory region should be freed under virgl_cmd_resource_unmap_blob()
which is calling memory_region_del_subregion(>hostmem, res->region).
Because this region is created by map_blob(). Do we have the case to call
virgl_cmd_resource_unref() without calling virgl_cmd_resource_unmap_blob()
for blob memory?


Calling virgl_cmd_resource_unmap_blob() and virgl_cmd_resource_unref() 
in order is a guest's responsibility, and we are required to prepare for 
broken guests.

Re: [PATCH v3 2/4] hw/cxl: Use switch statements for read and write of cachemem registers

2023-09-19 Thread Michael Tokarev


19.09.2023 12:34, Jonathan Cameron via wrote:

Establishing that only register accesses of size 4 and 8 can occur
using these functions requires looking at their callers. Make it
easier to see that by using switch statements.
Assertions are used to enforce that the register storage is of the
matching size, allowing fixed values to be used for divisors of
the array indices.

Suggested-by: Michael Tokarev 
Signed-off-by: Jonathan Cameron 
Reviewed-by: Fan Ni 



@@ -117,25 +125,36 @@ static void cxl_cache_mem_write_reg(void *opaque, hwaddr 
offset, uint64_t value,
  ComponentRegisters *cregs = _cstate->crb;
  uint32_t mask;

..
This hunk does not apply to qemu/master.  Is it based on some other
change missing in this area?

I thought about collecting all this and pushing trivial-patches but
stumbled upon this one.

Thanks,

/mjt

Re: [PATCH v2 0/3] hw/cxl: Misc small fixes

2023-09-19 Thread Michael Tokarev


19.09.2023 13:19, Jonathan Cameron via wrote:

v2:
  - Tag collection.
  - Patch 2 discussion on appropriate license concluded that this should
have originally only been accepted on GPL-v2 and later. However, I've
left it as GPL-v2-only as that is what was used for other CXL files and
for the license to be usefully relaxed we need to do them all - which
is a job for another day.
  - Added SPDX
  - Added similar header to cxl_type3_stubs.c
  
Misc set of trivial fixes.  No conflicts with other sets outstanding

so can go with main CXL patches or perhaps via the trivial tree.


Appied to my trivial-patches tree.  Thank you!

/mjt

Re: [VIRTIO PCI PATCH v5 1/1] transport-pci: Add freeze_mode to virtio_pci_common_cfg

2023-09-19 Thread Chen, Jiqian

Hi Michael S. Tsirkin,

On 2023/9/19 20:31, Michael S. Tsirkin wrote:
> On Tue, Sep 19, 2023 at 07:42:42PM +0800, Jiqian Chen wrote:
>> When guest vm does S3, Qemu will reset and clear some things of virtio
>> devices, but guest can't aware that, so that may cause some problems.
>> For excample, Qemu calls virtio_reset->virtio_gpu_gl_reset when guest
>> resume, that function will destroy render resources of virtio-gpu. As
>> a result, after guest resume, the display can't come back and we only
>> saw a black screen. Due to guest can't re-create all the resources, so
>> we need to let Qemu not to destroy them when S3.
>>
>> For above purpose, we need a mechanism that allows guests and QEMU to
>> negotiate their reset behavior. So this patch add a new parameter
>> named freeze_mode to struct virtio_pci_common_cfg. And when guest
>> suspends, it can write freeze_mode to be FREEZE_S3, and then virtio
>> devices can change their reset behavior on Qemu side according to
>> freeze_mode. What's more, freeze_mode can be used for all virtio
>> devices to affect the behavior of Qemu, not just virtio gpu device.
>>
>> Signed-off-by: Jiqian Chen 
>> ---
>>  transport-pci.tex | 7 +++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/transport-pci.tex b/transport-pci.tex
>> index a5c6719..2543536 100644
>> --- a/transport-pci.tex
>> +++ b/transport-pci.tex
>> @@ -319,6 +319,7 @@ \subsubsection{Common configuration structure 
>> layout}\label{sec:Virtio Transport
>>  le64 queue_desc;/* read-write */
>>  le64 queue_driver;  /* read-write */
>>  le64 queue_device;  /* read-write */
>> +le16 freeze_mode;   /* read-write */
>>  le16 queue_notif_config_data;   /* read-only for driver */
>>  le16 queue_reset;   /* read-write */
>>
> 
> we can't add fields in the middle of the structure like this -
> offset of queue_notif_config_data and queue_reset changes.
I have confused about this. I found in latest kernel code(master branch):
struct virtio_pci_common_cfg {
/* About the whole device. */
__le32 device_feature_select;   /* read-write */
__le32 device_feature;  /* read-only */
__le32 guest_feature_select;/* read-write */
__le32 guest_feature;   /* read-write */
__le16 msix_config; /* read-write */
__le16 num_queues;  /* read-only */
__u8 device_status; /* read-write */
__u8 config_generation; /* read-only */

/* About a specific virtqueue. */
__le16 queue_select;/* read-write */
__le16 queue_size;  /* read-write, power of 2. */
__le16 queue_msix_vector;   /* read-write */
__le16 queue_enable;/* read-write */
__le16 queue_notify_off;/* read-only */
__le32 queue_desc_lo;   /* read-write */
__le32 queue_desc_hi;   /* read-write */
__le32 queue_avail_lo;  /* read-write */
__le32 queue_avail_hi;  /* read-write */
__le32 queue_used_lo;   /* read-write */
__le32 queue_used_hi;   /* read-write */

__le16 freeze_mode; /* read-write */
};
There is no queue_notif_config_data or queue_reset, and freeze_mode I added is 
at the end. Why is it different from virtio-spec?
I add the offset for freeze_mode(VIRTIO_PCI_COMMON_F_MODE), and change the 
offset of Q_NDATA and Q_RESET
-#define VIRTIO_PCI_COMMON_Q_NDATA  56
-#define VIRTIO_PCI_COMMON_Q_RESET  58
+#define VIRTIO_PCI_COMMON_F_MODE   56
+#define VIRTIO_PCI_COMMON_Q_NDATA  58
+#define VIRTIO_PCI_COMMON_Q_RESET  60

> 
>   
>> @@ -393,6 +394,12 @@ \subsubsection{Common configuration structure 
>> layout}\label{sec:Virtio Transport
>>  \item[\field{queue_device}]
>>  The driver writes the physical address of Device Area here.  See 
>> section \ref{sec:Basic Facilities of a Virtio Device / Virtqueues}.
>>  
>> +\item[\field{freeze_mode}]
>> +The driver writes this to set the freeze mode of virtio pci.
>> +VIRTIO_PCI_FREEZE_MODE_UNFREEZE - virtio-pci is running;
>> +VIRTIO_PCI_FREEZE_MODE_FREEZE_S3 - guest vm is doing S3, and 
>> virtio-pci enters S3 suspension;
>> +Other values are reserved for future use, like S4, etc.
>> +
> 
> we need to specify these values then.
Thanks, I will add the values.

> 
> we also need
> - feature bit to detect support for S3
Do I need to add feature bit to DEFINE_VIRTIO_COMMON_FEATURES? And each time 
when I write freeze_mode filed on kernel driver side, I need to check this bit?

> - conformance statements documenting behavious under S3
Sorry, I am not very sure. Do you mean when freeze_mode is set FREEZE_S3, what 
operations should driver and device to do? Can you elaborate on it, or give an 
example?

> 
> 
>>

stable-8.1.1: which bug do we keep?

2023-09-19 Thread Michael Tokarev


Hi!

I'm in somewhat doubt what to do with 8.1.1 release.

There are 2 compelling issues, fixing one discovers the other.

https://gitlab.com/qemu-project/qemu/-/issues/1864
"x86 VM with TCG and SMP fails to start on 8.1.0"
is fixed by 0d58c660689f "softmmu: Use async_run_on_cpu in tcg_commit"

But this brings up

https://gitlab.com/qemu-project/qemu/-/issues/1866
"mips/mip64 virtio broken on master (and 8.1.0 with tcg fix)"
(which is actually more than mips, as I've shown down the line,
https://gitlab.com/qemu-project/qemu/-/issues/1866#note_1558221926 )

Also, one commit alone,
86e4f93d827 "softmmu: Assert data in bounds in iotlb_to_section",
when not followed with "async_run_on_cpu in tcg_commit", causes
assertion failures, eg
https://www.mail-archive.com/qemu-devel@nongnu.org/msg989846.html
I don't know if "async_run_on_cpu in tcg_commit" was supposed to
fix this assertion or not, or maybe some additional fix is needed, -
but I haven't see this is triggered with 0d58c660689f applied.

There were at least two attempts by Richard to fix issues after
0d58c660689f, one "accel/tcg: Always require can_do_io", which fixes
both reproducers for #1866 but at a high cost, and another,
"softmmu: Introduce cpu_address_space_sync", which addresses the
mips regression but does not fix my reproducer with ovmf
and none of the 2 landed on master so far.

Right now I have a "which bug to keep?" situation for 8.1.1, and
I'd love to have at least *some* comments about all this.  I've got
no replies to my earlier emails in this area.

To mee, it *feels* like 0d58c660689f should be there.
Note: the scheduled deadline for staging-8.1.1 is gone yesterday.
But this stuff seems to be important enough to delay 8.1.1 further.

Just some comments please? :)

Thank you!

/mjt

RE: [virtio-dev] RE: [VIRTIO PCI PATCH v5 1/1] transport-pci: Add freeze_mode to virtio_pci_common_cfg

2023-09-19 Thread Parav Pandit



> From: Chen, Jiqian 
> Sent: Wednesday, September 20, 2023 9:28 AM

> >> For above purpose, we need a mechanism that allows guests and QEMU to
> >> negotiate their reset behavior. So this patch add a new parameter
> >> named
> > Freeze != reset. :)
> > Please fix it to say freeze or suspend.
> But in my virtio-gpu scene, I want to prevent Qemu destroying resources when
> Guest do resuming(pci_pm_resume-> virtio_pci_restore->
> virtio_device_restore-> virtio_reset_device-> vp_modern_set_status->Qemu
> virtio_pci_reset->virtio_gpu_gl_reset-> virtio_gpu_reset). And I add check in
> virtio_gpu_gl_reset and virtio_gpu_reset, if freeze_mode was set to FREEZE_S3
> during Guest suspending, Qemu will not destroy resources. So the reason why I
> add this mechanism is to affect the reset behavior. And I think this also can 
> help
> other virtio devices to affect their behavior, like the issue of virtio-video 
> which
> Mikhail Golubev-Ciuchea encountered.
>
The point is when driver tells to freeze, it is freeze command and not reset.
So resume() should not invoke device_reset() when FREEZE+RESUME supported.
 
> >
> >> freeze_mode to struct virtio_pci_common_cfg. And when guest suspends,
> >> it can write freeze_mode to be FREEZE_S3, and then virtio devices can
> >> change their reset behavior on Qemu side according to freeze_mode.
> >> What's more,
> > Not reset, but suspend behavior.
> The same reason as above.
>
Reset should not be done by the guest driver when the device supports unfreeze.
 
> >
> >> freeze_mode can be used for all virtio devices to affect the behavior
> >> of Qemu, not just virtio gpu device.
> >>
> >> Signed-off-by: Jiqian Chen 
> >> ---
> >>  transport-pci.tex | 7 +++
> >>  1 file changed, 7 insertions(+)
> >>
> >> diff --git a/transport-pci.tex b/transport-pci.tex index
> >> a5c6719..2543536 100644
> >> --- a/transport-pci.tex
> >> +++ b/transport-pci.tex
> >> @@ -319,6 +319,7 @@ \subsubsection{Common configuration structure
> >> layout}\label{sec:Virtio Transport
> >>  le64 queue_desc;/* read-write */
> >>  le64 queue_driver;  /* read-write */
> >>  le64 queue_device;  /* read-write */
> >> +le16 freeze_mode;   /* read-write */
> >>  le16 queue_notif_config_data;   /* read-only for driver */
> >>  le16 queue_reset;   /* read-write */
> >>
> > The new field cannot be in the middle of the structure.
> > Otherwise, the location of the queue_notif_config_data depends on
> completely unrelated feature bit, breaking the backward compatibility.
> > So please move it at the end.
> I have confused about this. I found in latest kernel code(master branch):
> struct virtio_pci_common_cfg {
>   /* About the whole device. */
>   __le32 device_feature_select;   /* read-write */
>   __le32 device_feature;  /* read-only */
>   __le32 guest_feature_select;/* read-write */
>   __le32 guest_feature;   /* read-write */
>   __le16 msix_config; /* read-write */
>   __le16 num_queues;  /* read-only */
>   __u8 device_status; /* read-write */
>   __u8 config_generation; /* read-only */
> 
>   /* About a specific virtqueue. */
>   __le16 queue_select;/* read-write */
>   __le16 queue_size;  /* read-write, power of 2. */
>   __le16 queue_msix_vector;   /* read-write */
>   __le16 queue_enable;/* read-write */
>   __le16 queue_notify_off;/* read-only */
>   __le32 queue_desc_lo;   /* read-write */
>   __le32 queue_desc_hi;   /* read-write */
>   __le32 queue_avail_lo;  /* read-write */
>   __le32 queue_avail_hi;  /* read-write */
>   __le32 queue_used_lo;   /* read-write */
>   __le32 queue_used_hi;   /* read-write */
> 
>   __le16 freeze_mode; /* read-write */
> };
> There is no queue_notif_config_data or queue_reset, and freeze_mode I added
> is at the end. Why is it different from virtio-spec?
>
Because notify data may not be used by Linux driver so it may be shorter.
I didn’t dig code yet.
 
> >
> >> @@ -393,6 +394,12 @@ \subsubsection{Common configuration structure
> >> layout}\label{sec:Virtio Transport  \item[\field{queue_device}]
> >>  The driver writes the physical address of Device Area here.
> >> See section \ref{sec:Basic Facilities of a Virtio Device / Virtqueues}.
> >>
> >> +\item[\field{freeze_mode}]
> >> +The driver writes this to set the freeze mode of virtio pci.
> >> +VIRTIO_PCI_FREEZE_MODE_UNFREEZE - virtio-pci is running;
> >> +VIRTIO_PCI_FREEZE_MODE_FREEZE_S3 - guest vm is doing S3, and
> >> +virtio-
> > For above names, please define the actual values in the spec.
> Ok, I will add them.
> 
> >
> >> pci enters S3 suspension;
> >> +Other values are reserved for future use, like S4, etc.

Re: [virtio-dev] RE: [VIRTIO PCI PATCH v5 1/1] transport-pci: Add freeze_mode to virtio_pci_common_cfg

2023-09-19 Thread Chen, Jiqian

Hi Parav,

On 2023/9/19 20:10, Parav Pandit wrote:
> Hi Jiqian,
> 
>> From: Jiqian Chen 
>> Sent: Tuesday, September 19, 2023 5:13 PM
>>
>> When guest vm does S3, Qemu will reset and clear some things of virtio
>> devices, but guest can't aware that, so that may cause some problems.
> It is not true that guest VM is not aware of it.
> As you show in your kernel patch, it is freeze/unfreeze in the guest VM PCI 
> PM driver callback. So please update the commit log.
Thanks, I will update it.

> 
>> For excample, Qemu calls virtio_reset->virtio_gpu_gl_reset when guest
> s/excample/example
> 
>> resume, that function will destroy render resources of virtio-gpu. As a 
>> result,
>> after guest resume, the display can't come back and we only saw a black
>> screen. Due to guest can't re-create all the resources, so we need to let 
>> Qemu
>> not to destroy them when S3.
> Above QEMU specific details to go in cover letter, instead of commit log, but 
> no strong opinion.
> Explaining the use case is good.
Thanks, I will also add it to cover letter.

> 
>>
>> For above purpose, we need a mechanism that allows guests and QEMU to
>> negotiate their reset behavior. So this patch add a new parameter named
> Freeze != reset. :)
> Please fix it to say freeze or suspend.
But in my virtio-gpu scene, I want to prevent Qemu destroying resources when 
Guest do resuming(pci_pm_resume-> virtio_pci_restore-> virtio_device_restore-> 
virtio_reset_device-> vp_modern_set_status->Qemu 
virtio_pci_reset->virtio_gpu_gl_reset-> virtio_gpu_reset). And I add check in 
virtio_gpu_gl_reset and virtio_gpu_reset, if freeze_mode was set to FREEZE_S3 
during Guest suspending, Qemu will not destroy resources. So the reason why I 
add this mechanism is to affect the reset behavior. And I think this also can 
help other virtio devices to affect their behavior, like the issue of 
virtio-video which Mikhail Golubev-Ciuchea encountered.

> 
>> freeze_mode to struct virtio_pci_common_cfg. And when guest suspends, it can
>> write freeze_mode to be FREEZE_S3, and then virtio devices can change their
>> reset behavior on Qemu side according to freeze_mode. What's more,
> Not reset, but suspend behavior.
The same reason as above.

> 
>> freeze_mode can be used for all virtio devices to affect the behavior of 
>> Qemu,
>> not just virtio gpu device.
>>
>> Signed-off-by: Jiqian Chen 
>> ---
>>  transport-pci.tex | 7 +++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/transport-pci.tex b/transport-pci.tex index a5c6719..2543536 
>> 100644
>> --- a/transport-pci.tex
>> +++ b/transport-pci.tex
>> @@ -319,6 +319,7 @@ \subsubsection{Common configuration structure
>> layout}\label{sec:Virtio Transport
>>  le64 queue_desc;/* read-write */
>>  le64 queue_driver;  /* read-write */
>>  le64 queue_device;  /* read-write */
>> +le16 freeze_mode;   /* read-write */
>>  le16 queue_notif_config_data;   /* read-only for driver */
>>  le16 queue_reset;   /* read-write */
>>
> The new field cannot be in the middle of the structure.
> Otherwise, the location of the queue_notif_config_data depends on completely 
> unrelated feature bit, breaking the backward compatibility.
> So please move it at the end.
I have confused about this. I found in latest kernel code(master branch):
struct virtio_pci_common_cfg {
/* About the whole device. */
__le32 device_feature_select;   /* read-write */
__le32 device_feature;  /* read-only */
__le32 guest_feature_select;/* read-write */
__le32 guest_feature;   /* read-write */
__le16 msix_config; /* read-write */
__le16 num_queues;  /* read-only */
__u8 device_status; /* read-write */
__u8 config_generation; /* read-only */

/* About a specific virtqueue. */
__le16 queue_select;/* read-write */
__le16 queue_size;  /* read-write, power of 2. */
__le16 queue_msix_vector;   /* read-write */
__le16 queue_enable;/* read-write */
__le16 queue_notify_off;/* read-only */
__le32 queue_desc_lo;   /* read-write */
__le32 queue_desc_hi;   /* read-write */
__le32 queue_avail_lo;  /* read-write */
__le32 queue_avail_hi;  /* read-write */
__le32 queue_used_lo;   /* read-write */
__le32 queue_used_hi;   /* read-write */

__le16 freeze_mode; /* read-write */
};
There is no queue_notif_config_data or queue_reset, and freeze_mode I added is 
at the end. Why is it different from virtio-spec?

> 
>> @@ -393,6 +394,12 @@ \subsubsection{Common configuration structure
>> layout}\label{sec:Virtio Transport  \item[\field{queue_device}]
>>  The driver writes the physical address of Device Area here.

RE: [PATCH v1 15/22] Add iommufd configure option

2023-09-19 Thread Duan, Zhenzhong

>-Original Message-
>From: Cédric Le Goater 
>Sent: Wednesday, September 20, 2023 1:08 AM
>Subject: Re: [PATCH v1 15/22] Add iommufd configure option
>
>On 8/30/23 12:37, Zhenzhong Duan wrote:
>> This adds "--enable-iommufd/--disable-iommufd" to enable or disable
>> iommufd support, enabled by default.
>
>Why would someone want to disable support at compile time ? It might

For those users who only want to support legacy container feature?
Let me know if you still prefer to drop this patch, I'm fine with that.

>have been useful for dev but now QEMU should self-adjust at runtime
>depending only on the host capabilities AFAIUI. Am I missing something ?

IOMMUFD doesn't support all features of legacy container, so QEMU
doesn't self-adjust at runtime by checking if host supports IOMMUFD.
We need to specify it explicitly to use IOMMUFD as below:

-object iommufd,id=iommufd0
-device vfio-pci,host=:02:00.0,iommufd=iommufd0

Thanks
Zhenzhong

RE: [PATCH v1 14/22] vfio/common: Simplify vfio_viommu_preset()

2023-09-19 Thread Duan, Zhenzhong



>-Original Message-
>From: Cédric Le Goater 
>Sent: Wednesday, September 20, 2023 12:01 AM
>Subject: Re: [PATCH v1 14/22] vfio/common: Simplify vfio_viommu_preset()
>
>On 8/30/23 12:37, Zhenzhong Duan wrote:
>> Commit "vfio/container-base: Introduce [attach/detach]_device container
>callbacks"
>> add support to link to address space, we can utilize it to simplify
>> vfio_viommu_preset().
>>
>> Signed-off-by: Zhenzhong Duan 
>
>This looks like a revert of patch 07. Can it be avoided in v2 ?

Yes, I will redesign the related part so that I could have this patch dropped.

Thanks
Zhenzhong

Re: [PATCH v2 0/8] ACPI: X86 AML generation and GPE tracing cleanup

2023-09-19 Thread Michael S. Tsirkin

On Tue, Sep 19, 2023 at 07:47:09PM +, Bernhard Beschow wrote:
> 
> 
> Am 8. September 2023 08:42:26 UTC schrieb Bernhard Beschow 
> :
> >This series contains changes from my effort to bring the VIA south bridges to
> >
> >the PC machine [1]. The first part of the series resolves the
> >
> >AcpiCpuAmlIfClass::madt_cpu virtual method which frees ACPI controllers from
> >
> >worrying about CPU AML generation. The second part minimizes an 
> >Intel-specific
> >
> >assumption in AML generation to just one place. The third part contains two
> >
> >ACPI tracing patches which have been reviewed a long time ago but weren't 
> >merged
> >
> >yet.
> >
> >
> >
> >The removal of AcpiCpuAmlIfClass::madt_cpu is essentially a respin of [2] 
> >with
> >
> >a different approach. Igor wasn't generally against it but wasn't convinced
> >
> >either [3]. The new approach causes much less churn and instead allows to
> >
> >remove code. So I think it's worth to be reconsidered.
> >
> >
> >
> >The motivation for removing this virtual method didn't change: It frees the 
> >ACPI
> >
> >controllers in general and PIIX4 PM in particular from generating X86 CPU 
> >AML.
> >
> >The latter is also used in MPIS context where X86 CPU AML generation is
> >
> >stubbed out. This indicates a design issue where a problem was solved at the
> >
> >wrong place. Moreover, it turned out that TYPE_ACPI_GED_X86 could be removed 
> >as
> >
> >well, further supporting this claim.
> >
> >
> >
> >The second part of this series limits SMI command port determination during 
> >AML
> >
> >generation to just one place. Currently the ACPI_PORT_SMI_CMD constant is 
> >used
> >
> >multiple times which has an Intel-specific value. In order to make the code a
> >
> >microscopic bit more compatible with our VIA south bridge models its usage 
> >gets
> >
> >limited to one place, allowing the constant to be turned into a device model
> >
> >property in the future.
> >
> >
> >
> >The third part improves the tracing experience for ACPI general purpose 
> >events.
> >
> >It originates from an old series: [4].
> >
> >
> >
> >Testing done:
> >
> >* `make check`
> >
> >* `make check-avocado`
> >
> >
> >
> >v2:
> >
> >* Trace ACPI GPE values with "0x%02" (Phil)
> >
> 
> Ping
> 
> All patches reviewed. Michael, are you the one going to queue it?
> 
> Thanks,
> Bernhard

yes, thanks!

> >
> >
> >[1] https://github.com/shentok/qemu/tree/pc-via
> >
> >[2] 
> >https://lore.kernel.org/qemu-devel/20230121151941.24120-1-shen...@gmail.com/
> >
> >[3] 
> >https://lore.kernel.org/qemu-devel/20230125174842.395fd...@imammedo.users.ipa.redhat.com/
> >
> >[4] https://patchew.org/QEMU/20230122170724.21868-1-shen...@gmail.com/
> >
> >
> >
> >Bernhard Beschow (8):
> >
> >  hw/i386/acpi-build: Use pc_madt_cpu_entry() directly
> >
> >  hw/acpi/cpu: Have build_cpus_aml() take a build_madt_cpu_fn callback
> >
> >  hw/acpi/acpi_dev_interface: Remove now unused madt_cpu virtual method
> >
> >  hw/acpi/acpi_dev_interface: Remove now unused #include "hw/boards.h"
> >
> >  hw/i386: Remove now redundant TYPE_ACPI_GED_X86
> >
> >  hw/i386/acpi-build: Determine SMI command port just once
> >
> >  hw/acpi: Trace GPE access in all device models, not just PIIX4
> >
> >  hw/acpi/core: Trace enable and status registers of GPE separately
> >
> >
> >
> > hw/acpi/hmat.h |  3 ++-
> >
> > hw/i386/acpi-common.h  |  3 +--
> >
> > include/hw/acpi/acpi_dev_interface.h   |  3 ---
> >
> > include/hw/acpi/cpu.h  |  6 -
> >
> > include/hw/acpi/generic_event_device.h |  2 --
> >
> > hw/acpi/acpi-x86-stub.c|  6 -
> >
> > hw/acpi/core.c |  9 +++
> >
> > hw/acpi/cpu.c  |  9 +++
> >
> > hw/acpi/hmat.c |  1 +
> >
> > hw/acpi/memory_hotplug.c   |  1 +
> >
> > hw/acpi/piix4.c|  5 
> >
> > hw/i386/acpi-build.c   | 13 +-
> >
> > hw/i386/acpi-common.c  |  5 ++--
> >
> > hw/i386/acpi-microvm.c |  3 +--
> >
> > hw/i386/generic_event_device_x86.c | 36 --
> >
> > hw/i386/microvm.c  |  2 +-
> >
> > hw/isa/lpc_ich9.c  |  1 -
> >
> > hw/acpi/trace-events   | 10 ---
> >
> > hw/i386/meson.build|  1 -
> >
> > 19 files changed, 38 insertions(+), 81 deletions(-)
> >
> > delete mode 100644 hw/i386/generic_event_device_x86.c
> >
> >
> >
> >-- >
> >2.42.0
> >
> >
> >

ping: Re: [PATCH] cpu-throttle: Fix vcpu missed throttle work

2023-09-19 Thread alloc young


Hi pbonzini:
please take some to review this patch. It fixes
autoconverge migration issue for heavy memory dirty
pages. Any comment will be welcome, Thx.


On 2023/9/18 11:29, alloc.yo...@outlook.com wrote:

From: alloc 

During migrations, vcpu may run longer than 10ms and not exit
on time. If the vcpu runs over 20ms, then it'll miss a throttle
kick and will run the whole tick. When this happens and vcpu
dirties pages fast, the migration will take long time or event
not enable to auto converge. To fix this issue, take overrun
vcpu time into account and adjust the whole sleep time.

Signed-off-by: yangchunguang 
---
  include/hw/core/cpu.h  |  5 
  softmmu/cpu-throttle.c | 58 +-
  2 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 92a4234439..0b3cc3e81e 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -430,6 +430,11 @@ struct CPUState {
   */
  bool throttle_thread_scheduled;
  
+/* Used to keep last cpu throttle tick

+ *
+ */
+int64_t throttle_last_tick;
+
  /*
   * Sleep throttle_us_per_full microseconds once dirty ring is full
   * if dirty page rate limit is enabled.
diff --git a/softmmu/cpu-throttle.c b/softmmu/cpu-throttle.c
index d9bb30a223..bdec8dc954 100644
--- a/softmmu/cpu-throttle.c
+++ b/softmmu/cpu-throttle.c
@@ -36,22 +36,66 @@ static unsigned int throttle_percentage;
  #define CPU_THROTTLE_PCT_MIN 1
  #define CPU_THROTTLE_PCT_MAX 99
  #define CPU_THROTTLE_TIMESLICE_NS 1000
+#define CPU_THROTTLE_RUN_MIN_NS (CPU_THROTTLE_TIMESLICE_NS / 100)
  
  static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)

  {
  double pct;
  double throttle_ratio;
-int64_t sleeptime_ns, endtime_ns;
+int64_t sleeptime_ns, endtime_ns, now, overrun_ns;
  
  if (!cpu_throttle_get_percentage()) {

  return;
  }
  
+now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);

  pct = (double)cpu_throttle_get_percentage() / 100;
  throttle_ratio = pct / (1 - pct);
-/* Add 1ns to fix double's rounding error (like 0.999...) */
-sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
-endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
+overrun_ns = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) - 
cpu->throttle_last_tick;
+/* If vcpu runs longer than 20ms, then the vcpu will miss next throttle 
tick and
+*  will run almost the full tick frame. When this happens and vcpu runs 
fast dirty
+*  pages, migration may take long time or can't converge at all.
+*
+*  Example of guest run longer than 30ms when cpu throttle is 99%
+*
+*  guest run(x) throttle tick(*) guest sleep(+)
+*
+*+...+x xx+...++x...xx  vcpu
+*
+*  --*...--*--...-*--...*-- 
timeframe
+*
+*/
+if (overrun_ns > (CPU_THROTTLE_TIMESLICE_NS - CPU_THROTTLE_RUN_MIN_NS)) {
+int64_t timeframe = CPU_THROTTLE_TIMESLICE_NS / (1 - pct) + 1;
+int64_t new_ns = overrun_ns / (1 - pct) + 1;
+int frames;
+int64_t adj, remainder;
+
+frames = overrun_ns / CPU_THROTTLE_TIMESLICE_NS;
+sleeptime_ns = overrun_ns * throttle_ratio + 1;
+remainder = new_ns - frames * timeframe;
+if (remainder > 0) {
+int64_t left_ns = timeframe - remainder;
+int64_t left_run = (1 - pct) * left_ns;
+
+adj = left_run < CPU_THROTTLE_RUN_MIN_NS ? CPU_THROTTLE_RUN_MIN_NS 
- left_run : 0;
+sleeptime_ns += left_ns * pct;
+} else
+adj = CPU_THROTTLE_RUN_MIN_NS;
+
+/* Limit max vcpu sleep time to avoid guest hang,
+ * max sleep time is 10s when cpu throttle is 99%
+ */
+if (sleeptime_ns > 10 * timeframe) {
+adj = remainder + CPU_THROTTLE_RUN_MIN_NS;
+sleeptime_ns = 10 * timeframe;
+}
+sleeptime_ns -=  adj;
+} else
+/* Add 1ns to fix double's rounding error (like 0.999...) */
+sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 
1);
+
+endtime_ns = now + sleeptime_ns;
  while (sleeptime_ns > 0 && !cpu->stop) {
  if (sleeptime_ns > SCALE_MS) {
  qemu_cond_timedwait_iothread(cpu->halt_cond,
@@ -70,6 +114,7 @@ static void cpu_throttle_timer_tick(void *opaque)
  {
  CPUState *cpu;
  double pct;
+int64_t now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
  
  /* Stop the timer if needed */

  if (!cpu_throttle_get_percentage()) {
@@ -77,14 +122,13 @@ static void cpu_throttle_timer_tick(void *opaque)
  }
  CPU_FOREACH(cpu) {
  if (!qatomic_xchg(>throttle_thread_scheduled, 1)) {
+cpu->throttle_last_tick = now;
  async_run_on_cpu(cpu, cpu_throttle_thread,

Re: [RFC PATCH v2 03/21] HostMem: Add private property and associate it with RAM_KVM_GMEM

2023-09-19 Thread Xiaoyao Li


On 9/19/2023 5:46 PM, Markus Armbruster wrote:

Xiaoyao Li  writes:


From: Isaku Yamahata 

Add a new property "private" to memory backends. When it's set to true,
it indicates the RAMblock of the backend also requires kvm gmem.


Can you add a brief explanation why you need the property?


It provides a mechanism for user to specify whether the memory can serve 
as private memory (need request kvm gmem).



Signed-off-by: Isaku Yamahata 
Signed-off-by: Xiaoyao Li 


[...]


diff --git a/qapi/qom.json b/qapi/qom.json
index fa3e88c8e6ab..d28c5403bc0f 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -605,6 +605,9 @@
  # @reserve: if true, reserve swap space (or huge pages) if applicable
  # (default: true) (since 6.1)
  #
+# @private: if true, use KVM gmem private memory (default: false)
+# (since 8.2)
+#
  # @size: size of the memory region in bytes
  #
  # @x-use-canonical-path-for-ramblock-id: if true, the canonical path
@@ -631,6 +634,7 @@
  '*prealloc-context': 'str',
  '*share': 'bool',
  '*reserve': 'bool',
+'*private': 'bool',
  'size': 'size',
  '*x-use-canonical-path-for-ramblock-id': 'bool' } }

[PATCH v2 4/4] target/ppc: Add migration support for BHRB

2023-09-19 Thread Glenn Miles

Adds migration support for Branch History Rolling
Buffer (BHRB) internal state.

Signed-off-by: Glenn Miles 
---
 target/ppc/machine.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/target/ppc/machine.c b/target/ppc/machine.c
index d42e475bfb..ba328ad5e2 100644
--- a/target/ppc/machine.c
+++ b/target/ppc/machine.c
@@ -711,6 +711,26 @@ static const VMStateDescription vmstate_reservation = {
 }
 };
 
+#ifdef TARGET_PPC64
+static bool bhrb_needed(void *opaque)
+{
+PowerPCCPU *cpu = opaque;
+return (cpu->env.flags & POWERPC_FLAG_BHRB) != 0;
+}
+
+static const VMStateDescription vmstate_bhrb = {
+.name = "cpu/bhrb",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = bhrb_needed,
+.fields = (VMStateField[]) {
+VMSTATE_UINTTL(env.bhrb_offset, PowerPCCPU),
+VMSTATE_UINT64_ARRAY(env.bhrb, PowerPCCPU, BHRB_MAX_NUM_ENTRIES),
+VMSTATE_END_OF_LIST()
+}
+};
+#endif
+
 const VMStateDescription vmstate_ppc_cpu = {
 .name = "cpu",
 .version_id = 5,
@@ -756,6 +776,7 @@ const VMStateDescription vmstate_ppc_cpu = {
 #ifdef TARGET_PPC64
 _tm,
 _slb,
+_bhrb,
 #endif /* TARGET_PPC64 */
 _tlb6xx,
 _tlbemb,
-- 
2.31.1

[PATCH v2 1/4] target/ppc: Add new hflags to support BHRB

2023-09-19 Thread Glenn Miles

This commit is preparatory to the addition of Branch History
Rolling Buffer (BHRB) functionality, which is being provided
today starting with the P8 processor.

BHRB uses several SPR register fields to control whether or not
a branch instruction's address (and sometimes target address)
should be recorded.  Checking each of these fields with each
branch instruction using jitted code would lead to a significant
decrease in performance.

Therefore, it was decided that BHRB configuration bits that are
not expected to change frequently should have their state summarized
in an hflag so that the amount of checking done by jitted code can
be reduced.

This commit contains the changes for summarizing the state of the
following register fields in the HFLAGS_BHRB_ENABLE hflag:

MMCR0[FCP] - Determines if BHRB recording is frozen in the
 problem state

MMCR0[FCPC] - A modifier for MMCR0[FCP]

MMCRA[BHRBRD] - Disables all BHRB recording for a thread

Signed-off-by: Glenn Miles 
---
 target/ppc/cpu.h |  5 +
 target/ppc/cpu_init.c|  4 ++--
 target/ppc/helper.h  |  1 +
 target/ppc/helper_regs.c | 35 
 target/ppc/machine.c |  2 +-
 target/ppc/power8-pmu-regs.c.inc |  5 +
 target/ppc/power8-pmu.c  | 15 ++
 target/ppc/power8-pmu.h  |  4 ++--
 target/ppc/spr_common.h  |  1 +
 target/ppc/translate.c   |  2 ++
 10 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 173e4c351a..55985fb84f 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -439,6 +439,8 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR0_FC56   PPC_BIT(59) /* PMC Freeze Counters 5-6 bit */
 #define MMCR0_PMC1CE PPC_BIT(48) /* MMCR0 PMC1 Condition Enabled */
 #define MMCR0_PMCjCE PPC_BIT(49) /* MMCR0 PMCj Condition Enabled */
+#define MMCR0_FCPPPC_BIT(34) /* Freeze Counters/BHRB if PR=1 */
+#define MMCR0_FCPC   PPC_BIT(51) /* Condition for FCP bit */
 /* MMCR0 userspace r/w mask */
 #define MMCR0_UREG_MASK (MMCR0_FC | MMCR0_PMAO | MMCR0_PMAE)
 /* MMCR2 userspace r/w mask */
@@ -451,6 +453,8 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR2_UREG_MASK (MMCR2_FC1P0 | MMCR2_FC2P0 | MMCR2_FC3P0 | \
  MMCR2_FC4P0 | MMCR2_FC5P0 | MMCR2_FC6P0)
 
+#define MMCRA_BHRBRDPPC_BIT(26) /* BHRB Recording Disable */
+
 #define MMCR1_EVT_SIZE 8
 /* extract64() does a right shift before extracting */
 #define MMCR1_PMC1SEL_START 32
@@ -703,6 +707,7 @@ enum {
 HFLAGS_PMCJCE = 17, /* MMCR0 PMCjCE bit */
 HFLAGS_PMC_OTHER = 18, /* PMC other than PMC5-6 is enabled */
 HFLAGS_INSN_CNT = 19, /* PMU instruction count enabled */
+HFLAGS_BHRB_ENABLE = 20, /* Summary flag for enabling BHRB */
 HFLAGS_VSX = 23, /* MSR_VSX if cpu has VSX */
 HFLAGS_VR = 25,  /* MSR_VR if cpu has VRE */
 
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 7ab5ee92d9..8c81a75416 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -5152,7 +5152,7 @@ static void register_book3s_pmu_sup_sprs(CPUPPCState *env)
  KVM_REG_PPC_MMCR1, 0x);
 spr_register_kvm(env, SPR_POWER_MMCRA, "MMCRA",
  SPR_NOACCESS, SPR_NOACCESS,
- _read_generic, _write_generic,
+ _read_generic, _write_MMCRA,
  KVM_REG_PPC_MMCRA, 0x);
 spr_register_kvm(env, SPR_POWER_PMC1, "PMC1",
  SPR_NOACCESS, SPR_NOACCESS,
@@ -7164,7 +7164,7 @@ static void ppc_cpu_reset_hold(Object *obj)
 if (env->mmu_model != POWERPC_MMU_REAL) {
 ppc_tlb_invalidate_all(env);
 }
-pmu_mmcr01_updated(env);
+pmu_mmcr01a_updated(env);
 }
 
 /* clean any pending stop state */
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 86f97ee1e7..3df360efe9 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -30,6 +30,7 @@ DEF_HELPER_2(store_dawr0, void, env, tl)
 DEF_HELPER_2(store_dawrx0, void, env, tl)
 DEF_HELPER_2(store_mmcr0, void, env, tl)
 DEF_HELPER_2(store_mmcr1, void, env, tl)
+DEF_HELPER_2(store_mmcrA, void, env, tl)
 DEF_HELPER_3(store_pmc, void, env, i32, i64)
 DEF_HELPER_2(read_pmc, tl, env, i32)
 DEF_HELPER_2(insns_inc, void, env, i32)
diff --git a/target/ppc/helper_regs.c b/target/ppc/helper_regs.c
index f380342d4d..5696338137 100644
--- a/target/ppc/helper_regs.c
+++ b/target/ppc/helper_regs.c
@@ -47,6 +47,37 @@ void hreg_swap_gpr_tgpr(CPUPPCState *env)
 env->tgpr[3] = tmp;
 }
 
+static bool hreg_check_bhrb_enable(CPUPPCState *env)
+{
+bool pr = !!(env->msr & (1 << MSR_PR));
+target_long mmcr0;
+bool fcp;
+bool hv;
+
+/* ISA 3.1 adds the PMCRA[BRHBRD] and problem state checks */
+if ((env->insns_flags2 & PPC2_ISA310) &&
+((env->spr[SPR_POWER_MMCRA] & MMCRA_BHRBRD) || !pr))

[PATCH v2 3/4] target/ppc: Add clrbhrb and mfbhrbe instructions

2023-09-19 Thread Glenn Miles

Add support for the clrbhrb and mfbhrbe instructions.

Since neither instruction is believed to be critical to
performance, both instructions were implemented using helper
functions.

Access to both instructions is controlled by bits in the
HFSCR (for privileged state) and MMCR0 (for problem state).
A new function, helper_mmcr0_facility_check, was added for
checking MMCR0[BHRBA] and raising a facility_unavailable exception
if required.

Signed-off-by: Glenn Miles 
---
 target/ppc/cpu.h |  2 ++
 target/ppc/helper.h  |  4 +++
 target/ppc/insn32.decode |  8 ++
 target/ppc/misc_helper.c | 43 
 target/ppc/translate.c   |  2 ++
 target/ppc/translate/bhrb-impl.c.inc | 43 
 6 files changed, 102 insertions(+)
 create mode 100644 target/ppc/translate/bhrb-impl.c.inc

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 396b1f1a6c..15326c4d40 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -441,6 +441,7 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR0_PMCjCE PPC_BIT(49) /* MMCR0 PMCj Condition Enabled */
 #define MMCR0_FCPPPC_BIT(34) /* Freeze Counters/BHRB if PR=1 */
 #define MMCR0_FCPC   PPC_BIT(51) /* Condition for FCP bit */
+#define MMCR0_BHRBA_NR PPC_BIT_NR(42)/* BHRB Available */
 /* MMCR0 userspace r/w mask */
 #define MMCR0_UREG_MASK (MMCR0_FC | MMCR0_PMAO | MMCR0_PMAE)
 /* MMCR2 userspace r/w mask */
@@ -540,6 +541,7 @@ FIELD(MSR, LE, MSR_LE, 1)
 
 /* HFSCR bits */
 #define HFSCR_MSGP PPC_BIT(53) /* Privileged Message Send Facilities */
+#define HFSCR_BHRB PPC_BIT(59) /* BHRB Instructions */
 #define HFSCR_IC_MSGP  0xA
 
 #define DBCR0_ICMP (1 << 27)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 3df360efe9..a62d32d786 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -820,3 +820,7 @@ DEF_HELPER_4(DSCLIQ, void, env, fprp, fprp, i32)
 
 DEF_HELPER_1(tbegin, void, env)
 DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
+
+DEF_HELPER_1(clrbhrb, void, env)
+DEF_HELPER_FLAGS_2(mfbhrbe, TCG_CALL_NO_WG, i64, env, i32)
+
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode
index 4fcf3af8d0..00d3ddda02 100644
--- a/target/ppc/insn32.decode
+++ b/target/ppc/insn32.decode
@@ -972,3 +972,11 @@ MSGSND  01 - - . 0011001110 -   
@X_rb
 MSGCLRP 01 - - . 0010101110 -   @X_rb
 MSGSNDP 01 - - . 0010001110 -   @X_rb
 MSGSYNC 01 - - - 1101110110 -
+
+# Branch History Rolling Buffer (BHRB) Instructions
+
+_bhrbe  rt bhrbe
+@XFX_bhrbe  .. rt:5 bhrbe:10 .. -   _bhrbe
+
+MFBHRBE 01 . . . 0100101110 -   @XFX_bhrbe
+CLRBHRB 01 - - - 0110101110 -
diff --git a/target/ppc/misc_helper.c b/target/ppc/misc_helper.c
index a05bdf78c9..c923766f0e 100644
--- a/target/ppc/misc_helper.c
+++ b/target/ppc/misc_helper.c
@@ -139,6 +139,17 @@ void helper_fscr_facility_check(CPUPPCState *env, uint32_t 
bit,
 #endif
 }
 
+static void helper_mmcr0_facility_check(CPUPPCState *env, uint32_t bit,
+ uint32_t sprn, uint32_t cause)
+{
+#ifdef TARGET_PPC64
+if (FIELD_EX64(env->msr, MSR, PR) &&
+!(env->spr[SPR_POWER_MMCR0] & (1ULL << bit))) {
+raise_fu_exception(env, bit, sprn, cause, GETPC());
+}
+#endif
+}
+
 void helper_msr_facility_check(CPUPPCState *env, uint32_t bit,
uint32_t sprn, uint32_t cause)
 {
@@ -366,3 +377,35 @@ void helper_fixup_thrm(CPUPPCState *env)
 env->spr[i] = v;
 }
 }
+
+void helper_clrbhrb(CPUPPCState *env)
+{
+helper_hfscr_facility_check(env, HFSCR_BHRB, "clrbhrb", FSCR_IC_BHRB);
+
+helper_mmcr0_facility_check(env, MMCR0_BHRBA_NR, 0, FSCR_IC_BHRB);
+
+memset(env->bhrb, 0, sizeof(env->bhrb));
+}
+
+uint64_t helper_mfbhrbe(CPUPPCState *env, uint32_t bhrbe)
+{
+unsigned int index;
+
+helper_hfscr_facility_check(env, HFSCR_BHRB, "mfbhrbe", FSCR_IC_BHRB);
+
+helper_mmcr0_facility_check(env, MMCR0_BHRBA_NR, 0, FSCR_IC_BHRB);
+
+if ((bhrbe >= env->bhrb_num_entries) ||
+(env->spr[SPR_POWER_MMCR0] & MMCR0_PMAE)) {
+return 0;
+}
+
+/*
+ * Note: bhrb_offset is the byte offset for writing the
+ * next entry (over the oldest entry), which is why we
+ * must offset bhrbe by 1 to get to the 0th entry.
+ */
+index = ((env->bhrb_offset / sizeof(uint64_t)) - (bhrbe + 1)) %
+env->bhrb_num_entries;
+return env->bhrb[index];
+}
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 5f0c79923f..68a8395a23 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -6505,6 +6505,8 @@ static bool resolve_PLS_D(DisasContext *ctx, arg_D *d, 
arg_PLS_D *a)
 
 #include "translate/storage-ctrl-impl.c.inc"
 
+#include "translate/bhrb-impl.c.inc"
+
 /* Handles lfdp */
 static

Re: [PATCH 4/4] target/ppc: Add migration support for BHRB

2023-09-19 Thread Glenn Miles


On 2023-09-14 20:20, Nicholas Piggin wrote:

On Wed Sep 13, 2023 at 6:25 AM AEST, Glenn Miles wrote:

Adds migration support for Branch History Rolling
Buffer (BHRB) internal state.

Signed-off-by: Glenn Miles 
---
 target/ppc/machine.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/target/ppc/machine.c b/target/ppc/machine.c
index b195fb4dc8..89146969c8 100644
--- a/target/ppc/machine.c
+++ b/target/ppc/machine.c
@@ -314,6 +314,7 @@ static int cpu_post_load(void *opaque, int 
version_id)


 if (tcg_enabled()) {
 pmu_mmcr01a_updated(env);
+hreg_bhrb_filter_update(env);
 }

 return 0;
@@ -670,6 +671,27 @@ static const VMStateDescription vmstate_compat = 
{

 }
 };

+#ifdef TARGET_PPC64
+static bool bhrb_needed(void *opaque)
+{
+PowerPCCPU *cpu = opaque;
+return (cpu->env.flags & POWERPC_FLAG_BHRB) != 0;
+}
+
+static const VMStateDescription vmstate_bhrb = {
+.name = "cpu/bhrb",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = bhrb_needed,
+.fields = (VMStateField[]) {
+VMSTATE_UINTTL(env.bhrb_num_entries, PowerPCCPU),


Maybe don't need bhrb_num_entries since target machine should have the
same?



Removed.


+VMSTATE_UINTTL(env.bhrb_offset, PowerPCCPU),
+VMSTATE_UINT64_ARRAY(env.bhrb, PowerPCCPU, 
BHRB_MAX_NUM_ENTRIES),


Is it possible to migrate only bhrb_num_entries items? Wants a VARRAY
AFAIKS but there is no VARRAY_UINT64?

Since all sizes are the same 32 now, would it be possible to turn it
into a VARRAY sometime later if supposing a new CPU changed to a
different size, and would the wire format for the VARRAY still be
compatible with this fixed size array, or does a VARRAY look different
I wonder?



I looked into this some more.  It turns out that the UINT32 in 
VARRAY_UINT32
is referring to the size of the field that holds the number of entries 
in
the array, not the size of the array elements.  So, it is possible to do 
this
with the VARRAY_UINT32 type.  I would need to change the type for 
bhrb_num_entries
to a uint32_t and also, since VARRAY_UINT32 requires the array field to 
be a
pointer to an array, I would need to store the address of the array in 
another

field.



Thanks,
Nick


Thank you for taking the time to review my code!

Glenn

[PATCH v2 2/4] target/ppc: Add recording of taken branches to BHRB

2023-09-19 Thread Glenn Miles

This commit continues adding support for the Branch History
Rolling Buffer (BHRB) as is provided starting with the P8
processor and continuing with its successors.  This commit
is limited to the recording and filtering of taken branches.

The following changes were made:

  - Added a BHRB buffer for storing branch instruction and
target addresses for taken branches
  - Renamed gen_update_cfar to gen_update_branch_history and
added a 'target' parameter to hold the branch target
address and 'inst_type' parameter to use for filtering
  - Added TCG code to gen_update_branch_history that stores
data to the BHRB and updates the BHRB offset.
  - Added BHRB resource initialization and reset functions
  - Enabled functionality for P8, P9 and P10 processors.

Signed-off-by: Glenn Miles 
---
 target/ppc/cpu.h   | 17 +
 target/ppc/cpu_init.c  | 41 ++-
 target/ppc/power8-pmu.c| 33 +
 target/ppc/power8-pmu.h|  7 ++
 target/ppc/translate.c | 97 --
 target/ppc/translate/branch-impl.c.inc |  2 +-
 6 files changed, 187 insertions(+), 10 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 55985fb84f..396b1f1a6c 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -454,6 +454,8 @@ FIELD(MSR, LE, MSR_LE, 1)
  MMCR2_FC4P0 | MMCR2_FC5P0 | MMCR2_FC6P0)
 
 #define MMCRA_BHRBRDPPC_BIT(26) /* BHRB Recording Disable */
+#define MMCRA_IFM_MASK  PPC_BITMASK(32, 33) /* BHRB Instruction Filtering */
+#define MMCRA_IFM_SHIFT PPC_BIT_NR(33)
 
 #define MMCR1_EVT_SIZE 8
 /* extract64() does a right shift before extracting */
@@ -680,6 +682,8 @@ enum {
 POWERPC_FLAG_SMT  = 0x0040,
 /* Using "LPAR per core" mode  (as opposed to per-thread)*/
 POWERPC_FLAG_SMT_1LPAR = 0x0080,
+/* Has BHRB */
+POWERPC_FLAG_BHRB  = 0x0100,
 };
 
 /*
@@ -1106,6 +1110,9 @@ DEXCR_ASPECT(PHIE, 6)
 #define PPC_CPU_OPCODES_LEN  0x40
 #define PPC_CPU_INDIRECT_OPCODES_LEN 0x20
 
+#define BHRB_MAX_NUM_ENTRIES_LOG2 (5)
+#define BHRB_MAX_NUM_ENTRIES  (1 << BHRB_MAX_NUM_ENTRIES_LOG2)
+
 struct CPUArchState {
 /* Most commonly used resources during translated code execution first */
 target_ulong gpr[32];  /* general purpose registers */
@@ -1196,6 +1203,16 @@ struct CPUArchState {
 int dcache_line_size;
 int icache_line_size;
 
+#ifdef TARGET_PPC64
+/* Branch History Rolling Buffer (BHRB) resources */
+target_ulong bhrb_num_entries;
+target_ulong bhrb_base;
+target_ulong bhrb_filter;
+target_ulong bhrb_offset;
+target_ulong bhrb_offset_mask;
+uint64_t bhrb[BHRB_MAX_NUM_ENTRIES];
+#endif
+
 /* These resources are used during exception processing */
 /* CPU model definition */
 target_ulong msr_mask;
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 8c81a75416..09d9faf942 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6110,6 +6110,28 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void *data)
 pcc->l1_icache_size = 0x8000;
 }
 
+static void bhrb_init_state(CPUPPCState *env, target_long num_entries_log2)
+{
+if (env->flags & POWERPC_FLAG_BHRB) {
+if (num_entries_log2 > BHRB_MAX_NUM_ENTRIES_LOG2) {
+num_entries_log2 = BHRB_MAX_NUM_ENTRIES_LOG2;
+}
+env->bhrb_num_entries = 1 << num_entries_log2;
+env->bhrb_base = (target_long)>bhrb[0];
+env->bhrb_offset_mask = (env->bhrb_num_entries * sizeof(uint64_t)) - 1;
+}
+}
+
+static void bhrb_reset_state(CPUPPCState *env)
+{
+if (env->flags & POWERPC_FLAG_BHRB) {
+env->bhrb_offset = 0;
+env->bhrb_filter = 0;
+memset(env->bhrb, 0, sizeof(env->bhrb));
+}
+}
+
+#define POWER8_BHRB_ENTRIES_LOG2 5
 static void init_proc_POWER8(CPUPPCState *env)
 {
 /* Common Registers */
@@ -6151,6 +6173,8 @@ static void init_proc_POWER8(CPUPPCState *env)
 env->dcache_line_size = 128;
 env->icache_line_size = 128;
 
+bhrb_init_state(env, POWER8_BHRB_ENTRIES_LOG2);
+
 /* Allocate hardware IRQ controller */
 init_excp_POWER8(env);
 ppcPOWER7_irq_init(env_archcpu(env));
@@ -6251,7 +6275,8 @@ POWERPC_FAMILY(POWER8)(ObjectClass *oc, void *data)
 pcc->flags = POWERPC_FLAG_VRE | POWERPC_FLAG_SE |
  POWERPC_FLAG_BE | POWERPC_FLAG_PMM |
  POWERPC_FLAG_BUS_CLK | POWERPC_FLAG_CFAR |
- POWERPC_FLAG_VSX | POWERPC_FLAG_TM;
+ POWERPC_FLAG_VSX | POWERPC_FLAG_TM |
+ POWERPC_FLAG_BHRB;
 pcc->l1_dcache_size = 0x8000;
 pcc->l1_icache_size = 0x8000;
 }
@@ -6275,6 +6300,7 @@ static struct ppc_radix_page_info POWER9_radix_page_info 
= {
 };
 #endif /* CONFIG_USER_ONLY */
 
+#define POWER9_BHRB_ENTRIES_LOG2 5
 static void init_proc_POWER9(CPUPPCState *env)
 {
 /* Common Registers */
@@ -6325,6 +6351,8 @@ static void

[PATCH v2 0/4] Add BHRB Facility Support

2023-09-19 Thread Glenn Miles

This is a series of patches for adding support for the Branch History
Rolling Buffer (BHRB) facility.  This was added to the Power ISA
starting with version 2.07.  Changes were subsequently made in version
3.1 to limit BHRB recording to instructions run in problem state only
and to add a control bit to disable recording (MMCRA[BHRBRD]).

Glenn Miles (4):
  target/ppc: Add new hflags to support BHRB
  target/ppc: Add recording of taken branches to BHRB
  target/ppc: Add clrbhrb and mfbhrbe instructions
  target/ppc: Add migration support for BHRB

 target/ppc/cpu.h   |  24 ++
 target/ppc/cpu_init.c  |  45 +--
 target/ppc/helper.h|   5 ++
 target/ppc/helper_regs.c   |  35 +
 target/ppc/insn32.decode   |   8 ++
 target/ppc/machine.c   |  23 +-
 target/ppc/misc_helper.c   |  43 +++
 target/ppc/power8-pmu-regs.c.inc   |   5 ++
 target/ppc/power8-pmu.c|  48 +++-
 target/ppc/power8-pmu.h|  11 ++-
 target/ppc/spr_common.h|   1 +
 target/ppc/translate.c | 101 +++--
 target/ppc/translate/bhrb-impl.c.inc   |  43 +++
 target/ppc/translate/branch-impl.c.inc |   2 +-
 14 files changed, 375 insertions(+), 19 deletions(-)
 create mode 100644 target/ppc/translate/bhrb-impl.c.inc

-- 
2.31.1

Re: [QEMU PATCH v5 07/13] softmmu/memory: enable automatic deallocation of memory regions

2023-09-19 Thread Akihiko Odaki


On 2023/09/19 23:21, Xenia Ragiadakou wrote:


On 19/9/23 13:44, Akihiko Odaki wrote:

On 2023/09/19 19:28, Xenia Ragiadakou wrote:


On 15/9/23 18:11, Akihiko Odaki wrote:

On 2023/09/15 20:11, Huang Rui wrote:

From: Xenia Ragiadakou 

When the memory region has a different life-cycle from that of her 
parent,
could be automatically released, once has been unparent and once 
all of her

references have gone away, via the object's free callback.

However, currently, references to the memory region are held by its 
owner

without first incrementing the memory region object's reference count.
As a result, the automatic deallocation of the object, not taking into
account those references, results in use-after-free memory corruption.

This patch increases the reference count of an owned memory region 
object
on each memory_region_ref() and decreases it on each 
memory_region_unref().


Signed-off-by: Xenia Ragiadakou 
Signed-off-by: Huang Rui 
---

V4 -> V5:
 - ref/unref only owned memory regions (Akihiko)

  softmmu/memory.c | 5 +
  1 file changed, 5 insertions(+)

diff --git a/softmmu/memory.c b/softmmu/memory.c
index 7d9494ce70..15e1699750 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -1800,6 +1800,9 @@ void memory_region_ref(MemoryRegion *mr)
  /* MMIO callbacks most likely will access data that belongs
   * to the owner, hence the need to ref/unref the owner whenever
   * the memory region is in use.
+ * Likewise, the owner keeps references to the memory region,
+ * hence the need to ref/unref the memory region object to 
prevent
+ * its automatic deallocation while still referenced by its 
owner.


This comment does not make sense. Traditionally no such automatic 
deallocation happens so the owner has been always required to free 
the memory region when it gets finalized.


"[QEMU PATCH v5 09/13] virtio-gpu: Handle resource blob commands" 
introduces a different kind of memory region, which can be freed 
anytime before the device gets finalized. Even in this case, the 
owner removes the reference to the memory owner by doing res->region 
= NULL;


Hi Akihiko,

You are right, the word "owner" is not correct. The issue observed 
was due to the references kept in flatview ranges and the fact that 
flatview_destroy() is asynchronous and was called after memory 
region's destruction.


If I replace the word "owner" with "memory subsystem" in the commit 
message and drop the comment, would that be ok with you? or do want 
to suggest something else?


This will extend the lifetime of the memory region, but the underlying 
memory is still synchronously freed. Can you show that the flatview 
range will not be used to read the freed memory?


Yes, the intention of this patch is to delay the mr object finalization 
until all memory_region_unref() on this mr have been taken place.


What do you mean by "the underlying memory is still synchronously freed"?



A pointer is passed to memory_region_init_ram_ptr() with the ptr 
parameter when initializing the memory region and the memory region 
keeps the pointer.


In virtio_gpu_virgl_resource_unmap(), the memory pointed with the 
pointer is unmapped with virgl_renderer_resource_unmap() and makes the 
pointer kept by the memory region dangling though the lifetime of the 
memory region is extended with this patch. Can you show that the 
dangling pointer the memory region has will never be referenced?

Re: [PATCH v11 6/9] gfxstream + rutabaga: add initial support for gfxstream

2023-09-19 Thread Akihiko Odaki


On 2023/09/20 3:36, Bernhard Beschow wrote:



Am 15. September 2023 02:38:02 UTC schrieb Gurchetan Singh 
:

On Thu, Sep 14, 2023 at 12:23 AM Bernhard Beschow  wrote:




Am 14. September 2023 04:38:51 UTC schrieb Gurchetan Singh <
gurchetansi...@chromium.org>:

On Wed, Sep 13, 2023 at 4:58 AM Bernhard Beschow 

wrote:





Am 23. August 2023 01:25:38 UTC schrieb Gurchetan Singh <
gurchetansi...@chromium.org>:

This adds initial support for gfxstream and cross-domain.  Both
features rely on virtio-gpu blob resources and context types, which
are also implemented in this patch.

gfxstream has a long and illustrious history in Android graphics
paravirtualization.  It has been powering graphics in the Android
Studio Emulator for more than a decade, which is the main developer
platform.

Originally conceived by Jesse Hall, it was first known as "EmuGL" [a].
The key design characteristic was a 1:1 threading model and
auto-generation, which fit nicely with the OpenGLES spec.  It also
allowed easy layering with ANGLE on the host, which provides the GLES
implementations on Windows or MacOS enviroments.

gfxstream has traditionally been maintained by a single engineer, and
between 2015 to 2021, the goldfish throne passed to Frank Yang.
Historians often remark this glorious reign ("pax gfxstreama" is the
academic term) was comparable to that of Augustus and both Queen
Elizabeths.  Just to name a few accomplishments in a resplendent
panoply: higher versions of GLES, address space graphics, snapshot
support and CTS compliant Vulkan [b].

One major drawback was the use of out-of-tree goldfish drivers.
Android engineers didn't know much about DRM/KMS and especially TTM so
a simple guest to host pipe was conceived.

Luckily, virtio-gpu 3D started to emerge in 2016 due to the work of
the Mesa/virglrenderer communities.  In 2018, the initial virtio-gpu
port of gfxstream was done by Cuttlefish enthusiast Alistair Delva.
It was a symbol compatible replacement of virglrenderer [c] and named
"AVDVirglrenderer".  This implementation forms the basis of the
current gfxstream host implementation still in use today.

cross-domain support follows a similar arc.  Originally conceived by
Wayland aficionado David Reveman and crosvm enjoyer Zach Reizner in
2018, it initially relied on the downstream "virtio-wl" device.

In 2020 and 2021, virtio-gpu was extended to include blob resources
and multiple timelines by yours truly, features gfxstream/cross-domain
both require to function correctly.

Right now, we stand at the precipice of a truly fantastic possibility:
the Android Emulator powered by upstream QEMU and upstream Linux
kernel.  gfxstream will then be packaged properfully, and app
developers can even fix gfxstream bugs on their own if they encounter
them.

It's been quite the ride, my friends.  Where will gfxstream head next,
nobody really knows.  I wouldn't be surprised if it's around for
another decade, maintained by a new generation of Android graphics
enthusiasts.

Technical details:
  - Very simple initial display integration: just used Pixman
  - Largely, 1:1 mapping of virtio-gpu hypercalls to rutabaga function
calls

Next steps for Android VMs:
  - The next step would be improving display integration and UI

interfaces

with the goal of the QEMU upstream graphics being in an emulator
release [d].

Next steps for Linux VMs for display virtualization:
  - For widespread distribution, someone needs to package Sommelier or

the

wayland-proxy-virtwl [e] ideally into Debian main. In addition,

newer

versions of the Linux kernel come with DRM_VIRTIO_GPU_KMS option,
which allows disabling KMS hypercalls.  If anyone cares enough,

it'll

probably be possible to build a custom VM variant that uses this

display

virtualization strategy.

[a]

https://android-review.googlesource.com/c/platform/development/+/34470

[b]



https://android-review.googlesource.com/q/topic:%22vulkan-hostconnection-start%22

[c]



https://android-review.googlesource.com/c/device/generic/goldfish-opengl/+/761927

[d] https://developer.android.com/studio/releases/emulator
[e] https://github.com/talex5/wayland-proxy-virtwl

Signed-off-by: Gurchetan Singh 
Tested-by: Alyssa Ross 
Tested-by: Emmanouil Pitsidianakis 
Reviewed-by: Emmanouil Pitsidianakis 
---
v1: Incorported various suggestions by Akihiko Odaki and Bernard

Berschow

- Removed GET_VIRTIO_GPU_GL / GET_RUTABAGA macros
- Used error_report(..)
- Used g_autofree to fix leaks on error paths
- Removed unnecessary casts
- added virtio-gpu-pci-rutabaga.c + virtio-vga-rutabaga.c files

v2: Incorported various suggestions by Akihiko Odaki, Marc-André Lureau

and

Bernard Berschow:
- Parenthesis in CHECK macro
- CHECK_RESULT(result, ..) --> CHECK(!result, ..)
- delay until g->parent_obj.enable = 1
- Additional cast fixes
- initialize directly in virtio_gpu_rutabaga_realize(..)
- add debug callback to hook into QEMU error's

Re: [PATCH 3/4] target/ppc: Add clrbhrb and mfbhrbe instructions

2023-09-19 Thread Glenn Miles


On 2023-09-14 20:13, Nicholas Piggin wrote:

On Wed Sep 13, 2023 at 6:24 AM AEST, Glenn Miles wrote:

Add support for the clrbhrb and mfbhrbe instructions.

Since neither instruction is believed to be critical to
performance, both instructions were implemented using helper
functions.

Access to both instructions is controlled by bits in the
HFSCR (for privileged state) and MMCR0 (for problem state).
A new function, helper_mmcr0_facility_check, was added for
checking MMCR0[BHRBA] and raising a facility_unavailable exception
if required.

Signed-off-by: Glenn Miles 
---
 target/ppc/cpu.h |  1 +
 target/ppc/helper.h  |  4 
 target/ppc/misc_helper.c | 43 


 target/ppc/translate.c   | 13 
 4 files changed, 61 insertions(+)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index bda1afb700..ee81ede4ee 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -541,6 +541,7 @@ FIELD(MSR, LE, MSR_LE, 1)

 /* HFSCR bits */
 #define HFSCR_MSGP PPC_BIT(53) /* Privileged Message Send 
Facilities */

+#define HFSCR_BHRB PPC_BIT(59) /* BHRB Instructions */
 #define HFSCR_IC_MSGP  0xA

 #define DBCR0_ICMP (1 << 27)
diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 1a3d9a7e57..bbc32ff114 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -816,3 +816,7 @@ DEF_HELPER_4(DSCLIQ, void, env, fprp, fprp, i32)

 DEF_HELPER_1(tbegin, void, env)
 DEF_HELPER_FLAGS_1(fixup_thrm, TCG_CALL_NO_RWG, void, env)
+
+DEF_HELPER_1(clrbhrb, void, env)
+DEF_HELPER_FLAGS_2(mfbhrbe, TCG_CALL_NO_WG, i64, env, i32)
+
diff --git a/target/ppc/misc_helper.c b/target/ppc/misc_helper.c
index 692d058665..45abe04f66 100644
--- a/target/ppc/misc_helper.c
+++ b/target/ppc/misc_helper.c
@@ -139,6 +139,17 @@ void helper_fscr_facility_check(CPUPPCState *env, 
uint32_t bit,

 #endif
 }

+static void helper_mmcr0_facility_check(CPUPPCState *env, uint32_t 
bit,

+ uint32_t sprn, uint32_t cause)
+{
+#ifdef TARGET_PPC64
+if (FIELD_EX64(env->msr, MSR, PR) &&
+!(env->spr[SPR_POWER_MMCR0] & (1ULL << bit))) {
+raise_fu_exception(env, bit, sprn, cause, GETPC());
+}
+#endif
+}
+
 void helper_msr_facility_check(CPUPPCState *env, uint32_t bit,
uint32_t sprn, uint32_t cause)
 {
@@ -351,3 +362,35 @@ void helper_fixup_thrm(CPUPPCState *env)
 env->spr[i] = v;
 }
 }
+
+void helper_clrbhrb(CPUPPCState *env)
+{
+helper_hfscr_facility_check(env, HFSCR_BHRB, "clrbhrb", 
FSCR_IC_BHRB);

+
+helper_mmcr0_facility_check(env, MMCR0_BHRBA, 0, FSCR_IC_BHRB);


Repeating the comment about MMCR0_BHRBA and PPC_BIT_NR discrepancy here
for posterity.



Added NR suffix.


+
+memset(env->bhrb, 0, sizeof(env->bhrb));
+}
+
+uint64_t helper_mfbhrbe(CPUPPCState *env, uint32_t bhrbe)
+{
+unsigned int index;
+
+helper_hfscr_facility_check(env, HFSCR_BHRB, "mfbhrbe", 
FSCR_IC_BHRB);

+
+helper_mmcr0_facility_check(env, MMCR0_BHRBA, 0, FSCR_IC_BHRB);
+
+if ((bhrbe >= env->bhrb_num_entries) ||
+   (env->spr[SPR_POWER_MMCR0] & MMCR0_PMAE)) {


Nitpick, but multi line statment starts again inside the first
parenthesis after a keyword like this.



Fixed.


+return 0;
+}
+
+/*
+ * Note: bhrb_offset is the byte offset for writing the
+ * next entry (over the oldest entry), which is why we
+ * must offset bhrbe by 1 to get to the 0th entry.
+ */
+index = ((env->bhrb_offset / sizeof(uint64_t)) - (bhrbe + 1)) %
+env->bhrb_num_entries;
+return env->bhrb[index];
+}
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 7824475f54..b330871793 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -6549,12 +6549,25 @@ static void gen_brh(DisasContext *ctx)
 }
 #endif

+static void gen_clrbhrb(DisasContext *ctx)
+{
+gen_helper_clrbhrb(cpu_env);
+}
+
+static void gen_mfbhrbe(DisasContext *ctx)
+{
+TCGv_i32 bhrbe = tcg_constant_i32(_SPR(ctx->opcode));
+gen_helper_mfbhrbe(cpu_gpr[rD(ctx->opcode)], cpu_env, bhrbe);
+}
+
 static opcode_t opcodes[] = {
 #if defined(TARGET_PPC64)
 GEN_HANDLER_E(brd, 0x1F, 0x1B, 0x05, 0xF801, PPC_NONE, 
PPC2_ISA310),
 GEN_HANDLER_E(brw, 0x1F, 0x1B, 0x04, 0xF801, PPC_NONE, 
PPC2_ISA310),
 GEN_HANDLER_E(brh, 0x1F, 0x1B, 0x06, 0xF801, PPC_NONE, 
PPC2_ISA310),

 #endif
+GEN_HANDLER_E(clrbhrb, 0x1F, 0x0E, 0x0D, 0x3FFF801, PPC_NONE, 
PPC2_ISA207S),
+GEN_HANDLER_E(mfbhrbe, 0x1F, 0x0E, 0x09, 0x001, PPC_NONE, 
PPC2_ISA207S),


How much of a pain would it be to add it as decodetree? If there is an
addition a family of existing instrutions here it makes sense to add it
here, for new family would be nice to use decodetree.

I think they're only supported in 64-bit ISA so it could be ifdef
TARGET_PPC64.



Ok, switched to using decodetree.


Thanks,
Nick



Thanks for the review!

Glenn


 GEN_HANDLER(invalid, 0x00, 0x00, 0x00, 0x, PPC_NONE),
 #if

Re: [PATCH 2/4] target/ppc: Add recording of taken branches to BHRB

2023-09-19 Thread Glenn Miles


On 2023-09-14 20:02, Nicholas Piggin wrote:

On Wed Sep 13, 2023 at 6:24 AM AEST, Glenn Miles wrote:

This commit continues adding support for the Branch History
Rolling Buffer (BHRB) as is provided starting with the P8
processor and continuing with its successors.  This commit
is limited to the recording and filtering of taken branches.

The following changes were made:

  - Added a BHRB buffer for storing branch instruction and
target addresses for taken branches
  - Renamed gen_update_cfar to gen_update_branch_history and
added a 'target' parameter to hold the branch target
address and 'inst_type' parameter to use for filtering
  - Added a combination of jit-time and run-time checks to
gen_update_branch_history for determining if a branch
should be recorded
  - Added TCG code to gen_update_branch_history that stores
data to the BHRB and updates the BHRB offset.
  - Added BHRB resource initialization and reset functions
  - Enabled functionality for P8, P9 and P10 processors.

Signed-off-by: Glenn Miles 
---
 target/ppc/cpu.h   |  18 +++-
 target/ppc/cpu_init.c  |  41 -
 target/ppc/helper_regs.c   |  32 +++
 target/ppc/helper_regs.h   |   1 +
 target/ppc/power8-pmu.c|   2 +
 target/ppc/power8-pmu.h|   7 ++
 target/ppc/translate.c | 114 
+++--

 target/ppc/translate/branch-impl.c.inc |   2 +-
 8 files changed, 205 insertions(+), 12 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 20ae1466a5..bda1afb700 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -454,8 +454,9 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR2_UREG_MASK (MMCR2_FC1P0 | MMCR2_FC2P0 | MMCR2_FC3P0 | \
  MMCR2_FC4P0 | MMCR2_FC5P0 | MMCR2_FC6P0)

-#define MMCRA_BHRBRDPPC_BIT(26)/* BHRB Recording 
Disable */

-
+#define MMCRA_BHRBRDPPC_BIT(26) /* BHRB Recording Disable 
*/


Fold this tidying into patch 1.


Done.



+#define MMCRA_IFM_MASK  PPC_BITMASK(32, 33) /* BHRB Instruction 
Filtering */

+#define MMCRA_IFM_SHIFT PPC_BIT_NR(33)

 #define MMCR1_EVT_SIZE 8
 /* extract64() does a right shift before extracting */
@@ -682,6 +683,8 @@ enum {
 POWERPC_FLAG_SMT  = 0x0040,
 /* Using "LPAR per core" mode  (as opposed to per-thread) 
   */

 POWERPC_FLAG_SMT_1LPAR = 0x0080,
+/* Has BHRB */
+POWERPC_FLAG_BHRB  = 0x0100,
 };


Interesting question of which patch to add different flags. I'm
strongly in add when you add code that uses them like this one,
but it's a matter of taste and not always practical to be an
absolute rule. I don't mind too much what others do, but maybe
this and the pcc->flags init should go in patch 1 since that's adding
flags that aren't yet used?



I think I prefer keeping it in patch 2 since patch 1 is more about
the hflags, which seems unrelated to this flag.



 /*
@@ -1110,6 +1113,9 @@ DEXCR_ASPECT(PHIE, 6)
 #define PPC_CPU_OPCODES_LEN  0x40
 #define PPC_CPU_INDIRECT_OPCODES_LEN 0x20

+#define BHRB_MAX_NUM_ENTRIES_LOG2 (5)
+#define BHRB_MAX_NUM_ENTRIES  (1 << BHRB_MAX_NUM_ENTRIES_LOG2)
+
 struct CPUArchState {
 /* Most commonly used resources during translated code execution 
first */

 target_ulong gpr[32];  /* general purpose registers */
@@ -1196,6 +1202,14 @@ struct CPUArchState {
 int dcache_line_size;
 int icache_line_size;

+/* Branch History Rolling Buffer (BHRB) resources */
+target_ulong bhrb_num_entries;
+target_ulong bhrb_base;
+target_ulong bhrb_filter;
+target_ulong bhrb_offset;
+target_ulong bhrb_offset_mask;
+uint64_t bhrb[BHRB_MAX_NUM_ENTRIES];


Put these under ifdef TARGET_PPC64?



Ok.


+
 /* These resources are used during exception processing */
 /* CPU model definition */
 target_ulong msr_mask;
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 568f9c3b88..19d7505a73 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6100,6 +6100,28 @@ POWERPC_FAMILY(POWER7)(ObjectClass *oc, void 
*data)

 pcc->l1_icache_size = 0x8000;
 }

+static void bhrb_init_state(CPUPPCState *env, target_long 
num_entries_log2)

+{
+if (env->flags & POWERPC_FLAG_BHRB) {
+if (num_entries_log2 > BHRB_MAX_NUM_ENTRIES_LOG2) {
+num_entries_log2 = BHRB_MAX_NUM_ENTRIES_LOG2;
+}
+env->bhrb_num_entries = 1 << num_entries_log2;
+env->bhrb_base = (target_long)>bhrb[0];
+env->bhrb_offset_mask = (env->bhrb_num_entries * 
sizeof(uint64_t)) - 1;

+}
+}
+
+static void bhrb_reset_state(CPUPPCState *env)
+{
+if (env->flags & POWERPC_FLAG_BHRB) {
+env->bhrb_offset = 0;
+env->bhrb_filter = 0;
+memset(env->bhrb, 0, sizeof(env->bhrb));
+}
+}
+
+#define POWER8_BHRB_ENTRIES_LOG2 5
 static void init_proc_POWER8(CPUPPCState *env)
 {
 /* Common Registers */
@@ -6141,6

Re: [PATCH 1/4] target/ppc: Add new hflags to support BHRB

2023-09-19 Thread Glenn Miles


On 2023-09-14 19:39, Nicholas Piggin wrote:

On Wed Sep 13, 2023 at 6:23 AM AEST, Glenn Miles wrote:

This commit is preparatory to the addition of Branch History
Rolling Buffer (BHRB) functionality, which is being provided
today starting with the P8 processor.

BHRB uses several SPR register fields to control whether or not
a branch instruction's address (and sometimes target address)
should be recorded.  Checking each of these fields with each
branch instruction using jitted code would lead to a significant
decrease in performance.

Therefore, it was decided that BHRB configuration bits that are
not expected to change frequently should have their state stored in
hflags so that the amount of checking done by jitted code can
be reduced.

This commit contains the changes for storing the state of the
following register fields as hflags:

MMCR0[FCP] - Determines if BHRB recording is frozen in the
 problem state

MMCR0[FCPC] - A modifier for MMCR0[FCP]

MMCRA[BHRBRD] - Disables all BHRB recording for a thread

Signed-off-by: Glenn Miles 
---
 target/ppc/cpu.h |  9 +
 target/ppc/cpu_init.c|  4 ++--
 target/ppc/helper.h  |  1 +
 target/ppc/helper_regs.c | 12 
 target/ppc/machine.c |  2 +-
 target/ppc/power8-pmu-regs.c.inc |  5 +
 target/ppc/power8-pmu.c  | 15 +++
 target/ppc/power8-pmu.h  |  4 ++--
 target/ppc/spr_common.h  |  1 +
 target/ppc/translate.c   |  6 ++
 10 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 25fac9577a..20ae1466a5 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -439,6 +439,9 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR0_FC56   PPC_BIT(59) /* PMC Freeze Counters 5-6 
bit */
 #define MMCR0_PMC1CE PPC_BIT(48) /* MMCR0 PMC1 Condition 
Enabled */
 #define MMCR0_PMCjCE PPC_BIT(49) /* MMCR0 PMCj Condition 
Enabled */

+#define MMCR0_BHRBA  PPC_BIT_NR(42)  /* BHRB Available */


It's confusing to use NR for this. Either call it MMCR0_BHRBA_NR or 
have
the facility check in patch 3 take the bit value. I'd move it to patch 
3

too.



Ok, adding NR suffix.

+#define MMCR0_FCPPPC_BIT(34) /* Freeze Counters/BHRB if 
PR=1 */

+#define MMCR0_FCPC   PPC_BIT(51) /* Condition for FCP bit */
 /* MMCR0 userspace r/w mask */
 #define MMCR0_UREG_MASK (MMCR0_FC | MMCR0_PMAO | MMCR0_PMAE)
 /* MMCR2 userspace r/w mask */
@@ -451,6 +454,9 @@ FIELD(MSR, LE, MSR_LE, 1)
 #define MMCR2_UREG_MASK (MMCR2_FC1P0 | MMCR2_FC2P0 | MMCR2_FC3P0 | \
  MMCR2_FC4P0 | MMCR2_FC5P0 | MMCR2_FC6P0)

+#define MMCRA_BHRBRDPPC_BIT(26)/* BHRB Recording 
Disable */

+
+
 #define MMCR1_EVT_SIZE 8
 /* extract64() does a right shift before extracting */
 #define MMCR1_PMC1SEL_START 32
@@ -703,6 +709,9 @@ enum {
 HFLAGS_PMCJCE = 17, /* MMCR0 PMCjCE bit */
 HFLAGS_PMC_OTHER = 18, /* PMC other than PMC5-6 is enabled */
 HFLAGS_INSN_CNT = 19, /* PMU instruction count enabled */
+HFLAGS_FCPC = 20,   /* MMCR0 FCPC bit */
+HFLAGS_FCP = 21,/* MMCR0 FCP bit */
+HFLAGS_BHRBRD = 22, /* MMCRA BHRBRD bit */
 HFLAGS_VSX = 23, /* MSR_VSX if cpu has VSX */
 HFLAGS_VR = 25,  /* MSR_VR if cpu has VRE */


hflags are an interesting tradeoff. You can specialise some code but
at the cost of duplicating your jit footprint, which is often the
most costly thing. The ideal hflag is one where code is not shared
between flag set/clear like PR and HV. Rarely used features is another
good one, that BHRB falls into.

But, we do want flags that carry stronger or more direct semantics
wrt code generation because you want to avoid redundant hflags values
that result in the same code generation. I might have missed something
but AFAIKS BHRB_ENABLED could be a combination of this logic (from
later patch):

+/* ISA 3.1 adds the PMCRA[BRHBRD] and problem state checks */
+if ((ctx->insns_flags2 & PPC2_ISA310) && (ctx->mmcra_bhrbrd || 
!ctx->pr)) {

+return;
+}
+
+/* Check for BHRB "frozen" conditions */
+if (ctx->mmcr0_fcpc) {
+if (ctx->mmcr0_fcp) {
+if ((ctx->hv) && (ctx->pr)) {
+return;
+}
+} else if (!(ctx->hv) && (ctx->pr)) {
+return;
+}
+} else if ((ctx->mmcr0_fcp) && (ctx->pr)) {
+return;
+}



Ok, Combining above logic into a single hflag.


Otherwise the patch looks good to me.

Thanks,
Nick

Re: qemu-riscv32 usermode still broken?

2023-09-19 Thread Andreas K. Huettel

Hi Alistair, 

> It would be great to get a strace of the failure to narrow down what
> it is. From there it should be not too hard to find and fix.

thanks a lot. Here's as much info as I could get with strace mechanisms.

1) What I did, without any tracing

pinacolada ~ # qemu-riscv32 -L /var/lib/machines/riscv32 
/var/lib/machines/riscv32/bin/bash
pinacolada ~ # python
Python 3.11.5 (main, Aug 27 2023, 18:39:05) [GCC 12.3.1 20230623] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>>
[1]+  Stopped python
^C^C
pinacolada ~ # ^C
pinacolada ~ # fg
python

pinacolada ~ #
exit

* When I type Ctrl-Z at the python prompt, the terminal hangs.
* With several Ctrl-C I can get back to the riscv32 bash, and then python is 
suspended in the background.

* Now I did this again, first with qemu tracing system calls, then with strace 
tracing qemu
* In both cases, the log starts when I type "python", and ends (with quickly 
repeated output lines) 
  after pressing Ctrl-Z

2) 
pinacolada ~ # QEMU_STRACE=1 qemu-riscv32 -L /var/lib/machines/riscv32 
/var/lib/machines/riscv32/bin/bash
(QEMU_STRACE is getting unset in my bashrc, so no subprocesses are traced)

(...)
2472050 write(2,0xe56c0,58)pinacolada ~ #  = 58
2472050 Unknown syscall 413
2472050 read(0,0x2b2aa29b,1) = 1
2472050 Unknown syscall 413
2472050 write(2,0xe56c0,1)p = 1
2472050 Unknown syscall 413
2472050 read(0,0x2b2aa29b,1) = 1
2472050 Unknown syscall 413
2472050 write(2,0xe56c0,1)y = 1
2472050 Unknown syscall 413
2472050 read(0,0x2b2aa29b,1) = 1
2472050 Unknown syscall 413
2472050 write(2,0xe56c0,1)t = 1
2472050 Unknown syscall 413
2472050 read(0,0x2b2aa29b,1) = 1
2472050 Unknown syscall 413
2472050 write(2,0xe56c0,1)h = 1
2472050 Unknown syscall 413
2472050 read(0,0x2b2aa29b,1) = 1
2472050 Unknown syscall 413
2472050 write(2,0xe56c0,1)o = 1
2472050 Unknown syscall 413
2472050 read(0,0x2b2aa29b,1) = 1
2472050 Unknown syscall 413
2472050 write(2,0xe56c0,1)n = 1
2472050 Unknown syscall 413
2472050 read(0,0x2b2aa29b,1) = 1
2472050 write(2,0xe56c0,1)
 = 1
 = 9050 write(2,0xe56c0,9)
2472050 ioctl(0,TCSETSW,{c_iflag = ICRNL|IXON|IXOFF|IUTF8,c_oflag = 
OPOST|ONLCR,c_cflag = B38400,CS8,CREAD,c_lflag = 
ISIG|ICANON|ECHO|ECHOE|ECHOK|ECHOCTL|ECHOKE|IEXTEN,c_cc = "",c_line = ''}) = 0
2472050 rt_sigaction(SIGINT,0x2b2aa1bc,0x2b2aa244) = 0
2472050 rt_sigaction(SIGHUP,0x2b2aa1bc,0x2b2aa244) = 0
2472050 rt_sigaction(SIGALRM,0x2b2aa1bc,0x2b2aa244) = 0
2472050 rt_sigaction(SIGWINCH,0x2b2aa1bc,0x2b2aa244) = 0
2472050 rt_sigaction(SIGINT,0x2b2aa14c,0x2b2aa1d4) = 0
2472050 clock_gettime64(CLOCK_REALTIME_COARSE,0x2b2aa268) = 0 
({tv_sec=1695154794,tv_nsec=760883171})
2472050 
statx(AT_FDCWD,".",AT_NO_AUTOMOUNT|AT_STATX_SYNC_AS_STAT,STATX_BASIC_STATS,0x2b2aaa78)
 = 0
2472050 
statx(AT_FDCWD,"/usr/local/sbin/python",AT_NO_AUTOMOUNT|AT_STATX_SYNC_AS_STAT,STATX_BASIC_STATS,0x2b2aa998)
 = -1 errno=2 (No such file or directory)
2472050 
statx(AT_FDCWD,"/usr/local/bin/python",AT_NO_AUTOMOUNT|AT_STATX_SYNC_AS_STAT,STATX_BASIC_STATS,0x2b2aa998)
 = -1 errno=2 (No such file or directory)
2472050 
statx(AT_FDCWD,"/usr/sbin/python",AT_NO_AUTOMOUNT|AT_STATX_SYNC_AS_STAT,STATX_BASIC_STATS,0x2b2aa998)
 = -1 errno=2 (No such file or directory)
2472050 
statx(AT_FDCWD,"/usr/bin/python",AT_NO_AUTOMOUNT|AT_STATX_SYNC_AS_STAT,STATX_BASIC_STATS,0x2b2aa998)
 = 0
2472050 
statx(AT_FDCWD,"/usr/bin/python",AT_NO_AUTOMOUNT|AT_STATX_SYNC_AS_STAT,STATX_BASIC_STATS,0x2b2aa8e8)
 = 0
2472050 geteuid() = 0
2472050 getegid() = 0
2472050 getuid() = 0
2472050 getgid() = 0
2472050 
faccessat(AT_FDCWD,"/usr/bin/python",X_OK,AT_SYMLINK_NOFOLLOW|0x1da42089) = 0
2472050 
statx(AT_FDCWD,"/usr/bin/python",AT_NO_AUTOMOUNT|AT_STATX_SYNC_AS_STAT,STATX_BASIC_STATS,0x2b2aa8e8)
 = 0
2472050 geteuid() = 0
2472050 getegid() = 0
2472050 getuid() = 0
2472050 getgid() = 0
2472050 
faccessat(AT_FDCWD,"/usr/bin/python",R_OK,AT_SYMLINK_NOFOLLOW|0x1da42089) = 0
2472050 rt_sigprocmask(SIG_BLOCK,NULL,0x2b2aabec,8) = 0
2472050 rt_sigprocmask(SIG_BLOCK,0x2b2aaaec,0x2b2aab6c,8) = 0
2472050 rt_sigaction(SIGTERM,0x2b2aa85c,0x2b2aa8e4) = 0
2472050 rt_sigprocmask(SIG_BLOCK,0x2b2aa98c,0x2b2aaa0c,8) = 0
2472050 rt_sigprocmask(SIG_SETMASK,0x2b2aaa0c,NULL,8) = 0
2472050 pipe2(0x5560d3f4,0) = 0
2472050 
clone(CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|0x11,child_stack=0x,parent_tidptr=0x,tls=0x,child_tidptr=0x2b2d20c8)
 = 2472055
2472050 rt_sigaction(SIGTERM,0x2b2aa85c,0x2b2aa8e4) = 0
 = 0
2472050 setpgid(2472055,2472055) = 0
2472055 set_robust_list(0x2b2d20cc,12) = 2472050 
rt_sigprocmask(SIG_SETMASK,0x2b2aab6c,-1 errno=38 (Function not 
implemented)NULL,
8) = 0
2472055 getpid() = 2472055
2472055 rt_sigprocmask(SIG_SETMASK,0x55616e24,NULL,8) = 0
2472050 rt_sigprocmask(SIG_BLOCK,0x2b2c,0x2b2aab2c,8) = 0
2472050 close(3) = 0
2472050 close(4) = 0
2472055 rt_sigaction(SIGTSTP,0x2b2aa84c,0x2b2aa8d4) = 0
2472055 rt_sigaction(SIGTTIN,0x2b2aa84c,0x2b2aa8d4) = 0
2472055

Re: [PATCH v3 13/32] target/riscv: Use generic helper to show CPU model names

2023-09-19 Thread Daniel Henrique Barboza





On 9/6/23 21:35, Gavin Shan wrote:

For target/riscv, the CPU type name is always the combination of the
CPU model name and suffix. The CPU model names have been correctly
shown in riscv_cpu_list_entry() and riscv_cpu_add_definition()

Use generic helper cpu_mdoel_from_type() to show the CPU model names
in the above two functions, and adjusted format of the output from
riscv_cpu_list_entry() to match with other targets. Besides, the
function riscv_cpu_class_by_name() is improved by renaming @cpuname
to @model since it's for the CPU model name, and merging the condtion


typo: "condition"


of "@oc == NULL" to object_class_dynamic_cast().

Signed-off-by: Gavin Shan 
---



Tested with "-cpu help" and "query-cpu-definitions". LGTM


Reviewed-by: Daniel Henrique Barboza 



  target/riscv/cpu.c| 23 +--
  target/riscv/riscv-qmp-cmds.c |  3 +--
  2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 6b93b04453..a525e24c5a 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -612,18 +612,19 @@ static ObjectClass *riscv_cpu_class_by_name(const char 
*cpu_model)
  {
  ObjectClass *oc;
  char *typename;
-char **cpuname;
+char **model;
  
-cpuname = g_strsplit(cpu_model, ",", 1);

-typename = g_strdup_printf(RISCV_CPU_TYPE_NAME("%s"), cpuname[0]);
+model = g_strsplit(cpu_model, ",", 1);
+typename = g_strdup_printf(RISCV_CPU_TYPE_NAME("%s"), model[0]);
  oc = object_class_by_name(typename);
-g_strfreev(cpuname);
+g_strfreev(model);
  g_free(typename);
-if (!oc || !object_class_dynamic_cast(oc, TYPE_RISCV_CPU) ||
-object_class_is_abstract(oc)) {
-return NULL;
+if (object_class_dynamic_cast(oc, TYPE_RISCV_CPU) &&
+!object_class_is_abstract(oc)) {
+return oc;
  }
-return oc;
+
+return NULL;
  }
  
  static void riscv_cpu_dump_state(CPUState *cs, FILE *f, int flags)

@@ -2211,9 +2212,10 @@ static gint riscv_cpu_list_compare(gconstpointer a, 
gconstpointer b)
  static void riscv_cpu_list_entry(gpointer data, gpointer user_data)
  {
  const char *typename = object_class_get_name(OBJECT_CLASS(data));
-int len = strlen(typename) - strlen(RISCV_CPU_TYPE_SUFFIX);
+char *model = cpu_model_from_type(typename);
  
-qemu_printf("%.*s\n", len, typename);

+qemu_printf("  %s\n", model);
+g_free(model);
  }
  
  void riscv_cpu_list(void)

@@ -,6 +2224,7 @@ void riscv_cpu_list(void)
  
  list = object_class_get_list(TYPE_RISCV_CPU, false);

  list = g_slist_sort(list, riscv_cpu_list_compare);
+qemu_printf("Available CPUs:\n");
  g_slist_foreach(list, riscv_cpu_list_entry, NULL);
  g_slist_free(list);
  }
diff --git a/target/riscv/riscv-qmp-cmds.c b/target/riscv/riscv-qmp-cmds.c
index 5ecff1afb3..22f728673f 100644
--- a/target/riscv/riscv-qmp-cmds.c
+++ b/target/riscv/riscv-qmp-cmds.c
@@ -35,8 +35,7 @@ static void riscv_cpu_add_definition(gpointer data, gpointer 
user_data)
  const char *typename = object_class_get_name(oc);
  ObjectClass *dyn_class;
  
-info->name = g_strndup(typename,

-   strlen(typename) - strlen("-" TYPE_RISCV_CPU));
+info->name = cpu_model_from_type(typename);
  info->q_typename = g_strdup(typename);
  
  dyn_class = object_class_dynamic_cast(oc, TYPE_RISCV_DYNAMIC_CPU);

[PATCH v3 01/10] qapi: fix example of get-win32-socket command

2023-09-19 Thread Victor Toso

Example output lacks double quotes. Fix it.

Fixes: 4cda177c60 "qmp: add 'get-win32-socket'"
Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Markus Armbruster 
---
 qapi/misc.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qapi/misc.json b/qapi/misc.json
index cda2effa81..be302cadeb 100644
--- a/qapi/misc.json
+++ b/qapi/misc.json
@@ -290,7 +290,7 @@
 #
 # Example:
 #
-# -> { "execute": "get-win32-socket", "arguments": { "info": "abcd123..", 
fdname": "skclient" } }
+# -> { "execute": "get-win32-socket", "arguments": { "info": "abcd123..", 
"fdname": "skclient" } }
 # <- { "return": {} }
 ##
 { 'command': 'get-win32-socket', 'data': {'info': 'str', 'fdname': 'str'}, 
'if': 'CONFIG_WIN32' }
-- 
2.41.0

[PATCH v3 09/10] qapi: fix example of query-spice command

2023-09-19 Thread Victor Toso

Example output has a comment embedded in the array. Remove it.
The end result is a list of size 2.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
---
 qapi/ui.json | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/qapi/ui.json b/qapi/ui.json
index 006616aa77..6ed36c45ea 100644
--- a/qapi/ui.json
+++ b/qapi/ui.json
@@ -363,8 +363,7 @@
 #"host": "127.0.0.1",
 #"channel-id": 0,
 #"tls": false
-# },
-# [ ... more channels follow ... ]
+# }
 #  ]
 #   }
 #}
-- 
2.41.0

[PATCH v3 07/10] qapi: fix example of query-blockstats command

2023-09-19 Thread Victor Toso

Example output has several missing commas. Add them.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Markus Armbruster 
---
 qapi/block-core.json | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 2b1d493d6e..6a81103594 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1167,10 +1167,10 @@
 #   "wr_bytes":9786368,
 #   "wr_operations":751,
 #   "rd_bytes":122567168,
-#   "rd_operations":36772
-#   "wr_total_times_ns":313253456
-#   "rd_total_times_ns":3465673657
-#   "flush_total_times_ns":49653
+#   "rd_operations":36772,
+#   "wr_total_times_ns":313253456,
+#   "rd_total_times_ns":3465673657,
+#   "flush_total_times_ns":49653,
 #   "flush_operations":61,
 #   "rd_merged":0,
 #   "wr_merged":0,
@@ -1184,10 +1184,10 @@
 #"wr_bytes":9786368,
 #"wr_operations":692,
 #"rd_bytes":122739200,
-#"rd_operations":36604
+#"rd_operations":36604,
 #"flush_operations":51,
-#"wr_total_times_ns":313253456
-#"rd_total_times_ns":3465673657
+#"wr_total_times_ns":313253456,
+#"rd_total_times_ns":3465673657,
 #"flush_total_times_ns":49653,
 #"rd_merged":0,
 #"wr_merged":0,
@@ -1204,10 +1204,10 @@
 #"wr_bytes":0,
 #"wr_operations":0,
 #"rd_bytes":0,
-#"rd_operations":0
+#"rd_operations":0,
 #"flush_operations":0,
-#"wr_total_times_ns":0
-#"rd_total_times_ns":0
+#"wr_total_times_ns":0,
+#"rd_total_times_ns":0,
 #"flush_total_times_ns":0,
 #"rd_merged":0,
 #"wr_merged":0,
@@ -1223,10 +1223,10 @@
 #"wr_bytes":0,
 #"wr_operations":0,
 #"rd_bytes":0,
-#"rd_operations":0
+#"rd_operations":0,
 #"flush_operations":0,
-#"wr_total_times_ns":0
-#"rd_total_times_ns":0
+#"wr_total_times_ns":0,
+#"rd_total_times_ns":0,
 #"flush_total_times_ns":0,
 #"rd_merged":0,
 #"wr_merged":0,
@@ -1242,10 +1242,10 @@
 #"wr_bytes":0,
 #"wr_operations":0,
 #"rd_bytes":0,
-#"rd_operations":0
+#"rd_operations":0,
 #"flush_operations":0,
-#"wr_total_times_ns":0
-#"rd_total_times_ns":0
+#"wr_total_times_ns":0,
+#"rd_total_times_ns":0,
 #"flush_total_times_ns":0,
 #"rd_merged":0,
 #"wr_merged":0,
-- 
2.41.0

[PATCH v3 05/10] qapi: fix example of calc-dirty-rate command

2023-09-19 Thread Victor Toso

Example output has property name with single quotes. Fix it.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Markus Armbruster 
---
 qapi/migration.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 2658cdbcbe..45dac41f67 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1922,7 +1922,7 @@
 # Example:
 #
 # -> {"execute": "calc-dirty-rate", "arguments": {"calc-time": 1,
-# 'sample-pages': 512} }
+# "sample-pages": 512} }
 # <- { "return": {} }
 ##
 { 'command': 'calc-dirty-rate', 'data': {'calc-time': 'int64',
-- 
2.41.0

[PATCH v3 02/10] qapi: fix example of dumpdtb command

2023-09-19 Thread Victor Toso

Example output has extra end curly bracket. Switch with comma.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Markus Armbruster 
---
 qapi/machine.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qapi/machine.json b/qapi/machine.json
index a08b6576ca..9eb76193e0 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -1684,7 +1684,7 @@
 #
 # Example:
 #
-# -> { "execute": "dumpdtb" }
+# -> { "execute": "dumpdtb",
 #  "arguments": { "filename": "fdt.dtb" } }
 # <- { "return": {} }
 ##
-- 
2.41.0

[PATCH v3 10/10] qapi: scripts: add a generator for qapi's examples

2023-09-19 Thread Victor Toso

This generator has two goals:
 1. Mechanical validation of QAPI examples
 2. Generate the examples in a JSON format to be consumed for extra
validation.

The generator iterates over every Example section, parsing both server
and client messages. The generator prints any inconsistency found, for
example:

 |  Error: Extra data: line 1 column 39 (char 38)
 |  Location: cancel-vcpu-dirty-limit at qapi/migration.json:2017
 |  Data: {"execute": "cancel-vcpu-dirty-limit"},
 |  "arguments": { "cpu-index": 1 } }

The generator will output other JSON file with all the examples in the
QAPI module that they came from. This can be used to validate the
introspection between QAPI/QMP to language bindings, for example:

 | { "examples": [
 |   {
 | "id": "ksuxwzfayw",
 | "client": [
 | {
 |   "sequence-order": 1
 |   "message-type": "command",
 |   "message":
 |   { "arguments":
 | { "device": "scratch", "size": 1073741824 },
 | "execute": "block_resize"
 |   },
 |} ],
 |"server": [
 |{
 |  "sequence-order": 2
 |  "message-type": "return",
 |  "message": { "return": {} },
 |} ]
 |}
 |  ] }

Note that the order matters, as read by the Example section and
translated into "sequence-order". A language binding project can then
consume this files to Marshal and Unmarshal, comparing if the results
are what is to be expected.

RFC discussion:
https://lists.gnu.org/archive/html/qemu-devel/2022-08/msg04641.html

Signed-off-by: Victor Toso 
---
 scripts/qapi/dumpexamples.py | 208 +++
 scripts/qapi/main.py |   3 +-
 2 files changed, 210 insertions(+), 1 deletion(-)
 create mode 100644 scripts/qapi/dumpexamples.py

diff --git a/scripts/qapi/dumpexamples.py b/scripts/qapi/dumpexamples.py
new file mode 100644
index 00..55d9f13ab7
--- /dev/null
+++ b/scripts/qapi/dumpexamples.py
@@ -0,0 +1,208 @@
+"""
+Dump examples for Developers
+"""
+# Copyright (c) 2023 Red Hat Inc.
+#
+# Authors:
+#  Victor Toso 
+#
+# This work is licensed under the terms of the GNU GPL, version 2.
+# See the COPYING file in the top-level directory.
+
+# Just for type hint on self
+from __future__ import annotations
+
+import os
+import json
+import random
+import string
+
+from typing import Dict, List, Optional
+
+from .schema import (
+QAPISchema,
+QAPISchemaType,
+QAPISchemaVisitor,
+QAPISchemaEnumMember,
+QAPISchemaFeature,
+QAPISchemaIfCond,
+QAPISchemaObjectType,
+QAPISchemaObjectTypeMember,
+QAPISchemaVariants,
+)
+from .source import QAPISourceInfo
+
+
+def gen_examples(schema: QAPISchema,
+ output_dir: str,
+ prefix: str) -> None:
+vis = QAPISchemaGenExamplesVisitor(prefix)
+schema.visit(vis)
+vis.write(output_dir)
+
+
+def get_id(random, size: int) -> str:
+letters = string.ascii_lowercase
+return ''.join(random.choice(letters) for i in range(size))
+
+
+def next_object(text, start, end, context) -> (Dict, bool):
+# Start of json object
+start = text.find("{", start)
+end = text.rfind("}", start, end+1)
+
+# try catch, pretty print issues
+try:
+ret = json.loads(text[start:end+1])
+except Exception as e:
+print("Error: {}\nLocation: {}\nData: {}\n".format(
+  str(e), context, text[start:end+1]))
+return {}, True
+else:
+return ret, False
+
+
+def parse_text_to_dicts(text: str, context: str) -> (List[Dict], bool):
+examples, clients, servers = [], [], []
+failed = False
+
+count = 1
+c, s = text.find("->"), text.find("<-")
+while c != -1 or s != -1:
+if c == -1 or (s != -1 and s < c):
+start, target = s, servers
+else:
+start, target = c, clients
+
+# Find the client and server, if any
+if c != -1:
+c = text.find("->", start + 1)
+if s != -1:
+s = text.find("<-", start + 1)
+
+# Find the limit of current's object.
+# We first look for the next message, either client or server. If none
+# is avaible, we set the end of the text as limit.
+if c == -1 and s != -1:
+end = s
+elif c != -1 and s == -1:
+end = c
+elif c != -1 and s != -1:
+end = (c < s) and c or s
+else:
+end = len(text) - 1
+
+message, error = next_object(text, start, end, context)
+if error:
+failed = True
+
+if len(message) > 0:
+message_type = "return"
+if "execute" in message:
+message_type = "command"
+elif "event" in message:
+message_type = "event"
+
+target.append({
+"sequence-order": count,
+"message-type": message_type,
+"message": message
+})
+count += 1
+
+examples.append({"client": clients,

[PATCH v3 06/10] qapi: fix example of NETDEV_STREAM_CONNECTED event

2023-09-19 Thread Victor Toso

Example output was using single quotes. Fix it.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Markus Armbruster 
---
 qapi/net.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/qapi/net.json b/qapi/net.json
index 313c8a606e..81988e499a 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -930,9 +930,9 @@
 #
 # Example:
 #
-# <- { 'event': 'NETDEV_STREAM_DISCONNECTED',
-#  'data': {'netdev-id': 'netdev0'},
-#  'timestamp': {'seconds': 1663330937, 'microseconds': 526695} }
+# <- { "event": "NETDEV_STREAM_DISCONNECTED",
+#  "data": {"netdev-id": "netdev0"},
+#  "timestamp": {"seconds": 1663330937, "microseconds": 526695} }
 ##
 { 'event': 'NETDEV_STREAM_DISCONNECTED',
   'data': { 'netdev-id': 'str' } }
-- 
2.41.0

[PATCH v3 04/10] qapi: fix example of set-vcpu-dirty-limit command

2023-09-19 Thread Victor Toso

Example output has extra end curly bracket. Remove it.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Markus Armbruster 
---
 qapi/migration.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 9385b9f87c..2658cdbcbe 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -1986,7 +1986,7 @@
 #
 # Example:
 #
-# -> {"execute": "set-vcpu-dirty-limit"}
+# -> {"execute": "set-vcpu-dirty-limit",
 # "arguments": { "dirty-rate": 200,
 #"cpu-index": 1 } }
 # <- { "return": {} }
-- 
2.41.0

[PATCH v3 03/10] qapi: fix example of cancel-vcpu-dirty-limit command

2023-09-19 Thread Victor Toso

Example output has extra end curly bracket. Remove it.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
Reviewed-by: Markus Armbruster 
---
 qapi/migration.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qapi/migration.json b/qapi/migration.json
index 8843e74b59..9385b9f87c 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -2010,7 +2010,7 @@
 #
 # Example:
 #
-# -> {"execute": "cancel-vcpu-dirty-limit"},
+# -> {"execute": "cancel-vcpu-dirty-limit",
 # "arguments": { "cpu-index": 1 } }
 # <- { "return": {} }
 ##
-- 
2.41.0

[PATCH v3 08/10] qapi: fix example of query-rocker-of-dpa-flows command

2023-09-19 Thread Victor Toso

Example output has a comment embedded in the array. Remove it.
The end result is a list of size 1.

Signed-off-by: Victor Toso 
Reviewed-by: Daniel P. Berrangé 
---
 qapi/rocker.json | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/qapi/rocker.json b/qapi/rocker.json
index 31ce0b36f6..858e4f4a45 100644
--- a/qapi/rocker.json
+++ b/qapi/rocker.json
@@ -249,8 +249,7 @@
 #   "cookie": 0,
 #   "action": {"goto-tbl": 10},
 #   "mask": {"in-pport": 4294901760}
-#  },
-#  {...more...},
+#  }
 #]}
 ##
 { 'command': 'query-rocker-of-dpa-flows',
-- 
2.41.0

[PATCH v3 00/10] Validate and test qapi examples

2023-09-19 Thread Victor Toso

Hi,

v2: https://lists.gnu.org/archive/html/qemu-devel/2023-09/msg02383.html

- Sorry Markus, I kept the two last 'fix example' patches as I don't
  fully remember how we should go with it. Not taking them but taking
  the generator would be bad as we would fail the build.

- Removed the meson flag suggested by Philippe to take the pragma suggestion 
from Markus, the
  interesting diff is:

--- a/scripts/qapi/dumpexamples.py
+++ b/scripts/qapi/dumpexamples.py
@@ -119,6 +119,10 @@ def parse_examples_of(self: 
QAPISchemaGenExamplesVisitor,

 assert(name in self.schema._entity_dict)
 obj = self.schema._entity_dict[name]
+
+if not obj.info.pragma.doc_required:
+return
+
 assert((obj.doc is not None))
 module_name = obj._module.name

  which avoid failures with tests that don't have any docs.

Cheers,
Victor

Victor Toso (10):
  qapi: fix example of get-win32-socket command
  qapi: fix example of dumpdtb command
  qapi: fix example of cancel-vcpu-dirty-limit command
  qapi: fix example of set-vcpu-dirty-limit command
  qapi: fix example of calc-dirty-rate command
  qapi: fix example of NETDEV_STREAM_CONNECTED event
  qapi: fix example of query-blockstats command
  qapi: fix example of query-rocker-of-dpa-flows command
  qapi: fix example of query-spice command
  qapi: scripts: add a generator for qapi's examples

 qapi/block-core.json |  32 +++---
 qapi/machine.json|   2 +-
 qapi/migration.json  |   6 +-
 qapi/misc.json   |   2 +-
 qapi/net.json|   6 +-
 qapi/rocker.json |   3 +-
 qapi/ui.json |   3 +-
 scripts/qapi/dumpexamples.py | 208 +++
 scripts/qapi/main.py |   3 +-
 9 files changed, 236 insertions(+), 29 deletions(-)
 create mode 100644 scripts/qapi/dumpexamples.py

-- 
2.41.0

Re: [PULL 00/28] Block layer patches

2023-09-19 Thread Stefan Hajnoczi

On Tue, 19 Sept 2023 at 06:26, Kevin Wolf  wrote:
> Am 18.09.2023 um 20:56 hat Stefan Hajnoczi geschrieben:
> If we could fully get rid of the AioContext lock (as we originally
> stated as a goal), that would automatically solve this kind of
> deadlocks.

Grepping for "ctx locked", "context acquired", etc does not bring up a
lot of comments describing variables that are protected by the
AioContext lock.

However, there are at least hundreds of functions that assume they are
called with the AioContext lock held.

There are a few strategies:

Top-down

Shorten AioContext lock critical sections to cover only APIs that need them.
Then push the lock down into the API and repeat the next lower level until
aio_context_acquire() + AIO_WAIT_WHILE() + aio_context_release() can be
replaced with AIO_WAIT_UNLOCKED().

Bottom-up
-
Switch AIO_WAIT_WHILE() to aio_context_release() + AIO_WAIT_WHILE_UNLOCKED() +
aio_context_acquire(). Then move the lock up into callers and repeat at the
next higher level until aio_context_acquire() + aio_context_release() cancel
each other out.

Big bang

Remove aio_context_acquire/release() and fix tests until they pass.

I think top-down is safer than bottom-up, because bottom-up is more
likely to cause issues with callers that do not tolerate temporarily
dropping the lock.

The big bang approach is only reasonable if the AioContext lock is no
longer used to protect variables (which we don't know for sure because
that requires auditing every line of code).

My concern with the top-down approach is that so much code needs to be
audited and the conversions are temporary steps (it's almost a waste
of time for maintainers to review them).

I'm tempted to go for the big bang approach but also don't want to
introduce a slew of new race conditions. :/

Stefan

[PATCH 05/14] hw/net: Add NPCMXXX GMAC device

2023-09-19 Thread Nabih Estefan

From: Hao Wu 

This patch implements the basic registers of GMAC device. Actual network
communications are not supported yet.

Signed-off-by: Hao Wu 

include/hw: Fix type problem in NPCMGMACState

- Fix type problem in NPCMGMACState
- Fix Register Initalization which was breaking boot-up in driver
- Added trace for NPCM_GMAC reset
- Added nd_table to npcm8xx.c for GMAC bootup

Signed-off-by: Nabih Estefan Diaz 

hw/net: Add BCM54612E PHY regs for GMAC

This patch adds default values for PHYs to make the driver happy.
The device is derived from an actual Izumi machine.

Signed-off-by: Hao Wu 

hw/net: change GMAC PHY regs to indicate link is up

This change makes NPCM GMAC module to use BCM54612E unconditionally
and make some fake PHY registers such that the kernel driver thinks
the link partner is up.

Tested:
The following message shows up with the change:
Broadcom BCM54612E stmmac-0:00: attached PHY driver [Broadcom BCM54612E] 
(mii_bus:phy_addr=stmmac-0:00, irq=POLL)
stmmaceth f0802000.eth eth0: Link is Up - 1Gbps/Full - flow control rx/tx

Signed-off-by: Hao Wu 
---
 hw/arm/npcm8xx.c   | 905 +
 hw/net/meson.build |   3 +-
 hw/net/npcm_gmac.c | 395 
 hw/net/trace-events|  11 +
 include/hw/net/npcm_gmac.h | 172 +++
 5 files changed, 1485 insertions(+), 1 deletion(-)
 create mode 100644 hw/arm/npcm8xx.c
 create mode 100644 hw/net/npcm_gmac.c
 create mode 100644 include/hw/net/npcm_gmac.h

diff --git a/hw/arm/npcm8xx.c b/hw/arm/npcm8xx.c
new file mode 100644
index 00..a05dcfed5c
--- /dev/null
+++ b/hw/arm/npcm8xx.c
@@ -0,0 +1,905 @@
+/*
+ * Nuvoton NPCM8xx SoC family.
+ *
+ * Copyright 2022 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+#include "qemu/osdep.h"
+
+#include "hw/arm/boot.h"
+#include "hw/arm/npcm8xx.h"
+#include "hw/char/serial.h"
+#include "hw/intc/arm_gic.h"
+#include "hw/loader.h"
+#include "hw/misc/unimp.h"
+#include "hw/qdev-clock.h"
+#include "hw/qdev-properties.h"
+#include "qapi/error.h"
+#include "qemu/units.h"
+#include "sysemu/sysemu.h"
+
+#define ARM_PHYS_TIMER_PPI  30
+#define ARM_VIRT_TIMER_PPI  27
+#define ARM_HYP_TIMER_PPI   26
+#define ARM_SEC_TIMER_PPI   29
+
+/*
+ * This covers the whole MMIO space. We'll use this to catch any MMIO accesses
+ * that aren't handled by a device.
+ */
+#define NPCM8XX_MMIO_BA (0x8000)
+#define NPCM8XX_MMIO_SZ (0x7ffd)
+
+/* OTP fuse array */
+#define NPCM8XX_OTP_BA  (0xf0189000)
+
+/* GIC Distributor */
+#define NPCM8XX_GICD_BA (0xdfff9000)
+#define NPCM8XX_GICC_BA (0xdfffa000)
+
+/* Core system modules. */
+#define NPCM8XX_CPUP_BA (0xf03fe000)
+#define NPCM8XX_GCR_BA  (0xf080)
+#define NPCM8XX_CLK_BA  (0xf0801000)
+#define NPCM8XX_MC_BA   (0xf0824000)
+#define NPCM8XX_RNG_BA  (0xf000b000)
+#define NPCM8XX_KCS_BA  (0xf0007000)
+#define NPCM8XX_PECI_BA (0xf010)
+#define NPCM8XX_PCIERC_BA   (0xe100)
+#define NPCM8XX_PCIE_ROOT_BA(0xe800)
+
+/* ADC Module */
+#define NPCM8XX_ADC_BA  (0xf000c000)
+
+/* Internal AHB SRAM */
+#define NPCM8XX_RAM3_BA (0xc0008000)
+#define NPCM8XX_RAM3_SZ (4 * KiB)
+
+/* Memory blocks at the end of the address space */
+#define NPCM8XX_RAM2_BA (0xfffb)
+#define NPCM8XX_RAM2_SZ (256 * KiB)
+#define NPCM8XX_ROM_BA  (0x0100)
+#define NPCM8XX_ROM_SZ  (64 * KiB)
+
+/* SDHCI Modules */
+#define NPCM8XX_MMC_BA  (0xf0842000)
+
+/* PCS Module */
+#define NPCM8XX_PCS_BA  (0xf078)
+
+/* Run PLL1 at 1600 MHz */
+#define NPCM8XX_PLLCON1_FIXUP_VAL   (0x00402101)
+/* Run the CPU from PLL1 and UART from PLL2 */
+#define NPCM8XX_CLKSEL_FIXUP_VAL(0x004aaba9)
+
+/* Clock configuration values to be fixed up when bypassing bootloader */
+
+/*
+ * Interrupt lines going into the GIC. This does not include internal Cortex-A9
+ * interrupts.
+ */
+enum NPCM8xxInterrupt {
+NPCM8XX_ADC_IRQ = 0,
+NPCM8XX_PECI_IRQ= 6,
+NPCM8XX_KCS_HIB_IRQ = 9,
+NPCM8XX_GMAC1_IRQ   = 14,
+NPCM8XX_GMAC2_IRQ,
+NPCM8XX_GMAC3_IRQ,
+NPCM8XX_GMAC4_IRQ,
+NPCM8XX_MMC_IRQ = 26,
+NPCM8XX_TIMER0_IRQ  = 32,   /* Timer Module 0 */
+NPCM8XX_TIMER1_IRQ,
+NPCM8XX_TIMER2_IRQ,
+NPCM8XX_TIMER3_IRQ,
+NPCM8XX_TIMER4_IRQ,
+NPCM8XX_TIMER5_IRQ, /* Timer Module 1 */
+

[PATCH 12/14] hw/net: GMAC Tx Implementation

2023-09-19 Thread Nabih Estefan

From: Nabih Estefan Diaz 

- Implementation of Transmit function for packets
- Implementation for reading and writing from and to descriptors in
  memory for Tx

NOTE: This function implements the steps detailed in the datasheet for
transmitting messages from the GMAC.

Signed-off-by: Nabih Estefan Diaz 
---
 hw/net/npcm_gmac.c | 150 +
 1 file changed, 150 insertions(+)

diff --git a/hw/net/npcm_gmac.c b/hw/net/npcm_gmac.c
index 67f123e3c4..c457b11e4d 100644
--- a/hw/net/npcm_gmac.c
+++ b/hw/net/npcm_gmac.c
@@ -266,6 +266,7 @@ static int gmac_write_tx_desc(dma_addr_t addr, struct 
NPCMGMACTxDesc *desc)
 }
 return 0;
 }
+
 static int gmac_rx_transfer_frame_to_buffer(uint32_t rx_buf_len,
 uint32_t *left_frame,
 uint32_t rx_buf_addr,
@@ -484,6 +485,155 @@ static ssize_t gmac_receive(NetClientState *nc, const 
uint8_t *buf, size_t len)
 gmac->regs[R_NPCM_DMA_HOST_RX_DESC] = desc_addr;
 return len;
 }
+
+static int gmac_tx_get_csum(uint32_t tdes1)
+{
+uint32_t mask = TX_DESC_TDES1_CHKSM_INS_CTRL_MASK(tdes1);
+int csum = 0;
+
+if (likely(mask > 0)) {
+csum |= CSUM_IP;
+}
+if (likely(mask > 1)) {
+csum |= CSUM_TCP | CSUM_UDP;
+}
+
+return csum;
+}
+
+static void gmac_try_send_next_packet(NPCMGMACState *gmac)
+{
+/*
+ * Comments about steps refer to steps for
+ * transmitting in page 384 of datasheet
+ */
+uint16_t tx_buffer_size = 2048;
+g_autofree uint8_t *tx_send_buffer = g_malloc(tx_buffer_size);
+uint32_t desc_addr;
+struct NPCMGMACTxDesc tx_desc;
+uint32_t tx_buf_addr, tx_buf_len;
+uint16_t length = 0;
+uint8_t *buf = tx_send_buffer;
+uint32_t prev_buf_size = 0;
+int csum = 0;
+
+/* steps 1&2 */
+if (!gmac->regs[R_NPCM_DMA_HOST_TX_DESC]) {
+gmac->regs[R_NPCM_DMA_HOST_TX_DESC] =
+NPCM_DMA_HOST_TX_DESC_MASK(gmac->regs[R_NPCM_DMA_TX_BASE_ADDR]);
+}
+desc_addr = gmac->regs[R_NPCM_DMA_HOST_TX_DESC];
+
+while (true) {
+gmac_dma_set_state(gmac, NPCM_DMA_STATUS_TX_PROCESS_STATE_SHIFT,
+NPCM_DMA_STATUS_TX_RUNNING_FETCHING_STATE);
+trace_npcm_gmac_packet_transmit(DEVICE(gmac)->canonical_path, length);
+if (gmac_read_tx_desc(desc_addr, _desc)) {
+qemu_log_mask(LOG_GUEST_ERROR, "TX Descriptor @ 0x%x can't be 
read\n",
+  desc_addr);
+return;
+}
+/* step 3 */
+
+trace_npcm_gmac_packet_desc_read(DEVICE(gmac)->canonical_path, 
desc_addr);
+trace_npcm_gmac_debug_desc_data(DEVICE(gmac)->canonical_path, _desc,
+tx_desc.tdes0, tx_desc.tdes1, tx_desc.tdes2, tx_desc.tdes3);
+
+/* 1 = DMA Owned, 0 = Software Owned */
+if (!(tx_desc.tdes0 & TX_DESC_TDES0_OWN)) {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "TX Descriptor @ 0x%x is owned by software\n",
+  desc_addr);
+gmac->regs[R_NPCM_DMA_STATUS] |= NPCM_DMA_STATUS_TU;
+gmac_dma_set_state(gmac, NPCM_DMA_STATUS_TX_PROCESS_STATE_SHIFT,
+NPCM_DMA_STATUS_TX_SUSPENDED_STATE);
+gmac_update_irq(gmac);
+return;
+}
+
+gmac_dma_set_state(gmac, NPCM_DMA_STATUS_TX_PROCESS_STATE_SHIFT,
+NPCM_DMA_STATUS_TX_RUNNING_READ_STATE);
+/* Give the descriptor back regardless of what happens. */
+tx_desc.tdes0 &= ~TX_DESC_TDES0_OWN;
+
+if (tx_desc.tdes1 & TX_DESC_TDES1_FIRST_SEG_MASK) {
+csum = gmac_tx_get_csum(tx_desc.tdes1);
+}
+
+/* step 4 */
+tx_buf_addr = tx_desc.tdes2;
+gmac->regs[R_NPCM_DMA_CUR_TX_BUF_ADDR] = tx_buf_addr;
+tx_buf_len = TX_DESC_TDES1_BFFR1_SZ_MASK(tx_desc.tdes1);
+buf = _send_buffer[prev_buf_size];
+
+if ((prev_buf_size + tx_buf_len) > sizeof(buf)) {
+tx_buffer_size = prev_buf_size + tx_buf_len;
+tx_send_buffer = g_realloc(tx_send_buffer, tx_buffer_size);
+buf = _send_buffer[prev_buf_size];
+}
+
+/* step 5 */
+if (dma_memory_read(_space_memory, tx_buf_addr, buf,
+tx_buf_len, MEMTXATTRS_UNSPECIFIED)) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: Failed to read packet @ 
0x%x\n",
+__func__, tx_buf_addr);
+return;
+}
+length += tx_buf_len;
+prev_buf_size += tx_buf_len;
+
+/* If not chained we'll have a second buffer. */
+if (!(tx_desc.tdes1 & TX_DESC_TDES1_SEC_ADDR_CHND_MASK)) {
+tx_buf_addr = tx_desc.tdes3;
+gmac->regs[R_NPCM_DMA_CUR_TX_BUF_ADDR] = tx_buf_addr;
+tx_buf_len = TX_DESC_TDES1_BFFR2_SZ_MASK(tx_desc.tdes1);
+buf = _send_buffer[prev_buf_size];
+
+if ((prev_buf_size + tx_buf_len) > sizeof(buf)) {
+

[PATCH 04/14] hw/net: Add NPCM8XX PCS Module

2023-09-19 Thread Nabih Estefan

From: Hao Wu 

The PCS exists in NPCM8XX's GMAC1 and is used to control the SGMII
PHY. This implementation contains all the default registers and
the soft reset feature that are required to load the Linux kernel
driver. Further features have not been implemented yet.

Signed-off-by: Hao Wu 
---
 hw/net/npcm_pcs.c | 409 ++
 include/hw/net/npcm_pcs.h |  42 
 2 files changed, 451 insertions(+)
 create mode 100644 hw/net/npcm_pcs.c
 create mode 100644 include/hw/net/npcm_pcs.h

diff --git a/hw/net/npcm_pcs.c b/hw/net/npcm_pcs.c
new file mode 100644
index 00..efe5f68d9c
--- /dev/null
+++ b/hw/net/npcm_pcs.c
@@ -0,0 +1,409 @@
+/*
+ * Nuvoton NPCM8xx PCS Module
+ *
+ * Copyright 2022 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+/*
+ * Disclaimer:
+ * Currently we only implemented the default values of the registers and
+ * the soft reset feature. These are required to boot up the GMAC module
+ * in Linux kernel for NPCM845 boards. Other functionalities are not modeled.
+ */
+
+#include "qemu/osdep.h"
+
+#include "exec/hwaddr.h"
+#include "hw/registerfields.h"
+#include "hw/net/npcm_pcs.h"
+#include "migration/vmstate.h"
+#include "qemu/log.h"
+#include "qemu/units.h"
+#include "trace.h"
+
+#define NPCM_PCS_IND_AC_BA  0x1fe
+#define NPCM_PCS_IND_SR_CTL 0x1e00
+#define NPCM_PCS_IND_SR_MII 0x1f00
+#define NPCM_PCS_IND_SR_TIM 0x1f07
+#define NPCM_PCS_IND_VR_MII 0x1f80
+
+REG16(NPCM_PCS_SR_CTL_ID1, 0x08)
+REG16(NPCM_PCS_SR_CTL_ID2, 0x0a)
+REG16(NPCM_PCS_SR_CTL_STS, 0x10)
+
+REG16(NPCM_PCS_SR_MII_CTRL, 0x00)
+REG16(NPCM_PCS_SR_MII_STS, 0x02)
+REG16(NPCM_PCS_SR_MII_DEV_ID1, 0x04)
+REG16(NPCM_PCS_SR_MII_DEV_ID2, 0x06)
+REG16(NPCM_PCS_SR_MII_AN_ADV, 0x08)
+REG16(NPCM_PCS_SR_MII_LP_BABL, 0x0a)
+REG16(NPCM_PCS_SR_MII_AN_EXPN, 0x0c)
+REG16(NPCM_PCS_SR_MII_EXT_STS, 0x1e)
+
+REG16(NPCM_PCS_SR_TIM_SYNC_ABL, 0x10)
+REG16(NPCM_PCS_SR_TIM_SYNC_TX_MAX_DLY_LWR, 0x12)
+REG16(NPCM_PCS_SR_TIM_SYNC_TX_MAX_DLY_UPR, 0x14)
+REG16(NPCM_PCS_SR_TIM_SYNC_TX_MIN_DLY_LWR, 0x16)
+REG16(NPCM_PCS_SR_TIM_SYNC_TX_MIN_DLY_UPR, 0x18)
+REG16(NPCM_PCS_SR_TIM_SYNC_RX_MAX_DLY_LWR, 0x1a)
+REG16(NPCM_PCS_SR_TIM_SYNC_RX_MAX_DLY_UPR, 0x1c)
+REG16(NPCM_PCS_SR_TIM_SYNC_RX_MIN_DLY_LWR, 0x1e)
+REG16(NPCM_PCS_SR_TIM_SYNC_RX_MIN_DLY_UPR, 0x20)
+
+REG16(NPCM_PCS_VR_MII_MMD_DIG_CTRL1, 0x000)
+REG16(NPCM_PCS_VR_MII_AN_CTRL, 0x002)
+REG16(NPCM_PCS_VR_MII_AN_INTR_STS, 0x004)
+REG16(NPCM_PCS_VR_MII_TC, 0x006)
+REG16(NPCM_PCS_VR_MII_DBG_CTRL, 0x00a)
+REG16(NPCM_PCS_VR_MII_EEE_MCTRL0, 0x00c)
+REG16(NPCM_PCS_VR_MII_EEE_TXTIMER, 0x010)
+REG16(NPCM_PCS_VR_MII_EEE_RXTIMER, 0x012)
+REG16(NPCM_PCS_VR_MII_LINK_TIMER_CTRL, 0x014)
+REG16(NPCM_PCS_VR_MII_EEE_MCTRL1, 0x016)
+REG16(NPCM_PCS_VR_MII_DIG_STS, 0x020)
+REG16(NPCM_PCS_VR_MII_ICG_ERRCNT1, 0x022)
+REG16(NPCM_PCS_VR_MII_MISC_STS, 0x030)
+REG16(NPCM_PCS_VR_MII_RX_LSTS, 0x040)
+REG16(NPCM_PCS_VR_MII_MP_TX_BSTCTRL0, 0x070)
+REG16(NPCM_PCS_VR_MII_MP_TX_LVLCTRL0, 0x074)
+REG16(NPCM_PCS_VR_MII_MP_TX_GENCTRL0, 0x07a)
+REG16(NPCM_PCS_VR_MII_MP_TX_GENCTRL1, 0x07c)
+REG16(NPCM_PCS_VR_MII_MP_TX_STS, 0x090)
+REG16(NPCM_PCS_VR_MII_MP_RX_GENCTRL0, 0x0b0)
+REG16(NPCM_PCS_VR_MII_MP_RX_GENCTRL1, 0x0b2)
+REG16(NPCM_PCS_VR_MII_MP_RX_LOS_CTRL0, 0x0ba)
+REG16(NPCM_PCS_VR_MII_MP_MPLL_CTRL0, 0x0f0)
+REG16(NPCM_PCS_VR_MII_MP_MPLL_CTRL1, 0x0f2)
+REG16(NPCM_PCS_VR_MII_MP_MPLL_STS, 0x110)
+REG16(NPCM_PCS_VR_MII_MP_MISC_CTRL2, 0x126)
+REG16(NPCM_PCS_VR_MII_MP_LVL_CTRL, 0x130)
+REG16(NPCM_PCS_VR_MII_MP_MISC_CTRL0, 0x132)
+REG16(NPCM_PCS_VR_MII_MP_MISC_CTRL1, 0x134)
+REG16(NPCM_PCS_VR_MII_DIG_CTRL2, 0x1c2)
+REG16(NPCM_PCS_VR_MII_DIG_ERRCNT_SEL, 0x1c4)
+
+/* Register Fields */
+#define NPCM_PCS_SR_MII_CTRL_RSTBIT(15)
+
+static const uint16_t npcm_pcs_sr_ctl_cold_reset_values[NPCM_PCS_NR_SR_CTLS] = 
{
+[R_NPCM_PCS_SR_CTL_ID1] = 0x699e,
+[R_NPCM_PCS_SR_CTL_STS] = 0x8000,
+};
+
+static const uint16_t npcm_pcs_sr_mii_cold_reset_values[NPCM_PCS_NR_SR_MIIS] = 
{
+[R_NPCM_PCS_SR_MII_CTRL]= 0x1140,
+[R_NPCM_PCS_SR_MII_STS] = 0x0109,
+[R_NPCM_PCS_SR_MII_DEV_ID1] = 0x699e,
+[R_NPCM_PCS_SR_MII_DEV_ID2] = 0xced0,
+[R_NPCM_PCS_SR_MII_AN_ADV]  = 0x0020,
+[R_NPCM_PCS_SR_MII_EXT_STS] = 0xc000,
+};
+
+static const uint16_t npcm_pcs_sr_tim_cold_reset_values[NPCM_PCS_NR_SR_TIMS] = 
{
+[R_NPCM_PCS_SR_TIM_SYNC_ABL]= 0x0003,
+

[PATCH 02/14] hw/arm: Add PCI mailbox module to Nuvoton SoC

2023-09-19 Thread Nabih Estefan

From: Hao Wu 

This patch wires the PCI mailbox module to Nuvoton SoC.

hw/misc: Add chardev to PCI mailbox

This patches adds a chardev to PCI mailbox that can be used to
receive external read and write request from the host.

Signed-off-by: Hao Wu 
---
 hw/arm/npcm7xx.c   |  16 +++-
 hw/misc/npcm7xx_pci_mbox.c | 147 +
 include/hw/arm/npcm7xx.h   |   1 +
 include/hw/misc/npcm7xx_pci_mbox.h |  18 
 4 files changed, 181 insertions(+), 1 deletion(-)

diff --git a/hw/arm/npcm7xx.c b/hw/arm/npcm7xx.c
index 15ff21d047..c69e936669 100644
--- a/hw/arm/npcm7xx.c
+++ b/hw/arm/npcm7xx.c
@@ -53,6 +53,9 @@
 /* ADC Module */
 #define NPCM7XX_ADC_BA  (0xf000c000)
 
+/* PCI Mailbox Module */
+#define NPCM7XX_PCI_MBOX_BA (0xf0848000)
+
 /* Internal AHB SRAM */
 #define NPCM7XX_RAM3_BA (0xc0008000)
 #define NPCM7XX_RAM3_SZ (4 * KiB)
@@ -83,6 +86,10 @@ enum NPCM7xxInterrupt {
 NPCM7XX_UART1_IRQ,
 NPCM7XX_UART2_IRQ,
 NPCM7XX_UART3_IRQ,
+NPCM7XX_PECI_IRQ= 6,
+NPCM7XX_PCI_MBOX_IRQ= 8,
+NPCM7XX_KCS_HIB_IRQ = 9,
+NPCM7XX_GMAC1_IRQ   = 14,
 NPCM7XX_EMC1RX_IRQ  = 15,
 NPCM7XX_EMC1TX_IRQ,
 NPCM7XX_MMC_IRQ = 26,
@@ -706,6 +713,14 @@ static void npcm7xx_realize(DeviceState *dev, Error **errp)
 }
 }
 
+/* PCI Mailbox. Cannot fail */
+sysbus_realize(SYS_BUS_DEVICE(>pci_mbox), _abort);
+sysbus_mmio_map(SYS_BUS_DEVICE(>pci_mbox), 0, NPCM7XX_PCI_MBOX_BA);
+sysbus_mmio_map(SYS_BUS_DEVICE(>pci_mbox), 1,
+NPCM7XX_PCI_MBOX_BA + NPCM7XX_PCI_MBOX_RAM_SIZE);
+sysbus_connect_irq(SYS_BUS_DEVICE(>pci_mbox), 0,
+   npcm7xx_irq(s, NPCM7XX_PCI_MBOX_IRQ));
+
 /* RAM2 (SRAM) */
 memory_region_init_ram(>sram, OBJECT(dev), "ram2",
NPCM7XX_RAM2_SZ, _abort);
@@ -765,7 +780,6 @@ static void npcm7xx_realize(DeviceState *dev, Error **errp)
 create_unimplemented_device("npcm7xx.usbd[8]",  0xf0838000,   4 * KiB);
 create_unimplemented_device("npcm7xx.usbd[9]",  0xf0839000,   4 * KiB);
 create_unimplemented_device("npcm7xx.sd",   0xf084,   8 * KiB);
-create_unimplemented_device("npcm7xx.pcimbx",   0xf0848000, 512 * KiB);
 create_unimplemented_device("npcm7xx.aes",  0xf0858000,   4 * KiB);
 create_unimplemented_device("npcm7xx.des",  0xf0859000,   4 * KiB);
 create_unimplemented_device("npcm7xx.sha",  0xf085a000,   4 * KiB);
diff --git a/hw/misc/npcm7xx_pci_mbox.c b/hw/misc/npcm7xx_pci_mbox.c
index d82a87fc41..8f971a1b0d 100644
--- a/hw/misc/npcm7xx_pci_mbox.c
+++ b/hw/misc/npcm7xx_pci_mbox.c
@@ -15,6 +15,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "chardev/char-fe.h"
 #include "hw/irq.h"
 #include "hw/qdev-clock.h"
 #include "hw/qdev-properties-system.h"
@@ -35,6 +36,18 @@ REG32(NPCM7XX_PCI_MBOX_BMBXSTAT, 0x00);
 REG32(NPCM7XX_PCI_MBOX_BMBXCTL, 0x04);
 REG32(NPCM7XX_PCI_MBOX_BMBXCMD, 0x08);
 
+enum NPCM7xxPCIMBoxOperation {
+NPCM7XX_PCI_MBOX_OP_READ = 1,
+NPCM7XX_PCI_MBOX_OP_WRITE,
+};
+
+#define NPCM7XX_PCI_MBOX_OFFSET_BYTES 8
+
+/* Response code */
+#define NPCM7XX_PCI_MBOX_OK 0
+#define NPCM7XX_PCI_MBOX_INVALID_OP 0xa0
+#define NPCM7XX_PCI_MBOX_INVALID_SIZE 0xa1
+#define NPCM7XX_PCI_MBOX_UNSPECIFIED_ERROR 0xff
 
 #define NPCM7XX_PCI_MBOX_NR_CI 8
 #define NPCM7XX_PCI_MBOX_CI_MASK MAKE_64BIT_MASK(0, NPCM7XX_PCI_MBOX_NR_CI)
@@ -53,6 +66,92 @@ static void npcm7xx_pci_mbox_update_irq(NPCM7xxPCIMBoxState 
*s)
 }
 }
 
+static void npcm7xx_pci_mbox_send_response(NPCM7xxPCIMBoxState *s, uint8_t 
code)
+{
+qemu_chr_fe_write(>chr, , 1);
+if (code == NPCM7XX_PCI_MBOX_OK && s->op == NPCM7XX_PCI_MBOX_OP_READ) {
+qemu_chr_fe_write(>chr, (uint8_t *)(>data), s->size);
+}
+}
+
+static void npcm7xx_pci_mbox_handle_read(NPCM7xxPCIMBoxState *s)
+{
+MemTxResult r = memory_region_dispatch_read(
+>ram, s->offset, >data, MO_LE | size_memop(s->size),
+MEMTXATTRS_UNSPECIFIED);
+
+npcm7xx_pci_mbox_send_response(s, (uint8_t)r);
+}
+
+static void npcm7xx_pci_mbox_handle_write(NPCM7xxPCIMBoxState *s)
+{
+MemTxResult r = memory_region_dispatch_write(
+>ram, s->offset, s->data, MO_LE | size_memop(s->size),
+MEMTXATTRS_UNSPECIFIED);
+
+npcm7xx_pci_mbox_send_response(s, (uint8_t)r);
+}
+
+static void npcm7xx_pci_mbox_receive_char(NPCM7xxPCIMBoxState *s, uint8_t byte)
+{
+switch (s->state) {
+case NPCM7XX_PCI_MBOX_STATE_IDLE:
+switch (byte) {
+case NPCM7XX_PCI_MBOX_OP_READ:
+case NPCM7XX_PCI_MBOX_OP_WRITE:
+s->op = byte;
+s->state = NPCM7XX_PCI_MBOX_STATE_OFFSET;
+s->offset = 0;
+s->receive_count = 0;
+break;
+
+default:
+qemu_log_mask(LOG_GUEST_ERROR,
+"received invalid op type: 0x%" PRIx8, byte);
+

[PATCH 06/14] hw/arm: Add GMAC devices to NPCM8XX SoC

2023-09-19 Thread Nabih Estefan

From: Hao Wu 

Signed-off-by: Hao Wu 
---
 hw/arm/npcm8xx.c |  12 
 include/hw/arm/npcm8xx.h | 118 +++
 2 files changed, 118 insertions(+), 12 deletions(-)
 create mode 100644 include/hw/arm/npcm8xx.h

diff --git a/hw/arm/npcm8xx.c b/hw/arm/npcm8xx.c
index a05dcfed5c..a9eb2b894c 100644
--- a/hw/arm/npcm8xx.c
+++ b/hw/arm/npcm8xx.c
@@ -440,9 +440,6 @@ static void npcm8xx_init(Object *obj)
 object_initialize_child(obj, "gpio[*]", >gpio[i], 
TYPE_NPCM7XX_GPIO);
 }
 
-object_initialize_child(obj, "gpiotx", >gpiotx,
-TYPE_GOOGLE_GPIO_TRANSMITTER);
-
 for (i = 0; i < ARRAY_SIZE(s->smbus); i++) {
 object_initialize_child(obj, "smbus[*]", >smbus[i],
 TYPE_NPCM8XX_SMBUS);
@@ -633,12 +630,9 @@ static void npcm8xx_realize(DeviceState *dev, Error **errp)
 
 /* GPIO modules. Cannot fail. */
 QEMU_BUILD_BUG_ON(ARRAY_SIZE(npcm8xx_gpio) != ARRAY_SIZE(s->gpio));
-sysbus_realize(SYS_BUS_DEVICE(>gpiotx), _abort);
 for (i = 0; i < ARRAY_SIZE(s->gpio); i++) {
 Object *obj = OBJECT(>gpio[i]);
 
-object_property_set_link(obj, "gpio-tx", OBJECT(>gpiotx),
- _abort);
 object_property_set_uint(obj, "reset-pullup",
  npcm8xx_gpio[i].reset_pu, _abort);
 object_property_set_uint(obj, "reset-pulldown",
@@ -725,12 +719,6 @@ static void npcm8xx_realize(DeviceState *dev, Error **errp)
 for (i = 0; i < ARRAY_SIZE(s->gmac); i++) {
 SysBusDevice *sbd = SYS_BUS_DEVICE(>gmac[i]);
 
-/* This is used to make sure that the NIC can create the device */
-if (nd_table[i].used) {
-qemu_check_nic_model(_table[i], TYPE_NPCM_GMAC);
-qdev_set_nic_properties(DEVICE(sbd), _table[i]);
-}
-
 /*
  * The device exists regardless of whether it's connected to a QEMU
  * netdev backend. So always instantiate it even if there is no
diff --git a/include/hw/arm/npcm8xx.h b/include/hw/arm/npcm8xx.h
new file mode 100644
index 00..0c0488b641
--- /dev/null
+++ b/include/hw/arm/npcm8xx.h
@@ -0,0 +1,118 @@
+/*
+ * Nuvoton NPCM8xx SoC family.
+ *
+ * Copyright 2022 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+#ifndef NPCM8XX_H
+#define NPCM8XX_H
+
+#include "hw/boards.h"
+#include "hw/adc/npcm7xx_adc.h"
+#include "hw/core/split-irq.h"
+#include "hw/cpu/cluster.h"
+#include "hw/gpio/npcm7xx_gpio.h"
+#include "hw/i2c/npcm_smbus.h"
+#include "hw/ipmi/npcm7xx_kcs.h"
+#include "hw/intc/arm_gic_common.h"
+#include "hw/mem/npcm7xx_mc.h"
+#include "hw/misc/npcm_clk.h"
+#include "hw/misc/npcm_gcr.h"
+#include "hw/misc/npcm7xx_mft.h"
+#include "hw/misc/npcm7xx_pci_mbox.h"
+#include "hw/misc/npcm7xx_pwm.h"
+#include "hw/misc/npcm7xx_rng.h"
+#include "hw/net/npcm_gmac.h"
+#include "hw/net/npcm_pcs.h"
+#include "hw/nvram/npcm7xx_otp.h"
+#include "hw/peci/npcm7xx_peci.h"
+#include "hw/pci-host/npcm_pcierc.h"
+#include "hw/sd/npcm7xx_sdhci.h"
+#include "hw/timer/npcm7xx_timer.h"
+#include "hw/ssi/npcm7xx_fiu.h"
+#include "hw/usb/hcd-ehci.h"
+#include "hw/usb/hcd-ohci.h"
+#include "target/arm/cpu.h"
+
+#define NPCM8XX_MAX_NUM_CPUS(4)
+
+/* The first half of the address space is reserved for DDR4 DRAM. */
+#define NPCM8XX_DRAM_BA (0x)
+#define NPCM8XX_DRAM_SZ (2 * GiB)
+
+/* Magic addresses for setting up direct kernel booting and SMP boot stubs. */
+#define NPCM8XX_LOADER_START(0x)  /* Start of SDRAM */
+#define NPCM8XX_SMP_LOADER_START(0x)  /* Boot ROM */
+#define NPCM8XX_SMP_BOOTREG_ADDR(0xf080013c)  /* GCR.SCRPAD */
+#define NPCM8XX_BOARD_SETUP_ADDR(0x1000)  /* Boot ROM */
+
+#define NPCM8XX_NR_PWM_MODULES 3
+
+typedef struct NPCM8xxState {
+DeviceState parent;
+
+ARMCPU  cpu[NPCM8XX_MAX_NUM_CPUS];
+CPUClusterState cpu_cluster;
+GICStategic;
+
+MemoryRegionsram;
+MemoryRegionirom;
+MemoryRegionram3;
+MemoryRegion*dram;
+
+NPCMGCRStategcr;
+NPCMCLKStateclk;
+NPCM7xxTimerCtrlState tim[3];
+NPCM7xxADCState adc;
+NPCM7xxPWMState pwm[NPCM8XX_NR_PWM_MODULES];
+NPCM7xxMFTState mft[8];
+NPCM7xxOTPState fuse_array;
+NPCM7xxMCState  mc;
+NPCM7xxRNGState rng;
+NPCM7xxGPIOStategpio[8];
+NPCMSMBusState  smbus[27];
+

[PATCH 13/14] hw/arm: Connect to chardev backend for NPCM8XX

2023-09-19 Thread Nabih Estefan

From: Hao Wu 

As NPCM8XX SoCs have 2 mailboxes, we can't use -global to connect
the mailboxes to their specific chardevs. So we add the search
for chardev code here, similar to what we did for the GMAC devices.

Signed-off-by: Hao Wu 
---
 hw/arm/npcm8xx.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/hw/arm/npcm8xx.c b/hw/arm/npcm8xx.c
index a9eb2b894c..8859cb274d 100644
--- a/hw/arm/npcm8xx.c
+++ b/hw/arm/npcm8xx.c
@@ -16,6 +16,7 @@
 
 #include "qemu/osdep.h"
 
+#include "chardev/char.h"
 #include "hw/arm/boot.h"
 #include "hw/arm/npcm8xx.h"
 #include "hw/char/serial.h"
@@ -25,7 +26,9 @@
 #include "hw/qdev-clock.h"
 #include "hw/qdev-properties.h"
 #include "qapi/error.h"
+#include "qemu/error-report.h"
 #include "qemu/units.h"
+#include "qom/object.h"
 #include "sysemu/sysemu.h"
 
 #define ARM_PHYS_TIMER_PPI  30
@@ -762,6 +765,14 @@ static void npcm8xx_realize(DeviceState *dev, Error **errp)
 
 /* PCI Mailbox. Cannot fail */
 for (i = 0; i < ARRAY_SIZE(s->pci_mbox); i++) {
+g_autofree char *char_name = g_strdup_printf("pci%d", i);
+Chardev *chardev = qemu_chr_find(char_name);
+
+if (chardev) {
+qdev_prop_set_chr(DEVICE(>pci_mbox[i]), "chardev", chardev);
+} else {
+warn_report("PCI Mailbox %d does not have a chardev backend.", i);
+}
 sysbus_realize(SYS_BUS_DEVICE(>pci_mbox[i]), _abort);
 sysbus_mmio_map(SYS_BUS_DEVICE(>pci_mbox[i]), 0,
npcm8xx_pci_mbox_addr[i]);
-- 
2.42.0.459.ge4e396fd5e-goog

[PATCH 07/14] hw/arm: Add GMAC devices to NPCM7XX SoC

2023-09-19 Thread Nabih Estefan

From: Hao Wu 

Signed-off-by: Hao Wu 
---
 hw/arm/npcm7xx.c | 38 --
 include/hw/arm/npcm7xx.h |  3 +++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/hw/arm/npcm7xx.c b/hw/arm/npcm7xx.c
index c69e936669..15c58ef4a9 100644
--- a/hw/arm/npcm7xx.c
+++ b/hw/arm/npcm7xx.c
@@ -92,6 +92,7 @@ enum NPCM7xxInterrupt {
 NPCM7XX_GMAC1_IRQ   = 14,
 NPCM7XX_EMC1RX_IRQ  = 15,
 NPCM7XX_EMC1TX_IRQ,
+NPCM7XX_GMAC2_IRQ,
 NPCM7XX_MMC_IRQ = 26,
 NPCM7XX_PSPI2_IRQ   = 28,
 NPCM7XX_PSPI1_IRQ   = 31,
@@ -235,6 +236,12 @@ static const hwaddr npcm7xx_pspi_addr[] = {
 0xf0201000,
 };
 
+/* Register base address for each GMAC Module */
+static const hwaddr npcm7xx_gmac_addr[] = {
+0xf0802000,
+0xf0804000,
+};
+
 static const struct {
 hwaddr regs_addr;
 uint32_t unconnected_pins;
@@ -463,6 +470,12 @@ static void npcm7xx_init(Object *obj)
 object_initialize_child(obj, "pspi[*]", >pspi[i], TYPE_NPCM_PSPI);
 }
 
+for (i = 0; i < ARRAY_SIZE(s->gmac); i++) {
+object_initialize_child(obj, "gmac[*]", >gmac[i], TYPE_NPCM_GMAC);
+}
+
+object_initialize_child(obj, "pci-mbox", >pci_mbox,
+TYPE_NPCM7XX_PCI_MBOX);
 object_initialize_child(obj, "mmc", >mmc, TYPE_NPCM7XX_SDHCI);
 }
 
@@ -694,6 +707,29 @@ static void npcm7xx_realize(DeviceState *dev, Error **errp)
 sysbus_connect_irq(sbd, 1, npcm7xx_irq(s, rx_irq));
 }
 
+/*
+ * GMAC Modules. Cannot fail.
+ */
+QEMU_BUILD_BUG_ON(ARRAY_SIZE(npcm7xx_gmac_addr) != ARRAY_SIZE(s->gmac));
+QEMU_BUILD_BUG_ON(ARRAY_SIZE(s->gmac) != 2);
+for (i = 0; i < ARRAY_SIZE(s->gmac); i++) {
+SysBusDevice *sbd = SYS_BUS_DEVICE(>gmac[i]);
+
+/*
+ * The device exists regardless of whether it's connected to a QEMU
+ * netdev backend. So always instantiate it even if there is no
+ * backend.
+ */
+sysbus_realize(sbd, _abort);
+sysbus_mmio_map(sbd, 0, npcm7xx_gmac_addr[i]);
+int irq = i == 0 ? NPCM7XX_GMAC1_IRQ : NPCM7XX_GMAC2_IRQ;
+/*
+ * N.B. The values for the second argument sysbus_connect_irq are
+ * chosen to match the registration order in npcm7xx_emc_realize.
+ */
+sysbus_connect_irq(sbd, 0, npcm7xx_irq(s, irq));
+}
+
 /*
  * Flash Interface Unit (FIU). Can fail if incorrect number of chip selects
  * specified, but this is a programming error.
@@ -764,8 +800,6 @@ static void npcm7xx_realize(DeviceState *dev, Error **errp)
 create_unimplemented_device("npcm7xx.siox[2]",  0xf0102000,   4 * KiB);
 create_unimplemented_device("npcm7xx.ahbpci",   0xf040,   1 * MiB);
 create_unimplemented_device("npcm7xx.mcphy",0xf05f,  64 * KiB);
-create_unimplemented_device("npcm7xx.gmac1",0xf0802000,   8 * KiB);
-create_unimplemented_device("npcm7xx.gmac2",0xf0804000,   8 * KiB);
 create_unimplemented_device("npcm7xx.vcd",  0xf081,  64 * KiB);
 create_unimplemented_device("npcm7xx.ece",  0xf082,   8 * KiB);
 create_unimplemented_device("npcm7xx.vdma", 0xf0822000,   8 * KiB);
diff --git a/include/hw/arm/npcm7xx.h b/include/hw/arm/npcm7xx.h
index 273090ac60..9e5cf639a2 100644
--- a/include/hw/arm/npcm7xx.h
+++ b/include/hw/arm/npcm7xx.h
@@ -30,6 +30,7 @@
 #include "hw/misc/npcm7xx_pwm.h"
 #include "hw/misc/npcm7xx_rng.h"
 #include "hw/net/npcm7xx_emc.h"
+#include "hw/net/npcm_gmac.h"
 #include "hw/nvram/npcm7xx_otp.h"
 #include "hw/timer/npcm7xx_timer.h"
 #include "hw/ssi/npcm7xx_fiu.h"
@@ -105,6 +106,8 @@ struct NPCM7xxState {
 OHCISysBusState ohci;
 NPCM7xxFIUState fiu[2];
 NPCM7xxEMCState emc[2];
+NPCMGMACState   gmac[2];
+NPCM7xxPCIMBoxState pci_mbox;
 NPCM7xxSDHCIState   mmc;
 NPCMPSPIState   pspi[2];
 };
-- 
2.42.0.459.ge4e396fd5e-goog

[PATCH 10/14] hw/net: General GMAC Implementation

2023-09-19 Thread Nabih Estefan

From: Nabih Estefan Diaz 

- General GMAC Register handling
- GMAC IRQ Handling
- Added traces in some methods for debugging
- Lots of declarations for accessing information on GMAC Descriptors 
(npcm_gmac.h file)

NOTE: With code on this state, the GMAC can boot-up properly and will show up 
in the ifconfig command on the BMC

Signed-off-by: Nabih Estefan Diaz 
---
 hw/net/npcm_gmac.c   | 183 ---
 hw/net/trace-events  |   9 ++
 include/hw/net/npcm_gmac.h   | 233 ---
 tests/qtest/npcm_gmac-test.c |   2 +-
 4 files changed, 338 insertions(+), 89 deletions(-)

diff --git a/hw/net/npcm_gmac.c b/hw/net/npcm_gmac.c
index 5ce632858d..6f8109e0ee 100644
--- a/hw/net/npcm_gmac.c
+++ b/hw/net/npcm_gmac.c
@@ -32,7 +32,7 @@
 REG32(NPCM_DMA_BUS_MODE, 0x1000)
 REG32(NPCM_DMA_XMT_POLL_DEMAND, 0x1004)
 REG32(NPCM_DMA_RCV_POLL_DEMAND, 0x1008)
-REG32(NPCM_DMA_RCV_BASE_ADDR, 0x100c)
+REG32(NPCM_DMA_RX_BASE_ADDR, 0x100c)
 REG32(NPCM_DMA_TX_BASE_ADDR, 0x1010)
 REG32(NPCM_DMA_STATUS, 0x1014)
 REG32(NPCM_DMA_CONTROL, 0x1018)
@@ -91,7 +91,8 @@ REG32(NPCM_GMAC_PTP_TTSR, 0x71c)
 #define NPCM_DMA_BUS_MODE_SWR   BIT(0)
 
 static const uint32_t npcm_gmac_cold_reset_values[NPCM_GMAC_NR_REGS] = {
-[R_NPCM_GMAC_VERSION] = 0x1037,
+/* Reduce version to 3.2 so that the kernel can enable interrupt. */
+[R_NPCM_GMAC_VERSION] = 0x1032,
 [R_NPCM_GMAC_TIMER_CTRL]  = 0x03e8,
 [R_NPCM_GMAC_MAC0_ADDR_HI]= 0x8000,
 [R_NPCM_GMAC_MAC0_ADDR_LO]= 0x,
@@ -125,12 +126,12 @@ static const uint16_t phy_reg_init[] = {
 [MII_EXTSTAT]   = 0x3000, /* 1000BASTE_T full-duplex capable */
 };
 
-static void npcm_gmac_soft_reset(NPCMGMACState *s)
+static void npcm_gmac_soft_reset(NPCMGMACState *gmac)
 {
-memcpy(s->regs, npcm_gmac_cold_reset_values,
+memcpy(gmac->regs, npcm_gmac_cold_reset_values,
NPCM_GMAC_NR_REGS * sizeof(uint32_t));
 /* Clear reset bits */
-s->regs[R_NPCM_DMA_BUS_MODE] &= ~NPCM_DMA_BUS_MODE_SWR;
+gmac->regs[R_NPCM_DMA_BUS_MODE] &= ~NPCM_DMA_BUS_MODE_SWR;
 }
 
 static void gmac_phy_set_link(NPCMGMACState *s, bool active)
@@ -148,11 +149,53 @@ static bool gmac_can_receive(NetClientState *nc)
 return true;
 }
 
-static ssize_t gmac_receive(NetClientState *nc, const uint8_t *buf, size_t 
len1)
+/*
+ * Function that updates the GMAC IRQ
+ * It find the logical OR of the enabled bits for NIS (if enabled)
+ * It find the logical OR of the enabled bits for AIS (if enabled)
+ */
+static void gmac_update_irq(NPCMGMACState *gmac)
 {
-return 0;
+/*
+ * Check if the normal interrupts summery is enabled
+ * if so, add the bits for the summary that are enabled
+ */
+if (gmac->regs[R_NPCM_DMA_INTR_ENA] & gmac->regs[R_NPCM_DMA_STATUS] &
+(NPCM_DMA_INTR_ENAB_NIE_BITS))
+{
+gmac->regs[R_NPCM_DMA_STATUS] |=  NPCM_DMA_STATUS_NIS;
+}
+/*
+ * Check if the abnormal interrupts summery is enabled
+ * if so, add the bits for the summary that are enabled
+ */
+if (gmac->regs[R_NPCM_DMA_INTR_ENA] & gmac->regs[R_NPCM_DMA_STATUS] &
+(NPCM_DMA_INTR_ENAB_AIE_BITS))
+{
+gmac->regs[R_NPCM_DMA_STATUS] |=  NPCM_DMA_STATUS_AIS;
+}
+
+/* Get the logical OR of both normal and abnormal interrupts */
+int level = !!((gmac->regs[R_NPCM_DMA_STATUS] &
+gmac->regs[R_NPCM_DMA_INTR_ENA] &
+NPCM_DMA_STATUS_NIS) |
+   (gmac->regs[R_NPCM_DMA_STATUS] &
+   gmac->regs[R_NPCM_DMA_INTR_ENA] &
+   NPCM_DMA_STATUS_AIS));
+
+/* Set the IRQ */
+trace_npcm_gmac_update_irq(DEVICE(gmac)->canonical_path,
+   gmac->regs[R_NPCM_DMA_STATUS],
+   gmac->regs[R_NPCM_DMA_INTR_ENA],
+   level);
+qemu_set_irq(gmac->irq, level);
 }
 
+static ssize_t gmac_receive(NetClientState *nc, const uint8_t *buf, size_t len)
+{
+/* Placeholder */
+return 0;
+}
 static void gmac_cleanup(NetClientState *nc)
 {
 /* Nothing to do yet. */
@@ -166,7 +209,7 @@ static void gmac_set_link(NetClientState *nc)
 gmac_phy_set_link(s, !nc->link_down);
 }
 
-static void npcm_gmac_mdio_access(NPCMGMACState *s, uint16_t v)
+static void npcm_gmac_mdio_access(NPCMGMACState *gmac, uint16_t v)
 {
 bool busy = v & NPCM_GMAC_MII_ADDR_BUSY;
 uint8_t is_write;
@@ -183,33 +226,38 @@ static void npcm_gmac_mdio_access(NPCMGMACState *s, 
uint16_t v)
 
 
 if (v & NPCM_GMAC_MII_ADDR_WRITE) {
-data = s->regs[R_NPCM_GMAC_MII_DATA];
+data = gmac->regs[R_NPCM_GMAC_MII_DATA];
 /* Clear reset bit for BMCR register */
 switch (gr) {
 case MII_BMCR:
 data &= ~MII_BMCR_RESET;
-/* Complete auto-negotiation immediately and set as complete */
-if (data &

[PATCH 11/14] hw/net: GMAC Rx Implementation

2023-09-19 Thread Nabih Estefan

From: Nabih Estefan Diaz 

- Implementation of Receive function for packets
- Implementation for reading and writing from and to descriptors in
  memory for Rx

NOTE: At this point in development we believe this function is working
as intended, and the kernel supports these findings, but we need the
Transmit function to work before we upload

Signed-off-by: Nabih Estefan Diaz 

hw/net: npcm_gmac Flush queued packets when starting RX

When RX starts, we need to flush the queued packets so that they
can be received by the GMAC device. Without this it won't work
with TAP NIC device.

Signed-off-by: Hao Wu 

hw/net: Handle RX desc full in NPCM GMAC

When RX descriptor list is full, it returns a DMA_STATUS for software to handle 
it. But there's no way to indicate the software ha handled all RX descriptors 
and the whole pipeline stalls.

We do something similar to NPCM7XX EMC to handle this case.

1. Return packet size when RX descriptor is full, effectively dropping these 
packets in such a case.
2. When software clears RX descriptor full bit, continue receiving further 
packets by flushing QEMU packet queue.

Signed-off-by: Hao Wu 

hw/net: Receive and drop packets when descriptors are full in GMAC

Effectively this allows QEMU to receive and drop incoming packets when
RX descriptors are full. Similar to EMC, this lets GMAC to drop packets
faster, especially during bootup sequence.

Signed-off-by: Hao Wu 
---
 hw/net/npcm_gmac.c | 353 +
 1 file changed, 324 insertions(+), 29 deletions(-)

diff --git a/hw/net/npcm_gmac.c b/hw/net/npcm_gmac.c
index 6f8109e0ee..67f123e3c4 100644
--- a/hw/net/npcm_gmac.c
+++ b/hw/net/npcm_gmac.c
@@ -23,7 +23,11 @@
 #include "hw/registerfields.h"
 #include "hw/net/mii.h"
 #include "hw/net/npcm_gmac.h"
+#include "linux/if_ether.h"
 #include "migration/vmstate.h"
+#include "net/checksum.h"
+#include "net/net.h"
+#include "qemu/cutils.h"
 #include "qemu/log.h"
 #include "qemu/units.h"
 #include "sysemu/dma.h"
@@ -91,7 +95,6 @@ REG32(NPCM_GMAC_PTP_TTSR, 0x71c)
 #define NPCM_DMA_BUS_MODE_SWR   BIT(0)
 
 static const uint32_t npcm_gmac_cold_reset_values[NPCM_GMAC_NR_REGS] = {
-/* Reduce version to 3.2 so that the kernel can enable interrupt. */
 [R_NPCM_GMAC_VERSION] = 0x1032,
 [R_NPCM_GMAC_TIMER_CTRL]  = 0x03e8,
 [R_NPCM_GMAC_MAC0_ADDR_HI]= 0x8000,
@@ -146,6 +149,17 @@ static void gmac_phy_set_link(NPCMGMACState *s, bool 
active)
 
 static bool gmac_can_receive(NetClientState *nc)
 {
+NPCMGMACState *gmac = NPCM_GMAC(qemu_get_nic_opaque(nc));
+
+/* If GMAC receive is disabled. */
+if (!(gmac->regs[R_NPCM_GMAC_MAC_CONFIG] & NPCM_GMAC_MAC_CONFIG_RX_EN)) {
+return false;
+}
+
+/* If GMAC DMA RX is stopped. */
+if (!(gmac->regs[R_NPCM_DMA_CONTROL] & NPCM_DMA_CONTROL_START_STOP_RX)) {
+return false;
+}
 return true;
 }
 
@@ -191,11 +205,285 @@ static void gmac_update_irq(NPCMGMACState *gmac)
 qemu_set_irq(gmac->irq, level);
 }
 
-static ssize_t gmac_receive(NetClientState *nc, const uint8_t *buf, size_t len)
+static int gmac_read_rx_desc(dma_addr_t addr, struct NPCMGMACRxDesc *desc)
 {
-/* Placeholder */
+if (dma_memory_read(_space_memory, addr, desc,
+sizeof(*desc), MEMTXATTRS_UNSPECIFIED)) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: Failed to read descriptor @ 0x%"
+  HWADDR_PRIx "\n", __func__, addr);
+return -1;
+}
+desc->rdes0 = le32_to_cpu(desc->rdes0);
+desc->rdes1 = le32_to_cpu(desc->rdes1);
+desc->rdes2 = le32_to_cpu(desc->rdes2);
+desc->rdes3 = le32_to_cpu(desc->rdes3);
+return 0;
+}
+
+static int gmac_write_rx_desc(dma_addr_t addr, struct NPCMGMACRxDesc *desc)
+{
+struct NPCMGMACRxDesc le_desc;
+le_desc.rdes0 = cpu_to_le32(desc->rdes0);
+le_desc.rdes1 = cpu_to_le32(desc->rdes1);
+le_desc.rdes2 = cpu_to_le32(desc->rdes2);
+le_desc.rdes3 = cpu_to_le32(desc->rdes3);
+if (dma_memory_write(_space_memory, addr, _desc,
+sizeof(le_desc), MEMTXATTRS_UNSPECIFIED)) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: Failed to write descriptor @ 0x%"
+  HWADDR_PRIx "\n", __func__, addr);
+return -1;
+}
+return 0;
+}
+
+static int gmac_read_tx_desc(dma_addr_t addr, struct NPCMGMACTxDesc *desc)
+{
+if (dma_memory_read(_space_memory, addr, desc,
+sizeof(*desc), MEMTXATTRS_UNSPECIFIED)) {
+qemu_log_mask(LOG_GUEST_ERROR, "%s: Failed to read descriptor @ 0x%"
+  HWADDR_PRIx "\n", __func__, addr);
+return -1;
+}
+desc->tdes0 = le32_to_cpu(desc->tdes0);
+desc->tdes1 = le32_to_cpu(desc->tdes1);
+desc->tdes2 = le32_to_cpu(desc->tdes2);
+desc->tdes3 = le32_to_cpu(desc->tdes3);
+return 0;
+}
+
+static int gmac_write_tx_desc(dma_addr_t addr, struct NPCMGMACTxDesc *desc)
+{
+struct NPCMGMACTxDesc

[PATCH 03/14] hw/misc: Add qtest for NPCM7xx PCI Mailbox

2023-09-19 Thread Nabih Estefan

From: Hao Wu 

This patches adds a qtest for NPCM7XX PCI Mailbox module.
It sends read and write requests to the module, and verifies that
the module contains the correct data after the requests.

Signed-off-by: Hao Wu 
---
 tests/qtest/meson.build |   1 +
 tests/qtest/npcm7xx_pci_mbox-test.c | 238 
 2 files changed, 239 insertions(+)
 create mode 100644 tests/qtest/npcm7xx_pci_mbox-test.c

diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index b071d400b3..5adf12b45f 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -183,6 +183,7 @@ qtests_sparc64 = \
 qtests_npcm7xx = \
   ['npcm7xx_adc-test',
'npcm7xx_gpio-test',
+   'npcm7xx_pci_mbox-test',
'npcm7xx_pwm-test',
'npcm7xx_rng-test',
'npcm7xx_sdhci-test',
diff --git a/tests/qtest/npcm7xx_pci_mbox-test.c 
b/tests/qtest/npcm7xx_pci_mbox-test.c
new file mode 100644
index 00..24eec18e3c
--- /dev/null
+++ b/tests/qtest/npcm7xx_pci_mbox-test.c
@@ -0,0 +1,238 @@
+/*
+ * QTests for Nuvoton NPCM7xx PCI Mailbox Modules.
+ *
+ * Copyright 2021 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qnum.h"
+#include "libqtest-single.h"
+
+#define PCI_MBOX_BA 0xf0848000
+#define PCI_MBOX_IRQ8
+
+/* register offset */
+#define PCI_MBOX_STAT   0x00
+#define PCI_MBOX_CTL0x04
+#define PCI_MBOX_CMD0x08
+
+#define CODE_OK 0x00
+#define CODE_INVALID_OP 0xa0
+#define CODE_INVALID_SIZE   0xa1
+#define CODE_ERROR  0xff
+
+#define OP_READ 0x01
+#define OP_WRITE0x02
+#define OP_INVALID  0x41
+
+
+static int sock;
+static int fd;
+
+/*
+ * Create a local TCP socket with any port, then save off the port we got.
+ */
+static in_port_t open_socket(void)
+{
+struct sockaddr_in myaddr;
+socklen_t addrlen;
+
+myaddr.sin_family = AF_INET;
+myaddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+myaddr.sin_port = 0;
+sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
+g_assert(sock != -1);
+g_assert(bind(sock, (struct sockaddr *) , sizeof(myaddr)) != -1);
+addrlen = sizeof(myaddr);
+g_assert(getsockname(sock, (struct sockaddr *)  , ) != -1);
+g_assert(listen(sock, 1) != -1);
+return ntohs(myaddr.sin_port);
+}
+
+static void setup_fd(void)
+{
+fd_set readfds;
+
+FD_ZERO();
+FD_SET(sock, );
+g_assert(select(sock + 1, , NULL, NULL, NULL) == 1);
+
+fd = accept(sock, NULL, 0);
+g_assert(fd >= 0);
+}
+
+static uint8_t read_response(uint8_t *buf, size_t len)
+{
+uint8_t code;
+ssize_t ret = read(fd, , 1);
+
+if (ret == -1) {
+return CODE_ERROR;
+}
+if (code != CODE_OK) {
+return code;
+}
+g_test_message("response code: %x", code);
+if (len > 0) {
+ret = read(fd, buf, len);
+if (ret < len) {
+return CODE_ERROR;
+}
+}
+return CODE_OK;
+}
+
+static void receive_data(uint64_t offset, uint8_t *buf, size_t len)
+{
+uint8_t op = OP_READ;
+uint8_t code;
+ssize_t rv;
+
+while (len > 0) {
+uint8_t size;
+
+if (len >= 8) {
+size = 8;
+} else if (len >= 4) {
+size = 4;
+} else if (len >= 2) {
+size = 2;
+} else {
+size = 1;
+}
+
+g_test_message("receiving %u bytes", size);
+/* Write op */
+rv = write(fd, , 1);
+g_assert_cmpint(rv, ==, 1);
+/* Write offset */
+rv = write(fd, (uint8_t *), sizeof(uint64_t));
+g_assert_cmpint(rv, ==, sizeof(uint64_t));
+/* Write size */
+g_assert_cmpint(write(fd, , 1), ==, 1);
+
+/* Read data and Expect response */
+code = read_response(buf, size);
+g_assert_cmphex(code, ==, CODE_OK);
+
+buf += size;
+offset += size;
+len -= size;
+}
+}
+
+static void send_data(uint64_t offset, const uint8_t *buf, size_t len)
+{
+uint8_t op = OP_WRITE;
+uint8_t code;
+ssize_t rv;
+
+while (len > 0) {
+uint8_t size;
+
+if (len >= 8) {
+size = 8;
+} else if (len >= 4) {
+size = 4;
+} else if (len >= 2) {
+size = 2;
+} else {
+size = 1;
+}
+
+g_test_message("sending %u bytes", size);
+/* Write op */
+rv = write(fd, , 1);
+

[PATCH 08/14] \tests/qtest: Creating qtest for GMAC Module

2023-09-19 Thread Nabih Estefan

From: Nabih Estefan Diaz 

 - Created qtest to check initialization of registers in GMAC Module.
 - Implemented test into Build File.

Signed-off-by: Nabih Estefan Diaz 
---
 tests/qtest/meson.build  |  11 +-
 tests/qtest/npcm_gmac-test.c | 209 +++
 2 files changed, 215 insertions(+), 5 deletions(-)
 create mode 100644 tests/qtest/npcm_gmac-test.c

diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index 5adf12b45f..4d0e00444d 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -191,6 +191,8 @@ qtests_npcm7xx = \
'npcm7xx_timer-test',
'npcm7xx_watchdog_timer-test'] + \
(slirp.found() ? ['npcm7xx_emc-test'] : [])
+qtests_npcm8xx = \
+  ['npcm_gmac-test']
 qtests_aspeed = \
   ['aspeed_hace-test',
'aspeed_smc-test',
@@ -205,9 +207,7 @@ qtests_arm = \
   (config_all_devices.has_key('CONFIG_ASPEED_SOC') ? qtests_aspeed : []) + \
   (config_all_devices.has_key('CONFIG_NPCM7XX') ? qtests_npcm7xx : []) + \
   (config_all_devices.has_key('CONFIG_GENERIC_LOADER') ? ['hexloader-test'] : 
[]) + \
-  (config_all_devices.has_key('CONFIG_TPM_TIS_I2C') ? ['tpm-tis-i2c-test'] : 
[]) + \
-  (config_all_devices.has_key('CONFIG_VEXPRESS') ? ['test-arm-mptimer'] : []) 
+ \
-  (config_all_devices.has_key('CONFIG_MICROBIT') ? ['microbit-test'] : []) + \
+  (config_all_devices.has_key('CONFIG_NPCM8XX') ? qtests_npcm8xx : []) + \
   ['arm-cpu-features',
'boot-serial-test']
 
@@ -219,8 +219,9 @@ qtests_aarch64 = \
   (config_all_devices.has_key('CONFIG_XLNX_ZYNQMP_ARM') ? ['xlnx-can-test', 
'fuzz-xlnx-dp-test'] : []) + \
   (config_all_devices.has_key('CONFIG_XLNX_VERSAL') ? ['xlnx-canfd-test'] : 
[]) + \
   (config_all_devices.has_key('CONFIG_RASPI') ? ['bcm2835-dma-test'] : []) +  \
-  (config_all.has_key('CONFIG_TCG') and
\
-   config_all_devices.has_key('CONFIG_TPM_TIS_I2C') ? ['tpm-tis-i2c-test'] : 
[]) + \
+  (config_all_devices.has_key('CONFIG_ASPEED_SOC') ? qtests_aspeed : []) + \
+  (config_all_devices.has_key('CONFIG_NPCM7XX') ? qtests_npcm7xx : []) + \
+  (config_all_devices.has_key('CONFIG_NPCM8XX') ? qtests_npcm8xx : []) + \
   ['arm-cpu-features',
'numa-test',
'boot-serial-test',
diff --git a/tests/qtest/npcm_gmac-test.c b/tests/qtest/npcm_gmac-test.c
new file mode 100644
index 00..30d27e8dcc
--- /dev/null
+++ b/tests/qtest/npcm_gmac-test.c
@@ -0,0 +1,209 @@
+/*
+ * QTests for Nuvoton NPCM7xx/8xx GMAC Modules.
+ *
+ * Copyright 2022 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+#include "qemu/osdep.h"
+#include "libqos/libqos.h"
+
+/* Name of the GMAC Device */
+#define TYPE_NPCM_GMAC "npcm-gmac"
+
+typedef struct GMACModule {
+int irq;
+uint64_t base_addr;
+} GMACModule;
+
+typedef struct TestData {
+const GMACModule *module;
+} TestData;
+
+/* Values extracted from hw/arm/npcm8xx.c */
+static const GMACModule gmac_module_list[] = {
+{
+.irq= 14,
+.base_addr  = 0xf0802000
+},
+{
+.irq= 15,
+.base_addr  = 0xf0804000
+},
+{
+.irq= 16,
+.base_addr  = 0xf0806000
+},
+{
+.irq= 17,
+.base_addr  = 0xf0808000
+}
+};
+
+/* Returns the index of the GMAC module. */
+static int gmac_module_index(const GMACModule *mod)
+{
+ptrdiff_t diff = mod - gmac_module_list;
+
+g_assert_true(diff >= 0 && diff < ARRAY_SIZE(gmac_module_list));
+
+return diff;
+}
+
+/* 32-bit register indices. Taken from npcm_gmac.c */
+typedef enum NPCMRegister {
+/* DMA Registers */
+NPCM_DMA_BUS_MODE = 0x1000,
+NPCM_DMA_XMT_POLL_DEMAND = 0x1004,
+NPCM_DMA_RCV_POLL_DEMAND = 0x1008,
+NPCM_DMA_RCV_BASE_ADDR = 0x100c,
+NPCM_DMA_TX_BASE_ADDR = 0x1010,
+NPCM_DMA_STATUS = 0x1014,
+NPCM_DMA_CONTROL = 0x1018,
+NPCM_DMA_INTR_ENA = 0x101c,
+NPCM_DMA_MISSED_FRAME_CTR = 0x1020,
+NPCM_DMA_HOST_TX_DESC = 0x1048,
+NPCM_DMA_HOST_RX_DESC = 0x104c,
+NPCM_DMA_CUR_TX_BUF_ADDR = 0x1050,
+NPCM_DMA_CUR_RX_BUF_ADDR = 0x1054,
+NPCM_DMA_HW_FEATURE = 0x1058,
+
+/* GMAC Registers */
+NPCM_GMAC_MAC_CONFIG = 0x0,
+NPCM_GMAC_FRAME_FILTER = 0x4,
+NPCM_GMAC_HASH_HIGH = 0x8,
+NPCM_GMAC_HASH_LOW = 0xc,
+NPCM_GMAC_MII_ADDR = 0x10,
+NPCM_GMAC_MII_DATA = 0x14,
+NPCM_GMAC_FLOW_CTRL = 0x18,
+NPCM_GMAC_VLAN_FLAG = 0x1c,
+NPCM_GMAC_VERSION = 0x20,
+NPCM_GMAC_WAKEUP_FILTER = 0x28,
+NPCM_GMAC_PMT = 0x2c,
+

[PATCH 00/14] Implementation of NPI Mailbox and GMAC Networking Module

2023-09-19 Thread Nabih Estefan

From: Nabih Estefan Diaz 

Creates NPI Mailbox Module with data verification for read and write (internal 
and external),
wiring to the Nuvoton SoC, and QTests.

Also creates the GMAC Networking Module. Implements read and write 
functionalities with cooresponding descriptors
and registers. Also includes QTests for the different functionalities.

Hao Wu (8):
  hw/misc: Add Nuvoton's PCI Mailbox Module
  hw/arm: Add PCI mailbox module to Nuvoton SoC
  hw/misc: Add qtest for NPCM7xx PCI Mailbox
  hw/net: Add NPCM8XX PCS Module
  hw/net: Add NPCMXXX GMAC device
  hw/arm: Add GMAC devices to NPCM8XX SoC
  hw/arm: Add GMAC devices to NPCM7XX SoC
  hw/arm: Connect to chardev backend for NPCM8XX

Nabih Estefan Diaz (6):
  \tests/qtest: Creating qtest for GMAC Module
  include/hw/net: Implemented Classes and Masks for GMAC Descriptors
  hw/net: General GMAC Implementation
  hw/net: GMAC Rx Implementation
  hw/net: GMAC Tx Implementation
  tests/qtest: Adding PCS Module test to GMAC Qtest

 hw/arm/npcm7xx.c|  54 +-
 hw/arm/npcm8xx.c| 904 +++
 hw/misc/meson.build |   1 +
 hw/misc/npcm7xx_pci_mbox.c  | 323 ++
 hw/misc/trace-events|   5 +
 hw/net/meson.build  |   3 +-
 hw/net/npcm_gmac.c  | 937 
 hw/net/npcm_pcs.c   | 409 
 hw/net/trace-events |  20 +
 include/hw/arm/npcm7xx.h|   4 +
 include/hw/arm/npcm8xx.h| 118 
 include/hw/misc/npcm7xx_pci_mbox.h  |  81 +++
 include/hw/net/npcm_gmac.h  | 342 ++
 include/hw/net/npcm_pcs.h   |  42 ++
 tests/qtest/meson.build |  12 +-
 tests/qtest/npcm7xx_pci_mbox-test.c | 238 +++
 tests/qtest/npcm_gmac-test.c| 342 ++
 17 files changed, 3826 insertions(+), 9 deletions(-)
 create mode 100644 hw/arm/npcm8xx.c
 create mode 100644 hw/misc/npcm7xx_pci_mbox.c
 create mode 100644 hw/net/npcm_gmac.c
 create mode 100644 hw/net/npcm_pcs.c
 create mode 100644 include/hw/arm/npcm8xx.h
 create mode 100644 include/hw/misc/npcm7xx_pci_mbox.h
 create mode 100644 include/hw/net/npcm_gmac.h
 create mode 100644 include/hw/net/npcm_pcs.h
 create mode 100644 tests/qtest/npcm7xx_pci_mbox-test.c
 create mode 100644 tests/qtest/npcm_gmac-test.c

-- 
2.42.0.459.ge4e396fd5e-goog

[PATCH 01/14] hw/misc: Add Nuvoton's PCI Mailbox Module

2023-09-19 Thread Nabih Estefan

From: Hao Wu 

The PCI Mailbox Module is a high-bandwidth communcation module
between a Nuvoton BMC and CPU. It features 16KB RAM that are both
accessible by the BMC and core CPU. and supports interrupt for
both sides.

This patch implements the BMC side of the PCI mailbox module.
Communication with the core CPU is emulated via a chardev and
will be in a follow-up patch.

Signed-off-by: Hao Wu 
---
 hw/misc/meson.build|   1 +
 hw/misc/npcm7xx_pci_mbox.c | 176 +
 hw/misc/trace-events   |   5 +
 include/hw/misc/npcm7xx_pci_mbox.h |  63 +++
 4 files changed, 245 insertions(+)
 create mode 100644 hw/misc/npcm7xx_pci_mbox.c
 create mode 100644 include/hw/misc/npcm7xx_pci_mbox.h

diff --git a/hw/misc/meson.build b/hw/misc/meson.build
index 892f8b91c5..1f4ec94584 100644
--- a/hw/misc/meson.build
+++ b/hw/misc/meson.build
@@ -70,6 +70,7 @@ system_ss.add(when: 'CONFIG_NPCM7XX', if_true: files(
   'npcm7xx_clk.c',
   'npcm7xx_gcr.c',
   'npcm7xx_mft.c',
+  'npcm7xx_pci_mbox.c',
   'npcm7xx_pwm.c',
   'npcm7xx_rng.c',
 ))
diff --git a/hw/misc/npcm7xx_pci_mbox.c b/hw/misc/npcm7xx_pci_mbox.c
new file mode 100644
index 00..d82a87fc41
--- /dev/null
+++ b/hw/misc/npcm7xx_pci_mbox.c
@@ -0,0 +1,176 @@
+/*
+ * Nuvoton NPCM7xx PCI Mailbox Module
+ *
+ * Copyright 2021 Google LLC
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/irq.h"
+#include "hw/qdev-clock.h"
+#include "hw/qdev-properties-system.h"
+#include "hw/misc/npcm7xx_pci_mbox.h"
+#include "hw/registerfields.h"
+#include "migration/vmstate.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "qemu/bitops.h"
+#include "qemu/error-report.h"
+#include "qemu/log.h"
+#include "qemu/module.h"
+#include "qemu/timer.h"
+#include "qemu/units.h"
+#include "trace.h"
+
+REG32(NPCM7XX_PCI_MBOX_BMBXSTAT, 0x00);
+REG32(NPCM7XX_PCI_MBOX_BMBXCTL, 0x04);
+REG32(NPCM7XX_PCI_MBOX_BMBXCMD, 0x08);
+
+
+#define NPCM7XX_PCI_MBOX_NR_CI 8
+#define NPCM7XX_PCI_MBOX_CI_MASK MAKE_64BIT_MASK(0, NPCM7XX_PCI_MBOX_NR_CI)
+
+static void npcm7xx_pci_mbox_update_irq(NPCM7xxPCIMBoxState *s)
+{
+/* We should send an interrupt when one of the CIE and CIF are both 1. */
+if (s->regs[R_NPCM7XX_PCI_MBOX_BMBXSTAT] &
+s->regs[R_NPCM7XX_PCI_MBOX_BMBXCTL] &
+NPCM7XX_PCI_MBOX_CI_MASK) {
+qemu_irq_raise(s->irq);
+trace_npcm7xx_pci_mbox_irq(1);
+} else {
+qemu_irq_lower(s->irq);
+trace_npcm7xx_pci_mbox_irq(0);
+}
+}
+
+static uint64_t npcm7xx_pci_mbox_read(void *opaque, hwaddr offset, unsigned 
size)
+{
+NPCM7xxPCIMBoxState *s = NPCM7XX_PCI_MBOX(opaque);
+uint16_t value = 0;
+
+if (offset / sizeof(uint32_t) >= NPCM7XX_PCI_MBOX_NR_REGS) {
+qemu_log_mask(LOG_GUEST_ERROR,
+  "%s: offset 0x%04" HWADDR_PRIx " out of range\n",
+  __func__, offset);
+return 0;
+}
+
+value = s->regs[offset / sizeof(uint32_t)];
+trace_npcm7xx_pci_mbox_read(DEVICE(s)->canonical_path, offset, value, 
size);
+return value;
+}
+
+static void npcm7xx_pci_mbox_write(void *opaque, hwaddr offset,
+  uint64_t v, unsigned size)
+{
+NPCM7xxPCIMBoxState *s = NPCM7XX_PCI_MBOX(opaque);
+
+trace_npcm7xx_pci_mbox_write(DEVICE(s)->canonical_path, offset, v, size);
+switch (offset) {
+case A_NPCM7XX_PCI_MBOX_BMBXSTAT:
+/* Clear bits that are 1. */
+s->regs[R_NPCM7XX_PCI_MBOX_BMBXSTAT] &= ~v;
+break;
+
+case A_NPCM7XX_PCI_MBOX_BMBXCTL:
+s->regs[R_NPCM7XX_PCI_MBOX_BMBXCTL] = v;
+break;
+
+case A_NPCM7XX_PCI_MBOX_BMBXCMD:
+/* Set the bits that are 1. */
+s->regs[R_NPCM7XX_PCI_MBOX_BMBXCMD] |= v;
+/* TODO: Set interrupt to host. */
+break;
+
+default:
+qemu_log_mask(LOG_GUEST_ERROR,
+  "%s: offset 0x%04" HWADDR_PRIx " out of range\n",
+  __func__, offset);
+}
+npcm7xx_pci_mbox_update_irq(s);
+}
+
+static const struct MemoryRegionOps npcm7xx_pci_mbox_ops = {
+.read   = npcm7xx_pci_mbox_read,
+.write  = npcm7xx_pci_mbox_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.valid  = {
+.min_access_size= 4,
+.max_access_size= 4,
+.unaligned  = false,
+},
+};
+
+static void npcm7xx_pci_mbox_enter_reset(Object *obj, ResetType type)
+{
+NPCM7xxPCIMBoxState *s =

[PATCH 09/14] include/hw/net: Implemented Classes and Masks for GMAC Descriptors

2023-09-19 Thread Nabih Estefan

From: Nabih Estefan Diaz 

 - Implemeted classes for GMAC Receive and Transmit Descriptors
 - Implemented Masks for said descriptors

Signed-off-by: Nabih Estefan Diaz 
---
 include/hw/net/npcm_gmac.h | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/include/hw/net/npcm_gmac.h b/include/hw/net/npcm_gmac.h
index 03529db1d6..067928fe0b 100644
--- a/include/hw/net/npcm_gmac.h
+++ b/include/hw/net/npcm_gmac.h
@@ -38,12 +38,19 @@ struct NPCMGMACRxDesc {
 /* RDES2 and RDES3 are buffer address pointers */
 /* Owner: 0 = software, 1 = gmac */
 #define RX_DESC_RDES0_OWNER_MASK BIT(31)
+<<< HEAD
 /* Owner*/
 #define RX_DESC_RDES0_OWNER_SHIFT 31
 /* Destination Address Filter Fail */
 #define RX_DESC_RDES0_DEST_ADDR_FILT_FAIL_MASK BIT(30)
 /* Frame length*/
 #define RX_DESC_RDES0_FRAME_LEN_MASK(word) extract32(word, 16, 29)
+===
+/* Destination Address Filter Fail */
+#define RX_DESC_RDES0_DEST_ADDR_FILT_FAIL_MASK BIT(30)
+/* Frame length*/
+#define RX_DESC_RDES0_FRAME_LEN_MASK extract32(rdes0, 16, 29)
+>>> f17fd3e311 (include/hw/net: Implemented Classes and Masks for GMAC 
Descriptors)
 /* Error Summary */
 #define RX_DESC_RDES0_ERR_SUMM_MASK BIT(15)
 /* Descriptor Error */
@@ -84,11 +91,17 @@ struct NPCMGMACRxDesc {
 /* Second Address Chained */
 #define RX_DESC_RDES1_SEC_ADDR_CHND_MASK BIT(24)
 /* Receive Buffer 2 Size */
+<<< HEAD
 #define RX_DESC_RDES1_BFFR2_SZ_SHIFT 11
 #define RX_DESC_RDES1_BFFR2_SZ_MASK(word) extract32(word, \
 RX_DESC_RDES1_BFFR2_SZ_SHIFT, 10 + RX_DESC_RDES1_BFFR2_SZ_SHIFT)
 /* Receive Buffer 1 Size */
 #define RX_DESC_RDES1_BFFR1_SZ_MASK(word) extract32(word, 0, 10)
+===
+#define RX_DESC_RDES1_BFFR2_SZ_MASK extract32(rdes1, 11, 21)
+/* Receive Buffer 1 Size */
+#define RX_DESC_RDES1_BFFR1_SZ_MASK extract32(rdes1, 0, 10)
+>>> f17fd3e311 (include/hw/net: Implemented Classes and Masks for GMAC 
Descriptors)
 
 
 struct NPCMGMACTxDesc {
@@ -125,7 +138,11 @@ struct NPCMGMACTxDesc {
 /* VLAN Frame */
 #define TX_DESC_TDES0_VLAN_FRM_MASK BIT(7)
 /* Collision Count */
+<<< HEAD
 #define TX_DESC_TDES0_COLL_CNT_MASK(word) extract32(word, 3, 6)
+===
+#define TX_DESC_TDES0_COLL_CNT_MASK extract32(tdes0, 3, 6)
+>>> f17fd3e311 (include/hw/net: Implemented Classes and Masks for GMAC 
Descriptors)
 /* Excessive Deferral */
 #define TX_DESC_TDES0_EXCS_DEF_MASK BIT(2)
 /* Underflow Error */
@@ -140,7 +157,11 @@ struct NPCMGMACTxDesc {
 /* Last Segment */
 #define TX_DESC_TDES1_FIRST_SEG_MASK BIT(29)
 /* Checksum Insertion Control */
+<<< HEAD
 #define TX_DESC_TDES1_CHKSM_INS_CTRL_MASK(word) extract32(word, 27, 28)
+===
+#define TX_DESC_TDES1_CHKSM_INS_CTRL_MASK extract32(tdes1, 27, 28)
+>>> f17fd3e311 (include/hw/net: Implemented Classes and Masks for GMAC 
Descriptors)
 /* Disable Cyclic Redundancy Check */
 #define TX_DESC_TDES1_DIS_CDC_MASK BIT(26)
 /* Transmit End of Ring */
@@ -148,9 +169,15 @@ struct NPCMGMACTxDesc {
 /* Secondary Address Chained */
 #define TX_DESC_TDES1_SEC_ADDR_CHND_MASK BIT(24)
 /* Transmit Buffer 2 Size */
+<<< HEAD
 #define TX_DESC_TDES1_BFFR2_SZ_MASK(word) extract32(word, 11, 21)
 /* Transmit Buffer 1 Size */
 #define TX_DESC_TDES1_BFFR1_SZ_MASK(word) extract32(word, 0, 10)
+===
+#define TX_DESC_TDES1_BFFR2_SZ_MASK extract32(tdes1, 11, 21)
+/* Transmit Buffer 1 Size */
+#define TX_DESC_TDES1_BFFR1_SZ_MASK extract32(tdes1, 0, 10)
+>>> f17fd3e311 (include/hw/net: Implemented Classes and Masks for GMAC 
Descriptors)
 
 typedef struct NPCMGMACState {
 SysBusDevice parent;
-- 
2.42.0.459.ge4e396fd5e-goog

[PATCH 14/14] tests/qtest: Adding PCS Module test to GMAC Qtest

2023-09-19 Thread Nabih Estefan

From: Nabih Estefan Diaz 

 - Add PCS Register check to npcm_gmac-test

Signed-off-by: Nabih Estefan Diaz 
---
 tests/qtest/npcm_gmac-test.c | 135 ++-
 1 file changed, 134 insertions(+), 1 deletion(-)

diff --git a/tests/qtest/npcm_gmac-test.c b/tests/qtest/npcm_gmac-test.c
index 84511fd915..99b914f198 100644
--- a/tests/qtest/npcm_gmac-test.c
+++ b/tests/qtest/npcm_gmac-test.c
@@ -20,6 +20,10 @@
 /* Name of the GMAC Device */
 #define TYPE_NPCM_GMAC "npcm-gmac"
 
+/* Address of the PCS Module */
+#define PCS_BASE_ADDRESS 0xf078
+#define NPCM_PCS_IND_AC_BA 0x1fe
+
 typedef struct GMACModule {
 int irq;
 uint64_t base_addr;
@@ -111,6 +115,62 @@ typedef enum NPCMRegister {
 NPCM_GMAC_PTP_STNSUR = 0x714,
 NPCM_GMAC_PTP_TAR = 0x718,
 NPCM_GMAC_PTP_TTSR = 0x71c,
+
+/* PCS Registers */
+NPCM_PCS_SR_CTL_ID1 = 0x3c0008,
+NPCM_PCS_SR_CTL_ID2 = 0x3c000a,
+NPCM_PCS_SR_CTL_STS = 0x3c0010,
+
+NPCM_PCS_SR_MII_CTRL = 0x3e,
+NPCM_PCS_SR_MII_STS = 0x3e0002,
+NPCM_PCS_SR_MII_DEV_ID1 = 0x3e0004,
+NPCM_PCS_SR_MII_DEV_ID2 = 0x3e0006,
+NPCM_PCS_SR_MII_AN_ADV = 0x3e0008,
+NPCM_PCS_SR_MII_LP_BABL = 0x3e000a,
+NPCM_PCS_SR_MII_AN_EXPN = 0x3e000c,
+NPCM_PCS_SR_MII_EXT_STS = 0x3e001e,
+
+NPCM_PCS_SR_TIM_SYNC_ABL = 0x3e0e10,
+NPCM_PCS_SR_TIM_SYNC_TX_MAX_DLY_LWR = 0x3e0e12,
+NPCM_PCS_SR_TIM_SYNC_TX_MAX_DLY_UPR = 0x3e0e14,
+NPCM_PCS_SR_TIM_SYNC_TX_MIN_DLY_LWR = 0x3e0e16,
+NPCM_PCS_SR_TIM_SYNC_TX_MIN_DLY_UPR = 0x3e0e18,
+NPCM_PCS_SR_TIM_SYNC_RX_MAX_DLY_LWR = 0x3e0e1a,
+NPCM_PCS_SR_TIM_SYNC_RX_MAX_DLY_UPR = 0x3e0e1c,
+NPCM_PCS_SR_TIM_SYNC_RX_MIN_DLY_LWR = 0x3e0e1e,
+NPCM_PCS_SR_TIM_SYNC_RX_MIN_DLY_UPR = 0x3e0e20,
+
+NPCM_PCS_VR_MII_MMD_DIG_CTRL1 = 0x3f,
+NPCM_PCS_VR_MII_AN_CTRL = 0x3f0002,
+NPCM_PCS_VR_MII_AN_INTR_STS = 0x3f0004,
+NPCM_PCS_VR_MII_TC = 0x3f0006,
+NPCM_PCS_VR_MII_DBG_CTRL = 0x3f000a,
+NPCM_PCS_VR_MII_EEE_MCTRL0 = 0x3f000c,
+NPCM_PCS_VR_MII_EEE_TXTIMER = 0x3f0010,
+NPCM_PCS_VR_MII_EEE_RXTIMER = 0x3f0012,
+NPCM_PCS_VR_MII_LINK_TIMER_CTRL = 0x3f0014,
+NPCM_PCS_VR_MII_EEE_MCTRL1 = 0x3f0016,
+NPCM_PCS_VR_MII_DIG_STS = 0x3f0020,
+NPCM_PCS_VR_MII_ICG_ERRCNT1 = 0x3f0022,
+NPCM_PCS_VR_MII_MISC_STS = 0x3f0030,
+NPCM_PCS_VR_MII_RX_LSTS = 0x3f0040,
+NPCM_PCS_VR_MII_MP_TX_BSTCTRL0 = 0x3f0070,
+NPCM_PCS_VR_MII_MP_TX_LVLCTRL0 = 0x3f0074,
+NPCM_PCS_VR_MII_MP_TX_GENCTRL0 = 0x3f007a,
+NPCM_PCS_VR_MII_MP_TX_GENCTRL1 = 0x3f007c,
+NPCM_PCS_VR_MII_MP_TX_STS = 0x3f0090,
+NPCM_PCS_VR_MII_MP_RX_GENCTRL0 = 0x3f00b0,
+NPCM_PCS_VR_MII_MP_RX_GENCTRL1 = 0x3f00b2,
+NPCM_PCS_VR_MII_MP_RX_LOS_CTRL0 = 0x3f00ba,
+NPCM_PCS_VR_MII_MP_MPLL_CTRL0 = 0x3f00f0,
+NPCM_PCS_VR_MII_MP_MPLL_CTRL1 = 0x3f00f2,
+NPCM_PCS_VR_MII_MP_MPLL_STS = 0x3f0110,
+NPCM_PCS_VR_MII_MP_MISC_CTRL2 = 0x3f0126,
+NPCM_PCS_VR_MII_MP_LVL_CTRL = 0x3f0130,
+NPCM_PCS_VR_MII_MP_MISC_CTRL0 = 0x3f0132,
+NPCM_PCS_VR_MII_MP_MISC_CTRL1 = 0x3f0134,
+NPCM_PCS_VR_MII_DIG_CTRL2 = 0x3f01c2,
+NPCM_PCS_VR_MII_DIG_ERRCNT_SEL = 0x3f01c4,
 } NPCMRegister;
 
 static uint32_t gmac_read(QTestState *qts, const GMACModule *mod,
@@ -119,6 +179,15 @@ static uint32_t gmac_read(QTestState *qts, const 
GMACModule *mod,
 return qtest_readl(qts, mod->base_addr + regno);
 }
 
+static uint16_t pcs_read(QTestState *qts, const GMACModule *mod,
+  NPCMRegister regno)
+{
+uint32_t write_value = (regno & 0x3ffe00) >> 9;
+qtest_writel(qts, PCS_BASE_ADDRESS + NPCM_PCS_IND_AC_BA, write_value);
+uint32_t read_offset = regno & 0x1ff;
+return qtest_readl(qts, PCS_BASE_ADDRESS + read_offset);
+}
+
 /* Check that GMAC registers are reset to default value */
 static void test_init(gconstpointer test_data)
 {
@@ -129,7 +198,12 @@ static void test_init(gconstpointer test_data)
 #define CHECK_REG32(regno, value) \
 do { \
 g_assert_cmphex(gmac_read(qts, mod, (regno)), ==, (value)); \
-} while (0)
+} while (0);
+
+#define CHECK_REG_PCS(regno, value) \
+do { \
+g_assert_cmphex(pcs_read(qts, mod, (regno)), ==, (value)); \
+} while (0);
 
 CHECK_REG32(NPCM_DMA_BUS_MODE, 0x00020100);
 CHECK_REG32(NPCM_DMA_XMT_POLL_DEMAND, 0);
@@ -180,6 +254,65 @@ static void test_init(gconstpointer test_data)
 CHECK_REG32(NPCM_GMAC_PTP_TAR, 0);
 CHECK_REG32(NPCM_GMAC_PTP_TTSR, 0);
 
+/* TODO Add registers PCS */
+if (mod->base_addr == 0xf0802000)
+{
+CHECK_REG_PCS(NPCM_PCS_SR_CTL_ID1, 0x699e)
+CHECK_REG_PCS(NPCM_PCS_SR_CTL_ID2, 0)
+CHECK_REG_PCS(NPCM_PCS_SR_CTL_STS, 0x8000)
+
+CHECK_REG_PCS(NPCM_PCS_SR_MII_CTRL, 0x1140)
+CHECK_REG_PCS(NPCM_PCS_SR_MII_STS, 0x0109)
+CHECK_REG_PCS(NPCM_PCS_SR_MII_DEV_ID1, 0x699e)
+CHECK_REG_PCS(NPCM_PCS_SR_MII_DEV_ID2, 0x0ced0)
+CHECK_REG_PCS(NPCM_PCS_SR_MII_AN_ADV, 0x0020)
+

Re: [PATCH v3 2/6] hw/isa/piix3: Reuse piix3_realize() in piix3_xen_realize()

2023-09-19 Thread Bernhard Beschow




Am 3. April 2023 12:27:14 UTC schrieb Jason Andryuk :
>On Mon, Apr 3, 2023 at 5:33 AM Anthony PERARD  
>wrote:
>>
>> On Sat, Apr 01, 2023 at 10:36:45PM +, Bernhard Beschow wrote:
>> >
>> >
>> > Am 30. März 2023 13:00:25 UTC schrieb Anthony PERARD 
>> > :
>> > >On Sun, Mar 12, 2023 at 01:02:17PM +0100, Bernhard Beschow wrote:
>> > >> This is a preparational patch for the next one to make the following
>> > >> more obvious:
>> > >>
>> > >> First, pci_bus_irqs() is now called twice in case of Xen where the
>> > >> second call overrides the pci_set_irq_fn with the Xen variant.
>> > >
>> > >pci_bus_irqs() does allocates pci_bus->irq_count, so the second call in
>> > >piix3_xen_realize() will leak `pci_bus->irq_count`. Could you look if
>> > >pci_bus_irqs_cleanup() can be called before the second pci_bus_irqs()
>> > >call, or maybe some other way to avoid the leak?
>> >
>> > Thanks for catching this! I'll post a v4.
>> >
>> > I think the most fool-proof way to fix this is to free irq_count just 
>> > before the assignment. pci_bus_irqs_cleanup() would then have to NULL the 
>> > attribute such that pci_bus_irqs() can be called afterwards.
>> >
>> > BTW: I tried running qemu-system-x86_64 with PIIX4 rather than PIIX3 as 
>> > Xen guest with my pc-piix4 branch without success. This branch essentially 
>> > just provides slightly different PCI IDs for PIIX. Does xl or something 
>> > else in Xen check these? If not then this means I'm still missing 
>> > something. Under KVM this branch works just fine. Any idea?
>>
>> Maybe the ACPI tables provided by libxl needs to be updated.
>> Or maybe something in the firmware (SeaBIOS or OVMF/OvmfXen) check the
>> id (I know that the PCI id of the root bus is checked, but I don't know
>> if that's the one that's been changed).
>
>Xen also has hvmloader, which runs before SeaBIOS/OVMF.  Looking at
>tools/firmware/hvmloader/pci.c, it has
>ASSERT((devfn != PCI_ISA_DEVFN) ||
>   ((vendor_id == 0x8086) && (device_id == 0x7000)));
>
>From QEMU, it looks like 0x7000 is PCI_DEVICE_ID_INTEL_82371SB_0, but
>PIIX4 uses 0x7110 (PCI_DEVICE_ID_INTEL_82371AB_0).  Maybe try removing
>that check?

I was finally able to build Xen successfully (without my distribution providing 
too recent dependencies that prevent compilation). With 0x7110 added in the 
line above I could indeed run a Xen guest with PIIX4. Yay!

Now I just need to respin my PIIX consolidation series...

Best regards,
Bernhard

>
>Regards,
>Jason

Re: [PATCH v2 0/8] ACPI: X86 AML generation and GPE tracing cleanup

2023-09-19 Thread Bernhard Beschow




Am 8. September 2023 08:42:26 UTC schrieb Bernhard Beschow :
>This series contains changes from my effort to bring the VIA south bridges to
>
>the PC machine [1]. The first part of the series resolves the
>
>AcpiCpuAmlIfClass::madt_cpu virtual method which frees ACPI controllers from
>
>worrying about CPU AML generation. The second part minimizes an Intel-specific
>
>assumption in AML generation to just one place. The third part contains two
>
>ACPI tracing patches which have been reviewed a long time ago but weren't 
>merged
>
>yet.
>
>
>
>The removal of AcpiCpuAmlIfClass::madt_cpu is essentially a respin of [2] with
>
>a different approach. Igor wasn't generally against it but wasn't convinced
>
>either [3]. The new approach causes much less churn and instead allows to
>
>remove code. So I think it's worth to be reconsidered.
>
>
>
>The motivation for removing this virtual method didn't change: It frees the 
>ACPI
>
>controllers in general and PIIX4 PM in particular from generating X86 CPU AML.
>
>The latter is also used in MPIS context where X86 CPU AML generation is
>
>stubbed out. This indicates a design issue where a problem was solved at the
>
>wrong place. Moreover, it turned out that TYPE_ACPI_GED_X86 could be removed as
>
>well, further supporting this claim.
>
>
>
>The second part of this series limits SMI command port determination during AML
>
>generation to just one place. Currently the ACPI_PORT_SMI_CMD constant is used
>
>multiple times which has an Intel-specific value. In order to make the code a
>
>microscopic bit more compatible with our VIA south bridge models its usage gets
>
>limited to one place, allowing the constant to be turned into a device model
>
>property in the future.
>
>
>
>The third part improves the tracing experience for ACPI general purpose events.
>
>It originates from an old series: [4].
>
>
>
>Testing done:
>
>* `make check`
>
>* `make check-avocado`
>
>
>
>v2:
>
>* Trace ACPI GPE values with "0x%02" (Phil)
>

Ping

All patches reviewed. Michael, are you the one going to queue it?

Thanks,
Bernhard

>
>
>[1] https://github.com/shentok/qemu/tree/pc-via
>
>[2] 
>https://lore.kernel.org/qemu-devel/20230121151941.24120-1-shen...@gmail.com/
>
>[3] 
>https://lore.kernel.org/qemu-devel/20230125174842.395fd...@imammedo.users.ipa.redhat.com/
>
>[4] https://patchew.org/QEMU/20230122170724.21868-1-shen...@gmail.com/
>
>
>
>Bernhard Beschow (8):
>
>  hw/i386/acpi-build: Use pc_madt_cpu_entry() directly
>
>  hw/acpi/cpu: Have build_cpus_aml() take a build_madt_cpu_fn callback
>
>  hw/acpi/acpi_dev_interface: Remove now unused madt_cpu virtual method
>
>  hw/acpi/acpi_dev_interface: Remove now unused #include "hw/boards.h"
>
>  hw/i386: Remove now redundant TYPE_ACPI_GED_X86
>
>  hw/i386/acpi-build: Determine SMI command port just once
>
>  hw/acpi: Trace GPE access in all device models, not just PIIX4
>
>  hw/acpi/core: Trace enable and status registers of GPE separately
>
>
>
> hw/acpi/hmat.h |  3 ++-
>
> hw/i386/acpi-common.h  |  3 +--
>
> include/hw/acpi/acpi_dev_interface.h   |  3 ---
>
> include/hw/acpi/cpu.h  |  6 -
>
> include/hw/acpi/generic_event_device.h |  2 --
>
> hw/acpi/acpi-x86-stub.c|  6 -
>
> hw/acpi/core.c |  9 +++
>
> hw/acpi/cpu.c  |  9 +++
>
> hw/acpi/hmat.c |  1 +
>
> hw/acpi/memory_hotplug.c   |  1 +
>
> hw/acpi/piix4.c|  5 
>
> hw/i386/acpi-build.c   | 13 +-
>
> hw/i386/acpi-common.c  |  5 ++--
>
> hw/i386/acpi-microvm.c |  3 +--
>
> hw/i386/generic_event_device_x86.c | 36 --
>
> hw/i386/microvm.c  |  2 +-
>
> hw/isa/lpc_ich9.c  |  1 -
>
> hw/acpi/trace-events   | 10 ---
>
> hw/i386/meson.build|  1 -
>
> 19 files changed, 38 insertions(+), 81 deletions(-)
>
> delete mode 100644 hw/i386/generic_event_device_x86.c
>
>
>
>-- >
>2.42.0
>
>
>

Re: [PATCH v11 6/9] gfxstream + rutabaga: add initial support for gfxstream

2023-09-19 Thread Bernhard Beschow




Am 15. September 2023 02:38:02 UTC schrieb Gurchetan Singh 
:
>On Thu, Sep 14, 2023 at 12:23 AM Bernhard Beschow  wrote:
>
>>
>>
>> Am 14. September 2023 04:38:51 UTC schrieb Gurchetan Singh <
>> gurchetansi...@chromium.org>:
>> >On Wed, Sep 13, 2023 at 4:58 AM Bernhard Beschow 
>> wrote:
>> >
>> >>
>> >>
>> >> Am 23. August 2023 01:25:38 UTC schrieb Gurchetan Singh <
>> >> gurchetansi...@chromium.org>:
>> >> >This adds initial support for gfxstream and cross-domain.  Both
>> >> >features rely on virtio-gpu blob resources and context types, which
>> >> >are also implemented in this patch.
>> >> >
>> >> >gfxstream has a long and illustrious history in Android graphics
>> >> >paravirtualization.  It has been powering graphics in the Android
>> >> >Studio Emulator for more than a decade, which is the main developer
>> >> >platform.
>> >> >
>> >> >Originally conceived by Jesse Hall, it was first known as "EmuGL" [a].
>> >> >The key design characteristic was a 1:1 threading model and
>> >> >auto-generation, which fit nicely with the OpenGLES spec.  It also
>> >> >allowed easy layering with ANGLE on the host, which provides the GLES
>> >> >implementations on Windows or MacOS enviroments.
>> >> >
>> >> >gfxstream has traditionally been maintained by a single engineer, and
>> >> >between 2015 to 2021, the goldfish throne passed to Frank Yang.
>> >> >Historians often remark this glorious reign ("pax gfxstreama" is the
>> >> >academic term) was comparable to that of Augustus and both Queen
>> >> >Elizabeths.  Just to name a few accomplishments in a resplendent
>> >> >panoply: higher versions of GLES, address space graphics, snapshot
>> >> >support and CTS compliant Vulkan [b].
>> >> >
>> >> >One major drawback was the use of out-of-tree goldfish drivers.
>> >> >Android engineers didn't know much about DRM/KMS and especially TTM so
>> >> >a simple guest to host pipe was conceived.
>> >> >
>> >> >Luckily, virtio-gpu 3D started to emerge in 2016 due to the work of
>> >> >the Mesa/virglrenderer communities.  In 2018, the initial virtio-gpu
>> >> >port of gfxstream was done by Cuttlefish enthusiast Alistair Delva.
>> >> >It was a symbol compatible replacement of virglrenderer [c] and named
>> >> >"AVDVirglrenderer".  This implementation forms the basis of the
>> >> >current gfxstream host implementation still in use today.
>> >> >
>> >> >cross-domain support follows a similar arc.  Originally conceived by
>> >> >Wayland aficionado David Reveman and crosvm enjoyer Zach Reizner in
>> >> >2018, it initially relied on the downstream "virtio-wl" device.
>> >> >
>> >> >In 2020 and 2021, virtio-gpu was extended to include blob resources
>> >> >and multiple timelines by yours truly, features gfxstream/cross-domain
>> >> >both require to function correctly.
>> >> >
>> >> >Right now, we stand at the precipice of a truly fantastic possibility:
>> >> >the Android Emulator powered by upstream QEMU and upstream Linux
>> >> >kernel.  gfxstream will then be packaged properfully, and app
>> >> >developers can even fix gfxstream bugs on their own if they encounter
>> >> >them.
>> >> >
>> >> >It's been quite the ride, my friends.  Where will gfxstream head next,
>> >> >nobody really knows.  I wouldn't be surprised if it's around for
>> >> >another decade, maintained by a new generation of Android graphics
>> >> >enthusiasts.
>> >> >
>> >> >Technical details:
>> >> >  - Very simple initial display integration: just used Pixman
>> >> >  - Largely, 1:1 mapping of virtio-gpu hypercalls to rutabaga function
>> >> >calls
>> >> >
>> >> >Next steps for Android VMs:
>> >> >  - The next step would be improving display integration and UI
>> interfaces
>> >> >with the goal of the QEMU upstream graphics being in an emulator
>> >> >release [d].
>> >> >
>> >> >Next steps for Linux VMs for display virtualization:
>> >> >  - For widespread distribution, someone needs to package Sommelier or
>> the
>> >> >wayland-proxy-virtwl [e] ideally into Debian main. In addition,
>> newer
>> >> >versions of the Linux kernel come with DRM_VIRTIO_GPU_KMS option,
>> >> >which allows disabling KMS hypercalls.  If anyone cares enough,
>> it'll
>> >> >probably be possible to build a custom VM variant that uses this
>> >> display
>> >> >virtualization strategy.
>> >> >
>> >> >[a]
>> >> https://android-review.googlesource.com/c/platform/development/+/34470
>> >> >[b]
>> >>
>> https://android-review.googlesource.com/q/topic:%22vulkan-hostconnection-start%22
>> >> >[c]
>> >>
>> https://android-review.googlesource.com/c/device/generic/goldfish-opengl/+/761927
>> >> >[d] https://developer.android.com/studio/releases/emulator
>> >> >[e] https://github.com/talex5/wayland-proxy-virtwl
>> >> >
>> >> >Signed-off-by: Gurchetan Singh 
>> >> >Tested-by: Alyssa Ross 
>> >> >Tested-by: Emmanouil Pitsidianakis 
>> >> >Reviewed-by: Emmanouil Pitsidianakis 
>> >> >---
>> >> >v1: Incorported various suggestions by Akihiko Odaki and Bernard
>> Berschow
>> >> >

Re: [PULL 00/28] Block layer patches

2023-09-19 Thread Stefan Hajnoczi

On Tue, 19 Sept 2023 at 06:26, Kevin Wolf  wrote:
>
> Am 18.09.2023 um 20:56 hat Stefan Hajnoczi geschrieben:
> > Hi Kevin,
> > I believe that my own commit "block-coroutine-wrapper: use
> > qemu_get_current_aio_context()" breaks this test. The failure is
> > non-deterministic (happens about 1 out of 4 runs).
> >
> > It seems the job hangs and the test times out in vm.run_job('job1', 
> > wait=5.0).
> >
> > I haven't debugged it yet but wanted to share this information to save
> > some time. Tomorrow I'll investigate further.
>
> Yes, it's relatively easily reproducible if I run the test in a loop,
> and I can't seem to reproduce it without the last patch. Should I
> unstage the full series again, or do you think that the last patch is
> really optional this time?
>
> However, I'm unsure how the stack traces I'm seeing are related to your
> patch. Maybe it just made an existing bug more likely to be triggered?
>
> What I'm seeing is that the reader lock is held by an iothread that is
> waiting for its AioContext lock to make progress:
>
> Thread 3 (Thread 0x7f811e9346c0 (LWP 26390) "qemu-system-x86"):
> #0  0x7f81250aaf80 in __lll_lock_wait () at /lib64/libc.so.6
> #1  0x7f81250b149a in pthread_mutex_lock@@GLIBC_2.2.5 () at 
> /lib64/libc.so.6
> #2  0x55b7b170967e in qemu_mutex_lock_impl (mutex=0x55b7b34e3080, 
> file=0x55b7b199e1f7 "../util/async.c", line=728) at 
> ../util/qemu-thread-posix.c:94
> #3  0x55b7b1709953 in qemu_rec_mutex_lock_impl (mutex=0x55b7b34e3080, 
> file=0x55b7b199e1f7 "../util/async.c", line=728) at 
> ../util/qemu-thread-posix.c:149
> #4  0x55b7b1728318 in aio_context_acquire (ctx=0x55b7b34e3020) at 
> ../util/async.c:728
> #5  0x55b7b1727c49 in co_schedule_bh_cb (opaque=0x55b7b34e3020) at 
> ../util/async.c:565
> #6  0x55b7b1726f1c in aio_bh_call (bh=0x55b7b34e2e70) at 
> ../util/async.c:169
> #7  0x55b7b17270ee in aio_bh_poll (ctx=0x55b7b34e3020) at 
> ../util/async.c:216
> #8  0x55b7b170351d in aio_poll (ctx=0x55b7b34e3020, blocking=true) at 
> ../util/aio-posix.c:722
> #9  0x55b7b1518604 in iothread_run (opaque=0x55b7b2904460) at 
> ../iothread.c:63
> #10 0x55b7b170a955 in qemu_thread_start (args=0x55b7b34e36b0) at 
> ../util/qemu-thread-posix.c:541
> #11 0x7f81250ae15d in start_thread () at /lib64/libc.so.6
> #12 0x7f812512fc00 in clone3 () at /lib64/libc.so.6
>
> On the other hand, the main thread wants to acquire the writer lock,
> but it holds the AioContext lock of the iothread (it takes it in
> job_prepare_locked()):
>
> Thread 1 (Thread 0x7f811f4b7b00 (LWP 26388) "qemu-system-x86"):
> #0  0x7f8125122356 in ppoll () at /lib64/libc.so.6
> #1  0x55b7b172eae0 in qemu_poll_ns (fds=0x55b7b34ec910, nfds=1, 
> timeout=-1) at ../util/qemu-timer.c:339
> #2  0x55b7b1704ebd in fdmon_poll_wait (ctx=0x55b7b3269210, 
> ready_list=0x7ffc90b05680, timeout=-1) at ../util/fdmon-poll.c:79
> #3  0x55b7b1703284 in aio_poll (ctx=0x55b7b3269210, blocking=true) at 
> ../util/aio-posix.c:670
> #4  0x55b7b1567c3b in bdrv_graph_wrlock (bs=0x0) at 
> ../block/graph-lock.c:145
> #5  0x55b7b1554c1c in blk_remove_bs (blk=0x55b7b4425800) at 
> ../block/block-backend.c:916
> #6  0x55b7b1554779 in blk_delete (blk=0x55b7b4425800) at 
> ../block/block-backend.c:497
> #7  0x55b7b1554133 in blk_unref (blk=0x55b7b4425800) at 
> ../block/block-backend.c:557
> #8  0x55b7b157a149 in mirror_exit_common (job=0x55b7b4419000) at 
> ../block/mirror.c:696
> #9  0x55b7b1577015 in mirror_prepare (job=0x55b7b4419000) at 
> ../block/mirror.c:807
> #10 0x55b7b153a1a7 in job_prepare_locked (job=0x55b7b4419000) at 
> ../job.c:988
> #11 0x55b7b153a0d9 in job_txn_apply_locked (job=0x55b7b4419000, 
> fn=0x55b7b153a110 ) at ../job.c:191
> #12 0x55b7b1538b6d in job_do_finalize_locked (job=0x55b7b4419000) at 
> ../job.c:1011
> #13 0x55b7b153a886 in job_completed_txn_success_locked 
> (job=0x55b7b4419000) at ../job.c:1068
> #14 0x55b7b1539372 in job_completed_locked (job=0x55b7b4419000) at 
> ../job.c:1082
> #15 0x55b7b153a71b in job_exit (opaque=0x55b7b4419000) at ../job.c:1103
> #16 0x55b7b1726f1c in aio_bh_call (bh=0x7f8110005470) at 
> ../util/async.c:169
> #17 0x55b7b17270ee in aio_bh_poll (ctx=0x55b7b3269210) at 
> ../util/async.c:216
> #18 0x55b7b1702c05 in aio_dispatch (ctx=0x55b7b3269210) at 
> ../util/aio-posix.c:423
> #19 0x55b7b1728a14 in aio_ctx_dispatch (source=0x55b7b3269210, 
> callback=0x0, user_data=0x0) at ../util/async.c:358
> #20 0x7f8126c31c7f in g_main_dispatch (context=0x55b7b3269720) at 
> ../glib/gmain.c:3454
> #21 g_main_context_dispatch (context=0x55b7b3269720) at ../glib/gmain.c:4172
> #22 0x55b7b1729c98 in glib_pollfds_poll () at ../util/main-loop.c:290
> #23 0x55b7b1729572 in os_host_main_loop_wait (timeout=27462700) at 
> ../util/main-loop.c:313
> #24 0x55b7b1729452 in main_loop_wait (nonblocking=0) at 
> ../util/main-loop.c:592
> #25 0x55b7b119a1eb in

[PULL v2 4/8] target/hppa: Add BTLB support to hppa TLB functions

2023-09-19 Thread deller

From: Helge Deller 

Change the TLB code to store the Block-TLBs at the beginning
of the TLB table. New 4k TLB entries which are added later
shall not overwrite any of the BTLB entries.

Make sure that when the TLB is cleared by the OS via the ptlbe
instruction, the Block-TLBs will not be dropped.

Signed-off-by: Helge Deller 
---
 target/hppa/cpu.h|  3 +-
 target/hppa/int_helper.c |  2 +-
 target/hppa/mem_helper.c | 87 +++-
 target/hppa/op_helper.c  |  3 +-
 4 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 23852d89b2..730f35231a 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -350,7 +350,8 @@ bool hppa_cpu_tlb_fill(CPUState *cs, vaddr address, int 
size,
 void hppa_cpu_do_interrupt(CPUState *cpu);
 bool hppa_cpu_exec_interrupt(CPUState *cpu, int int_req);
 int hppa_get_physical_address(CPUHPPAState *env, vaddr addr, int mmu_idx,
-  int type, hwaddr *pphys, int *pprot);
+  int type, hwaddr *pphys, int *pprot,
+  hppa_tlb_entry **tlb_entry);
 extern const MemoryRegionOps hppa_io_eir_ops;
 extern const VMStateDescription vmstate_hppa_cpu;
 void hppa_cpu_alarm_timer(void *);
diff --git a/target/hppa/int_helper.c b/target/hppa/int_helper.c
index bebc732c97..3ab9934a1d 100644
--- a/target/hppa/int_helper.c
+++ b/target/hppa/int_helper.c
@@ -154,7 +154,7 @@ void hppa_cpu_do_interrupt(CPUState *cs)
 
 vaddr = hppa_form_gva_psw(old_psw, iasq_f, vaddr);
 t = hppa_get_physical_address(env, vaddr, MMU_KERNEL_IDX,
-  0, , );
+  0, , , NULL);
 if (t >= 0) {
 /* We can't re-load the instruction.  */
 env->cr[CR_IIR] = 0;
diff --git a/target/hppa/mem_helper.c b/target/hppa/mem_helper.c
index 46c3dcaf15..ea33b58ddd 100644
--- a/target/hppa/mem_helper.c
+++ b/target/hppa/mem_helper.c
@@ -41,16 +41,24 @@ static hppa_tlb_entry *hppa_find_tlb(CPUHPPAState *env, 
vaddr addr)
 return NULL;
 }
 
-static void hppa_flush_tlb_ent(CPUHPPAState *env, hppa_tlb_entry *ent)
+static void hppa_flush_tlb_ent(CPUHPPAState *env, hppa_tlb_entry *ent,
+   bool force_flush_btlb)
 {
 CPUState *cs = env_cpu(env);
-unsigned i, n = 1 << (2 * ent->page_size);
-uint64_t addr = ent->va_b;
+
+if (!ent->entry_valid) {
+return;
+}
 
 trace_hppa_tlb_flush_ent(env, ent, ent->va_b, ent->va_e, ent->pa);
 
-for (i = 0; i < n; ++i, addr += TARGET_PAGE_SIZE) {
-tlb_flush_page_by_mmuidx(cs, addr, HPPA_MMU_FLUSH_MASK);
+tlb_flush_range_by_mmuidx(cs, ent->va_b,
+ent->va_e - ent->va_b + 1,
+HPPA_MMU_FLUSH_MASK, TARGET_LONG_BITS);
+
+/* never clear BTLBs, unless forced to do so. */
+if (ent < >tlb[HPPA_BTLB_ENTRIES] && !force_flush_btlb) {
+return;
 }
 
 memset(ent, 0, sizeof(*ent));
@@ -60,23 +68,35 @@ static void hppa_flush_tlb_ent(CPUHPPAState *env, 
hppa_tlb_entry *ent)
 static hppa_tlb_entry *hppa_alloc_tlb_ent(CPUHPPAState *env)
 {
 hppa_tlb_entry *ent;
-uint32_t i = env->tlb_last;
+uint32_t i;
+
+if (env->tlb_last < HPPA_BTLB_ENTRIES || env->tlb_last >= 
ARRAY_SIZE(env->tlb)) {
+i = HPPA_BTLB_ENTRIES;
+env->tlb_last = HPPA_BTLB_ENTRIES + 1;
+} else {
+i = env->tlb_last;
+env->tlb_last++;
+}
 
-env->tlb_last = (i == ARRAY_SIZE(env->tlb) - 1 ? 0 : i + 1);
 ent = >tlb[i];
 
-hppa_flush_tlb_ent(env, ent);
+hppa_flush_tlb_ent(env, ent, false);
 return ent;
 }
 
 int hppa_get_physical_address(CPUHPPAState *env, vaddr addr, int mmu_idx,
-  int type, hwaddr *pphys, int *pprot)
+  int type, hwaddr *pphys, int *pprot,
+  hppa_tlb_entry **tlb_entry)
 {
 hwaddr phys;
 int prot, r_prot, w_prot, x_prot, priv;
 hppa_tlb_entry *ent;
 int ret = -1;
 
+if (tlb_entry) {
+*tlb_entry = NULL;
+}
+
 /* Virtual translation disabled.  Direct map virtual to physical.  */
 if (mmu_idx == MMU_PHYS_IDX) {
 phys = addr;
@@ -93,8 +113,12 @@ int hppa_get_physical_address(CPUHPPAState *env, vaddr 
addr, int mmu_idx,
 goto egress;
 }
 
+if (tlb_entry) {
+*tlb_entry = ent;
+}
+
 /* We now know the physical address.  */
-phys = ent->pa + (addr & ~TARGET_PAGE_MASK);
+phys = ent->pa + (addr - ent->va_b);
 
 /* Map TLB access_rights field to QEMU protection.  */
 priv = MMU_IDX_TO_PRIV(mmu_idx);
@@ -193,7 +217,7 @@ hwaddr hppa_cpu_get_phys_page_debug(CPUState *cs, vaddr 
addr)
 }
 
 excp = hppa_get_physical_address(>env, addr, MMU_KERNEL_IDX, 0,
-

Re: [PULL 0/8] Hppa btlb patches

2023-09-19 Thread Helge Deller


Hi Stefan,

On 9/19/23 19:18, Stefan Hajnoczi wrote:

Please take a look at the following CI failure and resend when you
have fixed the error:
...
In file included from ../target/hppa/mem_helper.c:21:
../target/hppa/mem_helper.c: In function ‘helper_diag_btlb’:
../target/hppa/mem_helper.c:461:36: error: format ‘%lx’ expects
argument of type ‘long unsigned int’, but argument 4 has type
‘uint64_t’ {aka ‘long long unsigned int’} [-Werror=format=]


I just send out a v2 pull request with this fixed.

Thanks!
Helge

[PULL v2 7/8] linux-user/hppa: clear the PSW 'N' bit when delivering signals

2023-09-19 Thread deller

From: Mikulas Patocka 

qemu-hppa may crash when delivering a signal. It can be demonstrated with
this program. Compile the program with "hppa-linux-gnu-gcc -O2 signal.c"
and run it with "qemu-hppa -one-insn-per-tb a.out". It reports that the
address of the flag is 0xb4 and it crashes when attempting to touch it.

#include 
#include 
#include 
#include 

sig_atomic_t flag;

void sig(int n)
{
printf(": %p\n", );
flag = 1;
}

int main(void)
{
struct sigaction sa;
struct itimerval it;

sa.sa_handler = sig;
sigemptyset(_mask);
sa.sa_flags = SA_RESTART;
if (sigaction(SIGALRM, , NULL)) perror("sigaction"), exit(1);

it.it_interval.tv_sec = 0;
it.it_interval.tv_usec = 100;
it.it_value.tv_sec = it.it_interval.tv_sec;
it.it_value.tv_usec = it.it_interval.tv_usec;

if (setitimer(ITIMER_REAL, , NULL)) perror("setitimer"), exit(1);

while (1) {
}
}

The reason for the crash is that the signal handling routine doesn't clear
the 'N' flag in the PSW. If the signal interrupts a thread when the 'N'
flag is set, the flag remains set at the beginning of the signal handler
and the first instruction of the signal handler is skipped.

Signed-off-by: Mikulas Patocka 
Acked-by: Helge Deller 
Cc: qemu-sta...@nongnu.org
Signed-off-by: Helge Deller 
---
 linux-user/hppa/signal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/linux-user/hppa/signal.c b/linux-user/hppa/signal.c
index f253a15864..3a976ac693 100644
--- a/linux-user/hppa/signal.c
+++ b/linux-user/hppa/signal.c
@@ -159,6 +159,7 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 }
 env->iaoq_f = haddr;
 env->iaoq_b = haddr + 4;
+env->psw_n = 0;
 return;
 
  give_sigsegv:
-- 
2.41.0

[PULL v2 5/8] target/hppa: Extract diagnose immediate value

2023-09-19 Thread deller

From: Helge Deller 

Extract the immediate value given by the diagnose CPU instruction.
This is needed to distinguish the various diagnose calls.

Signed-off-by: Helge Deller 
---
 target/hppa/insns.decode | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/hppa/insns.decode b/target/hppa/insns.decode
index 27341d27b2..aebe03ccfd 100644
--- a/target/hppa/insns.decode
+++ b/target/hppa/insns.decode
@@ -528,4 +528,4 @@ fdiv_d  001110 . . 011 . ... .  
@f0e_d_3
 xmpyu   001110 . . 010 .0111 .00 t:5r1=%ra64 r2=%rb64
 
 # diag
-diag000101 - -    
+diag000101 i:26
-- 
2.41.0

[PULL v2 8/8] linux-user/hppa: lock both words of function descriptor

2023-09-19 Thread deller

From: Mikulas Patocka 

The code in setup_rt_frame reads two words at haddr, but locks only one.
This patch fixes it to lock both.

Signed-off-by: Mikulas Patocka 
Acked-by: Helge Deller 
Cc: qemu-sta...@nongnu.org
Signed-off-by: Helge Deller 
---
 linux-user/hppa/signal.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/linux-user/hppa/signal.c b/linux-user/hppa/signal.c
index 3a976ac693..bda6e54655 100644
--- a/linux-user/hppa/signal.c
+++ b/linux-user/hppa/signal.c
@@ -149,12 +149,13 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 target_ulong *fdesc, dest;
 
 haddr &= -4;
-if (!lock_user_struct(VERIFY_READ, fdesc, haddr, 1)) {
+fdesc = lock_user(VERIFY_READ, haddr, 2 * sizeof(target_ulong), 1);
+if (!fdesc) {
 goto give_sigsegv;
 }
 __get_user(dest, fdesc);
 __get_user(env->gr[19], fdesc + 1);
-unlock_user_struct(fdesc, haddr, 1);
+unlock_user(fdesc, haddr, 0);
 haddr = dest;
 }
 env->iaoq_f = haddr;
-- 
2.41.0

[PULL v2 0/8] Hppa btlb patches

2023-09-19 Thread deller

From: Helge Deller 

The following changes since commit 9ef497755afc252fb8e060c9ea6b0987abfd20b6:

  Merge tag 'pull-vfio-20230911' of https://github.com/legoater/qemu into 
staging (2023-09-11 09:13:08 -0400)

are available in the Git repository at:

  https://github.com/hdeller/qemu-hppa.git tags/hppa-btlb-pull-request

for you to fetch changes up to 5b1270ef1477bb7f240c3bfe2cd8b0fe4721fd51:

  linux-user/hppa: lock both words of function descriptor (2023-09-19 21:12:18 
+0200)


Block-TLB support and linux-user fixes for hppa target

All 32-bit hppa CPUs allow a fixed number of TLB entries to have a
different page size than the default 4k.
Those are called "Block-TLBs" and are created at startup by the
operating system and managed by the firmware of hppa machines
through the firmware PDC_BLOCK_TLB call.

This patchset adds the necessary glue to SeaBIOS-hppa and
qemu to allow up to 16 BTLB entries in the emulation.

Two patches from Mikulas Patocka fix signal delivery issues
in linux-user on hppa.



Helge Deller (6):
  target/hppa: Update to SeaBIOS-hppa version 9
  target/hppa: Allow up to 16 BTLB entries
  target/hppa: Report and clear BTLBs via fw_cfg at startup
  target/hppa: Add BTLB support to hppa TLB functions
  target/hppa: Extract diagnose immediate value
  target/hppa: Wire up diag instruction to support BTLB

Mikulas Patocka (2):
  linux-user/hppa: clear the PSW 'N' bit when delivering signals
  linux-user/hppa: lock both words of function descriptor

 hw/hppa/machine.c |  10 +--
 linux-user/hppa/signal.c  |   6 +-
 pc-bios/hppa-firmware.img | Bin 720216 -> 732376 bytes
 roms/seabios-hppa |   2 +-
 target/hppa/cpu.h |  11 ++-
 target/hppa/helper.h  |   1 +
 target/hppa/insns.decode  |   2 +-
 target/hppa/int_helper.c  |   2 +-
 target/hppa/mem_helper.c  | 179 --
 target/hppa/op_helper.c   |   3 +-
 target/hppa/translate.c   |  15 +++-
 11 files changed, 188 insertions(+), 43 deletions(-)

-- 
2.41.0

[PULL v2 2/8] target/hppa: Allow up to 16 BTLB entries

2023-09-19 Thread deller

From: Helge Deller 

Reserve 16 out of the 256 TLB entries for Block-TLBs.

Signed-off-by: Helge Deller 
---
 target/hppa/cpu.h | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index fa13694dab..23852d89b2 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -211,8 +211,14 @@ typedef struct CPUArchState {
 target_ureg shadow[7];   /* shadow registers */
 
 /* ??? The number of entries isn't specified by the architecture.  */
+#ifdef TARGET_HPPA64
+#define HPPA_BTLB_FIXED 0   /* BTLBs are not supported in 64-bit 
machines */
+#else
+#define HPPA_BTLB_FIXED 16
+#endif
+#define HPPA_BTLB_VARIABLE  0
 #define HPPA_TLB_ENTRIES256
-#define HPPA_BTLB_ENTRIES   0
+#define HPPA_BTLB_ENTRIES   (HPPA_BTLB_FIXED + HPPA_BTLB_VARIABLE)
 
 /* ??? Implement a unified itlb/dtlb for the moment.  */
 /* ??? We should use a more intelligent data structure.  */
-- 
2.41.0

[PULL v2 3/8] target/hppa: Report and clear BTLBs via fw_cfg at startup

2023-09-19 Thread deller

From: Helge Deller 

Report the new number of TLB entries (without BTLBs) to the
guest and drop reporting of BTLB entries which weren't used at all.

Clear all BTLB and TLB entries at machine reset.

Signed-off-by: Helge Deller 
---
 hw/hppa/machine.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hw/hppa/machine.c b/hw/hppa/machine.c
index 866e11d208..cf28cb9586 100644
--- a/hw/hppa/machine.c
+++ b/hw/hppa/machine.c
@@ -133,14 +133,10 @@ static FWCfgState *create_fw_cfg(MachineState *ms)
 fw_cfg_add_file(fw_cfg, "/etc/firmware-min-version",
 g_memdup(, sizeof(val)), sizeof(val));
 
-val = cpu_to_le64(HPPA_TLB_ENTRIES);
+val = cpu_to_le64(HPPA_TLB_ENTRIES - HPPA_BTLB_ENTRIES);
 fw_cfg_add_file(fw_cfg, "/etc/cpu/tlb_entries",
 g_memdup(, sizeof(val)), sizeof(val));
 
-val = cpu_to_le64(HPPA_BTLB_ENTRIES);
-fw_cfg_add_file(fw_cfg, "/etc/cpu/btlb_entries",
-g_memdup(, sizeof(val)), sizeof(val));
-
 val = cpu_to_le64(HPA_POWER_BUTTON);
 fw_cfg_add_file(fw_cfg, "/etc/power-button-addr",
 g_memdup(, sizeof(val)), sizeof(val));
@@ -433,6 +429,10 @@ static void hppa_machine_reset(MachineState *ms, 
ShutdownCause reason)
 
 cs->exception_index = -1;
 cs->halted = 0;
+
+/* clear any existing TLB and BTLB entries */
+memset(cpu[i]->env.tlb, 0, sizeof(cpu[i]->env.tlb));
+cpu[i]->env.tlb_last = HPPA_BTLB_ENTRIES;
 }
 
 /* already initialized by machine_hppa_init()? */
-- 
2.41.0

[PULL v2 6/8] target/hppa: Wire up diag instruction to support BTLB

2023-09-19 Thread deller

From: Helge Deller 

Wire up the hppa diag instruction to support Block-TLBs
when called with the 0x100 value.

The diag_btlb() helper function does all necessary steps
to emulate the PDC BTLB firmware function, which includes
providing BTLB info, adding a new BTLB, deleting a BTLB
and removing all BTLBs.

Signed-off-by: Helge Deller 
---
 target/hppa/helper.h |  1 +
 target/hppa/mem_helper.c | 92 
 target/hppa/translate.c  | 15 +--
 3 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/target/hppa/helper.h b/target/hppa/helper.h
index c7e35ce8c7..647f043c85 100644
--- a/target/hppa/helper.h
+++ b/target/hppa/helper.h
@@ -95,4 +95,5 @@ DEF_HELPER_FLAGS_2(ptlb, TCG_CALL_NO_RWG, void, env, tl)
 DEF_HELPER_FLAGS_1(ptlbe, TCG_CALL_NO_RWG, void, env)
 DEF_HELPER_FLAGS_2(lpa, TCG_CALL_NO_WG, tr, env, tl)
 DEF_HELPER_FLAGS_1(change_prot_id, TCG_CALL_NO_RWG, void, env)
+DEF_HELPER_1(diag_btlb, void, env)
 #endif
diff --git a/target/hppa/mem_helper.c b/target/hppa/mem_helper.c
index ea33b58ddd..520fd311f8 100644
--- a/target/hppa/mem_helper.c
+++ b/target/hppa/mem_helper.c
@@ -412,3 +412,95 @@ int hppa_artype_for_page(CPUHPPAState *env, target_ulong 
vaddr)
 hppa_tlb_entry *ent = hppa_find_tlb(env, vaddr);
 return ent ? ent->ar_type : -1;
 }
+
+/*
+ * diag_btlb() emulates the PDC PDC_BLOCK_TLB firmware call to
+ * allow operating systems to modify the Block TLB (BTLB) entries.
+ * For implementation details see page 1-13 in
+ * https://parisc.wiki.kernel.org/images-parisc/e/ef/Pdc11-v0.96-Ch1-procs.pdf
+ */
+void HELPER(diag_btlb)(CPUHPPAState *env)
+{
+unsigned int phys_page, len, slot;
+int mmu_idx = cpu_mmu_index(env, 0);
+uintptr_t ra = GETPC();
+hppa_tlb_entry *btlb;
+uint64_t virt_page;
+uint32_t *vaddr;
+
+#ifdef TARGET_HPPA64
+/* BTLBs are not supported on 64-bit CPUs */
+env->gr[28] = -1; /* nonexistent procedure */
+return;
+#endif
+env->gr[28] = 0; /* PDC_OK */
+
+switch (env->gr[25]) {
+case 0:
+/* return BTLB parameters */
+qemu_log_mask(CPU_LOG_MMU, "PDC_BLOCK_TLB: PDC_BTLB_INFO\n");
+vaddr = probe_access(env, env->gr[24], 4 * sizeof(target_ulong),
+ MMU_DATA_STORE, mmu_idx, ra);
+if (vaddr == NULL) {
+env->gr[28] = -10; /* invalid argument */
+} else {
+vaddr[0] = cpu_to_be32(1);
+vaddr[1] = cpu_to_be32(16 * 1024);
+vaddr[2] = cpu_to_be32(HPPA_BTLB_FIXED);
+vaddr[3] = cpu_to_be32(HPPA_BTLB_VARIABLE);
+}
+break;
+case 1:
+/* insert BTLB entry */
+virt_page = env->gr[24];/* upper 32 bits */
+virt_page <<= 32;
+virt_page |= env->gr[23];   /* lower 32 bits */
+phys_page = env->gr[22];
+len = env->gr[21];
+slot = env->gr[19];
+qemu_log_mask(CPU_LOG_MMU, "PDC_BLOCK_TLB: PDC_BTLB_INSERT "
+"0x%08llx-0x%08llx: vpage 0x%llx for phys page 0x%04x len 
%d "
+"into slot %d\n",
+(long long) virt_page << TARGET_PAGE_BITS,
+(long long) (virt_page + len) << TARGET_PAGE_BITS,
+(long long) virt_page, phys_page, len, slot);
+if (slot < HPPA_BTLB_ENTRIES) {
+btlb = >tlb[slot];
+/* force flush of possibly existing BTLB entry */
+hppa_flush_tlb_ent(env, btlb, true);
+/* create new BTLB entry */
+btlb->va_b = virt_page << TARGET_PAGE_BITS;
+btlb->va_e = btlb->va_b + len * TARGET_PAGE_SIZE - 1;
+btlb->pa = phys_page << TARGET_PAGE_BITS;
+set_access_bits(env, btlb, env->gr[20]);
+btlb->t = 0;
+btlb->d = 1;
+} else {
+env->gr[28] = -10; /* invalid argument */
+}
+break;
+case 2:
+/* Purge BTLB entry */
+slot = env->gr[22];
+qemu_log_mask(CPU_LOG_MMU, "PDC_BLOCK_TLB: PDC_BTLB_PURGE slot %d\n",
+slot);
+if (slot < HPPA_BTLB_ENTRIES) {
+btlb = >tlb[slot];
+hppa_flush_tlb_ent(env, btlb, true);
+} else {
+env->gr[28] = -10; /* invalid argument */
+}
+break;
+case 3:
+/* Purge all BTLB entries */
+qemu_log_mask(CPU_LOG_MMU, "PDC_BLOCK_TLB: PDC_BTLB_PURGE_ALL\n");
+for (slot = 0; slot < HPPA_BTLB_ENTRIES; slot++) {
+btlb = >tlb[slot];
+hppa_flush_tlb_ent(env, btlb, true);
+}
+break;
+default:
+env->gr[28] = -2; /* nonexistent option */
+break;
+}
+}
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index c04dc15228..650bbcfe95 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -4042,9 +4042,18 @@ static bool trans_fmpyfadd_d(DisasContext *ctx, 
arg_fmpyfadd_d *a)
 
 static bool

Re: [GIT PULL 00/12] Host Memory Backends and Memory devices queue 2023-09-19

2023-09-19 Thread Stefan Hajnoczi

Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.2 for any 
user-visible changes.


signature.asc
Description: PGP signature

Re: [PULL 0/8] Firmware/edk2 20230918 patches

2023-09-19 Thread Stefan Hajnoczi

Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.2 for any 
user-visible changes.


signature.asc
Description: PGP signature

Re: [PULL v2 00/39] tcg patch queue

2023-09-19 Thread Stefan Hajnoczi

Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.2 for any 
user-visible changes.


signature.asc
Description: PGP signature

Re: [PULL 0/3] ppc queue

2023-09-19 Thread Stefan Hajnoczi

Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.2 for any 
user-visible changes.


signature.asc
Description: PGP signature

Re: [PULL V2 00/17] Net patches

2023-09-19 Thread Stefan Hajnoczi

Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/8.2 for any 
user-visible changes.


signature.asc
Description: PGP signature

Re: [PATCH 00/52] migration/rdma: Error handling fixes

2023-09-19 Thread Fabiano Rosas

Daniel P. Berrangé  writes:

> On Tue, Sep 19, 2023 at 12:49:46PM -0400, Peter Xu wrote:
>> On Mon, Sep 18, 2023 at 04:41:14PM +0200, Markus Armbruster wrote:
>> > Oh dear, where to start.  There's so much wrong, and in pretty obvious
>> > ways.  This code should never have passed review.  I'm refraining from
>> > saying more; see the commit messages instead.
>> > 
>> > Issues remaining after this series include:
>> > 
>> > * Terrible error messages
>> > 
>> > * Some error message cascades remain
>> > 
>> > * There is no written contract for QEMUFileHooks, and the
>> >   responsibility for reporting errors is unclear
>> 
>> Even being removed.. because no one is really extending that..
>> 
>> https://lore.kernel.org/all/20230509120700.78359-1-quint...@redhat.com/#t
>
> One day (in another 5-10 years) I still hope we'll get to
> the point where QEMUFile itself is obsolete :-) Getting
> rid of QEMUFileHooks is a great step in that direction.
> Me finishing a old PoC to bring buffering to QIOChannel
> would be another big step.
>

If you need any help with that let me know. I've been tripping over
QEMUFile weirdness on a daily basis.

Just last week I was looking into restricting the usage of
qemu_file_set_error() to qemu-file.c so we can get rid of this situation
where any piece of code that has a pointer to the QEMUFile can put
whatever it wants in f->last_error* and the rest of the code has to
guess when to call qemu_file_get_error().

*last_error actually stores the first error

Moving all the interesting parts into the channel and removing QEMUFile
would of course be the better solution. Multifd already ignores it
completly, so there's probably more code that could be made generic
after that change.

Also, looking at what people do with iovs in the block layer, it seems
the migration code is a little behind.

> The data rate limiting would be the biggest missing piece
> to enable migration/vmstate logic to directly consume
> a QIOChannel.
>
> Eliminating QEMUFile would help to bring Error **errp
> to all the vmstate codepaths.
>
>> > * There seem to be no tests whatsoever
>> 
>> I always see rdma as "odd fixes" stage.. for a long time.  But maybe I was
>> wrong.
>
> In the MAINTAINERS file RDMA still get classified as formally
> supported under the migration maintainers.  I'm not convinced
> that is an accurate description of its status.  I tend to agree
> with you that it is 'odd fixes' at the very best.
>
> Dave Gilbert had previously speculated about whether we should
> even consider deprecating it on the basis that latest non-RDMA
> migration is too much better than in the past, with multifd
> and zerocopy, that RDMA might not even offer a significant
> enough peformance win to justify.
>
>> Copying Zhijian for status of rdma; Zhijian, I saw that you just replied to
>> the hwpoison issue.  Maybe we should have one entry for rdma too, just like
>> colo?
>
> With regards,
> Daniel

Re: [PATCH 00/52] migration/rdma: Error handling fixes

2023-09-19 Thread Daniel P . Berrangé

On Tue, Sep 19, 2023 at 12:49:46PM -0400, Peter Xu wrote:
> On Mon, Sep 18, 2023 at 04:41:14PM +0200, Markus Armbruster wrote:
> > Oh dear, where to start.  There's so much wrong, and in pretty obvious
> > ways.  This code should never have passed review.  I'm refraining from
> > saying more; see the commit messages instead.
> > 
> > Issues remaining after this series include:
> > 
> > * Terrible error messages
> > 
> > * Some error message cascades remain
> > 
> > * There is no written contract for QEMUFileHooks, and the
> >   responsibility for reporting errors is unclear
> 
> Even being removed.. because no one is really extending that..
> 
> https://lore.kernel.org/all/20230509120700.78359-1-quint...@redhat.com/#t

One day (in another 5-10 years) I still hope we'll get to
the point where QEMUFile itself is obsolete :-) Getting
rid of QEMUFileHooks is a great step in that direction.
Me finishing a old PoC to bring buffering to QIOChannel
would be another big step.

The data rate limiting would be the biggest missing piece
to enable migration/vmstate logic to directly consume
a QIOChannel.

Eliminating QEMUFile would help to bring Error **errp
to all the vmstate codepaths.

> > * There seem to be no tests whatsoever
> 
> I always see rdma as "odd fixes" stage.. for a long time.  But maybe I was
> wrong.

In the MAINTAINERS file RDMA still get classified as formally
supported under the migration maintainers.  I'm not convinced
that is an accurate description of its status.  I tend to agree
with you that it is 'odd fixes' at the very best.

Dave Gilbert had previously speculated about whether we should
even consider deprecating it on the basis that latest non-RDMA
migration is too much better than in the past, with multifd
and zerocopy, that RDMA might not even offer a significant
enough peformance win to justify.

> Copying Zhijian for status of rdma; Zhijian, I saw that you just replied to
> the hwpoison issue.  Maybe we should have one entry for rdma too, just like
> colo?

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v4 2/5] softmmu: Support concurrent bounce buffers

2023-09-19 Thread Mattias Nissler

On Tue, Sep 19, 2023 at 7:14 PM Peter Xu  wrote:
>
> On Tue, Sep 19, 2023 at 09:08:10AM -0700, Mattias Nissler wrote:
> > @@ -3119,31 +3143,35 @@ void *address_space_map(AddressSpace *as,
> >  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
> >   bool is_write, hwaddr access_len)
> >  {
> > -if (buffer != as->bounce.buffer) {
> > -MemoryRegion *mr;
> > -ram_addr_t addr1;
> > +MemoryRegion *mr;
> > +ram_addr_t addr1;
> > +
> > +mr = memory_region_from_host(buffer, );
> > +if (mr == NULL) {
> > +BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
> > +assert(bounce->magic == BOUNCE_BUFFER_MAGIC);
> >
> > -mr = memory_region_from_host(buffer, );
> > -assert(mr != NULL);
> >  if (is_write) {
> > -invalidate_and_set_dirty(mr, addr1, access_len);
> > -}
> > -if (xen_enabled()) {
> > -xen_invalidate_map_cache_entry(buffer);
> > +address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
> > +bounce->buffer, access_len);
> >  }
> > -memory_region_unref(mr);
> > +
> > +memory_region_unref(bounce->mr);
> > +qatomic_sub(>bounce_buffer_size, bounce->len);
> > +/* Write bounce_buffer_size before reading map_client_list. */
> > +smp_mb();
> > +address_space_notify_map_clients(as);
> > +bounce->magic = ~BOUNCE_BUFFER_MAGIC;
> > +g_free(bounce);
> >  return;
> >  }
> > +
> > +if (xen_enabled()) {
> > +xen_invalidate_map_cache_entry(buffer);
> > +}
> >  if (is_write) {
> > -address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
> > -as->bounce.buffer, access_len);
> > -}
> > -qemu_vfree(as->bounce.buffer);
> > -as->bounce.buffer = NULL;
> > -memory_region_unref(as->bounce.mr);
>
> This line needs to be kept?

Yes, good catch. Thanks!

>
> > -/* Clear in_use before reading map_client_list.  */
> > -qatomic_set_mb(>bounce.in_use, false);
> > -address_space_notify_map_clients(as);
> > +invalidate_and_set_dirty(mr, addr1, access_len);
> > +}
> >  }
>
> --
> Peter Xu
>

Re: npcm7xx_timer-test.c is unreliable

2023-09-19 Thread Hao Wu

>
> Let me take a look at that. I suspect the timer is off by 1 tick due to
> some rounding errors.

Re: [PATCH v23 01/20] CPU topology: extend with s390 specifics

2023-09-19 Thread Nina Schoetterl-Glausch

On Tue, 2023-09-19 at 14:47 +0200, Markus Armbruster wrote:
> Nina Schoetterl-Glausch  writes:
> 
> > From: Pierre Morel 
> > 
> > S390 adds two new SMP levels, drawers and books to the CPU
> > topology.
> > S390 CPUs have specific topology features like dedication and
> > entitlement. These indicate to the guest information on host
> > vCPU scheduling and help the guest make better scheduling decisions.
> > 
> > Let us provide the SMP properties with books and drawers levels
> > and S390 CPU with dedication and entitlement,
> > 
> > Signed-off-by: Pierre Morel 
> > Reviewed-by: Nina Schoetterl-Glausch 
> > Co-developed-by: Nina Schoetterl-Glausch 
> > Signed-off-by: Nina Schoetterl-Glausch 
> > ---
> >  qapi/machine-common.json| 21 +
> >  qapi/machine.json   | 19 ++--
> >  include/hw/boards.h | 10 +-
> >  include/hw/qdev-properties-system.h |  4 +++
> >  target/s390x/cpu.h  |  6 
> >  hw/core/machine-smp.c   | 48 -
> >  hw/core/machine.c   |  4 +++
> >  hw/core/qdev-properties-system.c| 13 
> >  hw/s390x/s390-virtio-ccw.c  |  4 +++
> >  softmmu/vl.c|  6 
> >  target/s390x/cpu.c  |  7 +
> >  qapi/meson.build|  1 +
> >  qemu-options.hx |  7 +++--
> >  13 files changed, 137 insertions(+), 13 deletions(-)
> >  create mode 100644 qapi/machine-common.json
> > 
> > diff --git a/qapi/machine-common.json b/qapi/machine-common.json
> > new file mode 100644
> > index 00..e40421bb37
> > --- /dev/null
> > +++ b/qapi/machine-common.json
> 
> Why do you need a separate QAPI sub-module?

See here 
https://lore.kernel.org/qemu-devel/d8da6f7d1e3addcb63614f548ed77ac1b8895e63.ca...@linux.ibm.com/
> 
> > @@ -0,0 +1,21 @@
> > +# -*- Mode: Python -*-
> > +# vim: filetype=python
> > +#
> > +# This work is licensed under the terms of the GNU GPL, version 2 or later.
> > +# See the COPYING file in the top-level directory.
> > +
> > +##
> > +# = Machines S390 data types
> > +##
> > +
> > +##
> > +# @CpuS390Entitlement:
> > +#
> > +# An enumeration of cpu entitlements that can be assumed by a virtual
> > +# S390 CPU
> > +#
> > +# Since: 8.2
> > +##
> > +{ 'enum': 'CpuS390Entitlement',
> > +  'prefix': 'S390_CPU_ENTITLEMENT',
> > +  'data': [ 'auto', 'low', 'medium', 'high' ] }
> > diff --git a/qapi/machine.json b/qapi/machine.json
> > index a08b6576ca..a63cb951d2 100644
> > --- a/qapi/machine.json
> > +++ b/qapi/machine.json
> > @@ -9,6 +9,7 @@
>##
># = Machines
> >  ##
> >  
> >  { 'include': 'common.json' }
> > +{ 'include': 'machine-common.json' }
> 
> Section structure is borked :)
> 
> Existing section "Machine" now ends at the new "Machines S390 data
> types" you pull in here.  The contents of below moves from "Machines" to
> "Machines S390 data types".
> 
> Before I explain how to avoid this, I'd like to understand why we need a
> new sub-module.
> 
> >  
> >  ##
> >  # @SysEmuTarget:
> > @@ -71,7 +72,7 @@
>##
># @CpuInfoFast:
>#
># Information about a virtual CPU
>#
># @cpu-index: index of the virtual CPU
>#
># @qom-path: path to the CPU object in the QOM tree
> >  #
> >  # @thread-id: ID of the underlying host thread
> >  #
> > -# @props: properties describing to which node/socket/core/thread
> > +# @props: properties describing to which 
> > node/drawer/book/socket/core/thread
> >  # virtual CPU belongs to, provided if supported by board
> 
> Is this description accurate?

Kinda, although the wording might not be the best.
All the CpuInstanceProperties fields are optional, it's like a superset of 
possible
properties across architectures.
Only a subset might be returned by query-cpus-fast.
Also die and cluster are missing.
> 
> @props is of type CpuInstanceProperties, shown below.  Its documentation
> describes it as "properties to be used for hotplugging a CPU instance,
> it should be passed by management with device_add command when a CPU is
> being hotplugged."  Hmm.
> 
> I figure details ("node/drawer/book/socket/core/thread") are better left
> to CpuInstanceProperties.
> 
> The "provided if supported by board" part makes no sense to me.  If
> @props is there, it lists the properties we need to provide with
> device_add.  What if it's not there?  Same as empty list, i.e. we don't
> need to provide properties with device_add?

There are default values/default logic.
For s390x, socket, book, drawer are calculated from the core id
if not provided with device_add.
Partial specifications are rejected.

> 
> Not your patch's fault, but let's get this in shape if we can.
> 
> >  #
> >  # @target: the QEMU system emulation target, which determines which
> > @@ -901,7 +902,11 @@
> >  #
> >  # @node-id: NUMA node ID the CPU belongs to
> >  #
> > -# @socket-id: socket number within node/board the CPU belongs to
> > +# @drawer-id:

npcm7xx_timer-test.c is unreliable

2023-09-19 Thread Stefan Hajnoczi

Hi,
Sometimes npcm7xx_timer-test fails intermittently: 
https://gitlab.com/qemu-project/qemu/-/jobs/5121787250

38/96 qemu:qtest+qtest-arm / qtest-arm/npcm7xx_timer-test   ERROR   
0.95s   exit status 1
>>> QTEST_QEMU_BINARY=./qemu-system-arm 
>>> QTEST_QEMU_STORAGE_DAEMON_BINARY=./storage-daemon/qemu-storage-daemon 
>>> G_TEST_DBUS_DAEMON=/builds/qemu-project/qemu/tests/dbus-vmstate-daemon.sh 
>>> QTEST_QEMU_IMG=./qemu-img MALLOC_PERTURB_=103 
>>> /builds/qemu-project/qemu/build/tests/qtest/npcm7xx_timer-test --tap -k
― ✀  ―
stderr:
**
ERROR:../tests/qtest/npcm7xx_timer-test.c:475:test_periodic_interrupt: 
assertion failed (tim_read(td, TISR) == tim_timer_bit(td)): (0x == 
0x0004)
**
ERROR:../tests/qtest/npcm7xx_timer-test.c:476:test_periodic_interrupt: 
'qtest_get_irq(global_qtest, tim_timer_irq(td))' should be TRUE
(test program exited with status code 1)
――

When I reran the CI job, it passed.

Please investigate why this test is unreliable and fix it. Thanks!

There is a GitLab Issue to track this here:
https://gitlab.com/qemu-project/qemu/-/issues/1897

Stefan


signature.asc
Description: PGP signature

Concerns regarding e17bebd049 ("dump: Set correct vaddr for ELF dump")

2023-09-19 Thread Stephen Brennan

Hello all,

I've started working on better support and documentation around
hypervisor vmcores in the Drgn debugger[1]. Of course there's quite a
lot of different implementations out there, but recently I'm looking at
Qemu kdump and ELF vmcores generated via dump-guest-memory, and one
thing caught my eye. I generated a ELF vmcore without the paging option
enabled, and without the guest note loaded, and the resulting core
dump's program header looked like this:

$ eu-readelf -l dumpfile2
Program Headers:
  Type   Offset   VirtAddr   PhysAddr   FileSiz  MemSiz 
  Flg Align
  NOTE   0x000168 0x 0x 0x001980 
0x001980 0x0
  LOAD   0x001ae8 0x 0x 0x8000 
0x8000 0x0
  LOAD   0x80001ae8 0xfffc 0xfffc 0x04 
0x04 0x0

In particular, the "VirtAddr" field for the loadable segment shows a
confusing address - it appears to reuse the segment's physical address,
despite the fact that there's no actual corresponding mapping.

By comparison, the /proc/kcore and /proc/vmcore ELF vmcores use the
VirtAddr in the program header to represent the real virtual memory
mappings in use by the kernel. Debuggers can directly use these without
needing to walk page tables. If there is no virtual memory mapping
information available, I would have expected a placeholder value such as
... or ... to take the place of VirtAddr here so a debugger can
detect the lack of virtual mappings and know that it needs to use
architecture-specific details (and the vmcoreinfo) to find the page
tables and accurately determine memory mappings. As it is, this program
header seems to advertise to a debugger, "yes, we have the virtual
memory mappings" when in fact, that's not the case.

It seems that this behavior was introduced in e17bebd049 ("dump: Set
correct vaddr for ELF dump")[2], a small commit I'll reproduce below.
The justification seems to be that it fixes an issue reading the vmcore
with GDB, but I wonder if that's not a GDB bug which should have been
fixed with them? If GDB aims to support ELF kernel core dumps,
presumably it should be handling physical addresses separately from
virtual addresses. And if GDB doesn't aim for this, but you'd like to
con it into reading your core dump, presumably the onus is on you to
edit the ELF VirtAddr field to suit your needs? It should be QEMU's
primary goal to produce a *correct* vmcore, not work around limitations
or bugs in GDB.

I'd like to propose reverting this, since it makes it impossible to
interpret QEMU ELF vmcores, unless you discard all the virtual addresses
in the program headers, and unconditionally do all the page table walks
yourself. But I wanted to see if there was some justification for this
behavior that I missed.

Thanks,
Stephen

[1]: https://github.com/osandov/drgn
[2]: https://lore.kernel.org/qemu-devel/20181225125344.4482-1-ari...@gmail.com/

---

commit e17bebd049d78f489c2cff755e2b66a0536a156e
Author: Jon Doron 
Date:   Wed Jan 9 10:22:03 2019 +0200

dump: Set correct vaddr for ELF dump

vaddr needs to be equal to the paddr since the dump file represents the
physical memory image.

Without setting vaddr correctly, GDB would load all the different memory
regions on top of each other to vaddr 0, thus making GDB showing the wrong
memory data for a given address.

Signed-off-by: Jon Doron 
Message-Id: <20190109082203.27142-1-ari...@gmail.com>
Reviewed-by: Marc-André Lureau 
Tested-by: Marc-André Lureau 
Acked-by: Laszlo Ersek 

diff --git a/dump.c b/dump.c
index ef1d8025c9..107a67165a 100644
--- a/dump.c
+++ b/dump.c
@@ -192,7 +192,7 @@ static void write_elf64_load(DumpState *s, MemoryMapping 
*memory_mapping,
 phdr.p_paddr = cpu_to_dump64(s, memory_mapping->phys_addr);
 phdr.p_filesz = cpu_to_dump64(s, filesz);
 phdr.p_memsz = cpu_to_dump64(s, memory_mapping->length);
-phdr.p_vaddr = cpu_to_dump64(s, memory_mapping->virt_addr);
+phdr.p_vaddr = cpu_to_dump64(s, memory_mapping->virt_addr) ?: phdr.p_paddr;
 
 assert(memory_mapping->length >= filesz);
 
@@ -216,7 +216,8 @@ static void write_elf32_load(DumpState *s, MemoryMapping 
*memory_mapping,
 phdr.p_paddr = cpu_to_dump32(s, memory_mapping->phys_addr);
 phdr.p_filesz = cpu_to_dump32(s, filesz);
 phdr.p_memsz = cpu_to_dump32(s, memory_mapping->length);
-phdr.p_vaddr = cpu_to_dump32(s, memory_mapping->virt_addr);
+phdr.p_vaddr =
+cpu_to_dump32(s, memory_mapping->virt_addr) ?: phdr.p_paddr;
 
 assert(memory_mapping->length >= filesz);
 
diff --git a/scripts/dump-guest-memory.py b/scripts/dump-guest-memory.py
index 198cd0fe40..2c587cbefc 100644
--- a/scripts/dump-guest-memory.py
+++ b/scripts/dump-guest-memory.py
@@ -163,6 +163,7 @@ def add_segment(self, p_type, p_paddr, p_size):
 phdr = get_arch_phdr(self.endianness, self.elfclass)
 phdr.p_type =

Re: [PULL 00/28] Block layer patches

2023-09-19 Thread Stefan Hajnoczi

On Tue, 19 Sept 2023 at 06:26, Kevin Wolf  wrote:
>
> Am 18.09.2023 um 20:56 hat Stefan Hajnoczi geschrieben:
> > Hi Kevin,
> > I believe that my own commit "block-coroutine-wrapper: use
> > qemu_get_current_aio_context()" breaks this test. The failure is
> > non-deterministic (happens about 1 out of 4 runs).
> >
> > It seems the job hangs and the test times out in vm.run_job('job1', 
> > wait=5.0).
> >
> > I haven't debugged it yet but wanted to share this information to save
> > some time. Tomorrow I'll investigate further.
>
> Yes, it's relatively easily reproducible if I run the test in a loop,
> and I can't seem to reproduce it without the last patch. Should I
> unstage the full series again, or do you think that the last patch is
> really optional this time?

Please drop the last patch. I'm not aware of dependencies on the last patch.

> However, I'm unsure how the stack traces I'm seeing are related to your
> patch. Maybe it just made an existing bug more likely to be triggered?

I'll share my thoughts once I've looked at the crash today.

Regarding AioContext lock removal: I'll work on that and see what
still depends on the lock.

Stefan

> What I'm seeing is that the reader lock is held by an iothread that is
> waiting for its AioContext lock to make progress:
>
> Thread 3 (Thread 0x7f811e9346c0 (LWP 26390) "qemu-system-x86"):
> #0  0x7f81250aaf80 in __lll_lock_wait () at /lib64/libc.so.6
> #1  0x7f81250b149a in pthread_mutex_lock@@GLIBC_2.2.5 () at 
> /lib64/libc.so.6
> #2  0x55b7b170967e in qemu_mutex_lock_impl (mutex=0x55b7b34e3080, 
> file=0x55b7b199e1f7 "../util/async.c", line=728) at 
> ../util/qemu-thread-posix.c:94
> #3  0x55b7b1709953 in qemu_rec_mutex_lock_impl (mutex=0x55b7b34e3080, 
> file=0x55b7b199e1f7 "../util/async.c", line=728) at 
> ../util/qemu-thread-posix.c:149
> #4  0x55b7b1728318 in aio_context_acquire (ctx=0x55b7b34e3020) at 
> ../util/async.c:728
> #5  0x55b7b1727c49 in co_schedule_bh_cb (opaque=0x55b7b34e3020) at 
> ../util/async.c:565
> #6  0x55b7b1726f1c in aio_bh_call (bh=0x55b7b34e2e70) at 
> ../util/async.c:169
> #7  0x55b7b17270ee in aio_bh_poll (ctx=0x55b7b34e3020) at 
> ../util/async.c:216
> #8  0x55b7b170351d in aio_poll (ctx=0x55b7b34e3020, blocking=true) at 
> ../util/aio-posix.c:722
> #9  0x55b7b1518604 in iothread_run (opaque=0x55b7b2904460) at 
> ../iothread.c:63
> #10 0x55b7b170a955 in qemu_thread_start (args=0x55b7b34e36b0) at 
> ../util/qemu-thread-posix.c:541
> #11 0x7f81250ae15d in start_thread () at /lib64/libc.so.6
> #12 0x7f812512fc00 in clone3 () at /lib64/libc.so.6
>
> On the other hand, the main thread wants to acquire the writer lock,
> but it holds the AioContext lock of the iothread (it takes it in
> job_prepare_locked()):
>
> Thread 1 (Thread 0x7f811f4b7b00 (LWP 26388) "qemu-system-x86"):
> #0  0x7f8125122356 in ppoll () at /lib64/libc.so.6
> #1  0x55b7b172eae0 in qemu_poll_ns (fds=0x55b7b34ec910, nfds=1, 
> timeout=-1) at ../util/qemu-timer.c:339
> #2  0x55b7b1704ebd in fdmon_poll_wait (ctx=0x55b7b3269210, 
> ready_list=0x7ffc90b05680, timeout=-1) at ../util/fdmon-poll.c:79
> #3  0x55b7b1703284 in aio_poll (ctx=0x55b7b3269210, blocking=true) at 
> ../util/aio-posix.c:670
> #4  0x55b7b1567c3b in bdrv_graph_wrlock (bs=0x0) at 
> ../block/graph-lock.c:145
> #5  0x55b7b1554c1c in blk_remove_bs (blk=0x55b7b4425800) at 
> ../block/block-backend.c:916
> #6  0x55b7b1554779 in blk_delete (blk=0x55b7b4425800) at 
> ../block/block-backend.c:497
> #7  0x55b7b1554133 in blk_unref (blk=0x55b7b4425800) at 
> ../block/block-backend.c:557
> #8  0x55b7b157a149 in mirror_exit_common (job=0x55b7b4419000) at 
> ../block/mirror.c:696
> #9  0x55b7b1577015 in mirror_prepare (job=0x55b7b4419000) at 
> ../block/mirror.c:807
> #10 0x55b7b153a1a7 in job_prepare_locked (job=0x55b7b4419000) at 
> ../job.c:988
> #11 0x55b7b153a0d9 in job_txn_apply_locked (job=0x55b7b4419000, 
> fn=0x55b7b153a110 ) at ../job.c:191
> #12 0x55b7b1538b6d in job_do_finalize_locked (job=0x55b7b4419000) at 
> ../job.c:1011
> #13 0x55b7b153a886 in job_completed_txn_success_locked 
> (job=0x55b7b4419000) at ../job.c:1068
> #14 0x55b7b1539372 in job_completed_locked (job=0x55b7b4419000) at 
> ../job.c:1082
> #15 0x55b7b153a71b in job_exit (opaque=0x55b7b4419000) at ../job.c:1103
> #16 0x55b7b1726f1c in aio_bh_call (bh=0x7f8110005470) at 
> ../util/async.c:169
> #17 0x55b7b17270ee in aio_bh_poll (ctx=0x55b7b3269210) at 
> ../util/async.c:216
> #18 0x55b7b1702c05 in aio_dispatch (ctx=0x55b7b3269210) at 
> ../util/aio-posix.c:423
> #19 0x55b7b1728a14 in aio_ctx_dispatch (source=0x55b7b3269210, 
> callback=0x0, user_data=0x0) at ../util/async.c:358
> #20 0x7f8126c31c7f in g_main_dispatch (context=0x55b7b3269720) at 
> ../glib/gmain.c:3454
> #21 g_main_context_dispatch (context=0x55b7b3269720) at ../glib/gmain.c:4172
> #22 0x55b7b1729c98 in glib_pollfds_poll () at

Re: [PATCH v1 13/22] vfio: Add base container

2023-09-19 Thread Cédric Le Goater


On 8/30/23 12:37, Zhenzhong Duan wrote:

From: Yi Liu 

Abstract the VFIOContainer to be a base object. It is supposed to be
embedded by legacy VFIO container and later on, into the new iommufd
based container.

The base container implements generic code such as code related to
memory_listener and address space management. The VFIOContainerOps
implements callbacks that depend on the kernel user space being used.

'common.c' and vfio device code only manipulates the base container with
wrapper functions that calls the functions defined in VFIOContainerOpsClass.
Existing 'container.c' code is converted to implement the legacy container
ops functions.

Below is the base container. It's named as VFIOContainer, old VFIOContainer
is replaced with VFIOLegacyContainer.


Usualy, we introduce the new interface solely, port the current models
on top of the new interface, wire the new models in the current
implementation and remove the old implementation. Then, we can start
adding extensions to support other implementations.

spapr should be taken care of separatly following the principle above.
With my PPC hat, I would not even read such a massive change, too risky
for the subsystem. This path will need (much) further splitting to be
understandable and acceptable.

Also, please include the .h file first, it helps in reading. Have you
considered using an InterfaceClass ?

Thanks,

C.



struct VFIOContainer {
 VFIOIOMMUBackendOpsClass *ops;
 VFIOAddressSpace *space;
 MemoryListener listener;
 Error *error;
 bool initialized;
 bool dirty_pages_supported;
 uint64_t dirty_pgsizes;
 uint64_t max_dirty_bitmap_size;
 unsigned long pgsizes;
 unsigned int dma_max_mappings;
 QLIST_HEAD(, VFIOGuestIOMMU) giommu_list;
 QLIST_HEAD(, VFIOHostDMAWindow) hostwin_list;
 QLIST_HEAD(, VFIORamDiscardListener) vrdl_list;
 QLIST_ENTRY(VFIOContainer) next;
};

struct VFIOLegacyContainer {
 VFIOContainer bcontainer;
 int fd; /* /dev/vfio/vfio, empowered by the attached groups */
 MemoryListener prereg_listener;
 unsigned iommu_type;
 QLIST_HEAD(, VFIOGroup) group_list;
};

Co-authored-by: Eric Auger 
Signed-off-by: Eric Auger 
Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
  hw/vfio/common.c  |  72 +---
  hw/vfio/container-base.c  | 160 +
  hw/vfio/container.c   | 247 --
  hw/vfio/meson.build   |   1 +
  hw/vfio/spapr.c   |  22 +--
  hw/vfio/trace-events  |   4 +-
  include/hw/vfio/vfio-common.h |  85 ++---
  include/hw/vfio/vfio-container-base.h | 155 
  8 files changed, 540 insertions(+), 206 deletions(-)
  create mode 100644 hw/vfio/container-base.c
  create mode 100644 include/hw/vfio/vfio-container-base.h

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 044710fc1f..86b6af5740 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -379,19 +379,20 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
IOMMUTLBEntry *iotlb)
   * of vaddr will always be there, even if the memory object is
   * destroyed and its backing memory munmap-ed.
   */
-ret = vfio_dma_map(container, iova,
-   iotlb->addr_mask + 1, vaddr,
-   read_only);
+ret = vfio_container_dma_map(container, iova,
+ iotlb->addr_mask + 1, vaddr,
+ read_only);
  if (ret) {
-error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
+error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
   "0x%"HWADDR_PRIx", %p) = %d (%s)",
   container, iova,
   iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
  }
  } else {
-ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
+ret = vfio_container_dma_unmap(container, iova,
+   iotlb->addr_mask + 1, iotlb);
  if (ret) {
-error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
+error_report("vfio_container_dma_unmap(%p, 0x%"HWADDR_PRIx", "
   "0x%"HWADDR_PRIx") = %d (%s)",
   container, iova,
   iotlb->addr_mask + 1, ret, strerror(-ret));
@@ -407,14 +408,15 @@ static void 
vfio_ram_discard_notify_discard(RamDiscardListener *rdl,
  {
  VFIORamDiscardListener *vrdl = container_of(rdl, VFIORamDiscardListener,
  listener);
+VFIOContainer *container = vrdl->container;
  const hwaddr size = int128_get64(section->size);
  const hwaddr iova = section->offset_within_address_space;
  int ret;
  
  /* Unmap with a single call. */

-ret =

Re: [PATCH v2 12/12] vfio: Remove 64-bit IOVA address space assumption

2023-09-19 Thread Alex Williamson

On Wed, 13 Sep 2023 10:01:47 +0200
Eric Auger  wrote:

> Now we retrieve the usable IOVA ranges from the host,
> we now the physical IOMMU aperture and we can remove
> the assumption of 64b IOVA space when calling
> vfio_host_win_add().
> 
> This works fine in general but in case of an IOMMU memory
> region this becomes more tricky. For instance the virtio-iommu
> MR has a 64b aperture by default. If the physical IOMMU has a
> smaller aperture (typically the case for VTD), this means we
> would need to resize the IOMMU MR when this latter is linked
> to a container. However this happens on vfio_listener_region_add()
> when calling the IOMMU MR set_iova_ranges() callback and this
> would mean we would have a recursive call the
> vfio_listener_region_add(). This looks like a wrong usage of
> the memory API causing duplicate IOMMU MR notifier registration
> for instance.
> 
> Until we find a better solution, make sure the vfio_find_hostwin()
> is not called anymore for IOMMU region.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> I have not found any working solution to the IOMMU MR resizing.
> So I can remove this patch or remove the check for IOMMU MR. Maybe
> this is an issue which can be handled separately?
> ---
>  hw/vfio/common.c | 25 -
>  1 file changed, 12 insertions(+), 13 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index 26da38de05..40cac1ca91 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -1112,13 +1112,6 @@ static void vfio_listener_region_add(MemoryListener 
> *listener,
>  #endif
>  }
>  
> -hostwin = vfio_find_hostwin(container, iova, end);
> -if (!hostwin) {
> -error_setg(, "Container %p can't map guest IOVA region"
> -   " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, 
> end);
> -goto fail;
> -}
> -
>  memory_region_ref(section->mr);
>  
>  if (memory_region_is_iommu(section->mr)) {
> @@ -1177,6 +1170,14 @@ static void vfio_listener_region_add(MemoryListener 
> *listener,
>  return;
>  }
>  
> +hostwin = vfio_find_hostwin(container, iova, end);
> +if (!hostwin) {
> +error_setg(, "Container %p can't map guest IOVA region"
> +   " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, 
> end);
> +goto fail;
> +}
> +
> +
>  /* Here we assume that memory_region_is_ram(section->mr)==true */
>  
>  /*
> @@ -2594,12 +2595,10 @@ static int vfio_connect_container(VFIOGroup *group, 
> AddressSpace *as,
>  vfio_get_iommu_info_migration(container, info);
>  g_free(info);
>  
> -/*
> - * FIXME: We should parse VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE
> - * information to get the actual window extent rather than assume
> - * a 64-bit IOVA address space.
> - */
> -vfio_host_win_add(container, 0, (hwaddr)-1, container->pgsizes);
> +g_assert(container->nr_iovas);

This assert is a problem for older kernels.

> +vfio_host_win_add(container, 0,
> +  container->iova_ranges[container->nr_iovas - 
> 1].end,
> +  container->pgsizes);

This doesn't address the assumption about the min_iova and adds an
assumption that the kernel provided list is sorted.  Thanks,

Alex

Re: [PULL 0/8] Hppa btlb patches

2023-09-19 Thread Stefan Hajnoczi

Please take a look at the following CI failure and resend when you
have fixed the error:

mipsel-linux-gnu-gcc -Ilibqemu-hppa-softmmu.fa.p -I. -I..
-Itarget/hppa -I../target/hppa -Iqapi -Itrace -Iui -Iui/shader
-I/usr/include/pixman-1 -I/usr/include/capstone
-I/usr/include/spice-server -I/usr/include/spice-1
-I/usr/include/glib-2.0 -I/usr/lib/mipsel-linux-gnu/glib-2.0/include
-fdiagnostics-color=auto -Wall -Winvalid-pch -Werror -std=gnu11 -O2 -g
-fstack-protector-strong -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -Wundef
-Wwrite-strings -Wmissing-prototypes -Wstrict-prototypes
-Wredundant-decls -Wold-style-declaration -Wold-style-definition
-Wtype-limits -Wformat-security -Wformat-y2k -Winit-self
-Wignored-qualifiers -Wempty-body -Wnested-externs -Wendif-labels
-Wexpansion-to-defined -Wimplicit-fallthrough=2
-Wmissing-format-attribute -Wno-missing-include-dirs
-Wno-shift-negative-value -Wno-psabi -isystem
/builds/qemu-project/qemu/linux-headers -isystem linux-headers -iquote
. -iquote /builds/qemu-project/qemu -iquote
/builds/qemu-project/qemu/include -iquote
/builds/qemu-project/qemu/host/include/generic -iquote
/builds/qemu-project/qemu/tcg/mips -pthread -D_GNU_SOURCE
-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -fno-strict-aliasing
-fno-common -fwrapv -fPIE -isystem../linux-headers
-isystemlinux-headers -DNEED_CPU_H
'-DCONFIG_TARGET="hppa-softmmu-config-target.h"'
'-DCONFIG_DEVICES="hppa-softmmu-config-devices.h"' -MD -MQ
libqemu-hppa-softmmu.fa.p/target_hppa_mem_helper.c.o -MF
libqemu-hppa-softmmu.fa.p/target_hppa_mem_helper.c.o.d -o
libqemu-hppa-softmmu.fa.p/target_hppa_mem_helper.c.o -c
../target/hppa/mem_helper.c
In file included from ../target/hppa/mem_helper.c:21:
../target/hppa/mem_helper.c: In function ‘helper_diag_btlb’:
../target/hppa/mem_helper.c:461:36: error: format ‘%lx’ expects
argument of type ‘long unsigned int’, but argument 4 has type
‘uint64_t’ {aka ‘long long unsigned int’} [-Werror=format=]
461 | qemu_log_mask(CPU_LOG_MMU, "PDC_BLOCK_TLB: PDC_BTLB_INSERT "
| ^
..
466 | virt_page, phys_page, len, slot);
| ~
| |
| uint64_t {aka long long unsigned int}
../include/qemu/log.h:55:22: note: in definition of macro ‘qemu_log_mask’
55 | qemu_log(FMT, ## __VA_ARGS__); \
| ^~~
cc1: all warnings being treated as errors

Thanks,
Stefan

On Sat, 16 Sept 2023 at 15:33,  wrote:
>
> From: Helge Deller 
>
> The following changes since commit 9ef497755afc252fb8e060c9ea6b0987abfd20b6:
>
>   Merge tag 'pull-vfio-20230911' of https://github.com/legoater/qemu into 
> staging (2023-09-11 09:13:08 -0400)
>
> are available in the Git repository at:
>
>   https://github.com/hdeller/qemu-hppa.git tags/hppa-btlb-pull-request
>
> for you to fetch changes up to 303b1febe3dcd519314d6ed80d97a706cdd21f64:
>
>   linux-user/hppa: lock both words of function descriptor (2023-09-16 
> 21:13:08 +0200)
>
> 
> Block-TLB support and linux-user fixes for hppa target
>
> All 32-bit hppa CPUs allow a fixed number of TLB entries to have a
> different page size than the default 4k.
> Those are called "Block-TLBs" and are created at startup by the
> operating system and managed by the firmware of hppa machines
> through the firmware PDC_BLOCK_TLB call.
>
> This patchset adds the necessary glue to SeaBIOS-hppa and
> qemu to allow up to 16 BTLB entries in the emulation.
>
> Two patches from Mikulas Patocka fix signal delivery issues
> in linux-user on hppa.
>
> 
>
> Helge Deller (6):
>   target/hppa: Update to SeaBIOS-hppa version 9
>   target/hppa: Allow up to 16 BTLB entries
>   target/hppa: Report and clear BTLBs via fw_cfg at startup
>   target/hppa: Add BTLB support to hppa TLB functions
>   target/hppa: Extract diagnose immediate value
>   target/hppa: Wire up diag instruction to support BTLB
>
> Mikulas Patocka (2):
>   linux-user/hppa: clear the PSW 'N' bit when delivering signals
>   linux-user/hppa: lock both words of function descriptor
>
>  hw/hppa/machine.c |  10 +--
>  linux-user/hppa/signal.c  |   6 +-
>  pc-bios/hppa-firmware.img | Bin 720216 -> 732376 bytes
>  roms/seabios-hppa |   2 +-
>  target/hppa/cpu.h |  11 ++-
>  target/hppa/helper.h  |   1 +
>  target/hppa/insns.decode  |   2 +-
>  target/hppa/int_helper.c  |   2 +-
>  target/hppa/mem_helper.c  | 179 --
>  target/hppa/op_helper.c   |   3 +-
>  target/hppa/translate.c   |  15 +++-
>  11 files changed, 188 insertions(+), 43 deletions(-)
>
> --
> 2.41.0
>
>

Re: [PATCH v4 2/5] softmmu: Support concurrent bounce buffers

2023-09-19 Thread Peter Xu

On Tue, Sep 19, 2023 at 09:08:10AM -0700, Mattias Nissler wrote:
> @@ -3119,31 +3143,35 @@ void *address_space_map(AddressSpace *as,
>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>   bool is_write, hwaddr access_len)
>  {
> -if (buffer != as->bounce.buffer) {
> -MemoryRegion *mr;
> -ram_addr_t addr1;
> +MemoryRegion *mr;
> +ram_addr_t addr1;
> +
> +mr = memory_region_from_host(buffer, );
> +if (mr == NULL) {
> +BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
> +assert(bounce->magic == BOUNCE_BUFFER_MAGIC);
>  
> -mr = memory_region_from_host(buffer, );
> -assert(mr != NULL);
>  if (is_write) {
> -invalidate_and_set_dirty(mr, addr1, access_len);
> -}
> -if (xen_enabled()) {
> -xen_invalidate_map_cache_entry(buffer);
> +address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
> +bounce->buffer, access_len);
>  }
> -memory_region_unref(mr);
> +
> +memory_region_unref(bounce->mr);
> +qatomic_sub(>bounce_buffer_size, bounce->len);
> +/* Write bounce_buffer_size before reading map_client_list. */
> +smp_mb();
> +address_space_notify_map_clients(as);
> +bounce->magic = ~BOUNCE_BUFFER_MAGIC;
> +g_free(bounce);
>  return;
>  }
> +
> +if (xen_enabled()) {
> +xen_invalidate_map_cache_entry(buffer);
> +}
>  if (is_write) {
> -address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
> -as->bounce.buffer, access_len);
> -}
> -qemu_vfree(as->bounce.buffer);
> -as->bounce.buffer = NULL;
> -memory_region_unref(as->bounce.mr);

This line needs to be kept?

> -/* Clear in_use before reading map_client_list.  */
> -qatomic_set_mb(>bounce.in_use, false);
> -address_space_notify_map_clients(as);
> +invalidate_and_set_dirty(mr, addr1, access_len);
> +}
>  }

-- 
Peter Xu

Re: [PATCH v4 1/5] softmmu: Per-AddressSpace bounce buffering

2023-09-19 Thread Peter Xu

On Tue, Sep 19, 2023 at 09:08:09AM -0700, Mattias Nissler wrote:
> Instead of using a single global bounce buffer, give each AddressSpace
> its own bounce buffer. The MapClient callback mechanism moves to
> AddressSpace accordingly.
> 
> This is in preparation for generalizing bounce buffer handling further
> to allow multiple bounce buffers, with a total allocation limit
> configured per AddressSpace.
> 
> Signed-off-by: Mattias Nissler 

Reviewed-by: Peter Xu 

-- 
Peter Xu

Re: [PATCH v1 15/22] Add iommufd configure option

2023-09-19 Thread Cédric Le Goater


On 8/30/23 12:37, Zhenzhong Duan wrote:

This adds "--enable-iommufd/--disable-iommufd" to enable or disable
iommufd support, enabled by default.


Why would someone want to disable support at compile time ? It might
have been useful for dev but now QEMU should self-adjust at runtime
depending only on the host capabilities AFAIUI. Am I missing something ?

Thanks,

C.




Signed-off-by: Zhenzhong Duan 
---
  meson.build   | 6 ++
  meson_options.txt | 2 ++
  scripts/meson-buildoptions.sh | 3 +++
  3 files changed, 11 insertions(+)

diff --git a/meson.build b/meson.build
index 98e68ef0b1..6526d8cc9b 100644
--- a/meson.build
+++ b/meson.build
@@ -574,6 +574,10 @@ have_tpm = get_option('tpm') \
.require(targetos != 'windows', error_message: 'TPM emulation only 
available on POSIX systems') \
.allowed()
  
+have_iommufd = get_option('iommufd') \

+  .require(targetos == 'linux', error_message: 'iommufd is supported only on 
Linux') \
+  .allowed()
+
  # vhost
  have_vhost_user = get_option('vhost_user') \
.disable_auto_if(targetos != 'linux') \
@@ -2129,6 +2133,7 @@ endif
  config_host_data.set('CONFIG_SNAPPY', snappy.found())
  config_host_data.set('CONFIG_TPM', have_tpm)
  config_host_data.set('CONFIG_TSAN', get_option('tsan'))
+config_host_data.set('CONFIG_IOMMUFD', have_iommufd)
  config_host_data.set('CONFIG_USB_LIBUSB', libusb.found())
  config_host_data.set('CONFIG_VDE', vde.found())
  config_host_data.set('CONFIG_VHOST_NET', have_vhost_net)
@@ -4051,6 +4056,7 @@ summary_info += {'vhost-user-crypto support': 
have_vhost_user_crypto}
  summary_info += {'vhost-user-blk server support': have_vhost_user_blk_server}
  summary_info += {'vhost-vdpa support': have_vhost_vdpa}
  summary_info += {'build guest agent': have_ga}
+summary_info += {'iommufd support': have_iommufd}
  summary(summary_info, bool_yn: true, section: 'Configurable features')
  
  # Compilation information

diff --git a/meson_options.txt b/meson_options.txt
index aaea5ddd77..aed91d173b 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -105,6 +105,8 @@ option('dbus_display', type: 'feature', value: 'auto',
 description: '-display dbus support')
  option('tpm', type : 'feature', value : 'auto',
 description: 'TPM support')
+option('iommufd', type : 'feature', value : 'auto',
+   description: 'iommufd support')
  
  # Do not enable it by default even for Mingw32, because it doesn't

  # work on Wine.
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 9da3fe299b..719401ffb0 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -113,6 +113,7 @@ meson_options_help() {
printf "%s\n" '  hax HAX acceleration support'
printf "%s\n" '  hvf HVF acceleration support'
printf "%s\n" '  iconv   Font glyph conversion support'
+  printf "%s\n" '  iommufd iommufd support'
printf "%s\n" '  jackJACK sound support'
printf "%s\n" '  keyring Linux keyring support'
printf "%s\n" '  kvm KVM acceleration support'
@@ -325,6 +326,8 @@ _meson_option_parse() {
  --enable-install-blobs) printf "%s" -Dinstall_blobs=true ;;
  --disable-install-blobs) printf "%s" -Dinstall_blobs=false ;;
  --interp-prefix=*) quote_sh "-Dinterp_prefix=$2" ;;
+--enable-iommufd) printf "%s" -Diommufd=enabled ;;
+--disable-iommufd) printf "%s" -Diommufd=disabled ;;
  --enable-jack) printf "%s" -Djack=enabled ;;
  --disable-jack) printf "%s" -Djack=disabled ;;
  --enable-keyring) printf "%s" -Dkeyring=enabled ;;

[PATCH v3 1/8] qemu-img: rebase: stop when reaching EOF of old backing file

2023-09-19 Thread Andrey Drobyshev via

In case when we're rebasing within one backing chain, and when target image
is larger than old backing file, bdrv_is_allocated_above() ends up setting
*pnum = 0.  As a result, target offset isn't getting incremented, and we
get stuck in an infinite for loop.  Let's detect this case and proceed
further down the loop body, as the offsets beyond the old backing size need
to be explicitly zeroed.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Denis V. Lunev 
Reviewed-by: Hanna Czenczek 
---
 qemu-img.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/qemu-img.c b/qemu-img.c
index a48edb7101..50660ba920 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3805,6 +3805,8 @@ static int img_rebase(int argc, char **argv)
 }
 
 if (prefix_chain_bs) {
+uint64_t bytes = n;
+
 /*
  * If cluster wasn't changed since prefix_chain, we don't need
  * to take action
@@ -3817,9 +3819,18 @@ static int img_rebase(int argc, char **argv)
  strerror(-ret));
 goto out;
 }
-if (!ret) {
+if (!ret && n) {
 continue;
 }
+if (!n) {
+/*
+ * If we've reached EOF of the old backing, it means that
+ * offsets beyond the old backing size were read as zeroes.
+ * Now we will need to explicitly zero the cluster in
+ * order to preserve that state after the rebase.
+ */
+n = bytes;
+}
 }
 
 /*
-- 
2.39.3

[PATCH v3 5/8] qemu-img: rebase: avoid unnecessary COW operations

2023-09-19 Thread Andrey Drobyshev via

When rebasing an image from one backing file to another, we need to
compare data from old and new backings.  If the diff between that data
happens to be unaligned to the target cluster size, we might end up
doing partial writes, which would lead to copy-on-write and additional IO.

Consider the following simple case (virtual_size == cluster_size == 64K):

base <-- inc1 <-- inc2

qemu-io -c "write -P 0xaa 0 32K" base.qcow2
qemu-io -c "write -P 0xcc 32K 32K" base.qcow2
qemu-io -c "write -P 0xbb 0 32K" inc1.qcow2
qemu-io -c "write -P 0xcc 32K 32K" inc1.qcow2
qemu-img rebase -f qcow2 -b base.qcow2 -F qcow2 inc2.qcow2

While doing rebase, we'll write a half of the cluster to inc2, and block
layer will have to read the 2nd half of the same cluster from the base image
inc1 while doing this write operation, although the whole cluster is already
read earlier to perform data comparison.

In order to avoid these unnecessary IO cycles, let's make sure every
write request is aligned to the overlay subcluster boundaries.  Using
subcluster size is universal as for the images which don't have them
this size equals to the cluster size. so in any case we end up aligning
to the smallest unit of allocation.

Signed-off-by: Andrey Drobyshev 
---
 qemu-img.c | 74 +++---
 1 file changed, 54 insertions(+), 20 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 0f67b021f7..a2d6241648 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3523,6 +3523,7 @@ static int img_rebase(int argc, char **argv)
 uint8_t *buf_new = NULL;
 BlockDriverState *bs = NULL, *prefix_chain_bs = NULL;
 BlockDriverState *unfiltered_bs;
+BlockDriverInfo bdi = {0};
 char *filename;
 const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg;
 int c, flags, src_flags, ret;
@@ -3533,6 +3534,7 @@ static int img_rebase(int argc, char **argv)
 bool quiet = false;
 Error *local_err = NULL;
 bool image_opts = false;
+int64_t write_align;
 
 /* Parse commandline parameters */
 fmt = NULL;
@@ -3656,6 +3658,20 @@ static int img_rebase(int argc, char **argv)
 }
 }
 
+/*
+ * We need overlay subcluster size to make sure write requests are
+ * aligned.
+ */
+ret = bdrv_get_info(unfiltered_bs, );
+if (ret < 0) {
+error_report("could not get block driver info");
+goto out;
+} else if (bdi.subcluster_size == 0) {
+bdi.subcluster_size = 1;
+}
+
+write_align = bdi.subcluster_size;
+
 /* For safe rebasing we need to compare old and new backing file */
 if (!unsafe) {
 QDict *options = NULL;
@@ -3753,7 +3769,7 @@ static int img_rebase(int argc, char **argv)
 int64_t old_backing_size = 0;
 int64_t new_backing_size = 0;
 uint64_t offset;
-int64_t n;
+int64_t n, n_old = 0, n_new = 0;
 float local_progress = 0;
 
 if (blk_old_backing && bdrv_opt_mem_align(blk_bs(blk_old_backing)) >
@@ -3799,7 +3815,8 @@ static int img_rebase(int argc, char **argv)
 }
 
 for (offset = 0; offset < size; offset += n) {
-bool buf_old_is_zero = false;
+bool old_backing_eof = false;
+int64_t n_alloc;
 
 /* How many bytes can we handle with the next read? */
 n = MIN(IO_BUF_SIZE, size - offset);
@@ -3844,33 +3861,46 @@ static int img_rebase(int argc, char **argv)
 }
 }
 
+/*
+ * At this point we know that the region [offset; offset + n)
+ * is unallocated within the target image.  This region might be
+ * unaligned to the target image's (sub)cluster boundaries, as
+ * old backing may have smaller clusters (or have subclusters).
+ * We extend it to the aligned boundaries to avoid CoW on
+ * partial writes in blk_pwrite(),
+ */
+n += offset - QEMU_ALIGN_DOWN(offset, write_align);
+offset = QEMU_ALIGN_DOWN(offset, write_align);
+n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
+n = MIN(n, size - offset);
+assert(!bdrv_is_allocated(unfiltered_bs, offset, n, _alloc) &&
+   n_alloc == n);
+
+/*
+ * Much like with the target image, we'll try to read as much
+ * of the old and new backings as we can.
+ */
+n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset));
+n_new = MIN(n, MAX(0, new_backing_size - (int64_t) offset));
+
 /*
  * Read old and new backing file and take into consideration that
  * backing files may be smaller than the COW image.
  */
-if (offset >= old_backing_size) {
-memset(buf_old, 0, n);
-buf_old_is_zero = true;
+memset(buf_old + n_old, 0, n - n_old);
+if (!n_old) {
+

[PATCH v3 8/8] iotests: add tests for "qemu-img rebase" with compression

2023-09-19 Thread Andrey Drobyshev via

The test cases considered so far:

314 (new test suite):

1. Check that compression mode isn't compatible with "-f raw" (raw
   format doesn't support compression).
2. Check that rebasing an image onto no backing file preserves the data
   and writes the copied clusters actually compressed.
3. Same as 2, but with a raw backing file (i.e. the clusters copied from the
   backing are originally uncompressed -- we check they end up compressed
   after being merged).
4. Remove a single delta from a backing chain, perform the same checks
   as in 2.
5. Check that even when backing and overlay are initially uncompressed,
   copied clusters end up compressed when rebase with compression is
   performed.

271:

1. Check that when target image has subclusters, rebase with compression
   will make an entire cluster containing the written subcluster
   compressed.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Hanna Czenczek 
---
 tests/qemu-iotests/271 |  65 +++
 tests/qemu-iotests/271.out |  40 +
 tests/qemu-iotests/314 | 165 +
 tests/qemu-iotests/314.out |  75 +
 4 files changed, 345 insertions(+)
 create mode 100755 tests/qemu-iotests/314
 create mode 100644 tests/qemu-iotests/314.out

diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
index e243f57ba7..59a6fafa2f 100755
--- a/tests/qemu-iotests/271
+++ b/tests/qemu-iotests/271
@@ -965,6 +965,71 @@ echo
 
 TEST_IMG="$TEST_IMG.top" alloc="1 30" zero="" _verify_l2_bitmap 0
 
+# Check that rebase with compression works correctly with images containing
+# subclusters.  When compression is enabled and we allocate a new
+# subcluster within the target (overlay) image, we expect the entire cluster
+# containing that subcluster to become compressed.
+#
+# Here we expect 1st and 3rd clusters of the top (overlay) image to become
+# compressed after the rebase, while cluster 2 to remain unallocated and
+# be read from the base (new backing) image.
+#
+# Base (new backing): |-- -- .. -- --|11 11 .. 11 11|-- -- .. -- --|
+# Mid (old backing):  |-- -- .. -- 22|-- -- .. -- --|33 -- .. -- --|
+# Top:|-- -- .. -- --|-- -- -- -- --|-- -- .. -- --|
+
+echo
+echo "### Rebase with compression for images with subclusters ###"
+echo
+
+echo "# create backing chain"
+echo
+
+TEST_IMG="$TEST_IMG.base" _make_test_img -o cluster_size=1M,extended_l2=on 3M
+TEST_IMG="$TEST_IMG.mid" _make_test_img -o cluster_size=1M,extended_l2=on \
+-b "$TEST_IMG.base" -F qcow2 3M
+TEST_IMG="$TEST_IMG.top" _make_test_img -o cluster_size=1M,extended_l2=on \
+-b "$TEST_IMG.mid" -F qcow2 3M
+
+echo
+echo "# fill old and new backing with data"
+echo
+
+$QEMU_IO -c "write -P 0x11 1M 1M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -P 0x22 $(( 31 * 32 ))k 32k" \
+ -c "write -P 0x33 $(( 64 * 32 ))k 32k" \
+ "$TEST_IMG.mid" | _filter_qemu_io
+
+echo
+echo "# rebase topmost image onto the new backing, with compression"
+echo
+
+$QEMU_IMG rebase -c -b "$TEST_IMG.base" -F qcow2 "$TEST_IMG.top"
+
+echo "# verify that the 1st and 3rd clusters've become compressed"
+echo
+
+$QEMU_IMG map --output=json "$TEST_IMG.top" | _filter_testdir
+
+echo
+echo "# verify that data is read the same before and after rebase"
+echo
+
+$QEMU_IO -c "read -P 0x22 $(( 31 * 32 ))k 32k" \
+ -c "read -P 0x11 1M 1M" \
+ -c "read -P 0x33 $(( 64 * 32 ))k 32k" \
+ "$TEST_IMG.top" | _filter_qemu_io
+
+echo
+echo "# verify image bitmap"
+echo
+
+# For compressed clusters bitmap is always 0.  For unallocated cluster
+# there should be no entry at all, thus bitmap is also 0.
+TEST_IMG="$TEST_IMG.top" alloc="" zero="" _verify_l2_bitmap 0
+TEST_IMG="$TEST_IMG.top" alloc="" zero="" _verify_l2_bitmap 1
+TEST_IMG="$TEST_IMG.top" alloc="" zero="" _verify_l2_bitmap 2
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/271.out b/tests/qemu-iotests/271.out
index c335a6c608..0b24d50159 100644
--- a/tests/qemu-iotests/271.out
+++ b/tests/qemu-iotests/271.out
@@ -765,4 +765,44 @@ Offset  Length  Mapped to   File
 # verify image bitmap
 
 L2 entry #0: 0x8050 4002
+
+### Rebase with compression for images with subclusters ###
+
+# create backing chain
+
+Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=3145728
+Formatting 'TEST_DIR/t.IMGFMT.mid', fmt=IMGFMT size=3145728 
backing_file=TEST_DIR/t.IMGFMT.base backing_fmt=IMGFMT
+Formatting 'TEST_DIR/t.IMGFMT.top', fmt=IMGFMT size=3145728 
backing_file=TEST_DIR/t.IMGFMT.mid backing_fmt=IMGFMT
+
+# fill old and new backing with data
+
+wrote 1048576/1048576 bytes at offset 1048576
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 32768/32768 bytes at offset 1015808
+32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 32768/32768 bytes at offset 2097152
+32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+# rebase topmost image onto the new

[PATCH v3 4/8] qemu-img: add chunk size parameter to compare_buffers()

2023-09-19 Thread Andrey Drobyshev via

Add @chsize param to the function which, if non-zero, would represent
the chunk size to be used for comparison.  If it's zero, then
BDRV_SECTOR_SIZE is used as default chunk size, which is the previous
behaviour.

In particular, we're going to use this param in img_rebase() to make the
write requests aligned to a predefined alignment value.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Eric Blake 
Reviewed-by: Hanna Czenczek 
---
 qemu-img.c | 24 +++-
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 4dc91505bf..0f67b021f7 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1274,23 +1274,29 @@ static int is_allocated_sectors_min(const uint8_t *buf, 
int n, int *pnum,
 }
 
 /*
- * Compares two buffers sector by sector. Returns 0 if the first
- * sector of each buffer matches, non-zero otherwise.
+ * Compares two buffers chunk by chunk, where @chsize is the chunk size.
+ * If @chsize is 0, default chunk size of BDRV_SECTOR_SIZE is used.
+ * Returns 0 if the first chunk of each buffer matches, non-zero otherwise.
  *
- * pnum is set to the sector-aligned size of the buffer prefix that
- * has the same matching status as the first sector.
+ * @pnum is set to the size of the buffer prefix aligned to @chsize that
+ * has the same matching status as the first chunk.
  */
 static int compare_buffers(const uint8_t *buf1, const uint8_t *buf2,
-   int64_t bytes, int64_t *pnum)
+   int64_t bytes, uint64_t chsize, int64_t *pnum)
 {
 bool res;
-int64_t i = MIN(bytes, BDRV_SECTOR_SIZE);
+int64_t i;
 
 assert(bytes > 0);
 
+if (!chsize) {
+chsize = BDRV_SECTOR_SIZE;
+}
+i = MIN(bytes, chsize);
+
 res = !!memcmp(buf1, buf2, i);
 while (i < bytes) {
-int64_t len = MIN(bytes - i, BDRV_SECTOR_SIZE);
+int64_t len = MIN(bytes - i, chsize);
 
 if (!!memcmp(buf1 + i, buf2 + i, len) != res) {
 break;
@@ -1559,7 +1565,7 @@ static int img_compare(int argc, char **argv)
 ret = 4;
 goto out;
 }
-ret = compare_buffers(buf1, buf2, chunk, );
+ret = compare_buffers(buf1, buf2, chunk, 0, );
 if (ret || pnum != chunk) {
 qprintf(quiet, "Content mismatch at offset %" PRId64 "!\n",
 offset + (ret ? 0 : pnum));
@@ -3878,7 +3884,7 @@ static int img_rebase(int argc, char **argv)
 int64_t pnum;
 
 if (compare_buffers(buf_old + written, buf_new + written,
-n - written, ))
+n - written, 0, ))
 {
 if (buf_old_is_zero) {
 ret = blk_pwrite_zeroes(blk, offset + written, pnum, 
0);
-- 
2.39.3

[PATCH v3 2/8] qemu-iotests: 024: add rebasing test case for overlay_size > backing_size

2023-09-19 Thread Andrey Drobyshev via

Before previous commit, rebase was getting infitely stuck in case of
rebasing within the same backing chain and when overlay_size > backing_size.
Let's add this case to the rebasing test 024 to make sure it doesn't
break again.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Denis V. Lunev 
Reviewed-by: Hanna Czenczek 
---
 tests/qemu-iotests/024 | 57 ++
 tests/qemu-iotests/024.out | 30 
 2 files changed, 87 insertions(+)

diff --git a/tests/qemu-iotests/024 b/tests/qemu-iotests/024
index 25a564a150..98a7c8fd65 100755
--- a/tests/qemu-iotests/024
+++ b/tests/qemu-iotests/024
@@ -199,6 +199,63 @@ echo
 # $BASE_OLD and $BASE_NEW)
 $QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map
 
+# Check that rebase within the chain is working when
+# overlay_size > old_backing_size
+#
+# base_new <-- base_old <-- overlay
+#
+# Backing (new): 11 11 11 11 11
+# Backing (old): 22 22 22 22
+# Overlay:   -- -- -- -- --
+#
+# As a result, overlay should contain data identical to base_old, with the
+# last cluster remaining unallocated.
+
+echo
+echo "=== Test rebase within one backing chain ==="
+echo
+
+echo "Creating backing chain"
+echo
+
+TEST_IMG=$BASE_NEW _make_test_img $(( CLUSTER_SIZE * 5 ))
+TEST_IMG=$BASE_OLD _make_test_img -b "$BASE_NEW" -F $IMGFMT \
+$(( CLUSTER_SIZE * 4 ))
+TEST_IMG=$OVERLAY _make_test_img -b "$BASE_OLD" -F $IMGFMT \
+$(( CLUSTER_SIZE * 5 ))
+
+echo
+echo "Fill backing files with data"
+echo
+
+$QEMU_IO "$BASE_NEW" -c "write -P 0x11 0 $(( CLUSTER_SIZE * 5 ))" \
+| _filter_qemu_io
+$QEMU_IO "$BASE_OLD" -c "write -P 0x22 0 $(( CLUSTER_SIZE * 4 ))" \
+| _filter_qemu_io
+
+echo
+echo "Check the last cluster is zeroed in overlay before the rebase"
+echo
+$QEMU_IO "$OVERLAY" -c "read -P 0x00 $(( CLUSTER_SIZE * 4 )) $CLUSTER_SIZE" \
+| _filter_qemu_io
+
+echo
+echo "Rebase onto another image in the same chain"
+echo
+
+$QEMU_IMG rebase -b "$BASE_NEW" -F $IMGFMT "$OVERLAY"
+
+echo "Verify that data is read the same before and after rebase"
+echo
+
+# Verify the first 4 clusters are still read the same as in the old base
+$QEMU_IO "$OVERLAY" -c "read -P 0x22 0 $(( CLUSTER_SIZE * 4 ))" \
+| _filter_qemu_io
+# Verify the last cluster still reads as zeroes
+$QEMU_IO "$OVERLAY" -c "read -P 0x00 $(( CLUSTER_SIZE * 4 )) $CLUSTER_SIZE" \
+| _filter_qemu_io
+
+echo
 
 # success, all done
 echo "*** done"
diff --git a/tests/qemu-iotests/024.out b/tests/qemu-iotests/024.out
index 973a5a3711..245fe8b1d1 100644
--- a/tests/qemu-iotests/024.out
+++ b/tests/qemu-iotests/024.out
@@ -171,4 +171,34 @@ read 65536/65536 bytes at offset 196608
 Offset  Length  File
 0   0x3 TEST_DIR/subdir/t.IMGFMT
 0x3 0x1 TEST_DIR/subdir/t.IMGFMT.base_new
+
+=== Test rebase within one backing chain ===
+
+Creating backing chain
+
+Formatting 'TEST_DIR/subdir/t.IMGFMT.base_new', fmt=IMGFMT size=327680
+Formatting 'TEST_DIR/subdir/t.IMGFMT.base_old', fmt=IMGFMT size=262144 
backing_file=TEST_DIR/subdir/t.IMGFMT.base_new backing_fmt=IMGFMT
+Formatting 'TEST_DIR/subdir/t.IMGFMT', fmt=IMGFMT size=327680 
backing_file=TEST_DIR/subdir/t.IMGFMT.base_old backing_fmt=IMGFMT
+
+Fill backing files with data
+
+wrote 327680/327680 bytes at offset 0
+320 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 262144/262144 bytes at offset 0
+256 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Check the last cluster is zeroed in overlay before the rebase
+
+read 65536/65536 bytes at offset 262144
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Rebase onto another image in the same chain
+
+Verify that data is read the same before and after rebase
+
+read 262144/262144 bytes at offset 0
+256 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 262144
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
 *** done
-- 
2.39.3

[PATCH v3 6/8] iotests/{024, 271}: add testcases for qemu-img rebase

2023-09-19 Thread Andrey Drobyshev via

As the previous commit changes the logic of "qemu-img rebase" (it's using
write alignment now), let's add a couple more test cases which would
ensure it works correctly.  In particular, the following scenarios:

024: add test case for rebase within one backing chain when the overlay
 cluster size > backings cluster size;
271: add test case for rebase images that contain subclusters.  Check
 that no extra allocations are being made.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Hanna Czenczek 
---
 tests/qemu-iotests/024 | 60 ++
 tests/qemu-iotests/024.out | 43 +
 tests/qemu-iotests/271 | 66 ++
 tests/qemu-iotests/271.out | 42 
 4 files changed, 211 insertions(+)

diff --git a/tests/qemu-iotests/024 b/tests/qemu-iotests/024
index 98a7c8fd65..285f17e79f 100755
--- a/tests/qemu-iotests/024
+++ b/tests/qemu-iotests/024
@@ -257,6 +257,66 @@ $QEMU_IO "$OVERLAY" -c "read -P 0x00 $(( CLUSTER_SIZE * 4 
)) $CLUSTER_SIZE" \
 
 echo
 
+# Check that rebase within the chain is working when
+# overlay cluster size > backings cluster size
+# (here overlay cluster size == 2 * backings cluster size)
+#
+# base_new <-- base_old <-- overlay
+#
+# Backing (new): -- -- -- -- -- --
+# Backing (old): -- 11 -- -- 22 --
+# Overlay:  |-- --|-- --|-- --|
+#
+# We should end up having 1st and 3rd cluster allocated, and their halves
+# being read as zeroes.
+
+echo
+echo "=== Test rebase with different cluster sizes ==="
+echo
+
+echo "Creating backing chain"
+echo
+
+TEST_IMG=$BASE_NEW _make_test_img $(( CLUSTER_SIZE * 6 ))
+TEST_IMG=$BASE_OLD _make_test_img -b "$BASE_NEW" -F $IMGFMT \
+$(( CLUSTER_SIZE * 6 ))
+CLUSTER_SIZE=$(( CLUSTER_SIZE * 2 )) TEST_IMG=$OVERLAY \
+_make_test_img -b "$BASE_OLD" -F $IMGFMT $(( CLUSTER_SIZE * 6 ))
+
+TEST_IMG=$OVERLAY _img_info
+
+echo
+echo "Fill backing files with data"
+echo
+
+$QEMU_IO "$BASE_OLD" -c "write -P 0x11 $CLUSTER_SIZE $CLUSTER_SIZE" \
+-c "write -P 0x22 $(( CLUSTER_SIZE * 4 )) $CLUSTER_SIZE" \
+| _filter_qemu_io
+
+echo
+echo "Rebase onto another image in the same chain"
+echo
+
+$QEMU_IMG rebase -b "$BASE_NEW" -F $IMGFMT "$OVERLAY"
+
+echo "Verify that data is read the same before and after rebase"
+echo
+
+$QEMU_IO "$OVERLAY" -c "read -P 0x00 0 $CLUSTER_SIZE" \
+-c "read -P 0x11 $CLUSTER_SIZE $CLUSTER_SIZE" \
+-c "read -P 0x00 $(( CLUSTER_SIZE * 2 )) $(( CLUSTER_SIZE * 2 ))" \
+-c "read -P 0x22 $(( CLUSTER_SIZE * 4 )) $CLUSTER_SIZE" \
+-c "read -P 0x00 $(( CLUSTER_SIZE * 5 )) $CLUSTER_SIZE" \
+| _filter_qemu_io
+
+echo
+echo "Verify that untouched cluster remains unallocated"
+echo
+
+$QEMU_IMG map "$OVERLAY" | _filter_qemu_img_map
+
+echo
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/024.out b/tests/qemu-iotests/024.out
index 245fe8b1d1..e1e8eea863 100644
--- a/tests/qemu-iotests/024.out
+++ b/tests/qemu-iotests/024.out
@@ -201,4 +201,47 @@ read 262144/262144 bytes at offset 0
 read 65536/65536 bytes at offset 262144
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
+
+=== Test rebase with different cluster sizes ===
+
+Creating backing chain
+
+Formatting 'TEST_DIR/subdir/t.IMGFMT.base_new', fmt=IMGFMT size=393216
+Formatting 'TEST_DIR/subdir/t.IMGFMT.base_old', fmt=IMGFMT size=393216 
backing_file=TEST_DIR/subdir/t.IMGFMT.base_new backing_fmt=IMGFMT
+Formatting 'TEST_DIR/subdir/t.IMGFMT', fmt=IMGFMT size=393216 
backing_file=TEST_DIR/subdir/t.IMGFMT.base_old backing_fmt=IMGFMT
+image: TEST_DIR/subdir/t.IMGFMT
+file format: IMGFMT
+virtual size: 384 KiB (393216 bytes)
+cluster_size: 131072
+backing file: TEST_DIR/subdir/t.IMGFMT.base_old
+backing file format: IMGFMT
+
+Fill backing files with data
+
+wrote 65536/65536 bytes at offset 65536
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 65536/65536 bytes at offset 262144
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Rebase onto another image in the same chain
+
+Verify that data is read the same before and after rebase
+
+read 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 65536
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 131072/131072 bytes at offset 131072
+128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 262144
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 65536/65536 bytes at offset 327680
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Verify that untouched cluster remains unallocated
+
+Offset  Length  File
+0   0x2 TEST_DIR/subdir/t.IMGFMT
+0x4 0x2 TEST_DIR/subdir/t.IMGFMT
+
 *** done
diff --git a/tests/qemu-iotests/271 b/tests/qemu-iotests/271
index c7c2cadda0..e243f57ba7 100755
--- a/tests/qemu-iotests/271
+++ b/tests/qemu-iotests/271
@@ -899,6

[PATCH v3 0/8] qemu-img: rebase: add compression support

2023-09-19 Thread Andrey Drobyshev via

v2 --> v3:
 * Patch 3/8: fixed logic in the if statement, so that we align on blk
   when blk_old_backing == NULL;
 * Patch 4/8: comment fix;
 * Patch 5/8: comment fix; dropped redundant "if (blk_new_backing)"
   statements.

v2: https://lists.nongnu.org/archive/html/qemu-block/2023-09/msg00448.html

Andrey Drobyshev (8):
  qemu-img: rebase: stop when reaching EOF of old backing file
  qemu-iotests: 024: add rebasing test case for overlay_size >
backing_size
  qemu-img: rebase: use backing files' BlockBackend for buffer alignment
  qemu-img: add chunk size parameter to compare_buffers()
  qemu-img: rebase: avoid unnecessary COW operations
  iotests/{024, 271}: add testcases for qemu-img rebase
  qemu-img: add compression option to rebase subcommand
  iotests: add tests for "qemu-img rebase" with compression

 docs/tools/qemu-img.rst|   6 +-
 qemu-img-cmds.hx   |   4 +-
 qemu-img.c | 136 ++
 tests/qemu-iotests/024 | 117 ++
 tests/qemu-iotests/024.out |  73 
 tests/qemu-iotests/271 | 131 +
 tests/qemu-iotests/271.out |  82 ++
 tests/qemu-iotests/314 | 165 +
 tests/qemu-iotests/314.out |  75 +
 9 files changed, 752 insertions(+), 37 deletions(-)
 create mode 100755 tests/qemu-iotests/314
 create mode 100644 tests/qemu-iotests/314.out

-- 
2.39.3

[PATCH v3 3/8] qemu-img: rebase: use backing files' BlockBackend for buffer alignment

2023-09-19 Thread Andrey Drobyshev via

Since commit bb1c05973cf ("qemu-img: Use qemu_blockalign"), buffers for
the data read from the old and new backing files are aligned using
BlockDriverState (or BlockBackend later on) referring to the target image.
However, this isn't quite right, because buf_new is only being used for
reading from the new backing, while buf_old is being used for both reading
from the old backing and writing to the target.  Let's take that into account
and use more appropriate values as alignments.

Signed-off-by: Andrey Drobyshev 
---
 qemu-img.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 50660ba920..4dc91505bf 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3750,8 +3750,13 @@ static int img_rebase(int argc, char **argv)
 int64_t n;
 float local_progress = 0;
 
-buf_old = blk_blockalign(blk, IO_BUF_SIZE);
-buf_new = blk_blockalign(blk, IO_BUF_SIZE);
+if (blk_old_backing && bdrv_opt_mem_align(blk_bs(blk_old_backing)) >
+bdrv_opt_mem_align(blk_bs(blk))) {
+buf_old = blk_blockalign(blk_old_backing, IO_BUF_SIZE);
+} else {
+buf_old = blk_blockalign(blk, IO_BUF_SIZE);
+}
+buf_new = blk_blockalign(blk_new_backing, IO_BUF_SIZE);
 
 size = blk_getlength(blk);
 if (size < 0) {
-- 
2.39.3

[PATCH v3 7/8] qemu-img: add compression option to rebase subcommand

2023-09-19 Thread Andrey Drobyshev via

If we rebase an image whose backing file has compressed clusters, we
might end up wasting disk space since the copied clusters are now
uncompressed.  In order to have better control over this, let's add
"--compress" option to the "qemu-img rebase" command.

Note that this option affects only the clusters which are actually being
copied from the original backing file.  The clusters which were
uncompressed in the target image will remain so.

Signed-off-by: Andrey Drobyshev 
Reviewed-by: Denis V. Lunev 
Reviewed-by: Hanna Czenczek 
---
 docs/tools/qemu-img.rst |  6 --
 qemu-img-cmds.hx|  4 ++--
 qemu-img.c  | 26 --
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst
index ca5a2773cf..4459c065f1 100644
--- a/docs/tools/qemu-img.rst
+++ b/docs/tools/qemu-img.rst
@@ -667,7 +667,7 @@ Command description:
 
   List, apply, create or delete snapshots in image *FILENAME*.
 
-.. option:: rebase [--object OBJECTDEF] [--image-opts] [-U] [-q] [-f FMT] [-t 
CACHE] [-T SRC_CACHE] [-p] [-u] -b BACKING_FILE [-F BACKING_FMT] FILENAME
+.. option:: rebase [--object OBJECTDEF] [--image-opts] [-U] [-q] [-f FMT] [-t 
CACHE] [-T SRC_CACHE] [-p] [-u] [-c] -b BACKING_FILE [-F BACKING_FMT] FILENAME
 
   Changes the backing file of an image. Only the formats ``qcow2`` and
   ``qed`` support changing the backing file.
@@ -694,7 +694,9 @@ Command description:
 
 In order to achieve this, any clusters that differ between
 *BACKING_FILE* and the old backing file of *FILENAME* are merged
-into *FILENAME* before actually changing the backing file.
+into *FILENAME* before actually changing the backing file. With the
+``-c`` option specified, the clusters which are being merged (but not
+the entire *FILENAME* image) are compressed when written.
 
 Note that the safe mode is an expensive operation, comparable to
 converting an image. It only works if the old backing file still
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 1b1dab5b17..068692d13e 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -88,9 +88,9 @@ SRST
 ERST
 
 DEF("rebase", img_rebase,
-"rebase [--object objectdef] [--image-opts] [-U] [-q] [-f fmt] [-t cache] 
[-T src_cache] [-p] [-u] -b backing_file [-F backing_fmt] filename")
+"rebase [--object objectdef] [--image-opts] [-U] [-q] [-f fmt] [-t cache] 
[-T src_cache] [-p] [-u] [-c] -b backing_file [-F backing_fmt] filename")
 SRST
-.. option:: rebase [--object OBJECTDEF] [--image-opts] [-U] [-q] [-f FMT] [-t 
CACHE] [-T SRC_CACHE] [-p] [-u] -b BACKING_FILE [-F BACKING_FMT] FILENAME
+.. option:: rebase [--object OBJECTDEF] [--image-opts] [-U] [-q] [-f FMT] [-t 
CACHE] [-T SRC_CACHE] [-p] [-u] [-c] -b BACKING_FILE [-F BACKING_FMT] FILENAME
 ERST
 
 DEF("resize", img_resize,
diff --git a/qemu-img.c b/qemu-img.c
index a2d6241648..9d7a4a3566 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3527,11 +3527,13 @@ static int img_rebase(int argc, char **argv)
 char *filename;
 const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg;
 int c, flags, src_flags, ret;
+BdrvRequestFlags write_flags = 0;
 bool writethrough, src_writethrough;
 int unsafe = 0;
 bool force_share = false;
 int progress = 0;
 bool quiet = false;
+bool compress = false;
 Error *local_err = NULL;
 bool image_opts = false;
 int64_t write_align;
@@ -3548,9 +3550,10 @@ static int img_rebase(int argc, char **argv)
 {"object", required_argument, 0, OPTION_OBJECT},
 {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
 {"force-share", no_argument, 0, 'U'},
+{"compress", no_argument, 0, 'c'},
 {0, 0, 0, 0}
 };
-c = getopt_long(argc, argv, ":hf:F:b:upt:T:qU",
+c = getopt_long(argc, argv, ":hf:F:b:upt:T:qUc",
 long_options, NULL);
 if (c == -1) {
 break;
@@ -3598,6 +3601,9 @@ static int img_rebase(int argc, char **argv)
 case 'U':
 force_share = true;
 break;
+case 'c':
+compress = true;
+break;
 }
 }
 
@@ -3650,6 +3656,14 @@ static int img_rebase(int argc, char **argv)
 
 unfiltered_bs = bdrv_skip_filters(bs);
 
+if (compress && !block_driver_can_compress(unfiltered_bs->drv)) {
+error_report("Compression not supported for this file format");
+ret = -1;
+goto out;
+} else if (compress) {
+write_flags |= BDRV_REQ_WRITE_COMPRESSED;
+}
+
 if (out_basefmt != NULL) {
 if (bdrv_find_format(out_basefmt) == NULL) {
 error_report("Invalid format name: '%s'", out_basefmt);
@@ -3659,18 +3673,18 @@ static int img_rebase(int argc, char **argv)
 }
 
 /*
- * We need overlay subcluster size to make sure write requests are
- * aligned.
+ * We need overlay subcluster size (or cluster size

Re: [PATCH 00/52] migration/rdma: Error handling fixes

2023-09-19 Thread Peter Xu

On Mon, Sep 18, 2023 at 04:41:14PM +0200, Markus Armbruster wrote:
> Oh dear, where to start.  There's so much wrong, and in pretty obvious
> ways.  This code should never have passed review.  I'm refraining from
> saying more; see the commit messages instead.
> 
> Issues remaining after this series include:
> 
> * Terrible error messages
> 
> * Some error message cascades remain
> 
> * There is no written contract for QEMUFileHooks, and the
>   responsibility for reporting errors is unclear

Even being removed.. because no one is really extending that..

https://lore.kernel.org/all/20230509120700.78359-1-quint...@redhat.com/#t

> 
> * There seem to be no tests whatsoever

I always see rdma as "odd fixes" stage.. for a long time.  But maybe I was
wrong.

Copying Zhijian for status of rdma; Zhijian, I saw that you just replied to
the hwpoison issue.  Maybe we should have one entry for rdma too, just like
colo?

Thanks,

-- 
Peter Xu

Re: [PATCH v2 06/12] range: Introduce range_inverse_array()

2023-09-19 Thread Alex Williamson

On Wed, 13 Sep 2023 10:01:41 +0200
Eric Auger  wrote:

> This helper reverses an array of regions, turning original
> regions into holes and original holes into actual regions,
> covering the whole UINT64_MAX span.
> 
> Signed-off-by: Eric Auger 
> 
> ---
> 
> v1 -> v2:
> - Move range_inverse_array description comment in the header
> - Take low/high params
> ---
>  include/qemu/range.h |  8 
>  util/range.c | 45 
>  2 files changed, 53 insertions(+)
> 
> diff --git a/include/qemu/range.h b/include/qemu/range.h
> index 7e2b1cc447..2b59e3bf0c 100644
> --- a/include/qemu/range.h
> +++ b/include/qemu/range.h
> @@ -219,4 +219,12 @@ static inline int ranges_overlap(uint64_t first1, 
> uint64_t len1,
>  
>  GList *range_list_insert(GList *list, Range *data);
>  
> +/*
> + * Inverse an array of sorted ranges over the [low, high] span, ie.
> + * original ranges becomes holes in the newly allocated inv_ranges
> + */
> +void range_inverse_array(uint32_t nr_ranges, Range *ranges,
> + uint32_t *nr_inv_ranges, Range **inv_ranges,
> + uint64_t low, uint64_t high);
> +
>  #endif
> diff --git a/util/range.c b/util/range.c
> index 098d9d2dc0..4baeb588cc 100644
> --- a/util/range.c
> +++ b/util/range.c
> @@ -70,3 +70,48 @@ GList *range_list_insert(GList *list, Range *data)
>  
>  return list;
>  }
> +
> +void range_inverse_array(uint32_t nr_ranges, Range *ranges,
> + uint32_t *nr_inv_ranges, Range **inv_ranges,
> + uint64_t low, uint64_t high)

Rare be it for me to suggest GLib, but we already appear to have
range_list_insert() making use of GList for an ordered list of Ranges.
Doesn't this function become a lot easier if we take a sorted GList,
walk it to create the inverse, and return a new GList of the inverted
Ranges?  Seems the initial sorted GList would be created by making use
of the existing range_list_insert() function.  Thanks,

Alex

> +{
> +Range *resv;
> +int i = 0, j = 0;
> +
> +resv = g_malloc0_n(nr_ranges + 1, sizeof(Range));
> +
> +for (; j < nr_ranges  && (range_upb([j]) < low); j++) {
> +continue; /* skip all ranges below mon */
> +}
> +
> +if (j == nr_ranges) {
> +range_set_bounds([i++], low, high);
> +goto realloc;
> +}
> +
> +/* first range lob is greater than min, insert a first range */
> +if (range_lob([j]) > low) {
> +range_set_bounds([i++], low,
> + MIN(range_lob([j]) - 1, high));
> +}
> +
> +/* insert a range inbetween each original range until we reach max */
> +for (; j < nr_ranges - 1; j++) {
> +if (range_lob([j]) >= high) {
> +goto realloc;
> +}
> +if (range_compare([j], [j + 1])) {
> +range_set_bounds([i++], range_upb([j]) + 1,
> + MIN(range_lob([j + 1]) - 1, high));
> +}
> +}
> +/* last range upb is less than max, insert a last range */
> +if (range_upb([j]) <  high) {
> +range_set_bounds([i++],
> +  range_upb([j]) + 1, high);
> +}
> +realloc:
> +*nr_inv_ranges = i;
> +resv = g_realloc(resv, i * sizeof(Range));
> +*inv_ranges = resv;
> +}

Re: [PATCH v2 5/8] qemu-img: rebase: avoid unnecessary COW operations

2023-09-19 Thread Andrey Drobyshev

On 9/19/23 13:46, Hanna Czenczek wrote:
> On 15.09.23 18:20, Andrey Drobyshev wrote:
>> When rebasing an image from one backing file to another, we need to
>> compare data from old and new backings.  If the diff between that data
>> happens to be unaligned to the target cluster size, we might end up
>> doing partial writes, which would lead to copy-on-write and additional
>> IO.
>>
>> Consider the following simple case (virtual_size == cluster_size == 64K):
>>
>> base <-- inc1 <-- inc2
>>
>> qemu-io -c "write -P 0xaa 0 32K" base.qcow2
>> qemu-io -c "write -P 0xcc 32K 32K" base.qcow2
>> qemu-io -c "write -P 0xbb 0 32K" inc1.qcow2
>> qemu-io -c "write -P 0xcc 32K 32K" inc1.qcow2
>> qemu-img rebase -f qcow2 -b base.qcow2 -F qcow2 inc2.qcow2
>>
>> While doing rebase, we'll write a half of the cluster to inc2, and block
>> layer will have to read the 2nd half of the same cluster from the base
>> image
>> inc1 while doing this write operation, although the whole cluster is
>> already
>> read earlier to perform data comparison.
>>
>> In order to avoid these unnecessary IO cycles, let's make sure every
>> write request is aligned to the overlay subcluster boundaries.  Using
>> subcluster size is universal as for the images which don't have them
>> this size equals to the cluster size, so in any case we end up aligning
>> to the smallest unit of allocation.
>>
>> Signed-off-by: Andrey Drobyshev 
>> ---
>>   qemu-img.c | 76 --
>>   1 file changed, 56 insertions(+), 20 deletions(-)
> 
> Looks good, I like the changes from v1!  Two minor things:
> 
>> diff --git a/qemu-img.c b/qemu-img.c
>> index fcd31d7b5b..83950af42b 100644
>> --- a/qemu-img.c
>> +++ b/qemu-img.c
> 
> [...]
> 
>> @@ -3844,33 +3861,48 @@ static int img_rebase(int argc, char **argv)
>>   }
>>   }
>>   +    /*
>> + * At this point we know that the region [offset; offset
>> + n)
>> + * is unallocated within the target image.  This region
>> might be
>> + * unaligned to the target image's (sub)cluster
>> boundaries, as
>> + * old backing may have smaller clusters (or have
>> subclusters).
>> + * We extend it to the aligned boundaries to avoid CoW on
>> + * partial writes in blk_pwrite(),
>> + */
>> +    n += offset - QEMU_ALIGN_DOWN(offset, write_align);
>> +    offset = QEMU_ALIGN_DOWN(offset, write_align);
>> +    n += QEMU_ALIGN_UP(offset + n, write_align) - (offset + n);
>> +    n = MIN(n, size - offset);
>> +    assert(!bdrv_is_allocated(unfiltered_bs, offset, n,
>> _alloc) &&
>> +   n_alloc == n);
>> +
>> +    /*
>> + * Much like the with the target image, we'll try to read
>> as much
> 
> s/the with the/with the/
>

Noted.

>> + * of the old and new backings as we can.
>> + */
>> +    n_old = MIN(n, MAX(0, old_backing_size - (int64_t) offset));
>> +    if (blk_new_backing) {
>> +    n_new = MIN(n, MAX(0, new_backing_size - (int64_t)
>> offset));
>> +    }
> 
> If we don’t have a check for blk_old_backing (old_backing_size is 0 if
> blk_old_backing is NULL), why do we have a check for blk_new_backing
> (new_backing_size is 0 if blk_new_backing is NULL)?
> 
> (Perhaps because the previous check was `offset >= new_backing_size ||
> !blk_new_backing`, i.e. included exactly such a check – but I don’t
> think it’s necessary, new_backing_size will be 0 if blk_new_backing is
> NULL.)
> 
>> +
>>   /*
>>    * Read old and new backing file and take into
>> consideration that
>>    * backing files may be smaller than the COW image.
>>    */
>> -    if (offset >= old_backing_size) {
>> -    memset(buf_old, 0, n);
>> -    buf_old_is_zero = true;
>> +    memset(buf_old + n_old, 0, n - n_old);
>> +    if (!n_old) {
>> +    old_backing_eof = true;
>>   } else {
>> -    if (offset + n > old_backing_size) {
>> -    n = old_backing_size - offset;
>> -    }
>> -
>> -    ret = blk_pread(blk_old_backing, offset, n, buf_old, 0);
>> +    ret = blk_pread(blk_old_backing, offset, n_old,
>> buf_old, 0);
>>   if (ret < 0) {
>>   error_report("error while reading from old
>> backing file");
>>   goto out;
>>   }
>>   }
>>   -    if (offset >= new_backing_size || !blk_new_backing) {
>> -    memset(buf_new, 0, n);
>> -    } else {
>> -    if (offset + n > new_backing_size) {
>> -    n = new_backing_size - offset;
>> -    }
>> -
>> -    ret = blk_pread(blk_new_backing, offset, n, buf_new, 0);
>> +    memset(buf_new + n_new, 0,

Re: [PULL v2 0/9] testing updates (back to green!)

2023-09-19 Thread Stefan Hajnoczi

On Tue, 19 Sept 2023 at 12:00, Alex Bennée  wrote:
>
>
> Stefan Hajnoczi  writes:
>
> > There is some funny business with tests/lcitool/libvirt-ci. Please
> > rebase on master and send a v3. Sorry for the trouble, I am afraid I
> > would mess something up with the submodule if I attempted to resolve
> > it myself.
> >
> > (If you don't see a conflict when rebasing, please wait until the end
> > of the day when the other pull requests queued on the staging branch
> > are pushed to master.)
>
> That's weird, was their another PR in flight which touched libvirt-ci?

It's probably a conflict with Ilya Maximets' patches in Jason Wang's
net pull request:

https://lore.kernel.org/qemu-devel/20230918083132.55423-1-jasow...@redhat.com/

>
> >
> > Thanks!
> >
> > Auto-merging tests/docker/dockerfiles/debian-amd64-cross.docker
> > Auto-merging tests/docker/dockerfiles/debian-amd64.docker
> > Auto-merging tests/docker/dockerfiles/debian-arm64-cross.docker
> > Auto-merging tests/docker/dockerfiles/debian-armhf-cross.docker
> > Auto-merging tests/docker/dockerfiles/debian-ppc64el-cross.docker
> > Auto-merging tests/docker/dockerfiles/debian-s390x-cross.docker
> > Failed to merge submodule tests/lcitool/libvirt-ci (not checked out)
> > CONFLICT (submodule): Merge conflict in tests/lcitool/libvirt-ci
> > Recursive merging with submodules currently only supports trivial cases.
> > Please manually handle the merging of each conflicted submodule.
> > This can be accomplished with the following steps:
> >  - come back to superproject and run:
> >
> >   git add tests/lcitool/libvirt-ci
> >
> >to record the above merge or update
> >  - resolve any other conflicts in the superproject
> >  - commit the resulting index in the superproject
> > Automatic merge failed; fix conflicts and then commit the result.
> >
> > Stefan
> >
> > On Tue, 19 Sept 2023 at 02:59, Alex Bennée  wrote:
> >>
> >> The following changes since commit 
> >> 13d6b1608160de40ec65ae4c32419e56714bbadf:
> >>
> >>   Merge tag 'pull-crypto-20230915' of https://gitlab.com/rth7680/qemu into 
> >> staging (2023-09-18 11:04:21 -0400)
> >>
> >> are available in the Git repository at:
> >>
> >>   https://gitlab.com/stsquad/qemu.git tags/pull-maintainer-ominbus-190923-1
> >>
> >> for you to fetch changes up to bb3c01212b54595f5bbdbe235cb353b220f94943:
> >>
> >>   tests/avocado: Disable MIPS Malta tests due to GitLab issue #1884 
> >> (2023-09-19 07:46:02 +0100)
> >>
> >> 
> >> testing updates:
> >>
> >>   - update most Debian to bookworm
> >>   - fix some typos
> >>   - update loongarch toolchain
> >>   - fix microbit test
> >>   - handle GitLab/Cirrus timeout discrepancy
> >>   - improve avocado console handling
> >>   - disable mips avocado images pending bugfix
> >>
> >> 
> >> Alex Bennée (2):
> >>   tests: update most Debian images to Bookworm
> >>   gitlab: fix typo/spelling in comments
> >>
> >> Daniel P. Berrangé (4):
> >>   microbit: add missing qtest_quit() call
> >>   qtest: kill orphaned qtest QEMU processes on FreeBSD
> >>   gitlab: make Cirrus CI timeout explicit
> >>   gitlab: make Cirrus CI jobs gating
> >>
> >> Nicholas Piggin (1):
> >>   tests/avocado: Fix console data loss
> >>
> >> Philippe Mathieu-Daudé (1):
> >>   tests/avocado: Disable MIPS Malta tests due to GitLab issue #1884
> >>
> >> Richard Henderson (1):
> >>   tests/docker: Update docker-loongarch-cross toolchain
> >>
> >>  tests/qtest/libqtest.c|  7 +++
> >>  tests/qtest/microbit-test.c   |  2 ++
> >>  .gitlab-ci.d/base.yml |  2 +-
> >>  .gitlab-ci.d/cirrus.yml   |  4 +++-
> >>  .gitlab-ci.d/cirrus/build.yml |  2 ++
> >>  python/qemu/machine/machine.py| 19 
> >> +++
> >>  tests/avocado/avocado_qemu/__init__.py|  2 +-
> >>  tests/avocado/boot_linux_console.py   |  7 +++
> >>  tests/avocado/machine_mips_malta.py   |  6 ++
> >>  tests/avocado/replay_kernel.py|  7 +++
> >>  tests/avocado/tuxrun_baselines.py |  4 
> >>  tests/docker/dockerfiles/debian-amd64-cross.docker| 10 +++---
> >>  tests/docker/dockerfiles/debian-amd64.docker  | 10 +++---
> >>  tests/docker/dockerfiles/debian-arm64-cross.docker| 10 +++---
> >>  tests/docker/dockerfiles/debian-armel-cross.docker|  2 +-
> >>  tests/docker/dockerfiles/debian-armhf-cross.docker| 10 +++---
> >>  .../docker/dockerfiles/debian-loongarch-cross.docker  |  2 +-
> >>  tests/docker/dockerfiles/debian-ppc64el-cross.docker  | 10 +++---
> >>  tests/docker/dockerfiles/debian-s390x-cross.docker| 10 +++---
> >>

[PATCH v4 2/5] softmmu: Support concurrent bounce buffers

2023-09-19 Thread Mattias Nissler

When DMA memory can't be directly accessed, as is the case when
running the device model in a separate process without shareable DMA
file descriptors, bounce buffering is used.

It is not uncommon for device models to request mapping of several DMA
regions at the same time. Examples include:
 * net devices, e.g. when transmitting a packet that is split across
   several TX descriptors (observed with igb)
 * USB host controllers, when handling a packet with multiple data TRBs
   (observed with xhci)

Previously, qemu only provided a single bounce buffer per AddressSpace
and would fail DMA map requests while the buffer was already in use. In
turn, this would cause DMA failures that ultimately manifest as hardware
errors from the guest perspective.

This change allocates DMA bounce buffers dynamically instead of
supporting only a single buffer. Thus, multiple DMA mappings work
correctly also when RAM can't be mmap()-ed.

The total bounce buffer allocation size is limited individually for each
AddressSpace. The default limit is 4096 bytes, matching the previous
maximum buffer size. A new x-max-bounce-buffer-size parameter is
provided to configure the limit for PCI devices.

Signed-off-by: Mattias Nissler 
---
 hw/pci/pci.c|  8 
 include/exec/memory.h   | 14 +++---
 include/hw/pci/pci_device.h |  3 ++
 softmmu/memory.c|  5 ++-
 softmmu/physmem.c   | 88 -
 5 files changed, 77 insertions(+), 41 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 881d774fb6..d071ac8091 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -85,6 +85,8 @@ static Property pci_props[] = {
 QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
 DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
 QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
+DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice,
+ max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE),
 DEFINE_PROP_END_OF_LIST()
 };
 
@@ -1208,6 +1210,8 @@ static PCIDevice *do_pci_register_device(PCIDevice 
*pci_dev,
"bus master container", UINT64_MAX);
 address_space_init(_dev->bus_master_as,
_dev->bus_master_container_region, pci_dev->name);
+pci_dev->bus_master_as.max_bounce_buffer_size =
+pci_dev->max_bounce_buffer_size;
 
 if (phase_check(PHASE_MACHINE_READY)) {
 pci_init_bus_master(pci_dev);
@@ -2664,6 +2668,10 @@ static void pci_device_class_init(ObjectClass *klass, 
void *data)
 k->unrealize = pci_qdev_unrealize;
 k->bus_type = TYPE_PCI_BUS;
 device_class_set_props(k, pci_props);
+object_class_property_set_description(
+klass, "x-max-bounce-buffer-size",
+"Maximum buffer size allocated for bounce buffers used for mapped "
+"access to indirect DMA memory");
 }
 
 static void pci_device_class_base_init(ObjectClass *klass, void *data)
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 7d68936157..67379bd9cc 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -1081,13 +1081,7 @@ typedef struct AddressSpaceMapClient {
 QLIST_ENTRY(AddressSpaceMapClient) link;
 } AddressSpaceMapClient;
 
-typedef struct {
-MemoryRegion *mr;
-void *buffer;
-hwaddr addr;
-hwaddr len;
-bool in_use;
-} BounceBuffer;
+#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096)
 
 /**
  * struct AddressSpace: describes a mapping of addresses to #MemoryRegion 
objects
@@ -1106,8 +1100,10 @@ struct AddressSpace {
 QTAILQ_HEAD(, MemoryListener) listeners;
 QTAILQ_ENTRY(AddressSpace) address_spaces_link;
 
-/* Bounce buffer to use for this address space. */
-BounceBuffer bounce;
+/* Maximum DMA bounce buffer size used for indirect memory map requests */
+uint64_t max_bounce_buffer_size;
+/* Total size of bounce buffers currently allocated, atomically accessed */
+uint64_t bounce_buffer_size;
 /* List of callbacks to invoke when buffers free up */
 QemuMutex map_client_list_lock;
 QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index d3dd0f64b2..f4027c5379 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -160,6 +160,9 @@ struct PCIDevice {
 /* ID of standby device in net_failover pair */
 char *failover_pair_id;
 uint32_t acpi_index;
+
+/* Maximum DMA bounce buffer size used for indirect memory map requests */
+uint64_t max_bounce_buffer_size;
 };
 
 static inline int pci_intx(PCIDevice *pci_dev)
diff --git a/softmmu/memory.c b/softmmu/memory.c
index ffa37fc327..24d90b10b2 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -3105,7 +3105,8 @@ void address_space_init(AddressSpace *as, MemoryRegion 
*root, const char *name)
 as->ioeventfds = NULL;
 QTAILQ_INIT(>listeners);
 QTAILQ_INSERT_TAIL(_spaces, as, address_spaces_link);
-

[PATCH v4 3/5] Update subprojects/libvfio-user

2023-09-19 Thread Mattias Nissler

Brings in assorted bug fixes. The following are of particular interest
with respect to message-based DMA support:

* bb308a2 "Fix address calculation for message-based DMA"
  Corrects a bug in DMA address calculation.

* 1569a37 "Pass server->client command over a separate socket pair"
  Adds support for separate sockets for either command direction,
  addressing a bug where libvfio-user gets confused if both client and
  server send commands concurrently.

Signed-off-by: Mattias Nissler 
---
 subprojects/libvfio-user.wrap | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subprojects/libvfio-user.wrap b/subprojects/libvfio-user.wrap
index 416955ca45..cdf0a7a375 100644
--- a/subprojects/libvfio-user.wrap
+++ b/subprojects/libvfio-user.wrap
@@ -1,4 +1,4 @@
 [wrap-git]
 url = https://gitlab.com/qemu-project/libvfio-user.git
-revision = 0b28d205572c80b568a1003db2c8f37ca333e4d7
+revision = 1569a37a54ecb63bd4008708c76339ccf7d06115
 depth = 1
-- 
2.34.1

1 2 3 >

1 - 100 of 298 matches

Mail list logo