date:20240325

Re: [PATCH] hw/net/net_tx_pkt: Fix virtio header without checksum offloading

2024-03-25 Thread Jason Wang

On Sun, Mar 24, 2024 at 4:32 PM Akihiko Odaki  wrote:
>
> It is incorrect to have the VIRTIO_NET_HDR_F_NEEDS_CSUM set when
> checksum offloading is disabled so clear the bit. Set the
> VIRTIO_NET_HDR_F_DATA_VALID bit instead to tell the checksum is valid.
>
> TCP/UDP checksum is usually offloaded when the peer requires virtio
> headers because they can instruct the peer to compute checksum. However,
> igb disables TX checksum offloading when a VF is enabled whether the
> peer requires virtio headers because a transmitted packet can be routed
> to it and it expects the packet has a proper checksum. Therefore, it
> is necessary to have a correct virtio header even when checksum
> offloading is disabled.
>
> A real TCP/UDP checksum will be computed and saved in the buffer when
> checksum offloading is disabled. The virtio specification requires to
> set the packet checksum stored in the buffer to the TCP/UDP pseudo
> header when the VIRTIO_NET_HDR_F_NEEDS_CSUM bit is set so the bit must
> be cleared in that case.
>
> The VIRTIO_NET_HDR_F_NEEDS_CSUM bit also tells to skip checksum
> validation. Even if checksum offloading is disabled, it is desirable to
> skip checksum validation because the checksum is always correct. Use the
> VIRTIO_NET_HDR_F_DATA_VALID bit to claim the validity of the checksum.
>
> Fixes: ffbd2dbd8e64 ("e1000e: Perform software segmentation for loopback")
> Buglink: https://issues.redhat.com/browse/RHEL-23067
> Signed-off-by: Akihiko Odaki 
> ---
>  hw/net/net_tx_pkt.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
> index 2e5f58b3c9cc..c225cf706513 100644
> --- a/hw/net/net_tx_pkt.c
> +++ b/hw/net/net_tx_pkt.c
> @@ -833,6 +833,9 @@ bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool 
> offload,
>
>  if (offload || gso_type == VIRTIO_NET_HDR_GSO_NONE) {
>  if (!offload && pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
> +pkt->virt_hdr.flags =
> +(pkt->virt_hdr.flags & ~VIRTIO_NET_HDR_F_NEEDS_CSUM) |
> +VIRTIO_NET_HDR_F_DATA_VALID;

Why VIRTIO_NET_HDR_F_DATA_VALID is used in TX path?

Thanks

>  net_tx_pkt_do_sw_csum(pkt, &pkt->vec[NET_TX_PKT_L2HDR_FRAG],
>pkt->payload_frags + 
> NET_TX_PKT_PL_START_FRAG - 1,
>pkt->payload_len);
>
> ---
> base-commit: ba49d760eb04630e7b15f423ebecf6c871b8f77b
> change-id: 20240324-tx-c57d3c22ad73
>
> Best regards,
> --
> Akihiko Odaki 
>

[PATCH 2/3] target/hppa: Replace c with uv in do_cond

2024-03-25 Thread Richard Henderson

Prepare for proper indication of shladd unsigned overflow.
The UV indicator will be zero/not-zero instead of a single bit.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index a70d644c0b..9d31ef5764 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -707,7 +707,7 @@ static bool cond_need_cb(int c)
  */
 
 static DisasCond do_cond(DisasContext *ctx, unsigned cf, bool d,
- TCGv_i64 res, TCGv_i64 cb_msb, TCGv_i64 sv)
+ TCGv_i64 res, TCGv_i64 uv, TCGv_i64 sv)
 {
 DisasCond cond;
 TCGv_i64 tmp;
@@ -754,14 +754,12 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 }
 cond = cond_make_0_tmp(TCG_COND_EQ, tmp);
 break;
-case 4: /* NUV / UV  (!C / C) */
-/* Only bit 0 of cb_msb is ever set. */
-cond = cond_make_0(TCG_COND_EQ, cb_msb);
+case 4: /* NUV / UV  (!UV / UV) */
+cond = cond_make_0(TCG_COND_EQ, uv);
 break;
-case 5: /* ZNV / VNZ (!C | Z / C & !Z) */
+case 5: /* ZNV / VNZ (!UV | Z / UV & !Z) */
 tmp = tcg_temp_new_i64();
-tcg_gen_neg_i64(tmp, cb_msb);
-tcg_gen_and_i64(tmp, tmp, res);
+tcg_gen_movcond_i64(TCG_COND_EQ, tmp, uv, ctx->zero, ctx->zero, res);
 if (!d) {
 tcg_gen_ext32u_i64(tmp, tmp);
 }
-- 
2.34.1

[PATCH 3/3] target/hppa: Fix overflow computation for shladd

2024-03-25 Thread Richard Henderson

Overflow indicator should include the effect of the shift step.
We had previously left ??? comments about the issue.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 85 -
 1 file changed, 66 insertions(+), 19 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 9d31ef5764..0976372d16 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -994,7 +994,8 @@ static TCGv_i64 get_psw_carry(DisasContext *ctx, bool d)
 
 /* Compute signed overflow for addition.  */
 static TCGv_i64 do_add_sv(DisasContext *ctx, TCGv_i64 res,
-  TCGv_i64 in1, TCGv_i64 in2)
+  TCGv_i64 in1, TCGv_i64 in2,
+  TCGv_i64 orig_in1, int shift, bool d)
 {
 TCGv_i64 sv = tcg_temp_new_i64();
 TCGv_i64 tmp = tcg_temp_new_i64();
@@ -1003,9 +1004,50 @@ static TCGv_i64 do_add_sv(DisasContext *ctx, TCGv_i64 
res,
 tcg_gen_xor_i64(tmp, in1, in2);
 tcg_gen_andc_i64(sv, sv, tmp);
 
+switch (shift) {
+case 0:
+break;
+case 1:
+/* Shift left by one and compare the sign. */
+tcg_gen_add_i64(tmp, orig_in1, orig_in1);
+tcg_gen_xor_i64(tmp, tmp, orig_in1);
+/* Incorporate into the overflow. */
+tcg_gen_or_i64(sv, sv, tmp);
+break;
+default:
+{
+int sign_bit = d ? 63 : 31;
+uint64_t mask = MAKE_64BIT_MASK(sign_bit - shift, shift);
+
+/* Compare the sign against all lower bits. */
+tcg_gen_sextract_i64(tmp, orig_in1, sign_bit, 1);
+tcg_gen_xor_i64(tmp, tmp, orig_in1);
+/*
+ * If one of the bits shifting into or through the sign
+ * differs, then we have overflow.
+ */
+tcg_gen_movcond_i64(TCG_COND_TSTNE, sv,
+tmp, tcg_constant_i64(mask),
+tcg_constant_i64(-1), sv);
+}
+}
 return sv;
 }
 
+/* Compute unsigned overflow for addition.  */
+static TCGv_i64 do_add_uv(DisasContext *ctx, TCGv_i64 cb, TCGv_i64 cb_msb,
+  TCGv_i64 in1, int shift, bool d)
+{
+if (shift == 0) {
+return get_carry(ctx, d, cb, cb_msb);
+} else {
+TCGv_i64 tmp = tcg_temp_new_i64();
+tcg_gen_extract_i64(tmp, in1, (d ? 63 : 31) - shift, shift);
+tcg_gen_or_i64(tmp, tmp, get_carry(ctx, d, cb, cb_msb));
+return tmp;
+}
+}
+
 /* Compute signed overflow for subtraction.  */
 static TCGv_i64 do_sub_sv(DisasContext *ctx, TCGv_i64 res,
   TCGv_i64 in1, TCGv_i64 in2)
@@ -1020,19 +1062,19 @@ static TCGv_i64 do_sub_sv(DisasContext *ctx, TCGv_i64 
res,
 return sv;
 }
 
-static void do_add(DisasContext *ctx, unsigned rt, TCGv_i64 in1,
+static void do_add(DisasContext *ctx, unsigned rt, TCGv_i64 orig_in1,
TCGv_i64 in2, unsigned shift, bool is_l,
bool is_tsv, bool is_tc, bool is_c, unsigned cf, bool d)
 {
-TCGv_i64 dest, cb, cb_msb, cb_cond, sv, tmp;
+TCGv_i64 dest, cb, cb_msb, in1, uv, sv, tmp;
 unsigned c = cf >> 1;
 DisasCond cond;
 
 dest = tcg_temp_new_i64();
 cb = NULL;
 cb_msb = NULL;
-cb_cond = NULL;
 
+in1 = orig_in1;
 if (shift) {
 tmp = tcg_temp_new_i64();
 tcg_gen_shli_i64(tmp, in1, shift);
@@ -1050,9 +1092,6 @@ static void do_add(DisasContext *ctx, unsigned rt, 
TCGv_i64 in1,
 }
 tcg_gen_xor_i64(cb, in1, in2);
 tcg_gen_xor_i64(cb, cb, dest);
-if (cond_need_cb(c)) {
-cb_cond = get_carry(ctx, d, cb, cb_msb);
-}
 } else {
 tcg_gen_add_i64(dest, in1, in2);
 if (is_c) {
@@ -1063,18 +1102,23 @@ static void do_add(DisasContext *ctx, unsigned rt, 
TCGv_i64 in1,
 /* Compute signed overflow if required.  */
 sv = NULL;
 if (is_tsv || cond_need_sv(c)) {
-sv = do_add_sv(ctx, dest, in1, in2);
+sv = do_add_sv(ctx, dest, in1, in2, orig_in1, shift, d);
 if (is_tsv) {
 if (!d) {
 tcg_gen_ext32s_i64(sv, sv);
 }
-/* ??? Need to include overflow from shift.  */
 gen_helper_tsv(tcg_env, sv);
 }
 }
 
+/* Compute unsigned overflow if required.  */
+uv = NULL;
+if (cond_need_cb(c)) {
+uv = do_add_uv(ctx, cb, cb_msb, orig_in1, shift, d);
+}
+
 /* Emit any conditional trap before any writeback.  */
-cond = do_cond(ctx, cf, d, dest, cb_cond, sv);
+cond = do_cond(ctx, cf, d, dest, uv, sv);
 if (is_tc) {
 tmp = tcg_temp_new_i64();
 tcg_gen_setcond_i64(cond.c, tmp, cond.a0, cond.a1);
@@ -2843,7 +2887,6 @@ static bool trans_dcor_i(DisasContext *ctx, arg_rr_cf_d 
*a)
 static bool trans_ds(DisasContext *ctx, arg_rrr_cf *a)
 {
 TCGv_i64 dest, add1, add2, addc, in1, in2;
-TCGv_i64 cout;
 
 nullify_over(ctx);
 
@@ -2880,19 +2

[PATCH 1/3] target/hppa: Squash d for pa1.x during decode

2024-03-25 Thread Richard Henderson

The cond_need_ext predicate was created while we still had a
32-bit compilation mode.  It now makes more sense to treat D
as an absolute indicator of a 64-bit operation.

Signed-off-by: Richard Henderson 
---
 target/hppa/insns.decode | 20 +---
 target/hppa/translate.c  | 38 --
 2 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/target/hppa/insns.decode b/target/hppa/insns.decode
index f58455dfdb..6a74cf23cd 100644
--- a/target/hppa/insns.decode
+++ b/target/hppa/insns.decode
@@ -57,6 +57,9 @@
 %neg_to_m   0:1  !function=neg_to_m
 %a_to_m 2:1  !function=neg_to_m
 %cmpbid_c   13:2 !function=cmpbid_c
+%d_55:1  !function=pa20_d
+%d_11   11:1 !function=pa20_d
+%d_13   13:1 !function=pa20_d
 
 
 # Argument set definitions
@@ -84,15 +87,16 @@
 # Format definitions
 
 
-@rr_cf_d.. r:5 . cf:4 .. d:1 t:5&rr_cf_d
+@rr_cf_d.. r:5 . cf:4 .. . t:5  &rr_cf_d d=%d_5
 @rrr.. r2:5 r1:5  ... t:5   &rrr
 @rrr_cf .. r2:5 r1:5 cf:4 ... t:5   &rrr_cf
-@rrr_cf_d   .. r2:5 r1:5 cf:4 .. d:1 t:5&rrr_cf_d
+@rrr_cf_d   .. r2:5 r1:5 cf:4 .. . t:5  &rrr_cf_d d=%d_5
 @rrr_sh .. r2:5 r1:5  sh:2 . t:5&rrr_sh
-@rrr_cf_d_sh.. r2:5 r1:5 cf:4  sh:2 d:1 t:5 &rrr_cf_d_sh
-@rrr_cf_d_sh0   .. r2:5 r1:5 cf:4 .. d:1 t:5&rrr_cf_d_sh sh=0
+@rrr_cf_d_sh.. r2:5 r1:5 cf:4  sh:2 . t:5   &rrr_cf_d_sh d=%d_5
+@rrr_cf_d_sh0   .. r2:5 r1:5 cf:4 .. . t:5  &rrr_cf_d_sh d=%d_5 
sh=0
 @rri_cf .. r:5  t:5  cf:4 . ... &rri_cf i=%lowsign_11
-@rri_cf_d   .. r:5  t:5  cf:4 d:1 ...   &rri_cf_d i=%lowsign_11
+@rri_cf_d   .. r:5  t:5  cf:4 . ... \
+&rri_cf_d d=%d_11 i=%lowsign_11
 
 @rrb_cf .. r2:5 r1:5 c:3 ... n:1 .  \
 &rrb_c_f disp=%assemble_12
@@ -368,8 +372,10 @@ fmpysub_d   100110 . . . . 1 .  
@mpyadd
 # Conditional Branches
 
 
-bb_sar  11 0 r:5 c:1 1 d:1 ... n:1 . disp=%assemble_12
-bb_imm  110001 p:5   r:5 c:1 1 d:1 ... n:1 . disp=%assemble_12
+bb_sar  11 0 r:5 c:1 1 . ... n:1 . \
+disp=%assemble_12 d=%d_13
+bb_imm  110001 p:5   r:5 c:1 1 . ... n:1 . \
+disp=%assemble_12 d=%d_13
 
 movb110010 . . ... ... . .  @rrb_cf f=0
 movbi   110011 . . ... ... . .  @rib_cf f=0
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 99c5c4cbca..a70d644c0b 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -200,6 +200,14 @@ static int cmpbid_c(DisasContext *ctx, int val)
 return val ? val : 4; /* 0 == "*<<" */
 }
 
+/*
+ * In many places pa1.x did not decode the bit that later became
+ * the pa2.0 D bit.  Suppress D unless the cpu is pa2.0.
+ */
+static int pa20_d(DisasContext *ctx, int val)
+{
+return ctx->is_pa20 & val;
+}
 
 /* Include the auto-generated decoder.  */
 #include "decode-insns.c.inc"
@@ -693,12 +701,6 @@ static bool cond_need_cb(int c)
 return c == 4 || c == 5;
 }
 
-/* Need extensions from TCGv_i32 to TCGv_i64. */
-static bool cond_need_ext(DisasContext *ctx, bool d)
-{
-return !(ctx->is_pa20 && d);
-}
-
 /*
  * Compute conditional for arithmetic.  See Page 5-3, Table 5-1, of
  * the Parisc 1.1 Architecture Reference Manual for details.
@@ -715,7 +717,7 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 cond = cond_make_f();
 break;
 case 1: /* = / <>(Z / !Z) */
-if (cond_need_ext(ctx, d)) {
+if (!d) {
 tmp = tcg_temp_new_i64();
 tcg_gen_ext32u_i64(tmp, res);
 res = tmp;
@@ -725,7 +727,7 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 case 2: /* < / >=(N ^ V / !(N ^ V) */
 tmp = tcg_temp_new_i64();
 tcg_gen_xor_i64(tmp, res, sv);
-if (cond_need_ext(ctx, d)) {
+if (!d) {
 tcg_gen_ext32s_i64(tmp, tmp);
 }
 cond = cond_make_0_tmp(TCG_COND_LT, tmp);
@@ -742,7 +744,7 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
  */
 tmp = tcg_temp_new_i64();
 tcg_gen_eqv_i64(tmp, res, sv);
-if (cond_need_ext(ctx, d)) {
+if (!d) {
 tcg_gen_sextract_i64(tmp, tmp, 31, 1);
 tcg_gen_and_i64(tmp, tmp, res);
 tcg_gen_ext32u_i64(tmp, tmp);
@@ -760,13 +762,13 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 tmp = tcg_temp_new_i64();
 tcg_gen_neg_i64(tmp, cb_msb);
 tcg_gen_and_i64(tmp, tmp, res);
-if (cond_need_ext(ctx, d)) {
+if (!d) {
 tcg_gen_e

[PATCH for-9.0 0/3] target/hppa: Fix overflow computation for shladd

2024-03-25 Thread Richard Henderson

These ??? notes have been there since day one.
This fixes l2diag test 59.


r~


Richard Henderson (3):
  target/hppa: Squash d for pa1.x during decode
  target/hppa: Replace c with uv in do_cond
  target/hppa: Fix overflow computation for shladd

 target/hppa/insns.decode |  20 --
 target/hppa/translate.c  | 135 ++-
 2 files changed, 104 insertions(+), 51 deletions(-)

-- 
2.34.1

Re: [PATCH 0/2] tap: Use g_spawn_sync() and g_spawn_check_wait_status()

2024-03-25 Thread Jason Wang

On Tue, Dec 19, 2023 at 7:59 PM Akihiko Odaki  wrote:
>
> g_spawn_sync() gives an informative message if it fails to execute
> the script instead of reporting exiting status 1.
>
> g_spawn_check_wait_status() also gives an message easier to understand
> than the raw value returned by waitpid().
>
> Signed-off-by: Akihiko Odaki 
> ---
> Akihiko Odaki (2):
>   glib-compat: Define g_spawn_check_wait_status()
>   tap: Use g_spawn_sync() and g_spawn_check_wait_status()
>
>  include/glib-compat.h |  2 ++
>  net/tap.c | 52 
> ++-
>  2 files changed, 24 insertions(+), 30 deletions(-)
> ---
> base-commit: 9c74490bff6c8886a922008d0c9ce6cae70dd17e
> change-id: 20231219-glib-034a34bb05d8
>
> Best regards,
> --
> Akihiko Odaki 

I've queued this for 9.1

Thanks

>

Re: [PATCH v2] Revert "tap: setting error appropriately when calling net_init_tap_one()"

2024-03-25 Thread Jason Wang

On Thu, Sep 21, 2023 at 5:48 PM Akihiko Odaki  wrote:
>
> This reverts commit 46d4d36d0bf2b24b205f2f604f0905db80264eef.
>
> The reverted commit changed to emit warnings instead of errors when
> vhost is requested but vhost initialization fails if vhostforce option
> is not set.
>
> However, vhostforce is not meant to change the error handling. It was
> once introduced as an option to commit 5430a28fe4 ("vhost: force vhost
> off for non-MSI guests") to force enabling vhost for non-MSI guests,
> which will have worse performance with vhost. It was deprecated with
> commit 1e7398a140 ("vhost: enable vhost without without MSI-X") and
> changed to behave identical with the vhost option for compatibility.
>
> Worse, commit bf769f742c ("virtio: del net client if net_init_tap_one
> failed") changed to delete the client when vhost fails even when the
> failure only results in a warning. The leads to an assertion failure
> for the -netdev command line option.
>
> The reverted commit was intended to ensure that the vhost initialization
> failure won't result in a corrupted netdev. This problem should have
> been fixed by deleting netdev when the initialization fails instead of
> ignoring the failure by converting it into a warning. Fortunately,
> commit bf769f742c ("virtio: del net client if net_init_tap_one failed"),
> mentioned earlier, implements this behavior.
>
> Restore the correct semantics and fix the assertion failure for the
> -netdev command line option by reverting the problematic commit.
>
> Signed-off-by: Akihiko Odaki 
> ---
> V1 -> V2: Corrected the message.
>

Queued.

Thanks

Re: [PATCH v2] tap-win32: Remove unnecessary stubs

2024-03-25 Thread Jason Wang

On Mon, Feb 12, 2024 at 10:04 PM Akihiko Odaki  wrote:
>
> Some of them are only necessary for POSIX systems. The others are
> assigned to function pointers in NetClientInfo that can actually be
> NULL.
>
> Signed-off-by: Akihiko Odaki 
> ---
> Changes in v2:
> - Rebased.
> - Link to v1: 
> https://lore.kernel.org/r/20231006051127.5429-1-akihiko.od...@daynix.com
> ---

Queued.

Thanks

Re: [External] Re: [PATCH v4 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-25 Thread Ho-Ren (Jack) Chuang

On Mon, Mar 25, 2024 at 8:08 PM Huang, Ying  wrote:
>
> "Ho-Ren (Jack) Chuang"  writes:
>
> > On Fri, Mar 22, 2024 at 1:41 AM Huang, Ying  wrote:
> >>
> >> "Ho-Ren (Jack) Chuang"  writes:
> >>
> >> > The current implementation treats emulated memory devices, such as
> >> > CXL1.1 type3 memory, as normal DRAM when they are emulated as normal 
> >> > memory
> >> > (E820_TYPE_RAM). However, these emulated devices have different
> >> > characteristics than traditional DRAM, making it important to
> >> > distinguish them. Thus, we modify the tiered memory initialization 
> >> > process
> >> > to introduce a delay specifically for CPUless NUMA nodes. This delay
> >> > ensures that the memory tier initialization for these nodes is deferred
> >> > until HMAT information is obtained during the boot process. Finally,
> >> > demotion tables are recalculated at the end.
> >> >
> >> > * late_initcall(memory_tier_late_init);
> >> > Some device drivers may have initialized memory tiers between
> >> > `memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
> >> > online memory nodes and configuring memory tiers. They should be excluded
> >> > in the late init.
> >> >
> >> > * Handle cases where there is no HMAT when creating memory tiers
> >> > There is a scenario where a CPUless node does not provide HMAT 
> >> > information.
> >> > If no HMAT is specified, it falls back to using the default DRAM tier.
> >> >
> >> > * Introduce another new lock `default_dram_perf_lock` for adist 
> >> > calculation
> >> > In the current implementation, iterating through CPUlist nodes requires
> >> > holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end 
> >> > up
> >> > trying to acquire the same lock, leading to a potential deadlock.
> >> > Therefore, we propose introducing a standalone `default_dram_perf_lock` 
> >> > to
> >> > protect `default_dram_perf_*`. This approach not only avoids deadlock
> >> > but also prevents holding a large lock simultaneously.
> >> >
> >> > * Upgrade `set_node_memory_tier` to support additional cases, including
> >> >   default DRAM, late CPUless, and hot-plugged initializations.
> >> > To cover hot-plugged memory nodes, `mt_calc_adistance()` and
> >> > `mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
> >> > handle cases where memtype is not initialized and where HMAT information 
> >> > is
> >> > available.
> >> >
> >> > * Introduce `default_memory_types` for those memory types that are not
> >> >   initialized by device drivers.
> >> > Because late initialized memory and default DRAM memory need to be 
> >> > managed,
> >> > a default memory type is created for storing all memory types that are
> >> > not initialized by device drivers and as a fallback.
> >> >
> >> > Signed-off-by: Ho-Ren (Jack) Chuang 
> >> > Signed-off-by: Hao Xiang 
> >> > ---
> >> >  mm/memory-tiers.c | 73 ---
> >> >  1 file changed, 63 insertions(+), 10 deletions(-)
> >> >
> >> > diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> >> > index 974af10cfdd8..9396330fa162 100644
> >> > --- a/mm/memory-tiers.c
> >> > +++ b/mm/memory-tiers.c
> >> > @@ -36,6 +36,11 @@ struct node_memory_type_map {
> >> >
> >> >  static DEFINE_MUTEX(memory_tier_lock);
> >> >  static LIST_HEAD(memory_tiers);
> >> > +/*
> >> > + * The list is used to store all memory types that are not created
> >> > + * by a device driver.
> >> > + */
> >> > +static LIST_HEAD(default_memory_types);
> >> >  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
> >> >  struct memory_dev_type *default_dram_type;
> >> >
> >> > @@ -108,6 +113,7 @@ static struct demotion_nodes *node_demotion 
> >> > __read_mostly;
> >> >
> >> >  static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
> >> >
> >> > +static DEFINE_MUTEX(default_dram_perf_lock);
> >>
> >> Better to add comments about what is protected by this lock.
> >>
> >
> > Thank you. I will add a comment like this:
> > + /* The lock is used to protect `default_dram_perf*` info and nid. */
> > +static DEFINE_MUTEX(default_dram_perf_lock);
> >
> > I also found an error path was not handled and
> > found the lock could be put closer to what it protects.
> > I will have them fixed in V5.
> >
> >> >  static bool default_dram_perf_error;
> >> >  static struct access_coordinate default_dram_perf;
> >> >  static int default_dram_perf_ref_nid = NUMA_NO_NODE;
> >> > @@ -505,7 +511,8 @@ static inline void __init_node_memory_type(int node, 
> >> > struct memory_dev_type *mem
> >> >  static struct memory_tier *set_node_memory_tier(int node)
> >> >  {
> >> >   struct memory_tier *memtier;
> >> > - struct memory_dev_type *memtype;
> >> > + struct memory_dev_type *mtype;
> >>
> >> mtype may be referenced without initialization now below.
> >>
> >
> > Good catch! Thank you.
> >
> > Please check below.
> > I may found a potential NULL pointer dereference.
> >
> >> > + int adist = MEMTIER_ADISTANCE_DRAM;
> >> >

Re: [PATCH] hw/ppc/spapr: Include missing 'sysemu/tcg.h' header

2024-03-25 Thread Harsh Prateek Bora





On 3/22/24 21:54, Philippe Mathieu-Daudé wrote:

"sysemu/tcg.h" declares tcg_enabled(), and is implicitly included.
Include it explicitly to avoid the following error when refactoring
headers:

   hw/ppc/spapr.c:2612:9: error: call to undeclared function 'tcg_enabled'; ISO 
C99 and later do not support implicit function declarations 
[-Wimplicit-function-declaration]
 if (tcg_enabled()) {
 ^

Signed-off-by: Philippe Mathieu-Daudé 


Reviewed-by: Harsh Prateek Bora 


---
  hw/ppc/spapr.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index c417f9dd52..e9bc97fee0 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -35,6 +35,7 @@
  #include "sysemu/sysemu.h"
  #include "sysemu/hostmem.h"
  #include "sysemu/numa.h"
+#include "sysemu/tcg.h"
  #include "sysemu/qtest.h"
  #include "sysemu/reset.h"
  #include "sysemu/runstate.h"

Re: [PATCH v2] target/riscv/kvm/kvm-cpu.c: kvm_riscv_handle_sbi() fail with vendor-specific SBI

2024-03-25 Thread Alistair Francis

On Mon, Mar 25, 2024 at 11:46 PM Alexei Filippov
 wrote:
>
> kvm_riscv_handle_sbi() may return not supported return code to not trigger
> qemu abort with vendor-specific sbi.
>
> Added SBI related return code's defines.
>
> Signed-off-by: Alexei Filippov 
> Fixes: 4eb47125 ("target/riscv: Handle KVM_EXIT_RISCV_SBI exit")
> Reviewed-by: Daniel Henrique Barboza 

Reviewed-by: Alistair Francis 

Alistair

> ---
>
> Changes since v1:
> -Add Fixes and Revied-by lines.
>  target/riscv/kvm/kvm-cpu.c |  5 +++--
>  target/riscv/sbi_ecall_interface.h | 11 +++
>  2 files changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/target/riscv/kvm/kvm-cpu.c b/target/riscv/kvm/kvm-cpu.c
> index 6a6c6cae80..a4f84ad950 100644
> --- a/target/riscv/kvm/kvm-cpu.c
> +++ b/target/riscv/kvm/kvm-cpu.c
> @@ -1404,7 +1404,7 @@ static int kvm_riscv_handle_sbi(CPUState *cs, struct 
> kvm_run *run)
>  if (ret == sizeof(ch)) {
>  run->riscv_sbi.ret[0] = ch;
>  } else {
> -run->riscv_sbi.ret[0] = -1;
> +run->riscv_sbi.ret[0] = SBI_ERR_FAILURE;
>  }
>  ret = 0;
>  break;
> @@ -1412,7 +1412,8 @@ static int kvm_riscv_handle_sbi(CPUState *cs, struct 
> kvm_run *run)
>  qemu_log_mask(LOG_UNIMP,
>"%s: un-handled SBI EXIT, specific reasons is %lu\n",
>__func__, run->riscv_sbi.extension_id);
> -ret = -1;
> +run->riscv_sbi.ret[0] = SBI_ERR_NOT_SUPPORTED;
> +ret = 0;
>  break;
>  }
>  return ret;
> diff --git a/target/riscv/sbi_ecall_interface.h 
> b/target/riscv/sbi_ecall_interface.h
> index 43899d08f6..0279e92a36 100644
> --- a/target/riscv/sbi_ecall_interface.h
> +++ b/target/riscv/sbi_ecall_interface.h
> @@ -69,4 +69,15 @@
>  #define SBI_EXT_VENDOR_END  0x09FF
>  /* clang-format on */
>
> +/* SBI return error codes */
> +#define SBI_SUCCESS  0
> +#define SBI_ERR_FAILURE -1
> +#define SBI_ERR_NOT_SUPPORTED   -2
> +#define SBI_ERR_INVALID_PARAM   -3
> +#define SBI_ERR_DENIED  -4
> +#define SBI_ERR_INVALID_ADDRESS -5
> +#define SBI_ERR_ALREADY_AVAILABLE   -6
> +#define SBI_ERR_ALREADY_STARTED -7
> +#define SBI_ERR_ALREADY_STOPPED -8
> +
>  #endif
> --
> 2.34.1
>
>

Re: [PATCH-for-9.0 v3 3/3] hw/misc/stm32l4x5_rcc: Propagate period when enabling a clock

2024-03-25 Thread Alistair Francis

On Tue, Mar 26, 2024 at 1:29 AM Philippe Mathieu-Daudé
 wrote:
>
> From: Arnaud Minier 
>
> The "clock_set_mul_div" function doesn't propagate the clock period
> to the children if it is changed (e.g. by enabling/disabling a clock
> multiplexer).
> This was overlooked during the implementation due to late changes.
>
> This commit propagates the change if the multiplier or divider changes.
>
> Fixes: ec7d83acbd ("hw/misc/stm32l4x5_rcc: Add an internal clock multiplexer 
> object")
> Signed-off-by: Arnaud Minier 
> Signed-off-by: Inès Varhol 
> Message-ID: <20240317103918.44375-2-arnaud.min...@telecom-paris.fr>
> [PMD: Check clock_set_mul_div() return value]
> Signed-off-by: Philippe Mathieu-Daudé 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  hw/misc/stm32l4x5_rcc.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/hw/misc/stm32l4x5_rcc.c b/hw/misc/stm32l4x5_rcc.c
> index 49b90afdf0..ed2dbd9dc3 100644
> --- a/hw/misc/stm32l4x5_rcc.c
> +++ b/hw/misc/stm32l4x5_rcc.c
> @@ -61,7 +61,7 @@ static void clock_mux_update(RccClockMuxState *mux, bool 
> bypass_source)
>  freq_multiplier = mux->divider;
>  }
>
> -clock_set_mul_div(mux->out, freq_multiplier, mux->multiplier);
> +clk_changed |= clock_set_mul_div(mux->out, freq_multiplier, 
> mux->multiplier);
>  clk_changed |= clock_set(mux->out, clock_get(current_source));
>  if (clk_changed) {
>  clock_propagate(mux->out);
> --
> 2.41.0
>
>

Re: [PATCH-for-9.0 v3 2/3] hw/misc/stm32l4x5_rcc: Inline clock_update() in clock_mux_update()

2024-03-25 Thread Alistair Francis

On Tue, Mar 26, 2024 at 1:29 AM Philippe Mathieu-Daudé
 wrote:
>
> Trivial inlining in preliminary patch to make the next
> one easier to review.
>
> Signed-off-by: Philippe Mathieu-Daudé 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  hw/misc/stm32l4x5_rcc.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/hw/misc/stm32l4x5_rcc.c b/hw/misc/stm32l4x5_rcc.c
> index bc2d63528b..49b90afdf0 100644
> --- a/hw/misc/stm32l4x5_rcc.c
> +++ b/hw/misc/stm32l4x5_rcc.c
> @@ -48,6 +48,8 @@ static void clock_mux_update(RccClockMuxState *mux, bool 
> bypass_source)
>  uint64_t src_freq;
>  Clock *current_source = mux->srcs[mux->src];
>  uint32_t freq_multiplier = 0;
> +bool clk_changed = false;
> +
>  /*
>   * To avoid rounding errors, we use the clock period instead of the
>   * frequency.
> @@ -60,7 +62,10 @@ static void clock_mux_update(RccClockMuxState *mux, bool 
> bypass_source)
>  }
>
>  clock_set_mul_div(mux->out, freq_multiplier, mux->multiplier);
> -clock_update(mux->out, clock_get(current_source));
> +clk_changed |= clock_set(mux->out, clock_get(current_source));
> +if (clk_changed) {
> +clock_propagate(mux->out);
> +}
>
>  src_freq = clock_get_hz(current_source);
>  /* TODO: can we simply detect if the config changed so that we reduce 
> log spam ? */
> --
> 2.41.0
>
>

Re: [PATCH-for-9.0 v3 1/3] hw/clock: Let clock_set_mul_div() return a boolean value

2024-03-25 Thread Alistair Francis

On Tue, Mar 26, 2024 at 1:29 AM Philippe Mathieu-Daudé
 wrote:
>
> Let clock_set_mul_div() return a boolean value whether the
> clock has been updated or not, similarly to clock_set().
>
> Return early when clock_set_mul_div() is called with
> same mul/div values the clock has.
>
> Acked-by: Luc Michel 
> Signed-off-by: Philippe Mathieu-Daudé 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  docs/devel/clocks.rst | 4 
>  include/hw/clock.h| 4 +++-
>  hw/core/clock.c   | 8 +++-
>  3 files changed, 14 insertions(+), 2 deletions(-)
>
> diff --git a/docs/devel/clocks.rst b/docs/devel/clocks.rst
> index c4d14bde04..b2d1148cdb 100644
> --- a/docs/devel/clocks.rst
> +++ b/docs/devel/clocks.rst
> @@ -279,6 +279,10 @@ You can change the multiplier and divider of a clock at 
> runtime,
>  so you can use this to model clock controller devices which
>  have guest-programmable frequency multipliers or dividers.
>
> +Similary to ``clock_set()``, ``clock_set_mul_div()`` returns ``true`` if
> +the clock state was modified; that is, if the multiplier or the diviser
> +or both were changed by the call.
> +
>  Note that ``clock_set_mul_div()`` does not automatically call
>  ``clock_propagate()``. If you make a runtime change to the
>  multiplier or divider you must call clock_propagate() yourself.
> diff --git a/include/hw/clock.h b/include/hw/clock.h
> index bb12117f67..eb58599131 100644
> --- a/include/hw/clock.h
> +++ b/include/hw/clock.h
> @@ -357,6 +357,8 @@ char *clock_display_freq(Clock *clk);
>   * @multiplier: multiplier value
>   * @divider: divider value
>   *
> + * @return: true if the clock is changed.
> + *
>   * By default, a Clock's children will all run with the same period
>   * as their parent. This function allows you to adjust the multiplier
>   * and divider used to derive the child clock frequency.
> @@ -374,6 +376,6 @@ char *clock_display_freq(Clock *clk);
>   * Note that this function does not call clock_propagate(); the
>   * caller should do that if necessary.
>   */
> -void clock_set_mul_div(Clock *clk, uint32_t multiplier, uint32_t divider);
> +bool clock_set_mul_div(Clock *clk, uint32_t multiplier, uint32_t divider);
>
>  #endif /* QEMU_HW_CLOCK_H */
> diff --git a/hw/core/clock.c b/hw/core/clock.c
> index d82e44cd1a..a19c7db7df 100644
> --- a/hw/core/clock.c
> +++ b/hw/core/clock.c
> @@ -143,14 +143,20 @@ char *clock_display_freq(Clock *clk)
>  return freq_to_str(clock_get_hz(clk));
>  }
>
> -void clock_set_mul_div(Clock *clk, uint32_t multiplier, uint32_t divider)
> +bool clock_set_mul_div(Clock *clk, uint32_t multiplier, uint32_t divider)
>  {
>  assert(divider != 0);
>
> +if (clk->multiplier == multiplier && clk->divider == divider) {
> +return false;
> +}
> +
>  trace_clock_set_mul_div(CLOCK_PATH(clk), clk->multiplier, multiplier,
>  clk->divider, divider);
>  clk->multiplier = multiplier;
>  clk->divider = divider;
> +
> +return true;
>  }
>
>  static void clock_initfn(Object *obj)
> --
> 2.41.0
>
>

Re: [External] : Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration

2024-03-25 Thread Jason Wang

On Tue, Mar 26, 2024 at 7:21 AM Si-Wei Liu  wrote:
>
>
>
> On 3/24/2024 11:13 PM, Jason Wang wrote:
> > On Sat, Mar 23, 2024 at 5:14 AM Si-Wei Liu  wrote:
> >>
> >>
> >> On 3/21/2024 10:08 PM, Jason Wang wrote:
> >>> On Fri, Mar 22, 2024 at 5:43 AM Si-Wei Liu  wrote:
> 
>  On 3/20/2024 8:56 PM, Jason Wang wrote:
> > On Thu, Mar 21, 2024 at 5:03 AM Si-Wei Liu  
> > wrote:
> >> On 3/19/2024 8:27 PM, Jason Wang wrote:
> >>> On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu  
> >>> wrote:
>  On 3/17/2024 8:22 PM, Jason Wang wrote:
> > On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu  
> > wrote:
> >> On 3/14/2024 9:03 PM, Jason Wang wrote:
> >>> On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu 
> >>>  wrote:
>  On setups with one or more virtio-net devices with vhost on,
>  dirty tracking iteration increases cost the bigger the number
>  amount of queues are set up e.g. on idle guests migration the
>  following is observed with virtio-net with vhost=on:
> 
>  48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
>  8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
>  1 queue -> 6.89% [.] vhost_dev_sync_region.isra.13
>  2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14
> 
>  With high memory rates the symptom is lack of convergence as soon
>  as it has a vhost device with a sufficiently high number of 
>  queues,
>  the sufficient number of vhost devices.
> 
>  On every migration iteration (every 100msecs) it will redundantly
>  query the *shared log* the number of queues configured with vhost
>  that exist in the guest. For the virtqueue data, this is 
>  necessary,
>  but not for the memory sections which are the same. So 
>  essentially
>  we end up scanning the dirty log too often.
> 
>  To fix that, select a vhost device responsible for scanning the
>  log with regards to memory sections dirty tracking. It is 
>  selected
>  when we enable the logger (during migration) and cleared when we
>  disable the logger. If the vhost logger device goes away for some
>  reason, the logger will be re-selected from the rest of vhost
>  devices.
> 
>  After making mem-section logger a singleton instance, constant 
>  cost
>  of 7%-9% (like the 1 queue report) will be seen, no matter how 
>  many
>  queues or how many vhost devices are configured:
> 
>  48 queues -> 8.71%[.] vhost_dev_sync_region.isra.13
>  2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14
> 
>  Co-developed-by: Joao Martins 
>  Signed-off-by: Joao Martins 
>  Signed-off-by: Si-Wei Liu 
> 
>  ---
>  v3 -> v4:
>  - add comment to clarify effect on cache locality and
>    performance
> 
>  v2 -> v3:
>  - add after-fix benchmark to commit log
>  - rename vhost_log_dev_enabled to vhost_dev_should_log
>  - remove unneeded comparisons for backend_type
>  - use QLIST array instead of single flat list to store 
>  vhost
>    logger devices
>  - simplify logger election logic
>  ---
> hw/virtio/vhost.c | 67 
>  ++-
> include/hw/virtio/vhost.h |  1 +
> 2 files changed, 62 insertions(+), 6 deletions(-)
> 
>  diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
>  index 612f4db..58522f1 100644
>  --- a/hw/virtio/vhost.c
>  +++ b/hw/virtio/vhost.c
>  @@ -45,6 +45,7 @@
> 
> static struct vhost_log 
>  *vhost_log[VHOST_BACKEND_TYPE_MAX];
> static struct vhost_log 
>  *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
>  +static QLIST_HEAD(, vhost_dev) 
>  vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
> 
> /* Memslots used by backends that support private 
>  memslots (without an fd). */
> static unsigned int used_memslots;
>  @@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev 
>  *dev)
> }
> }
> 
>  +static inline bool vhost_dev_should_log(struct vhost_dev *dev)
>  +{
> >>

Re: [PATCH v3] contrib/plugins/execlog: Fix compiler warning

2024-03-25 Thread Pierrick Bouvier


On 3/26/24 05:52, Yao Xingtao wrote:

1. The g_pattern_match_string() is deprecated when glib2 version >= 2.70.
Use g_pattern_spec_match_string() instead to avoid this problem.

2. The type of second parameter in g_ptr_array_add() is
'gpointer' {aka 'void *'}, but the type of reg->name is 'const char*'.
Cast the type of reg->name to 'gpointer' to avoid this problem.

compiler warning message:
/root/qemu/contrib/plugins/execlog.c:330:17: warning: ‘g_pattern_match_string’
is deprecated: Use 'g_pattern_spec_match_string'
instead [-Wdeprecated-declarations]
   330 | if (g_pattern_match_string(pat, rd->name) ||
   | ^~
In file included from /usr/include/glib-2.0/glib.h:67,
  from /root/qemu/contrib/plugins/execlog.c:9:
/usr/include/glib-2.0/glib/gpattern.h:57:15: note: declared here
57 | gboolean  g_pattern_match_string   (GPatternSpec *pspec,
   |   ^~
/root/qemu/contrib/plugins/execlog.c:331:21: warning: ‘g_pattern_match_string’
is deprecated: Use 'g_pattern_spec_match_string'
instead [-Wdeprecated-declarations]
   331 | g_pattern_match_string(pat, rd_lower)) {
   | ^~
/usr/include/glib-2.0/glib/gpattern.h:57:15: note: declared here
57 | gboolean  g_pattern_match_string   (GPatternSpec *pspec,
   |   ^~
/root/qemu/contrib/plugins/execlog.c:339:63: warning: passing argument 2 of
‘g_ptr_array_add’ discards ‘const’ qualifier from pointer target type
[-Wdiscarded-qualifiers]
   339 | g_ptr_array_add(all_reg_names, reg->name);
   |~~~^~
In file included from /usr/include/glib-2.0/glib.h:33:
/usr/include/glib-2.0/glib/garray.h:198:62: note: expected
‘gpointer’ {aka ‘void *’} but argument is of type ‘const char *’
   198 |gpointer  data);
   |~~^~~~

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2210
Signed-off-by: Yao Xingtao 
---
  contrib/plugins/execlog.c | 24 +---
  1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/contrib/plugins/execlog.c b/contrib/plugins/execlog.c
index a1dfd59ab7..fab18113d4 100644
--- a/contrib/plugins/execlog.c
+++ b/contrib/plugins/execlog.c
@@ -311,6 +311,24 @@ static Register 
*init_vcpu_register(qemu_plugin_reg_descriptor *desc)
  return reg;
  }
  
+/*

+ * g_pattern_match_string has been deprecated in Glib since 2.70 and
+ * will complain about it if you try to use it. Fortunately the
+ * signature of both functions is the same making it easy to work
+ * around.
+ */
+static inline
+gboolean g_pattern_spec_match_string_qemu(GPatternSpec *pspec,
+  const gchar *string)
+{
+#if GLIB_CHECK_VERSION(2, 70, 0)
+return g_pattern_spec_match_string(pspec, string);
+#else
+return g_pattern_match_string(pspec, string);
+#endif
+};
+#define g_pattern_spec_match_string(p, s) g_pattern_spec_match_string_qemu(p, 
s)
+
  static GPtrArray *registers_init(int vcpu_index)
  {
  g_autoptr(GPtrArray) registers = g_ptr_array_new();
@@ -327,8 +345,8 @@ static GPtrArray *registers_init(int vcpu_index)
  for (int p = 0; p < rmatches->len; p++) {
  g_autoptr(GPatternSpec) pat = 
g_pattern_spec_new(rmatches->pdata[p]);
  g_autofree gchar *rd_lower = g_utf8_strdown(rd->name, -1);
-if (g_pattern_match_string(pat, rd->name) ||
-g_pattern_match_string(pat, rd_lower)) {
+if (g_pattern_spec_match_string(pat, rd->name) ||
+g_pattern_spec_match_string(pat, rd_lower)) {
  Register *reg = init_vcpu_register(rd);
  g_ptr_array_add(registers, reg);
  
@@ -336,7 +354,7 @@ static GPtrArray *registers_init(int vcpu_index)

  if (disas_assist) {
  g_mutex_lock(&add_reg_name_lock);
  if (!g_ptr_array_find(all_reg_names, reg->name, 
NULL)) {
-g_ptr_array_add(all_reg_names, reg->name);
+g_ptr_array_add(all_reg_names, 
(gpointer)reg->name);
  }
  g_mutex_unlock(&add_reg_name_lock);
  }


Would be nice if it's still possible to merge this in 9.0 Peter.

Reviewed-by: Pierrick Bouvier

Re: [PATCH-for-9.0 v2 1/8] hw/clock: Have clock_set_mul_div() return early when nothing to change

2024-03-25 Thread Alistair Francis

On Mon, Mar 25, 2024 at 11:34 PM Philippe Mathieu-Daudé
 wrote:
>
> Return early when clock_set_mul_div() is called with
> same mul/div values the clock has.
>
> Signed-off-by: Philippe Mathieu-Daudé 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  hw/core/clock.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/hw/core/clock.c b/hw/core/clock.c
> index d82e44cd1a..c73f0c2f98 100644
> --- a/hw/core/clock.c
> +++ b/hw/core/clock.c
> @@ -147,6 +147,10 @@ void clock_set_mul_div(Clock *clk, uint32_t multiplier, 
> uint32_t divider)
>  {
>  assert(divider != 0);
>
> +if (clk->multiplier == multiplier && clk->divider == divider) {
> +return;
> +}
> +
>  trace_clock_set_mul_div(CLOCK_PATH(clk), clk->multiplier, multiplier,
>  clk->divider, divider);
>  clk->multiplier = multiplier;
> --
> 2.41.0
>
>

Re: [External] Re: [PATCH v4 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-03-25 Thread Huang, Ying

"Ho-Ren (Jack) Chuang"  writes:

> On Fri, Mar 22, 2024 at 1:41 AM Huang, Ying  wrote:
>>
>> "Ho-Ren (Jack) Chuang"  writes:
>>
>> > The current implementation treats emulated memory devices, such as
>> > CXL1.1 type3 memory, as normal DRAM when they are emulated as normal memory
>> > (E820_TYPE_RAM). However, these emulated devices have different
>> > characteristics than traditional DRAM, making it important to
>> > distinguish them. Thus, we modify the tiered memory initialization process
>> > to introduce a delay specifically for CPUless NUMA nodes. This delay
>> > ensures that the memory tier initialization for these nodes is deferred
>> > until HMAT information is obtained during the boot process. Finally,
>> > demotion tables are recalculated at the end.
>> >
>> > * late_initcall(memory_tier_late_init);
>> > Some device drivers may have initialized memory tiers between
>> > `memory_tier_init()` and `memory_tier_late_init()`, potentially bringing
>> > online memory nodes and configuring memory tiers. They should be excluded
>> > in the late init.
>> >
>> > * Handle cases where there is no HMAT when creating memory tiers
>> > There is a scenario where a CPUless node does not provide HMAT information.
>> > If no HMAT is specified, it falls back to using the default DRAM tier.
>> >
>> > * Introduce another new lock `default_dram_perf_lock` for adist calculation
>> > In the current implementation, iterating through CPUlist nodes requires
>> > holding the `memory_tier_lock`. However, `mt_calc_adistance()` will end up
>> > trying to acquire the same lock, leading to a potential deadlock.
>> > Therefore, we propose introducing a standalone `default_dram_perf_lock` to
>> > protect `default_dram_perf_*`. This approach not only avoids deadlock
>> > but also prevents holding a large lock simultaneously.
>> >
>> > * Upgrade `set_node_memory_tier` to support additional cases, including
>> >   default DRAM, late CPUless, and hot-plugged initializations.
>> > To cover hot-plugged memory nodes, `mt_calc_adistance()` and
>> > `mt_find_alloc_memory_type()` are moved into `set_node_memory_tier()` to
>> > handle cases where memtype is not initialized and where HMAT information is
>> > available.
>> >
>> > * Introduce `default_memory_types` for those memory types that are not
>> >   initialized by device drivers.
>> > Because late initialized memory and default DRAM memory need to be managed,
>> > a default memory type is created for storing all memory types that are
>> > not initialized by device drivers and as a fallback.
>> >
>> > Signed-off-by: Ho-Ren (Jack) Chuang 
>> > Signed-off-by: Hao Xiang 
>> > ---
>> >  mm/memory-tiers.c | 73 ---
>> >  1 file changed, 63 insertions(+), 10 deletions(-)
>> >
>> > diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> > index 974af10cfdd8..9396330fa162 100644
>> > --- a/mm/memory-tiers.c
>> > +++ b/mm/memory-tiers.c
>> > @@ -36,6 +36,11 @@ struct node_memory_type_map {
>> >
>> >  static DEFINE_MUTEX(memory_tier_lock);
>> >  static LIST_HEAD(memory_tiers);
>> > +/*
>> > + * The list is used to store all memory types that are not created
>> > + * by a device driver.
>> > + */
>> > +static LIST_HEAD(default_memory_types);
>> >  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
>> >  struct memory_dev_type *default_dram_type;
>> >
>> > @@ -108,6 +113,7 @@ static struct demotion_nodes *node_demotion 
>> > __read_mostly;
>> >
>> >  static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
>> >
>> > +static DEFINE_MUTEX(default_dram_perf_lock);
>>
>> Better to add comments about what is protected by this lock.
>>
>
> Thank you. I will add a comment like this:
> + /* The lock is used to protect `default_dram_perf*` info and nid. */
> +static DEFINE_MUTEX(default_dram_perf_lock);
>
> I also found an error path was not handled and
> found the lock could be put closer to what it protects.
> I will have them fixed in V5.
>
>> >  static bool default_dram_perf_error;
>> >  static struct access_coordinate default_dram_perf;
>> >  static int default_dram_perf_ref_nid = NUMA_NO_NODE;
>> > @@ -505,7 +511,8 @@ static inline void __init_node_memory_type(int node, 
>> > struct memory_dev_type *mem
>> >  static struct memory_tier *set_node_memory_tier(int node)
>> >  {
>> >   struct memory_tier *memtier;
>> > - struct memory_dev_type *memtype;
>> > + struct memory_dev_type *mtype;
>>
>> mtype may be referenced without initialization now below.
>>
>
> Good catch! Thank you.
>
> Please check below.
> I may found a potential NULL pointer dereference.
>
>> > + int adist = MEMTIER_ADISTANCE_DRAM;
>> >   pg_data_t *pgdat = NODE_DATA(node);
>> >
>> >
>> > @@ -514,11 +521,20 @@ static struct memory_tier *set_node_memory_tier(int 
>> > node)
>> >   if (!node_state(node, N_MEMORY))
>> >   return ERR_PTR(-EINVAL);
>> >
>> > - __init_node_memory_type(node, default_dram_type);
>> > + mt_calc_adistance(node, &adi

[PATCH v3] contrib/plugins/execlog: Fix compiler warning

2024-03-25 Thread Yao Xingtao via

1. The g_pattern_match_string() is deprecated when glib2 version >= 2.70.
   Use g_pattern_spec_match_string() instead to avoid this problem.

2. The type of second parameter in g_ptr_array_add() is
   'gpointer' {aka 'void *'}, but the type of reg->name is 'const char*'.
   Cast the type of reg->name to 'gpointer' to avoid this problem.

compiler warning message:
/root/qemu/contrib/plugins/execlog.c:330:17: warning: ‘g_pattern_match_string’
is deprecated: Use 'g_pattern_spec_match_string'
instead [-Wdeprecated-declarations]
  330 | if (g_pattern_match_string(pat, rd->name) ||
  | ^~
In file included from /usr/include/glib-2.0/glib.h:67,
 from /root/qemu/contrib/plugins/execlog.c:9:
/usr/include/glib-2.0/glib/gpattern.h:57:15: note: declared here
   57 | gboolean  g_pattern_match_string   (GPatternSpec *pspec,
  |   ^~
/root/qemu/contrib/plugins/execlog.c:331:21: warning: ‘g_pattern_match_string’
is deprecated: Use 'g_pattern_spec_match_string'
instead [-Wdeprecated-declarations]
  331 | g_pattern_match_string(pat, rd_lower)) {
  | ^~
/usr/include/glib-2.0/glib/gpattern.h:57:15: note: declared here
   57 | gboolean  g_pattern_match_string   (GPatternSpec *pspec,
  |   ^~
/root/qemu/contrib/plugins/execlog.c:339:63: warning: passing argument 2 of
‘g_ptr_array_add’ discards ‘const’ qualifier from pointer target type
[-Wdiscarded-qualifiers]
  339 | g_ptr_array_add(all_reg_names, reg->name);
  |~~~^~
In file included from /usr/include/glib-2.0/glib.h:33:
/usr/include/glib-2.0/glib/garray.h:198:62: note: expected
‘gpointer’ {aka ‘void *’} but argument is of type ‘const char *’
  198 |gpointer  data);
  |~~^~~~

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2210
Signed-off-by: Yao Xingtao 
---
 contrib/plugins/execlog.c | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/contrib/plugins/execlog.c b/contrib/plugins/execlog.c
index a1dfd59ab7..fab18113d4 100644
--- a/contrib/plugins/execlog.c
+++ b/contrib/plugins/execlog.c
@@ -311,6 +311,24 @@ static Register 
*init_vcpu_register(qemu_plugin_reg_descriptor *desc)
 return reg;
 }
 
+/*
+ * g_pattern_match_string has been deprecated in Glib since 2.70 and
+ * will complain about it if you try to use it. Fortunately the
+ * signature of both functions is the same making it easy to work
+ * around.
+ */
+static inline
+gboolean g_pattern_spec_match_string_qemu(GPatternSpec *pspec,
+  const gchar *string)
+{
+#if GLIB_CHECK_VERSION(2, 70, 0)
+return g_pattern_spec_match_string(pspec, string);
+#else
+return g_pattern_match_string(pspec, string);
+#endif
+};
+#define g_pattern_spec_match_string(p, s) g_pattern_spec_match_string_qemu(p, 
s)
+
 static GPtrArray *registers_init(int vcpu_index)
 {
 g_autoptr(GPtrArray) registers = g_ptr_array_new();
@@ -327,8 +345,8 @@ static GPtrArray *registers_init(int vcpu_index)
 for (int p = 0; p < rmatches->len; p++) {
 g_autoptr(GPatternSpec) pat = 
g_pattern_spec_new(rmatches->pdata[p]);
 g_autofree gchar *rd_lower = g_utf8_strdown(rd->name, -1);
-if (g_pattern_match_string(pat, rd->name) ||
-g_pattern_match_string(pat, rd_lower)) {
+if (g_pattern_spec_match_string(pat, rd->name) ||
+g_pattern_spec_match_string(pat, rd_lower)) {
 Register *reg = init_vcpu_register(rd);
 g_ptr_array_add(registers, reg);
 
@@ -336,7 +354,7 @@ static GPtrArray *registers_init(int vcpu_index)
 if (disas_assist) {
 g_mutex_lock(&add_reg_name_lock);
 if (!g_ptr_array_find(all_reg_names, reg->name, NULL)) 
{
-g_ptr_array_add(all_reg_names, reg->name);
+g_ptr_array_add(all_reg_names, 
(gpointer)reg->name);
 }
 g_mutex_unlock(&add_reg_name_lock);
 }
-- 
2.37.3

Re: [PATCH] misc/pca955*: Move models under hw/gpio

2024-03-25 Thread Andrew Jeffery

On Mon, 2024-03-25 at 14:48 +0100, Cédric Le Goater wrote:
> The PCA9552 and PCA9554 devices are both I2C GPIO controllers and the
> PCA9552 also can drive LEDs. Do all the necessary adjustments to move
> the models under hw/gpio.
> 
> Cc: Glenn Miles 
> Signed-off-by: Cédric Le Goater 

Acked-by: Andrew Jeffery

Re: [External] : Re: [PATCH v4 2/2] vhost: Perform memory section dirty scans once per iteration

2024-03-25 Thread Si-Wei Liu





On 3/24/2024 11:13 PM, Jason Wang wrote:

On Sat, Mar 23, 2024 at 5:14 AM Si-Wei Liu  wrote:



On 3/21/2024 10:08 PM, Jason Wang wrote:

On Fri, Mar 22, 2024 at 5:43 AM Si-Wei Liu  wrote:


On 3/20/2024 8:56 PM, Jason Wang wrote:

On Thu, Mar 21, 2024 at 5:03 AM Si-Wei Liu  wrote:

On 3/19/2024 8:27 PM, Jason Wang wrote:

On Tue, Mar 19, 2024 at 6:16 AM Si-Wei Liu  wrote:

On 3/17/2024 8:22 PM, Jason Wang wrote:

On Sat, Mar 16, 2024 at 2:45 AM Si-Wei Liu  wrote:

On 3/14/2024 9:03 PM, Jason Wang wrote:

On Fri, Mar 15, 2024 at 5:39 AM Si-Wei Liu  wrote:

On setups with one or more virtio-net devices with vhost on,
dirty tracking iteration increases cost the bigger the number
amount of queues are set up e.g. on idle guests migration the
following is observed with virtio-net with vhost=on:

48 queues -> 78.11%  [.] vhost_dev_sync_region.isra.13
8 queues -> 40.50%   [.] vhost_dev_sync_region.isra.13
1 queue -> 6.89% [.] vhost_dev_sync_region.isra.13
2 devices, 1 queue -> 18.60%  [.] vhost_dev_sync_region.isra.14

With high memory rates the symptom is lack of convergence as soon
as it has a vhost device with a sufficiently high number of queues,
the sufficient number of vhost devices.

On every migration iteration (every 100msecs) it will redundantly
query the *shared log* the number of queues configured with vhost
that exist in the guest. For the virtqueue data, this is necessary,
but not for the memory sections which are the same. So essentially
we end up scanning the dirty log too often.

To fix that, select a vhost device responsible for scanning the
log with regards to memory sections dirty tracking. It is selected
when we enable the logger (during migration) and cleared when we
disable the logger. If the vhost logger device goes away for some
reason, the logger will be re-selected from the rest of vhost
devices.

After making mem-section logger a singleton instance, constant cost
of 7%-9% (like the 1 queue report) will be seen, no matter how many
queues or how many vhost devices are configured:

48 queues -> 8.71%[.] vhost_dev_sync_region.isra.13
2 devices, 8 queues -> 7.97%   [.] vhost_dev_sync_region.isra.14

Co-developed-by: Joao Martins 
Signed-off-by: Joao Martins 
Signed-off-by: Si-Wei Liu 

---
v3 -> v4:
- add comment to clarify effect on cache locality and
  performance

v2 -> v3:
- add after-fix benchmark to commit log
- rename vhost_log_dev_enabled to vhost_dev_should_log
- remove unneeded comparisons for backend_type
- use QLIST array instead of single flat list to store vhost
  logger devices
- simplify logger election logic
---
   hw/virtio/vhost.c | 67 
++-
   include/hw/virtio/vhost.h |  1 +
   2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 612f4db..58522f1 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -45,6 +45,7 @@

   static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
   static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
+static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];

   /* Memslots used by backends that support private memslots (without an 
fd). */
   static unsigned int used_memslots;
@@ -149,6 +150,47 @@ bool vhost_dev_has_iommu(struct vhost_dev *dev)
   }
   }

+static inline bool vhost_dev_should_log(struct vhost_dev *dev)
+{
+assert(dev->vhost_ops);
+assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
+assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
+
+return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);

A dumb question, why not simple check

dev->log == vhost_log_shm[dev->vhost_ops->backend_type]

Because we are not sure if the logger comes from vhost_log_shm[] or
vhost_log[]. Don't want to complicate the check here by calling into
vhost_dev_log_is_shared() everytime when the .log_sync() is called.

It has very low overhead, isn't it?

Whether this has low overhead will have to depend on the specific
backend's implementation for .vhost_requires_shm_log(), which the common
vhost layer should not assume upon or rely on the current implementation.


static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
{
 return dev->vhost_ops->vhost_requires_shm_log &&
dev->vhost_ops->vhost_requires_shm_log(dev);
}

For example, if I understand the code correctly, the log type won't be
changed during runtime, so we can endup with a boolean to record that
instead of a query ops?

Right now the log type won't change during runtime, but I am not sure if
this may prohibit future revisit to allow change at the runtime,

We can be bothered when we have such a request then.


then
there'll be complex code involvled to maintain the state.

Other than this, I think it's insufficient to just check the shm log
v.s. normal log. The l

[PATCH] target/hppa: Fix diag instructions to set/restore shadow registers

2024-03-25 Thread Helge Deller

The 32-bit 7300LC CPU and the 64-bit PCX-W 8500 CPU use different
diag instructions to save or restore the CPU registers to/from
the shadow registers.

Implement those per-CPU architecture diag instructions to fix those
parts of the HP ODE testcases (L2DIAG and WDIAG, section 1) which test
the shadow registers.

Signed-off-by: Helge Deller 

diff --git a/target/hppa/helper.h b/target/hppa/helper.h
index 8fd7ba65d8..2c5d58bec9 100644
--- a/target/hppa/helper.h
+++ b/target/hppa/helper.h
@@ -86,6 +86,7 @@ DEF_HELPER_FLAGS_0(read_interval_timer, TCG_CALL_NO_RWG, tl)
 #ifndef CONFIG_USER_ONLY
 DEF_HELPER_1(halt, noreturn, env)
 DEF_HELPER_1(reset, noreturn, env)
+DEF_HELPER_1(putshadowregs, void, env)
 DEF_HELPER_1(getshadowregs, void, env)
 DEF_HELPER_1(rfi, void, env)
 DEF_HELPER_1(rfi_r, void, env)
diff --git a/target/hppa/sys_helper.c b/target/hppa/sys_helper.c
index 4a31748342..3727f4ce8b 100644
--- a/target/hppa/sys_helper.c
+++ b/target/hppa/sys_helper.c
@@ -95,6 +95,17 @@ void HELPER(rfi)(CPUHPPAState *env)
 cpu_hppa_put_psw(env, env->cr[CR_IPSW]);
 }
 
+void HELPER(putshadowregs)(CPUHPPAState *env)
+{
+env->shadow[0] = env->gr[1];
+env->shadow[1] = env->gr[8];
+env->shadow[2] = env->gr[9];
+env->shadow[3] = env->gr[16];
+env->shadow[4] = env->gr[17];
+env->shadow[5] = env->gr[24];
+env->shadow[6] = env->gr[25];
+}
+
 void HELPER(getshadowregs)(CPUHPPAState *env)
 {
 env->gr[1] = env->shadow[0];
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 99c5c4cbca..40f1cbe7ed 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -4533,6 +4533,18 @@ static bool trans_diag(DisasContext *ctx, arg_diag *a)
 gen_helper_diag_console_output(tcg_env);
 return nullify_end(ctx);
 }
+if ((ctx->is_pa20 && a->i == 0x701840) ||
+(!ctx->is_pa20 && a->i == 0x1a40)) {
+/* save shadow registers */
+nullify_over(ctx);
+gen_helper_putshadowregs(tcg_env);
+return nullify_end(ctx);
+}
+if ((ctx->is_pa20 && a->i == 0x781840) ||
+(!ctx->is_pa20 && a->i == 0x1a00)) {
+/* restore shadow registers */
+return trans_getshadowregs(ctx, NULL);
+}
 #endif
 qemu_log_mask(LOG_UNIMP, "DIAG opcode 0x%04x ignored\n", a->i);
 return true;

Re: [RFC PATCH 07/12] hw/arm/smmu-common: Support nested translation

2024-03-25 Thread Mostafa Saleh

Hi Julien,

On Mon, Mar 25, 2024 at 02:20:07PM +, Julien Grall wrote:
> Hi Mostafa,
> 
> On 25/03/2024 10:14, Mostafa Saleh wrote:
> > @@ -524,7 +551,7 @@ static int smmu_ptw_64_s2(SMMUTransCfg *cfg,
> >   tlbe->entry.translated_addr = gpa;
> >   tlbe->entry.iova = ipa & ~mask;
> >   tlbe->entry.addr_mask = mask;
> > -tlbe->entry.perm = s2ap;
> > +tlbe->parent_perm = tlbe->entry.perm = s2ap;
> >   tlbe->level = level;
> >   tlbe->granule = granule_sz;
> >   return 0;
> > @@ -537,6 +564,35 @@ error:
> >   return -EINVAL;
> >   }
> > +/* Combine 2 TLB enteries and return in tlbe. */
> > +static void combine_tlb(SMMUTLBEntry *tlbe, SMMUTLBEntry *tlbe_s2,
> > +dma_addr_t iova, SMMUTransCfg *cfg)
> > +{
> > +if (cfg->stage == SMMU_NESTED) {
> > +
> > +/*
> > + * tg and level are used from stage-1, while the addr mask can 
> > be
> With the current approach, I can't boot a guest if I create a dummy stage-1
> using 512GB mapping and a stage-2 using 2MB mapping. It looks like this is
> because the level will be used during the TLB lookup.

Agh, I guess that case is’t common with Linux.

I was able to reproduce it with a hacked Linux driver, and the issue
happens in smmu_iotlb_lookup() because it assumes the cached entry has
a mask matching level and granularity, which is not correct with
nesting and I missed it, and fixing the mask is not enough here.

Looking at the mask of the found entry, not good also, if there is
disparity between stage-1 and stage-2 levels we always miss in TLB
even for the same address.

> 
> I managed to solve the issue by using the max level of the two stages. I
> think we may need to do a minimum for the granule.
> 

Just fixing the granularity and level, will alway miss in TLB if they
are different as granularity is used in lookup, I guess one way is to
fall back for stage-2 granularity in lookup if stage-1 lookup fails,
I will have another look and see if there is a better solution for v2.

But for now as you mentioned (also we need update the IOVA to match
the mask), that just should at least work:

diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index ef5edfe4dc..ac2dc3efeb 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -572,21 +572,13 @@ static void combine_tlb(SMMUTLBEntry *tlbe, SMMUTLBEntry 
*tlbe_s2,
 dma_addr_t iova, SMMUTransCfg *cfg)
 {
 if (cfg->stage == SMMU_NESTED) {
-
-/*
- * tg and level are used from stage-1, while the addr mask can be
- * smaller in case stage-2 size(based on granule and level) was
- * smaller than stage-1.
- * That should have no impact on:
- * - lookup: as iova is properly aligned with the stage-1 level and
- *   granule.
- * - Invalidation: as it uses the entry mask.
- */
 tlbe->entry.addr_mask = MIN(tlbe->entry.addr_mask,
 tlbe_s2->entry.addr_mask);
 tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
   tlbe->entry.translated_addr);
-
+tlbe->granule = MIN(tlbe->granule, tlbe_s2->granule);
+tlbe->level = MAX(tlbe->level, tlbe_s2->level);
+tlbe->entry.iova = iova & ~tlbe->entry.addr_mask;
 /* parent_perm has s2 perm while perm has s1 perm. */
 tlbe->parent_perm = tlbe_s2->entry.perm;

> 
> > + * smaller in case stage-2 size(based on granule and level) was
> > + * smaller than stage-1.
> > + * That should have no impact on:
> > + * - lookup: as iova is properly aligned with the stage-1 
> > level and
> > + *   granule.
> > + * - Invalidation: as it uses the entry mask.
> > + */
> > +tlbe->entry.addr_mask = MIN(tlbe->entry.addr_mask,
> > +tlbe_s2->entry.addr_mask);
> > +tlbe->entry.translated_addr = CACHED_ENTRY_TO_ADDR(tlbe_s2,
> > +  tlbe->entry.translated_addr);
> > +
> > +/* parent_perm has s2 perm while perm has s1 perm. */
> > +tlbe->parent_perm = tlbe_s2->entry.perm;
> > +return;
> > +}
> > +
> > +/* That was not nested, use the s2. */
> > +memcpy(tlbe, tlbe_s2, sizeof(*tlbe));
> > +}
> 
> Cheers,
> 
> -- 
> Julien Grall

Thanks,
Mostafa

Re: [PATCH v2] target/hppa: Fix unit carry conditions

2024-03-25 Thread Helge Deller


On 3/25/24 20:02, Richard Henderson wrote:

Split do_unit_cond to do_unit_zero_cond to only handle conditions
versus zero.  These are the only ones that are legal for UXOR.
Simplify trans_uxor accordingly.

Rename do_unit to do_unit_addsub, since xor has been split.
Properly compute carry-out bits for add and subtract, mirroring
the code in do_add and do_sub.

Signed-off-by: Richard Henderson 


this patch does not break test #55 (uaddcm) any longer, and with the other
two patches test #58 (uaddcm & dcor) is OK as well.

So, for the whole series:
Reviewed-by: Helge Deller 
Tested-by: Helge Deller 

Thanks!
Helge




---

v2: Cut and paste error between 64- and 32-bit paths.
 Shift 32-bit carry down 1 bit like 64-bit carry;
 tradeoff is shift vs needing a 64-bit constant for the mask.
 Don't use of TCG_COND_TST{NE,EQ}, as this will limit backports
 of the actual bug fix.  We can convert the port to test conditions
 en masse during the next devel cycle.

---
  target/hppa/translate.c | 218 +---
  1 file changed, 113 insertions(+), 105 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 3fc3e7754c..99c5c4cbca 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -936,98 +936,44 @@ static DisasCond do_sed_cond(DisasContext *ctx, unsigned 
orig, bool d,
  return do_log_cond(ctx, c * 2 + f, d, res);
  }

-/* Similar, but for unit conditions.  */
-
-static DisasCond do_unit_cond(unsigned cf, bool d, TCGv_i64 res,
-  TCGv_i64 in1, TCGv_i64 in2)
+/* Similar, but for unit zero conditions.  */
+static DisasCond do_unit_zero_cond(unsigned cf, bool d, TCGv_i64 res)
  {
-DisasCond cond;
-TCGv_i64 tmp, cb = NULL;
+TCGv_i64 tmp;
  uint64_t d_repl = d ? 0x00010001ull : 1;
-
-if (cf & 8) {
-/* Since we want to test lots of carry-out bits all at once, do not
- * do our normal thing and compute carry-in of bit B+1 since that
- * leaves us with carry bits spread across two words.
- */
-cb = tcg_temp_new_i64();
-tmp = tcg_temp_new_i64();
-tcg_gen_or_i64(cb, in1, in2);
-tcg_gen_and_i64(tmp, in1, in2);
-tcg_gen_andc_i64(cb, cb, res);
-tcg_gen_or_i64(cb, cb, tmp);
-}
+uint64_t ones = 0, sgns = 0;

  switch (cf >> 1) {
-case 0: /* never / TR */
-cond = cond_make_f();
-break;
-
  case 1: /* SBW / NBW */
  if (d) {
-tmp = tcg_temp_new_i64();
-tcg_gen_subi_i64(tmp, res, d_repl * 0x0001u);
-tcg_gen_andc_i64(tmp, tmp, res);
-tcg_gen_andi_i64(tmp, tmp, d_repl * 0x8000u);
-cond = cond_make_0(TCG_COND_NE, tmp);
-} else {
-/* undefined */
-cond = cond_make_f();
+ones = d_repl;
+sgns = d_repl << 31;
  }
  break;
-
  case 2: /* SBZ / NBZ */
-/* See hasless(v,1) from
- * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
- */
-tmp = tcg_temp_new_i64();
-tcg_gen_subi_i64(tmp, res, d_repl * 0x01010101u);
-tcg_gen_andc_i64(tmp, tmp, res);
-tcg_gen_andi_i64(tmp, tmp, d_repl * 0x80808080u);
-cond = cond_make_0(TCG_COND_NE, tmp);
+ones = d_repl * 0x01010101u;
+sgns = ones << 7;
  break;
-
  case 3: /* SHZ / NHZ */
-tmp = tcg_temp_new_i64();
-tcg_gen_subi_i64(tmp, res, d_repl * 0x00010001u);
-tcg_gen_andc_i64(tmp, tmp, res);
-tcg_gen_andi_i64(tmp, tmp, d_repl * 0x80008000u);
-cond = cond_make_0(TCG_COND_NE, tmp);
+ones = d_repl * 0x00010001u;
+sgns = ones << 15;
  break;
-
-case 4: /* SDC / NDC */
-tcg_gen_andi_i64(cb, cb, d_repl * 0xu);
-cond = cond_make_0(TCG_COND_NE, cb);
-break;
-
-case 5: /* SWC / NWC */
-if (d) {
-tcg_gen_andi_i64(cb, cb, d_repl * 0x8000u);
-cond = cond_make_0(TCG_COND_NE, cb);
-} else {
-/* undefined */
-cond = cond_make_f();
-}
-break;
-
-case 6: /* SBC / NBC */
-tcg_gen_andi_i64(cb, cb, d_repl * 0x80808080u);
-cond = cond_make_0(TCG_COND_NE, cb);
-break;
-
-case 7: /* SHC / NHC */
-tcg_gen_andi_i64(cb, cb, d_repl * 0x80008000u);
-cond = cond_make_0(TCG_COND_NE, cb);
-break;
-
-default:
-g_assert_not_reached();
  }
-if (cf & 1) {
-cond.c = tcg_invert_cond(cond.c);
+if (ones == 0) {
+/* Undefined, or 0/1 (never/always). */
+return cf & 1 ? cond_make_t() : cond_make_f();
  }

-return cond;
+/*
+ * See hasless(v,1) from
+ * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+ */
+tmp = tcg_temp_new_i64();
+tcg_gen_subi_i64(tmp, res, ones);
+tcg_gen_andc_i64(tmp, tmp,

Re: [RFC 0/8] virtio,vhost: Add VIRTIO_F_IN_ORDER support

2024-03-25 Thread Eugenio Perez Martin

On Mon, Mar 25, 2024 at 5:52 PM Jonah Palmer  wrote:
>
>
>
> On 3/22/24 7:18 AM, Eugenio Perez Martin wrote:
> > On Thu, Mar 21, 2024 at 4:57 PM Jonah Palmer  
> > wrote:
> >>
> >> The goal of these patches is to add support to a variety of virtio and
> >> vhost devices for the VIRTIO_F_IN_ORDER transport feature. This feature
> >> indicates that all buffers are used by the device in the same order in
> >> which they were made available by the driver.
> >>
> >> These patches attempt to implement a generalized, non-device-specific
> >> solution to support this feature.
> >>
> >> The core feature behind this solution is a buffer mechanism in the form
> >> of GLib's GHashTable. The decision behind using a hash table was to
> >> leverage their ability for quick lookup, insertion, and removal
> >> operations. Given that our keys are simply numbers of an ordered
> >> sequence, a hash table seemed like the best choice for a buffer
> >> mechanism.
> >>
> >> -
> >>
> >> The strategy behind this implementation is as follows:
> >>
> >> We know that buffers that are popped from the available ring and enqueued
> >> for further processing will always done in the same order in which they
> >> were made available by the driver. Given this, we can note their order
> >> by assigning the resulting VirtQueueElement a key. This key is a number
> >> in a sequence that represents the order in which they were popped from
> >> the available ring, relative to the other VirtQueueElements.
> >>
> >> For example, given 3 "elements" that were popped from the available
> >> ring, we assign a key value to them which represents their order (elem0
> >> is popped first, then elem1, then lastly elem2):
> >>
> >>   elem2   --  elem1   --  elem0   ---> Enqueue for processing
> >>  (key: 2)(key: 1)(key: 0)
> >>
> >> Then these elements are enqueued for further processing by the host.
> >>
> >> While most devices will return these completed elements in the same
> >> order in which they were enqueued, some devices may not (e.g.
> >> virtio-blk). To guarantee that these elements are put on the used ring
> >> in the same order in which they were enqueued, we can use a buffering
> >> mechanism that keeps track of the next expected sequence number of an
> >> element.
> >>
> >> In other words, if the completed element does not have a key value that
> >> matches the next expected sequence number, then we know this element is
> >> not in-order and we must stash it away in a hash table until an order
> >> can be made. The element's key value is used as the key for placing it
> >> in the hash table.
> >>
> >> If the completed element has a key value that matches the next expected
> >> sequence number, then we know this element is in-order and we can push
> >> it on the used ring. Then we increment the next expected sequence number
> >> and check if the hash table contains an element at this key location.
> >>
> >> If so, we retrieve this element, push it to the used ring, delete the
> >> key-value pair from the hash table, increment the next expected sequence
> >> number, and check the hash table again for an element at this new key
> >> location. This process is repeated until we're unable to find an element
> >> in the hash table to continue the order.
> >>
> >> So, for example, say the 3 elements we enqueued were completed in the
> >> following order: elem1, elem2, elem0. The next expected sequence number
> >> is 0:
> >>
> >>  exp-seq-num = 0:
> >>
> >>   elem1   --> elem1.key == exp-seq-num ? --> No, stash it
> >>  (key: 1) |
> >>   |
> >>   v
> >> 
> >> |key: 1 - elem1|
> >> 
> >>  -
> >>  exp-seq-num = 0:
> >>
> >>   elem2   --> elem2.key == exp-seq-num ? --> No, stash it
> >>  (key: 2) |
> >>   |
> >>   v
> >> 
> >> |key: 1 - elem1|
> >> |--|
> >> |key: 2 - elem2|
> >> 
> >>  -
> >>  exp-seq-num = 0:
> >>
> >>   elem0   --> elem0.key == exp-seq-num ? --> Yes, push to used ring
> >>  (key: 0)
> >>
> >>  exp-seq-num = 1:
> >>
> >>  lookup(table, exp-seq-num) != NULL ? --> Yes, push to used ring,
> >>   remove elem from table
> >>

Re: [PATCH v3 0/4] fix two edge cases related to stream block jobs

2024-03-25 Thread Stefan Hajnoczi

On Fri, Mar 22, 2024 at 10:50:05AM +0100, Fiona Ebner wrote:
> Changes in v3:
> * Also deal with edge case in bdrv_next_cleanup(). Haven't run
>   into an actual issue there, but at least the caller in
>   migration/block.c uses bdrv_nb_sectors() which, while not a
>   coroutine wrapper itself (it's written manually), may call
>   bdrv_refresh_total_sectors(), which is a generated coroutine
>   wrapper, so AFAIU, the block graph can change during that call.
>   And even without that, it's just better to be more consistent
>   with bdrv_next().
> 
> Changes in v2:
> * Ran into another issue while writing the IO test Stefan wanted
>   to have (good call :)), so include a fix for that and add the
>   test. I didn't notice during manual testing, because I hadn't
>   used a scripted QMP 'quit', so there was no race.
> 
> Fiona Ebner (3):
>   block-backend: fix edge case in bdrv_next() where BDS associated to BB
> changes
>   block-backend: fix edge case in bdrv_next_cleanup() where BDS
> associated to BB changes
>   iotests: add test for stream job with an unaligned prefetch read
> 
> Stefan Reiter (1):
>   block/io: accept NULL qiov in bdrv_pad_request
> 
>  block/block-backend.c | 18 ++--
>  block/io.c| 31 ---
>  .../tests/stream-unaligned-prefetch   | 86 +++
>  .../tests/stream-unaligned-prefetch.out   |  5 ++
>  4 files changed, 117 insertions(+), 23 deletions(-)
>  create mode 100755 tests/qemu-iotests/tests/stream-unaligned-prefetch
>  create mode 100644 tests/qemu-iotests/tests/stream-unaligned-prefetch.out

Looks good to me. I will wait until Thursday before merging in case
Hanna, Vladimir, or Kevin have comments. Thanks!

Stefan


signature.asc
Description: PGP signature

Re: [PATCH v3 4/4] iotests: add test for stream job with an unaligned prefetch read

2024-03-25 Thread Stefan Hajnoczi

On Fri, Mar 22, 2024 at 10:50:09AM +0100, Fiona Ebner wrote:
> Previously, bdrv_pad_request() could not deal with a NULL qiov when
> a read needed to be aligned. During prefetch, a stream job will pass a
> NULL qiov. Add a test case to cover this scenario.
> 
> By accident, also covers a previous race during shutdown, where block
> graph changes during iteration in bdrv_flush_all() could lead to
> unreferencing the wrong block driver state and an assertion failure
> later.
> 
> Signed-off-by: Fiona Ebner 
> ---
> 
> No changes in v3.
> New in v2.
> 
>  .../tests/stream-unaligned-prefetch   | 86 +++
>  .../tests/stream-unaligned-prefetch.out   |  5 ++
>  2 files changed, 91 insertions(+)
>  create mode 100755 tests/qemu-iotests/tests/stream-unaligned-prefetch
>  create mode 100644 tests/qemu-iotests/tests/stream-unaligned-prefetch.out

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature

Re: [PATCH v3 3/4] block-backend: fix edge case in bdrv_next_cleanup() where BDS associated to BB changes

2024-03-25 Thread Stefan Hajnoczi

On Fri, Mar 22, 2024 at 10:50:08AM +0100, Fiona Ebner wrote:
> Same rationale as for commit "block-backend: fix edge case in
> bdrv_next() where BDS associated to BB changes". The block graph might
> change between the bdrv_next() call and the bdrv_next_cleanup() call,
> so it could be that the associated BDS is not the same that was
> referenced previously anymore. Instead, rely on bdrv_next() to set
> it->bs to the BDS it referenced and unreference that one in any case.
> 
> Signed-off-by: Fiona Ebner 
> ---
> 
> New in v3.
> 
>  block/block-backend.c | 11 ---
>  1 file changed, 4 insertions(+), 7 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature

Re: [PATCH v3 2/4] block-backend: fix edge case in bdrv_next() where BDS associated to BB changes

2024-03-25 Thread Stefan Hajnoczi

On Fri, Mar 22, 2024 at 10:50:07AM +0100, Fiona Ebner wrote:
> The old_bs variable in bdrv_next() is currently determined by looking
> at the old block backend. However, if the block graph changes before
> the next bdrv_next() call, it might be that the associated BDS is not
> the same that was referenced previously. In that case, the wrong BDS
> is unreferenced, leading to an assertion failure later:
> 
> > bdrv_unref: Assertion `bs->refcnt > 0' failed.
> 
> In particular, this can happen in the context of bdrv_flush_all(),
> when polling for bdrv_co_flush() in the generated co-wrapper leads to
> a graph change (for example with a stream block job [0]).
> 
> A racy reproducer:
> 
> > #!/bin/bash
> > rm -f /tmp/backing.qcow2
> > rm -f /tmp/top.qcow2
> > ./qemu-img create /tmp/backing.qcow2 -f qcow2 64M
> > ./qemu-io -c "write -P42 0x0 0x1" /tmp/backing.qcow2
> > ./qemu-img create /tmp/top.qcow2 -f qcow2 64M -b /tmp/backing.qcow2 -F qcow2
> > ./qemu-system-x86_64 --qmp stdio \
> > --blockdev 
> > qcow2,node-name=node0,file.driver=file,file.filename=/tmp/top.qcow2 \
> > < > {"execute": "qmp_capabilities"}
> > {"execute": "block-stream", "arguments": { "job-id": "stream0", "device": 
> > "node0" } }
> > {"execute": "quit"}
> > EOF
> 
> [0]:
> 
> > #0  bdrv_replace_child_tran (child=..., new_bs=..., tran=...)
> > #1  bdrv_replace_node_noperm (from=..., to=..., auto_skip=..., tran=..., 
> > errp=...)
> > #2  bdrv_replace_node_common (from=..., to=..., auto_skip=..., 
> > detach_subchain=..., errp=...)
> > #3  bdrv_drop_filter (bs=..., errp=...)
> > #4  bdrv_cor_filter_drop (cor_filter_bs=...)
> > #5  stream_prepare (job=...)
> > #6  job_prepare_locked (job=...)
> > #7  job_txn_apply_locked (fn=..., job=...)
> > #8  job_do_finalize_locked (job=...)
> > #9  job_exit (opaque=...)
> > #10 aio_bh_poll (ctx=...)
> > #11 aio_poll (ctx=..., blocking=...)
> > #12 bdrv_poll_co (s=...)
> > #13 bdrv_flush (bs=...)
> > #14 bdrv_flush_all ()
> > #15 do_vm_stop (state=..., send_stop=...)
> > #16 vm_shutdown ()
> 
> Signed-off-by: Fiona Ebner 
> ---
> 
> No changes in v3.
> New in v2.
> 
>  block/block-backend.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature

Re: [PATCH v3 1/4] block/io: accept NULL qiov in bdrv_pad_request

2024-03-25 Thread Stefan Hajnoczi

On Fri, Mar 22, 2024 at 10:50:06AM +0100, Fiona Ebner wrote:
> From: Stefan Reiter 
> 
> Some operations, e.g. block-stream, perform reads while discarding the
> results (only copy-on-read matters). In this case, they will pass NULL
> as the target QEMUIOVector, which will however trip bdrv_pad_request,
> since it wants to extend its passed vector. In particular, this is the
> case for the blk_co_preadv() call in stream_populate().
> 
> If there is no qiov, no operation can be done with it, but the bytes
> and offset still need to be updated, so the subsequent aligned read
> will actually be aligned and not run into an assertion failure.
> 
> In particular, this can happen when the request alignment of the top
> node is larger than the allocated part of the bottom node, in which
> case padding becomes necessary. For example:
> 
> > ./qemu-img create /tmp/backing.qcow2 -f qcow2 64M -o cluster_size=32768
> > ./qemu-io -c "write -P42 0x0 0x1" /tmp/backing.qcow2
> > ./qemu-img create /tmp/top.qcow2 -f qcow2 64M -b /tmp/backing.qcow2 -F qcow2
> > ./qemu-system-x86_64 --qmp stdio \
> > --blockdev 
> > qcow2,node-name=node0,file.driver=file,file.filename=/tmp/top.qcow2 \
> > < > {"execute": "qmp_capabilities"}
> > {"execute": "blockdev-add", "arguments": { "driver": "compress", "file": 
> > "node0", "node-name": "node1" } }
> > {"execute": "block-stream", "arguments": { "job-id": "stream0", "device": 
> > "node1" } }
> > EOF
> 
> Originally-by: Stefan Reiter 
> Signed-off-by: Thomas Lamprecht 
> [FE: do update bytes and offset in any case
>  add reproducer to commit message]
> Signed-off-by: Fiona Ebner 
> ---
> 
> No changes in v3.
> No changes in v2.
> 
>  block/io.c | 31 +++
>  1 file changed, 19 insertions(+), 12 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature

Re: [RFC 4/8] virtio: Implement in-order handling for virtio devices

2024-03-25 Thread Eugenio Perez Martin

On Mon, Mar 25, 2024 at 6:35 PM Jonah Palmer  wrote:
>
>
>
> On 3/22/24 6:46 AM, Eugenio Perez Martin wrote:
> > On Thu, Mar 21, 2024 at 4:57 PM Jonah Palmer  
> > wrote:
> >>
> >> Implements in-order handling for most virtio devices using the
> >> VIRTIO_F_IN_ORDER transport feature, specifically those who call
> >> virtqueue_push to push their used elements onto the used ring.
> >>
> >> The logic behind this implementation is as follows:
> >>
> >> 1.) virtqueue_pop always enqueues VirtQueueElements in-order.
> >>
> >> virtqueue_pop always retrieves one or more buffer descriptors in-order
> >> from the available ring and converts them into a VirtQueueElement. This
> >> means that the order in which VirtQueueElements are enqueued are
> >> in-order by default.
> >>
> >> By virtue, as VirtQueueElements are created, we can assign a sequential
> >> key value to them. This preserves the order of buffers that have been
> >> made available to the device by the driver.
> >>
> >> As VirtQueueElements are assigned a key value, the current sequence
> >> number is incremented.
> >>
> >> 2.) Requests can be completed out-of-order.
> >>
> >> While most devices complete requests in the same order that they were
> >> enqueued by default, some devices don't (e.g. virtio-blk). The goal of
> >> this out-of-order handling is to reduce the impact of devices that
> >> process elements in-order by default while also guaranteeing compliance
> >> with the VIRTIO_F_IN_ORDER feature.
> >>
> >> Below is the logic behind handling completed requests (which may or may
> >> not be in-order).
> >>
> >> 3.) Does the incoming used VirtQueueElement preserve the correct order?
> >>
> >> In other words, is the sequence number (key) assigned to the
> >> VirtQueueElement the expected number that would preserve the original
> >> order?
> >>
> >> 3a.)
> >> If it does... immediately push the used element onto the used ring.
> >> Then increment the next expected sequence number and check to see if
> >> any previous out-of-order VirtQueueElements stored on the hash table
> >> has a key that matches this next expected sequence number.
> >>
> >> For each VirtQueueElement found on the hash table with a matching key:
> >> push the element on the used ring, remove the key-value pair from the
> >> hash table, and then increment the next expected sequence number. Repeat
> >> this process until we're unable to find an element with a matching key.
> >>
> >> Note that if the device uses batching (e.g. virtio-net), then we skip
> >> the virtqueue_flush call and let the device call it themselves.
> >>
> >> 3b.)
> >> If it does not... stash the VirtQueueElement, along with relevant data,
> >> as a InOrderVQElement on the hash table. The key used is the order_key
> >> that was assigned when the VirtQueueElement was created.
> >>
> >> Signed-off-by: Jonah Palmer 
> >> ---
> >>   hw/virtio/virtio.c | 70 --
> >>   include/hw/virtio/virtio.h |  8 +
> >>   2 files changed, 76 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> >> index 40124545d6..40e4377f1e 100644
> >> --- a/hw/virtio/virtio.c
> >> +++ b/hw/virtio/virtio.c
> >> @@ -992,12 +992,56 @@ void virtqueue_flush(VirtQueue *vq, unsigned int 
> >> count)
> >>   }
> >>   }
> >>
> >> +void virtqueue_order_element(VirtQueue *vq, const VirtQueueElement *elem,
> >> + unsigned int len, unsigned int idx,
> >> + unsigned int count)
> >> +{
> >> +InOrderVQElement *in_order_elem;
> >> +
> >> +if (elem->order_key == vq->current_order_idx) {
> >> +/* Element is in-order, push to used ring */
> >> +virtqueue_fill(vq, elem, len, idx);
> >> +
> >> +/* Batching? Don't flush */
> >> +if (count) {
> >> +virtqueue_flush(vq, count);
> >
> > The "count" parameter is the number of heads used, but here you're
> > only using one head (elem). Same with the other virtqueue_flush in the
> > function.
> >
>
> True. This acts more as a flag than an actual count since, unless we're
> batching (which in the current setup, the device would explicitly call
> virtqueue_flush separately), this value will be either 0 or 1.
>

If possible, I think it is better to keep consistent with the current
API: fill+flush for batching, push for a single shot.

> > Also, this function sometimes replaces virtqueue_fill and other
> > replaces virtqueue_fill + virtqueue_flush (both examples in patch
> > 6/8). I have the impression the series would be simpler if
> > virtqueue_order_element is a static function just handling the
> > virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER) path of
> > virtqueue_fill, so the caller does not need to know if the in_order
> > feature is on or off.
> >
>
> Originally I wanted this function to replace virtqueue_fill +
> virtqueue_flush but after looking at virtio_net_receive_rcu and
> vhost_svq_flush, where multiple virtqueue_

target/hppa: be,n nullifying first insn at branch target?

2024-03-25 Thread Sven Schnelle

Hi Richard,

one of the last issue i'm seeing with 64bit HP-UX 11.11 is a crash
of ld64 while building (linking) the kernel. The program is killed
due to a unaligned access. I've wrote a logfile, and it looks like
be,n is nullifying the first instruction at the branch target:

IN: 
0x9d4a416d4fc:  cmpiclr,<> c,r31,r0
0x9d4a416d500:  addi,tr 13,r0,ret1
0x9d4a416d504:  ldi 0,ret1
0x9d4a416d508:  ldi 0,ret0
0x9d4a416d50c:  ldsid (rp),r31
0x9d4a416d510:  mtsp r31,sr0
0x9d4a416d514:  be,n 0(sr0,rp)

Trace 0: 0x7efd7f9d75c0 [9d4a404/09d4a416d4fc/00040306/ff00] 
IA_F 09d4a416d4ff IA_B 09d4a416d503 IIR 4afc3ff9
PSW  00ff0004ff1f CB    -C---RQPDI
GR00  GR01 09d4a400 GR02 00107573 GR03 
003c79b8
GR04 00419f50 GR05 00410a30 GR06 7a0005c8 GR07 
00419f50
GR08 004122f8 GR09 000b GR10 001db1b8 GR11 
001c81f8
GR12 001c81f8 GR13 001c81f8 GR14  GR15 
001dbf18
GR16  GR17 0001 GR18 001d5278 GR19 
001c5000
GR20 00416a40 GR21 006a GR22 0016d4e8 GR23 
004a
GR24 029c GR25  GR26 00419f50 GR27 
001e65f8
GR28 0001 GR29 001db1b8 GR30 7a029510 GR31 
000c
SR00 09d4a400 SR01  SR02  SR03 
SR04 09d4a400 SR05 09d4a400 SR06 01eea400 SR07 01eea400


IN: 
0x9d4a4107570:  ldw 0(r4),r19
0x9d4a4107574:  ldw 58(r19),r22
0x9d4a4107578:  ldo 0(ret1),r7
0x9d4a410757c:  ldo 0(r4),r26
0x9d4a4107580:  b,l 0x9d4a41074d8,r31
0x9d4a4107584:  ldo 0(r31),rp

Trace 0: 0x7efd7f9d77c0 [9d4a404/09d4a4107570/00240306/ff00] 
IA_F 09d4a4107573 IA_B 09d4a4107577 IIR 4afc3ff9
PSW  0024001f CB    --N--C---RQPDI
GR00  GR01 09d4a400 GR02 00107573 GR03 
003c79b8
GR04 00419f50 GR05 00410a30 GR06 7a0005c8 GR07 
00419f50
GR08 004122f8 GR09 000b GR10 001db1b8 GR11 
001c81f8
GR12 001c81f8 GR13 001c81f8 GR14  GR15 
001dbf18
GR16  GR17 0001 GR18 001d5278 GR19 
001c5000
GR20 00416a40 GR21 006a GR22 0016d4e8 GR23 
004a
GR24 029c GR25  GR26 00419f50 GR27 
001e65f8
GR28  GR29 0013 GR30 7a029510 GR31 
09d4a400
SR00 09d4a400 SR01  SR02  SR03 
SR04 09d4a400 SR05 09d4a400 SR06 01eea400 SR07 01eea400

Trace 0: 0x7efd7f9adb80 [9d4a404/09d4a41074d8/00040306/ff00] 
IA_F 09d4a41074db IA_B 09d4a41074df IIR 4afc3ff9
PSW  0004001f CB    -C---RQPDI
GR00  GR01 09d4a400 GR02 0010758b GR03 
003c79b8
GR04 00419f50 GR05 00410a30 GR06 7a0005c8 GR07 
0013
GR08 004122f8 GR09 000b GR10 001db1b8 GR11 
001c81f8
GR12 001c81f8 GR13 001c81f8 GR14  GR15 
001dbf18
GR16  GR17 0001 GR18 001d5278 GR19 
001c5000
GR20 00416a40 GR21 006a GR22 2400 GR23 
004a
GR24 029c GR25  GR26 00419f50 GR27 
001e65f8
GR28  GR29 0013 GR30 7a029510 GR31 
0010758b
SR00 09d4a400 SR01  SR02  SR03 
SR04 09d4a400 SR05 09d4a400 SR06 01eea400 SR07 01eea400

The problem is the 0x1c5000 value in r19, which is the value of an old
instruction. At 0x9d4a416d514 is a be,n and at 09d4a4107570 the
N bit is set. First i thought it might be just a display issue with the
log, but the following change to confirm the issue makes the kernel
linking succeed:


diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 7546a5f5a2..56c68a7cbe 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -3847,7 +3849,7 @@ static bool trans_be(DisasContext *ctx, arg_be *a)
 copy_iaoq_entry(ctx, cpu_gr[31], ctx->iaoq_n, ctx->iaoq_n_var);
 tcg_gen_mov_i64(cpu_sr[0], cpu_iasq_b);
 }
-if (a->n && use_nullify_skip(ctx)) {
+if (0 && a->n && use_nullify_skip(ctx)) {
 copy_iaoq_entry(ctx, cpu_iaoq_f, -1, tmp);
 tcg_gen_addi_i64(tmp, tmp, 4);
 copy_iaoq_entry(ctx, cpu_iaoq_b, -1, tmp);


So i think the problem is caused by this optimization. Does this ring a
bell? Debugging this is rather hard, alone the logfile above is 6GB in
size...

Thanks,
Sven

[PATCH v2 2/4] linux-user: Fix shmat() strace

2024-03-25 Thread Ilya Leoshkevich

The indices of arguments passed to print_shmat() are all off-by-1,
because arg1 is the ipc() command. Fix them.

New output for linux-shmat-maps test:

3501769 shmat(4784214,0x0080,SHM_RND) = 0

Fixes: 9f7c97324c27 ("linux-user: Add strace for shmat")
Reviewed-by: Richard Henderson 
Signed-off-by: Ilya Leoshkevich 
---
 linux-user/strace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index 660f942f599..54169096aa4 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -701,7 +701,7 @@ print_ipc(CPUArchState *cpu_env, const struct syscallname 
*name,
 break;
 case IPCOP_shmat:
 print_shmat(cpu_env, &(const struct syscallname){ .name = "shmat" },
-arg1, arg4, arg2, 0, 0, 0);
+arg2, arg5, arg3, 0, 0, 0);
 break;
 default:
 qemu_log(("%s("
-- 
2.44.0

[PATCH v2 4/4] tests/tcg: Test shmat(NULL)

2024-03-25 Thread Ilya Leoshkevich

Add a small test to prevent regressions.

Reviewed-by: Richard Henderson 
Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/multiarch/linux/linux-shmat-null.c | 38 
 1 file changed, 38 insertions(+)
 create mode 100644 tests/tcg/multiarch/linux/linux-shmat-null.c

diff --git a/tests/tcg/multiarch/linux/linux-shmat-null.c 
b/tests/tcg/multiarch/linux/linux-shmat-null.c
new file mode 100644
index 000..94eaaec371a
--- /dev/null
+++ b/tests/tcg/multiarch/linux/linux-shmat-null.c
@@ -0,0 +1,38 @@
+/*
+ * Test shmat(NULL).
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include 
+#include 
+#include 
+#include 
+
+int main(void)
+{
+int shmid;
+char *p;
+int err;
+
+/* Create, attach and intialize shared memory. */
+shmid = shmget(IPC_PRIVATE, 1, IPC_CREAT | 0600);
+assert(shmid != -1);
+p = shmat(shmid, NULL, 0);
+assert(p != (void *)-1);
+*p = 42;
+
+/* Reattach, check that the value is still there. */
+err = shmdt(p);
+assert(err == 0);
+p = shmat(shmid, NULL, 0);
+assert(p != (void *)-1);
+assert(*p == 42);
+
+/* Detach. */
+err = shmdt(p);
+assert(err == 0);
+err = shmctl(shmid, IPC_RMID, NULL);
+assert(err == 0);
+
+return EXIT_SUCCESS;
+}
-- 
2.44.0

[PATCH v2 0/4] linux-user: Fix shmat(NULL) for h != g

2024-03-25 Thread Ilya Leoshkevich

v1: 
https://lore.kernel.org/qemu-devel/20240325153313.526888-1-...@linux.ibm.com/
v1 -> v2: Remove an unnecessary ifdef, add R-Bs (Richard).

Hi,

I noticed that while shmat() now works with /proc/self/maps,
shmat(NULL) got broken. This series fixes that along with two related
strace issues, and adds a test.

Best regards,
Ilya


Ilya Leoshkevich (4):
  linux-user: Fix semctl() strace
  linux-user: Fix shmat() strace
  linux-user: Fix shmat(NULL) for h != g
  tests/tcg: Test shmat(NULL)

 linux-user/mmap.c|  2 +-
 linux-user/strace.c  | 10 ++
 tests/tcg/multiarch/linux/linux-shmat-null.c | 38 
 3 files changed, 42 insertions(+), 8 deletions(-)
 create mode 100644 tests/tcg/multiarch/linux/linux-shmat-null.c

-- 
2.44.0

[PATCH v2 1/4] linux-user: Fix semctl() strace

2024-03-25 Thread Ilya Leoshkevich

The indices of arguments used with semctl() are all off-by-1, because
arg1 is the ipc() command. Fix them. While at it, reuse print_semctl().

New output (for a small test program):

3540333 semctl(999,888,SEM_INFO,0x7fe5051ee9a0) = -1 errno=14 (Bad 
address)

Fixes: 7ccfb2eb5f9d ("Fix warnings that would be caused by gcc flag 
-Wwrite-strings")
Reviewed-by: Richard Henderson 
Signed-off-by: Ilya Leoshkevich 
---
 linux-user/strace.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index 9934e2208e2..660f942f599 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -657,7 +657,6 @@ print_newselect(CPUArchState *cpu_env, const struct 
syscallname *name,
 }
 #endif
 
-#ifdef TARGET_NR_semctl
 static void
 print_semctl(CPUArchState *cpu_env, const struct syscallname *name,
  abi_long arg1, abi_long arg2, abi_long arg3,
@@ -668,7 +667,6 @@ print_semctl(CPUArchState *cpu_env, const struct 
syscallname *name,
 print_ipc_cmd(arg3);
 qemu_log(",0x" TARGET_ABI_FMT_lx ")", arg4);
 }
-#endif
 
 static void
 print_shmat(CPUArchState *cpu_env, const struct syscallname *name,
@@ -698,10 +696,8 @@ print_ipc(CPUArchState *cpu_env, const struct syscallname 
*name,
 {
 switch(arg1) {
 case IPCOP_semctl:
-qemu_log("semctl(" TARGET_ABI_FMT_ld "," TARGET_ABI_FMT_ld ",",
- arg1, arg2);
-print_ipc_cmd(arg3);
-qemu_log(",0x" TARGET_ABI_FMT_lx ")", arg4);
+print_semctl(cpu_env, &(const struct syscallname){ .name = "semctl" },
+ arg2, arg3, arg4, arg5, 0, 0);
 break;
 case IPCOP_shmat:
 print_shmat(cpu_env, &(const struct syscallname){ .name = "shmat" },
-- 
2.44.0

[PATCH v2 3/4] linux-user: Fix shmat(NULL) for h != g

2024-03-25 Thread Ilya Leoshkevich

In the h != g && shmaddr == NULL && !reserved_va case, target_shmat()
incorrectly mmap()s the initial anonymous range with
MAP_FIXED_NOREPLACE, even though the earlier mmap_find_vma() has
already reserved the respective address range.

Fix by using MAP_FIXED when "mapped", which is set after
mmap_find_vma(), is true.

Fixes: 78bc8ed9a8f0 ("linux-user: Rewrite target_shmat")
Reviewed-by: Richard Henderson 
Signed-off-by: Ilya Leoshkevich 
---
 linux-user/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index e88faf1ab3d..681b6db1b67 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -1358,7 +1358,7 @@ abi_ulong target_shmat(CPUArchState *cpu_env, int shmid,
 if (h_len != t_len) {
 int mmap_p = PROT_READ | (shmflg & SHM_RDONLY ? 0 : PROT_WRITE);
 int mmap_f = MAP_PRIVATE | MAP_ANONYMOUS
-   | (reserved_va || (shmflg & SHM_REMAP)
+   | (reserved_va || mapped || (shmflg & SHM_REMAP)
   ? MAP_FIXED : MAP_FIXED_NOREPLACE);
 
 test = mmap(want, m_len, mmap_p, mmap_f, -1, 0);
-- 
2.44.0

Re: [PATCH v3 7/8] plugins: distinct types for callbacks

2024-03-25 Thread Richard Henderson


On 3/25/24 02:41, Pierrick Bouvier wrote:

To prevent errors when writing new types of callbacks or inline
operations, we split callbacks data to distinct types.

Signed-off-by: Pierrick Bouvier 
---
  include/qemu/plugin.h  | 46 ++---
  plugins/plugin.h   |  2 +-
  accel/tcg/plugin-gen.c | 58 +---
  plugins/core.c | 76 ++
  4 files changed, 98 insertions(+), 84 deletions(-)

diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h
index bb224b8e4c7..a078229942f 100644
--- a/include/qemu/plugin.h
+++ b/include/qemu/plugin.h
@@ -73,34 +73,40 @@ enum plugin_dyn_cb_type {
  PLUGIN_CB_INLINE_STORE_U64,
  };
  
+struct qemu_plugin_regular_cb {

+union qemu_plugin_cb_sig f;
+TCGHelperInfo *info;
+void *userp;
+enum qemu_plugin_mem_rw rw;
+};
+
+struct qemu_plugin_inline_cb {
+qemu_plugin_u64 entry;
+enum qemu_plugin_op op;
+uint64_t imm;
+enum qemu_plugin_mem_rw rw;
+};


Do you still need 'op' anymore here?
It seems redundant with 'type'.

Otherwise,
Reviewed-by: Richard Henderson 


r~

Re: [PATCH v3 8/8] plugins: extract cpu_index generate

2024-03-25 Thread Richard Henderson


On 3/25/24 02:41, Pierrick Bouvier wrote:

Signed-off-by: Pierrick Bouvier
---
  accel/tcg/plugin-gen.c | 28 +---
  1 file changed, 13 insertions(+), 15 deletions(-)


Reviewed-by: Richard Henderson 

r~

Re: [PATCH v3 5/8] plugins: conditional callbacks

2024-03-25 Thread Richard Henderson


On 3/25/24 02:41, Pierrick Bouvier wrote:

Extend plugins API to support callback called with a given criteria
(evaluated inline).

Added functions:
- qemu_plugin_register_vcpu_tb_exec_cond_cb
- qemu_plugin_register_vcpu_insn_exec_cond_cb

They expect as parameter a condition, a qemu_plugin_u64_t (op1) and an
immediate (op2). Callback is called if op1|cond|  op2 is true.

Signed-off-by: Pierrick Bouvier
---
  include/qemu/plugin.h|  8 
  include/qemu/qemu-plugin.h   | 76 
  plugins/plugin.h |  8 
  accel/tcg/plugin-gen.c   | 48 +++
  plugins/api.c| 39 ++
  plugins/core.c   | 32 +++
  plugins/qemu-plugins.symbols |  2 +
  7 files changed, 213 insertions(+)


Reviewed-by: Richard Henderson 

r~

Re: [RFC 1/8] virtio: Define InOrderVQElement

2024-03-25 Thread Eugenio Perez Martin

On Mon, Mar 25, 2024 at 6:08 PM Jonah Palmer  wrote:
>
>
>
> On 3/22/24 5:45 AM, Eugenio Perez Martin wrote:
> > On Thu, Mar 21, 2024 at 4:57 PM Jonah Palmer  
> > wrote:
> >>
> >> Define the InOrderVQElement structure for the VIRTIO_F_IN_ORDER
> >> transport feature implementation.
> >>
> >> The InOrderVQElement structure is used to encapsulate out-of-order
> >> VirtQueueElement data that was processed by the host. This data
> >> includes:
> >>   - The processed VirtQueueElement (elem)
> >>   - Length of data (len)
> >>   - VirtQueueElement array index (idx)
> >>   - Number of processed VirtQueueElements (count)
> >>
> >> InOrderVQElements will be stored in a buffering mechanism until an
> >> order can be achieved.
> >>
> >> Signed-off-by: Jonah Palmer 
> >> ---
> >>   include/hw/virtio/virtio.h | 7 +++
> >>   1 file changed, 7 insertions(+)
> >>
> >> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> >> index b3c74a1bca..c8aa435a5e 100644
> >> --- a/include/hw/virtio/virtio.h
> >> +++ b/include/hw/virtio/virtio.h
> >> @@ -77,6 +77,13 @@ typedef struct VirtQueueElement
> >>   struct iovec *out_sg;
> >>   } VirtQueueElement;
> >>
> >> +typedef struct InOrderVQElement {
> >> +const VirtQueueElement *elem;
> >
> > Some subsystems allocate space for extra elements after
> > VirtQueueElement, like VirtIOBlockReq. You can request virtqueue_pop
> > to allocate this extra space by its second argument. Would it work for
> > this?
> >
>
> I don't see why not. Although this may not be necessary due to me
> missing a key aspect mentioned in your comment below.
>
> >> +unsigned int len;
> >> +unsigned int idx;
> >> +unsigned int count;
> >
> > Now I don't get why these fields cannot be obtained from elem->(len,
> > index, ndescs) ?
> >
>
> Interesting. I didn't realize that these values are equivalent to a
> VirtQueueElement's len, index, and ndescs fields.
>
> Is this always true? Else I would've expected, for example,
> virtqueue_push to not need the 'unsigned int len' parameter if this
> information is already included via. the VirtQueueElement being passed in.
>

The code uses "len" to store the written length values of each used
descriptor between virtqueue_fill and virtqueue_flush. But not all
devices use these separately, only the ones that batches: virtio-net
and SVQ.

A smarter / less simpler implementation of virtqueue_push could
certainly avoid storing elem->len. But the performance gain is
probably tiny, and the code complexity grows.

> >> +} InOrderVQElement;
> >> +
> >>   #define VIRTIO_QUEUE_MAX 1024
> >>
> >>   #define VIRTIO_NO_VECTOR 0x
> >> --
> >> 2.39.3
> >>
> >
>

Re: [PATCH v3 4/8] tests/plugin/inline: add test for STORE_U64 inline op

2024-03-25 Thread Richard Henderson


On 3/25/24 02:41, Pierrick Bouvier wrote:

Signed-off-by: Pierrick Bouvier
---
  tests/plugin/inline.c | 41 +
  1 file changed, 37 insertions(+), 4 deletions(-)


Reviewed-by: Richard Henderson 

r~

Re: [PATCH v3 3/8] plugins: add new inline op STORE_U64

2024-03-25 Thread Richard Henderson


On 3/25/24 02:41, Pierrick Bouvier wrote:

@@ -152,6 +152,18 @@ static void gen_inline_add_u64_cb(struct 
qemu_plugin_dyn_cb *cb)
  tcg_temp_free_ptr(ptr);
  }
  
+static void gen_inline_store_u64_cb(struct qemu_plugin_dyn_cb *cb)

+{
+TCGv_ptr ptr = gen_plugin_u64_ptr(cb->inline_insn.entry);
+TCGv_i64 val = tcg_temp_ebb_new_i64();
+
+tcg_gen_movi_i64(val, cb->inline_insn.imm);


TCGv_i64 val = tcg_constant_i64(cb->inline_insn.imm);

With that,
Reviewed-by: Richard Henderson 


r~

Re: [PATCH v3 2/8] plugins: extract generate ptr for qemu_plugin_u64

2024-03-25 Thread Richard Henderson


On 3/25/24 02:41, Pierrick Bouvier wrote:

Signed-off-by: Pierrick Bouvier
---
  accel/tcg/plugin-gen.c | 27 ++-
  1 file changed, 18 insertions(+), 9 deletions(-)


Reviewed-by: Richard Henderson 

r~

[PATCH v6 00/12] Enabling DCD emulation support in Qemu

2024-03-25 Thread nifan . cxl

From: Fan Ni 

A git tree of his series can be found here (with one extra commit on top
for printing out accepted/pending extent list): 
https://github.com/moking/qemu/tree/dcd-v6

v5->v6:
1. Picked up tags;
2. Renamed start_region_id to start_rid; (Jonathan)
3. For get extent list mailbox command, add logic to adjust returned extent
   count based on output payload size; (Jonathan)
4. Use Range to detect extent comparison and overlaps; (Jonathan)
5. Renamed extents_pending_to_add to extents_pending; (Jonathan)
6. Updated the commit log of the qmp interface patch by renaming "dpa" to offset
to align with the code. (Gregory)
7. For DC extent add response and release mailbox command, we use a 2 pass
   approach. The first pass is to detect any potential errors, and the second
   pass to update the in-device data structure;
8. For QMP interface for add/release DC extents, use 2 pass approach with the
   first pass detecting any faulty input and second pass filling the event log.
   Note, based on sswg discussion, we disallow release extents which has DPA
   range not accepted by the host yet;
9. We enforce the in-order process of the pending list for DC extent release
   mailbox command, and the head of pending list is handled accordingly.
10. The last patch from v5 has been removed from this series.

Note: we do not drop the DC changes in build_dvsecs which was suggested
by Jonathan, the reason is that during testing, we found in the current
kernel code, when checking whether the media is ready
(in cxl_await_media_ready), the devsec range registers are checked, for
dcd device, if we leave dvsec range registers unset, the device cannot be
put into "ready" state, which will cause the device inactive.
The related code is below,
https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git/tree/drivers/cxl/core/pci.c?h=fixes&id=d206a76d7d2726f3b096037f2079ce0bd3ba329b#n195

Compared to v5[1], PATCH 8-9 and PATCH 11-12 are almost re-coded, so need more
care when review.

The code is tested with similar setup and has passed similar tests as listed
in the cover letter of v5.
Also, the code passes similar tests with the latest DCD kernel patchset[2].

[1] Qemu DCD patches v5: 
https://lore.kernel.org/linux-cxl/20240304194331.1586191-1-nifan@gmail.com/T/#t
[2] DCD kernel patches: 
https://lore.kernel.org/linux-cxl/20240324-dcd-type2-upstream-v1-0-b7b00d623...@intel.com/T/#m11c571e21c4fe17c7d04ec5c2c7bc7cbf2cd07e3



Fan Ni (12):
  hw/cxl/cxl-mailbox-utils: Add dc_event_log_size field to output
payload of identify memory device command
  hw/cxl/cxl-mailbox-utils: Add dynamic capacity region representative
and mailbox command support
  include/hw/cxl/cxl_device: Rename mem_size as static_mem_size for
type3 memory devices
  hw/mem/cxl_type3: Add support to create DC regions to type3 memory
devices
  hw/mem/cxl-type3: Refactor ct3_build_cdat_entries_for_mr to take mr
size instead of mr as argument
  hw/mem/cxl_type3: Add host backend and address space handling for DC
regions
  hw/mem/cxl_type3: Add DC extent list representative and get DC extent
list mailbox support
  hw/cxl/cxl-mailbox-utils: Add mailbox commands to support add/release
dynamic capacity response
  hw/cxl/events: Add qmp interfaces to add/release dynamic capacity
extents
  hw/mem/cxl_type3: Add dpa range validation for accesses to DC regions
  hw/cxl/cxl-mailbox-utils: Add superset extent release mailbox support
  hw/mem/cxl_type3: Allow to release extent superset in QMP interface

 hw/cxl/cxl-mailbox-utils.c  | 644 +++-
 hw/mem/cxl_type3.c  | 580 +---
 hw/mem/cxl_type3_stubs.c|  14 +
 include/hw/cxl/cxl_device.h |  67 +++-
 include/hw/cxl/cxl_events.h |  18 +
 qapi/cxl.json   |  61 +++-
 6 files changed, 1334 insertions(+), 50 deletions(-)

-- 
2.43.0

[PATCH v6 07/12] hw/mem/cxl_type3: Add DC extent list representative and get DC extent list mailbox support

2024-03-25 Thread nifan . cxl

From: Fan Ni 

Add dynamic capacity extent list representative to the definition of
CXLType3Dev and implement get DC extent list mailbox command per
CXL.spec.3.1:.8.2.9.9.9.2.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  | 75 -
 hw/mem/cxl_type3.c  |  1 +
 include/hw/cxl/cxl_device.h | 22 +++
 3 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 831cef0567..30ef46a036 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -84,6 +84,7 @@ enum {
 #define CLEAR_POISON   0x2
 DCD_CONFIG  = 0x48,
 #define GET_DC_CONFIG  0x0
+#define GET_DYN_CAP_EXT_LIST   0x1
 PHYSICAL_SWITCH = 0x51,
 #define IDENTIFY_SWITCH_DEVICE  0x0
 #define GET_PHYSICAL_PORT_STATE 0x1
@@ -1322,7 +1323,8 @@ static CXLRetCode cmd_dcd_get_dyn_cap_config(const struct 
cxl_cmd *cmd,
  * to use.
  */
 stl_le_p(&extra_out->num_extents_supported, CXL_NUM_EXTENTS_SUPPORTED);
-stl_le_p(&extra_out->num_extents_available, CXL_NUM_EXTENTS_SUPPORTED);
+stl_le_p(&extra_out->num_extents_available, CXL_NUM_EXTENTS_SUPPORTED -
+ ct3d->dc.total_extent_count);
 stl_le_p(&extra_out->num_tags_supported, CXL_NUM_TAGS_SUPPORTED);
 stl_le_p(&extra_out->num_tags_available, CXL_NUM_TAGS_SUPPORTED);
 
@@ -1330,6 +1332,74 @@ static CXLRetCode cmd_dcd_get_dyn_cap_config(const 
struct cxl_cmd *cmd,
 return CXL_MBOX_SUCCESS;
 }
 
+/*
+ * CXL r3.1 section 8.2.9.9.9.2:
+ * Get Dynamic Capacity Extent List (Opcode 4801h)
+ */
+static CXLRetCode cmd_dcd_get_dyn_cap_ext_list(const struct cxl_cmd *cmd,
+   uint8_t *payload_in,
+   size_t len_in,
+   uint8_t *payload_out,
+   size_t *len_out,
+   CXLCCI *cci)
+{
+CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
+struct {
+uint32_t extent_cnt;
+uint32_t start_extent_id;
+} QEMU_PACKED *in = (void *)payload_in;
+struct {
+uint32_t count;
+uint32_t total_extents;
+uint32_t generation_num;
+uint8_t rsvd[4];
+CXLDCExtentRaw records[];
+} QEMU_PACKED *out = (void *)payload_out;
+uint32_t start_extent_id = in->start_extent_id;
+CXLDCExtentList *extent_list = &ct3d->dc.extents;
+uint16_t record_count = 0, i = 0, record_done = 0;
+uint16_t out_pl_len, size;
+CXLDCExtent *ent;
+
+if (start_extent_id > ct3d->dc.total_extent_count) {
+return CXL_MBOX_INVALID_INPUT;
+}
+
+record_count = MIN(in->extent_cnt,
+   ct3d->dc.total_extent_count - start_extent_id);
+size = CXL_MAILBOX_MAX_PAYLOAD_SIZE - sizeof(*out);
+if (size / sizeof(out->records[0]) < record_count) {
+record_count = size / sizeof(out->records[0]);
+}
+out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
+
+stl_le_p(&out->count, record_count);
+stl_le_p(&out->total_extents, ct3d->dc.total_extent_count);
+stl_le_p(&out->generation_num, ct3d->dc.ext_list_gen_seq);
+
+if (record_count > 0) {
+CXLDCExtentRaw *out_rec = &out->records[record_done];
+
+QTAILQ_FOREACH(ent, extent_list, node) {
+if (i++ < start_extent_id) {
+continue;
+}
+stq_le_p(&out_rec->start_dpa, ent->start_dpa);
+stq_le_p(&out_rec->len, ent->len);
+memcpy(&out_rec->tag, ent->tag, 0x10);
+stw_le_p(&out_rec->shared_seq, ent->shared_seq);
+
+record_done++;
+if (record_done == record_count) {
+break;
+}
+}
+}
+
+*len_out = out_pl_len;
+return CXL_MBOX_SUCCESS;
+}
+
 #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
 #define IMMEDIATE_DATA_CHANGE (1 << 2)
 #define IMMEDIATE_POLICY_CHANGE (1 << 3)
@@ -1377,6 +1447,9 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = {
 static const struct cxl_cmd cxl_cmd_set_dcd[256][256] = {
 [DCD_CONFIG][GET_DC_CONFIG] = { "DCD_GET_DC_CONFIG",
 cmd_dcd_get_dyn_cap_config, 2, 0 },
+[DCD_CONFIG][GET_DYN_CAP_EXT_LIST] = {
+"DCD_GET_DYNAMIC_CAPACITY_EXTENT_LIST", cmd_dcd_get_dyn_cap_ext_list,
+8, 0 },
 };
 
 static const struct cxl_cmd cxl_cmd_set_sw[256][256] = {
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 75ea9b20e1..5be3c904ba 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -673,6 +673,7 @@ static bool cxl_create_dc_regions(CXLType3Dev *ct3d, Error 
**errp)
 };
 ct3d->dc.total_capacity += region->len;
 }
+QTAILQ_INIT(&ct3d->dc.extents);
 
 return true;
 }
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index c2c3df0d2a..6aec6ac983 100644
--- a/include

[PATCH v6 02/12] hw/cxl/cxl-mailbox-utils: Add dynamic capacity region representative and mailbox command support

2024-03-25 Thread nifan . cxl

From: Fan Ni 

Per cxl spec r3.1, add dynamic capacity region representative based on
Table 8-165 and extend the cxl type3 device definition to include dc region
information. Also, based on info in 8.2.9.9.9.1, add 'Get Dynamic Capacity
Configuration' mailbox support.

Note: we store region decode length as byte-wise length on the device, which
should be divided by 256 * MiB before being returned to the host
for "Get Dynamic Capacity Configuration" mailbox command per
specification.

Reviewed-by: Jonathan Cameron 
Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  | 96 +
 include/hw/cxl/cxl_device.h | 16 +++
 2 files changed, 112 insertions(+)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index ba1d9901df..49c7944d93 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -22,6 +22,8 @@
 
 #define CXL_CAPACITY_MULTIPLIER   (256 * MiB)
 #define CXL_DC_EVENT_LOG_SIZE 8
+#define CXL_NUM_EXTENTS_SUPPORTED 512
+#define CXL_NUM_TAGS_SUPPORTED 0
 
 /*
  * How to add a new command, example. The command set FOO, with cmd BAR.
@@ -80,6 +82,8 @@ enum {
 #define GET_POISON_LIST0x0
 #define INJECT_POISON  0x1
 #define CLEAR_POISON   0x2
+DCD_CONFIG  = 0x48,
+#define GET_DC_CONFIG  0x0
 PHYSICAL_SWITCH = 0x51,
 #define IDENTIFY_SWITCH_DEVICE  0x0
 #define GET_PHYSICAL_PORT_STATE 0x1
@@ -1238,6 +1242,88 @@ static CXLRetCode cmd_media_clear_poison(const struct 
cxl_cmd *cmd,
 return CXL_MBOX_SUCCESS;
 }
 
+/*
+ * CXL r3.1 section 8.2.9.9.9.1: Get Dynamic Capacity Configuration
+ * (Opcode: 4800h)
+ */
+static CXLRetCode cmd_dcd_get_dyn_cap_config(const struct cxl_cmd *cmd,
+ uint8_t *payload_in,
+ size_t len_in,
+ uint8_t *payload_out,
+ size_t *len_out,
+ CXLCCI *cci)
+{
+CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
+struct {
+uint8_t region_cnt;
+uint8_t start_rid;
+} QEMU_PACKED *in = (void *)payload_in;
+struct {
+uint8_t num_regions;
+uint8_t regions_returned;
+uint8_t rsvd1[6];
+struct {
+uint64_t base;
+uint64_t decode_len;
+uint64_t region_len;
+uint64_t block_size;
+uint32_t dsmadhandle;
+uint8_t flags;
+uint8_t rsvd2[3];
+} QEMU_PACKED records[];
+} QEMU_PACKED *out = (void *)payload_out;
+struct {
+uint32_t num_extents_supported;
+uint32_t num_extents_available;
+uint32_t num_tags_supported;
+uint32_t num_tags_available;
+} QEMU_PACKED *extra_out;
+uint16_t record_count;
+uint16_t i;
+uint16_t out_pl_len;
+uint8_t start_rid;
+
+start_rid = in->start_rid;
+if (start_rid >= ct3d->dc.num_regions) {
+return CXL_MBOX_INVALID_INPUT;
+}
+
+record_count = MIN(ct3d->dc.num_regions - in->start_rid, in->region_cnt);
+
+out_pl_len = sizeof(*out) + record_count * sizeof(out->records[0]);
+extra_out = (void *)(payload_out + out_pl_len);
+out_pl_len += sizeof(*extra_out);
+assert(out_pl_len <= CXL_MAILBOX_MAX_PAYLOAD_SIZE);
+
+out->num_regions = ct3d->dc.num_regions;
+out->regions_returned = record_count;
+for (i = 0; i < record_count; i++) {
+stq_le_p(&out->records[i].base,
+ ct3d->dc.regions[start_rid + i].base);
+stq_le_p(&out->records[i].decode_len,
+ ct3d->dc.regions[start_rid + i].decode_len /
+ CXL_CAPACITY_MULTIPLIER);
+stq_le_p(&out->records[i].region_len,
+ ct3d->dc.regions[start_rid + i].len);
+stq_le_p(&out->records[i].block_size,
+ ct3d->dc.regions[start_rid + i].block_size);
+stl_le_p(&out->records[i].dsmadhandle,
+ ct3d->dc.regions[start_rid + i].dsmadhandle);
+out->records[i].flags = ct3d->dc.regions[start_rid + i].flags;
+}
+/*
+ * TODO: Assign values once extents and tags are introduced
+ * to use.
+ */
+stl_le_p(&extra_out->num_extents_supported, CXL_NUM_EXTENTS_SUPPORTED);
+stl_le_p(&extra_out->num_extents_available, CXL_NUM_EXTENTS_SUPPORTED);
+stl_le_p(&extra_out->num_tags_supported, CXL_NUM_TAGS_SUPPORTED);
+stl_le_p(&extra_out->num_tags_available, CXL_NUM_TAGS_SUPPORTED);
+
+*len_out = out_pl_len;
+return CXL_MBOX_SUCCESS;
+}
+
 #define IMMEDIATE_CONFIG_CHANGE (1 << 1)
 #define IMMEDIATE_DATA_CHANGE (1 << 2)
 #define IMMEDIATE_POLICY_CHANGE (1 << 3)
@@ -1282,6 +1368,11 @@ static const struct cxl_cmd cxl_cmd_set[256][256] = {
 cmd_media_clear_poison, 72, 0 },
 };
 
+static const struct cxl_cmd cxl_cmd_set_dcd[256][256] = {
+[DCD_CONFIG][GET_DC_CONFI

[PATCH v6 10/12] hw/mem/cxl_type3: Add dpa range validation for accesses to DC regions

2024-03-25 Thread nifan . cxl

From: Fan Ni 

All dpa ranges in the DC regions are invalid to access until an extent
covering the range has been added. Add a bitmap for each region to
record whether a DC block in the region has been backed by DC extent.
For the bitmap, a bit in the bitmap represents a DC block. When a DC
extent is added, all the bits of the blocks in the extent will be set,
which will be cleared when the extent is released.

Reviewed-by: Jonathan Cameron 
Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  |  6 +++
 hw/mem/cxl_type3.c  | 76 +
 include/hw/cxl/cxl_device.h |  7 
 3 files changed, 89 insertions(+)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 7094e007b9..a0d2239176 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -1620,6 +1620,7 @@ static CXLRetCode cmd_dcd_add_dyn_cap_rsp(const struct 
cxl_cmd *cmd,
 
 cxl_insert_extent_to_extent_list(extent_list, dpa, len, NULL, 0);
 ct3d->dc.total_extent_count += 1;
+ct3_set_region_block_backed(ct3d, dpa, len);
 
 ent = QTAILQ_FIRST(&ct3d->dc.extents_pending);
 cxl_remove_extent_from_extent_list(&ct3d->dc.extents_pending, ent);
@@ -1798,18 +1799,23 @@ static CXLRetCode cmd_dcd_release_dyn_cap(const struct 
cxl_cmd *cmd,
 
 cxl_remove_extent_from_extent_list(extent_list, ent);
 ct3d->dc.total_extent_count -= 1;
+ct3_clear_region_block_backed(ct3d, ent_start_dpa,
+  ent_len);
 
 if (len1) {
 cxl_insert_extent_to_extent_list(extent_list,
  ent_start_dpa,
  len1, NULL, 0);
 ct3d->dc.total_extent_count += 1;
+ct3_set_region_block_backed(ct3d, ent_start_dpa,
+len1);
 }
 if (len2) {
 cxl_insert_extent_to_extent_list(extent_list,
  dpa + len,
  len2, NULL, 0);
 ct3d->dc.total_extent_count += 1;
+ct3_set_region_block_backed(ct3d, dpa + len, len2);
 }
 
 len -= len_done;
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 74cb64e843..2628a6f50f 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -672,6 +672,7 @@ static bool cxl_create_dc_regions(CXLType3Dev *ct3d, Error 
**errp)
 .flags = 0,
 };
 ct3d->dc.total_capacity += region->len;
+region->blk_bitmap = bitmap_new(region->len / region->block_size);
 }
 QTAILQ_INIT(&ct3d->dc.extents);
 QTAILQ_INIT(&ct3d->dc.extents_pending);
@@ -682,6 +683,8 @@ static bool cxl_create_dc_regions(CXLType3Dev *ct3d, Error 
**errp)
 static void cxl_destroy_dc_regions(CXLType3Dev *ct3d)
 {
 CXLDCExtent *ent, *ent_next;
+int i;
+CXLDCRegion *region;
 
 QTAILQ_FOREACH_SAFE(ent, &ct3d->dc.extents, node, ent_next) {
 cxl_remove_extent_from_extent_list(&ct3d->dc.extents, ent);
@@ -690,6 +693,11 @@ static void cxl_destroy_dc_regions(CXLType3Dev *ct3d)
 cxl_remove_extent_from_extent_list(&ct3d->dc.extents_pending,
ent);
 }
+
+for (i = 0; i < ct3d->dc.num_regions; i++) {
+region = &ct3d->dc.regions[i];
+g_free(region->blk_bitmap);
+}
 }
 
 static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp)
@@ -921,6 +929,70 @@ static void ct3_exit(PCIDevice *pci_dev)
 }
 }
 
+/*
+ * Mark the DPA range [dpa, dap + len - 1] to be backed and accessible. This
+ * happens when a DC extent is added and accepted by the host.
+ */
+void ct3_set_region_block_backed(CXLType3Dev *ct3d, uint64_t dpa,
+ uint64_t len)
+{
+CXLDCRegion *region;
+
+region = cxl_find_dc_region(ct3d, dpa, len);
+if (!region) {
+return;
+}
+
+bitmap_set(region->blk_bitmap, (dpa - region->base) / region->block_size,
+   len / region->block_size);
+}
+
+/*
+ * Check whether the DPA range [dpa, dpa + len - 1] is backed with DC extents.
+ * Used when validating read/write to dc regions
+ */
+bool ct3_test_region_block_backed(CXLType3Dev *ct3d, uint64_t dpa,
+  uint64_t len)
+{
+CXLDCRegion *region;
+uint64_t nbits;
+long nr;
+
+region = cxl_find_dc_region(ct3d, dpa, len);
+if (!region) {
+return false;
+}
+
+nr = (dpa - region->base) / region->block_size;
+nbits = DIV_ROUND_UP(len, region->block_size);
+/*
+ * if bits between [dpa, dpa + len) are all 1s, meaning the DPA range is
+ * backed with DC extents, return true;

[PATCH v6 12/12] hw/mem/cxl_type3: Allow to release extent superset in QMP interface

2024-03-25 Thread nifan . cxl

From: Fan Ni 

Before the change, the QMP interface used for add/release DC extents
only allows to release an extent whose DPA range is contained by a single
accepted extent in the device.

With the change, we relax the constraints.  As long as the DPA range of
the extent is covered by accepted extents, we allow the release.

Signed-off-by: Fan Ni 
---
 hw/mem/cxl_type3.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 2628a6f50f..62c2022477 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -1935,8 +1935,7 @@ static void qmp_cxl_process_dynamic_capacity(const char 
*path, CxlEventLog log,
"cannot release extent with pending DPA range");
 return;
 }
-if (!cxl_extents_contains_dpa_range(&dcd->dc.extents,
-dpa, len)) {
+if (!ct3_test_region_block_backed(dcd, dpa, len)) {
 error_setg(errp,
"cannot release extent with non-existing DPA 
range");
 return;
-- 
2.43.0

[PATCH v6 11/12] hw/cxl/cxl-mailbox-utils: Add superset extent release mailbox support

2024-03-25 Thread nifan . cxl

From: Fan Ni 

With the change, we extend the extent release mailbox command processing
to allow more flexible release. As long as the DPA range of the extent to
release is covered by accepted extent(s) in the device, the release can be
performed.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c | 41 ++
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index a0d2239176..3b7949c364 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -1674,6 +1674,12 @@ static CXLRetCode 
cxl_dc_extent_release_dry_run(CXLType3Dev *ct3d,
 dpa = in->updated_entries[i].start_dpa;
 len = in->updated_entries[i].len;
 
+/* Check if the DPA range is not fully backed with valid extents */
+if (!ct3_test_region_block_backed(ct3d, dpa, len)) {
+ret = CXL_MBOX_INVALID_PA;
+goto free_and_exit;
+}
+/* After this point, extent overflow is the only error can happen */
 while (len > 0) {
 QTAILQ_FOREACH(ent, &tmp_list, node) {
 range_init_nofail(&range, ent->start_dpa, ent->len);
@@ -1713,25 +1719,27 @@ static CXLRetCode 
cxl_dc_extent_release_dry_run(CXLType3Dev *ct3d,
 goto free_and_exit;
 }
 } else {
-/*
- * TODO: we reject the attempt to remove an extent
- * that overlaps with multiple extents in the device
- * for now, we will allow it once superset release
- * support is added.
- */
-ret = CXL_MBOX_INVALID_PA;
-goto free_and_exit;
+len1 = dpa - ent_start_dpa;
+len2 = 0;
+len_done = ent_len - len1 - len2;
+
+cxl_remove_extent_from_extent_list(&tmp_list, ent);
+cnt_delta--;
+if (len1) {
+cxl_insert_extent_to_extent_list(&tmp_list,
+ ent_start_dpa,
+ len1, NULL, 0);
+cnt_delta++;
+}
 }
 
 len -= len_done;
-/* len == 0 here until superset release is added */
+if (len) {
+dpa = ent_start_dpa + ent_len;
+}
 break;
 }
 }
-if (len) {
-ret = CXL_MBOX_INVALID_PA;
-goto free_and_exit;
-}
 }
 }
 free_and_exit:
@@ -1819,10 +1827,9 @@ static CXLRetCode cmd_dcd_release_dyn_cap(const struct 
cxl_cmd *cmd,
 }
 
 len -= len_done;
-/*
- * len will always be 0 until superset release is add.
- * TODO: superset release will be added.
- */
+if (len > 0) {
+dpa = ent_start_dpa + ent_len;
+}
 break;
 }
 }
-- 
2.43.0

[PATCH v6 03/12] include/hw/cxl/cxl_device: Rename mem_size as static_mem_size for type3 memory devices

2024-03-25 Thread nifan . cxl

From: Fan Ni 

Rename mem_size as static_mem_size for type3 memdev to cover static RAM and
pmem capacity, preparing for the introduction of dynamic capacity to support
dynamic capacity devices.

Reviewed-by: Jonathan Cameron 
Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  | 4 ++--
 hw/mem/cxl_type3.c  | 8 
 include/hw/cxl/cxl_device.h | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 49c7944d93..0f2ad58a14 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -803,7 +803,7 @@ static CXLRetCode cmd_identify_memory_device(const struct 
cxl_cmd *cmd,
 snprintf(id->fw_revision, 0x10, "BWFW VERSION %02d", 0);
 
 stq_le_p(&id->total_capacity,
- cxl_dstate->mem_size / CXL_CAPACITY_MULTIPLIER);
+ cxl_dstate->static_mem_size / CXL_CAPACITY_MULTIPLIER);
 stq_le_p(&id->persistent_capacity,
  cxl_dstate->pmem_size / CXL_CAPACITY_MULTIPLIER);
 stq_le_p(&id->volatile_capacity,
@@ -1179,7 +1179,7 @@ static CXLRetCode cmd_media_clear_poison(const struct 
cxl_cmd *cmd,
 struct clear_poison_pl *in = (void *)payload_in;
 
 dpa = ldq_le_p(&in->dpa);
-if (dpa + CXL_CACHE_LINE_SIZE > cxl_dstate->mem_size) {
+if (dpa + CXL_CACHE_LINE_SIZE > cxl_dstate->static_mem_size) {
 return CXL_MBOX_INVALID_PA;
 }
 
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index b679dfae1c..0836e9135b 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -608,7 +608,7 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error 
**errp)
 }
 address_space_init(&ct3d->hostvmem_as, vmr, v_name);
 ct3d->cxl_dstate.vmem_size = memory_region_size(vmr);
-ct3d->cxl_dstate.mem_size += memory_region_size(vmr);
+ct3d->cxl_dstate.static_mem_size += memory_region_size(vmr);
 g_free(v_name);
 }
 
@@ -631,7 +631,7 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error 
**errp)
 }
 address_space_init(&ct3d->hostpmem_as, pmr, p_name);
 ct3d->cxl_dstate.pmem_size = memory_region_size(pmr);
-ct3d->cxl_dstate.mem_size += memory_region_size(pmr);
+ct3d->cxl_dstate.static_mem_size += memory_region_size(pmr);
 g_free(p_name);
 }
 
@@ -837,7 +837,7 @@ static int cxl_type3_hpa_to_as_and_dpa(CXLType3Dev *ct3d,
 return -EINVAL;
 }
 
-if (*dpa_offset > ct3d->cxl_dstate.mem_size) {
+if (*dpa_offset > ct3d->cxl_dstate.static_mem_size) {
 return -EINVAL;
 }
 
@@ -1010,7 +1010,7 @@ static bool set_cacheline(CXLType3Dev *ct3d, uint64_t 
dpa_offset, uint8_t *data)
 return false;
 }
 
-if (dpa_offset + CXL_CACHE_LINE_SIZE > ct3d->cxl_dstate.mem_size) {
+if (dpa_offset + CXL_CACHE_LINE_SIZE > ct3d->cxl_dstate.static_mem_size) {
 return false;
 }
 
diff --git a/include/hw/cxl/cxl_device.h b/include/hw/cxl/cxl_device.h
index e839370266..f7f56b44e3 100644
--- a/include/hw/cxl/cxl_device.h
+++ b/include/hw/cxl/cxl_device.h
@@ -234,7 +234,7 @@ typedef struct cxl_device_state {
 } timestamp;
 
 /* memory region size, HDM */
-uint64_t mem_size;
+uint64_t static_mem_size;
 uint64_t pmem_size;
 uint64_t vmem_size;
 
-- 
2.43.0

[PATCH v6 04/12] hw/mem/cxl_type3: Add support to create DC regions to type3 memory devices

2024-03-25 Thread nifan . cxl

From: Fan Ni 

With the change, when setting up memory for type3 memory device, we can
create DC regions.
A property 'num-dc-regions' is added to ct3_props to allow users to pass the
number of DC regions to create. To make it easier, other region parameters
like region base, length, and block size are hard coded. If needed,
these parameters can be added easily.

With the change, we can create DC regions with proper kernel side
support like below:

region=$(cat /sys/bus/cxl/devices/decoder0.0/create_dc_region)
echo $region > /sys/bus/cxl/devices/decoder0.0/create_dc_region
echo 256 > /sys/bus/cxl/devices/$region/interleave_granularity
echo 1 > /sys/bus/cxl/devices/$region/interleave_ways

echo "dc0" >/sys/bus/cxl/devices/decoder2.0/mode
echo 0x4000 >/sys/bus/cxl/devices/decoder2.0/dpa_size

echo 0x4000 > /sys/bus/cxl/devices/$region/size
echo  "decoder2.0" > /sys/bus/cxl/devices/$region/target0
echo 1 > /sys/bus/cxl/devices/$region/commit
echo $region > /sys/bus/cxl/drivers/cxl_region/bind

Reviewed-by: Jonathan Cameron 
Signed-off-by: Fan Ni 
---
 hw/mem/cxl_type3.c | 47 ++
 1 file changed, 47 insertions(+)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 0836e9135b..c83d6f8004 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -30,6 +30,7 @@
 #include "hw/pci/msix.h"
 
 #define DWORD_BYTE 4
+#define CXL_CAPACITY_MULTIPLIER   (256 * MiB)
 
 /* Default CDAT entries for a memory region */
 enum {
@@ -567,6 +568,46 @@ static void ct3d_reg_write(void *opaque, hwaddr offset, 
uint64_t value,
 }
 }
 
+/*
+ * TODO: dc region configuration will be updated once host backend and address
+ * space support is added for DCD.
+ */
+static bool cxl_create_dc_regions(CXLType3Dev *ct3d, Error **errp)
+{
+int i;
+uint64_t region_base = 0;
+uint64_t region_len =  2 * GiB;
+uint64_t decode_len = 2 * GiB;
+uint64_t blk_size = 2 * MiB;
+CXLDCRegion *region;
+MemoryRegion *mr;
+
+if (ct3d->hostvmem) {
+mr = host_memory_backend_get_memory(ct3d->hostvmem);
+region_base += memory_region_size(mr);
+}
+if (ct3d->hostpmem) {
+mr = host_memory_backend_get_memory(ct3d->hostpmem);
+region_base += memory_region_size(mr);
+}
+assert(region_base % CXL_CAPACITY_MULTIPLIER == 0);
+
+for (i = 0, region = &ct3d->dc.regions[0];
+ i < ct3d->dc.num_regions;
+ i++, region++, region_base += region_len) {
+*region = (CXLDCRegion) {
+.base = region_base,
+.decode_len = decode_len,
+.len = region_len,
+.block_size = blk_size,
+/* dsmad_handle set when creating CDAT table entries */
+.flags = 0,
+};
+}
+
+return true;
+}
+
 static bool cxl_setup_memory(CXLType3Dev *ct3d, Error **errp)
 {
 DeviceState *ds = DEVICE(ct3d);
@@ -635,6 +676,11 @@ static bool cxl_setup_memory(CXLType3Dev *ct3d, Error 
**errp)
 g_free(p_name);
 }
 
+if (!cxl_create_dc_regions(ct3d, errp)) {
+error_setg(errp, "setup DC regions failed");
+return false;
+}
+
 return true;
 }
 
@@ -930,6 +976,7 @@ static Property ct3_props[] = {
  HostMemoryBackend *),
 DEFINE_PROP_UINT64("sn", CXLType3Dev, sn, UI64_NULL),
 DEFINE_PROP_STRING("cdat", CXLType3Dev, cxl_cstate.cdat.filename),
+DEFINE_PROP_UINT8("num-dc-regions", CXLType3Dev, dc.num_regions, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
-- 
2.43.0

[PATCH v6 06/12] hw/mem/cxl_type3: Add host backend and address space handling for DC regions

2024-03-25 Thread nifan . cxl

From: Fan Ni 

Add (file/memory backed) host backend, all the dynamic capacity regions
will share a single, large enough host backend. Set up address space for
DC regions to support read/write operations to dynamic capacity for DCD.

With the change, following supports are added:
1. Add a new property to type3 device "volatile-dc-memdev" to point to host
   memory backend for dynamic capacity. Currently, all dc regions share one
   host backend.
2. Add namespace for dynamic capacity for read/write support;
3. Create cdat entries for each dynamic capacity region;

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  |  16 ++-
 hw/mem/cxl_type3.c  | 187 +---
 include/hw/cxl/cxl_device.h |   8 ++
 3 files changed, 172 insertions(+), 39 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 0f2ad58a14..831cef0567 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -622,7 +622,8 @@ static CXLRetCode cmd_firmware_update_get_info(const struct 
cxl_cmd *cmd,
size_t *len_out,
CXLCCI *cci)
 {
-CXLDeviceState *cxl_dstate = &CXL_TYPE3(cci->d)->cxl_dstate;
+CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
+CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate;
 struct {
 uint8_t slots_supported;
 uint8_t slot_info;
@@ -636,7 +637,8 @@ static CXLRetCode cmd_firmware_update_get_info(const struct 
cxl_cmd *cmd,
 QEMU_BUILD_BUG_ON(sizeof(*fw_info) != 0x50);
 
 if ((cxl_dstate->vmem_size < CXL_CAPACITY_MULTIPLIER) ||
-(cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER)) {
+(cxl_dstate->pmem_size < CXL_CAPACITY_MULTIPLIER) ||
+(ct3d->dc.total_capacity < CXL_CAPACITY_MULTIPLIER)) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
@@ -793,7 +795,8 @@ static CXLRetCode cmd_identify_memory_device(const struct 
cxl_cmd *cmd,
 CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate;
 
 if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) ||
-(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER))) {
+(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER)) ||
+(!QEMU_IS_ALIGNED(ct3d->dc.total_capacity, CXL_CAPACITY_MULTIPLIER))) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
@@ -835,9 +838,11 @@ static CXLRetCode cmd_ccls_get_partition_info(const struct 
cxl_cmd *cmd,
 uint64_t next_pmem;
 } QEMU_PACKED *part_info = (void *)payload_out;
 QEMU_BUILD_BUG_ON(sizeof(*part_info) != 0x20);
+CXLType3Dev *ct3d = container_of(cxl_dstate, CXLType3Dev, cxl_dstate);
 
 if ((!QEMU_IS_ALIGNED(cxl_dstate->vmem_size, CXL_CAPACITY_MULTIPLIER)) ||
-(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER))) {
+(!QEMU_IS_ALIGNED(cxl_dstate->pmem_size, CXL_CAPACITY_MULTIPLIER)) ||
+(!QEMU_IS_ALIGNED(ct3d->dc.total_capacity, CXL_CAPACITY_MULTIPLIER))) {
 return CXL_MBOX_INTERNAL_ERROR;
 }
 
@@ -1179,7 +1184,8 @@ static CXLRetCode cmd_media_clear_poison(const struct 
cxl_cmd *cmd,
 struct clear_poison_pl *in = (void *)payload_in;
 
 dpa = ldq_le_p(&in->dpa);
-if (dpa + CXL_CACHE_LINE_SIZE > cxl_dstate->static_mem_size) {
+if (dpa + CXL_CACHE_LINE_SIZE > cxl_dstate->static_mem_size +
+ct3d->dc.total_capacity) {
 return CXL_MBOX_INVALID_PA;
 }
 
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index a9e8bdc436..75ea9b20e1 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -45,7 +45,8 @@ enum {
 
 static void ct3_build_cdat_entries_for_mr(CDATSubHeader **cdat_table,
   int dsmad_handle, uint64_t size,
-  bool is_pmem, uint64_t dpa_base)
+  bool is_pmem, bool is_dynamic,
+  uint64_t dpa_base)
 {
 CDATDsmas *dsmas;
 CDATDslbis *dslbis0;
@@ -61,7 +62,8 @@ static void ct3_build_cdat_entries_for_mr(CDATSubHeader 
**cdat_table,
 .length = sizeof(*dsmas),
 },
 .DSMADhandle = dsmad_handle,
-.flags = is_pmem ? CDAT_DSMAS_FLAG_NV : 0,
+.flags = (is_pmem ? CDAT_DSMAS_FLAG_NV : 0) |
+ (is_dynamic ? CDAT_DSMAS_FLAG_DYNAMIC_CAP : 0),
 .DPA_base = dpa_base,
 .DPA_length = size,
 };
@@ -149,12 +151,13 @@ static int ct3_build_cdat_table(CDATSubHeader 
***cdat_table, void *priv)
 g_autofree CDATSubHeader **table = NULL;
 CXLType3Dev *ct3d = priv;
 MemoryRegion *volatile_mr = NULL, *nonvolatile_mr = NULL;
+MemoryRegion *dc_mr = NULL;
 uint64_t vmr_size = 0, pmr_size = 0;
 int dsmad_handle = 0;
 int cur_ent = 0;
 int len = 0;
 
-if (!ct3d->hostpmem && !ct3d->hostvmem) {
+if (!ct3d->hostpmem && !ct3d->hostvmem && !ct3d->dc.num_regions) {
 return 0;
 }
 
@@ -176,2

[PATCH v6 05/12] hw/mem/cxl-type3: Refactor ct3_build_cdat_entries_for_mr to take mr size instead of mr as argument

2024-03-25 Thread nifan . cxl

From: Fan Ni 

The function ct3_build_cdat_entries_for_mr only uses size of the passed
memory region argument, refactor the function definition to make the passed
arguments more specific.

Reviewed-by: Jonathan Cameron 
Signed-off-by: Fan Ni 
---
 hw/mem/cxl_type3.c | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index c83d6f8004..a9e8bdc436 100644
--- a/hw/mem/cxl_type3.c
+++ b/hw/mem/cxl_type3.c
@@ -44,7 +44,7 @@ enum {
 };
 
 static void ct3_build_cdat_entries_for_mr(CDATSubHeader **cdat_table,
-  int dsmad_handle, MemoryRegion *mr,
+  int dsmad_handle, uint64_t size,
   bool is_pmem, uint64_t dpa_base)
 {
 CDATDsmas *dsmas;
@@ -63,7 +63,7 @@ static void ct3_build_cdat_entries_for_mr(CDATSubHeader 
**cdat_table,
 .DSMADhandle = dsmad_handle,
 .flags = is_pmem ? CDAT_DSMAS_FLAG_NV : 0,
 .DPA_base = dpa_base,
-.DPA_length = memory_region_size(mr),
+.DPA_length = size,
 };
 
 /* For now, no memory side cache, plausiblish numbers */
@@ -132,7 +132,7 @@ static void ct3_build_cdat_entries_for_mr(CDATSubHeader 
**cdat_table,
  */
 .EFI_memory_type_attr = is_pmem ? 2 : 1,
 .DPA_offset = 0,
-.DPA_length = memory_region_size(mr),
+.DPA_length = size,
 };
 
 /* Header always at start of structure */
@@ -149,6 +149,7 @@ static int ct3_build_cdat_table(CDATSubHeader 
***cdat_table, void *priv)
 g_autofree CDATSubHeader **table = NULL;
 CXLType3Dev *ct3d = priv;
 MemoryRegion *volatile_mr = NULL, *nonvolatile_mr = NULL;
+uint64_t vmr_size = 0, pmr_size = 0;
 int dsmad_handle = 0;
 int cur_ent = 0;
 int len = 0;
@@ -163,6 +164,7 @@ static int ct3_build_cdat_table(CDATSubHeader 
***cdat_table, void *priv)
 return -EINVAL;
 }
 len += CT3_CDAT_NUM_ENTRIES;
+vmr_size = memory_region_size(volatile_mr);
 }
 
 if (ct3d->hostpmem) {
@@ -171,21 +173,22 @@ static int ct3_build_cdat_table(CDATSubHeader 
***cdat_table, void *priv)
 return -EINVAL;
 }
 len += CT3_CDAT_NUM_ENTRIES;
+pmr_size = memory_region_size(nonvolatile_mr);
 }
 
 table = g_malloc0(len * sizeof(*table));
 
 /* Now fill them in */
 if (volatile_mr) {
-ct3_build_cdat_entries_for_mr(table, dsmad_handle++, volatile_mr,
+ct3_build_cdat_entries_for_mr(table, dsmad_handle++, vmr_size,
   false, 0);
 cur_ent = CT3_CDAT_NUM_ENTRIES;
 }
 
 if (nonvolatile_mr) {
-uint64_t base = volatile_mr ? memory_region_size(volatile_mr) : 0;
+uint64_t base = vmr_size;
 ct3_build_cdat_entries_for_mr(&(table[cur_ent]), dsmad_handle++,
-  nonvolatile_mr, true, base);
+  pmr_size, true, base);
 cur_ent += CT3_CDAT_NUM_ENTRIES;
 }
 assert(len == cur_ent);
-- 
2.43.0

[PATCH v6 09/12] hw/cxl/events: Add qmp interfaces to add/release dynamic capacity extents

2024-03-25 Thread nifan . cxl

From: Fan Ni 

To simulate FM functionalities for initiating Dynamic Capacity Add
(Opcode 5604h) and Dynamic Capacity Release (Opcode 5605h) as in CXL spec
r3.1 7.6.7.6.5 and 7.6.7.6.6, we implemented two QMP interfaces to issue
add/release dynamic capacity extents requests.

With the change, we allow to release an extent only when its DPA range
is contained by a single accepted extent in the device. That is to say,
extent superset release is not supported yet.

1. Add dynamic capacity extents:

For example, the command to add two continuous extents (each 128MiB long)
to region 0 (starting at DPA offset 0) looks like below:

{ "execute": "qmp_capabilities" }

{ "execute": "cxl-add-dynamic-capacity",
  "arguments": {
  "path": "/machine/peripheral/cxl-dcd0",
  "region-id": 0,
  "extents": [
  {
  "offset": 0,
  "len": 134217728
  },
  {
  "offset": 134217728,
  "len": 134217728
  }
  ]
  }
}

2. Release dynamic capacity extents:

For example, the command to release an extent of size 128MiB from region 0
(DPA offset 128MiB) looks like below:

{ "execute": "cxl-release-dynamic-capacity",
  "arguments": {
  "path": "/machine/peripheral/cxl-dcd0",
  "region-id": 0,
  "extents": [
  {
  "offset": 134217728,
  "len": 134217728
  }
  ]
  }
}

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  |  26 ++--
 hw/mem/cxl_type3.c  | 252 +++-
 hw/mem/cxl_type3_stubs.c|  14 ++
 include/hw/cxl/cxl_device.h |   8 ++
 include/hw/cxl/cxl_events.h |  18 +++
 qapi/cxl.json   |  61 -
 6 files changed, 367 insertions(+), 12 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index a9eca516c8..7094e007b9 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -1407,7 +1407,7 @@ static CXLRetCode cmd_dcd_get_dyn_cap_ext_list(const 
struct cxl_cmd *cmd,
  * Check whether any bit between addr[nr, nr+size) is set,
  * return true if any bit is set, otherwise return false
  */
-static bool test_any_bits_set(const unsigned long *addr, unsigned long nr,
+bool test_any_bits_set(const unsigned long *addr, unsigned long nr,
   unsigned long size)
 {
 unsigned long res = find_next_bit(addr, size + nr, nr);
@@ -1446,7 +1446,7 @@ CXLDCRegion *cxl_find_dc_region(CXLType3Dev *ct3d, 
uint64_t dpa, uint64_t len)
 return NULL;
 }
 
-static void cxl_insert_extent_to_extent_list(CXLDCExtentList *list,
+void cxl_insert_extent_to_extent_list(CXLDCExtentList *list,
  uint64_t dpa,
  uint64_t len,
  uint8_t *tag,
@@ -1552,10 +1552,11 @@ static CXLRetCode 
cxl_dcd_add_dyn_cap_rsp_dry_run(CXLType3Dev *ct3d,
 
 range_init_nofail(&range1, dpa, len);
 
-/*
- * TODO: once the pending extent list is added, check against
- * the list will be added here.
- */
+/* host-accepted DPA range must be contained by pending extent */
+if (!cxl_extents_contains_dpa_range(&ct3d->dc.extents_pending,
+dpa, len)) {
+return CXL_MBOX_INVALID_PA;
+}
 
 /* to-be-added range should not overlap with range already accepted */
 QTAILQ_FOREACH(ent, &ct3d->dc.extents, node) {
@@ -1585,9 +1586,13 @@ static CXLRetCode cmd_dcd_add_dyn_cap_rsp(const struct 
cxl_cmd *cmd,
 CXLDCExtentList *extent_list = &ct3d->dc.extents;
 uint32_t i;
 uint64_t dpa, len;
+CXLDCExtent *ent;
 CXLRetCode ret;
 
 if (in->num_entries_updated == 0) {
+/* Always remove the first pending extent when response received. */
+ent = QTAILQ_FIRST(&ct3d->dc.extents_pending);
+cxl_remove_extent_from_extent_list(&ct3d->dc.extents_pending, ent);
 return CXL_MBOX_SUCCESS;
 }
 
@@ -1604,6 +1609,8 @@ static CXLRetCode cmd_dcd_add_dyn_cap_rsp(const struct 
cxl_cmd *cmd,
 
 ret = cxl_dcd_add_dyn_cap_rsp_dry_run(ct3d, in);
 if (ret != CXL_MBOX_SUCCESS) {
+ent = QTAILQ_FIRST(&ct3d->dc.extents_pending);
+cxl_remove_extent_from_extent_list(&ct3d->dc.extents_pending, ent);
 return ret;
 }
 
@@ -1613,10 +1620,9 @@ static CXLRetCode cmd_dcd_add_dyn_cap_rsp(const struct 
cxl_cmd *cmd,
 
 cxl_insert_extent_to_extent_list(extent_list, dpa, len, NULL, 0);
 ct3d->dc.total_extent_count += 1;
-/*
- * TODO: we will add a pending extent list based on event log record
- * and process the list according here.
- */
+
+ent = QTAILQ_FIRST(&ct3d->dc.extents_pending);
+cxl_remove_extent_from_extent_list(&ct3d->dc.extents_pending, ent);
 }
 
 return CXL_MBOX_SUCCESS;
diff --git a/hw/mem/cxl_type3.c b/hw/mem/cxl_type3.c
index 951bd79a82..74cb64e843 100644
--- a/hw/mem/cxl_t

[PATCH v6 01/12] hw/cxl/cxl-mailbox-utils: Add dc_event_log_size field to output payload of identify memory device command

2024-03-25 Thread nifan . cxl

From: Fan Ni 

Based on CXL spec r3.1 Table 8-127 (Identify Memory Device Output
Payload), dynamic capacity event log size should be part of
output of the Identify command.
Add dc_event_log_size to the output payload for the host to get the info.

Reviewed-by: Jonathan Cameron 
Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 4bcd727f4c..ba1d9901df 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -21,6 +21,7 @@
 #include "sysemu/hostmem.h"
 
 #define CXL_CAPACITY_MULTIPLIER   (256 * MiB)
+#define CXL_DC_EVENT_LOG_SIZE 8
 
 /*
  * How to add a new command, example. The command set FOO, with cmd BAR.
@@ -780,8 +781,9 @@ static CXLRetCode cmd_identify_memory_device(const struct 
cxl_cmd *cmd,
 uint16_t inject_poison_limit;
 uint8_t poison_caps;
 uint8_t qos_telemetry_caps;
+uint16_t dc_event_log_size;
 } QEMU_PACKED *id;
-QEMU_BUILD_BUG_ON(sizeof(*id) != 0x43);
+QEMU_BUILD_BUG_ON(sizeof(*id) != 0x45);
 CXLType3Dev *ct3d = CXL_TYPE3(cci->d);
 CXLType3Class *cvc = CXL_TYPE3_GET_CLASS(ct3d);
 CXLDeviceState *cxl_dstate = &ct3d->cxl_dstate;
@@ -807,6 +809,7 @@ static CXLRetCode cmd_identify_memory_device(const struct 
cxl_cmd *cmd,
 st24_le_p(id->poison_list_max_mer, 256);
 /* No limit - so limited by main poison record limit */
 stw_le_p(&id->inject_poison_limit, 0);
+stw_le_p(&id->dc_event_log_size, CXL_DC_EVENT_LOG_SIZE);
 
 *len_out = sizeof(*id);
 return CXL_MBOX_SUCCESS;
-- 
2.43.0

[PATCH v6 08/12] hw/cxl/cxl-mailbox-utils: Add mailbox commands to support add/release dynamic capacity response

2024-03-25 Thread nifan . cxl

From: Fan Ni 

Per CXL spec 3.1, two mailbox commands are implemented:
Add Dynamic Capacity Response (Opcode 4802h) 8.2.9.9.9.3, and
Release Dynamic Capacity (Opcode 4803h) 8.2.9.9.9.4.

For the process of the above two commands, we use two-pass approach.
Pass 1: Check whether the input payload is valid or not; if not, skip
Pass 2 and return mailbox process error.
Pass 2: Do the real work--add or release extents, respectively.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  | 433 +++-
 hw/mem/cxl_type3.c  |  11 +
 include/hw/cxl/cxl_device.h |   4 +
 3 files changed, 444 insertions(+), 4 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 30ef46a036..a9eca516c8 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -19,6 +19,7 @@
 #include "qemu/units.h"
 #include "qemu/uuid.h"
 #include "sysemu/hostmem.h"
+#include "qemu/range.h"
 
 #define CXL_CAPACITY_MULTIPLIER   (256 * MiB)
 #define CXL_DC_EVENT_LOG_SIZE 8
@@ -85,6 +86,8 @@ enum {
 DCD_CONFIG  = 0x48,
 #define GET_DC_CONFIG  0x0
 #define GET_DYN_CAP_EXT_LIST   0x1
+#define ADD_DYN_CAP_RSP0x2
+#define RELEASE_DYN_CAP0x3
 PHYSICAL_SWITCH = 0x51,
 #define IDENTIFY_SWITCH_DEVICE  0x0
 #define GET_PHYSICAL_PORT_STATE 0x1
@@ -1400,6 +1403,422 @@ static CXLRetCode cmd_dcd_get_dyn_cap_ext_list(const 
struct cxl_cmd *cmd,
 return CXL_MBOX_SUCCESS;
 }
 
+/*
+ * Check whether any bit between addr[nr, nr+size) is set,
+ * return true if any bit is set, otherwise return false
+ */
+static bool test_any_bits_set(const unsigned long *addr, unsigned long nr,
+  unsigned long size)
+{
+unsigned long res = find_next_bit(addr, size + nr, nr);
+
+return res < nr + size;
+}
+
+CXLDCRegion *cxl_find_dc_region(CXLType3Dev *ct3d, uint64_t dpa, uint64_t len)
+{
+int i;
+CXLDCRegion *region = &ct3d->dc.regions[0];
+
+if (dpa < region->base ||
+dpa >= region->base + ct3d->dc.total_capacity) {
+return NULL;
+}
+
+/*
+ * CXL r3.1 section 9.13.3: Dynamic Capacity Device (DCD)
+ *
+ * Regions are used in increasing-DPA order, with Region 0 being used for
+ * the lowest DPA of Dynamic Capacity and Region 7 for the highest DPA.
+ * So check from the last region to find where the dpa belongs. Extents 
that
+ * cross multiple regions are not allowed.
+ */
+for (i = ct3d->dc.num_regions - 1; i >= 0; i--) {
+region = &ct3d->dc.regions[i];
+if (dpa >= region->base) {
+if (dpa + len > region->base + region->len) {
+return NULL;
+}
+return region;
+}
+}
+
+return NULL;
+}
+
+static void cxl_insert_extent_to_extent_list(CXLDCExtentList *list,
+ uint64_t dpa,
+ uint64_t len,
+ uint8_t *tag,
+ uint16_t shared_seq)
+{
+CXLDCExtent *extent;
+
+extent = g_new0(CXLDCExtent, 1);
+extent->start_dpa = dpa;
+extent->len = len;
+if (tag) {
+memcpy(extent->tag, tag, 0x10);
+}
+extent->shared_seq = shared_seq;
+
+QTAILQ_INSERT_TAIL(list, extent, node);
+}
+
+void cxl_remove_extent_from_extent_list(CXLDCExtentList *list,
+CXLDCExtent *extent)
+{
+QTAILQ_REMOVE(list, extent, node);
+g_free(extent);
+}
+
+/*
+ * CXL r3.1 Table 8-168: Add Dynamic Capacity Response Input Payload
+ * CXL r3.1 Table 8-170: Release Dynamic Capacity Input Payload
+ */
+typedef struct CXLUpdateDCExtentListInPl {
+uint32_t num_entries_updated;
+uint8_t flags;
+uint8_t rsvd[3];
+/* CXL r3.1 Table 8-169: Updated Extent */
+struct {
+uint64_t start_dpa;
+uint64_t len;
+uint8_t rsvd[8];
+} QEMU_PACKED updated_entries[];
+} QEMU_PACKED CXLUpdateDCExtentListInPl;
+
+/*
+ * For the extents in the extent list to operate, check whether they are valid
+ * 1. The extent should be in the range of a valid DC region;
+ * 2. The extent should not cross multiple regions;
+ * 3. The start DPA and the length of the extent should align with the block
+ * size of the region;
+ * 4. The address range of multiple extents in the list should not overlap.
+ */
+static CXLRetCode cxl_detect_malformed_extent_list(CXLType3Dev *ct3d,
+const CXLUpdateDCExtentListInPl *in)
+{
+uint64_t min_block_size = UINT64_MAX;
+CXLDCRegion *region = &ct3d->dc.regions[0];
+CXLDCRegion *lastregion = &ct3d->dc.regions[ct3d->dc.num_regions - 1];
+g_autofree unsigned long *blk_bitmap = NULL;
+uint64_t dpa, len;
+uint32_t i;
+
+for (i = 0; i < ct3d->dc.num_regions; i++) {
+region = &ct3d->dc.regions[i];
+min_block_size = MIN(min_block_s

[PATCH v2] target/hppa: Fix unit carry conditions

2024-03-25 Thread Richard Henderson

Split do_unit_cond to do_unit_zero_cond to only handle conditions
versus zero.  These are the only ones that are legal for UXOR.
Simplify trans_uxor accordingly.

Rename do_unit to do_unit_addsub, since xor has been split.
Properly compute carry-out bits for add and subtract, mirroring
the code in do_add and do_sub.

Signed-off-by: Richard Henderson 
---

v2: Cut and paste error between 64- and 32-bit paths.
Shift 32-bit carry down 1 bit like 64-bit carry;
tradeoff is shift vs needing a 64-bit constant for the mask.
Don't use of TCG_COND_TST{NE,EQ}, as this will limit backports
of the actual bug fix.  We can convert the port to test conditions
en masse during the next devel cycle.
 
---
 target/hppa/translate.c | 218 +---
 1 file changed, 113 insertions(+), 105 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 3fc3e7754c..99c5c4cbca 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -936,98 +936,44 @@ static DisasCond do_sed_cond(DisasContext *ctx, unsigned 
orig, bool d,
 return do_log_cond(ctx, c * 2 + f, d, res);
 }
 
-/* Similar, but for unit conditions.  */
-
-static DisasCond do_unit_cond(unsigned cf, bool d, TCGv_i64 res,
-  TCGv_i64 in1, TCGv_i64 in2)
+/* Similar, but for unit zero conditions.  */
+static DisasCond do_unit_zero_cond(unsigned cf, bool d, TCGv_i64 res)
 {
-DisasCond cond;
-TCGv_i64 tmp, cb = NULL;
+TCGv_i64 tmp;
 uint64_t d_repl = d ? 0x00010001ull : 1;
-
-if (cf & 8) {
-/* Since we want to test lots of carry-out bits all at once, do not
- * do our normal thing and compute carry-in of bit B+1 since that
- * leaves us with carry bits spread across two words.
- */
-cb = tcg_temp_new_i64();
-tmp = tcg_temp_new_i64();
-tcg_gen_or_i64(cb, in1, in2);
-tcg_gen_and_i64(tmp, in1, in2);
-tcg_gen_andc_i64(cb, cb, res);
-tcg_gen_or_i64(cb, cb, tmp);
-}
+uint64_t ones = 0, sgns = 0;
 
 switch (cf >> 1) {
-case 0: /* never / TR */
-cond = cond_make_f();
-break;
-
 case 1: /* SBW / NBW */
 if (d) {
-tmp = tcg_temp_new_i64();
-tcg_gen_subi_i64(tmp, res, d_repl * 0x0001u);
-tcg_gen_andc_i64(tmp, tmp, res);
-tcg_gen_andi_i64(tmp, tmp, d_repl * 0x8000u);
-cond = cond_make_0(TCG_COND_NE, tmp);
-} else {
-/* undefined */
-cond = cond_make_f();
+ones = d_repl;
+sgns = d_repl << 31;
 }
 break;
-
 case 2: /* SBZ / NBZ */
-/* See hasless(v,1) from
- * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
- */
-tmp = tcg_temp_new_i64();
-tcg_gen_subi_i64(tmp, res, d_repl * 0x01010101u);
-tcg_gen_andc_i64(tmp, tmp, res);
-tcg_gen_andi_i64(tmp, tmp, d_repl * 0x80808080u);
-cond = cond_make_0(TCG_COND_NE, tmp);
+ones = d_repl * 0x01010101u;
+sgns = ones << 7;
 break;
-
 case 3: /* SHZ / NHZ */
-tmp = tcg_temp_new_i64();
-tcg_gen_subi_i64(tmp, res, d_repl * 0x00010001u);
-tcg_gen_andc_i64(tmp, tmp, res);
-tcg_gen_andi_i64(tmp, tmp, d_repl * 0x80008000u);
-cond = cond_make_0(TCG_COND_NE, tmp);
+ones = d_repl * 0x00010001u;
+sgns = ones << 15;
 break;
-
-case 4: /* SDC / NDC */
-tcg_gen_andi_i64(cb, cb, d_repl * 0xu);
-cond = cond_make_0(TCG_COND_NE, cb);
-break;
-
-case 5: /* SWC / NWC */
-if (d) {
-tcg_gen_andi_i64(cb, cb, d_repl * 0x8000u);
-cond = cond_make_0(TCG_COND_NE, cb);
-} else {
-/* undefined */
-cond = cond_make_f();
-}
-break;
-
-case 6: /* SBC / NBC */
-tcg_gen_andi_i64(cb, cb, d_repl * 0x80808080u);
-cond = cond_make_0(TCG_COND_NE, cb);
-break;
-
-case 7: /* SHC / NHC */
-tcg_gen_andi_i64(cb, cb, d_repl * 0x80008000u);
-cond = cond_make_0(TCG_COND_NE, cb);
-break;
-
-default:
-g_assert_not_reached();
 }
-if (cf & 1) {
-cond.c = tcg_invert_cond(cond.c);
+if (ones == 0) {
+/* Undefined, or 0/1 (never/always). */
+return cf & 1 ? cond_make_t() : cond_make_f();
 }
 
-return cond;
+/*
+ * See hasless(v,1) from
+ * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+ */
+tmp = tcg_temp_new_i64();
+tcg_gen_subi_i64(tmp, res, ones);
+tcg_gen_andc_i64(tmp, tmp, res);
+tcg_gen_andi_i64(tmp, tmp, sgns);
+
+return cond_make_0_tmp(cf & 1 ? TCG_COND_EQ : TCG_COND_NE, tmp);
 }
 
 static TCGv_i64 get_carry(DisasContext *ctx, bool d,
@@ -1330,34 +1276,86 @@ static bool do_log_reg(DisasContext *ctx, arg_rrr_cf_d 
*a,
 return nullify_end(ctx);

Re: [RFC v2 4/5] target/arm: Enable feature ARM_FEATURE_EL2 if EL2 is supported

2024-03-25 Thread Eric Auger

Hi Peter,

On 3/5/24 17:49, Peter Maydell wrote:
> On Fri, 9 Feb 2024 at 16:00, Eric Auger  wrote:
>> From: Haibo Xu 
>>
>> KVM_CAP_ARM_EL2 must be supported by the cpu to enable ARM_FEATURE_EL2.
>> In case the host does support NV, expose the feature.
>>
>> Signed-off-by: Haibo Xu 
>> Signed-off-by: Miguel Luis 
>> Signed-off-by: Eric Auger 
>>
>> ---
>>
>> v1 -> v2:
>> - remove isar_feature_aa64_aa32_el2 modif in target/arm/cpu.h
>>   [Richard] and use el2_supported in kvm_arch_init_vcpu
>> ---
>>  target/arm/kvm.c | 16 
>>  1 file changed, 16 insertions(+)
>>
>> diff --git a/target/arm/kvm.c b/target/arm/kvm.c
>> index 0996866afe..a08bc68a3f 100644
>> --- a/target/arm/kvm.c
>> +++ b/target/arm/kvm.c
>> @@ -238,6 +238,7 @@ static bool 
>> kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
>>   */
>>  int fdarray[3];
>>  bool sve_supported;
>> +bool el2_supported;
>>  bool pmu_supported = false;
>>  uint64_t features = 0;
>>  int err;
>> @@ -268,6 +269,14 @@ static bool 
>> kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
>>  init.features[0] |= 1 << KVM_ARM_VCPU_SVE;
>>  }
>>
>> +/*
>> + * Ask for EL2 if supported.
>> + */
>> +el2_supported = kvm_arm_el2_supported();
>> +if (el2_supported) {
>> +init.features[0] |= 1 << KVM_ARM_VCPU_HAS_EL2;
>> +}
>> +
>>  /*
>>   * Ask for Pointer Authentication if supported, so that we get
>>   * the unsanitized field values for AA64ISAR1_EL1.
>> @@ -449,6 +458,10 @@ static bool 
>> kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
>>  features |= 1ULL << ARM_FEATURE_PMU;
>>  features |= 1ULL << ARM_FEATURE_GENERIC_TIMER;
>>
>> +if (el2_supported) {
>> +features |= 1ULL << ARM_FEATURE_EL2;
>> +}
>> +
>>  ahcf->features = features;
>>
>>  return true;
>> @@ -1912,6 +1925,9 @@ int kvm_arch_init_vcpu(CPUState *cs)
>>  cpu->kvm_init_features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
>>1 << KVM_ARM_VCPU_PTRAUTH_GENERIC);
>>  }
>> +if (kvm_arm_el2_supported()) {
>> +cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_HAS_EL2;
>> +}
>>
>>  /* Do KVM_ARM_VCPU_INIT ioctl */
>>  ret = kvm_arm_vcpu_init(cpu);
> Am I reading this right that if the kernel supports FEAT_NV
> then we will always ask for a vCPU with that feature?
> Is that a good idea, or should we arrange to only do it if
> the user uses the 'virtualization=on' option to -M virt ?
> (Or does that happen already in some way I'm not seeing?)
yes you're right, if the host supports it, the feature is currently set
on the vcpu. I am not totaly clear under which conditions the features
shall be instantiated in the scratch VM and when the host passthrough
model shall be altered by machine option.  

Thanks

Eric 
>
> thanks
> -- PMM
>

Re: [PATCH 4/4] tests/tcg: Test shmat(NULL)

2024-03-25 Thread Richard Henderson


On 3/25/24 05:07, Ilya Leoshkevich wrote:

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich
---
  tests/tcg/multiarch/linux/linux-shmat-null.c | 38 
  1 file changed, 38 insertions(+)
  create mode 100644 tests/tcg/multiarch/linux/linux-shmat-null.c


Reviewed-by: Richard Henderson 

r~

Re: [PATCH 3/4] linux-user: Fix shmat(NULL) for h != g

2024-03-25 Thread Richard Henderson


On 3/25/24 05:07, Ilya Leoshkevich wrote:

In the h != g && shmaddr == NULL && !reserved_va case, target_shmat()
incorrectly mmap()s the initial anonymous range with
MAP_FIXED_NOREPLACE, even though the earlier mmap_find_vma() has
already reserved the respective address range.

Fix by using MAP_FIXED when "mapped", which is set after
mmap_find_vma(), is true.

Fixes: 78bc8ed9a8f0 ("linux-user: Rewrite target_shmat")
Signed-off-by: Ilya Leoshkevich 
---
  linux-user/mmap.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Richard Henderson 


r~

Re: [PATCH 1/4] linux-user: Fix semctl() strace

2024-03-25 Thread Richard Henderson


On 3/25/24 05:07, Ilya Leoshkevich wrote:

The indices of arguments used with semctl() are all off-by-1, because
arg1 is the ipc() command. Fix them. While at it, reuse print_semctl().

New output (for a small test program):

 3540333 semctl(999,888,SEM_INFO,0x7fe5051ee9a0) = -1 errno=14 (Bad 
address)

Fixes: 7ccfb2eb5f9d ("Fix warnings that would be caused by gcc flag 
-Wwrite-strings")
Signed-off-by: Ilya Leoshkevich 
---
  linux-user/strace.c | 8 +++-
  1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index 9934e2208e2..9be71af4016 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -657,7 +657,7 @@ print_newselect(CPUArchState *cpu_env, const struct 
syscallname *name,
  }
  #endif
  
-#ifdef TARGET_NR_semctl

+#if defined(TARGET_NR_semctl) || defined(TARGET_NR_ipc)
  static void
  print_semctl(CPUArchState *cpu_env, const struct syscallname *name,
   abi_long arg1, abi_long arg2, abi_long arg3,


You can remove this ifdef, because one of the two is always defined.
Otherwise,
Reviewed-by: Richard Henderson 

r~


@@ -698,10 +698,8 @@ print_ipc(CPUArchState *cpu_env, const struct syscallname 
*name,
  {
  switch(arg1) {
  case IPCOP_semctl:
-qemu_log("semctl(" TARGET_ABI_FMT_ld "," TARGET_ABI_FMT_ld ",",
- arg1, arg2);
-print_ipc_cmd(arg3);
-qemu_log(",0x" TARGET_ABI_FMT_lx ")", arg4);
+print_semctl(cpu_env, &(const struct syscallname){ .name = "semctl" },
+ arg2, arg3, arg4, arg5, 0, 0);
  break;
  case IPCOP_shmat:
  print_shmat(cpu_env, &(const struct syscallname){ .name = "shmat" },

Re: [PATCH 2/4] linux-user: Fix shmat() strace

2024-03-25 Thread Richard Henderson


On 3/25/24 05:07, Ilya Leoshkevich wrote:

The indices of arguments passed to print_shmat() are all off-by-1,
because arg1 is the ipc() command. Fix them.

New output for linux-shmat-maps test:

 3501769 shmat(4784214,0x0080,SHM_RND) = 0

Fixes: 9f7c97324c27 ("linux-user: Add strace for shmat")
Signed-off-by: Ilya Leoshkevich
---
  linux-user/strace.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Oops,
Reviewed-by: Richard Henderson 

r~

Re: [PATCH for-9.0] target/arm: take HSTR traps of cp15 accesses to EL2, not EL1

2024-03-25 Thread Richard Henderson


On 3/25/24 03:31, Peter Maydell wrote:

The HSTR_EL2 register allows the hypervisor to trap AArch32 EL1 and
EL0 accesses to cp15 registers.  We incorrectly implemented this so
they trap to EL1 when we detect the need for a HSTR trap at code
generation time.  (The check in access_check_cp_reg() which we do at
runtime to catch traps from EL0 is correctly routing them to EL2.)

Use the correct target EL when generating the code to take the trap.

Cc:qemu-sta...@nongnu.org
Resolves:https://gitlab.com/qemu-project/qemu/-/issues/2226
Fixes: 049edada5e93df ("target/arm: Make HSTR_EL2 traps take priority over 
UNDEF-at-EL1")
Signed-off-by: Peter Maydell
---
  target/arm/tcg/translate.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Richard Henderson 

r~

Re: [PATCH] hw/s390x: Include missing 'cpu.h' header

2024-03-25 Thread Eric Farman

On Fri, 2024-03-22 at 17:28 +0100, Philippe Mathieu-Daudé wrote:
> "cpu.h" is implicitly included. Include it explicitly to
> avoid the following error when refactoring headers:
> 
>   hw/s390x/s390-stattrib.c:86:40: error: use of undeclared identifier
> 'TARGET_PAGE_SIZE'
>   len = sac->peek_stattr(sas, addr / TARGET_PAGE_SIZE, buflen,
> vals);
>  ^
>   hw/s390x/s390-stattrib.c:94:58: error: use of undeclared identifier
> 'TARGET_PAGE_MASK'
>  addr / TARGET_PAGE_SIZE, len, addr &
> ~TARGET_PAGE_MASK);
>  ^
>   hw/s390x/s390-stattrib.c:224:40: error: use of undeclared
> identifier 'TARGET_PAGE_BITS'
>   qemu_put_be64(f, (start_gfn << TARGET_PAGE_BITS) |
> STATTR_FLAG_MORE);
>  ^
>   In file included from hw/s390x/s390-virtio-ccw.c:17:
>   hw/s390x/s390-virtio-hcall.h:22:27: error: unknown type name
> 'CPUS390XState'
>   int s390_virtio_hypercall(CPUS390XState *env);
>     ^
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  hw/s390x/s390-virtio-hcall.h | 2 ++
>  hw/s390x/s390-stattrib.c | 1 +
>  2 files changed, 3 insertions(+)

These aren't the only implicit users of cpu.h in hw/s390x/ but if this
solves one problem, then that's good.

Acked-by: Eric Farman

Re: [PATCH trivial for-9.0] hw/i386/fw_cfg.c: fix non-legacy smbios build

2024-03-25 Thread Michael Tokarev


25.03.2024 18:20, Igor Mammedov wrote

On Mon, 25 Mar 2024 16:09:20 +0300
Michael Tokarev  wrote:


When building qemu with smbios but not legacy mode (eg minimal microvm build),
link fails with:

   hw/i386/fw_cfg.c:74: undefined reference to `smbios_get_table_legacy'

This is because fw_cfg interface can call this function if CONFIG_SMBIOS
is defined.  Made this code block to depend on CONFIG_SMBIOS_LEGACY.


stub supposedly should have handled that
what configure options do you use to build 'minimal microvm'?


This is a custom build, not only configure options but also custom
devices.mak: 
https://salsa.debian.org/qemu-team/qemu/-/blob/master/debian/microvm-devices.mak

== cut ==
# see configs/devices/i386-softmmu/default.mak
# for additional devices which can be disabled
#
CONFIG_PCI_DEVICES=n
# we can't disable all machine types (boards) as of 6.1
# since the resulting binary fails to link
#CONFIG_ISAPC=y
#CONFIG_I440FX=y
CONFIG_Q35=y
CONFIG_MICROVM=y
CONFIG_VIRTIO_BLK=y
CONFIG_VIRTIO_SERIAL=y
CONFIG_VIRTIO_INPUT=y
CONFIG_VIRTIO_INPUT_HOST=y
CONFIG_VHOST_USER_INPUT=y
CONFIG_VIRTIO_NET=y
CONFIG_VIRTIO_SCSI=y
CONFIG_VIRTIO_RNG=y
CONFIG_VIRTIO_CRYPTO=y
CONFIG_VIRTIO_BALLOON=y
CONFIG_VIRTIO_MEM=y
CONFIG_VIRTIO_PMEM=y
CONFIG_VIRTIO_GPU=y
CONFIG_VHOST_USER_GPU=y
== cut ==

Relevant configure options:
https://salsa.debian.org/qemu-team/qemu/-/blob/master/debian/rules#L293-308

../../configure ${common_configure_opts} \
--extra-cflags="${extra-cflags} -DCONFIG_MICROVM_DEFAULT=1" \
--without-default-features \
--target-list=x86_64-softmmu --enable-kvm --disable-tcg \
--enable-pixman --enable-vnc \
--enable-attr \
--enable-coroutine-pool \
--audio-drv-list="" \
--without-default-devices \
--with-devices-x86_64=microvm \
--enable-vhost-kernel --enable-vhost-net \
--enable-vhost-vdpa \
--enable-vhost-user --enable-vhost-user-blk-server \
--enable-vhost-crypto \

I dunno how relevant these are, - it come from ubuntu and I haven't
looked there for a long time.  The idea was to have a build especially
for microvm with minimal footprint, as light as possible, for fastest
startup time etc.

Enabling (selecting) CONFIG_SMBIOS does not help since it is already
enabled by something, but not SMBIOS_LEGACY (which should not be
enabled in this case).

I still think it is better to avoid pcmc->smbios_legacy_mode variable
(field) entirely.

Thanks,

/mjt



Fixes: b42b0e4daaa5 "smbios: build legacy mode code only for 'pc' machine"
Signed-off-by: Michael Tokarev 
---
  hw/i386/fw_cfg.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index d802d2787f..d5e78a9183 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -70,6 +70,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
*fw_cfg,
  /* tell smbios about cpuid version and features */
  smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]);
  
+#ifdef CONFIG_SMBIOS_LEGACY

  if (pcmc->smbios_legacy_mode) {
  smbios_tables = smbios_get_table_legacy(&smbios_tables_len,
  &error_fatal);
@@ -77,6 +78,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
*fw_cfg,
   smbios_tables, smbios_tables_len);
  return;
  }
+#endif
  
  /* build the array of physical mem area from e820 table */

  mem_array = g_malloc0(sizeof(*mem_array) * e820_get_num_entries());

Re: [PATCH v3 2/2] Implement SSH commands in QEMU GA for Windows

2024-03-25 Thread Philippe Mathieu-Daudé


On 22/3/24 18:46, aidan_le...@selinc.com wrote:

From: Aidan Leuck 

Signed-off-by: Aidan Leuck 
---
  qga/commands-windows-ssh.c | 791 +


Huge file, I'm skipping it.


  qga/commands-windows-ssh.h |  26 ++
  qga/meson.build|   5 +-
  qga/qapi-schema.json   |  17 +-
  4 files changed, 828 insertions(+), 11 deletions(-)
  create mode 100644 qga/commands-windows-ssh.c
  create mode 100644 qga/commands-windows-ssh.h




diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index 9554b566a7..a64a6d91cf 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -1562,9 +1562,8 @@
  { 'struct': 'GuestAuthorizedKeys',
'data': {
'keys': ['str']
-  },
-  'if': 'CONFIG_POSIX' }
-


For Windows you have to check the CONFIG_WIN32 definition,
so you want:

  'if': { 'any': [ 'CONFIG_POSIX',
   'CONFIG_WIN32' ] },


+  }
+}
  
  ##

  # @guest-ssh-get-authorized-keys:
@@ -1580,8 +1579,8 @@
  ##
  { 'command': 'guest-ssh-get-authorized-keys',
'data': { 'username': 'str' },
-  'returns': 'GuestAuthorizedKeys',
-  'if': 'CONFIG_POSIX' }
+  'returns': 'GuestAuthorizedKeys'
+}
  
  ##

  # @guest-ssh-add-authorized-keys:
@@ -1599,8 +1598,8 @@
  # Since: 5.2
  ##
  { 'command': 'guest-ssh-add-authorized-keys',
-  'data': { 'username': 'str', 'keys': ['str'], '*reset': 'bool' },
-  'if': 'CONFIG_POSIX' }
+  'data': { 'username': 'str', 'keys': ['str'], '*reset': 'bool' }
+}
  
  ##

  # @guest-ssh-remove-authorized-keys:
@@ -1617,8 +1616,8 @@
  # Since: 5.2
  ##
  { 'command': 'guest-ssh-remove-authorized-keys',
-  'data': { 'username': 'str', 'keys': ['str'] },
-  'if': 'CONFIG_POSIX' }
+  'data': { 'username': 'str', 'keys': ['str'] }
+}
  
  ##

  # @GuestDiskStats:

Re: [RFC PATCH 00/12] SMMUv3 nested translation support

2024-03-25 Thread Marcin Juszkiewicz


W dniu 25.03.2024 o 11:13, Mostafa Saleh pisze:

Currently, QEMU supports emulating either stage-1 or stage-2 SMMUs
but not nested instances.
This patch series adds support for nested translation in SMMUv3,
this is controlled by property “arm-smmuv3.stage=nested”, and
advertised to guests as (IDR0.S1P == 1 && IDR0.S2P == 2)


From pure curiosity I applied the series, enabled 'nested' one in
sbsa-ref and ran (S)BSA ACS tests.

Two more tests passed. Ones which check does SMMU supports both stage1 
and stage2 at same time.


The fun part? Those tests only check SMMU registers.

Re: [RFC v2 2/5] hw/arm: Allow setting KVM vGIC maintenance IRQ

2024-03-25 Thread Eric Auger

Hi Peter,

On 3/5/24 17:46, Peter Maydell wrote:
> On Fri, 9 Feb 2024 at 16:00, Eric Auger  wrote:
>> From: Haibo Xu 
>>
>> Allow virt arm machine to set the intid for the KVM GIC maintenance
>> interrupt.
>>
>> Signed-off-by: Haibo Xu 
>> Signed-off-by: Miguel Luis 
>> Signed-off-by: Eric Auger 
>>
>> ---
>> v1 -> v2:
>> - [Miguel] replaced the has_virt_extensions by the maintenance irq
>>   intid property. [Eric] restored kvm_device_check_attr and
>>   kvm_device_access standard usage and conditionally call those
>>   if the prop is set
Please forgive me for the delay
> This seems reasonable, but it's not the same way we opted to
> handle telling the kernel the IRQ number for the PMU interrupt
> (where we use kvm_arm_pmu_set_irq()). I guess we have to do
> it this way because it's a device attr so we need to set it
> in gic realize, though?
This cannot follow the same pattern as the

kvm_arm_pmu_set_irq() because the maintenance irq must be set between before 
the GICv3 KVM device creation and the 
KVM_DEV_ARM_VGIC_CTRL_INIT. The GICv3 realize function calls both so I cannot 
set the maintenance after the realize. It would fail
with -EBUSY. Hope this helps.

Thanks

Eric

>
> By the way, does the kernel automatically complain and fail
> if we try to enable nested-virt with a GICv2 or with a
> userspace GIC, or do we need to catch and produce error
> messages for those (invalid) combinations ourselves?
>
> thanks
> -- PMM
>

Re: [PATCH v3 1/2] Refactor common functions between POSIX and Windows implementation

2024-03-25 Thread Philippe Mathieu-Daudé


Hi Aidan,

On 22/3/24 18:46, aidan_le...@selinc.com wrote:

From: Aidan Leuck 

Signed-off-by: Aidan Leuck 
---
  qga/commands-posix-ssh.c | 47 +
  qga/commands-ssh-core.c  | 57 
  qga/commands-ssh-core.h  |  8 ++
  qga/meson.build  |  1 +
  4 files changed, 67 insertions(+), 46 deletions(-)
  create mode 100644 qga/commands-ssh-core.c
  create mode 100644 qga/commands-ssh-core.h


We already have:
- commands-common.h
- commands-posix-ssh.c

what about using the same pattern?
- commands-common-ssh.c


diff --git a/qga/commands-posix-ssh.c b/qga/commands-posix-ssh.c
index 236f80de44..9a71b109f9 100644
--- a/qga/commands-posix-ssh.c
+++ b/qga/commands-posix-ssh.c
@@ -9,6 +9,7 @@
  #include 
  #include 
  
+#include "commands-ssh-core.h"

  #include "qapi/error.h"
  #include "qga-qapi-commands.h"
  
@@ -80,37 +81,6 @@ mkdir_for_user(const char *path, const struct passwd *p,

  return true;
  }
  
-static bool

-check_openssh_pub_key(const char *key, Error **errp)
-{
-/* simple sanity-check, we may want more? */
-if (!key || key[0] == '#' || strchr(key, '\n')) {
-error_setg(errp, "invalid OpenSSH public key: '%s'", key);
-return false;
-}
-
-return true;
-}
-
-static bool
-check_openssh_pub_keys(strList *keys, size_t *nkeys, Error **errp)
-{
-size_t n = 0;
-strList *k;
-
-for (k = keys; k != NULL; k = k->next) {
-if (!check_openssh_pub_key(k->value, errp)) {
-return false;
-}
-n++;
-}
-
-if (nkeys) {
-*nkeys = n;
-}
-return true;
-}
-
  static bool
  write_authkeys(const char *path, const GStrv keys,
 const struct passwd *p, Error **errp)
@@ -139,21 +109,6 @@ write_authkeys(const char *path, const GStrv keys,
  return true;
  }
  
-static GStrv

-read_authkeys(const char *path, Error **errp)
-{
-g_autoptr(GError) err = NULL;
-g_autofree char *contents = NULL;
-
-if (!g_file_get_contents(path, &contents, NULL, &err)) {
-error_setg(errp, "failed to read '%s': %s", path, err->message);
-return NULL;
-}
-
-return g_strsplit(contents, "\n", -1);
-
-}
-
  void
  qmp_guest_ssh_add_authorized_keys(const char *username, strList *keys,
bool has_reset, bool reset,
diff --git a/qga/commands-ssh-core.c b/qga/commands-ssh-core.c
new file mode 100644
index 00..f165c4a337
--- /dev/null
+++ b/qga/commands-ssh-core.c
@@ -0,0 +1,57 @@
+/*
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include 
+#include 
+#include "qapi/error.h"
+#include "commands-ssh-core.h"
+
+GStrv read_authkeys(const char *path, Error **errp)
+{
+g_autoptr(GError) err = NULL;
+g_autofree char *contents = NULL;
+
+if (!g_file_get_contents(path, &contents, NULL, &err))
+{


Please keep QEMU style while moving the code, see
https://www.qemu.org/docs/master/devel/submitting-a-patch.html#use-the-qemu-coding-style 
which explain how to run scripts/checkpatch.pl

before posting your patches.

Otherwise the refactor LGTM.

Regards,

Phil.


+error_setg(errp, "failed to read '%s': %s", path, err->message);
+return NULL;
+}
+
+return g_strsplit(contents, "\n", -1);
+}

Re: [PATCH v1] qapi: document parameters of query-cpu-model-* QAPI commands

2024-03-25 Thread Markus Armbruster

David Hildenbrand  writes:

>>>   #
>>>   # Usually, a CPU model is compared against the maximum possible CPU
>>>   # model of a certain configuration (e.g. the "host" model for KVM).
>>> @@ -154,7 +155,14 @@
>>>   # Some architectures may not support comparing CPU models.  s390x
>>>   # supports comparing CPU models.
>>>   #
>>> -# Returns: a CpuModelBaselineInfo
>>> +# @modela: description of the first CPU model to compare, referred to as
>>> +# "model A" in CpuModelCompareResult
>>> +#
>>> +# @modelb: description of the second CPU model to compare, referred to as
>>> +# "model B" in CpuModelCompareResult
>>> +#
>>> +# Returns: a CpuModelCompareInfo, describing how both CPU models
>>
>> Scratch the comma?
>
> Agreed to both. Do you want to fixup when applying or do you prefer a v2?

Happy to fix it up myself, thanks!

> Thanks!
>
>> Reviewed-by: Markus Armbruster

Re: [RFC 4/8] virtio: Implement in-order handling for virtio devices

2024-03-25 Thread Jonah Palmer





On 3/22/24 6:46 AM, Eugenio Perez Martin wrote:

On Thu, Mar 21, 2024 at 4:57 PM Jonah Palmer  wrote:


Implements in-order handling for most virtio devices using the
VIRTIO_F_IN_ORDER transport feature, specifically those who call
virtqueue_push to push their used elements onto the used ring.

The logic behind this implementation is as follows:

1.) virtqueue_pop always enqueues VirtQueueElements in-order.

virtqueue_pop always retrieves one or more buffer descriptors in-order
from the available ring and converts them into a VirtQueueElement. This
means that the order in which VirtQueueElements are enqueued are
in-order by default.

By virtue, as VirtQueueElements are created, we can assign a sequential
key value to them. This preserves the order of buffers that have been
made available to the device by the driver.

As VirtQueueElements are assigned a key value, the current sequence
number is incremented.

2.) Requests can be completed out-of-order.

While most devices complete requests in the same order that they were
enqueued by default, some devices don't (e.g. virtio-blk). The goal of
this out-of-order handling is to reduce the impact of devices that
process elements in-order by default while also guaranteeing compliance
with the VIRTIO_F_IN_ORDER feature.

Below is the logic behind handling completed requests (which may or may
not be in-order).

3.) Does the incoming used VirtQueueElement preserve the correct order?

In other words, is the sequence number (key) assigned to the
VirtQueueElement the expected number that would preserve the original
order?

3a.)
If it does... immediately push the used element onto the used ring.
Then increment the next expected sequence number and check to see if
any previous out-of-order VirtQueueElements stored on the hash table
has a key that matches this next expected sequence number.

For each VirtQueueElement found on the hash table with a matching key:
push the element on the used ring, remove the key-value pair from the
hash table, and then increment the next expected sequence number. Repeat
this process until we're unable to find an element with a matching key.

Note that if the device uses batching (e.g. virtio-net), then we skip
the virtqueue_flush call and let the device call it themselves.

3b.)
If it does not... stash the VirtQueueElement, along with relevant data,
as a InOrderVQElement on the hash table. The key used is the order_key
that was assigned when the VirtQueueElement was created.

Signed-off-by: Jonah Palmer 
---
  hw/virtio/virtio.c | 70 --
  include/hw/virtio/virtio.h |  8 +
  2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 40124545d6..40e4377f1e 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -992,12 +992,56 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count)
  }
  }

+void virtqueue_order_element(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len, unsigned int idx,
+ unsigned int count)
+{
+InOrderVQElement *in_order_elem;
+
+if (elem->order_key == vq->current_order_idx) {
+/* Element is in-order, push to used ring */
+virtqueue_fill(vq, elem, len, idx);
+
+/* Batching? Don't flush */
+if (count) {
+virtqueue_flush(vq, count);


The "count" parameter is the number of heads used, but here you're
only using one head (elem). Same with the other virtqueue_flush in the
function.



True. This acts more as a flag than an actual count since, unless we're 
batching (which in the current setup, the device would explicitly call 
virtqueue_flush separately), this value will be either 0 or 1.



Also, this function sometimes replaces virtqueue_fill and other
replaces virtqueue_fill + virtqueue_flush (both examples in patch
6/8). I have the impression the series would be simpler if
virtqueue_order_element is a static function just handling the
virtio_vdev_has_feature(vq->vdev, VIRTIO_F_IN_ORDER) path of
virtqueue_fill, so the caller does not need to know if the in_order
feature is on or off.



Originally I wanted this function to replace virtqueue_fill + 
virtqueue_flush but after looking at virtio_net_receive_rcu and 
vhost_svq_flush, where multiple virtqueue_fill's can be called before a 
single virtqueue_flush, I added this 'if (count)' conditional to handle 
both cases.


I did consider virtqueue_order_element just handling the virtqueue_fill 
path but then I wasn't sure how to handle calling virtqueue_flush when 
retrieving out-of-order data from the hash table.


For example, devices that call virtqueue_push would call virtqueue_fill 
and then virtqueue_flush afterwards. In the scenario where, say, elem1 
was found out of order and put into the hash table, and then elem0 comes 
along. For elem0 we'd call virtqueue_fill and then we should call 
virtqueue_flush to keep the order going. Then we'd f

Re: [PATCH] meson: Fix MESONINTROSPECT parsing

2024-03-25 Thread Michael Tokarev


25.03.2024 20:23, Peter Maydell :

On Sat, 12 Aug 2023 at 13:10, Akihiko Odaki  wrote:

...

The problem is that Meson uses a different logic for escaping arguments
in MESONINTROSPECT on Windows. I'll wait till Meson maintainers figure
out how MESONINTROSPECT should be used. For details, see:
https://github.com/mesonbuild/meson/pull/12115#issuecomment-1675863266


Am I correct in understanding from
https://github.com/mesonbuild/meson/pull/12807
that the eventual resolution that Meson upstream decided was
to restore the behaviour that regardless of platform the right
way to split the file is shlex.split()?

If so, then I think we should resurrect and apply this patch,
since at the moment configuring QEMU fails if, for instance,
the build tree directory name has a '~' character in it.


Yes, meson upstream went for shlex.split() finally.
However this change is pretty recent (Feb-11) if applying, we have
to upgrade meson too, which is not a good idea at this stage during
RCs.

The whole issue is very minor though.  I think so far it only happened
on debian due to its usage of tilde (~) char in 9.0.0~rc0 (instead of
a minus sign), because in debian, a tilde has special meaning in version
string, it sorts before anything else, even before 9.0.0 in this case.

I don't think this issue is a problem in practice in any other context.
So for now, I'll apply this patch to qemu in debian to let the RCs
build, it wont be needed after actual release, and we can rethink
this for 9.1 or a later version.

Thanks,

/mjt

Re: [PATCH] meson: Fix MESONINTROSPECT parsing

2024-03-25 Thread Peter Maydell

On Sat, 12 Aug 2023 at 13:10, Akihiko Odaki  wrote:
>
> On 2023/08/12 15:15, Akihiko Odaki wrote:
> > The arguments in MESONINTROSPECT are quoted with shlex.quote() so it
> > must be parsed with shlex.split().
> >
> > Fixes: cf60ccc330 ("cutils: Introduce bundle mechanism")
> > Reported-by: Michael Tokarev 
> > Signed-off-by: Akihiko Odaki 
> > ---
> >   scripts/symlink-install-tree.py | 3 ++-
> >   1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/scripts/symlink-install-tree.py 
> > b/scripts/symlink-install-tree.py
> > index 8ed97e3c94..b72563895c 100644
> > --- a/scripts/symlink-install-tree.py
> > +++ b/scripts/symlink-install-tree.py
> > @@ -4,6 +4,7 @@
> >   import errno
> >   import json
> >   import os
> > +import shlex
> >   import subprocess
> >   import sys
> >
> > @@ -14,7 +15,7 @@ def destdir_join(d1: str, d2: str) -> str:
> >   return str(PurePath(d1, *PurePath(d2).parts[1:]))
> >
> >   introspect = os.environ.get('MESONINTROSPECT')
> > -out = subprocess.run([*introspect.split(' '), '--installed'],
> > +out = subprocess.run([*shlex.split(introspect), '--installed'],
> >stdout=subprocess.PIPE, check=True).stdout
> >   for source, dest in json.loads(out).items():
> >   bundle_dest = destdir_join('qemu-bundle', dest)
>
> Please do NOT merge this. This will break Windows builds. I'm putting
> this patch on hold.
>
> The problem is that Meson uses a different logic for escaping arguments
> in MESONINTROSPECT on Windows. I'll wait till Meson maintainers figure
> out how MESONINTROSPECT should be used. For details, see:
> https://github.com/mesonbuild/meson/pull/12115#issuecomment-1675863266

Am I correct in understanding from
https://github.com/mesonbuild/meson/pull/12807
that the eventual resolution that Meson upstream decided was
to restore the behaviour that regardless of platform the right
way to split the file is shlex.split()?

If so, then I think we should resurrect and apply this patch,
since at the moment configuring QEMU fails if, for instance,
the build tree directory name has a '~' character in it.

thanks
-- PMM

Re: [RFC 1/8] virtio: Define InOrderVQElement

2024-03-25 Thread Jonah Palmer





On 3/22/24 5:45 AM, Eugenio Perez Martin wrote:

On Thu, Mar 21, 2024 at 4:57 PM Jonah Palmer  wrote:


Define the InOrderVQElement structure for the VIRTIO_F_IN_ORDER
transport feature implementation.

The InOrderVQElement structure is used to encapsulate out-of-order
VirtQueueElement data that was processed by the host. This data
includes:
  - The processed VirtQueueElement (elem)
  - Length of data (len)
  - VirtQueueElement array index (idx)
  - Number of processed VirtQueueElements (count)

InOrderVQElements will be stored in a buffering mechanism until an
order can be achieved.

Signed-off-by: Jonah Palmer 
---
  include/hw/virtio/virtio.h | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index b3c74a1bca..c8aa435a5e 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -77,6 +77,13 @@ typedef struct VirtQueueElement
  struct iovec *out_sg;
  } VirtQueueElement;

+typedef struct InOrderVQElement {
+const VirtQueueElement *elem;


Some subsystems allocate space for extra elements after
VirtQueueElement, like VirtIOBlockReq. You can request virtqueue_pop
to allocate this extra space by its second argument. Would it work for
this?



I don't see why not. Although this may not be necessary due to me 
missing a key aspect mentioned in your comment below.



+unsigned int len;
+unsigned int idx;
+unsigned int count;


Now I don't get why these fields cannot be obtained from elem->(len,
index, ndescs) ?



Interesting. I didn't realize that these values are equivalent to a 
VirtQueueElement's len, index, and ndescs fields.


Is this always true? Else I would've expected, for example, 
virtqueue_push to not need the 'unsigned int len' parameter if this 
information is already included via. the VirtQueueElement being passed in.



+} InOrderVQElement;
+
  #define VIRTIO_QUEUE_MAX 1024

  #define VIRTIO_NO_VECTOR 0x
--
2.39.3

Re: how do the iotests pick a machine model to run on ?

2024-03-25 Thread Thomas Huth


On 19/01/2024 17.18, Peter Maydell wrote:

On Fri, 19 Jan 2024 at 15:26, Peter Maydell  wrote:

(Also, we should probably put an entry for sh4 in machine_map,
because the default board type (shix) is about to be deprecated,
and the r2d board type is thus a better choice.)


The good news is if we add r2d to the machine_map, then
only 3 iotests fail:

  191 -- not sure exactly what's going on. QEMU complains
 "machine type does not support if=ide,bus=0,unit=1".
 Side note: the test harness seems to throw away the
 stderr from QEMU with this error message, leaving the
 test failure log rather uninformative. I had to
 run everything under strace to get hold of it.
  203 -- this wants a machine type that can be migrated;
 sh4 CPUs don't support migration, so the test
 fails because the 'migrate' command returns the error
 {"error": {"class": "GenericError", "desc": "State blocked by
non-migratable device 'cpu'"}}
  267 -- similarly, wants a machine that supports snapshots,
 so fails when the loadvm/savevm get the error
 Error: State blocked by non-migratable device 'cpu'

How should a test indicate "I need a machine type that
supports migration" ?


We could maybe add a flag to the machine_map to indicate whether the machine 
is capable of migration or not. In the latter case, we could skip all tests 
that are in the "migration" group ?


 Thomas

Re: [RFC 0/8] virtio,vhost: Add VIRTIO_F_IN_ORDER support

2024-03-25 Thread Jonah Palmer





On 3/22/24 7:18 AM, Eugenio Perez Martin wrote:

On Thu, Mar 21, 2024 at 4:57 PM Jonah Palmer  wrote:


The goal of these patches is to add support to a variety of virtio and
vhost devices for the VIRTIO_F_IN_ORDER transport feature. This feature
indicates that all buffers are used by the device in the same order in
which they were made available by the driver.

These patches attempt to implement a generalized, non-device-specific
solution to support this feature.

The core feature behind this solution is a buffer mechanism in the form
of GLib's GHashTable. The decision behind using a hash table was to
leverage their ability for quick lookup, insertion, and removal
operations. Given that our keys are simply numbers of an ordered
sequence, a hash table seemed like the best choice for a buffer
mechanism.

-

The strategy behind this implementation is as follows:

We know that buffers that are popped from the available ring and enqueued
for further processing will always done in the same order in which they
were made available by the driver. Given this, we can note their order
by assigning the resulting VirtQueueElement a key. This key is a number
in a sequence that represents the order in which they were popped from
the available ring, relative to the other VirtQueueElements.

For example, given 3 "elements" that were popped from the available
ring, we assign a key value to them which represents their order (elem0
is popped first, then elem1, then lastly elem2):

  elem2   --  elem1   --  elem0   ---> Enqueue for processing
 (key: 2)(key: 1)(key: 0)

Then these elements are enqueued for further processing by the host.

While most devices will return these completed elements in the same
order in which they were enqueued, some devices may not (e.g.
virtio-blk). To guarantee that these elements are put on the used ring
in the same order in which they were enqueued, we can use a buffering
mechanism that keeps track of the next expected sequence number of an
element.

In other words, if the completed element does not have a key value that
matches the next expected sequence number, then we know this element is
not in-order and we must stash it away in a hash table until an order
can be made. The element's key value is used as the key for placing it
in the hash table.

If the completed element has a key value that matches the next expected
sequence number, then we know this element is in-order and we can push
it on the used ring. Then we increment the next expected sequence number
and check if the hash table contains an element at this key location.

If so, we retrieve this element, push it to the used ring, delete the
key-value pair from the hash table, increment the next expected sequence
number, and check the hash table again for an element at this new key
location. This process is repeated until we're unable to find an element
in the hash table to continue the order.

So, for example, say the 3 elements we enqueued were completed in the
following order: elem1, elem2, elem0. The next expected sequence number
is 0:

 exp-seq-num = 0:

  elem1   --> elem1.key == exp-seq-num ? --> No, stash it
 (key: 1) |
  |
  v

|key: 1 - elem1|

 -
 exp-seq-num = 0:

  elem2   --> elem2.key == exp-seq-num ? --> No, stash it
 (key: 2) |
  |
  v

|key: 1 - elem1|
|--|
|key: 2 - elem2|

 -
 exp-seq-num = 0:

  elem0   --> elem0.key == exp-seq-num ? --> Yes, push to used ring
 (key: 0)

 exp-seq-num = 1:

 lookup(table, exp-seq-num) != NULL ? --> Yes, push to used ring,
  remove elem from table
  |
  v

|key: 2 - elem2|


 exp-seq-num = 2:

 lookup(table, exp-seq-num) != NULL ? --> Yes, push to used ring,
  remove elem from table
  |

Re: [PULL v2 0/7] target-arm queue

2024-03-25 Thread Peter Maydell

On Mon, 25 Mar 2024 at 14:19, Peter Maydell  wrote:
>
>
> v2: added a missing #include qemu/error-report.h which only causes
> build failure in some configs, not all.
>
> The following changes since commit 853546f8128476eefb701d4a55b2781bb3a46faa:
>
>   Merge tag 'pull-loongarch-20240322' of https://gitlab.com/gaosong/qemu into 
> staging (2024-03-22 10:59:57 +)
>
> are available in the Git repository at:
>
>   https://git.linaro.org/people/pmaydell/qemu-arm.git 
> tags/pull-target-arm-20240325-1
>
> for you to fetch changes up to fe3e38390126c2202292911c49d46fc7ee4a163a:
>
>   tests/qtest/libqtest.c: Check for g_setenv() failure (2024-03-25 14:17:07 
> +)
>
> 
> target-arm queue:
>  * Fixes for seven minor coverity issues
>


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/9.0
for any user-visible changes.

-- PMM

Re: [PULL 0/9] Patches for QEMU 9.0-rc1

2024-03-25 Thread Peter Maydell

On Mon, 25 Mar 2024 at 14:12, Thomas Huth  wrote:
>
> The following changes since commit 853546f8128476eefb701d4a55b2781bb3a46faa:
>
>   Merge tag 'pull-loongarch-20240322' of https://gitlab.com/gaosong/qemu into 
> staging (2024-03-22 10:59:57 +)
>
> are available in the Git repository at:
>
>   https://gitlab.com/thuth/qemu.git tags/pull-request-2024-03-25
>
> for you to fetch changes up to f9b29c636442e917a56a725d774ea99be3b28111:
>
>   tests/tcg/s390x: Test TEST AND SET (2024-03-25 15:05:59 +0100)
>
> 
> * Fix timeouts in Travis-CI jobs
> * Mark devices with user_creatable = false that can crash QEMU otherwise
> * Fix s390x TEST-AND-SET TCG instruction emulation
> * Move pc955* devices to hw/gpio/
>


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/9.0
for any user-visible changes.

-- PMM

Re: [PULL 0/3] Migration 20240322 patches

2024-03-25 Thread Peter Maydell

On Fri, 22 Mar 2024 at 16:15,  wrote:
>
> From: Peter Xu 
>
> The following changes since commit 853546f8128476eefb701d4a55b2781bb3a46faa:
>
>   Merge tag 'pull-loongarch-20240322' of https://gitlab.com/gaosong/qemu into 
> staging (2024-03-22 10:59:57 +)
>
> are available in the Git repository at:
>
>   https://gitlab.com/peterx/qemu.git tags/migration-20240322-pull-request
>
> for you to fetch changes up to 8fa1a21c6edc2bf7de85984944848ab9ac49e937:
>
>   migration/multifd: Fix clearing of mapped-ram zero pages (2024-03-22 
> 12:12:08 -0400)
>
> 
> Migration pull for 9.0-rc1
>
> - Fabiano's patch to revert fd: support on mapped-ram
> - Peter's fix on postcopy regression on unnecessary dirty syncs
> - Fabiano's fix on mapped-ram rare corrupt on zero page handling
>
> 


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/9.0
for any user-visible changes.

-- PMM

Re: [PATCH] qapi/block-core: improve Qcow2OverlapCheckFlags documentation

2024-03-25 Thread Vladimir Sementsov-Ogievskiy


On 25.03.24 16:04, Markus Armbruster wrote:

Vladimir Sementsov-Ogievskiy  writes:


Most of fields have no description at all. Let's fix that. Still, no
reason to place here more detailed descriptions of what these
structures are, as we have public Qcow2 format specification.

Signed-off-by: Vladimir Sementsov-Ogievskiy
---
  qapi/block-core.json | 25 +
  1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 1874f880a8..b9fb994a66 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3403,14 +3403,31 @@
  # @Qcow2OverlapCheckFlags:
  #
  # Structure of flags for each metadata structure.  Setting a field to
-# 'true' makes qemu guard that structure against unintended
-# overwriting.  The default value is chosen according to the template
-# given.
+# 'true' makes qemu guard that Qcow2 format structure against

Mind if I use the occasion to correct the spelling of QEMU?


No problem, thanks for fixing

--
Best regards,
Vladimir

Re: [PATCH v1] qapi: document parameters of query-cpu-model-* QAPI commands

2024-03-25 Thread David Hildenbrand


  #
  # Usually, a CPU model is compared against the maximum possible CPU
  # model of a certain configuration (e.g. the "host" model for KVM).
@@ -154,7 +155,14 @@
  # Some architectures may not support comparing CPU models.  s390x
  # supports comparing CPU models.
  #
-# Returns: a CpuModelBaselineInfo
+# @modela: description of the first CPU model to compare, referred to as
+# "model A" in CpuModelCompareResult
+#
+# @modelb: description of the second CPU model to compare, referred to as
+# "model B" in CpuModelCompareResult
+#
+# Returns: a CpuModelCompareInfo, describing how both CPU models


Scratch the comma?


Agreed to both. Do you want to fixup when applying or do you prefer a v2?

Thanks!



Reviewed-by: Markus Armbruster 



--
Cheers,

David / dhildenb

Re: [PATCH trivial for-9.0] hw/i386/fw_cfg.c: fix non-legacy smbios build

2024-03-25 Thread Igor Mammedov

On Mon, 25 Mar 2024 14:40:21 +0100
Philippe Mathieu-Daudé  wrote:

> Hi Michael,
> 
> On 25/3/24 14:09, Michael Tokarev wrote:
> > When building qemu with smbios but not legacy mode (eg minimal microvm 
> > build),
> > link fails with:
> > 
> >hw/i386/fw_cfg.c:74: undefined reference to `smbios_get_table_legacy'
> > 
> > This is because fw_cfg interface can call this function if CONFIG_SMBIOS
> > is defined.  Made this code block to depend on CONFIG_SMBIOS_LEGACY.
> > 
> > Fixes: b42b0e4daaa5 "smbios: build legacy mode code only for 'pc' machine"
> > Signed-off-by: Michael Tokarev 
> > ---
> >   hw/i386/fw_cfg.c | 2 ++
> >   1 file changed, 2 insertions(+)
> > 
> > diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
> > index d802d2787f..d5e78a9183 100644
> > --- a/hw/i386/fw_cfg.c
> > +++ b/hw/i386/fw_cfg.c
> > @@ -70,6 +70,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
> > *fw_cfg,
> >   /* tell smbios about cpuid version and features */
> >   smbios_set_cpuid(cpu->env.cpuid_version, 
> > cpu->env.features[FEAT_1_EDX]);
> >   
> > +#ifdef CONFIG_SMBIOS_LEGACY
> >   if (pcmc->smbios_legacy_mode) {  
> 
> But then having pcmc->smbios_legacy_mode == true without
> CONFIG_SMBIOS_LEGACY would be a bug. IMHO what we want is:
> 
> -- >8 --  
> diff --git a/hw/smbios/smbios_legacy_stub.c b/hw/smbios/smbios_legacy_stub.c
> index f29b15316c..7d593dca98 100644
> --- a/hw/smbios/smbios_legacy_stub.c
> +++ b/hw/smbios/smbios_legacy_stub.c
> @@ -13,3 +13,8 @@
>   void smbios_add_usr_blob_size(size_t size)
>   {
>   }
> +
> +uint8_t *smbios_get_table_legacy(size_t *length, Error **errp)
> +{
> +g_assert_not_reached();
> +}

Agreed

Philippe,
shall I post a patch or will you do this?

> ---
> 
> >   smbios_tables = smbios_get_table_legacy(&smbios_tables_len,
> >   &error_fatal);
> > @@ -77,6 +78,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
> > *fw_cfg,
> >smbios_tables, smbios_tables_len);
> >   return;
> >   }
> > +#endif
> >   
> >   /* build the array of physical mem area from e820 table */
> >   mem_array = g_malloc0(sizeof(*mem_array) * e820_get_num_entries());  
>

Re: [PATCH v1] coroutine: avoid inserting duplicate coroutine to co_queue_wakeup

2024-03-25 Thread Eric Blake

I've seen (and agree with) Stefan's reply that a more thorough audit
is needed, but am still providing a preliminary review based on what I
see here.

On Mon, Mar 25, 2024 at 05:18:50PM +0800, zhuyangyang via wrote:
> If g_main_loop_run()/aio_poll() is called in the coroutine context,
> the pending coroutine may be woken up repeatedly, and the co_queue_wakeup
> may be disordered.

coroutine context should not be doing anything that can block; you may
have uncovered a bigger bug that we need to address.

> 
> When the poll() syscall exited in g_main_loop_run()/aio_poll(), it means
> some listened events is completed. Therefore, the completion callback
> function is dispatched.
> 
> If this callback function needs to invoke aio_co_enter(), it will only
> wake up the coroutine (because we are already in coroutine context),
> which may cause that the data on this listening event_fd/socket_fd
> is not read/cleared. When the next poll () exits, it will be woken up again
> and inserted into the wakeup queue again.
> 
> For example, if TLS is enabled in NBD, the server will call g_main_loop_run()
> in the coroutine, and repeatedly wake up the io_read event on a socket.
> The call stack is as follows:
> 
> aio_co_enter()
> aio_co_wake()
> qio_channel_restart_read()
> aio_dispatch_handler()
> aio_dispatch_handlers()
> aio_dispatch()
> aio_ctx_dispatch()
> g_main_context_dispatch()
> g_main_loop_run()
> nbd_negotiate_handle_starttls()
> nbd_negotiate_options()
> nbd_negotiate()
> nbd_co_client_start()
> coroutine_trampoline()
> 
> Signed-off-by: zhuyangyang 
> ---
>  util/async.c | 13 -
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/util/async.c b/util/async.c
> index 0467890052..25fc1e6083 100644
> --- a/util/async.c
> +++ b/util/async.c
> @@ -705,7 +705,18 @@ void aio_co_enter(AioContext *ctx, Coroutine *co)
>  if (qemu_in_coroutine()) {
>  Coroutine *self = qemu_coroutine_self();
>  assert(self != co);
> -QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
> +/*
> + * If the Coroutine *co is already in the co_queue_wakeup, this
> + * repeated insertion will causes the loss of other queue element

s/causes/cause/

> + * or infinite loop.
> + * For examplex:

s/examplex/example/

> + * Head->a->b->c->NULL, after insert_tail(head, b) => 
> Head->a->b->NULL
> + * Head->a-b>->NULL, after insert_tail(head, b) => Head->a->b->b...

s/b>->/b->/

> + */
> +if (!co->co_queue_next.sqe_next &&
> +self->co_queue_wakeup.sqh_last != &co->co_queue_next.sqe_next) {
> +QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
> +}
>  } else {
>  qemu_aio_coroutine_enter(ctx, co);
>  }

Intuitively, attacking the symptoms (avoiding bogus list insertion
when it is already on the list) makes sense; but I want to make sure
we attack the root cause.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

Re: [PATCH] qapi/block-core: improve Qcow2OverlapCheckFlags documentation

2024-03-25 Thread Markus Armbruster

Squashing in

diff --git a/qapi/pragma.json b/qapi/pragma.json
index 99e4052ab3..9e28de1721 100644
--- a/qapi/pragma.json
+++ b/qapi/pragma.json
@@ -72,7 +72,6 @@
 'QCryptoAkCipherKeyType',
 'QCryptodevBackendServiceType',
 'QKeyCode',
-'Qcow2OverlapCheckFlags',
 'RbdAuthMode',
 'RbdImageEncryptionFormat',
 'String',

Re: [PATCH v1] coroutine: avoid inserting duplicate coroutine to co_queue_wakeup

2024-03-25 Thread Stefan Hajnoczi

On Mon, Mar 25, 2024 at 05:18:50PM +0800, zhuyangyang wrote:
> If g_main_loop_run()/aio_poll() is called in the coroutine context,
> the pending coroutine may be woken up repeatedly, and the co_queue_wakeup
> may be disordered.

aio_poll() must not be called from coroutine context:

  bool no_coroutine_fn aio_poll(AioContext *ctx, bool blocking);
   ^^^

Coroutines are not supposed to block. Instead, they should yield.

> When the poll() syscall exited in g_main_loop_run()/aio_poll(), it means
> some listened events is completed. Therefore, the completion callback
> function is dispatched.
> 
> If this callback function needs to invoke aio_co_enter(), it will only
> wake up the coroutine (because we are already in coroutine context),
> which may cause that the data on this listening event_fd/socket_fd
> is not read/cleared. When the next poll () exits, it will be woken up again
> and inserted into the wakeup queue again.
> 
> For example, if TLS is enabled in NBD, the server will call g_main_loop_run()
> in the coroutine, and repeatedly wake up the io_read event on a socket.
> The call stack is as follows:
> 
> aio_co_enter()
> aio_co_wake()
> qio_channel_restart_read()
> aio_dispatch_handler()
> aio_dispatch_handlers()
> aio_dispatch()
> aio_ctx_dispatch()
> g_main_context_dispatch()
> g_main_loop_run()
> nbd_negotiate_handle_starttls()

This code does not look like it was designed to run in coroutine
context. Two options:

1. Don't run it in coroutine context (e.g. use a BH instead). This
   avoids blocking the coroutine but calling g_main_loop_run() is still
   ugly, in my opinion.

2. Get rid of data.loop and use coroutine APIs instead:

   while (!data.complete) {
   qemu_coroutine_yield();
   }

   and update nbd_tls_handshake() to call aio_co_wake(data->co) instead
   of g_main_loop_quit(data->loop).

   This requires auditing the code to check whether the event loop might
   invoke something that interferes with
   nbd_negotiate_handle_starttls(). Typically this means monitor
   commands or fd activity that could change the state of this
   connection while it is yielded. This is where the real work is but
   hopefully it will not be that hard to figure out.

> nbd_negotiate_options()
> nbd_negotiate()
> nbd_co_client_start()
> coroutine_trampoline()
> 
> Signed-off-by: zhuyangyang 
> ---
>  util/async.c | 13 -
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/util/async.c b/util/async.c
> index 0467890052..25fc1e6083 100644
> --- a/util/async.c
> +++ b/util/async.c
> @@ -705,7 +705,18 @@ void aio_co_enter(AioContext *ctx, Coroutine *co)
>  if (qemu_in_coroutine()) {
>  Coroutine *self = qemu_coroutine_self();
>  assert(self != co);
> -QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
> +/*
> + * If the Coroutine *co is already in the co_queue_wakeup, this
> + * repeated insertion will causes the loss of other queue element
> + * or infinite loop.
> + * For examplex:
> + * Head->a->b->c->NULL, after insert_tail(head, b) => 
> Head->a->b->NULL
> + * Head->a-b>->NULL, after insert_tail(head, b) => Head->a->b->b...
> + */
> +if (!co->co_queue_next.sqe_next &&
> +self->co_queue_wakeup.sqh_last != &co->co_queue_next.sqe_next) {
> +QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
> +}
>  } else {
>  qemu_aio_coroutine_enter(ctx, co);
>  }
> -- 
> 2.33.0
> 


signature.asc
Description: PGP signature

Re: [PATCH] qapi: document leftover members in qapi/stats.json

2024-03-25 Thread Markus Armbruster

Squashing in

diff --git a/qapi/pragma.json b/qapi/pragma.json
index 1a302981c1..99e4052ab3 100644
--- a/qapi/pragma.json
+++ b/qapi/pragma.json
@@ -75,8 +75,6 @@
 'Qcow2OverlapCheckFlags',
 'RbdAuthMode',
 'RbdImageEncryptionFormat',
-'StatsFilter',
-'StatsValue',
 'String',
 'StringWrapper',
 'SysEmuTarget',
@@ -91,8 +89,7 @@
 'query-cpu-model-comparison',
 'query-cpu-model-expansion',
 'query-rocker',
-'query-rocker-ports',
-'query-stats-schemas' ],
+'query-rocker-ports' ],
 # Externally visible types whose member names may use uppercase
 'member-name-exceptions': [ # visible in:
 'ACPISlotType', # query-acpi-ospm-status

Re: [PATCH] qapi: document leftover members in qapi/run-state.json

2024-03-25 Thread Markus Armbruster

Squashing in

diff --git a/qapi/pragma.json b/qapi/pragma.json
index 92715d22b3..1a302981c1 100644
--- a/qapi/pragma.json
+++ b/qapi/pragma.json
@@ -57,7 +57,6 @@
 'DummyForceArrays',
 'DummyVirtioForceArrays',
 'GrabToggleKeys',
-'GuestPanicInformationHyperV',
 'HotKeyMod',
 'ImageInfoSpecificKind',
 'InputAxis',
@@ -93,8 +92,7 @@
 'query-cpu-model-expansion',
 'query-rocker',
 'query-rocker-ports',
-'query-stats-schemas',
-'watchdog-set-action' ],
+'query-stats-schemas' ],
 # Externally visible types whose member names may use uppercase
 'member-name-exceptions': [ # visible in:
 'ACPISlotType', # query-acpi-ospm-status

[PATCH] tests/qemu-iotests: Test 157 and 227 require virtio-blk

2024-03-25 Thread Thomas Huth

Tests 157 and 227 use the virtio-blk device, so we have to mark these
tests accordingly to be skipped if this devices is not available (e.g.
when running the tests with qemu-system-avr only).

Signed-off-by: Thomas Huth 
---
 tests/qemu-iotests/157 | 2 ++
 tests/qemu-iotests/227 | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tests/qemu-iotests/157 b/tests/qemu-iotests/157
index 0dc9ba68d2..aa2ebbfb4b 100755
--- a/tests/qemu-iotests/157
+++ b/tests/qemu-iotests/157
@@ -40,6 +40,8 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt generic
 _supported_proto file
 
+_require_devices virtio-blk
+
 do_run_qemu()
 {
 (
diff --git a/tests/qemu-iotests/227 b/tests/qemu-iotests/227
index 7e45a47ac6..eddaad679e 100755
--- a/tests/qemu-iotests/227
+++ b/tests/qemu-iotests/227
@@ -40,6 +40,8 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt generic
 _supported_proto file
 
+_require_devices virtio-blk
+
 do_run_qemu()
 {
 echo Testing: "$@"
-- 
2.44.0

Re: [PATCH] qapi: document InputMultiTouchType

2024-03-25 Thread Markus Armbruster

Squashing in

diff --git a/qapi/pragma.json b/qapi/pragma.json
index 6929ab776e..92715d22b3 100644
--- a/qapi/pragma.json
+++ b/qapi/pragma.json
@@ -62,8 +62,6 @@
 'ImageInfoSpecificKind',
 'InputAxis',
 'InputButton',
-'InputMultiTouchEvent',
-'InputMultiTouchType',
 'IscsiHeaderDigest',
 'IscsiTransport',
 'JSONType',

Re: [PATCH v1] qapi: document parameters of query-cpu-model-* QAPI commands

2024-03-25 Thread Markus Armbruster

David Hildenbrand  writes:

> Let's document the parameters of these commands, so we can remove them
> from the "documentation-exceptions" list.
>
> While at it, extend the "Returns:" documentation as well, fixing a wrong
> use of CpuModelBaselineInfo vs. CpuModelCompareInfo for
> query-cpu-model-comparison.
>
> Cc: Markus Armbruster 
> Cc: Eric Blake 
> Cc: Eduardo Habkost 
> Cc: Marcel Apfelbaum 
> Cc: "Philippe Mathieu-Daudé" 
> Cc: Yanan Wang 
> Signed-off-by: David Hildenbrand 
> ---
>  qapi/machine-target.json | 46 +++-
>  qapi/pragma.json |  3 ---
>  2 files changed, 31 insertions(+), 18 deletions(-)
>
> diff --git a/qapi/machine-target.json b/qapi/machine-target.json
> index 519adf3220..7883616cce 100644
> --- a/qapi/machine-target.json
> +++ b/qapi/machine-target.json
> @@ -124,11 +124,12 @@
>  ##
>  # @query-cpu-model-comparison:
>  #
> -# Compares two CPU models, returning how they compare in a specific
> -# configuration.  The results indicates how both models compare
> -# regarding runnability.  This result can be used by tooling to make
> -# decisions if a certain CPU model will run in a certain configuration
> -# or if a compatible CPU model has to be created by baselining.
> +# Compares two CPU models, @modela and @modelb, returning how they
> +# compare in a specific configuration.  The results indicates how
> +# both models compare regarding runnability.  This result can be
> +# used by tooling to make decisions if a certain CPU model will
> +# run in a certain configuration or if a compatible CPU model has
> +# to be created by baselining.
>  #
>  # Usually, a CPU model is compared against the maximum possible CPU
>  # model of a certain configuration (e.g. the "host" model for KVM).
> @@ -154,7 +155,14 @@
>  # Some architectures may not support comparing CPU models.  s390x
>  # supports comparing CPU models.
>  #
> -# Returns: a CpuModelBaselineInfo
> +# @modela: description of the first CPU model to compare, referred to as
> +# "model A" in CpuModelCompareResult
> +#
> +# @modelb: description of the second CPU model to compare, referred to as
> +# "model B" in CpuModelCompareResult
> +#
> +# Returns: a CpuModelCompareInfo, describing how both CPU models

Scratch the comma?

> +# compare
>  #
>  # Errors:
>  # - if comparing CPU models is not supported
> @@ -175,9 +183,9 @@
>  ##
>  # @query-cpu-model-baseline:
>  #
> -# Baseline two CPU models, creating a compatible third model.  The
> -# created model will always be a static, migration-safe CPU model (see
> -# "static" CPU model expansion for details).
> +# Baseline two CPU models, @modela and @modelb, creating a compatible
> +# third model.  The created model will always be a static,
> +# migration-safe CPU model (see "static" CPU model expansion for details).
>  #
>  # This interface can be used by tooling to create a compatible CPU
>  # model out two CPU models.  The created CPU model will be identical
> @@ -204,7 +212,11 @@
>  # Some architectures may not support baselining CPU models.  s390x
>  # supports baselining CPU models.
>  #
> -# Returns: a CpuModelBaselineInfo
> +# @modela: description of the first CPU model to baseline
> +#
> +# @modelb: description of the second CPU model to baseline
> +#
> +# Returns: a CpuModelBaselineInfo, describing the baselined CPU model

Scratch the comma?

>  #
>  # Errors:
>  # - if baselining CPU models is not supported
> @@ -243,10 +255,10 @@
>  ##
>  # @query-cpu-model-expansion:
>  #
> -# Expands a given CPU model (or a combination of CPU model +
> -# additional options) to different granularities, allowing tooling to
> -# get an understanding what a specific CPU model looks like in QEMU
> -# under a certain configuration.
> +# Expands a given CPU model, @model, (or a combination of CPU model +
> +# additional options) to different granularities, specified by
> +# @type, allowing tooling to get an understanding what a specific
> +# CPU model looks like in QEMU under a certain configuration.
>  #
>  # This interface can be used to query the "host" CPU model.
>  #
> @@ -269,7 +281,11 @@
>  # Some architectures may not support all expansion types.  s390x
>  # supports "full" and "static". Arm only supports "full".
>  #
> -# Returns: a CpuModelExpansionInfo
> +# @model: description of the CPU model to expand
> +#
> +# @type: expansion type, specifying how to expand the CPU model
> +#
> +# Returns: a CpuModelExpansionInfo, describing the expanded CPU model
>  #
>  # Errors:
>  # - if expanding CPU models is not supported
> diff --git a/qapi/pragma.json b/qapi/pragma.json
> index 6929ab776e..0d82bc1a03 100644
> --- a/qapi/pragma.json
> +++ b/qapi/pragma.json
> @@ -90,9 +90,6 @@
>  'XDbgBlockGraph',
>  'YankInstanceType',
>  'blockdev-reopen',
> -'query-cpu-model-baseline',
> -'query-cpu-model-comparison',
> -'query-cpu-model-expansion',
>  'query-rocker',
>  'query-

[PATCH 0/4] linux-user: Fix shmat(NULL) for h != g

2024-03-25 Thread Ilya Leoshkevich

Hi,

I noticed that while shmat() now works with /proc/self/maps,
shmat(NULL) got broken. This series fixes that along with two related
strace issues, and adds a test.

Best regards,
Ilya

Ilya Leoshkevich (4):
  linux-user: Fix semctl() strace
  linux-user: Fix shmat() strace
  linux-user: Fix shmat(NULL) for h != g
  tests/tcg: Check that shmat(NULL)

 linux-user/mmap.c|  2 +-
 linux-user/strace.c  | 10 +++---
 tests/tcg/multiarch/linux/linux-shmat-null.c | 38 
 3 files changed, 43 insertions(+), 7 deletions(-)
 create mode 100644 tests/tcg/multiarch/linux/linux-shmat-null.c

-- 
2.44.0

[PATCH 2/4] linux-user: Fix shmat() strace

2024-03-25 Thread Ilya Leoshkevich

The indices of arguments passed to print_shmat() are all off-by-1,
because arg1 is the ipc() command. Fix them.

New output for linux-shmat-maps test:

3501769 shmat(4784214,0x0080,SHM_RND) = 0

Fixes: 9f7c97324c27 ("linux-user: Add strace for shmat")
Signed-off-by: Ilya Leoshkevich 
---
 linux-user/strace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index 9be71af4016..3b4ccd9fa04 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -703,7 +703,7 @@ print_ipc(CPUArchState *cpu_env, const struct syscallname 
*name,
 break;
 case IPCOP_shmat:
 print_shmat(cpu_env, &(const struct syscallname){ .name = "shmat" },
-arg1, arg4, arg2, 0, 0, 0);
+arg2, arg5, arg3, 0, 0, 0);
 break;
 default:
 qemu_log(("%s("
-- 
2.44.0

[PATCH 1/4] linux-user: Fix semctl() strace

2024-03-25 Thread Ilya Leoshkevich

The indices of arguments used with semctl() are all off-by-1, because
arg1 is the ipc() command. Fix them. While at it, reuse print_semctl().

New output (for a small test program):

3540333 semctl(999,888,SEM_INFO,0x7fe5051ee9a0) = -1 errno=14 (Bad 
address)

Fixes: 7ccfb2eb5f9d ("Fix warnings that would be caused by gcc flag 
-Wwrite-strings")
Signed-off-by: Ilya Leoshkevich 
---
 linux-user/strace.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/linux-user/strace.c b/linux-user/strace.c
index 9934e2208e2..9be71af4016 100644
--- a/linux-user/strace.c
+++ b/linux-user/strace.c
@@ -657,7 +657,7 @@ print_newselect(CPUArchState *cpu_env, const struct 
syscallname *name,
 }
 #endif
 
-#ifdef TARGET_NR_semctl
+#if defined(TARGET_NR_semctl) || defined(TARGET_NR_ipc)
 static void
 print_semctl(CPUArchState *cpu_env, const struct syscallname *name,
  abi_long arg1, abi_long arg2, abi_long arg3,
@@ -698,10 +698,8 @@ print_ipc(CPUArchState *cpu_env, const struct syscallname 
*name,
 {
 switch(arg1) {
 case IPCOP_semctl:
-qemu_log("semctl(" TARGET_ABI_FMT_ld "," TARGET_ABI_FMT_ld ",",
- arg1, arg2);
-print_ipc_cmd(arg3);
-qemu_log(",0x" TARGET_ABI_FMT_lx ")", arg4);
+print_semctl(cpu_env, &(const struct syscallname){ .name = "semctl" },
+ arg2, arg3, arg4, arg5, 0, 0);
 break;
 case IPCOP_shmat:
 print_shmat(cpu_env, &(const struct syscallname){ .name = "shmat" },
-- 
2.44.0

[PATCH 4/4] tests/tcg: Test shmat(NULL)

2024-03-25 Thread Ilya Leoshkevich

Add a small test to prevent regressions.

Signed-off-by: Ilya Leoshkevich 
---
 tests/tcg/multiarch/linux/linux-shmat-null.c | 38 
 1 file changed, 38 insertions(+)
 create mode 100644 tests/tcg/multiarch/linux/linux-shmat-null.c

diff --git a/tests/tcg/multiarch/linux/linux-shmat-null.c 
b/tests/tcg/multiarch/linux/linux-shmat-null.c
new file mode 100644
index 000..94eaaec371a
--- /dev/null
+++ b/tests/tcg/multiarch/linux/linux-shmat-null.c
@@ -0,0 +1,38 @@
+/*
+ * Test shmat(NULL).
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#include 
+#include 
+#include 
+#include 
+
+int main(void)
+{
+int shmid;
+char *p;
+int err;
+
+/* Create, attach and intialize shared memory. */
+shmid = shmget(IPC_PRIVATE, 1, IPC_CREAT | 0600);
+assert(shmid != -1);
+p = shmat(shmid, NULL, 0);
+assert(p != (void *)-1);
+*p = 42;
+
+/* Reattach, check that the value is still there. */
+err = shmdt(p);
+assert(err == 0);
+p = shmat(shmid, NULL, 0);
+assert(p != (void *)-1);
+assert(*p == 42);
+
+/* Detach. */
+err = shmdt(p);
+assert(err == 0);
+err = shmctl(shmid, IPC_RMID, NULL);
+assert(err == 0);
+
+return EXIT_SUCCESS;
+}
-- 
2.44.0

[PATCH 3/4] linux-user: Fix shmat(NULL) for h != g

2024-03-25 Thread Ilya Leoshkevich

In the h != g && shmaddr == NULL && !reserved_va case, target_shmat()
incorrectly mmap()s the initial anonymous range with
MAP_FIXED_NOREPLACE, even though the earlier mmap_find_vma() has
already reserved the respective address range.

Fix by using MAP_FIXED when "mapped", which is set after
mmap_find_vma(), is true.

Fixes: 78bc8ed9a8f0 ("linux-user: Rewrite target_shmat")
Signed-off-by: Ilya Leoshkevich 
---
 linux-user/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linux-user/mmap.c b/linux-user/mmap.c
index e88faf1ab3d..681b6db1b67 100644
--- a/linux-user/mmap.c
+++ b/linux-user/mmap.c
@@ -1358,7 +1358,7 @@ abi_ulong target_shmat(CPUArchState *cpu_env, int shmid,
 if (h_len != t_len) {
 int mmap_p = PROT_READ | (shmflg & SHM_RDONLY ? 0 : PROT_WRITE);
 int mmap_f = MAP_PRIVATE | MAP_ANONYMOUS
-   | (reserved_va || (shmflg & SHM_REMAP)
+   | (reserved_va || mapped || (shmflg & SHM_REMAP)
   ? MAP_FIXED : MAP_FIXED_NOREPLACE);
 
 test = mmap(want, m_len, mmap_p, mmap_f, -1, 0);
-- 
2.44.0

Re: [PATCH trivial for-9.0] hw/i386/fw_cfg.c: fix non-legacy smbios build

2024-03-25 Thread Igor Mammedov

On Mon, 25 Mar 2024 16:09:20 +0300
Michael Tokarev  wrote:

> When building qemu with smbios but not legacy mode (eg minimal microvm build),
> link fails with:
> 
>   hw/i386/fw_cfg.c:74: undefined reference to `smbios_get_table_legacy'
> 
> This is because fw_cfg interface can call this function if CONFIG_SMBIOS
> is defined.  Made this code block to depend on CONFIG_SMBIOS_LEGACY.
> 
> Fixes: b42b0e4daaa5 "smbios: build legacy mode code only for 'pc' machine"
> Signed-off-by: Michael Tokarev 


hmh, it looks like MICROVM doesn't select SMBIOS nor FW_CFG_DMA

which looks broken to me,
does following help:

diff --git a/hw/i386/Kconfig b/hw/i386/Kconfig
index a6ee052f9a..54c77b5bcc 100644
--- a/hw/i386/Kconfig
+++ b/hw/i386/Kconfig
@@ -119,6 +119,8 @@ config MICROVM
 select PCI_EXPRESS_GENERIC_BRIDGE
 select USB_XHCI_SYSBUS
 select I8254
+select SMBIOS
+select FW_CFG_DMA
 
 config X86_IOMMU
 bool


> ---
>  hw/i386/fw_cfg.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
> index d802d2787f..d5e78a9183 100644
> --- a/hw/i386/fw_cfg.c
> +++ b/hw/i386/fw_cfg.c
> @@ -70,6 +70,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
> *fw_cfg,
>  /* tell smbios about cpuid version and features */
>  smbios_set_cpuid(cpu->env.cpuid_version, cpu->env.features[FEAT_1_EDX]);
>  
> +#ifdef CONFIG_SMBIOS_LEGACY
>  if (pcmc->smbios_legacy_mode) {
>  smbios_tables = smbios_get_table_legacy(&smbios_tables_len,
>  &error_fatal);
> @@ -77,6 +78,7 @@ void fw_cfg_build_smbios(PCMachineState *pcms, FWCfgState 
> *fw_cfg,
>   smbios_tables, smbios_tables_len);
>  return;
>  }
> +#endif
>  
>  /* build the array of physical mem area from e820 table */
>  mem_array = g_malloc0(sizeof(*mem_array) * e820_get_num_entries());

[PATCH-for-9.0 v3 2/3] hw/misc/stm32l4x5_rcc: Inline clock_update() in clock_mux_update()

2024-03-25 Thread Philippe Mathieu-Daudé

Trivial inlining in preliminary patch to make the next
one easier to review.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/misc/stm32l4x5_rcc.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/misc/stm32l4x5_rcc.c b/hw/misc/stm32l4x5_rcc.c
index bc2d63528b..49b90afdf0 100644
--- a/hw/misc/stm32l4x5_rcc.c
+++ b/hw/misc/stm32l4x5_rcc.c
@@ -48,6 +48,8 @@ static void clock_mux_update(RccClockMuxState *mux, bool 
bypass_source)
 uint64_t src_freq;
 Clock *current_source = mux->srcs[mux->src];
 uint32_t freq_multiplier = 0;
+bool clk_changed = false;
+
 /*
  * To avoid rounding errors, we use the clock period instead of the
  * frequency.
@@ -60,7 +62,10 @@ static void clock_mux_update(RccClockMuxState *mux, bool 
bypass_source)
 }
 
 clock_set_mul_div(mux->out, freq_multiplier, mux->multiplier);
-clock_update(mux->out, clock_get(current_source));
+clk_changed |= clock_set(mux->out, clock_get(current_source));
+if (clk_changed) {
+clock_propagate(mux->out);
+}
 
 src_freq = clock_get_hz(current_source);
 /* TODO: can we simply detect if the config changed so that we reduce log 
spam ? */
-- 
2.41.0

[PATCH-for-9.0 v3 1/3] hw/clock: Let clock_set_mul_div() return a boolean value

2024-03-25 Thread Philippe Mathieu-Daudé

Let clock_set_mul_div() return a boolean value whether the
clock has been updated or not, similarly to clock_set().

Return early when clock_set_mul_div() is called with
same mul/div values the clock has.

Acked-by: Luc Michel 
Signed-off-by: Philippe Mathieu-Daudé 
---
 docs/devel/clocks.rst | 4 
 include/hw/clock.h| 4 +++-
 hw/core/clock.c   | 8 +++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/docs/devel/clocks.rst b/docs/devel/clocks.rst
index c4d14bde04..b2d1148cdb 100644
--- a/docs/devel/clocks.rst
+++ b/docs/devel/clocks.rst
@@ -279,6 +279,10 @@ You can change the multiplier and divider of a clock at 
runtime,
 so you can use this to model clock controller devices which
 have guest-programmable frequency multipliers or dividers.
 
+Similary to ``clock_set()``, ``clock_set_mul_div()`` returns ``true`` if
+the clock state was modified; that is, if the multiplier or the diviser
+or both were changed by the call.
+
 Note that ``clock_set_mul_div()`` does not automatically call
 ``clock_propagate()``. If you make a runtime change to the
 multiplier or divider you must call clock_propagate() yourself.
diff --git a/include/hw/clock.h b/include/hw/clock.h
index bb12117f67..eb58599131 100644
--- a/include/hw/clock.h
+++ b/include/hw/clock.h
@@ -357,6 +357,8 @@ char *clock_display_freq(Clock *clk);
  * @multiplier: multiplier value
  * @divider: divider value
  *
+ * @return: true if the clock is changed.
+ *
  * By default, a Clock's children will all run with the same period
  * as their parent. This function allows you to adjust the multiplier
  * and divider used to derive the child clock frequency.
@@ -374,6 +376,6 @@ char *clock_display_freq(Clock *clk);
  * Note that this function does not call clock_propagate(); the
  * caller should do that if necessary.
  */
-void clock_set_mul_div(Clock *clk, uint32_t multiplier, uint32_t divider);
+bool clock_set_mul_div(Clock *clk, uint32_t multiplier, uint32_t divider);
 
 #endif /* QEMU_HW_CLOCK_H */
diff --git a/hw/core/clock.c b/hw/core/clock.c
index d82e44cd1a..a19c7db7df 100644
--- a/hw/core/clock.c
+++ b/hw/core/clock.c
@@ -143,14 +143,20 @@ char *clock_display_freq(Clock *clk)
 return freq_to_str(clock_get_hz(clk));
 }
 
-void clock_set_mul_div(Clock *clk, uint32_t multiplier, uint32_t divider)
+bool clock_set_mul_div(Clock *clk, uint32_t multiplier, uint32_t divider)
 {
 assert(divider != 0);
 
+if (clk->multiplier == multiplier && clk->divider == divider) {
+return false;
+}
+
 trace_clock_set_mul_div(CLOCK_PATH(clk), clk->multiplier, multiplier,
 clk->divider, divider);
 clk->multiplier = multiplier;
 clk->divider = divider;
+
+return true;
 }
 
 static void clock_initfn(Object *obj)
-- 
2.41.0

[PATCH-for-9.0 v3 0/3] hw/clock: Propagate clock changes when STM32L4X5 MUX is updated

2024-03-25 Thread Philippe Mathieu-Daudé

Since v2:
- Simpler approach

Since v1:
- Rework API to only propagate when both clock_set
  and clock_set_mul_div modified the clock params
  (Peter & Luc).
- Use that in zynq_slcr.

Per 
https://www.qemu.org/docs/master/devel/clocks.html#clock-multiplier-and-divider-settings:

  Note that clock_set_mul_div() does not automatically call
  clock_propagate(). If you make a runtime change to the
  multiplier or divider you must call clock_propagate() yourself.

Fix what we forgot to do that in recent commit ec7d83acbd
("hw/misc/stm32l4x5_rcc: Add an internal clock multiplexer object")

Arnaud Minier (1):
  hw/misc/stm32l4x5_rcc: Propagate period when enabling a clock

Philippe Mathieu-Daudé (2):
  hw/clock: Let clock_set_mul_div() return a boolean value
  hw/misc/stm32l4x5_rcc: Inline clock_update() in clock_mux_update()

 docs/devel/clocks.rst   | 4 
 include/hw/clock.h  | 4 +++-
 hw/core/clock.c | 8 +++-
 hw/misc/stm32l4x5_rcc.c | 9 +++--
 4 files changed, 21 insertions(+), 4 deletions(-)

-- 
2.41.0

1 2 3 >

1 - 100 of 284 matches

Mail list logo