date:20211209

[PATCH v11 02/77] target/riscv: Use FIELD_EX32() to extract wd field

2021-12-09 Thread frank . chang

From: Frank Chang 

Signed-off-by: Frank Chang 
Reviewed-by: Richard Henderson 
Reviewed-by: Alistair Francis 
---
 target/riscv/vector_helper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 12c31aa4b4..70f589813e 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -98,7 +98,7 @@ static inline uint32_t vext_lmul(uint32_t desc)
 
 static uint32_t vext_wd(uint32_t desc)
 {
-return (simd_data(desc) >> 11) & 0x1;
+return FIELD_EX32(simd_data(desc), VDATA, WD);
 }
 
 /*
-- 
2.31.1

[PATCH v11 01/77] target/riscv: drop vector 0.7.1 and add 1.0 support

2021-12-09 Thread frank . chang

From: Frank Chang 

Signed-off-by: Frank Chang 
Reviewed-by: Richard Henderson 
Reviewed-by: Alistair Francis 
Reviewed-by: Bin Meng 
---
 target/riscv/cpu.c | 16 
 target/riscv/cpu.h |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 9835829588..728092f78c 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -523,7 +523,7 @@ static void riscv_cpu_realize(DeviceState *dev, Error 
**errp)
 ext |= RVH;
 }
 if (cpu->cfg.ext_v) {
-int vext_version = VEXT_VERSION_0_07_1;
+int vext_version = VEXT_VERSION_1_00_0;
 ext |= RVV;
 if (!is_power_of_2(cpu->cfg.vlen)) {
 error_setg(errp,
@@ -548,8 +548,8 @@ static void riscv_cpu_realize(DeviceState *dev, Error 
**errp)
 return;
 }
 if (cpu->cfg.vext_spec) {
-if (!g_strcmp0(cpu->cfg.vext_spec, "v0.7.1")) {
-vext_version = VEXT_VERSION_0_07_1;
+if (!g_strcmp0(cpu->cfg.vext_spec, "v1.0")) {
+vext_version = VEXT_VERSION_1_00_0;
 } else {
 error_setg(errp,
"Unsupported vector spec version '%s'",
@@ -558,7 +558,7 @@ static void riscv_cpu_realize(DeviceState *dev, Error 
**errp)
 }
 } else {
 qemu_log("vector version is not specified, "
-"use the default value v0.7.1\n");
+ "use the default value v1.0\n");
 }
 set_vext_version(env, vext_version);
 }
@@ -626,6 +626,7 @@ static Property riscv_cpu_properties[] = {
 DEFINE_PROP_BOOL("c", RISCVCPU, cfg.ext_c, true),
 DEFINE_PROP_BOOL("s", RISCVCPU, cfg.ext_s, true),
 DEFINE_PROP_BOOL("u", RISCVCPU, cfg.ext_u, true),
+DEFINE_PROP_BOOL("v", RISCVCPU, cfg.ext_v, false),
 DEFINE_PROP_BOOL("Counters", RISCVCPU, cfg.ext_counters, true),
 DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
 DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
@@ -635,6 +636,9 @@ static Property riscv_cpu_properties[] = {
 DEFINE_PROP_BOOL("pmp", RISCVCPU, cfg.pmp, true),
 
 DEFINE_PROP_STRING("priv_spec", RISCVCPU, cfg.priv_spec),
+DEFINE_PROP_STRING("vext_spec", RISCVCPU, cfg.vext_spec),
+DEFINE_PROP_UINT16("vlen", RISCVCPU, cfg.vlen, 128),
+DEFINE_PROP_UINT16("elen", RISCVCPU, cfg.elen, 64),
 
 /* These are experimental so mark with 'x-' */
 DEFINE_PROP_BOOL("x-zba", RISCVCPU, cfg.ext_zba, false),
@@ -643,10 +647,6 @@ static Property riscv_cpu_properties[] = {
 DEFINE_PROP_BOOL("x-zbs", RISCVCPU, cfg.ext_zbs, false),
 DEFINE_PROP_BOOL("x-h", RISCVCPU, cfg.ext_h, false),
 DEFINE_PROP_BOOL("x-j", RISCVCPU, cfg.ext_j, false),
-DEFINE_PROP_BOOL("x-v", RISCVCPU, cfg.ext_v, false),
-DEFINE_PROP_STRING("vext_spec", RISCVCPU, cfg.vext_spec),
-DEFINE_PROP_UINT16("vlen", RISCVCPU, cfg.vlen, 128),
-DEFINE_PROP_UINT16("elen", RISCVCPU, cfg.elen, 64),
 /* ePMP 0.9.3 */
 DEFINE_PROP_BOOL("x-epmp", RISCVCPU, cfg.epmp, false),
 
diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index ef677f9092..5ea2004ae0 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -81,7 +81,7 @@ enum {
 #define PRIV_VERSION_1_10_0 0x00011000
 #define PRIV_VERSION_1_11_0 0x00011100
 
-#define VEXT_VERSION_0_07_1 0x0701
+#define VEXT_VERSION_1_00_0 0x0001
 
 enum {
 TRANSLATE_SUCCESS,
-- 
2.31.1

Re: [PATCH 3/4] Move CONFIG_XFS handling to meson.build

2021-12-09 Thread Thomas Huth


On 02/11/2021 12.34, Paolo Bonzini wrote:

On 28/10/21 20:59, Thomas Huth wrote:

Checking for xfsctl() can be done more easily in meson.build. Also,
this is not a "real" feature like the other features that we provide
with the "--enable-xxx" and "--disable-xxx" switches for the
configure script, since this does not influence lots of code (it's
only about one call to xfsctl() in file-posix.c), so people don't
gain much with the ability to disable this with "--disable-xfsctl".
Let's rather treat this like the other cc.has_function() checks in
meson.build, i.e. don't add a new option for this in meson_options.txt.

Signed-off-by: Thomas Huth 


I think we should just use ioctl and copy the relevant definitions from Linux:

struct dioattr {
     u32   d_mem;  /* data buffer memory alignment */
     u32   d_miniosz;  /* min xfer size    */
     u32   d_maxiosz;  /* max xfer size    */
};

#define XFS_IOC_DIOINFO    _IOR ('X', 30, struct dioattr)


I've now had a closer look at this idea, but it's getting messy: We'd 
additionally also need the platform_test_xfs_fd() function that is called 
from file-posix.c ... sure it's not big, but the XFS header stuff is 
licensed as LGPL, so it feels wrong to copy this over into file-posix.c that 
has a MIT license. Of course, it could be rewritten, or put into a separate 
file ... but that is already way more cumbersome for such a small benefit. 
So I think I prefer to rather keep my patch in the current shape that has a 
way nicer diffstat with way less risk of messing things up here.


 Thomas

[PATCH v6 8/8] target/riscv: zfh: add Zfhmin cpu property

2021-12-09 Thread frank . chang

From: Frank Chang 

Signed-off-by: Frank Chang 
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index 0f808a5bee..9835829588 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -630,6 +630,7 @@ static Property riscv_cpu_properties[] = {
 DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
 DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
 DEFINE_PROP_BOOL("Zfh", RISCVCPU, cfg.ext_zfh, false),
+DEFINE_PROP_BOOL("Zfhmin", RISCVCPU, cfg.ext_zfhmin, false),
 DEFINE_PROP_BOOL("mmu", RISCVCPU, cfg.mmu, true),
 DEFINE_PROP_BOOL("pmp", RISCVCPU, cfg.pmp, true),
 
-- 
2.31.1

[PATCH v6 4/8] target/riscv: zfh: half-precision floating-point compare

2021-12-09 Thread frank . chang

From: Kito Cheng 

Signed-off-by: Kito Cheng 
Signed-off-by: Chih-Min Chao 
Signed-off-by: Frank Chang 
Reviewed-by: Richard Henderson 
Reviewed-by: Alistair Francis 
---
 target/riscv/fpu_helper.c | 21 +
 target/riscv/helper.h |  3 ++
 target/riscv/insn32.decode|  3 ++
 target/riscv/insn_trans/trans_rvzfh.c.inc | 37 +++
 4 files changed, 64 insertions(+)

diff --git a/target/riscv/fpu_helper.c b/target/riscv/fpu_helper.c
index 2ed9b03193..ec2009ee65 100644
--- a/target/riscv/fpu_helper.c
+++ b/target/riscv/fpu_helper.c
@@ -461,6 +461,27 @@ uint64_t helper_fsqrt_h(CPURISCVState *env, uint64_t rs1)
 return nanbox_h(float16_sqrt(frs1, >fp_status));
 }
 
+target_ulong helper_fle_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+return float16_le(frs1, frs2, >fp_status);
+}
+
+target_ulong helper_flt_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+return float16_lt(frs1, frs2, >fp_status);
+}
+
+target_ulong helper_feq_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+return float16_eq_quiet(frs1, frs2, >fp_status);
+}
+
 target_ulong helper_fcvt_w_h(CPURISCVState *env, uint64_t rs1)
 {
 float16 frs1 = check_nanbox_h(rs1);
diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index b50672d168..9c89521d4a 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -74,6 +74,9 @@ DEF_HELPER_FLAGS_3(fdiv_h, TCG_CALL_NO_RWG, i64, env, i64, 
i64)
 DEF_HELPER_FLAGS_3(fmin_h, TCG_CALL_NO_RWG, i64, env, i64, i64)
 DEF_HELPER_FLAGS_3(fmax_h, TCG_CALL_NO_RWG, i64, env, i64, i64)
 DEF_HELPER_FLAGS_2(fsqrt_h, TCG_CALL_NO_RWG, i64, env, i64)
+DEF_HELPER_FLAGS_3(fle_h, TCG_CALL_NO_RWG, tl, env, i64, i64)
+DEF_HELPER_FLAGS_3(flt_h, TCG_CALL_NO_RWG, tl, env, i64, i64)
+DEF_HELPER_FLAGS_3(feq_h, TCG_CALL_NO_RWG, tl, env, i64, i64)
 DEF_HELPER_FLAGS_2(fcvt_s_h, TCG_CALL_NO_RWG, i64, env, i64)
 DEF_HELPER_FLAGS_2(fcvt_h_s, TCG_CALL_NO_RWG, i64, env, i64)
 DEF_HELPER_FLAGS_2(fcvt_d_h, TCG_CALL_NO_RWG, i64, env, i64)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index ba40f3e7f8..3906c9fb20 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -751,6 +751,9 @@ fcvt_d_h   011  00010 . ... . 1010011 @r2_rm
 fcvt_w_h   1100010  0 . ... . 1010011 @r2_rm
 fcvt_wu_h  1100010  1 . ... . 1010011 @r2_rm
 fmv_x_h1110010  0 . 000 . 1010011 @r2
+feq_h  1010010  . . 010 . 1010011 @r
+flt_h  1010010  . . 001 . 1010011 @r
+fle_h  1010010  . . 000 . 1010011 @r
 fcvt_h_w   1101010  0 . ... . 1010011 @r2_rm
 fcvt_h_wu  1101010  1 . ... . 1010011 @r2_rm
 fmv_h_x010  0 . 000 . 1010011 @r2
diff --git a/target/riscv/insn_trans/trans_rvzfh.c.inc 
b/target/riscv/insn_trans/trans_rvzfh.c.inc
index d125025766..8d0959a667 100644
--- a/target/riscv/insn_trans/trans_rvzfh.c.inc
+++ b/target/riscv/insn_trans/trans_rvzfh.c.inc
@@ -335,6 +335,43 @@ static bool trans_fcvt_h_d(DisasContext *ctx, arg_fcvt_h_d 
*a)
 return true;
 }
 
+static bool trans_feq_h(DisasContext *ctx, arg_feq_h *a)
+{
+REQUIRE_FPU;
+REQUIRE_ZFH(ctx);
+
+TCGv dest = dest_gpr(ctx, a->rd);
+
+gen_helper_feq_h(dest, cpu_env, cpu_fpr[a->rs1], cpu_fpr[a->rs2]);
+gen_set_gpr(ctx, a->rd, dest);
+return true;
+}
+
+static bool trans_flt_h(DisasContext *ctx, arg_flt_h *a)
+{
+REQUIRE_FPU;
+REQUIRE_ZFH(ctx);
+
+TCGv dest = dest_gpr(ctx, a->rd);
+
+gen_helper_flt_h(dest, cpu_env, cpu_fpr[a->rs1], cpu_fpr[a->rs2]);
+gen_set_gpr(ctx, a->rd, dest);
+
+return true;
+}
+
+static bool trans_fle_h(DisasContext *ctx, arg_fle_h *a)
+{
+REQUIRE_FPU;
+REQUIRE_ZFH(ctx);
+
+TCGv dest = dest_gpr(ctx, a->rd);
+
+gen_helper_fle_h(dest, cpu_env, cpu_fpr[a->rs1], cpu_fpr[a->rs2]);
+gen_set_gpr(ctx, a->rd, dest);
+return true;
+}
+
 static bool trans_fcvt_w_h(DisasContext *ctx, arg_fcvt_w_h *a)
 {
 REQUIRE_FPU;
-- 
2.31.1

[PATCH v6 6/8] target/riscv: zfh: add Zfh cpu property

2021-12-09 Thread frank . chang

From: Frank Chang 

Signed-off-by: Frank Chang 
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index f812998123..0f808a5bee 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -629,6 +629,7 @@ static Property riscv_cpu_properties[] = {
 DEFINE_PROP_BOOL("Counters", RISCVCPU, cfg.ext_counters, true),
 DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
 DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
+DEFINE_PROP_BOOL("Zfh", RISCVCPU, cfg.ext_zfh, false),
 DEFINE_PROP_BOOL("mmu", RISCVCPU, cfg.mmu, true),
 DEFINE_PROP_BOOL("pmp", RISCVCPU, cfg.pmp, true),
 
-- 
2.31.1

[PATCH v6 7/8] target/riscv: zfh: implement zfhmin extension

2021-12-09 Thread frank . chang

From: Frank Chang 

Zfhmin extension is a subset of Zfh extension, consisting only of data
transfer and conversion instructions.

If enabled, only the following instructions from Zfh extension are
included:
  * flh, fsh, fmv.x.h, fmv.h.x, fcvt.s.h, fcvt.h.s
  * If D extension is present: fcvt.d.h, fcvt.h.d

Signed-off-by: Frank Chang 
Reviewed-by: Richard Henderson 
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.h|  1 +
 target/riscv/insn_trans/trans_rvzfh.c.inc | 22 ++
 target/riscv/translate.c  |  2 ++
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 570c49f365..ef677f9092 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -313,6 +313,7 @@ struct RISCVCPU {
 bool ext_ifencei;
 bool ext_icsr;
 bool ext_zfh;
+bool ext_zfhmin;
 
 char *priv_spec;
 char *user_spec;
diff --git a/target/riscv/insn_trans/trans_rvzfh.c.inc 
b/target/riscv/insn_trans/trans_rvzfh.c.inc
index 0549e25fb4..5a7cac8958 100644
--- a/target/riscv/insn_trans/trans_rvzfh.c.inc
+++ b/target/riscv/insn_trans/trans_rvzfh.c.inc
@@ -22,13 +22,19 @@
 } \
 } while (0)
 
+#define REQUIRE_ZFH_OR_ZFHMIN(ctx) do {   \
+if (!(ctx->ext_zfh || ctx->ext_zfhmin)) { \
+return false; \
+} \
+} while (0)
+
 static bool trans_flh(DisasContext *ctx, arg_flh *a)
 {
 TCGv_i64 dest;
 TCGv t0;
 
 REQUIRE_FPU;
-REQUIRE_ZFH(ctx);
+REQUIRE_ZFH_OR_ZFHMIN(ctx);
 
 t0 = get_gpr(ctx, a->rs1, EXT_NONE);
 if (a->imm) {
@@ -50,7 +56,7 @@ static bool trans_fsh(DisasContext *ctx, arg_fsh *a)
 TCGv t0;
 
 REQUIRE_FPU;
-REQUIRE_ZFH(ctx);
+REQUIRE_ZFH_OR_ZFHMIN(ctx);
 
 t0 = get_gpr(ctx, a->rs1, EXT_NONE);
 if (a->imm) {
@@ -283,7 +289,7 @@ static bool trans_fmax_h(DisasContext *ctx, arg_fmax_h *a)
 static bool trans_fcvt_s_h(DisasContext *ctx, arg_fcvt_s_h *a)
 {
 REQUIRE_FPU;
-REQUIRE_ZFH(ctx);
+REQUIRE_ZFH_OR_ZFHMIN(ctx);
 
 gen_set_rm(ctx, a->rm);
 gen_helper_fcvt_s_h(cpu_fpr[a->rd], cpu_env, cpu_fpr[a->rs1]);
@@ -296,7 +302,7 @@ static bool trans_fcvt_s_h(DisasContext *ctx, arg_fcvt_s_h 
*a)
 static bool trans_fcvt_d_h(DisasContext *ctx, arg_fcvt_d_h *a)
 {
 REQUIRE_FPU;
-REQUIRE_ZFH(ctx);
+REQUIRE_ZFH_OR_ZFHMIN(ctx);
 REQUIRE_EXT(ctx, RVD);
 
 gen_set_rm(ctx, a->rm);
@@ -311,7 +317,7 @@ static bool trans_fcvt_d_h(DisasContext *ctx, arg_fcvt_d_h 
*a)
 static bool trans_fcvt_h_s(DisasContext *ctx, arg_fcvt_h_s *a)
 {
 REQUIRE_FPU;
-REQUIRE_ZFH(ctx);
+REQUIRE_ZFH_OR_ZFHMIN(ctx);
 
 gen_set_rm(ctx, a->rm);
 gen_helper_fcvt_h_s(cpu_fpr[a->rd], cpu_env, cpu_fpr[a->rs1]);
@@ -324,7 +330,7 @@ static bool trans_fcvt_h_s(DisasContext *ctx, arg_fcvt_h_s 
*a)
 static bool trans_fcvt_h_d(DisasContext *ctx, arg_fcvt_h_d *a)
 {
 REQUIRE_FPU;
-REQUIRE_ZFH(ctx);
+REQUIRE_ZFH_OR_ZFHMIN(ctx);
 REQUIRE_EXT(ctx, RVD);
 
 gen_set_rm(ctx, a->rm);
@@ -441,7 +447,7 @@ static bool trans_fcvt_h_wu(DisasContext *ctx, 
arg_fcvt_h_wu *a)
 static bool trans_fmv_x_h(DisasContext *ctx, arg_fmv_x_h *a)
 {
 REQUIRE_FPU;
-REQUIRE_ZFH(ctx);
+REQUIRE_ZFH_OR_ZFHMIN(ctx);
 
 TCGv dest = dest_gpr(ctx, a->rd);
 
@@ -461,7 +467,7 @@ static bool trans_fmv_x_h(DisasContext *ctx, arg_fmv_x_h *a)
 static bool trans_fmv_h_x(DisasContext *ctx, arg_fmv_h_x *a)
 {
 REQUIRE_FPU;
-REQUIRE_ZFH(ctx);
+REQUIRE_ZFH_OR_ZFHMIN(ctx);
 
 TCGv t0 = get_gpr(ctx, a->rs1, EXT_ZERO);
 
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index 93f9ec0c8b..d445954dc7 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -74,6 +74,7 @@ typedef struct DisasContext {
 bool virt_enabled;
 bool ext_ifencei;
 bool ext_zfh;
+bool ext_zfhmin;
 bool hlsx;
 /* vector extension */
 bool vill;
@@ -644,6 +645,7 @@ static void riscv_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
 ctx->frm = -1;  /* unknown rounding mode */
 ctx->ext_ifencei = cpu->cfg.ext_ifencei;
 ctx->ext_zfh = cpu->cfg.ext_zfh;
+ctx->ext_zfhmin = cpu->cfg.ext_zfhmin;
 ctx->vlen = cpu->cfg.vlen;
 ctx->mstatus_hs_fs = FIELD_EX32(tb_flags, TB_FLAGS, MSTATUS_HS_FS);
 ctx->hlsx = FIELD_EX32(tb_flags, TB_FLAGS, HLSX);
-- 
2.31.1

[PATCH v6 1/8] target/riscv: zfh: half-precision load and store

2021-12-09 Thread frank . chang

From: Kito Cheng 

Signed-off-by: Kito Cheng 
Signed-off-by: Chih-Min Chao 
Signed-off-by: Frank Chang 
Reviewed-by: Richard Henderson 
---
 target/riscv/cpu.h|  1 +
 target/riscv/insn32.decode|  4 ++
 target/riscv/insn_trans/trans_rvzfh.c.inc | 65 +++
 target/riscv/translate.c  |  8 +++
 4 files changed, 78 insertions(+)
 create mode 100644 target/riscv/insn_trans/trans_rvzfh.c.inc

diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 0760c0af93..570c49f365 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -312,6 +312,7 @@ struct RISCVCPU {
 bool ext_counters;
 bool ext_ifencei;
 bool ext_icsr;
+bool ext_zfh;
 
 char *priv_spec;
 char *user_spec;
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 2f251dac1b..b36a3d8dbf 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -726,3 +726,7 @@ binv   0110100 .. 001 . 0110011 @r
 binvi  01101. ... 001 . 0010011 @sh
 bset   0010100 .. 001 . 0110011 @r
 bseti  00101. ... 001 . 0010011 @sh
+
+# *** RV32 Zfh Extension ***
+flh   . 001 . 111 @i
+fsh...  . . 001 . 0100111 @s
diff --git a/target/riscv/insn_trans/trans_rvzfh.c.inc 
b/target/riscv/insn_trans/trans_rvzfh.c.inc
new file mode 100644
index 00..dad1d703d7
--- /dev/null
+++ b/target/riscv/insn_trans/trans_rvzfh.c.inc
@@ -0,0 +1,65 @@
+/*
+ * RISC-V translation routines for the RV64Zfh Standard Extension.
+ *
+ * Copyright (c) 2020 Chih-Min Chao, chihmin.c...@sifive.com
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2 or later, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see .
+ */
+
+#define REQUIRE_ZFH(ctx) do { \
+if (!ctx->ext_zfh) {  \
+return false; \
+} \
+} while (0)
+
+static bool trans_flh(DisasContext *ctx, arg_flh *a)
+{
+TCGv_i64 dest;
+TCGv t0;
+
+REQUIRE_FPU;
+REQUIRE_ZFH(ctx);
+
+t0 = get_gpr(ctx, a->rs1, EXT_NONE);
+if (a->imm) {
+TCGv temp = temp_new(ctx);
+tcg_gen_addi_tl(temp, t0, a->imm);
+t0 = temp;
+}
+
+dest = cpu_fpr[a->rd];
+tcg_gen_qemu_ld_i64(dest, t0, ctx->mem_idx, MO_TEUW);
+gen_nanbox_h(dest, dest);
+
+mark_fs_dirty(ctx);
+return true;
+}
+
+static bool trans_fsh(DisasContext *ctx, arg_fsh *a)
+{
+TCGv t0;
+
+REQUIRE_FPU;
+REQUIRE_ZFH(ctx);
+
+t0 = get_gpr(ctx, a->rs1, EXT_NONE);
+if (a->imm) {
+TCGv temp = tcg_temp_new();
+tcg_gen_addi_tl(temp, t0, a->imm);
+t0 = temp;
+}
+
+tcg_gen_qemu_st_i64(cpu_fpr[a->rs2], t0, ctx->mem_idx, MO_TEUW);
+
+return true;
+}
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index 1d57bc97b5..bea87b31b5 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -73,6 +73,7 @@ typedef struct DisasContext {
 RISCVMXL ol;
 bool virt_enabled;
 bool ext_ifencei;
+bool ext_zfh;
 bool hlsx;
 /* vector extension */
 bool vill;
@@ -134,6 +135,11 @@ static void gen_nanbox_s(TCGv_i64 out, TCGv_i64 in)
 tcg_gen_ori_i64(out, in, MAKE_64BIT_MASK(32, 32));
 }
 
+static void gen_nanbox_h(TCGv_i64 out, TCGv_i64 in)
+{
+tcg_gen_ori_i64(out, in, MAKE_64BIT_MASK(16, 48));
+}
+
 /*
  * A narrow n-bit operation, where n < FLEN, checks that input operands
  * are correctly Nan-boxed, i.e., all upper FLEN - n bits are 1.
@@ -574,6 +580,7 @@ static uint32_t opcode_at(DisasContextBase *dcbase, 
target_ulong pc)
 #include "insn_trans/trans_rvh.c.inc"
 #include "insn_trans/trans_rvv.c.inc"
 #include "insn_trans/trans_rvb.c.inc"
+#include "insn_trans/trans_rvzfh.c.inc"
 #include "insn_trans/trans_privileged.c.inc"
 
 /* Include the auto-generated decoder for 16 bit insn */
@@ -626,6 +633,7 @@ static void riscv_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
 ctx->misa_ext = env->misa_ext;
 ctx->frm = -1;  /* unknown rounding mode */
 ctx->ext_ifencei = cpu->cfg.ext_ifencei;
+ctx->ext_zfh = cpu->cfg.ext_zfh;
 ctx->vlen = cpu->cfg.vlen;
 ctx->mstatus_hs_fs = FIELD_EX32(tb_flags, TB_FLAGS, MSTATUS_HS_FS);
 ctx->hlsx = FIELD_EX32(tb_flags, TB_FLAGS, HLSX);
-- 
2.31.1

[PATCH v6 2/8] target/riscv: zfh: half-precision computational

2021-12-09 Thread frank . chang

From: Kito Cheng 

Signed-off-by: Kito Cheng 
Signed-off-by: Chih-Min Chao 
Signed-off-by: Frank Chang 
Reviewed-by: Richard Henderson 
Reviewed-by: Alistair Francis 
---
 target/riscv/fpu_helper.c |  86 +++
 target/riscv/helper.h |  13 +++
 target/riscv/insn32.decode|  11 ++
 target/riscv/insn_trans/trans_rvzfh.c.inc | 129 ++
 target/riscv/internals.h  |  16 +++
 5 files changed, 255 insertions(+)

diff --git a/target/riscv/fpu_helper.c b/target/riscv/fpu_helper.c
index d62f470900..20bb89ad14 100644
--- a/target/riscv/fpu_helper.c
+++ b/target/riscv/fpu_helper.c
@@ -81,6 +81,15 @@ void helper_set_rounding_mode(CPURISCVState *env, uint32_t 
rm)
 set_float_rounding_mode(softrm, >fp_status);
 }
 
+static uint64_t do_fmadd_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2,
+   uint64_t rs3, int flags)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+float16 frs3 = check_nanbox_h(rs3);
+return nanbox_h(float16_muladd(frs1, frs2, frs3, flags, >fp_status));
+}
+
 static uint64_t do_fmadd_s(CPURISCVState *env, uint64_t rs1, uint64_t rs2,
uint64_t rs3, int flags)
 {
@@ -102,6 +111,12 @@ uint64_t helper_fmadd_d(CPURISCVState *env, uint64_t frs1, 
uint64_t frs2,
 return float64_muladd(frs1, frs2, frs3, 0, >fp_status);
 }
 
+uint64_t helper_fmadd_h(CPURISCVState *env, uint64_t frs1, uint64_t frs2,
+uint64_t frs3)
+{
+return do_fmadd_h(env, frs1, frs2, frs3, 0);
+}
+
 uint64_t helper_fmsub_s(CPURISCVState *env, uint64_t frs1, uint64_t frs2,
 uint64_t frs3)
 {
@@ -115,6 +130,12 @@ uint64_t helper_fmsub_d(CPURISCVState *env, uint64_t frs1, 
uint64_t frs2,
   >fp_status);
 }
 
+uint64_t helper_fmsub_h(CPURISCVState *env, uint64_t frs1, uint64_t frs2,
+uint64_t frs3)
+{
+return do_fmadd_h(env, frs1, frs2, frs3, float_muladd_negate_c);
+}
+
 uint64_t helper_fnmsub_s(CPURISCVState *env, uint64_t frs1, uint64_t frs2,
  uint64_t frs3)
 {
@@ -128,6 +149,12 @@ uint64_t helper_fnmsub_d(CPURISCVState *env, uint64_t 
frs1, uint64_t frs2,
   >fp_status);
 }
 
+uint64_t helper_fnmsub_h(CPURISCVState *env, uint64_t frs1, uint64_t frs2,
+ uint64_t frs3)
+{
+return do_fmadd_h(env, frs1, frs2, frs3, float_muladd_negate_product);
+}
+
 uint64_t helper_fnmadd_s(CPURISCVState *env, uint64_t frs1, uint64_t frs2,
  uint64_t frs3)
 {
@@ -142,6 +169,13 @@ uint64_t helper_fnmadd_d(CPURISCVState *env, uint64_t 
frs1, uint64_t frs2,
   float_muladd_negate_product, >fp_status);
 }
 
+uint64_t helper_fnmadd_h(CPURISCVState *env, uint64_t frs1, uint64_t frs2,
+ uint64_t frs3)
+{
+return do_fmadd_h(env, frs1, frs2, frs3,
+  float_muladd_negate_c | float_muladd_negate_product);
+}
+
 uint64_t helper_fadd_s(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
 {
 float32 frs1 = check_nanbox_s(rs1);
@@ -374,3 +408,55 @@ target_ulong helper_fclass_d(uint64_t frs1)
 {
 return fclass_d(frs1);
 }
+
+uint64_t helper_fadd_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+return nanbox_h(float16_add(frs1, frs2, >fp_status));
+}
+
+uint64_t helper_fsub_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+return nanbox_h(float16_sub(frs1, frs2, >fp_status));
+}
+
+uint64_t helper_fmul_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+return nanbox_h(float16_mul(frs1, frs2, >fp_status));
+}
+
+uint64_t helper_fdiv_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+return nanbox_h(float16_div(frs1, frs2, >fp_status));
+}
+
+uint64_t helper_fmin_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+return nanbox_h(env->priv_ver < PRIV_VERSION_1_11_0 ?
+float16_minnum(frs1, frs2, >fp_status) :
+float16_minimum_number(frs1, frs2, >fp_status));
+}
+
+uint64_t helper_fmax_h(CPURISCVState *env, uint64_t rs1, uint64_t rs2)
+{
+float16 frs1 = check_nanbox_h(rs1);
+float16 frs2 = check_nanbox_h(rs2);
+return nanbox_h(env->priv_ver < PRIV_VERSION_1_11_0 ?
+float16_maxnum(frs1, frs2, >fp_status) :
+float16_maximum_number(frs1, frs2, >fp_status));
+}
+
+uint64_t helper_fsqrt_h(CPURISCVState *env, uint64_t rs1)
+{
+float16 frs1 = check_nanbox_h(rs1);
+return

[PATCH v6 5/8] target/riscv: zfh: half-precision floating-point classify

2021-12-09 Thread frank . chang

From: Kito Cheng 

Signed-off-by: Kito Cheng 
Signed-off-by: Chih-Min Chao 
Signed-off-by: Frank Chang 
Reviewed-by: Richard Henderson 
Reviewed-by: Alistair Francis 
---
 target/riscv/fpu_helper.c |  6 ++
 target/riscv/helper.h |  1 +
 target/riscv/insn32.decode|  1 +
 target/riscv/insn_trans/trans_rvzfh.c.inc | 12 
 4 files changed, 20 insertions(+)

diff --git a/target/riscv/fpu_helper.c b/target/riscv/fpu_helper.c
index ec2009ee65..388e23ca67 100644
--- a/target/riscv/fpu_helper.c
+++ b/target/riscv/fpu_helper.c
@@ -482,6 +482,12 @@ target_ulong helper_feq_h(CPURISCVState *env, uint64_t 
rs1, uint64_t rs2)
 return float16_eq_quiet(frs1, frs2, >fp_status);
 }
 
+target_ulong helper_fclass_h(uint64_t rs1)
+{
+float16 frs1 = check_nanbox_h(rs1);
+return fclass_h(frs1);
+}
+
 target_ulong helper_fcvt_w_h(CPURISCVState *env, uint64_t rs1)
 {
 float16 frs1 = check_nanbox_h(rs1);
diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index 9c89521d4a..d25cf725c5 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -89,6 +89,7 @@ DEF_HELPER_FLAGS_2(fcvt_h_w, TCG_CALL_NO_RWG, i64, env, tl)
 DEF_HELPER_FLAGS_2(fcvt_h_wu, TCG_CALL_NO_RWG, i64, env, tl)
 DEF_HELPER_FLAGS_2(fcvt_h_l, TCG_CALL_NO_RWG, i64, env, tl)
 DEF_HELPER_FLAGS_2(fcvt_h_lu, TCG_CALL_NO_RWG, i64, env, tl)
+DEF_HELPER_FLAGS_1(fclass_h, TCG_CALL_NO_RWG_SE, tl, i64)
 
 /* Special functions */
 DEF_HELPER_2(csrr, tl, env, int)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 3906c9fb20..6c4cde216b 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -754,6 +754,7 @@ fmv_x_h1110010  0 . 000 . 1010011 @r2
 feq_h  1010010  . . 010 . 1010011 @r
 flt_h  1010010  . . 001 . 1010011 @r
 fle_h  1010010  . . 000 . 1010011 @r
+fclass_h   1110010  0 . 001 . 1010011 @r2
 fcvt_h_w   1101010  0 . ... . 1010011 @r2_rm
 fcvt_h_wu  1101010  1 . ... . 1010011 @r2_rm
 fmv_h_x010  0 . 000 . 1010011 @r2
diff --git a/target/riscv/insn_trans/trans_rvzfh.c.inc 
b/target/riscv/insn_trans/trans_rvzfh.c.inc
index 8d0959a667..0549e25fb4 100644
--- a/target/riscv/insn_trans/trans_rvzfh.c.inc
+++ b/target/riscv/insn_trans/trans_rvzfh.c.inc
@@ -372,6 +372,18 @@ static bool trans_fle_h(DisasContext *ctx, arg_fle_h *a)
 return true;
 }
 
+static bool trans_fclass_h(DisasContext *ctx, arg_fclass_h *a)
+{
+REQUIRE_FPU;
+REQUIRE_ZFH(ctx);
+
+TCGv dest = dest_gpr(ctx, a->rd);
+
+gen_helper_fclass_h(dest, cpu_fpr[a->rs1]);
+gen_set_gpr(ctx, a->rd, dest);
+return true;
+}
+
 static bool trans_fcvt_w_h(DisasContext *ctx, arg_fcvt_w_h *a)
 {
 REQUIRE_FPU;
-- 
2.31.1

[PATCH v6 3/8] target/riscv: zfh: half-precision convert and move

2021-12-09 Thread frank . chang

From: Kito Cheng 

Signed-off-by: Kito Cheng 
Signed-off-by: Chih-Min Chao 
Signed-off-by: Frank Chang 
Reviewed-by: Richard Henderson 
Acked-by: Alistair Francis 
---
 target/riscv/fpu_helper.c |  67 +
 target/riscv/helper.h |  12 +
 target/riscv/insn32.decode|  19 ++
 target/riscv/insn_trans/trans_rvzfh.c.inc | 288 ++
 target/riscv/translate.c  |  10 +
 5 files changed, 396 insertions(+)

diff --git a/target/riscv/fpu_helper.c b/target/riscv/fpu_helper.c
index 20bb89ad14..2ed9b03193 100644
--- a/target/riscv/fpu_helper.c
+++ b/target/riscv/fpu_helper.c
@@ -460,3 +460,70 @@ uint64_t helper_fsqrt_h(CPURISCVState *env, uint64_t rs1)
 float16 frs1 = check_nanbox_h(rs1);
 return nanbox_h(float16_sqrt(frs1, >fp_status));
 }
+
+target_ulong helper_fcvt_w_h(CPURISCVState *env, uint64_t rs1)
+{
+float16 frs1 = check_nanbox_h(rs1);
+return float16_to_int32(frs1, >fp_status);
+}
+
+target_ulong helper_fcvt_wu_h(CPURISCVState *env, uint64_t rs1)
+{
+float16 frs1 = check_nanbox_h(rs1);
+return (int32_t)float16_to_uint32(frs1, >fp_status);
+}
+
+target_ulong helper_fcvt_l_h(CPURISCVState *env, uint64_t rs1)
+{
+float16 frs1 = check_nanbox_h(rs1);
+return float16_to_int64(frs1, >fp_status);
+}
+
+target_ulong helper_fcvt_lu_h(CPURISCVState *env, uint64_t rs1)
+{
+float16 frs1 = check_nanbox_h(rs1);
+return float16_to_uint64(frs1, >fp_status);
+}
+
+uint64_t helper_fcvt_h_w(CPURISCVState *env, target_ulong rs1)
+{
+return nanbox_h(int32_to_float16((int32_t)rs1, >fp_status));
+}
+
+uint64_t helper_fcvt_h_wu(CPURISCVState *env, target_ulong rs1)
+{
+return nanbox_h(uint32_to_float16((uint32_t)rs1, >fp_status));
+}
+
+uint64_t helper_fcvt_h_l(CPURISCVState *env, target_ulong rs1)
+{
+return nanbox_h(int64_to_float16(rs1, >fp_status));
+}
+
+uint64_t helper_fcvt_h_lu(CPURISCVState *env, target_ulong rs1)
+{
+return nanbox_h(uint64_to_float16(rs1, >fp_status));
+}
+
+uint64_t helper_fcvt_h_s(CPURISCVState *env, uint64_t rs1)
+{
+float32 frs1 = check_nanbox_s(rs1);
+return nanbox_h(float32_to_float16(frs1, true, >fp_status));
+}
+
+uint64_t helper_fcvt_s_h(CPURISCVState *env, uint64_t rs1)
+{
+float16 frs1 = check_nanbox_h(rs1);
+return nanbox_s(float16_to_float32(frs1, true, >fp_status));
+}
+
+uint64_t helper_fcvt_h_d(CPURISCVState *env, uint64_t rs1)
+{
+return nanbox_h(float64_to_float16(rs1, true, >fp_status));
+}
+
+uint64_t helper_fcvt_d_h(CPURISCVState *env, uint64_t rs1)
+{
+float16 frs1 = check_nanbox_h(rs1);
+return float16_to_float64(frs1, true, >fp_status);
+}
diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index c6c0323faf..b50672d168 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -74,6 +74,18 @@ DEF_HELPER_FLAGS_3(fdiv_h, TCG_CALL_NO_RWG, i64, env, i64, 
i64)
 DEF_HELPER_FLAGS_3(fmin_h, TCG_CALL_NO_RWG, i64, env, i64, i64)
 DEF_HELPER_FLAGS_3(fmax_h, TCG_CALL_NO_RWG, i64, env, i64, i64)
 DEF_HELPER_FLAGS_2(fsqrt_h, TCG_CALL_NO_RWG, i64, env, i64)
+DEF_HELPER_FLAGS_2(fcvt_s_h, TCG_CALL_NO_RWG, i64, env, i64)
+DEF_HELPER_FLAGS_2(fcvt_h_s, TCG_CALL_NO_RWG, i64, env, i64)
+DEF_HELPER_FLAGS_2(fcvt_d_h, TCG_CALL_NO_RWG, i64, env, i64)
+DEF_HELPER_FLAGS_2(fcvt_h_d, TCG_CALL_NO_RWG, i64, env, i64)
+DEF_HELPER_FLAGS_2(fcvt_w_h, TCG_CALL_NO_RWG, tl, env, i64)
+DEF_HELPER_FLAGS_2(fcvt_wu_h, TCG_CALL_NO_RWG, tl, env, i64)
+DEF_HELPER_FLAGS_2(fcvt_l_h, TCG_CALL_NO_RWG, tl, env, i64)
+DEF_HELPER_FLAGS_2(fcvt_lu_h, TCG_CALL_NO_RWG, tl, env, i64)
+DEF_HELPER_FLAGS_2(fcvt_h_w, TCG_CALL_NO_RWG, i64, env, tl)
+DEF_HELPER_FLAGS_2(fcvt_h_wu, TCG_CALL_NO_RWG, i64, env, tl)
+DEF_HELPER_FLAGS_2(fcvt_h_l, TCG_CALL_NO_RWG, i64, env, tl)
+DEF_HELPER_FLAGS_2(fcvt_h_lu, TCG_CALL_NO_RWG, i64, env, tl)
 
 /* Special functions */
 DEF_HELPER_2(csrr, tl, env, int)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 66c231a301..ba40f3e7f8 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -739,5 +739,24 @@ fsub_h 110  . . ... . 1010011 @r_rm
 fmul_h 0001010  . . ... . 1010011 @r_rm
 fdiv_h 0001110  . . ... . 1010011 @r_rm
 fsqrt_h0101110  0 . ... . 1010011 @r2_rm
+fsgnj_h0010010  . . 000 . 1010011 @r
+fsgnjn_h   0010010  . . 001 . 1010011 @r
+fsgnjx_h   0010010  . . 010 . 1010011 @r
 fmin_h 0010110  . . 000 . 1010011 @r
 fmax_h 0010110  . . 001 . 1010011 @r
+fcvt_h_s   0100010  0 . ... . 1010011 @r2_rm
+fcvt_s_h   010  00010 . ... . 1010011 @r2_rm
+fcvt_h_d   0100010  1 . ... . 1010011 @r2_rm
+fcvt_d_h   011  00010 . ... . 1010011 @r2_rm
+fcvt_w_h   1100010  0 . ... . 1010011 @r2_rm
+fcvt_wu_h  1100010  1 . ... . 1010011 @r2_rm
+fmv_x_h1110010  0 . 000 . 1010011 @r2

[PATCH v6 0/8] target/riscv: support Zfh, Zfhmin extension v0.1

2021-12-09 Thread frank . chang

From: Frank Chang 

Zfh - Half width floating point
Zfhmin - Subset of half width floating point

Zfh, Zfhmin v0.1 is now in public review period and is required by
RVV extension:
https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/63gDCinXTwE/m/871Wm9XIBQAJ

Zfh, Zfhmin can be enabled with -cpu option: Zfh=true and Zfhmin=true
respectively.

The port is available at:
https://github.com/sifive/qemu/tree/zfh-upstream-v6

Note: This patchset depends on another patchset listed in Based-on
  section below so it is not able to be built unless the patchset
  is applied.

Changelog:

v6:
  * Rebase on riscv-to-apply.next.

v5:
  * Rebase on riscv-to-apply.next.

v4:
  * Spilt Zfh, Zfhmin cpu properties related changes into individual
patches.

v3:
  * Use the renamed softfloat min/max APIs: *_minimum_number()
and *_maximum_number().
  * Pick softfloat min/max APIs based on CPU privilege spec version.
  * Add braces for if statements in REQUIRE_ZFH() and
REQUIRE_ZFH_OR_ZFHMIN().
  * Rearrange the positions of Zfh and Zfhmin cpu properties.

v2:
  * Use {get,dest}_gpr APIs.
  * Add Zfhmin extension.

Based-on: <20211021160847.2748577-1-frank.ch...@sifive.com>

Frank Chang (3):
  target/riscv: zfh: add Zfh cpu property
  target/riscv: zfh: implement zfhmin extension
  target/riscv: zfh: add Zfhmin cpu property

Kito Cheng (5):
  target/riscv: zfh: half-precision load and store
  target/riscv: zfh: half-precision computational
  target/riscv: zfh: half-precision convert and move
  target/riscv: zfh: half-precision floating-point compare
  target/riscv: zfh: half-precision floating-point classify

 target/riscv/cpu.c|   2 +
 target/riscv/cpu.h|   2 +
 target/riscv/fpu_helper.c | 180 
 target/riscv/helper.h |  29 ++
 target/riscv/insn32.decode|  38 ++
 target/riscv/insn_trans/trans_rvzfh.c.inc | 537 ++
 target/riscv/internals.h  |  16 +
 target/riscv/translate.c  |  20 +
 8 files changed, 824 insertions(+)
 create mode 100644 target/riscv/insn_trans/trans_rvzfh.c.inc

--
2.31.1

Re: [PATCH v10 00/77] support vector extension v1.0

2021-12-09 Thread Frank Chang

On Wed, Dec 8, 2021 at 2:40 PM Alistair Francis 
wrote:

> On Mon, Nov 29, 2021 at 1:04 PM  wrote:
> >
> > From: Frank Chang 
> >
> > This patchset implements the vector extension v1.0 for RISC-V on QEMU.
> >
> > RVV v1.0 spec is now fronzen for public review:
> > https://github.com/riscv/riscv-v-spec/releases/tag/v1.0
> >
> > The port is available here:
> > https://github.com/sifive/qemu/tree/rvv-1.0-upstream-v10
> >
> > RVV v1.0 can be enabled with -cpu option: v=true and specify vext_spec
> > option to v1.0 (i.e. vext_spec=v1.0)
> >
> > Note: This patchset depends on other patchsets listed in Based-on
> >   section below so it is not able to be built unless those patchsets
> >   are applied.
>
> I think this is all reviewed now. Once your other patch sets are
> merged just re-send this and I can apply it.
>
> Alistair
>

Hi Alistair,

Thanks for the review.
I think <20211021160847.2748577-1-frank.ch...@sifive.com> has already been
applied to your riscv-to-apply.next branch.
I'll rebase <20211021162956.2772656-1-frank.ch...@sifive.com> and also this
patchset as well then resend them.

Regards,
Frank Chang


>
> >
> > Changelog:
> >
> > v10
> >   * Add ELEN checks for widening and narrowing instructions.
> >
> > v9
> >   * Remove explicitly set mstatus.SD patches as mstatus.SD is now
> > set in add_status_sd().
> >   * Rebase on riscv-to-apply.next branch.
> >
> > v8
> >   * Use {get,dest}_gpr APIs.
> >   * remove vector AMO instructions.
> >   * rename vpopc.m to vcpop.m.
> >   * rename vle1.v and vse1.v to vlm.v and vsm.v.
> >   * rename vmandnot.mm and vmornot.mm to vmandn.mm and vmorn.mm.
> >
> > v7
> >   * remove hardcoded GDB vector registers list.
> >   * add vsetivli instruction.
> >   * add vle1.v and vse1.v instructions.
> >
> > v6
> >   * add vector floating-point reciprocal estimate instruction.
> >   * add vector floating-point reciprocal square-root estimate
> instruction.
> >   * update check rules for segment register groups, each segment register
> > group has to follow overlap rules.
> >   * update viota.m instruction check rules.
> >
> > v5
> >   * refactor RVV v1.0 check functions.
> > (Thanks to Richard Henderson's bitwise tricks.)
> >   * relax RV_VLEN_MAX to 1024-bits.
> >   * implement vstart CSR's behaviors.
> >   * trigger illegal instruction exception if frm is not valid for
> > vector floating-point instructions.
> >   * rebase on riscv-to-apply.next.
> >
> > v4
> >   * remove explicit float flmul variable in DisasContext.
> >   * replace floating-point calculations with shift operations to
> > improve performance.
> >   * relax RV_VLEN_MAX to 512-bits.
> >
> > v3
> >   * apply nan-box helpers from Richard Henderson.
> >   * remove fp16 api changes as they are sent independently in another
> > pathcset by Chih-Min Chao.
> >   * remove all tail elements clear functions as tail elements can
> > retain unchanged for either VTA set to undisturbed or agnostic.
> >   * add fp16 nan-box check generator function.
> >   * add floating-point rounding mode enum.
> >   * replace flmul arithmetic with shifts to avoid floating-point
> > conversions.
> >   * add Zvqmac extension.
> >   * replace gdbstub vector register xml files with dynamic generator.
> >   * bumped to RVV v1.0.
> >   * RVV v1.0 related changes:
> > * add vlre.v and vsr.v vector whole register
> >   load/store instructions
> > * add vrgatherei16 instruction.
> > * rearranged bits in vtype to make vlmul bits into a contiguous
> >   field.
> >
> > v2
> >   * drop v0.7.1 support.
> >   * replace invisible return check macros with functions.
> >   * move mark_vs_dirty() to translators.
> >   * add SSTATUS_VS flag for s-mode.
> >   * nan-box scalar fp register for floating-point operations.
> >   * add gdbstub files for vector registers to allow system-mode
> > debugging with GDB.
> >
> > Based-on: <20211021160847.2748577-1-frank.ch...@sifive.com>
> > Based-on: <20211021162956.2772656-1-frank.ch...@sifive.com>
> >
> > Frank Chang (72):
> >   target/riscv: drop vector 0.7.1 and add 1.0 support
> >   target/riscv: Use FIELD_EX32() to extract wd field
> >   target/riscv: rvv-1.0: set mstatus.SD bit if mstatus.VS is dirty
> >   target/riscv: rvv-1.0: introduce writable misa.v field
> >   target/riscv: rvv-1.0: add translation-time vector context status
> >   target/riscv: rvv-1.0: remove rvv related codes from fcsr registers
> >   target/riscv: rvv-1.0: check MSTATUS_VS when accessing vector csr
> > registers
> >   target/riscv: rvv-1.0: remove MLEN calculations
> >   target/riscv: rvv-1.0: add fractional LMUL
> >   target/riscv: rvv-1.0: add VMA and VTA
> >   target/riscv: rvv-1.0: update check functions
> >   target/riscv: introduce more imm value modes in translator functions
> >   target/riscv: rvv:1.0: add translation-time nan-box helper function
> >   target/riscv: rvv-1.0: remove amo operations instructions
> >   target/riscv: rvv-1.0: configure instructions

Re: [PATCH 7/7] hw/riscv: Use error_fatal for SoC realisation

2021-12-09 Thread Markus Armbruster

Alistair Francis  writes:

> From: Alistair Francis 
>
> When realising the SoC use error_fatal instead of error_abort as the
> process can fail and report useful information to the user.
>
> Currently a user can see this:
>
>$ ../qemu/bld/qemu-system-riscv64 -M sifive_u -S -monitor stdio -display 
> none -drive if=pflash
> QEMU 6.1.93 monitor - type 'help' for more information
> (qemu) Unexpected error in sifive_u_otp_realize() at 
> ../hw/misc/sifive_u_otp.c:229:
> qemu-system-riscv64: OTP drive size < 16K
> Aborted (core dumped)
>
> Which this patch addresses
>
> Signed-off-by: Alistair Francis 
> Reported-by: Markus Armbruster 

Reviewed-by: Markus Armbruster

[PATCH 2/2] target/riscv: Implement the stval/mtval illegal instruction

2021-12-09 Thread Alistair Francis

From: Alistair Francis 

The stval and mtval registers can optionally contain the faulting
instruction on an illegal instruction exception. This patch adds support
for setting the stval and mtval registers based on the CPU feature.

Signed-off-by: Alistair Francis 
---
 target/riscv/cpu.h|  2 ++
 target/riscv/cpu_helper.c | 25 +++--
 target/riscv/translate.c  |  3 +++
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
index 0760c0af93..3a163b57ed 100644
--- a/target/riscv/cpu.h
+++ b/target/riscv/cpu.h
@@ -127,6 +127,8 @@ struct CPURISCVState {
 target_ulong frm;
 
 target_ulong badaddr;
+uint32_t bins;
+
 target_ulong guest_phys_fault_addr;
 
 target_ulong priv_ver;
diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index 9eeed38c7e..cb5833a6d2 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -975,7 +975,6 @@ void riscv_cpu_do_interrupt(CPUState *cs)
 bool async = !!(cs->exception_index & RISCV_EXCP_INT_FLAG);
 target_ulong cause = cs->exception_index & RISCV_EXCP_INT_MASK;
 target_ulong deleg = async ? env->mideleg : env->medeleg;
-bool write_tval = false;
 target_ulong tval = 0;
 target_ulong htval = 0;
 target_ulong mtval2 = 0;
@@ -1004,9 +1003,11 @@ void riscv_cpu_do_interrupt(CPUState *cs)
 case RISCV_EXCP_INST_PAGE_FAULT:
 case RISCV_EXCP_LOAD_PAGE_FAULT:
 case RISCV_EXCP_STORE_PAGE_FAULT:
-write_tval  = true;
 tval = env->badaddr;
 break;
+case RISCV_EXCP_ILLEGAL_INST:
+tval = env->bins;
+break;
 default:
 break;
 }
@@ -1041,17 +1042,7 @@ void riscv_cpu_do_interrupt(CPUState *cs)
 if (riscv_has_ext(env, RVH)) {
 target_ulong hdeleg = async ? env->hideleg : env->hedeleg;
 
-if (env->two_stage_lookup && write_tval) {
-/*
- * If we are writing a guest virtual address to stval, set
- * this to 1. If we are trapping to VS we will set this to 0
- * later.
- */
-env->hstatus = set_field(env->hstatus, HSTATUS_GVA, 1);
-} else {
-/* For other HS-mode traps, we set this to 0. */
-env->hstatus = set_field(env->hstatus, HSTATUS_GVA, 0);
-}
+env->hstatus = set_field(env->hstatus, HSTATUS_GVA, 0);
 
 if (riscv_cpu_virt_enabled(env) && ((hdeleg >> cause) & 1)) {
 /* Trap to VS mode */
@@ -1063,7 +1054,6 @@ void riscv_cpu_do_interrupt(CPUState *cs)
 cause == IRQ_VS_EXT) {
 cause = cause - 1;
 }
-env->hstatus = set_field(env->hstatus, HSTATUS_GVA, 0);
 } else if (riscv_cpu_virt_enabled(env)) {
 /* Trap into HS mode, from virt */
 riscv_cpu_swap_hypervisor_regs(env);
@@ -1071,6 +1061,13 @@ void riscv_cpu_do_interrupt(CPUState *cs)
  env->priv);
 env->hstatus = set_field(env->hstatus, HSTATUS_SPV,
  riscv_cpu_virt_enabled(env));
+if (tval) {
+/*
+ * If we are writing a guest virtual address to stval, set
+ * this to 1.
+ */
+env->hstatus = set_field(env->hstatus, HSTATUS_GVA, 1);
+}
 
 htval = env->guest_phys_fault_addr;
 
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index 24251bc8cc..921ca06bf9 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -167,6 +167,9 @@ static void generate_exception_mtval(DisasContext *ctx, int 
excp)
 
 static void gen_exception_illegal(DisasContext *ctx)
 {
+tcg_gen_st_i32(tcg_constant_i32(ctx->opcode), cpu_env,
+   offsetof(CPURISCVState, bins));
+
 generate_exception(ctx, RISCV_EXCP_ILLEGAL_INST);
 }
 
-- 
2.31.1

[PATCH 1/2] target/riscv: Set the opcode in DisasContext

2021-12-09 Thread Alistair Francis

From: Alistair Francis 

Signed-off-by: Alistair Francis 
---
 target/riscv/translate.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index 1d57bc97b5..24251bc8cc 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -586,6 +586,7 @@ static void decode_opc(CPURISCVState *env, DisasContext 
*ctx, uint16_t opcode)
 if (!has_ext(ctx, RVC)) {
 gen_exception_illegal(ctx);
 } else {
+ctx->opcode = opcode;
 ctx->pc_succ_insn = ctx->base.pc_next + 2;
 if (!decode_insn16(ctx, opcode)) {
 gen_exception_illegal(ctx);
@@ -596,6 +597,7 @@ static void decode_opc(CPURISCVState *env, DisasContext 
*ctx, uint16_t opcode)
 opcode32 = deposit32(opcode32, 16, 16,
  translator_lduw(env, >base,
  ctx->base.pc_next + 2));
+ctx->opcode = opcode32;
 ctx->pc_succ_insn = ctx->base.pc_next + 4;
 if (!decode_insn32(ctx, opcode32)) {
 gen_exception_illegal(ctx);
-- 
2.31.1

[PATCH 0/2] RISC-V: Populate mtval and stval

2021-12-09 Thread Alistair Francis

From: Alistair Francis 

Populate mtval and stval when taking an illegal instruction exception.

The RISC-V spec states that "The stval register can optionally also be
used to return the faulting instruction bits on an illegal instruction
exception...". In this case we are always writing the value on an
illegal instruction.

This doesn't match all CPUs (some CPUs won't write the data), but in
QEMU let's just populate the value on illegal instructions. This won't
break any guest software, but will provide more information to guests.

*** BLURB HERE ***

Alistair Francis (2):
  target/riscv: Set the opcode in DisasContext
  target/riscv: Implement the stval/mtval illegal instruction

 target/riscv/cpu.h|  2 ++
 target/riscv/cpu_helper.c | 25 +++--
 target/riscv/translate.c  |  5 +
 3 files changed, 18 insertions(+), 14 deletions(-)

-- 
2.31.1

Re: [PATCH v7 2/7] net/vmnet: add vmnet backends to qapi/net

2021-12-09 Thread Markus Armbruster

Vladislav Yaroshchuk  writes:

> Create separate netdevs for each vmnet operating mode:
> - vmnet-host
> - vmnet-shared
> - vmnet-bridged
>
> Signed-off-by: Vladislav Yaroshchuk 
> ---

[...]

> diff --git a/qapi/net.json b/qapi/net.json
> index 7fab2e7cd8..8ed7bf0c04 100644
> --- a/qapi/net.json
> +++ b/qapi/net.json
> @@ -452,6 +452,122 @@
>  '*vhostdev': 'str',
>  '*queues':   'int' } }
>  
> +##
> +# @NetdevVmnetHostOptions:
> +#
> +# vmnet (host mode) network backend.
> +#
> +# Allows the vmnet interface to communicate with other vmnet
> +# interfaces that are in host mode and also with the native host.

We don't say "native host" elsewhere, just "host".  Let's drop
"native".

> +#
> +# @start-address: The starting IPv4 address to use for the interface.
> +# Must be in the private IP range (RFC 1918). Must be
> +# specified along with @end-address and @subnet-mask.
> +# This address is used as the gateway address. The
> +# subsequent address up to and including end-address are
> +# placed in the DHCP pool.
> +#
> +# @end-address: The DHCP IPv4 range end address to use for the
> +#   interface. Must be in the private IP range (RFC 1918).
> +#   Must be specified along with @start-address and
> +#   @subnet-mask.
> +#
> +# @subnet-mask: The IPv4 subnet mask to use on the interface. Must
> +#   be specified along with @start-address and @subnet-mask.
> +#
> +# @isolated: Enable isolation for this interface. Interface isolation
> +#ensures that vmnet interface is not able to communicate
> +#with any other vmnet interfaces. Only communication with
> +#host is allowed.
> +#
> +# @net-uuid: The identifier (UUID) to uniquely identify the isolated
> +#network vmnet interface should be added to. If
> +#set, no DHCP service is provided for this interface and
> +#network communication is allowed only with other interfaces
> +#added to this network identified by the UUID.
> +#
> +# Since: 7.0
> +##
> +{ 'struct': 'NetdevVmnetHostOptions',
> +  'data': {
> +'*start-address':   'str',
> +'*end-address': 'str',
> +'*subnet-mask': 'str',
> +'*isolated':'bool',
> +'*net-uuid':'str'
> +  },

Unusual formatting.  The common one is

   '*net-uuid':'str' },

Let's stick to it.

> +  'if': 'CONFIG_VMNET' }
> +
> +##
> +# @NetdevVmnetSharedOptions:
> +#
> +# vmnet (shared mode) network backend.
> +#
> +# Allows traffic originating from the vmnet interface to reach the
> +# Internet through a network address translator (NAT).
> +# The vmnet interface can communicate with the native host and with

Drop "native".

> +# other shared mode interfaces on the same subnet. If no DHCP
> +# settings, subnet mask and IPv6 prefix specified, the interface can
> +# communicate with any of other interfaces in shared mode.
> +#
> +# @start-address: The starting IPv4 address to use for the interface.
> +# Must be in the private IP range (RFC 1918). Must be
> +# specified along with @end-address and @subnet-mask.
> +# This address is used as the gateway address. The
> +# subsequent address up to and including end-address are
> +# placed in the DHCP pool.
> +#
> +# @end-address: The DHCP IPv4 range end address to use for the
> +#   interface. Must be in the private IP range (RFC 1918).
> +#   Must be specified along with @start-address and @subnet-mask.
> +#
> +# @subnet-mask: The IPv4 subnet mask to use on the interface. Must
> +#be specified along with @start-address and @subnet-mask.
> +#
> +# @isolated: Enable isolation for this interface. Interface isolation
> +#ensures that vmnet interface is not able to communicate
> +#with any other vmnet interfaces. Only communication with
> +#host is allowed.
> +#
> +# @nat66-prefix: The IPv6 prefix to use into guest network. Must be a
> +#unique local address i.e. start with fd00::/8 and have
> +#length of 64.
> +#
> +# Since: 7.0
> +##
> +{ 'struct': 'NetdevVmnetSharedOptions',
> +  'data': {
> +'*start-address':'str',
> +'*end-address':  'str',
> +'*subnet-mask':  'str',
> +'*isolated': 'bool',
> +'*nat66-prefix': 'str'
> +  },

Unusual formatting again.

> +  'if': 'CONFIG_VMNET' }
> +
> +##
> +# @NetdevVmnetBridgedOptions:
> +#
> +# vmnet (bridged mode) network backend.
> +#
> +# Bridges the vmnet interface with a physical network interface.
> +#
> +# @ifname: The name of the physical interface to be bridged.
> +#
> +# @isolated: Enable isolation for this interface. Interface isolation
> +#ensures that vmnet interface is not able to communicate
> +#with any other vmnet interfaces. Only

[PATCH] COLO: Move some trace code behind qemu_mutex_unlock_iothread()

2021-12-09 Thread Rao, Lei

Signed-off-by: Lei Rao 
---
 migration/colo.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/migration/colo.c b/migration/colo.c
index 2415325262..3ccacb29c8 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -683,8 +683,8 @@ static void 
colo_incoming_process_checkpoint(MigrationIncomingState *mis,
 
 qemu_mutex_lock_iothread();
 vm_stop_force_state(RUN_STATE_COLO);
-trace_colo_vm_state_change("run", "stop");
 qemu_mutex_unlock_iothread();
+trace_colo_vm_state_change("run", "stop");
 
 /* FIXME: This is unnecessary for periodic checkpoint mode */
 colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
@@ -786,8 +786,8 @@ static void 
colo_incoming_process_checkpoint(MigrationIncomingState *mis,
 
 vmstate_loading = false;
 vm_start();
-trace_colo_vm_state_change("stop", "run");
 qemu_mutex_unlock_iothread();
+trace_colo_vm_state_change("stop", "run");
 
 if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
 return;
@@ -870,8 +870,8 @@ void *colo_process_incoming_thread(void *opaque)
 abort();
 #endif
 vm_start();
-trace_colo_vm_state_change("stop", "run");
 qemu_mutex_unlock_iothread();
+trace_colo_vm_state_change("stop", "run");
 
 colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
   _err);
-- 
2.32.0

Re: [PATCH 1/7] hw/intc: sifive_plic: Add a reset function

2021-12-09 Thread Alistair Francis

On Wed, Dec 8, 2021 at 10:00 PM Philippe Mathieu-Daudé  wrote:
>
> Hi Alistair,
>
> On 12/8/21 07:42, Alistair Francis wrote:
> > From: Alistair Francis 
> >
> > Signed-off-by: Alistair Francis 
> > ---
> >  hw/intc/sifive_plic.c | 12 
> >  1 file changed, 12 insertions(+)
> >
> > diff --git a/hw/intc/sifive_plic.c b/hw/intc/sifive_plic.c
> > index 877e76877c..35f097799a 100644
> > --- a/hw/intc/sifive_plic.c
> > +++ b/hw/intc/sifive_plic.c
> > @@ -355,6 +355,17 @@ static const MemoryRegionOps sifive_plic_ops = {
> >  }
> >  };
> >
> > +static void sifive_plic_reset(DeviceState *dev)
> > +{
> > +SiFivePLICState *s = SIFIVE_PLIC(dev);
> > +
> > +memset(s->source_priority, 0, sizeof(uint32_t) * s->num_sources);
> > +memset(s->target_priority, 0, sizeof(uint32_t) * s->num_addrs);
> > +memset(s->pending, 0, sizeof(uint32_t) * s->bitfield_words);
> > +memset(s->claimed, 0, sizeof(uint32_t) * s->bitfield_words);
> > +memset(s->enable, 0, sizeof(uint32_t) * s->num_enables);
>
> Looking at sifive_plic_realize():
>
> - Should we reset the external IRQs in a default state?

Good point, I'll add that.

> - Shouldn't riscv_cpu_claim_interrupts() be called at reset?

I don't think so. riscv_cpu_claim_interrupts is a once and done call.

Alistair

>
> Note: parse_hart_config() name is slightly confusing since
> beside parsing, it also allocates addr_config. Maybe consider
> renaming?

Re: [PATCH v6 09/18] target/riscv: accessors to registers upper part and 128-bit load/store

2021-12-09 Thread Alistair Francis

On Mon, Nov 29, 2021 at 12:03 AM Frédéric Pétrot
 wrote:
>
> Get function to retrieve the 64 top bits of a register, stored in the gprh
> field of the cpu state. Set function that writes the 128-bit value at once.
> The access to the gprh field can not be protected at compile time to make
> sure it is accessed only in the 128-bit version of the processor because we
> have no way to indicate that the misa_mxl_max field is const.
>
> The 128-bit ISA adds ldu, lq and sq. We provide support for these
> instructions. Note that (a) we compute only 64-bit addresses to actually
> access memory, cowardly utilizing the existing address translation mechanism
> of QEMU, and (b) we assume for now little-endian memory accesses.
>
> Signed-off-by: Frédéric Pétrot 
> Co-authored-by: Fabien Portas 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/insn16.decode  |  27 ++-
>  target/riscv/insn32.decode  |   5 ++
>  target/riscv/translate.c|  41 ++
>  target/riscv/insn_trans/trans_rvi.c.inc | 100 ++--
>  4 files changed, 163 insertions(+), 10 deletions(-)
>
> diff --git a/target/riscv/insn16.decode b/target/riscv/insn16.decode
> index 2e9212663c..02c8f61b48 100644
> --- a/target/riscv/insn16.decode
> +++ b/target/riscv/insn16.decode
> @@ -25,14 +25,17 @@
>  # Immediates:
>  %imm_ci12:s1 2:5
>  %nzuimm_ciw7:4 11:2 5:1 6:1   !function=ex_shift_2
> +%uimm_cl_q 10:1 5:2 11:2  !function=ex_shift_4
>  %uimm_cl_d 5:2 10:3   !function=ex_shift_3
>  %uimm_cl_w 5:1 10:3 6:1   !function=ex_shift_2
>  %imm_cb12:s1 5:2 2:1 10:2 3:2 !function=ex_shift_1
>  %imm_cj12:s1 8:1 9:2 6:1 7:1 2:1 11:1 3:3 !function=ex_shift_1
>
>  %shimm_6bit   12:1 2:5   !function=ex_rvc_shifti
> +%uimm_6bit_lq 2:4 12:1 6:1   !function=ex_shift_4
>  %uimm_6bit_ld 2:3 12:1 5:2   !function=ex_shift_3
>  %uimm_6bit_lw 2:2 12:1 4:3   !function=ex_shift_2
> +%uimm_6bit_sq 7:4 11:2   !function=ex_shift_4
>  %uimm_6bit_sd 7:3 10:3   !function=ex_shift_3
>  %uimm_6bit_sw 7:2 9:4!function=ex_shift_2
>
> @@ -54,16 +57,20 @@
>  # Formats 16:
>  @cr  . .  ..   rs2=%rs2_5   rs1=%rd %rd
>  @ci... . . .  ..   imm=%imm_ci  rs1=%rd %rd
> +@cl_q  ... . .  . ..   imm=%uimm_cl_q   rs1=%rs1_3  
> rd=%rs2_3
>  @cl_d  ... ... ... .. ... ..   imm=%uimm_cl_d   rs1=%rs1_3  
> rd=%rs2_3
>  @cl_w  ... ... ... .. ... ..   imm=%uimm_cl_w   rs1=%rs1_3  
> rd=%rs2_3
>  @cs_2  ... ... ... .. ... ..   rs2=%rs2_3   rs1=%rs1_3  
> rd=%rs1_3
> +@cs_q  ... ... ... .. ... ..   imm=%uimm_cl_q   rs1=%rs1_3  
> rs2=%rs2_3
>  @cs_d  ... ... ... .. ... ..   imm=%uimm_cl_d   rs1=%rs1_3  
> rs2=%rs2_3
>  @cs_w  ... ... ... .. ... ..   imm=%uimm_cl_w   rs1=%rs1_3  
> rs2=%rs2_3
>  @cj...... ..   imm=%imm_cj
>  @cb_z  ... ... ... .. ... ..   imm=%imm_cb  rs1=%rs1_3  rs2=0
>
> +@c_lqsp... . .  . ..   imm=%uimm_6bit_lq rs1=2 %rd
>  @c_ldsp... . .  . ..   imm=%uimm_6bit_ld rs1=2 %rd
>  @c_lwsp... . .  . ..   imm=%uimm_6bit_lw rs1=2 %rd
> +@c_sqsp... . .  . ..   imm=%uimm_6bit_sq rs1=2 rs2=%rs2_5
>  @c_sdsp... . .  . ..   imm=%uimm_6bit_sd rs1=2 rs2=%rs2_5
>  @c_swsp... . .  . ..   imm=%uimm_6bit_sw rs1=2 rs2=%rs2_5
>  @c_li  ... . .  . ..   imm=%imm_ci rs1=0 %rd
> @@ -87,9 +94,15 @@
>illegal 000  000 000 00 --- 00
>addi000  ... ... .. ... 00 @c_addi4spn
>  }
> -fld   001  ... ... .. ... 00 @cl_d
> +{
> +  lq  001  ... ... .. ... 00 @cl_q
> +  fld 001  ... ... .. ... 00 @cl_d
> +}
>  lw010  ... ... .. ... 00 @cl_w
> -fsd   101  ... ... .. ... 00 @cs_d
> +{
> +  sq  101  ... ... .. ... 00 @cs_q
> +  fsd 101  ... ... .. ... 00 @cs_d
> +}
>  sw110  ... ... .. ... 00 @cs_w
>
>  # *** RV32C and RV64C specific Standard Extension (Quadrant 0) ***
> @@ -132,7 +145,10 @@ addw  100 1 11 ... 01 ... 01 @cs_2
>
>  # *** RV32/64C Standard Extension (Quadrant 2) ***
>  slli  000 .  .  . 10 @c_shift2
> -fld   001 .  .  . 10 @c_ldsp
> +{
> +  lq  001  ... ... .. ... 10 @c_lqsp
> +  fld 001 .  .  . 10 @c_ldsp
> +}
>  {
>illegal 010 -  0  - 10 # c.lwsp, RES rd=0
>lw  010 .  .  . 10 @c_lwsp
> @@ -147,7 +163,10 @@ fld   001 .  .  . 10 @c_ldsp
>jalr100 1  .  0 10 @c_jalr rd=1  # C.JALR
>add 100 1  .  . 10 @cr
>  }
> -fsd   101   ..  . 10 @c_sdsp
> +{
> +  sq  101  ... ... .. ... 10 @c_sqsp
> +

[PATCH v2 2/4] intel_iommu: Support IR-only mode without DMA translation

2021-12-09 Thread David Woodhouse

From: David Woodhouse 

By setting none of the SAGAW bits we can indicate to a guest that DMA
translation isn't supported. Tested by booting Windows 10, as well as
Linux guests with the fix at https://git.kernel.org/torvalds/c/c40c10

Signed-off-by: David Woodhouse 
Acked-by: Claudio Fontana 
---
 hw/i386/intel_iommu.c | 14 ++
 include/hw/i386/intel_iommu.h |  1 +
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index f584449d8d..9a3cb2b789 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2202,7 +2202,7 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s)
 uint32_t changed = status ^ val;
 
 trace_vtd_reg_write_gcmd(status, val);
-if (changed & VTD_GCMD_TE) {
+if ((changed & VTD_GCMD_TE) && s->dma_translation) {
 /* Translation enable/disable */
 vtd_handle_gcmd_te(s, val & VTD_GCMD_TE);
 }
@@ -3100,6 +3100,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
 DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
+DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, 
true),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -3605,12 +3606,17 @@ static void vtd_init(IntelIOMMUState *s)
 s->next_frcd_reg = 0;
 s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
  VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
- VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
+ VTD_CAP_MGAW(s->aw_bits);
 if (s->dma_drain) {
 s->cap |= VTD_CAP_DRAIN;
 }
-if (s->aw_bits == VTD_HOST_AW_48BIT) {
-s->cap |= VTD_CAP_SAGAW_48bit;
+if (s->dma_translation) {
+if (s->aw_bits >= VTD_HOST_AW_39BIT) {
+s->cap |= VTD_CAP_SAGAW_39bit;
+}
+if (s->aw_bits >= VTD_HOST_AW_48BIT) {
+s->cap |= VTD_CAP_SAGAW_48bit;
+}
 }
 s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;
 
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 41783ee46d..42d6a6a636 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -266,6 +266,7 @@ struct IntelIOMMUState {
 bool buggy_eim; /* Force buggy EIM unless eim=off */
 uint8_t aw_bits;/* Host/IOVA address width (in bits) */
 bool dma_drain; /* Whether DMA r/w draining enabled */
+bool dma_translation;   /* Whether DMA translation supported */
 
 /*
  * Protects IOMMU states in general.  Currently it protects the
-- 
2.31.1

[PATCH v2 1/4] target/i386: Fix sanity check on max APIC ID / X2APIC enablement

2021-12-09 Thread David Woodhouse

The check on x86ms->apic_id_limit in pc_machine_done() had two problems.

Firstly, we need KVM to support the X2APIC API in order to allow IRQ
delivery to APICs >= 255. So we need to call/check kvm_enable_x2apic(),
which was done elsewhere in *some* cases but not all.

Secondly, microvm needs the same check. So move it from pc_machine_done()
to x86_cpus_init() where it will work for both.

The check in kvm_cpu_instance_init() is now redundant and can be dropped.

Signed-off-by: David Woodhouse 
Acked-by: Claudio Fontana 
---
 hw/i386/pc.c  |  8 
 hw/i386/x86.c | 16 
 target/i386/kvm/kvm-cpu.c |  2 +-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index a2ef40ecbc..9959f93216 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -736,14 +736,6 @@ void pc_machine_done(Notifier *notifier, void *data)
 /* update FW_CFG_NB_CPUS to account for -device added CPUs */
 fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
 }
-
-
-if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
-!kvm_irqchip_in_kernel()) {
-error_report("current -smp configuration requires kernel "
- "irqchip support.");
-exit(EXIT_FAILURE);
-}
 }
 
 void pc_guest_info_init(PCMachineState *pcms)
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index b84840a1bb..f64639b873 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -39,6 +39,7 @@
 #include "sysemu/replay.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/cpu-timers.h"
+#include "sysemu/xen.h"
 #include "trace.h"
 
 #include "hw/i386/x86.h"
@@ -136,6 +137,21 @@ void x86_cpus_init(X86MachineState *x86ms, int 
default_cpu_version)
  */
 x86ms->apic_id_limit = x86_cpu_apic_id_from_index(x86ms,
   ms->smp.max_cpus - 1) + 
1;
+
+/*
+ * Can we support APIC ID 255 or higher?
+ *
+ * Under Xen: yes.
+ * With userspace emulated lapic: no
+ * With KVM's in-kernel lapic: only if X2APIC API is enabled.
+ */
+if (x86ms->apic_id_limit > 255 && !xen_enabled() &&
+(!kvm_irqchip_in_kernel() || !kvm_enable_x2apic())) {
+error_report("current -smp configuration requires kernel "
+ "irqchip and X2APIC API support.");
+exit(EXIT_FAILURE);
+}
+
 possible_cpus = mc->possible_cpu_arch_ids(ms);
 for (i = 0; i < ms->smp.cpus; i++) {
 x86_cpu_new(x86ms, possible_cpus->cpus[i].arch_id, _fatal);
diff --git a/target/i386/kvm/kvm-cpu.c b/target/i386/kvm/kvm-cpu.c
index d95028018e..c60cb2dafb 100644
--- a/target/i386/kvm/kvm-cpu.c
+++ b/target/i386/kvm/kvm-cpu.c
@@ -165,7 +165,7 @@ static void kvm_cpu_instance_init(CPUState *cs)
 /* only applies to builtin_x86_defs cpus */
 if (!kvm_irqchip_in_kernel()) {
 x86_cpu_change_kvm_default("x2apic", "off");
-} else if (kvm_irqchip_is_split() && kvm_enable_x2apic()) {
+} else if (kvm_irqchip_is_split()) {
 x86_cpu_change_kvm_default("kvm-msi-ext-dest-id", "on");
 }
 
-- 
2.31.1

[PATCH v2 4/4] intel_iommu: Fix irqchip / X2APIC configuration checks

2021-12-09 Thread David Woodhouse

We don't need to check kvm_enable_x2apic(). It's perfectly OK to support
interrupt remapping even if we can't address CPUs above 254. Kind of
pointless, but still functional.

The check on kvm_enable_x2apic() needs to happen *anyway* in order to
allow CPUs above 254 even without an IOMMU, so allow that to happen
elsewhere.

However, we do require the *split* irqchip in order to rewrite I/OAPIC
destinations. So fix that check while we're here.

Signed-off-by: David Woodhouse 
Reviewed-by: Peter Xu 
Acked-by: Jason Wang 
---
 hw/i386/intel_iommu.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index bd288d45bb..0d1c72f08e 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3760,15 +3760,10 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
   ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
 }
 if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) {
-if (!kvm_irqchip_in_kernel()) {
+if (!kvm_irqchip_is_split()) {
 error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split");
 return false;
 }
-if (!kvm_enable_x2apic()) {
-error_setg(errp, "eim=on requires support on the KVM side"
- "(X2APIC_API, first shipped in v4.7)");
-return false;
-}
 }
 
 /* Currently only address widths supported are 39 and 48 bits */
-- 
2.31.1

[PATCH v2 3/4] intel_iommu: Only allow interrupt remapping to be enabled if it's supported

2021-12-09 Thread David Woodhouse

From: David Woodhouse 

We should probably check if we were meant to be exposing IR, before
letting the guest turn the IRE bit on.

Signed-off-by: David Woodhouse 
Reviewed-by: Peter Xu 
Acked-by: Jason Wang 
---
 hw/i386/intel_iommu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 9a3cb2b789..bd288d45bb 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2197,6 +2197,7 @@ static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool 
en)
 /* Handle write to Global Command Register */
 static void vtd_handle_gcmd_write(IntelIOMMUState *s)
 {
+X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
 uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG);
 uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG);
 uint32_t changed = status ^ val;
@@ -2218,7 +2219,8 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s)
 /* Set/update the interrupt remapping root-table pointer */
 vtd_handle_gcmd_sirtp(s);
 }
-if (changed & VTD_GCMD_IRE) {
+if ((changed & VTD_GCMD_IRE) &&
+x86_iommu_ir_supported(x86_iommu)) {
 /* Interrupt remap enable/disable */
 vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE);
 }
-- 
2.31.1

Re: Redesign of QEMU startup & initial configuration

2021-12-09 Thread Daniel P . Berrangé

On Thu, Dec 09, 2021 at 09:01:24PM +0100, Mark Burton wrote:
> I’ll take the liberty to cut one part (I agree with much of what you say 
> elsewhere)
> 
> > On 9 Dec 2021, at 20:11, Daniel P. Berrangé  wrote:
> > 
> > As illustrated earlier, I'd really like us to consider being a bit
> > more adventurous on the CLI side. I'm convinced that a CLI for
> > directly configurable hardware is doomed to be horrible no matter
> > what, if you try to directly expose all QAPI configuration
> > flexibilty. Whether key/value, JSON, whatever, it will become
> > unmanagable on the CLI because VM hardware config is inherantly
> > complicated.
> > 
> 
> I absolutely agree, but reach a slightly different conclusion
> 
> > Thus my though that config files or QMP should be the only two
> > places where the full power of QAPI config is exposed. Use CLI
> > as just a way to interact with config files in a simple way
> > with templates.
> 
> I would countenance that we choose only one place to ‘support’ an interface. 
> Either “Yet Another Hardware Configuration Language” or QAPI. Rather than 
> re-inventing that wheel I would simply suggest that we leave that to the 
> relevant ‘user’ community (libvirt, whatever), who have specific requirements 
> and/or existing solutions. Leaving QEMU itself to focus on improving QAPI 
> (and migrating the CLI). 

Yes, indeed, the logical extension of my idea is that the 'simple'
CLI + templating thing doesn't atually have to be in the main QEMU
binary at all. We could in fact ship a bare '/usr/bin/qemu' which
does the config file templating and spawns whatever full QEMU
binary (/usr/bin/qemu-system-blah) does the real VM.  The key is
just that we have something simple for users, who don't want a
full mgmt layer and like the historical QEMU simple configs.


Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 1/1] uas: add stream number sanity checks.

2021-12-09 Thread Guenter Roeck

On Wed, Aug 18, 2021 at 02:05:05PM +0200, Gerd Hoffmann wrote:
> The device uses the guest-supplied stream number unchecked, which can
> lead to guest-triggered out-of-band access to the UASDevice->data3 and
> UASDevice->status3 fields.  Add the missing checks.
> 
> Fixes: CVE-2021-3713
> Signed-off-by: Gerd Hoffmann 
> Reported-by: Chen Zhe 
> Reported-by: Tan Jingguo 
> Reviewed-by: Philippe Mathieu-Daudé 
> ---
>  hw/usb/dev-uas.c | 11 +++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/hw/usb/dev-uas.c b/hw/usb/dev-uas.c
> index 263056231c79..f6309a5ebfdc 100644
> --- a/hw/usb/dev-uas.c
> +++ b/hw/usb/dev-uas.c
> @@ -840,6 +840,9 @@ static void usb_uas_handle_data(USBDevice *dev, USBPacket 
> *p)
>  }
>  break;
>  case UAS_PIPE_ID_STATUS:
> +if (p->stream > UAS_MAX_STREAMS) {
> +goto err_stream;
> +}
>  if (p->stream) {
>  QTAILQ_FOREACH(st, >results, next) {
>  if (st->stream == p->stream) {
> @@ -867,6 +870,9 @@ static void usb_uas_handle_data(USBDevice *dev, USBPacket 
> *p)
>  break;
>  case UAS_PIPE_ID_DATA_IN:
>  case UAS_PIPE_ID_DATA_OUT:
> +if (p->stream > UAS_MAX_STREAMS) {
> +goto err_stream;
> +}
>  if (p->stream) {
>  req = usb_uas_find_request(uas, p->stream);
>  } else {
> @@ -902,6 +908,11 @@ static void usb_uas_handle_data(USBDevice *dev, 
> USBPacket *p)
>  p->status = USB_RET_STALL;
>  break;
>  }
> +
> +err_stream:
> +error_report("%s: invalid stream %d", __func__, p->stream);
> +p->status = USB_RET_STALL;
> +return;

How is this supposed to work ? It results in messages such as the following.

qemu-system-sparc64: usb_uas_handle_data: invalid stream 1
qemu-system-sparc64: usb_uas_handle_data: invalid stream 1

It also sets the status unconditionally to USB_RET_STALL,
and UAS is simply broken after this patch is applied because
the error handling code is executed literally for each call
of usb_uas_handle_data().

Guenter

Re: Redesign of QEMU startup & initial configuration

2021-12-09 Thread Mark Burton

I’ll take the liberty to cut one part (I agree with much of what you say 
elsewhere)

> On 9 Dec 2021, at 20:11, Daniel P. Berrangé  wrote:
> 
> As illustrated earlier, I'd really like us to consider being a bit
> more adventurous on the CLI side. I'm convinced that a CLI for
> directly configurable hardware is doomed to be horrible no matter
> what, if you try to directly expose all QAPI configuration
> flexibilty. Whether key/value, JSON, whatever, it will become
> unmanagable on the CLI because VM hardware config is inherantly
> complicated.
> 

I absolutely agree, but reach a slightly different conclusion

> Thus my though that config files or QMP should be the only two
> places where the full power of QAPI config is exposed. Use CLI
> as just a way to interact with config files in a simple way
> with templates.

I would countenance that we choose only one place to ‘support’ an interface. 
Either “Yet Another Hardware Configuration Language” or QAPI. Rather than 
re-inventing that wheel I would simply suggest that we leave that to the 
relevant ‘user’ community (libvirt, whatever), who have specific requirements 
and/or existing solutions. Leaving QEMU itself to focus on improving QAPI (and 
migrating the CLI). 

Cheers
Mark.

Re: [PATCH v2] target/i386: Use assert() to sanity-check b1 in SSE decode

2021-12-09 Thread Peter Maydell

Gave up pinging for i386 maintainers; will take this via target-arm.next.

thanks
-- PMM


On Mon, 15 Nov 2021 at 14:38, Peter Maydell  wrote:
>
> Ping^4. Who is collecting target/i386 patches these days ?
>
> -- PMM
>
> On Mon, 1 Nov 2021 at 16:18, Peter Maydell  wrote:
> >
> > Ping^3, now 2 months after patch posted and reviewed...
> >
> > -- PMM
> >
> > On Mon, 27 Sept 2021 at 11:03, Peter Maydell  
> > wrote:
> > >
> > > Ping^2 !
> > >
> > > thanks
> > > -- PMM
> > >
> > > On Mon, 13 Sept 2021 at 13:34, Peter Maydell  
> > > wrote:
> > > >
> > > > Ping? (this has been reviewed)
> > > >
> > > > thanks
> > > > -- PMM
> > > >
> > > > On Wed, 1 Sept 2021 at 15:10, Peter Maydell  
> > > > wrote:
> > > > >
> > > > > In the SSE decode function gen_sse(), we combine a byte
> > > > > 'b' and a value 'b1' which can be [0..3], and switch on them:
> > > > >b |= (b1 << 8);
> > > > >switch (b) {
> > > > >...
> > > > >default:
> > > > >unknown_op:
> > > > >gen_unknown_opcode(env, s);
> > > > >return;
> > > > >}
> > > > >
> > > > > In three cases inside this switch, we were then also checking for
> > > > >  "if (b1 >= 2) { goto unknown_op; }".
> > > > > However, this can never happen, because the 'case' values in each 
> > > > > place
> > > > > are 0x0nn or 0x1nn and the switch will have directed the b1 == (2, 3)
> > > > > cases to the default already.
> > > > >
> > > > > This check was added in commit c045af25a52e9 in 2010; the added code
> > > > > was unnecessary then as well, and was apparently intended only to
> > > > > ensure that we never accidentally ended up indexing off the end
> > > > > of an sse_op_table with only 2 entries as a result of future bugs
> > > > > in the decode logic.
> > > > >
> > > > > Change the checks to assert() instead, and make sure they're always
> > > > > immediately before the array access they are protecting.
> > > > >
> > > > > Fixes: Coverity CID 1460207
> > > > > Signed-off-by: Peter Maydell 
> > > > > ---
> > > > > v1->v2: use assert() rather than just deleting the if()s
> > > > >
> > > > >  target/i386/tcg/translate.c | 12 +++-
> > > > >  1 file changed, 3 insertions(+), 9 deletions(-)
> > > > >
> > > > > diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c
> > > > > index aacb605eee4..a4fee5e445d 100644
> > > > > --- a/target/i386/tcg/translate.c
> > > > > +++ b/target/i386/tcg/translate.c
> > > > > @@ -3521,9 +3521,6 @@ static void gen_sse(CPUX86State *env, 
> > > > > DisasContext *s, int b,
> > > > >  case 0x171: /* shift xmm, im */
> > > > >  case 0x172:
> > > > >  case 0x173:
> > > > > -if (b1 >= 2) {
> > > > > -goto unknown_op;
> > > > > -}
> > > > >  val = x86_ldub_code(env, s);
> > > > >  if (is_xmm) {
> > > > >  tcg_gen_movi_tl(s->T0, val);
> > > > > @@ -3542,6 +3539,7 @@ static void gen_sse(CPUX86State *env, 
> > > > > DisasContext *s, int b,
> > > > >  offsetof(CPUX86State, 
> > > > > mmx_t0.MMX_L(1)));
> > > > >  op1_offset = offsetof(CPUX86State,mmx_t0);
> > > > >  }
> > > > > +assert(b1 < 2);
> > > > >  sse_fn_epp = sse_op_table2[((b - 1) & 3) * 8 +
> > > > > (((modrm >> 3)) & 7)][b1];
> > > > >  if (!sse_fn_epp) {
> > > > > @@ -3772,10 +3770,8 @@ static void gen_sse(CPUX86State *env, 
> > > > > DisasContext *s, int b,
> > > > >  rm = modrm & 7;
> > > > >  reg = ((modrm >> 3) & 7) | REX_R(s);
> > > > >  mod = (modrm >> 6) & 3;
> > > > > -if (b1 >= 2) {
> > > > > -goto unknown_op;
> > > > > -}
> > > > >
> > > > > +assert(b1 < 2);
> > > > >  sse_fn_epp = sse_op_table6[b].op[b1];
> > > > >  if (!sse_fn_epp) {
> > > > >  goto unknown_op;
> > > > > @@ -4202,10 +4198,8 @@ static void gen_sse(CPUX86State *env, 
> > > > > DisasContext *s, int b,
> > > > >  rm = modrm & 7;
> > > > >  reg = ((modrm >> 3) & 7) | REX_R(s);
> > > > >  mod = (modrm >> 6) & 3;
> > > > > -if (b1 >= 2) {
> > > > > -goto unknown_op;
> > > > > -}
> > > > >
> > > > > +assert(b1 < 2);
> > > > >  sse_fn_eppi = sse_op_table7[b].op[b1];
> > > > >  if (!sse_fn_eppi) {
> > > > >  goto unknown_op;
> > > > > --
> > > > > 2.20.1

Re: [PATCH v2 for-7.0] scripts: Explain the difference between linux-headers and standard-headers

2021-12-09 Thread Thomas Huth


On 09/12/2021 20.45, Peter Maydell wrote:

If you don't know it, it's hard to figure out the difference between
the linux-headers folder and the include/standard-headers folder.
So let's add a short explanation to clarify the difference.

Suggested-by: Thomas Huth 
Signed-off-by: Peter Maydell 
---
v1 of this was from Thomas; I suggested some expanded wording
and since that made the patch pretty much entirely my text
Thomas suggested I send this under my name.
---
  scripts/update-linux-headers.sh | 16 
  1 file changed, 16 insertions(+)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index fea4d6eb655..d23851e1d3b 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -9,6 +9,22 @@
  #
  # This work is licensed under the terms of the GNU GPL version 2.
  # See the COPYING file in the top-level directory.
+#
+# The script will copy the headers into two target folders:
+#
+# - linux-headers/ for files that are required for compiling on a
+#   Linux host.  Generally we have these so we can use kernel structs
+#   and defines that are more recent than the headers that might be
+#   in /usr/include/linux on the host system.  Usually this script
+#   can do simple file copies for these headers.
+#
+# - include/standard-headers/ for files that are used for guest
+#   device emulation and are required on all hosts.  For instance, we
+#   get our definitions of the virtio structures from the Linux
+#   kernel headers, but we need those definitions regardless of which
+#   host OS we are building on.  This script has to be careful to
+#   sanitize the headers to remove any use of Linux-specifics such as
+#   types like "__u64".  This work is done in the cp_portable function.


Thanks!

Reviewed-by: Thomas Huth

[PATCH v2 for-7.0] scripts: Explain the difference between linux-headers and standard-headers

2021-12-09 Thread Peter Maydell

If you don't know it, it's hard to figure out the difference between
the linux-headers folder and the include/standard-headers folder.
So let's add a short explanation to clarify the difference.

Suggested-by: Thomas Huth 
Signed-off-by: Peter Maydell 
---
v1 of this was from Thomas; I suggested some expanded wording
and since that made the patch pretty much entirely my text
Thomas suggested I send this under my name.
---
 scripts/update-linux-headers.sh | 16 
 1 file changed, 16 insertions(+)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index fea4d6eb655..d23851e1d3b 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -9,6 +9,22 @@
 #
 # This work is licensed under the terms of the GNU GPL version 2.
 # See the COPYING file in the top-level directory.
+#
+# The script will copy the headers into two target folders:
+#
+# - linux-headers/ for files that are required for compiling on a
+#   Linux host.  Generally we have these so we can use kernel structs
+#   and defines that are more recent than the headers that might be
+#   in /usr/include/linux on the host system.  Usually this script
+#   can do simple file copies for these headers.
+#
+# - include/standard-headers/ for files that are used for guest
+#   device emulation and are required on all hosts.  For instance, we
+#   get our definitions of the virtio structures from the Linux
+#   kernel headers, but we need those definitions regardless of which
+#   host OS we are building on.  This script has to be careful to
+#   sanitize the headers to remove any use of Linux-specifics such as
+#   types like "__u64".  This work is done in the cp_portable function.
 
 tmpdir=$(mktemp -d)
 linux="$1"
-- 
2.25.1

Re: [PATCH] target/ppc: powerpc_excp: Guard ALIGNMENT interrupt with CONFIG_TCG

2021-12-09 Thread Fabiano Rosas

Fabiano Rosas  writes:

> Cédric Le Goater  writes:
>
>> Richard,
>>
>> On 12/9/21 16:05, Fabiano Rosas wrote:
>>> Cédric Le Goater  writes:
>>> 
 On 12/9/21 00:06, Fabiano Rosas wrote:
> We cannot have TCG code in powerpc_excp because the function is called
> from kvm-only code via ppc_cpu_do_interrupt:
>
>../target/ppc/excp_helper.c:463:29: error: implicit declaration of
>function ‘cpu_ldl_code’ [-Werror=implicit-function-declaration]
>
> Fortunately, the Alignment interrupt is not among the ones dispatched
> from kvm-only code, so we can keep it out of the disable-tcg build for
> now.
>
> Fixes: 336e91f853 ("target/ppc: Move SPR_DSISR setting to powerpc_excp")
> Signed-off-by: Fabiano Rosas 
>
> ---
>
> Perhaps we could make powerpc_excp TCG only and have a separate
> function that only knows the two interrupts that we use with KVM
> (Program, Machine check). But for now this fix will do, I think.
> ---
>target/ppc/excp_helper.c | 2 ++
>1 file changed, 2 insertions(+)
>
> diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
> index 17607adbe4..dcf22440cc 100644
> --- a/target/ppc/excp_helper.c
> +++ b/target/ppc/excp_helper.c
> @@ -453,6 +453,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
> excp_model, int excp)
>}
>break;
>}
> +#ifdef CONFIG_TCG
>case POWERPC_EXCP_ALIGN: /* Alignment exception
>   */
>/*
> * Get rS/rD and rA from faulting opcode.
> @@ -464,6 +465,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
> excp_model, int excp)
>env->spr[SPR_DSISR] |= (insn & 0x03FF) >> 16;
>}
>break;
> +#endif
>case POWERPC_EXCP_PROGRAM:   /* Program exception  
>   */
>switch (env->error_code & ~0xF) {
>case POWERPC_EXCP_FP:
>

 Shouldn't we move that code under ppc_cpu_do_unaligned_access ?
>>> 
>>> Well, it came from there initially. We could revert 336e91f853 and that
>>> would fix the issue as well.
>>
>> What would you prefer ?
>
> Well none of this interfere with the work I'm doing, so it really makes
> no difference. I guess reverting the patch is cleaner than having an
> ifdef loose in the middle of the code. I'll send a v2 with the revert.
>

Ah I missed that you were talking to Richard! That first line got kind of
hidden.

I already sent a v2, but as I said, I have no preference either
way. Let's hear from Richard.

Sorry for the confusion =)

>>
>> Thanks,
>>
>> C.

Re: Redesign of QEMU startup & initial configuration

2021-12-09 Thread Daniel P . Berrangé

On Thu, Dec 02, 2021 at 07:57:38AM +0100, Markus Armbruster wrote:
> = Motivation =
> 
> QEMU startup and initial configuration were designed many years ago for
> a much, much simpler QEMU.  They have since changed beyond recognition
> to adapt to new needs.  There was no real redesign.  Adaption to new
> needs has become more and more difficult.  A recent example for
> "difficult" is Damien's "[RFC PATCH v3 0/5] QMP support for
> cold-plugging devices".
> 
> I think it's time for a redesign.
> 
> 
> = What users want for initial configuration =
> 
> 1. QMP only
> 
>Management applications need to use QMP for monitoring anyway.  They
>may want to use it for initial configuration, too.  Libvirt does.

Essentially, as soon as you need to deal with hotplug, you need QMP/QAPI.
As soon as you need QMP/QAPI, it is horrible to also need to use something
that isn't JSON for the coldplug configuration approach, as you've doubled
the number of things to write + test.

>They still need to bootstrap a QMP monitor, and for that, CLI is fine
>as long as it's simple and stable.

Since you mentioned 'simple', allow me to go down a slight
tangent...

Incidentally, I often wish we didn't use -chardevs anywhere near
as much as we do. Chardev was OK when we used it for backing
simple guest devices that just had a single host endpoint that
was considered permanently connected.

Everywhere we've used it for things that really want socket
semantics we've created so much pain. The things we've done
with chardevs for vhostuser in particular horrify me. I'm
glad the block layer resisted the tempetation and just
directly used the SocketAddress QAPI type with a QIOChannel
objects.

I would say the monitor would be better without chardevs
too. Having to create multiple monitor instances so that
you can have multiple clients is insane. The complexity
of course is the need for 'mux' with HMP. If it were not
for that, we could use QIOChannel and SocketAddress for
the monitor code.

Maybe if we one day get HMP fully  separated from QMP such
that it is independant code, we can simplify internally.
Meanwhile, I think we should consider the QMP CLI at least
to only use SocketAddress for config, and secretly turn
it into a chardev internally. 

> 2. CLI and configuration files
> 
>Human users want a CLI and configuration files.
> 
>CLI is good for quick tweaks, and to explore.
> 
>For more permanent, non-trivial configuration, configuration files
>are more suitable, because they are easier to read, edit, and
>document than long command lines.

And you can start doing things like "includes" or templating
to make the config more manageable / scalable.

It is also nice to not have human and machines in completely
separate worlds. Humans will often look at what machines do and
then try to replicate that. eg people often ask to see the libvirt
QEMU config and then want to run that directly themselves, or use
libvirt CLI passthrough to add on features that libvirt does not
native support yet.

As libvirt makes more & more use of QAPI, we are increasing the
divide betweeen what machines and humans target.

I'd note that Kubernetes doesn't bother with a human CLI at
all, and just expects everyone/everything to use JSON/YAML
config files. So there's no divide between what syntax humans
and machines use - humans just send the config via the CLI
tool which uploads it to the REST API that machines use
directly. 

> = What we have for initial configuration =
> 
> Half of 1. and half of 2., satisfying nobody's needs.
> 
> Management applications need to do a lot of non-trivial initial
> configuration with the CLI.
> 
> Human users struggle with inconsistent syntax, insufficiently expressive
> configuration files, and huge command lines.

The QEMU CLI was nice because you could historically do simple
stuff really simply eg

  qemu-system-x86_64 mydisk.img

or

  qemu-system-x86_64 -hda mydisk.img -cdrom mydistro.iso

The challenge is how CLI usage evolves as you need to finese
the config you're using. Using this simple CLI approach, our
defaults still largely give you a VM from 1995. If people
want a modern guest setup, the simple syntax quickly stops
being simple.

You very easily get to a point where passing stuff on the
CLI gets out of control. IMHO if the CLI is over 100 characters
long, a config file is the way to go. Given that I really
wonder whether direct configuration of hardware on the CLI
is worthwhile at all, and users should instead /always/ be
using a config file, albeit indirectly.

This doesn't mean simple things become harder.

I'm thinking of a config file that supports a standard
template language supporting variable substitution,
loops, conditionals. The CLI then does not need to
represent anything related to hardware config schemas
at all. Instead it just needs to take the name of a
config file and ability to set variables.  We could
then ship  some standard configs for the simple cases,
which

Re: [PATCH] audio: Add sndio backend

2021-12-09 Thread Volker Rümelin


Hi Brad,

I tested the sndio backend on my Linux system and I found a bug in the 
sndio backend. The problem is that the function audio_run() can call the 
function sndio_enable_out() to disable audio playback.


In the sndio_poll_event() function, audio_run() is called, which removes 
the poll handlers when the playback stream is stopped by calling 
sndio_enable_out(). Next, sndio_poll_wait() is called in 
sndio_poll_event(), which can reinstall the poll handlers of the stopped 
stream again. After a subsequent call to sndio_fini_out(), the pindex 
pointer of the still installed poll handlers points to a memory area 
that has already been freed. This can lead to a segmentation fault or a 
QEMU lockup because of a blocking read.


I suggest to use a flag to prevent that sndio_poll_event() reinstalls 
the poll handlers.



+typedef struct SndioVoice {
+union {
+HWVoiceOut out;
+HWVoiceIn in;
+} hw;
+struct sio_par par;
+struct sio_hdl *hdl;
+struct pollfd *pfds;
+struct pollindex {
+struct SndioVoice *self;
+int index;
+} *pindexes;
+unsigned char *buf;
+size_t buf_size;
+size_t sndio_pos;
+size_t qemu_pos;
+unsigned int mode;
+unsigned int nfds;


+    bool enabled;


+} SndioVoice;



+/*
+ * call-back called when one of the descriptors
+ * became readable or writable
+ */
+static void sndio_poll_event(SndioVoice *self, int index, int event)
+{
+int revents;
+
+/*
+ * ensure we're not called twice this cycle
+ */
+sndio_poll_clear(self);
+
+/*
+ * make self->pfds[] look as we're returning from poll syscal,
+ * this is how sio_revents expects events to be.
+ */
+self->pfds[index].revents = event;
+
+/*
+ * tell sndio to handle events and return whether we can read or
+ * write without blocking.
+ */
+revents = sio_revents(self->hdl, self->pfds);
+if (self->mode == SIO_PLAY) {
+if (revents & POLLOUT) {
+sndio_write(self);
+}
+
+if (self->qemu_pos < self->buf_size) {
+audio_run(self->hw.out.s, "sndio_out");
+}
+} else {
+if (revents & POLLIN) {
+sndio_read(self);
+}
+
+if (self->qemu_pos < self->sndio_pos) {
+audio_run(self->hw.in.s, "sndio_in");
+}
+}
+
+sndio_poll_wait(self);


-    sndio_poll_wait(self);
+    if (self->enabled) {
+    sndio_poll_wait(self);
+    }


+}



+/*
+ * return a buffer where data to play can be stored
+ */
+static size_t sndio_put_buffer_out(HWVoiceOut *hw, void *buf, size_t size)
+{
+SndioVoice *self = (SndioVoice *) hw;
+
+self->qemu_pos += size;
+sndio_poll_wait(self);
+return size;
+}



+/*
+ * discard the given amount of recorded data
+ */
+static void sndio_put_buffer_in(HWVoiceIn *hw, void *buf, size_t size)
+{
+SndioVoice *self = (SndioVoice *) hw;
+
+self->qemu_pos += size;
+if (self->qemu_pos == self->buf_size) {
+self->qemu_pos = 0;
+self->sndio_pos = 0;
+}
+sndio_poll_wait(self);
+}


It's not necessary to guard sndio_poll_wait() in sndio_put_buffer_out() 
and sndio_put_buffer_in() because audio_run() will never call those 
functions for a disabled stream.



+static void sndio_enable(SndioVoice *self, bool enable)
+{
+if (enable) {
+sio_start(self->hdl);


+    self->enabled = true;


+sndio_poll_wait(self);
+} else {


+    self->enabled = false;


+sndio_poll_clear(self);
+sio_stop(self->hdl);
+}
+}


With best regards,
Volker

Re: [PATCH] scripts: Explain the difference between linux-headers and standard-headers

2021-12-09 Thread Thomas Huth


On 09/12/2021 18.44, Peter Maydell wrote:

On Thu, 9 Dec 2021 at 17:34, Thomas Huth  wrote:


If you don't know it, it's hard to figure out the difference between
the linux-headers folder and the include/standard-headers folder.
So let's add a short explanation to clarify the difference.

Signed-off-by: Thomas Huth 
---
  scripts/update-linux-headers.sh | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index fea4d6eb65..96c7daef94 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -9,6 +9,13 @@
  #
  # This work is licensed under the terms of the GNU GPL version 2.
  # See the COPYING file in the top-level directory.
+#
+# The script will copy the headers into two target folder:


"folders"


+#
+# - linux-headers/ for file that are required for compiling on a Linux host


"files"


+#
+# - include/standard-headers/ for files that are used for guest device 
emulation
+#



We could expand on this a little if you like, eg:

- linux-headers/ for files that are required for compiling on a Linux host.
   Generally we have these so we can use kernel structs and defines that
   are more recent than the headers that might be in /usr/include/linux
   on the host system. Usually this script can do simple file copies
   for these headers.

- include/standard-headers/ for files that are used for guest device emulation
   and are required on all hosts. For instance, we get our definitions of the
   virtio structures from the Linux kernel headers, but we need those
   definitions regardless of which host OS we are building on. This script
   has to be careful to sanitize the headers to remove any use of 
Linux-specifics
   such as types like "__u64". This work is done in the cp_portable function.


Sounds like a very good idea! Could you please send this as a patch (since 
this wouldn't be my own words anymore)?


 Thomas

[PATCH v10 07/10] ACPI ERST: create ACPI ERST table for pc/x86 machines

2021-12-09 Thread Eric DeVolder

This change exposes ACPI ERST support for x86 guests.

Signed-off-by: Eric DeVolder 
Reviewed-by: Ani Sinha 
---
 hw/i386/acpi-build.c   | 15 +++
 hw/i386/acpi-microvm.c | 15 +++
 include/hw/acpi/erst.h |  5 +
 3 files changed, 35 insertions(+)

diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index a99c6e4..55bca28 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -43,6 +43,7 @@
 #include "sysemu/tpm.h"
 #include "hw/acpi/tpm.h"
 #include "hw/acpi/vmgenid.h"
+#include "hw/acpi/erst.h"
 #include "sysemu/tpm_backend.h"
 #include "hw/rtc/mc146818rtc_regs.h"
 #include "migration/vmstate.h"
@@ -74,6 +75,8 @@
 #include "hw/acpi/hmat.h"
 #include "hw/acpi/viot.h"
 
+#include CONFIG_DEVICES
+
 /* These are used to size the ACPI tables for -M pc-i440fx-1.7 and
  * -M pc-i440fx-2.0.  Even if the actual amount of AML generated grows
  * a little bit, there should be plenty of free space since the DSDT
@@ -2566,6 +2569,18 @@ void acpi_build(AcpiBuildTables *tables, MachineState 
*machine)
 ACPI_DEVICE_IF(x86ms->acpi_dev), x86ms->oem_id,
 x86ms->oem_table_id);
 
+#ifdef CONFIG_ACPI_ERST
+{
+Object *erst_dev;
+erst_dev = find_erst_dev();
+if (erst_dev) {
+acpi_add_table(table_offsets, tables_blob);
+build_erst(tables_blob, tables->linker, erst_dev,
+   x86ms->oem_id, x86ms->oem_table_id);
+}
+}
+#endif
+
 vmgenid_dev = find_vmgenid_dev();
 if (vmgenid_dev) {
 acpi_add_table(table_offsets, tables_blob);
diff --git a/hw/i386/acpi-microvm.c b/hw/i386/acpi-microvm.c
index 196d318..68ca7e7 100644
--- a/hw/i386/acpi-microvm.c
+++ b/hw/i386/acpi-microvm.c
@@ -30,6 +30,7 @@
 #include "hw/acpi/bios-linker-loader.h"
 #include "hw/acpi/generic_event_device.h"
 #include "hw/acpi/utils.h"
+#include "hw/acpi/erst.h"
 #include "hw/i386/fw_cfg.h"
 #include "hw/i386/microvm.h"
 #include "hw/pci/pci.h"
@@ -40,6 +41,8 @@
 #include "acpi-common.h"
 #include "acpi-microvm.h"
 
+#include CONFIG_DEVICES
+
 static void acpi_dsdt_add_virtio(Aml *scope,
  MicrovmMachineState *mms)
 {
@@ -207,6 +210,18 @@ static void acpi_build_microvm(AcpiBuildTables *tables,
 ACPI_DEVICE_IF(x86ms->acpi_dev), x86ms->oem_id,
 x86ms->oem_table_id);
 
+#ifdef CONFIG_ACPI_ERST
+{
+Object *erst_dev;
+erst_dev = find_erst_dev();
+if (erst_dev) {
+acpi_add_table(table_offsets, tables_blob);
+build_erst(tables_blob, tables->linker, erst_dev,
+   x86ms->oem_id, x86ms->oem_table_id);
+}
+}
+#endif
+
 xsdt = tables_blob->len;
 build_xsdt(tables_blob, tables->linker, table_offsets, x86ms->oem_id,
x86ms->oem_table_id);
diff --git a/include/hw/acpi/erst.h b/include/hw/acpi/erst.h
index 9d63717..b747fe7 100644
--- a/include/hw/acpi/erst.h
+++ b/include/hw/acpi/erst.h
@@ -16,4 +16,9 @@ void build_erst(GArray *table_data, BIOSLinker *linker, 
Object *erst_dev,
 
 #define TYPE_ACPI_ERST "acpi-erst"
 
+/* returns NULL unless there is exactly one device */
+static inline Object *find_erst_dev(void)
+{
+return object_resolve_path_type("", TYPE_ACPI_ERST, NULL);
+}
 #endif
-- 
1.8.3.1

[PATCH v10 10/10] ACPI ERST: step 6 of bios-tables-test.c

2021-12-09 Thread Eric DeVolder

Following the guidelines in tests/qtest/bios-tables-test.c, this
is step 6.

Below is the disassembly of tests/data/acpi/pc/ERST.acpierst.

 /*
  * Intel ACPI Component Architecture
  * AML/ASL+ Disassembler version 20180508 (64-bit version)
  * Copyright (c) 2000 - 2018 Intel Corporation
  *
  * Disassembly of tests/data/acpi/pc/ERST.acpierst, Thu Dec  2 13:32:07 2021
  *
  * ACPI Data Table [ERST]
  *
  * Format: [HexOffset DecimalOffset ByteLength]  FieldName : FieldValue
  */

 [000h    4]Signature : "ERST"[Error Record 
Serialization Table]
 [004h 0004   4] Table Length : 0390
 [008h 0008   1] Revision : 01
 [009h 0009   1] Checksum : D6
 [00Ah 0010   6]   Oem ID : "BOCHS "
 [010h 0016   8] Oem Table ID : "BXPC"
 [018h 0024   4] Oem Revision : 0001
 [01Ch 0028   4]  Asl Compiler ID : "BXPC"
 [020h 0032   4]Asl Compiler Revision : 0001

 [024h 0036   4]  Serialization Header Length : 0030
 [028h 0040   4] Reserved : 
 [02Ch 0044   4]  Instruction Entry Count : 001B

 [030h 0048   1]   Action : 00 [Begin Write Operation]
 [031h 0049   1]  Instruction : 03 [Write Register Value]
 [032h 0050   1]Flags (decoded below) : 00
   Preserve Register Bits : 0
 [033h 0051   1] Reserved : 00

 [034h 0052  12]  Register Region : [Generic Address Structure]
 [034h 0052   1] Space ID : 00 [SystemMemory]
 [035h 0053   1]Bit Width : 20
 [036h 0054   1]   Bit Offset : 00
 [037h 0055   1] Encoded Access Width : 03 [DWord Access:32]
 [038h 0056   8]  Address : FEBF3000

 [040h 0064   8]Value : 
 [048h 0072   8] Mask : 00FF

 [050h 0080   1]   Action : 01 [Begin Read Operation]
 [051h 0081   1]  Instruction : 03 [Write Register Value]
 [052h 0082   1]Flags (decoded below) : 00
   Preserve Register Bits : 0
 [053h 0083   1] Reserved : 00

 [054h 0084  12]  Register Region : [Generic Address Structure]
 [054h 0084   1] Space ID : 00 [SystemMemory]
 [055h 0085   1]Bit Width : 20
 [056h 0086   1]   Bit Offset : 00
 [057h 0087   1] Encoded Access Width : 03 [DWord Access:32]
 [058h 0088   8]  Address : FEBF3000

 [060h 0096   8]Value : 0001
 [068h 0104   8] Mask : 00FF

 [070h 0112   1]   Action : 02 [Begin Clear Operation]
 [071h 0113   1]  Instruction : 03 [Write Register Value]
 [072h 0114   1]Flags (decoded below) : 00
   Preserve Register Bits : 0
 [073h 0115   1] Reserved : 00

 [074h 0116  12]  Register Region : [Generic Address Structure]
 [074h 0116   1] Space ID : 00 [SystemMemory]
 [075h 0117   1]Bit Width : 20
 [076h 0118   1]   Bit Offset : 00
 [077h 0119   1] Encoded Access Width : 03 [DWord Access:32]
 [078h 0120   8]  Address : FEBF3000

 [080h 0128   8]Value : 0002
 [088h 0136   8] Mask : 00FF

 [090h 0144   1]   Action : 03 [End Operation]
 [091h 0145   1]  Instruction : 03 [Write Register Value]
 [092h 0146   1]Flags (decoded below) : 00
   Preserve Register Bits : 0
 [093h 0147   1] Reserved : 00

 [094h 0148  12]  Register Region : [Generic Address Structure]
 [094h 0148   1] Space ID : 00 [SystemMemory]
 [095h 0149   1]Bit Width : 20
 [096h 0150   1]   Bit Offset : 00
 [097h 0151   1] Encoded Access Width : 03 [DWord Access:32]
 [098h 0152   8]  Address : FEBF3000

 [0A0h 0160   8]Value : 0003
 [0A8h 0168   8] Mask : 00FF

 [0B0h 0176   1]   Action : 04 [Set Record Offset]
 [0B1h 0177   1]  Instruction : 02 [Write Register]
 [0B2h 0178   1]Flags (decoded below) : 00
   Preserve Register Bits : 0
 [0B3h 0179   1] Reserved : 00

 [0B4h 0180  12]  Register Region : [Generic Address Structure]
 [0B4h 0180   1] Space ID : 00 [SystemMemory]
 [0B5h 0181   1]Bit Width : 20
 [0B6h 0182   1]   Bit Offset : 00
 [0B7h 0183   1]

[PATCH v10 06/10] ACPI ERST: build the ACPI ERST table

2021-12-09 Thread Eric DeVolder

This builds the ACPI ERST table to inform OSPM how to communicate
with the acpi-erst device.

Signed-off-by: Eric DeVolder 
---
 hw/acpi/erst.c | 241 +
 1 file changed, 241 insertions(+)

diff --git a/hw/acpi/erst.c b/hw/acpi/erst.c
index 81f5435..753425a 100644
--- a/hw/acpi/erst.c
+++ b/hw/acpi/erst.c
@@ -711,6 +711,247 @@ static const MemoryRegionOps erst_reg_ops = {
 .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
+
+/***/
+/***/
+
+/* ACPI 4.0: Table 17-19 Serialization Instructions */
+#define INST_READ_REGISTER 0x00
+#define INST_READ_REGISTER_VALUE   0x01
+#define INST_WRITE_REGISTER0x02
+#define INST_WRITE_REGISTER_VALUE  0x03
+#define INST_NOOP  0x04
+#define INST_LOAD_VAR1 0x05
+#define INST_LOAD_VAR2 0x06
+#define INST_STORE_VAR10x07
+#define INST_ADD   0x08
+#define INST_SUBTRACT  0x09
+#define INST_ADD_VALUE 0x0A
+#define INST_SUBTRACT_VALUE0x0B
+#define INST_STALL 0x0C
+#define INST_STALL_WHILE_TRUE  0x0D
+#define INST_SKIP_NEXT_INSTRUCTION_IF_TRUE 0x0E
+#define INST_GOTO  0x0F
+#define INST_SET_SRC_ADDRESS_BASE  0x10
+#define INST_SET_DST_ADDRESS_BASE  0x11
+#define INST_MOVE_DATA 0x12
+
+/* ACPI 4.0: 17.4.1.2 Serialization Instruction Entries */
+static void build_serialization_instruction_entry(GArray *table_data,
+uint8_t serialization_action,
+uint8_t instruction,
+uint8_t flags,
+uint8_t register_bit_width,
+uint64_t register_address,
+uint64_t value,
+uint64_t mask)
+{
+/* ACPI 4.0: Table 17-18 Serialization Instruction Entry */
+struct AcpiGenericAddress gas;
+
+/* Serialization Action */
+build_append_int_noprefix(table_data, serialization_action, 1);
+/* Instruction */
+build_append_int_noprefix(table_data, instruction , 1);
+/* Flags */
+build_append_int_noprefix(table_data, flags   , 1);
+/* Reserved */
+build_append_int_noprefix(table_data, 0   , 1);
+/* Register Region */
+gas.space_id = AML_SYSTEM_MEMORY;
+gas.bit_width = register_bit_width;
+gas.bit_offset = 0;
+switch (register_bit_width) {
+case 8:
+gas.access_width = 1;
+break;
+case 16:
+gas.access_width = 2;
+break;
+case 32:
+gas.access_width = 3;
+break;
+case 64:
+gas.access_width = 4;
+break;
+default:
+gas.access_width = 0;
+break;
+}
+gas.address = register_address;
+build_append_gas_from_struct(table_data, );
+/* Value */
+build_append_int_noprefix(table_data, value  , 8);
+/* Mask */
+build_append_int_noprefix(table_data, mask   , 8);
+}
+
+/* ACPI 4.0: 17.4.1 Serialization Action Table */
+void build_erst(GArray *table_data, BIOSLinker *linker, Object *erst_dev,
+const char *oem_id, const char *oem_table_id)
+{
+GArray *table_instruction_data;
+unsigned action;
+pcibus_t bar0, bar1;
+AcpiTable table = { .sig = "ERST", .rev = 1, .oem_id = oem_id,
+.oem_table_id = oem_table_id };
+
+bar0 = (pcibus_t)pci_get_bar_addr(PCI_DEVICE(erst_dev), 0);
+trace_acpi_erst_pci_bar_0(bar0);
+bar1 = (pcibus_t)pci_get_bar_addr(PCI_DEVICE(erst_dev), 1);
+trace_acpi_erst_pci_bar_1(bar1);
+
+#define MASK8  0x00FFUL
+#define MASK16 0xUL
+#define MASK32 0xUL
+#define MASK64 0xUL
+
+/*
+ * Serialization Action Table
+ * The serialization action table must be generated first
+ * so that its size can be known in order to populate the
+ * Instruction Entry Count field.
+ */
+table_instruction_data = g_array_new(FALSE, FALSE, sizeof(char));
+
+/* Serialization Instruction Entries */
+action = ACTION_BEGIN_WRITE_OPERATION;
+build_serialization_instruction_entry(table_instruction_data,
+action, INST_WRITE_REGISTER_VALUE, 0, 32,
+bar0 + ERST_ACTION_OFFSET, action, MASK8);
+
+action = ACTION_BEGIN_READ_OPERATION;
+build_serialization_instruction_entry(table_instruction_data,
+action, INST_WRITE_REGISTER_VALUE, 0, 32,
+bar0 + ERST_ACTION_OFFSET, action, MASK8);
+
+action = ACTION_BEGIN_CLEAR_OPERATION;
+build_serialization_instruction_entry(table_instruction_data,
+action, INST_WRITE_REGISTER_VALUE, 0, 32,
+bar0 + ERST_ACTION_OFFSET, action, MASK8);
+
+action = ACTION_END_OPERATION;
+build_serialization_instruction_entry(table_instruction_data,
+action, INST_WRITE_REGISTER_VALUE, 0,

[PATCH v10 02/10] ACPI ERST: specification for ERST support

2021-12-09 Thread Eric DeVolder

Information on the implementation of the ACPI ERST support.

Signed-off-by: Eric DeVolder 
Acked-by: Ani Sinha 
---
 docs/specs/acpi_erst.rst | 200 +++
 1 file changed, 200 insertions(+)
 create mode 100644 docs/specs/acpi_erst.rst

diff --git a/docs/specs/acpi_erst.rst b/docs/specs/acpi_erst.rst
new file mode 100644
index 000..a8a9d22
--- /dev/null
+++ b/docs/specs/acpi_erst.rst
@@ -0,0 +1,200 @@
+ACPI ERST DEVICE
+
+
+The ACPI ERST device is utilized to support the ACPI Error Record
+Serialization Table, ERST, functionality. This feature is designed for
+storing error records in persistent storage for future reference
+and/or debugging.
+
+The ACPI specification[1], in Chapter "ACPI Platform Error Interfaces
+(APEI)", and specifically subsection "Error Serialization", outlines a
+method for storing error records into persistent storage.
+
+The format of error records is described in the UEFI specification[2],
+in Appendix N "Common Platform Error Record".
+
+While the ACPI specification allows for an NVRAM "mode" (see
+GET_ERROR_LOG_ADDRESS_RANGE_ATTRIBUTES) where non-volatile RAM is
+directly exposed for direct access by the OS/guest, this device
+implements the non-NVRAM "mode". This non-NVRAM "mode" is what is
+implemented by most BIOS (since flash memory requires programming
+operations in order to update its contents). Furthermore, as of the
+time of this writing, Linux only supports the non-NVRAM "mode".
+
+
+Background/Motivation
+-
+
+Linux uses the persistent storage filesystem, pstore, to record
+information (eg. dmesg tail) upon panics and shutdowns.  Pstore is
+independent of, and runs before, kdump.  In certain scenarios (ie.
+hosts/guests with root filesystems on NFS/iSCSI where networking
+software and/or hardware fails, and thus kdump fails), pstore may
+contain information available for post-mortem debugging.
+
+Two common storage backends for the pstore filesystem are ACPI ERST
+and UEFI. Most BIOS implement ACPI ERST. UEFI is not utilized in all
+guests. With QEMU supporting ACPI ERST, it becomes a viable pstore
+storage backend for virtual machines (as it is now for bare metal
+machines).
+
+Enabling support for ACPI ERST facilitates a consistent method to
+capture kernel panic information in a wide range of guests: from
+resource-constrained microvms to very large guests, and in particular,
+in direct-boot environments (which would lack UEFI run-time services).
+
+Note that Microsoft Windows also utilizes the ACPI ERST for certain
+crash information, if available[3].
+
+
+Configuration|Usage
+---
+
+To use ACPI ERST, a memory-backend-file object and acpi-erst device
+can be created, for example:
+
+ qemu ...
+ -object 
memory-backend-file,id=erstnvram,mem-path=acpi-erst.backing,size=0x1,share=on
 \
+ -device acpi-erst,memdev=erstnvram
+
+For proper operation, the ACPI ERST device needs a memory-backend-file
+object with the following parameters:
+
+ - id: The id of the memory-backend-file object is used to associate
+   this memory with the acpi-erst device.
+ - size: The size of the ACPI ERST backing storage. This parameter is
+   required.
+ - mem-path: The location of the ACPI ERST backing storage file. This
+   parameter is also required.
+ - share: The share=on parameter is required so that updates to the
+   ERST backing store are written to the file.
+
+and ERST device:
+
+ - memdev: Is the object id of the memory-backend-file.
+ - record_size: Specifies the size of the records (or slots) in the
+   backend storage. Must be a power of two value greater than or
+   equal to 4096 (PAGE_SIZE).
+
+
+PCI Interface
+-
+
+The ERST device is a PCI device with two BARs, one for accessing the
+programming registers, and the other for accessing the record exchange
+buffer.
+
+BAR0 contains the programming interface consisting of ACTION and VALUE
+64-bit registers.  All ERST actions/operations/side effects happen on
+the write to the ACTION, by design. Any data needed by the action must
+be placed into VALUE prior to writing ACTION.  Reading the VALUE
+simply returns the register contents, which can be updated by a
+previous ACTION.
+
+BAR1 contains the 8KiB record exchange buffer, which is the
+implemented maximum record size.
+
+
+Backend Storage Format
+--
+
+The backend storage is divided into fixed size "slots", 8KiB in
+length, with each slot storing a single record.  Not all slots need to
+be occupied, and they need not be occupied in a contiguous fashion.
+The ability to clear/erase specific records allows for the formation
+of unoccupied slots.
+
+Slot 0 contains a backend storage header that identifies the contents
+as ERST and also facilitates efficient access to the records.
+Depending upon the size of the backend storage, additional slots will
+be designated to be a part of the slot 0 header. For example, at 8KiB,
+the slot 0 header can

[PATCH v10 09/10] ACPI ERST: bios-tables-test testcase

2021-12-09 Thread Eric DeVolder

This change implements the test suite checks for the ERST table.

Signed-off-by: Eric DeVolder 
Reviewed-by: Ani Sinha 
---
 tests/qtest/bios-tables-test.c | 56 ++
 1 file changed, 56 insertions(+)

diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c
index 2588741..2f073e6 100644
--- a/tests/qtest/bios-tables-test.c
+++ b/tests/qtest/bios-tables-test.c
@@ -1446,6 +1446,57 @@ static void test_acpi_piix4_tcg_acpi_hmat(void)
 test_acpi_tcg_acpi_hmat(MACHINE_PC);
 }
 
+static void test_acpi_erst(const char *machine)
+{
+gchar *tmp_path = g_dir_make_tmp("qemu-test-erst.XX", NULL);
+gchar *params;
+test_data data;
+
+memset(, 0, sizeof(data));
+data.machine = machine;
+data.variant = ".acpierst";
+params = g_strdup_printf(
+" -object memory-backend-file,id=erstnvram,"
+"mem-path=%s,size=0x1,share=on"
+" -device acpi-erst,memdev=erstnvram", tmp_path);
+test_acpi_one(params, );
+free_test_data();
+g_free(params);
+g_assert(g_rmdir(tmp_path) == 0);
+g_free(tmp_path);
+}
+
+static void test_acpi_piix4_acpi_erst(void)
+{
+test_acpi_erst(MACHINE_PC);
+}
+
+static void test_acpi_q35_acpi_erst(void)
+{
+test_acpi_erst(MACHINE_Q35);
+}
+
+static void test_acpi_microvm_acpi_erst(void)
+{
+gchar *tmp_path = g_dir_make_tmp("qemu-test-erst.XX", NULL);
+gchar *params;
+test_data data;
+
+test_acpi_microvm_prepare();
+data.variant = ".pcie";
+data.tcg_only = true; /* need constant host-phys-bits */
+params = g_strdup_printf(" -machine microvm,"
+"acpi=on,ioapic2=off,rtc=off,pcie=on"
+" -object memory-backend-file,id=erstnvram,"
+   "mem-path=%s,size=0x1,share=on"
+" -device acpi-erst,memdev=erstnvram", tmp_path);
+test_acpi_one(params, );
+g_free(params);
+g_assert(g_rmdir(tmp_path) == 0);
+g_free(tmp_path);
+free_test_data();
+}
+
 static void test_acpi_virt_tcg(void)
 {
 test_data data = {
@@ -1624,6 +1675,8 @@ int main(int argc, char *argv[])
 qtest_add_func("acpi/q35/dimmpxm", test_acpi_q35_tcg_dimm_pxm);
 qtest_add_func("acpi/piix4/acpihmat", test_acpi_piix4_tcg_acpi_hmat);
 qtest_add_func("acpi/q35/acpihmat", test_acpi_q35_tcg_acpi_hmat);
+qtest_add_func("acpi/piix4/acpierst", test_acpi_piix4_acpi_erst);
+qtest_add_func("acpi/q35/acpierst", test_acpi_q35_acpi_erst);
 qtest_add_func("acpi/microvm", test_acpi_microvm_tcg);
 qtest_add_func("acpi/microvm/usb", test_acpi_microvm_usb_tcg);
 qtest_add_func("acpi/microvm/rtc", test_acpi_microvm_rtc_tcg);
@@ -1639,6 +1692,9 @@ int main(int argc, char *argv[])
 qtest_add_func("acpi/q35/kvm/xapic", test_acpi_q35_kvm_xapic);
 qtest_add_func("acpi/q35/kvm/dmar", test_acpi_q35_kvm_dmar);
 }
+if (strcmp(arch, "x86_64") == 0) {
+qtest_add_func("acpi/microvm/acpierst", 
test_acpi_microvm_acpi_erst);
+}
 } else if (strcmp(arch, "aarch64") == 0) {
 if (has_tcg) {
 qtest_add_func("acpi/virt", test_acpi_virt_tcg);
-- 
1.8.3.1

[PATCH v10 04/10] ACPI ERST: header file for ERST

2021-12-09 Thread Eric DeVolder

This change introduces the public defintions for ACPI ERST.

Signed-off-by: Eric DeVolder 
Reviewed-by: Ani Sinha 
---
 include/hw/acpi/erst.h | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 include/hw/acpi/erst.h

diff --git a/include/hw/acpi/erst.h b/include/hw/acpi/erst.h
new file mode 100644
index 000..9d63717
--- /dev/null
+++ b/include/hw/acpi/erst.h
@@ -0,0 +1,19 @@
+/*
+ * ACPI Error Record Serialization Table, ERST, Implementation
+ *
+ * ACPI ERST introduced in ACPI 4.0, June 16, 2009.
+ * ACPI Platform Error Interfaces : Error Serialization
+ *
+ * Copyright (c) 2021 Oracle and/or its affiliates.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+#ifndef HW_ACPI_ERST_H
+#define HW_ACPI_ERST_H
+
+void build_erst(GArray *table_data, BIOSLinker *linker, Object *erst_dev,
+const char *oem_id, const char *oem_table_id);
+
+#define TYPE_ACPI_ERST "acpi-erst"
+
+#endif
-- 
1.8.3.1

[PATCH v10 08/10] ACPI ERST: qtest for ERST

2021-12-09 Thread Eric DeVolder

This change provides a qtest that locates and then does a simple
interrogation of the ERST feature within the guest.

Signed-off-by: Eric DeVolder 
---
 tests/qtest/erst-test.c | 167 
 tests/qtest/meson.build |   2 +
 2 files changed, 169 insertions(+)
 create mode 100644 tests/qtest/erst-test.c

diff --git a/tests/qtest/erst-test.c b/tests/qtest/erst-test.c
new file mode 100644
index 000..370c119
--- /dev/null
+++ b/tests/qtest/erst-test.c
@@ -0,0 +1,167 @@
+/*
+ * QTest testcase for acpi-erst
+ *
+ * Copyright (c) 2021 Oracle
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include 
+#include "libqos/libqos-pc.h"
+#include "libqos/libqtest.h"
+#include "qemu-common.h"
+
+static void save_fn(QPCIDevice *dev, int devfn, void *data)
+{
+QPCIDevice **pdev = (QPCIDevice **) data;
+
+*pdev = dev;
+}
+
+static QPCIDevice *get_device(QPCIBus *pcibus)
+{
+QPCIDevice *dev;
+
+dev = NULL;
+qpci_device_foreach(pcibus, 0x1b36, 0x0012, save_fn, );
+g_assert(dev != NULL);
+
+return dev;
+}
+
+typedef struct _ERSTState {
+QOSState *qs;
+QPCIBar reg_bar, mem_bar;
+uint64_t reg_barsize, mem_barsize;
+QPCIDevice *dev;
+} ERSTState;
+
+#define ACTION 0
+#define VALUE 8
+
+static const char *reg2str(unsigned reg)
+{
+switch (reg) {
+case 0:
+return "ACTION";
+case 8:
+return "VALUE";
+default:
+return NULL;
+}
+}
+
+static inline uint32_t in_reg32(ERSTState *s, unsigned reg)
+{
+const char *name = reg2str(reg);
+uint32_t res;
+
+res = qpci_io_readl(s->dev, s->reg_bar, reg);
+g_test_message("*%s -> %08x", name, res);
+
+return res;
+}
+
+static inline uint64_t in_reg64(ERSTState *s, unsigned reg)
+{
+const char *name = reg2str(reg);
+uint64_t res;
+
+res = qpci_io_readq(s->dev, s->reg_bar, reg);
+g_test_message("*%s -> %016llx", name, (unsigned long long)res);
+
+return res;
+}
+
+static inline void out_reg32(ERSTState *s, unsigned reg, uint32_t v)
+{
+const char *name = reg2str(reg);
+
+g_test_message("%08x -> *%s", v, name);
+qpci_io_writel(s->dev, s->reg_bar, reg, v);
+}
+
+static inline void out_reg64(ERSTState *s, unsigned reg, uint64_t v)
+{
+const char *name = reg2str(reg);
+
+g_test_message("%016llx -> *%s", (unsigned long long)v, name);
+qpci_io_writeq(s->dev, s->reg_bar, reg, v);
+}
+
+static void cleanup_vm(ERSTState *s)
+{
+g_free(s->dev);
+qtest_shutdown(s->qs);
+}
+
+static void setup_vm_cmd(ERSTState *s, const char *cmd)
+{
+const char *arch = qtest_get_arch();
+
+if (strcmp(arch, "i386") == 0 || strcmp(arch, "x86_64") == 0) {
+s->qs = qtest_pc_boot(cmd);
+} else {
+g_printerr("erst-test tests are only available on x86\n");
+exit(EXIT_FAILURE);
+}
+s->dev = get_device(s->qs->pcibus);
+
+s->reg_bar = qpci_iomap(s->dev, 0, >reg_barsize);
+g_assert_cmpuint(s->reg_barsize, ==, 16);
+
+s->mem_bar = qpci_iomap(s->dev, 1, >mem_barsize);
+g_assert_cmpuint(s->mem_barsize, ==, 0x2000);
+
+qpci_device_enable(s->dev);
+}
+
+static void test_acpi_erst_basic(void)
+{
+ERSTState state;
+uint64_t log_address_range;
+uint64_t log_address_length;
+uint32_t log_address_attr;
+
+setup_vm_cmd(,
+"-object memory-backend-file,"
+"mem-path=acpi-erst.XX,"
+"size=64K,"
+"share=on,"
+"id=nvram "
+"-device acpi-erst,"
+"memdev=nvram");
+
+out_reg32(, ACTION, 0xD);
+log_address_range = in_reg64(, VALUE);
+out_reg32(, ACTION, 0xE);
+log_address_length = in_reg64(, VALUE);
+out_reg32(, ACTION, 0xF);
+log_address_attr = in_reg32(, VALUE);
+
+/* Check log_address_range is not 0, ~0 or base */
+g_assert_cmpuint(log_address_range, !=,  0ULL);
+g_assert_cmpuint(log_address_range, !=, ~0ULL);
+g_assert_cmpuint(log_address_range, !=, state.reg_bar.addr);
+g_assert_cmpuint(log_address_range, ==, state.mem_bar.addr);
+
+/* Check log_address_length is bar1_size */
+g_assert_cmpuint(log_address_length, ==, state.mem_barsize);
+
+/* Check log_address_attr is 0 */
+g_assert_cmpuint(log_address_attr, ==, 0);
+
+cleanup_vm();
+}
+
+int main(int argc, char **argv)
+{
+int ret;
+
+g_test_init(, , NULL);
+qtest_add_func("/acpi-erst/basic", test_acpi_erst_basic);
+ret = g_test_run();
+return ret;
+}
diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index c9d8458..4b01c22 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -68,6 +68,7 @@ qtests_i386 = \
   (config_all_devices.has_key('CONFIG_RTL8139_PCI') ? ['rtl8139-test'] : []) + 
 \
   (config_all_devices.has_key('CONFIG_E1000E_PCI_EXPRESS') ? 
['fuzz-e1000e-test'] : []) +   \

[PATCH v10 03/10] ACPI ERST: PCI device_id for ERST

2021-12-09 Thread Eric DeVolder

This change reserves the PCI device_id for the new ACPI ERST
device.

Signed-off-by: Eric DeVolder 
Acked-by: Igor Mammedov 
Acked-by: Ani Sinha 
---
 include/hw/pci/pci.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index e7cdf2d..d3734b9 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -108,6 +108,7 @@ extern bool pci_available;
 #define PCI_DEVICE_ID_REDHAT_MDPY0x000f
 #define PCI_DEVICE_ID_REDHAT_NVME0x0010
 #define PCI_DEVICE_ID_REDHAT_PVPANIC 0x0011
+#define PCI_DEVICE_ID_REDHAT_ACPI_ERST   0x0012
 #define PCI_DEVICE_ID_REDHAT_QXL 0x0100
 
 #define FMT_PCIBUS  PRIx64
-- 
1.8.3.1

[PATCH v10 05/10] ACPI ERST: support for ACPI ERST feature

2021-12-09 Thread Eric DeVolder

This implements a PCI device for ACPI ERST. This implements the
non-NVRAM "mode" of operation for ERST as it is supported by
Linux and Windows.

Signed-off-by: Eric DeVolder 
---
 hw/acpi/Kconfig  |   6 +
 hw/acpi/erst.c   | 846 +++
 hw/acpi/meson.build  |   1 +
 hw/acpi/trace-events |  15 +
 4 files changed, 868 insertions(+)
 create mode 100644 hw/acpi/erst.c

diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig
index 622b0b5..19caebd 100644
--- a/hw/acpi/Kconfig
+++ b/hw/acpi/Kconfig
@@ -10,6 +10,7 @@ config ACPI_X86
 select ACPI_HMAT
 select ACPI_PIIX4
 select ACPI_PCIHP
+select ACPI_ERST
 
 config ACPI_X86_ICH
 bool
@@ -60,3 +61,8 @@ config ACPI_HW_REDUCED
 select ACPI
 select ACPI_MEMORY_HOTPLUG
 select ACPI_NVDIMM
+
+config ACPI_ERST
+bool
+default y
+depends on ACPI && PCI
diff --git a/hw/acpi/erst.c b/hw/acpi/erst.c
new file mode 100644
index 000..81f5435
--- /dev/null
+++ b/hw/acpi/erst.c
@@ -0,0 +1,846 @@
+/*
+ * ACPI Error Record Serialization Table, ERST, Implementation
+ *
+ * ACPI ERST introduced in ACPI 4.0, June 16, 2009.
+ * ACPI Platform Error Interfaces : Error Serialization
+ *
+ * Copyright (c) 2021 Oracle and/or its affiliates.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include 
+#include 
+#include 
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/qdev-core.h"
+#include "exec/memory.h"
+#include "qom/object.h"
+#include "hw/pci/pci.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "migration/vmstate.h"
+#include "hw/qdev-properties.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/acpi-defs.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "exec/address-spaces.h"
+#include "sysemu/hostmem.h"
+#include "hw/acpi/erst.h"
+#include "trace.h"
+
+/* ACPI 4.0: Table 17-16 Serialization Actions */
+#define ACTION_BEGIN_WRITE_OPERATION 0x0
+#define ACTION_BEGIN_READ_OPERATION  0x1
+#define ACTION_BEGIN_CLEAR_OPERATION 0x2
+#define ACTION_END_OPERATION 0x3
+#define ACTION_SET_RECORD_OFFSET 0x4
+#define ACTION_EXECUTE_OPERATION 0x5
+#define ACTION_CHECK_BUSY_STATUS 0x6
+#define ACTION_GET_COMMAND_STATUS0x7
+#define ACTION_GET_RECORD_IDENTIFIER 0x8
+#define ACTION_SET_RECORD_IDENTIFIER 0x9
+#define ACTION_GET_RECORD_COUNT  0xA
+#define ACTION_BEGIN_DUMMY_WRITE_OPERATION   0xB
+#define ACTION_RESERVED  0xC
+#define ACTION_GET_ERROR_LOG_ADDRESS_RANGE   0xD
+#define ACTION_GET_ERROR_LOG_ADDRESS_LENGTH  0xE
+#define ACTION_GET_ERROR_LOG_ADDRESS_RANGE_ATTRIBUTES 0xF
+#define ACTION_GET_EXECUTE_OPERATION_TIMINGS 0x10 /* ACPI 6.3 */
+
+/* ACPI 4.0: Table 17-17 Command Status Definitions */
+#define STATUS_SUCCESS0x00
+#define STATUS_NOT_ENOUGH_SPACE   0x01
+#define STATUS_HARDWARE_NOT_AVAILABLE 0x02
+#define STATUS_FAILED 0x03
+#define STATUS_RECORD_STORE_EMPTY 0x04
+#define STATUS_RECORD_NOT_FOUND   0x05
+
+
+/* UEFI 2.1: Appendix N Common Platform Error Record */
+#define UEFI_CPER_RECORD_MIN_SIZE 128U
+#define UEFI_CPER_RECORD_LENGTH_OFFSET 20U
+#define UEFI_CPER_RECORD_ID_OFFSET 96U
+#define IS_UEFI_CPER_RECORD(ptr) \
+(((ptr)[0] == 'C') && \
+ ((ptr)[1] == 'P') && \
+ ((ptr)[2] == 'E') && \
+ ((ptr)[3] == 'R'))
+
+/*
+ * NOTE that when accessing CPER fields within a record, memcpy()
+ * is utilized to avoid a possible misaligned access on the host.
+ */
+
+/*
+ * This implementation is an ACTION (cmd) and VALUE (data)
+ * interface consisting of just two 64-bit registers.
+ */
+#define ERST_REG_SIZE (16UL)
+#define ERST_ACTION_OFFSET (0UL) /* action (cmd) */
+#define ERST_VALUE_OFFSET  (8UL) /* argument/value (data) */
+
+/*
+ * ERST_RECORD_SIZE is the buffer size for exchanging ERST
+ * record contents. Thus, it defines the maximum record size.
+ * As this is mapped through a PCI BAR, it must be a power of
+ * two and larger than UEFI_CPER_RECORD_MIN_SIZE.
+ * The backing storage is divided into fixed size "slots",
+ * each ERST_RECORD_SIZE in length, and each "slot"
+ * storing a single record. No attempt at optimizing storage
+ * through compression, compaction, etc is attempted.
+ * NOTE that slot 0 is reserved for the backing storage header.
+ * Depending upon the size of the backing storage, additional
+ * slots will be part of the slot 0 header in order to account
+ * for a record_id for each available remaining slot.
+ */
+/* 8KiB records, not too small, not too big */
+#define ERST_RECORD_SIZE (8192UL)
+
+#define ACPI_ERST_MEMDEV_PROP "memdev"
+#define ACPI_ERST_RECORD_SIZE_PROP "record_size"
+
+/*
+ * From the ACPI ERST spec sections:
+ * A record id of all 0s is used to indicate 'unspecified' record id.
+ * A record id of all 1s is used to indicate empty or end.
+ */
+#define

[PATCH v10 01/10] ACPI ERST: bios-tables-test.c steps 1 and 2

2021-12-09 Thread Eric DeVolder

Following the guidelines in tests/qtest/bios-tables-test.c, this
change adds empty placeholder files per step 1 for the new ERST
table, and excludes resulting changed files in bios-tables-test-allowed-diff.h
per step 2.

Signed-off-by: Eric DeVolder 
Acked-by: Igor Mammedov 
---
 tests/data/acpi/microvm/ERST.pcie   | 0
 tests/data/acpi/pc/DSDT.acpierst| 0
 tests/data/acpi/pc/ERST.acpierst| 0
 tests/data/acpi/q35/DSDT.acpierst   | 0
 tests/data/acpi/q35/ERST.acpierst   | 0
 tests/qtest/bios-tables-test-allowed-diff.h | 5 +
 6 files changed, 5 insertions(+)
 create mode 100644 tests/data/acpi/microvm/ERST.pcie
 create mode 100644 tests/data/acpi/pc/DSDT.acpierst
 create mode 100644 tests/data/acpi/pc/ERST.acpierst
 create mode 100644 tests/data/acpi/q35/DSDT.acpierst
 create mode 100644 tests/data/acpi/q35/ERST.acpierst

diff --git a/tests/data/acpi/microvm/ERST.pcie 
b/tests/data/acpi/microvm/ERST.pcie
new file mode 100644
index 000..e69de29
diff --git a/tests/data/acpi/pc/DSDT.acpierst b/tests/data/acpi/pc/DSDT.acpierst
new file mode 100644
index 000..e69de29
diff --git a/tests/data/acpi/pc/ERST.acpierst b/tests/data/acpi/pc/ERST.acpierst
new file mode 100644
index 000..e69de29
diff --git a/tests/data/acpi/q35/DSDT.acpierst 
b/tests/data/acpi/q35/DSDT.acpierst
new file mode 100644
index 000..e69de29
diff --git a/tests/data/acpi/q35/ERST.acpierst 
b/tests/data/acpi/q35/ERST.acpierst
new file mode 100644
index 000..e69de29
diff --git a/tests/qtest/bios-tables-test-allowed-diff.h 
b/tests/qtest/bios-tables-test-allowed-diff.h
index dfb8523..603db07 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1 +1,6 @@
 /* List of comma-separated changed AML files to ignore */
+"tests/data/acpi/pc/DSDT.acpierst",
+"tests/data/acpi/pc/ERST.acpierst",
+"tests/data/acpi/q35/DSDT.acpierst",
+"tests/data/acpi/q35/ERST.acpierst",
+"tests/data/acpi/microvm/ERST.pcie",
-- 
1.8.3.1

[PATCH v10 00/10] acpi: Error Record Serialization Table, ERST, support for QEMU

2021-12-09 Thread Eric DeVolder

This patchset introduces support for the ACPI Error Record
Serialization Table, ERST.

For background and implementation information, please see
docs/specs/acpi_erst.rst, which is patch 2/10.

Suggested-by: Konrad Wilk 
Signed-off-by: Eric DeVolder 

---
v10: 9dec2021
 - Addressed additional feedback from Ani Sinha

v9: 2dec2021
 - Addressed feedback from Ani Sinha

v8: 15oct2021
 - Added Kconfig option for ERST, per Ani Sinha
 - Fixed patch ordering, per Ani

v7: 7oct2021
 - style improvements, per Igor
 - use of endian accessors for storage header, per Igor
 - a number of optimizations and improvements, per Igor
 - updated spec for header, per Igor
 - updated spec for rst format, per Michael Tsirkin
 - updated spec for new record_size parameter
   Due to changes in the spec, I am not carrying the
   Acked-by from Ani Sinha.
 - changes for and testing of migration to systems with
   differing ERST_RECORD_SIZE

v6: 5aug2021
 - Fixed compile warning/error, per Michael Tsirkin
 - Fixed mingw32 build error, per Michael
 - Converted exchange buffer to MemoryBackend, per Igor
 - Migrated test to PCI, per Igor
 - Significantly reduced amount of copying, per Igor
 - Corrections/enhancements to acpi_erst.txt, per Igor
 - Many misc/other small items, per Igor

v5: 30jun2021
 - Create docs/specs/acpi_erst.txt, per Igor
 - Separate PCI BARs for registers and memory, per Igor
 - Convert debugging to use trace infrastructure, per Igor
 - Various other fixups, per Igor

v4: 11jun2021
 - Converted to a PCI device, per Igor.
 - Updated qtest.
 - Rearranged patches, per Igor.

v3: 28may2021
 - Converted to using a TYPE_MEMORY_BACKEND_FILE object rather than
   internal array with explicit file operations, per Igor.
 - Changed the way the qdev and base address are handled, allowing
   ERST to be disabled at run-time. Also aligns better with other
   existing code.

v2: 8feb2021
 - Added qtest/smoke test per Paolo Bonzini
 - Split patch into smaller chunks, per Igor Mammedov
 - Did away with use of ACPI packed structures, per Igor Mammedov

v1: 26oct2020
 - initial post

---

Eric DeVolder (10):
  ACPI ERST: bios-tables-test.c steps 1 and 2
  ACPI ERST: specification for ERST support
  ACPI ERST: PCI device_id for ERST
  ACPI ERST: header file for ERST
  ACPI ERST: support for ACPI ERST feature
  ACPI ERST: build the ACPI ERST table
  ACPI ERST: create ACPI ERST table for pc/x86 machines
  ACPI ERST: qtest for ERST
  ACPI ERST: bios-tables-test testcase
  ACPI ERST: step 6 of bios-tables-test.c

 docs/specs/acpi_erst.rst  |  200 +++
 hw/acpi/Kconfig   |6 +
 hw/acpi/erst.c| 1087 +
 hw/acpi/meson.build   |1 +
 hw/acpi/trace-events  |   15 +
 hw/i386/acpi-build.c  |   15 +
 hw/i386/acpi-microvm.c|   15 +
 include/hw/acpi/erst.h|   24 +
 include/hw/pci/pci.h  |1 +
 tests/data/acpi/microvm/ERST.pcie |  Bin 0 -> 912 bytes
 tests/data/acpi/pc/DSDT.acpierst  |  Bin 0 -> 5969 bytes
 tests/data/acpi/pc/ERST.acpierst  |  Bin 0 -> 912 bytes
 tests/data/acpi/q35/DSDT.acpierst |  Bin 0 -> 8306 bytes
 tests/data/acpi/q35/ERST.acpierst |  Bin 0 -> 912 bytes
 tests/qtest/bios-tables-test.c|   56 ++
 tests/qtest/erst-test.c   |  167 ++
 tests/qtest/meson.build   |2 +
 17 files changed, 1589 insertions(+)
 create mode 100644 docs/specs/acpi_erst.rst
 create mode 100644 hw/acpi/erst.c
 create mode 100644 include/hw/acpi/erst.h
 create mode 100644 tests/data/acpi/microvm/ERST.pcie
 create mode 100644 tests/data/acpi/pc/DSDT.acpierst
 create mode 100644 tests/data/acpi/pc/ERST.acpierst
 create mode 100644 tests/data/acpi/q35/DSDT.acpierst
 create mode 100644 tests/data/acpi/q35/ERST.acpierst
 create mode 100644 tests/qtest/erst-test.c

-- 
1.8.3.1

Re: [PATCH v9 05/10] ACPI ERST: support for ACPI ERST feature

2021-12-09 Thread Eric DeVolder




On 12/9/21 00:31, Ani Sinha wrote:

On Wed, Dec 8, 2021 at 10:08 PM Eric DeVolder  wrote:




On 12/6/21 02:14, Ani Sinha wrote:

On Fri, Dec 3, 2021 at 12:39 AM Eric DeVolder  wrote:


This implements a PCI device for ACPI ERST. This implements the
non-NVRAM "mode" of operation for ERST as it is supported by
Linux and Windows.


OK sent some more comments. It will take another pass for me to fully
review this.



Hi Ani, thank you for reviewing. I have incorporated your feedback thus far.
I have v10 ready to go but not sure if your review of v9 is completed yet?


I completed scanning this patch. Don't hold your breath. I review
things when I find gaps in other work and can't promise timely
reviews.
You can send a v10 once you have addressed my last set of comments.



Thanks Ani! I understand on the reviews. I have incorporated all feedback and 
posted v10.

Thank you!
eric

Re: [PATCH v9 10/10] ACPI ERST: step 6 of bios-tables-test.c

2021-12-09 Thread Eric DeVolder


Ani, thanks!
eric

On 12/9/21 03:29, Ani Sinha wrote:

On Fri, Dec 3, 2021 at 12:39 AM Eric DeVolder  wrote:


Following the guidelines in tests/qtest/bios-tables-test.c, this
is step 6.

Below is the disassembly of tests/data/acpi/pc/ERST.acpierst.

[...]


Note that the contents of tests/data/q35/ERST.acpierst and
tests/data/microvm/ERST.pcie are the same except for differences
due to assigned base address.

Files tests/data/pc/DSDT.acpierst and tests/data/acpi/q35/DSDT.acpierst
are new files (and are included as a result of 'make check' process).
Rather than provide the entire content, I am providing the differences
between pc/DSDT and pc/DSDT.acpierst, and the difference between
q35/DSDT and q35/DSDT.acpierst, with an explanation to follow.

diff pc/DSDT pc/DSDT.acpierst:
  @@ -5,13 +5,13 @@
*
* Disassembling to symbolic ASL+ operators
*
  - * Disassembly of tests/data/acpi/pc/DSDT, Thu Dec  2 10:10:13 2021
  + * Disassembly of tests/data/acpi/pc/DSDT.acpierst, Thu Dec  2 12:59:36 2021
*
* Original Table Header:
* Signature"DSDT"
  - * Length   0x1772 (6002)
  + * Length   0x1751 (5969)
* Revision 0x01  32-bit table (V1), no 64-bit math support
  - * Checksum 0x9E
  + * Checksum 0x95
* OEM ID   "BOCHS "
* OEM Table ID "BXPC"
* OEM Revision 0x0001 (1)
  @@ -964,16 +964,11 @@ DefinitionBlock ("", "DSDT", 1, "BOCHS "

   Device (S18)
   {
  -Name (_SUN, 0x03)  // _SUN: Slot User Number
   Name (_ADR, 0x0003)  // _ADR: Address
  -Method (_EJ0, 1, NotSerialized)  // _EJx: Eject Device
  -{
  -PCEJ (BSEL, _SUN)
  -}
  -
  +Name (ASUN, 0x03)
   Method (_DSM, 4, Serialized)  // _DSM: Device-Specific Method
   {
  -Return (PDSM (Arg0, Arg1, Arg2, Arg3, BSEL, _SUN))
  +Return (PDSM (Arg0, Arg1, Arg2, Arg3, BSEL, ASUN))
   }
   }

  @@ -1399,11 +1394,6 @@ DefinitionBlock ("", "DSDT", 1, "BOCHS "

   Method (DVNT, 2, NotSerialized)
   {
  -If ((Arg0 & 0x08))
  -{
  -Notify (S18, Arg1)
  -}
  -
   If ((Arg0 & 0x10))
   {
   Notify (S20, Arg1)

diff q35/DSDT and q35/DSDT.acpierst:
  @@ -5,13 +5,13 @@
*
* Disassembling to symbolic ASL+ operators
*
  - * Disassembly of tests/data/acpi/q35/DSDT, Thu Dec  2 10:10:13 2021
  + * Disassembly of tests/data/acpi/q35/DSDT.acpierst, Thu Dec  2 12:59:36 2021
*
* Original Table Header:
* Signature"DSDT"
  - * Length   0x2061 (8289)
  + * Length   0x2072 (8306)
* Revision 0x01  32-bit table (V1), no 64-bit math support
  - * Checksum 0xFA
  + * Checksum 0x9A
* OEM ID   "BOCHS "
* OEM Table ID "BXPC"
* OEM Revision 0x0001 (1)
  @@ -3278,6 +3278,11 @@ DefinitionBlock ("", "DSDT", 1, "BOCHS "
   }
   }

  +Device (S10)
  +{
  +Name (_ADR, 0x0002)  // _ADR: Address
  +}
  +
   Method (PCNT, 0, NotSerialized)
   {
   }

For both pc and q35, there is but a small difference between this
DSDT.acpierst and the corresponding DSDT. In both cases, the changes
occur under the hiearchy:

 Scope (\_SB)
 {
 Scope (PCI0)
 {

which leads me to believe that the change to the DSDT was needed
due to the introduction of the ERST PCI device.


I have convinced myself of the changes we see in the DSDT tables.
On i440fx side, we are adding a non-hotpluggable pci device on slot 3.
So the changes we see are basically replacing an empty hotpluggable
slot on the pci root port with a non-hotplugggable device.
On q35, bsel on pcie root bus is not set (its not hotpluggable bus),
so the change basically adds the address enumeration for the device.


Excellent! I've included the insight into commit message.



Signed-off-by: Eric DeVolder 


Acked-by: Ani Sinha 


---
  tests/data/acpi/microvm/ERST.pcie   | Bin 0 -> 912 bytes
  tests/data/acpi/pc/DSDT.acpierst| Bin 0 -> 5969 bytes
  tests/data/acpi/pc/ERST.acpierst| Bin 0 -> 912 bytes
  tests/data/acpi/q35/DSDT.acpierst   | Bin 0 -> 8306 bytes
  tests/data/acpi/q35/ERST.acpierst   | Bin 0 -> 912 bytes
  tests/qtest/bios-tables-test-allowed-diff.h |   5 -
  6 files changed, 5 deletions(-)

diff --git a/tests/data/acpi/microvm/ERST.pcie 
b/tests/data/acpi/microvm/ERST.pcie
index 
e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d9a2b3211ab5893a50751ad52be3782579e367f2
 100644
GIT

Re: [RFC PATCH v2 0/5] virtio: early detect 'modern' virtio

2021-12-09 Thread Michael S. Tsirkin

On Thu, Dec 09, 2021 at 02:29:25PM +0100, Halil Pasic wrote:
> On Wed, 8 Dec 2021 13:56:19 -0500
> "Michael S. Tsirkin"  wrote:
> 
> > On Fri, Nov 12, 2021 at 03:57:44PM +0100, Halil Pasic wrote:
> > > This is an early RFC for a transport specific early detecton of
> > > modern virtio, which is most relevant for transitional devices on big
> > > endian platforms, when drivers access the config space before
> > > FEATURES_OK is set.
> > > 
> > > The most important part that is missing here is fixing all the problems
> > > that arise in the situation described in the previous paragraph, when
> > > the config is managed by a vhost device (and thus outside QEMU. This
> > > series tackles this problem only for virtio_net+vhost as an example. If
> > > this approach is deemed good, we need to do something very similar for
> > > every single affected device.
> > > 
> > > This series was only lightly tested. The vhost stuff is entirely
> > > untested, unfortunately I don't have a working setup where this
> > > handling would be needed (because the config space is handled in the
> > > device). DPDK is not supported on s390x so at the moment I can't test
> > > DPDK based setups.   
> > 
> > So this looks sane to me. Cornelia requested some name tweaks and we
> > need to add vhost-user things and more devices, but otherwise we are
> > good.
> 
> Thanks for your feedback! There were several points where I could
> not reach agreement with Cornelia. From your response I recon that:
> 
> 1) I should rename virtio_force_modern() to virtio_indicate_modern()
> (per maintainer request).
> 2) Keep the call to virtio_set_features()?
> 
> Is that right?
> 
> Regards,
> Halil

that's my take, yes.

-- 
MST

Re: [PATCH v9 05/10] ACPI ERST: support for ACPI ERST feature

2021-12-09 Thread Eric DeVolder


Ani, inline responses below. eric

On 12/9/21 00:29, Ani Sinha wrote:

On Fri, Dec 3, 2021 at 12:39 AM Eric DeVolder  wrote:


This implements a PCI device for ACPI ERST. This implements the
non-NVRAM "mode" of operation for ERST as it is supported by
Linux and Windows.


Few more comments on this patch ...



Signed-off-by: Eric DeVolder 
---
  hw/acpi/Kconfig  |   6 +
  hw/acpi/erst.c   | 836 +++
  hw/acpi/meson.build  |   1 +
  hw/acpi/trace-events |  15 +
  4 files changed, 858 insertions(+)
  create mode 100644 hw/acpi/erst.c

diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig
index 622b0b5..19caebd 100644
--- a/hw/acpi/Kconfig
+++ b/hw/acpi/Kconfig
@@ -10,6 +10,7 @@ config ACPI_X86
  select ACPI_HMAT
  select ACPI_PIIX4
  select ACPI_PCIHP
+select ACPI_ERST

  config ACPI_X86_ICH
  bool
@@ -60,3 +61,8 @@ config ACPI_HW_REDUCED
  select ACPI
  select ACPI_MEMORY_HOTPLUG
  select ACPI_NVDIMM
+
+config ACPI_ERST
+bool
+default y
+depends on ACPI && PCI
diff --git a/hw/acpi/erst.c b/hw/acpi/erst.c
new file mode 100644
index 000..4304f55
--- /dev/null
+++ b/hw/acpi/erst.c
@@ -0,0 +1,836 @@
+/*
+ * ACPI Error Record Serialization Table, ERST, Implementation
+ *
+ * ACPI ERST introduced in ACPI 4.0, June 16, 2009.
+ * ACPI Platform Error Interfaces : Error Serialization
+ *
+ * Copyright (c) 2021 Oracle and/or its affiliates.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include 
+#include 
+#include 
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/qdev-core.h"
+#include "exec/memory.h"
+#include "qom/object.h"
+#include "hw/pci/pci.h"
+#include "qom/object_interfaces.h"
+#include "qemu/error-report.h"
+#include "migration/vmstate.h"
+#include "hw/qdev-properties.h"
+#include "hw/acpi/acpi.h"
+#include "hw/acpi/acpi-defs.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/acpi/bios-linker-loader.h"
+#include "exec/address-spaces.h"
+#include "sysemu/hostmem.h"
+#include "hw/acpi/erst.h"
+#include "trace.h"
+
+/* ACPI 4.0: Table 17-16 Serialization Actions */
+#define ACTION_BEGIN_WRITE_OPERATION 0x0
+#define ACTION_BEGIN_READ_OPERATION  0x1
+#define ACTION_BEGIN_CLEAR_OPERATION 0x2
+#define ACTION_END_OPERATION 0x3
+#define ACTION_SET_RECORD_OFFSET 0x4
+#define ACTION_EXECUTE_OPERATION 0x5
+#define ACTION_CHECK_BUSY_STATUS 0x6
+#define ACTION_GET_COMMAND_STATUS0x7
+#define ACTION_GET_RECORD_IDENTIFIER 0x8
+#define ACTION_SET_RECORD_IDENTIFIER 0x9
+#define ACTION_GET_RECORD_COUNT  0xA
+#define ACTION_BEGIN_DUMMY_WRITE_OPERATION   0xB
+#define ACTION_RESERVED  0xC
+#define ACTION_GET_ERROR_LOG_ADDRESS_RANGE   0xD
+#define ACTION_GET_ERROR_LOG_ADDRESS_LENGTH  0xE
+#define ACTION_GET_ERROR_LOG_ADDRESS_RANGE_ATTRIBUTES 0xF
+#define ACTION_GET_EXECUTE_OPERATION_TIMINGS 0x10
+
+/* ACPI 4.0: Table 17-17 Command Status Definitions */
+#define STATUS_SUCCESS0x00
+#define STATUS_NOT_ENOUGH_SPACE   0x01
+#define STATUS_HARDWARE_NOT_AVAILABLE 0x02
+#define STATUS_FAILED 0x03
+#define STATUS_RECORD_STORE_EMPTY 0x04
+#define STATUS_RECORD_NOT_FOUND   0x05
+
+
+/* UEFI 2.1: Appendix N Common Platform Error Record */
+#define UEFI_CPER_RECORD_MIN_SIZE 128U
+#define UEFI_CPER_RECORD_LENGTH_OFFSET 20U
+#define UEFI_CPER_RECORD_ID_OFFSET 96U
+#define IS_UEFI_CPER_RECORD(ptr) \
+(((ptr)[0] == 'C') && \
+ ((ptr)[1] == 'P') && \
+ ((ptr)[2] == 'E') && \
+ ((ptr)[3] == 'R'))
+
+/*
+ * NOTE that when accessing CPER fields within a record, memcpy()
+ * is utilized to avoid a possible misaligned access on the host.
+ */
+
+/*
+ * This implementation is an ACTION (cmd) and VALUE (data)
+ * interface consisting of just two 64-bit registers.
+ */
+#define ERST_REG_SIZE (16UL)
+#define ERST_ACTION_OFFSET (0UL) /* action (cmd) */
+#define ERST_VALUE_OFFSET  (8UL) /* argument/value (data) */
+
+/*
+ * ERST_RECORD_SIZE is the buffer size for exchanging ERST
+ * record contents. Thus, it defines the maximum record size.
+ * As this is mapped through a PCI BAR, it must be a power of
+ * two and larger than UEFI_CPER_RECORD_MIN_SIZE.
+ * The backing storage is divided into fixed size "slots",
+ * each ERST_RECORD_SIZE in length, and each "slot"
+ * storing a single record. No attempt at optimizing storage
+ * through compression, compaction, etc is attempted.
+ * NOTE that slot 0 is reserved for the backing storage header.
+ * Depending upon the size of the backing storage, additional
+ * slots will be part of the slot 0 header in order to account
+ * for a record_id for each available remaining slot.
+ */
+/* 8KiB records, not too small, not too big */
+#define ERST_RECORD_SIZE (8192UL)
+
+#define ACPI_ERST_MEMDEV_PROP "memdev"
+#define ACPI_ERST_RECORD_SIZE_PROP "record_size"
+
+/*
+ * From the ACPI ERST spec sections:
+

Re: [PATCH] scripts: Explain the difference between linux-headers and standard-headers

2021-12-09 Thread Peter Maydell

On Thu, 9 Dec 2021 at 17:34, Thomas Huth  wrote:
>
> If you don't know it, it's hard to figure out the difference between
> the linux-headers folder and the include/standard-headers folder.
> So let's add a short explanation to clarify the difference.
>
> Signed-off-by: Thomas Huth 
> ---
>  scripts/update-linux-headers.sh | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
> index fea4d6eb65..96c7daef94 100755
> --- a/scripts/update-linux-headers.sh
> +++ b/scripts/update-linux-headers.sh
> @@ -9,6 +9,13 @@
>  #
>  # This work is licensed under the terms of the GNU GPL version 2.
>  # See the COPYING file in the top-level directory.
> +#
> +# The script will copy the headers into two target folder:

"folders"

> +#
> +# - linux-headers/ for file that are required for compiling on a Linux host

"files"

> +#
> +# - include/standard-headers/ for files that are used for guest device 
> emulation
> +#


We could expand on this a little if you like, eg:

- linux-headers/ for files that are required for compiling on a Linux host.
  Generally we have these so we can use kernel structs and defines that
  are more recent than the headers that might be in /usr/include/linux
  on the host system. Usually this script can do simple file copies
  for these headers.

- include/standard-headers/ for files that are used for guest device emulation
  and are required on all hosts. For instance, we get our definitions of the
  virtio structures from the Linux kernel headers, but we need those
  definitions regardless of which host OS we are building on. This script
  has to be careful to sanitize the headers to remove any use of Linux-specifics
  such as types like "__u64". This work is done in the cp_portable function.

-- PMM

[PATCH v2] Revert "target/ppc: Move SPR_DSISR setting to powerpc_excp"

2021-12-09 Thread Fabiano Rosas

This reverts commit 336e91f85332dda0ede4c1d15b87a19a0fb898a2.

It breaks the --disable-tcg build:

 ../target/ppc/excp_helper.c:463:29: error: implicit declaration of
 function ‘cpu_ldl_code’ [-Werror=implicit-function-declaration]

We should not have TCG code in powerpc_excp because some kvm-only
routines use it indirectly to dispatch interrupts. See
kvm_handle_debug, spapr_mce_req_event and
spapr_do_system_reset_on_cpu.

We can re-introduce the change once we have split the interrupt
injection code between KVM and TCG.

Signed-off-by: Fabiano Rosas 
---
 target/ppc/excp_helper.c | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 17607adbe4..1c8b373078 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -454,15 +454,13 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
excp_model, int excp)
 break;
 }
 case POWERPC_EXCP_ALIGN: /* Alignment exception  */
+/* Get rS/rD and rA from faulting opcode */
 /*
- * Get rS/rD and rA from faulting opcode.
- * Note: We will only invoke ALIGN for atomic operations,
- * so all instructions are X-form.
+ * Note: the opcode fields will not be set properly for a
+ * direct store load/store, but nobody cares as nobody
+ * actually uses direct store segments.
  */
-{
-uint32_t insn = cpu_ldl_code(env, env->nip);
-env->spr[SPR_DSISR] |= (insn & 0x03FF) >> 16;
-}
+env->spr[SPR_DSISR] |= (env->error_code & 0x03FF) >> 16;
 break;
 case POWERPC_EXCP_PROGRAM:   /* Program exception*/
 switch (env->error_code & ~0xF) {
@@ -1461,6 +1459,11 @@ void ppc_cpu_do_unaligned_access(CPUState *cs, vaddr 
vaddr,
  int mmu_idx, uintptr_t retaddr)
 {
 CPUPPCState *env = cs->env_ptr;
+uint32_t insn;
+
+/* Restore state and reload the insn we executed, for filling in DSISR.  */
+cpu_restore_state(cs, retaddr, true);
+insn = cpu_ldl_code(env, env->nip);
 
 switch (env->mmu_model) {
 case POWERPC_MMU_SOFT_4xx:
@@ -1477,8 +1480,8 @@ void ppc_cpu_do_unaligned_access(CPUState *cs, vaddr 
vaddr,
 }
 
 cs->exception_index = POWERPC_EXCP_ALIGN;
-env->error_code = 0;
-cpu_loop_exit_restore(cs, retaddr);
+env->error_code = insn & 0x03FF;
+cpu_loop_exit(cs);
 }
 #endif /* CONFIG_TCG */
 #endif /* !CONFIG_USER_ONLY */
-- 
2.33.1

[PATCH] scripts: Explain the difference between linux-headers and standard-headers

2021-12-09 Thread Thomas Huth

If you don't know it, it's hard to figure out the difference between
the linux-headers folder and the include/standard-headers folder.
So let's add a short explanation to clarify the difference.

Signed-off-by: Thomas Huth 
---
 scripts/update-linux-headers.sh | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index fea4d6eb65..96c7daef94 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -9,6 +9,13 @@
 #
 # This work is licensed under the terms of the GNU GPL version 2.
 # See the COPYING file in the top-level directory.
+#
+# The script will copy the headers into two target folder:
+#
+# - linux-headers/ for file that are required for compiling on a Linux host
+#
+# - include/standard-headers/ for files that are used for guest device 
emulation
+#
 
 tmpdir=$(mktemp -d)
 linux="$1"
-- 
2.27.0

Re: [PATCH] target/ppc: powerpc_excp: Guard ALIGNMENT interrupt with CONFIG_TCG

2021-12-09 Thread Fabiano Rosas

Cédric Le Goater  writes:

> Richard,
>
> On 12/9/21 16:05, Fabiano Rosas wrote:
>> Cédric Le Goater  writes:
>> 
>>> On 12/9/21 00:06, Fabiano Rosas wrote:
 We cannot have TCG code in powerpc_excp because the function is called
 from kvm-only code via ppc_cpu_do_interrupt:

../target/ppc/excp_helper.c:463:29: error: implicit declaration of
function ‘cpu_ldl_code’ [-Werror=implicit-function-declaration]

 Fortunately, the Alignment interrupt is not among the ones dispatched
 from kvm-only code, so we can keep it out of the disable-tcg build for
 now.

 Fixes: 336e91f853 ("target/ppc: Move SPR_DSISR setting to powerpc_excp")
 Signed-off-by: Fabiano Rosas 

 ---

 Perhaps we could make powerpc_excp TCG only and have a separate
 function that only knows the two interrupts that we use with KVM
 (Program, Machine check). But for now this fix will do, I think.
 ---
target/ppc/excp_helper.c | 2 ++
1 file changed, 2 insertions(+)

 diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
 index 17607adbe4..dcf22440cc 100644
 --- a/target/ppc/excp_helper.c
 +++ b/target/ppc/excp_helper.c
 @@ -453,6 +453,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
 excp_model, int excp)
}
break;
}
 +#ifdef CONFIG_TCG
case POWERPC_EXCP_ALIGN: /* Alignment exception 
  */
/*
 * Get rS/rD and rA from faulting opcode.
 @@ -464,6 +465,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
 excp_model, int excp)
env->spr[SPR_DSISR] |= (insn & 0x03FF) >> 16;
}
break;
 +#endif
case POWERPC_EXCP_PROGRAM:   /* Program exception   
  */
switch (env->error_code & ~0xF) {
case POWERPC_EXCP_FP:

>>>
>>> Shouldn't we move that code under ppc_cpu_do_unaligned_access ?
>> 
>> Well, it came from there initially. We could revert 336e91f853 and that
>> would fix the issue as well.
>
> What would you prefer ?

Well none of this interfere with the work I'm doing, so it really makes
no difference. I guess reverting the patch is cleaner than having an
ifdef loose in the middle of the code. I'll send a v2 with the revert.

>
> Thanks,
>
> C.

Re: [PULL 0/1] Block patches

2021-12-09 Thread Richard Henderson


On 12/9/21 8:34 AM, Stefan Hajnoczi wrote:

I'm not running the release cycle this time around, but: it's
already rc4, pull requests by this point need a clear justification
in the cover letter for why they're really release critical.


It's late, this isn't a show-stopper (block/nvme.c is not widely used).
Let's leave it for the next release cycle and -stable.


Good.

Unless you want to re-issue with Cc: qemu-stable included in the patch, this can be the 
first PR of the next devel cycle, since it's already here.  :-)



r~

Re: [RFC] block-backend: prevent dangling BDS pointer in blk_drain()

2021-12-09 Thread Vladimir Sementsov-Ogievskiy


09.12.2021 19:32, Stefan Hajnoczi wrote:

On Thu, Dec 09, 2021 at 04:45:13PM +0100, Hanna Reitz wrote:

On 09.12.21 15:23, Stefan Hajnoczi wrote:

The BlockBackend root child can change during bdrv_drained_begin() when
aio_poll() is invoked. In fact the BlockDriverState can reach refcnt 0
and blk_drain() is left with a dangling BDS pointer.

One example is scsi_device_purge_requests(), which calls blk_drain() to
wait for in-flight requests to cancel. If the backup blockjob is active,
then the BlockBackend root child is a temporary filter BDS owned by the
blockjob. The blockjob can complete during bdrv_drained_begin() and the
last reference to the BDS is released when the temporary filter node is
removed. This results in a use-after-free when blk_drain() calls
bdrv_drained_end(bs) on the dangling pointer.

The general problem is that a function and its callers must not assume
that bs is still valid across aio_poll(). Explicitly hold a reference to
bs in blk_drain() to avoid the dangling pointer.

Signed-off-by: Stefan Hajnoczi 
---
I found that BDS nodes are sometimes deleted with bs->quiesce_counter >
0 (at least when running "make check"), so it is currently not possible
to put the bdrv_ref/unref() calls in bdrv_do_drained_begin() and
bdrv_do_drained_end() because they will be unbalanced. That would have
been a more general solution than only fixing blk_drain().


Deleting nodes that have a `quiesce_counter > 0` doesn’t seem wrong to me –
deleting only depends on strong references, and so I’d expect that anything
that increases the quiesce_counter also has a strong reference to the node
if the former wants the latter to stay around.

I suppose we could make it so that both the quiesce_counter and the refcnt
need to be 0 before a BDS is deleted (and then deletion can happen both from
bdrv_unref() and drained_end), but I don’t know whether that’s really
necessary.  I’d rather leave it to the caller to ensure they keep a strong
reference throughout the drain.

The question is, how often do we have a situation like this, where we take a
weak reference for draining, because we assume there’s a strong reference
backing us up (namely the one through blk->root), but that strong reference
then can go away due to draining...


Any suggestions for a better fix?


The fix makes sense to me.


Okay. My concern was that this is a whole class of bugs and my patch
only fixes blk_drain(). I have audited the code some more in the
meantime.

bdrv_insert_node() may be unsafe in the case where bs is a temporary
filter node that is unref'd during bdrv_drained_begin():

   BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
  int flags, Error **errp)
   {
   ERRP_GUARD();
   int ret;
   BlockDriverState *new_node_bs = NULL;
   const char *drvname, *node_name;
   BlockDriver *drv;
   
   drvname = qdict_get_try_str(options, "driver");

   if (!drvname) {
   error_setg(errp, "driver is not specified");
   goto fail;
   }
   
   drv = bdrv_find_format(drvname);

   if (!drv) {
   error_setg(errp, "Unknown driver: '%s'", drvname);
   goto fail;
   }
   
   node_name = qdict_get_try_str(options, "node-name");
   
   new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,

   errp);
   options = NULL; /* bdrv_new_open_driver() eats options */
   if (!new_node_bs) {
   error_prepend(errp, "Could not create node: ");
   goto fail;
   }
   
   bdrv_drained_begin(bs);

   ^^^ <--- bs can be dangling pointer
   ret = bdrv_replace_node(bs, new_node_bs, errp);
   bdrv_drained_end(bs);

The fix isn't as simple as blk_drain() because we don't want to insert
the new node before the now-deleted node. I think the correct way to
insert a node is against BdrvChild, not BlockDriverState. That way we
can be sure the new node will be inserted into a graph that is reachable
via BdrvChild (e.g. BlockBackend) instead of a detached BDS.

bdrv_set_aio_context_ignore() and blk_io_limits_disable() need to ref bs
like blk_drain() in this patch.

There are some other bdrv_drained_begin() calls that I'm assuming are
safe because they are during creation/deletion so I think we have strong
references there or nothing else knows about our BDS yet.

Do you agree with extending this patch series to cover the functions I
mentioned above?


I'm not sure.

First, we can't support "any graph change" during some graph changing operation.

Actually, when we do some specific graph change operation, we should forbid any 
other graph change operations, they should wait. Possibly, by adding strong 
references everywhere, we can avoid crashes. But what about the logic? If we do 
several graph changing operations simultaneously, the result is absolutely 
unpredictable, it's not what user wants.

The problem is

Re: [PULL 0/1] Block patches

2021-12-09 Thread Stefan Hajnoczi

On Thu, Dec 09, 2021 at 03:46:29PM +, Peter Maydell wrote:
> On Thu, 9 Dec 2021 at 15:21, Stefan Hajnoczi  wrote:
> >
> > The following changes since commit a3607def89f9cd68c1b994e1030527df33aa91d0:
> >
> >   Update version for v6.2.0-rc4 release (2021-12-07 17:51:38 -0800)
> >
> > are available in the Git repository at:
> >
> >   https://gitlab.com/stefanha/qemu.git tags/block-pull-request
> >
> > for you to fetch changes up to cf4fbc3030c974fff726756a7ceef8386cdf500b:
> >
> >   block/nvme: fix infinite loop in nvme_free_req_queue_cb() (2021-12-09 
> > 09:19:49 +)
> >
> > 
> > Pull request
> >
> > An infinite loop fix for the userspace NVMe driver.
> >
> > 
> 
> I'm not running the release cycle this time around, but: it's
> already rc4, pull requests by this point need a clear justification
> in the cover letter for why they're really release critical.

It's late, this isn't a show-stopper (block/nvme.c is not widely used).
Let's leave it for the next release cycle and -stable.

Stefan


signature.asc
Description: PGP signature

Re: [PATCH] mirror: Avoid assertion failed in mirror_run

2021-12-09 Thread Vladimir Sementsov-Ogievskiy


08.12.2021 12:52, wang.y...@zte.com.cn wrote:

[CC-ing qemu-block, Vladimir, Kevin, and John – when sending patches,
please look into the MAINTAINERS file or use the
scripts/get_maintainer.pl script to find out who to CC on them.  It’s
very to overlook patches on qemu-devel :/]

On 07.12.21 11:56, Yi Wang wrote:

From: Long YunJian 

when blockcommit from active leaf node, sometimes, we get assertion failed with
"mirror_run: Assertion `QLIST_EMPTY(>tracked_requests)' failed" messages.
According to the core file, we find bs->tracked_requests has IO request,
so assertion failed.
(gdb) bt
#0  0x7f410df707cf in raise () from /lib64/libc.so.6
#1  0x7f410df5ac05 in abort () from /lib64/libc.so.6
#2  0x7f410df5aad9 in __assert_fail_base.cold.0 () from /lib64/libc..so.6
#3  0x7f410df68db6 in __assert_fail () from /lib64/libc.so.6
#4  0x556915635371 in mirror_run (job=0x556916ff8600, errp=) 
at block/mirror.c:1092
#5  0x5569155e6c53 in job_co_entry (opaque=0x556916ff8600) at job..c:904
#6  0x5569156d9483 in coroutine_trampoline (i0=, i1=) at util/coroutine-ucontext.c:115
(gdb) p s->mirror_top_bs->backing->bs->tracked_requests
$12 = {lh_first = 0x7f3f07bfb8b0}
(gdb) p s->mirror_top_bs->backing->bs->tracked_requests->lh_first
$13 = (struct BdrvTrackedRequest *) 0x7f3f07bfb8b0

Actually, before excuting assert(QLIST_EMPTY(>tracked_requests)),
it will excute mirror_flush(s). It may handle new I/O request and maybe
pending I/O during this flush. Just likes in bdrv_close fuction,
bdrv_drain(bs) followed by bdrv_flush(bs), we should add bdrv_drain fuction
to handle pending I/O after mirror_flush.


Oh.  How is that happening, though?  I would have expected that flushing
the target BB (and associated BDS) only flushes requests to the OS and
lower layers, but the source node (which is `bs`) should (in the case of
commit) always be above the target, so I wouldn’t have expected it to
get any new requests due to this flush.

Do you have a reproducer for this?


As i know, flush maybe will do some thring write, and then in qcow2_co_pwritev 
function,
if others aready hold "s->lock" lock, qemu_co_mutex_lock(>lock) will go to 
qemu_coroutine_yield,
and do some other things. Maybe, it will handle new I/O now.


No, they must not, as we are in a drained section.. All possible producers of 
new io requests should be aware of it and should not create new requests. 
Still, the history knows bugs, when requests were created during drained 
section, look at cf3129323f900ef5ddbccbe8 commit.

So, if in drained section (after bdrv_drain_begin() call returned) we see 
something in bs->tracked_requests - that's probably a deeper bug, and we 
shouldn't try to mask it by additional bdrv_drain(). bdrv_drain() inside a drained 
section for same bs should be a no-op.

Could you investigate a bit more? The simplest thing to do is to look at this 
tracked request coroutine, it may help to catch the source of this request. To 
do this, you can use scripts/qemu-gdb.py's coroutine command that shows 
backtrace for coroutine. Unfortunately it doesn't work for coredumps, only for 
alive process.

So, you'll need:

1. start your vm
2. attach with gdb to qemu process, and in gdb do "source 
/path/to/qemu/scripts/qemu-gdb.py"
3. do the reproduce
4. In gdb, run command "qemu coroutine COROUTINE_POINTER". And COROUTINE_POINTER you'll find 
inside s->mirror_top_bs->backing->bs->tracked_requests->lh_first, it is its .co field.

It should print back-trace of the coroutine.

Another approach could be try to set a breakpoint on adding an element to tracked_requests with 
a condition that bs->quiesce_counter > 0  (which is, as I understand, a kind of 
"drain counter" actually)

--
Best regards,
Vladimir

Re: [RFC] block-backend: prevent dangling BDS pointer in blk_drain()

2021-12-09 Thread Stefan Hajnoczi

On Thu, Dec 09, 2021 at 04:45:13PM +0100, Hanna Reitz wrote:
> On 09.12.21 15:23, Stefan Hajnoczi wrote:
> > The BlockBackend root child can change during bdrv_drained_begin() when
> > aio_poll() is invoked. In fact the BlockDriverState can reach refcnt 0
> > and blk_drain() is left with a dangling BDS pointer.
> > 
> > One example is scsi_device_purge_requests(), which calls blk_drain() to
> > wait for in-flight requests to cancel. If the backup blockjob is active,
> > then the BlockBackend root child is a temporary filter BDS owned by the
> > blockjob. The blockjob can complete during bdrv_drained_begin() and the
> > last reference to the BDS is released when the temporary filter node is
> > removed. This results in a use-after-free when blk_drain() calls
> > bdrv_drained_end(bs) on the dangling pointer.
> > 
> > The general problem is that a function and its callers must not assume
> > that bs is still valid across aio_poll(). Explicitly hold a reference to
> > bs in blk_drain() to avoid the dangling pointer.
> > 
> > Signed-off-by: Stefan Hajnoczi 
> > ---
> > I found that BDS nodes are sometimes deleted with bs->quiesce_counter >
> > 0 (at least when running "make check"), so it is currently not possible
> > to put the bdrv_ref/unref() calls in bdrv_do_drained_begin() and
> > bdrv_do_drained_end() because they will be unbalanced. That would have
> > been a more general solution than only fixing blk_drain().
> 
> Deleting nodes that have a `quiesce_counter > 0` doesn’t seem wrong to me –
> deleting only depends on strong references, and so I’d expect that anything
> that increases the quiesce_counter also has a strong reference to the node
> if the former wants the latter to stay around.
> 
> I suppose we could make it so that both the quiesce_counter and the refcnt
> need to be 0 before a BDS is deleted (and then deletion can happen both from
> bdrv_unref() and drained_end), but I don’t know whether that’s really
> necessary.  I’d rather leave it to the caller to ensure they keep a strong
> reference throughout the drain.
> 
> The question is, how often do we have a situation like this, where we take a
> weak reference for draining, because we assume there’s a strong reference
> backing us up (namely the one through blk->root), but that strong reference
> then can go away due to draining...
> 
> > Any suggestions for a better fix?
> 
> The fix makes sense to me.

Okay. My concern was that this is a whole class of bugs and my patch
only fixes blk_drain(). I have audited the code some more in the
meantime.

bdrv_insert_node() may be unsafe in the case where bs is a temporary
filter node that is unref'd during bdrv_drained_begin():

  BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *options,
 int flags, Error **errp)
  {
  ERRP_GUARD();
  int ret;
  BlockDriverState *new_node_bs = NULL;
  const char *drvname, *node_name;
  BlockDriver *drv;
  
  drvname = qdict_get_try_str(options, "driver");
  if (!drvname) {
  error_setg(errp, "driver is not specified");
  goto fail;
  }
  
  drv = bdrv_find_format(drvname);
  if (!drv) {
  error_setg(errp, "Unknown driver: '%s'", drvname);
  goto fail;
  }
  
  node_name = qdict_get_try_str(options, "node-name");
  
  new_node_bs = bdrv_new_open_driver_opts(drv, node_name, options, flags,
  errp);
  options = NULL; /* bdrv_new_open_driver() eats options */
  if (!new_node_bs) {
  error_prepend(errp, "Could not create node: ");
  goto fail;
  }
  
  bdrv_drained_begin(bs);
  ^^^ <--- bs can be dangling pointer
  ret = bdrv_replace_node(bs, new_node_bs, errp);
  bdrv_drained_end(bs);

The fix isn't as simple as blk_drain() because we don't want to insert
the new node before the now-deleted node. I think the correct way to
insert a node is against BdrvChild, not BlockDriverState. That way we
can be sure the new node will be inserted into a graph that is reachable
via BdrvChild (e.g. BlockBackend) instead of a detached BDS.

bdrv_set_aio_context_ignore() and blk_io_limits_disable() need to ref bs
like blk_drain() in this patch.

There are some other bdrv_drained_begin() calls that I'm assuming are
safe because they are during creation/deletion so I think we have strong
references there or nothing else knows about our BDS yet.

Do you agree with extending this patch series to cover the functions I
mentioned above?

> One alternative that comes to my mind is to instead re-fetch `bs =
> blk_bs(blk);` after the AIO_WAIT_WHILE() loop.  But that might be wrong,
> because if the node attached to the BB changed (i.e. isn’t `bs`, and isn’t
> `NULL`), then we’d end the drain on the wrong node.

Yes.

Stefan


signature.asc
Description: PGP signature

Re: QEMU 6.2.0 and rhbz#1999878

2021-12-09 Thread Eduardo Lima

Thanks all, I saw the patch has been merged and is part of rc4. I'm
removing it from the fedora package.

On Fri, Dec 3, 2021 at 9:09 PM Richard Henderson <
richard.hender...@linaro.org> wrote:

> On 12/3/21 2:00 PM, Richard Henderson wrote:
> >> Oh I see, it was indeed replaced by Richard Henderson's patch:
> >>
> >>
> https://src.fedoraproject.org/rpms/qemu/blob/rawhide/f/0001-tcg-arm-Reduce-vector-alignment-requirement-for-NEON.patch
> >>
> >>
> >>> At the moment I kept it as part of 6.2.0 build, which I am just about
> to push
> >>> to rawhide. It builds locally, and I am only waiting for the
> scratch-build to
> >>> finish.
> >>
> >> Yes looks like we need to keep it, and get it upstream too.
> >
> > Whoops.  That dropped through the cracks.
> > I'll queue that now-ish.
>
>
> https://patchew.org/QEMU/20210912174925.200132-1-richard.hender...@linaro.org/
>
> Ah right, I was supposed to test your kernel and never got there.
> Plus it never got any r-b's.
>
> Rebase was smooth and regression testing went ok on cortex-a57 host.
>
>
> r~
>
>

Re: [RFC] block-backend: prevent dangling BDS pointer in blk_drain()

2021-12-09 Thread Vladimir Sementsov-Ogievskiy


09.12.2021 18:45, Hanna Reitz wrote:

On 09.12.21 15:23, Stefan Hajnoczi wrote:

The BlockBackend root child can change during bdrv_drained_begin() when
aio_poll() is invoked. In fact the BlockDriverState can reach refcnt 0
and blk_drain() is left with a dangling BDS pointer.

One example is scsi_device_purge_requests(), which calls blk_drain() to
wait for in-flight requests to cancel. If the backup blockjob is active,
then the BlockBackend root child is a temporary filter BDS owned by the
blockjob. The blockjob can complete during bdrv_drained_begin() and the
last reference to the BDS is released when the temporary filter node is
removed. This results in a use-after-free when blk_drain() calls
bdrv_drained_end(bs) on the dangling pointer.

The general problem is that a function and its callers must not assume
that bs is still valid across aio_poll(). Explicitly hold a reference to
bs in blk_drain() to avoid the dangling pointer.

Signed-off-by: Stefan Hajnoczi 
---
I found that BDS nodes are sometimes deleted with bs->quiesce_counter >
0 (at least when running "make check"), so it is currently not possible
to put the bdrv_ref/unref() calls in bdrv_do_drained_begin() and
bdrv_do_drained_end() because they will be unbalanced. That would have
been a more general solution than only fixing blk_drain().


Deleting nodes that have a `quiesce_counter > 0` doesn’t seem wrong to me – 
deleting only depends on strong references, and so I’d expect that anything that 
increases the quiesce_counter also has a strong reference to the node if the 
former wants the latter to stay around.

I suppose we could make it so that both the quiesce_counter and the refcnt need 
to be 0 before a BDS is deleted (and then deletion can happen both from 
bdrv_unref() and drained_end), but I don’t know whether that’s really 
necessary.  I’d rather leave it to the caller to ensure they keep a strong 
reference throughout the drain.


Agree. Better to keep the ref-count behavior obvious.



The question is, how often do we have a situation like this, where we take a weak 
reference for draining, because we assume there’s a strong reference backing us up 
(namely the one through blk->root), but that strong reference then can go away 
due to draining...


Any suggestions for a better fix?


The fix makes sense to me.

One alternative that comes to my mind is to instead re-fetch `bs = 
blk_bs(blk);` after the AIO_WAIT_WHILE() loop.  But that might be wrong, 
because if the node attached to the BB changed (i.e. isn’t `bs`, and isn’t 
`NULL`), then we’d end the drain on the wrong node.

So I think your fix is the right one.



I agree.

Interesting how many code paths that care to take a strong reference are 
actually prepared to the fact that block-graph may change, and this bs may be 
in some other place, with changed permissions, children and parents :/

Is graph modifying during drain is safe? Hmm, we probably always do graph 
modification in drained section for purpose) As I understand, all the logic 
about quiesce_counter is to support exactly this. And the only logic that seems 
correct is to finish drain on same node where it was started.

Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

Re: [PULL 04/13] target/rx: TCG helpers

2021-12-09 Thread Peter Maydell

On Tue, 17 Mar 2020 at 16:43, Philippe Mathieu-Daudé  wrote:
>
> From: Yoshinori Sato 
>
> Reviewed-by: Richard Henderson 
> Tested-by: Philippe Mathieu-Daudé 
> Signed-off-by: Yoshinori Sato 
> Signed-off-by: Richard Henderson 
> [PMD: Removed tlb_fill, extracted from patch of Yoshinori Sato
>  'Convert to CPUClass::tlb_fill']
> Signed-off-by: Philippe Mathieu-Daudé 
> Message-Id: <20200224141923.82118-6-ys...@users.sourceforge.jp>
> Acked-by: Richard Henderson 
> Signed-off-by: Philippe Mathieu-Daudé 

Somewhat late, but I've just noticed a bug in the helper_set_fpsw()
function introduced in this patch. The function has changed a little
since but the bug is still there in the version in git:

> +void helper_set_fpsw(CPURXState *env, uint32_t val)
> +{
> +static const int roundmode[] = {
> +float_round_nearest_even,
> +float_round_to_zero,
> +float_round_up,
> +float_round_down,
> +};
> +uint32_t fpsw = env->fpsw;
> +fpsw |= 0x7f03;
> +val &= ~0x8000;
> +fpsw &= val;
> +FIELD_DP32(fpsw, FPSW, FS, FIELD_EX32(fpsw, FPSW, FLAGS) != 0);

FIELD_DP32() does not update its first argument, it merely reads
it. It returns the new value with the field change applied, so
you need to use it like this:

fpsw = FIELD_DP32(fpsw, );

Would somebody like to write a patch ?

(I noticed this because I just made the same mistake in some new
code I was writing, so I did a quick grep of the codebase to see
if there were any instances of it already present. I think the macro
magic used in the definitions of FIELD_DP* to provide a compile error
if you pass a value that's bigger than the target field has the
unfortunate side effect of suppressing the compiler warning that the
whole statement has no effect.)

thanks
-- PMM

Re: [RFC] vhost-vdpa-net: add vhost-vdpa-net host device support

2021-12-09 Thread Stefano Garzarella


On Thu, Dec 09, 2021 at 09:16:58AM +, Stefan Hajnoczi wrote:

On Wed, Dec 08, 2021 at 01:20:10PM +0800, Longpeng(Mike) wrote:

From: Longpeng 

Hi guys,

This patch introduces vhost-vdpa-net device, which is inspired
by vhost-user-blk and the proposal of vhost-vdpa-blk device [1].

I've tested this patch on Huawei's offload card:
./x86_64-softmmu/qemu-system-x86_64 \
-device vhost-vdpa-net-pci,vdpa-dev=/dev/vhost-vdpa-0

For virtio hardware offloading, the most important requirement for us
is to support live migration between offloading cards from different
vendors, the combination of netdev and virtio-net seems too heavy, we
prefer a lightweight way.

Maybe we could support both in the future ? Such as:

* Lightweight
 Net: vhost-vdpa-net
 Storage: vhost-vdpa-blk

* Heavy but more powerful
 Net: netdev + virtio-net + vhost-vdpa
 Storage: bdrv + virtio-blk + vhost-vdpa

[1] https://www.mail-archive.com/qemu-devel@nongnu.org/msg797569.html


Stefano presented a plan for vdpa-blk at KVM Forum 2021:
https://kvmforum2021.sched.com/event/ke3a/vdpa-blk-unified-hardware-and-software-offload-for-virtio-blk-stefano-garzarella-red-hat

It's closer to today's virtio-net + vhost-net approach than the
vhost-vdpa-blk device you have mentioned. The idea is to treat vDPA as
an offload feature rather than a completely separate code path that
needs to be maintained and tested. That way QEMU's block layer features
and live migration work with vDPA devices and re-use the virtio-blk
code. The key functionality that has not been implemented yet is a "fast
path" mechanism that allows the QEMU virtio-blk device's virtqueue to be
offloaded to vDPA.

The unified vdpa-blk architecture should deliver the same performance
as the vhost-vdpa-blk device you mentioned but with more features, so I
wonder what aspects of the vhost-vdpa-blk idea are important to you?

QEMU already has vhost-user-blk, which takes a similar approach as the
vhost-vdpa-blk device you are proposing. I'm not against the
vhost-vdpa-blk approach in priciple, but would like to understand your
requirements and see if there is a way to collaborate on one vdpa-blk
implementation instead of dividing our efforts between two.


Waiting for the aspects that Stefan asked, I add some details about the 
plan for vdpa-blk.


Currently I'm working on the in-kernel software device. In the next 
months I hope to start working on the QEMU part. Anyway that part could 
go in parallel with the in-kernel device, so if you are interested we 
can collaborate.


Having only the unified vdpa-blk architecture would allow us to simplify 
the management layers and avoid duplicate code, but it takes more time 
to develop compared to vhost-vdpa-blk. So if vdpa-blk support in QEMU is 
urgent, I could understand the need to add vhost-vdpa-blk now.


Let me know if you want more details about the unified vdpa-blk 
architecture.


Thanks,
Stefano

Re: RFC: x86 memory map, where to put CXL ranges?

2021-12-09 Thread Jonathan Cameron via

On Thu, 09 Dec 2021 14:19:59 +
Alex Bennée  wrote:

> Jonathan Cameron  writes:
> 
> > Hi All,
> >
> > For CXL emulation we require a couple of types of memory range that
> > are then provided to the OS via the CEDT ACPI table.
> >
> > 1) CXL Host Bridge Structures point to CXL Host Bridge Component Registers.
> > Small regions for each CXL Host bridge that are mapped into the memory 
> > space.
> > 64k each.  In theory we may have a huge number of these but in reality I
> > think 16 will do for any reasonable system.
> >
> > 2) CXL Fixed Memory Window Structures (CFMWS)
> > Large PA space ranges (multiple TB) to which various CXL devices can be 
> > assigned
> > and their address decoders appropriately programmed.
> > Each such CFMWS will have particular characteristics such as interleaving 
> > across
> > multiple host bridges.  The can potentially be huge but are a system
> > characteristic.  For emulation purposes it won't matter if they move around
> > dependent on what else is the machine has configured. So I'd like to
> > just configure their size rather than fully specify them at the command line
> > and possibly clash on PA space with something else.  Alternatively could
> > leave them as fully specified at the command line (address and size) and 
> > just
> > error out if the hit memory already in use for something else.
> >
> > Now unfortunately there are no systems out there yet that we can just
> > copy the memory map from...
> >
> > Coming form an Arm background I have only a vague idea of how this should be
> > done for x86 so apologies if it is a stupid question.
> >
> > My current approach is to put these above device_memory and moving
> > the pci hole up appropriately.  
> 
> Which board model would be be talking about here? virt? Or maybe we need
> a new one?

Initially at least the plan is virt because all the acpi table building support
is there which makes things nice and easy.  It's not very invasive and there
has to be a machine level cxl=on to enable it so shouldn't be a maintenance 
problem
and doesn't justify new board model.

> 
> If it's virt I would look at extended_memmap which floats above the
> configured RAM size and means less shuffling around of the relatively
> crowded lower address space.

I'll give that a go.

There are two types of regions to find space for.
1) Root complex control registers.  Those can go in the memory map.  Currently
I have them fitted in a gap in the low memory map but I'll move them to the 
extended
one as doesn't matter if they are at a fixed address or not.
2) Fixed memory windows described in CEDT/CFMWS entries.
These are less obvious.  They are PA ranges with associated interleaving
(across host bridges) and peformance properties (related to platform QoS
control etc). They act as PA ranges into which we can assign particular devices.

For an initial patch set I plan to support no interleaving and just have
one such region per host bridge (currently pxb instance).
For these regions, my current choice is to put them above device_memory.
So at the command line you can specify a set of region sizes and we
then lay them out appropriately and describe those in CFMWS entries.
Ben's earlier code had these fully specified as base + size, but I don't
think there is any advantage in doing that as the info is presented to
the OS etc anyway so it doesn't matter if it changes. If it is
figured out at runtime we will have less problem with accidental clashes.

> 
> I have no idea about how this is handled on x86 though.

I'll go with the show the code for that :)

Thanks,

Jonathan

> 
> > Is that the right choice?
> >
> > On Arm I currently have the Host Bridge Structures low down in the MemMap 
> > and the CFMWS
> > can go above the device memory.  Comments on that also welcome.
> >
> > In Ben's RFC the host bridge component register location was marked as a 
> > TODO
> > and a arbitrary address used in the meantime so time to figure out how to 
> > clean
> > that up.
> >
> > Thanks,
> >
> > Jonathan  
> 
>

Re: [PULL 0/1] Block patches

2021-12-09 Thread Peter Maydell

On Thu, 9 Dec 2021 at 15:21, Stefan Hajnoczi  wrote:
>
> The following changes since commit a3607def89f9cd68c1b994e1030527df33aa91d0:
>
>   Update version for v6.2.0-rc4 release (2021-12-07 17:51:38 -0800)
>
> are available in the Git repository at:
>
>   https://gitlab.com/stefanha/qemu.git tags/block-pull-request
>
> for you to fetch changes up to cf4fbc3030c974fff726756a7ceef8386cdf500b:
>
>   block/nvme: fix infinite loop in nvme_free_req_queue_cb() (2021-12-09 
> 09:19:49 +)
>
> 
> Pull request
>
> An infinite loop fix for the userspace NVMe driver.
>
> 

I'm not running the release cycle this time around, but: it's
already rc4, pull requests by this point need a clear justification
in the cover letter for why they're really release critical.

-- PMM

Re: [RFC] block-backend: prevent dangling BDS pointer in blk_drain()

2021-12-09 Thread Hanna Reitz


On 09.12.21 15:23, Stefan Hajnoczi wrote:

The BlockBackend root child can change during bdrv_drained_begin() when
aio_poll() is invoked. In fact the BlockDriverState can reach refcnt 0
and blk_drain() is left with a dangling BDS pointer.

One example is scsi_device_purge_requests(), which calls blk_drain() to
wait for in-flight requests to cancel. If the backup blockjob is active,
then the BlockBackend root child is a temporary filter BDS owned by the
blockjob. The blockjob can complete during bdrv_drained_begin() and the
last reference to the BDS is released when the temporary filter node is
removed. This results in a use-after-free when blk_drain() calls
bdrv_drained_end(bs) on the dangling pointer.

The general problem is that a function and its callers must not assume
that bs is still valid across aio_poll(). Explicitly hold a reference to
bs in blk_drain() to avoid the dangling pointer.

Signed-off-by: Stefan Hajnoczi 
---
I found that BDS nodes are sometimes deleted with bs->quiesce_counter >
0 (at least when running "make check"), so it is currently not possible
to put the bdrv_ref/unref() calls in bdrv_do_drained_begin() and
bdrv_do_drained_end() because they will be unbalanced. That would have
been a more general solution than only fixing blk_drain().


Deleting nodes that have a `quiesce_counter > 0` doesn’t seem wrong to 
me – deleting only depends on strong references, and so I’d expect that 
anything that increases the quiesce_counter also has a strong reference 
to the node if the former wants the latter to stay around.


I suppose we could make it so that both the quiesce_counter and the 
refcnt need to be 0 before a BDS is deleted (and then deletion can 
happen both from bdrv_unref() and drained_end), but I don’t know whether 
that’s really necessary.  I’d rather leave it to the caller to ensure 
they keep a strong reference throughout the drain.


The question is, how often do we have a situation like this, where we 
take a weak reference for draining, because we assume there’s a strong 
reference backing us up (namely the one through blk->root), but that 
strong reference then can go away due to draining...



Any suggestions for a better fix?


The fix makes sense to me.

One alternative that comes to my mind is to instead re-fetch `bs = 
blk_bs(blk);` after the AIO_WAIT_WHILE() loop.  But that might be wrong, 
because if the node attached to the BB changed (i.e. isn’t `bs`, and 
isn’t `NULL`), then we’d end the drain on the wrong node.


So I think your fix is the right one.

Hanna


I think it's likely that more "dangling pointer across aio_poll()"
problems exist :(.

Here is the (hacky) reproducer:

   build/qemu-system-x86_64 \
  -name 'avocado-vt-vm1'  \
  -sandbox on  \
  -machine q35,memory-backend=mem-machine_mem \
  -device 
pcie-root-port,id=pcie-root-port-0,multifunction=on,bus=pcie.0,addr=0x1,chassis=1
 \
  -device 
pcie-pci-bridge,id=pcie-pci-bridge-0,addr=0x0,bus=pcie-root-port-0  \
  -nodefaults \
  -device VGA,bus=pcie.0,addr=0x2 \
  -m 1024 \
  -object memory-backend-ram,size=1024M,id=mem-machine_mem  \
  -smp 10,maxcpus=10,cores=5,threads=1,dies=1,sockets=2  \
  -cpu 'Cascadelake-Server-noTSX',+kvm_pv_unhalt \
  -chardev 
socket,wait=off,server=on,id=qmp_id_qmpmonitor1,path=/tmp/qmp.sock  \
  -mon chardev=qmp_id_qmpmonitor1,mode=control \
  -chardev 
socket,wait=off,server=on,id=qmp_id_catch_monitor,path=/tmp/catch_monitor.sock  
\
  -mon chardev=qmp_id_catch_monitor,mode=control \
  -device pvpanic,ioport=0x505,id=idgKHYrQ \
  -chardev 
socket,wait=off,server=on,id=chardev_serial0,path=/tmp/serial.sock \
  -device isa-serial,id=serial0,chardev=chardev_serial0  \
  -chardev 
socket,id=seabioslog_id_2020-012521-TNCkxDmn,path=/tmp/seabios.sock,server=on,wait=off
 \
  -device 
isa-debugcon,chardev=seabioslog_id_2020-012521-TNCkxDmn,iobase=0x402 \
  -device 
pcie-root-port,id=pcie-root-port-2,port=0x2,addr=0x1.0x2,bus=pcie.0,chassis=3 \
  -device virtio-scsi-pci,id=virtio_scsi_pci0,bus=pcie-root-port-2,addr=0x0 
\
  -blockdev 
node-name=file_image1,driver=file,auto-read-only=on,discard=unmap,aio=threads,filename=test.img,cache.direct=on,cache.no-flush=off
 \
  -blockdev 
node-name=drive_image1,driver=raw,read-only=off,cache.direct=on,cache.no-flush=off,file=file_image1
 \
  -device scsi-hd,id=image1,drive=drive_image1,write-cache=on \
  -blockdev 
node-name=file_src1,driver=file,auto-read-only=on,discard=unmap,aio=threads,filename=sr1.qcow2,cache.direct=on,cache.no-flush=off
 \
  -blockdev 
node-name=drive_src1,driver=qcow2,read-only=off,cache.direct=on,cache.no-flush=off,file=file_src1
 \
  -device scsi-hd,id=src1,drive=drive_src1,write-cache=on \
  -device 
pcie-root-port,id=pcie-root-port-3,port=0x3,addr=0x1.0x3,bus=pcie.0,chassis=4 \
  -device 
virtio-net-pci,mac=9a:11:64:b0:5d:a8,id=idxnEEYY,netdev=idBjpylo,bus=pcie-root-port-3,addr=0x0
  \

Re: [PATCH v2] Move the libssh setup from configure to meson.build

2021-12-09 Thread Richard W.M. Jones

On Thu, Dec 09, 2021 at 04:08:24PM +0100, Thomas Huth wrote:
> On 09/12/2021 15.55, Richard W.M. Jones wrote:
> >On Thu, Dec 09, 2021 at 03:48:01PM +0100, Thomas Huth wrote:
> >>It's easier to do this in meson.build now.
> >>
> >>Signed-off-by: Thomas Huth 
> >>---
> >>  v2: Added the missing "config_host_data.set('CONFIG_LIBSSH', 
> >> libssh.found())"
> >>
> >>  configure | 27 ---
> >>  meson.build   | 13 +
> >>  meson_options.txt |  2 ++
> >>  scripts/meson-buildoptions.sh |  3 +++
> >>  4 files changed, 14 insertions(+), 31 deletions(-)
> >>
> >>diff --git a/configure b/configure
> >>index 48c21775f3..bb99a40ed0 100755
> >>--- a/configure
> >>+++ b/configure
> >>@@ -344,7 +344,6 @@ debug_stack_usage="no"
> >>  crypto_afalg="no"
> >>  tls_priority="NORMAL"
> >>  tpm="$default_feature"
> >>-libssh="$default_feature"
> >>  live_block_migration=${default_feature:-yes}
> >>  numa="$default_feature"
> >>  replication=${default_feature:-yes}
> >>@@ -1078,10 +1077,6 @@ for opt do
> >>;;
> >>--enable-tpm) tpm="yes"
> >>;;
> >>-  --disable-libssh) libssh="no"
> >>-  ;;
> >>-  --enable-libssh) libssh="yes"
> >>-  ;;
> >>--disable-live-block-migration) live_block_migration="no"
> >>;;
> >>--enable-live-block-migration) live_block_migration="yes"
> >>@@ -1448,7 +1443,6 @@ cat << EOF
> >>live-block-migration   Block migration in the main migration stream
> >>coroutine-pool  coroutine freelist (better performance)
> >>tpm TPM support
> >>-  libssh  ssh block device support
> >>numalibnuma support
> >>avx2AVX2 optimization support
> >>avx512f AVX512F optimization support
> >>@@ -2561,21 +2555,6 @@ if test "$modules" = yes; then
> >>  fi
> >>  fi
> >>-##
> >>-# libssh probe
> >>-if test "$libssh" != "no" ; then
> >>-  if $pkg_config --exists "libssh >= 0.8.7"; then
> >>-libssh_cflags=$($pkg_config libssh --cflags)
> >>-libssh_libs=$($pkg_config libssh --libs)
> >>-libssh=yes
> >>-  else
> >>-if test "$libssh" = "yes" ; then
> >>-  error_exit "libssh required for --enable-libssh"
> >>-fi
> >>-libssh=no
> >>-  fi
> >>-fi
> >>-
> >>  ##
> >>  # TPM emulation is only on POSIX
> >>@@ -3636,12 +3615,6 @@ if test "$cmpxchg128" = "yes" ; then
> >>echo "CONFIG_CMPXCHG128=y" >> $config_host_mak
> >>  fi
> >>-if test "$libssh" = "yes" ; then
> >>-  echo "CONFIG_LIBSSH=y" >> $config_host_mak
> >>-  echo "LIBSSH_CFLAGS=$libssh_cflags" >> $config_host_mak
> >>-  echo "LIBSSH_LIBS=$libssh_libs" >> $config_host_mak
> >>-fi
> >>-
> >>  if test "$live_block_migration" = "yes" ; then
> >>echo "CONFIG_LIVE_BLOCK_MIGRATION=y" >> $config_host_mak
> >>  fi
> >>diff --git a/meson.build b/meson.build
> >>index 96de1a6ef9..ae67ca28ab 100644
> >>--- a/meson.build
> >>+++ b/meson.build
> >>@@ -874,11 +874,15 @@ if not get_option('glusterfs').auto() or have_block
> >>  ''', dependencies: glusterfs)
> >>endif
> >>  endif
> >>+
> >>  libssh = not_found
> >>-if 'CONFIG_LIBSSH' in config_host
> >>-  libssh = declare_dependency(compile_args: 
> >>config_host['LIBSSH_CFLAGS'].split(),
> >>-  link_args: 
> >>config_host['LIBSSH_LIBS'].split())
> >>+if not get_option('libssh').auto() or have_block
> >>+  libssh = dependency('libssh', version: '>=0.8.7',
> >>+method: 'pkg-config',
> >>+required: get_option('libssh'),
> >>+kwargs: static_kwargs)
> >>  endif
> >>+
> >>  libbzip2 = not_found
> >>  if not get_option('bzip2').auto() or have_block
> >>libbzip2 = cc.find_library('bz2', has_headers: ['bzlib.h'],
> >>@@ -1451,6 +1455,7 @@ config_host_data.set('CONFIG_EBPF', libbpf.found())
> >>  config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
> >>  config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
> >>  config_host_data.set('CONFIG_LIBNFS', libnfs.found())
> >>+config_host_data.set('CONFIG_LIBSSH', libssh.found())
> >>  config_host_data.set('CONFIG_LINUX_AIO', libaio.found())
> >>  config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
> >>  config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
> >>@@ -3430,7 +3435,7 @@ endif
> >>  summary_info += {'seccomp support':   seccomp}
> >>  summary_info += {'GlusterFS support': glusterfs}
> >>  summary_info += {'TPM support':   config_host.has_key('CONFIG_TPM')}
> >>-summary_info += {'libssh support':config_host.has_key('CONFIG_LIBSSH')}
> >>+summary_info += {'libssh support':libssh}
> >>  summary_info += {'lzo support':   lzo}
> >>  summary_info += {'snappy support':snappy}
> >>  summary_info += {'bzip2 support': libbzip2}
> >>diff --git a/meson_options.txt b/meson_options.txt
> >>index e392323732..4114bfcaa4 100644
> >>--- a/meson_options.txt
> >>+++

[PATCH] vvfat: Fix vvfat_write() for writes before the root directory

2021-12-09 Thread Kevin Wolf

The calculation in sector2cluster() is done relative to the offset of
the root directory. Any writes to blocks before the start of the root
directory (in particular, writes to the FAT) result in negative values,
which are not handled correctly in vvfat_write().

This changes sector2cluster() to return a signed value, and makes sure
that vvfat_write() doesn't try to find mappings for negative cluster
number. It clarifies the code in vvfat_write() to make it more obvious
that the cluster numbers can be negative.

Signed-off-by: Kevin Wolf 
---
 block/vvfat.c | 30 ++
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index 9deb552e0e..99a3bc2568 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -882,7 +882,7 @@ static int read_directory(BDRVVVFATState* s, int 
mapping_index)
 return 0;
 }
 
-static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num)
+static inline int32_t sector2cluster(BDRVVVFATState* s,off_t sector_num)
 {
 return (sector_num - s->offset_to_root_dir) / s->sectors_per_cluster;
 }
@@ -2971,6 +2971,7 @@ static int vvfat_write(BlockDriverState *bs, int64_t 
sector_num,
 {
 BDRVVVFATState *s = bs->opaque;
 int i, ret;
+int first_cluster, last_cluster;
 
 DLOG(checkpoint());
 
@@ -2989,9 +2990,20 @@ DLOG(checkpoint());
 if (sector_num < s->offset_to_fat)
 return -1;
 
-for (i = sector2cluster(s, sector_num);
-i <= sector2cluster(s, sector_num + nb_sectors - 1);) {
-mapping_t* mapping = find_mapping_for_cluster(s, i);
+/*
+ * Values will be negative for writes to the FAT, which is located before
+ * the root directory.
+ */
+first_cluster = sector2cluster(s, sector_num);
+last_cluster = sector2cluster(s, sector_num + nb_sectors - 1);
+
+for (i = first_cluster; i <= last_cluster;) {
+mapping_t *mapping = NULL;
+
+if (i >= 0) {
+mapping = find_mapping_for_cluster(s, i);
+}
+
 if (mapping) {
 if (mapping->read_only) {
 fprintf(stderr, "Tried to write to write-protected file %s\n",
@@ -3031,8 +3043,9 @@ DLOG(checkpoint());
 }
 }
 i = mapping->end;
-} else
+} else {
 i++;
+}
 }
 
 /*
@@ -3046,10 +3059,11 @@ DLOG(fprintf(stderr, "Write to qcow backend: %d + 
%d\n", (int)sector_num, nb_sec
 return ret;
 }
 
-for (i = sector2cluster(s, sector_num);
-i <= sector2cluster(s, sector_num + nb_sectors - 1); i++)
-if (i >= 0)
+for (i = first_cluster; i <= last_cluster; i++) {
+if (i >= 0) {
 s->used_clusters[i] |= USED_ALLOCATED;
+}
+}
 
 DLOG(checkpoint());
 /* TODO: add timeout */
-- 
2.31.1

Re: [PATCH] target/ppc: powerpc_excp: Guard ALIGNMENT interrupt with CONFIG_TCG

2021-12-09 Thread Fabiano Rosas

Philippe Mathieu-Daudé  writes:

> On 12/9/21 00:06, Fabiano Rosas wrote:
>> We cannot have TCG code in powerpc_excp because the function is called
>> from kvm-only code via ppc_cpu_do_interrupt:
>> 
>>  ../target/ppc/excp_helper.c:463:29: error: implicit declaration of
>>  function ‘cpu_ldl_code’ [-Werror=implicit-function-declaration]
>> 
>> Fortunately, the Alignment interrupt is not among the ones dispatched
>> from kvm-only code, so we can keep it out of the disable-tcg build for
>> now.
>> 
>> Fixes: 336e91f853 ("target/ppc: Move SPR_DSISR setting to powerpc_excp")
>> Signed-off-by: Fabiano Rosas 
>> 
>> ---
>> 
>> Perhaps we could make powerpc_excp TCG only and have a separate
>> function that only knows the two interrupts that we use with KVM
>> (Program, Machine check). But for now this fix will do, I think.
>
> If KVM only uses 2 exception vectors, you could guard the
> enum in target/ppc/cpu.h using #ifdef'ry. While making the
> include uglier, it will helps to catch vector misused at
> compile time.

Yes, good point.

I just noticed that we also use System Reset with KVM. The other two are
kvm-only, but this one is in code shared with TCG, so it will need a bit
more work to disentangle. But should still be doable.

>> ---
>>  target/ppc/excp_helper.c | 2 ++
>>  1 file changed, 2 insertions(+)

[PATCH] vvfat: Fix size of temporary qcow file

2021-12-09 Thread Kevin Wolf

The size of the qcow size was calculated so that only the FAT partition
would fit on it, but not the whole disk. However, offsets relative to
the whole disk are used to access it, so increase its size to be large
enough for that.

Signed-off-by: Kevin Wolf 
---
 block/vvfat.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index 05e78e3c27..9deb552e0e 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -1230,6 +1230,7 @@ static int vvfat_open(BlockDriverState *bs, QDict 
*options, int flags,
  dirname, cyls, heads, secs));
 
 s->sector_count = cyls * heads * secs - s->offset_to_bootsector;
+bs->total_sectors = cyls * heads * secs;
 
 if (qemu_opt_get_bool(opts, "rw", false)) {
 if (!bdrv_is_read_only(bs)) {
@@ -1250,8 +1251,6 @@ static int vvfat_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 }
 
-bs->total_sectors = cyls * heads * secs;
-
 if (init_directories(s, dirname, heads, secs, errp)) {
 ret = -EIO;
 goto fail;
@@ -3137,8 +3136,8 @@ static int enable_write_target(BlockDriverState *bs, 
Error **errp)
 }
 
 opts = qemu_opts_create(bdrv_qcow->create_opts, NULL, 0, _abort);
-qemu_opt_set_number(opts, BLOCK_OPT_SIZE, s->sector_count * 512,
-_abort);
+qemu_opt_set_number(opts, BLOCK_OPT_SIZE,
+bs->total_sectors * BDRV_SECTOR_SIZE, _abort);
 qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, "fat:", _abort);
 
 ret = bdrv_create(bdrv_qcow, s->qcow_filename, opts, errp);
-- 
2.31.1

[PULL 0/1] Block patches

2021-12-09 Thread Stefan Hajnoczi

The following changes since commit a3607def89f9cd68c1b994e1030527df33aa91d0:

  Update version for v6.2.0-rc4 release (2021-12-07 17:51:38 -0800)

are available in the Git repository at:

  https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to cf4fbc3030c974fff726756a7ceef8386cdf500b:

  block/nvme: fix infinite loop in nvme_free_req_queue_cb() (2021-12-09 
09:19:49 +)


Pull request

An infinite loop fix for the userspace NVMe driver.



Stefan Hajnoczi (1):
  block/nvme: fix infinite loop in nvme_free_req_queue_cb()

 block/nvme.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

-- 
2.33.1

[PULL 1/1] block/nvme: fix infinite loop in nvme_free_req_queue_cb()

2021-12-09 Thread Stefan Hajnoczi

When the request free list is exhausted the coroutine waits on
q->free_req_queue for the next free request. Whenever a request is
completed a BH is scheduled to invoke nvme_free_req_queue_cb() and wake
up waiting coroutines.

1. nvme_get_free_req() waits for a free request:

while (q->free_req_head == -1) {
...
trace_nvme_free_req_queue_wait(q->s, q->index);
qemu_co_queue_wait(>free_req_queue, >lock);
...
}

2. nvme_free_req_queue_cb() wakes up the coroutine:

while (qemu_co_enter_next(>free_req_queue, >lock)) {
   ^--- infinite loop when free_req_head == -1
}

nvme_free_req_queue_cb() and the coroutine form an infinite loop when
q->free_req_head == -1. Fix this by checking q->free_req_head in
nvme_free_req_queue_cb(). If the free request list is exhausted, don't
wake waiting coroutines. Eventually an in-flight request will complete
and the BH will be scheduled again, guaranteeing forward progress.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Philippe Mathieu-Daudé 
Message-id: 20211208152246.244585-1-stefa...@redhat.com
Signed-off-by: Stefan Hajnoczi 
---
 block/nvme.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index e4f336d79c..fa360b9b3c 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -206,8 +206,9 @@ static void nvme_free_req_queue_cb(void *opaque)
 NVMeQueuePair *q = opaque;
 
 qemu_mutex_lock(>lock);
-while (qemu_co_enter_next(>free_req_queue, >lock)) {
-/* Retry all pending requests */
+while (q->free_req_head != -1 &&
+   qemu_co_enter_next(>free_req_queue, >lock)) {
+/* Retry waiting requests */
 }
 qemu_mutex_unlock(>lock);
 }
-- 
2.33.1

Re: [PATCH] target/ppc: powerpc_excp: Guard ALIGNMENT interrupt with CONFIG_TCG

2021-12-09 Thread Cédric Le Goater


Richard,

On 12/9/21 16:05, Fabiano Rosas wrote:

Cédric Le Goater  writes:


On 12/9/21 00:06, Fabiano Rosas wrote:

We cannot have TCG code in powerpc_excp because the function is called
from kvm-only code via ppc_cpu_do_interrupt:

   ../target/ppc/excp_helper.c:463:29: error: implicit declaration of
   function ‘cpu_ldl_code’ [-Werror=implicit-function-declaration]

Fortunately, the Alignment interrupt is not among the ones dispatched
from kvm-only code, so we can keep it out of the disable-tcg build for
now.

Fixes: 336e91f853 ("target/ppc: Move SPR_DSISR setting to powerpc_excp")
Signed-off-by: Fabiano Rosas 

---

Perhaps we could make powerpc_excp TCG only and have a separate
function that only knows the two interrupts that we use with KVM
(Program, Machine check). But for now this fix will do, I think.
---
   target/ppc/excp_helper.c | 2 ++
   1 file changed, 2 insertions(+)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 17607adbe4..dcf22440cc 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -453,6 +453,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
excp_model, int excp)
   }
   break;
   }
+#ifdef CONFIG_TCG
   case POWERPC_EXCP_ALIGN: /* Alignment exception  
*/
   /*
* Get rS/rD and rA from faulting opcode.
@@ -464,6 +465,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
excp_model, int excp)
   env->spr[SPR_DSISR] |= (insn & 0x03FF) >> 16;
   }
   break;
+#endif
   case POWERPC_EXCP_PROGRAM:   /* Program exception
*/
   switch (env->error_code & ~0xF) {
   case POWERPC_EXCP_FP:



Shouldn't we move that code under ppc_cpu_do_unaligned_access ?


Well, it came from there initially. We could revert 336e91f853 and that
would fix the issue as well.


What would you prefer ?

Thanks,

C.

Re: [PATCH v2] Move the libssh setup from configure to meson.build

2021-12-09 Thread Thomas Huth


On 09/12/2021 15.55, Richard W.M. Jones wrote:

On Thu, Dec 09, 2021 at 03:48:01PM +0100, Thomas Huth wrote:

It's easier to do this in meson.build now.

Signed-off-by: Thomas Huth 
---
  v2: Added the missing "config_host_data.set('CONFIG_LIBSSH', libssh.found())"

  configure | 27 ---
  meson.build   | 13 +
  meson_options.txt |  2 ++
  scripts/meson-buildoptions.sh |  3 +++
  4 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/configure b/configure
index 48c21775f3..bb99a40ed0 100755
--- a/configure
+++ b/configure
@@ -344,7 +344,6 @@ debug_stack_usage="no"
  crypto_afalg="no"
  tls_priority="NORMAL"
  tpm="$default_feature"
-libssh="$default_feature"
  live_block_migration=${default_feature:-yes}
  numa="$default_feature"
  replication=${default_feature:-yes}
@@ -1078,10 +1077,6 @@ for opt do
;;
--enable-tpm) tpm="yes"
;;
-  --disable-libssh) libssh="no"
-  ;;
-  --enable-libssh) libssh="yes"
-  ;;
--disable-live-block-migration) live_block_migration="no"
;;
--enable-live-block-migration) live_block_migration="yes"
@@ -1448,7 +1443,6 @@ cat << EOF
live-block-migration   Block migration in the main migration stream
coroutine-pool  coroutine freelist (better performance)
tpm TPM support
-  libssh  ssh block device support
numalibnuma support
avx2AVX2 optimization support
avx512f AVX512F optimization support
@@ -2561,21 +2555,6 @@ if test "$modules" = yes; then
  fi
  fi
  
-##

-# libssh probe
-if test "$libssh" != "no" ; then
-  if $pkg_config --exists "libssh >= 0.8.7"; then
-libssh_cflags=$($pkg_config libssh --cflags)
-libssh_libs=$($pkg_config libssh --libs)
-libssh=yes
-  else
-if test "$libssh" = "yes" ; then
-  error_exit "libssh required for --enable-libssh"
-fi
-libssh=no
-  fi
-fi
-
  ##
  # TPM emulation is only on POSIX
  
@@ -3636,12 +3615,6 @@ if test "$cmpxchg128" = "yes" ; then

echo "CONFIG_CMPXCHG128=y" >> $config_host_mak
  fi
  
-if test "$libssh" = "yes" ; then

-  echo "CONFIG_LIBSSH=y" >> $config_host_mak
-  echo "LIBSSH_CFLAGS=$libssh_cflags" >> $config_host_mak
-  echo "LIBSSH_LIBS=$libssh_libs" >> $config_host_mak
-fi
-
  if test "$live_block_migration" = "yes" ; then
echo "CONFIG_LIVE_BLOCK_MIGRATION=y" >> $config_host_mak
  fi
diff --git a/meson.build b/meson.build
index 96de1a6ef9..ae67ca28ab 100644
--- a/meson.build
+++ b/meson.build
@@ -874,11 +874,15 @@ if not get_option('glusterfs').auto() or have_block
  ''', dependencies: glusterfs)
endif
  endif
+
  libssh = not_found
-if 'CONFIG_LIBSSH' in config_host
-  libssh = declare_dependency(compile_args: 
config_host['LIBSSH_CFLAGS'].split(),
-  link_args: config_host['LIBSSH_LIBS'].split())
+if not get_option('libssh').auto() or have_block
+  libssh = dependency('libssh', version: '>=0.8.7',
+method: 'pkg-config',
+required: get_option('libssh'),
+kwargs: static_kwargs)
  endif
+
  libbzip2 = not_found
  if not get_option('bzip2').auto() or have_block
libbzip2 = cc.find_library('bz2', has_headers: ['bzlib.h'],
@@ -1451,6 +1455,7 @@ config_host_data.set('CONFIG_EBPF', libbpf.found())
  config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
  config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
  config_host_data.set('CONFIG_LIBNFS', libnfs.found())
+config_host_data.set('CONFIG_LIBSSH', libssh.found())
  config_host_data.set('CONFIG_LINUX_AIO', libaio.found())
  config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
  config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
@@ -3430,7 +3435,7 @@ endif
  summary_info += {'seccomp support':   seccomp}
  summary_info += {'GlusterFS support': glusterfs}
  summary_info += {'TPM support':   config_host.has_key('CONFIG_TPM')}
-summary_info += {'libssh support':config_host.has_key('CONFIG_LIBSSH')}
+summary_info += {'libssh support':libssh}
  summary_info += {'lzo support':   lzo}
  summary_info += {'snappy support':snappy}
  summary_info += {'bzip2 support': libbzip2}
diff --git a/meson_options.txt b/meson_options.txt
index e392323732..4114bfcaa4 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -105,6 +105,8 @@ option('libdaxctl', type : 'feature', value : 'auto',
 description: 'libdaxctl support')
  option('libpmem', type : 'feature', value : 'auto',
 description: 'libpmem support')
+option('libssh', type : 'feature', value : 'auto',
+   description: 'ssh block device support')
  option('libudev', type : 'feature', value : 'auto',
 description: 'Use libudev to enumerate host devices')
  option('libusb', type : 'feature', value : 'auto',
diff --git a/scripts/meson-buildoptions.sh

Re: [PATCH] target/ppc: powerpc_excp: Guard ALIGNMENT interrupt with CONFIG_TCG

2021-12-09 Thread Fabiano Rosas

Cédric Le Goater  writes:

> On 12/9/21 00:06, Fabiano Rosas wrote:
>> We cannot have TCG code in powerpc_excp because the function is called
>> from kvm-only code via ppc_cpu_do_interrupt:
>> 
>>   ../target/ppc/excp_helper.c:463:29: error: implicit declaration of
>>   function ‘cpu_ldl_code’ [-Werror=implicit-function-declaration]
>> 
>> Fortunately, the Alignment interrupt is not among the ones dispatched
>> from kvm-only code, so we can keep it out of the disable-tcg build for
>> now.
>> 
>> Fixes: 336e91f853 ("target/ppc: Move SPR_DSISR setting to powerpc_excp")
>> Signed-off-by: Fabiano Rosas 
>> 
>> ---
>> 
>> Perhaps we could make powerpc_excp TCG only and have a separate
>> function that only knows the two interrupts that we use with KVM
>> (Program, Machine check). But for now this fix will do, I think.
>> ---
>>   target/ppc/excp_helper.c | 2 ++
>>   1 file changed, 2 insertions(+)
>> 
>> diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
>> index 17607adbe4..dcf22440cc 100644
>> --- a/target/ppc/excp_helper.c
>> +++ b/target/ppc/excp_helper.c
>> @@ -453,6 +453,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
>> excp_model, int excp)
>>   }
>>   break;
>>   }
>> +#ifdef CONFIG_TCG
>>   case POWERPC_EXCP_ALIGN: /* Alignment exception
>>   */
>>   /*
>>* Get rS/rD and rA from faulting opcode.
>> @@ -464,6 +465,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
>> excp_model, int excp)
>>   env->spr[SPR_DSISR] |= (insn & 0x03FF) >> 16;
>>   }
>>   break;
>> +#endif
>>   case POWERPC_EXCP_PROGRAM:   /* Program exception  
>>   */
>>   switch (env->error_code & ~0xF) {
>>   case POWERPC_EXCP_FP:
>> 
>
> Shouldn't we move that code under ppc_cpu_do_unaligned_access ?

Well, it came from there initially. We could revert 336e91f853 and that
would fix the issue as well.

>
> Thanks,
>
> C.

Re: [PATCH v4 0/5] s390x: CPU Topology

2021-12-09 Thread Pierre Morel


Hi,

This series is updated by a v5 series with documentation and numa 
extensions.
Some changes have been made in some of the patches contained in this 
series too.


Regards,
Pierre

On 11/17/21 17:48, Pierre Morel wrote:

Hi,

This series is a first part of the implementation of CPU topology
for S390 greatly reduced from the first spin.

In particular, we reduced the scope to the S390x specificities, removing
all code touching to SMP or NUMA, with the goal to:
- facilitate review and acceptance
- let for later the SMP part currently actively discussed in mainline
- be able despite the reduction of code to handle CPU topology for S390
   using the current S390 topology provided by QEMU with cores and sockets
   only.

To use these patches, you will need the Linux series version 4.
You find it there:
https://lkml.org/lkml/2021/9/16/576

Currently this code is for KVM only, I have no idea if it is interesting
to provide a TCG patch. If ever it will be done in another series.

A short introduction


CPU Topology is described in the S390 POP with essentially the description
of two instructions:

PTF Perform Topology function used to poll for topology change
 and used to set the polarization but this part is not part of this item.

STSI Store System Information and the SYSIB 15.1.x providing the Topology
 configuration.

S390 Topology is a 6 levels hierarchical topology with up to 5 level
 of containers. The last topology level, specifying the CPU cores.

 This patch series only uses the two lower levels sockets and cores.
 
 To get the information on the topology, S390 provides the STSI

 instruction, which stores a structures providing the list of the
 containers used in the Machine topology: the SYSIB.
 A selector within the STSI instruction allow to chose how many topology
 levels will be provide in the SYSIB.

 Using the Topology List Entries (TLE) provided inside the SYSIB we
 the Linux kernel is able to compute the information about the cache
 distance between two cores and can use this information to take
 scheduling decisions.

Note:
-
  Z15 reports 3 levels of containers, drawers, book, sockets as
  Container-TLEs above the core description inside CPU-TLEs.

The Topology can be seen at several places inside zLinux:
 - sysfs: /sys/devices/system/cpu/cpuX/topology
 - procfs: /proc/sysinfo and /proc/cpuinfo
 - lscpu -e : gives toplogy information

The different Topology levels have names:
 - Node - Drawer - Book - sockets or physical package - core

Threads:
 Multithreading, is not part of the topology as described by the
 SYSIB 15.1.x

The interest of the guest to know the CPU topology is obviously to be
able to optimise the load balancing and the migration of threads.
KVM will have the same interest concerning vCPUs scheduling and cache
optimisation.


The design
==

1) To be ready for hotplug, I chose an Object oriented design
of the topology containers:
- A node is a bridge on the SYSBUS and defines a "node bus"
- A drawer is hotplug on the "node bus"
- A book on the "drawer bus"
- A socket on the "book bus"
- And the CPU Topology List Entry (CPU-TLE)sits on the socket bus.
These objects will be enhanced with the cache information when
NUMA is implemented.

This also allows for easy retrieval when building the different SYSIB
for Store Topology System Information (STSI)

2) Perform Topology Function (PTF) instruction is made available to the
guest with a new KVM capability and intercepted in QEMU, allowing the
guest to pool for topology changes.


Features and TBD list
=

- There is no direct match between IDs shown by:
 - lscpu (unrelated numbered list),
 - SYSIB 15.1.x (topology ID)

- The CPU number, left column of lscpu, is used to reference a CPU
 by Linux tools
 While the CPU address is used by QEMU for hotplug.

- Effect of -smp parsing on the topology with an example:
 -smp 9,sockets=4,cores=4,maxcpus=16

 We have 4 socket each holding 4 cores so that we have a maximum
 of 16 CPU, 9 of them are active on boot. (Should be obvious)

# lscpu -e
CPU NODE DRAWER BOOK SOCKET CORE L1d:L1i:L2d:L2i ONLINE CONFIGURED POLARIZATION 
ADDRESS
   00  00  00 0:0:0:0yes yeshorizontal  
 0
   10  00  01 1:1:1:1yes yeshorizontal  
 1
   20  00  02 2:2:2:2yes yeshorizontal  
 2
   30  00  03 3:3:3:3yes yeshorizontal  
 3
   40  00  14 4:4:4:4yes yeshorizontal  
 4
   50  00  15 5:5:5:5yes yeshorizontal  
 5
   60  00  16 6:6:6:6yes yeshorizontal  
 6
   70  00  17 7:7:7:7yes yeshorizontal  
 7
   80  00  28

[PATCH 3/8] vhost-user-video: boiler plate code for vhost-user-video device

2021-12-09 Thread Peter Griffin

Signed-off-by: Peter Griffin 
---
 hw/display/Kconfig   |   5 +
 hw/display/meson.build   |   3 +
 hw/display/vhost-user-video.c| 386 +++
 include/hw/virtio/vhost-user-video.h |  41 +++
 4 files changed, 435 insertions(+)
 create mode 100644 hw/display/vhost-user-video.c
 create mode 100644 include/hw/virtio/vhost-user-video.h

diff --git a/hw/display/Kconfig b/hw/display/Kconfig
index a2306b67d8..186163b015 100644
--- a/hw/display/Kconfig
+++ b/hw/display/Kconfig
@@ -118,6 +118,11 @@ config VHOST_USER_VGA
 default y
 depends on VIRTIO_VGA && VHOST_USER_GPU
 
+config VHOST_USER_VIDEO
+bool
+default y
+depends on VIRTIO && VHOST_USER
+
 config DPCD
 bool
 select AUX
diff --git a/hw/display/meson.build b/hw/display/meson.build
index 861c43ff98..48284528cf 100644
--- a/hw/display/meson.build
+++ b/hw/display/meson.build
@@ -37,6 +37,9 @@ softmmu_ss.add(when: 'CONFIG_MACFB', if_true: 
files('macfb.c'))
 softmmu_ss.add(when: 'CONFIG_NEXTCUBE', if_true: files('next-fb.c'))
 
 specific_ss.add(when: 'CONFIG_VGA', if_true: files('vga.c'))
+specific_ss.add(when: 'CONFIG_VHOST_USER_VIDEO', if_true: 
files('vhost-user-video.c'))
+specific_ss.add(when: ['CONFIG_VHOST_USER_VIDEO', 'CONFIG_VIRTIO_PCI' ],
+  if_true: files('vhost-user-video-pci.c'))
 
 if config_all_devices.has_key('CONFIG_QXL')
   qxl_ss = ss.source_set()
diff --git a/hw/display/vhost-user-video.c b/hw/display/vhost-user-video.c
new file mode 100644
index 00..506e350365
--- /dev/null
+++ b/hw/display/vhost-user-video.c
@@ -0,0 +1,386 @@
+/*
+ * Vhost-user VIDEO virtio device
+ *
+ * This is the boilerplate for instantiating a vhost-user device
+ * implementing a virtio-video device.
+ *
+ * The virtio video decoder and encoder devices are virtual devices that
+ * support encoding and decoding respectively.
+ *
+ * The actual back-end for this driver is the vhost-user-video daemon.
+ * The code here just connects up the device in QEMU and allows it to
+ * be instantiated.
+ *
+ * Copyright (c) 2021 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/vhost-user-video.h"
+#include "qemu/error-report.h"
+
+/* currently there is no VIDEO enc/dec defined in Linux virtio_ids.h */
+#define VIRTIO_ID_VIDEO_ENC 30
+#define VIRTIO_ID_VIDEO_DEC 31
+#define MAX_CAPS_LEN 4096
+
+static void vhost_user_video_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+VHostUserVIDEO *video = VHOST_USER_VIDEO(vdev);
+struct virtio_video_config *vconfig = (struct virtio_video_config *)config;
+int ret;
+Error *local_err = NULL;
+
+memset(config, 0, sizeof(struct virtio_video_config));
+
+ret = vhost_dev_get_config(>vhost_dev, config,
+   sizeof(struct virtio_video_config), _err);
+if (ret) {
+error_report("vhost-user-video: get device config space failed");
+
+/*TODO vhost_dev_get_config() fails so for now lets just set it here */
+vconfig = (struct virtio_video_config *)config;
+vconfig->version = 0;
+vconfig->max_caps_length = MAX_CAPS_LEN;
+vconfig->max_resp_length = MAX_CAPS_LEN;
+return;
+}
+}
+
+static void vhost_user_video_start(VirtIODevice *vdev)
+{
+VHostUserVIDEO *video = VHOST_USER_VIDEO(vdev);
+BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+int ret;
+int i;
+
+if (!k->set_guest_notifiers) {
+error_report("binding does not support guest notifiers");
+return;
+}
+
+ret = vhost_dev_enable_notifiers(>vhost_dev, vdev);
+if (ret < 0) {
+error_report("Error enabling host notifiers: %d", -ret);
+return;
+}
+
+ret = k->set_guest_notifiers(qbus->parent, video->vhost_dev.nvqs, true);
+if (ret < 0) {
+error_report("Error binding guest notifier: %d", -ret);
+goto err_host_notifiers;
+}
+
+video->vhost_dev.acked_features = vdev->guest_features;
+
+ret = vhost_dev_start(>vhost_dev, vdev);
+if (ret < 0) {
+error_report("Error starting vhost-user-video: %d", -ret);
+goto err_guest_notifiers;
+}
+
+/*
+ * guest_notifier_mask/pending not used yet, so just unmask
+ * everything here.  virtio-pci will do the right thing by
+ * enabling/disabling irqfd.
+ */
+for (i = 0; i < video->vhost_dev.nvqs; i++) {
+vhost_virtqueue_mask(>vhost_dev, vdev, i, false);
+}
+
+return;
+
+err_guest_notifiers:
+k->set_guest_notifiers(qbus->parent, video->vhost_dev.nvqs, false);
+err_host_notifiers:
+vhost_dev_disable_notifiers(>vhost_dev, vdev);
+}
+
+static void vhost_user_video_stop(VirtIODevice *vdev)
+{
+VHostUserVIDEO *video = VHOST_USER_VIDEO(vdev);
+BusState *qbus =

[PATCH 6/8] virtio_video: Add Fast Walsh-Hadamard Transform format

2021-12-09 Thread Peter Griffin

Linux vicodec (Virtual Codec) test driver in Linux implements
FWHT. FWHT was designed to be fast and simple and to have
characteristics of other video codecs and therefore face similar
issues [1].

https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform

Signed-off-by: Peter Griffin 
---
 include/standard-headers/linux/virtio_video.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/standard-headers/linux/virtio_video.h 
b/include/standard-headers/linux/virtio_video.h
index 16b5f642a9..3b517d50c4 100644
--- a/include/standard-headers/linux/virtio_video.h
+++ b/include/standard-headers/linux/virtio_video.h
@@ -75,6 +75,7 @@ enum virtio_video_format {
VIRTIO_VIDEO_FORMAT_HEVC, /* HEVC aka H.265*/
VIRTIO_VIDEO_FORMAT_VP8, /* VP8 */
VIRTIO_VIDEO_FORMAT_VP9, /* VP9 */
+   VIRTIO_VIDEO_FORMAT_FWHT, /* FWHT used by vicodec */
VIRTIO_VIDEO_FORMAT_CODED_MAX = VIRTIO_VIDEO_FORMAT_VP9,
 };
 
-- 
2.25.1

[PATCH 8/8] tools/vhost-user-video: Add initial vhost-user-video vmm

2021-12-09 Thread Peter Griffin

This vmm translates from virtio-video v3 protocol and writes
to a v4l2 mem2mem stateful decoder/encoder device [1]. v3 was
chosen as that is what the virtio-video Linux frontend driver
implements.

This allows for testing with the v4l2 vicodec test codec [2]
module in the Linux kernel, and is intended to also be used
with Arm SoCs that implement a v4l2 stateful decoder/encoder
drivers.

The advantage of developing & testing with vicodec is that
is allows quick development on a purely virtual setup with
qemu and a host Linux kernel. Also it allows ci systems like
lkft, kernelci to easily test the virtio interface.

Currently conversion from virtio-video to v4l2 stateless m2m
codec driver or VAAPI drivers is consiered out ot scope as
is emulation of a decoder device using a something like ffmpeg.
Although this could be added in the future.

Note some virtio & v4l2 helpers were based off virtio-video
Linux frontend driver and yavta utility, both GPL v2.

Example host commands
 modprobe vicodec
 vhost-user-video --v4l2-device=/dev/video3 -v --socket-path=video.sock

Run Qemu with
 -device vhost-user-video-pci,chardev=video,id=video

Guest decoder
 v4l2-ctl -d0 -x width=640,height=480 -v width=640,height=480,pixelformat=YU12
   --stream-mmap --stream-out-mmap --stream-from jelly_640_480-420P.fwht
   --stream-to out-jelly-640-480.YU12

[1] https://www.kernel.org/doc/html/latest/userspace-api/media/
v4l/dev-decoder.html

[2] https://lwn.net/Articles/760650/

Signed-off-by: Peter Griffin 
---
 tools/vhost-user-video/50-qemu-rpmb.json.in   |5 +
 tools/vhost-user-video/main.c | 1680 
 tools/vhost-user-video/meson.build|   10 +
 tools/vhost-user-video/v4l2_backend.c | 1777 +
 tools/vhost-user-video/v4l2_backend.h |   99 +
 tools/vhost-user-video/virtio_video_helpers.c |  462 +
 tools/vhost-user-video/virtio_video_helpers.h |  166 ++
 tools/vhost-user-video/vuvideo.h  |   43 +
 8 files changed, 4242 insertions(+)
 create mode 100644 tools/vhost-user-video/50-qemu-rpmb.json.in
 create mode 100644 tools/vhost-user-video/main.c
 create mode 100644 tools/vhost-user-video/meson.build
 create mode 100644 tools/vhost-user-video/v4l2_backend.c
 create mode 100644 tools/vhost-user-video/v4l2_backend.h
 create mode 100644 tools/vhost-user-video/virtio_video_helpers.c
 create mode 100644 tools/vhost-user-video/virtio_video_helpers.h
 create mode 100644 tools/vhost-user-video/vuvideo.h

diff --git a/tools/vhost-user-video/50-qemu-rpmb.json.in 
b/tools/vhost-user-video/50-qemu-rpmb.json.in
new file mode 100644
index 00..2b033cda56
--- /dev/null
+++ b/tools/vhost-user-video/50-qemu-rpmb.json.in
@@ -0,0 +1,5 @@
+{
+  "description": "QEMU vhost-user-rpmb",
+  "type": "block",
+  "binary": "@libexecdir@/vhost-user-rpmb"
+}
diff --git a/tools/vhost-user-video/main.c b/tools/vhost-user-video/main.c
new file mode 100644
index 00..a944efadb6
--- /dev/null
+++ b/tools/vhost-user-video/main.c
@@ -0,0 +1,1680 @@
+/*
+ * VIRTIO Video Emulation via vhost-user
+ *
+ * Copyright (c) 2021 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#define G_LOG_DOMAIN "vhost-user-video"
+#define G_LOG_USE_STRUCTURED 1
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "libvhost-user-glib.h"
+#include "libvhost-user.h"
+#include "standard-headers/linux/virtio_video.h"
+
+#include "qemu/compiler.h"
+#include "qemu/iov.h"
+
+#include "vuvideo.h"
+#include "v4l2_backend.h"
+#include "virtio_video_helpers.h"
+
+#ifndef container_of
+#define container_of(ptr, type, member) ({  \
+const typeof(((type *) 0)->member) * __mptr = (ptr); \
+(type *) ((char *) __mptr - offsetof(type, member)); })
+#endif
+
+static gchar *socket_path;
+static gchar *v4l2_path;
+static gint socket_fd = -1;
+static gboolean print_cap;
+static gboolean verbose;
+static gboolean debug;
+
+static GOptionEntry options[] = {
+{ "socket-path", 0, 0, G_OPTION_ARG_FILENAME, _path,
+  "Location of vhost-user Unix domain socket, "
+  "incompatible with --fd", "PATH" },
+{ "v4l2-device", 0, 0, G_OPTION_ARG_FILENAME, _path,
+  "Location of v4l2 device node", "PATH" },
+{ "fd", 0, 0, G_OPTION_ARG_INT, _fd,
+  "Specify the fd of the backend, "
+  "incompatible with --socket-path", "FD" },
+{ "print-capabilities", 0, 0, G_OPTION_ARG_NONE, _cap,
+  "Output to stdout the backend capabilities "
+  "in JSON format and exit", NULL},
+{ "verbose", 'v', 0, G_OPTION_ARG_NONE, ,
+  "Be more verbose in output", NULL},
+{ "debug", 0, 0, G_OPTION_ARG_NONE, ,
+  "Include debug output", NULL},
+{ NULL }
+};
+
+enum {
+VHOST_USER_VIDEO_MAX_QUEUES = 2,
+};
+
+/* taken from util/iov.c */
+size_t video_iov_size(const struct iovec *iov, const

[PATCH 7/8] hw/display: add vhost-user-video-pci

2021-12-09 Thread Peter Griffin

Add boiler plate code for vhost-user-video-pci.

Example
-device vhost-user-video-pci,chardev=video,id=video
-chardev socket,path=video.sock,id=video

Signed-off-by: Peter Griffin 
---
 hw/display/vhost-user-video-pci.c | 82 +++
 1 file changed, 82 insertions(+)
 create mode 100644 hw/display/vhost-user-video-pci.c

diff --git a/hw/display/vhost-user-video-pci.c 
b/hw/display/vhost-user-video-pci.c
new file mode 100644
index 00..ceeaad2742
--- /dev/null
+++ b/hw/display/vhost-user-video-pci.c
@@ -0,0 +1,82 @@
+/*
+ * Vhost-user VIDEO virtio device PCI glue
+ *
+ * Copyright (c) 2021 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-video.h"
+#include "hw/virtio/virtio-pci.h"
+
+struct VHostUserVIDEOPCI {
+VirtIOPCIProxy parent_obj;
+VHostUserVIDEO vdev;
+};
+
+typedef struct VHostUserVIDEOPCI VHostUserVIDEOPCI;
+
+#define TYPE_VHOST_USER_VIDEO_PCI "vhost-user-video-pci-base"
+
+#define VHOST_USER_VIDEO_PCI(obj) \
+OBJECT_CHECK(VHostUserVIDEOPCI, (obj), TYPE_VHOST_USER_VIDEO_PCI)
+
+static Property vuvideo_pci_properties[] = {
+DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
+VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+   DEV_NVECTORS_UNSPECIFIED),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vuvideo_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VHostUserVIDEOPCI *dev = VHOST_USER_VIDEO_PCI(vpci_dev);
+DeviceState *vdev = DEVICE(>vdev);
+
+if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+vpci_dev->nvectors = 1;
+}
+
+qdev_set_parent_bus(vdev, BUS(_dev->bus), errp);
+object_property_set_bool(OBJECT(vdev), "realized", true, errp);
+}
+
+static void vuvideo_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+k->realize = vuvideo_pci_realize;
+set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
+device_class_set_props(dc, vuvideo_pci_properties);
+pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */
+pcidev_k->revision = 0x00;
+pcidev_k->class_id = PCI_CLASS_STORAGE_OTHER;
+}
+
+static void vuvideo_pci_instance_init(Object *obj)
+{
+VHostUserVIDEOPCI *dev = VHOST_USER_VIDEO_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VHOST_USER_VIDEO);
+}
+
+static const VirtioPCIDeviceTypeInfo vuvideo_pci_info = {
+.base_name = TYPE_VHOST_USER_VIDEO_PCI,
+.non_transitional_name = "vhost-user-video-pci",
+.instance_size = sizeof(VHostUserVIDEOPCI),
+.instance_init = vuvideo_pci_instance_init,
+.class_init= vuvideo_pci_class_init,
+};
+
+static void vuvideo_pci_register(void)
+{
+virtio_pci_types_register(_pci_info);
+}
+
+type_init(vuvideo_pci_register);
-- 
2.25.1

[PATCH 4/8] vhost-user-video: add meson subdir build logic

2021-12-09 Thread Peter Griffin

Signed-off-by: Peter Griffin 
---
 tools/meson.build | 9 +
 1 file changed, 9 insertions(+)

diff --git a/tools/meson.build b/tools/meson.build
index 3e5a0abfa2..3314b5efc5 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -24,3 +24,12 @@ endif
 if have_virtiofsd
   subdir('virtiofsd')
 endif
+
+have_virtiovideo = (have_system and
+have_tools and
+'CONFIG_LINUX' in config_host)
+
+if have_virtiovideo
+  subdir('vhost-user-video')
+endif
+
-- 
2.25.1

[PATCH 5/8] standard-headers: Add virtio_video.h

2021-12-09 Thread Peter Griffin

Signed-off-by: Peter Griffin 
---
 include/standard-headers/linux/virtio_video.h | 483 ++
 1 file changed, 483 insertions(+)
 create mode 100644 include/standard-headers/linux/virtio_video.h

diff --git a/include/standard-headers/linux/virtio_video.h 
b/include/standard-headers/linux/virtio_video.h
new file mode 100644
index 00..16b5f642a9
--- /dev/null
+++ b/include/standard-headers/linux/virtio_video.h
@@ -0,0 +1,483 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*
+ * Virtio Video Device
+ *
+ * This header is BSD licensed so anyone can use the definitions
+ * to implement compatible drivers/servers:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of IBM nor the names of its contributors
+ *may be used to endorse or promote products derived from this software
+ *without specific prior written permission.
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Copyright (C) 2019 OpenSynergy GmbH.
+ */
+
+#ifndef _UAPI_LINUX_VIRTIO_VIDEO_H
+#define _UAPI_LINUX_VIRTIO_VIDEO_H
+
+#include 
+#include 
+
+/*
+ * Feature bits
+ */
+
+/* Guest pages can be used for video buffers. */
+#define VIRTIO_VIDEO_F_RESOURCE_GUEST_PAGES 0
+/*
+ * The host can process buffers even if they are non-contiguous memory such as
+ * scatter-gather lists.
+ */
+#define VIRTIO_VIDEO_F_RESOURCE_NON_CONTIG 1
+/* Objects exported by another virtio device can be used for video buffers */
+#define VIRTIO_VIDEO_F_RESOURCE_VIRTIO_OBJECT 2
+
+/*
+ * Image formats
+ */
+
+enum virtio_video_format {
+   /* Raw formats */
+   VIRTIO_VIDEO_FORMAT_RAW_MIN = 1,
+   VIRTIO_VIDEO_FORMAT_ARGB = VIRTIO_VIDEO_FORMAT_RAW_MIN,
+   VIRTIO_VIDEO_FORMAT_BGRA,
+   VIRTIO_VIDEO_FORMAT_NV12, /* 12  Y/CbCr 4:2:0  */
+   VIRTIO_VIDEO_FORMAT_YUV420, /* 12  YUV 4:2:0 */
+   VIRTIO_VIDEO_FORMAT_YVU420, /* 12  YVU 4:2:0 */
+   VIRTIO_VIDEO_FORMAT_RAW_MAX = VIRTIO_VIDEO_FORMAT_YVU420,
+
+   /* Coded formats */
+   VIRTIO_VIDEO_FORMAT_CODED_MIN = 0x1000,
+   VIRTIO_VIDEO_FORMAT_MPEG2 =
+   VIRTIO_VIDEO_FORMAT_CODED_MIN, /* MPEG-2 Part 2 */
+   VIRTIO_VIDEO_FORMAT_MPEG4, /* MPEG-4 Part 2 */
+   VIRTIO_VIDEO_FORMAT_H264, /* H.264 */
+   VIRTIO_VIDEO_FORMAT_HEVC, /* HEVC aka H.265*/
+   VIRTIO_VIDEO_FORMAT_VP8, /* VP8 */
+   VIRTIO_VIDEO_FORMAT_VP9, /* VP9 */
+   VIRTIO_VIDEO_FORMAT_CODED_MAX = VIRTIO_VIDEO_FORMAT_VP9,
+};
+
+enum virtio_video_profile {
+   /* H.264 */
+   VIRTIO_VIDEO_PROFILE_H264_MIN = 0x100,
+   VIRTIO_VIDEO_PROFILE_H264_BASELINE = VIRTIO_VIDEO_PROFILE_H264_MIN,
+   VIRTIO_VIDEO_PROFILE_H264_MAIN,
+   VIRTIO_VIDEO_PROFILE_H264_EXTENDED,
+   VIRTIO_VIDEO_PROFILE_H264_HIGH,
+   VIRTIO_VIDEO_PROFILE_H264_HIGH10PROFILE,
+   VIRTIO_VIDEO_PROFILE_H264_HIGH422PROFILE,
+   VIRTIO_VIDEO_PROFILE_H264_HIGH444PREDICTIVEPROFILE,
+   VIRTIO_VIDEO_PROFILE_H264_SCALABLEBASELINE,
+   VIRTIO_VIDEO_PROFILE_H264_SCALABLEHIGH,
+   VIRTIO_VIDEO_PROFILE_H264_STEREOHIGH,
+   VIRTIO_VIDEO_PROFILE_H264_MULTIVIEWHIGH,
+   VIRTIO_VIDEO_PROFILE_H264_MAX = VIRTIO_VIDEO_PROFILE_H264_MULTIVIEWHIGH,
+
+   /* HEVC */
+   VIRTIO_VIDEO_PROFILE_HEVC_MIN = 0x200,
+   VIRTIO_VIDEO_PROFILE_HEVC_MAIN = VIRTIO_VIDEO_PROFILE_HEVC_MIN,
+   VIRTIO_VIDEO_PROFILE_HEVC_MAIN10,
+   VIRTIO_VIDEO_PROFILE_HEVC_MAIN_STILL_PICTURE,
+   VIRTIO_VIDEO_PROFILE_HEVC_MAX =
+   VIRTIO_VIDEO_PROFILE_HEVC_MAIN_STILL_PICTURE,
+
+   /* VP8 */
+   VIRTIO_VIDEO_PROFILE_VP8_MIN = 0x300,
+   VIRTIO_VIDEO_PROFILE_VP8_PROFILE0 = VIRTIO_VIDEO_PROFILE_VP8_MIN,
+   VIRTIO_VIDEO_PROFILE_VP8_PROFILE1,
+   VIRTIO_VIDEO_PROFILE_VP8_PROFILE2,
+

[PATCH 0/8] virtio: Add vhost-user based Video decode

2021-12-09 Thread Peter Griffin

This series adds support for virtio-video decoder devices in Qemu
and also provides a vhost-user-video vmm implementation.

The vhost-user-video vmm currently parses virtio-vido v3 protocol
(as that is what the Linux frontend driver implements).
It then converts that to a v4l2 mem2mem stateful decoder device.
Currently this has been tested using v4l2 vicodec test driver in Linux
[1] but it is intended to be used with Arm SoCs which often implement
v4l2 stateful decoders/encoders drivers for their video accelerators.

The primary goal so far has been to allow continuing development
of virtio-video Linux frontend driver and testing with Qemu. Using
vicodec on the host allows a purely virtual dev env, and allows for
ci integration in the future by kernelci etc.

This series also adds the virtio_video.h header and adds the
FWHT format which is used by vicodec driver.

I have tested this VMM using v4l2-ctl from v4l2 utils in the guest
to do a video decode to a file. This can then be validated using ffplay
v4l2-compliance tool in the guest has also been run which stresses the
interface and issues lots of syscall level tests

See the README.md for example commands on how to configure guest kernel
and do a video decode using Qemu, vicodec using this VMM.

Linux virtio-video frontend driver code:
https://github.com/petegriffin/linux/commits/v5.10-virtio-video-latest

Qemu vmm code:
https://github.com/petegriffin/qemu/tree/vhost-virtio-video-master-v1

This is part of a wider initiative by Linaro called
"project Stratos" for which you can find information here:

  https://collaborate.linaro.org/display/STR/Stratos+Home

Applies cleanly to git://git.qemu.org/qemu.git master(a3607def89).

Thanks,

Peter.

[1] https://lwn.net/Articles/760650/

Peter Griffin (8):
  vhost-user-video: Add a README.md with cheat sheet of commands
  MAINTAINERS: Add virtio-video section
  vhost-user-video: boiler plate code for vhost-user-video device
  vhost-user-video: add meson subdir build logic
  standard-headers: Add virtio_video.h
  virtio_video: Add Fast Walsh-Hadamard Transform format
  hw/display: add vhost-user-video-pci
  tools/vhost-user-video: Add initial vhost-user-video vmm

 MAINTAINERS   |8 +
 hw/display/Kconfig|5 +
 hw/display/meson.build|3 +
 hw/display/vhost-user-video-pci.c |   82 +
 hw/display/vhost-user-video.c |  386 
 include/hw/virtio/vhost-user-video.h  |   41 +
 include/standard-headers/linux/virtio_video.h |  484 +
 tools/meson.build |9 +
 tools/vhost-user-video/50-qemu-rpmb.json.in   |5 +
 tools/vhost-user-video/README.md  |   98 +
 tools/vhost-user-video/main.c | 1680 
 tools/vhost-user-video/meson.build|   10 +
 tools/vhost-user-video/v4l2_backend.c | 1777 +
 tools/vhost-user-video/v4l2_backend.h |   99 +
 tools/vhost-user-video/virtio_video_helpers.c |  462 +
 tools/vhost-user-video/virtio_video_helpers.h |  166 ++
 tools/vhost-user-video/vuvideo.h  |   43 +
 17 files changed, 5358 insertions(+)
 create mode 100644 hw/display/vhost-user-video-pci.c
 create mode 100644 hw/display/vhost-user-video.c
 create mode 100644 include/hw/virtio/vhost-user-video.h
 create mode 100644 include/standard-headers/linux/virtio_video.h
 create mode 100644 tools/vhost-user-video/50-qemu-rpmb.json.in
 create mode 100644 tools/vhost-user-video/README.md
 create mode 100644 tools/vhost-user-video/main.c
 create mode 100644 tools/vhost-user-video/meson.build
 create mode 100644 tools/vhost-user-video/v4l2_backend.c
 create mode 100644 tools/vhost-user-video/v4l2_backend.h
 create mode 100644 tools/vhost-user-video/virtio_video_helpers.c
 create mode 100644 tools/vhost-user-video/virtio_video_helpers.h
 create mode 100644 tools/vhost-user-video/vuvideo.h

-- 
2.25.1

[PATCH 1/8] vhost-user-video: Add a README.md with cheat sheet of commands

2021-12-09 Thread Peter Griffin

Signed-off-by: Peter Griffin 
---
 tools/vhost-user-video/README.md | 98 
 1 file changed, 98 insertions(+)
 create mode 100644 tools/vhost-user-video/README.md

diff --git a/tools/vhost-user-video/README.md b/tools/vhost-user-video/README.md
new file mode 100644
index 00..c55e0a7b68
--- /dev/null
+++ b/tools/vhost-user-video/README.md
@@ -0,0 +1,98 @@
+# Overview vhost-user-video
+
+This vmm translates from virtio-video v3 protocol and writes
+to a v4l2 mem2mem stateful decoder/encoder device [1]. v3 was
+chosen as that is what the virtio-video Linux frontend driver
+currently implements.
+
+The primary goal so far is to enable development of virtio-video
+frontend driver using purely open source software. Using vicodec
+v4l2 stateful decoder on the host for testing then allows a pure
+virtual environment for development and testing.
+
+Currently the vmm only supports v4l2 stateful devices, and the
+intention is it will be used with Arm SoCs that implement stateful
+decode/encode devices such as Qcom Venus, RPi, MediaTek etc.
+
+A Qemu + vicodec setup for virtio-video should also allow for
+CI systems like kernelci, lkft to test the virtio-video interface
+easily.
+
+Currently support for VAAPI or decoding via libavcodec or similar
+libraries is not implemented, but this could be added in the future.
+
+Some example commands are provided below on how to run the daemon
+and achieve a video decode using vicodec and a link to some test
+content.
+
+[1] https://www.kernel.org/doc/html/latest/userspace-api/media/
+v4l/dev-decoder.html
+
+[2] https://lwn.net/Articles/760650/
+
+# Guest Linux kernel modules
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MEDIA_TEST_SUPPORT=y
+CONFIG_V4L_TEST_DRIVERS=y
+CONFIG_VIRTIO_VIDEO=y
+CONFIG_GDB_SCRIPTS=y
+CONFIG_DRM_VIRTIO_GPU=y
+
+# Host kernel modules
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MEDIA_TEST_SUPPORT=y
+CONFIG_V4L_TEST_DRIVERS=y
+CONFIG_VIDEO_VICODEC=y
+
+# Run vhost-user-video daemon with vicodec
+# (video3 typically is the stateful video)
+vhost-user-video --socket-path=/tmp/video.sock --v4l2-device=/dev/video3
+
+# Qemu command for virtio-video device
+
+-device vhost-user-video-pci,chardev=video,id=video
+-chardev socket,path=/tmp//video.sock,id=video
+
+# Example v4l2-ctl decode command
+wget https://people.linaro.org/~peter.griffin/jelly_640_480-420P.fwht
+
+v4l2-ctl -d0 -x width=640,height=480 -v width=640,height=480,pixelformat=YU12
+--stream-mmap --stream-out-mmap --stream-from jelly_640_480-420P.fwht
+--stream-to out-jelly-640-480.YU12
+
+# Play the raw decoded video with ffplay or mplayer
+ffplay -loglevel warning -v info -f rawvideo -pixel_format  yuv420p
+  -video_size "640x480" ./out-jelly-640-480.YU12
+
+mplayer -demuxer rawvideo -rawvideo
+  format=i420:w=640:h=480:fps=25 out-jelly-640-480.YU12
+
+# Enable v4l2 debug in virtio-video frontend driver
+echo 0x1f > /sys/class/video4linux/video0/dev_debug
+
+# Enable v4l2 debug in vicodec backend driver
+echo 0x1f > /sys/class/video4linux/video3/dev_debug
+
+# optee-build system qemu virtio-video command
+make QEMU_VIRTFS_ENABLE=y QEMU_USERNET_ENABLE=y CFG_TA_ASLR=n
+QEMU_VHOSTUSER_MEM=y QEMU_VIRTVIDEO_ENABLE=y SSH_PORT_FW=y run-only
+
+Current status
+* Tested with v4l2-ctl from v4l2-utils and vicodec stateful decoder driver
+* v4l2-compliance - reports
+Total: 43, Succeeded: 37, Failed: 6, Warnings: 0
+
+Known Issues
+* 6 v4l2-compliance failures remaining
+* v4l2-ctl 0fps misleading output
+* v4l2-ctl sometimes reports - 0 != 
+* Encoder not tested yet
+
+TODOs
+* Test with a "real" stateful decoder & codec
+  (e.g. Qcom Venus or RPi).
+* Test more v4l2 userspaces in the guest
+
+Future potential features
+* Emulation using libavcodec or similar library
+* Support for VAAPI, OpenMax or v4l2 stateless devices
-- 
2.25.1

[PATCH 2/8] MAINTAINERS: Add virtio-video section

2021-12-09 Thread Peter Griffin

Add myself as maintainer of the virtio-video files added
in this series.

Signed-off-by: Peter Griffin 
---
 MAINTAINERS | 8 
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 7543eb4d59..43c53aded8 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2012,6 +2012,14 @@ F: hw/virtio/vhost-user-rng-pci.c
 F: include/hw/virtio/vhost-user-rng.h
 F: tools/vhost-user-rng/*
 
+virtio-video
+M: Peter Griffin 
+S: Supported
+F: hw/display/vhost-user-video.c
+F: hw/display/vhost-user-video-pci.c
+F: include/hw/virtio/vhost-user-video.h
+F: tools/vhost-user-video/*
+
 virtio-crypto
 M: Gonglei 
 S: Supported
-- 
2.25.1

Re: [PATCH v2] Move the libssh setup from configure to meson.build

2021-12-09 Thread Richard W.M. Jones

On Thu, Dec 09, 2021 at 03:48:01PM +0100, Thomas Huth wrote:
> It's easier to do this in meson.build now.
> 
> Signed-off-by: Thomas Huth 
> ---
>  v2: Added the missing "config_host_data.set('CONFIG_LIBSSH', libssh.found())"
> 
>  configure | 27 ---
>  meson.build   | 13 +
>  meson_options.txt |  2 ++
>  scripts/meson-buildoptions.sh |  3 +++
>  4 files changed, 14 insertions(+), 31 deletions(-)
> 
> diff --git a/configure b/configure
> index 48c21775f3..bb99a40ed0 100755
> --- a/configure
> +++ b/configure
> @@ -344,7 +344,6 @@ debug_stack_usage="no"
>  crypto_afalg="no"
>  tls_priority="NORMAL"
>  tpm="$default_feature"
> -libssh="$default_feature"
>  live_block_migration=${default_feature:-yes}
>  numa="$default_feature"
>  replication=${default_feature:-yes}
> @@ -1078,10 +1077,6 @@ for opt do
>;;
>--enable-tpm) tpm="yes"
>;;
> -  --disable-libssh) libssh="no"
> -  ;;
> -  --enable-libssh) libssh="yes"
> -  ;;
>--disable-live-block-migration) live_block_migration="no"
>;;
>--enable-live-block-migration) live_block_migration="yes"
> @@ -1448,7 +1443,6 @@ cat << EOF
>live-block-migration   Block migration in the main migration stream
>coroutine-pool  coroutine freelist (better performance)
>tpm TPM support
> -  libssh  ssh block device support
>numalibnuma support
>avx2AVX2 optimization support
>avx512f AVX512F optimization support
> @@ -2561,21 +2555,6 @@ if test "$modules" = yes; then
>  fi
>  fi
>  
> -##
> -# libssh probe
> -if test "$libssh" != "no" ; then
> -  if $pkg_config --exists "libssh >= 0.8.7"; then
> -libssh_cflags=$($pkg_config libssh --cflags)
> -libssh_libs=$($pkg_config libssh --libs)
> -libssh=yes
> -  else
> -if test "$libssh" = "yes" ; then
> -  error_exit "libssh required for --enable-libssh"
> -fi
> -libssh=no
> -  fi
> -fi
> -
>  ##
>  # TPM emulation is only on POSIX
>  
> @@ -3636,12 +3615,6 @@ if test "$cmpxchg128" = "yes" ; then
>echo "CONFIG_CMPXCHG128=y" >> $config_host_mak
>  fi
>  
> -if test "$libssh" = "yes" ; then
> -  echo "CONFIG_LIBSSH=y" >> $config_host_mak
> -  echo "LIBSSH_CFLAGS=$libssh_cflags" >> $config_host_mak
> -  echo "LIBSSH_LIBS=$libssh_libs" >> $config_host_mak
> -fi
> -
>  if test "$live_block_migration" = "yes" ; then
>echo "CONFIG_LIVE_BLOCK_MIGRATION=y" >> $config_host_mak
>  fi
> diff --git a/meson.build b/meson.build
> index 96de1a6ef9..ae67ca28ab 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -874,11 +874,15 @@ if not get_option('glusterfs').auto() or have_block
>  ''', dependencies: glusterfs)
>endif
>  endif
> +
>  libssh = not_found
> -if 'CONFIG_LIBSSH' in config_host
> -  libssh = declare_dependency(compile_args: 
> config_host['LIBSSH_CFLAGS'].split(),
> -  link_args: config_host['LIBSSH_LIBS'].split())
> +if not get_option('libssh').auto() or have_block
> +  libssh = dependency('libssh', version: '>=0.8.7',
> +method: 'pkg-config',
> +required: get_option('libssh'),
> +kwargs: static_kwargs)
>  endif
> +
>  libbzip2 = not_found
>  if not get_option('bzip2').auto() or have_block
>libbzip2 = cc.find_library('bz2', has_headers: ['bzlib.h'],
> @@ -1451,6 +1455,7 @@ config_host_data.set('CONFIG_EBPF', libbpf.found())
>  config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
>  config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
>  config_host_data.set('CONFIG_LIBNFS', libnfs.found())
> +config_host_data.set('CONFIG_LIBSSH', libssh.found())
>  config_host_data.set('CONFIG_LINUX_AIO', libaio.found())
>  config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
>  config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
> @@ -3430,7 +3435,7 @@ endif
>  summary_info += {'seccomp support':   seccomp}
>  summary_info += {'GlusterFS support': glusterfs}
>  summary_info += {'TPM support':   config_host.has_key('CONFIG_TPM')}
> -summary_info += {'libssh support':config_host.has_key('CONFIG_LIBSSH')}
> +summary_info += {'libssh support':libssh}
>  summary_info += {'lzo support':   lzo}
>  summary_info += {'snappy support':snappy}
>  summary_info += {'bzip2 support': libbzip2}
> diff --git a/meson_options.txt b/meson_options.txt
> index e392323732..4114bfcaa4 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -105,6 +105,8 @@ option('libdaxctl', type : 'feature', value : 'auto',
> description: 'libdaxctl support')
>  option('libpmem', type : 'feature', value : 'auto',
> description: 'libpmem support')
> +option('libssh', type : 'feature', value : 'auto',
> +   description: 'ssh block device support')
>  option('libudev', type : 'feature', value : 'auto',
>

[PATCH v2] Move the libssh setup from configure to meson.build

2021-12-09 Thread Thomas Huth

It's easier to do this in meson.build now.

Signed-off-by: Thomas Huth 
---
 v2: Added the missing "config_host_data.set('CONFIG_LIBSSH', libssh.found())"

 configure | 27 ---
 meson.build   | 13 +
 meson_options.txt |  2 ++
 scripts/meson-buildoptions.sh |  3 +++
 4 files changed, 14 insertions(+), 31 deletions(-)

diff --git a/configure b/configure
index 48c21775f3..bb99a40ed0 100755
--- a/configure
+++ b/configure
@@ -344,7 +344,6 @@ debug_stack_usage="no"
 crypto_afalg="no"
 tls_priority="NORMAL"
 tpm="$default_feature"
-libssh="$default_feature"
 live_block_migration=${default_feature:-yes}
 numa="$default_feature"
 replication=${default_feature:-yes}
@@ -1078,10 +1077,6 @@ for opt do
   ;;
   --enable-tpm) tpm="yes"
   ;;
-  --disable-libssh) libssh="no"
-  ;;
-  --enable-libssh) libssh="yes"
-  ;;
   --disable-live-block-migration) live_block_migration="no"
   ;;
   --enable-live-block-migration) live_block_migration="yes"
@@ -1448,7 +1443,6 @@ cat << EOF
   live-block-migration   Block migration in the main migration stream
   coroutine-pool  coroutine freelist (better performance)
   tpm TPM support
-  libssh  ssh block device support
   numalibnuma support
   avx2AVX2 optimization support
   avx512f AVX512F optimization support
@@ -2561,21 +2555,6 @@ if test "$modules" = yes; then
 fi
 fi
 
-##
-# libssh probe
-if test "$libssh" != "no" ; then
-  if $pkg_config --exists "libssh >= 0.8.7"; then
-libssh_cflags=$($pkg_config libssh --cflags)
-libssh_libs=$($pkg_config libssh --libs)
-libssh=yes
-  else
-if test "$libssh" = "yes" ; then
-  error_exit "libssh required for --enable-libssh"
-fi
-libssh=no
-  fi
-fi
-
 ##
 # TPM emulation is only on POSIX
 
@@ -3636,12 +3615,6 @@ if test "$cmpxchg128" = "yes" ; then
   echo "CONFIG_CMPXCHG128=y" >> $config_host_mak
 fi
 
-if test "$libssh" = "yes" ; then
-  echo "CONFIG_LIBSSH=y" >> $config_host_mak
-  echo "LIBSSH_CFLAGS=$libssh_cflags" >> $config_host_mak
-  echo "LIBSSH_LIBS=$libssh_libs" >> $config_host_mak
-fi
-
 if test "$live_block_migration" = "yes" ; then
   echo "CONFIG_LIVE_BLOCK_MIGRATION=y" >> $config_host_mak
 fi
diff --git a/meson.build b/meson.build
index 96de1a6ef9..ae67ca28ab 100644
--- a/meson.build
+++ b/meson.build
@@ -874,11 +874,15 @@ if not get_option('glusterfs').auto() or have_block
 ''', dependencies: glusterfs)
   endif
 endif
+
 libssh = not_found
-if 'CONFIG_LIBSSH' in config_host
-  libssh = declare_dependency(compile_args: 
config_host['LIBSSH_CFLAGS'].split(),
-  link_args: config_host['LIBSSH_LIBS'].split())
+if not get_option('libssh').auto() or have_block
+  libssh = dependency('libssh', version: '>=0.8.7',
+method: 'pkg-config',
+required: get_option('libssh'),
+kwargs: static_kwargs)
 endif
+
 libbzip2 = not_found
 if not get_option('bzip2').auto() or have_block
   libbzip2 = cc.find_library('bz2', has_headers: ['bzlib.h'],
@@ -1451,6 +1455,7 @@ config_host_data.set('CONFIG_EBPF', libbpf.found())
 config_host_data.set('CONFIG_LIBDAXCTL', libdaxctl.found())
 config_host_data.set('CONFIG_LIBISCSI', libiscsi.found())
 config_host_data.set('CONFIG_LIBNFS', libnfs.found())
+config_host_data.set('CONFIG_LIBSSH', libssh.found())
 config_host_data.set('CONFIG_LINUX_AIO', libaio.found())
 config_host_data.set('CONFIG_LINUX_IO_URING', linux_io_uring.found())
 config_host_data.set('CONFIG_LIBPMEM', libpmem.found())
@@ -3430,7 +3435,7 @@ endif
 summary_info += {'seccomp support':   seccomp}
 summary_info += {'GlusterFS support': glusterfs}
 summary_info += {'TPM support':   config_host.has_key('CONFIG_TPM')}
-summary_info += {'libssh support':config_host.has_key('CONFIG_LIBSSH')}
+summary_info += {'libssh support':libssh}
 summary_info += {'lzo support':   lzo}
 summary_info += {'snappy support':snappy}
 summary_info += {'bzip2 support': libbzip2}
diff --git a/meson_options.txt b/meson_options.txt
index e392323732..4114bfcaa4 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -105,6 +105,8 @@ option('libdaxctl', type : 'feature', value : 'auto',
description: 'libdaxctl support')
 option('libpmem', type : 'feature', value : 'auto',
description: 'libpmem support')
+option('libssh', type : 'feature', value : 'auto',
+   description: 'ssh block device support')
 option('libudev', type : 'feature', value : 'auto',
description: 'Use libudev to enumerate host devices')
 option('libusb', type : 'feature', value : 'auto',
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 7a17ff4218..ae8f18edc2 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -53,6 +53,7 @@ meson_options_help() {

Re: LTP test perf_event_open02.c: possible rounding issue on aarch64 KVM

2021-12-09 Thread James Clark



On 09/12/2021 12:20, Petr Vorel wrote:
> Hi,
> 
> I have problem with LTP test perf_event_open02.c [1] on QEMU using KVM on
> openSUSE aarch64 kernel 5.15.5-1-default (not much different from stable 
> kernel
> from kernel.org):
> 
> # /opt/ltp/testcases/bin/perf_event_open02
> ...
> perf_event_open02.c:104: TINFO: bench_work estimated loops = 8083 in 500 ms
> perf_event_open02.c:149: TINFO: [0] value:2425293761 time_enabled:749092800 
> time_running:749092800
> perf_event_open02.c:149: TINFO: [1] value:2425287027 time_enabled:749141475 
> time_running:749141475
> perf_event_open02.c:149: TINFO: [2] value:2433046583 time_enabled:757346300 
> time_running:757346300
> perf_event_open02.c:149: TINFO: [3] value:2432771537 time_enabled:753369300 
> time_running:753369300
> perf_event_open02.c:149: TINFO: [4] value:2432551620 time_enabled:753784075 
> time_running:753784075
> perf_event_open02.c:149: TINFO: [5] value:2432386104 time_enabled:753481750 
> time_running:753481750
> perf_event_open02.c:149: TINFO: [6] value:2095086137 time_enabled:768866050 
> time_running:660021525
> perf_event_open02.c:308: TINFO: nhw: 6, overall task clock: 4098138525
> perf_event_open02.c:309: TINFO: hw sum: 116450294745, task clock sum: 
> 24589636350
> perf_event_open02.c:321: TINFO: ratio: 6.000196
> perf_event_open02.c:323: TFAIL: test failed (ratio was greater than 6)
> ...
> 
> The test tries to assert the precision of hardware counters (using struct
> perf_event_attr hw_event.type = PERF_TYPE_HARDWARE), but sometimes it fails 
> with
> slight overrun. We suppose that this is a rounding error, but it'd be nice to
> get this confirmed from kernel developers.
> 

I don't believe this is a rounding error because the test uses long longs for 
the totals and then
has a single division into a double. A difference of 0.000196 is too big to be 
explained by
rounding from a single division.

There is at least one other fix (f4bf9ba01802) in the commit history that 
involves
very close but failing values, and the fix wasn't to change the tolerance. So 
I'd say there is
probably a bug in the test, or you've found a real bug.

> Related kernel setup (or you need to know something else)
> grep PERF_EVENTS config-5.15.5-1-default # aarch64
> CONFIG_HAVE_PERF_EVENTS=y
> CONFIG_PERF_EVENTS=y
> CONFIG_HW_PERF_EVENTS=y
> 
> Test is running inside testing framework with this setup:
> qemu-system-aarch64 -device virtio-gpu-pci -only-migratable -chardev 
> ringbuf,id=serial0,logfile=serial0,logappend=on -serial chardev:serial0 
> -audiodev none,id=snd0 -device intel-hda -device hda-output,audiodev=snd0 -m 
> 2048 -machine virt,gic-version=host -cpu host -mem-prealloc -mem-path 
> /dev/hugepages/ -netdev user,id=qanet0 -device 
> virtio-net,netdev=qanet0,mac=52:54:00:12:34:56 -object 
> rng-random,filename=/dev/urandom,id=rng0 -device virtio-rng-pci,rng=rng0 
> -boot menu=on,splash-time=5000 -device nec-usb-xhci -device usb-tablet 
> -device usb-kbd -smp 2 -enable-kvm -no-shutdown -vnc :97,share=force-shared 
> -device virtio-serial -chardev 
> pipe,id=virtio_console,path=virtio_console,logfile=virtio_console.log,logappend=on
>  -device 
> virtconsole,chardev=virtio_console,name=org.openqa.console.virtio_console 
> -chardev 
> pipe,id=virtio_console1,path=virtio_console1,logfile=virtio_console1.log,logappend=on
>  -device 
> virtconsole,chardev=virtio_console1,name=org.openqa.console.virtio_console1 
> -chardev 
> socket,path=qmp_socket,server=on,wait=off,id=qmp_socket,logfile=qmp_socket.log,logappend=on
>  -qmp chardev:qmp_socket -S -device virtio-scsi-pci,id=scsi0 -blockdev 
> driver=file,node-name=hd0-overlay0-file,filename=/var/lib/openqa/pool/7/raid/hd0-overlay0,cache.no-flush=on
>  -blockdev 
> driver=qcow2,node-name=hd0-overlay0,file=hd0-overlay0-file,cache.no-flush=on 
> -device 
> virtio-blk-device,id=hd0-device,drive=hd0-overlay0,bootindex=0,serial=hd0 
> -blockdev 
> driver=file,node-name=cd0-overlay0-file,filename=/var/lib/openqa/pool/7/raid/cd0-overlay0,cache.no-flush=on
>  -blockdev 
> driver=qcow2,node-name=cd0-overlay0,file=cd0-overlay0-file,cache.no-flush=on 
> -device scsi-cd,id=cd0-device,drive=cd0-overlay0,serial=cd0 -drive 
> id=pflash-code-overlay0,if=pflash,file=/var/lib/openqa/pool/7/raid/pflash-code-overlay0,unit=0,readonly=on
>  -drive 
> id=pflash-vars-overlay0,if=pflash,file=/var/lib/openqa/pool/7/raid/pflash-vars-overlay0,unit=1
> 
> Running the same OS and kernel (aarch64 JeOS Tumbleweed 20211202) on RPI it's 
> working:
> perf_event_open02.c:104: TINFO: bench_work estimated loops = 3601 in 500 ms
> perf_event_open02.c:149: TINFO: [0] value:1080601748 time_enabled:480527015 
> time_running:480527015
> perf_event_open02.c:149: TINFO: [1] value:1080599535 time_enabled:480540573 
> time_running:480540573
> perf_event_open02.c:149: TINFO: [2] value:1080592770 time_enabled:480533868 
> time_running:480533868
> perf_event_open02.c:149: TINFO: [3] value:1080607121 time_enabled:480571573 
> time_running:480571573
>

Re: [PATCH] target/ppc: powerpc_excp: Guard ALIGNMENT interrupt with CONFIG_TCG

2021-12-09 Thread Cédric Le Goater


On 12/9/21 00:06, Fabiano Rosas wrote:

We cannot have TCG code in powerpc_excp because the function is called
from kvm-only code via ppc_cpu_do_interrupt:

  ../target/ppc/excp_helper.c:463:29: error: implicit declaration of
  function ‘cpu_ldl_code’ [-Werror=implicit-function-declaration]

Fortunately, the Alignment interrupt is not among the ones dispatched
from kvm-only code, so we can keep it out of the disable-tcg build for
now.

Fixes: 336e91f853 ("target/ppc: Move SPR_DSISR setting to powerpc_excp")
Signed-off-by: Fabiano Rosas 

---

Perhaps we could make powerpc_excp TCG only and have a separate
function that only knows the two interrupts that we use with KVM
(Program, Machine check). But for now this fix will do, I think.
---
  target/ppc/excp_helper.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index 17607adbe4..dcf22440cc 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -453,6 +453,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
excp_model, int excp)
  }
  break;
  }
+#ifdef CONFIG_TCG
  case POWERPC_EXCP_ALIGN: /* Alignment exception  
*/
  /*
   * Get rS/rD and rA from faulting opcode.
@@ -464,6 +465,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
excp_model, int excp)
  env->spr[SPR_DSISR] |= (insn & 0x03FF) >> 16;
  }
  break;
+#endif
  case POWERPC_EXCP_PROGRAM:   /* Program exception
*/
  switch (env->error_code & ~0xF) {
  case POWERPC_EXCP_FP:



Shouldn't we move that code under ppc_cpu_do_unaligned_access ?

Thanks,

C.

Re: [PATCH 0/3] iotests: multiprocessing!!

2021-12-09 Thread Hanna Reitz


On 03.12.21 13:22, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

Finally, I can not stand it any longer. So, I'm happy to present
multiprocessing support for iotests test runner.


Thanks, looks great!

Applied to my block-next branch:

https://gitlab.com/hreitz/qemu/-/commits/block-next

Hanna

Re: [RFC PATCH v2 32/44] tdx: add kvm_tdx_enabled() accessor for later use

2021-12-09 Thread Xiaoyao Li


On 7/23/2021 1:53 AM, Connor Kuehl wrote:

On 7/7/21 7:55 PM, isaku.yamah...@gmail.com wrote:

From: Isaku Yamahata 

Signed-off-by: Isaku Yamahata 
---
  include/sysemu/tdx.h  | 1 +
  target/i386/kvm/kvm.c | 5 +
  2 files changed, 6 insertions(+)

diff --git a/include/sysemu/tdx.h b/include/sysemu/tdx.h
index 70eb01348f..f3eced10f9 100644
--- a/include/sysemu/tdx.h
+++ b/include/sysemu/tdx.h
@@ -6,6 +6,7 @@
  #include "hw/i386/pc.h"
  bool kvm_has_tdx(KVMState *s);
+bool kvm_tdx_enabled(void);
  int tdx_system_firmware_init(PCMachineState *pcms, MemoryRegion 
*rom_memory);

  #endif
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index af6b5f350e..76c3ea9fac 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -152,6 +152,11 @@ int kvm_set_vm_type(MachineState *ms, int kvm_type)
  return -ENOTSUP;
  }
+bool kvm_tdx_enabled(void)
+{
+    return vm_type == KVM_X86_TDX_VM;
+}
+


Is this the whole story? Does this guarantee that the VM QEMU is
responsible to bring up is a successfully initialized TD?


No, it just means a TDX guest is requested.


 From my reading of the series as it unfolded, this looks like the
function proves that KVM can support TDs and that the user requested
a TDX kvm-type, not that we have a fully-formed TD.


yes, you are right. We referenced what sev_eanbled() and sev_es_enabled().

If the name is misleading, does it looks better to name it is_tdx_vm()?


Is it possible to associate this with a more verifiable metric that
the TD has been or will be created successfully? I.e., once the VM
has successfully called the TDX INIT ioctl or has finalized setup?

My question mainly comes from a later patch in the series, where the
"query-tdx-capabilities" and "query-tdx" QMP commands are added.

Forgive me if I am misinterpreting the semantics of each of these
commands:


what you understood is correct.


"query-tdx-capabilities" sounds like it answers the question of
"can it run a TD?"

and "query-tdx" sounds like it answers the question of "is it a TD?"

Is the assumption with "query-tdx" that anything that's gone wrong
with developing a TD will have resulted in the QEMU process exiting
and therefore if we get to a point where we can run "query-tdx" then
we know the TD was successfully formed?

Re: RFC: x86 memory map, where to put CXL ranges?

2021-12-09 Thread Alex Bennée



Jonathan Cameron  writes:

> Hi All,
>
> For CXL emulation we require a couple of types of memory range that
> are then provided to the OS via the CEDT ACPI table.
>
> 1) CXL Host Bridge Structures point to CXL Host Bridge Component Registers.
> Small regions for each CXL Host bridge that are mapped into the memory space.
> 64k each.  In theory we may have a huge number of these but in reality I
> think 16 will do for any reasonable system.
>
> 2) CXL Fixed Memory Window Structures (CFMWS)
> Large PA space ranges (multiple TB) to which various CXL devices can be 
> assigned
> and their address decoders appropriately programmed.
> Each such CFMWS will have particular characteristics such as interleaving 
> across
> multiple host bridges.  The can potentially be huge but are a system
> characteristic.  For emulation purposes it won't matter if they move around
> dependent on what else is the machine has configured. So I'd like to
> just configure their size rather than fully specify them at the command line
> and possibly clash on PA space with something else.  Alternatively could
> leave them as fully specified at the command line (address and size) and just
> error out if the hit memory already in use for something else.
>
> Now unfortunately there are no systems out there yet that we can just
> copy the memory map from...
>
> Coming form an Arm background I have only a vague idea of how this should be
> done for x86 so apologies if it is a stupid question.
>
> My current approach is to put these above device_memory and moving
> the pci hole up appropriately.

Which board model would be be talking about here? virt? Or maybe we need
a new one?

If it's virt I would look at extended_memmap which floats above the
configured RAM size and means less shuffling around of the relatively
crowded lower address space.

I have no idea about how this is handled on x86 though.

> Is that the right choice?
>
> On Arm I currently have the Host Bridge Structures low down in the MemMap and 
> the CFMWS
> can go above the device memory.  Comments on that also welcome.
>
> In Ben's RFC the host bridge component register location was marked as a TODO
> and a arbitrary address used in the meantime so time to figure out how to 
> clean
> that up.
>
> Thanks,
>
> Jonathan


-- 
Alex Bennée

[RFC] block-backend: prevent dangling BDS pointer in blk_drain()

2021-12-09 Thread Stefan Hajnoczi

The BlockBackend root child can change during bdrv_drained_begin() when
aio_poll() is invoked. In fact the BlockDriverState can reach refcnt 0
and blk_drain() is left with a dangling BDS pointer.

One example is scsi_device_purge_requests(), which calls blk_drain() to
wait for in-flight requests to cancel. If the backup blockjob is active,
then the BlockBackend root child is a temporary filter BDS owned by the
blockjob. The blockjob can complete during bdrv_drained_begin() and the
last reference to the BDS is released when the temporary filter node is
removed. This results in a use-after-free when blk_drain() calls
bdrv_drained_end(bs) on the dangling pointer.

The general problem is that a function and its callers must not assume
that bs is still valid across aio_poll(). Explicitly hold a reference to
bs in blk_drain() to avoid the dangling pointer.

Signed-off-by: Stefan Hajnoczi 
---
I found that BDS nodes are sometimes deleted with bs->quiesce_counter >
0 (at least when running "make check"), so it is currently not possible
to put the bdrv_ref/unref() calls in bdrv_do_drained_begin() and
bdrv_do_drained_end() because they will be unbalanced. That would have
been a more general solution than only fixing blk_drain().

Any suggestions for a better fix?

I think it's likely that more "dangling pointer across aio_poll()"
problems exist :(.

Here is the (hacky) reproducer:

  build/qemu-system-x86_64 \
 -name 'avocado-vt-vm1'  \
 -sandbox on  \
 -machine q35,memory-backend=mem-machine_mem \
 -device 
pcie-root-port,id=pcie-root-port-0,multifunction=on,bus=pcie.0,addr=0x1,chassis=1
 \
 -device pcie-pci-bridge,id=pcie-pci-bridge-0,addr=0x0,bus=pcie-root-port-0 
 \
 -nodefaults \
 -device VGA,bus=pcie.0,addr=0x2 \
 -m 1024 \
 -object memory-backend-ram,size=1024M,id=mem-machine_mem  \
 -smp 10,maxcpus=10,cores=5,threads=1,dies=1,sockets=2  \
 -cpu 'Cascadelake-Server-noTSX',+kvm_pv_unhalt \
 -chardev 
socket,wait=off,server=on,id=qmp_id_qmpmonitor1,path=/tmp/qmp.sock  \
 -mon chardev=qmp_id_qmpmonitor1,mode=control \
 -chardev 
socket,wait=off,server=on,id=qmp_id_catch_monitor,path=/tmp/catch_monitor.sock  
\
 -mon chardev=qmp_id_catch_monitor,mode=control \
 -device pvpanic,ioport=0x505,id=idgKHYrQ \
 -chardev 
socket,wait=off,server=on,id=chardev_serial0,path=/tmp/serial.sock \
 -device isa-serial,id=serial0,chardev=chardev_serial0  \
 -chardev 
socket,id=seabioslog_id_2020-012521-TNCkxDmn,path=/tmp/seabios.sock,server=on,wait=off
 \
 -device 
isa-debugcon,chardev=seabioslog_id_2020-012521-TNCkxDmn,iobase=0x402 \
 -device 
pcie-root-port,id=pcie-root-port-2,port=0x2,addr=0x1.0x2,bus=pcie.0,chassis=3 \
 -device virtio-scsi-pci,id=virtio_scsi_pci0,bus=pcie-root-port-2,addr=0x0 \
 -blockdev 
node-name=file_image1,driver=file,auto-read-only=on,discard=unmap,aio=threads,filename=test.img,cache.direct=on,cache.no-flush=off
 \
 -blockdev 
node-name=drive_image1,driver=raw,read-only=off,cache.direct=on,cache.no-flush=off,file=file_image1
 \
 -device scsi-hd,id=image1,drive=drive_image1,write-cache=on \
 -blockdev 
node-name=file_src1,driver=file,auto-read-only=on,discard=unmap,aio=threads,filename=sr1.qcow2,cache.direct=on,cache.no-flush=off
 \
 -blockdev 
node-name=drive_src1,driver=qcow2,read-only=off,cache.direct=on,cache.no-flush=off,file=file_src1
 \
 -device scsi-hd,id=src1,drive=drive_src1,write-cache=on \
 -device 
pcie-root-port,id=pcie-root-port-3,port=0x3,addr=0x1.0x3,bus=pcie.0,chassis=4 \
 -device 
virtio-net-pci,mac=9a:11:64:b0:5d:a8,id=idxnEEYY,netdev=idBjpylo,bus=pcie-root-port-3,addr=0x0
  \
 -netdev user,id=idBjpylo  \
 -vnc :0  \
 -rtc base=utc,clock=host,driftfix=slew  \
 -boot menu=off,order=cdn,once=c,strict=off \
 -enable-kvm \
 -device 
pcie-root-port,id=pcie_extra_root_port_0,multifunction=on,bus=pcie.0,addr=0x3,chassis=5
 &

  sleep 8 # delay for VM startup and socket creation

  nc -U /tmp/qmp.sock <

[PATCH v5 11/12] s390x: topology: implementing numa for the s390x topology

2021-12-09 Thread Pierre Morel

S390x CPU Topology allows a non uniform repartition of the CPU
inside the topology containers, sockets, books and drawers.

We use numa to place the CPU inside the right topology container
and report the non uniform topology to the guest.

Note that s390x needs CPU0 to belong to the topology and consequently
all topology must include CPU0.

We accept a partial QEMU numa definition, in that case undefined CPUs
are added to free slots in the topology starting with slot 0 and going
up.

Signed-off-by: Pierre Morel 
---
 hw/core/machine.c  | 18 ++
 hw/s390x/s390-virtio-ccw.c | 68 ++
 2 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 0059070309..d65a91c607 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -684,6 +684,16 @@ void machine_set_cpu_numa_node(MachineState *machine,
 return;
 }
 
+if (props->has_book_id && !slot->props.has_book_id) {
+error_setg(errp, "book-id is not supported");
+return;
+}
+
+if (props->has_drawer_id && !slot->props.has_drawer_id) {
+error_setg(errp, "drawer-id is not supported");
+return;
+}
+
 /* skip slots with explicit mismatch */
 if (props->has_thread_id && props->thread_id != slot->props.thread_id) 
{
 continue;
@@ -701,6 +711,14 @@ void machine_set_cpu_numa_node(MachineState *machine,
 continue;
 }
 
+if (props->has_book_id && props->book_id != slot->props.book_id) {
+continue;
+}
+
+if (props->has_drawer_id && props->drawer_id != slot->props.drawer_id) 
{
+continue;
+}
+
 /* reject assignment if slot is already assigned, for compatibility
  * of legacy cpu_index mapping with SPAPR core based mapping do not
  * error out if cpu thread and matched core have the same node-id */
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index cd27b4c3af..dcd6a1cf19 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -84,14 +84,34 @@ out:
 static void s390_init_cpus(MachineState *machine)
 {
 MachineClass *mc = MACHINE_GET_CLASS(machine);
-int i;
+CPUArchId *slot;
+int i, n = 0;
 
 /* initialize possible_cpus */
 mc->possible_cpu_arch_ids(machine);
 
 s390_topology_setup(machine);
-for (i = 0; i < machine->smp.cpus; i++) {
+
+/* For NUMA configuration create defined nodes */
+if (machine->numa_state->num_nodes) {
+for (i = 0; i < machine->smp.max_cpus; i++) {
+slot = >possible_cpus->cpus[i];
+if (slot->arch_id != -1 && n < machine->smp.cpus) {
+s390x_new_cpu(machine->cpu_type, i, _fatal);
+n++;
+}
+}
+}
+
+/* create all remaining CPUs */
+for (i = 0; n < machine->smp.cpus && i < machine->smp.max_cpus; i++) {
+slot = >possible_cpus->cpus[i];
+/* For NUMA configuration skip defined nodes */
+if (machine->numa_state->num_nodes && slot->arch_id != -1) {
+continue;
+}
 s390x_new_cpu(machine->cpu_type, i, _fatal);
+n++;
 }
 }
 
@@ -274,6 +294,11 @@ static void ccw_init(MachineState *machine)
 /* register hypercalls */
 virtio_ccw_register_hcalls();
 
+/* CPU0 must exist on S390x */
+if (!s390_cpu_addr2state(0)) {
+error_printf("Core_id 0 must be defined in the CPU configuration\n");
+exit(1);
+}
 s390_enable_css_support(s390_cpu_addr2state(0));
 
 ret = css_create_css_image(VIRTUAL_CSSID, true);
@@ -306,6 +331,7 @@ static void s390_cpu_plug(HotplugHandler *hotplug_dev,
 
 g_assert(!ms->possible_cpus->cpus[cpu->env.core_id].cpu);
 ms->possible_cpus->cpus[cpu->env.core_id].cpu = OBJECT(dev);
+ms->possible_cpus->cpus[cpu->env.core_id].arch_id = cpu->env.core_id;
 
 s390_topology_new_cpu(cpu->env.core_id);
 
@@ -579,7 +605,9 @@ static CpuInstanceProperties 
s390_cpu_index_to_props(MachineState *ms,
 static const CPUArchIdList *s390_possible_cpu_arch_ids(MachineState *ms)
 {
 int i;
+int drawer_id, book_id, socket_id;
 unsigned int max_cpus = ms->smp.max_cpus;
+CPUArchId *slot;
 
 if (ms->possible_cpus) {
 g_assert(ms->possible_cpus && ms->possible_cpus->len == max_cpus);
@@ -590,11 +618,25 @@ static const CPUArchIdList 
*s390_possible_cpu_arch_ids(MachineState *ms)
   sizeof(CPUArchId) * max_cpus);
 ms->possible_cpus->len = max_cpus;
 for (i = 0; i < ms->possible_cpus->len; i++) {
-ms->possible_cpus->cpus[i].type = ms->cpu_type;
-ms->possible_cpus->cpus[i].vcpus_count = 1;
-ms->possible_cpus->cpus[i].arch_id = i;
-ms->possible_cpus->cpus[i].props.has_core_id = true;
-ms->possible_cpus->cpus[i].props.core_id = i;
+slot =

Re: [PATCH] virtio-blk: Fix clean up of host notifiers for single MR transaction.

2021-12-09 Thread Ani Sinha

On Thu, Dec 2, 2021 at 10:34 PM Mark Mielke  wrote:
>
> Sorry... I missed copy maintainers and qemu-stable. This should be
> considered a regression.
>
> -- Forwarded message -
> From: Mark Mielke 
> Date: Thu, Dec 2, 2021 at 11:26 AM
> Subject: [PATCH] virtio-blk: Fix clean up of host notifiers for single
> MR transaction.
> To: 
>
>
> The code that introduced "virtio-blk: Configure all host notifiers in
> a single MR transaction" introduced a second loop variable to perform
> cleanup in second loop, but mistakenly still refers to the first
> loop variable within the second loop body.
>
> Fixes: d0267da61489 ("virtio-blk: Configure all host notifiers in a
> single MR transaction")
> Signed-off-by: Mark Mielke 

Reviewed-by: Ani Sinha 

> ---
>  hw/block/dataplane/virtio-blk.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> index 252c3a7a23..ee5a5352dc 100644
> --- a/hw/block/dataplane/virtio-blk.c
> +++ b/hw/block/dataplane/virtio-blk.c
> @@ -222,7 +222,7 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
>  memory_region_transaction_commit();
>
>  while (j--) {
> -virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
> +virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), j);
>  }
>  goto fail_host_notifiers;
>  }
> --
> 2.33.1
>
> --
> Mark Mielke 
>

[PATCH v5 12/12] s390: Topology: documentation

2021-12-09 Thread Pierre Morel

The use of the S390x CPU topology is explain in a new documentation
file.

Signed-off-by: Pierre Morel 
---
 docs/system/s390x/numa-cpu-topology.rst | 273 
 1 file changed, 273 insertions(+)
 create mode 100644 docs/system/s390x/numa-cpu-topology.rst

diff --git a/docs/system/s390x/numa-cpu-topology.rst 
b/docs/system/s390x/numa-cpu-topology.rst
new file mode 100644
index 00..9ae15f792f
--- /dev/null
+++ b/docs/system/s390x/numa-cpu-topology.rst
@@ -0,0 +1,273 @@
+NUMA CPU Topology on S390x
+==
+
+IBM S390 provides a complex CPU architecture with several cache levels.
+Using NUMA with the CPU topology is a way to let the guest optimize his
+accesses to the main memory.
+
+The QEMU smp parameter for S390x allows to specify 4 NUMA levels:
+core, socket, drawer and book and these levels are available for
+the numa parameter too.
+
+
+Prerequisites
+-
+
+To take advantage of the CPU topology, KVM must give support for the
+Perform Topology Function and to the Store System Information instructions
+as indicated by the Perform CPU Topology facility (stfle bit 11).
+
+If those requirements are met, the capability ``KVM_CAP_S390_CPU_TOPOLOGY``
+will indicate that KVM can support CPU Topology on that LPAR.
+
+
+Using CPU Topology in QEMU for S390x
+
+
+
+QEMU -smp parameter
+~~~
+
+With -smp QEMU provides the user with the possibility to define
+a Topology based on ::
+
+  -smp [[cpus=]n][,maxcpus=maxcpus][,drawers=drawers][,books=books] \
+   [,sockets=sockets][,cores=cores]
+
+The topology reported to the guest in this situation will provide
+n cpus of a maximum of maxcpus cpus, filling the topology levels one by one
+starting with CPU0 being the first CPU on drawer[0] book[0] socket[0].
+
+For example ``-smp 5,books=2,sockets=2,cores=2`` will provide ::
+
+  drawer[0]--+--book[0]--+--socket[0]--+--core[0]-CPU0
+ |   | |
+ |   | +--core[1]-CPU1
+ |   |
+ |   +--socket[1]--+--core[0]-CPU2
+ | |
+ | +--core[1]-CPU3
+ |
+ +--book[1]--+--socket[0]--+--core[0]-CPU4
+
+
+Note that the thread parameter can not be defined on S390 as it
+has no representation on the CPU topology.
+
+
+QEMU -numa parameter
+~~~
+
+With -numa QEMU provides the user with the possibility to define
+the Topology in a non uniform way ::
+
+  -smp [[cpus=]n][,maxcpus=maxcpus][,drawers=drawers][,books=books] \
+   [,sockets=sockets][,cores=cores]
+  -numa 
node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=initiator]
+  -numa cpu,node-id=node[,drawer-id=x][,book-id=x][,socket-id=x][,core-id=y]
+
+The topology reported to the guest in this situation will provide
+n cpus of a maximum of maxcpus cpus, and the topology entries will be
+
+- if there is less cpus than specified by the -numa arguments
+  the topology will be build by filling the numa definitions
+  starting with the lowest node.
+
+- if there is more cpus than specified by the -numa argument
+  the numa specification will first be fulfilled and the remaining
+  CPU will be assigned to unassigned slots starting with the
+  core 0 on socket 0.
+
+- a CPU declared with -device does not count inside the ncpus parameter
+  of the -smp argument and will be added on the topology based on
+  its core ID.
+
+For example  ::
+
+  -smp 3,drawers=8,books=2,sockets=2,cores=2,maxcpus=64
+  -object memory-backend-ram,id=mem0,size=10G
+  -numa node,nodeid=0,memdev=mem0
+  -numa node,nodeid=1
+  -numa node,nodeid=2
+  -numa cpu,node-id=0,drawer-id=0
+  -numa cpu,node-id=1,socket-id=9
+  -device host-s390x-cpu,core-id=19
+
+Will provide the following topology ::
+
+  drawer[0]--+--book[0]--+--socket[0]--+--core[0]-CPU0
+ | |
+ | +--core[1]-CPU1
+ |
+ +--socket[1]--+--core[0]-CPU2
+
+  drawer[2]--+--book[0]--+--socket[1]--+--core[1]-CPU19
+
+
+S390 NUMA specificity
+-
+
+Heterogene Memory Attributes
+
+
+The S390 topology implementation does not use ACPI HMAT to specify the
+cache size and bandwidth between nodes.
+
+Memory device
+~
+
+When using NUMA S390 needs a memory device to be associated with
+the nodes definitions. As we do not use HMAT, it has little sense
+to assign memory to each node and one should assign all memory to
+a node without CPU and use other nodes to define the CPU Topology.
+
+Exemple ::
+
+  -object memory-backend-ram,id=mem0,size=10G
+  -numa node,nodeid=0,memdev=mem0
+
+
+CPUs
+
+
+In the S390 topology we do not use threads and the first topology
+level is the core.
+The number of threads can no be defined for S390 and is always equal to 1.
+

[PATCH v5 09/12] s390: topology: Adding drawers to CPU topology

2021-12-09 Thread Pierre Morel

S390 CPU topology may have up to 5 topology containers.
The first container above the cores is level 2, the sockets,
and the level 3, containing sockets are the books.

We introduce here the drawers, drawers is the level containing books.

Let's add drawers, level4, containers to the CPU topology.

Signed-off-by: Pierre Morel 
---
 hw/core/machine-smp.c  | 28 +---
 hw/core/machine.c  |  2 ++
 hw/s390x/s390-virtio-ccw.c |  1 +
 include/hw/boards.h|  4 
 qapi/machine.json  |  7 ++-
 softmmu/vl.c   |  3 +++
 6 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/hw/core/machine-smp.c b/hw/core/machine-smp.c
index 4848f546cf..b80a7785b4 100644
--- a/hw/core/machine-smp.c
+++ b/hw/core/machine-smp.c
@@ -31,6 +31,10 @@ static char *cpu_hierarchy_to_string(MachineState *ms)
 MachineClass *mc = MACHINE_GET_CLASS(ms);
 GString *s = g_string_new(NULL);
 
+if (mc->smp_props.drawers_supported) {
+g_string_append_printf(s, " * drawers (%u)", ms->smp.drawers);
+}
+
 if (mc->smp_props.books_supported) {
 g_string_append_printf(s, " * books (%u)", ms->smp.books);
 }
@@ -71,6 +75,7 @@ void smp_parse(MachineState *ms, SMPConfiguration *config, 
Error **errp)
 {
 MachineClass *mc = MACHINE_GET_CLASS(ms);
 unsigned cpus= config->has_cpus ? config->cpus : 0;
+unsigned drawers = config->has_drawers ? config->drawers : 0;
 unsigned books   = config->has_books ? config->books : 0;
 unsigned sockets = config->has_sockets ? config->sockets : 0;
 unsigned dies= config->has_dies ? config->dies : 0;
@@ -83,6 +88,7 @@ void smp_parse(MachineState *ms, SMPConfiguration *config, 
Error **errp)
  * explicit configuration like "cpus=0" is not allowed.
  */
 if ((config->has_cpus && config->cpus == 0) ||
+(config->has_drawers && config->drawers == 0) ||
 (config->has_books && config->books == 0) ||
 (config->has_sockets && config->sockets == 0) ||
 (config->has_dies && config->dies == 0) ||
@@ -111,6 +117,13 @@ void smp_parse(MachineState *ms, SMPConfiguration *config, 
Error **errp)
 
 books = books > 0 ? books : 1;
 
+if (!mc->smp_props.drawers_supported && drawers > 1) {
+error_setg(errp, "drawers not supported by this machine's CPU 
topology");
+return;
+}
+
+drawers = drawers > 0 ? drawers : 1;
+
 /* compute missing values based on the provided ones */
 if (cpus == 0 && maxcpus == 0) {
 sockets = sockets > 0 ? sockets : 1;
@@ -124,33 +137,34 @@ void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 if (sockets == 0) {
 cores = cores > 0 ? cores : 1;
 threads = threads > 0 ? threads : 1;
-sockets = maxcpus / (books * dies * cores * threads);
+sockets = maxcpus / (drawers * books * dies * cores * threads);
 } else if (cores == 0) {
 threads = threads > 0 ? threads : 1;
-cores = maxcpus / (books * sockets * dies * threads);
+cores = maxcpus / (drawers * books * sockets * dies * threads);
 }
 } else {
 /* prefer cores over sockets since 6.2 */
 if (cores == 0) {
 sockets = sockets > 0 ? sockets : 1;
 threads = threads > 0 ? threads : 1;
-cores = maxcpus / (books * sockets * dies * threads);
+cores = maxcpus / (drawers * books * sockets * dies * threads);
 } else if (sockets == 0) {
 threads = threads > 0 ? threads : 1;
-sockets = maxcpus / (books * dies * cores * threads);
+sockets = maxcpus / (drawers * books * dies * cores * threads);
 }
 }
 
 /* try to calculate omitted threads at last */
 if (threads == 0) {
-threads = maxcpus / (books * sockets * dies * cores);
+threads = maxcpus / (drawers * books * sockets * dies * cores);
 }
 }
 
-maxcpus = maxcpus > 0 ? maxcpus : books * sockets * dies * cores * threads;
+maxcpus = maxcpus > 0 ? maxcpus : drawers * books * sockets * dies * cores 
* threads;
 cpus = cpus > 0 ? cpus : maxcpus;
 
 ms->smp.cpus = cpus;
+ms->smp.drawers = drawers;
 ms->smp.books = books;
 ms->smp.sockets = sockets;
 ms->smp.dies = dies;
@@ -159,7 +173,7 @@ void smp_parse(MachineState *ms, SMPConfiguration *config, 
Error **errp)
 ms->smp.max_cpus = maxcpus;
 
 /* sanity-check of the computed topology */
-if (books * sockets * dies * cores * threads != maxcpus) {
+if (drawers * books * sockets * dies * cores * threads != maxcpus) {
 g_autofree char *topo_msg = cpu_hierarchy_to_string(ms);
 error_setg(errp, "Invalid CPU topology: "
"product of the hierarchy must match maxcpus: "
diff --git a/hw/core/machine.c

[PATCH v5 06/12] s390x: kvm: topology: interception of PTF instruction

2021-12-09 Thread Pierre Morel

When the host supports the CPU topology facility, the PTF
instruction with function code 2 is interpreted by the SIE,
provided that the userland hypervizor activates the interpretation
by using the KVM_CAP_S390_CPU_TOPOLOGY KVM extension.

The PTF instructions with function code 0 and 1 are intercepted
and must be emulated by the userland hypervizor.

Signed-off-by: Pierre Morel 
---
 hw/s390x/s390-virtio-ccw.c | 50 ++
 include/hw/s390x/s390-virtio-ccw.h |  6 
 target/s390x/kvm/kvm.c | 15 +
 3 files changed, 71 insertions(+)

diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 8b624c2e0c..6218352e68 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -408,6 +408,56 @@ static void s390_pv_prepare_reset(S390CcwMachineState *ms)
 s390_pv_prep_reset();
 }
 
+/*
+ * s390_handle_ptf:
+ *
+ * @register 1: contains the function code
+ *
+ * Function codes 0 and 1 handle the CPU polarization.
+ * We assume an horizontal topology, the only one supported currently
+ * by Linux, consequently we answer to function code 0, requesting
+ * horizontal polarization that it is already the current polarization
+ * and reject vertical polarization request without further explanation.
+ *
+ * Function code 2 is handling topology changes and is interpreted
+ * by the SIE.
+ */
+int s390_handle_ptf(S390CPU *cpu, uint8_t r1, uintptr_t ra)
+{
+CPUS390XState *env = >env;
+uint64_t reg = env->regs[r1];
+uint8_t fc = reg & S390_TOPO_FC_MASK;
+
+if (!s390_has_feat(S390_FEAT_CONFIGURATION_TOPOLOGY)) {
+s390_program_interrupt(env, PGM_OPERATION, ra);
+return 0;
+}
+
+if (env->psw.mask & PSW_MASK_PSTATE) {
+s390_program_interrupt(env, PGM_PRIVILEGED, ra);
+return 0;
+}
+
+if (reg & ~S390_TOPO_FC_MASK) {
+s390_program_interrupt(env, PGM_SPECIFICATION, ra);
+return 0;
+}
+
+switch (fc) {
+case 0:/* Horizontal polarization is already set */
+env->regs[r1] |= S390_PTF_REASON_DONE;
+return 2;
+case 1:/* Vertical polarization is not supported */
+env->regs[r1] |= S390_PTF_REASON_NONE;
+return 2;
+default:
+/* Note that fc == 2 is interpreted by the SIE */
+s390_program_interrupt(env, PGM_SPECIFICATION, ra);
+}
+
+return 0;
+}
+
 static void s390_machine_reset(MachineState *machine)
 {
 S390CcwMachineState *ms = S390_CCW_MACHINE(machine);
diff --git a/include/hw/s390x/s390-virtio-ccw.h 
b/include/hw/s390x/s390-virtio-ccw.h
index 3331990e02..ac4b4a92e7 100644
--- a/include/hw/s390x/s390-virtio-ccw.h
+++ b/include/hw/s390x/s390-virtio-ccw.h
@@ -30,6 +30,12 @@ struct S390CcwMachineState {
 uint8_t loadparm[8];
 };
 
+#define S390_PTF_REASON_NONE (0x00 << 8)
+#define S390_PTF_REASON_DONE (0x01 << 8)
+#define S390_PTF_REASON_BUSY (0x02 << 8)
+#define S390_TOPO_FC_MASK 0xffUL
+int s390_handle_ptf(S390CPU *cpu, uint8_t r1, uintptr_t ra);
+
 struct S390CcwMachineClass {
 /*< private >*/
 MachineClass parent_class;
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
index 6ffc697b51..42eda0faee 100644
--- a/target/s390x/kvm/kvm.c
+++ b/target/s390x/kvm/kvm.c
@@ -98,6 +98,7 @@
 
 #define PRIV_B9_EQBS0x9c
 #define PRIV_B9_CLP 0xa0
+#define PRIV_B9_PTF 0xa2
 #define PRIV_B9_PCISTG  0xd0
 #define PRIV_B9_PCILG   0xd2
 #define PRIV_B9_RPCIT   0xd3
@@ -363,6 +364,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
 kvm_vm_enable_cap(s, KVM_CAP_S390_USER_SIGP, 0);
 kvm_vm_enable_cap(s, KVM_CAP_S390_VECTOR_REGISTERS, 0);
 kvm_vm_enable_cap(s, KVM_CAP_S390_USER_STSI, 0);
+kvm_vm_enable_cap(s, KVM_CAP_S390_CPU_TOPOLOGY, 0);
 if (ri_allowed()) {
 if (kvm_vm_enable_cap(s, KVM_CAP_S390_RI, 0) == 0) {
 cap_ri = 1;
@@ -1454,6 +1456,16 @@ static int kvm_mpcifc_service_call(S390CPU *cpu, struct 
kvm_run *run)
 }
 }
 
+static int kvm_handle_ptf(S390CPU *cpu, struct kvm_run *run)
+{
+uint8_t r1 = (run->s390_sieic.ipb >> 20) & 0x0f;
+int ret;
+
+ret = s390_handle_ptf(cpu, r1, RA_IGNORED);
+setcc(cpu, ret);
+return 0;
+}
+
 static int handle_b9(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
 {
 int r = 0;
@@ -1471,6 +1483,9 @@ static int handle_b9(S390CPU *cpu, struct kvm_run *run, 
uint8_t ipa1)
 case PRIV_B9_RPCIT:
 r = kvm_rpcit_service_call(cpu, run);
 break;
+case PRIV_B9_PTF:
+r = kvm_handle_ptf(cpu, run);
+break;
 case PRIV_B9_EQBS:
 /* just inject exception */
 r = -1;
-- 
2.27.0

[PATCH v5 07/12] s390: topology: Adding books to CPU topology

2021-12-09 Thread Pierre Morel

S390 CPU topology may have up to 5 topology containers.
The first container above the cores is level 2, the sockets.
We introduce here the books, book is the level containing sockets.

Let's add books, level3, containers to the CPU topology.

Signed-off-by: Pierre Morel 
---
 hw/core/machine-smp.c  | 28 +---
 hw/core/machine.c  |  2 ++
 hw/s390x/s390-virtio-ccw.c |  1 +
 include/hw/boards.h|  4 
 qapi/machine.json  |  7 ++-
 softmmu/vl.c   |  3 +++
 6 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/hw/core/machine-smp.c b/hw/core/machine-smp.c
index 116a0cbbfa..4848f546cf 100644
--- a/hw/core/machine-smp.c
+++ b/hw/core/machine-smp.c
@@ -31,6 +31,10 @@ static char *cpu_hierarchy_to_string(MachineState *ms)
 MachineClass *mc = MACHINE_GET_CLASS(ms);
 GString *s = g_string_new(NULL);
 
+if (mc->smp_props.books_supported) {
+g_string_append_printf(s, " * books (%u)", ms->smp.books);
+}
+
 g_string_append_printf(s, "sockets (%u)", ms->smp.sockets);
 
 if (mc->smp_props.dies_supported) {
@@ -67,6 +71,7 @@ void smp_parse(MachineState *ms, SMPConfiguration *config, 
Error **errp)
 {
 MachineClass *mc = MACHINE_GET_CLASS(ms);
 unsigned cpus= config->has_cpus ? config->cpus : 0;
+unsigned books   = config->has_books ? config->books : 0;
 unsigned sockets = config->has_sockets ? config->sockets : 0;
 unsigned dies= config->has_dies ? config->dies : 0;
 unsigned cores   = config->has_cores ? config->cores : 0;
@@ -78,6 +83,7 @@ void smp_parse(MachineState *ms, SMPConfiguration *config, 
Error **errp)
  * explicit configuration like "cpus=0" is not allowed.
  */
 if ((config->has_cpus && config->cpus == 0) ||
+(config->has_books && config->books == 0) ||
 (config->has_sockets && config->sockets == 0) ||
 (config->has_dies && config->dies == 0) ||
 (config->has_cores && config->cores == 0) ||
@@ -98,6 +104,13 @@ void smp_parse(MachineState *ms, SMPConfiguration *config, 
Error **errp)
 
 dies = dies > 0 ? dies : 1;
 
+if (!mc->smp_props.books_supported && books > 1) {
+error_setg(errp, "books not supported by this machine's CPU topology");
+return;
+}
+
+books = books > 0 ? books : 1;
+
 /* compute missing values based on the provided ones */
 if (cpus == 0 && maxcpus == 0) {
 sockets = sockets > 0 ? sockets : 1;
@@ -111,33 +124,34 @@ void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 if (sockets == 0) {
 cores = cores > 0 ? cores : 1;
 threads = threads > 0 ? threads : 1;
-sockets = maxcpus / (dies * cores * threads);
+sockets = maxcpus / (books * dies * cores * threads);
 } else if (cores == 0) {
 threads = threads > 0 ? threads : 1;
-cores = maxcpus / (sockets * dies * threads);
+cores = maxcpus / (books * sockets * dies * threads);
 }
 } else {
 /* prefer cores over sockets since 6.2 */
 if (cores == 0) {
 sockets = sockets > 0 ? sockets : 1;
 threads = threads > 0 ? threads : 1;
-cores = maxcpus / (sockets * dies * threads);
+cores = maxcpus / (books * sockets * dies * threads);
 } else if (sockets == 0) {
 threads = threads > 0 ? threads : 1;
-sockets = maxcpus / (dies * cores * threads);
+sockets = maxcpus / (books * dies * cores * threads);
 }
 }
 
 /* try to calculate omitted threads at last */
 if (threads == 0) {
-threads = maxcpus / (sockets * dies * cores);
+threads = maxcpus / (books * sockets * dies * cores);
 }
 }
 
-maxcpus = maxcpus > 0 ? maxcpus : sockets * dies * cores * threads;
+maxcpus = maxcpus > 0 ? maxcpus : books * sockets * dies * cores * threads;
 cpus = cpus > 0 ? cpus : maxcpus;
 
 ms->smp.cpus = cpus;
+ms->smp.books = books;
 ms->smp.sockets = sockets;
 ms->smp.dies = dies;
 ms->smp.cores = cores;
@@ -145,7 +159,7 @@ void smp_parse(MachineState *ms, SMPConfiguration *config, 
Error **errp)
 ms->smp.max_cpus = maxcpus;
 
 /* sanity-check of the computed topology */
-if (sockets * dies * cores * threads != maxcpus) {
+if (books * sockets * dies * cores * threads != maxcpus) {
 g_autofree char *topo_msg = cpu_hierarchy_to_string(ms);
 error_setg(errp, "Invalid CPU topology: "
"product of the hierarchy must match maxcpus: "
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 53a99abc56..d98c9105f7 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -740,6 +740,7 @@ static void machine_get_smp(Object *obj, Visitor *v, const 
char *name,
 MachineState *ms =

[PATCH v5 10/12] s390: topology: Adding drawers to STSI

2021-12-09 Thread Pierre Morel

Let's add STSI support for the container level 4, drawers,
and provide the information back to the guest.

Signed-off-by: Pierre Morel 
---
 hw/s390x/cpu-topology.c | 137 +---
 include/hw/s390x/cpu-topology.h |  19 -
 include/hw/s390x/sclp.h |   2 +-
 target/s390x/cpu_topology.c |  40 --
 4 files changed, 176 insertions(+), 22 deletions(-)

diff --git a/hw/s390x/cpu-topology.c b/hw/s390x/cpu-topology.c
index 43eff650d9..f1048ba648 100644
--- a/hw/s390x/cpu-topology.c
+++ b/hw/s390x/cpu-topology.c
@@ -81,6 +81,26 @@ static S390TopologyBook *s390_create_book(S390TopologyDrawer 
*drawer, int id)
 return book;
 }
 
+static S390TopologyDrawer *s390_create_drawer(S390TopologyNode *node, int id)
+{
+DeviceState *dev;
+S390TopologyDrawer *drawer;
+const MachineState *ms = MACHINE(qdev_get_machine());
+
+if (node->bus->num_children >= ms->smp.drawers) {
+return NULL;
+}
+
+dev = qdev_new(TYPE_S390_TOPOLOGY_DRAWER);
+qdev_realize_and_unref(dev, node->bus, _fatal);
+
+drawer = S390_TOPOLOGY_DRAWER(dev);
+drawer->drawer_id = id;
+node->cnt++;
+
+return drawer;
+}
+
 /*
  * s390_get_cores:
  * @socket: the socket to search into
@@ -130,6 +150,31 @@ static S390TopologySocket 
*s390_get_socket(S390TopologyBook *book,
 return s390_create_socket(book, socket_id);
 }
 
+/*
+ * s390_get_drawer:
+ * @node: The node to search into
+ * @drawer_id: the identifier of the drawer to search for
+ *
+ * returns a pointer to a S390TopologyDrawer structure within a book having
+ * the specified drawer_id.
+ * First search if the book is already containing the S390TopologyDrawer
+ * structure and if not create one with this drawer_id.
+ */
+static S390TopologyDrawer *s390_get_drawer(S390TopologyNode *node,
+   int drawer_id)
+{
+S390TopologyDrawer *drawer;
+BusChild *kid;
+
+QTAILQ_FOREACH(kid, >bus->children, sibling) {
+drawer = S390_TOPOLOGY_DRAWER(kid->child);
+if (drawer->drawer_id == drawer_id) {
+return drawer;
+}
+}
+return s390_create_drawer(node, drawer_id);
+}
+
 /*
  * s390_get_book:
  * @drawer: The drawer to search into
@@ -169,6 +214,7 @@ static S390TopologyBook *s390_get_book(S390TopologyDrawer 
*drawer,
 void s390_topology_new_cpu(int core_id)
 {
 const MachineState *ms = MACHINE(qdev_get_machine());
+S390TopologyNode *node;
 S390TopologyDrawer *drawer;
 S390TopologyBook *book;
 S390TopologySocket *socket;
@@ -176,13 +222,16 @@ void s390_topology_new_cpu(int core_id)
 int origin, bit;
 int nb_cores_per_socket;
 int nb_cores_per_book;
+int nb_cores_per_drawer;
 
-drawer = s390_get_topology();
+node = s390_get_topology();
 
 /* Cores for the S390 topology are cores and threads of the QEMU topology 
*/
 nb_cores_per_socket = ms->smp.cores * ms->smp.threads;
 nb_cores_per_book = ms->smp.sockets * nb_cores_per_socket;
+nb_cores_per_drawer = ms->smp.books * nb_cores_per_book;
 
+drawer = s390_get_drawer(node, core_id / nb_cores_per_drawer);
 book = s390_get_book(drawer, core_id / nb_cores_per_book);
 socket = s390_get_socket(book, core_id / nb_cores_per_socket);
 
@@ -216,23 +265,23 @@ void s390_topology_setup(MachineState *ms)
 DeviceState *dev;
 
 /* Create BOOK bridge device */
-dev = qdev_new(TYPE_S390_TOPOLOGY_DRAWER);
+dev = qdev_new(TYPE_S390_TOPOLOGY_NODE);
 object_property_add_child(qdev_get_machine(),
-  TYPE_S390_TOPOLOGY_DRAWER, OBJECT(dev));
+  TYPE_S390_TOPOLOGY_NODE, OBJECT(dev));
 sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);
 }
 
-S390TopologyDrawer *s390_get_topology(void)
+S390TopologyNode *s390_get_topology(void)
 {
-static S390TopologyDrawer *drawer;
+static S390TopologyNode *node;
 
-if (!drawer) {
-drawer = 
S390_TOPOLOGY_DRAWER(object_resolve_path(TYPE_S390_TOPOLOGY_DRAWER,
-  NULL));
-assert(drawer != NULL);
+if (!node) {
+node = S390_TOPOLOGY_NODE(object_resolve_path(TYPE_S390_TOPOLOGY_NODE,
+  NULL));
+assert(node != NULL);
 }
 
-return drawer;
+return node;
 }
 
 /* --- CORES Definitions --- */
@@ -455,6 +504,7 @@ static void drawer_class_init(ObjectClass *oc, void *data)
 
 hc->unplug = qdev_simple_device_unplug_cb;
 set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
+dc->bus_type = TYPE_S390_TOPOLOGY_NODE_BUS;
 dc->realize = s390_drawer_device_realize;
 device_class_set_props(dc, s390_topology_drawer_properties);
 dc->desc = "topology drawer";
@@ -462,7 +512,7 @@ static void drawer_class_init(ObjectClass *oc, void *data)
 
 static const TypeInfo drawer_info = {
 .name  = TYPE_S390_TOPOLOGY_DRAWER,
-.parent=

1 2 >

1 - 100 of 133 matches

Mail list logo