date:20210927

Now we have a common structure SMPCompatProps used to store information
about SMP compatibility stuff, so we can also move smp_prefer_sockets
there for cleaner code.

No functional change intended.

Signed-off-by: Yanan Wang 
Acked-by: David Gibson 
Reviewed-by: Andrew Jones 
---
 hw/arm/virt.c  | 2 +-
 hw/core/machine.c  | 2 +-
 hw/i386/pc_piix.c  | 2 +-
 hw/i386/pc_q35.c   | 2 +-
 hw/ppc/spapr.c | 2 +-
 hw/s390x/s390-virtio-ccw.c | 2 +-
 include/hw/boards.h| 3 ++-
 7 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 8c13deb5db..7170aaacd5 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2815,7 +2815,7 @@ static void virt_machine_6_1_options(MachineClass *mc)
 
 virt_machine_6_2_options(mc);
 compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
-mc->smp_prefer_sockets = true;
+mc->smp_props.prefer_sockets = true;
 
 /* qemu ITS was introduced with 6.2 */
 vmc->no_tcg_its = true;
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 23f77201eb..e2a48aa18c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -817,7 +817,7 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 } else {
 maxcpus = maxcpus > 0 ? maxcpus : cpus;
 
-if (mc->smp_prefer_sockets) {
+if (mc->smp_props.prefer_sockets) {
 /* prefer sockets over cores before 6.2 */
 if (sockets == 0) {
 cores = cores > 0 ? cores : 1;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 077644ee9c..5efb6f1949 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -431,7 +431,7 @@ static void pc_i440fx_6_1_machine_options(MachineClass *m)
 m->is_default = false;
 compat_props_add(m->compat_props, hw_compat_6_1, hw_compat_6_1_len);
 compat_props_add(m->compat_props, pc_compat_6_1, pc_compat_6_1_len);
-m->smp_prefer_sockets = true;
+m->smp_props.prefer_sockets = true;
 }
 
 DEFINE_I440FX_MACHINE(v6_1, "pc-i440fx-6.1", NULL,
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 2d97c0ab3e..9eae40e32c 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -371,7 +371,7 @@ static void pc_q35_6_1_machine_options(MachineClass *m)
 m->alias = NULL;
 compat_props_add(m->compat_props, hw_compat_6_1, hw_compat_6_1_len);
 compat_props_add(m->compat_props, pc_compat_6_1, pc_compat_6_1_len);
-m->smp_prefer_sockets = true;
+m->smp_props.prefer_sockets = true;
 }
 
 DEFINE_Q35_MACHINE(v6_1, "pc-q35-6.1", NULL,
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index a481fade51..efdea43c0d 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -4702,7 +4702,7 @@ static void spapr_machine_6_1_class_options(MachineClass 
*mc)
 {
 spapr_machine_6_2_class_options(mc);
 compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
-mc->smp_prefer_sockets = true;
+mc->smp_props.prefer_sockets = true;
 }
 
 DEFINE_SPAPR_MACHINE(6_1, "6.1", false);
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 5401c985cf..653587ea62 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -814,7 +814,7 @@ static void ccw_machine_6_1_class_options(MachineClass *mc)
 {
 ccw_machine_6_2_class_options(mc);
 compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
-mc->smp_prefer_sockets = true;
+mc->smp_props.prefer_sockets = true;
 }
 DEFINE_CCW_MACHINE(6_1, "6.1", false);
 
diff --git a/include/hw/boards.h b/include/hw/boards.h
index fa284e01e9..5adbcbb99b 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -110,9 +110,11 @@ typedef struct {
 
 /**
  * SMPCompatProps:
+ * @prefer_sockets - whether sockets are preferred over cores in smp parsing
  * @dies_supported - whether dies are supported by the machine
  */
 typedef struct {
+bool prefer_sockets;
 bool dies_supported;
 } SMPCompatProps;
 
@@ -250,7 +252,6 @@ struct MachineClass {
 bool nvdimm_supported;
 bool numa_mem_supported;
 bool auto_enable_numa;
-bool smp_prefer_sockets;
 SMPCompatProps smp_props;
 const char *default_ram_id;
 
-- 
2.19.1

[PATCH v11 04/14] machine: Set the value of cpus to match maxcpus if it's omitted

Currently we directly calculate the omitted cpus based on the given
incomplete collection of parameters. This makes some cmdlines like:
  -smp maxcpus=16
  -smp sockets=2,maxcpus=16
  -smp sockets=2,dies=2,maxcpus=16
  -smp sockets=2,cores=4,maxcpus=16
not work. We should probably set the value of cpus to match maxcpus
if it's omitted, which will make above configs start to work.

So the calculation logic of cpus/maxcpus after this patch will be:
When both maxcpus and cpus are omitted, maxcpus will be calculated
from the given parameters and cpus will be set equal to maxcpus.
When only one of maxcpus and cpus is given then the omitted one
will be set to its counterpart's value. Both maxcpus and cpus may
be specified, but maxcpus must be equal to or greater than cpus.

Note: change in this patch won't affect any existing working cmdlines
but allows more incomplete configs to be valid.

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
 hw/core/machine.c | 29 -
 hw/i386/pc.c  | 29 -
 qemu-options.hx   | 11 ---
 3 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 56bd3033a5..fe935cb4a3 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -760,25 +760,28 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 }
 
 /* compute missing values, prefer sockets over cores over threads */
-maxcpus = maxcpus > 0 ? maxcpus : cpus;
-
-if (cpus == 0) {
+if (cpus == 0 && maxcpus == 0) {
 sockets = sockets > 0 ? sockets : 1;
 cores = cores > 0 ? cores : 1;
 threads = threads > 0 ? threads : 1;
-cpus = sockets * cores * threads;
+} else {
 maxcpus = maxcpus > 0 ? maxcpus : cpus;
-} else if (sockets == 0) {
-cores = cores > 0 ? cores : 1;
-threads = threads > 0 ? threads : 1;
-sockets = maxcpus / (cores * threads);
-} else if (cores == 0) {
-threads = threads > 0 ? threads : 1;
-cores = maxcpus / (sockets * threads);
-} else if (threads == 0) {
-threads = maxcpus / (sockets * cores);
+
+if (sockets == 0) {
+cores = cores > 0 ? cores : 1;
+threads = threads > 0 ? threads : 1;
+sockets = maxcpus / (cores * threads);
+} else if (cores == 0) {
+threads = threads > 0 ? threads : 1;
+cores = maxcpus / (sockets * threads);
+} else if (threads == 0) {
+threads = maxcpus / (sockets * cores);
+}
 }
 
+maxcpus = maxcpus > 0 ? maxcpus : sockets * cores * threads;
+cpus = cpus > 0 ? cpus : maxcpus;
+
 if (sockets * cores * threads < cpus) {
 error_setg(errp, "cpu topology: "
"sockets (%u) * cores (%u) * threads (%u) < "
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 87c06d3991..d9382b7d57 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -727,25 +727,28 @@ static void pc_smp_parse(MachineState *ms, 
SMPConfiguration *config, Error **err
 dies = dies > 0 ? dies : 1;
 
 /* compute missing values, prefer sockets over cores over threads */
-maxcpus = maxcpus > 0 ? maxcpus : cpus;
-
-if (cpus == 0) {
+if (cpus == 0 && maxcpus == 0) {
 sockets = sockets > 0 ? sockets : 1;
 cores = cores > 0 ? cores : 1;
 threads = threads > 0 ? threads : 1;
-cpus = sockets * dies * cores * threads;
+} else {
 maxcpus = maxcpus > 0 ? maxcpus : cpus;
-} else if (sockets == 0) {
-cores = cores > 0 ? cores : 1;
-threads = threads > 0 ? threads : 1;
-sockets = maxcpus / (dies * cores * threads);
-} else if (cores == 0) {
-threads = threads > 0 ? threads : 1;
-cores = maxcpus / (sockets * dies * threads);
-} else if (threads == 0) {
-threads = maxcpus / (sockets * dies * cores);
+
+if (sockets == 0) {
+cores = cores > 0 ? cores : 1;
+threads = threads > 0 ? threads : 1;
+sockets = maxcpus / (dies * cores * threads);
+} else if (cores == 0) {
+threads = threads > 0 ? threads : 1;
+cores = maxcpus / (sockets * dies * threads);
+} else if (threads == 0) {
+threads = maxcpus / (sockets * dies * cores);
+}
 }
 
+maxcpus = maxcpus > 0 ? maxcpus : sockets * dies * cores * threads;
+cpus = cpus > 0 ? cpus : maxcpus;
+
 if (sockets * dies * cores * threads < cpus) {
 error_setg(errp, "cpu topology: "
"sockets (%u) * dies (%u) * cores (%u) * threads (%u) < "
diff --git a/qemu-options.hx b/qemu-options.hx
index 91d859aa29..9d71a661bb 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -214,9 +214,14 @@ SRST
 Simulate a SMP system with '\ ``n``\ ' CPUs initially present on
 the machine type board. On boards supporting CPU hotplug, the optional
 '\ ``maxcpus``\ '

[PATCH v11 09/14] machine: Use ms instead of global current_machine in sanity-check

In the sanity-check of smp_cpus and max_cpus against mc in function
machine_set_smp(), we are now using ms->smp.max_cpus for the check
but using current_machine->smp.max_cpus in the error message.
Tweak this by uniformly using the local ms.

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
Reviewed-by: Pankaj Gupta 
Reviewed-by: Cornelia Huck 
---
 hw/core/machine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 0df597f99c..1ad5dac3e8 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -881,7 +881,7 @@ static void machine_set_smp(Object *obj, Visitor *v, const 
char *name,
 } else if (ms->smp.max_cpus > mc->max_cpus) {
 error_setg(errp, "Invalid SMP CPUs %d. The max CPUs "
"supported by machine '%s' is %d",
-   current_machine->smp.max_cpus,
+   ms->smp.max_cpus,
mc->name, mc->max_cpus);
 }
 
-- 
2.19.1

[PATCH v11 14/14] machine: Put all sanity-check in the generic SMP parser

Put both sanity-check of the input SMP configuration and sanity-check
of the output SMP configuration uniformly in the generic parser. Then
machine_set_smp() will become cleaner, also all the invalid scenarios
can be tested only by calling the parser.

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
Reviewed-by: Pankaj Gupta 
---
 hw/core/machine.c | 63 +++
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index e2a48aa18c..637acd8d42 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -798,6 +798,20 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 unsigned threads = config->has_threads ? config->threads : 0;
 unsigned maxcpus = config->has_maxcpus ? config->maxcpus : 0;
 
+/*
+ * Specified CPU topology parameters must be greater than zero,
+ * explicit configuration like "cpus=0" is not allowed.
+ */
+if ((config->has_cpus && config->cpus == 0) ||
+(config->has_sockets && config->sockets == 0) ||
+(config->has_dies && config->dies == 0) ||
+(config->has_cores && config->cores == 0) ||
+(config->has_threads && config->threads == 0) ||
+(config->has_maxcpus && config->maxcpus == 0)) {
+warn_report("Invalid CPU topology deprecated: "
+"CPU topology parameters must be greater than zero");
+}
+
 /*
  * If not supported by the machine, a topology parameter must be
  * omitted or specified equal to 1.
@@ -873,6 +887,22 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
topo_msg, maxcpus, cpus);
 return;
 }
+
+if (ms->smp.cpus < mc->min_cpus) {
+error_setg(errp, "Invalid SMP CPUs %d. The min CPUs "
+   "supported by machine '%s' is %d",
+   ms->smp.cpus,
+   mc->name, mc->min_cpus);
+return;
+}
+
+if (ms->smp.max_cpus > mc->max_cpus) {
+error_setg(errp, "Invalid SMP CPUs %d. The max CPUs "
+   "supported by machine '%s' is %d",
+   ms->smp.max_cpus,
+   mc->name, mc->max_cpus);
+return;
+}
 }
 
 static void machine_get_smp(Object *obj, Visitor *v, const char *name,
@@ -895,7 +925,6 @@ static void machine_get_smp(Object *obj, Visitor *v, const 
char *name,
 static void machine_set_smp(Object *obj, Visitor *v, const char *name,
 void *opaque, Error **errp)
 {
-MachineClass *mc = MACHINE_GET_CLASS(obj);
 MachineState *ms = MACHINE(obj);
 SMPConfiguration *config;
 ERRP_GUARD();
@@ -904,40 +933,10 @@ static void machine_set_smp(Object *obj, Visitor *v, 
const char *name,
 return;
 }
 
-/*
- * Specified CPU topology parameters must be greater than zero,
- * explicit configuration like "cpus=0" is not allowed.
- */
-if ((config->has_cpus && config->cpus == 0) ||
-(config->has_sockets && config->sockets == 0) ||
-(config->has_dies && config->dies == 0) ||
-(config->has_cores && config->cores == 0) ||
-(config->has_threads && config->threads == 0) ||
-(config->has_maxcpus && config->maxcpus == 0)) {
-warn_report("Invalid CPU topology deprecated: "
-"CPU topology parameters must be greater than zero");
-}
-
 smp_parse(ms, config, errp);
 if (*errp) {
-goto out_free;
-}
-
-/* sanity-check smp_cpus and max_cpus against mc */
-if (ms->smp.cpus < mc->min_cpus) {
-error_setg(errp, "Invalid SMP CPUs %d. The min CPUs "
-   "supported by machine '%s' is %d",
-   ms->smp.cpus,
-   mc->name, mc->min_cpus);
-} else if (ms->smp.max_cpus > mc->max_cpus) {
-error_setg(errp, "Invalid SMP CPUs %d. The max CPUs "
-   "supported by machine '%s' is %d",
-   ms->smp.max_cpus,
-   mc->name, mc->max_cpus);
+qapi_free_SMPConfiguration(config);
 }
-
-out_free:
-qapi_free_SMPConfiguration(config);
 }
 
 static void machine_class_init(ObjectClass *oc, void *data)
-- 
2.19.1

[PATCH v11 06/14] qtest/numa-test: Use detailed -smp CLIs in pc_dynamic_cpu_cfg

Since commit 80d7835749 (qemu-options: rewrite help for -smp options),
the preference of sockets/cores in -smp parsing is considered liable
to change, and actually we are going to change it in a coming commit.
So it'll be more stable to use detailed -smp CLIs in testing if we
have strong dependency on the parsing results.

pc_dynamic_cpu_cfg currently assumes/needs that there will be 2 CPU
sockets with "-smp 2". To avoid breaking the test because of parsing
logic change, now explicitly use "-smp 2,sockets=2".

Cc: Paolo Bonzini 
Cc: Igor Mammedov 
Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
 tests/qtest/numa-test.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/qtest/numa-test.c b/tests/qtest/numa-test.c
index c677cd63c4..fd7a2e80a0 100644
--- a/tests/qtest/numa-test.c
+++ b/tests/qtest/numa-test.c
@@ -265,7 +265,8 @@ static void pc_dynamic_cpu_cfg(const void *data)
 QTestState *qs;
 g_autofree char *cli = NULL;
 
-cli = make_cli(data, "-nodefaults --preconfig -machine smp.cpus=2");
+cli = make_cli(data, "-nodefaults --preconfig "
+ "-machine smp.cpus=2,smp.sockets=2");
 qs = qtest_init(cli);
 
 /* create 2 numa nodes */
-- 
2.19.1

[PATCH v11 03/14] machine: Uniformly use maxcpus to calculate the omitted parameters

We are currently using maxcpus to calculate the omitted sockets
but using cpus to calculate the omitted cores/threads. This makes
cmdlines like:
  -smp cpus=8,maxcpus=16
  -smp cpus=8,cores=4,maxcpus=16
  -smp cpus=8,threads=2,maxcpus=16
work fine but the ones like:
  -smp cpus=8,sockets=2,maxcpus=16
  -smp cpus=8,sockets=2,cores=4,maxcpus=16
  -smp cpus=8,sockets=2,threads=2,maxcpus=16
break the sanity check.

Since we require for a valid config that the product of "sockets * cores
* threads" should equal to the maxcpus, we should uniformly use maxcpus
to calculate their omitted values.

Also the if-branch of "cpus == 0 || sockets == 0" was split into two
branches of "cpus == 0" and "sockets == 0" so that we can clearly read
that we are parsing the configuration with a preference on cpus over
sockets over cores over threads.

Note: change in this patch won't affect any existing working cmdlines
but improves consistency and allows more incomplete configs to be valid.

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
Reviewed-by: Pankaj Gupta 
---
 hw/core/machine.c | 30 +++---
 hw/i386/pc.c  | 30 +++---
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index cf9cf53911..56bd3033a5 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -760,24 +760,26 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 }
 
 /* compute missing values, prefer sockets over cores over threads */
-if (cpus == 0 || sockets == 0) {
+maxcpus = maxcpus > 0 ? maxcpus : cpus;
+
+if (cpus == 0) {
+sockets = sockets > 0 ? sockets : 1;
 cores = cores > 0 ? cores : 1;
 threads = threads > 0 ? threads : 1;
-if (cpus == 0) {
-sockets = sockets > 0 ? sockets : 1;
-cpus = cores * threads * sockets;
-} else {
-maxcpus = maxcpus > 0 ? maxcpus : cpus;
-sockets = maxcpus / (cores * threads);
-}
+cpus = sockets * cores * threads;
+maxcpus = maxcpus > 0 ? maxcpus : cpus;
+} else if (sockets == 0) {
+cores = cores > 0 ? cores : 1;
+threads = threads > 0 ? threads : 1;
+sockets = maxcpus / (cores * threads);
 } else if (cores == 0) {
 threads = threads > 0 ? threads : 1;
-cores = cpus / (sockets * threads);
-cores = cores > 0 ? cores : 1;
+cores = maxcpus / (sockets * threads);
 } else if (threads == 0) {
-threads = cpus / (cores * sockets);
-threads = threads > 0 ? threads : 1;
-} else if (sockets * cores * threads < cpus) {
+threads = maxcpus / (sockets * cores);
+}
+
+if (sockets * cores * threads < cpus) {
 error_setg(errp, "cpu topology: "
"sockets (%u) * cores (%u) * threads (%u) < "
"smp_cpus (%u)",
@@ -785,8 +787,6 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 return;
 }
 
-maxcpus = maxcpus > 0 ? maxcpus : cpus;
-
 if (maxcpus < cpus) {
 error_setg(errp, "maxcpus must be equal to or greater than smp");
 return;
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 93dc322a97..87c06d3991 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -727,24 +727,26 @@ static void pc_smp_parse(MachineState *ms, 
SMPConfiguration *config, Error **err
 dies = dies > 0 ? dies : 1;
 
 /* compute missing values, prefer sockets over cores over threads */
-if (cpus == 0 || sockets == 0) {
+maxcpus = maxcpus > 0 ? maxcpus : cpus;
+
+if (cpus == 0) {
+sockets = sockets > 0 ? sockets : 1;
 cores = cores > 0 ? cores : 1;
 threads = threads > 0 ? threads : 1;
-if (cpus == 0) {
-sockets = sockets > 0 ? sockets : 1;
-cpus = cores * threads * dies * sockets;
-} else {
-maxcpus = maxcpus > 0 ? maxcpus : cpus;
-sockets = maxcpus / (dies * cores * threads);
-}
+cpus = sockets * dies * cores * threads;
+maxcpus = maxcpus > 0 ? maxcpus : cpus;
+} else if (sockets == 0) {
+cores = cores > 0 ? cores : 1;
+threads = threads > 0 ? threads : 1;
+sockets = maxcpus / (dies * cores * threads);
 } else if (cores == 0) {
 threads = threads > 0 ? threads : 1;
-cores = cpus / (sockets * dies * threads);
-cores = cores > 0 ? cores : 1;
+cores = maxcpus / (sockets * dies * threads);
 } else if (threads == 0) {
-threads = cpus / (cores * dies * sockets);
-threads = threads > 0 ? threads : 1;
-} else if (sockets * dies * cores * threads < cpus) {
+threads = maxcpus / (sockets * dies * cores);
+}
+
+if (sockets * dies * cores * threads < cpus) {
 error_setg(errp, "cpu topology: "
"sockets (%u) * dies (%u) * cores (%u) * threads (%u) < "

[PATCH v11 11/14] machine: Make smp_parse generic enough for all arches

Currently the only difference between smp_parse and pc_smp_parse
is the support of dies parameter and the related error reporting.
With some arch compat variables like "bool dies_supported", we can
make smp_parse generic enough for all arches and the PC specific
one can be removed.

Making smp_parse() generic enough can reduce code duplication and
ease the code maintenance, and also allows extending the topology
with more arch specific members (e.g., clusters) in the future.

Suggested-by: Andrew Jones 
Suggested-by: Daniel P. Berrange 
Signed-off-by: Yanan Wang 
---
 hw/core/machine.c   | 91 +++--
 hw/i386/pc.c| 84 +
 include/hw/boards.h |  9 +
 3 files changed, 81 insertions(+), 103 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index a21fcd7700..f5dadcbd78 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -746,20 +746,69 @@ void machine_set_cpu_numa_node(MachineState *machine,
 }
 }
 
+/*
+ * Report information of a machine's supported CPU topology hierarchy.
+ * Topology members will be ordered from the largest to the smallest
+ * in the string.
+ */
+static char *cpu_hierarchy_to_string(MachineState *ms)
+{
+MachineClass *mc = MACHINE_GET_CLASS(ms);
+GString *s = g_string_new(NULL);
+
+g_string_append_printf(s, "sockets (%u)", ms->smp.sockets);
+
+if (mc->smp_props.dies_supported) {
+g_string_append_printf(s, " * dies (%u)", ms->smp.dies);
+}
+
+g_string_append_printf(s, " * cores (%u)", ms->smp.cores);
+g_string_append_printf(s, " * threads (%u)", ms->smp.threads);
+
+return g_string_free(s, false);
+}
+
+/*
+ * smp_parse - Generic function used to parse the given SMP configuration
+ *
+ * Any missing parameter in "cpus/maxcpus/sockets/cores/threads" will be
+ * automatically computed based on the provided ones.
+ *
+ * In the calculation of omitted sockets/cores/threads: we prefer sockets
+ * over cores over threads before 6.2, while preferring cores over sockets
+ * over threads since 6.2.
+ *
+ * In the calculation of cpus/maxcpus: When both maxcpus and cpus are omitted,
+ * maxcpus will be computed from the given parameters and cpus will be set
+ * equal to maxcpus. When only one of maxcpus and cpus is given then the
+ * omitted one will be set to its given counterpart's value. Both maxcpus and
+ * cpus may be specified, but maxcpus must be equal to or greater than cpus.
+ *
+ * For compatibility, apart from the parameters that will be computed, newly
+ * introduced topology members which are likely to be target specific should
+ * be directly set as 1 if they are omitted (e.g. dies for PC since 4.1).
+ */
 static void smp_parse(MachineState *ms, SMPConfiguration *config, Error **errp)
 {
 MachineClass *mc = MACHINE_GET_CLASS(ms);
 unsigned cpus= config->has_cpus ? config->cpus : 0;
 unsigned sockets = config->has_sockets ? config->sockets : 0;
+unsigned dies= config->has_dies ? config->dies : 0;
 unsigned cores   = config->has_cores ? config->cores : 0;
 unsigned threads = config->has_threads ? config->threads : 0;
 unsigned maxcpus = config->has_maxcpus ? config->maxcpus : 0;
 
-if (config->has_dies && config->dies > 1) {
+/*
+ * If not supported by the machine, a topology parameter must be
+ * omitted or specified equal to 1.
+ */
+if (!mc->smp_props.dies_supported && dies > 1) {
 error_setg(errp, "dies not supported by this machine's CPU topology");
 return;
 }
 
+dies = dies > 0 ? dies : 1;
+
 /* compute missing values based on the provided ones */
 if (cpus == 0 && maxcpus == 0) {
 sockets = sockets > 0 ? sockets : 1;
@@ -773,55 +822,57 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 if (sockets == 0) {
 cores = cores > 0 ? cores : 1;
 threads = threads > 0 ? threads : 1;
-sockets = maxcpus / (cores * threads);
+sockets = maxcpus / (dies * cores * threads);
 } else if (cores == 0) {
 threads = threads > 0 ? threads : 1;
-cores = maxcpus / (sockets * threads);
+cores = maxcpus / (sockets * dies * threads);
 }
 } else {
 /* prefer cores over sockets since 6.2 */
 if (cores == 0) {
 sockets = sockets > 0 ? sockets : 1;
 threads = threads > 0 ? threads : 1;
-cores = maxcpus / (sockets * threads);
+cores = maxcpus / (sockets * dies * threads);
 } else if (sockets == 0) {
 threads = threads > 0 ? threads : 1;
-sockets = maxcpus / (cores * threads);
+sockets = maxcpus / (dies * cores * threads);
 }
 }
 
 /* try to calculate omitted threads at last */
 if (threads == 0)

[PATCH v11 12/14] machine: Remove smp_parse callback from MachineClass

Now we have a generic smp parser for all arches, and there will
not be any other arch specific ones, so let's remove the callback
from MachineClass and call the parser directly.

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
 hw/core/machine.c   | 3 +--
 include/hw/boards.h | 5 -
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index f5dadcbd78..23f77201eb 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -918,7 +918,7 @@ static void machine_set_smp(Object *obj, Visitor *v, const 
char *name,
 "CPU topology parameters must be greater than zero");
 }
 
-mc->smp_parse(ms, config, errp);
+smp_parse(ms, config, errp);
 if (*errp) {
 goto out_free;
 }
@@ -947,7 +947,6 @@ static void machine_class_init(ObjectClass *oc, void *data)
 /* Default 128 MB as guest ram size */
 mc->default_ram_size = 128 * MiB;
 mc->rom_file_has_mr = true;
-mc->smp_parse = smp_parse;
 
 /* numa node memory size aligned on 8MB by default.
  * On Linux, each node's border has to be 8MB aligned
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 72a23e4e0f..fa284e01e9 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -177,10 +177,6 @@ typedef struct {
  *kvm-type may be NULL if it is not needed.
  * @numa_mem_supported:
  *true if '--numa node.mem' option is supported and false otherwise
- * @smp_parse:
- *The function pointer to hook different machine specific functions for
- *parsing "smp-opts" from QemuOpts to MachineState::CpuTopology and more
- *machine specific topology fields, such as smp_dies for PCMachine.
  * @hotplug_allowed:
  *If the hook is provided, then it'll be called for each device
  *hotplug to check whether the device hotplug is allowed.  Return
@@ -217,7 +213,6 @@ struct MachineClass {
 void (*reset)(MachineState *state);
 void (*wakeup)(MachineState *state);
 int (*kvm_type)(MachineState *machine, const char *arg);
-void (*smp_parse)(MachineState *ms, SMPConfiguration *config, Error 
**errp);
 
 BlockInterfaceType block_default_type;
 int units_per_default_bus;
-- 
2.19.1

[PATCH v11 08/14] machine: Prefer cores over sockets in smp parsing since 6.2

In the real SMP hardware topology world, it's much more likely that
we have high cores-per-socket counts and few sockets totally. While
the current preference of sockets over cores in smp parsing results
in a virtual cpu topology with low cores-per-sockets counts and a
large number of sockets, which is just contrary to the real world.

Given that it is better to make the virtual cpu topology be more
reflective of the real world and also for the sake of compatibility,
we start to prefer cores over sockets over threads in smp parsing
since machine type 6.2 for different arches.

In this patch, a boolean "smp_prefer_sockets" is added, and we only
enable the old preference on older machines and enable the new one
since type 6.2 for all arches by using the machine compat mechanism.

Suggested-by: Daniel P. Berrange 
Signed-off-by: Yanan Wang 
Acked-by: David Gibson 
Acked-by: Cornelia Huck 
Reviewed-by: Andrew Jones 
Reviewed-by: Pankaj Gupta 
---
 hw/arm/virt.c  |  1 +
 hw/core/machine.c  | 35 ++-
 hw/i386/pc.c   | 35 ++-
 hw/i386/pc_piix.c  |  1 +
 hw/i386/pc_q35.c   |  1 +
 hw/ppc/spapr.c |  1 +
 hw/s390x/s390-virtio-ccw.c |  1 +
 include/hw/boards.h|  1 +
 qemu-options.hx|  3 ++-
 9 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 1d59f0e59f..8c13deb5db 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -2815,6 +2815,7 @@ static void virt_machine_6_1_options(MachineClass *mc)
 
 virt_machine_6_2_options(mc);
 compat_props_add(mc->compat_props, hw_compat_6_1, hw_compat_6_1_len);
+mc->smp_prefer_sockets = true;
 
 /* qemu ITS was introduced with 6.2 */
 vmc->no_tcg_its = true;
diff --git a/hw/core/machine.c b/hw/core/machine.c
index f1b30b3101..0df597f99c 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -748,6 +748,7 @@ void machine_set_cpu_numa_node(MachineState *machine,
 
 static void smp_parse(MachineState *ms, SMPConfiguration *config, Error **errp)
 {
+MachineClass *mc = MACHINE_GET_CLASS(ms);
 unsigned cpus= config->has_cpus ? config->cpus : 0;
 unsigned sockets = config->has_sockets ? config->sockets : 0;
 unsigned cores   = config->has_cores ? config->cores : 0;
@@ -759,7 +760,7 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 return;
 }
 
-/* compute missing values, prefer sockets over cores over threads */
+/* compute missing values based on the provided ones */
 if (cpus == 0 && maxcpus == 0) {
 sockets = sockets > 0 ? sockets : 1;
 cores = cores > 0 ? cores : 1;
@@ -767,14 +768,30 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 } else {
 maxcpus = maxcpus > 0 ? maxcpus : cpus;
 
-if (sockets == 0) {
-cores = cores > 0 ? cores : 1;
-threads = threads > 0 ? threads : 1;
-sockets = maxcpus / (cores * threads);
-} else if (cores == 0) {
-threads = threads > 0 ? threads : 1;
-cores = maxcpus / (sockets * threads);
-} else if (threads == 0) {
+if (mc->smp_prefer_sockets) {
+/* prefer sockets over cores before 6.2 */
+if (sockets == 0) {
+cores = cores > 0 ? cores : 1;
+threads = threads > 0 ? threads : 1;
+sockets = maxcpus / (cores * threads);
+} else if (cores == 0) {
+threads = threads > 0 ? threads : 1;
+cores = maxcpus / (sockets * threads);
+}
+} else {
+/* prefer cores over sockets since 6.2 */
+if (cores == 0) {
+sockets = sockets > 0 ? sockets : 1;
+threads = threads > 0 ? threads : 1;
+cores = maxcpus / (sockets * threads);
+} else if (sockets == 0) {
+threads = threads > 0 ? threads : 1;
+sockets = maxcpus / (cores * threads);
+}
+}
+
+/* try to calculate omitted threads at last */
+if (threads == 0) {
 threads = maxcpus / (sockets * cores);
 }
 }
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index a37eef8057..447114e57a 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -716,6 +716,7 @@ void pc_acpi_smi_interrupt(void *opaque, int irq, int level)
  */
 static void pc_smp_parse(MachineState *ms, SMPConfiguration *config, Error 
**errp)
 {
+MachineClass *mc = MACHINE_GET_CLASS(ms);
 unsigned cpus= config->has_cpus ? config->cpus : 0;
 unsigned sockets = config->has_sockets ? config->sockets : 0;
 unsigned dies= config->has_dies ? config->dies : 0;
@@ -726,7 +727,7 @@ static void pc_smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **err
 /* directly default dies to 1 if it's omitted */
 dies = dies >

[PATCH v11 07/14] qtest/numa-test: Use detailed -smp CLIs in test_def_cpu_split

Since commit 80d7835749 (qemu-options: rewrite help for -smp options),
the preference of sockets/cores in -smp parsing is considered liable
to change, and actually we are going to change it in a coming commit.
So it'll be more stable to use detailed -smp CLIs in the testcases
that have strong dependency on the parsing results.

Currently, test_def_cpu_split use "-smp 8" and will get 8 CPU sockets
based on current parsing rule. But if we change to prefer cores over
sockets we will get one CPU socket with 8 cores, and this testcase
will not get expected numa set by default on x86_64 (Ok on aarch64).

So now explicitly use "-smp 8,sockets=8" to avoid affect from parsing
logic change.

Cc: Paolo Bonzini 
Cc: Igor Mammedov 
Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
 tests/qtest/numa-test.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/qtest/numa-test.c b/tests/qtest/numa-test.c
index fd7a2e80a0..90bf68a5b3 100644
--- a/tests/qtest/numa-test.c
+++ b/tests/qtest/numa-test.c
@@ -42,7 +42,8 @@ static void test_def_cpu_split(const void *data)
 g_autofree char *s = NULL;
 g_autofree char *cli = NULL;
 
-cli = make_cli(data, "-machine smp.cpus=8 -numa node,memdev=ram -numa 
node");
+cli = make_cli(data, "-machine smp.cpus=8,smp.sockets=8 "
+ "-numa node,memdev=ram -numa node");
 qts = qtest_init(cli);
 
 s = qtest_hmp(qts, "info numa");
-- 
2.19.1

[PATCH v11 02/14] machine: Minor refactor/fix for the smp parsers

To pave the way for the functional improvement in later patches,
make some refactor/cleanup for the smp parsers, including using
local maxcpus instead of ms->smp.max_cpus in the calculation,
defaulting dies to 0 initially like other members, cleanup the
sanity check for dies.

We actually also fix a hidden defect by avoiding directly using
the provided *zero value* in the calculation, which could cause
a segment fault (e.g. using dies=0 in the calculation).

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
 hw/core/machine.c | 18 ++
 hw/i386/pc.c  | 23 ++-
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 711e83db54..cf9cf53911 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -752,8 +752,9 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 unsigned sockets = config->has_sockets ? config->sockets : 0;
 unsigned cores   = config->has_cores ? config->cores : 0;
 unsigned threads = config->has_threads ? config->threads : 0;
+unsigned maxcpus = config->has_maxcpus ? config->maxcpus : 0;
 
-if (config->has_dies && config->dies != 0 && config->dies != 1) {
+if (config->has_dies && config->dies > 1) {
 error_setg(errp, "dies not supported by this machine's CPU topology");
 return;
 }
@@ -766,8 +767,8 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 sockets = sockets > 0 ? sockets : 1;
 cpus = cores * threads * sockets;
 } else {
-ms->smp.max_cpus = config->has_maxcpus ? config->maxcpus : cpus;
-sockets = ms->smp.max_cpus / (cores * threads);
+maxcpus = maxcpus > 0 ? maxcpus : cpus;
+sockets = maxcpus / (cores * threads);
 }
 } else if (cores == 0) {
 threads = threads > 0 ? threads : 1;
@@ -784,26 +785,27 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 return;
 }
 
-ms->smp.max_cpus = config->has_maxcpus ? config->maxcpus : cpus;
+maxcpus = maxcpus > 0 ? maxcpus : cpus;
 
-if (ms->smp.max_cpus < cpus) {
+if (maxcpus < cpus) {
 error_setg(errp, "maxcpus must be equal to or greater than smp");
 return;
 }
 
-if (sockets * cores * threads != ms->smp.max_cpus) {
+if (sockets * cores * threads != maxcpus) {
 error_setg(errp, "Invalid CPU topology: "
"sockets (%u) * cores (%u) * threads (%u) "
"!= maxcpus (%u)",
sockets, cores, threads,
-   ms->smp.max_cpus);
+   maxcpus);
 return;
 }
 
 ms->smp.cpus = cpus;
+ms->smp.sockets = sockets;
 ms->smp.cores = cores;
 ms->smp.threads = threads;
-ms->smp.sockets = sockets;
+ms->smp.max_cpus = maxcpus;
 }
 
 static void machine_get_smp(Object *obj, Visitor *v, const char *name,
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 557d49c9f8..93dc322a97 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -718,9 +718,13 @@ static void pc_smp_parse(MachineState *ms, 
SMPConfiguration *config, Error **err
 {
 unsigned cpus= config->has_cpus ? config->cpus : 0;
 unsigned sockets = config->has_sockets ? config->sockets : 0;
-unsigned dies= config->has_dies ? config->dies : 1;
+unsigned dies= config->has_dies ? config->dies : 0;
 unsigned cores   = config->has_cores ? config->cores : 0;
 unsigned threads = config->has_threads ? config->threads : 0;
+unsigned maxcpus = config->has_maxcpus ? config->maxcpus : 0;
+
+/* directly default dies to 1 if it's omitted */
+dies = dies > 0 ? dies : 1;
 
 /* compute missing values, prefer sockets over cores over threads */
 if (cpus == 0 || sockets == 0) {
@@ -730,8 +734,8 @@ static void pc_smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **err
 sockets = sockets > 0 ? sockets : 1;
 cpus = cores * threads * dies * sockets;
 } else {
-ms->smp.max_cpus = config->has_maxcpus ? config->maxcpus : cpus;
-sockets = ms->smp.max_cpus / (cores * threads * dies);
+maxcpus = maxcpus > 0 ? maxcpus : cpus;
+sockets = maxcpus / (dies * cores * threads);
 }
 } else if (cores == 0) {
 threads = threads > 0 ? threads : 1;
@@ -748,27 +752,28 @@ static void pc_smp_parse(MachineState *ms, 
SMPConfiguration *config, Error **err
 return;
 }
 
-ms->smp.max_cpus = config->has_maxcpus ? config->maxcpus : cpus;
+maxcpus = maxcpus > 0 ? maxcpus : cpus;
 
-if (ms->smp.max_cpus < cpus) {
+if (maxcpus < cpus) {
 error_setg(errp, "maxcpus must be equal to or greater than smp");
 return;
 }
 
-if (sockets * dies * cores * threads != ms->smp.max_cpus) {
+if (sockets * dies * cores * threads != maxcpus) {

[PATCH v11 10/14] machine: Tweak the order of topology members in struct CpuTopology

Now that all the possible topology parameters are integrated in struct
CpuTopology, tweak the order of topology members to be "cpus/sockets/
dies/cores/threads/maxcpus" for readability and consistency. We also
tweak the comment by adding explanation of dies parameter.

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
Reviewed-by: Pankaj Gupta 
---
 hw/core/machine.c   | 8 
 include/hw/boards.h | 7 ---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 1ad5dac3e8..a21fcd7700 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -829,11 +829,11 @@ static void machine_get_smp(Object *obj, Visitor *v, 
const char *name,
 {
 MachineState *ms = MACHINE(obj);
 SMPConfiguration *config = &(SMPConfiguration){
-.has_cores = true, .cores = ms->smp.cores,
+.has_cpus = true, .cpus = ms->smp.cpus,
 .has_sockets = true, .sockets = ms->smp.sockets,
 .has_dies = true, .dies = ms->smp.dies,
+.has_cores = true, .cores = ms->smp.cores,
 .has_threads = true, .threads = ms->smp.threads,
-.has_cpus = true, .cpus = ms->smp.cpus,
 .has_maxcpus = true, .maxcpus = ms->smp.max_cpus,
 };
 if (!visit_type_SMPConfiguration(v, name, , _abort)) {
@@ -1060,10 +1060,10 @@ static void machine_initfn(Object *obj)
 /* default to mc->default_cpus */
 ms->smp.cpus = mc->default_cpus;
 ms->smp.max_cpus = mc->default_cpus;
-ms->smp.cores = 1;
+ms->smp.sockets = 1;
 ms->smp.dies = 1;
+ms->smp.cores = 1;
 ms->smp.threads = 1;
-ms->smp.sockets = 1;
 }
 
 static void machine_finalize(Object *obj)
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 2ae039b74f..2a1bba86c0 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -275,17 +275,18 @@ typedef struct DeviceMemoryState {
 /**
  * CpuTopology:
  * @cpus: the number of present logical processors on the machine
- * @cores: the number of cores in one package
- * @threads: the number of threads in one core
  * @sockets: the number of sockets on the machine
+ * @dies: the number of dies in one socket
+ * @cores: the number of cores in one die
+ * @threads: the number of threads in one core
  * @max_cpus: the maximum number of logical processors on the machine
  */
 typedef struct CpuTopology {
 unsigned int cpus;
+unsigned int sockets;
 unsigned int dies;
 unsigned int cores;
 unsigned int threads;
-unsigned int sockets;
 unsigned int max_cpus;
 } CpuTopology;
 
-- 
2.19.1

[PATCH v11 00/14] machine: smp parsing fixes and improvement

Hi,

This is a new version (v11) with some update in patch 11/14 suggested
by Daniel. Please have another look, Thanks!

Summary of v11:
1) Specifying a CPU topology parameter as zero was implicitly allowed
but undocumented before, while now it's explicitly deprecated.

2) Refactor/fixes of the smp parsers.

3) For consistency, maxcpus is now uniformly used to calculate the
omitted topology members.

4) Improve the error reporting of the parsers.

5) It's also suggested that we should start to prefer cores over sockets
over threads on the newer machine types, which will make the computed
virtual topology more reflective of the real hardware. Related discussion
can be found in [1].
[1] https://lore.kernel.org/qemu-devel/ynigink00ynni...@redhat.com/

6) In order to reduce code duplication and ease the code maintenance,
smp_parse() is converted into a generic enough parser for all arches,
so that the arch-specific ones (e.g. pc_smp_parse) can be removed.
It's also convenient to introduce more topology members to the generic
parser in the future. Related discussions can be found in [2] and [3].
[2] 
https://lore.kernel.org/qemu-devel/20210630115602.txmvmfe2jrzu7...@gator.home/
[3] https://lore.kernel.org/qemu-devel/ypfn83pkbt7f9...@redhat.com/

Changelogs:

v10->v11:
- only update patch 11/14
  use GString APIs to build the cpu topology hierarchy string (Daniel)
  refine the comments of smp_parse()
- v10: 
https://lore.kernel.org/qemu-devel/20210926084541.17352-1-wangyana...@huawei.com/

v9->v10:
- rebased on latest upstream commit 11a1199846.
  there is no change of the patches in v10, except minor update
  in 08/14 to resolve merge conflict with master.
- To make this series more acceptable, drop the last two patches
  about SMP unit test, since the scalability of the test is not
  optimally designed after rethinking of it. So I will resend the
  test related patches separately after refining them.
- v9: 
https://lore.kernel.org/qemu-devel/20210910073025.16480-1-wangyana...@huawei.com/

Yanan Wang (14):
  machine: Deprecate "parameter=0" SMP configurations
  machine: Minor refactor/fix for the smp parsers
  machine: Uniformly use maxcpus to calculate the omitted parameters
  machine: Set the value of cpus to match maxcpus if it's omitted
  machine: Improve the error reporting of smp parsing
  qtest/numa-test: Use detailed -smp CLIs in pc_dynamic_cpu_cfg
  qtest/numa-test: Use detailed -smp CLIs in test_def_cpu_split
  machine: Prefer cores over sockets in smp parsing since 6.2
  machine: Use ms instead of global current_machine in sanity-check
  machine: Tweak the order of topology members in struct CpuTopology
  machine: Make smp_parse generic enough for all arches
  machine: Remove smp_parse callback from MachineClass
  machine: Move smp_prefer_sockets to struct SMPCompatProps
  machine: Put all sanity-check in the generic SMP parser

 docs/about/deprecated.rst  |  15 +++
 hw/arm/virt.c  |   1 +
 hw/core/machine.c  | 195 ++---
 hw/i386/pc.c   |  63 +---
 hw/i386/pc_piix.c  |   1 +
 hw/i386/pc_q35.c   |   1 +
 hw/ppc/spapr.c |   1 +
 hw/s390x/s390-virtio-ccw.c |   1 +
 include/hw/boards.h|  23 +++--
 qapi/machine.json  |   2 +-
 qemu-options.hx|  24 +++--
 tests/qtest/numa-test.c|   6 +-
 12 files changed, 195 insertions(+), 138 deletions(-)

--
2.19.1

[PATCH v11 05/14] machine: Improve the error reporting of smp parsing

We have two requirements for a valid SMP configuration:
the product of "sockets * cores * threads" must represent all the
possible cpus, i.e., max_cpus, and then must include the initially
present cpus, i.e., smp_cpus.

So we only need to ensure 1) "sockets * cores * threads == maxcpus"
at first and then ensure 2) "maxcpus >= cpus". With a reasonable
order of the sanity check, we can simplify the error reporting code.
When reporting an error message we also report the exact value of
each topology member to make users easily see what's going on.

Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
Reviewed-by: Pankaj Gupta 
---
 hw/core/machine.c | 22 +-
 hw/i386/pc.c  | 24 ++--
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index fe935cb4a3..f1b30b3101 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -782,25 +782,21 @@ static void smp_parse(MachineState *ms, SMPConfiguration 
*config, Error **errp)
 maxcpus = maxcpus > 0 ? maxcpus : sockets * cores * threads;
 cpus = cpus > 0 ? cpus : maxcpus;
 
-if (sockets * cores * threads < cpus) {
-error_setg(errp, "cpu topology: "
-   "sockets (%u) * cores (%u) * threads (%u) < "
-   "smp_cpus (%u)",
-   sockets, cores, threads, cpus);
+if (sockets * cores * threads != maxcpus) {
+error_setg(errp, "Invalid CPU topology: "
+   "product of the hierarchy must match maxcpus: "
+   "sockets (%u) * cores (%u) * threads (%u) "
+   "!= maxcpus (%u)",
+   sockets, cores, threads, maxcpus);
 return;
 }
 
 if (maxcpus < cpus) {
-error_setg(errp, "maxcpus must be equal to or greater than smp");
-return;
-}
-
-if (sockets * cores * threads != maxcpus) {
 error_setg(errp, "Invalid CPU topology: "
+   "maxcpus must be equal to or greater than smp: "
"sockets (%u) * cores (%u) * threads (%u) "
-   "!= maxcpus (%u)",
-   sockets, cores, threads,
-   maxcpus);
+   "== maxcpus (%u) < smp_cpus (%u)",
+   sockets, cores, threads, maxcpus, cpus);
 return;
 }
 
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index d9382b7d57..a37eef8057 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -749,25 +749,21 @@ static void pc_smp_parse(MachineState *ms, 
SMPConfiguration *config, Error **err
 maxcpus = maxcpus > 0 ? maxcpus : sockets * dies * cores * threads;
 cpus = cpus > 0 ? cpus : maxcpus;
 
-if (sockets * dies * cores * threads < cpus) {
-error_setg(errp, "cpu topology: "
-   "sockets (%u) * dies (%u) * cores (%u) * threads (%u) < "
-   "smp_cpus (%u)",
-   sockets, dies, cores, threads, cpus);
+if (sockets * dies * cores * threads != maxcpus) {
+error_setg(errp, "Invalid CPU topology: "
+   "product of the hierarchy must match maxcpus: "
+   "sockets (%u) * dies (%u) * cores (%u) * threads (%u) "
+   "!= maxcpus (%u)",
+   sockets, dies, cores, threads, maxcpus);
 return;
 }
 
 if (maxcpus < cpus) {
-error_setg(errp, "maxcpus must be equal to or greater than smp");
-return;
-}
-
-if (sockets * dies * cores * threads != maxcpus) {
-error_setg(errp, "Invalid CPU topology deprecated: "
+error_setg(errp, "Invalid CPU topology: "
+   "maxcpus must be equal to or greater than smp: "
"sockets (%u) * dies (%u) * cores (%u) * threads (%u) "
-   "!= maxcpus (%u)",
-   sockets, dies, cores, threads,
-   maxcpus);
+   "== maxcpus (%u) < smp_cpus (%u)",
+   sockets, dies, cores, threads, maxcpus, cpus);
 return;
 }
 
-- 
2.19.1

[PATCH v11 01/14] machine: Deprecate "parameter=0" SMP configurations

In the SMP configuration, we should either provide a topology
parameter with a reasonable value (greater than zero) or just
omit it and QEMU will compute the missing value.

The users shouldn't provide a configuration with any parameter
of it specified as zero (e.g. -smp 8,sockets=0) which could
possibly cause unexpected results in the -smp parsing. So we
deprecate this kind of configurations since 6.2 by adding the
explicit sanity check.

Signed-off-by: Yanan Wang 
Reviewed-by: Cornelia Huck 
---
 docs/about/deprecated.rst | 15 +++
 hw/core/machine.c | 14 ++
 qapi/machine.json |  2 +-
 qemu-options.hx   | 12 +++-
 4 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 3c2be84d80..97415dbecd 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -160,6 +160,21 @@ Use ``-display sdl`` instead.
 
 Use ``-display curses`` instead.
 
+``-smp`` ("parameter=0" SMP configurations) (since 6.2)
+'''
+
+Specified CPU topology parameters must be greater than zero.
+
+In the SMP configuration, users should either provide a CPU topology
+parameter with a reasonable value (greater than zero) or just omit it
+and QEMU will compute the missing value.
+
+However, historically it was implicitly allowed for users to provide
+a parameter with zero value, which is meaningless and could also possibly
+cause unexpected results in the -smp parsing. So support for this kind of
+configurations (e.g. -smp 8,sockets=0) is deprecated since 6.2 and will
+be removed in the near future, users have to ensure that all the topology
+members described with -smp are greater than zero.
 
 Plugin argument passing through ``arg=`` (since 6.1)
 
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 067f42b528..711e83db54 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -835,6 +835,20 @@ static void machine_set_smp(Object *obj, Visitor *v, const 
char *name,
 return;
 }
 
+/*
+ * Specified CPU topology parameters must be greater than zero,
+ * explicit configuration like "cpus=0" is not allowed.
+ */
+if ((config->has_cpus && config->cpus == 0) ||
+(config->has_sockets && config->sockets == 0) ||
+(config->has_dies && config->dies == 0) ||
+(config->has_cores && config->cores == 0) ||
+(config->has_threads && config->threads == 0) ||
+(config->has_maxcpus && config->maxcpus == 0)) {
+warn_report("Invalid CPU topology deprecated: "
+"CPU topology parameters must be greater than zero");
+}
+
 mc->smp_parse(ms, config, errp);
 if (*errp) {
 goto out_free;
diff --git a/qapi/machine.json b/qapi/machine.json
index 32d47f4e35..227e75d8af 100644
--- a/qapi/machine.json
+++ b/qapi/machine.json
@@ -1331,7 +1331,7 @@
 #
 # @dies: number of dies per socket in the CPU topology
 #
-# @cores: number of cores per thread in the CPU topology
+# @cores: number of cores per die in the CPU topology
 #
 # @threads: number of threads per core in the CPU topology
 #
diff --git a/qemu-options.hx b/qemu-options.hx
index 8f603cc7e6..91d859aa29 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -227,11 +227,13 @@ SRST
 of computing the CPU maximum count.
 
 Either the initial CPU count, or at least one of the topology parameters
-must be specified. Values for any omitted parameters will be computed
-from those which are given. Historically preference was given to the
-coarsest topology parameters when computing missing values (ie sockets
-preferred over cores, which were preferred over threads), however, this
-behaviour is considered liable to change.
+must be specified. The specified parameters must be greater than zero,
+explicit configuration like "cpus=0" is not allowed. Values for any
+omitted parameters will be computed from those which are given.
+Historically preference was given to the coarsest topology parameters
+when computing missing values (ie sockets preferred over cores, which
+were preferred over threads), however, this behaviour is considered
+liable to change.
 ERST
 
 DEF("numa", HAS_ARG, QEMU_OPTION_numa,
-- 
2.19.1

[PATCH 0/1] hw: aspeed_gpio: Fix GPIO array indexing

From: Peter Delevoryas 

Hey everyone,

I think there might be a bug in aspeed_gpio_update, where it's selecting
a GPIO IRQ to update. The indexing that maps from GPIO pin to IRQ leads
to an out-of-bounds array access and a segfault after that.

tl;dr

There's 8 rows of 32 pins (8 * 32 == 256 total) on the AST2500, but some
of the pins are not actually active: there's only 228 pins actually
active in the AST2500.

The GPIO IRQ array has length 228, but we index it using a matrix
indexing scheme like [row][column], and end up out-of-bounds for
high-numbered pins.

I fixed this by converting the IRQ array to a matrix, where some
of the entries are uninitialized (zero). This retains the matrix
indexing scheme, which I think is easy to understand.

Notes on reproducing:

I was testing booting Facebook's OpenBMC platform "YosemiteV2" (fby2)
and hit a segfault:

  qemu-system-arm -machine ast2500-evb \
  -drive file=fby2.mtd,format=raw,if=mtd \
  -serial stdio -display none
  ...
  Setup Caching for Bridge IC info..done.
  Setup Front Panel Daemon..done.
  Setup fan speed...
  FAN CONFIG : Single Rotor FAN
  Unexpected 4 Servers config! Run FSC 4 TLs Config as default config
  Setting Zone 0 speed to 70%
  Setting Zone 1 speed to 70%
  ok: run: fscd: (pid 1726) 0s
  done.
  Powering fru 1 to ON state...
  Segmentation fault (core dumped)

In gdb:

  Thread 3 "qemu-system-arm" received signal SIGSEGV, Segmentation fault.
  [Switching to Thread 0x720ee700 (LWP 1840353)]
  qemu_set_irq (irq=0x, level=1) at ../hw/core/irq.c:45
  45  irq->handler(irq->opaque, irq->n, level);
  (gdb) p irq
  $1 = (qemu_irq) 0x
  (gdb) up
  #1  0x558e36f5 in aspeed_gpio_update (s=0x77ecffb0, 
regs=0x77ed0c94, value=128) at ../hw/gpio/aspeed_gpio.c:287
  287 qemu_set_irq(s->gpios[offset], !!(new & mask));
  (gdb) p s->gpios
  $2 = {0x0 }
  (gdb) p offset
  $3 = 231
  (gdb) p set
  $5 = 7
  (gdb) p gpio
  $4 = 7

With my fix, I can boot the fby2 platform. The image I was using is here:

https://github.com/peterdelevoryas/openbmc/releases/tag/fby2.debug.mtd

Peter Delevoryas (1):
  hw: aspeed_gpio: Fix GPIO array indexing

 hw/gpio/aspeed_gpio.c | 72 ++-
 include/hw/gpio/aspeed_gpio.h |  5 +--
 2 files changed, 31 insertions(+), 46 deletions(-)

-- 
2.30.2

[PATCH 1/1] hw: aspeed_gpio: Fix GPIO array indexing

From: Peter Delevoryas 

The gpio array is declared as a dense array:

  qemu_irq gpios[ASPEED_GPIO_NR_PINS];

(AST2500 has 228, AST2400 has 216, AST2600 has 208)

However, this array is used like a matrix of GPIO sets
(e.g. gpio[NR_SETS][NR_PINS_PER_SET] = gpio[8][32])

  size_t offset = set * GPIOS_PER_SET + gpio;
  qemu_set_irq(s->gpios[offset], !!(new & mask));

This can result in an out-of-bounds access to "s->gpios" because the
gpio sets do _not_ have the same length. Some of the groups (e.g.
GPIOAB) only have 4 pins. 228 != 8 * 32 == 256.

To fix this, I converted the gpio array from dense to sparse, to match
both the hardware layout and this existing indexing code.

Fixes: 4b7f956862dc2db4c5c ("hw/gpio: Add basic Aspeed GPIO model for AST2400 
and AST2500")
Signed-off-by: Peter Delevoryas 
---
 hw/gpio/aspeed_gpio.c | 72 ++-
 include/hw/gpio/aspeed_gpio.h |  5 +--
 2 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/hw/gpio/aspeed_gpio.c b/hw/gpio/aspeed_gpio.c
index dfa6d6cb40..f04d4a454c 100644
--- a/hw/gpio/aspeed_gpio.c
+++ b/hw/gpio/aspeed_gpio.c
@@ -16,11 +16,7 @@
 #include "hw/irq.h"
 #include "migration/vmstate.h"
 
-#define GPIOS_PER_REG 32
-#define GPIOS_PER_SET GPIOS_PER_REG
-#define GPIO_PIN_GAP_SIZE 4
 #define GPIOS_PER_GROUP 8
-#define GPIO_GROUP_SHIFT 3
 
 /* GPIO Source Types */
 #define ASPEED_CMD_SRC_MASK 0x01010101
@@ -259,7 +255,7 @@ static void aspeed_gpio_update(AspeedGPIOState *s, GPIOSets 
*regs,
 
 diff = old ^ new;
 if (diff) {
-for (gpio = 0; gpio < GPIOS_PER_REG; gpio++) {
+for (gpio = 0; gpio < ASPEED_GPIOS_PER_SET; gpio++) {
 uint32_t mask = 1 << gpio;
 
 /* If the gpio needs to be updated... */
@@ -283,8 +279,7 @@ static void aspeed_gpio_update(AspeedGPIOState *s, GPIOSets 
*regs,
 if (direction & mask) {
 /* ...trigger the line-state IRQ */
 ptrdiff_t set = aspeed_gpio_set_idx(s, regs);
-size_t offset = set * GPIOS_PER_SET + gpio;
-qemu_set_irq(s->gpios[offset], !!(new & mask));
+qemu_set_irq(s->gpios[set][gpio], !!(new & mask));
 } else {
 /* ...otherwise if we meet the line's current IRQ policy... */
 if (aspeed_evaluate_irq(regs, old & mask, gpio)) {
@@ -297,21 +292,6 @@ static void aspeed_gpio_update(AspeedGPIOState *s, 
GPIOSets *regs,
 qemu_set_irq(s->irq, !!(s->pending));
 }
 
-static uint32_t aspeed_adjust_pin(AspeedGPIOState *s, uint32_t pin)
-{
-AspeedGPIOClass *agc = ASPEED_GPIO_GET_CLASS(s);
-/*
- * The 2500 has a 4 pin gap in group AB and the 2400 has a 4 pin
- * gap in group Y (and only four pins in AB but this is the last group so
- * it doesn't matter).
- */
-if (agc->gap && pin >= agc->gap) {
-pin += GPIO_PIN_GAP_SIZE;
-}
-
-return pin;
-}
-
 static bool aspeed_gpio_get_pin_level(AspeedGPIOState *s, uint32_t set_idx,
   uint32_t pin)
 {
@@ -367,7 +347,7 @@ static uint32_t update_value_control_source(GPIOSets *regs, 
uint32_t old_value,
 uint32_t new_value = 0;
 
 /* for each group in set */
-for (i = 0; i < GPIOS_PER_REG; i += GPIOS_PER_GROUP) {
+for (i = 0; i < ASPEED_GPIOS_PER_SET; i += GPIOS_PER_GROUP) {
 cmd_source = extract32(regs->cmd_source_0, i, 1)
 | (extract32(regs->cmd_source_1, i, 1) << 1);
 
@@ -637,7 +617,7 @@ static void aspeed_gpio_write(void *opaque, hwaddr offset, 
uint64_t data,
  *   bidirectional  |   1   |   1|  data
  *   input only |   1   |   0|   0
  *   output only|   0   |   1|   1
- *   no pin / gap   |   0   |   0|   0
+ *   no pin |   0   |   0|   0
  *
  *  which is captured by:
  *  data = ( data | ~input) & output;
@@ -836,14 +816,20 @@ static void aspeed_gpio_realize(DeviceState *dev, Error 
**errp)
 AspeedGPIOState *s = ASPEED_GPIO(dev);
 SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
 AspeedGPIOClass *agc = ASPEED_GPIO_GET_CLASS(s);
-int pin;
 
 /* Interrupt parent line */
 sysbus_init_irq(sbd, >irq);
 
 /* Individual GPIOs */
-for (pin = 0; pin < agc->nr_gpio_pins; pin++) {
-sysbus_init_irq(sbd, >gpios[pin]);
+for (int i = 0; i < ASPEED_GPIO_MAX_NR_SETS; i++) {
+const GPIOSetProperties *props = >props[i];
+uint32_t skip = ~(props->input | props->output);
+for (int j = 0; j < ASPEED_GPIOS_PER_SET; j++) {
+if (skip >> j & 1) {
+continue;
+}
+sysbus_init_irq(sbd, >gpios[i][j]);
+}
 }
 
 memory_region_init_io(>iomem, OBJECT(s), _gpio_ops, s,
@@ -856,20 +842,22 @@ static void aspeed_gpio_init(Object *obj)
 {
 AspeedGPIOState *s = ASPEED_GPIO(obj);
 AspeedGPIOClass *agc =

[PATCH 1/1] hw: aspeed_gpio: Fix pin I/O type declarations

From: Peter Delevoryas 

Some of the pin declarations in the Aspeed GPIO module were incorrect,
probably because of confusion over which bits in the input and output
uint32_t's correspond to which groups in the label array. Since the
uint32_t literals are in big endian, it's sort of the opposite of what
would be intuitive. The least significant bit in ast2500_set_props[6]
corresponds to GPIOY0, not GPIOAB7.

GPIOxx indicates input and output capabilities, GPIxx indicates only
input, GPOxx indicates only output.

AST2500:
- Previously had GPIW0..GPIW7 and GPIX0..GPIX7, that's correct.
- Previously had GPIOY0..GPIOY3, should have been GPIOY0..GPIOY7.
- Previously had GPIOAB0..GPIOAB3 and GPIAB4..GPIAB7, should only have
  been GPIOAB0..GPIOAB3.

AST2600:
- GPIOT0..GPIOT7 should have been GPIT0..GPIT7.
- GPIOU0..GPIOU7 should have been GPIU0..GPIU7.
- GPIW0..GPIW7 should have been GPIOW0..GPIOW7.
- GPIOY0..GPIOY7 and GPIOZ0...GPIOZ7 were disabled.

Fixes: 4b7f956862dc2db4c5c ("hw/gpio: Add basic Aspeed GPIO model for AST2400 
and AST2500")
Fixes: 36d737ee82b2972167e ("hw/gpio: Add in AST2600 specific implementation")
Signed-off-by: Peter Delevoryas 
---
 hw/gpio/aspeed_gpio.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/gpio/aspeed_gpio.c b/hw/gpio/aspeed_gpio.c
index dfa6d6cb40..33a40a624a 100644
--- a/hw/gpio/aspeed_gpio.c
+++ b/hw/gpio/aspeed_gpio.c
@@ -796,7 +796,7 @@ static const GPIOSetProperties ast2500_set_props[] = {
 [3] = {0x,  0x,  {"M", "N", "O", "P"} },
 [4] = {0x,  0x,  {"Q", "R", "S", "T"} },
 [5] = {0x,  0x,  {"U", "V", "W", "X"} },
-[6] = {0xff0f,  0x0f0f,  {"Y", "Z", "AA", "AB"} },
+[6] = {0x0fff,  0x0fff,  {"Y", "Z", "AA", "AB"} },
 [7] = {0x00ff,  0x00ff,  {"AC"} },
 };
 
@@ -805,9 +805,9 @@ static GPIOSetProperties ast2600_3_3v_set_props[] = {
 [1] = {0x,  0x,  {"E", "F", "G", "H"} },
 [2] = {0x,  0x,  {"I", "J", "K", "L"} },
 [3] = {0x,  0x,  {"M", "N", "O", "P"} },
-[4] = {0x,  0x,  {"Q", "R", "S", "T"} },
-[5] = {0x,  0x,  {"U", "V", "W", "X"} },
-[6] = {0x,  0x0fff,  {"Y", "Z", "", ""} },
+[4] = {0x,  0x00ff,  {"Q", "R", "S", "T"} },
+[5] = {0x,  0xff00,  {"U", "V", "W", "X"} },
+[6] = {0x,  0x,  {"Y", "Z"} },
 };
 
 static GPIOSetProperties ast2600_1_8v_set_props[] = {
-- 
2.30.2

[PATCH 0/1] hw: aspeed_gpio: Fix pin I/O type declarations

From: Peter Delevoryas 

In the Aspeed chips, the GPIO pins are mostly labeled in groups of 8,
but some of the groups only have 4 elements. Also, most pins have input
and output capabilities, but some are strictly input or strictly output
pins. We have some arrays that describe the I/O capabilities of each pin
for each chip.

A few of the declarations for the AST2500 and AST2600 in aspeed_gpio.c
don't seem to match the datasheet, probably because of confusion over
the association between big-endian uint32 literals (right-to-left) and
the array of group labels (left-to-right). I checked the AST2400 too,
but it was correct and didn't need any changes.

We might want to consider replacing the u32's with u8's and putting them
together into a u32 programmatically, or perhaps implicitly performing a
big-endian to little-endian conversion, something to make it easier to
read.

Peter Delevoryas (1):
  hw: aspeed_gpio: Fix pin I/O type declarations

 hw/gpio/aspeed_gpio.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

-- 
2.30.2

[PATCH v4 23/25] linux-user/sparc: Implement setup_sigtramp

Create and record the two signal trampolines.
Use them when the guest does not use SA_RESTORER.

Cc: Mark Cave-Ayland 
Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/sparc/target_signal.h |  4 
 linux-user/sparc/signal.c| 40 +---
 2 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/linux-user/sparc/target_signal.h b/linux-user/sparc/target_signal.h
index 34f9a12519..e661ddd6ab 100644
--- a/linux-user/sparc/target_signal.h
+++ b/linux-user/sparc/target_signal.h
@@ -69,6 +69,10 @@ typedef struct target_sigaltstack {
 
 #ifdef TARGET_ABI32
 #define TARGET_ARCH_HAS_SETUP_FRAME
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+#else
+/* For sparc64, use of KA_RESTORER is mandatory. */
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 0
 #endif
 
 /* bit-flags */
diff --git a/linux-user/sparc/signal.c b/linux-user/sparc/signal.c
index 3bc023d281..23e1e761de 100644
--- a/linux-user/sparc/signal.c
+++ b/linux-user/sparc/signal.c
@@ -242,6 +242,12 @@ static void restore_fpu(struct target_siginfo_fpu *fpu, 
CPUSPARCState *env)
 }
 
 #ifdef TARGET_ARCH_HAS_SETUP_FRAME
+static void install_sigtramp(uint32_t *tramp, int syscall)
+{
+__put_user(0x82102000u + syscall, [0]); /* mov syscall, %g1 */
+__put_user(0x91d02010u, [1]);   /* t 0x10 */
+}
+
 void setup_frame(int sig, struct target_sigaction *ka,
  target_sigset_t *set, CPUSPARCState *env)
 {
@@ -291,13 +297,9 @@ void setup_frame(int sig, struct target_sigaction *ka,
 if (ka->ka_restorer) {
 env->regwptr[WREG_O7] = ka->ka_restorer;
 } else {
-env->regwptr[WREG_O7] = sf_addr +
-offsetof(struct target_signal_frame, insns) - 2 * 4;
-
-/* mov __NR_sigreturn, %g1 */
-__put_user(0x821020d8u, >insns[0]);
-/* t 0x10 */
-__put_user(0x91d02010u, >insns[1]);
+/* Not used, but retain for ABI compatibility. */
+install_sigtramp(sf->insns, TARGET_NR_sigreturn);
+env->regwptr[WREG_O7] = default_sigreturn;
 }
 unlock_user(sf, sf_addr, sf_size);
 }
@@ -358,13 +360,9 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 if (ka->ka_restorer) {
 env->regwptr[WREG_O7] = ka->ka_restorer;
 } else {
-env->regwptr[WREG_O7] =
-sf_addr + offsetof(struct target_rt_signal_frame, insns) - 2 * 4;
-
-/* mov __NR_rt_sigreturn, %g1 */
-__put_user(0x82102065u, >insns[0]);
-/* t 0x10 */
-__put_user(0x91d02010u, >insns[1]);
+/* Not used, but retain for ABI compatibility. */
+install_sigtramp(sf->insns, TARGET_NR_rt_sigreturn);
+env->regwptr[WREG_O7] = default_rt_sigreturn;
 }
 #else
 env->regwptr[WREG_O7] = ka->ka_restorer;
@@ -775,4 +773,18 @@ do_sigsegv:
 unlock_user_struct(ucp, ucp_addr, 1);
 force_sig(TARGET_SIGSEGV);
 }
+#else
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 2 * 8, 0);
+assert(tramp != NULL);
+
+default_sigreturn = sigtramp_page;
+install_sigtramp(tramp, TARGET_NR_sigreturn);
+
+default_rt_sigreturn = sigtramp_page + 8;
+install_sigtramp(tramp + 2, TARGET_NR_rt_sigreturn);
+
+unlock_user(tramp, sigtramp_page, 2 * 8);
+}
 #endif
-- 
2.25.1

[PATCH v4 22/25] linux-user/sh4: Implement setup_sigtramp

Create and record the two signal trampolines.
Use them when the guest does not use SA_RESTORER.

Cc: Yoshinori Sato 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/sh4/target_signal.h |  2 ++
 linux-user/sh4/signal.c| 40 +++---
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/linux-user/sh4/target_signal.h b/linux-user/sh4/target_signal.h
index d7309b7136..04069cba66 100644
--- a/linux-user/sh4/target_signal.h
+++ b/linux-user/sh4/target_signal.h
@@ -22,4 +22,6 @@ typedef struct target_sigaltstack {
 #include "../generic/signal.h"
 
 #define TARGET_ARCH_HAS_SETUP_FRAME
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* SH4_TARGET_SIGNAL_H */
diff --git a/linux-user/sh4/signal.c b/linux-user/sh4/signal.c
index d70d744bef..faa869fb19 100644
--- a/linux-user/sh4/signal.c
+++ b/linux-user/sh4/signal.c
@@ -52,7 +52,6 @@ struct target_sigframe
 {
 struct target_sigcontext sc;
 target_ulong extramask[TARGET_NSIG_WORDS-1];
-uint16_t retcode[3];
 };
 
 
@@ -68,7 +67,6 @@ struct target_rt_sigframe
 {
 struct target_siginfo info;
 struct target_ucontext uc;
-uint16_t retcode[3];
 };
 
 
@@ -190,15 +188,9 @@ void setup_frame(int sig, struct target_sigaction *ka,
 /* Set up to return from userspace.  If provided, use a stub
already in userspace.  */
 if (ka->sa_flags & TARGET_SA_RESTORER) {
-regs->pr = (unsigned long) ka->sa_restorer;
+regs->pr = ka->sa_restorer;
 } else {
-/* Generate return code (system call to sigreturn) */
-abi_ulong retcode_addr = frame_addr +
- offsetof(struct target_sigframe, retcode);
-__put_user(MOVW(2), >retcode[0]);
-__put_user(TRAP_NOARG, >retcode[1]);
-__put_user((TARGET_NR_sigreturn), >retcode[2]);
-regs->pr = (unsigned long) retcode_addr;
+regs->pr = default_sigreturn;
 }
 
 /* Set up registers for signal handler */
@@ -248,15 +240,9 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 /* Set up to return from userspace.  If provided, use a stub
already in userspace.  */
 if (ka->sa_flags & TARGET_SA_RESTORER) {
-regs->pr = (unsigned long) ka->sa_restorer;
+regs->pr = ka->sa_restorer;
 } else {
-/* Generate return code (system call to sigreturn) */
-abi_ulong retcode_addr = frame_addr +
- offsetof(struct target_rt_sigframe, retcode);
-__put_user(MOVW(2), >retcode[0]);
-__put_user(TRAP_NOARG, >retcode[1]);
-__put_user((TARGET_NR_rt_sigreturn), >retcode[2]);
-regs->pr = (unsigned long) retcode_addr;
+regs->pr = default_rt_sigreturn;
 }
 
 /* Set up registers for signal handler */
@@ -334,3 +320,21 @@ badframe:
 force_sig(TARGET_SIGSEGV);
 return -TARGET_QEMU_ESIGRETURN;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint16_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 2 * 6, 0);
+assert(tramp != NULL);
+
+default_sigreturn = sigtramp_page;
+__put_user(MOVW(2), [0]);
+__put_user(TRAP_NOARG, [1]);
+__put_user(TARGET_NR_sigreturn, [2]);
+
+default_rt_sigreturn = sigtramp_page + 6;
+__put_user(MOVW(2), [3]);
+__put_user(TRAP_NOARG, [4]);
+__put_user(TARGET_NR_rt_sigreturn, [5]);
+
+unlock_user(tramp, sigtramp_page, 2 * 6);
+}
-- 
2.25.1

[PATCH v4 24/25] linux-user/xtensa: Implement setup_sigtramp

Create and record the rt signal trampoline.
Use it when the guest does not use SA_RESTORER.

Reviewed-by: Max Filippov 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/xtensa/target_signal.h |  2 ++
 linux-user/xtensa/signal.c| 56 ---
 2 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/linux-user/xtensa/target_signal.h 
b/linux-user/xtensa/target_signal.h
index c60bf656f6..1c7ee73154 100644
--- a/linux-user/xtensa/target_signal.h
+++ b/linux-user/xtensa/target_signal.h
@@ -20,4 +20,6 @@ typedef struct target_sigaltstack {
 
 #include "../generic/signal.h"
 
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif
diff --git a/linux-user/xtensa/signal.c b/linux-user/xtensa/signal.c
index 7a3bfb92ca..81572a5fc7 100644
--- a/linux-user/xtensa/signal.c
+++ b/linux-user/xtensa/signal.c
@@ -128,6 +128,29 @@ static int setup_sigcontext(struct target_rt_sigframe 
*frame,
 return 1;
 }
 
+static void install_sigtramp(uint8_t *tramp)
+{
+#ifdef TARGET_WORDS_BIGENDIAN
+/* Generate instruction:  MOVI a2, __NR_rt_sigreturn */
+__put_user(0x22, [0]);
+__put_user(0x0a, [1]);
+__put_user(TARGET_NR_rt_sigreturn, [2]);
+/* Generate instruction:  SYSCALL */
+__put_user(0x00, [3]);
+__put_user(0x05, [4]);
+__put_user(0x00, [5]);
+#else
+/* Generate instruction:  MOVI a2, __NR_rt_sigreturn */
+__put_user(0x22, [0]);
+__put_user(0xa0, [1]);
+__put_user(TARGET_NR_rt_sigreturn, [2]);
+/* Generate instruction:  SYSCALL */
+__put_user(0x00, [3]);
+__put_user(0x50, [4]);
+__put_user(0x00, [5]);
+#endif
+}
+
 void setup_rt_frame(int sig, struct target_sigaction *ka,
 target_siginfo_t *info,
 target_sigset_t *set, CPUXtensaState *env)
@@ -164,26 +187,9 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 if (ka->sa_flags & TARGET_SA_RESTORER) {
 ra = ka->sa_restorer;
 } else {
-ra = frame_addr + offsetof(struct target_rt_sigframe, retcode);
-#ifdef TARGET_WORDS_BIGENDIAN
-/* Generate instruction:  MOVI a2, __NR_rt_sigreturn */
-__put_user(0x22, >retcode[0]);
-__put_user(0x0a, >retcode[1]);
-__put_user(TARGET_NR_rt_sigreturn, >retcode[2]);
-/* Generate instruction:  SYSCALL */
-__put_user(0x00, >retcode[3]);
-__put_user(0x05, >retcode[4]);
-__put_user(0x00, >retcode[5]);
-#else
-/* Generate instruction:  MOVI a2, __NR_rt_sigreturn */
-__put_user(0x22, >retcode[0]);
-__put_user(0xa0, >retcode[1]);
-__put_user(TARGET_NR_rt_sigreturn, >retcode[2]);
-/* Generate instruction:  SYSCALL */
-__put_user(0x00, >retcode[3]);
-__put_user(0x50, >retcode[4]);
-__put_user(0x00, >retcode[5]);
-#endif
+/* Not used, but retain for ABI compatibility. */
+install_sigtramp(frame->retcode);
+ra = default_rt_sigreturn;
 }
 memset(env->regs, 0, sizeof(env->regs));
 env->pc = ka->_sa_handler;
@@ -264,3 +270,13 @@ badframe:
 force_sig(TARGET_SIGSEGV);
 return -TARGET_QEMU_ESIGRETURN;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint8_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 6, 0);
+assert(tramp != NULL);
+
+default_rt_sigreturn = sigtramp_page;
+install_sigtramp(tramp);
+unlock_user(tramp, sigtramp_page, 6);
+}
-- 
2.25.1

[PATCH v4 19/25] linux-user/ppc: Implement setup_sigtramp

Create and record the two signal trampolines.

Cc: qemu-...@nongnu.org
Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/ppc/target_signal.h |  2 ++
 linux-user/ppc/signal.c| 34 ++
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/linux-user/ppc/target_signal.h b/linux-user/ppc/target_signal.h
index 72fcdd9bfa..82184ab8f2 100644
--- a/linux-user/ppc/target_signal.h
+++ b/linux-user/ppc/target_signal.h
@@ -24,4 +24,6 @@ typedef struct target_sigaltstack {
 #if !defined(TARGET_PPC64)
 #define TARGET_ARCH_HAS_SETUP_FRAME
 #endif
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* PPC_TARGET_SIGNAL_H */
diff --git a/linux-user/ppc/signal.c b/linux-user/ppc/signal.c
index 77f37b9f01..c37744c8fc 100644
--- a/linux-user/ppc/signal.c
+++ b/linux-user/ppc/signal.c
@@ -203,9 +203,6 @@ struct target_func_ptr {
 
 #endif
 
-/* We use the mc_pad field for the signal return trampoline.  */
-#define tramp mc_pad
-
 /* See arch/powerpc/kernel/signal.c.  */
 static target_ulong get_sigframe(struct target_sigaction *ka,
  CPUPPCState *env,
@@ -436,12 +433,7 @@ void setup_frame(int sig, struct target_sigaction *ka,
 /* Save user regs.  */
 save_user_regs(env, >mctx);
 
-/* Construct the trampoline code on the stack. */
-encode_trampoline(TARGET_NR_sigreturn, (uint32_t *)>mctx.tramp);
-
-/* The kernel checks for the presence of a VDSO here.  We don't
-   emulate a vdso, so use a sigreturn system call.  */
-env->lr = (target_ulong) h2g(frame->mctx.tramp);
+env->lr = default_sigreturn;
 
 /* Turn off all fp exceptions.  */
 env->fpscr = 0;
@@ -477,7 +469,6 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 target_sigset_t *set, CPUPPCState *env)
 {
 struct target_rt_sigframe *rt_sf;
-uint32_t *trampptr = 0;
 struct target_mcontext *mctx = 0;
 target_ulong rt_sf_addr, newsp = 0;
 int i, err = 0;
@@ -507,22 +498,17 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 
 #if defined(TARGET_PPC64)
 mctx = _sf->uc.tuc_sigcontext.mcontext;
-trampptr = _sf->trampoline[0];
 
 sc = _sf->uc.tuc_sigcontext;
 __put_user(h2g(mctx), >regs);
 __put_user(sig, >signal);
 #else
 mctx = _sf->uc.tuc_mcontext;
-trampptr = (uint32_t *)_sf->uc.tuc_mcontext.tramp;
 #endif
 
 save_user_regs(env, mctx);
-encode_trampoline(TARGET_NR_rt_sigreturn, trampptr);
 
-/* The kernel checks for the presence of a VDSO here.  We don't
-   emulate a vdso, so use a sigreturn system call.  */
-env->lr = (target_ulong) h2g(trampptr);
+env->lr = default_rt_sigreturn;
 
 /* Turn off all fp exceptions.  */
 env->fpscr = 0;
@@ -720,3 +706,19 @@ abi_long do_swapcontext(CPUArchState *env, abi_ulong 
uold_ctx,
 
 return 0;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 2 * 8, 0);
+assert(tramp != NULL);
+
+#ifdef TARGET_ARCH_HAS_SETUP_FRAME
+default_sigreturn = sigtramp_page;
+encode_trampoline(TARGET_NR_sigreturn, tramp + 0);
+#endif
+
+default_rt_sigreturn = sigtramp_page + 8;
+encode_trampoline(TARGET_NR_rt_sigreturn, tramp + 2);
+
+unlock_user(tramp, sigtramp_page, 2 * 8);
+}
-- 
2.25.1

[PATCH v4 25/25] linux-user: Remove default for TARGET_ARCH_HAS_SIGTRAMP_PAGE

All targets now define TARGET_ARCH_HAS_SIGTRAMP_PAGE.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/elfload.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 459a26ef1d..2404d482ba 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -30,10 +30,6 @@
 #undef ELF_ARCH
 #endif
 
-#ifndef TARGET_ARCH_HAS_SIGTRAMP_PAGE
-#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 0
-#endif
-
 #define ELF_OSABI   ELFOSABI_SYSV
 
 /* from personality.h */
-- 
2.25.1

[PATCH v4 20/25] linux-user/riscv: Implement setup_sigtramp

Create and record the rt signal trampoline.

This fixes a bug wrt libgcc fallback unwinding.  It expects
the stack pointer to point to the siginfo_t, whereas we had
inexplicably placed our private signal trampoline at the start
of the signal frame instead of the end.  Now moot because we
have removed it from the stack frame entirely.

Reviewed-by: Alistair Francis 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/riscv/target_signal.h |  2 ++
 linux-user/riscv/signal.c| 22 +-
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/linux-user/riscv/target_signal.h b/linux-user/riscv/target_signal.h
index f113ba9a55..3e36fddc9d 100644
--- a/linux-user/riscv/target_signal.h
+++ b/linux-user/riscv/target_signal.h
@@ -15,4 +15,6 @@ typedef struct target_sigaltstack {
 
 #include "../generic/signal.h"
 
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* RISCV_TARGET_SIGNAL_H */
diff --git a/linux-user/riscv/signal.c b/linux-user/riscv/signal.c
index f7f33bc90a..a0f9542ce3 100644
--- a/linux-user/riscv/signal.c
+++ b/linux-user/riscv/signal.c
@@ -47,7 +47,6 @@ struct target_ucontext {
 };
 
 struct target_rt_sigframe {
-uint32_t tramp[2]; /* not in kernel, which uses VDSO instead */
 struct target_siginfo info;
 struct target_ucontext uc;
 };
@@ -105,12 +104,6 @@ static void setup_ucontext(struct target_ucontext *uc,
 setup_sigcontext(>uc_mcontext, env);
 }
 
-static inline void install_sigtramp(uint32_t *tramp)
-{
-__put_user(0x08b00893, tramp + 0);  /* li a7, 139 = __NR_rt_sigreturn */
-__put_user(0x0073, tramp + 1);  /* ecall */
-}
-
 void setup_rt_frame(int sig, struct target_sigaction *ka,
 target_siginfo_t *info,
 target_sigset_t *set, CPURISCVState *env)
@@ -127,14 +120,13 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 
 setup_ucontext(>uc, env, set);
 tswap_siginfo(>info, info);
-install_sigtramp(frame->tramp);
 
 env->pc = ka->_sa_handler;
 env->gpr[xSP] = frame_addr;
 env->gpr[xA0] = sig;
 env->gpr[xA1] = frame_addr + offsetof(struct target_rt_sigframe, info);
 env->gpr[xA2] = frame_addr + offsetof(struct target_rt_sigframe, uc);
-env->gpr[xRA] = frame_addr + offsetof(struct target_rt_sigframe, tramp);
+env->gpr[xRA] = default_rt_sigreturn;
 
 return;
 
@@ -203,3 +195,15 @@ badframe:
 force_sig(TARGET_SIGSEGV);
 return 0;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 8, 0);
+assert(tramp != NULL);
+
+__put_user(0x08b00893, tramp + 0);  /* li a7, 139 = __NR_rt_sigreturn */
+__put_user(0x0073, tramp + 1);  /* ecall */
+
+default_rt_sigreturn = sigtramp_page;
+unlock_user(tramp, sigtramp_page, 8);
+}
-- 
2.25.1

[PATCH v4 15/25] linux-user/mips: Implement setup_sigtramp

Create and record the two signal trampolines.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/mips/target_signal.h   |  1 +
 linux-user/mips64/target_signal.h |  2 ++
 linux-user/mips/signal.c  | 34 ++-
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/linux-user/mips/target_signal.h b/linux-user/mips/target_signal.h
index d521765f6b..780a4ddf29 100644
--- a/linux-user/mips/target_signal.h
+++ b/linux-user/mips/target_signal.h
@@ -73,6 +73,7 @@ typedef struct target_sigaltstack {
 /* compare linux/arch/mips/kernel/signal.c:setup_frame() */
 #define TARGET_ARCH_HAS_SETUP_FRAME
 #endif
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
 
 /* bit-flags */
 #define TARGET_SS_AUTODISARM (1U << 31) /* disable sas during sighandling */
diff --git a/linux-user/mips64/target_signal.h 
b/linux-user/mips64/target_signal.h
index d857c55e4c..275e9b7f9a 100644
--- a/linux-user/mips64/target_signal.h
+++ b/linux-user/mips64/target_signal.h
@@ -76,4 +76,6 @@ typedef struct target_sigaltstack {
 /* compare linux/arch/mips/kernel/signal.c:setup_frame() */
 #define TARGET_ARCH_HAS_SETUP_FRAME
 #endif
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* MIPS64_TARGET_SIGNAL_H */
diff --git a/linux-user/mips/signal.c b/linux-user/mips/signal.c
index 64072779b9..8f79e405ec 100644
--- a/linux-user/mips/signal.c
+++ b/linux-user/mips/signal.c
@@ -209,8 +209,6 @@ void setup_frame(int sig, struct target_sigaction * ka,
 goto give_sigsegv;
 }
 
-install_sigtramp(frame->sf_code, TARGET_NR_sigreturn);
-
 setup_sigcontext(regs, >sf_sc);
 
 for(i = 0; i < TARGET_NSIG_WORDS; i++) {
@@ -231,7 +229,7 @@ void setup_frame(int sig, struct target_sigaction * ka,
 regs->active_tc.gpr[ 5] = 0;
 regs->active_tc.gpr[ 6] = frame_addr + offsetof(struct sigframe, sf_sc);
 regs->active_tc.gpr[29] = frame_addr;
-regs->active_tc.gpr[31] = frame_addr + offsetof(struct sigframe, sf_code);
+regs->active_tc.gpr[31] = default_sigreturn;
 /* The original kernel code sets CP0_EPC to the handler
 * since it returns to userland using eret
 * we cannot do this here, and we must set PC directly */
@@ -305,8 +303,6 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 goto give_sigsegv;
 }
 
-install_sigtramp(frame->rs_code, TARGET_NR_rt_sigreturn);
-
 tswap_siginfo(>rs_info, info);
 
 __put_user(0, >rs_uc.tuc_flags);
@@ -335,11 +331,13 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 env->active_tc.gpr[ 6] = frame_addr
  + offsetof(struct target_rt_sigframe, rs_uc);
 env->active_tc.gpr[29] = frame_addr;
-env->active_tc.gpr[31] = frame_addr
- + offsetof(struct target_rt_sigframe, rs_code);
-/* The original kernel code sets CP0_EPC to the handler
-* since it returns to userland using eret
-* we cannot do this here, and we must set PC directly */
+env->active_tc.gpr[31] = default_rt_sigreturn;
+
+/*
+ * The original kernel code sets CP0_EPC to the handler
+ * since it returns to userland using eret
+ * we cannot do this here, and we must set PC directly
+ */
 env->active_tc.PC = env->active_tc.gpr[25] = ka->_sa_handler;
 mips_set_hflags_isa_mode_from_pc(env);
 unlock_user_struct(frame, frame_addr, 1);
@@ -379,3 +377,19 @@ badframe:
 force_sig(TARGET_SIGSEGV);
 return -TARGET_QEMU_ESIGRETURN;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 2 * 8, 0);
+assert(tramp != NULL);
+
+#ifdef TARGET_ARCH_HAS_SETUP_FRAME
+default_sigreturn = sigtramp_page;
+install_sigtramp(tramp, TARGET_NR_sigreturn);
+#endif
+
+default_rt_sigreturn = sigtramp_page + 8;
+install_sigtramp(tramp + 2, TARGET_NR_rt_sigreturn);
+
+unlock_user(tramp, sigtramp_page, 2 * 8);
+}
-- 
2.25.1

[PATCH v4 21/25] linux-user/s390x: Implement setup_sigtramp

Create and record the two signal trampolines.
Use them when the guest does not use SA_RESTORER.

Cc: qemu-s3...@nongnu.org
Tested-by: Alex Bennée 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/s390x/target_signal.h |  2 ++
 linux-user/s390x/signal.c| 24 
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/linux-user/s390x/target_signal.h b/linux-user/s390x/target_signal.h
index bbfc464d44..64f5f42201 100644
--- a/linux-user/s390x/target_signal.h
+++ b/linux-user/s390x/target_signal.h
@@ -19,4 +19,6 @@ typedef struct target_sigaltstack {
 #include "../generic/signal.h"
 
 #define TARGET_ARCH_HAS_SETUP_FRAME
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* S390X_TARGET_SIGNAL_H */
diff --git a/linux-user/s390x/signal.c b/linux-user/s390x/signal.c
index 80f34086d7..676b948147 100644
--- a/linux-user/s390x/signal.c
+++ b/linux-user/s390x/signal.c
@@ -68,7 +68,6 @@ typedef struct {
 target_sigregs sregs;
 int signo;
 target_sigregs_ext sregs_ext;
-uint16_t retcode;
 } sigframe;
 
 #define TARGET_UC_VXRS 2
@@ -85,7 +84,6 @@ struct target_ucontext {
 
 typedef struct {
 uint8_t callee_used_stack[__SIGNAL_FRAMESIZE];
-uint16_t retcode;
 struct target_siginfo info;
 struct target_ucontext uc;
 } rt_sigframe;
@@ -209,9 +207,7 @@ void setup_frame(int sig, struct target_sigaction *ka,
 if (ka->sa_flags & TARGET_SA_RESTORER) {
 restorer = ka->sa_restorer;
 } else {
-restorer = frame_addr + offsetof(sigframe, retcode);
-__put_user(S390_SYSCALL_OPCODE | TARGET_NR_sigreturn,
-   >retcode);
+restorer = default_sigreturn;
 }
 
 /* Set up registers for signal handler */
@@ -262,9 +258,7 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 if (ka->sa_flags & TARGET_SA_RESTORER) {
 restorer = ka->sa_restorer;
 } else {
-restorer = frame_addr + offsetof(typeof(*frame), retcode);
-__put_user(S390_SYSCALL_OPCODE | TARGET_NR_rt_sigreturn,
-   >retcode);
+restorer = default_rt_sigreturn;
 }
 
 /* Create siginfo on the signal stack. */
@@ -405,3 +399,17 @@ long do_rt_sigreturn(CPUS390XState *env)
 unlock_user_struct(frame, frame_addr, 0);
 return -TARGET_QEMU_ESIGRETURN;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint16_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 2 + 2, 0);
+assert(tramp != NULL);
+
+default_sigreturn = sigtramp_page;
+__put_user(S390_SYSCALL_OPCODE | TARGET_NR_sigreturn, [0]);
+
+default_rt_sigreturn = sigtramp_page + 2;
+__put_user(S390_SYSCALL_OPCODE | TARGET_NR_rt_sigreturn, [1]);
+
+unlock_user(tramp, sigtramp_page, 2 + 2);
+}
-- 
2.25.1

[PATCH v4 18/25] linux-user/ppc: Simplify encode_trampoline

The sigret parameter is never 0, and even if it was the encoding
of the LI instruction would still work.

Reported-by: Peter Maydell 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/ppc/signal.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/linux-user/ppc/signal.c b/linux-user/ppc/signal.c
index e4d0dfa3bf..77f37b9f01 100644
--- a/linux-user/ppc/signal.c
+++ b/linux-user/ppc/signal.c
@@ -309,10 +309,8 @@ static void save_user_regs(CPUPPCState *env, struct 
target_mcontext *frame)
 static void encode_trampoline(int sigret, uint32_t *tramp)
 {
 /* Set up the sigreturn trampoline: li r0,sigret; sc.  */
-if (sigret) {
-__put_user(0x3800 | sigret, [0]);
-__put_user(0x4402, [1]);
-}
+__put_user(0x3800 | sigret, [0]);
+__put_user(0x4402, [1]);
 }
 
 static void restore_user_regs(CPUPPCState *env,
-- 
2.25.1

[PATCH v4 17/25] linux-user/openrisc: Implement setup_sigtramp

Create and record the rt signal trampoline.

Reviewed-by: Stafford Horne 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/openrisc/target_signal.h |  2 ++
 linux-user/openrisc/signal.c| 22 ++
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/linux-user/openrisc/target_signal.h 
b/linux-user/openrisc/target_signal.h
index 8283eaf544..077ec3d5e8 100644
--- a/linux-user/openrisc/target_signal.h
+++ b/linux-user/openrisc/target_signal.h
@@ -26,4 +26,6 @@ typedef struct target_sigaltstack {
 
 #include "../generic/signal.h"
 
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* OPENRISC_TARGET_SIGNAL_H */
diff --git a/linux-user/openrisc/signal.c b/linux-user/openrisc/signal.c
index ca2532bf50..be8b68784a 100644
--- a/linux-user/openrisc/signal.c
+++ b/linux-user/openrisc/signal.c
@@ -38,7 +38,6 @@ typedef struct target_ucontext {
 typedef struct target_rt_sigframe {
 struct target_siginfo info;
 target_ucontext uc;
-uint32_t retcode[4];  /* trampoline code */
 } target_rt_sigframe;
 
 static void restore_sigcontext(CPUOpenRISCState *env, target_sigcontext *sc)
@@ -116,14 +115,8 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 __put_user(set->sig[i], >uc.tuc_sigmask.sig[i]);
 }
 
-/* This is l.ori r11,r0,__NR_sigreturn; l.sys 1; l.nop; l.nop */
-__put_user(0xa960 | TARGET_NR_rt_sigreturn, frame->retcode + 0);
-__put_user(0x2001, frame->retcode + 1);
-__put_user(0x1500, frame->retcode + 2);
-__put_user(0x1500, frame->retcode + 3);
-
 /* Set up registers for signal handler */
-cpu_set_gpr(env, 9, frame_addr + offsetof(target_rt_sigframe, retcode));
+cpu_set_gpr(env, 9, default_rt_sigreturn);
 cpu_set_gpr(env, 3, sig);
 cpu_set_gpr(env, 4, frame_addr + offsetof(target_rt_sigframe, info));
 cpu_set_gpr(env, 5, frame_addr + offsetof(target_rt_sigframe, uc));
@@ -169,3 +162,16 @@ long do_rt_sigreturn(CPUOpenRISCState *env)
 force_sig(TARGET_SIGSEGV);
 return 0;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 8, 0);
+assert(tramp != NULL);
+
+/* This is l.ori r11,r0,__NR_sigreturn; l.sys 1 */
+__put_user(0xa960 | TARGET_NR_rt_sigreturn, tramp + 0);
+__put_user(0x2001, tramp + 1);
+
+default_rt_sigreturn = sigtramp_page;
+unlock_user(tramp, sigtramp_page, 8);
+}
-- 
2.25.1

[PATCH v4 13/25] linux-user/microblaze: Implement setup_sigtramp

Create and record the rt signal trampoline.

Cc: Edgar E. Iglesias 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/microblaze/target_signal.h |  2 ++
 linux-user/microblaze/signal.c| 24 +---
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/linux-user/microblaze/target_signal.h 
b/linux-user/microblaze/target_signal.h
index 1c326296de..e8b510f6b1 100644
--- a/linux-user/microblaze/target_signal.h
+++ b/linux-user/microblaze/target_signal.h
@@ -21,4 +21,6 @@ typedef struct target_sigaltstack {
 
 #include "../generic/signal.h"
 
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* MICROBLAZE_TARGET_SIGNAL_H */
diff --git a/linux-user/microblaze/signal.c b/linux-user/microblaze/signal.c
index b822679d18..8ebb6a1b7d 100644
--- a/linux-user/microblaze/signal.c
+++ b/linux-user/microblaze/signal.c
@@ -161,17 +161,11 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 
 /* Kernel does not use SA_RESTORER. */
 
-/* addi r12, r0, __NR_sigreturn */
-__put_user(0x3180U | TARGET_NR_rt_sigreturn, frame->tramp + 0);
-/* brki r14, 0x8 */
-__put_user(0xb9cc0008U, frame->tramp + 1);
-
 /*
  * Return from sighandler will jump to the tramp.
  * Negative 8 offset because return is rtsd r15, 8
  */
-env->regs[15] =
-frame_addr + offsetof(struct target_rt_sigframe, tramp) - 8;
+env->regs[15] = default_rt_sigreturn - 8;
 
 /* Set up registers for signal handler */
 env->regs[1] = frame_addr;
@@ -220,3 +214,19 @@ long do_rt_sigreturn(CPUMBState *env)
 force_sig(TARGET_SIGSEGV);
 return -TARGET_QEMU_ESIGRETURN;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 8, 0);
+assert(tramp != NULL);
+
+/*
+ * addi r12, r0, __NR_rt_sigreturn
+ * brki r14, 0x8
+ */
+__put_user(0x3180U | TARGET_NR_rt_sigreturn, tramp);
+__put_user(0xb9cc0008U, tramp + 1);
+
+default_rt_sigreturn = sigtramp_page;
+unlock_user(tramp, sigtramp_page, 8);
+}
-- 
2.25.1

[PATCH v4 14/25] linux-user/mips: Tidy install_sigtramp

The return value is constant 0, and unused as well -- change to void.
Drop inline marker.  Change tramp type to uint32_t* for clarity.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/mips/signal.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/linux-user/mips/signal.c b/linux-user/mips/signal.c
index d174b3453c..64072779b9 100644
--- a/linux-user/mips/signal.c
+++ b/linux-user/mips/signal.c
@@ -87,10 +87,8 @@ struct target_rt_sigframe {
 };
 
 /* Install trampoline to jump back from signal handler */
-static inline int install_sigtramp(unsigned int *tramp,   unsigned int syscall)
+static void install_sigtramp(uint32_t *tramp, unsigned int syscall)
 {
-int err = 0;
-
 /*
  * Set up the return code ...
  *
@@ -100,7 +98,6 @@ static inline int install_sigtramp(unsigned int *tramp,   
unsigned int syscall)
 
 __put_user(0x2402 + syscall, tramp + 0);
 __put_user(0x000c  , tramp + 1);
-return err;
 }
 
 static inline void setup_sigcontext(CPUMIPSState *regs,
-- 
2.25.1

[PATCH v4 07/25] linux-user/cris: Implement setup_sigtramp

Split out setup_sigreturn so that we can continue to
initialize the words on the stack, as documented.
However, use the off-stack trampoline.

Cc: Edgar E. Iglesias 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/cris/target_signal.h |  2 ++
 linux-user/cris/signal.c| 29 +
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/linux-user/cris/target_signal.h b/linux-user/cris/target_signal.h
index 495a142896..83a5155507 100644
--- a/linux-user/cris/target_signal.h
+++ b/linux-user/cris/target_signal.h
@@ -22,4 +22,6 @@ typedef struct target_sigaltstack {
 #include "../generic/signal.h"
 
 #define TARGET_ARCH_HAS_SETUP_FRAME
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* CRIS_TARGET_SIGNAL_H */
diff --git a/linux-user/cris/signal.c b/linux-user/cris/signal.c
index 2c39bdf727..7f6aca934e 100644
--- a/linux-user/cris/signal.c
+++ b/linux-user/cris/signal.c
@@ -97,6 +97,14 @@ static abi_ulong get_sigframe(CPUCRISState *env, int 
framesize)
 return sp - framesize;
 }
 
+static void setup_sigreturn(uint16_t *retcode)
+{
+/* This is movu.w __NR_sigreturn, r9; break 13; */
+__put_user(0x9c5f, retcode + 0);
+__put_user(TARGET_NR_sigreturn, retcode + 1);
+__put_user(0xe93d, retcode + 2);
+}
+
 void setup_frame(int sig, struct target_sigaction *ka,
  target_sigset_t *set, CPUCRISState *env)
 {
@@ -112,14 +120,8 @@ void setup_frame(int sig, struct target_sigaction *ka,
 /*
  * The CRIS signal return trampoline. A real linux/CRIS kernel doesn't
  * use this trampoline anymore but it sets it up for GDB.
- * In QEMU, using the trampoline simplifies things a bit so we use it.
- *
- * This is movu.w __NR_sigreturn, r9; break 13;
  */
-__put_user(0x9c5f, frame->retcode+0);
-__put_user(TARGET_NR_sigreturn,
-   frame->retcode + 1);
-__put_user(0xe93d, frame->retcode + 2);
+setup_sigreturn(frame->retcode);
 
 /* Save the mask.  */
 __put_user(set->sig[0], >sc.oldmask);
@@ -135,7 +137,7 @@ void setup_frame(int sig, struct target_sigaction *ka,
 env->regs[10] = sig;
 env->pc = (unsigned long) ka->_sa_handler;
 /* Link SRP so the guest returns through the trampoline.  */
-env->pregs[PR_SRP] = frame_addr + offsetof(typeof(*frame), retcode);
+env->pregs[PR_SRP] = default_sigreturn;
 
 unlock_user_struct(frame, frame_addr, 1);
 return;
@@ -187,3 +189,14 @@ long do_rt_sigreturn(CPUCRISState *env)
 qemu_log_mask(LOG_UNIMP, "do_rt_sigreturn: not implemented\n");
 return -TARGET_ENOSYS;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint16_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 6, 0);
+assert(tramp != NULL);
+
+default_sigreturn = sigtramp_page;
+setup_sigreturn(tramp);
+
+unlock_user(tramp, sigtramp_page, 6);
+}
-- 
2.25.1

[PATCH v4 16/25] linux-user/nios2: Document non-use of setup_sigtramp

Signed-off-by: Richard Henderson 
---
 linux-user/nios2/target_signal.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/linux-user/nios2/target_signal.h b/linux-user/nios2/target_signal.h
index aebf749f12..fe266c4c51 100644
--- a/linux-user/nios2/target_signal.h
+++ b/linux-user/nios2/target_signal.h
@@ -19,4 +19,7 @@ typedef struct target_sigaltstack {
 
 #include "../generic/signal.h"
 
+/* Nios2 uses a fixed address on the kuser page for sigreturn. */
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 0
+
 #endif /* NIOS2_TARGET_SIGNAL_H */
-- 
2.25.1

[PATCH v4 09/25] linux-user/hppa: Document non-use of setup_sigtramp

We cannot use a raw sigtramp page for hppa,
but must wait for full vdso support.

Reviewed-by: Alex Bennée 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/hppa/target_signal.h | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/linux-user/hppa/target_signal.h b/linux-user/hppa/target_signal.h
index 7f525362e9..d558119ee7 100644
--- a/linux-user/hppa/target_signal.h
+++ b/linux-user/hppa/target_signal.h
@@ -71,4 +71,18 @@ typedef struct target_sigaltstack {
 /* mask for all SS_xxx flags */
 #define TARGET_SS_FLAG_BITS  TARGET_SS_AUTODISARM
 
+/*
+ * We cannot use a bare sigtramp page for hppa-linux.
+ *
+ * Unlike other guests where we use the instructions at PC to validate
+ * an offset from SP, the hppa libgcc signal frame fallback unwinding uses
+ * the PC address itself to find the frame.  This is due to the fact that
+ * the hppa grows the stack upward, and the frame is of unknown size.
+ *
+ * TODO: We should be able to use a VDSO to address this, by providing
+ * proper unwind info for the sigtramp code, at which point the fallback
+ * unwinder will not be used.
+ */
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 0
+
 #endif /* HPPA_TARGET_SIGNAL_H */
-- 
2.25.1

[PATCH v4 01/25] linux-user: Add infrastructure for a signal trampoline page

Allocate a page to hold the signal trampoline(s).
Invoke a guest-specific hook to fill in the contents
of the page before marking it read-execute again.

Reviewed-by: Max Filippov 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/signal-common.h |  6 ++
 linux-user/elfload.c   | 18 ++
 linux-user/signal.c|  3 +++
 3 files changed, 27 insertions(+)

diff --git a/linux-user/signal-common.h b/linux-user/signal-common.h
index 79511becb4..7457f8025c 100644
--- a/linux-user/signal-common.h
+++ b/linux-user/signal-common.h
@@ -20,6 +20,12 @@
 #ifndef SIGNAL_COMMON_H
 #define SIGNAL_COMMON_H
 
+/* Fallback addresses into sigtramp page. */
+extern abi_ulong default_sigreturn;
+extern abi_ulong default_rt_sigreturn;
+
+void setup_sigtramp(abi_ulong tramp_page);
+
 int on_sig_stack(unsigned long sp);
 int sas_ss_flags(unsigned long sp);
 abi_ulong target_sigsp(abi_ulong sp, struct target_sigaction *ka);
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 5f9e2141ad..459a26ef1d 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -7,6 +7,7 @@
 
 #include "qemu.h"
 #include "user-internals.h"
+#include "signal-common.h"
 #include "loader.h"
 #include "user-mmap.h"
 #include "disas/disas.h"
@@ -17,6 +18,7 @@
 #include "qemu/units.h"
 #include "qemu/selfmap.h"
 #include "qapi/error.h"
+#include "target_signal.h"
 
 #ifdef _ARCH_PPC64
 #undef ARCH_DLINFO
@@ -28,6 +30,10 @@
 #undef ELF_ARCH
 #endif
 
+#ifndef TARGET_ARCH_HAS_SIGTRAMP_PAGE
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 0
+#endif
+
 #define ELF_OSABI   ELFOSABI_SYSV
 
 /* from personality.h */
@@ -3249,6 +3255,18 @@ int load_elf_binary(struct linux_binprm *bprm, struct 
image_info *info)
 #endif
 }
 
+/*
+ * TODO: load a vdso, which would also contain the signal trampolines.
+ * Otherwise, allocate a private page to hold them.
+ */
+if (TARGET_ARCH_HAS_SIGTRAMP_PAGE) {
+abi_ulong tramp_page = target_mmap(0, TARGET_PAGE_SIZE,
+   PROT_READ | PROT_WRITE,
+   MAP_PRIVATE | MAP_ANON, -1, 0);
+setup_sigtramp(tramp_page);
+target_mprotect(tramp_page, TARGET_PAGE_SIZE, PROT_READ | PROT_EXEC);
+}
+
 bprm->p = create_elf_tables(bprm->p, bprm->argc, bprm->envc, _ex,
 info, (elf_interpreter ? _info : NULL));
 info->start_stack = bprm->p;
diff --git a/linux-user/signal.c b/linux-user/signal.c
index 2038216455..14d8fdfde1 100644
--- a/linux-user/signal.c
+++ b/linux-user/signal.c
@@ -35,6 +35,9 @@ static struct target_sigaction sigact_table[TARGET_NSIG];
 static void host_signal_handler(int host_signum, siginfo_t *info,
 void *puc);
 
+/* Fallback addresses into sigtramp page. */
+abi_ulong default_sigreturn;
+abi_ulong default_rt_sigreturn;
 
 /*
  * System includes define _NSIG as SIGRTMAX + 1,
-- 
2.25.1

[PATCH v4 12/25] linux-user/m68k: Implement setup_sigtramp

Create and record the two signal trampolines.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/m68k/target_signal.h |  2 ++
 linux-user/m68k/signal.c| 47 +++--
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/linux-user/m68k/target_signal.h b/linux-user/m68k/target_signal.h
index d096544ef8..94157bf1f4 100644
--- a/linux-user/m68k/target_signal.h
+++ b/linux-user/m68k/target_signal.h
@@ -22,4 +22,6 @@ typedef struct target_sigaltstack {
 #include "../generic/signal.h"
 
 #define TARGET_ARCH_HAS_SETUP_FRAME
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* M68K_TARGET_SIGNAL_H */
diff --git a/linux-user/m68k/signal.c b/linux-user/m68k/signal.c
index 4f8eb6f727..ec33482e14 100644
--- a/linux-user/m68k/signal.c
+++ b/linux-user/m68k/signal.c
@@ -39,7 +39,6 @@ struct target_sigframe
 int sig;
 int code;
 abi_ulong psc;
-char retcode[8];
 abi_ulong extramask[TARGET_NSIG_WORDS-1];
 struct target_sigcontext sc;
 };
@@ -76,7 +75,6 @@ struct target_rt_sigframe
 int sig;
 abi_ulong pinfo;
 abi_ulong puc;
-char retcode[8];
 struct target_siginfo info;
 struct target_ucontext uc;
 };
@@ -130,7 +128,6 @@ void setup_frame(int sig, struct target_sigaction *ka,
 {
 struct target_sigframe *frame;
 abi_ulong frame_addr;
-abi_ulong retcode_addr;
 abi_ulong sc_addr;
 int i;
 
@@ -152,16 +149,7 @@ void setup_frame(int sig, struct target_sigaction *ka,
 }
 
 /* Set up to return from userspace.  */
-
-retcode_addr = frame_addr + offsetof(struct target_sigframe, retcode);
-__put_user(retcode_addr, >pretcode);
-
-/* moveq #,d0; trap #0 */
-
-__put_user(0x70004e40 + (TARGET_NR_sigreturn << 16),
-   (uint32_t *)(frame->retcode));
-
-/* Set up to return from userspace */
+__put_user(default_sigreturn, >pretcode);
 
 env->aregs[7] = frame_addr;
 env->pc = ka->_sa_handler;
@@ -288,7 +276,6 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 {
 struct target_rt_sigframe *frame;
 abi_ulong frame_addr;
-abi_ulong retcode_addr;
 abi_ulong info_addr;
 abi_ulong uc_addr;
 int err = 0;
@@ -325,17 +312,7 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 }
 
 /* Set up to return from userspace.  */
-
-retcode_addr = frame_addr + offsetof(struct target_sigframe, retcode);
-__put_user(retcode_addr, >pretcode);
-
-/* moveq #,d0; notb d0; trap #0 */
-
-__put_user(0x70004600 + ((TARGET_NR_rt_sigreturn ^ 0xff) << 16),
-   (uint32_t *)(frame->retcode + 0));
-__put_user(0x4e40, (uint16_t *)(frame->retcode + 4));
-
-/* Set up to return from userspace */
+__put_user(default_rt_sigreturn, >pretcode);
 
 env->aregs[7] = frame_addr;
 env->pc = ka->_sa_handler;
@@ -411,3 +388,23 @@ badframe:
 force_sig(TARGET_SIGSEGV);
 return -TARGET_QEMU_ESIGRETURN;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+void *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 4 + 6, 0);
+assert(tramp != NULL);
+
+default_sigreturn = sigtramp_page;
+
+/* moveq #,d0; trap #0 */
+__put_user(0x70004e40 + (TARGET_NR_sigreturn << 16), (uint32_t *)tramp);
+
+default_rt_sigreturn = sigtramp_page + 4;
+
+/* moveq #,d0; notb d0; trap #0 */
+__put_user(0x70004600 + ((TARGET_NR_rt_sigreturn ^ 0xff) << 16),
+   (uint32_t *)(tramp + 4));
+__put_user(0x4e40, (uint16_t *)(tramp + 8));
+
+unlock_user(tramp, sigtramp_page, 4 + 6);
+}
-- 
2.25.1

[PATCH v4 11/25] linux-user/x86_64: Raise SIGSEGV if SA_RESTORER not set

This has been a fixme for some time.  The effect of
returning -EFAULT from the kernel code is to raise SIGSEGV.

Reviewed-by: Peter Maydell 
Signed-off-by: Richard Henderson 
---
 linux-user/i386/signal.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/linux-user/i386/signal.c b/linux-user/i386/signal.c
index b38b5f108e..433efa3d69 100644
--- a/linux-user/i386/signal.c
+++ b/linux-user/i386/signal.c
@@ -421,19 +421,18 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 
 /* Set up to return from userspace.  If provided, use a stub
already in userspace.  */
-#ifndef TARGET_X86_64
 if (ka->sa_flags & TARGET_SA_RESTORER) {
 __put_user(ka->sa_restorer, >pretcode);
 } else {
+#ifdef TARGET_X86_64
+/* For x86_64, SA_RESTORER is required ABI.  */
+goto give_sigsegv;
+#else
 /* This is no longer used, but is retained for ABI compatibility. */
 install_rt_sigtramp(frame->retcode);
 __put_user(default_rt_sigreturn, >pretcode);
-}
-#else
-/* XXX: Would be slightly better to return -EFAULT here if test fails
-   assert(ka->sa_flags & TARGET_SA_RESTORER); */
-__put_user(ka->sa_restorer, >pretcode);
 #endif
+}
 
 /* Set up registers for signal handler */
 env->regs[R_ESP] = frame_addr;
-- 
2.25.1

[PATCH v4 08/25] linux-user/hexagon: Implement setup_sigtramp

Continue to initialize the words on the stack, as documented.
However, use the off-stack trampoline.

Reviewed-by: Taylor Simpson 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/hexagon/target_signal.h |  2 ++
 linux-user/hexagon/signal.c| 19 +--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/linux-user/hexagon/target_signal.h 
b/linux-user/hexagon/target_signal.h
index 345cf1cbb8..9e0223d322 100644
--- a/linux-user/hexagon/target_signal.h
+++ b/linux-user/hexagon/target_signal.h
@@ -31,4 +31,6 @@ typedef struct target_sigaltstack {
 
 #include "../generic/signal.h"
 
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* TARGET_SIGNAL_H */
diff --git a/linux-user/hexagon/signal.c b/linux-user/hexagon/signal.c
index c7f0bf6b92..74e61739a0 100644
--- a/linux-user/hexagon/signal.c
+++ b/linux-user/hexagon/signal.c
@@ -162,6 +162,11 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 
 setup_ucontext(>uc, env, set);
 tswap_siginfo(>info, info);
+/*
+ * The on-stack signal trampoline is no longer executed;
+ * however, the libgcc signal frame unwinding code checks
+ * for the presence of these two numeric magic values.
+ */
 install_sigtramp(frame->tramp);
 
 env->gpr[HEX_REG_PC] = ka->_sa_handler;
@@ -171,8 +176,7 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 frame_addr + offsetof(struct target_rt_sigframe, info);
 env->gpr[HEX_REG_R02] =
 frame_addr + offsetof(struct target_rt_sigframe, uc);
-env->gpr[HEX_REG_LR] =
-frame_addr + offsetof(struct target_rt_sigframe, tramp);
+env->gpr[HEX_REG_LR] = default_rt_sigreturn;
 
 return;
 
@@ -271,3 +275,14 @@ badframe:
 force_sig(TARGET_SIGSEGV);
 return 0;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 4 * 2, 0);
+assert(tramp != NULL);
+
+default_rt_sigreturn = sigtramp_page;
+install_sigtramp(tramp);
+
+unlock_user(tramp, sigtramp_page, 4 * 2);
+}
-- 
2.25.1

[PATCH v4 10/25] linux-user/i386: Implement setup_sigtramp

Create and record the two signal trampolines.
Use them when the guest does not use SA_RESTORER.
Note that x86_64 does not use this code.

Signed-off-by: Richard Henderson 
---
 linux-user/i386/target_signal.h   |  2 ++
 linux-user/x86_64/target_signal.h |  3 ++
 linux-user/i386/signal.c  | 56 +--
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/linux-user/i386/target_signal.h b/linux-user/i386/target_signal.h
index 50361af874..64d09f2e75 100644
--- a/linux-user/i386/target_signal.h
+++ b/linux-user/i386/target_signal.h
@@ -22,4 +22,6 @@ typedef struct target_sigaltstack {
 #include "../generic/signal.h"
 
 #define TARGET_ARCH_HAS_SETUP_FRAME
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* I386_TARGET_SIGNAL_H */
diff --git a/linux-user/x86_64/target_signal.h 
b/linux-user/x86_64/target_signal.h
index 4ea74f20dd..4673c5a886 100644
--- a/linux-user/x86_64/target_signal.h
+++ b/linux-user/x86_64/target_signal.h
@@ -21,4 +21,7 @@ typedef struct target_sigaltstack {
 
 #include "../generic/signal.h"
 
+/* For x86_64, use of SA_RESTORER is mandatory. */
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 0
+
 #endif /* X86_64_TARGET_SIGNAL_H */
diff --git a/linux-user/i386/signal.c b/linux-user/i386/signal.c
index 3b4b55fc0a..b38b5f108e 100644
--- a/linux-user/i386/signal.c
+++ b/linux-user/i386/signal.c
@@ -310,6 +310,22 @@ get_sigframe(struct target_sigaction *ka, CPUX86State 
*env, size_t frame_size)
 }
 
 #ifndef TARGET_X86_64
+static void install_sigtramp(void *tramp)
+{
+/* This is popl %eax ; movl $syscall,%eax ; int $0x80 */
+__put_user(0xb858, (uint16_t *)(tramp + 0));
+__put_user(TARGET_NR_sigreturn, (int32_t *)(tramp + 2));
+__put_user(0x80cd, (uint16_t *)(tramp + 6));
+}
+
+static void install_rt_sigtramp(void *tramp)
+{
+/* This is movl $syscall,%eax ; int $0x80 */
+__put_user(0xb8, (uint8_t *)(tramp + 0));
+__put_user(TARGET_NR_rt_sigreturn, (int32_t *)(tramp + 1));
+__put_user(0x80cd, (uint16_t *)(tramp + 5));
+}
+
 /* compare linux/arch/i386/kernel/signal.c:setup_frame() */
 void setup_frame(int sig, struct target_sigaction *ka,
  target_sigset_t *set, CPUX86State *env)
@@ -338,16 +354,9 @@ void setup_frame(int sig, struct target_sigaction *ka,
 if (ka->sa_flags & TARGET_SA_RESTORER) {
 __put_user(ka->sa_restorer, >pretcode);
 } else {
-uint16_t val16;
-abi_ulong retcode_addr;
-retcode_addr = frame_addr + offsetof(struct sigframe, retcode);
-__put_user(retcode_addr, >pretcode);
-/* This is popl %eax ; movl $,%eax ; int $0x80 */
-val16 = 0xb858;
-__put_user(val16, (uint16_t *)(frame->retcode+0));
-__put_user(TARGET_NR_sigreturn, (int *)(frame->retcode+2));
-val16 = 0x80cd;
-__put_user(val16, (uint16_t *)(frame->retcode+6));
+/* This is no longer used, but is retained for ABI compatibility. */
+install_sigtramp(frame->retcode);
+__put_user(default_sigreturn, >pretcode);
 }
 
 /* Set up registers for signal handler */
@@ -416,14 +425,9 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 if (ka->sa_flags & TARGET_SA_RESTORER) {
 __put_user(ka->sa_restorer, >pretcode);
 } else {
-uint16_t val16;
-addr = frame_addr + offsetof(struct rt_sigframe, retcode);
-__put_user(addr, >pretcode);
-/* This is movl $,%eax ; int $0x80 */
-__put_user(0xb8, (char *)(frame->retcode+0));
-__put_user(TARGET_NR_rt_sigreturn, (int *)(frame->retcode+1));
-val16 = 0x80cd;
-__put_user(val16, (uint16_t *)(frame->retcode+5));
+/* This is no longer used, but is retained for ABI compatibility. */
+install_rt_sigtramp(frame->retcode);
+__put_user(default_rt_sigreturn, >pretcode);
 }
 #else
 /* XXX: Would be slightly better to return -EFAULT here if test fails
@@ -592,3 +596,19 @@ badframe:
 force_sig(TARGET_SIGSEGV);
 return -TARGET_QEMU_ESIGRETURN;
 }
+
+#ifndef TARGET_X86_64
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint16_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 2 * 8, 0);
+assert(tramp != NULL);
+
+default_sigreturn = sigtramp_page;
+install_sigtramp(tramp);
+
+default_rt_sigreturn = sigtramp_page + 8;
+install_rt_sigtramp(tramp + 8);
+
+unlock_user(tramp, sigtramp_page, 2 * 8);
+}
+#endif
-- 
2.25.1

[PATCH v4 02/25] linux-user/aarch64: Implement setup_sigtramp

Create and record the rt signal trampoline.
Use it when the guest does not use SA_RESTORER.

Reviewed-by: Peter Maydell 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/aarch64/target_signal.h |  2 ++
 linux-user/aarch64/signal.c| 34 ++
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/linux-user/aarch64/target_signal.h 
b/linux-user/aarch64/target_signal.h
index 18013e1b23..7580d99403 100644
--- a/linux-user/aarch64/target_signal.h
+++ b/linux-user/aarch64/target_signal.h
@@ -25,4 +25,6 @@ typedef struct target_sigaltstack {
 #define TARGET_SEGV_MTESERR  9  /* Synchronous ARM MTE exception */
 
 #define TARGET_ARCH_HAS_SETUP_FRAME
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* AARCH64_TARGET_SIGNAL_H */
diff --git a/linux-user/aarch64/signal.c b/linux-user/aarch64/signal.c
index 49025648cb..29c52db3f1 100644
--- a/linux-user/aarch64/signal.c
+++ b/linux-user/aarch64/signal.c
@@ -109,7 +109,6 @@ struct target_rt_sigframe {
 struct target_rt_frame_record {
 uint64_t fp;
 uint64_t lr;
-uint32_t tramp[2];
 };
 
 static void target_setup_general_frame(struct target_rt_sigframe *sf,
@@ -461,9 +460,9 @@ static void target_setup_frame(int usig, struct 
target_sigaction *ka,
 layout.total_size = MAX(layout.total_size,
 sizeof(struct target_rt_sigframe));
 
-/* Reserve space for the return code.  On a real system this would
- * be within the VDSO.  So, despite the name this is not a "real"
- * record within the frame.
+/*
+ * Reserve space for the standard frame unwind pair: fp, lr.
+ * Despite the name this is not a "real" record within the frame.
  */
 fr_ofs = layout.total_size;
 layout.total_size += sizeof(struct target_rt_frame_record);
@@ -496,15 +495,7 @@ static void target_setup_frame(int usig, struct 
target_sigaction *ka,
 if (ka->sa_flags & TARGET_SA_RESTORER) {
 return_addr = ka->sa_restorer;
 } else {
-/*
- * mov x8,#__NR_rt_sigreturn; svc #0
- * Since these are instructions they need to be put as little-endian
- * regardless of target default or current CPU endianness.
- */
-__put_user_e(0xd2801168, >tramp[0], le);
-__put_user_e(0xd401, >tramp[1], le);
-return_addr = frame_addr + fr_ofs
-+ offsetof(struct target_rt_frame_record, tramp);
+return_addr = default_rt_sigreturn;
 }
 env->xregs[0] = usig;
 env->xregs[29] = frame_addr + fr_ofs;
@@ -577,3 +568,20 @@ long do_sigreturn(CPUARMState *env)
 {
 return do_rt_sigreturn(env);
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 8, 0);
+assert(tramp != NULL);
+
+/*
+ * mov x8,#__NR_rt_sigreturn; svc #0
+ * Since these are instructions they need to be put as little-endian
+ * regardless of target default or current CPU endianness.
+ */
+__put_user_e(0xd2801168, [0], le);
+__put_user_e(0xd401, [1], le);
+
+default_rt_sigreturn = sigtramp_page;
+unlock_user(tramp, sigtramp_page, 8);
+}
-- 
2.25.1

[PATCH v4 00/25] linux-user: Move signal trampolines to new page

Changes for v4:
  * Drop nios2 changes -- I still haven't built a full toolchain.
  * Clean up arm changes.
  * Minur i386 pointer type changes.

Changes for v3:
  * Drop vdsos, reinstate setup_sigtramp for all targets.
  * Incorporate nios2 kuser page emulation, which contains
the sigtramp for that target.

Changes for v2:
  * Add vdsos for aarch64, arm, i386, riscv.
  * Drop setup_sigtramp for any target with a vdso.
  * Drop arm v1 signal support.
  * Simplify ppc encode_trampoline.


r~


Richard Henderson (25):
  linux-user: Add infrastructure for a signal trampoline page
  linux-user/aarch64: Implement setup_sigtramp
  linux-user/arm: Drop v1 signal frames
  linux-user/arm: Drop "_v2" from symbols in signal.c
  linux-user/arm: Implement setup_sigtramp
  linux-user/alpha: Implement setup_sigtramp
  linux-user/cris: Implement setup_sigtramp
  linux-user/hexagon: Implement setup_sigtramp
  linux-user/hppa: Document non-use of setup_sigtramp
  linux-user/i386: Implement setup_sigtramp
  linux-user/x86_64: Raise SIGSEGV if SA_RESTORER not set
  linux-user/m68k: Implement setup_sigtramp
  linux-user/microblaze: Implement setup_sigtramp
  linux-user/mips: Tidy install_sigtramp
  linux-user/mips: Implement setup_sigtramp
  linux-user/nios2: Document non-use of setup_sigtramp
  linux-user/openrisc: Implement setup_sigtramp
  linux-user/ppc: Simplify encode_trampoline
  linux-user/ppc: Implement setup_sigtramp
  linux-user/riscv: Implement setup_sigtramp
  linux-user/s390x: Implement setup_sigtramp
  linux-user/sh4: Implement setup_sigtramp
  linux-user/sparc: Implement setup_sigtramp
  linux-user/xtensa: Implement setup_sigtramp
  linux-user: Remove default for TARGET_ARCH_HAS_SIGTRAMP_PAGE

 linux-user/aarch64/target_signal.h|   2 +
 linux-user/alpha/target_signal.h  |   1 +
 linux-user/arm/target_signal.h|   2 +
 linux-user/cris/target_signal.h   |   2 +
 linux-user/hexagon/target_signal.h|   2 +
 linux-user/hppa/target_signal.h   |  14 +
 linux-user/i386/target_signal.h   |   2 +
 linux-user/m68k/target_signal.h   |   2 +
 linux-user/microblaze/target_signal.h |   2 +
 linux-user/mips/target_signal.h   |   1 +
 linux-user/mips64/target_signal.h |   2 +
 linux-user/nios2/target_signal.h  |   3 +
 linux-user/openrisc/target_signal.h   |   2 +
 linux-user/ppc/target_signal.h|   2 +
 linux-user/riscv/target_signal.h  |   2 +
 linux-user/s390x/target_signal.h  |   2 +
 linux-user/sh4/target_signal.h|   2 +
 linux-user/signal-common.h|   6 +
 linux-user/sparc/target_signal.h  |   4 +
 linux-user/x86_64/target_signal.h |   3 +
 linux-user/xtensa/target_signal.h |   2 +
 linux-user/aarch64/signal.c   |  34 +-
 linux-user/alpha/signal.c |  34 +-
 linux-user/arm/signal.c   | 583 +-
 linux-user/cris/signal.c  |  29 +-
 linux-user/elfload.c  |  14 +
 linux-user/hexagon/signal.c   |  19 +-
 linux-user/i386/signal.c  |  65 ++-
 linux-user/m68k/signal.c  |  47 +--
 linux-user/microblaze/signal.c|  24 +-
 linux-user/mips/signal.c  |  39 +-
 linux-user/openrisc/signal.c  |  22 +-
 linux-user/ppc/signal.c   |  40 +-
 linux-user/riscv/signal.c |  22 +-
 linux-user/s390x/signal.c |  24 +-
 linux-user/sh4/signal.c   |  40 +-
 linux-user/signal.c   |   3 +
 linux-user/sparc/signal.c |  40 +-
 linux-user/xtensa/signal.c|  56 ++-
 39 files changed, 603 insertions(+), 592 deletions(-)

-- 
2.25.1

[PATCH v4 05/25] linux-user/arm: Implement setup_sigtramp

Mirror what the kernel does in arch/arm/kernel/signal.h,
using the old sigframe struct in the rt sigframe struct.

Update the trampoline code to match the kernel: this uses
sp-relative accesses rather than pc-relative.

Copy the code into frame->retcode from the trampoline page.
This minimises the different cases wrt arm vs thumb vs fdpic.

Signed-off-by: Richard Henderson 
---
 linux-user/arm/target_signal.h |   2 +
 linux-user/arm/signal.c| 184 -
 2 files changed, 115 insertions(+), 71 deletions(-)

diff --git a/linux-user/arm/target_signal.h b/linux-user/arm/target_signal.h
index 0998dd6dfa..1e7fb0cecb 100644
--- a/linux-user/arm/target_signal.h
+++ b/linux-user/arm/target_signal.h
@@ -22,4 +22,6 @@ typedef struct target_sigaltstack {
 #include "../generic/signal.h"
 
 #define TARGET_ARCH_HAS_SETUP_FRAME
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
+
 #endif /* ARM_TARGET_SIGNAL_H */
diff --git a/linux-user/arm/signal.c b/linux-user/arm/signal.c
index ed7d1d80bb..67a3d1428b 100644
--- a/linux-user/arm/signal.c
+++ b/linux-user/arm/signal.c
@@ -99,43 +99,21 @@ struct sigframe
 struct rt_sigframe
 {
 struct target_siginfo info;
-struct target_ucontext uc;
-abi_ulong retcode[4];
+struct sigframe sig;
 };
 
+static abi_ptr sigreturn_fdpic_tramp;
+
 /*
- * For ARM syscalls, we encode the syscall number into the instruction.
+ * Up to 3 words of 'retcode' in the sigframe are code,
+ * with retcode[3] being used by fdpic for the function descriptor.
+ * This code is not actually executed, but is retained for ABI compat.
+ *
+ * We will create a table of 8 retcode variants in the sigtramp page.
+ * Let each table entry use 3 words.
  */
-#define SWI_SYS_SIGRETURN   (0xef00|(TARGET_NR_sigreturn + 
ARM_SYSCALL_BASE))
-#define SWI_SYS_RT_SIGRETURN(0xef00|(TARGET_NR_rt_sigreturn + 
ARM_SYSCALL_BASE))
-
-/*
- * For Thumb syscalls, we pass the syscall number via r7.  We therefore
- * need two 16-bit instructions.
- */
-#define SWI_THUMB_SIGRETURN (0xdf00 << 16 | 0x2700 | (TARGET_NR_sigreturn))
-#define SWI_THUMB_RT_SIGRETURN  (0xdf00 << 16 | 0x2700 | 
(TARGET_NR_rt_sigreturn))
-
-static const abi_ulong retcodes[4] = {
-SWI_SYS_SIGRETURN,  SWI_THUMB_SIGRETURN,
-SWI_SYS_RT_SIGRETURN,   SWI_THUMB_RT_SIGRETURN
-};
-
-/*
- * Stub needed to make sure the FD register (r9) contains the right
- * value.
- */
-static const unsigned long sigreturn_fdpic_codes[3] = {
-0xe59fc004, /* ldr r12, [pc, #4] to read function descriptor */
-0xe59c9004, /* ldr r9, [r12, #4] to setup GOT */
-0xe59cf000  /* ldr pc, [r12] to jump into restorer */
-};
-
-static const unsigned long sigreturn_fdpic_thumb_codes[3] = {
-0xc008f8df, /* ldr r12, [pc, #8] to read function descriptor */
-0x9004f8dc, /* ldr r9, [r12, #4] to setup GOT */
-0xf000f8dc  /* ldr pc, [r12] to jump into restorer */
-};
+#define RETCODE_WORDS  3
+#define RETCODE_BYTES  (RETCODE_WORDS * 4)
 
 static inline int valid_user_regs(CPUARMState *regs)
 {
@@ -183,15 +161,15 @@ get_sigframe(struct target_sigaction *ka, CPUARMState 
*regs, int framesize)
 }
 
 static int
-setup_return(CPUARMState *env, struct target_sigaction *ka,
- abi_ulong *rc, abi_ulong frame_addr, int usig, abi_ulong rc_addr)
+setup_return(CPUARMState *env, struct target_sigaction *ka, int usig,
+ struct sigframe *frame, abi_ulong sp_addr)
 {
 abi_ulong handler = 0;
 abi_ulong handler_fdpic_GOT = 0;
 abi_ulong retcode;
-
-int thumb;
+int thumb, retcode_idx;
 int is_fdpic = info_is_fdpic(((TaskState *)thread_cpu->opaque)->info);
+bool copy_retcode;
 
 if (is_fdpic) {
 /* In FDPIC mode, ka->_sa_handler points to a function
@@ -208,6 +186,7 @@ setup_return(CPUARMState *env, struct target_sigaction *ka,
 }
 
 thumb = handler & 1;
+retcode_idx = thumb + (ka->sa_flags & TARGET_SA_SIGINFO ? 2 : 0);
 
 uint32_t cpsr = cpsr_read(env);
 
@@ -225,44 +204,34 @@ setup_return(CPUARMState *env, struct target_sigaction 
*ka,
 
 if (ka->sa_flags & TARGET_SA_RESTORER) {
 if (is_fdpic) {
-/* For FDPIC we ensure that the restorer is called with a
- * correct r9 value.  For that we need to write code on
- * the stack that sets r9 and jumps back to restorer
- * value.
- */
-if (thumb) {
-__put_user(sigreturn_fdpic_thumb_codes[0], rc);
-__put_user(sigreturn_fdpic_thumb_codes[1], rc + 1);
-__put_user(sigreturn_fdpic_thumb_codes[2], rc + 2);
-__put_user((abi_ulong)ka->sa_restorer, rc + 3);
-} else {
-__put_user(sigreturn_fdpic_codes[0], rc);
-__put_user(sigreturn_fdpic_codes[1], rc + 1);
-__put_user(sigreturn_fdpic_codes[2], rc + 2);
-__put_user((abi_ulong)ka->sa_restorer, rc + 3);
-}
-
-retcode =

[PATCH v4 03/25] linux-user/arm: Drop v1 signal frames

Version 2 signal frames are used from 2.6.12 and since cbc14e6f286,
we have set UNAME_MINIMUM_RELEASE to 2.6.32.

Suggested-by: Peter Maydell 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/arm/signal.c | 220 +---
 1 file changed, 4 insertions(+), 216 deletions(-)

diff --git a/linux-user/arm/signal.c b/linux-user/arm/signal.c
index ed144f9455..d0940bab47 100644
--- a/linux-user/arm/signal.c
+++ b/linux-user/arm/signal.c
@@ -46,14 +46,6 @@ struct target_sigcontext {
 abi_ulong fault_address;
 };
 
-struct target_ucontext_v1 {
-abi_ulong tuc_flags;
-abi_ulong tuc_link;
-target_stack_t tuc_stack;
-struct target_sigcontext tuc_mcontext;
-target_sigset_t  tuc_sigmask;   /* mask last for extensibility */
-};
-
 struct target_ucontext_v2 {
 abi_ulong tuc_flags;
 abi_ulong tuc_link;
@@ -98,28 +90,12 @@ struct target_iwmmxt_sigframe {
 #define TARGET_VFP_MAGIC 0x56465001
 #define TARGET_IWMMXT_MAGIC 0x12ef842a
 
-struct sigframe_v1
-{
-struct target_sigcontext sc;
-abi_ulong extramask[TARGET_NSIG_WORDS-1];
-abi_ulong retcode[4];
-};
-
 struct sigframe_v2
 {
 struct target_ucontext_v2 uc;
 abi_ulong retcode[4];
 };
 
-struct rt_sigframe_v1
-{
-abi_ulong pinfo;
-abi_ulong puc;
-struct target_siginfo info;
-struct target_ucontext_v1 uc;
-abi_ulong retcode[4];
-};
-
 struct rt_sigframe_v2
 {
 struct target_siginfo info;
@@ -363,37 +339,6 @@ static void setup_sigframe_v2(struct target_ucontext_v2 
*uc,
 }
 }
 
-/* compare linux/arch/arm/kernel/signal.c:setup_frame() */
-static void setup_frame_v1(int usig, struct target_sigaction *ka,
-   target_sigset_t *set, CPUARMState *regs)
-{
-struct sigframe_v1 *frame;
-abi_ulong frame_addr = get_sigframe(ka, regs, sizeof(*frame));
-int i;
-
-trace_user_setup_frame(regs, frame_addr);
-if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
-goto sigsegv;
-}
-
-setup_sigcontext(>sc, regs, set->sig[0]);
-
-for(i = 1; i < TARGET_NSIG_WORDS; i++) {
-__put_user(set->sig[i], >extramask[i - 1]);
-}
-
-if (setup_return(regs, ka, frame->retcode, frame_addr, usig,
- frame_addr + offsetof(struct sigframe_v1, retcode))) {
-goto sigsegv;
-}
-
-unlock_user_struct(frame, frame_addr, 1);
-return;
-sigsegv:
-unlock_user_struct(frame, frame_addr, 1);
-force_sigsegv(usig);
-}
-
 static void setup_frame_v2(int usig, struct target_sigaction *ka,
target_sigset_t *set, CPUARMState *regs)
 {
@@ -422,60 +367,7 @@ sigsegv:
 void setup_frame(int usig, struct target_sigaction *ka,
  target_sigset_t *set, CPUARMState *regs)
 {
-if (get_osversion() >= 0x020612) {
-setup_frame_v2(usig, ka, set, regs);
-} else {
-setup_frame_v1(usig, ka, set, regs);
-}
-}
-
-/* compare linux/arch/arm/kernel/signal.c:setup_rt_frame() */
-static void setup_rt_frame_v1(int usig, struct target_sigaction *ka,
-  target_siginfo_t *info,
-  target_sigset_t *set, CPUARMState *env)
-{
-struct rt_sigframe_v1 *frame;
-abi_ulong frame_addr = get_sigframe(ka, env, sizeof(*frame));
-struct target_sigaltstack stack;
-int i;
-abi_ulong info_addr, uc_addr;
-
-trace_user_setup_rt_frame(env, frame_addr);
-if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
-goto sigsegv;
-}
-
-info_addr = frame_addr + offsetof(struct rt_sigframe_v1, info);
-__put_user(info_addr, >pinfo);
-uc_addr = frame_addr + offsetof(struct rt_sigframe_v1, uc);
-__put_user(uc_addr, >puc);
-tswap_siginfo(>info, info);
-
-/* Clear all the bits of the ucontext we don't use.  */
-memset(>uc, 0, offsetof(struct target_ucontext_v1, tuc_mcontext));
-
-memset(, 0, sizeof(stack));
-target_save_altstack(, env);
-memcpy(>uc.tuc_stack, , sizeof(stack));
-
-setup_sigcontext(>uc.tuc_mcontext, env, set->sig[0]);
-for(i = 0; i < TARGET_NSIG_WORDS; i++) {
-__put_user(set->sig[i], >uc.tuc_sigmask.sig[i]);
-}
-
-if (setup_return(env, ka, frame->retcode, frame_addr, usig,
- frame_addr + offsetof(struct rt_sigframe_v1, retcode))) {
-goto sigsegv;
-}
-
-env->regs[1] = info_addr;
-env->regs[2] = uc_addr;
-
-unlock_user_struct(frame, frame_addr, 1);
-return;
-sigsegv:
-unlock_user_struct(frame, frame_addr, 1);
-force_sigsegv(usig);
+setup_frame_v2(usig, ka, set, regs);
 }
 
 static void setup_rt_frame_v2(int usig, struct target_sigaction *ka,
@@ -516,11 +408,7 @@ void setup_rt_frame(int usig, struct target_sigaction *ka,
 target_siginfo_t *info,
 target_sigset_t *set, CPUARMState *env)
 {
-if (get_osversion() >= 0x020612) {
-setup_rt_frame_v2(usig, ka, info, set,

[PATCH v4 04/25] linux-user/arm: Drop "_v2" from symbols in signal.c

Since we no longer support "v1", there's no need to distinguish "v2".

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/arm/signal.c | 155 +---
 1 file changed, 65 insertions(+), 90 deletions(-)

diff --git a/linux-user/arm/signal.c b/linux-user/arm/signal.c
index d0940bab47..ed7d1d80bb 100644
--- a/linux-user/arm/signal.c
+++ b/linux-user/arm/signal.c
@@ -46,7 +46,7 @@ struct target_sigcontext {
 abi_ulong fault_address;
 };
 
-struct target_ucontext_v2 {
+struct target_ucontext {
 abi_ulong tuc_flags;
 abi_ulong tuc_link;
 target_stack_t tuc_stack;
@@ -90,16 +90,16 @@ struct target_iwmmxt_sigframe {
 #define TARGET_VFP_MAGIC 0x56465001
 #define TARGET_IWMMXT_MAGIC 0x12ef842a
 
-struct sigframe_v2
+struct sigframe
 {
-struct target_ucontext_v2 uc;
+struct target_ucontext uc;
 abi_ulong retcode[4];
 };
 
-struct rt_sigframe_v2
+struct rt_sigframe
 {
 struct target_siginfo info;
-struct target_ucontext_v2 uc;
+struct target_ucontext uc;
 abi_ulong retcode[4];
 };
 
@@ -270,7 +270,7 @@ setup_return(CPUARMState *env, struct target_sigaction *ka,
 return 0;
 }
 
-static abi_ulong *setup_sigframe_v2_vfp(abi_ulong *regspace, CPUARMState *env)
+static abi_ulong *setup_sigframe_vfp(abi_ulong *regspace, CPUARMState *env)
 {
 int i;
 struct target_vfp_sigframe *vfpframe;
@@ -287,8 +287,7 @@ static abi_ulong *setup_sigframe_v2_vfp(abi_ulong 
*regspace, CPUARMState *env)
 return (abi_ulong*)(vfpframe+1);
 }
 
-static abi_ulong *setup_sigframe_v2_iwmmxt(abi_ulong *regspace,
-   CPUARMState *env)
+static abi_ulong *setup_sigframe_iwmmxt(abi_ulong *regspace, CPUARMState *env)
 {
 int i;
 struct target_iwmmxt_sigframe *iwmmxtframe;
@@ -307,15 +306,15 @@ static abi_ulong *setup_sigframe_v2_iwmmxt(abi_ulong 
*regspace,
 return (abi_ulong*)(iwmmxtframe+1);
 }
 
-static void setup_sigframe_v2(struct target_ucontext_v2 *uc,
-  target_sigset_t *set, CPUARMState *env)
+static void setup_sigframe(struct target_ucontext *uc,
+   target_sigset_t *set, CPUARMState *env)
 {
 struct target_sigaltstack stack;
 int i;
 abi_ulong *regspace;
 
 /* Clear all the bits of the ucontext we don't use.  */
-memset(uc, 0, offsetof(struct target_ucontext_v2, tuc_mcontext));
+memset(uc, 0, offsetof(struct target_ucontext, tuc_mcontext));
 
 memset(, 0, sizeof(stack));
 target_save_altstack(, env);
@@ -325,10 +324,10 @@ static void setup_sigframe_v2(struct target_ucontext_v2 
*uc,
 /* Save coprocessor signal frame.  */
 regspace = uc->tuc_regspace;
 if (cpu_isar_feature(aa32_vfp_simd, env_archcpu(env))) {
-regspace = setup_sigframe_v2_vfp(regspace, env);
+regspace = setup_sigframe_vfp(regspace, env);
 }
 if (arm_feature(env, ARM_FEATURE_IWMMXT)) {
-regspace = setup_sigframe_v2_iwmmxt(regspace, env);
+regspace = setup_sigframe_iwmmxt(regspace, env);
 }
 
 /* Write terminating magic word */
@@ -339,10 +338,10 @@ static void setup_sigframe_v2(struct target_ucontext_v2 
*uc,
 }
 }
 
-static void setup_frame_v2(int usig, struct target_sigaction *ka,
-   target_sigset_t *set, CPUARMState *regs)
+void setup_frame(int usig, struct target_sigaction *ka,
+ target_sigset_t *set, CPUARMState *regs)
 {
-struct sigframe_v2 *frame;
+struct sigframe *frame;
 abi_ulong frame_addr = get_sigframe(ka, regs, sizeof(*frame));
 
 trace_user_setup_frame(regs, frame_addr);
@@ -350,10 +349,10 @@ static void setup_frame_v2(int usig, struct 
target_sigaction *ka,
 goto sigsegv;
 }
 
-setup_sigframe_v2(>uc, set, regs);
+setup_sigframe(>uc, set, regs);
 
 if (setup_return(regs, ka, frame->retcode, frame_addr, usig,
- frame_addr + offsetof(struct sigframe_v2, retcode))) {
+ frame_addr + offsetof(struct sigframe, retcode))) {
 goto sigsegv;
 }
 
@@ -364,51 +363,38 @@ sigsegv:
 force_sigsegv(usig);
 }
 
-void setup_frame(int usig, struct target_sigaction *ka,
- target_sigset_t *set, CPUARMState *regs)
-{
-setup_frame_v2(usig, ka, set, regs);
-}
-
-static void setup_rt_frame_v2(int usig, struct target_sigaction *ka,
-  target_siginfo_t *info,
-  target_sigset_t *set, CPUARMState *env)
-{
-struct rt_sigframe_v2 *frame;
-abi_ulong frame_addr = get_sigframe(ka, env, sizeof(*frame));
-abi_ulong info_addr, uc_addr;
-
-trace_user_setup_rt_frame(env, frame_addr);
-if (!lock_user_struct(VERIFY_WRITE, frame, frame_addr, 0)) {
-goto sigsegv;
-}
-
-info_addr = frame_addr + offsetof(struct rt_sigframe_v2, info);
-uc_addr = frame_addr + offsetof(struct rt_sigframe_v2, uc);
-tswap_siginfo(>info, info);
-
-

[PATCH v4 06/25] linux-user/alpha: Implement setup_sigtramp

Create and record the two signal trampolines.
Use them when the guest does not use ka_restorer.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 linux-user/alpha/target_signal.h |  1 +
 linux-user/alpha/signal.c| 34 +++-
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/linux-user/alpha/target_signal.h b/linux-user/alpha/target_signal.h
index 250642913e..0b6a39de65 100644
--- a/linux-user/alpha/target_signal.h
+++ b/linux-user/alpha/target_signal.h
@@ -93,6 +93,7 @@ typedef struct target_sigaltstack {
 
 #define TARGET_ARCH_HAS_SETUP_FRAME
 #define TARGET_ARCH_HAS_KA_RESTORER
+#define TARGET_ARCH_HAS_SIGTRAMP_PAGE 1
 
 /* bit-flags */
 #define TARGET_SS_AUTODISARM (1U << 31) /* disable sas during sighandling */
diff --git a/linux-user/alpha/signal.c b/linux-user/alpha/signal.c
index 3a820f616b..bbe3dd175a 100644
--- a/linux-user/alpha/signal.c
+++ b/linux-user/alpha/signal.c
@@ -55,13 +55,11 @@ struct target_ucontext {
 
 struct target_sigframe {
 struct target_sigcontext sc;
-unsigned int retcode[3];
 };
 
 struct target_rt_sigframe {
 target_siginfo_t info;
 struct target_ucontext uc;
-unsigned int retcode[3];
 };
 
 #define INSN_MOV_R30_R160x47fe0410
@@ -142,12 +140,7 @@ void setup_frame(int sig, struct target_sigaction *ka,
 if (ka->ka_restorer) {
 r26 = ka->ka_restorer;
 } else {
-__put_user(INSN_MOV_R30_R16, >retcode[0]);
-__put_user(INSN_LDI_R0 + TARGET_NR_sigreturn,
-   >retcode[1]);
-__put_user(INSN_CALLSYS, >retcode[2]);
-/* imb() */
-r26 = frame_addr + offsetof(struct target_sigframe, retcode);
+r26 = default_sigreturn;
 }
 
 unlock_user_struct(frame, frame_addr, 1);
@@ -196,12 +189,7 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 if (ka->ka_restorer) {
 r26 = ka->ka_restorer;
 } else {
-__put_user(INSN_MOV_R30_R16, >retcode[0]);
-__put_user(INSN_LDI_R0 + TARGET_NR_rt_sigreturn,
-   >retcode[1]);
-__put_user(INSN_CALLSYS, >retcode[2]);
-/* imb(); */
-r26 = frame_addr + offsetof(struct target_rt_sigframe, retcode);
+r26 = default_rt_sigreturn;
 }
 
 if (err) {
@@ -269,3 +257,21 @@ badframe:
 force_sig(TARGET_SIGSEGV);
 return -TARGET_QEMU_ESIGRETURN;
 }
+
+void setup_sigtramp(abi_ulong sigtramp_page)
+{
+uint32_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 6 * 4, 0);
+assert(tramp != NULL);
+
+default_sigreturn = sigtramp_page;
+__put_user(INSN_MOV_R30_R16, [0]);
+__put_user(INSN_LDI_R0 + TARGET_NR_sigreturn, [1]);
+__put_user(INSN_CALLSYS, [2]);
+
+default_rt_sigreturn = sigtramp_page + 3 * 4;
+__put_user(INSN_MOV_R30_R16, [3]);
+__put_user(INSN_LDI_R0 + TARGET_NR_rt_sigreturn, [4]);
+__put_user(INSN_CALLSYS, [5]);
+
+unlock_user(tramp, sigtramp_page, 6 * 4);
+}
-- 
2.25.1

Re: [PATCH v3 10/27] linux-user/i386: Implement setup_sigtramp


On 9/24/21 2:01 PM, Philippe Mathieu-Daudé wrote:

+static void install_sigtramp(void *tramp)
+{
+    /* This is popl %eax ; movl $syscall,%eax ; int $0x80 */
+    __put_user(0xb858, (uint16_t *)(tramp + 0));
+    __put_user(TARGET_NR_sigreturn, (int *)(tramp + 2));


I know this is mostly code movement, but using uint32_t would
make it easier to read.


I'll give you int32_t here, since the value is signed.


+    __put_user(TARGET_NR_rt_sigreturn, (int *)(tramp + 1));


and uint32_t.


Likewise.


+    uint16_t *tramp = lock_user(VERIFY_WRITE, sigtramp_page, 2 * 8, 0);


Shouldn't this be 8 + 7?


Does it really matter if we write 15 or 16 bytes of this page?


r~

Re: [PATCH v10 11/14] machine: Make smp_parse generic enough for all arches

2021-09-27 Thread wangyanan (Y)




On 2021/9/27 18:12, Daniel P. Berrangé wrote:

On Sun, Sep 26, 2021 at 04:45:38PM +0800, Yanan Wang wrote:

Currently the only difference between smp_parse and pc_smp_parse
is the support of dies parameter and the related error reporting.
With some arch compat variables like "bool dies_supported", we can
make smp_parse generic enough for all arches and the PC specific
one can be removed.

Making smp_parse() generic enough can reduce code duplication and
ease the code maintenance, and also allows extending the topology
with more arch specific members (e.g., clusters) in the future.

Suggested-by: Andrew Jones 
Signed-off-by: Yanan Wang 
Reviewed-by: Andrew Jones 
---
  hw/core/machine.c   | 110 
  hw/i386/pc.c|  84 +
  include/hw/boards.h |   9 
  3 files changed, 100 insertions(+), 103 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index a21fcd7700..4b5c943f8e 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -15,6 +15,7 @@
  #include "qapi/qmp/qerror.h"
  #include "sysemu/replay.h"
  #include "qemu/units.h"
+#include "qemu/cutils.h"
  #include "hw/boards.h"
  #include "hw/loader.h"
  #include "qapi/error.h"
@@ -746,20 +747,87 @@ void machine_set_cpu_numa_node(MachineState *machine,
  }
  }
  
+static char *cpu_topology_hierarchy(MachineState *ms)

+{
+MachineClass *mc = MACHINE_GET_CLASS(ms);
+SMPCompatProps *smp_props = >smp_props;
+char topo_msg[256] = "";
+
+/*
+ * Topology members should be ordered from the largest to the smallest.
+ * Concept of sockets/cores/threads is supported by default and will be
+ * reported in the hierarchy. Unsupported members will not be reported.
+ */
+g_autofree char *sockets_msg = g_strdup_printf(
+" * sockets (%u)", ms->smp.sockets);
+pstrcat(topo_msg, sizeof(topo_msg), sockets_msg);
+
+if (smp_props->dies_supported) {
+g_autofree char *dies_msg = g_strdup_printf(
+" * dies (%u)", ms->smp.dies);
+pstrcat(topo_msg, sizeof(topo_msg), dies_msg);
+}
+
+g_autofree char *cores_msg = g_strdup_printf(
+" * cores (%u)", ms->smp.cores);
+pstrcat(topo_msg, sizeof(topo_msg), cores_msg);
+
+g_autofree char *threads_msg = g_strdup_printf(
+" * threads (%u)", ms->smp.threads);
+pstrcat(topo_msg, sizeof(topo_msg), threads_msg);
+
+return g_strdup_printf("%s", topo_msg + 3);
+}

Mixing g_strdup_printf + pstrcat + fixed buffer is quite
unpleasant. This method is begging to use 'GString' APIs
for formatting.


Indeed, GString seems to be the ideal choice for this. It's
my bad not thinking about usage of it at the beginning.
I will update this patch in v11. Thanks for the suggestion.

Thanks,
Yanan

Re: Fw: [EXTERNAL] Re: [RFC PATCH 00/13] Add support for Mirror VM.

2021-09-27 Thread Ashish Kalra

On Tue, Aug 24, 2021 at 06:00:51PM -0400, Tobin Feldman-Fitzthum wrote:
> On Mon, Aug 16, 2021 at 04:15:46PM +0200, Paolo Bonzini wrote:
> 
> > Hi,
> >
> > first of all, thanks for posting this work and starting the discussion.
> >
> > However, I am not sure if the in-guest migration helper vCPUs should use
> > the existing KVM support code.  For example, they probably can just
> > always work with host CPUID (copied directly from
> > KVM_GET_SUPPORTED_CPUID), and they do not need to interface with QEMU's
> > MMIO logic.  They would just sit on a "HLT" instruction and communicate
> > with the main migration loop using some kind of standardized ring buffer
> > protocol; the migration loop then executes KVM_RUN in order to start the
> > processing of pages, and expects a KVM_EXIT_HLT when the VM has nothing
> > to do or requires processing on the host.
> > The migration helper can then also use its own address space, for
> > example operating directly on ram_addr_t values with the helper running
> > at very high virtual addresses.  Migration code can use a
> > RAMBlockNotifier to invoke KVM_SET_USER_MEMORY_REGION on the mirror VM
> > (and never enable dirty memory logging on the mirror VM, too, which has
> > better performance).
> >
> > With this implementation, the number of mirror vCPUs does not even have
> > to be indicated on the command line.  The VM and its vCPUs can simply be
> > created when migration starts.  In the SEV-ES case, the guest can even
> > provide the VMSA that starts the migration helper.
> 

This also depends on the mirror VM and it's vCPUs being launched and
measured indepedently on the target side. If the MH VM is measured on the
source and then migrated to the target then it cannot be simply created
only when the migration starts, in that case it is launched, measured at
the source, migrated to the target and it remains suspended till the
migration code activates it.

Thanks,
Ashish

> It might make sense to tweak the mirror support code so that it is more
> closely tied to migration and the migration handler. On the other hand,
> the usage of a mirror VM might be more general than just migration. In
> some ways the mirror offers similar functionality to the VMPL in SNP,
> providing a way to run non-workload code inside the enclave. This
> potentially has uses beyond migration. If this is the case, do maybe we
> want to keep the mirror more general.
> 
> It's also worth noting that the SMP interface that Ashish is using to
> specify the mirror might come in handy if we ever want to have more than
> one vCPU in the mirror. For instance we might want to use multiple MH
> vCPUs to increase throughput.
> 
> -Tobin
> 
> > The disadvantage is that, as you point out, in the future some of the
> > infrastructure you introduce might be useful for VMPL0 operation on
> > SEV-SNP.  My proposal above might require some code duplication.
> > However, it might even be that VMPL0 operation works best with a model
> > more similar to my sketch of the migration helper; it's really too early
> > to say.
> >
> > Paolo

Re: [PATCH v3 02/15] target/ppc: add user write access control for PMU SPRs

2021-09-27 Thread Daniel Henrique Barboza





On 9/27/21 02:08, David Gibson wrote:

On Thu, Sep 23, 2021 at 11:39:14AM -0300, Daniel Henrique Barboza wrote:



On 9/6/21 22:38, David Gibson wrote:

On Fri, Sep 03, 2021 at 05:31:03PM -0300, Daniel Henrique Barboza wrote:

The PMU needs to enable writing of its uregs to userspace, otherwise
Perf applications will not able to setup the counters correctly. This
patch enables user space writing of all PMU uregs.

MMCR0 is a special case because its userspace writing access is controlled
by MMCR0_PMCC bits. There are 4 configurations available (0b00, 0b01,
0b10 and 0b11) but for our purposes here we're handling only
MMCR0_PMCC = 0b00. In this case, if userspace tries to write MMCR0, a
hypervisor emulation assistance interrupt occurs.

This is being done by adding HFLAGS_PMCCCLEAR to hflags. This flag
indicates if MMCR0_PMCC is cleared (0b00), and a new 'pmcc_clear' flag in
DisasContext allow us to use it in spr_write_MMCR0_ureg().

Signed-off-by: Daniel Henrique Barboza 
---
   target/ppc/cpu.h |  1 +
   target/ppc/cpu_init.c| 18 +++---
   target/ppc/helper_regs.c |  3 +++
   target/ppc/spr_tcg.h |  3 ++-
   target/ppc/translate.c   | 53 +++-
   5 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index f68bb8d8aa..8dfbb62022 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -616,6 +616,7 @@ enum {
   HFLAGS_SE = 10,  /* MSR_SE -- from elsewhere on embedded ppc */
   HFLAGS_FP = 13,  /* MSR_FP */
   HFLAGS_PR = 14,  /* MSR_PR */
+HFLAGS_PMCCCLEAR = 15, /* PMU MMCR0 PMCC equal to 0b00 */
   HFLAGS_VSX = 23, /* MSR_VSX if cpu has VSX */
   HFLAGS_VR = 25,  /* MSR_VR if cpu has VRE */
diff --git a/target/ppc/cpu_init.c b/target/ppc/cpu_init.c
index 9efc6c2d87..bb5ea04c61 100644
--- a/target/ppc/cpu_init.c
+++ b/target/ppc/cpu_init.c
@@ -6867,7 +6867,7 @@ static void register_book3s_pmu_sup_sprs(CPUPPCState *env)
   static void register_book3s_pmu_user_sprs(CPUPPCState *env)
   {
   spr_register(env, SPR_POWER_UMMCR0, "UMMCR0",
- _read_MMCR0_ureg, SPR_NOACCESS,
+ _read_MMCR0_ureg, _write_MMCR0_ureg,
_read_ureg, _write_ureg,
0x);
   spr_register(env, SPR_POWER_UMMCR1, "UMMCR1",
@@ -6875,31 +6875,31 @@ static void register_book3s_pmu_user_sprs(CPUPPCState 
*env)
_read_ureg, _write_ureg,
0x);
   spr_register(env, SPR_POWER_UMMCRA, "UMMCRA",
- _read_ureg, SPR_NOACCESS,
+ _read_ureg, _write_ureg,
_read_ureg, _write_ureg,
0x);
   spr_register(env, SPR_POWER_UPMC1, "UPMC1",
- _read_ureg, SPR_NOACCESS,
+ _read_ureg, _write_ureg,


Surely this can't be write.  AFAICT spr_write_ureg() will
unconditionally allow full userspace write access.  That can't be
right - otherwise the OS could never safely use the PMU for itself.


My assumption here was that the user mode SPRs (UMMCR* and UPMC*) were created 
to
allow userspace read/write of PMU regs, while the regular regs (MMCR* and PMC*)
are the supermode privileged SPRs that can't be written by userspace. At least 
this
is my understanding from reading commit fd51ff6328e3d98158 that introduced these
userspace PMC regs.


Sure, but my point is that these registers are only userspace
accessible under certain conditions, IIUC.  spr_write_ureg() doesn't
test for those conditions, so it will *always* allow write access.



Got it.

I guess I'll end up biting the bullet and exposing both PMCC bits and adding
proper read/write access controls for the callbacks we need. This is somewhat
out of scope of my original goal with this series, but I guess we'll all better
off by doing it right now.

I'll add all the read/write ureg functions I'll need in the first patches (the 
PMC
write callback functions are on the patch 14, for instance). That will, 
hopefully,
making it easier to review the rest of the series by going through all the 
access
control and read/write callbacks early on.

Thanks,


Daniel




The reason why these are marked as SPR_NOACCESS is because we didn't bothered
writing into them from userspace because we had no PMU logic to work
with.


[snip]

diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index b2ead144d1..0babde3131 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -175,6 +175,7 @@ struct DisasContext {
   bool spe_enabled;
   bool tm_enabled;
   bool gtse;
+bool pmcc_clear;
   ppc_spr_t *spr_cb; /* Needed to check rights for mfspr/mtspr */
   int singlestep_enabled;
   uint32_t flags;
@@ -561,7 +562,56 @@ void spr_write_ureg(DisasContext *ctx, int sprn, int gprn)
   {
   gen_store_spr(sprn + 0x10, cpu_gpr[gprn]);
   }
-#endif
+
+void spr_write_MMCR0_ureg(DisasContext *ctx, int sprn, int gprn)



Could you

[PATCH v4 1/3] vhost-user-rng: Add vhost-user-rng implementation

Introduce a random number generator (RNG) backend that communicates
with a vhost-user server to retrieve entropy.  That way other VMM
that comply with the vhost user protocl can use the same vhost-user
daemon without having to write yet another RNG driver.

Reviewed-by: Alex Bennée 
Signed-off-by: Mathieu Poirier 
---
 hw/virtio/Kconfig  |   5 +
 hw/virtio/meson.build  |   1 +
 hw/virtio/vhost-user-rng.c | 289 +
 include/hw/virtio/vhost-user-rng.h |  33 
 4 files changed, 328 insertions(+)
 create mode 100644 hw/virtio/vhost-user-rng.c
 create mode 100644 include/hw/virtio/vhost-user-rng.h

diff --git a/hw/virtio/Kconfig b/hw/virtio/Kconfig
index 35ab45e2095c..c144d42f9bd0 100644
--- a/hw/virtio/Kconfig
+++ b/hw/virtio/Kconfig
@@ -63,3 +63,8 @@ config VHOST_USER_I2C
 bool
 default y
 depends on VIRTIO && VHOST_USER
+
+config VHOST_USER_RNG
+bool
+default y
+depends on VIRTIO && VHOST_USER
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index bc352a600911..ae6b2cde1068 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -27,6 +27,7 @@ virtio_ss.add(when: 'CONFIG_VIRTIO_IOMMU', if_true: 
files('virtio-iommu.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: files('virtio-mem.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: 
files('vhost-user-i2c.c'))
 virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_I2C'], if_true: 
files('vhost-user-i2c-pci.c'))
+virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: 
files('vhost-user-rng.c'))
 
 virtio_pci_ss = ss.source_set()
 virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: 
files('vhost-vsock-pci.c'))
diff --git a/hw/virtio/vhost-user-rng.c b/hw/virtio/vhost-user-rng.c
new file mode 100644
index ..209ee5bf9acd
--- /dev/null
+++ b/hw/virtio/vhost-user-rng.c
@@ -0,0 +1,289 @@
+/*
+ * Vhost-user RNG virtio device
+ *
+ * Copyright (c) 2021 Mathieu Poirier 
+ *
+ * Implementation seriously tailored on vhost-user-i2c.c
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/vhost-user-rng.h"
+#include "qemu/error-report.h"
+#include "standard-headers/linux/virtio_ids.h"
+
+static void vu_rng_start(VirtIODevice *vdev)
+{
+VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+int ret;
+int i;
+
+if (!k->set_guest_notifiers) {
+error_report("binding does not support guest notifiers");
+return;
+}
+
+ret = vhost_dev_enable_notifiers(>vhost_dev, vdev);
+if (ret < 0) {
+error_report("Error enabling host notifiers: %d", -ret);
+return;
+}
+
+ret = k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, true);
+if (ret < 0) {
+error_report("Error binding guest notifier: %d", -ret);
+goto err_host_notifiers;
+}
+
+rng->vhost_dev.acked_features = vdev->guest_features;
+ret = vhost_dev_start(>vhost_dev, vdev);
+if (ret < 0) {
+error_report("Error starting vhost-user-rng: %d", -ret);
+goto err_guest_notifiers;
+}
+
+/*
+ * guest_notifier_mask/pending not used yet, so just unmask
+ * everything here. virtio-pci will do the right thing by
+ * enabling/disabling irqfd.
+ */
+for (i = 0; i < rng->vhost_dev.nvqs; i++) {
+vhost_virtqueue_mask(>vhost_dev, vdev, i, false);
+}
+
+return;
+
+err_guest_notifiers:
+k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, false);
+err_host_notifiers:
+vhost_dev_disable_notifiers(>vhost_dev, vdev);
+}
+
+static void vu_rng_stop(VirtIODevice *vdev)
+{
+VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+int ret;
+
+if (!k->set_guest_notifiers) {
+return;
+}
+
+vhost_dev_stop(>vhost_dev, vdev);
+
+ret = k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, false);
+if (ret < 0) {
+error_report("vhost guest notifier cleanup failed: %d", ret);
+return;
+}
+
+vhost_dev_disable_notifiers(>vhost_dev, vdev);
+}
+
+static void vu_rng_set_status(VirtIODevice *vdev, uint8_t status)
+{
+VHostUserRNG *rng = VHOST_USER_RNG(vdev);
+bool should_start = status & VIRTIO_CONFIG_S_DRIVER_OK;
+
+if (!vdev->vm_running) {
+should_start = false;
+}
+
+if (rng->vhost_dev.started == should_start) {
+return;
+}
+
+if (should_start) {
+vu_rng_start(vdev);
+} else {
+vu_rng_stop(vdev);
+}
+}
+
+static uint64_t vu_rng_get_features(VirtIODevice *vdev,
+uint64_t requested_features, Error **errp)
+{
+/* No feature bits used yet */
+

[PATCH v4 0/3] virtio: Add vhost-user based RNG

This set implements a random number generator (RNG) device that follows
the vhost-user protocol. 

The main difference between V3 is the absence of the vhost-user daemon
implemenation.  It was dropped to favour the rust implementation
currently being considered under the vhost-device crate[1] of the
rust-vmm repository[1].

Applies cleanly to git://git.qemu.org/qemu.git master(de8ed1055c2c).

Thanks,
Mathieu

[1]. https://github.com/rust-vmm/vhost-device/pull/29

Changes for V4:
1. Addressed merge conflicts
2. Dropped error path after call to g_new0()
3. Changed #define in vhost-user-rng-pci.c 
4. Dropped vhost-user-daemon implementation
5. Modified documentation to reflect the above (4)
6. Added Alex's RB tags.

Mathieu Poirier (3):
  vhost-user-rng: Add vhost-user-rng implementation
  vhost-user-rng-pci: Add vhost-user-rng-pci implementation
  docs: Add documentation for vhost based RNG implementation

 docs/system/devices/vhost-user-rng.rst |  39 
 hw/virtio/Kconfig  |   5 +
 hw/virtio/meson.build  |   2 +
 hw/virtio/vhost-user-rng-pci.c |  79 +++
 hw/virtio/vhost-user-rng.c | 289 +
 include/hw/virtio/vhost-user-rng.h |  33 +++
 6 files changed, 447 insertions(+)
 create mode 100644 docs/system/devices/vhost-user-rng.rst
 create mode 100644 hw/virtio/vhost-user-rng-pci.c
 create mode 100644 hw/virtio/vhost-user-rng.c
 create mode 100644 include/hw/virtio/vhost-user-rng.h

-- 
2.25.1

[PULL 20/20] nbd/server: Add --selinux-label option

From: "Richard W.M. Jones" 

Under SELinux, Unix domain sockets have two labels.  One is on the
disk and can be set with commands such as chcon(1).  There is a
different label stored in memory (called the process label).  This can
only be set by the process creating the socket.  When using SELinux +
SVirt and wanting qemu to be able to connect to a qemu-nbd instance,
you must set both labels correctly first.

For qemu-nbd the options to set the second label are awkward.  You can
create the socket in a wrapper program and then exec into qemu-nbd.
Or you could try something with LD_PRELOAD.

This commit adds the ability to set the label straightforwardly on the
command line, via the new --selinux-label flag.  (The name of the flag
is the same as the equivalent nbdkit option.)

A worked example showing how to use the new option can be found in
this bug: https://bugzilla.redhat.com/show_bug.cgi?id=1984938

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1984938
Signed-off-by: Richard W.M. Jones 
Message-Id: <20210723103303.1731437-2-rjo...@redhat.com>
Reviewed-by: Daniel P. Berrangé 
[eblake: Fail if option is used when not compiled in]
Signed-off-by: Eric Blake 
---
 configure |  8 +++-
 meson.build   | 10 -
 qemu-nbd.c| 39 +++
 meson_options.txt |  3 ++
 tests/docker/dockerfiles/centos8.docker   |  1 +
 tests/docker/dockerfiles/fedora.docker|  1 +
 tests/docker/dockerfiles/opensuse-leap.docker |  1 +
 tests/docker/dockerfiles/ubuntu1804.docker|  1 +
 tests/docker/dockerfiles/ubuntu2004.docker|  1 +
 9 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 1043ccce4f99..b3211a66eeea 100755
--- a/configure
+++ b/configure
@@ -445,6 +445,7 @@ fuse="auto"
 fuse_lseek="auto"
 multiprocess="auto"
 slirp_smbd="$default_feature"
+selinux="auto"

 malloc_trim="auto"
 gio="$default_feature"
@@ -1576,6 +1577,10 @@ for opt do
   ;;
   --disable-slirp-smbd) slirp_smbd=no
   ;;
+  --enable-selinux) selinux="enabled"
+  ;;
+  --disable-selinux) selinux="disabled"
+  ;;
   *)
   echo "ERROR: unknown option $opt"
   echo "Try '$0 --help' for more information"
@@ -1963,6 +1968,7 @@ disabled with --disable-FEATURE, default is enabled if 
available
   multiprocessOut of process device emulation support
   gio libgio support
   slirp-smbd  use smbd (at path --smbd=*) in slirp networking
+  selinux SELinux support in qemu-nbd

 NOTE: The object files are built at the place where configure is launched
 EOF
@@ -5207,7 +5213,7 @@ if test "$skip_meson" = no; then
 -Dattr=$attr -Ddefault_devices=$default_devices 
-Dvirglrenderer=$virglrenderer \
 -Ddocs=$docs -Dsphinx_build=$sphinx_build -Dinstall_blobs=$blobs \
 -Dvhost_user_blk_server=$vhost_user_blk_server 
-Dmultiprocess=$multiprocess \
--Dfuse=$fuse -Dfuse_lseek=$fuse_lseek 
-Dguest_agent_msi=$guest_agent_msi -Dbpf=$bpf\
+-Dselinux=$selinux \
 $(if test "$default_feature" = no; then echo 
"-Dauto_features=disabled"; fi) \
-Dtcg_interpreter=$tcg_interpreter \
 $cross_arg \
diff --git a/meson.build b/meson.build
index 15ef4d3c4187..0ded2ac5eb9d 100644
--- a/meson.build
+++ b/meson.build
@@ -1072,6 +1072,11 @@ keyutils = dependency('libkeyutils', required: false,

 has_gettid = cc.has_function('gettid')

+# libselinux
+selinux = dependency('libselinux',
+ required: get_option('selinux'),
+ method: 'pkg-config', kwargs: static_kwargs)
+
 # Malloc tests

 malloc = []
@@ -1300,6 +1305,7 @@ config_host_data.set('CONFIG_FUSE', fuse.found())
 config_host_data.set('CONFIG_FUSE_LSEEK', fuse_lseek.found())
 config_host_data.set('CONFIG_X11', x11.found())
 config_host_data.set('CONFIG_CFI', get_option('cfi'))
+config_host_data.set('CONFIG_SELINUX', selinux.found())
 config_host_data.set('QEMU_VERSION', '"@0@"'.format(meson.project_version()))
 config_host_data.set('QEMU_VERSION_MAJOR', 
meson.project_version().split('.')[0])
 config_host_data.set('QEMU_VERSION_MINOR', 
meson.project_version().split('.')[1])
@@ -2759,7 +2765,8 @@ if have_tools
   qemu_io = executable('qemu-io', files('qemu-io.c'),
  dependencies: [block, qemuutil], install: true)
   qemu_nbd = executable('qemu-nbd', files('qemu-nbd.c'),
-   dependencies: [blockdev, qemuutil, gnutls], install: true)
+   dependencies: [blockdev, qemuutil, gnutls, selinux],
+   install: true)

   subdir('storage-daemon')
   subdir('contrib/rdmacm-mux')
@@ -3124,6 +3131,7 @@ summary_info += {'libpmem support':   libpmem.found()}
 summary_info += {'libdaxctl support': libdaxctl.found()}
 summary_info += {'libudev':   libudev.found()}
 summary_info += {'FUSE lseek':fuse_lseek.found()}
+summary_info += {'selinux':   selinux.found()}

[PULL 13/20] nbd/server: Allow LIST_META_CONTEXT without STRUCTURED_REPLY

The NBD protocol just relaxed the requirements on
NBD_OPT_LIST_META_CONTEXT:

https://github.com/NetworkBlockDevice/nbd/commit/13a4e33a87

Since listing is not stateful (unlike SET_META_CONTEXT), we don't care
if a client asks for meta contexts without first requesting structured
replies.  Well-behaved clients will still ask for structured reply
first (if for no other reason than for back-compat to older servers),
but that's no reason to avoid this change.

Signed-off-by: Eric Blake 
Message-Id: <20210907173505.1499709-1-ebl...@redhat.com>
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 nbd/server.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nbd/server.c b/nbd/server.c
index 3927f7789dcf..6d03e8a4b436 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -980,7 +980,7 @@ static int nbd_negotiate_meta_queries(NBDClient *client,
 size_t i;
 size_t count = 0;

-if (!client->structured_reply) {
+if (client->opt == NBD_OPT_SET_META_CONTEXT && !client->structured_reply) {
 return nbd_opt_invalid(client, errp,
"request option '%s' when structured reply "
"is not negotiated",
-- 
2.31.1

[PULL 18/20] block/nbd: drop connection_co

From: Vladimir Sementsov-Ogievskiy 

OK, that's a big rewrite of the logic.

Pre-patch we have an always running coroutine - connection_co. It does
reply receiving and reconnecting. And it leads to a lot of difficult
and unobvious code around drained sections and context switch. We also
abuse bs->in_flight counter which is increased for connection_co and
temporary decreased in points where we want to allow drained section to
begin. One of these place is in another file: in nbd_read_eof() in
nbd/client.c.

We also cancel reconnect and requests waiting for reconnect on drained
begin which is not correct. And this patch fixes that.

Let's finally drop this always running coroutine and go another way:
do both reconnect and receiving in request coroutines.

The detailed list of changes below (in the sequence of diff hunks).

1. receiving coroutines are woken directly from nbd_channel_error, when
   we change s->state

2. nbd_co_establish_connection_cancel(): we don't have drain_begin now,
   and in nbd_teardown_connection() all requests should already be
   finished (and reconnect is done from request). So
   nbd_co_establish_connection_cancel() is called from
   nbd_cancel_in_flight() (to cancel the request that is doing
   nbd_co_establish_connection()) and from reconnect_delay_timer_cb()
   (previously we didn't need it, as reconnect delay only should cancel
   active requests not the reconnection itself). But now reconnection
   itself is done in the separate thread (we now call
   nbd_client_connection_enable_retry() in nbd_open()), and we need to
   cancel the requests that wait in nbd_co_establish_connection()
   now).

2A. We do receive headers in request coroutine. But we also should
   dispatch replies for other pending requests. So,
   nbd_connection_entry() is turned into nbd_receive_replies(), which
   does reply dispatching while it receives other request headers, and
   returns when it receives the requested header.

3. All old staff around drained sections and context switch is dropped.
   In details:
   - we don't need to move connection_co to new aio context, as we
 don't have connection_co anymore
   - we don't have a fake "request" of connection_co (extra increasing
 in_flight), so don't care with it in drain_begin/end
   - we don't stop reconnection during drained section anymore. This
 means that drain_begin may wait for a long time (up to
 reconnect_delay). But that's an improvement and more correct
 behavior see below[*]

4. In nbd_teardown_connection() we don't have to wait for
   connection_co, as it is dropped. And cleanup for s->ioc and nbd_yank
   is moved here from removed connection_co.

5. In nbd_co_do_establish_connection() we now should handle
   NBD_CLIENT_CONNECTING_NOWAIT: if new request comes when we are in
   NBD_CLIENT_CONNECTING_NOWAIT, it still should call
   nbd_co_establish_connection() (who knows, maybe the connection was
   already established by another thread in the background). But we
   shouldn't wait: if nbd_co_establish_connection() can't return new
   channel immediately the request should fail (we are in
   NBD_CLIENT_CONNECTING_NOWAIT state).

6. nbd_reconnect_attempt() is simplified: it's now easier to wait for
   other requests in the caller, so here we just assert that fact.
   Also delay time is now initialized here: we can easily detect first
   attempt and start a timer.

7. nbd_co_reconnect_loop() is dropped, we don't need it. Reconnect
   retries are fully handle by thread (nbd/client-connection.c), delay
   timer we initialize in nbd_reconnect_attempt(), we don't have to
   bother with s->drained and friends. nbd_reconnect_attempt() now
   called from nbd_co_send_request().

8. nbd_connection_entry is dropped: reconnect is now handled by
   nbd_co_send_request(), receiving reply is now handled by
   nbd_receive_replies(): all handled from request coroutines.

9. So, welcome new nbd_receive_replies() called from request coroutine,
   that receives reply header instead of nbd_connection_entry().
   Like with sending requests, only one coroutine may receive in a
   moment. So we introduce receive_mutex, which is locked around
   nbd_receive_reply(). It also protects some related fields. Still,
   full audit of thread-safety in nbd driver is a separate task.
   New function waits for a reply with specified handle being received
   and works rather simple:

   Under mutex:
 - if current handle is 0, do receive by hand. If another handle
   received - switch to other request coroutine, release mutex and
   yield. Otherwise return success
 - if current handle == requested handle, we are done
 - otherwise, release mutex and yield

10: in nbd_co_send_request() we now do nbd_reconnect_attempt() if
needed. Also waiting in free_sema queue we now wait for one of two
conditions:
- connectED, in_flight < MAX_NBD_REQUESTS (so we can start new one)
- connectING, in_flight == 0, so we can call

[PATCH v4 3/3] docs: Add documentation for vhost based RNG implementation

Add description and example for the vhost-user based RNG implementation.

Signed-off-by: Mathieu Poirier 
---
 docs/system/devices/vhost-user-rng.rst | 39 ++
 1 file changed, 39 insertions(+)
 create mode 100644 docs/system/devices/vhost-user-rng.rst

diff --git a/docs/system/devices/vhost-user-rng.rst 
b/docs/system/devices/vhost-user-rng.rst
new file mode 100644
index ..a145d4105c1a
--- /dev/null
+++ b/docs/system/devices/vhost-user-rng.rst
@@ -0,0 +1,39 @@
+QEMU vhost-user-rng - RNG emulation
+===
+
+Background
+--
+
+What follows builds on the material presented in vhost-user.rst - it should
+be reviewed before moving forward with the content in this file.
+
+Description
+---
+
+The vhost-user-rng device implementation was designed to work with a random
+number generator daemon such as the one found in the vhost-device crate of
+the rust-vmm project available on github [1].
+
+[1]. https://github.com/rust-vmm/vhost-device
+
+Examples
+
+
+The daemon should be started first:
+
+::
+
+  host# vhost-device-rng --socket-path=rng.sock -c 1 -m 512 -p 1000
+
+The QEMU invocation needs to create a chardev socket the device can
+use to communicate as well as share the guests memory over a memfd.
+
+::
+
+  host# qemu-system
\
+  -chardev socket,path=$(PATH)/rng.sock,id=rng0
\
+  -device vhost-user-rng-pci,chardev=rng0  
\
+  -m 4096  
\
+  -object memory-backend-file,id=mem,size=4G,mem-path=/dev/shm,share=on
\
+  -numa node,memdev=mem
\
+  ...
-- 
2.25.1

[PATCH v4 2/3] vhost-user-rng-pci: Add vhost-user-rng-pci implementation

This patch provides a PCI bus interface to the vhost-user-rng backend.

Reviewed-by: Alex Bennée 
Signed-off-by: Mathieu Poirier 
---
 hw/virtio/meson.build  |  1 +
 hw/virtio/vhost-user-rng-pci.c | 79 ++
 2 files changed, 80 insertions(+)
 create mode 100644 hw/virtio/vhost-user-rng-pci.c

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index ae6b2cde1068..521f7d64a86a 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -28,6 +28,7 @@ virtio_ss.add(when: 'CONFIG_VIRTIO_MEM', if_true: 
files('virtio-mem.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_I2C', if_true: 
files('vhost-user-i2c.c'))
 virtio_ss.add(when: ['CONFIG_VIRTIO_PCI', 'CONFIG_VHOST_USER_I2C'], if_true: 
files('vhost-user-i2c-pci.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER_RNG', if_true: 
files('vhost-user-rng.c'))
+virtio_ss.add(when: ['CONFIG_VHOST_USER_RNG', 'CONFIG_VIRTIO_PCI'], if_true: 
files('vhost-user-rng-pci.c'))
 
 virtio_pci_ss = ss.source_set()
 virtio_pci_ss.add(when: 'CONFIG_VHOST_VSOCK', if_true: 
files('vhost-vsock-pci.c'))
diff --git a/hw/virtio/vhost-user-rng-pci.c b/hw/virtio/vhost-user-rng-pci.c
new file mode 100644
index ..c83dc8681385
--- /dev/null
+++ b/hw/virtio/vhost-user-rng-pci.c
@@ -0,0 +1,79 @@
+/*
+ * Vhost-user RNG virtio device PCI glue
+ *
+ * Copyright (c) 2021 Mathieu Poirier 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-rng.h"
+#include "virtio-pci.h"
+
+struct VHostUserRNGPCI {
+VirtIOPCIProxy parent_obj;
+VHostUserRNG vdev;
+};
+
+typedef struct VHostUserRNGPCI VHostUserRNGPCI;
+
+#define TYPE_VHOST_USER_RNG_PCI "vhost-user-rng-pci-base"
+
+DECLARE_INSTANCE_CHECKER(VHostUserRNGPCI, VHOST_USER_RNG_PCI,
+ TYPE_VHOST_USER_RNG_PCI)
+
+static Property vhost_user_rng_pci_properties[] = {
+DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+   DEV_NVECTORS_UNSPECIFIED),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void vhost_user_rng_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
+{
+VHostUserRNGPCI *dev = VHOST_USER_RNG_PCI(vpci_dev);
+DeviceState *vdev = DEVICE(>vdev);
+
+if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+vpci_dev->nvectors = 1;
+}
+
+qdev_realize(vdev, BUS(_dev->bus), errp);
+}
+
+static void vhost_user_rng_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+k->realize = vhost_user_rng_pci_realize;
+set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+device_class_set_props(dc, vhost_user_rng_pci_properties);
+pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */
+pcidev_k->revision = 0x00;
+pcidev_k->class_id = PCI_CLASS_OTHERS;
+}
+
+static void vhost_user_rng_pci_instance_init(Object *obj)
+{
+VHostUserRNGPCI *dev = VHOST_USER_RNG_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VHOST_USER_RNG);
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_rng_pci_info = {
+.base_name = TYPE_VHOST_USER_RNG_PCI,
+.non_transitional_name = "vhost-user-rng-pci",
+.instance_size = sizeof(VHostUserRNGPCI),
+.instance_init = vhost_user_rng_pci_instance_init,
+.class_init = vhost_user_rng_pci_class_init,
+};
+
+static void vhost_user_rng_pci_register(void)
+{
+virtio_pci_types_register(_user_rng_pci_info);
+}
+
+type_init(vhost_user_rng_pci_register);
-- 
2.25.1

[PULL 15/20] block/nbd: nbd_channel_error() shutdown channel unconditionally

From: Vladimir Sementsov-Ogievskiy 

Don't rely on connection being totally broken in case of -EIO. Safer
and more correct is to just shut down the channel anyway, since we
change the state and plan on reconnecting.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
Message-Id: <20210902103805.25686-2-vsement...@virtuozzo.com>
[eblake: grammar tweaks]
Signed-off-by: Eric Blake 
---
 block/nbd.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index a66b2c282dc3..de59e76378ab 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -129,15 +129,16 @@ static bool nbd_client_connected(BDRVNBDState *s)

 static void nbd_channel_error(BDRVNBDState *s, int ret)
 {
+if (nbd_client_connected(s)) {
+qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+}
+
 if (ret == -EIO) {
 if (nbd_client_connected(s)) {
 s->state = s->reconnect_delay ? NBD_CLIENT_CONNECTING_WAIT :
 NBD_CLIENT_CONNECTING_NOWAIT;
 }
 } else {
-if (nbd_client_connected(s)) {
-qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
-}
 s->state = NBD_CLIENT_QUIT;
 }
 }
-- 
2.31.1

[PULL 14/20] nbd/client-connection: nbd_co_establish_connection(): fix non set errp

From: Vladimir Sementsov-Ogievskiy 

When we don't have a connection and blocking is false, we return NULL
but don't set errp. That's wrong.

We have two paths for calling nbd_co_establish_connection():

1. nbd_open() -> nbd_do_establish_connection() -> ...
  but that will never set blocking=false

2. nbd_reconnect_attempt() -> nbd_co_do_establish_connection() -> ...
  but that uses errp=NULL

So, we are safe with our wrong errp policy in
nbd_co_establish_connection(). Still let's fix it.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20210906190654.183421-2-vsement...@virtuozzo.com>
Reviewed-by: Eric Blake 
Signed-off-by: Eric Blake 
---
 nbd/client-connection.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nbd/client-connection.c b/nbd/client-connection.c
index 7123b1e18900..695f85575414 100644
--- a/nbd/client-connection.c
+++ b/nbd/client-connection.c
@@ -318,6 +318,7 @@ nbd_co_establish_connection(NBDClientConnection *conn, 
NBDExportInfo *info,
 }

 if (!blocking) {
+error_setg(errp, "No connection at the moment");
 return NULL;
 }

-- 
2.31.1

[PULL 11/20] block: use int64_t instead of int in driver discard handlers

From: Vladimir Sementsov-Ogievskiy 

We are generally moving to int64_t for both offset and bytes parameters
on all io paths.

Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.

We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).

So, convert driver discard handlers bytes parameter to int64_t.

The only caller of all updated function is bdrv_co_pdiscard in
block/io.c. It is already prepared to work with 64bit requests, but
pass at most max(bs->bl.max_pdiscard, INT_MAX) to the driver.

Let's look at all updated functions:

blkdebug: all calculations are still OK, thanks to
  bdrv_check_qiov_request().
  both rule_check and bdrv_co_pdiscard are 64bit

blklogwrites: pass to blk_loc_writes_co_log which is 64bit

blkreplay, copy-on-read, filter-compress: pass to bdrv_co_pdiscard, OK

copy-before-write: pass to bdrv_co_pdiscard which is 64bit and to
  cbw_do_copy_before_write which is 64bit

file-posix: one handler calls raw_account_discard() is 64bit and both
  handlers calls raw_do_pdiscard(). Update raw_do_pdiscard, which pass
  to RawPosixAIOData::aio_nbytes, which is 64bit (and calls
  raw_account_discard())

gluster: somehow, third argument of glfs_discard_async is size_t.
  Let's set max_pdiscard accordingly.

iscsi: iscsi_allocmap_set_invalid is 64bit,
  !is_byte_request_lun_aligned is 64bit.
  list.num is uint32_t. Let's clarify max_pdiscard and
  pdiscard_alignment.

mirror_top: pass to bdrv_mirror_top_do_write() which is
  64bit

nbd: protocol limitation. max_pdiscard is alredy set strict enough,
  keep it as is for now.

nvme: buf.nlb is uint32_t and we do shift. So, add corresponding limits
  to nvme_refresh_limits().

preallocate: pass to bdrv_co_pdiscard() which is 64bit.

rbd: pass to qemu_rbd_start_co() which is 64bit.

qcow2: calculations are still OK, thanks to bdrv_check_qiov_request(),
  qcow2_cluster_discard() is 64bit.

raw-format: raw_adjust_offset() is 64bit, bdrv_co_pdiscard too.

throttle: pass to bdrv_co_pdiscard() which is 64bit and to
  throttle_group_co_io_limits_intercept() which is 64bit as well.

test-block-iothread: bytes argument is unused

Great! Now all drivers are prepared to handle 64bit discard requests,
or else have explicit max_pdiscard limits.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20210903102807.27127-11-vsement...@virtuozzo.com>
Reviewed-by: Eric Blake 
Signed-off-by: Eric Blake 
---
 include/block/block_int.h|  2 +-
 block/blkdebug.c |  2 +-
 block/blklogwrites.c |  4 ++--
 block/blkreplay.c|  2 +-
 block/copy-before-write.c|  2 +-
 block/copy-on-read.c |  2 +-
 block/file-posix.c   |  7 ---
 block/filter-compress.c  |  2 +-
 block/gluster.c  |  7 +--
 block/iscsi.c| 16 +++-
 block/mirror.c   |  2 +-
 block/nbd.c  |  6 --
 block/nvme.c | 14 +-
 block/preallocate.c  |  2 +-
 block/qcow2.c|  2 +-
 block/raw-format.c   |  2 +-
 block/rbd.c  |  4 ++--
 block/throttle.c |  2 +-
 tests/unit/test-block-iothread.c |  2 +-
 block/trace-events   |  4 ++--
 20 files changed, 55 insertions(+), 31 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 9b4e0748bc86..ffe86068d4d5 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -303,7 +303,7 @@ struct BlockDriver {
 int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
 int64_t offset, int64_t bytes, BdrvRequestFlags flags);
 int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
-int64_t offset, int bytes);
+int64_t offset, int64_t bytes);

 /* Map [offset, offset + nbytes) range onto a child of @bs to copy from,
  * and invoke bdrv_co_copy_range_from(child, ...), or invoke
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 742b4a3834d8..bbf294870308 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -717,7 +717,7 @@ static int coroutine_fn 
blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
 }

 static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
- int64_t offset, int bytes)
+ int64_t offset, int64_t bytes)
 {
 uint32_t align = bs->bl.pdiscard_alignment;
 int err;
diff --git a/block/blklogwrites.c b/block/blklogwrites.c
index d7ae64c22d81..f7a251e91f9e 100644
--- a/block/blklogwrites.c
+++ b/block/blklogwrites.c
@@ -484,9 +484,9 @@ static int coroutine_fn 
blk_log_writes_co_flush_to_disk(BlockDriverState *bs)
 }

 static int coroutine_fn
-blk_log_writes_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)

[PULL 12/20] block/io: allow 64bit discard requests

From: Vladimir Sementsov-Ogievskiy 

Now that all drivers are updated by the previous commit, we can drop
the last limiter on pdiscard path: INT_MAX in bdrv_co_pdiscard().

Now everything is prepared for implementing incredibly cool and fast
big-discard requests in NBD and qcow2. And any other driver which wants
it of course.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
Message-Id: <20210903102807.27127-12-vsement...@virtuozzo.com>
Signed-off-by: Eric Blake 
---
 block/io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/io.c b/block/io.c
index 3846e2ed961b..18d345a87af3 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3104,7 +3104,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, 
int64_t offset,
 goto out;
 }

-max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
+max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, 
INT64_MAX),
align);
 assert(max_pdiscard >= bs->bl.request_alignment);

-- 
2.31.1

[PULL 17/20] block/nbd: refactor nbd_recv_coroutines_wake_all()

From: Vladimir Sementsov-Ogievskiy 

Split out nbd_recv_coroutine_wake_one(), as it will be used
separately.
Rename the function and add a possibility to wake only first found
sleeping coroutine.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20210902103805.25686-4-vsement...@virtuozzo.com>
Reviewed-by: Eric Blake 
[eblake: grammar tweak]
Signed-off-by: Eric Blake 
---
 block/nbd.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 2842e6263fe4..709c2499e33c 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -127,16 +127,24 @@ static bool nbd_client_connected(BDRVNBDState *s)
 return qatomic_load_acquire(>state) == NBD_CLIENT_CONNECTED;
 }

-static void nbd_recv_coroutines_wake_all(BDRVNBDState *s)
+static bool nbd_recv_coroutine_wake_one(NBDClientRequest *req)
+{
+if (req->receiving) {
+req->receiving = false;
+aio_co_wake(req->coroutine);
+return true;
+}
+
+return false;
+}
+
+static void nbd_recv_coroutines_wake(BDRVNBDState *s, bool all)
 {
 int i;

 for (i = 0; i < MAX_NBD_REQUESTS; i++) {
-NBDClientRequest *req = >requests[i];
-
-if (req->coroutine && req->receiving) {
-req->receiving = false;
-aio_co_wake(req->coroutine);
+if (nbd_recv_coroutine_wake_one(>requests[i]) && !all) {
+return;
 }
 }
 }
@@ -415,7 +423,7 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState 
*s)

 while (s->in_flight > 0) {
 qemu_co_mutex_unlock(>send_mutex);
-nbd_recv_coroutines_wake_all(s);
+nbd_recv_coroutines_wake(s, true);
 s->wait_in_flight = true;
 qemu_coroutine_yield();
 s->wait_in_flight = false;
@@ -558,7 +566,7 @@ static coroutine_fn void nbd_connection_entry(void *opaque)
 }

 qemu_co_queue_restart_all(>free_sema);
-nbd_recv_coroutines_wake_all(s);
+nbd_recv_coroutines_wake(s, true);
 bdrv_dec_in_flight(s->bs);

 s->connection_co = NULL;
@@ -1035,7 +1043,7 @@ static coroutine_fn int nbd_co_receive_one_chunk(
 if (s->connection_co && !s->wait_in_flight) {
 /*
  * We must check s->wait_in_flight, because we may entered by
- * nbd_recv_coroutines_wake_all(), in this case we should not
+ * nbd_recv_coroutines_wake(), in this case we should not
  * wake connection_co here, it will woken by last request.
  */
 aio_co_wake(s->connection_co);
-- 
2.31.1

[PULL 07/20] block: make BlockLimits::max_pwrite_zeroes 64bit

From: Vladimir Sementsov-Ogievskiy 

We are going to support 64 bit write-zeroes requests. Now update the
limit variable. It's absolutely safe. The variable is set in some
drivers, and used in bdrv_co_do_pwrite_zeroes().

Update also max_write_zeroes variable in bdrv_co_do_pwrite_zeroes(), so
that bdrv_co_do_pwrite_zeroes() is now prepared to 64bit requests. The
remaining logic including num, offset and bytes variables is already
supporting 64bit requests.

So the only thing that prevents 64 bit requests is limiting
max_write_zeroes variable to INT_MAX in bdrv_co_do_pwrite_zeroes().
We'll drop this limitation after updating all block drivers.

Ah, we also have bdrv_check_request32() in bdrv_co_pwritev_part(). It
will be modified to do bdrv_check_request() for write-zeroes path.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20210903102807.27127-7-vsement...@virtuozzo.com>
Reviewed-by: Eric Blake 
Signed-off-by: Eric Blake 
---
 include/block/block_int.h | 9 +
 block/io.c| 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 5536f49bc67c..24958acd33f4 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -686,10 +686,11 @@ typedef struct BlockLimits {
  * that is set. May be 0 if bl.request_alignment is good enough */
 uint32_t pdiscard_alignment;

-/* Maximum number of bytes that can zeroized at once (since it is
- * signed, it must be < 2G, if set). Must be multiple of
- * pwrite_zeroes_alignment. May be 0 if no inherent 32-bit limit */
-int32_t max_pwrite_zeroes;
+/*
+ * Maximum number of bytes that can zeroized at once. Must be multiple of
+ * pwrite_zeroes_alignment. 0 means no limit.
+ */
+int64_t max_pwrite_zeroes;

 /* Optimal alignment for write zeroes requests in bytes. A power
  * of 2 is best but not mandatory.  Must be a multiple of
diff --git a/block/io.c b/block/io.c
index aa6f7b075e78..0090224603f5 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1869,7 +1869,7 @@ static int coroutine_fn 
bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 int head = 0;
 int tail = 0;

-int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
+int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
 int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
 bs->bl.request_alignment);
 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
-- 
2.31.1

[PULL 08/20] block: use int64_t instead of int in driver write_zeroes handlers

From: Vladimir Sementsov-Ogievskiy 

We are generally moving to int64_t for both offset and bytes parameters
on all io paths.

Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.

We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).

So, convert driver write_zeroes handlers bytes parameter to int64_t.

The only caller of all updated function is bdrv_co_do_pwrite_zeroes().

bdrv_co_do_pwrite_zeroes() itself is of course OK with widening of
callee parameter type. Also, bdrv_co_do_pwrite_zeroes()'s
max_write_zeroes is limited to INT_MAX. So, updated functions all are
safe, they will not get "bytes" larger than before.

Still, let's look through all updated functions, and add assertions to
the ones which are actually unprepared to values larger than INT_MAX.
For these drivers also set explicit max_pwrite_zeroes limit.

Let's go:

blkdebug: calculations can't overflow, thanks to
  bdrv_check_qiov_request() in generic layer. rule_check() and
  bdrv_co_pwrite_zeroes() both have 64bit argument.

blklogwrites: pass to blk_log_writes_co_log() with 64bit argument.

blkreplay, copy-on-read, filter-compress: pass to
  bdrv_co_pwrite_zeroes() which is OK

copy-before-write: Calls cbw_do_copy_before_write() and
  bdrv_co_pwrite_zeroes, both have 64bit argument.

file-posix: both handler calls raw_do_pwrite_zeroes, which is updated.
  In raw_do_pwrite_zeroes() calculations are OK due to
  bdrv_check_qiov_request(), bytes go to RawPosixAIOData::aio_nbytes
  which is uint64_t.
  Check also where that uint64_t gets handed:
  handle_aiocb_write_zeroes_block() passes a uint64_t[2] to
  ioctl(BLKZEROOUT), handle_aiocb_write_zeroes() calls do_fallocate()
  which takes off_t (and we compile to always have 64-bit off_t), as
  does handle_aiocb_write_zeroes_unmap. All look safe.

gluster: bytes go to GlusterAIOCB::size which is int64_t and to
  glfs_zerofill_async works with off_t.

iscsi: Aha, here we deal with iscsi_writesame16_task() that has
  uint32_t num_blocks argument and iscsi_writesame16_task() has
  uint16_t argument. Make comments, add assertions and clarify
  max_pwrite_zeroes calculation.
  iscsi_allocmap_() functions already has int64_t argument
  is_byte_request_lun_aligned is simple to update, do it.

mirror_top: pass to bdrv_mirror_top_do_write which has uint64_t
  argument

nbd: Aha, here we have protocol limitation, and NBDRequest::len is
  uint32_t. max_pwrite_zeroes is cleanly set to 32bit value, so we are
  OK for now.

nvme: Again, protocol limitation. And no inherent limit for
  write-zeroes at all. But from code that calculates cdw12 it's obvious
  that we do have limit and alignment. Let's clarify it. Also,
  obviously the code is not prepared to handle bytes=0. Let's handle
  this case too.
  trace events already 64bit

preallocate: pass to handle_write() and bdrv_co_pwrite_zeroes(), both
  64bit.

rbd: pass to qemu_rbd_start_co() which is 64bit.

qcow2: offset + bytes and alignment still works good (thanks to
  bdrv_check_qiov_request()), so tail calculation is OK
  qcow2_subcluster_zeroize() has 64bit argument, should be OK
  trace events updated

qed: qed_co_request wants int nb_sectors. Also in code we have size_t
  used for request length which may be 32bit. So, let's just keep
  INT_MAX as a limit (aligning it down to pwrite_zeroes_alignment) and
  don't care.

raw-format: Is OK. raw_adjust_offset and bdrv_co_pwrite_zeroes are both
  64bit.

throttle: Both throttle_group_co_io_limits_intercept() and
  bdrv_co_pwrite_zeroes() are 64bit.

vmdk: pass to vmdk_pwritev which is 64bit

quorum: pass to quorum_co_pwritev() which is 64bit

Hooray!

At this point all block drivers are prepared to support 64bit
write-zero requests, or have explicitly set max_pwrite_zeroes.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20210903102807.27127-8-vsement...@virtuozzo.com>
Reviewed-by: Eric Blake 
[eblake: use <= rather than < in assertions relying on max_pwrite_zeroes]
Signed-off-by: Eric Blake 
---
 include/block/block_int.h |  2 +-
 block/blkdebug.c  |  2 +-
 block/blklogwrites.c  |  4 ++--
 block/blkreplay.c |  2 +-
 block/copy-before-write.c |  2 +-
 block/copy-on-read.c  |  2 +-
 block/file-posix.c|  6 +++---
 block/filter-compress.c   |  2 +-
 block/gluster.c   |  6 +++---
 block/iscsi.c | 30 --
 block/mirror.c|  2 +-
 block/nbd.c   |  6 --
 block/nvme.c  | 24 +---
 block/preallocate.c   |  2 +-
 block/qcow2.c |  2 +-
 block/qed.c   |  9 -
 block/quorum.c|  2 +-
 block/raw-format.c|  2 +-
 block/rbd.c   |  4 ++--
 block/throttle.c  |  2 +-
 block/vmdk.c  |  2 +-
 block/trace-events|  4 ++--
 22 files

[PULL 05/20] block: use int64_t instead of uint64_t in driver write handlers

From: Vladimir Sementsov-Ogievskiy 

We are generally moving to int64_t for both offset and bytes parameters
on all io paths.

Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.

We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).

So, convert driver write handlers parameters which are already 64bit to
signed type.

While being here, convert also flags parameter to be BdrvRequestFlags.

Now let's consider all callers. Simple

  git grep '\->bdrv_\(aio\|co\)_pwritev\(_part\)\?'

shows that's there three callers of driver function:

 bdrv_driver_pwritev() and bdrv_driver_pwritev_compressed() in
 block/io.c, both pass int64_t, checked by bdrv_check_qiov_request() to
 be non-negative.

 qcow2_save_vmstate() does bdrv_check_qiov_request().

Still, the functions may be called directly, not only by drv->...
Let's check:

git grep '\.bdrv_\(aio\|co\)_pwritev\(_part\)\?\s*=' | \
awk '{print $4}' | sed 's/,//' | sed 's/&//' | sort | uniq | \
while read func; do git grep "$func(" | \
grep -v "$func(BlockDriverState"; done

shows several callers:

qcow2:
  qcow2_co_truncate() write at most up to @offset, which is checked in
generic qcow2_co_truncate() by bdrv_check_request().
  qcow2_co_pwritev_compressed_task() pass the request (or part of the
request) that already went through normal write path, so it should
be OK

qcow:
  qcow_co_pwritev_compressed() pass int64_t, it's updated by this patch

quorum:
  quorum_co_pwrite_zeroes() pass int64_t and int - OK

throttle:
  throttle_co_pwritev_compressed() pass int64_t, it's updated by this
  patch

vmdk:
  vmdk_co_pwritev_compressed() pass int64_t, it's updated by this
  patch

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20210903102807.27127-5-vsement...@virtuozzo.com>
Reviewed-by: Eric Blake 
Signed-off-by: Eric Blake 
---
 include/block/block_int.h| 16 
 block/io.c   |  6 --
 block/blkdebug.c |  4 ++--
 block/blklogwrites.c |  4 ++--
 block/blkreplay.c|  2 +-
 block/blkverify.c|  4 ++--
 block/copy-before-write.c|  7 ---
 block/copy-on-read.c | 11 ++-
 block/crypto.c   |  4 ++--
 block/file-posix.c   |  6 +++---
 block/file-win32.c   |  4 ++--
 block/filter-compress.c  |  7 ---
 block/mirror.c   |  2 +-
 block/nbd.c  |  5 +++--
 block/nfs.c  |  6 +++---
 block/null.c |  9 +
 block/nvme.c |  5 +++--
 block/preallocate.c  |  6 +++---
 block/qcow.c | 10 +-
 block/qcow2.c|  6 +++---
 block/quorum.c   |  5 +++--
 block/raw-format.c   |  8 
 block/rbd.c  |  6 +++---
 block/throttle.c |  9 +
 block/vdi.c  |  4 ++--
 block/vmdk.c |  8 
 block/vpc.c  |  4 ++--
 block/vvfat.c|  4 ++--
 tests/unit/test-block-iothread.c |  4 ++--
 block/trace-events   |  2 +-
 30 files changed, 94 insertions(+), 84 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 9b1a276fa1c9..2cf5f1722a7f 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -238,8 +238,8 @@ struct BlockDriver {
 int64_t offset, int64_t bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
-uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
-BlockCompletionFunc *cb, void *opaque);
+int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
 BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
@@ -288,10 +288,11 @@ struct BlockDriver {
  * The buffer in @qiov may point directly to guest memory.
  */
 int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs,
-uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
+int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+BdrvRequestFlags flags);
 int coroutine_fn (*bdrv_co_pwritev_part)(BlockDriverState *bs,
-uint64_t offset, uint64_t bytes,
-QEMUIOVector *qiov, size_t qiov_offset, int flags);
+int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset,
+BdrvRequestFlags flags);

 /*
  * Efficiently zero a region of the disk image.  Typically an image format
@@ -438,10 +439,9 @@ struct BlockDriver

[PULL 16/20] block/nbd: move nbd_recv_coroutines_wake_all() up

From: Vladimir Sementsov-Ogievskiy 

We are going to use it in nbd_channel_error(), so move it up. Note,
that we are going also refactor and rename
nbd_recv_coroutines_wake_all() in future anyway, so keeping it where it
is and making forward declaration doesn't make real sense.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
Message-Id: <20210902103805.25686-3-vsement...@virtuozzo.com>
Signed-off-by: Eric Blake 
---
 block/nbd.c | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index de59e76378ab..2842e6263fe4 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -127,22 +127,6 @@ static bool nbd_client_connected(BDRVNBDState *s)
 return qatomic_load_acquire(>state) == NBD_CLIENT_CONNECTED;
 }

-static void nbd_channel_error(BDRVNBDState *s, int ret)
-{
-if (nbd_client_connected(s)) {
-qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
-}
-
-if (ret == -EIO) {
-if (nbd_client_connected(s)) {
-s->state = s->reconnect_delay ? NBD_CLIENT_CONNECTING_WAIT :
-NBD_CLIENT_CONNECTING_NOWAIT;
-}
-} else {
-s->state = NBD_CLIENT_QUIT;
-}
-}
-
 static void nbd_recv_coroutines_wake_all(BDRVNBDState *s)
 {
 int i;
@@ -157,6 +141,22 @@ static void nbd_recv_coroutines_wake_all(BDRVNBDState *s)
 }
 }

+static void nbd_channel_error(BDRVNBDState *s, int ret)
+{
+if (nbd_client_connected(s)) {
+qio_channel_shutdown(s->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+}
+
+if (ret == -EIO) {
+if (nbd_client_connected(s)) {
+s->state = s->reconnect_delay ? NBD_CLIENT_CONNECTING_WAIT :
+NBD_CLIENT_CONNECTING_NOWAIT;
+}
+} else {
+s->state = NBD_CLIENT_QUIT;
+}
+}
+
 static void reconnect_delay_timer_del(BDRVNBDState *s)
 {
 if (s->reconnect_delay_timer) {
-- 
2.31.1

[PULL 04/20] block: use int64_t instead of uint64_t in driver read handlers

From: Vladimir Sementsov-Ogievskiy 

We are generally moving to int64_t for both offset and bytes parameters
on all io paths.

Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.

We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).

So, convert driver read handlers parameters which are already 64bit to
signed type.

While being here, convert also flags parameter to be BdrvRequestFlags.

Now let's consider all callers. Simple

  git grep '\->bdrv_\(aio\|co\)_preadv\(_part\)\?'

shows that's there three callers of driver function:

 bdrv_driver_preadv() in block/io.c, passes int64_t, checked by
   bdrv_check_qiov_request() to be non-negative.

 qcow2_load_vmstate() does bdrv_check_qiov_request().

 do_perform_cow_read() has uint64_t argument. And a lot of things in
 qcow2 driver are uint64_t, so converting it is big job. But we must
 not work with requests that don't satisfy bdrv_check_qiov_request(),
 so let's just assert it here.

Still, the functions may be called directly, not only by drv->...
Let's check:

git grep '\.bdrv_\(aio\|co\)_preadv\(_part\)\?\s*=' | \
awk '{print $4}' | sed 's/,//' | sed 's/&//' | sort | uniq | \
while read func; do git grep "$func(" | \
grep -v "$func(BlockDriverState"; done

The only one such caller:

QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, , 1);
...
ret = bdrv_replace_test_co_preadv(bs, 0, 1, , 0);

in tests/unit/test-bdrv-drain.c, and it's OK obviously.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20210903102807.27127-4-vsement...@virtuozzo.com>
Reviewed-by: Eric Blake 
[eblake: fix typos]
Signed-off-by: Eric Blake 
---
 include/block/block_int.h| 11 ++-
 block/blkdebug.c |  4 ++--
 block/blklogwrites.c |  4 ++--
 block/blkreplay.c|  2 +-
 block/blkverify.c|  4 ++--
 block/bochs.c|  4 ++--
 block/cloop.c|  4 ++--
 block/commit.c   |  2 +-
 block/copy-before-write.c|  4 ++--
 block/copy-on-read.c |  4 ++--
 block/crypto.c   |  4 ++--
 block/curl.c |  3 ++-
 block/dmg.c  |  4 ++--
 block/file-posix.c   |  6 +++---
 block/file-win32.c   |  4 ++--
 block/filter-compress.c  |  4 ++--
 block/mirror.c   |  2 +-
 block/nbd.c  |  5 +++--
 block/nfs.c  |  6 +++---
 block/null.c |  9 +
 block/nvme.c |  5 +++--
 block/preallocate.c  |  4 ++--
 block/qcow.c |  6 +++---
 block/qcow2-cluster.c| 14 +-
 block/qcow2.c|  5 +++--
 block/quorum.c   |  4 ++--
 block/raw-format.c   | 20 ++--
 block/rbd.c  |  6 +++---
 block/throttle.c |  5 +++--
 block/vdi.c  |  4 ++--
 block/vmdk.c |  4 ++--
 block/vpc.c  |  4 ++--
 block/vvfat.c|  4 ++--
 tests/unit/test-bdrv-drain.c | 16 +---
 tests/unit/test-block-iothread.c | 19 ++-
 35 files changed, 120 insertions(+), 90 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index ed60495938a6..9b1a276fa1c9 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -235,8 +235,8 @@ struct BlockDriver {

 /* aio */
 BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs,
-uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
-BlockCompletionFunc *cb, void *opaque);
+int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque);
 BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
 uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags,
 BlockCompletionFunc *cb, void *opaque);
@@ -265,10 +265,11 @@ struct BlockDriver {
  * The buffer in @qiov may point directly to guest memory.
  */
 int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
-uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
+int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+BdrvRequestFlags flags);
 int coroutine_fn (*bdrv_co_preadv_part)(BlockDriverState *bs,
-uint64_t offset, uint64_t bytes,
-QEMUIOVector *qiov, size_t qiov_offset, int flags);
+int64_t offset, int64_t bytes,
+QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags);
 int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
 /**
diff --git a/block/blkdebug.c b/block/blkdebug.c
index

[PULL 10/20] block: make BlockLimits::max_pdiscard 64bit

From: Vladimir Sementsov-Ogievskiy 

We are going to support 64 bit discard requests. Now update the
limit variable. It's absolutely safe. The variable is set in some
drivers, and used in bdrv_co_pdiscard().

Update also max_pdiscard variable in bdrv_co_pdiscard(), so that
bdrv_co_pdiscard() is now prepared for 64bit requests. The remaining
logic including num, offset and bytes variables is already
supporting 64bit requests.

So the only thing that prevents 64 bit requests is limiting
max_pdiscard variable to INT_MAX in bdrv_co_pdiscard().
We'll drop this limitation after updating all block drivers.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
Message-Id: <20210903102807.27127-10-vsement...@virtuozzo.com>
Signed-off-by: Eric Blake 
---
 include/block/block_int.h | 11 ++-
 block/io.c|  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index d518703e3e59..9b4e0748bc86 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -674,11 +674,12 @@ typedef struct BlockLimits {
  * otherwise. */
 uint32_t request_alignment;

-/* Maximum number of bytes that can be discarded at once (since it
- * is signed, it must be < 2G, if set). Must be multiple of
- * pdiscard_alignment, but need not be power of 2. May be 0 if no
- * inherent 32-bit limit */
-int32_t max_pdiscard;
+/*
+ * Maximum number of bytes that can be discarded at once. Must be multiple
+ * of pdiscard_alignment, but need not be power of 2. May be 0 if no
+ * inherent 64-bit limit.
+ */
+int64_t max_pdiscard;

 /* Optimal alignment for discard requests in bytes. A power of 2
  * is best but not mandatory.  Must be a multiple of
diff --git a/block/io.c b/block/io.c
index e40462742ea1..3846e2ed961b 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3056,7 +3056,8 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, 
int64_t offset,
   int64_t bytes)
 {
 BdrvTrackedRequest req;
-int max_pdiscard, ret;
+int ret;
+int64_t max_pdiscard;
 int head, tail, align;
 BlockDriverState *bs = child->bs;

-- 
2.31.1

[PULL 19/20] block/nbd: check that received handle is valid

From: Vladimir Sementsov-Ogievskiy 

If we don't have active request, that waiting for this handle to be
received, we should report an error.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20210902103805.25686-6-vsement...@virtuozzo.com>
Reviewed-by: Eric Blake 
Signed-off-by: Eric Blake 
---
 block/nbd.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/block/nbd.c b/block/nbd.c
index 8ff6daf43d46..5ef462db1b7f 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -58,6 +58,7 @@ typedef struct {
 Coroutine *coroutine;
 uint64_t offset;/* original offset of the request */
 bool receiving; /* sleeping in the yield in nbd_receive_replies */
+bool reply_possible;/* reply header not yet received */
 } NBDClientRequest;

 typedef enum NBDClientState {
@@ -415,14 +416,7 @@ static coroutine_fn int nbd_receive_replies(BDRVNBDState 
*s, uint64_t handle)
 return 0;
 }
 ind2 = HANDLE_TO_INDEX(s, s->reply.handle);
-if (ind2 >= MAX_NBD_REQUESTS || !s->requests[ind2].coroutine) {
-/*
- * We only check that ind2 request exists. But don't check
- * whether it is now waiting for the reply header or
- * not. We can't just check s->requests[ind2].receiving:
- * ind2 request may wait in trying to lock
- * receive_mutex. So that's a TODO.
- */
+if (ind2 >= MAX_NBD_REQUESTS || !s->requests[ind2].reply_possible) {
 nbd_channel_error(s, -EINVAL);
 return -EINVAL;
 }
@@ -468,6 +462,7 @@ static int nbd_co_send_request(BlockDriverState *bs,
 s->requests[i].coroutine = qemu_coroutine_self();
 s->requests[i].offset = request->from;
 s->requests[i].receiving = false;
+s->requests[i].reply_possible = true;

 request->handle = INDEX_TO_HANDLE(s, i);

-- 
2.31.1

[PULL 06/20] block: use int64_t instead of uint64_t in copy_range driver handlers

From: Vladimir Sementsov-Ogievskiy 

We are generally moving to int64_t for both offset and bytes parameters
on all io paths.

Main motivation is realization of 64-bit write_zeroes operation for
fast zeroing large disk chunks, up to the whole disk.

We chose signed type, to be consistent with off_t (which is signed) and
with possibility for signed return type (where negative value means
error).

So, convert driver copy_range handlers parameters which are already
64bit to signed type.

Now let's consider all callers. Simple

  git grep '\->bdrv_co_copy_range'

shows the only caller:

  bdrv_co_copy_range_internal(), which does bdrv_check_request32(),
  so everything is OK.

Still, the functions may be called directly, not only by drv->...
Let's check:

git grep '\.bdrv_co_copy_range_\(from\|to\)\s*=' | \
awk '{print $4}' | sed 's/,//' | sed 's/&//' | sort | uniq | \
while read func; do git grep "$func(" | \
grep -v "$func(BlockDriverState"; done

shows no more callers. So, we are done.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
Message-Id: <20210903102807.27127-6-vsement...@virtuozzo.com>
Signed-off-by: Eric Blake 
---
 include/block/block_int.h | 12 ++--
 block/file-posix.c| 10 +-
 block/iscsi.c | 12 ++--
 block/qcow2.c | 12 ++--
 block/raw-format.c| 16 
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 2cf5f1722a7f..5536f49bc67c 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -314,10 +314,10 @@ struct BlockDriver {
  */
 int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs,
 BdrvChild *src,
-uint64_t offset,
+int64_t offset,
 BdrvChild *dst,
-uint64_t dst_offset,
-uint64_t bytes,
+int64_t dst_offset,
+int64_t bytes,
 BdrvRequestFlags read_flags,
 BdrvRequestFlags write_flags);

@@ -331,10 +331,10 @@ struct BlockDriver {
  */
 int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs,
   BdrvChild *src,
-  uint64_t src_offset,
+  int64_t src_offset,
   BdrvChild *dst,
-  uint64_t dst_offset,
-  uint64_t bytes,
+  int64_t dst_offset,
+  int64_t bytes,
   BdrvRequestFlags read_flags,
   BdrvRequestFlags write_flags);

diff --git a/block/file-posix.c b/block/file-posix.c
index 994f1c26ca7b..ed71e8d2dfee 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -3203,8 +3203,8 @@ static void raw_abort_perm_update(BlockDriverState *bs)
 }

 static int coroutine_fn raw_co_copy_range_from(
-BlockDriverState *bs, BdrvChild *src, uint64_t src_offset,
-BdrvChild *dst, uint64_t dst_offset, uint64_t bytes,
+BlockDriverState *bs, BdrvChild *src, int64_t src_offset,
+BdrvChild *dst, int64_t dst_offset, int64_t bytes,
 BdrvRequestFlags read_flags, BdrvRequestFlags write_flags)
 {
 return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
@@ -3213,10 +3213,10 @@ static int coroutine_fn raw_co_copy_range_from(

 static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
  BdrvChild *src,
- uint64_t src_offset,
+ int64_t src_offset,
  BdrvChild *dst,
- uint64_t dst_offset,
- uint64_t bytes,
+ int64_t dst_offset,
+ int64_t bytes,
  BdrvRequestFlags read_flags,
  BdrvRequestFlags write_flags)
 {
diff --git a/block/iscsi.c b/block/iscsi.c
index 852384086b61..01fdd1775f12 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2169,10 +2169,10 @@ static void coroutine_fn 
iscsi_co_invalidate_cache(BlockDriverState *bs,

 static int coroutine_fn iscsi_co_copy_range_from(BlockDriverState

[PULL 02/20] block/io: bring request check to bdrv_co_(read, write)v_vmstate

From: Vladimir Sementsov-Ogievskiy 

Only qcow2 driver supports vmstate.
In qcow2 these requests go through .bdrv_co_p{read,write}v_part
handlers.

So, let's do our basic check for the request on vmstate generic
handlers.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
Message-Id: <20210903102807.27127-2-vsement...@virtuozzo.com>
Signed-off-by: Eric Blake 
---
 block/io.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index 99ee182ca449..58602f84dbf0 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2810,7 +2810,12 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector 
*qiov, int64_t pos)
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *child_bs = bdrv_primary_bs(bs);
-int ret = -ENOTSUP;
+int ret;
+
+ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
+if (ret < 0) {
+return ret;
+}

 if (!drv) {
 return -ENOMEDIUM;
@@ -2822,6 +2827,8 @@ bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector 
*qiov, int64_t pos)
 ret = drv->bdrv_load_vmstate(bs, qiov, pos);
 } else if (child_bs) {
 ret = bdrv_co_readv_vmstate(child_bs, qiov, pos);
+} else {
+ret = -ENOTSUP;
 }

 bdrv_dec_in_flight(bs);
@@ -2834,7 +2841,12 @@ bdrv_co_writev_vmstate(BlockDriverState *bs, 
QEMUIOVector *qiov, int64_t pos)
 {
 BlockDriver *drv = bs->drv;
 BlockDriverState *child_bs = bdrv_primary_bs(bs);
-int ret = -ENOTSUP;
+int ret;
+
+ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
+if (ret < 0) {
+return ret;
+}

 if (!drv) {
 return -ENOMEDIUM;
@@ -2846,6 +2858,8 @@ bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector 
*qiov, int64_t pos)
 ret = drv->bdrv_save_vmstate(bs, qiov, pos);
 } else if (child_bs) {
 ret = bdrv_co_writev_vmstate(child_bs, qiov, pos);
+} else {
+ret = -ENOTSUP;
 }

 bdrv_dec_in_flight(bs);
-- 
2.31.1

[PULL 00/20] NBD patches through 2021-09-27

The following changes since commit 9b03a1178204598055f23f24e438fdddb5935df9:

  Merge remote-tracking branch 
'remotes/vivier2/tags/trivial-branch-for-6.2-pull-request' into staging 
(2021-09-27 11:08:36 +0100)

are available in the Git repository at:

  https://repo.or.cz/qemu/ericb.git tags/pull-nbd-2021-09-27

for you to fetch changes up to 3cb015ad05c7c1e07e0deb356cd20e6cd765c0ea:

  nbd/server: Add --selinux-label option (2021-09-27 16:16:28 -0500)


nbd patches for 2021-09-27

- Richard W.M. Jones: Add --selinux-label option to qemu-nbd
- Vladimir Sementsov-Ogievskiy: Rework coroutines of qemu NBD client
  to improve reconnect support
- Eric Blake: Relax server in regards to NBD_OPT_LIST_META_CONTEXT
- Vladimir Sementsov-Ogievskiy: Plumb up 64-bit bulk-zeroing support
  in block layer, in preparation for future NBD spec extensions
- Nir Soffer: Default to writeback cache in qemu-nbd


Eric Blake (1):
  nbd/server: Allow LIST_META_CONTEXT without STRUCTURED_REPLY

Nir Soffer (1):
  qemu-nbd: Change default cache mode to writeback

Richard W.M. Jones (1):
  nbd/server: Add --selinux-label option

Vladimir Sementsov-Ogievskiy (17):
  block/io: bring request check to bdrv_co_(read,write)v_vmstate
  qcow2: check request on vmstate save/load path
  block: use int64_t instead of uint64_t in driver read handlers
  block: use int64_t instead of uint64_t in driver write handlers
  block: use int64_t instead of uint64_t in copy_range driver handlers
  block: make BlockLimits::max_pwrite_zeroes 64bit
  block: use int64_t instead of int in driver write_zeroes handlers
  block/io: allow 64bit write-zeroes requests
  block: make BlockLimits::max_pdiscard 64bit
  block: use int64_t instead of int in driver discard handlers
  block/io: allow 64bit discard requests
  nbd/client-connection: nbd_co_establish_connection(): fix non set errp
  block/nbd: nbd_channel_error() shutdown channel unconditionally
  block/nbd: move nbd_recv_coroutines_wake_all() up
  block/nbd: refactor nbd_recv_coroutines_wake_all()
  block/nbd: drop connection_co
  block/nbd: check that received handle is valid

 docs/tools/qemu-nbd.rst   |   6 +-
 configure |   8 +-
 meson.build   |  10 +-
 include/block/block_int.h |  66 ++--
 block/io.c|  44 ++-
 block/blkdebug.c  |  12 +-
 block/blklogwrites.c  |  16 +-
 block/blkreplay.c |   8 +-
 block/blkverify.c |   8 +-
 block/bochs.c |   4 +-
 block/cloop.c |   4 +-
 block/commit.c|   2 +-
 block/copy-before-write.c |  15 +-
 block/copy-on-read.c  |  19 +-
 block/crypto.c|   8 +-
 block/curl.c  |   3 +-
 block/dmg.c   |   4 +-
 block/file-posix.c|  35 +-
 block/file-win32.c|   8 +-
 block/filter-compress.c   |  15 +-
 block/gluster.c   |  13 +-
 block/iscsi.c |  58 ++--
 block/mirror.c|   8 +-
 block/nbd.c   | 443 --
 block/nfs.c   |  12 +-
 block/null.c  |  18 +-
 block/nvme.c  |  48 ++-
 block/preallocate.c   |  14 +-
 block/qcow.c  |  16 +-
 block/qcow2-cluster.c |  14 +-
 block/qcow2.c |  70 ++--
 block/qed.c   |   9 +-
 block/quorum.c|  11 +-
 block/raw-format.c|  36 +--
 block/rbd.c   |  20 +-
 block/throttle.c  |  18 +-
 block/vdi.c   |   8 +-
 block/vmdk.c  |  14 +-
 block/vpc.c   |   8 +-
 block/vvfat.c |   8 +-
 nbd/client-connection.c   |   1 +
 nbd/client.c  |   2 -
 nbd/server.c  |   2 +-
 qemu-nbd.c|  45 ++-
 tests/unit/test-bdrv-drain.c  |  16 +-
 tests/unit/test-block-iothread.c  |  21 +-
 block/trace-events|  10 +-

[PULL 09/20] block/io: allow 64bit write-zeroes requests

From: Vladimir Sementsov-Ogievskiy 

Now that all drivers are updated by previous commit, we can drop two
last limiters on write-zeroes path: INT_MAX in
bdrv_co_do_pwrite_zeroes() and bdrv_check_request32() in
bdrv_co_pwritev_part().

Now everything is prepared for implementing incredibly cool and fast
big-write-zeroes in NBD and qcow2. And any other driver which wants it
of course.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
Message-Id: <20210903102807.27127-9-vsement...@virtuozzo.com>
Signed-off-by: Eric Blake 
---
 block/io.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index 0090224603f5..e40462742ea1 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1869,7 +1869,8 @@ static int coroutine_fn 
bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
 int head = 0;
 int tail = 0;

-int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
+int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes,
+INT64_MAX);
 int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
 bs->bl.request_alignment);
 int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
@@ -2248,7 +2249,11 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
 return -ENOMEDIUM;
 }

-ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
+if (flags & BDRV_REQ_ZERO_WRITE) {
+ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
+} else {
+ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
+}
 if (ret < 0) {
 return ret;
 }
-- 
2.31.1

[PULL 01/20] qemu-nbd: Change default cache mode to writeback

From: Nir Soffer 

Both qemu and qemu-img use writeback cache mode by default, which is
already documented in qemu(1). qemu-nbd uses writethrough cache mode by
default, and the default cache mode is not documented.

According to the qemu-nbd(8):

   --cache=CACHE
  The  cache  mode  to be used with the file.  See the
  documentation of the emulator's -drive cache=... option for
  allowed values.

qemu(1) says:

The default mode is cache=writeback.

So users have no reason to assume that qemu-nbd is using writethough
cache mode. The only hint is the painfully slow writing when using the
defaults.

Looking in git history, it seems that qemu used writethrough in the past
to support broken guests that did not flush data properly, or could not
flush due to limitations in qemu. But qemu-nbd clients can use
NBD_CMD_FLUSH to flush data, so using writethrough does not help anyone.

Change the default cache mode to writback, and document the default and
available values properly in the online help and manual.

With this change converting image via qemu-nbd is 3.5 times faster.

$ qemu-img create dst.img 50g
$ qemu-nbd -t -f raw -k /tmp/nbd.sock dst.img

Before this change:

$ hyperfine -r3 "./qemu-img convert -p -f raw -O raw -T none -W 
fedora34.img nbd+unix:///?socket=/tmp/nbd.sock"
Benchmark #1: ./qemu-img convert -p -f raw -O raw -T none -W fedora34.img 
nbd+unix:///?socket=/tmp/nbd.sock
  Time (mean ± σ): 83.639 s ±  5.970 s[User: 2.733 s, System: 6.112 
s]
  Range (min … max):   76.749 s … 87.245 s3 runs

After this change:

$ hyperfine -r3 "./qemu-img convert -p -f raw -O raw -T none -W 
fedora34.img nbd+unix:///?socket=/tmp/nbd.sock"
Benchmark #1: ./qemu-img convert -p -f raw -O raw -T none -W fedora34.img 
nbd+unix:///?socket=/tmp/nbd.sock
  Time (mean ± σ): 23.522 s ±  0.433 s[User: 2.083 s, System: 5.475 
s]
  Range (min … max):   23.234 s … 24.019 s3 runs

Users can avoid the issue by using --cache=writeback[1] but the defaults
should give good performance for the common use case.

[1] https://bugzilla.redhat.com/1990656

Signed-off-by: Nir Soffer 
Message-Id: <20210813205519.50518-1-nsof...@redhat.com>
Reviewed-by: Eric Blake 
CC: qemu-sta...@nongnu.org
Signed-off-by: Eric Blake 
---
 docs/tools/qemu-nbd.rst | 6 --
 qemu-nbd.c  | 6 --
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst
index e39a9f4b1a67..56e54cd44114 100644
--- a/docs/tools/qemu-nbd.rst
+++ b/docs/tools/qemu-nbd.rst
@@ -99,8 +99,10 @@ driver options if ``--image-opts`` is specified.

 .. option:: --cache=CACHE

-  The cache mode to be used with the file.  See the documentation of
-  the emulator's ``-drive cache=...`` option for allowed values.
+  The cache mode to be used with the file. Valid values are:
+  ``none``, ``writeback`` (the default), ``writethrough``,
+  ``directsync`` and ``unsafe``. See the documentation of
+  the emulator's ``-drive cache=...`` option for more info.

 .. option:: -n, --nocache

diff --git a/qemu-nbd.c b/qemu-nbd.c
index 65ebec598f88..9d895ba24b1e 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -135,7 +135,9 @@ static void usage(const char *name)
 "'snapshot.id=[ID],snapshot.name=[NAME]', or\n"
 "'[ID_OR_NAME]'\n"
 "  -n, --nocache disable host cache\n"
-"  --cache=MODE  set cache mode (none, writeback, ...)\n"
+"  --cache=MODE  set cache mode used to access the disk image, 
the\n"
+"valid options are: 'none', 'writeback' 
(default),\n"
+"'writethrough', 'directsync' and 'unsafe'\n"
 "  --aio=MODEset AIO mode (native, io_uring or threads)\n"
 "  --discard=MODEset discard mode (ignore, unmap)\n"
 "  --detect-zeroes=MODE  set detect-zeroes mode (off, on, unmap)\n"
@@ -552,7 +554,7 @@ int main(int argc, char **argv)
 bool alloc_depth = false;
 const char *tlscredsid = NULL;
 bool imageOpts = false;
-bool writethrough = true;
+bool writethrough = false; /* Client will flush as needed. */
 bool fork_process = false;
 bool list = false;
 int old_stderr = -1;
-- 
2.31.1

[PULL 03/20] qcow2: check request on vmstate save/load path

From: Vladimir Sementsov-Ogievskiy 

We modify the request by adding an offset to vmstate. Let's check the
modified request. It will help us to safely move .bdrv_co_preadv_part
and .bdrv_co_pwritev_part to int64_t type of offset and bytes.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
Message-Id: <20210903102807.27127-3-vsement...@virtuozzo.com>
Signed-off-by: Eric Blake 
---
 include/block/block_int.h |  3 +++
 block/io.c|  6 +++---
 block/qcow2.c | 43 +--
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 5451f89b8df9..ed60495938a6 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -94,6 +94,9 @@ typedef struct BdrvTrackedRequest {
 struct BdrvTrackedRequest *waiting_for;
 } BdrvTrackedRequest;

+int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
+QEMUIOVector *qiov, size_t qiov_offset,
+Error **errp);
 int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp);

 struct BlockDriver {
diff --git a/block/io.c b/block/io.c
index 58602f84dbf0..a4f124f75577 100644
--- a/block/io.c
+++ b/block/io.c
@@ -956,9 +956,9 @@ bool coroutine_fn 
bdrv_make_request_serialising(BdrvTrackedRequest *req,
 return waited;
 }

-static int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
-   QEMUIOVector *qiov, size_t qiov_offset,
-   Error **errp)
+int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
+QEMUIOVector *qiov, size_t qiov_offset,
+Error **errp)
 {
 /*
  * Check generic offset/bytes correctness
diff --git a/block/qcow2.c b/block/qcow2.c
index 02f9f3e63679..1c3cf7f91d86 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -5227,24 +5227,55 @@ static int qcow2_has_zero_init(BlockDriverState *bs)
 }
 }

+/*
+ * Check the request to vmstate. On success return
+ *  qcow2_vm_state_offset(bs) + @pos
+ */
+static int64_t qcow2_check_vmstate_request(BlockDriverState *bs,
+   QEMUIOVector *qiov, int64_t pos)
+{
+BDRVQcow2State *s = bs->opaque;
+int64_t vmstate_offset = qcow2_vm_state_offset(s);
+int ret;
+
+/* Incoming requests must be OK */
+bdrv_check_qiov_request(pos, qiov->size, qiov, 0, _abort);
+
+if (INT64_MAX - pos < vmstate_offset) {
+return -EIO;
+}
+
+pos += vmstate_offset;
+ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
+if (ret < 0) {
+return ret;
+}
+
+return pos;
+}
+
 static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
   int64_t pos)
 {
-BDRVQcow2State *s = bs->opaque;
+int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
+if (offset < 0) {
+return offset;
+}

 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
-return bs->drv->bdrv_co_pwritev_part(bs, qcow2_vm_state_offset(s) + pos,
- qiov->size, qiov, 0, 0);
+return bs->drv->bdrv_co_pwritev_part(bs, offset, qiov->size, qiov, 0, 0);
 }

 static int qcow2_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
   int64_t pos)
 {
-BDRVQcow2State *s = bs->opaque;
+int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
+if (offset < 0) {
+return offset;
+}

 BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
-return bs->drv->bdrv_co_preadv_part(bs, qcow2_vm_state_offset(s) + pos,
-qiov->size, qiov, 0, 0);
+return bs->drv->bdrv_co_preadv_part(bs, offset, qiov->size, qiov, 0, 0);
 }

 /*
-- 
2.31.1

Re: [PATCH v11 00/16] target/riscv: Update QEmu for Zb[abcs] 1.0.0

2021-09-27 Thread Vineet Gupta





On 9/27/21 1:23 PM, Jim Wilson wrote:
On Mon, Sep 27, 2021 at 1:01 PM Vineet Gupta > wrote:


So I obviously forgot to get the equivalent binutils branch, but the
only rvb branch on sifive fork feels dated


https://github.com/riscv-collab/riscv-binutils-gdb/tree/riscv-binutils-2.35-rvb




That is the right branch to use with the gcc that you are using.  This 
stuff hasn't been actively maintained so we have old gcc and binutils 
release versions.


We are in the process of putting stuff upstream now.



Thx Jim. Guess we'd have to wait for dust to settle, as this instance of 
binutils can't seem to grok sh1add.uw spit out by rvb-shNadd-03.c


-Vineet

Re: [PATCH v2] nbd/server: Add --selinux-label option

2021-09-27 Thread Richard W.M. Jones

On Mon, Sep 27, 2021 at 04:18:34PM -0500, Eric Blake wrote:
> On Fri, Jul 23, 2021 at 11:33:03AM +0100, Richard W.M. Jones wrote:
> > Under SELinux, Unix domain sockets have two labels.  One is on the
> > disk and can be set with commands such as chcon(1).  There is a
> > different label stored in memory (called the process label).  This can
> > only be set by the process creating the socket.  When using SELinux +
> > SVirt and wanting qemu to be able to connect to a qemu-nbd instance,
> > you must set both labels correctly first.
> > 
> > For qemu-nbd the options to set the second label are awkward.  You can
> > create the socket in a wrapper program and then exec into qemu-nbd.
> > Or you could try something with LD_PRELOAD.
> > 
> > This commit adds the ability to set the label straightforwardly on the
> > command line, via the new --selinux-label flag.  (The name of the flag
> > is the same as the equivalent nbdkit option.)
> > 
> > A worked example showing how to use the new option can be found in
> > this bug: https://bugzilla.redhat.com/show_bug.cgi?id=1984938
> > 
> > Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1984938
> > Signed-off-by: Richard W.M. Jones 
> > ---
> 
> I'm making one tweak to your patch before sending the pull request:
> 
> > +++ b/qemu-nbd.c
> > @@ -64,6 +68,7 @@
> >  #define QEMU_NBD_OPT_FORK  263
> >  #define QEMU_NBD_OPT_TLSAUTHZ  264
> >  #define QEMU_NBD_OPT_PID_FILE  265
> > +#define QEMU_NBD_OPT_SELINUX_LABEL 266
> >  
> >  #define MBR_SIZE 512
> >  
> > @@ -116,6 +121,9 @@ static void usage(const char *name)
> >  "  --forkfork off the server process and exit the 
> > parent\n"
> >  "once the server is running\n"
> >  "  --pid-file=PATH   store the server's process ID in the given 
> > file\n"
> > +#ifdef CONFIG_SELINUX
> > +"  --selinux-label=LABEL set SELinux process label on listening 
> > socket\n"
> > +#endif
> 
> The new option is only conditionally advertised under --help (qemu-nbd
> lacks a stable machine-parseable output, so scraping --help output
> will have to do for now)...
> 
> >  #if HAVE_NBD_DEVICE
> >  "\n"
> >  "Kernel NBD client support:\n"
> > @@ -532,6 +540,8 @@ int main(int argc, char **argv)
> >  { "trace", required_argument, NULL, 'T' },
> >  { "fork", no_argument, NULL, QEMU_NBD_OPT_FORK },
> >  { "pid-file", required_argument, NULL, QEMU_NBD_OPT_PID_FILE },
> > +{ "selinux-label", required_argument, NULL,
> > +  QEMU_NBD_OPT_SELINUX_LABEL },
> 
> ...but is unconditionally supported as a long option even when support
> was not compiled in...
> 
> >  { NULL, 0, NULL, 0 }
> >  };
> >  int ch;
> > @@ -558,6 +568,7 @@ int main(int argc, char **argv)
> >  int old_stderr = -1;
> >  unsigned socket_activation;
> >  const char *pid_file_name = NULL;
> > +const char *selinux_label = NULL;
> >  BlockExportOptions *export_opts;
> >  
> >  #ifdef CONFIG_POSIX
> > @@ -747,6 +758,9 @@ int main(int argc, char **argv)
> >  case QEMU_NBD_OPT_PID_FILE:
> >  pid_file_name = optarg;
> >  break;
> > +case QEMU_NBD_OPT_SELINUX_LABEL:
> > +selinux_label = optarg;
> > +break;
> >  }
> >  }
> >  
> > @@ -938,6 +952,16 @@ int main(int argc, char **argv)
> >  } else {
> >  backlog = MIN(shared, SOMAXCONN);
> >  }
> > +if (sockpath && selinux_label) {
> > +#ifdef CONFIG_SELINUX
> > +if (setsockcreatecon_raw(selinux_label) == -1) {
> > +error_report("Cannot set SELinux socket create context "
> > + "to %s: %s",
> > + selinux_label, strerror(errno));
> > +exit(EXIT_FAILURE);
> > +}
> > +#endif
> 
> ...but here we silently ignore it if support is not compiled in.
> Better is to issue an error message about using an unsupported option,
> so I'll squash this in:
> 
> diff --git i/qemu-nbd.c w/qemu-nbd.c
> index 5dc82c419255..94f8ec07c064 100644
> --- i/qemu-nbd.c
> +++ w/qemu-nbd.c
> @@ -962,6 +962,9 @@ int main(int argc, char **argv)
>   selinux_label, strerror(errno));
>  exit(EXIT_FAILURE);
>  }
> +#else
> +error_report("SELinux support not enabled in this binary");
> +exit(EXIT_FAILURE);
>  #endif
>  }
>  saddr = nbd_build_socket_address(sockpath, bindto, port);
> @@ -978,6 +981,9 @@ int main(int argc, char **argv)
>   strerror(errno));
>  exit(EXIT_FAILURE);
>  }
> +#else
> +error_report("SELinux support not enabled in this binary");
> +exit(EXIT_FAILURE);
>  #endif
>  }
>  } else {
> 

Good idea, thanks.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming

Re: [PATCH v2] nbd/server: Add --selinux-label option

On Fri, Jul 23, 2021 at 11:33:03AM +0100, Richard W.M. Jones wrote:
> Under SELinux, Unix domain sockets have two labels.  One is on the
> disk and can be set with commands such as chcon(1).  There is a
> different label stored in memory (called the process label).  This can
> only be set by the process creating the socket.  When using SELinux +
> SVirt and wanting qemu to be able to connect to a qemu-nbd instance,
> you must set both labels correctly first.
> 
> For qemu-nbd the options to set the second label are awkward.  You can
> create the socket in a wrapper program and then exec into qemu-nbd.
> Or you could try something with LD_PRELOAD.
> 
> This commit adds the ability to set the label straightforwardly on the
> command line, via the new --selinux-label flag.  (The name of the flag
> is the same as the equivalent nbdkit option.)
> 
> A worked example showing how to use the new option can be found in
> this bug: https://bugzilla.redhat.com/show_bug.cgi?id=1984938
> 
> Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1984938
> Signed-off-by: Richard W.M. Jones 
> ---

I'm making one tweak to your patch before sending the pull request:

> +++ b/qemu-nbd.c
> @@ -64,6 +68,7 @@
>  #define QEMU_NBD_OPT_FORK  263
>  #define QEMU_NBD_OPT_TLSAUTHZ  264
>  #define QEMU_NBD_OPT_PID_FILE  265
> +#define QEMU_NBD_OPT_SELINUX_LABEL 266
>  
>  #define MBR_SIZE 512
>  
> @@ -116,6 +121,9 @@ static void usage(const char *name)
>  "  --forkfork off the server process and exit the 
> parent\n"
>  "once the server is running\n"
>  "  --pid-file=PATH   store the server's process ID in the given 
> file\n"
> +#ifdef CONFIG_SELINUX
> +"  --selinux-label=LABEL set SELinux process label on listening socket\n"
> +#endif

The new option is only conditionally advertised under --help (qemu-nbd
lacks a stable machine-parseable output, so scraping --help output
will have to do for now)...

>  #if HAVE_NBD_DEVICE
>  "\n"
>  "Kernel NBD client support:\n"
> @@ -532,6 +540,8 @@ int main(int argc, char **argv)
>  { "trace", required_argument, NULL, 'T' },
>  { "fork", no_argument, NULL, QEMU_NBD_OPT_FORK },
>  { "pid-file", required_argument, NULL, QEMU_NBD_OPT_PID_FILE },
> +{ "selinux-label", required_argument, NULL,
> +  QEMU_NBD_OPT_SELINUX_LABEL },

...but is unconditionally supported as a long option even when support
was not compiled in...

>  { NULL, 0, NULL, 0 }
>  };
>  int ch;
> @@ -558,6 +568,7 @@ int main(int argc, char **argv)
>  int old_stderr = -1;
>  unsigned socket_activation;
>  const char *pid_file_name = NULL;
> +const char *selinux_label = NULL;
>  BlockExportOptions *export_opts;
>  
>  #ifdef CONFIG_POSIX
> @@ -747,6 +758,9 @@ int main(int argc, char **argv)
>  case QEMU_NBD_OPT_PID_FILE:
>  pid_file_name = optarg;
>  break;
> +case QEMU_NBD_OPT_SELINUX_LABEL:
> +selinux_label = optarg;
> +break;
>  }
>  }
>  
> @@ -938,6 +952,16 @@ int main(int argc, char **argv)
>  } else {
>  backlog = MIN(shared, SOMAXCONN);
>  }
> +if (sockpath && selinux_label) {
> +#ifdef CONFIG_SELINUX
> +if (setsockcreatecon_raw(selinux_label) == -1) {
> +error_report("Cannot set SELinux socket create context "
> + "to %s: %s",
> + selinux_label, strerror(errno));
> +exit(EXIT_FAILURE);
> +}
> +#endif

...but here we silently ignore it if support is not compiled in.
Better is to issue an error message about using an unsupported option,
so I'll squash this in:

diff --git i/qemu-nbd.c w/qemu-nbd.c
index 5dc82c419255..94f8ec07c064 100644
--- i/qemu-nbd.c
+++ w/qemu-nbd.c
@@ -962,6 +962,9 @@ int main(int argc, char **argv)
  selinux_label, strerror(errno));
 exit(EXIT_FAILURE);
 }
+#else
+error_report("SELinux support not enabled in this binary");
+exit(EXIT_FAILURE);
 #endif
 }
 saddr = nbd_build_socket_address(sockpath, bindto, port);
@@ -978,6 +981,9 @@ int main(int argc, char **argv)
  strerror(errno));
 exit(EXIT_FAILURE);
 }
+#else
+error_report("SELinux support not enabled in this binary");
+exit(EXIT_FAILURE);
 #endif
 }
 } else {


-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v11 00/16] target/riscv: Update QEmu for Zb[abcs] 1.0.0

2021-09-27 Thread Jim Wilson

On Mon, Sep 27, 2021 at 1:01 PM Vineet Gupta  wrote:

> So I obviously forgot to get the equivalent binutils branch, but the
> only rvb branch on sifive fork feels dated
>
>
> https://github.com/riscv-collab/riscv-binutils-gdb/tree/riscv-binutils-2.35-rvb

That is the right branch to use with the gcc that you are using.  This
stuff hasn't been actively maintained so we have old gcc and binutils
release versions.

We are in the process of putting stuff upstream now.

Jim

Re: [PATCH] hw/arm/virt: Allow additions to the generated device tree

2021-09-27 Thread Simon Glass

Hi Peter,

On Mon, 27 Sept 2021 at 10:50, Peter Maydell  wrote:
>
> On Mon, 27 Sept 2021 at 17:04, Simon Glass  wrote:
> > On Mon, 27 Sept 2021 at 09:46, Peter Maydell  
> > wrote:
>
> > > My take is that this is u-boot doing weird custom things with
> > > the DTB that aren't "describe the hardware". You should be able
> > > to boot u-boot by putting those custom DTB extra things in a
> > > separate blob and having u-boot combine that with the
> > > actual DTB when it starts.
> >
> > Well this is how U-Boot works. Since it doesn't have a user-space
> > program to provide configuration / policy, nor a command line to
> > provide parameters (except with sandbox[1]), device tree is what it
> > uses. All of its driver model and configuration comes from there The
> > 'describe the hardware' thing has been discussed to death but U-Boot
> > needs board- and arch-specific policy information about the hardware
> > so it can actually boot successfully on real systems.
> >
> > It has been like this since U-Boot started using device tree, some 9
> > years ago! I can't imagine it changing.
>
> > As to a separate blob, isn't that what I am suggesting with this
> > patch? QEMU doesn't support passing two separate dtb blobs to U-Boot,
> > nor is there an API for that.
>
> You're suggesting "QEMU should have machinery for taking two
> blobs and combining them and passing one blob to the guest".
> I'm suggesting "the guest should combine them" (and the second
> blob could be provided via several different existing mechanisms
> that amount to 'QEMU provides some ways to load data into guest
> ROM or RAM'), because as far as I know no other guest has this
> "combine two different bits of dtb for me" requirement.

I think you are misunderstanding my patch and that may be the problem here.

Where QEMU is provided with a dtb (-dtb) it uses that and passes it
on. This is absolutely fine and I have tested that this works well
with U-Boot. No issues.

Where QEMU creates its own dtb on the fly the -dtb parameter is
actually ignored and there is no way to adjust what QEMU passes on,
without recompiling QEMU. It is quite inflexible, actually. Even just
creating a new device for development purposes is not possible. That
is the problem I am trying to solve.

There is certainly no intent to combine two bits of dtb with my patch.
We could easily do that externally to QEMU.

The only current working option is to just pass the U-Boot dtb through
and not use QEMU's on-the-fly-generated dtb at all. But I am assuming
there is a reason why QEMU generates that dtb, so that would not be
desirable?

>
> > Even if we did that it would require
> > code very early in U-Boot to process, which would make it infeasible
> > for anything other than QEMU. Ideally QEMU should work the same way as
> > other boards.
>
> Well, real hardware doesn't provide device tree blobs of any
> form to u-boot, right? u-boot is just compiled into flash, or
> perhaps launched from some other boot ROM, as I understand it.
> Where does it get its dtb from then ?

The dtb is compiled as part of the U-Boot build. but exists as a
separate file. The mechanism for providing the dtb to U-Boot at
runtime is somewhat board-specific and we are working on standardising
it more. On the rpi for example, it is provided by a FAT file system
and first-stage firmware loads that and passes it along to U-Boot.
Some systems use TF-A which does a similar thing. Some use U-Boot as a
first-stage loader in which case SPL may select a DTB (out of many
built by the build) to pass to U-Boot proper.

There is also a tool called binman which packages the firmware as it
is getting quite complicated:

https://u-boot.readthedocs.io/en/latest/develop/package/binman.html

>
> > As a related point, I am looking at how we pass things between
> > firmware components.  If we wanted to pass in some initiate state in
> > some sort of blob, is it possible to set that up in memory (along with
> > the binary) for when QEMU starts emulating? The code and RAM might be
> > quite a long way apart so using a single image would involve a lot of
> > zeroes.
>
> The generic-loader is quite good for this sort of thing:
> https://qemu-project.gitlab.io/qemu/system/generic-loader.html
> You can load raw data files to specific addresses; or you can
> load ELF files (which can have multiple segments which get loaded
> as the ELF header specifies). You can specify -device generic-loader,...
> as many times as you need to to load multiple blobs.

OK great, thank you, that looks very useful.

One more question...other than dtb, does QEMU typically add support
for data structures needed by particular projects or groups of
projects? It looks like dtb was supported for ARM Linux originally? I
am looking at supporting bloblist as a way of communicating
information between firmware (basically a simple way of packaging
multiple blobs).

https://github.com/ni/u-boot/blob/master/doc/README.bloblist

Regards,
Simon

Re: [PATCH v11 00/16] target/riscv: Update QEmu for Zb[abcs] 1.0.0

2021-09-27 Thread Vineet Gupta

Hi,

On 9/11/21 7:00 AM, Philipp Tomsich wrote:

The Zb[abcs] extensions have complete public review and are nearing
ratifications. These individual extensions are one part of what was
previously though of as the "BitManip" (B) extension, leaving the
final details of future Zb* extensions open as they will undergo
further public discourse.

This series updates the earlier support for the B extension by
- removing those instructions that are not included in Zb[abcs]
- splitting this into 4 separate extensions that can be independently
enabled: Zba (addressing), Zbb (basic bit-manip), Zbc (carryless
multiplication), Zbs (single-bit operations)
- update the to the 1.0.0 version (e.g. w-forms of rev8 and Zbs
instructions are not included in Zb[abcs])

For the latest version of the public review speicifcaiton
(incorporating some editorial fixes and corrections from the review
period), refer to:

https://github.com/riscv/riscv-bitmanip/releases/download/1.0.0/bitmanip-1.0.0-31-g2af7256.pdf

I was curious to try these out. Challenge was not qemu but stuff built
to run on this qemu.

At LPC last week Jim/Kito suggested I use the gcc branch @
https://github.com/riscv-collab/riscv-gcc/tree/riscv-gcc-10.2.0-rvb

With that I get

$ riscv64-unknown-elf-gcc
~/gnu/gcc/gcc/testsuite/gcc.target/riscv/rvb-zbs-bclr.c -c --save-temps
-march=rv64gc_zbb_zbs -O2

Assembler messages:
Error: -march=rv64imafdc_zbb_zbs: unknown prefixed ISA extension `zbs'

So I obviously forgot to get the equivalent binutils branch, but the
only rvb branch on sifive fork feels dated

https://github.com/riscv-collab/riscv-binutils-gdb/tree/riscv-binutils-2.35-rvb

Can someone point me to the right binutils repo/branch to pair with gcc
changes above.

Thx,
-Vineet

Changes in v11:
- Swaps out the EXT_ZERO to EXT_NONE, as no extension is to be performed.
- Fix typos in commit message.

Changes in v10:
- New patch
- New patch, fixing regressions discovered with x264_r.
- New patch, fixing correctnes for clzw called on a register with undefined
(as in: not properly sign-extended) upper bits.
- Retested with CF3 and SPEC2017 (size=test, size=ref); addressing new
regressions (due to bugs in gen_clzw) from testing with SPEC2017 using
different optimization levels
- Split off gen_add_uw() fix into a separate patch, as requested.

Changes in v9:
- Retested with CF3 and SPEC2017 (size=test only).
- Rebased to 8880cc4362.
- Update gen_add_uw() to use a temporary instead of messing with
arg1 (fixes a regression after rebase on CF3 and SPEC2017).
- Rebased to 8880cc4362.
- Picked up Alistair's Reviewed-by, after patman had failed to catch
it for v8.
- Rebased to 8880cc4362.
- Fixes a whitespace-at-the-end-of-line warning for the rev8 comment
in insn32.decode
- Rebased to 8880cc4362.

Changes in v8:
- Optimize orc.b further by reordering the shift/and, updating the
comment to reflect that we put the truth-value into the LSB, and
putting the (now only) constant in a temporary
- Fold the final bitwise-not into the second and, using and andc.

Changes in v7:
- Free TCG temporary in gen_orc_b().

Changes in v6:
- Move gen_clmulh to trans_rvb.c.inc, as per Richard H's request.
- Fixed orc.b (now passes SPEC w/ optimized string functions) by
adding the missing final negation.

Changes in v5:
- Introduce gen_clmulh (as suggested by Richard H) and use to simplify
trans_clmulh().

Changes in v4:
- Drop rewrite of slli.uw (to match formal specification), as it would
remove an optimization.
- Change orc.b to implementation suggested by Richard Henderson
- reorder trans_rev8* functions to be sequential
- rename rev8 to rev8_32 in decoder
- Renamed RV32 variant to zext_h_32.
- Reordered trans_zext_h_{32,64} to be next to each other.

Changes in v3:
- Split off removal of 'x-b' property and 'ext_b' field into a separate
patch to ensure bisectability.
- The changes to the Zba instructions (i.e. the REQUIRE_ZBA macro
and its use for qualifying the Zba instructions) are moved into
a separate commit.
- Remove the W-form instructions from Zbs in a separate commit.
- Remove shift-one instructions in a separate commit.
- The changes to the Zbs instructions (i.e. the REQUIRE_ZBS macro) and
its use for qualifying the Zba instructions) are moved into a
separate commit.
- This adds the Zbc instructions as a spearate commit.
- Uses a helper for clmul/clmulr instead of inlining the calculation of
the result (addressing a comment from Richard Henderson).
- The changes to the Zbb instructions (i.e. use the REQUIRE_ZBB macro)
are now in a separate commit.
- Moved orc.b and gorc/gorci changes into separate commit.
- Using the simpler orc.b implementation suggested by Richard Henderson
- Moved the REQUIRE_32BIT macro into a separate commit.
- rev8-addition & grevi*-removal moved to a separate commit
- Moved zext.h-addition & pack*-removal to a separate commit.
- Removing RVB moved into a separate commit at

[PULL 26/32] python/aqmp: add LineProtocol tests

Tests a real connect, a real accept, and really sending and receiving a
message over a UNIX socket.

Brings coverage of protocol.py up to ~93%.

Signed-off-by: John Snow 
Message-id: 20210915162955.333025-27-js...@redhat.com
Signed-off-by: John Snow 
---
 python/tests/protocol.py | 48 
 1 file changed, 48 insertions(+)

diff --git a/python/tests/protocol.py b/python/tests/protocol.py
index f0682d29ce5..5cd7938be35 100644
--- a/python/tests/protocol.py
+++ b/python/tests/protocol.py
@@ -78,6 +78,25 @@ async def simulate_disconnect(self) -> None:
 self._schedule_disconnect()
 
 
+class LineProtocol(AsyncProtocol[str]):
+def __init__(self, name=None):
+super().__init__(name)
+self.rx_history = []
+
+async def _do_recv(self) -> str:
+raw = await self._readline()
+msg = raw.decode()
+self.rx_history.append(msg)
+return msg
+
+def _do_send(self, msg: str) -> None:
+assert self._writer is not None
+self._writer.write(msg.encode() + b'\n')
+
+async def send_msg(self, msg: str) -> None:
+await self._outgoing.put(msg)
+
+
 def run_as_task(coro, allow_cancellation=False):
 """
 Run a given coroutine as a task.
@@ -533,3 +552,32 @@ async def testConnectRequireDisconnecting(self):
  " Call disconnect() to return to IDLE state."),
 accept=False,
 )
+
+
+class SimpleSession(TestBase):
+
+def setUp(self):
+super().setUp()
+self.server = LineProtocol(type(self).__name__ + '-server')
+
+async def _asyncSetUp(self):
+await super()._asyncSetUp()
+await self._watch_runstates(*self.GOOD_CONNECTION_STATES)
+
+async def _asyncTearDown(self):
+await self.proto.disconnect()
+try:
+await self.server.disconnect()
+except EOFError:
+pass
+await super()._asyncTearDown()
+
+@TestBase.async_test
+async def testSmoke(self):
+with TemporaryDirectory(suffix='.aqmp') as tmpdir:
+sock = os.path.join(tmpdir, type(self.proto).__name__ + ".sock")
+server_task = create_task(self.server.accept(sock))
+
+# give the server a chance to start listening [...]
+await asyncio.sleep(0)
+await self.proto.connect(sock)
-- 
2.31.1

[PULL 09/32] python/aqmp: add AsyncProtocol.accept() method

It's a little messier than connect, because it wasn't designed to accept
*precisely one* connection. Such is life.

Signed-off-by: John Snow 
Reviewed-by: Eric Blake 
Message-id: 20210915162955.333025-10-js...@redhat.com
Signed-off-by: John Snow 
---
 python/qemu/aqmp/protocol.py | 89 ++--
 1 file changed, 85 insertions(+), 4 deletions(-)

diff --git a/python/qemu/aqmp/protocol.py b/python/qemu/aqmp/protocol.py
index 1dfd12895dc..62c26ede5a4 100644
--- a/python/qemu/aqmp/protocol.py
+++ b/python/qemu/aqmp/protocol.py
@@ -243,6 +243,24 @@ async def runstate_changed(self) -> Runstate:
 await self._runstate_event.wait()
 return self.runstate
 
+@upper_half
+@require(Runstate.IDLE)
+async def accept(self, address: Union[str, Tuple[str, int]],
+ ssl: Optional[SSLContext] = None) -> None:
+"""
+Accept a connection and begin processing message queues.
+
+If this call fails, `runstate` is guaranteed to be set back to `IDLE`.
+
+:param address:
+Address to listen to; UNIX socket path or TCP address/port.
+:param ssl: SSL context to use, if any.
+
+:raise StateError: When the `Runstate` is not `IDLE`.
+:raise ConnectError: If a connection could not be accepted.
+"""
+await self._new_session(address, ssl, accept=True)
+
 @upper_half
 @require(Runstate.IDLE)
 async def connect(self, address: Union[str, Tuple[str, int]],
@@ -308,7 +326,8 @@ def _set_state(self, state: Runstate) -> None:
 @upper_half
 async def _new_session(self,
address: Union[str, Tuple[str, int]],
-   ssl: Optional[SSLContext] = None) -> None:
+   ssl: Optional[SSLContext] = None,
+   accept: bool = False) -> None:
 """
 Establish a new connection and initialize the session.
 
@@ -317,9 +336,10 @@ async def _new_session(self,
 to be set back to `IDLE`.
 
 :param address:
-Address to connect to;
+Address to connect to/listen on;
 UNIX socket path or TCP address/port.
 :param ssl: SSL context to use, if any.
+:param accept: Accept a connection instead of connecting when `True`.
 
 :raise ConnectError:
 When a connection or session cannot be established.
@@ -333,7 +353,7 @@ async def _new_session(self,
 
 try:
 phase = "connection"
-await self._establish_connection(address, ssl)
+await self._establish_connection(address, ssl, accept)
 
 phase = "session"
 await self._establish_session()
@@ -367,6 +387,7 @@ async def _establish_connection(
 self,
 address: Union[str, Tuple[str, int]],
 ssl: Optional[SSLContext] = None,
+accept: bool = False
 ) -> None:
 """
 Establish a new connection.
@@ -375,6 +396,7 @@ async def _establish_connection(
 Address to connect to/listen on;
 UNIX socket path or TCP address/port.
 :param ssl: SSL context to use, if any.
+:param accept: Accept a connection instead of connecting when `True`.
 """
 assert self.runstate == Runstate.IDLE
 self._set_state(Runstate.CONNECTING)
@@ -384,7 +406,66 @@ async def _establish_connection(
 # otherwise yield.
 await asyncio.sleep(0)
 
-await self._do_connect(address, ssl)
+if accept:
+await self._do_accept(address, ssl)
+else:
+await self._do_connect(address, ssl)
+
+@upper_half
+async def _do_accept(self, address: Union[str, Tuple[str, int]],
+ ssl: Optional[SSLContext] = None) -> None:
+"""
+Acting as the transport server, accept a single connection.
+
+:param address:
+Address to listen on; UNIX socket path or TCP address/port.
+:param ssl: SSL context to use, if any.
+
+:raise OSError: For stream-related errors.
+"""
+self.logger.debug("Awaiting connection on %s ...", address)
+connected = asyncio.Event()
+server: Optional[asyncio.AbstractServer] = None
+
+async def _client_connected_cb(reader: asyncio.StreamReader,
+   writer: asyncio.StreamWriter) -> None:
+"""Used to accept a single incoming connection, see below."""
+nonlocal server
+nonlocal connected
+
+# A connection has been accepted; stop listening for new ones.
+assert server is not None
+server.close()
+await server.wait_closed()
+server = None
+
+# Register this client as being connected
+self._reader, self._writer = (reader, writer)
+
+# Signal back: We've accepted a client!
+

[PULL 23/32] python/aqmp: add scary message

Add a warning whenever AQMP is used to steer people gently away from
using it for the time-being.

Signed-off-by: John Snow 
Message-id: 20210915162955.333025-24-js...@redhat.com
Signed-off-by: John Snow 
---
 python/qemu/aqmp/__init__.py | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/python/qemu/aqmp/__init__.py b/python/qemu/aqmp/__init__.py
index 4b7df53e006..ab1782999cf 100644
--- a/python/qemu/aqmp/__init__.py
+++ b/python/qemu/aqmp/__init__.py
@@ -21,6 +21,8 @@
 # This work is licensed under the terms of the GNU GPL, version 2.  See
 # the COPYING file in the top-level directory.
 
+import warnings
+
 from .error import AQMPError
 from .events import EventListener
 from .message import Message
@@ -28,6 +30,18 @@
 from .qmp_client import ExecInterruptedError, ExecuteError, QMPClient
 
 
+_WMSG = """
+
+The Asynchronous QMP library is currently in development and its API
+should be considered highly fluid and subject to change. It should
+not be used by any other scripts checked into the QEMU tree.
+
+Proceed with caution!
+"""
+
+warnings.warn(_WMSG, FutureWarning)
+
+
 # The order of these fields impact the Sphinx documentation order.
 __all__ = (
 # Classes, most to least important
-- 
2.31.1

[PULL 32/32] python/aqmp-tui: Add syntax highlighting

From: G S Niteesh Babu 

Add syntax highlighting for the incoming and outgoing QMP messages.
This is achieved using the pygments module which was added in a
previous commit.

The current implementation is a really simple one which doesn't
allow for any configuration. In future this has to be improved
to allow for easier theme config using an external config of
some sort.

Signed-off-by: G S Niteesh Babu 
Message-Id: <20210823220746.28295-6-niteesh...@gmail.com>
Signed-off-by: John Snow 
---
 python/qemu/aqmp/aqmp_tui.py | 36 ++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/python/qemu/aqmp/aqmp_tui.py b/python/qemu/aqmp/aqmp_tui.py
index ac533541d2f..a2929f771cf 100644
--- a/python/qemu/aqmp/aqmp_tui.py
+++ b/python/qemu/aqmp/aqmp_tui.py
@@ -30,6 +30,8 @@
 cast,
 )
 
+from pygments import lexers
+from pygments import token as Token
 import urwid
 import urwid_readline
 
@@ -45,6 +47,22 @@
 UPDATE_MSG: str = 'UPDATE_MSG'
 
 
+palette = [
+(Token.Punctuation, '', '', '', 'h15,bold', 'g7'),
+(Token.Text, '', '', '', '', 'g7'),
+(Token.Name.Tag, '', '', '', 'bold,#f88', 'g7'),
+(Token.Literal.Number.Integer, '', '', '', '#fa0', 'g7'),
+(Token.Literal.String.Double, '', '', '', '#6f6', 'g7'),
+(Token.Keyword.Constant, '', '', '', '#6af', 'g7'),
+('DEBUG', '', '', '', '#ddf', 'g7'),
+('INFO', '', '', '', 'g100', 'g7'),
+('WARNING', '', '', '', '#ff6', 'g7'),
+('ERROR', '', '', '', '#a00', 'g7'),
+('CRITICAL', '', '', '', '#a00', 'g7'),
+('background', '', 'black', '', '', 'g7'),
+]
+
+
 def format_json(msg: str) -> str:
 """
 Formats valid/invalid multi-line JSON message into a single-line message.
@@ -353,6 +371,9 @@ def run(self, debug: bool = False) -> None:
 :param debug:
 Enables/Disables asyncio event loop debugging
 """
+screen = urwid.raw_display.Screen()
+screen.set_terminal_properties(256)
+
 self.aloop = asyncio.get_event_loop()
 self.aloop.set_debug(debug)
 
@@ -364,6 +385,8 @@ def run(self, debug: bool = False) -> None:
 event_loop = urwid.AsyncioEventLoop(loop=self.aloop)
 main_loop = urwid.MainLoop(urwid.AttrMap(self.window, 'background'),
unhandled_input=self.unhandled_input,
+   screen=screen,
+   palette=palette,
handle_mouse=True,
event_loop=event_loop)
 
@@ -487,7 +510,8 @@ def __init__(self, parent: App) -> None:
 self.history = urwid.SimpleFocusListWalker([])
 super().__init__(self.history)
 
-def add_to_history(self, history: str) -> None:
+def add_to_history(self,
+   history: Union[str, List[Tuple[str, str]]]) -> None:
 """
 Appends a message to the list and set the focus to the last appended
 message.
@@ -531,10 +555,18 @@ def cb_add_to_history(self, msg: str, level: 
Optional[str] = None) -> None:
 
 :param msg:
 The message to be appended to the history box.
+:param level:
+The log level of the message, if it is a log message.
 """
+formatted = []
 if level:
 msg = f'[{level}]: {msg}'
-self.history.add_to_history(msg)
+formatted.append((level, msg))
+else:
+lexer = lexers.JsonLexer()  # pylint: disable=no-member
+for token in lexer.get_tokens(msg):
+formatted.append(token)
+self.history.add_to_history(formatted)
 
 
 class Window(urwid.Frame):
-- 
2.31.1

[PULL 28/32] python: Add dependencies for AQMP TUI

From: G S Niteesh Babu 

Added dependencies for the upcoming AQMP TUI under the optional
'tui' group.

The same dependencies have also been added under the devel group
since no work around has been found for optional groups to imply
other optional groups.

Signed-off-by: G S Niteesh Babu 
Message-Id: <20210823220746.28295-2-niteesh...@gmail.com>
Signed-off-by: John Snow 
---
 python/Pipfile.lock | 12 
 python/setup.cfg|  8 
 2 files changed, 20 insertions(+)

diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index 457f5c3fe87..da7a4ee164c 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -289,6 +289,18 @@
 "markers": "python_version < '3.8'",
 "version": "==3.10.0.0"
 },
+"urwid": {
+"hashes": [
+
"sha256:588bee9c1cb208d0906a9f73c613d2bd32c3ed3702012f51efe318a3f2127eae"
+],
+"version": "==2.1.2"
+},
+"urwid-readline": {
+"hashes": [
+
"sha256:018020cbc864bb5ed87be17dc26b069eae2755cb29f3a9c569aac3bded1efaf4"
+],
+"version": "==0.13"
+},
 "virtualenv": {
 "hashes": [
 
"sha256:14fdf849f80dbb29a4eb6caa9875d476ee2a5cf76a5f5415fa2f1606010ab467",
diff --git a/python/setup.cfg b/python/setup.cfg
index 4fd0c68a5b7..efcda23c48d 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -44,11 +44,18 @@ devel =
 mypy >= 0.770
 pylint >= 2.8.0
 tox >= 3.18.0
+urwid >= 2.1.2
+urwid-readline >= 0.13
 
 # Provides qom-fuse functionality
 fuse =
 fusepy >= 2.0.4
 
+# AQMP TUI dependencies
+tui =
+urwid >= 2.1.2
+urwid-readline >= 0.13
+
 [options.entry_points]
 console_scripts =
 qom = qemu.qmp.qom:main
@@ -138,6 +145,7 @@ allowlist_externals = make
 deps =
 .[devel]
 .[fuse]  # Workaround to trigger tox venv rebuild
+.[tui]   # Workaround to trigger tox venv rebuild
 commands =
 make check
 
-- 
2.31.1

[PULL 21/32] python/aqmp: add _raw() execution interface

This is added in anticipation of wanting it for a synchronous wrapper
for the iotest interface. Normally, execute() and execute_msg() both
raise QMP errors in the form of Python exceptions.

Many iotests expect the entire reply as-is. To reduce churn there, add a
private execution interface that will ease transition churn. However, I
do not wish to encourage its use, so it will remain a private interface.

Signed-off-by: John Snow 
Message-id: 20210915162955.333025-22-js...@redhat.com
Signed-off-by: John Snow 
---
 python/qemu/aqmp/qmp_client.py | 51 ++
 1 file changed, 51 insertions(+)

diff --git a/python/qemu/aqmp/qmp_client.py b/python/qemu/aqmp/qmp_client.py
index 879348feaaa..82e9dab124c 100644
--- a/python/qemu/aqmp/qmp_client.py
+++ b/python/qemu/aqmp/qmp_client.py
@@ -484,6 +484,57 @@ async def _execute(self, msg: Message, assign_id: bool = 
True) -> Message:
 exec_id = await self._issue(msg)
 return await self._reply(exec_id)
 
+@upper_half
+@require(Runstate.RUNNING)
+async def _raw(
+self,
+msg: Union[Message, Mapping[str, object], bytes],
+assign_id: bool = True,
+) -> Message:
+"""
+Issue a raw `Message` to the QMP server and await a reply.
+
+:param msg:
+A Message to send to the server. It may be a `Message`, any
+Mapping (including Dict), or raw bytes.
+:param assign_id:
+Assign an arbitrary execution ID to this message. If
+`False`, the existing id must either be absent (and no other
+such pending execution may omit an ID) or a string. If it is
+a string, it must not start with '__aqmp#' and no other such
+pending execution may currently be using that ID.
+
+:return: Execution reply from the server.
+
+:raise ExecInterruptedError:
+When the reply could not be retrieved because the connection
+was lost, or some other problem.
+:raise TypeError:
+When assign_id is `False`, an ID is given, and it is not a string.
+:raise ValueError:
+When assign_id is `False`, but the ID is not usable;
+Either because it starts with '__aqmp#' or it is already in-use.
+"""
+# 1. convert generic Mapping or bytes to a QMP Message
+# 2. copy Message objects so that we assign an ID only to the copy.
+msg = Message(msg)
+
+exec_id = msg.get('id')
+if not assign_id and 'id' in msg:
+if not isinstance(exec_id, str):
+raise TypeError(f"ID ('{exec_id}') must be a string.")
+if exec_id.startswith('__aqmp#'):
+raise ValueError(
+f"ID ('{exec_id}') must not start with '__aqmp#'."
+)
+
+if not assign_id and exec_id in self._pending:
+raise ValueError(
+f"ID '{exec_id}' is in-use and cannot be used."
+)
+
+return await self._execute(msg, assign_id=assign_id)
+
 @upper_half
 @require(Runstate.RUNNING)
 async def execute_msg(self, msg: Message) -> object:
-- 
2.31.1

[PULL 29/32] python/aqmp-tui: Add AQMP TUI

From: G S Niteesh Babu 

Added AQMP TUI.

Implements the follwing basic features:
1) Command transmission/reception.
2) Shows events asynchronously.
3) Shows server status in the bottom status bar.
4) Automatic retries on disconnects and error conditions.

Also added type annotations and necessary pylint/mypy configurations.

Signed-off-by: G S Niteesh Babu 
Message-Id: <20210823220746.28295-3-niteesh...@gmail.com>
Signed-off-by: John Snow 
---
 python/qemu/aqmp/aqmp_tui.py | 620 +++
 python/setup.cfg |  13 +-
 2 files changed, 632 insertions(+), 1 deletion(-)
 create mode 100644 python/qemu/aqmp/aqmp_tui.py

diff --git a/python/qemu/aqmp/aqmp_tui.py b/python/qemu/aqmp/aqmp_tui.py
new file mode 100644
index 000..ac533541d2f
--- /dev/null
+++ b/python/qemu/aqmp/aqmp_tui.py
@@ -0,0 +1,620 @@
+# Copyright (c) 2021
+#
+# Authors:
+#  Niteesh Babu G S 
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+"""
+AQMP TUI
+
+AQMP TUI is an asynchronous interface built on top the of the AQMP library.
+It is the successor of QMP-shell and is bought-in as a replacement for it.
+
+Example Usage: aqmp-tui 
+Full Usage: aqmp-tui --help
+"""
+
+import argparse
+import asyncio
+import json
+import logging
+from logging import Handler, LogRecord
+import signal
+from typing import (
+List,
+Optional,
+Tuple,
+Type,
+Union,
+cast,
+)
+
+import urwid
+import urwid_readline
+
+from ..qmp import QEMUMonitorProtocol, QMPBadPortError
+from .error import ProtocolError
+from .message import DeserializationError, Message, UnexpectedTypeError
+from .protocol import ConnectError, Runstate
+from .qmp_client import ExecInterruptedError, QMPClient
+from .util import create_task, pretty_traceback
+
+
+# The name of the signal that is used to update the history list
+UPDATE_MSG: str = 'UPDATE_MSG'
+
+
+def format_json(msg: str) -> str:
+"""
+Formats valid/invalid multi-line JSON message into a single-line message.
+
+Formatting is first tried using the standard json module. If that fails
+due to an decoding error then a simple string manipulation is done to
+achieve a single line JSON string.
+
+Converting into single line is more asthetically pleasing when looking
+along with error messages.
+
+Eg:
+Input:
+  [ 1,
+true,
+3 ]
+The above input is not a valid QMP message and produces the following error
+"QMP message is not a JSON object."
+When displaying this in TUI in multiline mode we get
+
+[ 1,
+  true,
+  3 ]: QMP message is not a JSON object.
+
+whereas in singleline mode we get the following
+
+[1, true, 3]: QMP message is not a JSON object.
+
+The single line mode is more asthetically pleasing.
+
+:param msg:
+The message to formatted into single line.
+
+:return: Formatted singleline message.
+"""
+try:
+msg = json.loads(msg)
+return str(json.dumps(msg))
+except json.decoder.JSONDecodeError:
+msg = msg.replace('\n', '')
+words = msg.split(' ')
+words = list(filter(None, words))
+return ' '.join(words)
+
+
+def has_handler_type(logger: logging.Logger,
+ handler_type: Type[Handler]) -> bool:
+"""
+The Logger class has no interface to check if a certain type of handler is
+installed or not. So we provide an interface to do so.
+
+:param logger:
+Logger object
+:param handler_type:
+The type of the handler to be checked.
+
+:return: returns True if handler of type `handler_type`.
+"""
+for handler in logger.handlers:
+if isinstance(handler, handler_type):
+return True
+return False
+
+
+class App(QMPClient):
+"""
+Implements the AQMP TUI.
+
+Initializes the widgets and starts the urwid event loop.
+
+:param address:
+Address of the server to connect to.
+:param num_retries:
+The number of times to retry before stopping to reconnect.
+:param retry_delay:
+The delay(sec) before each retry
+"""
+def __init__(self, address: Union[str, Tuple[str, int]], num_retries: int,
+ retry_delay: Optional[int]) -> None:
+urwid.register_signal(type(self), UPDATE_MSG)
+self.window = Window(self)
+self.address = address
+self.aloop: Optional[asyncio.AbstractEventLoop] = None
+self.num_retries = num_retries
+self.retry_delay = retry_delay if retry_delay else 2
+self.retry: bool = False
+self.exiting: bool = False
+super().__init__()
+
+def add_to_history(self, msg: str, level: Optional[str] = None) -> None:
+"""
+Appends the msg to the history list.
+
+:param msg:
+The raw message to be appended in string type.
+"""
+

[PULL 22/32] python/aqmp: add asyncio_run compatibility wrapper

As a convenience. It isn't used by the library itself, but it is used by
the test suite. It will also come in handy for users of the library
still on Python 3.6.

Signed-off-by: John Snow 
Message-id: 20210915162955.333025-23-js...@redhat.com
Signed-off-by: John Snow 
---
 python/qemu/aqmp/util.py | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/python/qemu/aqmp/util.py b/python/qemu/aqmp/util.py
index 52a15321889..eaa5fc7d5f9 100644
--- a/python/qemu/aqmp/util.py
+++ b/python/qemu/aqmp/util.py
@@ -147,6 +147,25 @@ async def wait_closed(writer: asyncio.StreamWriter) -> 
None:
 await asyncio.sleep(0)
 
 
+def asyncio_run(coro: Coroutine[Any, Any, T], *, debug: bool = False) -> T:
+"""
+Python 3.6-compatible `asyncio.run` wrapper.
+
+:param coro: A coroutine to execute now.
+:return: The return value from the coroutine.
+"""
+if sys.version_info >= (3, 7):
+return asyncio.run(coro, debug=debug)
+
+# Python 3.6
+loop = asyncio.get_event_loop()
+loop.set_debug(debug)
+ret = loop.run_until_complete(coro)
+loop.close()
+
+return ret
+
+
 # 
 # Section: Logging & Debugging
 # 
-- 
2.31.1

[PULL 19/32] python/aqmp: Add message routing to QMP protocol

Add the ability to handle and route messages in qmp_protocol.py. The
interface for actually sending anything still isn't added until next
commit.

Signed-off-by: John Snow 
Message-id: 20210915162955.333025-20-js...@redhat.com
Signed-off-by: John Snow 
---
 python/qemu/aqmp/qmp_client.py | 122 -
 1 file changed, 120 insertions(+), 2 deletions(-)

diff --git a/python/qemu/aqmp/qmp_client.py b/python/qemu/aqmp/qmp_client.py
index 000ff59c7a7..fa0cc7c5ae5 100644
--- a/python/qemu/aqmp/qmp_client.py
+++ b/python/qemu/aqmp/qmp_client.py
@@ -7,15 +7,19 @@
 accept an incoming connection from that server.
 """
 
+# The import workarounds here are fixed in the next commit.
+import asyncio  # pylint: disable=unused-import # noqa
 import logging
 from typing import (
 Dict,
 List,
 Mapping,
 Optional,
+Union,
+cast,
 )
 
-from .error import ProtocolError
+from .error import AQMPError, ProtocolError
 from .events import Events
 from .message import Message
 from .models import Greeting
@@ -61,6 +65,53 @@ class NegotiationError(_WrappedProtocolError):
 """
 
 
+class ExecInterruptedError(AQMPError):
+"""
+Exception raised when an RPC is interrupted.
+
+This error is raised when an execute() statement could not be
+completed.  This can occur because the connection itself was
+terminated before a reply was received.
+
+The true cause of the interruption will be available via `disconnect()`.
+"""
+
+
+class _MsgProtocolError(ProtocolError):
+"""
+Abstract error class for protocol errors that have a `Message` object.
+
+This Exception class is used for protocol errors where the `Message`
+was mechanically understood, but was found to be inappropriate or
+malformed.
+
+:param error_message: Human-readable string describing the error.
+:param msg: The QMP `Message` that caused the error.
+"""
+def __init__(self, error_message: str, msg: Message):
+super().__init__(error_message)
+#: The received `Message` that caused the error.
+self.msg: Message = msg
+
+def __str__(self) -> str:
+return "\n".join([
+super().__str__(),
+f"  Message was: {str(self.msg)}\n",
+])
+
+
+class ServerParseError(_MsgProtocolError):
+"""
+The Server sent a `Message` indicating parsing failure.
+
+i.e. A reply has arrived from the server, but it is missing the "ID"
+field, indicating a parsing error.
+
+:param error_message: Human-readable string describing the error.
+:param msg: The QMP `Message` that caused the error.
+"""
+
+
 class QMPClient(AsyncProtocol[Message], Events):
 """
 Implements a QMP client connection.
@@ -106,6 +157,9 @@ async def run(self, address='/tmp/qemu.socket'):
 # Read buffer limit; large enough to accept query-qmp-schema
 _limit = (256 * 1024)
 
+# Type alias for pending execute() result items
+_PendingT = Union[Message, ExecInterruptedError]
+
 def __init__(self, name: Optional[str] = None) -> None:
 super().__init__(name)
 Events.__init__(self)
@@ -120,6 +174,12 @@ def __init__(self, name: Optional[str] = None) -> None:
 # Cached Greeting, if one was awaited.
 self._greeting: Optional[Greeting] = None
 
+# Incoming RPC reply messages.
+self._pending: Dict[
+Union[str, None],
+'asyncio.Queue[QMPClient._PendingT]'
+] = {}
+
 @upper_half
 async def _establish_session(self) -> None:
 """
@@ -132,6 +192,9 @@ async def _establish_session(self) -> None:
 :raise EOFError: When the server unexpectedly hangs up.
 :raise OSError: For underlying stream errors.
 """
+self._greeting = None
+self._pending = {}
+
 if self.await_greeting or self.negotiate:
 self._greeting = await self._get_greeting()
 
@@ -203,10 +266,33 @@ async def _negotiate(self) -> None:
 self.logger.debug("%s:\n%s\n", emsg, pretty_traceback())
 raise
 
+@bottom_half
+async def _bh_disconnect(self) -> None:
+try:
+await super()._bh_disconnect()
+finally:
+if self._pending:
+self.logger.debug("Cancelling pending executions")
+keys = self._pending.keys()
+for key in keys:
+self.logger.debug("Cancelling execution '%s'", key)
+self._pending[key].put_nowait(
+ExecInterruptedError("Disconnected")
+)
+
+self.logger.debug("QMP Disconnected.")
+
+@upper_half
+def _cleanup(self) -> None:
+super()._cleanup()
+assert not self._pending
+
 @bottom_half
 async def _on_message(self, msg: Message) -> None:
 """
 Add an incoming message to the appropriate queue/handler.
+
+:raise ServerParseError: When Message indicates server parse failure.

[PULL 27/32] python/aqmp: Add Coverage.py support

I'm not exposing this via the Makefile help, it's not likely to be
useful to passersby. Switch the avocado runner to the 'legacy' runner
for now, as the new runner seems to obscure coverage reports, again.

Usage is to enter your venv of choice and then:
`make check-coverage && xdg-open htmlcov/index.html`.

Signed-off-by: John Snow 
Message-id: 20210915162955.333025-28-js...@redhat.com
Signed-off-by: John Snow 
---
 python/.gitignore  |  5 +
 python/Makefile|  9 +
 python/avocado.cfg |  3 +++
 python/setup.cfg   | 10 ++
 4 files changed, 27 insertions(+)

diff --git a/python/.gitignore b/python/.gitignore
index c8b0e67fe6c..904f324bb11 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -15,3 +15,8 @@ qemu.egg-info/
 .venv/
 .tox/
 .dev-venv/
+
+# Coverage.py reports
+.coverage
+.coverage.*
+htmlcov/
diff --git a/python/Makefile b/python/Makefile
index fe27a3e12ee..33343113625 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -92,6 +92,13 @@ check:
 check-tox:
@tox $(QEMU_TOX_EXTRA_ARGS)
 
+.PHONY: check-coverage
+check-coverage:
+   @coverage run -m avocado --config avocado.cfg run tests/*.py
+   @coverage combine
+   @coverage html
+   @coverage report
+
 .PHONY: clean
 clean:
python3 setup.py clean --all
@@ -100,3 +107,5 @@ clean:
 .PHONY: distclean
 distclean: clean
rm -rf qemu.egg-info/ .venv/ .tox/ $(QEMU_VENV_DIR) dist/
+   rm -f .coverage .coverage.*
+   rm -rf htmlcov/
diff --git a/python/avocado.cfg b/python/avocado.cfg
index 10dc6fb6054..c7722e7ecd3 100644
--- a/python/avocado.cfg
+++ b/python/avocado.cfg
@@ -1,3 +1,6 @@
+[run]
+test_runner = runner
+
 [simpletests]
 # Don't show stdout/stderr in the test *summary*
 status.failure_fields = ['status']
diff --git a/python/setup.cfg b/python/setup.cfg
index 8481fa7c923..4fd0c68a5b7 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -140,3 +140,13 @@ deps =
 .[fuse]  # Workaround to trigger tox venv rebuild
 commands =
 make check
+
+# Coverage.py [https://coverage.readthedocs.io/en/latest/] is a tool for
+# measuring code coverage of Python programs. It monitors your program,
+# noting which parts of the code have been executed, then analyzes the
+# source to identify code that could have been executed but was not.
+
+[coverage:run]
+concurrency = multiprocessing
+source = qemu/
+parallel = true
-- 
2.31.1

Re: [PATCH v6 12/40] accel/nvmm: Implement AccelOpsClass::has_work()

2021-09-27 Thread Kamil Rytarowski

On 24.09.2021 11:38, Philippe Mathieu-Daudé wrote:
> Since there is no specific NVMM handling for cpu_has_work() in
> cpu_thread_is_idle(), implement NVMM has_work() handler as a
> simple 'return false' code.
>
> Signed-off-by: Philippe Mathieu-Daudé 
Reviewed-by: Kamil Rytarowski 
> ---
>  target/i386/nvmm/nvmm-accel-ops.c | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/target/i386/nvmm/nvmm-accel-ops.c 
> b/target/i386/nvmm/nvmm-accel-ops.c
> index f788f75289f..36296f79ff8 100644
> --- a/target/i386/nvmm/nvmm-accel-ops.c
> +++ b/target/i386/nvmm/nvmm-accel-ops.c
> @@ -83,6 +83,11 @@ static void nvmm_kick_vcpu_thread(CPUState *cpu)
>  cpus_kick_thread(cpu);
>  }
>
> +static bool nvmm_cpu_has_work(CPUState *cpu)
> +{
> +return false;
> +}
> +
>  static void nvmm_accel_ops_class_init(ObjectClass *oc, void *data)
>  {
>  AccelOpsClass *ops = ACCEL_OPS_CLASS(oc);
> @@ -94,6 +99,7 @@ static void nvmm_accel_ops_class_init(ObjectClass *oc, void 
> *data)
>  ops->synchronize_post_init = nvmm_cpu_synchronize_post_init;
>  ops->synchronize_state = nvmm_cpu_synchronize_state;
>  ops->synchronize_pre_loadvm = nvmm_cpu_synchronize_pre_loadvm;
> +ops->has_work = nvmm_cpu_has_work;
>  }
>
>  static const TypeInfo nvmm_accel_ops_type = {
>

[PULL 20/32] python/aqmp: add execute() interfaces

Add execute() and execute_msg().

_execute() is split into _issue() and _reply() halves so that
hypothetical subclasses of QMP that want to support different execution
paradigms can do so.

I anticipate a synchronous interface may have need of separating the
send/reply phases. However, I do not wish to expose that interface here
and want to actively discourage it, so they remain private interfaces.

Signed-off-by: John Snow 
Message-id: 20210915162955.333025-21-js...@redhat.com
Signed-off-by: John Snow 
---
 python/qemu/aqmp/__init__.py   |   4 +-
 python/qemu/aqmp/qmp_client.py | 202 +++--
 2 files changed, 198 insertions(+), 8 deletions(-)

diff --git a/python/qemu/aqmp/__init__.py b/python/qemu/aqmp/__init__.py
index d975c752eaa..4b7df53e006 100644
--- a/python/qemu/aqmp/__init__.py
+++ b/python/qemu/aqmp/__init__.py
@@ -25,7 +25,7 @@
 from .events import EventListener
 from .message import Message
 from .protocol import ConnectError, Runstate, StateError
-from .qmp_client import QMPClient
+from .qmp_client import ExecInterruptedError, ExecuteError, QMPClient
 
 
 # The order of these fields impact the Sphinx documentation order.
@@ -40,4 +40,6 @@
 'AQMPError',
 'StateError',
 'ConnectError',
+'ExecuteError',
+'ExecInterruptedError',
 )
diff --git a/python/qemu/aqmp/qmp_client.py b/python/qemu/aqmp/qmp_client.py
index fa0cc7c5ae5..879348feaaa 100644
--- a/python/qemu/aqmp/qmp_client.py
+++ b/python/qemu/aqmp/qmp_client.py
@@ -7,8 +7,7 @@
 accept an incoming connection from that server.
 """
 
-# The import workarounds here are fixed in the next commit.
-import asyncio  # pylint: disable=unused-import # noqa
+import asyncio
 import logging
 from typing import (
 Dict,
@@ -22,8 +21,8 @@
 from .error import AQMPError, ProtocolError
 from .events import Events
 from .message import Message
-from .models import Greeting
-from .protocol import AsyncProtocol
+from .models import ErrorResponse, Greeting
+from .protocol import AsyncProtocol, Runstate, require
 from .util import (
 bottom_half,
 exception_summary,
@@ -65,11 +64,32 @@ class NegotiationError(_WrappedProtocolError):
 """
 
 
+class ExecuteError(AQMPError):
+"""
+Exception raised by `QMPClient.execute()` on RPC failure.
+
+:param error_response: The RPC error response object.
+:param sent: The sent RPC message that caused the failure.
+:param received: The raw RPC error reply received.
+"""
+def __init__(self, error_response: ErrorResponse,
+ sent: Message, received: Message):
+super().__init__(error_response.error.desc)
+#: The sent `Message` that caused the failure
+self.sent: Message = sent
+#: The received `Message` that indicated failure
+self.received: Message = received
+#: The parsed error response
+self.error: ErrorResponse = error_response
+#: The QMP error class
+self.error_class: str = error_response.error.class_
+
+
 class ExecInterruptedError(AQMPError):
 """
-Exception raised when an RPC is interrupted.
+Exception raised by `execute()` (et al) when an RPC is interrupted.
 
-This error is raised when an execute() statement could not be
+This error is raised when an `execute()` statement could not be
 completed.  This can occur because the connection itself was
 terminated before a reply was received.
 
@@ -112,6 +132,27 @@ class ServerParseError(_MsgProtocolError):
 """
 
 
+class BadReplyError(_MsgProtocolError):
+"""
+An execution reply was successfully routed, but not understood.
+
+If a QMP message is received with an 'id' field to allow it to be
+routed, but is otherwise malformed, this exception will be raised.
+
+A reply message is malformed if it is missing either the 'return' or
+'error' keys, or if the 'error' value has missing keys or members of
+the wrong type.
+
+:param error_message: Human-readable string describing the error.
+:param msg: The malformed reply that was received.
+:param sent: The message that was sent that prompted the error.
+"""
+def __init__(self, error_message: str, msg: Message, sent: Message):
+super().__init__(error_message, msg)
+#: The sent `Message` that caused the failure
+self.sent = sent
+
+
 class QMPClient(AsyncProtocol[Message], Events):
 """
 Implements a QMP client connection.
@@ -174,6 +215,9 @@ def __init__(self, name: Optional[str] = None) -> None:
 # Cached Greeting, if one was awaited.
 self._greeting: Optional[Greeting] = None
 
+# Command ID counter
+self._execute_id = 0
+
 # Incoming RPC reply messages.
 self._pending: Dict[
 Union[str, None],
@@ -363,12 +407,135 @@ def _do_send(self, msg: Message) -> None:
 assert self._writer is not None
 self._writer.write(bytes(msg))
 
+@upper_half
+def _get_exec_id(self) ->

[PULL 18/32] python/pylint: disable no-member check

mypy handles this better -- but we only need the workaround because
pylint under Python 3.6 does not understand that a MutableMapping really
does have a .get() method attached.

We could remove this again once 3.7 is our minimum.

Signed-off-by: John Snow 
Message-id: 20210915162955.333025-19-js...@redhat.com
Signed-off-by: John Snow 
---
 python/setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/setup.cfg b/python/setup.cfg
index 70957ab7e4d..e6407e05826 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -90,6 +90,7 @@ ignore_missing_imports = True
 # --disable=W".
 disable=consider-using-f-string,
 too-many-function-args,  # mypy handles this with less false positives.
+no-member,  # mypy also handles this better.
 
 [pylint.basic]
 # Good variable names which should always be accepted, separated by a comma.
-- 
2.31.1

[PULL 25/32] python/aqmp: add AsyncProtocol unit tests

This tests most of protocol.py -- From a hacked up Coverage.py run, it's
at about 86%. There's a few error cases that aren't very well tested
yet, they're hard to induce artificially so far. I'm working on it.

Signed-off-by: John Snow 
Message-id: 20210915162955.333025-26-js...@redhat.com
Signed-off-by: John Snow 
---
 python/tests/protocol.py | 535 +++
 1 file changed, 535 insertions(+)
 create mode 100644 python/tests/protocol.py

diff --git a/python/tests/protocol.py b/python/tests/protocol.py
new file mode 100644
index 000..f0682d29ce5
--- /dev/null
+++ b/python/tests/protocol.py
@@ -0,0 +1,535 @@
+import asyncio
+from contextlib import contextmanager
+import os
+import socket
+from tempfile import TemporaryDirectory
+
+import avocado
+
+from qemu.aqmp import ConnectError, Runstate
+from qemu.aqmp.protocol import AsyncProtocol, StateError
+from qemu.aqmp.util import asyncio_run, create_task
+
+
+class NullProtocol(AsyncProtocol[None]):
+"""
+NullProtocol is a test mockup of an AsyncProtocol implementation.
+
+It adds a fake_session instance variable that enables a code path
+that bypasses the actual connection logic, but still allows the
+reader/writers to start.
+
+Because the message type is defined as None, an asyncio.Event named
+'trigger_input' is created that prohibits the reader from
+incessantly being able to yield None; this event can be poked to
+simulate an incoming message.
+
+For testing symmetry with do_recv, an interface is added to "send" a
+Null message.
+
+For testing purposes, a "simulate_disconnection" method is also
+added which allows us to trigger a bottom half disconnect without
+injecting any real errors into the reader/writer loops; in essence
+it performs exactly half of what disconnect() normally does.
+"""
+def __init__(self, name=None):
+self.fake_session = False
+self.trigger_input: asyncio.Event
+super().__init__(name)
+
+async def _establish_session(self):
+self.trigger_input = asyncio.Event()
+await super()._establish_session()
+
+async def _do_accept(self, address, ssl=None):
+if not self.fake_session:
+await super()._do_accept(address, ssl)
+
+async def _do_connect(self, address, ssl=None):
+if not self.fake_session:
+await super()._do_connect(address, ssl)
+
+async def _do_recv(self) -> None:
+await self.trigger_input.wait()
+self.trigger_input.clear()
+
+def _do_send(self, msg: None) -> None:
+pass
+
+async def send_msg(self) -> None:
+await self._outgoing.put(None)
+
+async def simulate_disconnect(self) -> None:
+"""
+Simulates a bottom-half disconnect.
+
+This method schedules a disconnection but does not wait for it
+to complete. This is used to put the loop into the DISCONNECTING
+state without fully quiescing it back to IDLE. This is normally
+something you cannot coax AsyncProtocol to do on purpose, but it
+will be similar to what happens with an unhandled Exception in
+the reader/writer.
+
+Under normal circumstances, the library design requires you to
+await on disconnect(), which awaits the disconnect task and
+returns bottom half errors as a pre-condition to allowing the
+loop to return back to IDLE.
+"""
+self._schedule_disconnect()
+
+
+def run_as_task(coro, allow_cancellation=False):
+"""
+Run a given coroutine as a task.
+
+Optionally, wrap it in a try..except block that allows this
+coroutine to be canceled gracefully.
+"""
+async def _runner():
+try:
+await coro
+except asyncio.CancelledError:
+if allow_cancellation:
+return
+raise
+return create_task(_runner())
+
+
+@contextmanager
+def jammed_socket():
+"""
+Opens up a random unused TCP port on localhost, then jams it.
+"""
+socks = []
+
+try:
+sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+sock.bind(('127.0.0.1', 0))
+sock.listen(1)
+address = sock.getsockname()
+
+socks.append(sock)
+
+# I don't *fully* understand why, but it takes *two* un-accepted
+# connections to start jamming the socket.
+for _ in range(2):
+sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+sock.connect(address)
+socks.append(sock)
+
+yield address
+
+finally:
+for sock in socks:
+sock.close()
+
+
+class Smoke(avocado.Test):
+
+def setUp(self):
+self.proto = NullProtocol()
+
+def test__repr__(self):
+self.assertEqual(
+repr(self.proto),
+""
+)
+
+def testRunstate(self):
+

[PULL 31/32] python: add optional pygments dependency

From: G S Niteesh Babu 

Added pygments as optional dependency for AQMP TUI.
This is required for the upcoming syntax highlighting feature
in AQMP TUI.
The dependency has also been added in the devel optional group.

Added mypy 'ignore_missing_imports' for pygments since it does
not have any type stubs.

Signed-off-by: G S Niteesh Babu 
Message-Id: <20210823220746.28295-5-niteesh...@gmail.com>
Signed-off-by: John Snow 
---
 python/Pipfile.lock | 8 
 python/setup.cfg| 5 +
 2 files changed, 13 insertions(+)

diff --git a/python/Pipfile.lock b/python/Pipfile.lock
index da7a4ee164c..d2a7dbd88be 100644
--- a/python/Pipfile.lock
+++ b/python/Pipfile.lock
@@ -200,6 +200,14 @@
 ],
 "version": "==2.0.0"
 },
+"pygments": {
+"hashes": [
+
"sha256:a18f47b506a429f6f4b9df81bb02beab9ca21d0a5fee38ed15aef65f0545519f",
+
"sha256:d66e804411278594d764fc69ec36ec13d9ae9147193a1740cd34d272ca383b8e"
+],
+"markers": "python_version >= '3.5'",
+"version": "==2.9.0"
+},
 "pylint": {
 "hashes": [
 
"sha256:082a6d461b54f90eea49ca90fff4ee8b6e45e8029e5dbd72f6107ef84f3779c0",
diff --git a/python/setup.cfg b/python/setup.cfg
index eefa9613f18..417e937839b 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -46,6 +46,7 @@ devel =
 tox >= 3.18.0
 urwid >= 2.1.2
 urwid-readline >= 0.13
+Pygments >= 2.9.0
 
 # Provides qom-fuse functionality
 fuse =
@@ -55,6 +56,7 @@ fuse =
 tui =
 urwid >= 2.1.2
 urwid-readline >= 0.13
+Pygments >= 2.9.0
 
 [options.entry_points]
 console_scripts =
@@ -97,6 +99,9 @@ ignore_missing_imports = True
 [mypy-urwid_readline]
 ignore_missing_imports = True
 
+[mypy-pygments]
+ignore_missing_imports = True
+
 [pylint.messages control]
 # Disable the message, report, category or checker with the given id(s). You
 # can either give multiple identifiers separated by comma (,) or put this
-- 
2.31.1

[PULL 13/32] python/aqmp: add QMP Message format

The Message class is here primarily to serve as a solid type to use for
mypy static typing for unambiguous annotation and documentation.

We can also stuff JSON serialization and deserialization into this class
itself so it can be re-used even outside this infrastructure.

Signed-off-by: John Snow 
Reviewed-by: Eric Blake 
Message-id: 20210915162955.333025-14-js...@redhat.com
Signed-off-by: John Snow 
---
 python/qemu/aqmp/__init__.py |   4 +-
 python/qemu/aqmp/message.py  | 209 +++
 2 files changed, 212 insertions(+), 1 deletion(-)
 create mode 100644 python/qemu/aqmp/message.py

diff --git a/python/qemu/aqmp/__init__.py b/python/qemu/aqmp/__init__.py
index 88ead4c0238..96fff1e5f3e 100644
--- a/python/qemu/aqmp/__init__.py
+++ b/python/qemu/aqmp/__init__.py
@@ -22,12 +22,14 @@
 # the COPYING file in the top-level directory.
 
 from .error import AQMPError
+from .message import Message
 from .protocol import ConnectError, Runstate, StateError
 
 
 # The order of these fields impact the Sphinx documentation order.
 __all__ = (
-# Classes
+# Classes, most to least important
+'Message',
 'Runstate',
 
 # Exceptions, most generic to most explicit
diff --git a/python/qemu/aqmp/message.py b/python/qemu/aqmp/message.py
new file mode 100644
index 000..f76ccc90746
--- /dev/null
+++ b/python/qemu/aqmp/message.py
@@ -0,0 +1,209 @@
+"""
+QMP Message Format
+
+This module provides the `Message` class, which represents a single QMP
+message sent to or from the server.
+"""
+
+import json
+from json import JSONDecodeError
+from typing import (
+Dict,
+Iterator,
+Mapping,
+MutableMapping,
+Optional,
+Union,
+)
+
+from .error import ProtocolError
+
+
+class Message(MutableMapping[str, object]):
+"""
+Represents a single QMP protocol message.
+
+QMP uses JSON objects as its basic communicative unit; so this
+Python object is a :py:obj:`~collections.abc.MutableMapping`. It may
+be instantiated from either another mapping (like a `dict`), or from
+raw `bytes` that still need to be deserialized.
+
+Once instantiated, it may be treated like any other MutableMapping::
+
+>>> msg = Message(b'{"hello": "world"}')
+>>> assert msg['hello'] == 'world'
+>>> msg['id'] = 'foobar'
+>>> print(msg)
+{
+  "hello": "world",
+  "id": "foobar"
+}
+
+It can be converted to `bytes`::
+
+>>> msg = Message({"hello": "world"})
+>>> print(bytes(msg))
+b'{"hello":"world","id":"foobar"}'
+
+Or back into a garden-variety `dict`::
+
+   >>> dict(msg)
+   {'hello': 'world'}
+
+
+:param value: Initial value, if any.
+:param eager:
+When `True`, attempt to serialize or deserialize the initial value
+immediately, so that conversion exceptions are raised during
+the call to ``__init__()``.
+"""
+# pylint: disable=too-many-ancestors
+
+def __init__(self,
+ value: Union[bytes, Mapping[str, object]] = b'{}', *,
+ eager: bool = True):
+self._data: Optional[bytes] = None
+self._obj: Optional[Dict[str, object]] = None
+
+if isinstance(value, bytes):
+self._data = value
+if eager:
+self._obj = self._deserialize(self._data)
+else:
+self._obj = dict(value)
+if eager:
+self._data = self._serialize(self._obj)
+
+# Methods necessary to implement the MutableMapping interface, see:
+# 
https://docs.python.org/3/library/collections.abc.html#collections.abc.MutableMapping
+
+# We get pop, popitem, clear, update, setdefault, __contains__,
+# keys, items, values, get, __eq__ and __ne__ for free.
+
+def __getitem__(self, key: str) -> object:
+return self._object[key]
+
+def __setitem__(self, key: str, value: object) -> None:
+self._object[key] = value
+self._data = None
+
+def __delitem__(self, key: str) -> None:
+del self._object[key]
+self._data = None
+
+def __iter__(self) -> Iterator[str]:
+return iter(self._object)
+
+def __len__(self) -> int:
+return len(self._object)
+
+# Dunder methods not related to MutableMapping:
+
+def __repr__(self) -> str:
+if self._obj is not None:
+return f"Message({self._object!r})"
+return f"Message({bytes(self)!r})"
+
+def __str__(self) -> str:
+"""Pretty-printed representation of this QMP message."""
+return json.dumps(self._object, indent=2)
+
+def __bytes__(self) -> bytes:
+"""bytes representing this QMP message."""
+if self._data is None:
+self._data = self._serialize(self._obj or {})
+return self._data
+
+# Conversion Methods
+
+@property
+def _object(self) -> Dict[str, object]:
+"""
+A `dict` representing this QMP message.
+
+

[PULL 11/32] python/aqmp: add _cb_inbound and _cb_outbound logging hooks

Add hooks designed to log/filter incoming/outgoing messages. The primary
intent for these is to be able to support iotests which may want to log
messages with specific filters for reproducible output.

Another use is for plugging into Urwid frameworks; all messages in/out
can be automatically added to a rendering list for the purposes of a
qmp-shell like tool.

Signed-off-by: John Snow 
Reviewed-by: Eric Blake 
Message-id: 20210915162955.333025-12-js...@redhat.com
Signed-off-by: John Snow 
---
 python/qemu/aqmp/protocol.py | 50 +---
 1 file changed, 46 insertions(+), 4 deletions(-)

diff --git a/python/qemu/aqmp/protocol.py b/python/qemu/aqmp/protocol.py
index 2ef19e96932..80c2004737e 100644
--- a/python/qemu/aqmp/protocol.py
+++ b/python/qemu/aqmp/protocol.py
@@ -177,6 +177,11 @@ class AsyncProtocol(Generic[T]):
  can be written after the super() call.
  - `_on_message`:
  Actions to be performed when a message is received.
+ - `_cb_outbound`:
+ Logging/Filtering hook for all outbound messages.
+ - `_cb_inbound`:
+ Logging/Filtering hook for all inbound messages.
+ This hook runs *before* `_on_message()`.
 
 :param name:
 Name used for logging messages, if any. By default, messages
@@ -752,6 +757,43 @@ async def _bh_recv_message(self) -> None:
 # Section: Message I/O
 # 
 
+@upper_half
+@bottom_half
+def _cb_outbound(self, msg: T) -> T:
+"""
+Callback: outbound message hook.
+
+This is intended for subclasses to be able to add arbitrary
+hooks to filter or manipulate outgoing messages. The base
+implementation does nothing but log the message without any
+manipulation of the message.
+
+:param msg: raw outbound message
+:return: final outbound message
+"""
+self.logger.debug("--> %s", str(msg))
+return msg
+
+@upper_half
+@bottom_half
+def _cb_inbound(self, msg: T) -> T:
+"""
+Callback: inbound message hook.
+
+This is intended for subclasses to be able to add arbitrary
+hooks to filter or manipulate incoming messages. The base
+implementation does nothing but log the message without any
+manipulation of the message.
+
+This method does not "handle" incoming messages; it is a filter.
+The actual "endpoint" for incoming messages is `_on_message()`.
+
+:param msg: raw inbound message
+:return: processed inbound message
+"""
+self.logger.debug("<-- %s", str(msg))
+return msg
+
 @upper_half
 @bottom_half
 async def _do_recv(self) -> T:
@@ -780,8 +822,8 @@ async def _recv(self) -> T:
 
 :return: A single (filtered, processed) protocol message.
 """
-# A forthcoming commit makes this method less trivial.
-return await self._do_recv()
+message = await self._do_recv()
+return self._cb_inbound(message)
 
 @upper_half
 @bottom_half
@@ -811,7 +853,7 @@ async def _send(self, msg: T) -> None:
 
 :raise OSError: For problems with the underlying stream.
 """
-# A forthcoming commit makes this method less trivial.
+msg = self._cb_outbound(msg)
 self._do_send(msg)
 
 @bottom_half
@@ -826,6 +868,6 @@ async def _on_message(self, msg: T) -> None:
 directly cause the loop to halt, so logic may be best-kept
 to a minimum if at all possible.
 
-:param msg: The incoming message
+:param msg: The incoming message, already logged/filtered.
 """
 # Nothing to do in the abstract case.
-- 
2.31.1

[PULL 30/32] python: Add entry point for aqmp-tui