date:20240424

Hi list,

This series is the followup of Thomas' suggestion in previous
ERRP_GUARD() cleanup[1]. And based on Thomas' thoughts, I tried to clean
up as many of the other related places (in s390x/cpu_models.c).

[1]: 
https://lore.kernel.org/qemu-devel/6e7eff95-cfd3-46f9-9937-7597b2e4f...@redhat.com/

Regards,
Zhao
---
v1: 
https://lore.kernel.org/qemu-devel/20240419065712.1225038-1-zhao1@linux.intel.com/
Changes since v1:
 * Picked Phili's stubs cleanup and rebased on it.
 * Fixed typos.

---
Philippe Mathieu-Daudé (1):
  target/s390x: Remove KVM stubs in cpu_models.h

Zhao Liu (6):
  target/s390x/cpu_model: Make check_compatibility() return boolean
  target/s390x/cpu_model: Drop local @err in s390_realize_cpu_model()
  target/s390x/cpu_models: Make kvm_s390_get_host_cpu_model() return
boolean
  target/s390x/cpu_models: Drop local @err in get_max_cpu_model()
  target/s390x/cpu_models: Make kvm_s390_apply_cpu_model() return
boolean
  target/s390x/cpu_models_sysemu: Drop local @err in apply_cpu_model()

 target/s390x/cpu_models.c| 25 ++---
 target/s390x/cpu_models.h| 19 ++-
 target/s390x/cpu_models_sysemu.c |  5 +
 target/s390x/kvm/kvm.c   | 28 +++-
 4 files changed, 28 insertions(+), 49 deletions(-)

-- 
2.34.1

[PATCH v2 4/7] target/s390x/cpu_models: Make kvm_s390_get_host_cpu_model() return boolean

As error.h suggested, the best practice for callee is to return
something to indicate success / failure.

So make kvm_s390_get_host_cpu_model() return boolean and check the
returned boolean in get_max_cpu_model() instead of accessing @err.

Signed-off-by: Zhao Liu 
---
 target/s390x/cpu_models.c |  9 -
 target/s390x/cpu_models.h |  2 +-
 target/s390x/kvm/kvm.c| 13 +++--
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
index 052540a866ac..a0e4acb707d7 100644
--- a/target/s390x/cpu_models.c
+++ b/target/s390x/cpu_models.c
@@ -560,16 +560,15 @@ S390CPUModel *get_max_cpu_model(Error **errp)
 }
 
 if (kvm_enabled()) {
-kvm_s390_get_host_cpu_model(_model, );
+if (!kvm_s390_get_host_cpu_model(_model, )) {
+error_propagate(errp, err);
+return NULL;
+}
 } else {
 max_model.def = s390_find_cpu_def(QEMU_MAX_CPU_TYPE, QEMU_MAX_CPU_GEN,
   QEMU_MAX_CPU_EC_GA, NULL);
 bitmap_copy(max_model.features, qemu_max_cpu_feat, S390_FEAT_MAX);
 }
-if (err) {
-error_propagate(errp, err);
-return NULL;
-}
 cached = true;
 return _model;
 }
diff --git a/target/s390x/cpu_models.h b/target/s390x/cpu_models.h
index a89c2a15ab54..c14aff6c10eb 100644
--- a/target/s390x/cpu_models.h
+++ b/target/s390x/cpu_models.h
@@ -115,7 +115,7 @@ S390CPUDef const *s390_find_cpu_def(uint16_t type, uint8_t 
gen, uint8_t ec_ga,
 S390FeatBitmap features);
 
 bool kvm_s390_cpu_models_supported(void);
-void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp);
+bool kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp);
 void kvm_s390_apply_cpu_model(const S390CPUModel *model,  Error **errp);
 
 #endif /* TARGET_S390X_CPU_MODELS_H */
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
index 4dcd757cdcc3..2c3e05cae3ad 100644
--- a/target/s390x/kvm/kvm.c
+++ b/target/s390x/kvm/kvm.c
@@ -2375,7 +2375,7 @@ bool kvm_s390_cpu_models_supported(void)
  KVM_S390_VM_CPU_MACHINE_SUBFUNC);
 }
 
-void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp)
+bool kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp)
 {
 struct kvm_s390_vm_cpu_machine prop = {};
 struct kvm_device_attr attr = {
@@ -2390,14 +2390,14 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, 
Error **errp)
 
 if (!kvm_s390_cpu_models_supported()) {
 error_setg(errp, "KVM doesn't support CPU models");
-return;
+return false;
 }
 
 /* query the basic cpu model properties */
 rc = kvm_vm_ioctl(kvm_state, KVM_GET_DEVICE_ATTR, );
 if (rc) {
 error_setg(errp, "KVM: Error querying host CPU model: %d", rc);
-return;
+return false;
 }
 
 cpu_type = cpuid_type(prop.cpuid);
@@ -2420,13 +2420,13 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, 
Error **errp)
 rc = query_cpu_feat(model->features);
 if (rc) {
 error_setg(errp, "KVM: Error querying CPU features: %d", rc);
-return;
+return false;
 }
 /* get supported cpu subfunctions indicated via query / test bit */
 rc = query_cpu_subfunc(model->features);
 if (rc) {
 error_setg(errp, "KVM: Error querying CPU subfunctions: %d", rc);
-return;
+return false;
 }
 
 /* PTFF subfunctions might be indicated although kernel support missing */
@@ -2482,7 +2482,7 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, 
Error **errp)
 }
 if (!model->def) {
 error_setg(errp, "KVM: host CPU model could not be identified");
-return;
+return false;
 }
 /* for now, we can only provide the AP feature with HW support */
 if (ap_available()) {
@@ -2506,6 +2506,7 @@ void kvm_s390_get_host_cpu_model(S390CPUModel *model, 
Error **errp)
 /* strip of features that are not part of the maximum model */
 bitmap_and(model->features, model->features, model->def->full_feat,
S390_FEAT_MAX);
+return true;
 }
 
 static int configure_uv_feat_guest(const S390FeatBitmap features)
-- 
2.34.1

[PATCH v2 6/7] target/s390x/cpu_models: Make kvm_s390_apply_cpu_model() return boolean

As error.h suggested, the best practice for callee is to return
something to indicate success / failure.

So make kvm_s390_apply_cpu_model() return boolean and check the
returned boolean in apply_cpu_model() instead of accessing @err.

Signed-off-by: Zhao Liu 
Reviewed-by: Thomas Huth 
---
 target/s390x/cpu_models.h|  2 +-
 target/s390x/cpu_models_sysemu.c |  3 +--
 target/s390x/kvm/kvm.c   | 15 ---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/target/s390x/cpu_models.h b/target/s390x/cpu_models.h
index c14aff6c10eb..71d4bc2dd4a2 100644
--- a/target/s390x/cpu_models.h
+++ b/target/s390x/cpu_models.h
@@ -116,6 +116,6 @@ S390CPUDef const *s390_find_cpu_def(uint16_t type, uint8_t 
gen, uint8_t ec_ga,
 
 bool kvm_s390_cpu_models_supported(void);
 bool kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp);
-void kvm_s390_apply_cpu_model(const S390CPUModel *model,  Error **errp);
+bool kvm_s390_apply_cpu_model(const S390CPUModel *model,  Error **errp);
 
 #endif /* TARGET_S390X_CPU_MODELS_H */
diff --git a/target/s390x/cpu_models_sysemu.c b/target/s390x/cpu_models_sysemu.c
index 2d99218069cb..bf855c659d5e 100644
--- a/target/s390x/cpu_models_sysemu.c
+++ b/target/s390x/cpu_models_sysemu.c
@@ -405,8 +405,7 @@ void apply_cpu_model(const S390CPUModel *model, Error 
**errp)
 }
 
 if (kvm_enabled()) {
-kvm_s390_apply_cpu_model(model, );
-if (err) {
+if (!kvm_s390_apply_cpu_model(model, )) {
 error_propagate(errp, err);
 return;
 }
diff --git a/target/s390x/kvm/kvm.c b/target/s390x/kvm/kvm.c
index 2c3e05cae3ad..1b494ecc2076 100644
--- a/target/s390x/kvm/kvm.c
+++ b/target/s390x/kvm/kvm.c
@@ -2543,7 +2543,7 @@ static void kvm_s390_configure_apie(bool interpret)
 }
 }
 
-void kvm_s390_apply_cpu_model(const S390CPUModel *model, Error **errp)
+bool kvm_s390_apply_cpu_model(const S390CPUModel *model, Error **errp)
 {
 struct kvm_s390_vm_cpu_processor prop  = {
 .fac_list = { 0 },
@@ -2560,11 +2560,11 @@ void kvm_s390_apply_cpu_model(const S390CPUModel 
*model, Error **errp)
 if (kvm_s390_cmma_available()) {
 kvm_s390_enable_cmma();
 }
-return;
+return true;
 }
 if (!kvm_s390_cpu_models_supported()) {
 error_setg(errp, "KVM doesn't support CPU models");
-return;
+return false;
 }
 prop.cpuid = s390_cpuid_from_cpu_model(model);
 prop.ibc = s390_ibc_from_cpu_model(model);
@@ -2574,19 +2574,19 @@ void kvm_s390_apply_cpu_model(const S390CPUModel 
*model, Error **errp)
 rc = kvm_vm_ioctl(kvm_state, KVM_SET_DEVICE_ATTR, );
 if (rc) {
 error_setg(errp, "KVM: Error configuring the CPU model: %d", rc);
-return;
+return false;
 }
 /* configure cpu features indicated e.g. via SCLP */
 rc = configure_cpu_feat(model->features);
 if (rc) {
 error_setg(errp, "KVM: Error configuring CPU features: %d", rc);
-return;
+return false;
 }
 /* configure cpu subfunctions indicated via query / test bit */
 rc = configure_cpu_subfunc(model->features);
 if (rc) {
 error_setg(errp, "KVM: Error configuring CPU subfunctions: %d", rc);
-return;
+return false;
 }
 /* enable CMM via CMMA */
 if (test_bit(S390_FEAT_CMM, model->features)) {
@@ -2601,8 +2601,9 @@ void kvm_s390_apply_cpu_model(const S390CPUModel *model, 
Error **errp)
 rc = configure_uv_feat_guest(model->features);
 if (rc) {
 error_setg(errp, "KVM: Error configuring CPU UV features %d", rc);
-return;
+return false;
 }
+return true;
 }
 
 void kvm_s390_restart_interrupt(S390CPU *cpu)
-- 
2.34.1

[PATCH v2 1/7] target/s390x/cpu_model: Make check_compatibility() return boolean

As error.h suggested, the best practice for callee is to return
something to indicate success / failure.

With returned boolean, there's no need to check @err.

Suggested-by: Thomas Huth 
Signed-off-by: Zhao Liu 
Reviewed-by: Thomas Huth 
---
 target/s390x/cpu_models.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
index 8ed3bb6a27b3..8cb47d905fb4 100644
--- a/target/s390x/cpu_models.c
+++ b/target/s390x/cpu_models.c
@@ -510,7 +510,7 @@ static void check_compat_model_failed(Error **errp,
 return;
 }
 
-static void check_compatibility(const S390CPUModel *max_model,
+static bool check_compatibility(const S390CPUModel *max_model,
 const S390CPUModel *model, Error **errp)
 {
 ERRP_GUARD();
@@ -518,11 +518,11 @@ static void check_compatibility(const S390CPUModel 
*max_model,
 
 if (model->def->gen > max_model->def->gen) {
 check_compat_model_failed(errp, max_model, "Selected CPU generation is 
too new");
-return;
+return false;
 } else if (model->def->gen == max_model->def->gen &&
model->def->ec_ga > max_model->def->ec_ga) {
 check_compat_model_failed(errp, max_model, "Selected CPU GA level is 
too new");
-return;
+return false;
 }
 
 #ifndef CONFIG_USER_ONLY
@@ -530,14 +530,14 @@ static void check_compatibility(const S390CPUModel 
*max_model,
 error_setg(errp, "The unpack facility is not compatible with "
"the --only-migratable option. You must remove either "
"the 'unpack' facility or the --only-migratable option");
-return;
+return false;
 }
 #endif
 
 /* detect the missing features to properly report them */
 bitmap_andnot(missing, model->features, max_model->features, 
S390_FEAT_MAX);
 if (bitmap_empty(missing, S390_FEAT_MAX)) {
-return;
+return true;
 }
 
 error_setg(errp, " ");
@@ -546,6 +546,7 @@ static void check_compatibility(const S390CPUModel 
*max_model,
   "available in the current configuration: ");
 error_append_hint(errp,
   "Consider a different accelerator, QEMU, or kernel 
version\n");
+return false;
 }
 
 S390CPUModel *get_max_cpu_model(Error **errp)
@@ -605,8 +606,7 @@ void s390_realize_cpu_model(CPUState *cs, Error **errp)
 cpu->model->cpu_ver = max_model->cpu_ver;
 
 check_consistency(cpu->model);
-check_compatibility(max_model, cpu->model, );
-if (err) {
+if (!check_compatibility(max_model, cpu->model, )) {
 error_propagate(errp, err);
 return;
 }
-- 
2.34.1

[PATCH v2 7/7] target/s390x/cpu_models_sysemu: Drop local @err in apply_cpu_model()

Use @errp to fetch error information directly and drop the local
variable @err.

Signed-off-by: Zhao Liu 
Reviewed-by: Thomas Huth 
---
 target/s390x/cpu_models_sysemu.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/target/s390x/cpu_models_sysemu.c b/target/s390x/cpu_models_sysemu.c
index bf855c659d5e..15be729c3d48 100644
--- a/target/s390x/cpu_models_sysemu.c
+++ b/target/s390x/cpu_models_sysemu.c
@@ -389,7 +389,6 @@ CpuModelBaselineInfo 
*qmp_query_cpu_model_baseline(CpuModelInfo *infoa,
 
 void apply_cpu_model(const S390CPUModel *model, Error **errp)
 {
-Error *err = NULL;
 static S390CPUModel applied_model;
 static bool applied;
 
@@ -405,8 +404,7 @@ void apply_cpu_model(const S390CPUModel *model, Error 
**errp)
 }
 
 if (kvm_enabled()) {
-if (!kvm_s390_apply_cpu_model(model, )) {
-error_propagate(errp, err);
+if (!kvm_s390_apply_cpu_model(model, errp)) {
 return;
 }
 }
-- 
2.34.1

[PATCH v2 5/7] target/s390x/cpu_models: Drop local @err in get_max_cpu_model()

Use @errp to fetch error information directly and drop the local
variable @err.

Signed-off-by: Zhao Liu 
Reviewed-by: Thomas Huth 
---
 target/s390x/cpu_models.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
index a0e4acb707d7..aae452cfd3fc 100644
--- a/target/s390x/cpu_models.c
+++ b/target/s390x/cpu_models.c
@@ -551,7 +551,6 @@ static bool check_compatibility(const S390CPUModel 
*max_model,
 
 S390CPUModel *get_max_cpu_model(Error **errp)
 {
-Error *err = NULL;
 static S390CPUModel max_model;
 static bool cached;
 
@@ -560,8 +559,7 @@ S390CPUModel *get_max_cpu_model(Error **errp)
 }
 
 if (kvm_enabled()) {
-if (!kvm_s390_get_host_cpu_model(_model, )) {
-error_propagate(errp, err);
+if (!kvm_s390_get_host_cpu_model(_model, errp)) {
 return NULL;
 }
 } else {
-- 
2.34.1

[PATCH v2 2/7] target/s390x/cpu_model: Drop local @err in s390_realize_cpu_model()

Use @errp to fetch error information directly and drop the local
variable @err.

Suggested-by: Thomas Huth 
Signed-off-by: Zhao Liu 
Reviewed-by: Thomas Huth 
---
 target/s390x/cpu_models.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
index 8cb47d905fb4..052540a866ac 100644
--- a/target/s390x/cpu_models.c
+++ b/target/s390x/cpu_models.c
@@ -577,7 +577,6 @@ S390CPUModel *get_max_cpu_model(Error **errp)
 void s390_realize_cpu_model(CPUState *cs, Error **errp)
 {
 ERRP_GUARD();
-Error *err = NULL;
 S390CPUClass *xcc = S390_CPU_GET_CLASS(cs);
 S390CPU *cpu = S390_CPU(cs);
 const S390CPUModel *max_model;
@@ -606,8 +605,7 @@ void s390_realize_cpu_model(CPUState *cs, Error **errp)
 cpu->model->cpu_ver = max_model->cpu_ver;
 
 check_consistency(cpu->model);
-if (!check_compatibility(max_model, cpu->model, )) {
-error_propagate(errp, err);
+if (!check_compatibility(max_model, cpu->model, errp)) {
 return;
 }
 
-- 
2.34.1

[PATCH v2 3/7] target/s390x: Remove KVM stubs in cpu_models.h

From: Philippe Mathieu-Daudé 

Since the calls are elided when KVM is not available,
we can remove the stubs (which are never compiled).

Inspired-by: Thomas Huth >
Signed-off-by: Philippe Mathieu-Daudé 
Reviewed-by: Zhao Liu 
Signed-off-by: Zhao Liu 
---
 target/s390x/cpu_models.h | 15 ---
 1 file changed, 15 deletions(-)

diff --git a/target/s390x/cpu_models.h b/target/s390x/cpu_models.h
index d7b89129891a..a89c2a15ab54 100644
--- a/target/s390x/cpu_models.h
+++ b/target/s390x/cpu_models.h
@@ -114,23 +114,8 @@ static inline uint64_t s390_cpuid_from_cpu_model(const 
S390CPUModel *model)
 S390CPUDef const *s390_find_cpu_def(uint16_t type, uint8_t gen, uint8_t ec_ga,
 S390FeatBitmap features);
 
-#ifdef CONFIG_KVM
 bool kvm_s390_cpu_models_supported(void);
 void kvm_s390_get_host_cpu_model(S390CPUModel *model, Error **errp);
 void kvm_s390_apply_cpu_model(const S390CPUModel *model,  Error **errp);
-#else
-static inline void kvm_s390_get_host_cpu_model(S390CPUModel *model,
-   Error **errp)
-{
-}
-static inline void kvm_s390_apply_cpu_model(const S390CPUModel *model,
-Error **errp)
-{
-}
-static inline bool kvm_s390_cpu_models_supported(void)
-{
-return false;
-}
-#endif
 
 #endif /* TARGET_S390X_CPU_MODELS_H */
-- 
2.34.1

Re: [PULL 0/9] tcg + linux-user patch queue


On 4/24/24 15:51, Richard Henderson wrote:

The following changes since commit 85b597413d4370cb168f711192eaef2eb70535ac:

   Merge tag 'housekeeping-20240424' ofhttps://github.com/philmd/qemu  into 
staging (2024-04-24 11:49:57 -0700)

are available in the Git repository at:

   https://gitlab.com/rth7680/qemu.git  tags/pull-tcg-20240424

for you to fetch changes up to 0815c228bd1e0c24ac064ce299807b32f8e05d83:

   target/m68k: Support semihosting on non-ColdFire targets (2024-04-24 
15:46:00 -0700)


meson: Make DEBUG_REMAP a meson option
target/m68k: Support semihosting on non-ColdFire targets
linux-user: do_setsockopt cleanups
linux-user: Add FITRIM ioctl


Applied, thanks.  Please update https://wiki.qemu.org/ChangeLog/9.1 as 
appropriate.


r~

[PATCH v4 09/14] migration/multifd: Prepare to introduce DSA acceleration on the multifd path.

1. Refactor multifd_send_thread function.
2. Introduce the batch task structure in MultiFDSendParams.

Signed-off-by: Hao Xiang 
---
 include/qemu/dsa.h  | 51 +++--
 migration/multifd.c |  5 +
 migration/multifd.h |  2 ++
 util/dsa.c  | 51 ++---
 4 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/include/qemu/dsa.h b/include/qemu/dsa.h
index e002652879..0c36e93016 100644
--- a/include/qemu/dsa.h
+++ b/include/qemu/dsa.h
@@ -2,6 +2,7 @@
 #define QEMU_DSA_H
 
 #include "qemu/error-report.h"
+#include "exec/cpu-common.h"
 #include "qemu/thread.h"
 #include "qemu/queue.h"
 
@@ -42,6 +43,21 @@ typedef struct dsa_batch_task {
 QSIMPLEQ_ENTRY(dsa_batch_task) entry;
 } dsa_batch_task;
 
+#endif
+
+struct batch_task {
+#ifdef CONFIG_DSA_OPT
+/* Address of each pages in pages */
+ram_addr_t *addr;
+/* Zero page checking results */
+bool *results;
+/* Batch task DSA specific implementation */
+struct dsa_batch_task *dsa_batch;
+#endif
+};
+
+#ifdef CONFIG_DSA_OPT
+
 /**
  * @brief Initializes DSA devices.
  *
@@ -74,7 +90,7 @@ void dsa_cleanup(void);
 bool dsa_is_running(void);
 
 /**
- * @brief Initializes a buffer zero batch task.
+ * @brief Initializes a buffer zero DSA batch task.
  *
  * @param task A pointer to the batch task to initialize.
  * @param results A pointer to an array of zero page checking results.
@@ -102,9 +118,26 @@ void buffer_zero_batch_task_destroy(struct dsa_batch_task 
*task);
  * @return Zero if successful, otherwise non-zero.
  */
 int
-buffer_is_zero_dsa_batch_async(struct dsa_batch_task *batch_task,
+buffer_is_zero_dsa_batch_async(struct batch_task *batch_task,
const void **buf, size_t count, size_t len);
 
+/**
+ * @brief Initializes a general buffer zero batch task.
+ *
+ * @param batch_size The number of zero page checking tasks in the batch.
+ * @return A pointer to the general batch task initialized.
+ */
+struct batch_task *
+batch_task_init(int batch_size);
+
+/**
+ * @brief Destroys a general buffer zero batch task.
+ *
+ * @param task A pointer to the general batch task to destroy.
+ */
+void
+batch_task_destroy(struct batch_task *task);
+
 #else
 
 static inline bool dsa_is_running(void)
@@ -128,6 +161,20 @@ static inline void dsa_stop(void) {}
 
 static inline void dsa_cleanup(void) {}
 
+static inline int
+buffer_is_zero_dsa_batch_async(struct batch_task *batch_task,
+   const void **buf, size_t count, size_t len)
+{
+exit(1);
+}
+
+static inline struct batch_task *batch_task_init(int batch_size)
+{
+return NULL;
+}
+
+static inline void batch_task_destroy(struct batch_task *task) {}
+
 #endif
 
 #endif
diff --git a/migration/multifd.c b/migration/multifd.c
index f317bff077..cfd3a92f6c 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -13,6 +13,8 @@
 #include "qemu/osdep.h"
 #include "qemu/cutils.h"
 #include "qemu/rcu.h"
+#include "qemu/dsa.h"
+#include "qemu/memalign.h"
 #include "exec/target_page.h"
 #include "sysemu/sysemu.h"
 #include "exec/ramblock.h"
@@ -780,6 +782,8 @@ static bool multifd_send_cleanup_channel(MultiFDSendParams 
*p, Error **errp)
 p->name = NULL;
 multifd_pages_clear(p->pages);
 p->pages = NULL;
+batch_task_destroy(p->batch_task);
+p->batch_task = NULL;
 p->packet_len = 0;
 g_free(p->packet);
 p->packet = NULL;
@@ -1172,6 +1176,7 @@ bool multifd_send_setup(void)
 qemu_sem_init(>sem_sync, 0);
 p->id = i;
 p->pages = multifd_pages_init(page_count);
+p->batch_task = batch_task_init(page_count);
 
 if (use_packets) {
 p->packet_len = sizeof(MultiFDPacket_t)
diff --git a/migration/multifd.h b/migration/multifd.h
index c9d9b09239..16e27db5e9 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -135,6 +135,8 @@ typedef struct {
  * pending_job != 0 -> multifd_channel can use it.
  */
 MultiFDPages_t *pages;
+/* Zero page checking batch task */
+struct batch_task *batch_task;
 
 /* thread local variables. No locking required */
 
diff --git a/util/dsa.c b/util/dsa.c
index 5a2bf33651..4f695e58af 100644
--- a/util/dsa.c
+++ b/util/dsa.c
@@ -802,7 +802,7 @@ buffer_zero_task_init_int(struct dsa_hw_desc *descriptor,
 }
 
 /**
- * @brief Initializes a buffer zero batch task.
+ * @brief Initializes a buffer zero DSA batch task.
  *
  * @param task A pointer to the batch task to initialize.
  * @param results A pointer to an array of zero page checking results.
@@ -1107,29 +1107,64 @@ void dsa_cleanup(void)
  * @return Zero if successful, otherwise non-zero.
  */
 int
-buffer_is_zero_dsa_batch_async(struct dsa_batch_task *batch_task,
+buffer_is_zero_dsa_batch_async(struct batch_task *batch_task,
const void **buf, size_t count, size_t len)
 {
-if (count <= 0 || count > batch_task->batch_size) {
+struct

[PATCH v4 07/14] util/dsa: Implement DSA task asynchronous submission and wait for completion.

* Add a DSA task completion callback.
* DSA completion thread will call the tasks's completion callback
on every task/batch task completion.
* DSA submission path to wait for completion.
* Implement CPU fallback if DSA is not able to complete the task.

Signed-off-by: Hao Xiang 
Signed-off-by: Bryan Zhang 
---
 include/qemu/dsa.h |  14 +
 util/dsa.c | 147 -
 2 files changed, 158 insertions(+), 3 deletions(-)

diff --git a/include/qemu/dsa.h b/include/qemu/dsa.h
index 645e6fc367..e002652879 100644
--- a/include/qemu/dsa.h
+++ b/include/qemu/dsa.h
@@ -91,6 +91,20 @@ buffer_zero_batch_task_init(struct dsa_batch_task *task,
  */
 void buffer_zero_batch_task_destroy(struct dsa_batch_task *task);
 
+/**
+ * @brief Performs buffer zero comparison on a DSA batch task asynchronously.
+ *
+ * @param batch_task A pointer to the batch task.
+ * @param buf An array of memory buffers.
+ * @param count The number of buffers in the array.
+ * @param len The buffer length.
+ *
+ * @return Zero if successful, otherwise non-zero.
+ */
+int
+buffer_is_zero_dsa_batch_async(struct dsa_batch_task *batch_task,
+   const void **buf, size_t count, size_t len);
+
 #else
 
 static inline bool dsa_is_running(void)
diff --git a/util/dsa.c b/util/dsa.c
index 9db4cfcf1d..5a2bf33651 100644
--- a/util/dsa.c
+++ b/util/dsa.c
@@ -473,6 +473,57 @@ poll_completion(struct dsa_completion_record *completion,
 return 0;
 }
 
+/**
+ * @brief Helper function to use CPU to complete a single
+ *zero page checking task.
+ *
+ * @param completion A pointer to a DSA task completion record.
+ * @param descriptor A pointer to a DSA task descriptor.
+ * @param result A pointer to the result of a zero page checking.
+ */
+static void
+task_cpu_fallback_int(struct dsa_completion_record *completion,
+  struct dsa_hw_desc *descriptor, bool *result)
+{
+const uint8_t *buf;
+size_t len;
+
+if (completion->status == DSA_COMP_SUCCESS) {
+return;
+}
+
+/*
+ * DSA was able to partially complete the operation. Check the
+ * result. If we already know this is not a zero page, we can
+ * return now.
+ */
+if (completion->bytes_completed != 0 && completion->result != 0) {
+*result = false;
+return;
+}
+
+/* Let's fallback to use CPU to complete it. */
+buf = (const uint8_t *)descriptor->src_addr;
+len = descriptor->xfer_size;
+*result = buffer_is_zero(buf + completion->bytes_completed,
+ len - completion->bytes_completed);
+}
+
+/**
+ * @brief Use CPU to complete a single zero page checking task.
+ *
+ * @param task A pointer to the task.
+ */
+static void
+task_cpu_fallback(struct dsa_batch_task *task)
+{
+assert(task->task_type == DSA_TASK);
+
+task_cpu_fallback_int(>completions[0],
+  >descriptors[0],
+  >results[0]);
+}
+
 /**
  * @brief Complete a single DSA task in the batch task.
  *
@@ -574,6 +625,47 @@ exit:
 return ret;
 }
 
+/**
+ * @brief Use CPU to complete the zero page checking batch task.
+ *
+ * @param batch_task A pointer to the batch task.
+ */
+static void
+batch_task_cpu_fallback(struct dsa_batch_task *batch_task)
+{
+assert(batch_task->task_type == DSA_BATCH_TASK);
+
+struct dsa_completion_record *batch_completion =
+_task->batch_completion;
+struct dsa_completion_record *completion;
+uint8_t status;
+bool *results = batch_task->results;
+uint32_t count = batch_task->batch_descriptor.desc_count;
+
+/* DSA is able to complete the entire batch task. */
+if (batch_completion->status == DSA_COMP_SUCCESS) {
+assert(count == batch_completion->bytes_completed);
+return;
+}
+
+/*
+ * DSA encounters some error and is not able to complete
+ * the entire batch task. Use CPU fallback.
+ */
+for (int i = 0; i < count; i++) {
+
+completion = _task->completions[i];
+status = completion->status;
+
+assert(status == DSA_COMP_SUCCESS ||
+status == DSA_COMP_PAGE_FAULT_NOBOF);
+
+task_cpu_fallback_int(completion,
+  _task->descriptors[i],
+  [i]);
+}
+}
+
 /**
  * @brief Handles an asynchronous DSA batch task completion.
  *
@@ -861,7 +953,6 @@ buffer_zero_batch_task_set(struct dsa_batch_task 
*batch_task,
  *
  * @return int Zero if successful, otherwise an appropriate error code.
  */
-__attribute__((unused))
 static int
 buffer_zero_dsa_async(struct dsa_batch_task *task,
   const void *buf, size_t len)
@@ -880,7 +971,6 @@ buffer_zero_dsa_async(struct dsa_batch_task *task,
  * @param count The number of buffers.
  * @param len The buffer length.
  */
-__attribute__((unused))
 static int
 buffer_zero_dsa_batch_async(struct dsa_batch_task *batch_task,
 const

[PATCH v4 10/14] migration/multifd: Enable DSA offloading in multifd sender path.

Multifd sender path gets an array of pages queued by the migration
thread. It performs zero page checking on every page in the array.
The pages are classfied as either a zero page or a normal page. This
change uses Intel DSA to offload the zero page checking from CPU to
the DSA accelerator. The sender thread submits a batch of pages to DSA
hardware and waits for the DSA completion thread to signal for work
completion.

Signed-off-by: Hao Xiang 
---
 migration/multifd-zero-page.c | 99 +--
 migration/multifd.c   | 27 +-
 migration/multifd.h   |  1 +
 3 files changed, 120 insertions(+), 7 deletions(-)

diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
index e1b8370f88..4f426289e4 100644
--- a/migration/multifd-zero-page.c
+++ b/migration/multifd-zero-page.c
@@ -37,25 +37,83 @@ static void swap_page_offset(ram_addr_t *pages_offset, int 
a, int b)
 }
 
 /**
- * multifd_send_zero_page_detect: Perform zero page detection on all pages.
+ * zero_page_detect_cpu: Perform zero page detection using CPU.
  *
  * Sorts normal pages before zero pages in p->pages->offset and updates
  * p->pages->normal_num.
  *
  * @param p A pointer to the send params.
  */
-void multifd_send_zero_page_detect(MultiFDSendParams *p)
+static void zero_page_detect_cpu(MultiFDSendParams *p)
 {
 MultiFDPages_t *pages = p->pages;
 RAMBlock *rb = pages->block;
 int i = 0;
 int j = pages->num - 1;
 
-if (!multifd_zero_page_enabled()) {
-pages->normal_num = pages->num;
+/*
+ * Sort the page offset array by moving all normal pages to
+ * the left and all zero pages to the right of the array.
+ */
+while (i <= j) {
+uint64_t offset = pages->offset[i];
+
+if (!buffer_is_zero(rb->host + offset, p->page_size)) {
+i++;
+continue;
+}
+
+swap_page_offset(pages->offset, i, j);
+ram_release_page(rb->idstr, offset);
+j--;
+}
+
+pages->normal_num = i;
+}
+
+
+#ifdef CONFIG_DSA_OPT
+
+static void swap_result(bool *results, int a, int b)
+{
+bool temp;
+
+if (a == b) {
 return;
 }
 
+temp = results[a];
+results[a] = results[b];
+results[b] = temp;
+}
+
+/**
+ * zero_page_detect_dsa: Perform zero page detection using
+ * Intel Data Streaming Accelerator (DSA).
+ *
+ * Sorts normal pages before zero pages in p->pages->offset and updates
+ * p->pages->normal_num.
+ *
+ * @param p A pointer to the send params.
+ */
+static void zero_page_detect_dsa(MultiFDSendParams *p)
+{
+MultiFDPages_t *pages = p->pages;
+RAMBlock *rb = pages->block;
+bool *results = p->batch_task->results;
+
+for (int i = 0; i < p->pages->num; i++) {
+p->batch_task->addr[i] = (ram_addr_t)(rb->host + p->pages->offset[i]);
+}
+
+buffer_is_zero_dsa_batch_async(p->batch_task,
+   (const void **)p->batch_task->addr,
+   p->pages->num,
+   p->page_size);
+
+int i = 0;
+int j = pages->num - 1;
+
 /*
  * Sort the page offset array by moving all normal pages to
  * the left and all zero pages to the right of the array.
@@ -63,11 +121,12 @@ void multifd_send_zero_page_detect(MultiFDSendParams *p)
 while (i <= j) {
 uint64_t offset = pages->offset[i];
 
-if (!buffer_is_zero(rb->host + offset, p->page_size)) {
+if (!results[i]) {
 i++;
 continue;
 }
 
+swap_result(results, i, j);
 swap_page_offset(pages->offset, i, j);
 ram_release_page(rb->idstr, offset);
 j--;
@@ -76,6 +135,15 @@ void multifd_send_zero_page_detect(MultiFDSendParams *p)
 pages->normal_num = i;
 }
 
+#else
+
+static void zero_page_detect_dsa(MultiFDSendParams *p)
+{
+exit(1);
+}
+
+#endif
+
 void multifd_recv_zero_page_process(MultiFDRecvParams *p)
 {
 for (int i = 0; i < p->zero_num; i++) {
@@ -87,3 +155,24 @@ void multifd_recv_zero_page_process(MultiFDRecvParams *p)
 }
 }
 }
+
+/**
+ * multifd_send_zero_page_detect: Perform zero page detection on all pages.
+ *
+ * @param p A pointer to the send params.
+ */
+void multifd_send_zero_page_detect(MultiFDSendParams *p)
+{
+MultiFDPages_t *pages = p->pages;
+
+if (!multifd_zero_page_enabled()) {
+pages->normal_num = pages->num;
+return;
+}
+
+if (dsa_is_running()) {
+zero_page_detect_dsa(p);
+} else {
+zero_page_detect_cpu(p);
+}
+}
diff --git a/migration/multifd.c b/migration/multifd.c
index cfd3a92f6c..7316643d0a 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -818,6 +818,8 @@ void multifd_send_shutdown(void)
 
 multifd_send_terminate_threads();
 
+dsa_cleanup();
+
 for (i = 0; i < migrate_multifd_channels(); i++) {
 MultiFDSendParams *p = _send_state->params[i];
 Error *local_err = NULL;
@@

[PATCH v4 04/14] util/dsa: Implement DSA task enqueue and dequeue.

* Use a safe thread queue for DSA task enqueue/dequeue.
* Implement DSA task submission.
* Implement DSA batch task submission.

Signed-off-by: Hao Xiang 
---
 include/qemu/dsa.h |  28 +++
 util/dsa.c | 201 +
 2 files changed, 229 insertions(+)

diff --git a/include/qemu/dsa.h b/include/qemu/dsa.h
index f15c05ee85..37cae8d9d2 100644
--- a/include/qemu/dsa.h
+++ b/include/qemu/dsa.h
@@ -13,6 +13,34 @@
 #include 
 #include "x86intrin.h"
 
+typedef enum DsaTaskType {
+DSA_TASK = 0,
+DSA_BATCH_TASK
+} DsaTaskType;
+
+typedef enum DsaTaskStatus {
+DSA_TASK_READY = 0,
+DSA_TASK_PROCESSING,
+DSA_TASK_COMPLETION
+} DsaTaskStatus;
+
+typedef void (*dsa_completion_fn)(void *);
+
+typedef struct dsa_batch_task {
+struct dsa_hw_desc batch_descriptor;
+struct dsa_hw_desc *descriptors;
+struct dsa_completion_record batch_completion __attribute__((aligned(32)));
+struct dsa_completion_record *completions;
+struct dsa_device_group *group;
+struct dsa_device *device;
+dsa_completion_fn completion_callback;
+QemuSemaphore sem_task_complete;
+DsaTaskType task_type;
+DsaTaskStatus status;
+int batch_size;
+QSIMPLEQ_ENTRY(dsa_batch_task) entry;
+} dsa_batch_task;
+
 /**
  * @brief Initializes DSA devices.
  *
diff --git a/util/dsa.c b/util/dsa.c
index 05bbf8e31a..75739a1af6 100644
--- a/util/dsa.c
+++ b/util/dsa.c
@@ -244,6 +244,205 @@ dsa_device_group_get_next_device(struct dsa_device_group 
*group)
 return >dsa_devices[current];
 }
 
+/**
+ * @brief Empties out the DSA task queue.
+ *
+ * @param group A pointer to the DSA device group.
+ */
+static void
+dsa_empty_task_queue(struct dsa_device_group *group)
+{
+qemu_mutex_lock(>task_queue_lock);
+dsa_task_queue *task_queue = >task_queue;
+while (!QSIMPLEQ_EMPTY(task_queue)) {
+QSIMPLEQ_REMOVE_HEAD(task_queue, entry);
+}
+qemu_mutex_unlock(>task_queue_lock);
+}
+
+/**
+ * @brief Adds a task to the DSA task queue.
+ *
+ * @param group A pointer to the DSA device group.
+ * @param context A pointer to the DSA task to enqueue.
+ *
+ * @return int Zero if successful, otherwise a proper error code.
+ */
+static int
+dsa_task_enqueue(struct dsa_device_group *group,
+ struct dsa_batch_task *task)
+{
+dsa_task_queue *task_queue = >task_queue;
+QemuMutex *task_queue_lock = >task_queue_lock;
+QemuCond *task_queue_cond = >task_queue_cond;
+
+bool notify = false;
+
+qemu_mutex_lock(task_queue_lock);
+
+if (!group->running) {
+error_report("DSA: Tried to queue task to stopped device queue.");
+qemu_mutex_unlock(task_queue_lock);
+return -1;
+}
+
+/* The queue is empty. This enqueue operation is a 0->1 transition. */
+if (QSIMPLEQ_EMPTY(task_queue)) {
+notify = true;
+}
+
+QSIMPLEQ_INSERT_TAIL(task_queue, task, entry);
+
+/* We need to notify the waiter for 0->1 transitions. */
+if (notify) {
+qemu_cond_signal(task_queue_cond);
+}
+
+qemu_mutex_unlock(task_queue_lock);
+
+return 0;
+}
+
+/**
+ * @brief Takes a DSA task out of the task queue.
+ *
+ * @param group A pointer to the DSA device group.
+ * @return dsa_batch_task* The DSA task being dequeued.
+ */
+__attribute__((unused))
+static struct dsa_batch_task *
+dsa_task_dequeue(struct dsa_device_group *group)
+{
+struct dsa_batch_task *task = NULL;
+dsa_task_queue *task_queue = >task_queue;
+QemuMutex *task_queue_lock = >task_queue_lock;
+QemuCond *task_queue_cond = >task_queue_cond;
+
+qemu_mutex_lock(task_queue_lock);
+
+while (true) {
+if (!group->running) {
+goto exit;
+}
+task = QSIMPLEQ_FIRST(task_queue);
+if (task != NULL) {
+break;
+}
+qemu_cond_wait(task_queue_cond, task_queue_lock);
+}
+
+QSIMPLEQ_REMOVE_HEAD(task_queue, entry);
+
+exit:
+qemu_mutex_unlock(task_queue_lock);
+return task;
+}
+
+/**
+ * @brief Submits a DSA work item to the device work queue.
+ *
+ * @param wq A pointer to the DSA work queue's device memory.
+ * @param descriptor A pointer to the DSA work item descriptor.
+ *
+ * @return Zero if successful, non-zero otherwise.
+ */
+static int
+submit_wi_int(void *wq, struct dsa_hw_desc *descriptor)
+{
+uint64_t retry = 0;
+
+_mm_sfence();
+
+while (true) {
+if (_enqcmd(wq, descriptor) == 0) {
+break;
+}
+retry++;
+if (retry > max_retry_count) {
+error_report("Submit work retry %lu times.", retry);
+return -1;
+}
+}
+
+return 0;
+}
+
+/**
+ * @brief Synchronously submits a DSA work item to the
+ *device work queue.
+ *
+ * @param wq A pointer to the DSA worjk queue's device memory.
+ * @param descriptor A pointer to the DSA work item descriptor.
+ *
+ * @return int Zero if successful, non-zero otherwise.
+ */

[PATCH v4 03/14] util/dsa: Implement DSA device start and stop logic.

* DSA device open and close.
* DSA group contains multiple DSA devices.
* DSA group configure/start/stop/clean.

Signed-off-by: Hao Xiang 
Signed-off-by: Bryan Zhang 
---
 include/qemu/dsa.h |  72 +++
 util/dsa.c | 316 +
 util/meson.build   |   1 +
 3 files changed, 389 insertions(+)
 create mode 100644 include/qemu/dsa.h
 create mode 100644 util/dsa.c

diff --git a/include/qemu/dsa.h b/include/qemu/dsa.h
new file mode 100644
index 00..f15c05ee85
--- /dev/null
+++ b/include/qemu/dsa.h
@@ -0,0 +1,72 @@
+#ifndef QEMU_DSA_H
+#define QEMU_DSA_H
+
+#include "qemu/error-report.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+
+#ifdef CONFIG_DSA_OPT
+
+#pragma GCC push_options
+#pragma GCC target("enqcmd")
+
+#include 
+#include "x86intrin.h"
+
+/**
+ * @brief Initializes DSA devices.
+ *
+ * @param dsa_parameter A list of DSA device path from migration parameter.
+ *
+ * @return int Zero if successful, otherwise non zero.
+ */
+int dsa_init(const char *dsa_parameter);
+
+/**
+ * @brief Start logic to enable using DSA.
+ */
+void dsa_start(void);
+
+/**
+ * @brief Stop the device group and the completion thread.
+ */
+void dsa_stop(void);
+
+/**
+ * @brief Clean up system resources created for DSA offloading.
+ */
+void dsa_cleanup(void);
+
+/**
+ * @brief Check if DSA is running.
+ *
+ * @return True if DSA is running, otherwise false.
+ */
+bool dsa_is_running(void);
+
+#else
+
+static inline bool dsa_is_running(void)
+{
+return false;
+}
+
+static inline int dsa_init(const char *dsa_parameter)
+{
+if (dsa_parameter != NULL && strlen(dsa_parameter) != 0) {
+error_report("DSA not supported.");
+return -1;
+}
+
+return 0;
+}
+
+static inline void dsa_start(void) {}
+
+static inline void dsa_stop(void) {}
+
+static inline void dsa_cleanup(void) {}
+
+#endif
+
+#endif
diff --git a/util/dsa.c b/util/dsa.c
new file mode 100644
index 00..05bbf8e31a
--- /dev/null
+++ b/util/dsa.c
@@ -0,0 +1,316 @@
+/*
+ * Use Intel Data Streaming Accelerator to offload certain background
+ * operations.
+ *
+ * Copyright (c) 2023 Hao Xiang 
+ *Bryan Zhang 
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/queue.h"
+#include "qemu/memalign.h"
+#include "qemu/lockable.h"
+#include "qemu/cutils.h"
+#include "qemu/dsa.h"
+#include "qemu/bswap.h"
+#include "qemu/error-report.h"
+#include "qemu/rcu.h"
+
+#ifdef CONFIG_DSA_OPT
+
+#pragma GCC push_options
+#pragma GCC target("enqcmd")
+
+#include 
+#include "x86intrin.h"
+
+#define DSA_WQ_SIZE 4096
+#define MAX_DSA_DEVICES 16
+
+typedef QSIMPLEQ_HEAD(dsa_task_queue, dsa_batch_task) dsa_task_queue;
+
+struct dsa_device {
+void *work_queue;
+};
+
+struct dsa_device_group {
+struct dsa_device *dsa_devices;
+int num_dsa_devices;
+/* The index of the next DSA device to be used. */
+uint32_t device_allocator_index;
+bool running;
+QemuMutex task_queue_lock;
+QemuCond task_queue_cond;
+dsa_task_queue task_queue;
+};
+
+uint64_t max_retry_count;
+static struct dsa_device_group dsa_group;
+
+
+/**
+ * @brief This function opens a DSA device's work queue and
+ *maps the DSA device memory into the current process.
+ *
+ * @param dsa_wq_path A pointer to the DSA device work queue's file path.
+ * @return A pointer to the mapped memory, or MAP_FAILED on failure.
+ */
+static void *
+map_dsa_device(const char *dsa_wq_path)
+{
+void *dsa_device;
+int fd;
+
+fd = open(dsa_wq_path, O_RDWR);
+if (fd < 0) {
+error_report("Open %s failed with errno = %d.",
+dsa_wq_path, errno);
+return MAP_FAILED;
+}
+dsa_device = mmap(NULL, DSA_WQ_SIZE, PROT_WRITE,
+  MAP_SHARED | MAP_POPULATE, fd, 0);
+close(fd);
+if (dsa_device == MAP_FAILED) {
+error_report("mmap failed with errno = %d.",

[PATCH v4 00/14] Use Intel DSA accelerator to offload zero page checking in multifd live migration.

v4
* Rebase on top of 85b597413d4370cb168f711192eaef2eb70535ac.
* A separate "multifd zero page checking" patchset was split from this
patchset's v3 and got merged into master. v4 re-applied the rest of all
commits on top of that patchset, re-factored and re-tested.
https://lore.kernel.org/all/20240311180015.3359271-1-hao.xi...@linux.dev/
* There are some feedback from v3 I likely overlooked.

v3
* Rebase on top of 7425b6277f12e82952cede1f531bfc689bf77fb1.
* Fix error/warning from checkpatch.pl
* Fix use-after-free bug when multifd-dsa-accel option is not set.
* Handle error from dsa_init and correctly propogate the error.
* Remove unnecessary call to dsa_stop.
* Detect availability of DSA feature at compile time.
* Implement a generic batch_task structure and a DSA specific one
dsa_batch_task.
* Remove all exit() calls and propagate errors correctly.
* Use bytes instead of page count to configure multifd-packet-size option.

v2
* Rebase on top of 3e01f1147a16ca566694b97eafc941d62fa1e8d8.
* Leave Juan's changes in their original form instead of squashing them.
* Add a new commit to refactor the multifd_send_thread function to prepare for
introducing the DSA offload functionality.
* Use page count to configure multifd-packet-size option.
* Don't use the FLAKY flag in DSA tests.
* Test if DSA integration test is setup correctly and skip the test if
* not.
* Fixed broken link in the previous patch cover.

* Background:

I posted an RFC about DSA offloading in QEMU:
https://patchew.org/QEMU/20230529182001.2232069-1-hao.xi...@bytedance.com/

This patchset implements the DSA offloading on zero page checking in
multifd live migration code path.

* Overview:

Intel Data Streaming Accelerator(DSA) is introduced in Intel's 4th generation
Xeon server, aka Sapphire Rapids.
https://cdrdv2-public.intel.com/671116/341204-intel-data-streaming-accelerator-spec.pdf
https://www.intel.com/content/www/us/en/content-details/759709/intel-data-streaming-accelerator-user-guide.html
One of the things DSA can do is to offload memory comparison workload from
CPU to DSA accelerator hardware. This patchset implements a solution to offload
QEMU's zero page checking from CPU to DSA accelerator hardware. We gain
two benefits from this change:
1. Reduces CPU usage in multifd live migration workflow across all use
cases.
2. Reduces migration total time in some use cases.

* Design:

These are the logical steps to perform DSA offloading:
1. Configure DSA accelerators and create user space openable DSA work
queues via the idxd driver.
2. Map DSA's work queue into a user space address space.
3. Fill an in-memory task descriptor to describe the memory operation.
4. Use dedicated CPU instruction _enqcmd to queue a task descriptor to
the work queue.
5. Pull the task descriptor's completion status field until the task
completes.
6. Check return status.

The memory operation is now totally done by the accelerator hardware but
the new workflow introduces overheads. The overhead is the extra cost CPU
prepares and submits the task descriptors and the extra cost CPU pulls for
completion. The design is around minimizing these two overheads.

1. In order to reduce the overhead on task preparation and submission,
we use batch descriptors. A batch descriptor will contain N individual
zero page checking tasks where the default N is 128 (default packet size
/ page size) and we can increase N by setting the packet size via a new
migration option.
2. The multifd sender threads prepares and submits batch tasks to DSA
hardware and it waits on a synchronization object for task completion.
Whenever a DSA task is submitted, the task structure is added to a
thread safe queue. It's safe to have multiple multifd sender threads to
submit tasks concurrently.
3. Multiple DSA hardware devices can be used. During multifd initialization,
every sender thread will be assigned a DSA device to work with. We
use a round-robin scheme to evenly distribute the work across all used
DSA devices.
4. Use a dedicated thread dsa_completion to perform busy pulling for all
DSA task completions. The thread keeps dequeuing DSA tasks from the
thread safe queue. The thread blocks when there is no outstanding DSA
task. When pulling for completion of a DSA task, the thread uses CPU
instruction _mm_pause between the iterations of a busy loop to save some
CPU power as well as optimizing core resources for the other hypercore.
5. DSA accelerator can encounter errors. The most popular error is a
page fault. We have tested using devices to handle page faults but
performance is bad. Right now, if DSA hits a page fault, we fallback to
use CPU to complete the rest of the work. The CPU fallback is done in
the multifd sender thread.
6. Added a new migration option multifd-dsa-accel to set the DSA device
path. If set, the multifd workflow will leverage the DSA devices for
offloading.
7. Added a new migration option multifd-normal-page-ratio to make
multifd live migration easier to test. Setting a

[PATCH v4 11/14] migration/multifd: Add migration option set packet size.

The current multifd packet size is 128 * 4kb. This change adds
an option to set the packet size. Both sender and receiver needs
to set the same packet size for things to work.

Signed-off-by: Hao Xiang 
---
 migration/options.c | 36 
 migration/options.h |  1 +
 qapi/migration.json | 21 ++---
 3 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/migration/options.c b/migration/options.c
index dc8642df81..a9deb079eb 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -79,6 +79,12 @@
 #define DEFAULT_MIGRATE_ANNOUNCE_ROUNDS5
 #define DEFAULT_MIGRATE_ANNOUNCE_STEP100
 
+/*
+ * Parameter for multifd packet size.
+ */
+#define DEFAULT_MIGRATE_MULTIFD_PACKET_SIZE (128 * 4 * 1024)
+#define MAX_MIGRATE_MULTIFD_PACKET_SIZE (1023 * 4 * 1024)
+
 #define DEFINE_PROP_MIG_CAP(name, x) \
 DEFINE_PROP_BOOL(name, MigrationState, capabilities[x], false)
 
@@ -184,6 +190,9 @@ Property migration_properties[] = {
ZERO_PAGE_DETECTION_MULTIFD),
 DEFINE_PROP_STRING("multifd-dsa-accel", MigrationState,
parameters.multifd_dsa_accel),
+DEFINE_PROP_SIZE("multifd-packet-size", MigrationState,
+ parameters.multifd_packet_size,
+ DEFAULT_MIGRATE_MULTIFD_PACKET_SIZE),
 
 /* Migration capabilities */
 DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -879,6 +888,13 @@ int migrate_multifd_channels(void)
 return s->parameters.multifd_channels;
 }
 
+uint64_t migrate_multifd_packet_size(void)
+{
+MigrationState *s = migrate_get_current();
+
+return s->parameters.multifd_packet_size;
+}
+
 MultiFDCompression migrate_multifd_compression(void)
 {
 MigrationState *s = migrate_get_current();
@@ -1031,6 +1047,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
 params->has_block_incremental = true;
 params->block_incremental = s->parameters.block_incremental;
+params->has_multifd_packet_size = true;
+params->multifd_packet_size = s->parameters.multifd_packet_size;
 params->has_multifd_channels = true;
 params->multifd_channels = s->parameters.multifd_channels;
 params->has_multifd_compression = true;
@@ -1094,6 +1112,7 @@ void migrate_params_init(MigrationParameters *params)
 params->has_downtime_limit = true;
 params->has_x_checkpoint_delay = true;
 params->has_block_incremental = true;
+params->has_multifd_packet_size = true;
 params->has_multifd_channels = true;
 params->has_multifd_compression = true;
 params->has_multifd_zlib_level = true;
@@ -1195,6 +1214,17 @@ bool migrate_params_check(MigrationParameters *params, 
Error **errp)
 
 /* x_checkpoint_delay is now always positive */
 
+if (params->has_multifd_packet_size &&
+((params->multifd_packet_size < DEFAULT_MIGRATE_MULTIFD_PACKET_SIZE) ||
+(params->multifd_packet_size >  MAX_MIGRATE_MULTIFD_PACKET_SIZE) ||
+(params->multifd_packet_size % qemu_target_page_size() != 0))) {
+error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+"multifd_packet_size",
+"a value between 524288 and 4190208, "
+"must be a multiple of guest VM's page size.");
+return false;
+}
+
 if (params->has_multifd_channels && (params->multifd_channels < 1)) {
 error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
"multifd_channels",
@@ -1374,6 +1404,9 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 if (params->has_block_incremental) {
 dest->block_incremental = params->block_incremental;
 }
+if (params->has_multifd_packet_size) {
+dest->multifd_packet_size = params->multifd_packet_size;
+}
 if (params->has_multifd_channels) {
 dest->multifd_channels = params->multifd_channels;
 }
@@ -1524,6 +1557,9 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 " use blockdev-mirror with NBD instead");
 s->parameters.block_incremental = params->block_incremental;
 }
+if (params->has_multifd_packet_size) {
+s->parameters.multifd_packet_size = params->multifd_packet_size;
+}
 if (params->has_multifd_channels) {
 s->parameters.multifd_channels = params->multifd_channels;
 }
diff --git a/migration/options.h b/migration/options.h
index 1cb3393be9..23995e6608 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -92,6 +92,7 @@ const char *migrate_tls_hostname(void);
 uint64_t migrate_xbzrle_cache_size(void);
 ZeroPageDetection migrate_zero_page_detection(void);
 const char *migrate_multifd_dsa_accel(void);
+uint64_t migrate_multifd_packet_size(void);
 
 /* parameters setters */
 
diff --git a/qapi/migration.json b/qapi/migration.json
index 934fa8839e..39d609c394

[PATCH v4 02/14] util/dsa: Add dependency idxd.

Idxd is the device driver for DSA (Intel Data Streaming
Accelerator). The driver is fully functioning since Linux
kernel 5.19. This change adds the driver's header file used
for userspace development.

Signed-off-by: Hao Xiang 
---
 linux-headers/linux/idxd.h | 356 +
 1 file changed, 356 insertions(+)
 create mode 100644 linux-headers/linux/idxd.h

diff --git a/linux-headers/linux/idxd.h b/linux-headers/linux/idxd.h
new file mode 100644
index 00..1d553bedbd
--- /dev/null
+++ b/linux-headers/linux/idxd.h
@@ -0,0 +1,356 @@
+/* SPDX-License-Identifier: LGPL-2.1 WITH Linux-syscall-note */
+/* Copyright(c) 2019 Intel Corporation. All rights rsvd. */
+#ifndef _USR_IDXD_H_
+#define _USR_IDXD_H_
+
+#ifdef __KERNEL__
+#include 
+#else
+#include 
+#endif
+
+/* Driver command error status */
+enum idxd_scmd_stat {
+   IDXD_SCMD_DEV_ENABLED = 0x8010,
+   IDXD_SCMD_DEV_NOT_ENABLED = 0x8020,
+   IDXD_SCMD_WQ_ENABLED = 0x8021,
+   IDXD_SCMD_DEV_DMA_ERR = 0x8002,
+   IDXD_SCMD_WQ_NO_GRP = 0x8003,
+   IDXD_SCMD_WQ_NO_NAME = 0x8004,
+   IDXD_SCMD_WQ_NO_SVM = 0x8005,
+   IDXD_SCMD_WQ_NO_THRESH = 0x8006,
+   IDXD_SCMD_WQ_PORTAL_ERR = 0x8007,
+   IDXD_SCMD_WQ_RES_ALLOC_ERR = 0x8008,
+   IDXD_SCMD_PERCPU_ERR = 0x8009,
+   IDXD_SCMD_DMA_CHAN_ERR = 0x800a,
+   IDXD_SCMD_CDEV_ERR = 0x800b,
+   IDXD_SCMD_WQ_NO_SWQ_SUPPORT = 0x800c,
+   IDXD_SCMD_WQ_NONE_CONFIGURED = 0x800d,
+   IDXD_SCMD_WQ_NO_SIZE = 0x800e,
+   IDXD_SCMD_WQ_NO_PRIV = 0x800f,
+   IDXD_SCMD_WQ_IRQ_ERR = 0x8010,
+   IDXD_SCMD_WQ_USER_NO_IOMMU = 0x8011,
+};
+
+#define IDXD_SCMD_SOFTERR_MASK 0x8000
+#define IDXD_SCMD_SOFTERR_SHIFT16
+
+/* Descriptor flags */
+#define IDXD_OP_FLAG_FENCE 0x0001
+#define IDXD_OP_FLAG_BOF   0x0002
+#define IDXD_OP_FLAG_CRAV  0x0004
+#define IDXD_OP_FLAG_RCR   0x0008
+#define IDXD_OP_FLAG_RCI   0x0010
+#define IDXD_OP_FLAG_CRSTS 0x0020
+#define IDXD_OP_FLAG_CR0x0080
+#define IDXD_OP_FLAG_CC0x0100
+#define IDXD_OP_FLAG_ADDR1_TCS 0x0200
+#define IDXD_OP_FLAG_ADDR2_TCS 0x0400
+#define IDXD_OP_FLAG_ADDR3_TCS 0x0800
+#define IDXD_OP_FLAG_CR_TCS0x1000
+#define IDXD_OP_FLAG_STORD 0x2000
+#define IDXD_OP_FLAG_DRDBK 0x4000
+#define IDXD_OP_FLAG_DSTS  0x8000
+
+/* IAX */
+#define IDXD_OP_FLAG_RD_SRC2_AECS  0x01
+#define IDXD_OP_FLAG_RD_SRC2_2ND   0x02
+#define IDXD_OP_FLAG_WR_SRC2_AECS_COMP 0x04
+#define IDXD_OP_FLAG_WR_SRC2_AECS_OVFL 0x08
+#define IDXD_OP_FLAG_SRC2_STS  0x10
+#define IDXD_OP_FLAG_CRC_RFC3720   0x20
+
+/* Opcode */
+enum dsa_opcode {
+   DSA_OPCODE_NOOP = 0,
+   DSA_OPCODE_BATCH,
+   DSA_OPCODE_DRAIN,
+   DSA_OPCODE_MEMMOVE,
+   DSA_OPCODE_MEMFILL,
+   DSA_OPCODE_COMPARE,
+   DSA_OPCODE_COMPVAL,
+   DSA_OPCODE_CR_DELTA,
+   DSA_OPCODE_AP_DELTA,
+   DSA_OPCODE_DUALCAST,
+   DSA_OPCODE_CRCGEN = 0x10,
+   DSA_OPCODE_COPY_CRC,
+   DSA_OPCODE_DIF_CHECK,
+   DSA_OPCODE_DIF_INS,
+   DSA_OPCODE_DIF_STRP,
+   DSA_OPCODE_DIF_UPDT,
+   DSA_OPCODE_CFLUSH = 0x20,
+};
+
+enum iax_opcode {
+   IAX_OPCODE_NOOP = 0,
+   IAX_OPCODE_DRAIN = 2,
+   IAX_OPCODE_MEMMOVE,
+   IAX_OPCODE_DECOMPRESS = 0x42,
+   IAX_OPCODE_COMPRESS,
+   IAX_OPCODE_CRC64,
+   IAX_OPCODE_ZERO_DECOMP_32 = 0x48,
+   IAX_OPCODE_ZERO_DECOMP_16,
+   IAX_OPCODE_ZERO_COMP_32 = 0x4c,
+   IAX_OPCODE_ZERO_COMP_16,
+   IAX_OPCODE_SCAN = 0x50,
+   IAX_OPCODE_SET_MEMBER,
+   IAX_OPCODE_EXTRACT,
+   IAX_OPCODE_SELECT,
+   IAX_OPCODE_RLE_BURST,
+   IAX_OPCODE_FIND_UNIQUE,
+   IAX_OPCODE_EXPAND,
+};
+
+/* Completion record status */
+enum dsa_completion_status {
+   DSA_COMP_NONE = 0,
+   DSA_COMP_SUCCESS,
+   DSA_COMP_SUCCESS_PRED,
+   DSA_COMP_PAGE_FAULT_NOBOF,
+   DSA_COMP_PAGE_FAULT_IR,
+   DSA_COMP_BATCH_FAIL,
+   DSA_COMP_BATCH_PAGE_FAULT,
+   DSA_COMP_DR_OFFSET_NOINC,
+   DSA_COMP_DR_OFFSET_ERANGE,
+   DSA_COMP_DIF_ERR,
+   DSA_COMP_BAD_OPCODE = 0x10,
+   DSA_COMP_INVALID_FLAGS,
+   DSA_COMP_NOZERO_RESERVE,
+   DSA_COMP_XFER_ERANGE,
+   DSA_COMP_DESC_CNT_ERANGE,
+   DSA_COMP_DR_ERANGE,
+   DSA_COMP_OVERLAP_BUFFERS,
+   DSA_COMP_DCAST_ERR,
+   DSA_COMP_DESCLIST_ALIGN,
+   DSA_COMP_INT_HANDLE_INVAL,
+   DSA_COMP_CRA_XLAT,
+   DSA_COMP_CRA_ALIGN,
+   DSA_COMP_ADDR_ALIGN,
+   DSA_COMP_PRIV_BAD,
+   DSA_COMP_TRAFFIC_CLASS_CONF,
+   DSA_COMP_PFAULT_RDBA,
+   DSA_COMP_HW_ERR1,
+   DSA_COMP_HW_ERR_DRB,
+   DSA_COMP_TRANSLATION_FAIL,
+};
+
+enum iax_completion_status {
+   IAX_COMP_NONE = 0,
+   IAX_COMP_SUCCESS,
+   IAX_COMP_PAGE_FAULT_IR = 0x04,
+   IAX_COMP_ANALYTICS_ERROR = 0x0a,
+

[PATCH v4 08/14] migration/multifd: Add new migration option for multifd DSA offloading.

Intel DSA offloading is an optional feature that turns on if
proper hardware and software stack is available. To turn on
DSA offloading in multifd live migration:

multifd-dsa-accel="[dsa_dev_path1] [dsa_dev_path2] ... [dsa_dev_pathX]"

This feature is turned off by default.

Signed-off-by: Hao Xiang 
---
 migration/migration-hmp-cmds.c |  8 
 migration/options.c| 30 ++
 migration/options.h|  1 +
 qapi/migration.json| 26 +++---
 4 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 7e96ae6ffd..7e9bb278c9 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -358,6 +358,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict 
*qdict)
 monitor_printf(mon, "%s: '%s'\n",
 MigrationParameter_str(MIGRATION_PARAMETER_TLS_AUTHZ),
 params->tls_authz);
+monitor_printf(mon, "%s: '%s'\n",
+MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_DSA_ACCEL),
+params->multifd_dsa_accel);
 
 if (params->has_block_bitmap_mapping) {
 const BitmapMigrationNodeAliasList *bmnal;
@@ -622,6 +625,11 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict 
*qdict)
 p->has_block_incremental = true;
 visit_type_bool(v, param, >block_incremental, );
 break;
+case MIGRATION_PARAMETER_MULTIFD_DSA_ACCEL:
+p->multifd_dsa_accel = g_new0(StrOrNull, 1);
+p->multifd_dsa_accel->type = QTYPE_QSTRING;
+visit_type_str(v, param, >multifd_dsa_accel->u.s, );
+break;
 case MIGRATION_PARAMETER_MULTIFD_CHANNELS:
 p->has_multifd_channels = true;
 visit_type_uint8(v, param, >multifd_channels, );
diff --git a/migration/options.c b/migration/options.c
index 239f5ecfb4..dc8642df81 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -182,6 +182,8 @@ Property migration_properties[] = {
 DEFINE_PROP_ZERO_PAGE_DETECTION("zero-page-detection", MigrationState,
parameters.zero_page_detection,
ZERO_PAGE_DETECTION_MULTIFD),
+DEFINE_PROP_STRING("multifd-dsa-accel", MigrationState,
+   parameters.multifd_dsa_accel),
 
 /* Migration capabilities */
 DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -920,6 +922,13 @@ const char *migrate_tls_creds(void)
 return s->parameters.tls_creds;
 }
 
+const char *migrate_multifd_dsa_accel(void)
+{
+MigrationState *s = migrate_get_current();
+
+return s->parameters.multifd_dsa_accel;
+}
+
 const char *migrate_tls_hostname(void)
 {
 MigrationState *s = migrate_get_current();
@@ -1060,6 +1069,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 params->mode = s->parameters.mode;
 params->has_zero_page_detection = true;
 params->zero_page_detection = s->parameters.zero_page_detection;
+params->multifd_dsa_accel = g_strdup(s->parameters.multifd_dsa_accel ?
+ s->parameters.multifd_dsa_accel : "");
 
 return params;
 }
@@ -1068,6 +1079,7 @@ void migrate_params_init(MigrationParameters *params)
 {
 params->tls_hostname = g_strdup("");
 params->tls_creds = g_strdup("");
+params->multifd_dsa_accel = g_strdup("");
 
 /* Set has_* up only for parameter checks */
 params->has_compress_level = true;
@@ -1416,6 +1428,11 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 if (params->has_zero_page_detection) {
 dest->zero_page_detection = params->zero_page_detection;
 }
+
+if (params->multifd_dsa_accel) {
+assert(params->multifd_dsa_accel->type == QTYPE_QSTRING);
+dest->multifd_dsa_accel = params->multifd_dsa_accel->u.s;
+}
 }
 
 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
@@ -1570,6 +1587,13 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 if (params->has_zero_page_detection) {
 s->parameters.zero_page_detection = params->zero_page_detection;
 }
+
+if (params->multifd_dsa_accel) {
+g_free(s->parameters.multifd_dsa_accel);
+assert(params->multifd_dsa_accel->type == QTYPE_QSTRING);
+s->parameters.multifd_dsa_accel =
+g_strdup(params->multifd_dsa_accel->u.s);
+}
 }
 
 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
@@ -1595,6 +1619,12 @@ void qmp_migrate_set_parameters(MigrateSetParameters 
*params, Error **errp)
 params->tls_authz->type = QTYPE_QSTRING;
 params->tls_authz->u.s = strdup("");
 }
+if (params->multifd_dsa_accel
+&& params->multifd_dsa_accel->type == QTYPE_QNULL) {
+qobject_unref(params->multifd_dsa_accel->u.n);
+params->multifd_dsa_accel->type = QTYPE_QSTRING;
+

[PATCH v4 12/14] migration/multifd: Enable set packet size migration option.

During live migration, if the latency between sender and receiver
is high and bandwidth is also high (a long and fat pipe), using a bigger
packet size can help reduce migration total time. In addition, Intel
DSA offloading performs better with a large batch task. Providing an
option to set the packet size is useful for performance tuning.

Set the option:
migrate_set_parameter multifd-packet-size 4190208

Signed-off-by: Hao Xiang 
---
 migration/migration-hmp-cmds.c | 7 +++
 migration/multifd-zlib.c   | 6 --
 migration/multifd-zstd.c   | 6 --
 migration/multifd.c| 6 --
 migration/multifd.h| 3 ---
 5 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 7e9bb278c9..053ad0283a 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -338,6 +338,9 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict 
*qdict)
 monitor_printf(mon, "%s: %s\n",
 MigrationParameter_str(MIGRATION_PARAMETER_BLOCK_INCREMENTAL),
 params->block_incremental ? "on" : "off");
+monitor_printf(mon, "%s: %" PRIu64 "\n",
+MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_PACKET_SIZE),
+params->multifd_packet_size);
 monitor_printf(mon, "%s: %u\n",
 MigrationParameter_str(MIGRATION_PARAMETER_MULTIFD_CHANNELS),
 params->multifd_channels);
@@ -630,6 +633,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict 
*qdict)
 p->multifd_dsa_accel->type = QTYPE_QSTRING;
 visit_type_str(v, param, >multifd_dsa_accel->u.s, );
 break;
+case MIGRATION_PARAMETER_MULTIFD_PACKET_SIZE:
+p->has_multifd_packet_size = true;
+visit_type_size(v, param, >multifd_packet_size, );
+break;
 case MIGRATION_PARAMETER_MULTIFD_CHANNELS:
 p->has_multifd_channels = true;
 visit_type_uint8(v, param, >multifd_channels, );
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 737a9645d2..2880d35841 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -49,6 +49,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
 struct zlib_data *z = g_new0(struct zlib_data, 1);
 z_stream *zs = >zs;
 const char *err_msg;
+uint64_t multifd_packet_size = migrate_multifd_packet_size();
 
 zs->zalloc = Z_NULL;
 zs->zfree = Z_NULL;
@@ -58,7 +59,7 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
 goto err_free_z;
 }
 /* This is the maximum size of the compressed buffer */
-z->zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
+z->zbuff_len = compressBound(multifd_packet_size);
 z->zbuff = g_try_malloc(z->zbuff_len);
 if (!z->zbuff) {
 err_msg = "out of memory for zbuff";
@@ -193,6 +194,7 @@ out:
  */
 static int zlib_recv_setup(MultiFDRecvParams *p, Error **errp)
 {
+uint64_t multifd_packet_size = migrate_multifd_packet_size();
 struct zlib_data *z = g_new0(struct zlib_data, 1);
 z_stream *zs = >zs;
 
@@ -207,7 +209,7 @@ static int zlib_recv_setup(MultiFDRecvParams *p, Error 
**errp)
 return -1;
 }
 /* To be safe, we reserve twice the size of the packet */
-z->zbuff_len = MULTIFD_PACKET_SIZE * 2;
+z->zbuff_len = multifd_packet_size * 2;
 z->zbuff = g_try_malloc(z->zbuff_len);
 if (!z->zbuff) {
 inflateEnd(zs);
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index 256858df0a..edc738afbb 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -49,6 +49,7 @@ struct zstd_data {
  */
 static int zstd_send_setup(MultiFDSendParams *p, Error **errp)
 {
+uint64_t multifd_packet_size = migrate_multifd_packet_size();
 struct zstd_data *z = g_new0(struct zstd_data, 1);
 int res;
 
@@ -69,7 +70,7 @@ static int zstd_send_setup(MultiFDSendParams *p, Error **errp)
 return -1;
 }
 /* This is the maximum size of the compressed buffer */
-z->zbuff_len = ZSTD_compressBound(MULTIFD_PACKET_SIZE);
+z->zbuff_len = ZSTD_compressBound(multifd_packet_size);
 z->zbuff = g_try_malloc(z->zbuff_len);
 if (!z->zbuff) {
 ZSTD_freeCStream(z->zcs);
@@ -182,6 +183,7 @@ out:
  */
 static int zstd_recv_setup(MultiFDRecvParams *p, Error **errp)
 {
+uint64_t multifd_packet_size = migrate_multifd_packet_size();
 struct zstd_data *z = g_new0(struct zstd_data, 1);
 int ret;
 
@@ -203,7 +205,7 @@ static int zstd_recv_setup(MultiFDRecvParams *p, Error 
**errp)
 }
 
 /* To be safe, we reserve twice the size of the packet */
-z->zbuff_len = MULTIFD_PACKET_SIZE * 2;
+z->zbuff_len = multifd_packet_size * 2;
 z->zbuff = g_try_malloc(z->zbuff_len);
 if (!z->zbuff) {
 ZSTD_freeDStream(z->zds);
diff --git a/migration/multifd.c b/migration/multifd.c
index 7316643d0a..2796646087 100644
--- a/migration/multifd.c
+++

[PATCH v4 14/14] migration/multifd: Add integration tests for multifd with Intel DSA offloading.

* Add test case to start and complete multifd live migration with DSA
offloading enabled.
* Add test case to start and cancel multifd live migration with DSA
offloading enabled.

Signed-off-by: Bryan Zhang 
Signed-off-by: Hao Xiang 
---
 tests/qtest/migration-test.c | 77 +++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 5d6d8cd634..354c5f26f8 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -616,6 +616,12 @@ typedef struct {
 bool suspend_me;
 } MigrateStart;
 
+/*
+ * It requires separate steps to configure and enable DSA device.
+ * This test assumes that the configuration is done already.
+ */
+static const char *dsa_dev_path = "/dev/dsa/wq4.0";
+
 /*
  * A hook that runs after the src and dst QEMUs have been
  * created, but before the migration is started. This can
@@ -3025,7 +3031,7 @@ static void 
test_multifd_tcp_tls_x509_reject_anon_client(void)
  *
  *  And see that it works
  */
-static void test_multifd_tcp_cancel(void)
+static void test_multifd_tcp_cancel_common(bool use_dsa)
 {
 MigrateStart args = {
 .hide_stderr = true,
@@ -3045,6 +3051,10 @@ static void test_multifd_tcp_cancel(void)
 migrate_set_capability(from, "multifd", true);
 migrate_set_capability(to, "multifd", true);
 
+if (use_dsa) {
+migrate_set_parameter_str(from, "multifd-dsa-accel", dsa_dev_path);
+}
+
 /* Start incoming migration from the 1st socket */
 migrate_incoming_qmp(to, "tcp:127.0.0.1:0", "{}");
 
@@ -3094,6 +3104,48 @@ static void test_multifd_tcp_cancel(void)
 test_migrate_end(from, to2, true);
 }
 
+/*
+ * This test does:
+ *  source   target
+ *   migrate_incoming
+ * migrate
+ * migrate_cancel
+ *   launch another target
+ * migrate
+ *
+ *  And see that it works
+ */
+static void test_multifd_tcp_cancel(void)
+{
+test_multifd_tcp_cancel_common(false);
+}
+
+#ifdef CONFIG_DSA_OPT
+
+static void *test_migrate_precopy_tcp_multifd_start_dsa(QTestState *from,
+QTestState *to)
+{
+migrate_set_parameter_str(from, "multifd-dsa-accel", dsa_dev_path);
+return test_migrate_precopy_tcp_multifd_start_common(from, to, "none");
+}
+
+static void test_multifd_tcp_zero_page_dsa(void)
+{
+MigrateCommon args = {
+.listen_uri = "defer",
+.start_hook = test_migrate_precopy_tcp_multifd_start_dsa,
+};
+
+test_precopy_common();
+}
+
+static void test_multifd_tcp_cancel_dsa(void)
+{
+test_multifd_tcp_cancel_common(true);
+}
+
+#endif
+
 static void calc_dirty_rate(QTestState *who, uint64_t calc_time)
 {
 qtest_qmp_assert_success(who,
@@ -3518,6 +3570,19 @@ static bool kvm_dirty_ring_supported(void)
 #endif
 }
 
+#ifdef CONFIG_DSA_OPT
+static int test_dsa_setup(void)
+{
+int fd;
+fd = open(dsa_dev_path, O_RDWR);
+if (fd < 0) {
+return -1;
+}
+close(fd);
+return 0;
+}
+#endif
+
 int main(int argc, char **argv)
 {
 bool has_kvm, has_tcg;
@@ -3752,6 +3817,16 @@ int main(int argc, char **argv)
test_multifd_tcp_zero_page_legacy);
 migration_test_add("/migration/multifd/tcp/plain/zero-page/none",
test_multifd_tcp_no_zero_page);
+
+#ifdef CONFIG_DSA_OPT
+if (g_str_equal(arch, "x86_64") && test_dsa_setup() == 0) {
+migration_test_add("/migration/multifd/tcp/plain/zero-page/dsa",
+   test_multifd_tcp_zero_page_dsa);
+migration_test_add("/migration/multifd/tcp/plain/cancel/dsa",
+   test_multifd_tcp_cancel_dsa);
+}
+#endif
+
 migration_test_add("/migration/multifd/tcp/plain/cancel",
test_multifd_tcp_cancel);
 migration_test_add("/migration/multifd/tcp/plain/zlib",
-- 
2.30.2

[PATCH v4 05/14] util/dsa: Implement DSA task asynchronous completion thread model.

* Create a dedicated thread for DSA task completion.
* DSA completion thread runs a loop and poll for completed tasks.
* Start and stop DSA completion thread during DSA device start stop.

User space application can directly submit task to Intel DSA
accelerator by writing to DSA's device memory (mapped in user space).
Once a task is submitted, the device starts processing it and write
the completion status back to the task. A user space application can
poll the task's completion status to check for completion. This change
uses a dedicated thread to perform DSA task completion checking.

Signed-off-by: Hao Xiang 
---
 include/qemu/dsa.h |   1 +
 util/dsa.c | 274 -
 2 files changed, 274 insertions(+), 1 deletion(-)

diff --git a/include/qemu/dsa.h b/include/qemu/dsa.h
index 37cae8d9d2..2513192a2b 100644
--- a/include/qemu/dsa.h
+++ b/include/qemu/dsa.h
@@ -38,6 +38,7 @@ typedef struct dsa_batch_task {
 DsaTaskType task_type;
 DsaTaskStatus status;
 int batch_size;
+bool *results;
 QSIMPLEQ_ENTRY(dsa_batch_task) entry;
 } dsa_batch_task;
 
diff --git a/util/dsa.c b/util/dsa.c
index 75739a1af6..003c4f47d9 100644
--- a/util/dsa.c
+++ b/util/dsa.c
@@ -44,6 +44,7 @@
 
 #define DSA_WQ_SIZE 4096
 #define MAX_DSA_DEVICES 16
+#define DSA_COMPLETION_THREAD "dsa_completion"
 
 typedef QSIMPLEQ_HEAD(dsa_task_queue, dsa_batch_task) dsa_task_queue;
 
@@ -62,8 +63,18 @@ struct dsa_device_group {
 dsa_task_queue task_queue;
 };
 
+struct dsa_completion_thread {
+bool stopping;
+bool running;
+QemuThread thread;
+int thread_id;
+QemuSemaphore sem_init_done;
+struct dsa_device_group *group;
+};
+
 uint64_t max_retry_count;
 static struct dsa_device_group dsa_group;
+static struct dsa_completion_thread completion_thread;
 
 
 /**
@@ -443,6 +454,265 @@ submit_batch_wi_async(struct dsa_batch_task *batch_task)
 return dsa_task_enqueue(device_group, batch_task);
 }
 
+/**
+ * @brief Poll for the DSA work item completion.
+ *
+ * @param completion A pointer to the DSA work item completion record.
+ * @param opcode The DSA opcode.
+ *
+ * @return Zero if successful, non-zero otherwise.
+ */
+static int
+poll_completion(struct dsa_completion_record *completion,
+enum dsa_opcode opcode)
+{
+uint8_t status;
+uint64_t retry = 0;
+
+while (true) {
+/* The DSA operation completes successfully or fails. */
+status = completion->status;
+if (status == DSA_COMP_SUCCESS ||
+status == DSA_COMP_PAGE_FAULT_NOBOF ||
+status == DSA_COMP_BATCH_PAGE_FAULT ||
+status == DSA_COMP_BATCH_FAIL) {
+break;
+} else if (status != DSA_COMP_NONE) {
+error_report("DSA opcode %d failed with status = %d.",
+opcode, status);
+return 1;
+}
+retry++;
+if (retry > max_retry_count) {
+error_report("DSA wait for completion retry %lu times.", retry);
+return 1;
+}
+_mm_pause();
+}
+
+return 0;
+}
+
+/**
+ * @brief Complete a single DSA task in the batch task.
+ *
+ * @param task A pointer to the batch task structure.
+ *
+ * @return Zero if successful, otherwise non-zero.
+ */
+static int
+poll_task_completion(struct dsa_batch_task *task)
+{
+assert(task->task_type == DSA_TASK);
+
+struct dsa_completion_record *completion = >completions[0];
+uint8_t status;
+int ret;
+
+ret = poll_completion(completion, task->descriptors[0].opcode);
+if (ret != 0) {
+goto exit;
+}
+
+status = completion->status;
+if (status == DSA_COMP_SUCCESS) {
+task->results[0] = (completion->result == 0);
+goto exit;
+}
+
+assert(status == DSA_COMP_PAGE_FAULT_NOBOF);
+
+exit:
+return ret;
+}
+
+/**
+ * @brief Poll a batch task status until it completes. If DSA task doesn't
+ *complete properly, use CPU to complete the task.
+ *
+ * @param batch_task A pointer to the DSA batch task.
+ *
+ * @return Zero if successful, otherwise non-zero.
+ */
+static int
+poll_batch_task_completion(struct dsa_batch_task *batch_task)
+{
+struct dsa_completion_record *batch_completion =
+_task->batch_completion;
+struct dsa_completion_record *completion;
+uint8_t batch_status;
+uint8_t status;
+bool *results = batch_task->results;
+uint32_t count = batch_task->batch_descriptor.desc_count;
+int ret;
+
+ret = poll_completion(batch_completion,
+  batch_task->batch_descriptor.opcode);
+if (ret != 0) {
+goto exit;
+}
+
+batch_status = batch_completion->status;
+
+if (batch_status == DSA_COMP_SUCCESS) {
+if (batch_completion->bytes_completed == count) {
+/*
+ * Let's skip checking for each descriptors' completion status
+ * if the batch descriptor says all succedded.
+ */
+

[PATCH v4 13/14] util/dsa: Add unit test coverage for Intel DSA task submission and completion.

* Test DSA start and stop path.
* Test DSA configure and cleanup path.
* Test DSA task submission and completion path.

Signed-off-by: Bryan Zhang 
Signed-off-by: Hao Xiang 
---
 tests/unit/meson.build |   6 +
 tests/unit/test-dsa.c  | 499 +
 2 files changed, 505 insertions(+)
 create mode 100644 tests/unit/test-dsa.c

diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index 26c109c968..1d4d48898b 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -49,6 +49,12 @@ tests = {
   'test-interval-tree': [],
 }
 
+if config_host_data.get('CONFIG_DSA_OPT')
+  tests += {
+'test-dsa': [],
+  }
+endif
+
 if have_system or have_tools
   tests += {
 'test-qmp-event': [testqapi],
diff --git a/tests/unit/test-dsa.c b/tests/unit/test-dsa.c
new file mode 100644
index 00..0f2092767d
--- /dev/null
+++ b/tests/unit/test-dsa.c
@@ -0,0 +1,499 @@
+/*
+ * Test DSA functions.
+ *
+ * Copyright (c) 2023 Hao Xiang 
+ * Copyright (c) 2023 Bryan Zhang 
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ */
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+
+#include "qemu/cutils.h"
+#include "qemu/memalign.h"
+#include "qemu/dsa.h"
+
+/*
+ * TODO Communicate that DSA must be configured to support this batch size.
+ * TODO Alternatively, poke the DSA device to figure out batch size.
+ */
+#define batch_size 128
+#define page_size 4096
+
+#define oversized_batch_size (batch_size + 1)
+#define num_devices 2
+#define max_buffer_size (64 * 1024)
+
+/* TODO Make these not-hardcoded. */
+static const char *path1 = "/dev/dsa/wq4.0";
+static const char *path2 = "/dev/dsa/wq4.0 /dev/dsa/wq4.1";
+
+static struct batch_task *task;
+
+/* A helper for running a single task and checking for correctness. */
+static void do_single_task(void)
+{
+task = batch_task_init(batch_size);
+char buf[page_size];
+char *ptr = buf;
+
+buffer_is_zero_dsa_batch_async(task,
+   (const void **),
+   1,
+   page_size);
+g_assert(task->results[0] == buffer_is_zero(buf, page_size));
+
+batch_task_destroy(task);
+}
+
+static void test_single_zero(void)
+{
+g_assert(!dsa_init(path1));
+dsa_start();
+
+task = batch_task_init(batch_size);
+
+char buf[page_size];
+char *ptr = buf;
+
+memset(buf, 0x0, page_size);
+buffer_is_zero_dsa_batch_async(task,
+   (const void **),
+   1, page_size);
+g_assert(task->results[0]);
+
+batch_task_destroy(task);
+
+dsa_cleanup();
+}
+
+static void test_single_zero_async(void)
+{
+test_single_zero();
+}
+
+static void test_single_nonzero(void)
+{
+g_assert(!dsa_init(path1));
+dsa_start();
+
+task = batch_task_init(batch_size);
+
+char buf[page_size];
+char *ptr = buf;
+
+memset(buf, 0x1, page_size);
+buffer_is_zero_dsa_batch_async(task,
+   (const void **),
+   1, page_size);
+g_assert(!task->results[0]);
+
+batch_task_destroy(task);
+
+dsa_cleanup();
+}
+
+static void test_single_nonzero_async(void)
+{
+test_single_nonzero();
+}
+
+/* count == 0 should return quickly without calling into DSA. */
+static void test_zero_count_async(void)
+{
+char buf[page_size];
+buffer_is_zero_dsa_batch_async(task,
+ (const void **),
+ 0,
+ page_size);
+}
+
+static void test_null_task_async(void)
+{
+if (g_test_subprocess()) {
+g_assert(!dsa_init(path1));
+
+char buf[page_size * batch_size];
+char *addrs[batch_size];
+for (int i = 0; i < batch_size; i++) {
+addrs[i] = buf + (page_size * i);
+}
+
+buffer_is_zero_dsa_batch_async(NULL, (const void **)addrs,
+  batch_size,
+  page_size);
+} else {
+g_test_trap_subprocess(NULL, 0, 0);
+g_test_trap_assert_failed();
+}
+}
+
+static void test_oversized_batch(void)
+{
+g_assert(!dsa_init(path1));
+dsa_start();
+
+task = batch_task_init(batch_size);
+
+char buf[page_size * oversized_batch_size];
+char

[PATCH v4 01/14] meson: Introduce new instruction set enqcmd to the build system.

Enable instruction set enqcmd in build.

Signed-off-by: Hao Xiang 
---
 meson.build   | 14 ++
 meson_options.txt |  2 ++
 scripts/meson-buildoptions.sh |  3 +++
 3 files changed, 19 insertions(+)

diff --git a/meson.build b/meson.build
index 95cee7046e..9e008ddc34 100644
--- a/meson.build
+++ b/meson.build
@@ -2824,6 +2824,20 @@ config_host_data.set('CONFIG_AVX512BW_OPT', 
get_option('avx512bw') \
 int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512BW not available').allowed())
 
+config_host_data.set('CONFIG_DSA_OPT', get_option('enqcmd') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable 
ENQCMD') \
+  .require(cc.links('''
+#include 
+#include 
+#include 
+static int __attribute__((target("enqcmd"))) bar(void *a) {
+  uint64_t dst[8] = { 0 };
+  uint64_t src[8] = { 0 };
+  return _enqcmd(dst, src);
+}
+int main(int argc, char *argv[]) { return bar(argv[argc - 1]); }
+  '''), error_message: 'ENQCMD not available').allowed())
+
 # For both AArch64 and AArch32, detect if builtins are available.
 config_host_data.set('CONFIG_ARM_AES_BUILTIN', cc.compiles('''
 #include 
diff --git a/meson_options.txt b/meson_options.txt
index b5c0bad9e7..63c1bf815b 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -121,6 +121,8 @@ option('avx512f', type: 'feature', value: 'disabled',
description: 'AVX512F optimizations')
 option('avx512bw', type: 'feature', value: 'auto',
description: 'AVX512BW optimizations')
+option('enqcmd', type: 'feature', value: 'disabled',
+   description: 'MENQCMD optimizations')
 option('keyring', type: 'feature', value: 'auto',
description: 'Linux keyring support')
 option('libkeyutils', type: 'feature', value: 'auto',
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 5ace33f167..2cdfc84455 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -93,6 +93,7 @@ meson_options_help() {
   printf "%s\n" '  avx2AVX2 optimizations'
   printf "%s\n" '  avx512bwAVX512BW optimizations'
   printf "%s\n" '  avx512f AVX512F optimizations'
+  printf "%s\n" '  enqcmd  ENQCMD optimizations'
   printf "%s\n" '  blkio   libblkio block device driver'
   printf "%s\n" '  bochs   bochs image format support'
   printf "%s\n" '  bpf eBPF support'
@@ -239,6 +240,8 @@ _meson_option_parse() {
 --disable-avx512bw) printf "%s" -Davx512bw=disabled ;;
 --enable-avx512f) printf "%s" -Davx512f=enabled ;;
 --disable-avx512f) printf "%s" -Davx512f=disabled ;;
+--enable-enqcmd) printf "%s" -Denqcmd=enabled ;;
+--disable-enqcmd) printf "%s" -Denqcmd=disabled ;;
 --enable-gcov) printf "%s" -Db_coverage=true ;;
 --disable-gcov) printf "%s" -Db_coverage=false ;;
 --enable-lto) printf "%s" -Db_lto=true ;;
-- 
2.30.2

[PATCH v4 06/14] util/dsa: Implement zero page checking in DSA task.

Create DSA task with operation code DSA_OPCODE_COMPVAL.
Here we create two types of DSA tasks, a single DSA task and
a batch DSA task. Batch DSA task reduces task submission overhead
and hence should be the default option. However, due to the way DSA
hardware works, a DSA batch task must contain at least two individual
tasks. There are times we need to submit a single task and hence a
single DSA task submission is also required.

Signed-off-by: Hao Xiang 
Signed-off-by: Bryan Zhang 
---
 include/qemu/dsa.h |  18 
 util/dsa.c | 247 +
 2 files changed, 244 insertions(+), 21 deletions(-)

diff --git a/include/qemu/dsa.h b/include/qemu/dsa.h
index 2513192a2b..645e6fc367 100644
--- a/include/qemu/dsa.h
+++ b/include/qemu/dsa.h
@@ -73,6 +73,24 @@ void dsa_cleanup(void);
  */
 bool dsa_is_running(void);
 
+/**
+ * @brief Initializes a buffer zero batch task.
+ *
+ * @param task A pointer to the batch task to initialize.
+ * @param results A pointer to an array of zero page checking results.
+ * @param batch_size The number of DSA tasks in the batch.
+ */
+void
+buffer_zero_batch_task_init(struct dsa_batch_task *task,
+bool *results, int batch_size);
+
+/**
+ * @brief Performs the proper cleanup on a DSA batch task.
+ *
+ * @param task A pointer to the batch task to cleanup.
+ */
+void buffer_zero_batch_task_destroy(struct dsa_batch_task *task);
+
 #else
 
 static inline bool dsa_is_running(void)
diff --git a/util/dsa.c b/util/dsa.c
index 003c4f47d9..9db4cfcf1d 100644
--- a/util/dsa.c
+++ b/util/dsa.c
@@ -76,6 +76,7 @@ uint64_t max_retry_count;
 static struct dsa_device_group dsa_group;
 static struct dsa_completion_thread completion_thread;
 
+static void buffer_zero_dsa_completion(void *context);
 
 /**
  * @brief This function opens a DSA device's work queue and
@@ -207,7 +208,6 @@ dsa_device_group_start(struct dsa_device_group *group)
  *
  * @param group A pointer to the DSA device group.
  */
-__attribute__((unused))
 static void
 dsa_device_group_stop(struct dsa_device_group *group)
 {
@@ -243,7 +243,6 @@ dsa_device_group_cleanup(struct dsa_device_group *group)
  * @return struct dsa_device* A pointer to the next available DSA device
  * in the group.
  */
-__attribute__((unused))
 static struct dsa_device *
 dsa_device_group_get_next_device(struct dsa_device_group *group)
 {
@@ -320,7 +319,6 @@ dsa_task_enqueue(struct dsa_device_group *group,
  * @param group A pointer to the DSA device group.
  * @return dsa_batch_task* The DSA task being dequeued.
  */
-__attribute__((unused))
 static struct dsa_batch_task *
 dsa_task_dequeue(struct dsa_device_group *group)
 {
@@ -378,22 +376,6 @@ submit_wi_int(void *wq, struct dsa_hw_desc *descriptor)
 return 0;
 }
 
-/**
- * @brief Synchronously submits a DSA work item to the
- *device work queue.
- *
- * @param wq A pointer to the DSA worjk queue's device memory.
- * @param descriptor A pointer to the DSA work item descriptor.
- *
- * @return int Zero if successful, non-zero otherwise.
- */
-__attribute__((unused))
-static int
-submit_wi(void *wq, struct dsa_hw_desc *descriptor)
-{
-return submit_wi_int(wq, descriptor);
-}
-
 /**
  * @brief Asynchronously submits a DSA work item to the
  *device work queue.
@@ -402,7 +384,6 @@ submit_wi(void *wq, struct dsa_hw_desc *descriptor)
  *
  * @return int Zero if successful, non-zero otherwise.
  */
-__attribute__((unused))
 static int
 submit_wi_async(struct dsa_batch_task *task)
 {
@@ -431,7 +412,6 @@ submit_wi_async(struct dsa_batch_task *task)
  *
  * @return int Zero if successful, non-zero otherwise.
  */
-__attribute__((unused))
 static int
 submit_batch_wi_async(struct dsa_batch_task *batch_task)
 {
@@ -713,6 +693,231 @@ static void dsa_completion_thread_stop(void *opaque)
 qemu_sem_destroy(_context->sem_init_done);
 }
 
+/**
+ * @brief Initializes a buffer zero comparison DSA task.
+ *
+ * @param descriptor A pointer to the DSA task descriptor.
+ * @param completion A pointer to the DSA task completion record.
+ */
+static void
+buffer_zero_task_init_int(struct dsa_hw_desc *descriptor,
+  struct dsa_completion_record *completion)
+{
+descriptor->opcode = DSA_OPCODE_COMPVAL;
+descriptor->flags = IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CRAV;
+descriptor->comp_pattern = (uint64_t)0;
+descriptor->completion_addr = (uint64_t)completion;
+}
+
+/**
+ * @brief Initializes a buffer zero batch task.
+ *
+ * @param task A pointer to the batch task to initialize.
+ * @param results A pointer to an array of zero page checking results.
+ * @param batch_size The number of DSA tasks in the batch.
+ */
+void
+buffer_zero_batch_task_init(struct dsa_batch_task *task,
+bool *results, int batch_size)
+{
+int descriptors_size = sizeof(*task->descriptors) * batch_size;
+memset(task, 0, sizeof(*task));
+
+task->descriptors =
+(struct

[PATCH v12 4/6] ui/console: Use qemu_dmabuf_set_..() helpers instead

From: Dongwon Kim 

This commit updates all occurrences where these fields were
set directly have been updated to utilize helper functions.

v7: removed prefix, "dpy_gl_" from all helpers

v8: Introduction of helpers was removed as those were already added
by the previous commit

Suggested-by: Marc-André Lureau 
Cc: Philippe Mathieu-Daudé 
Cc: Daniel P. Berrangé 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 ui/egl-helpers.c | 16 +---
 ui/gtk-egl.c |  4 ++--
 ui/gtk-gl-area.c |  4 ++--
 ui/gtk.c |  6 +++---
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/ui/egl-helpers.c b/ui/egl-helpers.c
index 3f96e63d25..99b2ebbe23 100644
--- a/ui/egl-helpers.c
+++ b/ui/egl-helpers.c
@@ -348,8 +348,8 @@ void egl_dmabuf_import_texture(QemuDmaBuf *dmabuf)
 return;
 }
 
-glGenTextures(1, >texture);
-texture = qemu_dmabuf_get_texture(dmabuf);
+glGenTextures(1, );
+qemu_dmabuf_set_texture(dmabuf, texture);
 glBindTexture(GL_TEXTURE_2D, texture);
 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
@@ -368,7 +368,7 @@ void egl_dmabuf_release_texture(QemuDmaBuf *dmabuf)
 }
 
 glDeleteTextures(1, );
-dmabuf->texture = 0;
+qemu_dmabuf_set_texture(dmabuf, 0);
 }
 
 void egl_dmabuf_create_sync(QemuDmaBuf *dmabuf)
@@ -382,7 +382,7 @@ void egl_dmabuf_create_sync(QemuDmaBuf *dmabuf)
 sync = eglCreateSyncKHR(qemu_egl_display,
 EGL_SYNC_NATIVE_FENCE_ANDROID, NULL);
 if (sync != EGL_NO_SYNC_KHR) {
-dmabuf->sync = sync;
+qemu_dmabuf_set_sync(dmabuf, sync);
 }
 }
 }
@@ -390,12 +390,14 @@ void egl_dmabuf_create_sync(QemuDmaBuf *dmabuf)
 void egl_dmabuf_create_fence(QemuDmaBuf *dmabuf)
 {
 void *sync = qemu_dmabuf_get_sync(dmabuf);
+int fence_fd;
 
 if (sync) {
-dmabuf->fence_fd = eglDupNativeFenceFDANDROID(qemu_egl_display,
-  sync);
+fence_fd = eglDupNativeFenceFDANDROID(qemu_egl_display,
+  sync);
+qemu_dmabuf_set_fence_fd(dmabuf, fence_fd);
 eglDestroySyncKHR(qemu_egl_display, sync);
-dmabuf->sync = NULL;
+qemu_dmabuf_set_sync(dmabuf, NULL);
 }
 }
 
diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
index 7a45daefa1..ec0bf45482 100644
--- a/ui/gtk-egl.c
+++ b/ui/gtk-egl.c
@@ -87,7 +87,7 @@ void gd_egl_draw(VirtualConsole *vc)
 if (!qemu_dmabuf_get_draw_submitted(dmabuf)) {
 return;
 } else {
-dmabuf->draw_submitted = false;
+qemu_dmabuf_set_draw_submitted(dmabuf, false);
 }
 }
 #endif
@@ -381,7 +381,7 @@ void gd_egl_flush(DisplayChangeListener *dcl,
 if (vc->gfx.guest_fb.dmabuf &&
 !qemu_dmabuf_get_draw_submitted(vc->gfx.guest_fb.dmabuf)) {
 graphic_hw_gl_block(vc->gfx.dcl.con, true);
-vc->gfx.guest_fb.dmabuf->draw_submitted = true;
+qemu_dmabuf_set_draw_submitted(vc->gfx.guest_fb.dmabuf, true);
 gtk_egl_set_scanout_mode(vc, true);
 gtk_widget_queue_draw_area(area, x, y, w, h);
 return;
diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c
index 2d70280803..9a3f3d0d71 100644
--- a/ui/gtk-gl-area.c
+++ b/ui/gtk-gl-area.c
@@ -63,7 +63,7 @@ void gd_gl_area_draw(VirtualConsole *vc)
 if (!qemu_dmabuf_get_draw_submitted(dmabuf)) {
 return;
 } else {
-dmabuf->draw_submitted = false;
+qemu_dmabuf_set_draw_submitted(dmabuf, false);
 }
 }
 #endif
@@ -291,7 +291,7 @@ void gd_gl_area_scanout_flush(DisplayChangeListener *dcl,
 if (vc->gfx.guest_fb.dmabuf &&
 !qemu_dmabuf_get_draw_submitted(vc->gfx.guest_fb.dmabuf)) {
 graphic_hw_gl_block(vc->gfx.dcl.con, true);
-vc->gfx.guest_fb.dmabuf->draw_submitted = true;
+qemu_dmabuf_set_draw_submitted(vc->gfx.guest_fb.dmabuf, true);
 gtk_gl_area_set_scanout_mode(vc, true);
 }
 gtk_gl_area_queue_render(GTK_GL_AREA(vc->gfx.drawing_area));
diff --git a/ui/gtk.c b/ui/gtk.c
index 237c913b26..3a6832eb1b 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -598,11 +598,11 @@ void gd_hw_gl_flushed(void *vcon)
 QemuDmaBuf *dmabuf = vc->gfx.guest_fb.dmabuf;
 int fence_fd;
 
-if (dmabuf->fence_fd >= 0) {
-fence_fd = qemu_dmabuf_get_fence_fd(dmabuf);
+fence_fd = qemu_dmabuf_get_fence_fd(dmabuf);
+if (fence_fd >= 0) {
 qemu_set_fd_handler(fence_fd, NULL, NULL, NULL);
 close(fence_fd);
-dmabuf->fence_fd = -1;
+qemu_dmabuf_set_fence_fd(dmabuf, -1);
 graphic_hw_gl_block(vc->gfx.dcl.con, false);
 }
 }
-- 
2.34.1

[PATCH v12 2/6] ui/console: new dmabuf.h and dmabuf.c for QemuDmaBuf struct and helpers

From: Dongwon Kim 

New header and source files are added for containing QemuDmaBuf struct
definition and newly introduced helpers for creating/freeing the struct
and accessing its data.

v10: Change the license type for both dmabuf.h and dmabuf.c from MIT to
 GPL to be in line with QEMU's default license

v11: -- Added new helpers, qemu_dmabuf_close for closing dmabuf->fd,
qemu_dmabuf_dup_fd for duplicating dmabuf->fd
(Daniel P. Berrangé )

 -- Let qemu_dmabuf_fee to call qemu_dmabuf_close before freeing
the struct to make sure fd is closed.
(Daniel P. Berrangé )

v12: Not closing fd in qemu_dmabuf_free because there are cases fd
 should still be available even after the struct is destroyed
 (e.g. virtio-gpu: res->dmabuf_fd).

Suggested-by: Marc-André Lureau 
Cc: Philippe Mathieu-Daudé 
Cc: Daniel P. Berrangé 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 include/ui/console.h |  20 +
 include/ui/dmabuf.h  |  66 ++
 ui/dmabuf.c  | 210 +++
 ui/meson.build   |   1 +
 4 files changed, 278 insertions(+), 19 deletions(-)
 create mode 100644 include/ui/dmabuf.h
 create mode 100644 ui/dmabuf.c

diff --git a/include/ui/console.h b/include/ui/console.h
index 0bc7a00ac0..a208a68b88 100644
--- a/include/ui/console.h
+++ b/include/ui/console.h
@@ -7,6 +7,7 @@
 #include "qapi/qapi-types-ui.h"
 #include "ui/input.h"
 #include "ui/surface.h"
+#include "ui/dmabuf.h"
 
 #define TYPE_QEMU_CONSOLE "qemu-console"
 OBJECT_DECLARE_TYPE(QemuConsole, QemuConsoleClass, QEMU_CONSOLE)
@@ -185,25 +186,6 @@ struct QEMUGLParams {
 int minor_ver;
 };
 
-typedef struct QemuDmaBuf {
-int   fd;
-uint32_t  width;
-uint32_t  height;
-uint32_t  stride;
-uint32_t  fourcc;
-uint64_t  modifier;
-uint32_t  texture;
-uint32_t  x;
-uint32_t  y;
-uint32_t  backing_width;
-uint32_t  backing_height;
-bool  y0_top;
-void  *sync;
-int   fence_fd;
-bool  allow_fences;
-bool  draw_submitted;
-} QemuDmaBuf;
-
 enum display_scanout {
 SCANOUT_NONE,
 SCANOUT_SURFACE,
diff --git a/include/ui/dmabuf.h b/include/ui/dmabuf.h
new file mode 100644
index 00..4198cdf85a
--- /dev/null
+++ b/include/ui/dmabuf.h
@@ -0,0 +1,66 @@
+/*
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * QemuDmaBuf struct and helpers used for accessing its data
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef DMABUF_H
+#define DMABUF_H
+
+typedef struct QemuDmaBuf {
+int   fd;
+uint32_t  width;
+uint32_t  height;
+uint32_t  stride;
+uint32_t  fourcc;
+uint64_t  modifier;
+uint32_t  texture;
+uint32_t  x;
+uint32_t  y;
+uint32_t  backing_width;
+uint32_t  backing_height;
+bool  y0_top;
+void  *sync;
+int   fence_fd;
+bool  allow_fences;
+bool  draw_submitted;
+} QemuDmaBuf;
+
+QemuDmaBuf *qemu_dmabuf_new(uint32_t width, uint32_t height,
+uint32_t stride, uint32_t x,
+uint32_t y, uint32_t backing_width,
+uint32_t backing_height, uint32_t fourcc,
+uint64_t modifier, int dmabuf_fd,
+bool allow_fences, bool y0_top);
+void qemu_dmabuf_free(QemuDmaBuf *dmabuf);
+
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(QemuDmaBuf, qemu_dmabuf_free);
+
+int qemu_dmabuf_get_fd(QemuDmaBuf *dmabuf);
+int qemu_dmabuf_dup_fd(QemuDmaBuf *dmabuf);
+void qemu_dmabuf_close(QemuDmaBuf *dmabuf);
+uint32_t qemu_dmabuf_get_width(QemuDmaBuf *dmabuf);
+uint32_t qemu_dmabuf_get_height(QemuDmaBuf *dmabuf);
+uint32_t qemu_dmabuf_get_stride(QemuDmaBuf *dmabuf);
+uint32_t qemu_dmabuf_get_fourcc(QemuDmaBuf *dmabuf);
+uint64_t qemu_dmabuf_get_modifier(QemuDmaBuf *dmabuf);
+uint32_t qemu_dmabuf_get_texture(QemuDmaBuf *dmabuf);
+uint32_t qemu_dmabuf_get_x(QemuDmaBuf *dmabuf);
+uint32_t qemu_dmabuf_get_y(QemuDmaBuf *dmabuf);
+uint32_t qemu_dmabuf_get_backing_width(QemuDmaBuf *dmabuf);
+uint32_t qemu_dmabuf_get_backing_height(QemuDmaBuf *dmabuf);
+bool qemu_dmabuf_get_y0_top(QemuDmaBuf *dmabuf);
+void *qemu_dmabuf_get_sync(QemuDmaBuf *dmabuf);
+int32_t qemu_dmabuf_get_fence_fd(QemuDmaBuf *dmabuf);
+bool qemu_dmabuf_get_allow_fences(QemuDmaBuf *dmabuf);
+bool qemu_dmabuf_get_draw_submitted(QemuDmaBuf *dmabuf);
+void qemu_dmabuf_set_texture(QemuDmaBuf *dmabuf, uint32_t texture);
+void qemu_dmabuf_set_fence_fd(QemuDmaBuf *dmabuf, int32_t fence_fd);
+void qemu_dmabuf_set_sync(QemuDmaBuf *dmabuf, void *sync);
+void qemu_dmabuf_set_draw_submitted(QemuDmaBuf *dmabuf, bool draw_submitted);
+void qemu_dmabuf_set_fd(QemuDmaBuf *dmabuf, int32_t fd);
+
+#endif
diff --git a/ui/dmabuf.c b/ui/dmabuf.c
new file mode 100644
index 00..e047d5ca26
--- /dev/null
+++ b/ui/dmabuf.c
@@ -0,0 +1,210 @@
+/*
+ *

[PATCH v12 5/6] ui/console: Use qemu_dmabuf_new() and free() helpers instead

From: Dongwon Kim 

This commit introduces utility functions for the creation and deallocation
of QemuDmaBuf instances. Additionally, it updates all relevant sections
of the codebase to utilize these new utility functions.

v7: remove prefix, "dpy_gl_" from all helpers
qemu_dmabuf_free() returns without doing anything if input is null
(Daniel P. Berrangé )
call G_DEFINE_AUTOPTR_CLEANUP_FUNC for qemu_dmabuf_free()
(Daniel P. Berrangé )

v8: Introduction of helpers was removed as those were already added
by the previous commit

v9: set dmabuf->allow_fences to 'true' when dmabuf is created in
virtio_gpu_create_dmabuf()/virtio-gpu-udmabuf.c

removed unnecessary spaces were accidently added in the patch,
'ui/console: Use qemu_dmabuf_new() a...'

v11: Calling qemu_dmabuf_close was removed as closing dmabuf->fd will be
 done in qemu_dmabuf_free anyway.
 (Daniel P. Berrangé )

v12: --- Calling qemu_dmabuf_close separately as qemu_dmabuf_free doesn't
 do it.

 --- 'dmabuf' is now allocated space so it should be freed at the end of
 dbus_scanout_texture

Suggested-by: Marc-André Lureau 
Cc: Philippe Mathieu-Daudé 
Cc: Daniel P. Berrangé 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 include/hw/vfio/vfio-common.h   |  2 +-
 include/hw/virtio/virtio-gpu.h  |  4 ++--
 hw/display/vhost-user-gpu.c | 21 +++--
 hw/display/virtio-gpu-udmabuf.c | 24 +---
 hw/vfio/display.c   | 26 --
 ui/dbus-listener.c  | 29 +
 6 files changed, 48 insertions(+), 58 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..d66e27db02 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -148,7 +148,7 @@ typedef struct VFIOGroup {
 } VFIOGroup;
 
 typedef struct VFIODMABuf {
-QemuDmaBuf buf;
+QemuDmaBuf *buf;
 uint32_t pos_x, pos_y, pos_updates;
 uint32_t hot_x, hot_y, hot_updates;
 int dmabuf_id;
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index ed44cdad6b..56d6e821bf 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -169,7 +169,7 @@ struct VirtIOGPUBaseClass {
 DEFINE_PROP_UINT32("yres", _state, _conf.yres, 800)
 
 typedef struct VGPUDMABuf {
-QemuDmaBuf buf;
+QemuDmaBuf *buf;
 uint32_t scanout_id;
 QTAILQ_ENTRY(VGPUDMABuf) next;
 } VGPUDMABuf;
@@ -238,7 +238,7 @@ struct VhostUserGPU {
 VhostUserBackend *vhost;
 int vhost_gpu_fd; /* closed by the chardev */
 CharBackend vhost_chr;
-QemuDmaBuf dmabuf[VIRTIO_GPU_MAX_SCANOUTS];
+QemuDmaBuf *dmabuf[VIRTIO_GPU_MAX_SCANOUTS];
 bool backend_blocked;
 };
 
diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
index 454e5afcff..62e7b4376b 100644
--- a/hw/display/vhost-user-gpu.c
+++ b/hw/display/vhost-user-gpu.c
@@ -249,6 +249,7 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
VhostUserGpuMsg *msg)
 case VHOST_USER_GPU_DMABUF_SCANOUT: {
 VhostUserGpuDMABUFScanout *m = >payload.dmabuf_scanout;
 int fd = qemu_chr_fe_get_msgfd(>vhost_chr);
+uint64_t modifier = 0;
 QemuDmaBuf *dmabuf;
 
 if (m->scanout_id >= g->parent_obj.conf.max_outputs) {
@@ -261,27 +262,27 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
VhostUserGpuMsg *msg)
 
 g->parent_obj.enable = 1;
 con = g->parent_obj.scanout[m->scanout_id].con;
-dmabuf = >dmabuf[m->scanout_id];
+dmabuf = g->dmabuf[m->scanout_id];
 qemu_dmabuf_close(dmabuf);
 dpy_gl_release_dmabuf(con, dmabuf);
+g_clear_pointer(, qemu_dmabuf_free);
 if (fd == -1) {
 dpy_gl_scanout_disable(con);
 break;
 }
-*dmabuf = (QemuDmaBuf) {
-.fd = fd,
-.width = m->fd_width,
-.height = m->fd_height,
-.stride = m->fd_stride,
-.fourcc = m->fd_drm_fourcc,
-.y0_top = m->fd_flags & VIRTIO_GPU_RESOURCE_FLAG_Y_0_TOP,
-};
 if (msg->request == VHOST_USER_GPU_DMABUF_SCANOUT2) {
 VhostUserGpuDMABUFScanout2 *m2 = >payload.dmabuf_scanout2;
-dmabuf->modifier = m2->modifier;
+modifier = m2->modifier;
 }
 
+dmabuf = qemu_dmabuf_new(m->fd_width, m->fd_height,
+ m->fd_stride, 0, 0, 0, 0,
+ m->fd_drm_fourcc, modifier,
+ fd, false, m->fd_flags &
+ VIRTIO_GPU_RESOURCE_FLAG_Y_0_TOP);
+
 dpy_gl_scanout_dmabuf(con, dmabuf);
+g->dmabuf[m->scanout_id] = dmabuf;
 break;
 }
 case VHOST_USER_GPU_DMABUF_UPDATE: {
diff --git a/hw/display/virtio-gpu-udmabuf.c b/hw/display/virtio-gpu-udmabuf.c
index c90eba281e..c02ec6d37d 100644
--- a/hw/display/virtio-gpu-udmabuf.c
+++

[PATCH v12 6/6] ui/console: move QemuDmaBuf struct def to dmabuf.c

From: Dongwon Kim 

To complete privatizing process of QemuDmaBuf, QemuDmaBuf struct def
is moved to dmabuf.c

Suggested-by: Marc-André Lureau 
Cc: Philippe Mathieu-Daudé 
Cc: Daniel P. Berrangé 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 include/ui/dmabuf.h | 19 +--
 ui/dmabuf.c | 19 +++
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/include/ui/dmabuf.h b/include/ui/dmabuf.h
index 4198cdf85a..dc74ba895a 100644
--- a/include/ui/dmabuf.h
+++ b/include/ui/dmabuf.h
@@ -10,24 +10,7 @@
 #ifndef DMABUF_H
 #define DMABUF_H
 
-typedef struct QemuDmaBuf {
-int   fd;
-uint32_t  width;
-uint32_t  height;
-uint32_t  stride;
-uint32_t  fourcc;
-uint64_t  modifier;
-uint32_t  texture;
-uint32_t  x;
-uint32_t  y;
-uint32_t  backing_width;
-uint32_t  backing_height;
-bool  y0_top;
-void  *sync;
-int   fence_fd;
-bool  allow_fences;
-bool  draw_submitted;
-} QemuDmaBuf;
+typedef struct QemuDmaBuf QemuDmaBuf;
 
 QemuDmaBuf *qemu_dmabuf_new(uint32_t width, uint32_t height,
 uint32_t stride, uint32_t x,
diff --git a/ui/dmabuf.c b/ui/dmabuf.c
index e047d5ca26..df7a09703f 100644
--- a/ui/dmabuf.c
+++ b/ui/dmabuf.c
@@ -10,6 +10,25 @@
 #include "qemu/osdep.h"
 #include "ui/dmabuf.h"
 
+struct QemuDmaBuf {
+int   fd;
+uint32_t  width;
+uint32_t  height;
+uint32_t  stride;
+uint32_t  fourcc;
+uint64_t  modifier;
+uint32_t  texture;
+uint32_t  x;
+uint32_t  y;
+uint32_t  backing_width;
+uint32_t  backing_height;
+bool  y0_top;
+void  *sync;
+int   fence_fd;
+bool  allow_fences;
+bool  draw_submitted;
+};
+
 QemuDmaBuf *qemu_dmabuf_new(uint32_t width, uint32_t height,
 uint32_t stride, uint32_t x,
 uint32_t y, uint32_t backing_width,
-- 
2.34.1

[PATCH v12 0/6] ui/console: Private QemuDmaBuf struct

From: Dongwon Kim 

This series introduces privacy enhancements to the QemuDmaBuf struct
and its contained data to bolster security. it accomplishes this by
introducing of helper functions for allocating, deallocating, and
accessing individual fields within the struct and replacing all direct
references to individual fields in the struct with methods using helpers
throughout the codebase.

This change was made based on a suggestion from Marc-André Lureau


(Resumitting same patch series with this new cover-leter)

v6: fixed some typos in patch -
ui/console: Introduce dpy_gl_qemu_dmabuf_get_..() helpers)

v7: included minor fix (ui/gtk: Check if fence_fd is equal to or greater than 0)
(Marc-André Lureau )

migrated all helpers and QemuDmaBuf struct into dmabuf.c and their 
prototypes
to dmabuf.h for better encapsulation (ui/dmabuf: New dmabuf.c and 
dmabuf.h..)
(Daniel P. Berrangé  and
 Marc-André Lureau )

removed 'dpy_gl' from all helpers' names
Defined autoptr clean up function for QemuDmaBuf*
(Daniel P. Berrangé )

Minor corrections

v8: Introduce new dmabuf.c and dmabuf.h and all helper functions in the second
patch in the series (ui/console: new dmabuf.h and dmabuf.c for QemuDma)
(Philippe Mathieu-Daudé )

v9: set dmabuf->allow_fences true when it is created in virtio-gpu-udmabuf

removed unnecessary spaces were added in the patch,
'ui/console: Use qemu_dmabuf_new() a...'

v10: Change the license type for both dmabuf.h and dmabuf.c from MIT to
 GPL to be in line with QEMU's default license
 (Daniel P. Berrangé )

v11: New helpers added - qemu_dmabuf_dup_fd, qemu_dmabuf_close for duplicating
 and closing dmabuf->fd. And use them in places where applicable.
 (Daniel P. Berrangé )

 qemu_dmabuf_free helper now close dmabuf->fd before freeing the struct to
 prevent any potential leakage (This eliminates the need for
 qemu_dmabuf_close in several places as qemu_dmabuf_close is done anyway.)
 (Daniel P. Berrangé )

v12: --- qemu_dmabuf_free does not include qemu_dmabuf_close as there are cases
 where fd still needs to be used even after QemuDmaBuf struct is
 destroyed (virtio-gpu: res->dmabuf_fd)

 --- 'dmabuf' is now allocated space so it should be freed at the end of
 dbus_scanout_texture

Dongwon Kim (6):
  ui/gtk: Check if fence_fd is equal to or greater than 0
  ui/console: new dmabuf.h and dmabuf.c for QemuDmaBuf struct and
helpers
  ui/console: Use qemu_dmabuf_get_..() helpers instead
  ui/console: Use qemu_dmabuf_set_..() helpers instead
  ui/console: Use qemu_dmabuf_new() and free() helpers instead
  ui/console: move QemuDmaBuf struct def to dmabuf.c

 include/hw/vfio/vfio-common.h   |   2 +-
 include/hw/virtio/virtio-gpu.h  |   4 +-
 include/ui/console.h|  20 +--
 include/ui/dmabuf.h |  49 +++
 hw/display/vhost-user-gpu.c |  26 ++--
 hw/display/virtio-gpu-udmabuf.c |  27 ++--
 hw/vfio/display.c   |  32 ++---
 ui/console.c|   4 +-
 ui/dbus-console.c   |   9 +-
 ui/dbus-listener.c  |  72 +-
 ui/dmabuf.c | 229 
 ui/egl-headless.c   |  23 +++-
 ui/egl-helpers.c|  59 
 ui/gtk-egl.c|  52 +---
 ui/gtk-gl-area.c|  41 --
 ui/gtk.c|  12 +-
 ui/spice-display.c  |  50 ---
 ui/meson.build  |   1 +
 18 files changed, 519 insertions(+), 193 deletions(-)
 create mode 100644 include/ui/dmabuf.h
 create mode 100644 ui/dmabuf.c

-- 
2.34.1

[PATCH v12 3/6] ui/console: Use qemu_dmabuf_get_..() helpers instead

From: Dongwon Kim 

This commit updates all instances where fields within the QemuDmaBuf
struct are directly accessed, replacing them with calls to these new
helper functions.

v6: fix typos in helper names in ui/spice-display.c

v7: removed prefix, "dpy_gl_" from all helpers

v8: Introduction of helpers was removed as those were already added
by the previous commit

v11: -- Use new qemu_dmabuf_close() instead of close(qemu_dmabuf_get_fd()).
(Daniel P. Berrangé )
 -- Use new qemu_dmabuf_dup_fd() instead of dup(qemu_dmabuf_get_fd()).
(Daniel P. Berrangé )

Suggested-by: Marc-André Lureau 
Reviewed-by: Marc-André Lureau 
Cc: Philippe Mathieu-Daudé 
Cc: Daniel P. Berrangé 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 hw/display/vhost-user-gpu.c |  5 +---
 hw/display/virtio-gpu-udmabuf.c |  7 +++--
 hw/vfio/display.c   | 12 +---
 ui/console.c|  4 +--
 ui/dbus-console.c   |  9 --
 ui/dbus-listener.c  | 43 +---
 ui/egl-headless.c   | 23 ++-
 ui/egl-helpers.c| 47 ++-
 ui/gtk-egl.c| 48 ---
 ui/gtk-gl-area.c| 37 
 ui/gtk.c|  6 ++--
 ui/spice-display.c  | 50 +++--
 12 files changed, 181 insertions(+), 110 deletions(-)

diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
index 709c8a02a1..454e5afcff 100644
--- a/hw/display/vhost-user-gpu.c
+++ b/hw/display/vhost-user-gpu.c
@@ -262,10 +262,7 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
VhostUserGpuMsg *msg)
 g->parent_obj.enable = 1;
 con = g->parent_obj.scanout[m->scanout_id].con;
 dmabuf = >dmabuf[m->scanout_id];
-if (dmabuf->fd >= 0) {
-close(dmabuf->fd);
-dmabuf->fd = -1;
-}
+qemu_dmabuf_close(dmabuf);
 dpy_gl_release_dmabuf(con, dmabuf);
 if (fd == -1) {
 dpy_gl_scanout_disable(con);
diff --git a/hw/display/virtio-gpu-udmabuf.c b/hw/display/virtio-gpu-udmabuf.c
index d51184d658..c90eba281e 100644
--- a/hw/display/virtio-gpu-udmabuf.c
+++ b/hw/display/virtio-gpu-udmabuf.c
@@ -206,6 +206,7 @@ int virtio_gpu_update_dmabuf(VirtIOGPU *g,
 {
 struct virtio_gpu_scanout *scanout = >parent_obj.scanout[scanout_id];
 VGPUDMABuf *new_primary, *old_primary = NULL;
+uint32_t width, height;
 
 new_primary = virtio_gpu_create_dmabuf(g, scanout_id, res, fb, r);
 if (!new_primary) {
@@ -216,10 +217,10 @@ int virtio_gpu_update_dmabuf(VirtIOGPU *g,
 old_primary = g->dmabuf.primary[scanout_id];
 }
 
+width = qemu_dmabuf_get_width(_primary->buf);
+height = qemu_dmabuf_get_height(_primary->buf);
 g->dmabuf.primary[scanout_id] = new_primary;
-qemu_console_resize(scanout->con,
-new_primary->buf.width,
-new_primary->buf.height);
+qemu_console_resize(scanout->con, width, height);
 dpy_gl_scanout_dmabuf(scanout->con, _primary->buf);
 
 if (old_primary) {
diff --git a/hw/vfio/display.c b/hw/vfio/display.c
index 1aa440c663..7784502b53 100644
--- a/hw/vfio/display.c
+++ b/hw/vfio/display.c
@@ -260,8 +260,9 @@ static VFIODMABuf *vfio_display_get_dmabuf(VFIOPCIDevice 
*vdev,
 static void vfio_display_free_one_dmabuf(VFIODisplay *dpy, VFIODMABuf *dmabuf)
 {
 QTAILQ_REMOVE(>dmabuf.bufs, dmabuf, next);
+
+qemu_dmabuf_close(>buf);
 dpy_gl_release_dmabuf(dpy->con, >buf);
-close(dmabuf->buf.fd);
 g_free(dmabuf);
 }
 
@@ -286,6 +287,7 @@ static void vfio_display_dmabuf_update(void *opaque)
 VFIOPCIDevice *vdev = opaque;
 VFIODisplay *dpy = vdev->dpy;
 VFIODMABuf *primary, *cursor;
+uint32_t width, height;
 bool free_bufs = false, new_cursor = false;
 
 primary = vfio_display_get_dmabuf(vdev, DRM_PLANE_TYPE_PRIMARY);
@@ -296,10 +298,12 @@ static void vfio_display_dmabuf_update(void *opaque)
 return;
 }
 
+width = qemu_dmabuf_get_width(>buf);
+height = qemu_dmabuf_get_height(>buf);
+
 if (dpy->dmabuf.primary != primary) {
 dpy->dmabuf.primary = primary;
-qemu_console_resize(dpy->con,
-primary->buf.width, primary->buf.height);
+qemu_console_resize(dpy->con, width, height);
 dpy_gl_scanout_dmabuf(dpy->con, >buf);
 free_bufs = true;
 }
@@ -328,7 +332,7 @@ static void vfio_display_dmabuf_update(void *opaque)
 cursor->pos_updates = 0;
 }
 
-dpy_gl_update(dpy->con, 0, 0, primary->buf.width, primary->buf.height);
+dpy_gl_update(dpy->con, 0, 0, width, height);
 
 if (free_bufs) {
 vfio_display_free_dmabufs(vdev);
diff --git a/ui/console.c b/ui/console.c
index 43226c5c14..1b2cd0c736 100644
--- a/ui/console.c
+++ b/ui/console.c
@@ -1459,7 +1459,7 @@ int

[PATCH v12 1/6] ui/gtk: Check if fence_fd is equal to or greater than 0

From: Dongwon Kim 

'fence_fd' needs to be validated always before being referenced
And the passing condition should include '== 0' as 0 is a valid
value for the file descriptor.

Suggested-by: Marc-André Lureau 
Reviewed-by: Daniel P. Berrangé 
Cc: Philippe Mathieu-Daudé 
Cc: Daniel P. Berrangé 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 ui/gtk-egl.c |  2 +-
 ui/gtk-gl-area.c |  2 +-
 ui/gtk.c | 10 ++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
index 3af5ac5bcf..955234429d 100644
--- a/ui/gtk-egl.c
+++ b/ui/gtk-egl.c
@@ -99,7 +99,7 @@ void gd_egl_draw(VirtualConsole *vc)
 #ifdef CONFIG_GBM
 if (dmabuf) {
 egl_dmabuf_create_fence(dmabuf);
-if (dmabuf->fence_fd > 0) {
+if (dmabuf->fence_fd >= 0) {
 qemu_set_fd_handler(dmabuf->fence_fd, gd_hw_gl_flushed, NULL, 
vc);
 return;
 }
diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c
index 52dcac161e..7fffd0544e 100644
--- a/ui/gtk-gl-area.c
+++ b/ui/gtk-gl-area.c
@@ -86,7 +86,7 @@ void gd_gl_area_draw(VirtualConsole *vc)
 #ifdef CONFIG_GBM
 if (dmabuf) {
 egl_dmabuf_create_fence(dmabuf);
-if (dmabuf->fence_fd > 0) {
+if (dmabuf->fence_fd >= 0) {
 qemu_set_fd_handler(dmabuf->fence_fd, gd_hw_gl_flushed, NULL, 
vc);
 return;
 }
diff --git a/ui/gtk.c b/ui/gtk.c
index 810d7fc796..7819a86321 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -597,10 +597,12 @@ void gd_hw_gl_flushed(void *vcon)
 VirtualConsole *vc = vcon;
 QemuDmaBuf *dmabuf = vc->gfx.guest_fb.dmabuf;
 
-qemu_set_fd_handler(dmabuf->fence_fd, NULL, NULL, NULL);
-close(dmabuf->fence_fd);
-dmabuf->fence_fd = -1;
-graphic_hw_gl_block(vc->gfx.dcl.con, false);
+if (dmabuf->fence_fd >= 0) {
+qemu_set_fd_handler(dmabuf->fence_fd, NULL, NULL, NULL);
+close(dmabuf->fence_fd);
+dmabuf->fence_fd = -1;
+graphic_hw_gl_block(vc->gfx.dcl.con, false);
+}
 }
 
 /** DisplayState Callbacks (opengl version) **/
-- 
2.34.1

[PATCH 07/45] target/hppa: Add install_iaq_entries

Instead of two separate cpu_iaoq_entry calls, use one call to update
both IAQ_Front and IAQ_Back.  Simplify with an argument combination
that automatically handles a simple increment from Front to Back.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 64 +
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index dfdcb3e23c..cad33e7aa6 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -616,6 +616,23 @@ static void copy_iaoq_entry(DisasContext *ctx, TCGv_i64 
dest,
 }
 }
 
+static void install_iaq_entries(DisasContext *ctx, uint64_t bi, TCGv_i64 bv,
+uint64_t ni, TCGv_i64 nv)
+{
+copy_iaoq_entry(ctx, cpu_iaoq_f, bi, bv);
+
+/* Allow ni variable, with nv null, to indicate a trivial advance. */
+if (ni != -1 || nv) {
+copy_iaoq_entry(ctx, cpu_iaoq_b, ni, nv);
+} else if (bi != -1) {
+copy_iaoq_entry(ctx, cpu_iaoq_b, bi + 4, NULL);
+} else {
+tcg_gen_addi_i64(cpu_iaoq_b, cpu_iaoq_f, 4);
+tcg_gen_andi_i64(cpu_iaoq_b, cpu_iaoq_b,
+ gva_offset_mask(ctx->tb_flags));
+}
+}
+
 static inline uint64_t iaoq_dest(DisasContext *ctx, int64_t disp)
 {
 return ctx->iaoq_f + disp + 8;
@@ -628,8 +645,7 @@ static void gen_excp_1(int exception)
 
 static void gen_excp(DisasContext *ctx, int exception)
 {
-copy_iaoq_entry(ctx, cpu_iaoq_f, ctx->iaoq_f, cpu_iaoq_f);
-copy_iaoq_entry(ctx, cpu_iaoq_b, ctx->iaoq_b, cpu_iaoq_b);
+install_iaq_entries(ctx, ctx->iaoq_f, cpu_iaoq_f, ctx->iaoq_b, cpu_iaoq_b);
 nullify_save(ctx);
 gen_excp_1(exception);
 ctx->base.is_jmp = DISAS_NORETURN;
@@ -683,12 +699,10 @@ static void gen_goto_tb(DisasContext *ctx, int which,
 {
 if (use_goto_tb(ctx, b, n)) {
 tcg_gen_goto_tb(which);
-copy_iaoq_entry(ctx, cpu_iaoq_f, b, NULL);
-copy_iaoq_entry(ctx, cpu_iaoq_b, n, NULL);
+install_iaq_entries(ctx, b, NULL, n, NULL);
 tcg_gen_exit_tb(ctx->base.tb, which);
 } else {
-copy_iaoq_entry(ctx, cpu_iaoq_f, b, cpu_iaoq_b);
-copy_iaoq_entry(ctx, cpu_iaoq_b, n, ctx->iaoq_n_var);
+install_iaq_entries(ctx, b, cpu_iaoq_b, n, ctx->iaoq_n_var);
 tcg_gen_lookup_and_goto_ptr();
 }
 }
@@ -1882,9 +1896,7 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 }
 if (is_n) {
 if (use_nullify_skip(ctx)) {
-copy_iaoq_entry(ctx, cpu_iaoq_f, -1, next);
-tcg_gen_addi_i64(next, next, 4);
-copy_iaoq_entry(ctx, cpu_iaoq_b, -1, next);
+install_iaq_entries(ctx, -1, next, -1, NULL);
 nullify_set(ctx, 0);
 ctx->base.is_jmp = DISAS_IAQ_N_UPDATED;
 return true;
@@ -1899,14 +1911,10 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 nullify_over(ctx);
 
 if (is_n && use_nullify_skip(ctx)) {
-copy_iaoq_entry(ctx, cpu_iaoq_f, -1, dest);
-next = tcg_temp_new_i64();
-tcg_gen_addi_i64(next, dest, 4);
-copy_iaoq_entry(ctx, cpu_iaoq_b, -1, next);
+install_iaq_entries(ctx, -1, dest, -1, NULL);
 nullify_set(ctx, 0);
 } else {
-copy_iaoq_entry(ctx, cpu_iaoq_f, ctx->iaoq_b, cpu_iaoq_b);
-copy_iaoq_entry(ctx, cpu_iaoq_b, -1, dest);
+install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, -1, dest);
 nullify_set(ctx, is_n);
 }
 if (link != 0) {
@@ -1997,9 +2005,7 @@ static void do_page_zero(DisasContext *ctx)
 tcg_gen_st_i64(cpu_gr[26], tcg_env, offsetof(CPUHPPAState, cr[27]));
 tmp = tcg_temp_new_i64();
 tcg_gen_ori_i64(tmp, cpu_gr[31], 3);
-copy_iaoq_entry(ctx, cpu_iaoq_f, -1, tmp);
-tcg_gen_addi_i64(tmp, tmp, 4);
-copy_iaoq_entry(ctx, cpu_iaoq_b, -1, tmp);
+install_iaq_entries(ctx, -1, tmp, -1, NULL);
 ctx->base.is_jmp = DISAS_IAQ_N_UPDATED;
 break;
 
@@ -2743,8 +2749,8 @@ static bool trans_or(DisasContext *ctx, arg_rrr_cf_d *a)
 nullify_over(ctx);
 
 /* Advance the instruction queue.  */
-copy_iaoq_entry(ctx, cpu_iaoq_f, ctx->iaoq_b, cpu_iaoq_b);
-copy_iaoq_entry(ctx, cpu_iaoq_b, ctx->iaoq_n, ctx->iaoq_n_var);
+install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b,
+ctx->iaoq_n, ctx->iaoq_n_var);
 nullify_set(ctx, 0);
 
 /* Tell the qemu main loop to halt until this cpu has work.  */
@@ -3897,18 +3903,15 @@ static bool trans_be(DisasContext *ctx, arg_be *a)
 tcg_gen_mov_i64(cpu_sr[0], cpu_iasq_b);
 }
 if (a->n && use_nullify_skip(ctx)) {
-copy_iaoq_entry(ctx, cpu_iaoq_f, -1, tmp);
-tcg_gen_addi_i64(tmp, tmp, 4);
-copy_iaoq_entry(ctx, cpu_iaoq_b, -1, tmp);
+install_iaq_entries(ctx, -1, tmp, -1, NULL);

[PATCH 36/45] target/hppa: Manage PSW_X and PSW_B in translator

PSW_X is cleared after every instruction, and only set by RFI.
PSW_B is cleared after every non-branch, or branch not taken,
and only set by taken branches.  We can clear both bits with a
single store, at most once per TB.  Taken branches set PSW_B,
at most once per TB.

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.c   | 10 ++---
 target/hppa/translate.c | 50 +
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index 003af63e20..5f0df0697a 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -50,7 +50,7 @@ static vaddr hppa_cpu_get_pc(CPUState *cs)
 void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
   uint64_t *pcsbase, uint32_t *pflags)
 {
-uint32_t flags = env->psw_n * PSW_N;
+uint32_t flags = 0;
 uint64_t cs_base = 0;
 
 /*
@@ -80,11 +80,14 @@ void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
 cs_base |= env->iaoq_b & ~TARGET_PAGE_MASK;
 }
 
+/* ??? E, T, H, L bits need to be here, when implemented.  */
+flags |= env->psw_n * PSW_N;
+flags |= env->psw_xb;
+flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_P);
+
 #ifdef CONFIG_USER_ONLY
 flags |= TB_FLAG_UNALIGN * !env_cpu(env)->prctl_unalign_sigbus;
 #else
-/* ??? E, T, H, L, B bits need to be here, when implemented.  */
-flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_P);
 if ((env->sr[4] == env->sr[5])
 & (env->sr[4] == env->sr[6])
 & (env->sr[4] == env->sr[7])) {
@@ -103,6 +106,7 @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs,
 
 /* IAQ is always up-to-date before goto_tb. */
 cpu->env.psw_n = (tb->flags & PSW_N) != 0;
+cpu->env.psw_xb = tb->flags & (PSW_X | PSW_B);
 }
 
 static void hppa_restore_state_to_opc(CPUState *cs,
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index fb5bc12986..a49cf09518 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -83,7 +83,9 @@ typedef struct DisasContext {
 uint32_t tb_flags;
 int mmu_idx;
 int privilege;
+uint32_t psw_xb;
 bool psw_n_nonzero;
+bool psw_b_next;
 bool is_pa20;
 bool insn_start_updated;
 
@@ -262,6 +264,7 @@ static TCGv_i64 cpu_psw_n;
 static TCGv_i64 cpu_psw_v;
 static TCGv_i64 cpu_psw_cb;
 static TCGv_i64 cpu_psw_cb_msb;
+static TCGv_i32 cpu_psw_xb;
 
 void hppa_translate_init(void)
 {
@@ -314,6 +317,9 @@ void hppa_translate_init(void)
 *v->var = tcg_global_mem_new(tcg_env, v->ofs, v->name);
 }
 
+cpu_psw_xb = tcg_global_mem_new_i32(tcg_env,
+offsetof(CPUHPPAState, psw_xb),
+"psw_xb");
 cpu_iasq_f = tcg_global_mem_new_i64(tcg_env,
 offsetof(CPUHPPAState, iasq_f),
 "iasq_f");
@@ -508,6 +514,25 @@ static void load_spr(DisasContext *ctx, TCGv_i64 dest, 
unsigned reg)
 #endif
 }
 
+/*
+ * Write a value to psw_xb, bearing in mind the known value.
+ * To be used just before exiting the TB, so do not update the known value.
+ */
+static void store_psw_xb(DisasContext *ctx, uint32_t xb)
+{
+tcg_debug_assert(xb == 0 || xb == PSW_B);
+if (ctx->psw_xb != xb) {
+tcg_gen_movi_i32(cpu_psw_xb, xb);
+}
+}
+
+/* Write a value to psw_xb, and update the known value. */
+static void set_psw_xb(DisasContext *ctx, uint32_t xb)
+{
+store_psw_xb(ctx, xb);
+ctx->psw_xb = xb;
+}
+
 /* Skip over the implementation of an insn that has been nullified.
Use this when the insn is too complex for a conditional move.  */
 static void nullify_over(DisasContext *ctx)
@@ -575,6 +600,8 @@ static bool nullify_end(DisasContext *ctx)
 /* For NEXT, NORETURN, STALE, we can easily continue (or exit).
For UPDATED, we cannot update on the nullified path.  */
 assert(status != DISAS_IAQ_N_UPDATED);
+/* Taken branches are handled manually. */
+assert(!ctx->psw_b_next);
 
 if (likely(null_lab == NULL)) {
 /* The current insn wasn't conditional or handled the condition
@@ -1841,6 +1868,7 @@ static bool do_dbranch(DisasContext *ctx, int64_t disp,
 if (is_n) {
 if (use_nullify_skip(ctx)) {
 nullify_set(ctx, 0);
+store_psw_xb(ctx, 0);
 gen_goto_tb(ctx, 0, >iaq_j, NULL);
 ctx->base.is_jmp = DISAS_NORETURN;
 return true;
@@ -1848,20 +1876,24 @@ static bool do_dbranch(DisasContext *ctx, int64_t disp,
 ctx->null_cond.c = TCG_COND_ALWAYS;
 }
 ctx->iaq_n = >iaq_j;
+ctx->psw_b_next = true;
 } else {
 nullify_over(ctx);
 
 install_link(ctx, link, false);
 if (is_n && use_nullify_skip(ctx)) {
 nullify_set(ctx, 0);
+store_psw_xb(ctx, 0);
 gen_goto_tb(ctx, 0, >iaq_j, NULL);
 } else {

[PATCH 29/45] target/hppa: Use delay_excp for conditional traps

Signed-off-by: Richard Henderson 
---
 target/hppa/helper.h |  1 -
 target/hppa/int_helper.c |  2 +-
 target/hppa/op_helper.c  |  7 ---
 target/hppa/translate.c  | 41 ++--
 4 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/target/hppa/helper.h b/target/hppa/helper.h
index 5900fd70bc..3d0d143aed 100644
--- a/target/hppa/helper.h
+++ b/target/hppa/helper.h
@@ -1,6 +1,5 @@
 DEF_HELPER_2(excp, noreturn, env, int)
 DEF_HELPER_FLAGS_2(tsv, TCG_CALL_NO_WG, void, env, tl)
-DEF_HELPER_FLAGS_2(tcond, TCG_CALL_NO_WG, void, env, tl)
 
 DEF_HELPER_FLAGS_3(stby_b, TCG_CALL_NO_WG, void, env, tl, tl)
 DEF_HELPER_FLAGS_3(stby_b_parallel, TCG_CALL_NO_WG, void, env, tl, tl)
diff --git a/target/hppa/int_helper.c b/target/hppa/int_helper.c
index a667ee380d..1aa3e88ef1 100644
--- a/target/hppa/int_helper.c
+++ b/target/hppa/int_helper.c
@@ -134,13 +134,13 @@ void hppa_cpu_do_interrupt(CPUState *cs)
 switch (i) {
 case EXCP_ILL:
 case EXCP_BREAK:
+case EXCP_COND:
 case EXCP_PRIV_REG:
 case EXCP_PRIV_OPR:
 /* IIR set via translate.c.  */
 break;
 
 case EXCP_OVERFLOW:
-case EXCP_COND:
 case EXCP_ASSIST:
 case EXCP_DTLB_MISS:
 case EXCP_NA_ITLB_MISS:
diff --git a/target/hppa/op_helper.c b/target/hppa/op_helper.c
index 6cf49f33b7..a8b69fd481 100644
--- a/target/hppa/op_helper.c
+++ b/target/hppa/op_helper.c
@@ -49,13 +49,6 @@ void HELPER(tsv)(CPUHPPAState *env, target_ulong cond)
 }
 }
 
-void HELPER(tcond)(CPUHPPAState *env, target_ulong cond)
-{
-if (unlikely(cond)) {
-hppa_dynamic_excp(env, EXCP_COND, GETPC());
-}
-}
-
 static void atomic_store_mask32(CPUHPPAState *env, target_ulong addr,
 uint32_t val, uint32_t mask, uintptr_t ra)
 {
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 7a92901e18..080a52e5e4 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1115,6 +1115,25 @@ static TCGv_i64 do_sub_sv(DisasContext *ctx, TCGv_i64 
res,
 return sv;
 }
 
+static void gen_tc(DisasContext *ctx, DisasCond *cond)
+{
+DisasDelayException *e;
+
+switch (cond->c) {
+case TCG_COND_NEVER:
+break;
+case TCG_COND_ALWAYS:
+gen_excp_iir(ctx, EXCP_COND);
+break;
+default:
+e = delay_excp(ctx, EXCP_COND);
+tcg_gen_brcond_i64(cond->c, cond->a0, cond->a1, e->lab);
+/* In the non-trap path, the condition is known false. */
+*cond = cond_make_f();
+break;
+}
+}
+
 static void do_add(DisasContext *ctx, unsigned rt, TCGv_i64 orig_in1,
TCGv_i64 in2, unsigned shift, bool is_l,
bool is_tsv, bool is_tc, bool is_c, unsigned cf, bool d)
@@ -1173,9 +1192,7 @@ static void do_add(DisasContext *ctx, unsigned rt, 
TCGv_i64 orig_in1,
 /* Emit any conditional trap before any writeback.  */
 cond = do_cond(ctx, cf, d, dest, uv, sv);
 if (is_tc) {
-tmp = tcg_temp_new_i64();
-tcg_gen_setcond_i64(cond.c, tmp, cond.a0, cond.a1);
-gen_helper_tcond(tcg_env, tmp);
+gen_tc(ctx, );
 }
 
 /* Write back the result.  */
@@ -1194,6 +1211,10 @@ static bool do_add_reg(DisasContext *ctx, 
arg_rrr_cf_d_sh *a,
 {
 TCGv_i64 tcg_r1, tcg_r2;
 
+if (unlikely(is_tc && a->cf == 1)) {
+/* Unconditional trap on condition. */
+return gen_excp_iir(ctx, EXCP_COND);
+}
 if (a->cf) {
 nullify_over(ctx);
 }
@@ -1209,6 +1230,10 @@ static bool do_add_imm(DisasContext *ctx, arg_rri_cf *a,
 {
 TCGv_i64 tcg_im, tcg_r2;
 
+if (unlikely(is_tc && a->cf == 1)) {
+/* Unconditional trap on condition. */
+return gen_excp_iir(ctx, EXCP_COND);
+}
 if (a->cf) {
 nullify_over(ctx);
 }
@@ -1223,7 +1248,7 @@ static void do_sub(DisasContext *ctx, unsigned rt, 
TCGv_i64 in1,
TCGv_i64 in2, bool is_tsv, bool is_b,
bool is_tc, unsigned cf, bool d)
 {
-TCGv_i64 dest, sv, cb, cb_msb, tmp;
+TCGv_i64 dest, sv, cb, cb_msb;
 unsigned c = cf >> 1;
 DisasCond cond;
 
@@ -1271,9 +1296,7 @@ static void do_sub(DisasContext *ctx, unsigned rt, 
TCGv_i64 in1,
 
 /* Emit any conditional trap before any writeback.  */
 if (is_tc) {
-tmp = tcg_temp_new_i64();
-tcg_gen_setcond_i64(cond.c, tmp, cond.a0, cond.a1);
-gen_helper_tcond(tcg_env, tmp);
+gen_tc(ctx, );
 }
 
 /* Write back the result.  */
@@ -1439,9 +1462,7 @@ static void do_unit_addsub(DisasContext *ctx, unsigned 
rt, TCGv_i64 in1,
 }
 
 if (is_tc) {
-TCGv_i64 tmp = tcg_temp_new_i64();
-tcg_gen_setcond_i64(cond.c, tmp, cond.a0, cond.a1);
-gen_helper_tcond(tcg_env, tmp);
+gen_tc(ctx, );
 }
 save_gpr(ctx, rt, dest);
 
-- 
2.34.1

[PATCH 38/45] target/hppa: Implement PSW_X

Use PAGE_WRITE_INV to temporarily enable write permission
on for a given page, driven by PSW_X being set.

Signed-off-by: Richard Henderson 
---
 target/hppa/mem_helper.c | 46 +++-
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/target/hppa/mem_helper.c b/target/hppa/mem_helper.c
index 84785b5a5c..5eca5e8a1e 100644
--- a/target/hppa/mem_helper.c
+++ b/target/hppa/mem_helper.c
@@ -295,30 +295,38 @@ int hppa_get_physical_address(CPUHPPAState *env, vaddr 
addr, int mmu_idx,
 goto egress;
 }
 
-/* In reverse priority order, check for conditions which raise faults.
-   As we go, remove PROT bits that cover the condition we want to check.
-   In this way, the resulting PROT will force a re-check of the
-   architectural TLB entry for the next access.  */
-if (unlikely(!ent->d)) {
-if (type & PAGE_WRITE) {
-/* The D bit is not set -- TLB Dirty Bit Fault.  */
-ret = EXCP_TLB_DIRTY;
-}
-prot &= PAGE_READ | PAGE_EXEC;
-}
-if (unlikely(ent->b)) {
-if (type & PAGE_WRITE) {
-/* The B bit is set -- Data Memory Break Fault.  */
-ret = EXCP_DMB;
-}
-prot &= PAGE_READ | PAGE_EXEC;
-}
+/*
+ * In priority order, check for conditions which raise faults.
+ * Remove PROT bits that cover the condition we want to check,
+ * so that the resulting PROT will force a re-check of the
+ * architectural TLB entry for the next access.
+ */
 if (unlikely(ent->t)) {
+prot &= PAGE_EXEC;
 if (!(type & PAGE_EXEC)) {
 /* The T bit is set -- Page Reference Fault.  */
 ret = EXCP_PAGE_REF;
 }
-prot &= PAGE_EXEC;
+} else if (!ent->d) {
+prot &= PAGE_READ | PAGE_EXEC;
+if (type & PAGE_WRITE) {
+/* The D bit is not set -- TLB Dirty Bit Fault.  */
+ret = EXCP_TLB_DIRTY;
+}
+} else if (unlikely(ent->b)) {
+prot &= PAGE_READ | PAGE_EXEC;
+if (type & PAGE_WRITE) {
+/*
+ * The B bit is set -- Data Memory Break Fault.
+ * Except when PSW_X is set, allow this single access to succeed.
+ * The write bit will be invalidated for subsequent accesses.
+ */
+if (env->psw_xb & PSW_X) {
+prot |= PAGE_WRITE_INV;
+} else {
+ret = EXCP_DMB;
+}
+}
 }
 
  egress:
-- 
2.34.1

[PATCH 45/45] target/hppa: Log cpu state on return-from-interrupt

Inverse of the logging on taking an interrupt.

Signed-off-by: Richard Henderson 
---
 target/hppa/sys_helper.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/target/hppa/sys_helper.c b/target/hppa/sys_helper.c
index 22d6c89964..9b43b556fd 100644
--- a/target/hppa/sys_helper.c
+++ b/target/hppa/sys_helper.c
@@ -18,6 +18,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/log.h"
 #include "cpu.h"
 #include "exec/exec-all.h"
 #include "exec/helper-proto.h"
@@ -93,6 +94,17 @@ void HELPER(rfi)(CPUHPPAState *env)
 env->iaoq_b = env->cr_back[1];
 env->iasq_f = (env->cr[CR_IIASQ] << 32) & ~(env->iaoq_f & mask);
 env->iasq_b = (env->cr_back[0] << 32) & ~(env->iaoq_b & mask);
+
+if (qemu_loglevel_mask(CPU_LOG_INT)) {
+FILE *logfile = qemu_log_trylock();
+if (logfile) {
+CPUState *cs = env_cpu(env);
+
+fprintf(logfile, "RFI: cpu %d\n", cs->cpu_index);
+hppa_cpu_dump_state(cs, logfile, 0);
+qemu_log_unlock(logfile);
+}
+}
 }
 
 static void getshadowregs(CPUHPPAState *env)
-- 
2.34.1

[PATCH 28/45] target/hppa: Introduce DisasDelayException

Allow an exception to be emitted at the end of the TranslationBlock,
leaving only the conditional branch inline.  Use it for simple
exception instructions like break, which happen to be nullified.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 60 +
 1 file changed, 55 insertions(+), 5 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 5714e2ad25..7a92901e18 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -50,6 +50,17 @@ typedef struct DisasIAQE {
 int64_t disp;
 } DisasIAQE;
 
+typedef struct DisasDelayException {
+struct DisasDelayException *next;
+TCGLabel *lab;
+uint32_t insn;
+bool set_iir;
+int8_t set_n;
+uint8_t excp;
+/* Saved state at parent insn. */
+DisasIAQE iaq_f, iaq_b;
+} DisasDelayException;
+
 typedef struct DisasContext {
 DisasContextBase base;
 CPUState *cs;
@@ -65,6 +76,7 @@ typedef struct DisasContext {
 DisasCond null_cond;
 TCGLabel *null_lab;
 
+DisasDelayException *delay_excp_list;
 TCGv_i64 zero;
 
 uint32_t insn;
@@ -682,13 +694,38 @@ static void gen_excp(DisasContext *ctx, int exception)
 ctx->base.is_jmp = DISAS_NORETURN;
 }
 
+static DisasDelayException *delay_excp(DisasContext *ctx, uint8_t excp)
+{
+DisasDelayException *e = tcg_malloc(sizeof(DisasDelayException));
+
+memset(e, 0, sizeof(*e));
+e->next = ctx->delay_excp_list;
+ctx->delay_excp_list = e;
+
+e->lab = gen_new_label();
+e->insn = ctx->insn;
+e->set_iir = true;
+e->set_n = ctx->psw_n_nonzero ? 0 : -1;
+e->excp = excp;
+e->iaq_f = ctx->iaq_f;
+e->iaq_b = ctx->iaq_b;
+
+return e;
+}
+
 static bool gen_excp_iir(DisasContext *ctx, int exc)
 {
-nullify_over(ctx);
-tcg_gen_st_i64(tcg_constant_i64(ctx->insn),
-   tcg_env, offsetof(CPUHPPAState, cr[CR_IIR]));
-gen_excp(ctx, exc);
-return nullify_end(ctx);
+if (ctx->null_cond.c == TCG_COND_NEVER) {
+tcg_gen_st_i64(tcg_constant_i64(ctx->insn),
+   tcg_env, offsetof(CPUHPPAState, cr[CR_IIR]));
+gen_excp(ctx, exc);
+} else {
+DisasDelayException *e = delay_excp(ctx, exc);
+tcg_gen_brcond_i64(tcg_invert_cond(ctx->null_cond.c),
+   ctx->null_cond.a0, ctx->null_cond.a1, e->lab);
+ctx->null_cond = cond_make_f();
+}
+return true;
 }
 
 static bool gen_illegal(DisasContext *ctx)
@@ -4695,6 +4732,19 @@ static void hppa_tr_tb_stop(DisasContextBase *dcbase, 
CPUState *cs)
 default:
 g_assert_not_reached();
 }
+
+for (DisasDelayException *e = ctx->delay_excp_list; e ; e = e->next) {
+gen_set_label(e->lab);
+if (e->set_n >= 0) {
+tcg_gen_movi_i64(cpu_psw_n, e->set_n);
+}
+if (e->set_iir) {
+tcg_gen_st_i64(tcg_constant_i64(e->insn), tcg_env,
+   offsetof(CPUHPPAState, cr[CR_IIR]));
+}
+install_iaq_entries(ctx, >iaq_f, >iaq_b);
+gen_excp_1(e->excp);
+}
 }
 
 static void hppa_tr_disas_log(const DisasContextBase *dcbase,
-- 
2.34.1

[PATCH 14/45] target/hppa: Add space argument to do_ibranch

This allows unification of BE, BLR, BV, BVE with a common helper.
Since we can now track space with IAQ_Next, we can now let the
TranslationBlock continue across the delay slot with BE, BVE.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 76 ++---
 1 file changed, 26 insertions(+), 50 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 6b3b298678..2ddaefde21 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1912,8 +1912,8 @@ static bool do_cbranch(DisasContext *ctx, int64_t disp, 
bool is_n,
 
 /* Emit an unconditional branch to an indirect target.  This handles
nullification of the branch itself.  */
-static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
-   unsigned link, bool is_n)
+static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest, TCGv_i64 dspc,
+   unsigned link, bool with_sr0, bool is_n)
 {
 TCGv_i64 next;
 
@@ -1921,10 +1921,10 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 next = tcg_temp_new_i64();
 tcg_gen_mov_i64(next, dest);
 
-install_link(ctx, link, false);
+install_link(ctx, link, with_sr0);
 if (is_n) {
 if (use_nullify_skip(ctx)) {
-install_iaq_entries(ctx, -1, next, NULL, -1, NULL, NULL);
+install_iaq_entries(ctx, -1, next, dspc, -1, NULL, NULL);
 nullify_set(ctx, 0);
 ctx->base.is_jmp = DISAS_IAQ_N_UPDATED;
 return true;
@@ -1933,6 +1933,7 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 }
 ctx->iaoq_n = -1;
 ctx->iaoq_n_var = next;
+ctx->iasq_n = dspc;
 return true;
 }
 
@@ -1941,13 +1942,13 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 next = tcg_temp_new_i64();
 tcg_gen_mov_i64(next, dest);
 
-install_link(ctx, link, false);
+install_link(ctx, link, with_sr0);
 if (is_n && use_nullify_skip(ctx)) {
-install_iaq_entries(ctx, -1, next, NULL, -1, NULL, NULL);
+install_iaq_entries(ctx, -1, next, dspc, -1, NULL, NULL);
 nullify_set(ctx, 0);
 } else {
 install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, ctx->iasq_b,
--1, next, NULL);
+-1, next, dspc);
 nullify_set(ctx, is_n);
 }
 
@@ -3914,33 +3915,18 @@ static bool trans_depi_sar(DisasContext *ctx, 
arg_depi_sar *a)
 
 static bool trans_be(DisasContext *ctx, arg_be *a)
 {
-TCGv_i64 tmp;
+TCGv_i64 dest = tcg_temp_new_i64();
+TCGv_i64 space = NULL;
 
-tmp = tcg_temp_new_i64();
-tcg_gen_addi_i64(tmp, load_gpr(ctx, a->b), a->disp);
-tmp = do_ibranch_priv(ctx, tmp);
+tcg_gen_addi_i64(dest, load_gpr(ctx, a->b), a->disp);
+dest = do_ibranch_priv(ctx, dest);
 
-#ifdef CONFIG_USER_ONLY
-return do_ibranch(ctx, tmp, a->l, a->n);
-#else
-TCGv_i64 new_spc = tcg_temp_new_i64();
-
-nullify_over(ctx);
-
-load_spr(ctx, new_spc, a->sp);
-install_link(ctx, a->l, true);
-if (a->n && use_nullify_skip(ctx)) {
-install_iaq_entries(ctx, -1, tmp, new_spc, -1, NULL, new_spc);
-nullify_set(ctx, 0);
-} else {
-install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, ctx->iasq_b,
--1, tmp, new_spc);
-nullify_set(ctx, a->n);
-}
-tcg_gen_lookup_and_goto_ptr();
-ctx->base.is_jmp = DISAS_NORETURN;
-return nullify_end(ctx);
+#ifndef CONFIG_USER_ONLY
+space = tcg_temp_new_i64();
+load_spr(ctx, space, a->sp);
 #endif
+
+return do_ibranch(ctx, dest, space, a->l, true, a->n);
 }
 
 static bool trans_bl(DisasContext *ctx, arg_bl *a)
@@ -4009,7 +3995,7 @@ static bool trans_blr(DisasContext *ctx, arg_blr *a)
 tcg_gen_shli_i64(tmp, load_gpr(ctx, a->x), 3);
 tcg_gen_addi_i64(tmp, tmp, ctx->iaoq_f + 8);
 /* The computation here never changes privilege level.  */
-return do_ibranch(ctx, tmp, a->l, a->n);
+return do_ibranch(ctx, tmp, NULL, a->l, false, a->n);
 } else {
 /* BLR R0,RX is a good way to load PC+8 into RX.  */
 return do_dbranch(ctx, 0, a->l, a->n);
@@ -4028,30 +4014,20 @@ static bool trans_bv(DisasContext *ctx, arg_bv *a)
 tcg_gen_add_i64(dest, dest, load_gpr(ctx, a->b));
 }
 dest = do_ibranch_priv(ctx, dest);
-return do_ibranch(ctx, dest, 0, a->n);
+return do_ibranch(ctx, dest, NULL, 0, false, a->n);
 }
 
 static bool trans_bve(DisasContext *ctx, arg_bve *a)
 {
-TCGv_i64 dest;
+TCGv_i64 b = load_gpr(ctx, a->b);
+TCGv_i64 dest = do_ibranch_priv(ctx, b);
+TCGv_i64 space = NULL;
 
-#ifdef CONFIG_USER_ONLY
-dest = do_ibranch_priv(ctx, load_gpr(ctx, a->b));
-return do_ibranch(ctx, dest, a->l, a->n);
-#else
-nullify_over(ctx);
-dest = tcg_temp_new_i64();
-tcg_gen_mov_i64(dest, load_gpr(ctx, a->b));
-dest =

[PATCH 05/45] target/hppa: Allow prior nullification in do_ibranch

Simplify the function by not attempting a conditional move
on the branch destination -- just use nullify_over normally.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 73 +++--
 1 file changed, 20 insertions(+), 53 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index cbf78a4007..ceba7a98e5 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1870,17 +1870,15 @@ static bool do_cbranch(DisasContext *ctx, int64_t disp, 
bool is_n,
 static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
unsigned link, bool is_n)
 {
-TCGv_i64 a0, a1, next, tmp;
-TCGCond c;
+TCGv_i64 next;
 
-assert(ctx->null_lab == NULL);
+if (ctx->null_cond.c == TCG_COND_NEVER && ctx->null_lab == NULL) {
+next = tcg_temp_new_i64();
+tcg_gen_mov_i64(next, dest);
 
-if (ctx->null_cond.c == TCG_COND_NEVER) {
 if (link != 0) {
 copy_iaoq_entry(ctx, cpu_gr[link], ctx->iaoq_n, ctx->iaoq_n_var);
 }
-next = tcg_temp_new_i64();
-tcg_gen_mov_i64(next, dest);
 if (is_n) {
 if (use_nullify_skip(ctx)) {
 copy_iaoq_entry(ctx, cpu_iaoq_f, -1, next);
@@ -1894,60 +1892,29 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 }
 ctx->iaoq_n = -1;
 ctx->iaoq_n_var = next;
-} else if (is_n && use_nullify_skip(ctx)) {
-/* The (conditional) branch, B, nullifies the next insn, N,
-   and we're allowed to skip execution N (no single-step or
-   tracepoint in effect).  Since the goto_ptr that we must use
-   for the indirect branch consumes no special resources, we
-   can (conditionally) skip B and continue execution.  */
-/* The use_nullify_skip test implies we have a known control path.  */
-tcg_debug_assert(ctx->iaoq_b != -1);
-tcg_debug_assert(ctx->iaoq_n != -1);
+return true;
+}
 
-/* We do have to handle the non-local temporary, DEST, before
-   branching.  Since IOAQ_F is not really live at this point, we
-   can simply store DEST optimistically.  Similarly with IAOQ_B.  */
+nullify_over(ctx);
+
+if (is_n && use_nullify_skip(ctx)) {
 copy_iaoq_entry(ctx, cpu_iaoq_f, -1, dest);
 next = tcg_temp_new_i64();
 tcg_gen_addi_i64(next, dest, 4);
 copy_iaoq_entry(ctx, cpu_iaoq_b, -1, next);
-
-nullify_over(ctx);
-if (link != 0) {
-copy_iaoq_entry(ctx, cpu_gr[link], ctx->iaoq_n, ctx->iaoq_n_var);
-}
-tcg_gen_lookup_and_goto_ptr();
-return nullify_end(ctx);
+nullify_set(ctx, 0);
 } else {
-c = ctx->null_cond.c;
-a0 = ctx->null_cond.a0;
-a1 = ctx->null_cond.a1;
-
-tmp = tcg_temp_new_i64();
-next = tcg_temp_new_i64();
-
-copy_iaoq_entry(ctx, tmp, ctx->iaoq_n, ctx->iaoq_n_var);
-tcg_gen_movcond_i64(c, next, a0, a1, tmp, dest);
-ctx->iaoq_n = -1;
-ctx->iaoq_n_var = next;
-
-if (link != 0) {
-tcg_gen_movcond_i64(c, cpu_gr[link], a0, a1, cpu_gr[link], tmp);
-}
-
-if (is_n) {
-/* The branch nullifies the next insn, which means the state of N
-   after the branch is the inverse of the state of N that applied
-   to the branch.  */
-tcg_gen_setcond_i64(tcg_invert_cond(c), cpu_psw_n, a0, a1);
-cond_free(>null_cond);
-ctx->null_cond = cond_make_n();
-ctx->psw_n_nonzero = true;
-} else {
-cond_free(>null_cond);
-}
+copy_iaoq_entry(ctx, cpu_iaoq_f, ctx->iaoq_b, cpu_iaoq_b);
+copy_iaoq_entry(ctx, cpu_iaoq_b, -1, dest);
+nullify_set(ctx, is_n);
 }
-return true;
+if (link != 0) {
+copy_iaoq_entry(ctx, cpu_gr[link], ctx->iaoq_n, ctx->iaoq_n_var);
+}
+
+tcg_gen_lookup_and_goto_ptr();
+ctx->base.is_jmp = DISAS_NORETURN;
+return nullify_end(ctx);
 }
 
 /* Implement
-- 
2.34.1

[PATCH 35/45] target/hppa: Split PSW X and B into their own field

Generally, both of these bits are cleared at the end of each
instruction.  By separating these, we will be able to clear
both with a single insn, instead of 2 or 3.

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.h| 3 ++-
 target/hppa/helper.c | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 66cae795bd..629299653d 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -214,7 +214,8 @@ typedef struct CPUArchState {
 uint64_t fr[32];
 uint64_t sr[8];  /* stored shifted into place for gva */
 
-target_ulong psw;/* All psw bits except the following:  */
+uint32_t psw;/* All psw bits except the following:  */
+uint32_t psw_xb; /* X and B, in their normal positions */
 target_ulong psw_n;  /* boolean */
 target_long psw_v;   /* in most significant bit */
 
diff --git a/target/hppa/helper.c b/target/hppa/helper.c
index 7d22c248fb..b79ddd8184 100644
--- a/target/hppa/helper.c
+++ b/target/hppa/helper.c
@@ -54,7 +54,7 @@ target_ulong cpu_hppa_get_psw(CPUHPPAState *env)
 
 psw |= env->psw_n * PSW_N;
 psw |= (env->psw_v < 0) * PSW_V;
-psw |= env->psw;
+psw |= env->psw | env->psw_xb;
 
 return psw;
 }
@@ -76,8 +76,8 @@ void cpu_hppa_put_psw(CPUHPPAState *env, target_ulong psw)
 }
 psw &= ~reserved;
 
-env->psw = psw & (uint32_t)~(PSW_N | PSW_V | PSW_CB);
-
+env->psw = psw & (uint32_t)~(PSW_B | PSW_N | PSW_V | PSW_X | PSW_CB);
+env->psw_xb = psw & (PSW_X | PSW_B);
 env->psw_n = (psw / PSW_N) & 1;
 env->psw_v = -((psw / PSW_V) & 1);
 
-- 
2.34.1

[PATCH 43/45] target/hppa: Implement PSW_H, PSW_L

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.c   |  4 +--
 target/hppa/translate.c | 68 +
 2 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index 42c413211a..5adbe0fe9c 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -76,10 +76,10 @@ void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
 cs_base |= env->iaoq_b & ~TARGET_PAGE_MASK;
 }
 
-/* ??? E, H, L bits need to be here, when implemented.  */
+/* ??? E bits need to be here, when implemented.  */
 flags |= env->psw_n * PSW_N;
 flags |= env->psw_xb;
-flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_P | PSW_T);
+flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_H | PSW_L | PSW_P | 
PSW_T);
 
 #ifdef CONFIG_USER_ONLY
 flags |= TB_FLAG_UNALIGN * !env_cpu(env)->prctl_unalign_sigbus;
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 7ad7aa675d..4126995604 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -55,6 +55,7 @@ typedef struct DisasDelayException {
 TCGLabel *lab;
 uint32_t insn;
 bool set_iir;
+bool set_b;
 int8_t set_n;
 uint8_t excp;
 /* Saved state at parent insn. */
@@ -744,6 +745,7 @@ static DisasDelayException *delay_excp(DisasContext *ctx, 
uint8_t excp)
 e->insn = ctx->insn;
 e->set_iir = true;
 e->set_n = ctx->psw_n_nonzero ? 0 : -1;
+e->set_b = false;
 e->excp = excp;
 e->iaq_f = ctx->iaq_f;
 e->iaq_b = ctx->iaq_b;
@@ -1872,6 +1874,54 @@ static bool do_fop_dedd(DisasContext *ctx, unsigned rt,
 return nullify_end(ctx);
 }
 
+/*
+ * Since B,GATE can only increase priv, and other indirect branches can
+ * only decrease priv, we only need to test in one direction.
+ * If maybe_priv == 0, no priv is possible with the current insn;
+ * if maybe_priv < 0, priv might increase, otherwise priv might decrease.
+ */
+static void do_priv_branch_trap(DisasContext *ctx, int maybe_priv,
+DisasIAQE *next, bool n)
+{
+DisasDelayException *e;
+uint32_t psw_bit, excp;
+TCGv_i64 new_priv;
+TCGCond cond;
+
+if (likely(maybe_priv == 0)) {
+return;
+}
+if (maybe_priv < 0) {
+psw_bit = PSW_H;
+excp = EXCP_HPT;
+cond = TCG_COND_LTU;
+} else {
+psw_bit = PSW_L;
+excp = EXCP_LPT;
+cond = TCG_COND_GTU;
+}
+if (likely(!(ctx->tb_flags & psw_bit))) {
+return;
+}
+
+e = tcg_malloc(sizeof(DisasDelayException));
+memset(e, 0, sizeof(*e));
+e->next = ctx->delay_excp_list;
+ctx->delay_excp_list = e;
+
+e->lab = gen_new_label();
+e->set_n = n ? 1 : ctx->psw_n_nonzero ? 0 : -1;
+e->set_b = ctx->psw_xb != PSW_B;
+e->excp = excp;
+e->iaq_f = ctx->iaq_b;
+e->iaq_b = *next;
+
+new_priv = tcg_temp_new_i64();
+copy_iaoq_entry(ctx, new_priv, next);
+tcg_gen_andi_i64(new_priv, new_priv, 3);
+tcg_gen_brcondi_i64(cond, new_priv, ctx->privilege, e->lab);
+}
+
 static bool do_taken_branch_trap(DisasContext *ctx, DisasIAQE *next, bool n)
 {
 if (unlikely(ctx->tb_flags & PSW_T)) {
@@ -2009,10 +2059,12 @@ static bool do_cbranch(DisasContext *ctx, int64_t disp, 
bool is_n,
  * This handles nullification of the branch itself.
  */
 static bool do_ibranch(DisasContext *ctx, unsigned link,
-   bool with_sr0, bool is_n)
+   bool with_sr0, bool is_n, int maybe_priv)
 {
 if (ctx->null_cond.c == TCG_COND_NEVER && ctx->null_lab == NULL) {
 install_link(ctx, link, with_sr0);
+
+do_priv_branch_trap(ctx, maybe_priv, >iaq_j, is_n);
 if (do_taken_branch_trap(ctx, >iaq_j, is_n)) {
 return true;
 }
@@ -2033,6 +2085,7 @@ static bool do_ibranch(DisasContext *ctx, unsigned link,
 nullify_over(ctx);
 install_link(ctx, link, with_sr0);
 
+do_priv_branch_trap(ctx, maybe_priv, >iaq_j, is_n);
 if (!do_taken_branch_trap(ctx, >iaq_j, is_n)) {
 if (is_n && use_nullify_skip(ctx)) {
 install_iaq_entries(ctx, >iaq_j, NULL);
@@ -3993,7 +4046,7 @@ static bool trans_be(DisasContext *ctx, arg_be *a)
 tcg_gen_addi_i64(ctx->iaq_j.base, load_gpr(ctx, a->b), a->disp);
 ctx->iaq_j.base = do_ibranch_priv(ctx, ctx->iaq_j.base);
 
-return do_ibranch(ctx, a->l, true, a->n);
+return do_ibranch(ctx, a->l, true, a->n, ctx->privilege == 3 ? 0 : 1);
 }
 
 static bool trans_bl(DisasContext *ctx, arg_bl *a)
@@ -4042,7 +4095,7 @@ static bool trans_b_gate(DisasContext *ctx, arg_b_gate *a)
 }
 
 if (indirect) {
-return do_ibranch(ctx, 0, false, a->n);
+return do_ibranch(ctx, 0, false, a->n, -1);
 }
 return do_dbranch(ctx, disp, 0, a->n);
 }
@@ -4060,7 +4113,7 @@ static bool trans_blr(DisasContext *ctx, arg_blr *a)
 tcg_gen_add_i64(t0, t0, t1);
 
 ctx->iaq_j = iaqe_next_absv(ctx, t0);
-return do_ibranch(ctx,

[PATCH 11/45] target/hppa: Simplify TB end

Minimize the amount of code in hppa_tr_translate_insn advancing the
insn queue for the next insn.  Move the goto_tb path to hppa_tr_tb_stop.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 109 +---
 1 file changed, 57 insertions(+), 52 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 6a73b1d409..138250b550 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -4698,54 +4698,31 @@ static void hppa_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cs)
 }
 }
 
-/* Advance the insn queue.  Note that this check also detects
-   a priority change within the instruction queue.  */
-if (ret == DISAS_NEXT && ctx->iaoq_b != ctx->iaoq_f + 4) {
-if (use_goto_tb(ctx, ctx->iaoq_b, ctx->iaoq_n)
-&& (ctx->null_cond.c == TCG_COND_NEVER
-|| ctx->null_cond.c == TCG_COND_ALWAYS)) {
-nullify_set(ctx, ctx->null_cond.c == TCG_COND_ALWAYS);
-gen_goto_tb(ctx, 0, ctx->iaoq_b, ctx->iaoq_n);
-ctx->base.is_jmp = ret = DISAS_NORETURN;
-} else {
-ctx->base.is_jmp = ret = DISAS_IAQ_N_STALE;
-}
+/* If the TranslationBlock must end, do so. */
+ctx->base.pc_next += 4;
+if (ret != DISAS_NEXT) {
+return;
 }
+/* Note this also detects a priority change. */
+if (ctx->iaoq_b != ctx->iaoq_f + 4) {
+ctx->base.is_jmp = DISAS_IAQ_N_STALE;
+return;
+}
+
+/*
+ * Advance the insn queue.
+ * The only exit now is DISAS_TOO_MANY from the translator loop.
+ */
 ctx->iaoq_f = ctx->iaoq_b;
 ctx->iaoq_b = ctx->iaoq_n;
-ctx->base.pc_next += 4;
-
-switch (ret) {
-case DISAS_NORETURN:
-case DISAS_IAQ_N_UPDATED:
-break;
-
-case DISAS_NEXT:
-case DISAS_IAQ_N_STALE:
-case DISAS_IAQ_N_STALE_EXIT:
-if (ctx->iaoq_f == -1) {
-install_iaq_entries(ctx, -1, cpu_iaoq_b,
-ctx->iaoq_n, ctx->iaoq_n_var);
-#ifndef CONFIG_USER_ONLY
-tcg_gen_mov_i64(cpu_iasq_f, cpu_iasq_b);
-#endif
-nullify_save(ctx);
-ctx->base.is_jmp = (ret == DISAS_IAQ_N_STALE_EXIT
-? DISAS_EXIT
-: DISAS_IAQ_N_UPDATED);
-} else if (ctx->iaoq_b == -1) {
-if (ctx->iaoq_n_var) {
-copy_iaoq_entry(ctx, cpu_iaoq_b, -1, ctx->iaoq_n_var);
-} else {
-tcg_gen_addi_i64(cpu_iaoq_b, cpu_iaoq_b, 4);
-tcg_gen_andi_i64(cpu_iaoq_b, cpu_iaoq_b,
- gva_offset_mask(ctx->tb_flags));
-}
+if (ctx->iaoq_b == -1) {
+if (ctx->iaoq_n_var) {
+copy_iaoq_entry(ctx, cpu_iaoq_b, -1, ctx->iaoq_n_var);
+} else {
+tcg_gen_addi_i64(cpu_iaoq_b, cpu_iaoq_b, 4);
+tcg_gen_andi_i64(cpu_iaoq_b, cpu_iaoq_b,
+ gva_offset_mask(ctx->tb_flags));
 }
-break;
-
-default:
-g_assert_not_reached();
 }
 }
 
@@ -4753,23 +4730,51 @@ static void hppa_tr_tb_stop(DisasContextBase *dcbase, 
CPUState *cs)
 {
 DisasContext *ctx = container_of(dcbase, DisasContext, base);
 DisasJumpType is_jmp = ctx->base.is_jmp;
+uint64_t fi, bi;
+TCGv_i64 fv, bv;
+TCGv_i64 fs;
+
+/* Assume the insn queue has not been advanced. */
+fi = ctx->iaoq_b;
+fv = cpu_iaoq_b;
+fs = fi == -1 ? cpu_iasq_b : NULL;
+bi = ctx->iaoq_n;
+bv = ctx->iaoq_n_var;
 
 switch (is_jmp) {
 case DISAS_NORETURN:
 break;
 case DISAS_TOO_MANY:
-case DISAS_IAQ_N_STALE:
-case DISAS_IAQ_N_STALE_EXIT:
-install_iaq_entries(ctx, ctx->iaoq_f, cpu_iaoq_f,
-ctx->iaoq_b, cpu_iaoq_b);
-nullify_save(ctx);
+/* The insn queue has not been advanced. */
+bi = fi;
+bv = fv;
+fi = ctx->iaoq_f;
+fv = NULL;
+fs = NULL;
 /* FALLTHRU */
-case DISAS_IAQ_N_UPDATED:
-if (is_jmp != DISAS_IAQ_N_STALE_EXIT) {
-tcg_gen_lookup_and_goto_ptr();
+case DISAS_IAQ_N_STALE:
+if (use_goto_tb(ctx, fi, bi)
+&& (ctx->null_cond.c == TCG_COND_NEVER
+|| ctx->null_cond.c == TCG_COND_ALWAYS)) {
+nullify_set(ctx, ctx->null_cond.c == TCG_COND_ALWAYS);
+gen_goto_tb(ctx, 0, fi, bi);
 break;
 }
 /* FALLTHRU */
+case DISAS_IAQ_N_STALE_EXIT:
+install_iaq_entries(ctx, fi, fv, bi, bv);
+if (fs) {
+tcg_gen_mov_i64(cpu_iasq_f, fs);
+}
+nullify_save(ctx);
+if (is_jmp == DISAS_IAQ_N_STALE_EXIT) {
+tcg_gen_exit_tb(NULL, 0);
+break;
+}
+/* FALLTHRU */
+case DISAS_IAQ_N_UPDATED:
+tcg_gen_lookup_and_goto_ptr();
+break;
 case DISAS_EXIT:

[PATCH 41/45] target/hppa: Implement CF_PCREL

Now that the groundwork has been laid, enabling CF_PCREL within the
translator proper is a simple matter of updating copy_iaoq_entry
and install_iaq_entries.

We also need to modify the unwind info, since we no longer have
absolute addresses to install.

As expected, this reduces the runtime overhead of compilation when
running a Linux kernel with address space randomization enabled.

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.c   | 19 ++--
 target/hppa/translate.c | 68 -
 2 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index 5f0df0697a..b3f3f070d3 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -62,10 +62,6 @@ void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
 *pc = hppa_cpu_get_pc(env_cpu(env));
 flags |= (env->iaoq_f & 3) << TB_FLAG_PRIV_SHIFT;
 
-if (hppa_is_pa20(env)) {
-cs_base = env->iaoq_f & MAKE_64BIT_MASK(32, 32);
-}
-
 /*
  * The only really interesting case is if IAQ_Back is on the same page
  * as IAQ_Front, so that we can use goto_tb between the blocks.  In all
@@ -113,19 +109,19 @@ static void hppa_restore_state_to_opc(CPUState *cs,
   const TranslationBlock *tb,
   const uint64_t *data)
 {
-HPPACPU *cpu = HPPA_CPU(cs);
+CPUHPPAState *env = cpu_env(cs);
 
-cpu->env.iaoq_f = data[0];
-if (data[1] != (target_ulong)-1) {
-cpu->env.iaoq_b = data[1];
+env->iaoq_f = (env->iaoq_f & TARGET_PAGE_MASK) | data[0];
+if (data[1] != INT32_MIN) {
+env->iaoq_b = env->iaoq_f + data[1];
 }
-cpu->env.unwind_breg = data[2];
+env->unwind_breg = data[2];
 /*
  * Since we were executing the instruction at IAOQ_F, and took some
  * sort of action that provoked the cpu_restore_state, we can infer
  * that the instruction was not nullified.
  */
-cpu->env.psw_n = 0;
+env->psw_n = 0;
 }
 
 static bool hppa_cpu_has_work(CPUState *cs)
@@ -191,6 +187,9 @@ static void hppa_cpu_realizefn(DeviceState *dev, Error 
**errp)
 hppa_ptlbe(>env);
 }
 #endif
+
+/* Use pc-relative instructions always to simplify the translator. */
+cs->tcg_cflags |= CF_PCREL;
 }
 
 static void hppa_cpu_initfn(Object *obj)
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 3ae196490a..b2cc81c685 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -46,7 +46,7 @@ typedef struct DisasIAQE {
 TCGv_i64 space;
 /* IAOQ base; may be null for relative address. */
 TCGv_i64 base;
-/* IAOQ addend; if base is null, relative to ctx->iaoq_first. */
+/* IAOQ addend; if base is null, relative to cpu_iaoq_f. */
 int64_t disp;
 } DisasIAQE;
 
@@ -663,11 +663,7 @@ static DisasIAQE iaqe_next_absv(DisasContext *ctx, 
TCGv_i64 var)
 static void copy_iaoq_entry(DisasContext *ctx, TCGv_i64 dest,
 const DisasIAQE *src)
 {
-if (src->base == NULL) {
-tcg_gen_movi_i64(dest, ctx->iaoq_first + src->disp);
-} else {
-tcg_gen_addi_i64(dest, src->base, src->disp);
-}
+tcg_gen_addi_i64(dest, src->base ? : cpu_iaoq_f, src->disp);
 }
 
 static void install_iaq_entries(DisasContext *ctx, const DisasIAQE *f,
@@ -679,8 +675,28 @@ static void install_iaq_entries(DisasContext *ctx, const 
DisasIAQE *f,
 b_next = iaqe_incr(f, 4);
 b = _next;
 }
-copy_iaoq_entry(ctx, cpu_iaoq_f, f);
-copy_iaoq_entry(ctx, cpu_iaoq_b, b);
+
+/*
+ * There is an edge case
+ *bv   r0(rN)
+ *b,l  disp,r0
+ * for which F will use cpu_iaoq_b (from the indirect branch),
+ * and B will use cpu_iaoq_f (from the direct branch).
+ * In this case we need an extra temporary.
+ */
+if (f->base != cpu_iaoq_b) {
+copy_iaoq_entry(ctx, cpu_iaoq_b, b);
+copy_iaoq_entry(ctx, cpu_iaoq_f, f);
+} else if (f->base == b->base) {
+copy_iaoq_entry(ctx, cpu_iaoq_f, f);
+tcg_gen_addi_i64(cpu_iaoq_b, cpu_iaoq_f, b->disp - f->disp);
+} else {
+TCGv_i64 tmp = tcg_temp_new_i64();
+copy_iaoq_entry(ctx, tmp, b);
+copy_iaoq_entry(ctx, cpu_iaoq_f, f);
+tcg_gen_mov_i64(cpu_iaoq_b, tmp);
+}
+
 if (f->space) {
 tcg_gen_mov_i64(cpu_iasq_f, f->space);
 }
@@ -3978,9 +3994,8 @@ static bool trans_b_gate(DisasContext *ctx, arg_b_gate *a)
 /* Adjust the dest offset for the privilege change from the PTE. */
 TCGv_i64 off = tcg_temp_new_i64();
 
-gen_helper_b_gate_priv(off, tcg_env,
-   tcg_constant_i64(ctx->iaoq_first
-+ ctx->iaq_f.disp));
+copy_iaoq_entry(ctx, off, >iaq_f);
+gen_helper_b_gate_priv(off, tcg_env, off);
 
 ctx->iaq_j.base = off;
 ctx->iaq_j.disp = disp + 8;
@@ -4601,7 +4616,7 @@ static bool

[PATCH 04/45] target/hppa: Pass displacement to do_dbranch

Pass a displacement instead of an absolute value.

In trans_be, remove the user-only do_dbranch case.  The branch we are
attempting to optimize is to the zero page, which is perforce on a
different page than the code currently executing, which means that
we will *not* use a goto_tb.  Use a plain indirect branch instead,
which is what we got out of the attempted direct branch anyway.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 33 +
 1 file changed, 9 insertions(+), 24 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index cb874e1c1e..cbf78a4007 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1765,9 +1765,11 @@ static bool do_fop_dedd(DisasContext *ctx, unsigned rt,
 
 /* Emit an unconditional branch to a direct target, which may or may not
have already had nullification handled.  */
-static bool do_dbranch(DisasContext *ctx, uint64_t dest,
+static bool do_dbranch(DisasContext *ctx, int64_t disp,
unsigned link, bool is_n)
 {
+uint64_t dest = iaoq_dest(ctx, disp);
+
 if (ctx->null_cond.c == TCG_COND_NEVER && ctx->null_lab == NULL) {
 if (link != 0) {
 copy_iaoq_entry(ctx, cpu_gr[link], ctx->iaoq_n, ctx->iaoq_n_var);
@@ -1814,10 +1816,7 @@ static bool do_cbranch(DisasContext *ctx, int64_t disp, 
bool is_n,
 
 /* Handle TRUE and NEVER as direct branches.  */
 if (c == TCG_COND_ALWAYS) {
-return do_dbranch(ctx, dest, 0, is_n && disp >= 0);
-}
-if (c == TCG_COND_NEVER) {
-return do_dbranch(ctx, ctx->iaoq_n, 0, is_n && disp < 0);
+return do_dbranch(ctx, disp, 0, is_n && disp >= 0);
 }
 
 taken = gen_new_label();
@@ -3913,22 +3912,6 @@ static bool trans_be(DisasContext *ctx, arg_be *a)
 {
 TCGv_i64 tmp;
 
-#ifdef CONFIG_USER_ONLY
-/* ??? It seems like there should be a good way of using
-   "be disp(sr2, r0)", the canonical gateway entry mechanism
-   to our advantage.  But that appears to be inconvenient to
-   manage along side branch delay slots.  Therefore we handle
-   entry into the gateway page via absolute address.  */
-/* Since we don't implement spaces, just branch.  Do notice the special
-   case of "be disp(*,r0)" using a direct branch to disp, so that we can
-   goto_tb to the TB containing the syscall.  */
-if (a->b == 0) {
-return do_dbranch(ctx, a->disp, a->l, a->n);
-}
-#else
-nullify_over(ctx);
-#endif
-
 tmp = tcg_temp_new_i64();
 tcg_gen_addi_i64(tmp, load_gpr(ctx, a->b), a->disp);
 tmp = do_ibranch_priv(ctx, tmp);
@@ -3938,6 +3921,8 @@ static bool trans_be(DisasContext *ctx, arg_be *a)
 #else
 TCGv_i64 new_spc = tcg_temp_new_i64();
 
+nullify_over(ctx);
+
 load_spr(ctx, new_spc, a->sp);
 if (a->l) {
 copy_iaoq_entry(ctx, cpu_gr[31], ctx->iaoq_n, ctx->iaoq_n_var);
@@ -3967,7 +3952,7 @@ static bool trans_be(DisasContext *ctx, arg_be *a)
 
 static bool trans_bl(DisasContext *ctx, arg_bl *a)
 {
-return do_dbranch(ctx, iaoq_dest(ctx, a->disp), a->l, a->n);
+return do_dbranch(ctx, a->disp, a->l, a->n);
 }
 
 static bool trans_b_gate(DisasContext *ctx, arg_b_gate *a)
@@ -4021,7 +4006,7 @@ static bool trans_b_gate(DisasContext *ctx, arg_b_gate *a)
 save_gpr(ctx, a->l, tmp);
 }
 
-return do_dbranch(ctx, dest, 0, a->n);
+return do_dbranch(ctx, dest - iaoq_dest(ctx, 0), 0, a->n);
 }
 
 static bool trans_blr(DisasContext *ctx, arg_blr *a)
@@ -4034,7 +4019,7 @@ static bool trans_blr(DisasContext *ctx, arg_blr *a)
 return do_ibranch(ctx, tmp, a->l, a->n);
 } else {
 /* BLR R0,RX is a good way to load PC+8 into RX.  */
-return do_dbranch(ctx, ctx->iaoq_f + 8, a->l, a->n);
+return do_dbranch(ctx, 0, a->l, a->n);
 }
 }
 
-- 
2.34.1

[PATCH 10/45] target/hppa: Skip nullified insns in unconditional dbranch path

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index ac181180a6..6a73b1d409 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1804,11 +1804,17 @@ static bool do_dbranch(DisasContext *ctx, int64_t disp,
 
 if (ctx->null_cond.c == TCG_COND_NEVER && ctx->null_lab == NULL) {
 install_link(ctx, link, false);
-ctx->iaoq_n = dest;
-ctx->iaoq_n_var = NULL;
 if (is_n) {
+if (use_nullify_skip(ctx)) {
+nullify_set(ctx, 0);
+gen_goto_tb(ctx, 0, dest, dest + 4);
+ctx->base.is_jmp = DISAS_NORETURN;
+return true;
+}
 ctx->null_cond.c = TCG_COND_ALWAYS;
 }
+ctx->iaoq_n = dest;
+ctx->iaoq_n_var = NULL;
 } else {
 nullify_over(ctx);
 
-- 
2.34.1

[PATCH 08/45] target/hppa: Add install_link

Add a common routine for writing the return address.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 54 +++--
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index cad33e7aa6..195a0e7e79 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -633,6 +633,23 @@ static void install_iaq_entries(DisasContext *ctx, 
uint64_t bi, TCGv_i64 bv,
 }
 }
 
+static void install_link(DisasContext *ctx, unsigned link, bool with_sr0)
+{
+tcg_debug_assert(ctx->null_cond.c == TCG_COND_NEVER);
+if (link) {
+if (ctx->iaoq_b == -1) {
+tcg_gen_addi_i64(cpu_gr[link], cpu_iaoq_b, 4);
+} else {
+tcg_gen_movi_i64(cpu_gr[link], ctx->iaoq_b + 4);
+}
+#ifndef CONFIG_USER_ONLY
+if (with_sr0) {
+tcg_gen_mov_i64(cpu_sr[0], cpu_iasq_b);
+}
+#endif
+}
+}
+
 static inline uint64_t iaoq_dest(DisasContext *ctx, int64_t disp)
 {
 return ctx->iaoq_f + disp + 8;
@@ -1786,9 +1803,7 @@ static bool do_dbranch(DisasContext *ctx, int64_t disp,
 uint64_t dest = iaoq_dest(ctx, disp);
 
 if (ctx->null_cond.c == TCG_COND_NEVER && ctx->null_lab == NULL) {
-if (link != 0) {
-copy_iaoq_entry(ctx, cpu_gr[link], ctx->iaoq_n, ctx->iaoq_n_var);
-}
+install_link(ctx, link, false);
 ctx->iaoq_n = dest;
 if (is_n) {
 ctx->null_cond.c = TCG_COND_ALWAYS;
@@ -1796,10 +1811,7 @@ static bool do_dbranch(DisasContext *ctx, int64_t disp,
 } else {
 nullify_over(ctx);
 
-if (link != 0) {
-copy_iaoq_entry(ctx, cpu_gr[link], ctx->iaoq_n, ctx->iaoq_n_var);
-}
-
+install_link(ctx, link, false);
 if (is_n && use_nullify_skip(ctx)) {
 nullify_set(ctx, 0);
 gen_goto_tb(ctx, 0, dest, dest + 4);
@@ -1891,9 +1903,7 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 next = tcg_temp_new_i64();
 tcg_gen_mov_i64(next, dest);
 
-if (link != 0) {
-copy_iaoq_entry(ctx, cpu_gr[link], ctx->iaoq_n, ctx->iaoq_n_var);
-}
+install_link(ctx, link, false);
 if (is_n) {
 if (use_nullify_skip(ctx)) {
 install_iaq_entries(ctx, -1, next, -1, NULL);
@@ -1910,16 +1920,17 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 
 nullify_over(ctx);
 
+next = tcg_temp_new_i64();
+tcg_gen_mov_i64(next, dest);
+
+install_link(ctx, link, false);
 if (is_n && use_nullify_skip(ctx)) {
-install_iaq_entries(ctx, -1, dest, -1, NULL);
+install_iaq_entries(ctx, -1, next, -1, NULL);
 nullify_set(ctx, 0);
 } else {
-install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, -1, dest);
+install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, -1, next);
 nullify_set(ctx, is_n);
 }
-if (link != 0) {
-copy_iaoq_entry(ctx, cpu_gr[link], ctx->iaoq_n, ctx->iaoq_n_var);
-}
 
 tcg_gen_lookup_and_goto_ptr();
 ctx->base.is_jmp = DISAS_NORETURN;
@@ -3898,10 +3909,7 @@ static bool trans_be(DisasContext *ctx, arg_be *a)
 nullify_over(ctx);
 
 load_spr(ctx, new_spc, a->sp);
-if (a->l) {
-copy_iaoq_entry(ctx, cpu_gr[31], ctx->iaoq_n, ctx->iaoq_n_var);
-tcg_gen_mov_i64(cpu_sr[0], cpu_iasq_b);
-}
+install_link(ctx, a->l, true);
 if (a->n && use_nullify_skip(ctx)) {
 install_iaq_entries(ctx, -1, tmp, -1, NULL);
 tcg_gen_mov_i64(cpu_iasq_f, new_spc);
@@ -4018,16 +4026,16 @@ static bool trans_bve(DisasContext *ctx, arg_bve *a)
 return do_ibranch(ctx, dest, a->l, a->n);
 #else
 nullify_over(ctx);
-dest = do_ibranch_priv(ctx, load_gpr(ctx, a->b));
+dest = tcg_temp_new_i64();
+tcg_gen_mov_i64(dest, load_gpr(ctx, a->b));
+dest = do_ibranch_priv(ctx, dest);
 
+install_link(ctx, a->l, false);
 install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, -1, dest);
 if (ctx->iaoq_b == -1) {
 tcg_gen_mov_i64(cpu_iasq_f, cpu_iasq_b);
 }
 tcg_gen_mov_i64(cpu_iasq_b, space_select(ctx, 0, dest));
-if (a->l) {
-copy_iaoq_entry(ctx, cpu_gr[a->l], ctx->iaoq_n, ctx->iaoq_n_var);
-}
 nullify_set(ctx, a->n);
 tcg_gen_lookup_and_goto_ptr();
 ctx->base.is_jmp = DISAS_NORETURN;
-- 
2.34.1

[PATCH 15/45] target/hppa: Use umax in do_ibranch_priv

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 2ddaefde21..7e01c21141 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1980,7 +1980,7 @@ static TCGv_i64 do_ibranch_priv(DisasContext *ctx, 
TCGv_i64 offset)
 dest = tcg_temp_new_i64();
 tcg_gen_andi_i64(dest, offset, -4);
 tcg_gen_ori_i64(dest, dest, ctx->privilege);
-tcg_gen_movcond_i64(TCG_COND_GTU, dest, dest, offset, dest, offset);
+tcg_gen_umax_i64(dest, dest, offset);
 break;
 }
 return dest;
-- 
2.34.1

[PATCH 37/45] target/hppa: Implement PSW_B

PSW_B causes B,GATE to trap as an illegal instruction, removing
the sequential execution test that was merely an approximation.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index a49cf09518..a4200742bd 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -2060,11 +2060,8 @@ static void do_page_zero(DisasContext *ctx)
 g_assert_not_reached();
 }
 
-/* Check that we didn't arrive here via some means that allowed
-   non-sequential instruction execution.  Normally the PSW[B] bit
-   detects this by disallowing the B,GATE instruction to execute
-   under such conditions.  */
-if (iaqe_variable(>iaq_b) || ctx->iaq_b.disp != 4) {
+/* If PSW[B] is set, the B,GATE insn would trap. */
+if (ctx->psw_xb & PSW_B) {
 goto do_sigill;
 }
 
@@ -3963,23 +3960,13 @@ static bool trans_b_gate(DisasContext *ctx, arg_b_gate 
*a)
 {
 int64_t disp = a->disp;
 
-nullify_over(ctx);
-
-/* Make sure the caller hasn't done something weird with the queue.
- * ??? This is not quite the same as the PSW[B] bit, which would be
- * expensive to track.  Real hardware will trap for
- *b  gateway
- *b  gateway+4  (in delay slot of first branch)
- * However, checking for a non-sequential instruction queue *will*
- * diagnose the security hole
- *b  gateway
- *b  evil
- * in which instructions at evil would run with increased privs.
- */
-if (iaqe_variable(>iaq_b) || ctx->iaq_b.disp != ctx->iaq_f.disp + 4) {
+/* Trap if PSW[B] is set. */
+if (ctx->psw_xb & PSW_B) {
 return gen_illegal(ctx);
 }
 
+nullify_over(ctx);
+
 #ifndef CONFIG_USER_ONLY
 if (ctx->tb_flags & PSW_C) {
 int type = hppa_artype_for_page(cpu_env(ctx->cs), ctx->base.pc_next);
-- 
2.34.1

[PATCH 26/45] target/hppa: Use TCG_COND_TST* in trans_ftest

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 22 ++
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 4b9092b1cf..b1311e7688 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -4309,6 +4309,8 @@ static bool trans_fcmp_d(DisasContext *ctx, arg_fclass2 
*a)
 
 static bool trans_ftest(DisasContext *ctx, arg_ftest *a)
 {
+TCGCond tc = TCG_COND_TSTNE;
+uint32_t mask;
 TCGv_i64 t;
 
 nullify_over(ctx);
@@ -4317,21 +4319,18 @@ static bool trans_ftest(DisasContext *ctx, arg_ftest *a)
 tcg_gen_ld32u_i64(t, tcg_env, offsetof(CPUHPPAState, fr0_shadow));
 
 if (a->y == 1) {
-int mask;
-bool inv = false;
-
 switch (a->c) {
 case 0: /* simple */
 mask = R_FPSR_C_MASK;
 break;
 case 2: /* rej */
-inv = true;
+tc = TCG_COND_TSTEQ;
 /* fallthru */
 case 1: /* acc */
 mask = R_FPSR_C_MASK | R_FPSR_CQ_MASK;
 break;
 case 6: /* rej8 */
-inv = true;
+tc = TCG_COND_TSTEQ;
 /* fallthru */
 case 5: /* acc8 */
 mask = R_FPSR_C_MASK | R_FPSR_CQ0_6_MASK;
@@ -4349,21 +4348,12 @@ static bool trans_ftest(DisasContext *ctx, arg_ftest *a)
 gen_illegal(ctx);
 return true;
 }
-if (inv) {
-TCGv_i64 c = tcg_constant_i64(mask);
-tcg_gen_or_i64(t, t, c);
-ctx->null_cond = cond_make_tt(TCG_COND_EQ, t, c);
-} else {
-tcg_gen_andi_i64(t, t, mask);
-ctx->null_cond = cond_make_ti(TCG_COND_EQ, t, 0);
-}
 } else {
 unsigned cbit = (a->y ^ 1) - 1;
-
-tcg_gen_extract_i64(t, t, R_FPSR_CA0_SHIFT - cbit, 1);
-ctx->null_cond = cond_make_ti(TCG_COND_NE, t, 0);
+mask = R_FPSR_CA0_MASK >> cbit;
 }
 
+ctx->null_cond = cond_make_ti(tc, t, mask);
 return nullify_end(ctx);
 }
 
-- 
2.34.1

[PATCH 25/45] target/hppa: Use registerfields.h for FPSR

Define all of the context dependent field definitions.
Use FIELD_EX32 and FIELD_DP32 with named fields instead
of extract32 and deposit32 with raw constants.

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.h| 25 +
 target/hppa/fpu_helper.c | 26 +-
 target/hppa/translate.c  | 18 --
 3 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 01dc8781a5..c0da9e9af6 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -24,6 +24,7 @@
 #include "exec/cpu-defs.h"
 #include "qemu/cpu-float.h"
 #include "qemu/interval-tree.h"
+#include "hw/registerfields.h"
 
 /* PA-RISC 1.x processors have a strong memory model.  */
 /* ??? While we do not yet implement PA-RISC 2.0, those processors have
@@ -158,6 +159,30 @@
 #define CR_IPSW  22
 #define CR_EIRR  23
 
+FIELD(FPSR, ENA_I, 0, 1)
+FIELD(FPSR, ENA_U, 1, 1)
+FIELD(FPSR, ENA_O, 2, 1)
+FIELD(FPSR, ENA_Z, 3, 1)
+FIELD(FPSR, ENA_V, 4, 1)
+FIELD(FPSR, ENABLES, 0, 5)
+FIELD(FPSR, D, 5, 1)
+FIELD(FPSR, T, 6, 1)
+FIELD(FPSR, RM, 9, 2)
+FIELD(FPSR, CQ, 11, 11)
+FIELD(FPSR, CQ0_6, 15, 7)
+FIELD(FPSR, CQ0_4, 17, 5)
+FIELD(FPSR, CQ0_2, 19, 3)
+FIELD(FPSR, CQ0, 21, 1)
+FIELD(FPSR, CA, 15, 7)
+FIELD(FPSR, CA0, 21, 1)
+FIELD(FPSR, C, 26, 1)
+FIELD(FPSR, FLG_I, 27, 1)
+FIELD(FPSR, FLG_U, 28, 1)
+FIELD(FPSR, FLG_O, 29, 1)
+FIELD(FPSR, FLG_Z, 30, 1)
+FIELD(FPSR, FLG_V, 31, 1)
+FIELD(FPSR, FLAGS, 27, 5)
+
 typedef struct HPPATLBEntry {
 union {
 IntervalTreeNode itree;
diff --git a/target/hppa/fpu_helper.c b/target/hppa/fpu_helper.c
index 576f283b04..deaed2b65d 100644
--- a/target/hppa/fpu_helper.c
+++ b/target/hppa/fpu_helper.c
@@ -30,7 +30,7 @@ void HELPER(loaded_fr0)(CPUHPPAState *env)
 
 env->fr0_shadow = shadow;
 
-switch (extract32(shadow, 9, 2)) {
+switch (FIELD_EX32(shadow, FPSR, RM)) {
 default:
 rm = float_round_nearest_even;
 break;
@@ -46,7 +46,7 @@ void HELPER(loaded_fr0)(CPUHPPAState *env)
 }
 set_float_rounding_mode(rm, >fp_status);
 
-d = extract32(shadow, 5, 1);
+d = FIELD_EX32(shadow, FPSR, D);
 set_flush_to_zero(d, >fp_status);
 set_flush_inputs_to_zero(d, >fp_status);
 }
@@ -57,7 +57,7 @@ void cpu_hppa_loaded_fr0(CPUHPPAState *env)
 }
 
 #define CONVERT_BIT(X, SRC, DST)\
-((SRC) > (DST)  \
+((unsigned)(SRC) > (unsigned)(DST)  \
  ? (X) / ((SRC) / (DST)) & (DST)\
  : ((X) & (SRC)) * ((DST) / (SRC)))
 
@@ -73,12 +73,12 @@ static void update_fr0_op(CPUHPPAState *env, uintptr_t ra)
 }
 set_float_exception_flags(0, >fp_status);
 
-hard_exp |= CONVERT_BIT(soft_exp, float_flag_inexact,   1u << 0);
-hard_exp |= CONVERT_BIT(soft_exp, float_flag_underflow, 1u << 1);
-hard_exp |= CONVERT_BIT(soft_exp, float_flag_overflow,  1u << 2);
-hard_exp |= CONVERT_BIT(soft_exp, float_flag_divbyzero, 1u << 3);
-hard_exp |= CONVERT_BIT(soft_exp, float_flag_invalid,   1u << 4);
-shadow |= hard_exp << (32 - 5);
+hard_exp |= CONVERT_BIT(soft_exp, float_flag_inexact,   R_FPSR_ENA_I_MASK);
+hard_exp |= CONVERT_BIT(soft_exp, float_flag_underflow, R_FPSR_ENA_U_MASK);
+hard_exp |= CONVERT_BIT(soft_exp, float_flag_overflow,  R_FPSR_ENA_O_MASK);
+hard_exp |= CONVERT_BIT(soft_exp, float_flag_divbyzero, R_FPSR_ENA_Z_MASK);
+hard_exp |= CONVERT_BIT(soft_exp, float_flag_invalid,   R_FPSR_ENA_V_MASK);
+shadow |= hard_exp << (R_FPSR_FLAGS_SHIFT - R_FPSR_ENABLES_SHIFT);
 env->fr0_shadow = shadow;
 env->fr[0] = (uint64_t)shadow << 32;
 
@@ -378,15 +378,15 @@ static void update_fr0_cmp(CPUHPPAState *env, uint32_t y,
 if (y) {
 /* targeted comparison */
 /* set fpsr[ca[y - 1]] to current compare */
-shadow = deposit32(shadow, 21 - (y - 1), 1, c);
+shadow = deposit32(shadow, R_FPSR_CA0_SHIFT - (y - 1), 1, c);
 } else {
 /* queued comparison */
 /* shift cq right by one place */
-shadow = deposit32(shadow, 11, 10, extract32(shadow, 12, 10));
+shadow = (shadow & ~R_FPSR_CQ_MASK) | ((shadow >> 1) & R_FPSR_CQ_MASK);
 /* move fpsr[c] to fpsr[cq[0]] */
-shadow = deposit32(shadow, 21, 1, extract32(shadow, 26, 1));
+shadow = FIELD_DP32(shadow, FPSR, CQ0, FIELD_EX32(shadow, FPSR, C));
 /* set fpsr[c] to current compare */
-shadow = deposit32(shadow, 26, 1, c);
+shadow = FIELD_DP32(shadow, FPSR, C, c);
 }
 
 env->fr0_shadow = shadow;
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index c996eb9823..4b9092b1cf 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -4322,29 +4322,28 @@ static bool trans_ftest(DisasContext *ctx, arg_ftest *a)
 
 switch (a->c) {
 case 0: /* simple */
-tcg_gen_andi_i64(t, t, 0x400);
-ctx->null_cond = cond_make_ti(TCG_COND_NE, t, 0);
-goto done;
+

[PATCH 01/45] target/hppa: Move cpu_get_tb_cpu_state out of line

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.h | 43 ++-
 target/hppa/cpu.c | 42 ++
 2 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index a072d0bb63..01dc8781a5 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -320,47 +320,8 @@ hwaddr hppa_abs_to_phys_pa2_w1(vaddr addr);
 #define TB_FLAG_PRIV_SHIFT  8
 #define TB_FLAG_UNALIGN 0x400
 
-static inline void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
-uint64_t *cs_base, uint32_t *pflags)
-{
-uint32_t flags = env->psw_n * PSW_N;
-
-/* TB lookup assumes that PC contains the complete virtual address.
-   If we leave space+offset separate, we'll get ITLB misses to an
-   incomplete virtual address.  This also means that we must separate
-   out current cpu privilege from the low bits of IAOQ_F.  */
-#ifdef CONFIG_USER_ONLY
-*pc = env->iaoq_f & -4;
-*cs_base = env->iaoq_b & -4;
-flags |= TB_FLAG_UNALIGN * !env_cpu(env)->prctl_unalign_sigbus;
-#else
-/* ??? E, T, H, L, B bits need to be here, when implemented.  */
-flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_P);
-flags |= (env->iaoq_f & 3) << TB_FLAG_PRIV_SHIFT;
-
-*pc = hppa_form_gva_psw(env->psw, (env->psw & PSW_C ? env->iasq_f : 0),
-env->iaoq_f & -4);
-*cs_base = env->iasq_f;
-
-/* Insert a difference between IAOQ_B and IAOQ_F within the otherwise zero
-   low 32-bits of CS_BASE.  This will succeed for all direct branches,
-   which is the primary case we care about -- using goto_tb within a page.
-   Failure is indicated by a zero difference.  */
-if (env->iasq_f == env->iasq_b) {
-target_long diff = env->iaoq_b - env->iaoq_f;
-if (diff == (int32_t)diff) {
-*cs_base |= (uint32_t)diff;
-}
-}
-if ((env->sr[4] == env->sr[5])
-& (env->sr[4] == env->sr[6])
-& (env->sr[4] == env->sr[7])) {
-flags |= TB_FLAG_SR_SAME;
-}
-#endif
-
-*pflags = flags;
-}
+void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
+  uint64_t *cs_base, uint32_t *pflags);
 
 target_ulong cpu_hppa_get_psw(CPUHPPAState *env);
 void cpu_hppa_put_psw(CPUHPPAState *env, target_ulong);
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index 3831cb6db2..1d5f5086bf 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -43,6 +43,48 @@ static vaddr hppa_cpu_get_pc(CPUState *cs)
 return cpu->env.iaoq_f;
 }
 
+void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
+  uint64_t *cs_base, uint32_t *pflags)
+{
+uint32_t flags = env->psw_n * PSW_N;
+
+/* TB lookup assumes that PC contains the complete virtual address.
+   If we leave space+offset separate, we'll get ITLB misses to an
+   incomplete virtual address.  This also means that we must separate
+   out current cpu privilege from the low bits of IAOQ_F.  */
+#ifdef CONFIG_USER_ONLY
+*pc = env->iaoq_f & -4;
+*cs_base = env->iaoq_b & -4;
+flags |= TB_FLAG_UNALIGN * !env_cpu(env)->prctl_unalign_sigbus;
+#else
+/* ??? E, T, H, L, B bits need to be here, when implemented.  */
+flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_P);
+flags |= (env->iaoq_f & 3) << TB_FLAG_PRIV_SHIFT;
+
+*pc = hppa_form_gva_psw(env->psw, (env->psw & PSW_C ? env->iasq_f : 0),
+env->iaoq_f & -4);
+*cs_base = env->iasq_f;
+
+/* Insert a difference between IAOQ_B and IAOQ_F within the otherwise zero
+   low 32-bits of CS_BASE.  This will succeed for all direct branches,
+   which is the primary case we care about -- using goto_tb within a page.
+   Failure is indicated by a zero difference.  */
+if (env->iasq_f == env->iasq_b) {
+target_long diff = env->iaoq_b - env->iaoq_f;
+if (diff == (int32_t)diff) {
+*cs_base |= (uint32_t)diff;
+}
+}
+if ((env->sr[4] == env->sr[5])
+& (env->sr[4] == env->sr[6])
+& (env->sr[4] == env->sr[7])) {
+flags |= TB_FLAG_SR_SAME;
+}
+#endif
+
+*pflags = flags;
+}
+
 static void hppa_cpu_synchronize_from_tb(CPUState *cs,
  const TranslationBlock *tb)
 {
-- 
2.34.1

[PATCH 30/45] target/hppa: Use delay_excp for conditional trap on overflow

Signed-off-by: Richard Henderson 
---
 target/hppa/helper.h |  1 -
 target/hppa/int_helper.c |  2 +-
 target/hppa/op_helper.c  |  7 ---
 target/hppa/translate.c  | 21 +
 4 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/target/hppa/helper.h b/target/hppa/helper.h
index 3d0d143aed..c12b48a04a 100644
--- a/target/hppa/helper.h
+++ b/target/hppa/helper.h
@@ -1,5 +1,4 @@
 DEF_HELPER_2(excp, noreturn, env, int)
-DEF_HELPER_FLAGS_2(tsv, TCG_CALL_NO_WG, void, env, tl)
 
 DEF_HELPER_FLAGS_3(stby_b, TCG_CALL_NO_WG, void, env, tl, tl)
 DEF_HELPER_FLAGS_3(stby_b_parallel, TCG_CALL_NO_WG, void, env, tl, tl)
diff --git a/target/hppa/int_helper.c b/target/hppa/int_helper.c
index 1aa3e88ef1..97e5f0b9a7 100644
--- a/target/hppa/int_helper.c
+++ b/target/hppa/int_helper.c
@@ -134,13 +134,13 @@ void hppa_cpu_do_interrupt(CPUState *cs)
 switch (i) {
 case EXCP_ILL:
 case EXCP_BREAK:
+case EXCP_OVERFLOW:
 case EXCP_COND:
 case EXCP_PRIV_REG:
 case EXCP_PRIV_OPR:
 /* IIR set via translate.c.  */
 break;
 
-case EXCP_OVERFLOW:
 case EXCP_ASSIST:
 case EXCP_DTLB_MISS:
 case EXCP_NA_ITLB_MISS:
diff --git a/target/hppa/op_helper.c b/target/hppa/op_helper.c
index a8b69fd481..66cad78a57 100644
--- a/target/hppa/op_helper.c
+++ b/target/hppa/op_helper.c
@@ -42,13 +42,6 @@ G_NORETURN void hppa_dynamic_excp(CPUHPPAState *env, int 
excp, uintptr_t ra)
 cpu_loop_exit_restore(cs, ra);
 }
 
-void HELPER(tsv)(CPUHPPAState *env, target_ulong cond)
-{
-if (unlikely((target_long)cond < 0)) {
-hppa_dynamic_excp(env, EXCP_OVERFLOW, GETPC());
-}
-}
-
 static void atomic_store_mask32(CPUHPPAState *env, target_ulong addr,
 uint32_t val, uint32_t mask, uintptr_t ra)
 {
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 080a52e5e4..5b0304d0d5 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1134,6 +1134,17 @@ static void gen_tc(DisasContext *ctx, DisasCond *cond)
 }
 }
 
+static void gen_tsv(DisasContext *ctx, TCGv_i64 *sv, bool d)
+{
+DisasCond cond = do_cond(ctx, /* SV */ 12, d, NULL, NULL, *sv);
+DisasDelayException *e = delay_excp(ctx, EXCP_OVERFLOW);
+
+tcg_gen_brcond_i64(cond.c, cond.a0, cond.a1, e->lab);
+
+/* In the non-trap path, V is known zero. */
+*sv = tcg_constant_i64(0);
+}
+
 static void do_add(DisasContext *ctx, unsigned rt, TCGv_i64 orig_in1,
TCGv_i64 in2, unsigned shift, bool is_l,
bool is_tsv, bool is_tc, bool is_c, unsigned cf, bool d)
@@ -1176,10 +1187,7 @@ static void do_add(DisasContext *ctx, unsigned rt, 
TCGv_i64 orig_in1,
 if (is_tsv || cond_need_sv(c)) {
 sv = do_add_sv(ctx, dest, in1, in2, orig_in1, shift, d);
 if (is_tsv) {
-if (!d) {
-tcg_gen_ext32s_i64(sv, sv);
-}
-gen_helper_tsv(tcg_env, sv);
+gen_tsv(ctx, , d);
 }
 }
 
@@ -1280,10 +1288,7 @@ static void do_sub(DisasContext *ctx, unsigned rt, 
TCGv_i64 in1,
 if (is_tsv || cond_need_sv(c)) {
 sv = do_sub_sv(ctx, dest, in1, in2);
 if (is_tsv) {
-if (!d) {
-tcg_gen_ext32s_i64(sv, sv);
-}
-gen_helper_tsv(tcg_env, sv);
+gen_tsv(ctx, , d);
 }
 }
 
-- 
2.34.1

[PATCH 44/45] target/hppa: Log cpu state at interrupt

This contains all of the information logged before, plus more.

Signed-off-by: Richard Henderson 
---
 target/hppa/int_helper.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/target/hppa/int_helper.c b/target/hppa/int_helper.c
index b82f32fd12..391f32f27d 100644
--- a/target/hppa/int_helper.c
+++ b/target/hppa/int_helper.c
@@ -241,21 +241,22 @@ void hppa_cpu_do_interrupt(CPUState *cs)
 [EXCP_SYSCALL_LWS]   = "syscall-lws",
 [EXCP_TOC]   = "TOC (transfer of control)",
 };
-static int count;
-const char *name = NULL;
-char unknown[16];
 
-if (i >= 0 && i < ARRAY_SIZE(names)) {
-name = names[i];
+FILE *logfile = qemu_log_trylock();
+if (logfile) {
+const char *name = NULL;
+
+if (i >= 0 && i < ARRAY_SIZE(names)) {
+name = names[i];
+}
+if (name) {
+fprintf(logfile, "INT: cpu %d %s\n", cs->cpu_index, name);
+} else {
+fprintf(logfile, "INT: cpu %d unknown %d\n", cs->cpu_index, i);
+}
+hppa_cpu_dump_state(cs, logfile, 0);
+qemu_log_unlock(logfile);
 }
-if (!name) {
-snprintf(unknown, sizeof(unknown), "unknown %d", i);
-name = unknown;
-}
-qemu_log("INT %6d: %s @ " TARGET_FMT_lx ":" TARGET_FMT_lx
- " for " TARGET_FMT_lx ":" TARGET_FMT_lx "\n",
- ++count, name, env->cr[CR_IIASQ], env->cr[CR_IIAOQ],
- env->cr[CR_ISR], env->cr[CR_IOR]);
 }
 cs->exception_index = -1;
 }
-- 
2.34.1

[PATCH 42/45] target/hppa: Implement PSW_T

PSW_T enables a trap on taken branches, at the very end of the
execution of the branch instruction.

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.c   |  4 +--
 target/hppa/translate.c | 55 +++--
 2 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index b3f3f070d3..42c413211a 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -76,10 +76,10 @@ void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
 cs_base |= env->iaoq_b & ~TARGET_PAGE_MASK;
 }
 
-/* ??? E, T, H, L bits need to be here, when implemented.  */
+/* ??? E, H, L bits need to be here, when implemented.  */
 flags |= env->psw_n * PSW_N;
 flags |= env->psw_xb;
-flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_P);
+flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_P | PSW_T);
 
 #ifdef CONFIG_USER_ONLY
 flags |= TB_FLAG_UNALIGN * !env_cpu(env)->prctl_unalign_sigbus;
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index b2cc81c685..7ad7aa675d 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1872,6 +1872,23 @@ static bool do_fop_dedd(DisasContext *ctx, unsigned rt,
 return nullify_end(ctx);
 }
 
+static bool do_taken_branch_trap(DisasContext *ctx, DisasIAQE *next, bool n)
+{
+if (unlikely(ctx->tb_flags & PSW_T)) {
+/*
+ * The X, B and N bits are updated, and the instruction queue
+ * is advanced before the trap is recognized.
+ */
+nullify_set(ctx, n);
+store_psw_xb(ctx, PSW_B);
+install_iaq_entries(ctx, >iaq_b, next);
+gen_excp_1(EXCP_TB);
+ctx->base.is_jmp = DISAS_NORETURN;
+return true;
+}
+return false;
+}
+
 /* Emit an unconditional branch to a direct target, which may or may not
have already had nullification handled.  */
 static bool do_dbranch(DisasContext *ctx, int64_t disp,
@@ -1881,6 +1898,9 @@ static bool do_dbranch(DisasContext *ctx, int64_t disp,
 
 if (ctx->null_cond.c == TCG_COND_NEVER && ctx->null_lab == NULL) {
 install_link(ctx, link, false);
+if (do_taken_branch_trap(ctx, >iaq_j, is_n)) {
+return true;
+}
 if (is_n) {
 if (use_nullify_skip(ctx)) {
 nullify_set(ctx, 0);
@@ -1897,7 +1917,9 @@ static bool do_dbranch(DisasContext *ctx, int64_t disp,
 nullify_over(ctx);
 
 install_link(ctx, link, false);
-if (is_n && use_nullify_skip(ctx)) {
+if (do_taken_branch_trap(ctx, >iaq_j, is_n)) {
+/* done */
+} else if (is_n && use_nullify_skip(ctx)) {
 nullify_set(ctx, 0);
 store_psw_xb(ctx, 0);
 gen_goto_tb(ctx, 0, >iaq_j, NULL);
@@ -1959,7 +1981,9 @@ static bool do_cbranch(DisasContext *ctx, int64_t disp, 
bool is_n,
 n = is_n && disp >= 0;
 
 next = iaqe_branchi(ctx, disp);
-if (n && use_nullify_skip(ctx)) {
+if (do_taken_branch_trap(ctx, , is_n)) {
+/* done */
+} else if (n && use_nullify_skip(ctx)) {
 nullify_set(ctx, 0);
 store_psw_xb(ctx, 0);
 gen_goto_tb(ctx, 1, , NULL);
@@ -1989,6 +2013,9 @@ static bool do_ibranch(DisasContext *ctx, unsigned link,
 {
 if (ctx->null_cond.c == TCG_COND_NEVER && ctx->null_lab == NULL) {
 install_link(ctx, link, with_sr0);
+if (do_taken_branch_trap(ctx, >iaq_j, is_n)) {
+return true;
+}
 if (is_n) {
 if (use_nullify_skip(ctx)) {
 install_iaq_entries(ctx, >iaq_j, NULL);
@@ -2004,20 +2031,22 @@ static bool do_ibranch(DisasContext *ctx, unsigned link,
 }
 
 nullify_over(ctx);
-
 install_link(ctx, link, with_sr0);
-if (is_n && use_nullify_skip(ctx)) {
-install_iaq_entries(ctx, >iaq_j, NULL);
-nullify_set(ctx, 0);
-store_psw_xb(ctx, 0);
-} else {
-install_iaq_entries(ctx, >iaq_b, >iaq_j);
-nullify_set(ctx, is_n);
-store_psw_xb(ctx, PSW_B);
+
+if (!do_taken_branch_trap(ctx, >iaq_j, is_n)) {
+if (is_n && use_nullify_skip(ctx)) {
+install_iaq_entries(ctx, >iaq_j, NULL);
+nullify_set(ctx, 0);
+store_psw_xb(ctx, 0);
+} else {
+install_iaq_entries(ctx, >iaq_b, >iaq_j);
+nullify_set(ctx, is_n);
+store_psw_xb(ctx, PSW_B);
+}
+tcg_gen_lookup_and_goto_ptr();
+ctx->base.is_jmp = DISAS_NORETURN;
 }
 
-tcg_gen_lookup_and_goto_ptr();
-ctx->base.is_jmp = DISAS_NORETURN;
 return nullify_end(ctx);
 }
 
-- 
2.34.1

[PATCH 23/45] target/hppa: Use TCG_COND_TST* in do_unit_addsub

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index de510fddb1..38697ddfbd 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1418,8 +1418,8 @@ static void do_unit_addsub(DisasContext *ctx, unsigned 
rt, TCGv_i64 in1,
 tcg_gen_shri_i64(cb, cb, 1);
 }
 
-tcg_gen_andi_i64(cb, cb, test_cb);
-cond = cond_make_ti(cf & 1 ? TCG_COND_EQ : TCG_COND_NE, cb, 0);
+cond = cond_make_ti(cf & 1 ? TCG_COND_TSTEQ : TCG_COND_TSTNE,
+cb, test_cb);
 }
 
 if (is_tc) {
-- 
2.34.1

[PATCH 32/45] target/hppa: Store full iaoq_f and page bits of iaoq_d in TB

In preparation for CF_PCREL. store the iaoq_f in 3 parts: high
bits in cs_base, middle bits in pc, and low bits in priv.
For iaoq_b, set a bit for either of space or page differing,
else the page offset.

Install iaq entries before goto_tb. The change to not record
the full direct branch difference in TB means that we have to
store at least iaoq_b before goto_tb.  But we since we'll need
both updated before goto_tb for CF_PCREL, do that now.

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.h   |  2 ++
 target/hppa/cpu.c   | 72 ++---
 target/hppa/translate.c | 29 +
 3 files changed, 48 insertions(+), 55 deletions(-)

diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 4514bc63dc..66cae795bd 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -347,6 +347,8 @@ hwaddr hppa_abs_to_phys_pa2_w1(vaddr addr);
 #define TB_FLAG_SR_SAME PSW_I
 #define TB_FLAG_PRIV_SHIFT  8
 #define TB_FLAG_UNALIGN 0x400
+#define CS_BASE_DIFFPAGE(1 << 12)
+#define CS_BASE_DIFFSPACE   (1 << 13)
 
 void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
   uint64_t *cs_base, uint32_t *pflags);
diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index 8c8c6181de..003af63e20 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -48,36 +48,43 @@ static vaddr hppa_cpu_get_pc(CPUState *cs)
 }
 
 void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
-  uint64_t *cs_base, uint32_t *pflags)
+  uint64_t *pcsbase, uint32_t *pflags)
 {
 uint32_t flags = env->psw_n * PSW_N;
+uint64_t cs_base = 0;
+
+/*
+ * TB lookup assumes that PC contains the complete virtual address.
+ * If we leave space+offset separate, we'll get ITLB misses to an
+ * incomplete virtual address.  This also means that we must separate
+ * out current cpu privilege from the low bits of IAOQ_F.
+ */
+*pc = hppa_cpu_get_pc(env_cpu(env));
+flags |= (env->iaoq_f & 3) << TB_FLAG_PRIV_SHIFT;
+
+if (hppa_is_pa20(env)) {
+cs_base = env->iaoq_f & MAKE_64BIT_MASK(32, 32);
+}
+
+/*
+ * The only really interesting case is if IAQ_Back is on the same page
+ * as IAQ_Front, so that we can use goto_tb between the blocks.  In all
+ * other cases, we'll be ending the TranslationBlock with one insn and
+ * not linking between them.
+ */
+if (env->iasq_f != env->iasq_b) {
+cs_base |= CS_BASE_DIFFSPACE;
+} else if ((env->iaoq_f ^ env->iaoq_b) & TARGET_PAGE_MASK) {
+cs_base |= CS_BASE_DIFFPAGE;
+} else {
+cs_base |= env->iaoq_b & ~TARGET_PAGE_MASK;
+}
 
-/* TB lookup assumes that PC contains the complete virtual address.
-   If we leave space+offset separate, we'll get ITLB misses to an
-   incomplete virtual address.  This also means that we must separate
-   out current cpu privilege from the low bits of IAOQ_F.  */
 #ifdef CONFIG_USER_ONLY
-*pc = env->iaoq_f & -4;
-*cs_base = env->iaoq_b & -4;
 flags |= TB_FLAG_UNALIGN * !env_cpu(env)->prctl_unalign_sigbus;
 #else
 /* ??? E, T, H, L, B bits need to be here, when implemented.  */
 flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_P);
-flags |= (env->iaoq_f & 3) << TB_FLAG_PRIV_SHIFT;
-
-*pc = hppa_cpu_get_pc(env_cpu(env));
-*cs_base = env->iasq_f;
-
-/* Insert a difference between IAOQ_B and IAOQ_F within the otherwise zero
-   low 32-bits of CS_BASE.  This will succeed for all direct branches,
-   which is the primary case we care about -- using goto_tb within a page.
-   Failure is indicated by a zero difference.  */
-if (env->iasq_f == env->iasq_b) {
-target_long diff = env->iaoq_b - env->iaoq_f;
-if (diff == (int32_t)diff) {
-*cs_base |= (uint32_t)diff;
-}
-}
 if ((env->sr[4] == env->sr[5])
 & (env->sr[4] == env->sr[6])
 & (env->sr[4] == env->sr[7])) {
@@ -85,6 +92,7 @@ void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
 }
 #endif
 
+*pcsbase = cs_base;
 *pflags = flags;
 }
 
@@ -93,25 +101,7 @@ static void hppa_cpu_synchronize_from_tb(CPUState *cs,
 {
 HPPACPU *cpu = HPPA_CPU(cs);
 
-tcg_debug_assert(!(cs->tcg_cflags & CF_PCREL));
-
-#ifdef CONFIG_USER_ONLY
-cpu->env.iaoq_f = tb->pc | PRIV_USER;
-cpu->env.iaoq_b = tb->cs_base | PRIV_USER;
-#else
-/* Recover the IAOQ values from the GVA + PRIV.  */
-uint32_t priv = (tb->flags >> TB_FLAG_PRIV_SHIFT) & 3;
-target_ulong cs_base = tb->cs_base;
-target_ulong iasq_f = cs_base & ~0xull;
-int32_t diff = cs_base;
-
-cpu->env.iasq_f = iasq_f;
-cpu->env.iaoq_f = (tb->pc & ~iasq_f) + priv;
-if (diff) {
-cpu->env.iaoq_b = cpu->env.iaoq_f + diff;
-}
-#endif
-
+/* IAQ is always up-to-date before goto_tb. */
 cpu->env.psw_n = (tb->flags & PSW_N) != 0;
 }
 
diff --git a/target/hppa/translate.c

[PATCH 22/45] target/hppa: Use TCG_COND_TST* in do_unit_zero_cond

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 1e772bef4d..de510fddb1 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1004,9 +1004,8 @@ static DisasCond do_unit_zero_cond(unsigned cf, bool d, 
TCGv_i64 res)
 tmp = tcg_temp_new_i64();
 tcg_gen_subi_i64(tmp, res, ones);
 tcg_gen_andc_i64(tmp, tmp, res);
-tcg_gen_andi_i64(tmp, tmp, sgns);
 
-return cond_make_ti(cf & 1 ? TCG_COND_EQ : TCG_COND_NE, tmp, 0);
+return cond_make_ti(cf & 1 ? TCG_COND_TSTEQ : TCG_COND_TSTNE, tmp, sgns);
 }
 
 static TCGv_i64 get_carry(DisasContext *ctx, bool d,
-- 
2.34.1

[PATCH 06/45] target/hppa: Use CF_BP_PAGE instead of cpu_breakpoint_test

The generic tcg driver will have already checked for breakpoints.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index ceba7a98e5..dfdcb3e23c 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -673,8 +673,9 @@ static bool use_goto_tb(DisasContext *ctx, uint64_t bofs, 
uint64_t nofs)
executing a TB that merely branches to the next TB.  */
 static bool use_nullify_skip(DisasContext *ctx)
 {
-return (((ctx->iaoq_b ^ ctx->iaoq_f) & TARGET_PAGE_MASK) == 0
-&& !cpu_breakpoint_test(ctx->cs, ctx->iaoq_b, BP_ANY));
+return (!(tb_cflags(ctx->base.tb) & CF_BP_PAGE)
+&& ctx->iaoq_b != -1
+&& is_same_page(>base, ctx->iaoq_b));
 }
 
 static void gen_goto_tb(DisasContext *ctx, int which,
-- 
2.34.1

[PATCH 40/45] target/hppa: Adjust priv for B,GATE at runtime

Do not compile in the priv change based on the first
translation; look up the PTE at execution time.

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.h|  1 -
 target/hppa/helper.h |  1 +
 target/hppa/mem_helper.c | 34 +++---
 target/hppa/translate.c  | 36 +++-
 4 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 5f3e99cdc4..8523f22452 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -386,7 +386,6 @@ void hppa_cpu_do_transaction_failed(CPUState *cs, hwaddr 
physaddr,
 extern const MemoryRegionOps hppa_io_eir_ops;
 extern const VMStateDescription vmstate_hppa_cpu;
 void hppa_cpu_alarm_timer(void *);
-int hppa_artype_for_page(CPUHPPAState *env, target_ulong vaddr);
 #endif
 G_NORETURN void hppa_dynamic_excp(CPUHPPAState *env, int excp, uintptr_t ra);
 
diff --git a/target/hppa/helper.h b/target/hppa/helper.h
index c12b48a04a..de411923d9 100644
--- a/target/hppa/helper.h
+++ b/target/hppa/helper.h
@@ -86,6 +86,7 @@ DEF_HELPER_1(halt, noreturn, env)
 DEF_HELPER_1(reset, noreturn, env)
 DEF_HELPER_1(rfi, void, env)
 DEF_HELPER_1(rfi_r, void, env)
+DEF_HELPER_FLAGS_2(b_gate_priv, TCG_CALL_NO_WG, i64, env, i64)
 DEF_HELPER_FLAGS_2(write_interval_timer, TCG_CALL_NO_RWG, void, env, tl)
 DEF_HELPER_FLAGS_2(write_eirr, TCG_CALL_NO_RWG, void, env, tl)
 DEF_HELPER_FLAGS_2(swap_system_mask, TCG_CALL_NO_RWG, tl, env, tl)
diff --git a/target/hppa/mem_helper.c b/target/hppa/mem_helper.c
index 3ef9e80064..6756d36dae 100644
--- a/target/hppa/mem_helper.c
+++ b/target/hppa/mem_helper.c
@@ -690,13 +690,6 @@ target_ulong HELPER(lpa)(CPUHPPAState *env, target_ulong 
addr)
 return phys;
 }
 
-/* Return the ar_type of the TLB at VADDR, or -1.  */
-int hppa_artype_for_page(CPUHPPAState *env, target_ulong vaddr)
-{
-HPPATLBEntry *ent = hppa_find_tlb(env, vaddr);
-return ent ? ent->ar_type : -1;
-}
-
 /*
  * diag_btlb() emulates the PDC PDC_BLOCK_TLB firmware call to
  * allow operating systems to modify the Block TLB (BTLB) entries.
@@ -792,3 +785,30 @@ void HELPER(diag_btlb)(CPUHPPAState *env)
 break;
 }
 }
+
+uint64_t HELPER(b_gate_priv)(CPUHPPAState *env, uint64_t iaoq_f)
+{
+uint64_t gva = hppa_form_gva(env, env->iasq_f, iaoq_f);
+HPPATLBEntry *ent = hppa_find_tlb(env, gva);
+
+if (ent == NULL) {
+raise_exception_with_ior(env, EXCP_ITLB_MISS, GETPC(), gva, false);
+}
+
+/*
+ * There should be no need to check page permissions, as that will
+ * already have been done by tb_lookup via get_page_addr_code.
+ * All we need at this point is to check the ar_type.
+ *
+ * No change for non-gateway pages or for priv decrease.
+ */
+if (ent->ar_type & 4) {
+int old_priv = iaoq_f & 3;
+int new_priv = ent->ar_type & 3;
+
+if (new_priv < old_priv) {
+iaoq_f = (iaoq_f & -4) | new_priv;
+}
+}
+return iaoq_f;
+}
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index a4200742bd..3ae196490a 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -3959,6 +3959,7 @@ static bool trans_bl(DisasContext *ctx, arg_bl *a)
 static bool trans_b_gate(DisasContext *ctx, arg_b_gate *a)
 {
 int64_t disp = a->disp;
+bool indirect = false;
 
 /* Trap if PSW[B] is set. */
 if (ctx->psw_xb & PSW_B) {
@@ -3968,24 +3969,22 @@ static bool trans_b_gate(DisasContext *ctx, arg_b_gate 
*a)
 nullify_over(ctx);
 
 #ifndef CONFIG_USER_ONLY
-if (ctx->tb_flags & PSW_C) {
-int type = hppa_artype_for_page(cpu_env(ctx->cs), ctx->base.pc_next);
-/* If we could not find a TLB entry, then we need to generate an
-   ITLB miss exception so the kernel will provide it.
-   The resulting TLB fill operation will invalidate this TB and
-   we will re-translate, at which point we *will* be able to find
-   the TLB entry and determine if this is in fact a gateway page.  */
-if (type < 0) {
-gen_excp(ctx, EXCP_ITLB_MISS);
-return true;
-}
-/* No change for non-gateway pages or for priv decrease.  */
-if (type >= 4 && type - 4 < ctx->privilege) {
-disp -= ctx->privilege;
-disp += type - 4;
-}
+if (ctx->privilege == 0) {
+/* Privilege cannot decrease. */
+} else if (!(ctx->tb_flags & PSW_C)) {
+/* With paging disabled, priv becomes 0. */
+disp -= ctx->privilege;
 } else {
-disp -= ctx->privilege;  /* priv = 0 */
+/* Adjust the dest offset for the privilege change from the PTE. */
+TCGv_i64 off = tcg_temp_new_i64();
+
+gen_helper_b_gate_priv(off, tcg_env,
+   tcg_constant_i64(ctx->iaoq_first
++ ctx->iaq_f.disp));
+
+ctx->iaq_j.base = off;
+ctx->iaq_j.disp = disp + 8;
+

[PATCH 18/45] target/hppa: Use displacements in DisasIAQE

This is a first step in enabling CF_PCREL, but for now
we regenerate the absolute address before writeback.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 43 ++---
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 9d3bffb688..dd3921dbf9 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -44,9 +44,9 @@ typedef struct DisasCond {
 typedef struct DisasIAQE {
 /* IASQ; may be null for no change from TB. */
 TCGv_i64 space;
-/* IAOQ base; may be null for immediate absolute address. */
+/* IAOQ base; may be null for relative address. */
 TCGv_i64 base;
-/* IAOQ addend; absolute immedate address if base is null. */
+/* IAOQ addend; if base is null, relative to ctx->iaoq_first. */
 int64_t disp;
 } DisasIAQE;
 
@@ -59,6 +59,9 @@ typedef struct DisasContext {
 /* IAQ_Next, for jumps, otherwise null for simple advance. */
 DisasIAQE iaq_j, *iaq_n;
 
+/* IAOQ_Front at entry to TB. */
+uint64_t iaoq_first;
+
 DisasCond null_cond;
 TCGLabel *null_lab;
 
@@ -639,7 +642,7 @@ static void copy_iaoq_entry(DisasContext *ctx, TCGv_i64 
dest,
 uint64_t mask = gva_offset_mask(ctx->tb_flags);
 
 if (src->base == NULL) {
-tcg_gen_movi_i64(dest, src->disp & mask);
+tcg_gen_movi_i64(dest, (ctx->iaoq_first + src->disp) & mask);
 } else if (src->disp == 0) {
 tcg_gen_andi_i64(dest, src->base, mask);
 } else {
@@ -671,12 +674,8 @@ static void install_link(DisasContext *ctx, unsigned link, 
bool with_sr0)
 {
 tcg_debug_assert(ctx->null_cond.c == TCG_COND_NEVER);
 if (link) {
-if (ctx->iaq_b.base) {
-tcg_gen_addi_i64(cpu_gr[link], ctx->iaq_b.base,
- ctx->iaq_b.disp + 4);
-} else {
-tcg_gen_movi_i64(cpu_gr[link], ctx->iaq_b.disp + 4);
-}
+DisasIAQE next = iaqe_incr(>iaq_b, 4);
+copy_iaoq_entry(ctx, cpu_gr[link], );
 #ifndef CONFIG_USER_ONLY
 if (with_sr0) {
 tcg_gen_mov_i64(cpu_sr[0], cpu_iasq_b);
@@ -729,7 +728,7 @@ static bool use_goto_tb(DisasContext *ctx, const DisasIAQE 
*f,
 {
 return (!iaqe_variable(f) &&
 (b == NULL || !iaqe_variable(b)) &&
-translator_use_goto_tb(>base, f->disp));
+translator_use_goto_tb(>base, ctx->iaoq_first + f->disp));
 }
 
 /* If the next insn is to be nullified, and it's on the same page,
@@ -740,7 +739,8 @@ static bool use_nullify_skip(DisasContext *ctx)
 {
 return (!(tb_cflags(ctx->base.tb) & CF_BP_PAGE)
 && !iaqe_variable(>iaq_b)
-&& is_same_page(>base, ctx->iaq_b.disp));
+&& (((ctx->iaoq_first + ctx->iaq_b.disp) ^ ctx->iaoq_first)
+& TARGET_PAGE_MASK) == 0);
 }
 
 static void gen_goto_tb(DisasContext *ctx, int which,
@@ -2003,6 +2003,8 @@ static TCGv_i64 do_ibranch_priv(DisasContext *ctx, 
TCGv_i64 offset)
aforementioned BE.  */
 static void do_page_zero(DisasContext *ctx)
 {
+assert(ctx->iaq_f.disp == 0);
+
 /* If by some means we get here with PSW[N]=1, that implies that
the B,GATE instruction would be skipped, and we'd fault on the
next insn within the privileged page.  */
@@ -2022,11 +2024,11 @@ static void do_page_zero(DisasContext *ctx)
non-sequential instruction execution.  Normally the PSW[B] bit
detects this by disallowing the B,GATE instruction to execute
under such conditions.  */
-if (iaqe_variable(>iaq_b) || ctx->iaq_b.disp != ctx->iaq_f.disp + 4) {
+if (iaqe_variable(>iaq_b) || ctx->iaq_b.disp != 4) {
 goto do_sigill;
 }
 
-switch (ctx->iaq_f.disp & -4) {
+switch (ctx->base.pc_first) {
 case 0x00: /* Null pointer call */
 gen_excp_1(EXCP_IMP);
 ctx->base.is_jmp = DISAS_NORETURN;
@@ -4617,8 +4619,8 @@ static void hppa_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
 #ifdef CONFIG_USER_ONLY
 ctx->privilege = MMU_IDX_TO_PRIV(MMU_USER_IDX);
 ctx->mmu_idx = MMU_USER_IDX;
-ctx->iaq_f.disp = ctx->base.pc_first | ctx->privilege;
-ctx->iaq_b.disp = ctx->base.tb->cs_base | ctx->privilege;
+ctx->iaoq_first = ctx->base.pc_first | ctx->privilege;
+ctx->iaq_b.disp = ctx->base.tb->cs_base - ctx->base.pc_first;
 ctx->unalign = (ctx->tb_flags & TB_FLAG_UNALIGN ? MO_UNALN : MO_ALIGN);
 #else
 ctx->privilege = (ctx->tb_flags >> TB_FLAG_PRIV_SHIFT) & 3;
@@ -4631,9 +4633,10 @@ static void hppa_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
 uint64_t iasq_f = cs_base & ~0xull;
 int32_t diff = cs_base;
 
-ctx->iaq_f.disp = (ctx->base.pc_first & ~iasq_f) + ctx->privilege;
+ctx->iaoq_first = (ctx->base.pc_first & ~iasq_f) + ctx->privilege;
+
 if (diff) {
-ctx->iaq_b.disp = ctx->iaq_f.disp + diff;
+ctx->iaq_b.disp = diff;
 } else {
 ctx->iaq_b.base =

[PATCH 33/45] target/hppa: Do not mask in copy_iaoq_entry

As with loads and stores, code offsets are kept intact until the
full gva is formed.  In qemu, this is in cpu_get_tb_cpu_state.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index cc409ffe13..fb5bc12986 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -636,15 +636,10 @@ static DisasIAQE iaqe_next_absv(DisasContext *ctx, 
TCGv_i64 var)
 static void copy_iaoq_entry(DisasContext *ctx, TCGv_i64 dest,
 const DisasIAQE *src)
 {
-uint64_t mask = gva_offset_mask(ctx->tb_flags);
-
 if (src->base == NULL) {
-tcg_gen_movi_i64(dest, (ctx->iaoq_first + src->disp) & mask);
-} else if (src->disp == 0) {
-tcg_gen_andi_i64(dest, src->base, mask);
+tcg_gen_movi_i64(dest, ctx->iaoq_first + src->disp);
 } else {
 tcg_gen_addi_i64(dest, src->base, src->disp);
-tcg_gen_andi_i64(dest, dest, mask);
 }
 }
 
-- 
2.34.1

[PATCH 21/45] target/hppa: Use TCG_COND_TST* in do_log_cond

We can directly test bits of a 32-bit comparison without
zero or sign-extending an intermediate result.
We can directly test bit 0 for odd/even.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 78 ++---
 1 file changed, 27 insertions(+), 51 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 85941f191f..1e772bef4d 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -913,65 +913,41 @@ static DisasCond do_log_cond(DisasContext *ctx, unsigned 
cf, bool d,
  TCGv_i64 res)
 {
 TCGCond tc;
-bool ext_uns;
+uint64_t imm;
 
-switch (cf) {
-case 0:  /* never */
-case 9:  /* undef, C */
-case 11: /* undef, C & !Z */
-case 12: /* undef, V */
-return cond_make_f();
-
-case 1:  /* true */
-case 8:  /* undef, !C */
-case 10: /* undef, !C | Z */
-case 13: /* undef, !V */
-return cond_make_t();
-
-case 2:  /* == */
-tc = TCG_COND_EQ;
-ext_uns = true;
+switch (cf >> 1) {
+case 0:  /* never / always */
+case 4:  /* undef, C */
+case 5:  /* undef, C & !Z */
+case 6:  /* undef, V */
+return cf & 1 ? cond_make_t() : cond_make_f();
+case 1:  /* == / <> */
+tc = d ? TCG_COND_EQ : TCG_COND_TSTEQ;
+imm = d ? 0 : UINT32_MAX;
 break;
-case 3:  /* <> */
-tc = TCG_COND_NE;
-ext_uns = true;
+case 2:  /* < / >= */
+tc = d ? TCG_COND_LT : TCG_COND_TSTNE;
+imm = d ? 0 : 1ull << 31;
 break;
-case 4:  /* < */
-tc = TCG_COND_LT;
-ext_uns = false;
+case 3:  /* <= / > */
+tc = cf & 1 ? TCG_COND_GT : TCG_COND_LE;
+if (!d) {
+TCGv_i64 tmp = tcg_temp_new_i64();
+tcg_gen_ext32s_i64(tmp, res);
+return cond_make_ti(tc, tmp, 0);
+}
+return cond_make_vi(tc, res, 0);
+case 7: /* OD / EV */
+tc = TCG_COND_TSTNE;
+imm = 1;
 break;
-case 5:  /* >= */
-tc = TCG_COND_GE;
-ext_uns = false;
-break;
-case 6:  /* <= */
-tc = TCG_COND_LE;
-ext_uns = false;
-break;
-case 7:  /* > */
-tc = TCG_COND_GT;
-ext_uns = false;
-break;
-
-case 14: /* OD */
-case 15: /* EV */
-return do_cond(ctx, cf, d, res, NULL, NULL);
-
 default:
 g_assert_not_reached();
 }
-
-if (!d) {
-TCGv_i64 tmp = tcg_temp_new_i64();
-
-if (ext_uns) {
-tcg_gen_ext32u_i64(tmp, res);
-} else {
-tcg_gen_ext32s_i64(tmp, res);
-}
-return cond_make_ti(tc, tmp, 0);
+if (cf & 1) {
+tc = tcg_invert_cond(tc);
 }
-return cond_make_vi(tc, res, 0);
+return cond_make_vi(tc, res, imm);
 }
 
 /* Similar, but for shift/extract/deposit conditions.  */
-- 
2.34.1

[PATCH 34/45] target/hppa: Improve hppa_cpu_dump_state

Print both raw IAQ_Front and IAQ_Back as well as the GVAs.
Print control registers in system mode.
Print floating point register if CPU_DUMP_FPU.

Signed-off-by: Richard Henderson 
---
 target/hppa/helper.c | 60 +++-
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/target/hppa/helper.c b/target/hppa/helper.c
index 9d217d051c..7d22c248fb 100644
--- a/target/hppa/helper.c
+++ b/target/hppa/helper.c
@@ -102,6 +102,19 @@ void cpu_hppa_put_psw(CPUHPPAState *env, target_ulong psw)
 
 void hppa_cpu_dump_state(CPUState *cs, FILE *f, int flags)
 {
+#ifndef CONFIG_USER_ONLY
+static const char cr_name[32][5] = {
+"RC","CR1",   "CR2",   "CR3",
+"CR4",   "CR5",   "CR6",   "CR7",
+"PID1",  "PID2",  "CCR",   "SAR",
+"PID3",  "PID4",  "IVA",   "EIEM",
+"ITMR",  "ISQF",  "IOQF",  "IIR",
+"ISR",   "IOR",   "IPSW",  "EIRR",
+"TR0",   "TR1",   "TR2",   "TR3",
+"TR4",   "TR5",   "TR6",   "TR7",
+};
+#endif
+
 CPUHPPAState *env = cpu_env(cs);
 target_ulong psw = cpu_hppa_get_psw(env);
 target_ulong psw_cb;
@@ -117,11 +130,12 @@ void hppa_cpu_dump_state(CPUState *cs, FILE *f, int flags)
 m = UINT32_MAX;
 }
 
-qemu_fprintf(f, "IA_F " TARGET_FMT_lx " IA_B " TARGET_FMT_lx
- " IIR %0*" PRIx64 "\n",
+qemu_fprintf(f, "IA_F %08" PRIx64 ":%0*" PRIx64 " (" TARGET_FMT_lx ")\n"
+"IA_B %08" PRIx64 ":%0*" PRIx64 " (" TARGET_FMT_lx ")\n",
+ env->iasq_f >> 32, w, m & env->iaoq_f,
  hppa_form_gva_psw(psw, env->iasq_f, env->iaoq_f),
- hppa_form_gva_psw(psw, env->iasq_b, env->iaoq_b),
- w, m & env->cr[CR_IIR]);
+ env->iasq_b >> 32, w, m & env->iaoq_b,
+ hppa_form_gva_psw(psw, env->iasq_b, env->iaoq_b));
 
 psw_c[0]  = (psw & PSW_W ? 'W' : '-');
 psw_c[1]  = (psw & PSW_E ? 'E' : '-');
@@ -154,12 +168,46 @@ void hppa_cpu_dump_state(CPUState *cs, FILE *f, int flags)
  (i & 3) == 3 ? '\n' : ' ');
 }
 #ifndef CONFIG_USER_ONLY
+for (i = 0; i < 32; i++) {
+qemu_fprintf(f, "%-4s %0*" PRIx64 "%c",
+ cr_name[i], w, m & env->cr[i],
+ (i & 3) == 3 ? '\n' : ' ');
+}
+qemu_fprintf(f, "ISQB %0*" PRIx64 " IOQB %0*" PRIx64 "\n",
+ w, m & env->cr_back[0], w, m & env->cr_back[1]);
 for (i = 0; i < 8; i++) {
 qemu_fprintf(f, "SR%02d %08x%c", i, (uint32_t)(env->sr[i] >> 32),
  (i & 3) == 3 ? '\n' : ' ');
 }
 #endif
- qemu_fprintf(f, "\n");
 
-/* ??? FR */
+if (flags & CPU_DUMP_FPU) {
+static const char rm[4][4] = { "RN", "RZ", "R+", "R-" };
+char flg[6], ena[6];
+uint32_t fpsr = env->fr0_shadow;
+
+flg[0] = (fpsr & R_FPSR_FLG_V_MASK ? 'V' : '-');
+flg[1] = (fpsr & R_FPSR_FLG_Z_MASK ? 'Z' : '-');
+flg[2] = (fpsr & R_FPSR_FLG_O_MASK ? 'O' : '-');
+flg[3] = (fpsr & R_FPSR_FLG_U_MASK ? 'U' : '-');
+flg[4] = (fpsr & R_FPSR_FLG_I_MASK ? 'I' : '-');
+flg[5] = '\0';
+
+ena[0] = (fpsr & R_FPSR_ENA_V_MASK ? 'V' : '-');
+ena[1] = (fpsr & R_FPSR_ENA_Z_MASK ? 'Z' : '-');
+ena[2] = (fpsr & R_FPSR_ENA_O_MASK ? 'O' : '-');
+ena[3] = (fpsr & R_FPSR_ENA_U_MASK ? 'U' : '-');
+ena[4] = (fpsr & R_FPSR_ENA_I_MASK ? 'I' : '-');
+ena[5] = '\0';
+
+qemu_fprintf(f, "FPSR %08x flag%s enable  %s %s\n",
+ fpsr, flg, ena, rm[FIELD_EX32(fpsr, FPSR, RM)]);
+
+for (i = 0; i < 32; i++) {
+qemu_fprintf(f, "FR%02d %016" PRIx64 "%c",
+ i, env->fr[i], (i & 3) == 3 ? '\n' : ' ');
+}
+}
+
+qemu_fprintf(f, "\n");
 }
-- 
2.34.1

[PATCH 19/45] target/hppa: Rename cond_make_* helpers

Use 'v' for a variable that needs copying, 't' for a temp that
doesn't need copying, and 'i' for an immediate, and use this
naming for both arguments of the comparison.  So:

   cond_make_tmp -> cond_make_tt
   cond_make_0_tmp -> cond_make_ti
   cond_make_0 -> cond_make_vi
   cond_make -> cond_make_vv

Pass 0 explictly, rather than implicitly in the function name.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 52 -
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index dd3921dbf9..a1132c884f 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -344,32 +344,32 @@ static DisasCond cond_make_n(void)
 };
 }
 
-static DisasCond cond_make_tmp(TCGCond c, TCGv_i64 a0, TCGv_i64 a1)
+static DisasCond cond_make_tt(TCGCond c, TCGv_i64 a0, TCGv_i64 a1)
 {
 assert (c != TCG_COND_NEVER && c != TCG_COND_ALWAYS);
 return (DisasCond){ .c = c, .a0 = a0, .a1 = a1 };
 }
 
-static DisasCond cond_make_0_tmp(TCGCond c, TCGv_i64 a0)
+static DisasCond cond_make_ti(TCGCond c, TCGv_i64 a0, uint64_t imm)
 {
-return cond_make_tmp(c, a0, tcg_constant_i64(0));
+return cond_make_tt(c, a0, tcg_constant_i64(imm));
 }
 
-static DisasCond cond_make_0(TCGCond c, TCGv_i64 a0)
+static DisasCond cond_make_vi(TCGCond c, TCGv_i64 a0, uint64_t imm)
 {
 TCGv_i64 tmp = tcg_temp_new_i64();
 tcg_gen_mov_i64(tmp, a0);
-return cond_make_0_tmp(c, tmp);
+return cond_make_ti(c, tmp, imm);
 }
 
-static DisasCond cond_make(TCGCond c, TCGv_i64 a0, TCGv_i64 a1)
+static DisasCond cond_make_vv(TCGCond c, TCGv_i64 a0, TCGv_i64 a1)
 {
 TCGv_i64 t0 = tcg_temp_new_i64();
 TCGv_i64 t1 = tcg_temp_new_i64();
 
 tcg_gen_mov_i64(t0, a0);
 tcg_gen_mov_i64(t1, a1);
-return cond_make_tmp(c, t0, t1);
+return cond_make_tt(c, t0, t1);
 }
 
 static void cond_free(DisasCond *cond)
@@ -787,7 +787,7 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 tcg_gen_ext32u_i64(tmp, res);
 res = tmp;
 }
-cond = cond_make_0(TCG_COND_EQ, res);
+cond = cond_make_vi(TCG_COND_EQ, res, 0);
 break;
 case 2: /* < / >=(N ^ V / !(N ^ V) */
 tmp = tcg_temp_new_i64();
@@ -795,7 +795,7 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 if (!d) {
 tcg_gen_ext32s_i64(tmp, tmp);
 }
-cond = cond_make_0_tmp(TCG_COND_LT, tmp);
+cond = cond_make_ti(TCG_COND_LT, tmp, 0);
 break;
 case 3: /* <= / >(N ^ V) | Z / !((N ^ V) | Z) */
 /*
@@ -817,10 +817,10 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 tcg_gen_sari_i64(tmp, tmp, 63);
 tcg_gen_and_i64(tmp, tmp, res);
 }
-cond = cond_make_0_tmp(TCG_COND_EQ, tmp);
+cond = cond_make_ti(TCG_COND_EQ, tmp, 0);
 break;
 case 4: /* NUV / UV  (!UV / UV) */
-cond = cond_make_0(TCG_COND_EQ, uv);
+cond = cond_make_vi(TCG_COND_EQ, uv, 0);
 break;
 case 5: /* ZNV / VNZ (!UV | Z / UV & !Z) */
 tmp = tcg_temp_new_i64();
@@ -828,7 +828,7 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 if (!d) {
 tcg_gen_ext32u_i64(tmp, tmp);
 }
-cond = cond_make_0_tmp(TCG_COND_EQ, tmp);
+cond = cond_make_ti(TCG_COND_EQ, tmp, 0);
 break;
 case 6: /* SV / NSV  (V / !V) */
 if (!d) {
@@ -836,12 +836,12 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 tcg_gen_ext32s_i64(tmp, sv);
 sv = tmp;
 }
-cond = cond_make_0(TCG_COND_LT, sv);
+cond = cond_make_ti(TCG_COND_LT, sv, 0);
 break;
 case 7: /* OD / EV */
 tmp = tcg_temp_new_i64();
 tcg_gen_andi_i64(tmp, res, 1);
-cond = cond_make_0_tmp(TCG_COND_NE, tmp);
+cond = cond_make_ti(TCG_COND_NE, tmp, 0);
 break;
 default:
 g_assert_not_reached();
@@ -903,9 +903,9 @@ static DisasCond do_sub_cond(DisasContext *ctx, unsigned 
cf, bool d,
 tcg_gen_ext32s_i64(t1, in1);
 tcg_gen_ext32s_i64(t2, in2);
 }
-return cond_make_tmp(tc, t1, t2);
+return cond_make_tt(tc, t1, t2);
 }
-return cond_make(tc, in1, in2);
+return cond_make_vv(tc, in1, in2);
 }
 
 /*
@@ -977,9 +977,9 @@ static DisasCond do_log_cond(DisasContext *ctx, unsigned 
cf, bool d,
 } else {
 tcg_gen_ext32s_i64(tmp, res);
 }
-return cond_make_0_tmp(tc, tmp);
+return cond_make_ti(tc, tmp, 0);
 }
-return cond_make_0(tc, res);
+return cond_make_vi(tc, res, 0);
 }
 
 /* Similar, but for shift/extract/deposit conditions.  */
@@ -1038,7 +1038,7 @@ static DisasCond do_unit_zero_cond(unsigned cf, bool d, 
TCGv_i64 res)
 tcg_gen_andc_i64(tmp, tmp, res);
 tcg_gen_andi_i64(tmp,

[PATCH 24/45] target/hppa: Use TCG_COND_TST* in trans_bb_imm

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 38697ddfbd..c996eb9823 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -3514,18 +3514,12 @@ static bool trans_bb_sar(DisasContext *ctx, arg_bb_sar 
*a)
 
 static bool trans_bb_imm(DisasContext *ctx, arg_bb_imm *a)
 {
-TCGv_i64 tmp, tcg_r;
 DisasCond cond;
-int p;
+int p = a->p | (a->d ? 0 : 32);
 
 nullify_over(ctx);
-
-tmp = tcg_temp_new_i64();
-tcg_r = load_gpr(ctx, a->r);
-p = a->p | (a->d ? 0 : 32);
-tcg_gen_shli_i64(tmp, tcg_r, p);
-
-cond = cond_make_ti(a->c ? TCG_COND_GE : TCG_COND_LT, tmp, 0);
+cond = cond_make_vi(a->c ? TCG_COND_TSTEQ : TCG_COND_TSTNE,
+load_gpr(ctx, a->r), 1ull << (63 - p));
 return do_cbranch(ctx, a->disp, a->n, );
 }
 
-- 
2.34.1

[PATCH 27/45] target/hppa: Remove cond_free

Now that we do not need to free tcg temporaries, the only
thing cond_free does is reset the condition to never.
Instead, simply write a new condition over the old, which
may be simply cond_make_f() for the never condition.

The do_*_cond functions do the right thing with c or cf == 0,
so there's no need for a special case anymore.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 102 +++-
 1 file changed, 27 insertions(+), 75 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index b1311e7688..5714e2ad25 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -372,21 +372,6 @@ static DisasCond cond_make_vv(TCGCond c, TCGv_i64 a0, 
TCGv_i64 a1)
 return cond_make_tt(c, t0, t1);
 }
 
-static void cond_free(DisasCond *cond)
-{
-switch (cond->c) {
-default:
-cond->a0 = NULL;
-cond->a1 = NULL;
-/* fallthru */
-case TCG_COND_ALWAYS:
-cond->c = TCG_COND_NEVER;
-break;
-case TCG_COND_NEVER:
-break;
-}
-}
-
 static TCGv_i64 load_gpr(DisasContext *ctx, unsigned reg)
 {
 if (reg == 0) {
@@ -536,7 +521,7 @@ static void nullify_over(DisasContext *ctx)
 
 tcg_gen_brcond_i64(ctx->null_cond.c, ctx->null_cond.a0,
ctx->null_cond.a1, ctx->null_lab);
-cond_free(>null_cond);
+ctx->null_cond = cond_make_f();
 }
 }
 
@@ -554,7 +539,7 @@ static void nullify_save(DisasContext *ctx)
 ctx->null_cond.a0, ctx->null_cond.a1);
 ctx->psw_n_nonzero = true;
 }
-cond_free(>null_cond);
+ctx->null_cond = cond_make_f();
 }
 
 /* Set a PSW[N] to X.  The intention is that this is used immediately
@@ -1164,7 +1149,6 @@ static void do_add(DisasContext *ctx, unsigned rt, 
TCGv_i64 orig_in1,
 save_gpr(ctx, rt, dest);
 
 /* Install the new nullification.  */
-cond_free(>null_cond);
 ctx->null_cond = cond;
 }
 
@@ -1261,7 +1245,6 @@ static void do_sub(DisasContext *ctx, unsigned rt, 
TCGv_i64 in1,
 save_gpr(ctx, rt, dest);
 
 /* Install the new nullification.  */
-cond_free(>null_cond);
 ctx->null_cond = cond;
 }
 
@@ -1316,7 +1299,6 @@ static void do_cmpclr(DisasContext *ctx, unsigned rt, 
TCGv_i64 in1,
 save_gpr(ctx, rt, dest);
 
 /* Install the new nullification.  */
-cond_free(>null_cond);
 ctx->null_cond = cond;
 }
 
@@ -1331,10 +1313,7 @@ static void do_log(DisasContext *ctx, unsigned rt, 
TCGv_i64 in1,
 save_gpr(ctx, rt, dest);
 
 /* Install the new nullification.  */
-cond_free(>null_cond);
-if (cf) {
-ctx->null_cond = do_log_cond(ctx, cf, d, dest);
-}
+ctx->null_cond = do_log_cond(ctx, cf, d, dest);
 }
 
 static bool do_log_reg(DisasContext *ctx, arg_rrr_cf_d *a,
@@ -1429,7 +1408,6 @@ static void do_unit_addsub(DisasContext *ctx, unsigned 
rt, TCGv_i64 in1,
 }
 save_gpr(ctx, rt, dest);
 
-cond_free(>null_cond);
 ctx->null_cond = cond;
 }
 
@@ -1852,7 +1830,6 @@ static bool do_cbranch(DisasContext *ctx, int64_t disp, 
bool is_n,
 
 taken = gen_new_label();
 tcg_gen_brcond_i64(c, cond->a0, cond->a1, taken);
-cond_free(cond);
 
 /* Not taken: Condition not satisfied; nullify on backward branches. */
 n = is_n && disp < 0;
@@ -2034,7 +2011,7 @@ static void do_page_zero(DisasContext *ctx)
 
 static bool trans_nop(DisasContext *ctx, arg_nop *a)
 {
-cond_free(>null_cond);
+ctx->null_cond = cond_make_f();
 return true;
 }
 
@@ -2048,7 +2025,7 @@ static bool trans_sync(DisasContext *ctx, arg_sync *a)
 /* No point in nullifying the memory barrier.  */
 tcg_gen_mb(TCG_BAR_SC | TCG_MO_ALL);
 
-cond_free(>null_cond);
+ctx->null_cond = cond_make_f();
 return true;
 }
 
@@ -2060,7 +2037,7 @@ static bool trans_mfia(DisasContext *ctx, arg_mfia *a)
 tcg_gen_andi_i64(dest, dest, -4);
 
 save_gpr(ctx, a->t, dest);
-cond_free(>null_cond);
+ctx->null_cond = cond_make_f();
 return true;
 }
 
@@ -2075,7 +2052,7 @@ static bool trans_mfsp(DisasContext *ctx, arg_mfsp *a)
 
 save_gpr(ctx, rt, t0);
 
-cond_free(>null_cond);
+ctx->null_cond = cond_make_f();
 return true;
 }
 
@@ -2120,7 +2097,7 @@ static bool trans_mfctl(DisasContext *ctx, arg_mfctl *a)
 save_gpr(ctx, rt, tmp);
 
  done:
-cond_free(>null_cond);
+ctx->null_cond = cond_make_f();
 return true;
 }
 
@@ -2160,7 +2137,7 @@ static bool trans_mtctl(DisasContext *ctx, arg_mtctl *a)
 tcg_gen_andi_i64(tmp, reg, ctx->is_pa20 ? 63 : 31);
 save_or_nullify(ctx, cpu_sar, tmp);
 
-cond_free(>null_cond);
+ctx->null_cond = cond_make_f();
 return true;
 }
 
@@ -2234,7 +2211,7 @@ static bool trans_mtsarcm(DisasContext *ctx, arg_mtsarcm 
*a)
 tcg_gen_andi_i64(tmp, tmp, ctx->is_pa20 ? 63 : 31);
 save_or_nullify(ctx, cpu_sar, tmp);
 
-cond_free(>null_cond);
+ctx->null_cond = cond_make_f();
 return true;

[PATCH 39/45] target/hppa: Drop tlb_entry return from hppa_get_physical_address

The return-by-reference is never used.

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.h|  3 +--
 target/hppa/int_helper.c |  2 +-
 target/hppa/mem_helper.c | 19 ---
 target/hppa/op_helper.c  |  3 +--
 4 files changed, 7 insertions(+), 20 deletions(-)

diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index 629299653d..5f3e99cdc4 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -377,8 +377,7 @@ bool hppa_cpu_tlb_fill(CPUState *cs, vaddr address, int 
size,
 void hppa_cpu_do_interrupt(CPUState *cpu);
 bool hppa_cpu_exec_interrupt(CPUState *cpu, int int_req);
 int hppa_get_physical_address(CPUHPPAState *env, vaddr addr, int mmu_idx,
-  int type, hwaddr *pphys, int *pprot,
-  HPPATLBEntry **tlb_entry);
+  int type, hwaddr *pphys, int *pprot);
 void hppa_cpu_do_transaction_failed(CPUState *cs, hwaddr physaddr,
  vaddr addr, unsigned size,
  MMUAccessType access_type,
diff --git a/target/hppa/int_helper.c b/target/hppa/int_helper.c
index 97e5f0b9a7..b82f32fd12 100644
--- a/target/hppa/int_helper.c
+++ b/target/hppa/int_helper.c
@@ -167,7 +167,7 @@ void hppa_cpu_do_interrupt(CPUState *cs)
 
 vaddr = hppa_form_gva_psw(old_psw, env->iasq_f, vaddr);
 t = hppa_get_physical_address(env, vaddr, MMU_KERNEL_IDX,
-  0, , , NULL);
+  0, , );
 if (t >= 0) {
 /* We can't re-load the instruction.  */
 env->cr[CR_IIR] = 0;
diff --git a/target/hppa/mem_helper.c b/target/hppa/mem_helper.c
index 5eca5e8a1e..3ef9e80064 100644
--- a/target/hppa/mem_helper.c
+++ b/target/hppa/mem_helper.c
@@ -196,18 +196,13 @@ static int match_prot_id64(CPUHPPAState *env, uint32_t 
access_id)
 }
 
 int hppa_get_physical_address(CPUHPPAState *env, vaddr addr, int mmu_idx,
-  int type, hwaddr *pphys, int *pprot,
-  HPPATLBEntry **tlb_entry)
+  int type, hwaddr *pphys, int *pprot)
 {
 hwaddr phys;
 int prot, r_prot, w_prot, x_prot, priv;
 HPPATLBEntry *ent;
 int ret = -1;
 
-if (tlb_entry) {
-*tlb_entry = NULL;
-}
-
 /* Virtual translation disabled.  Map absolute to physical.  */
 if (MMU_IDX_MMU_DISABLED(mmu_idx)) {
 switch (mmu_idx) {
@@ -237,10 +232,6 @@ int hppa_get_physical_address(CPUHPPAState *env, vaddr 
addr, int mmu_idx,
 goto egress;
 }
 
-if (tlb_entry) {
-*tlb_entry = ent;
-}
-
 /* We now know the physical address.  */
 phys = ent->pa + (addr - ent->itree.start);
 
@@ -349,7 +340,7 @@ hwaddr hppa_cpu_get_phys_page_debug(CPUState *cs, vaddr 
addr)
cpu->env.psw & PSW_W ? MMU_ABS_W_IDX : MMU_ABS_IDX);
 
 excp = hppa_get_physical_address(>env, addr, mmu_idx, 0,
- , , NULL);
+ , );
 
 /* Since we're translating for debugging, the only error that is a
hard error is no translation at all.  Otherwise, while a real cpu
@@ -431,7 +422,6 @@ bool hppa_cpu_tlb_fill(CPUState *cs, vaddr addr, int size,
 {
 HPPACPU *cpu = HPPA_CPU(cs);
 CPUHPPAState *env = >env;
-HPPATLBEntry *ent;
 int prot, excp, a_prot;
 hwaddr phys;
 
@@ -447,8 +437,7 @@ bool hppa_cpu_tlb_fill(CPUState *cs, vaddr addr, int size,
 break;
 }
 
-excp = hppa_get_physical_address(env, addr, mmu_idx,
- a_prot, , , );
+excp = hppa_get_physical_address(env, addr, mmu_idx, a_prot, , );
 if (unlikely(excp >= 0)) {
 if (probe) {
 return false;
@@ -689,7 +678,7 @@ target_ulong HELPER(lpa)(CPUHPPAState *env, target_ulong 
addr)
 int prot, excp;
 
 excp = hppa_get_physical_address(env, addr, MMU_KERNEL_IDX, 0,
- , , NULL);
+ , );
 if (excp >= 0) {
 if (excp == EXCP_DTLB_MISS) {
 excp = EXCP_NA_DTLB_MISS;
diff --git a/target/hppa/op_helper.c b/target/hppa/op_helper.c
index 66cad78a57..7f79196fff 100644
--- a/target/hppa/op_helper.c
+++ b/target/hppa/op_helper.c
@@ -334,8 +334,7 @@ target_ulong HELPER(probe)(CPUHPPAState *env, target_ulong 
addr,
 }
 
 mmu_idx = PRIV_P_TO_MMU_IDX(level, env->psw & PSW_P);
-excp = hppa_get_physical_address(env, addr, mmu_idx, 0, ,
- , NULL);
+excp = hppa_get_physical_address(env, addr, mmu_idx, 0, , );
 if (excp >= 0) {
 cpu_restore_state(env_cpu(env), GETPC());
 hppa_set_ior_and_isr(env, addr, MMU_IDX_MMU_DISABLED(mmu_idx));
-- 
2.34.1

[PATCH 31/45] linux-user/hppa: Force all code addresses to PRIV_USER

The kernel does this along the return path to user mode.

Signed-off-by: Richard Henderson 
---
 linux-user/hppa/target_cpu.h |  4 ++--
 target/hppa/cpu.h|  3 +++
 linux-user/elfload.c |  4 ++--
 linux-user/hppa/cpu_loop.c   | 14 +++---
 linux-user/hppa/signal.c |  6 --
 target/hppa/cpu.c|  7 +--
 target/hppa/gdbstub.c|  6 ++
 target/hppa/translate.c  |  4 ++--
 8 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/linux-user/hppa/target_cpu.h b/linux-user/hppa/target_cpu.h
index aacf3e9e02..4b84422a90 100644
--- a/linux-user/hppa/target_cpu.h
+++ b/linux-user/hppa/target_cpu.h
@@ -28,8 +28,8 @@ static inline void cpu_clone_regs_child(CPUHPPAState *env, 
target_ulong newsp,
 /* Indicate child in return value.  */
 env->gr[28] = 0;
 /* Return from the syscall.  */
-env->iaoq_f = env->gr[31];
-env->iaoq_b = env->gr[31] + 4;
+env->iaoq_f = env->gr[31] | PRIV_USER;
+env->iaoq_b = env->iaoq_f + 4;
 }
 
 static inline void cpu_clone_regs_parent(CPUHPPAState *env, unsigned flags)
diff --git a/target/hppa/cpu.h b/target/hppa/cpu.h
index c0da9e9af6..4514bc63dc 100644
--- a/target/hppa/cpu.h
+++ b/target/hppa/cpu.h
@@ -48,6 +48,9 @@
 #define MMU_IDX_TO_P(MIDX)  (((MIDX) - MMU_KERNEL_IDX) & 1)
 #define PRIV_P_TO_MMU_IDX(PRIV, P)  ((PRIV) * 2 + !!(P) + MMU_KERNEL_IDX)
 
+#define PRIV_KERNEL   0
+#define PRIV_USER 3
+
 #define TARGET_INSN_START_EXTRA_WORDS 2
 
 /* No need to flush MMU_ABS*_IDX  */
diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 60cf55b36c..b551cbcb03 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -1963,8 +1963,8 @@ static inline void init_thread(struct target_pt_regs 
*regs,
 static inline void init_thread(struct target_pt_regs *regs,
struct image_info *infop)
 {
-regs->iaoq[0] = infop->entry;
-regs->iaoq[1] = infop->entry + 4;
+regs->iaoq[0] = infop->entry | PRIV_USER;
+regs->iaoq[1] = regs->iaoq[0] + 4;
 regs->gr[23] = 0;
 regs->gr[24] = infop->argv;
 regs->gr[25] = infop->argc;
diff --git a/linux-user/hppa/cpu_loop.c b/linux-user/hppa/cpu_loop.c
index d5232f37fe..bc093b8fe8 100644
--- a/linux-user/hppa/cpu_loop.c
+++ b/linux-user/hppa/cpu_loop.c
@@ -129,8 +129,8 @@ void cpu_loop(CPUHPPAState *env)
 default:
 env->gr[28] = ret;
 /* We arrived here by faking the gateway page.  Return.  */
-env->iaoq_f = env->gr[31];
-env->iaoq_b = env->gr[31] + 4;
+env->iaoq_f = env->gr[31] | PRIV_USER;
+env->iaoq_b = env->iaoq_f + 4;
 break;
 case -QEMU_ERESTARTSYS:
 case -QEMU_ESIGRETURN:
@@ -140,8 +140,8 @@ void cpu_loop(CPUHPPAState *env)
 case EXCP_SYSCALL_LWS:
 env->gr[21] = hppa_lws(env);
 /* We arrived here by faking the gateway page.  Return.  */
-env->iaoq_f = env->gr[31];
-env->iaoq_b = env->gr[31] + 4;
+env->iaoq_f = env->gr[31] | PRIV_USER;
+env->iaoq_b = env->iaoq_f + 4;
 break;
 case EXCP_IMP:
 force_sig_fault(TARGET_SIGSEGV, TARGET_SEGV_MAPERR, env->iaoq_f);
@@ -152,9 +152,9 @@ void cpu_loop(CPUHPPAState *env)
 case EXCP_PRIV_OPR:
 /* check for glibc ABORT_INSTRUCTION "iitlbp %r0,(%sr0, %r0)" */
 if (env->cr[CR_IIR] == 0x0400) {
-   force_sig_fault(TARGET_SIGILL, TARGET_ILL_ILLOPC, 
env->iaoq_f);
+force_sig_fault(TARGET_SIGILL, TARGET_ILL_ILLOPC, env->iaoq_f);
 } else {
-   force_sig_fault(TARGET_SIGILL, TARGET_ILL_PRVOPC, 
env->iaoq_f);
+force_sig_fault(TARGET_SIGILL, TARGET_ILL_PRVOPC, env->iaoq_f);
 }
 break;
 case EXCP_PRIV_REG:
@@ -170,7 +170,7 @@ void cpu_loop(CPUHPPAState *env)
 force_sig_fault(TARGET_SIGFPE, 0, env->iaoq_f);
 break;
 case EXCP_BREAK:
-force_sig_fault(TARGET_SIGTRAP, TARGET_TRAP_BRKPT, env->iaoq_f & 
~3);
+force_sig_fault(TARGET_SIGTRAP, TARGET_TRAP_BRKPT, env->iaoq_f);
 break;
 case EXCP_DEBUG:
 force_sig_fault(TARGET_SIGTRAP, TARGET_TRAP_BRKPT, env->iaoq_f);
diff --git a/linux-user/hppa/signal.c b/linux-user/hppa/signal.c
index 682ba25922..f6f094c960 100644
--- a/linux-user/hppa/signal.c
+++ b/linux-user/hppa/signal.c
@@ -101,7 +101,9 @@ static void restore_sigcontext(CPUArchState *env, struct 
target_sigcontext *sc)
 cpu_hppa_loaded_fr0(env);
 
 __get_user(env->iaoq_f, >sc_iaoq[0]);
+env->iaoq_f |= PRIV_USER;
 __get_user(env->iaoq_b, >sc_iaoq[1]);
+env->iaoq_b |= PRIV_USER;
 __get_user(env->cr[CR_SAR], >sc_sar);
 }
 
@@ -162,8 +164,8 @@ void setup_rt_frame(int sig, struct target_sigaction *ka,
 unlock_user(fdesc, haddr, 0);
 haddr = dest;

[PATCH 12/45] target/hppa: Add IASQ entries to DisasContext

Add variable to track space changes to IAQ.  So far, no such changes
are introduced, but the new checks vs ctx->iasq_b may eliminate an
unnecessary copy to cpu_iasq_f with e.g. BLR.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 138250b550..43a74dafcf 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -49,6 +49,13 @@ typedef struct DisasContext {
 uint64_t iaoq_b;
 uint64_t iaoq_n;
 TCGv_i64 iaoq_n_var;
+/*
+ * Null when IASQ_Back unchanged from IASQ_Front,
+ * or cpu_iasq_b, when IASQ_Back has been changed.
+ */
+TCGv_i64 iasq_b;
+/* Null when IASQ_Next unchanged from IASQ_Back, or set by branch. */
+TCGv_i64 iasq_n;
 
 DisasCond null_cond;
 TCGLabel *null_lab;
@@ -3915,12 +3922,12 @@ static bool trans_be(DisasContext *ctx, arg_be *a)
 if (a->n && use_nullify_skip(ctx)) {
 install_iaq_entries(ctx, -1, tmp, -1, NULL);
 tcg_gen_mov_i64(cpu_iasq_f, new_spc);
-tcg_gen_mov_i64(cpu_iasq_b, cpu_iasq_f);
+tcg_gen_mov_i64(cpu_iasq_b, new_spc);
 nullify_set(ctx, 0);
 } else {
 install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, -1, tmp);
-if (ctx->iaoq_b == -1) {
-tcg_gen_mov_i64(cpu_iasq_f, cpu_iasq_b);
+if (ctx->iasq_b) {
+tcg_gen_mov_i64(cpu_iasq_f, ctx->iasq_b);
 }
 tcg_gen_mov_i64(cpu_iasq_b, new_spc);
 nullify_set(ctx, a->n);
@@ -4034,8 +4041,8 @@ static bool trans_bve(DisasContext *ctx, arg_bve *a)
 
 install_link(ctx, a->l, false);
 install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, -1, dest);
-if (ctx->iaoq_b == -1) {
-tcg_gen_mov_i64(cpu_iasq_f, cpu_iasq_b);
+if (ctx->iasq_b) {
+tcg_gen_mov_i64(cpu_iasq_f, ctx->iasq_b);
 }
 tcg_gen_mov_i64(cpu_iasq_b, space_select(ctx, 0, dest));
 nullify_set(ctx, a->n);
@@ -4616,6 +4623,7 @@ static void hppa_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
 ctx->mmu_idx = MMU_USER_IDX;
 ctx->iaoq_f = ctx->base.pc_first | ctx->privilege;
 ctx->iaoq_b = ctx->base.tb->cs_base | ctx->privilege;
+ctx->iasq_b = NULL;
 ctx->unalign = (ctx->tb_flags & TB_FLAG_UNALIGN ? MO_UNALN : MO_ALIGN);
 #else
 ctx->privilege = (ctx->tb_flags >> TB_FLAG_PRIV_SHIFT) & 3;
@@ -4630,6 +4638,7 @@ static void hppa_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
 
 ctx->iaoq_f = (ctx->base.pc_first & ~iasq_f) + ctx->privilege;
 ctx->iaoq_b = (diff ? ctx->iaoq_f + diff : -1);
+ctx->iasq_b = (diff ? NULL : cpu_iasq_b);
 #endif
 
 ctx->zero = tcg_constant_i64(0);
@@ -4682,6 +4691,7 @@ static void hppa_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cs)
 
 /* Set up the IA queue for the next insn.
This will be overwritten by a branch.  */
+ctx->iasq_n = NULL;
 ctx->iaoq_n_var = NULL;
 ctx->iaoq_n = ctx->iaoq_b == -1 ? -1 : ctx->iaoq_b + 4;
 
@@ -4704,7 +4714,7 @@ static void hppa_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cs)
 return;
 }
 /* Note this also detects a priority change. */
-if (ctx->iaoq_b != ctx->iaoq_f + 4) {
+if (ctx->iaoq_b != ctx->iaoq_f + 4 || ctx->iasq_b) {
 ctx->base.is_jmp = DISAS_IAQ_N_STALE;
 return;
 }
@@ -4724,6 +4734,10 @@ static void hppa_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cs)
  gva_offset_mask(ctx->tb_flags));
 }
 }
+if (ctx->iasq_n) {
+tcg_gen_mov_i64(cpu_iasq_b, ctx->iasq_n);
+ctx->iasq_b = cpu_iasq_b;
+}
 }
 
 static void hppa_tr_tb_stop(DisasContextBase *dcbase, CPUState *cs)
@@ -4732,14 +4746,15 @@ static void hppa_tr_tb_stop(DisasContextBase *dcbase, 
CPUState *cs)
 DisasJumpType is_jmp = ctx->base.is_jmp;
 uint64_t fi, bi;
 TCGv_i64 fv, bv;
-TCGv_i64 fs;
+TCGv_i64 fs, bs;
 
 /* Assume the insn queue has not been advanced. */
 fi = ctx->iaoq_b;
 fv = cpu_iaoq_b;
-fs = fi == -1 ? cpu_iasq_b : NULL;
+fs = ctx->iasq_b;
 bi = ctx->iaoq_n;
 bv = ctx->iaoq_n_var;
+bs = ctx->iasq_n;
 
 switch (is_jmp) {
 case DISAS_NORETURN:
@@ -4748,12 +4763,15 @@ static void hppa_tr_tb_stop(DisasContextBase *dcbase, 
CPUState *cs)
 /* The insn queue has not been advanced. */
 bi = fi;
 bv = fv;
+bs = fs;
 fi = ctx->iaoq_f;
 fv = NULL;
 fs = NULL;
 /* FALLTHRU */
 case DISAS_IAQ_N_STALE:
-if (use_goto_tb(ctx, fi, bi)
+if (fs == NULL
+&& bs == NULL
+&& use_goto_tb(ctx, fi, bi)
 && (ctx->null_cond.c == TCG_COND_NEVER
 || ctx->null_cond.c == TCG_COND_ALWAYS)) {
 nullify_set(ctx, ctx->null_cond.c == TCG_COND_ALWAYS);
@@ -4766,6

[PATCH 09/45] target/hppa: Delay computation of IAQ_Next

We no longer have to allocate a temp and perform an
addition before translation of the rest of the insn.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 26 ++
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 195a0e7e79..ac181180a6 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1805,6 +1805,7 @@ static bool do_dbranch(DisasContext *ctx, int64_t disp,
 if (ctx->null_cond.c == TCG_COND_NEVER && ctx->null_lab == NULL) {
 install_link(ctx, link, false);
 ctx->iaoq_n = dest;
+ctx->iaoq_n_var = NULL;
 if (is_n) {
 ctx->null_cond.c = TCG_COND_ALWAYS;
 }
@@ -1861,11 +1862,6 @@ static bool do_cbranch(DisasContext *ctx, int64_t disp, 
bool is_n,
 ctx->null_lab = NULL;
 }
 nullify_set(ctx, n);
-if (ctx->iaoq_n == -1) {
-/* The temporary iaoq_n_var died at the branch above.
-   Regenerate it here instead of saving it.  */
-tcg_gen_addi_i64(ctx->iaoq_n_var, cpu_iaoq_b, 4);
-}
 gen_goto_tb(ctx, 0, ctx->iaoq_b, ctx->iaoq_n);
 }
 
@@ -4629,8 +4625,6 @@ static void hppa_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
 ctx->iaoq_f = (ctx->base.pc_first & ~iasq_f) + ctx->privilege;
 ctx->iaoq_b = (diff ? ctx->iaoq_f + diff : -1);
 #endif
-ctx->iaoq_n = -1;
-ctx->iaoq_n_var = NULL;
 
 ctx->zero = tcg_constant_i64(0);
 
@@ -4682,14 +4676,8 @@ static void hppa_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cs)
 
 /* Set up the IA queue for the next insn.
This will be overwritten by a branch.  */
-if (ctx->iaoq_b == -1) {
-ctx->iaoq_n = -1;
-ctx->iaoq_n_var = tcg_temp_new_i64();
-tcg_gen_addi_i64(ctx->iaoq_n_var, cpu_iaoq_b, 4);
-} else {
-ctx->iaoq_n = ctx->iaoq_b + 4;
-ctx->iaoq_n_var = NULL;
-}
+ctx->iaoq_n_var = NULL;
+ctx->iaoq_n = ctx->iaoq_b == -1 ? -1 : ctx->iaoq_b + 4;
 
 if (unlikely(ctx->null_cond.c == TCG_COND_ALWAYS)) {
 ctx->null_cond.c = TCG_COND_NEVER;
@@ -4740,7 +4728,13 @@ static void hppa_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cs)
 ? DISAS_EXIT
 : DISAS_IAQ_N_UPDATED);
 } else if (ctx->iaoq_b == -1) {
-copy_iaoq_entry(ctx, cpu_iaoq_b, -1, ctx->iaoq_n_var);
+if (ctx->iaoq_n_var) {
+copy_iaoq_entry(ctx, cpu_iaoq_b, -1, ctx->iaoq_n_var);
+} else {
+tcg_gen_addi_i64(cpu_iaoq_b, cpu_iaoq_b, 4);
+tcg_gen_andi_i64(cpu_iaoq_b, cpu_iaoq_b,
+ gva_offset_mask(ctx->tb_flags));
+}
 }
 break;
 
-- 
2.34.1

[PATCH 17/45] target/hppa: Introduce and use DisasIAQE for branch management

Wrap offset and space together in one structure, ensuring
that they're copied together as required.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 378 +---
 1 file changed, 198 insertions(+), 180 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index dd5193cb6a..9d3bffb688 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -41,21 +41,23 @@ typedef struct DisasCond {
 TCGv_i64 a0, a1;
 } DisasCond;
 
+typedef struct DisasIAQE {
+/* IASQ; may be null for no change from TB. */
+TCGv_i64 space;
+/* IAOQ base; may be null for immediate absolute address. */
+TCGv_i64 base;
+/* IAOQ addend; absolute immedate address if base is null. */
+int64_t disp;
+} DisasIAQE;
+
 typedef struct DisasContext {
 DisasContextBase base;
 CPUState *cs;
 
-uint64_t iaoq_f;
-uint64_t iaoq_b;
-uint64_t iaoq_n;
-TCGv_i64 iaoq_n_var;
-/*
- * Null when IASQ_Back unchanged from IASQ_Front,
- * or cpu_iasq_b, when IASQ_Back has been changed.
- */
-TCGv_i64 iasq_b;
-/* Null when IASQ_Next unchanged from IASQ_Back, or set by branch. */
-TCGv_i64 iasq_n;
+/* IAQ_Front, IAQ_Back. */
+DisasIAQE iaq_f, iaq_b;
+/* IAQ_Next, for jumps, otherwise null for simple advance. */
+DisasIAQE iaq_j, *iaq_n;
 
 DisasCond null_cond;
 TCGLabel *null_lab;
@@ -601,49 +603,67 @@ static bool nullify_end(DisasContext *ctx)
 return true;
 }
 
+static bool iaqe_variable(const DisasIAQE *e)
+{
+return e->base || e->space;
+}
+
+static DisasIAQE iaqe_incr(const DisasIAQE *e, int64_t disp)
+{
+return (DisasIAQE){
+.space = e->space,
+.base = e->base,
+.disp = e->disp + disp,
+};
+}
+
+static DisasIAQE iaqe_branchi(DisasContext *ctx, int64_t disp)
+{
+return (DisasIAQE){
+.space = ctx->iaq_b.space,
+.disp = ctx->iaq_f.disp + 8 + disp,
+};
+}
+
+static DisasIAQE iaqe_next_absv(DisasContext *ctx, TCGv_i64 var)
+{
+return (DisasIAQE){
+.space = ctx->iaq_b.space,
+.base = var,
+};
+}
+
 static void copy_iaoq_entry(DisasContext *ctx, TCGv_i64 dest,
-uint64_t ival, TCGv_i64 vval)
+const DisasIAQE *src)
 {
 uint64_t mask = gva_offset_mask(ctx->tb_flags);
 
-if (ival != -1) {
-tcg_gen_movi_i64(dest, ival & mask);
-return;
-}
-tcg_debug_assert(vval != NULL);
-
-/*
- * We know that the IAOQ is already properly masked.
- * This optimization is primarily for "iaoq_f = iaoq_b".
- */
-if (vval == cpu_iaoq_f || vval == cpu_iaoq_b) {
-tcg_gen_mov_i64(dest, vval);
+if (src->base == NULL) {
+tcg_gen_movi_i64(dest, src->disp & mask);
+} else if (src->disp == 0) {
+tcg_gen_andi_i64(dest, src->base, mask);
 } else {
-tcg_gen_andi_i64(dest, vval, mask);
+tcg_gen_addi_i64(dest, src->base, src->disp);
+tcg_gen_andi_i64(dest, dest, mask);
 }
 }
 
-static void install_iaq_entries(DisasContext *ctx,
-uint64_t bi, TCGv_i64 bv, TCGv_i64 bs,
-uint64_t ni, TCGv_i64 nv, TCGv_i64 ns)
+static void install_iaq_entries(DisasContext *ctx, const DisasIAQE *f,
+const DisasIAQE *b)
 {
-copy_iaoq_entry(ctx, cpu_iaoq_f, bi, bv);
+DisasIAQE b_next;
 
-/* Allow ni variable, with nv null, to indicate a trivial advance. */
-if (ni != -1 || nv) {
-copy_iaoq_entry(ctx, cpu_iaoq_b, ni, nv);
-} else if (bi != -1) {
-copy_iaoq_entry(ctx, cpu_iaoq_b, bi + 4, NULL);
-} else {
-tcg_gen_addi_i64(cpu_iaoq_b, cpu_iaoq_f, 4);
-tcg_gen_andi_i64(cpu_iaoq_b, cpu_iaoq_b,
- gva_offset_mask(ctx->tb_flags));
+if (b == NULL) {
+b_next = iaqe_incr(f, 4);
+b = _next;
 }
-if (bs) {
-tcg_gen_mov_i64(cpu_iasq_f, bs);
+copy_iaoq_entry(ctx, cpu_iaoq_f, f);
+copy_iaoq_entry(ctx, cpu_iaoq_b, b);
+if (f->space) {
+tcg_gen_mov_i64(cpu_iasq_f, f->space);
 }
-if (ns || bs) {
-tcg_gen_mov_i64(cpu_iasq_b, ns ? ns : bs);
+if (b->space || f->space) {
+tcg_gen_mov_i64(cpu_iasq_b, b->space ? : f->space);
 }
 }
 
@@ -651,10 +671,11 @@ static void install_link(DisasContext *ctx, unsigned 
link, bool with_sr0)
 {
 tcg_debug_assert(ctx->null_cond.c == TCG_COND_NEVER);
 if (link) {
-if (ctx->iaoq_b == -1) {
-tcg_gen_addi_i64(cpu_gr[link], cpu_iaoq_b, 4);
+if (ctx->iaq_b.base) {
+tcg_gen_addi_i64(cpu_gr[link], ctx->iaq_b.base,
+ ctx->iaq_b.disp + 4);
 } else {
-tcg_gen_movi_i64(cpu_gr[link], ctx->iaoq_b + 4);
+tcg_gen_movi_i64(cpu_gr[link], ctx->iaq_b.disp + 4);
 }
 #ifndef CONFIG_USER_ONLY
 if (with_sr0) {
@@ -664,11

[PATCH 03/45] target/hppa: Move constant destination check into use_goto_tb

Share this check between gen_goto_tb and hppa_tr_translate_insn.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 42fa480950..cb874e1c1e 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -661,9 +661,10 @@ static bool gen_illegal(DisasContext *ctx)
 } while (0)
 #endif
 
-static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
+static bool use_goto_tb(DisasContext *ctx, uint64_t bofs, uint64_t nofs)
 {
-return translator_use_goto_tb(>base, dest);
+return (bofs != -1 && nofs != -1 &&
+translator_use_goto_tb(>base, bofs));
 }
 
 /* If the next insn is to be nullified, and it's on the same page,
@@ -677,16 +678,16 @@ static bool use_nullify_skip(DisasContext *ctx)
 }
 
 static void gen_goto_tb(DisasContext *ctx, int which,
-uint64_t f, uint64_t b)
+uint64_t b, uint64_t n)
 {
-if (f != -1 && b != -1 && use_goto_tb(ctx, f)) {
+if (use_goto_tb(ctx, b, n)) {
 tcg_gen_goto_tb(which);
-copy_iaoq_entry(ctx, cpu_iaoq_f, f, NULL);
-copy_iaoq_entry(ctx, cpu_iaoq_b, b, NULL);
+copy_iaoq_entry(ctx, cpu_iaoq_f, b, NULL);
+copy_iaoq_entry(ctx, cpu_iaoq_b, n, NULL);
 tcg_gen_exit_tb(ctx->base.tb, which);
 } else {
-copy_iaoq_entry(ctx, cpu_iaoq_f, f, cpu_iaoq_b);
-copy_iaoq_entry(ctx, cpu_iaoq_b, b, ctx->iaoq_n_var);
+copy_iaoq_entry(ctx, cpu_iaoq_f, b, cpu_iaoq_b);
+copy_iaoq_entry(ctx, cpu_iaoq_b, n, ctx->iaoq_n_var);
 tcg_gen_lookup_and_goto_ptr();
 }
 }
@@ -4743,8 +4744,7 @@ static void hppa_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cs)
 /* Advance the insn queue.  Note that this check also detects
a priority change within the instruction queue.  */
 if (ret == DISAS_NEXT && ctx->iaoq_b != ctx->iaoq_f + 4) {
-if (ctx->iaoq_b != -1 && ctx->iaoq_n != -1
-&& use_goto_tb(ctx, ctx->iaoq_b)
+if (use_goto_tb(ctx, ctx->iaoq_b, ctx->iaoq_n)
 && (ctx->null_cond.c == TCG_COND_NEVER
 || ctx->null_cond.c == TCG_COND_ALWAYS)) {
 nullify_set(ctx, ctx->null_cond.c == TCG_COND_ALWAYS);
-- 
2.34.1

[PATCH 13/45] target/hppa: Add space arguments to install_iaq_entries

Move space assighments to a central location.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 58 +++--
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 43a74dafcf..6b3b298678 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -623,8 +623,9 @@ static void copy_iaoq_entry(DisasContext *ctx, TCGv_i64 
dest,
 }
 }
 
-static void install_iaq_entries(DisasContext *ctx, uint64_t bi, TCGv_i64 bv,
-uint64_t ni, TCGv_i64 nv)
+static void install_iaq_entries(DisasContext *ctx,
+uint64_t bi, TCGv_i64 bv, TCGv_i64 bs,
+uint64_t ni, TCGv_i64 nv, TCGv_i64 ns)
 {
 copy_iaoq_entry(ctx, cpu_iaoq_f, bi, bv);
 
@@ -638,6 +639,12 @@ static void install_iaq_entries(DisasContext *ctx, 
uint64_t bi, TCGv_i64 bv,
 tcg_gen_andi_i64(cpu_iaoq_b, cpu_iaoq_b,
  gva_offset_mask(ctx->tb_flags));
 }
+if (bs) {
+tcg_gen_mov_i64(cpu_iasq_f, bs);
+}
+if (ns || bs) {
+tcg_gen_mov_i64(cpu_iasq_b, ns ? ns : bs);
+}
 }
 
 static void install_link(DisasContext *ctx, unsigned link, bool with_sr0)
@@ -669,7 +676,8 @@ static void gen_excp_1(int exception)
 
 static void gen_excp(DisasContext *ctx, int exception)
 {
-install_iaq_entries(ctx, ctx->iaoq_f, cpu_iaoq_f, ctx->iaoq_b, cpu_iaoq_b);
+install_iaq_entries(ctx, ctx->iaoq_f, cpu_iaoq_f, NULL,
+ctx->iaoq_b, cpu_iaoq_b, NULL);
 nullify_save(ctx);
 gen_excp_1(exception);
 ctx->base.is_jmp = DISAS_NORETURN;
@@ -723,10 +731,11 @@ static void gen_goto_tb(DisasContext *ctx, int which,
 {
 if (use_goto_tb(ctx, b, n)) {
 tcg_gen_goto_tb(which);
-install_iaq_entries(ctx, b, NULL, n, NULL);
+install_iaq_entries(ctx, b, NULL, NULL, n, NULL, NULL);
 tcg_gen_exit_tb(ctx->base.tb, which);
 } else {
-install_iaq_entries(ctx, b, cpu_iaoq_b, n, ctx->iaoq_n_var);
+install_iaq_entries(ctx, b, cpu_iaoq_b, ctx->iasq_b,
+n, ctx->iaoq_n_var, ctx->iasq_n);
 tcg_gen_lookup_and_goto_ptr();
 }
 }
@@ -1915,7 +1924,7 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 install_link(ctx, link, false);
 if (is_n) {
 if (use_nullify_skip(ctx)) {
-install_iaq_entries(ctx, -1, next, -1, NULL);
+install_iaq_entries(ctx, -1, next, NULL, -1, NULL, NULL);
 nullify_set(ctx, 0);
 ctx->base.is_jmp = DISAS_IAQ_N_UPDATED;
 return true;
@@ -1934,10 +1943,11 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 dest,
 
 install_link(ctx, link, false);
 if (is_n && use_nullify_skip(ctx)) {
-install_iaq_entries(ctx, -1, next, -1, NULL);
+install_iaq_entries(ctx, -1, next, NULL, -1, NULL, NULL);
 nullify_set(ctx, 0);
 } else {
-install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, -1, next);
+install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, ctx->iasq_b,
+-1, next, NULL);
 nullify_set(ctx, is_n);
 }
 
@@ -2025,7 +2035,7 @@ static void do_page_zero(DisasContext *ctx)
 tcg_gen_st_i64(cpu_gr[26], tcg_env, offsetof(CPUHPPAState, cr[27]));
 tmp = tcg_temp_new_i64();
 tcg_gen_ori_i64(tmp, cpu_gr[31], 3);
-install_iaq_entries(ctx, -1, tmp, -1, NULL);
+install_iaq_entries(ctx, -1, tmp, NULL, -1, NULL, NULL);
 ctx->base.is_jmp = DISAS_IAQ_N_UPDATED;
 break;
 
@@ -2769,8 +2779,8 @@ static bool trans_or(DisasContext *ctx, arg_rrr_cf_d *a)
 nullify_over(ctx);
 
 /* Advance the instruction queue.  */
-install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b,
-ctx->iaoq_n, ctx->iaoq_n_var);
+install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, ctx->iasq_b,
+ctx->iaoq_n, ctx->iaoq_n_var, ctx->iasq_n);
 nullify_set(ctx, 0);
 
 /* Tell the qemu main loop to halt until this cpu has work.  */
@@ -3920,16 +3930,11 @@ static bool trans_be(DisasContext *ctx, arg_be *a)
 load_spr(ctx, new_spc, a->sp);
 install_link(ctx, a->l, true);
 if (a->n && use_nullify_skip(ctx)) {
-install_iaq_entries(ctx, -1, tmp, -1, NULL);
-tcg_gen_mov_i64(cpu_iasq_f, new_spc);
-tcg_gen_mov_i64(cpu_iasq_b, new_spc);
+install_iaq_entries(ctx, -1, tmp, new_spc, -1, NULL, new_spc);
 nullify_set(ctx, 0);
 } else {
-install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, -1, tmp);
-if (ctx->iasq_b) {
-tcg_gen_mov_i64(cpu_iasq_f, ctx->iasq_b);
-}
-tcg_gen_mov_i64(cpu_iasq_b, new_spc);
+install_iaq_entries(ctx, ctx->iaoq_b, cpu_iaoq_b, ctx->iasq_b,
+

[PATCH 20/45] target/hppa: Use TCG_COND_TST* in do_cond

We can directly test bits of a 32-bit comparison without
zero or sign-extending an intermediate result.
We can directly test bit 0 for odd/even.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 64 ++---
 1 file changed, 28 insertions(+), 36 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index a1132c884f..85941f191f 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -774,28 +774,36 @@ static bool cond_need_cb(int c)
 static DisasCond do_cond(DisasContext *ctx, unsigned cf, bool d,
  TCGv_i64 res, TCGv_i64 uv, TCGv_i64 sv)
 {
+TCGCond sign_cond, zero_cond;
+uint64_t sign_imm, zero_imm;
 DisasCond cond;
 TCGv_i64 tmp;
 
+if (d) {
+/* 64-bit condition. */
+sign_imm = 0;
+sign_cond = TCG_COND_LT;
+zero_imm = 0;
+zero_cond = TCG_COND_EQ;
+} else {
+/* 32-bit condition. */
+sign_imm = 1ull << 31;
+sign_cond = TCG_COND_TSTNE;
+zero_imm = UINT32_MAX;
+zero_cond = TCG_COND_TSTEQ;
+}
+
 switch (cf >> 1) {
 case 0: /* Never / TR(0 / 1) */
 cond = cond_make_f();
 break;
 case 1: /* = / <>(Z / !Z) */
-if (!d) {
-tmp = tcg_temp_new_i64();
-tcg_gen_ext32u_i64(tmp, res);
-res = tmp;
-}
-cond = cond_make_vi(TCG_COND_EQ, res, 0);
+cond = cond_make_vi(zero_cond, res, zero_imm);
 break;
 case 2: /* < / >=(N ^ V / !(N ^ V) */
 tmp = tcg_temp_new_i64();
 tcg_gen_xor_i64(tmp, res, sv);
-if (!d) {
-tcg_gen_ext32s_i64(tmp, tmp);
-}
-cond = cond_make_ti(TCG_COND_LT, tmp, 0);
+cond = cond_make_ti(sign_cond, tmp, sign_imm);
 break;
 case 3: /* <= / >(N ^ V) | Z / !((N ^ V) | Z) */
 /*
@@ -803,21 +811,15 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
  *   (N ^ V) | Z
  *   ((res < 0) ^ (sv < 0)) | !res
  *   ((res ^ sv) < 0) | !res
- *   (~(res ^ sv) >= 0) | !res
- *   !(~(res ^ sv) >> 31) | !res
- *   !(~(res ^ sv) >> 31 & res)
+ *   ((res ^ sv) < 0 ? 1 : !res)
+ *   !((res ^ sv) < 0 ? 0 : res)
  */
 tmp = tcg_temp_new_i64();
-tcg_gen_eqv_i64(tmp, res, sv);
-if (!d) {
-tcg_gen_sextract_i64(tmp, tmp, 31, 1);
-tcg_gen_and_i64(tmp, tmp, res);
-tcg_gen_ext32u_i64(tmp, tmp);
-} else {
-tcg_gen_sari_i64(tmp, tmp, 63);
-tcg_gen_and_i64(tmp, tmp, res);
-}
-cond = cond_make_ti(TCG_COND_EQ, tmp, 0);
+tcg_gen_xor_i64(tmp, res, sv);
+tcg_gen_movcond_i64(sign_cond, tmp,
+tmp, tcg_constant_i64(sign_imm),
+ctx->zero, res);
+cond = cond_make_ti(zero_cond, tmp, zero_imm);
 break;
 case 4: /* NUV / UV  (!UV / UV) */
 cond = cond_make_vi(TCG_COND_EQ, uv, 0);
@@ -825,23 +827,13 @@ static DisasCond do_cond(DisasContext *ctx, unsigned cf, 
bool d,
 case 5: /* ZNV / VNZ (!UV | Z / UV & !Z) */
 tmp = tcg_temp_new_i64();
 tcg_gen_movcond_i64(TCG_COND_EQ, tmp, uv, ctx->zero, ctx->zero, res);
-if (!d) {
-tcg_gen_ext32u_i64(tmp, tmp);
-}
-cond = cond_make_ti(TCG_COND_EQ, tmp, 0);
+cond = cond_make_ti(zero_cond, tmp, zero_imm);
 break;
 case 6: /* SV / NSV  (V / !V) */
-if (!d) {
-tmp = tcg_temp_new_i64();
-tcg_gen_ext32s_i64(tmp, sv);
-sv = tmp;
-}
-cond = cond_make_ti(TCG_COND_LT, sv, 0);
+cond = cond_make_vi(sign_cond, sv, sign_imm);
 break;
 case 7: /* OD / EV */
-tmp = tcg_temp_new_i64();
-tcg_gen_andi_i64(tmp, res, 1);
-cond = cond_make_ti(TCG_COND_NE, tmp, 0);
+cond = cond_make_vi(TCG_COND_TSTNE, res, 1);
 break;
 default:
 g_assert_not_reached();
-- 
2.34.1

[PATCH 00/45] target/hppa: Misc improvements

Most of the patches lead up to implementing CF_PCREL.
Along the way there is a grab bag of code updates (TCG_COND_TST*),
bug fixes (space changes during branch-in-branch-delay-slot),
and implementation of features (PSW bits B, X, T, H, L).

Sven reported that PSW L tripped up HP/UX, so possibly there's
something wrong there, but that's right at the end of the patch set.
So I'd like some feedback on the rest leading up to that too.


r~


Richard Henderson (45):
  target/hppa: Move cpu_get_tb_cpu_state out of line
  target/hppa: Use hppa_form_gva_psw in hppa_cpu_get_pc
  target/hppa: Move constant destination check into use_goto_tb
  target/hppa: Pass displacement to do_dbranch
  target/hppa: Allow prior nullification in do_ibranch
  target/hppa: Use CF_BP_PAGE instead of cpu_breakpoint_test
  target/hppa: Add install_iaq_entries
  target/hppa: Add install_link
  target/hppa: Delay computation of IAQ_Next
  target/hppa: Skip nullified insns in unconditional dbranch path
  target/hppa: Simplify TB end
  target/hppa: Add IASQ entries to DisasContext
  target/hppa: Add space arguments to install_iaq_entries
  target/hppa: Add space argument to do_ibranch
  target/hppa: Use umax in do_ibranch_priv
  target/hppa: Always make a copy in do_ibranch_priv
  target/hppa: Introduce and use DisasIAQE for branch management
  target/hppa: Use displacements in DisasIAQE
  target/hppa: Rename cond_make_* helpers
  target/hppa: Use TCG_COND_TST* in do_cond
  target/hppa: Use TCG_COND_TST* in do_log_cond
  target/hppa: Use TCG_COND_TST* in do_unit_zero_cond
  target/hppa: Use TCG_COND_TST* in do_unit_addsub
  target/hppa: Use TCG_COND_TST* in trans_bb_imm
  target/hppa: Use registerfields.h for FPSR
  target/hppa: Use TCG_COND_TST* in trans_ftest
  target/hppa: Remove cond_free
  target/hppa: Introduce DisasDelayException
  target/hppa: Use delay_excp for conditional traps
  target/hppa: Use delay_excp for conditional trap on overflow
  linux-user/hppa: Force all code addresses to PRIV_USER
  target/hppa: Store full iaoq_f and page bits of iaoq_d in TB
  target/hppa: Do not mask in copy_iaoq_entry
  target/hppa: Improve hppa_cpu_dump_state
  target/hppa: Split PSW X and B into their own field
  target/hppa: Manage PSW_X and PSW_B in translator
  target/hppa: Implement PSW_B
  target/hppa: Implement PSW_X
  target/hppa: Drop tlb_entry return from hppa_get_physical_address
  target/hppa: Adjust priv for B,GATE at runtime
  target/hppa: Implement CF_PCREL
  target/hppa: Implement PSW_T
  target/hppa: Implement PSW_H, PSW_L
  target/hppa: Log cpu state at interrupt
  target/hppa: Log cpu state on return-from-interrupt

 linux-user/hppa/target_cpu.h |4 +-
 target/hppa/cpu.h|   80 +--
 target/hppa/helper.h |3 +-
 linux-user/elfload.c |4 +-
 linux-user/hppa/cpu_loop.c   |   14 +-
 linux-user/hppa/signal.c |6 +-
 target/hppa/cpu.c|   92 ++-
 target/hppa/fpu_helper.c |   26 +-
 target/hppa/gdbstub.c|6 +
 target/hppa/helper.c |   66 +-
 target/hppa/int_helper.c |   33 +-
 target/hppa/mem_helper.c |   99 +--
 target/hppa/op_helper.c  |   17 +-
 target/hppa/sys_helper.c |   12 +
 target/hppa/translate.c  | 1232 ++
 15 files changed, 947 insertions(+), 747 deletions(-)

-- 
2.34.1

[PATCH 16/45] target/hppa: Always make a copy in do_ibranch_priv

This simplifies callers, which might otherwise have
to make another copy.

Signed-off-by: Richard Henderson 
---
 target/hppa/translate.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index 7e01c21141..dd5193cb6a 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -1966,18 +1966,17 @@ static bool do_ibranch(DisasContext *ctx, TCGv_i64 
dest, TCGv_i64 dspc,
  */
 static TCGv_i64 do_ibranch_priv(DisasContext *ctx, TCGv_i64 offset)
 {
-TCGv_i64 dest;
+TCGv_i64 dest = tcg_temp_new_i64();
 switch (ctx->privilege) {
 case 0:
 /* Privilege 0 is maximum and is allowed to decrease.  */
-return offset;
+tcg_gen_mov_i64(dest, offset);
+break;
 case 3:
 /* Privilege 3 is minimum and is never allowed to increase.  */
-dest = tcg_temp_new_i64();
 tcg_gen_ori_i64(dest, offset, 3);
 break;
 default:
-dest = tcg_temp_new_i64();
 tcg_gen_andi_i64(dest, offset, -4);
 tcg_gen_ori_i64(dest, dest, ctx->privilege);
 tcg_gen_umax_i64(dest, dest, offset);
-- 
2.34.1

[PATCH 02/45] target/hppa: Use hppa_form_gva_psw in hppa_cpu_get_pc

This function is for log_pc(), which needs to produce a
similar result to cpu_get_tb_cpu_state().

Signed-off-by: Richard Henderson 
---
 target/hppa/cpu.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/target/hppa/cpu.c b/target/hppa/cpu.c
index 1d5f5086bf..7315567910 100644
--- a/target/hppa/cpu.c
+++ b/target/hppa/cpu.c
@@ -38,9 +38,10 @@ static void hppa_cpu_set_pc(CPUState *cs, vaddr value)
 
 static vaddr hppa_cpu_get_pc(CPUState *cs)
 {
-HPPACPU *cpu = HPPA_CPU(cs);
+CPUHPPAState *env = cpu_env(cs);
 
-return cpu->env.iaoq_f;
+return hppa_form_gva_psw(env->psw, (env->psw & PSW_C ? env->iasq_f : 0),
+ env->iaoq_f & -4);
 }
 
 void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
@@ -61,8 +62,7 @@ void cpu_get_tb_cpu_state(CPUHPPAState *env, vaddr *pc,
 flags |= env->psw & (PSW_W | PSW_C | PSW_D | PSW_P);
 flags |= (env->iaoq_f & 3) << TB_FLAG_PRIV_SHIFT;
 
-*pc = hppa_form_gva_psw(env->psw, (env->psw & PSW_C ? env->iasq_f : 0),
-env->iaoq_f & -4);
+*pc = hppa_cpu_get_pc(env_cpu(env));
 *cs_base = env->iasq_f;
 
 /* Insert a difference between IAOQ_B and IAOQ_F within the otherwise zero
-- 
2.34.1

[PATCH 2/5] target/alpha: Hoist branch shift to initial decode

Signed-off-by: Richard Henderson 
---
 target/alpha/translate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index a97cd54f0c..52c2e6248b 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -432,7 +432,7 @@ static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
 
 static DisasJumpType gen_bdirect(DisasContext *ctx, int ra, int32_t disp)
 {
-uint64_t dest = ctx->base.pc_next + (disp << 2);
+uint64_t dest = ctx->base.pc_next + disp;
 
 if (ra != 31) {
 tcg_gen_movi_i64(ctx->ir[ra], ctx->base.pc_next);
@@ -455,7 +455,7 @@ static DisasJumpType gen_bdirect(DisasContext *ctx, int ra, 
int32_t disp)
 static DisasJumpType gen_bcond_internal(DisasContext *ctx, TCGCond cond,
 TCGv cmp, uint64_t imm, int32_t disp)
 {
-uint64_t dest = ctx->base.pc_next + (disp << 2);
+uint64_t dest = ctx->base.pc_next + disp;
 TCGLabel *lab_true = gen_new_label();
 
 if (use_goto_tb(ctx, dest)) {
@@ -1382,7 +1382,7 @@ static DisasJumpType translate_one(DisasContext *ctx, 
uint32_t insn)
 real_islit = islit = extract32(insn, 12, 1);
 lit = extract32(insn, 13, 8);
 
-disp21 = sextract32(insn, 0, 21);
+disp21 = sextract32(insn, 0, 21) * 4;
 disp16 = sextract32(insn, 0, 16);
 disp12 = sextract32(insn, 0, 12);
 
-- 
2.34.1

[PATCH 1/5] target/alpha: Use cpu_env in preference to ALPHA_CPU

ALPHA_CPU has a dynamic object type assert, which is
unnecessary considering that these are all class hooks.

Signed-off-by: Richard Henderson 
---
 target/alpha/cpu.c| 15 ++-
 target/alpha/helper.c |  8 
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index 05f9ee41e9..f98d022671 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -28,25 +28,22 @@
 
 static void alpha_cpu_set_pc(CPUState *cs, vaddr value)
 {
-AlphaCPU *cpu = ALPHA_CPU(cs);
-
-cpu->env.pc = value;
+CPUAlphaState *env = cpu_env(cs);
+env->pc = value;
 }
 
 static vaddr alpha_cpu_get_pc(CPUState *cs)
 {
-AlphaCPU *cpu = ALPHA_CPU(cs);
-
-return cpu->env.pc;
+CPUAlphaState *env = cpu_env(cs);
+return env->pc;
 }
 
 static void alpha_restore_state_to_opc(CPUState *cs,
const TranslationBlock *tb,
const uint64_t *data)
 {
-AlphaCPU *cpu = ALPHA_CPU(cs);
-
-cpu->env.pc = data[0];
+CPUAlphaState *env = cpu_env(cs);
+env->pc = data[0];
 }
 
 static bool alpha_cpu_has_work(CPUState *cs)
diff --git a/target/alpha/helper.c b/target/alpha/helper.c
index d6d4353edd..c5e4958f8b 100644
--- a/target/alpha/helper.c
+++ b/target/alpha/helper.c
@@ -124,7 +124,7 @@ void alpha_cpu_record_sigsegv(CPUState *cs, vaddr address,
   MMUAccessType access_type,
   bool maperr, uintptr_t retaddr)
 {
-AlphaCPU *cpu = ALPHA_CPU(cs);
+CPUAlphaState *env = cpu_env(cs);
 target_ulong mmcsr, cause;
 
 /* Assuming !maperr, infer the missing protection. */
@@ -155,9 +155,9 @@ void alpha_cpu_record_sigsegv(CPUState *cs, vaddr address,
 }
 
 /* Record the arguments that PALcode would give to the kernel. */
-cpu->env.trap_arg0 = address;
-cpu->env.trap_arg1 = mmcsr;
-cpu->env.trap_arg2 = cause;
+env->trap_arg0 = address;
+env->trap_arg1 = mmcsr;
+env->trap_arg2 = cause;
 }
 #else
 /* Returns the OSF/1 entMM failure indication, or -1 on success.  */
-- 
2.34.1

[PATCH 0/5] target/alpha: Implement CF_PCREL

Implement pc-relative tcg code generation.

r~

Richard Henderson (5):
  target/alpha: Use cpu_env in preference to ALPHA_CPU
  target/alpha: Hoist branch shift to initial decode
  target/alpha: Split out gen_goto_tb
  target/alpha: Split out gen_pc_disp
  target/alpha: Implement CF_PCREL

 target/alpha/cpu.c   |  32 ---
 target/alpha/helper.c|   8 +--
 target/alpha/translate.c | 117 +--
 3 files changed, 91 insertions(+), 66 deletions(-)

-- 
2.34.1

[PATCH 5/5] target/alpha: Implement CF_PCREL

Signed-off-by: Richard Henderson 
---
 target/alpha/cpu.c   | 23 ++-
 target/alpha/translate.c | 29 +
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/target/alpha/cpu.c b/target/alpha/cpu.c
index f98d022671..0e2fbcb397 100644
--- a/target/alpha/cpu.c
+++ b/target/alpha/cpu.c
@@ -38,12 +38,27 @@ static vaddr alpha_cpu_get_pc(CPUState *cs)
 return env->pc;
 }
 
+static void alpha_cpu_synchronize_from_tb(CPUState *cs,
+  const TranslationBlock *tb)
+{
+/* The program counter is always up to date with CF_PCREL. */
+if (!(tb_cflags(tb) & CF_PCREL)) {
+CPUAlphaState *env = cpu_env(cs);
+env->pc = tb->pc;
+}
+}
+
 static void alpha_restore_state_to_opc(CPUState *cs,
const TranslationBlock *tb,
const uint64_t *data)
 {
 CPUAlphaState *env = cpu_env(cs);
-env->pc = data[0];
+
+if (tb_cflags(tb) & CF_PCREL) {
+env->pc = (env->pc & TARGET_PAGE_MASK) | data[0];
+} else {
+env->pc = data[0];
+}
 }
 
 static bool alpha_cpu_has_work(CPUState *cs)
@@ -78,6 +93,11 @@ static void alpha_cpu_realizefn(DeviceState *dev, Error 
**errp)
 AlphaCPUClass *acc = ALPHA_CPU_GET_CLASS(dev);
 Error *local_err = NULL;
 
+#ifndef CONFIG_USER_ONLY
+/* Use pc-relative instructions in system-mode */
+cs->tcg_cflags |= CF_PCREL;
+#endif
+
 cpu_exec_realizefn(cs, _err);
 if (local_err != NULL) {
 error_propagate(errp, local_err);
@@ -190,6 +210,7 @@ static const struct SysemuCPUOps alpha_sysemu_ops = {
 
 static const TCGCPUOps alpha_tcg_ops = {
 .initialize = alpha_translate_init,
+.synchronize_from_tb = alpha_cpu_synchronize_from_tb,
 .restore_state_to_opc = alpha_restore_state_to_opc,
 
 #ifdef CONFIG_USER_ONLY
diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index 86402d96d5..db847e7a23 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -54,6 +54,9 @@ struct DisasContext {
 uint32_t tbflags;
 int mem_idx;
 
+/* True if generating pc-relative code.  */
+bool pcrel;
+
 /* implver and amask values for this CPU.  */
 int implver;
 int amask;
@@ -254,7 +257,12 @@ static void st_flag_byte(TCGv val, unsigned shift)
 
 static void gen_pc_disp(DisasContext *ctx, TCGv dest, int32_t disp)
 {
-tcg_gen_movi_i64(dest, ctx->base.pc_next + disp);
+uint64_t addr = ctx->base.pc_next + disp;
+if (ctx->pcrel) {
+tcg_gen_addi_i64(dest, cpu_pc, addr - ctx->base.pc_first);
+} else {
+tcg_gen_movi_i64(dest, addr);
+}
 }
 
 static void gen_excp_1(int exception, int error_code)
@@ -433,8 +441,14 @@ static DisasJumpType gen_store_conditional(DisasContext 
*ctx, int ra, int rb,
 static void gen_goto_tb(DisasContext *ctx, int idx, int32_t disp)
 {
 if (translator_use_goto_tb(>base, ctx->base.pc_next + disp)) {
-tcg_gen_goto_tb(idx);
-gen_pc_disp(ctx, cpu_pc, disp);
+/* With PCREL, PC must always be up-to-date. */
+if (ctx->pcrel) {
+gen_pc_disp(ctx, cpu_pc, disp);
+tcg_gen_goto_tb(idx);
+} else {
+tcg_gen_goto_tb(idx);
+gen_pc_disp(ctx, cpu_pc, disp);
+}
 tcg_gen_exit_tb(ctx->base.tb, idx);
 } else {
 gen_pc_disp(ctx, cpu_pc, disp);
@@ -2852,6 +2866,7 @@ static void alpha_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cpu)
 
 ctx->tbflags = ctx->base.tb->flags;
 ctx->mem_idx = alpha_env_mmu_index(env);
+ctx->pcrel = ctx->base.tb->cflags & CF_PCREL;
 ctx->implver = env->implver;
 ctx->amask = env->amask;
 
@@ -2887,7 +2902,13 @@ static void alpha_tr_tb_start(DisasContextBase *db, 
CPUState *cpu)
 
 static void alpha_tr_insn_start(DisasContextBase *dcbase, CPUState *cpu)
 {
-tcg_gen_insn_start(dcbase->pc_next);
+DisasContext *ctx = container_of(dcbase, DisasContext, base);
+
+if (ctx->pcrel) {
+tcg_gen_insn_start(dcbase->pc_next & ~TARGET_PAGE_MASK);
+} else {
+tcg_gen_insn_start(dcbase->pc_next);
+}
 }
 
 static void alpha_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
-- 
2.34.1

[PATCH 4/5] target/alpha: Split out gen_pc_disp

Prepare for pcrel by not modifying cpu_pc before use,
in the case of JSR.

Signed-off-by: Richard Henderson 
---
 target/alpha/translate.c | 41 ++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index c1a55e5153..86402d96d5 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -252,6 +252,11 @@ static void st_flag_byte(TCGv val, unsigned shift)
 tcg_gen_st8_i64(val, tcg_env, get_flag_ofs(shift));
 }
 
+static void gen_pc_disp(DisasContext *ctx, TCGv dest, int32_t disp)
+{
+tcg_gen_movi_i64(dest, ctx->base.pc_next + disp);
+}
+
 static void gen_excp_1(int exception, int error_code)
 {
 TCGv_i32 tmp1, tmp2;
@@ -263,7 +268,7 @@ static void gen_excp_1(int exception, int error_code)
 
 static DisasJumpType gen_excp(DisasContext *ctx, int exception, int error_code)
 {
-tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
+gen_pc_disp(ctx, cpu_pc, 0);
 gen_excp_1(exception, error_code);
 return DISAS_NORETURN;
 }
@@ -427,14 +432,12 @@ static DisasJumpType gen_store_conditional(DisasContext 
*ctx, int ra, int rb,
 
 static void gen_goto_tb(DisasContext *ctx, int idx, int32_t disp)
 {
-uint64_t dest = ctx->base.pc_next + disp;
-
-if (translator_use_goto_tb(>base, dest)) {
+if (translator_use_goto_tb(>base, ctx->base.pc_next + disp)) {
 tcg_gen_goto_tb(idx);
-tcg_gen_movi_i64(cpu_pc, dest);
+gen_pc_disp(ctx, cpu_pc, disp);
 tcg_gen_exit_tb(ctx->base.tb, idx);
 } else {
-tcg_gen_movi_i64(cpu_pc, dest);
+gen_pc_disp(ctx, cpu_pc, disp);
 tcg_gen_lookup_and_goto_ptr();
 }
 }
@@ -442,7 +445,7 @@ static void gen_goto_tb(DisasContext *ctx, int idx, int32_t 
disp)
 static DisasJumpType gen_bdirect(DisasContext *ctx, int ra, int32_t disp)
 {
 if (ra != 31) {
-tcg_gen_movi_i64(ctx->ir[ra], ctx->base.pc_next);
+gen_pc_disp(ctx, ctx->ir[ra], 0);
 }
 
 /* Notice branch-to-next; used to initialize RA with the PC.  */
@@ -1091,7 +1094,7 @@ static DisasJumpType gen_call_pal(DisasContext *ctx, int 
palcode)
 }
 
 /* Allow interrupts to be recognized right away.  */
-tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
+gen_pc_disp(ctx, cpu_pc, 0);
 return DISAS_PC_UPDATED_NOCHAIN;
 
 case 0x36:
@@ -1138,19 +1141,17 @@ static DisasJumpType gen_call_pal(DisasContext *ctx, 
int palcode)
 #else
 {
 TCGv tmp = tcg_temp_new();
-uint64_t exc_addr = ctx->base.pc_next;
-uint64_t entry = ctx->palbr;
+uint64_t entry;
 
+gen_pc_disp(ctx, tmp, 0);
 if (ctx->tbflags & ENV_FLAG_PAL_MODE) {
-exc_addr |= 1;
+tcg_gen_ori_i64(tmp, tmp, 1);
 } else {
-tcg_gen_movi_i64(tmp, 1);
-st_flag_byte(tmp, ENV_FLAG_PAL_SHIFT);
+st_flag_byte(tcg_constant_i64(1), ENV_FLAG_PAL_SHIFT);
 }
-
-tcg_gen_movi_i64(tmp, exc_addr);
 tcg_gen_st_i64(tmp, tcg_env, offsetof(CPUAlphaState, exc_addr));
 
+entry = ctx->palbr;
 entry += (palcode & 0x80
   ? 0x2000 + (palcode - 0x80) * 64
   : 0x1000 + palcode * 64);
@@ -2344,9 +2345,13 @@ static DisasJumpType translate_one(DisasContext *ctx, 
uint32_t insn)
 /* JMP, JSR, RET, JSR_COROUTINE.  These only differ by the branch
prediction stack action, which of course we don't implement.  */
 vb = load_gpr(ctx, rb);
-tcg_gen_andi_i64(cpu_pc, vb, ~3);
 if (ra != 31) {
-tcg_gen_movi_i64(ctx->ir[ra], ctx->base.pc_next);
+tmp = tcg_temp_new();
+tcg_gen_andi_i64(tmp, vb, ~3);
+gen_pc_disp(ctx, ctx->ir[ra], 0);
+tcg_gen_mov_i64(cpu_pc, tmp);
+} else {
+tcg_gen_andi_i64(cpu_pc, vb, ~3);
 }
 ret = DISAS_PC_UPDATED;
 break;
@@ -2908,7 +2913,7 @@ static void alpha_tr_tb_stop(DisasContextBase *dcbase, 
CPUState *cpu)
 gen_goto_tb(ctx, 0, 0);
 break;
 case DISAS_PC_STALE:
-tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
+gen_pc_disp(ctx, cpu_pc, 0);
 /* FALLTHRU */
 case DISAS_PC_UPDATED:
 tcg_gen_lookup_and_goto_ptr();
-- 
2.34.1

[PATCH 3/5] target/alpha: Split out gen_goto_tb

Signed-off-by: Richard Henderson 
---
 target/alpha/translate.c | 61 ++--
 1 file changed, 21 insertions(+), 40 deletions(-)

diff --git a/target/alpha/translate.c b/target/alpha/translate.c
index 52c2e6248b..c1a55e5153 100644
--- a/target/alpha/translate.c
+++ b/target/alpha/translate.c
@@ -425,60 +425,45 @@ static DisasJumpType gen_store_conditional(DisasContext 
*ctx, int ra, int rb,
 return DISAS_NEXT;
 }
 
-static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
+static void gen_goto_tb(DisasContext *ctx, int idx, int32_t disp)
 {
-return translator_use_goto_tb(>base, dest);
+uint64_t dest = ctx->base.pc_next + disp;
+
+if (translator_use_goto_tb(>base, dest)) {
+tcg_gen_goto_tb(idx);
+tcg_gen_movi_i64(cpu_pc, dest);
+tcg_gen_exit_tb(ctx->base.tb, idx);
+} else {
+tcg_gen_movi_i64(cpu_pc, dest);
+tcg_gen_lookup_and_goto_ptr();
+}
 }
 
 static DisasJumpType gen_bdirect(DisasContext *ctx, int ra, int32_t disp)
 {
-uint64_t dest = ctx->base.pc_next + disp;
-
 if (ra != 31) {
 tcg_gen_movi_i64(ctx->ir[ra], ctx->base.pc_next);
 }
 
 /* Notice branch-to-next; used to initialize RA with the PC.  */
 if (disp == 0) {
-return 0;
-} else if (use_goto_tb(ctx, dest)) {
-tcg_gen_goto_tb(0);
-tcg_gen_movi_i64(cpu_pc, dest);
-tcg_gen_exit_tb(ctx->base.tb, 0);
-return DISAS_NORETURN;
-} else {
-tcg_gen_movi_i64(cpu_pc, dest);
-return DISAS_PC_UPDATED;
+return DISAS_NEXT;
 }
+gen_goto_tb(ctx, 0, disp);
+return DISAS_NORETURN;
 }
 
 static DisasJumpType gen_bcond_internal(DisasContext *ctx, TCGCond cond,
 TCGv cmp, uint64_t imm, int32_t disp)
 {
-uint64_t dest = ctx->base.pc_next + disp;
 TCGLabel *lab_true = gen_new_label();
 
-if (use_goto_tb(ctx, dest)) {
-tcg_gen_brcondi_i64(cond, cmp, imm, lab_true);
+tcg_gen_brcondi_i64(cond, cmp, imm, lab_true);
+gen_goto_tb(ctx, 0, 0);
+gen_set_label(lab_true);
+gen_goto_tb(ctx, 1, disp);
 
-tcg_gen_goto_tb(0);
-tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
-tcg_gen_exit_tb(ctx->base.tb, 0);
-
-gen_set_label(lab_true);
-tcg_gen_goto_tb(1);
-tcg_gen_movi_i64(cpu_pc, dest);
-tcg_gen_exit_tb(ctx->base.tb, 1);
-
-return DISAS_NORETURN;
-} else {
-TCGv_i64 i = tcg_constant_i64(imm);
-TCGv_i64 d = tcg_constant_i64(dest);
-TCGv_i64 p = tcg_constant_i64(ctx->base.pc_next);
-
-tcg_gen_movcond_i64(cond, cpu_pc, cmp, i, d, p);
-return DISAS_PC_UPDATED;
-}
+return DISAS_NORETURN;
 }
 
 static DisasJumpType gen_bcond(DisasContext *ctx, TCGCond cond, int ra,
@@ -2920,12 +2905,8 @@ static void alpha_tr_tb_stop(DisasContextBase *dcbase, 
CPUState *cpu)
 case DISAS_NORETURN:
 break;
 case DISAS_TOO_MANY:
-if (use_goto_tb(ctx, ctx->base.pc_next)) {
-tcg_gen_goto_tb(0);
-tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
-tcg_gen_exit_tb(ctx->base.tb, 0);
-}
-/* FALLTHRU */
+gen_goto_tb(ctx, 0, 0);
+break;
 case DISAS_PC_STALE:
 tcg_gen_movi_i64(cpu_pc, ctx->base.pc_next);
 /* FALLTHRU */
-- 
2.34.1

[PATCH v2 13/33] plugins: Use DisasContextBase for qemu_plugin_insn_haddr

We can delay the computation of haddr until the plugin
actually requests it.

Signed-off-by: Richard Henderson 
---
 include/qemu/plugin.h  |  4 
 accel/tcg/plugin-gen.c | 20 
 plugins/api.c  | 25 -
 3 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h
index 03081be543..3db0e75d16 100644
--- a/include/qemu/plugin.h
+++ b/include/qemu/plugin.h
@@ -98,7 +98,6 @@ struct qemu_plugin_dyn_cb {
 /* Internal context for instrumenting an instruction */
 struct qemu_plugin_insn {
 uint64_t vaddr;
-void *haddr;
 GArray *insn_cbs;
 GArray *mem_cbs;
 uint8_t len;
@@ -119,9 +118,6 @@ struct qemu_plugin_tb {
 GPtrArray *insns;
 size_t n;
 uint64_t vaddr;
-uint64_t vaddr2;
-void *haddr1;
-void *haddr2;
 
 /* if set, the TB calls helpers that might access guest memory */
 bool mem_helper;
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index a4656859c6..b036773d3c 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -319,9 +319,6 @@ bool plugin_gen_tb_start(CPUState *cpu, const 
DisasContextBase *db)
 ret = true;
 
 ptb->vaddr = db->pc_first;
-ptb->vaddr2 = -1;
-ptb->haddr1 = db->host_addr[0];
-ptb->haddr2 = NULL;
 ptb->mem_helper = false;
 
 tcg_gen_plugin_cb(PLUGIN_GEN_FROM_TB);
@@ -363,23 +360,6 @@ void plugin_gen_insn_start(CPUState *cpu, const 
DisasContextBase *db)
 pc = db->pc_next;
 insn->vaddr = pc;
 
-/*
- * Detect page crossing to get the new host address.
- * Note that we skip this when haddr1 == NULL, e.g. when we're
- * fetching instructions from a region not backed by RAM.
- */
-if (ptb->haddr1 == NULL) {
-insn->haddr = NULL;
-} else if (is_same_page(db, db->pc_next)) {
-insn->haddr = ptb->haddr1 + pc - ptb->vaddr;
-} else {
-if (ptb->vaddr2 == -1) {
-ptb->vaddr2 = TARGET_PAGE_ALIGN(db->pc_first);
-get_page_addr_code_hostp(cpu_env(cpu), ptb->vaddr2, >haddr2);
-}
-insn->haddr = ptb->haddr2 + pc - ptb->vaddr2;
-}
-
 tcg_gen_plugin_cb(PLUGIN_GEN_FROM_INSN);
 }
 
diff --git a/plugins/api.c b/plugins/api.c
index 39895a1cb1..4b6690c7d6 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -242,7 +242,30 @@ uint64_t qemu_plugin_insn_vaddr(const struct 
qemu_plugin_insn *insn)
 
 void *qemu_plugin_insn_haddr(const struct qemu_plugin_insn *insn)
 {
-return insn->haddr;
+const DisasContextBase *db = tcg_ctx->plugin_db;
+vaddr page0_last = db->pc_first | ~TARGET_PAGE_MASK;
+
+if (db->fake_insn) {
+return NULL;
+}
+
+/*
+ * ??? The return value is not intended for use of host memory,
+ * but as a proxy for address space and physical address.
+ * Thus we are only interested in the first byte and do not
+ * care about spanning pages.
+ */
+if (insn->vaddr <= page0_last) {
+if (db->host_addr[0] == NULL) {
+return NULL;
+}
+return db->host_addr[0] + insn->vaddr - db->pc_first;
+} else {
+if (db->host_addr[1] == NULL) {
+return NULL;
+}
+return db->host_addr[1] + insn->vaddr - (page0_last + 1);
+}
 }
 
 char *qemu_plugin_insn_disas(const struct qemu_plugin_insn *insn)
-- 
2.34.1

[PATCH v2 07/33] accel/tcg: Record when translator_fake_ldb is used

Signed-off-by: Richard Henderson 
---
 include/exec/translator.h | 3 ++-
 accel/tcg/translator.c| 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/exec/translator.h b/include/exec/translator.h
index 974cc4f9c4..e92dfba035 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -72,8 +72,8 @@ typedef enum DisasJumpType {
  * @num_insns: Number of translated instructions (including current).
  * @max_insns: Maximum number of instructions to be translated in this TB.
  * @singlestep_enabled: "Hardware" single stepping enabled.
- * @saved_can_do_io: Known value of cpu->neg.can_do_io, or -1 for unknown.
  * @plugin_enabled: TCG plugin enabled in this TB.
+ * @fake_insn: True if translator_fake_ldb used.
  * @insn_start: The last op emitted by the insn_start hook,
  *  which is expected to be INDEX_op_insn_start.
  *
@@ -88,6 +88,7 @@ typedef struct DisasContextBase {
 int max_insns;
 bool singlestep_enabled;
 bool plugin_enabled;
+bool fake_insn;
 struct TCGOp *insn_start;
 void *host_addr[2];
 
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index a3c246ea37..6863455ed9 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -129,6 +129,7 @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, 
int *max_insns,
 db->max_insns = *max_insns;
 db->singlestep_enabled = cflags & CF_SINGLE_STEP;
 db->insn_start = NULL;
+db->fake_insn = false;
 db->host_addr[0] = host_pc;
 db->host_addr[1] = NULL;
 db->record_start = 0;
@@ -433,6 +434,7 @@ uint64_t translator_ldq(CPUArchState *env, DisasContextBase 
*db, vaddr pc)
 void translator_fake_ldb(DisasContextBase *db, vaddr pc, uint8_t insn8)
 {
 assert(pc >= db->pc_first);
+db->fake_insn = true;
 record_save(db, pc, , sizeof(insn8));
 plugin_insn_append(pc, , sizeof(insn8));
 }
-- 
2.34.1

[PATCH v2 18/33] disas: Split disas.c

The routines in disas-common.c are also used from disas-mon.c.
Otherwise the rest of disassembly is only used from tcg.
While we're at it, put host and target code into separate files.

Signed-off-by: Richard Henderson 
---
 disas/disas-internal.h |   4 +
 include/disas/disas.h  |   4 +
 disas/disas-common.c   | 117 ++
 disas/disas-host.c | 129 
 disas/disas-target.c   |  84 ++
 disas/disas.c  | 337 -
 disas/objdump.c|  37 +
 disas/meson.build  |   8 +-
 8 files changed, 381 insertions(+), 339 deletions(-)
 create mode 100644 disas/disas-common.c
 create mode 100644 disas/disas-host.c
 create mode 100644 disas/disas-target.c
 delete mode 100644 disas/disas.c
 create mode 100644 disas/objdump.c

diff --git a/disas/disas-internal.h b/disas/disas-internal.h
index 84a01f126f..ed32e704cc 100644
--- a/disas/disas-internal.h
+++ b/disas/disas-internal.h
@@ -14,8 +14,12 @@ typedef struct CPUDebug {
 CPUState *cpu;
 } CPUDebug;
 
+void disas_initialize_debug(CPUDebug *s);
 void disas_initialize_debug_target(CPUDebug *s, CPUState *cpu);
 int disas_gstring_printf(FILE *stream, const char *fmt, ...)
 G_GNUC_PRINTF(2, 3);
 
+int print_insn_od_host(bfd_vma pc, disassemble_info *info);
+int print_insn_od_target(bfd_vma pc, disassemble_info *info);
+
 #endif
diff --git a/include/disas/disas.h b/include/disas/disas.h
index 176775eff7..54a5e68443 100644
--- a/include/disas/disas.h
+++ b/include/disas/disas.h
@@ -2,13 +2,17 @@
 #define QEMU_DISAS_H
 
 /* Disassemble this for me please... (debugging). */
+#ifdef CONFIG_TCG
 void disas(FILE *out, const void *code, size_t size);
 void target_disas(FILE *out, CPUState *cpu, uint64_t code, size_t size);
+#endif
 
 void monitor_disas(Monitor *mon, CPUState *cpu, uint64_t pc,
int nb_insn, bool is_physical);
 
+#ifdef CONFIG_PLUGIN
 char *plugin_disas(CPUState *cpu, uint64_t addr, size_t size);
+#endif
 
 /* Look up symbol for debugging purpose.  Returns "" if unknown. */
 const char *lookup_symbol(uint64_t orig_addr);
diff --git a/disas/disas-common.c b/disas/disas-common.c
new file mode 100644
index 00..e4118a381f
--- /dev/null
+++ b/disas/disas-common.c
@@ -0,0 +1,117 @@
+/*
+ * Common routines for disassembly.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "disas/disas.h"
+#include "disas/capstone.h"
+#include "hw/core/cpu.h"
+#include "exec/memory.h"
+#include "disas-internal.h"
+
+
+/* Filled in by elfload.c.  Simplistic, but will do for now. */
+struct syminfo *syminfos = NULL;
+
+/*
+ * Get LENGTH bytes from info's buffer, at target address memaddr.
+ * Transfer them to myaddr.
+ */
+static int target_read_memory(bfd_vma memaddr, bfd_byte *myaddr, int length,
+  struct disassemble_info *info)
+{
+CPUDebug *s = container_of(info, CPUDebug, info);
+int r = cpu_memory_rw_debug(s->cpu, memaddr, myaddr, length, 0);
+return r ? EIO : 0;
+}
+
+/*
+ * Print an error message.  We can assume that this is in response to
+ * an error return from {host,target}_read_memory.
+ */
+static void perror_memory(int status, bfd_vma memaddr,
+  struct disassemble_info *info)
+{
+if (status != EIO) {
+/* Can't happen.  */
+info->fprintf_func(info->stream, "Unknown error %d\n", status);
+} else {
+/* Address between memaddr and memaddr + len was out of bounds.  */
+info->fprintf_func(info->stream,
+   "Address 0x%" PRIx64 " is out of bounds.\n",
+   memaddr);
+}
+}
+
+/* Print address in hex. */
+static void print_address(bfd_vma addr, struct disassemble_info *info)
+{
+info->fprintf_func(info->stream, "0x%" PRIx64, addr);
+}
+
+/* Stub prevents some fruitless earching in optabs disassemblers. */
+static int symbol_at_address(bfd_vma addr, struct disassemble_info *info)
+{
+return 1;
+}
+
+void disas_initialize_debug(CPUDebug *s)
+{
+memset(s, 0, sizeof(*s));
+s->info.arch = bfd_arch_unknown;
+s->info.cap_arch = -1;
+s->info.cap_insn_unit = 4;
+s->info.cap_insn_split = 4;
+s->info.memory_error_func = perror_memory;
+s->info.symbol_at_address_func = symbol_at_address;
+}
+
+void disas_initialize_debug_target(CPUDebug *s, CPUState *cpu)
+{
+disas_initialize_debug(s);
+
+s->cpu = cpu;
+s->info.read_memory_func = target_read_memory;
+s->info.print_address_func = print_address;
+if (target_words_bigendian()) {
+s->info.endian = BFD_ENDIAN_BIG;
+} else {
+s->info.endian =  BFD_ENDIAN_LITTLE;
+}
+
+CPUClass *cc = CPU_GET_CLASS(cpu);
+if (cc->disas_set_info) {
+cc->disas_set_info(cpu, >info);
+}
+}
+
+int disas_gstring_printf(FILE *stream, const char *fmt, ...)
+{
+/* We abuse the FILE parameter to pass a GString. */
+GString *s = (GString *)stream;
+int

[PATCH v2 02/33] accel/tcg: Hide in_same_page outside of a target-specific context

While there are other methods that could be used to replace
TARGET_PAGE_MASK, the function is not really required outside
the context of target-specific translation.

This makes the header usable by target independent code.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 include/exec/translator.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/exec/translator.h b/include/exec/translator.h
index 51489c181c..c6a9e4b69a 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -232,6 +232,7 @@ translator_ldq_swap(CPUArchState *env, DisasContextBase *db,
  */
 void translator_fake_ldb(uint8_t insn8, vaddr pc);
 
+#ifdef NEED_CPU_H
 /*
  * Return whether addr is on the same page as where disassembly started.
  * Translators can use this to enforce the rule that only single-insn
@@ -241,5 +242,6 @@ static inline bool is_same_page(const DisasContextBase *db, 
vaddr addr)
 {
 return ((addr ^ db->pc_first) & TARGET_PAGE_MASK) == 0;
 }
+#endif
 
 #endif /* EXEC__TRANSLATOR_H */
-- 
2.34.1

[PATCH v2 11/33] plugins: Use translator_st for qemu_plugin_insn_data

Use the bytes that we record for the entire TB, rather than
a per-insn GByteArray.  Record the length of the insn in
plugin_gen_insn_end rather than infering from the length
of the array.

Signed-off-by: Richard Henderson 
---
 include/qemu/plugin.h  | 14 +-
 accel/tcg/plugin-gen.c |  7 +--
 accel/tcg/translator.c | 26 --
 plugins/api.c  | 12 +++-
 tcg/tcg.c  |  3 +--
 5 files changed, 14 insertions(+), 48 deletions(-)

diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h
index 07b1755990..c32bb97667 100644
--- a/include/qemu/plugin.h
+++ b/include/qemu/plugin.h
@@ -97,11 +97,11 @@ struct qemu_plugin_dyn_cb {
 
 /* Internal context for instrumenting an instruction */
 struct qemu_plugin_insn {
-GByteArray *data;
 uint64_t vaddr;
 void *haddr;
 GArray *insn_cbs;
 GArray *mem_cbs;
+uint8_t len;
 bool calls_helpers;
 
 /* if set, the instruction calls helpers that might access guest memory */
@@ -116,18 +116,6 @@ struct qemu_plugin_scoreboard {
 QLIST_ENTRY(qemu_plugin_scoreboard) entry;
 };
 
-/*
- * qemu_plugin_insn allocate and cleanup functions. We don't expect to
- * cleanup many of these structures. They are reused for each fresh
- * translation.
- */
-
-static inline void qemu_plugin_insn_cleanup_fn(gpointer data)
-{
-struct qemu_plugin_insn *insn = (struct qemu_plugin_insn *) data;
-g_byte_array_free(insn->data, true);
-}
-
 /* Internal context for this TranslationBlock */
 struct qemu_plugin_tb {
 GPtrArray *insns;
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index 94bbad6dc7..be2451be58 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -346,11 +346,9 @@ void plugin_gen_insn_start(CPUState *cpu, const 
DisasContextBase *db)
 ptb->n = n;
 if (n <= ptb->insns->len) {
 insn = g_ptr_array_index(ptb->insns, n - 1);
-g_byte_array_set_size(insn->data, 0);
 } else {
 assert(n - 1 == ptb->insns->len);
 insn = g_new0(struct qemu_plugin_insn, 1);
-insn->data = g_byte_array_sized_new(4);
 g_ptr_array_add(ptb->insns, insn);
 }
 
@@ -389,6 +387,11 @@ void plugin_gen_insn_start(CPUState *cpu, const 
DisasContextBase *db)
 
 void plugin_gen_insn_end(void)
 {
+const DisasContextBase *db = tcg_ctx->plugin_db;
+struct qemu_plugin_insn *pinsn = tcg_ctx->plugin_insn;
+
+pinsn->len = db->fake_insn ? db->record_len : db->pc_next - pinsn->vaddr;
+
 tcg_gen_plugin_cb(PLUGIN_GEN_AFTER_INSN);
 }
 
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index 7f63a8085d..df73312f99 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -408,27 +408,6 @@ bool translator_st(const DisasContextBase *db, void *dest,
 return false;
 }
 
-static void plugin_insn_append(vaddr pc, const void *from, size_t size)
-{
-#ifdef CONFIG_PLUGIN
-struct qemu_plugin_insn *insn = tcg_ctx->plugin_insn;
-size_t off;
-
-if (insn == NULL) {
-return;
-}
-off = pc - insn->vaddr;
-if (off < insn->data->len) {
-g_byte_array_set_size(insn->data, off);
-} else if (off > insn->data->len) {
-/* we have an unexpected gap */
-g_assert_not_reached();
-}
-
-insn->data = g_byte_array_append(insn->data, from, size);
-#endif
-}
-
 uint8_t translator_ldub(CPUArchState *env, DisasContextBase *db, vaddr pc)
 {
 uint8_t raw;
@@ -437,7 +416,6 @@ uint8_t translator_ldub(CPUArchState *env, DisasContextBase 
*db, vaddr pc)
 raw = cpu_ldub_code(env, pc);
 record_save(db, pc, , sizeof(raw));
 }
-plugin_insn_append(pc, , sizeof(raw));
 return raw;
 }
 
@@ -452,7 +430,6 @@ uint16_t translator_lduw(CPUArchState *env, 
DisasContextBase *db, vaddr pc)
 raw = tswap16(tgt);
 record_save(db, pc, , sizeof(raw));
 }
-plugin_insn_append(pc, , sizeof(raw));
 return tgt;
 }
 
@@ -467,7 +444,6 @@ uint32_t translator_ldl(CPUArchState *env, DisasContextBase 
*db, vaddr pc)
 raw = tswap32(tgt);
 record_save(db, pc, , sizeof(raw));
 }
-plugin_insn_append(pc, , sizeof(raw));
 return tgt;
 }
 
@@ -482,7 +458,6 @@ uint64_t translator_ldq(CPUArchState *env, DisasContextBase 
*db, vaddr pc)
 raw = tswap64(tgt);
 record_save(db, pc, , sizeof(raw));
 }
-plugin_insn_append(pc, , sizeof(raw));
 return tgt;
 }
 
@@ -491,5 +466,4 @@ void translator_fake_ldb(DisasContextBase *db, vaddr pc, 
uint8_t insn8)
 assert(pc >= db->pc_first);
 db->fake_insn = true;
 record_save(db, pc, , sizeof(insn8));
-plugin_insn_append(pc, , sizeof(insn8));
 }
diff --git a/plugins/api.c b/plugins/api.c
index 4e9125ea29..7b8b7523b3 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -43,6 +43,7 @@
 #include "exec/exec-all.h"
 #include "exec/gdbstub.h"
 #include "exec/ram_addr.h"
+#include "exec/translator.h"
 #include "disas/disas.h"
 #include "plugin.h"
 #ifndef CONFIG_USER_ONLY

[PATCH v2 06/33] accel/tcg: Record mmio bytes during translation

This will be able to replace plugin_insn_append, and will
be usable for disassembly.

Signed-off-by: Richard Henderson 
---
 include/exec/translator.h | 12 
 accel/tcg/translator.c| 41 +++
 2 files changed, 53 insertions(+)

diff --git a/include/exec/translator.h b/include/exec/translator.h
index 83fe66cba0..974cc4f9c4 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -90,6 +90,18 @@ typedef struct DisasContextBase {
 bool plugin_enabled;
 struct TCGOp *insn_start;
 void *host_addr[2];
+
+/*
+ * Record insn data that we cannot read directly from host memory.
+ * There are only two reasons we cannot use host memory:
+ * (1) We are executing from I/O,
+ * (2) We are executing a synthetic instruction (s390x EX).
+ * In both cases we need record exactly one instruction,
+ * and thus the maximum amount of data we record is limited.
+ */
+int record_start;
+int record_len;
+uint8_t record[32];
 } DisasContextBase;
 
 /**
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index c3f4d0e252..a3c246ea37 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -131,6 +131,8 @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, 
int *max_insns,
 db->insn_start = NULL;
 db->host_addr[0] = host_pc;
 db->host_addr[1] = NULL;
+db->record_start = 0;
+db->record_len = 0;
 
 ops->init_disas_context(db, cpu);
 tcg_debug_assert(db->is_jmp == DISAS_NEXT);  /* no early exit */
@@ -317,6 +319,39 @@ static bool translator_ld(CPUArchState *env, 
DisasContextBase *db,
 return true;
 }
 
+static void record_save(DisasContextBase *db, vaddr pc,
+const void *from, int size)
+{
+int offset;
+
+/* Do not record probes before the start of TB. */
+if (pc < db->pc_first) {
+return;
+}
+
+/*
+ * In translator_access, we verified that pc is within 2 pages
+ * of pc_first, thus this will never overflow.
+ */
+offset = pc - db->pc_first;
+
+/*
+ * Either the first or second page may be I/O.  If it is the second,
+ * then the first byte we need to record will be at a non-zero offset.
+ * In either case, we should not need to record but a single insn.
+ */
+if (db->record_len == 0) {
+db->record_start = offset;
+db->record_len = size;
+} else {
+assert(offset == db->record_start + db->record_len);
+assert(db->record_len + size <= sizeof(db->record));
+db->record_len += size;
+}
+
+memcpy(db->record + (offset - db->record_start), from, size);
+}
+
 static void plugin_insn_append(vaddr pc, const void *from, size_t size)
 {
 #ifdef CONFIG_PLUGIN
@@ -344,6 +379,7 @@ uint8_t translator_ldub(CPUArchState *env, DisasContextBase 
*db, vaddr pc)
 
 if (!translator_ld(env, db, , pc, sizeof(raw))) {
 raw = cpu_ldub_code(env, pc);
+record_save(db, pc, , sizeof(raw));
 }
 plugin_insn_append(pc, , sizeof(raw));
 return raw;
@@ -358,6 +394,7 @@ uint16_t translator_lduw(CPUArchState *env, 
DisasContextBase *db, vaddr pc)
 } else {
 tgt = cpu_lduw_code(env, pc);
 raw = tswap16(tgt);
+record_save(db, pc, , sizeof(raw));
 }
 plugin_insn_append(pc, , sizeof(raw));
 return tgt;
@@ -372,6 +409,7 @@ uint32_t translator_ldl(CPUArchState *env, DisasContextBase 
*db, vaddr pc)
 } else {
 tgt = cpu_ldl_code(env, pc);
 raw = tswap32(tgt);
+record_save(db, pc, , sizeof(raw));
 }
 plugin_insn_append(pc, , sizeof(raw));
 return tgt;
@@ -386,6 +424,7 @@ uint64_t translator_ldq(CPUArchState *env, DisasContextBase 
*db, vaddr pc)
 } else {
 tgt = cpu_ldl_code(env, pc);
 raw = tswap64(tgt);
+record_save(db, pc, , sizeof(raw));
 }
 plugin_insn_append(pc, , sizeof(raw));
 return tgt;
@@ -393,5 +432,7 @@ uint64_t translator_ldq(CPUArchState *env, DisasContextBase 
*db, vaddr pc)
 
 void translator_fake_ldb(DisasContextBase *db, vaddr pc, uint8_t insn8)
 {
+assert(pc >= db->pc_first);
+record_save(db, pc, , sizeof(insn8));
 plugin_insn_append(pc, , sizeof(insn8));
 }
-- 
2.34.1

[PATCH v2 09/33] plugins: Copy memory in qemu_plugin_insn_data

Instead of returning a host pointer, copy the data into
storage provided by the caller.

Signed-off-by: Richard Henderson 
---
 include/qemu/qemu-plugin.h | 15 +++
 contrib/plugins/execlog.c  |  5 +++--
 contrib/plugins/howvec.c   |  4 ++--
 plugins/api.c  |  7 +--
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/include/qemu/qemu-plugin.h b/include/qemu/qemu-plugin.h
index 4fc6c3739b..5f36c2d1ac 100644
--- a/include/qemu/qemu-plugin.h
+++ b/include/qemu/qemu-plugin.h
@@ -61,7 +61,7 @@ typedef uint64_t qemu_plugin_id_t;
 
 extern QEMU_PLUGIN_EXPORT int qemu_plugin_version;
 
-#define QEMU_PLUGIN_VERSION 2
+#define QEMU_PLUGIN_VERSION 3
 
 /**
  * struct qemu_info_t - system information for plugins
@@ -394,17 +394,16 @@ struct qemu_plugin_insn *
 qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx);
 
 /**
- * qemu_plugin_insn_data() - return ptr to instruction data
+ * qemu_plugin_insn_data() - copy instruction data
  * @insn: opaque instruction handle from qemu_plugin_tb_get_insn()
+ * @dest: destination into which data is copied
+ * @len: length of dest
  *
- * Note: data is only valid for duration of callback. See
- * qemu_plugin_insn_size() to calculate size of stream.
- *
- * Returns: pointer to a stream of bytes containing the value of this
- * instructions opcode.
+ * Returns the number of bytes copied, minimum of @len and insn size.
  */
 QEMU_PLUGIN_API
-const void *qemu_plugin_insn_data(const struct qemu_plugin_insn *insn);
+size_t qemu_plugin_insn_data(const struct qemu_plugin_insn *insn,
+ void *dest, size_t len);
 
 /**
  * qemu_plugin_insn_size() - return size of instruction
diff --git a/contrib/plugins/execlog.c b/contrib/plugins/execlog.c
index fab18113d4..371db97eb1 100644
--- a/contrib/plugins/execlog.c
+++ b/contrib/plugins/execlog.c
@@ -258,8 +258,9 @@ static void vcpu_tb_trans(qemu_plugin_id_t id, struct 
qemu_plugin_tb *tb)
NULL);
 }
 } else {
-uint32_t insn_opcode;
-insn_opcode = *((uint32_t *)qemu_plugin_insn_data(insn));
+uint32_t insn_opcode = 0;
+qemu_plugin_insn_data(insn, _opcode, sizeof(insn_opcode));
+
 char *output = g_strdup_printf("0x%"PRIx64", 0x%"PRIx32", \"%s\"",
insn_vaddr, insn_opcode, 
insn_disas);
 
diff --git a/contrib/plugins/howvec.c b/contrib/plugins/howvec.c
index 94bbc53820..9be67f7453 100644
--- a/contrib/plugins/howvec.c
+++ b/contrib/plugins/howvec.c
@@ -252,7 +252,7 @@ static struct qemu_plugin_scoreboard *find_counter(
 {
 int i;
 uint64_t *cnt = NULL;
-uint32_t opcode;
+uint32_t opcode = 0;
 InsnClassExecCount *class = NULL;
 
 /*
@@ -261,7 +261,7 @@ static struct qemu_plugin_scoreboard *find_counter(
  * They would probably benefit from a more tailored plugin.
  * However we can fall back to individual instruction counting.
  */
-opcode = *((uint32_t *)qemu_plugin_insn_data(insn));
+qemu_plugin_insn_data(insn, , sizeof(opcode));
 
 for (i = 0; !cnt && i < class_table_sz; i++) {
 class = _table[i];
diff --git a/plugins/api.c b/plugins/api.c
index 3912c9cc8f..4e9125ea29 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -216,9 +216,12 @@ qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, 
size_t idx)
  * instruction being translated.
  */
 
-const void *qemu_plugin_insn_data(const struct qemu_plugin_insn *insn)
+size_t qemu_plugin_insn_data(const struct qemu_plugin_insn *insn,
+ void *dest, size_t len)
 {
-return insn->data->data;
+len = MIN(len, insn->data->len);
+memcpy(dest, insn->data->data, len);
+return len;
 }
 
 size_t qemu_plugin_insn_size(const struct qemu_plugin_insn *insn)
-- 
2.34.1

[PATCH v2 29/33] target/riscv: Use translator_ld* for everything

Signed-off-by: Richard Henderson 
---
 target/riscv/translate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index c999e942e1..2c27fd4ce1 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -20,7 +20,6 @@
 #include "qemu/log.h"
 #include "cpu.h"
 #include "tcg/tcg-op.h"
-#include "exec/cpu_ldst.h"
 #include "exec/exec-all.h"
 #include "exec/helper-proto.h"
 #include "exec/helper-gen.h"
@@ -1082,7 +1081,7 @@ static uint32_t opcode_at(DisasContextBase *dcbase, 
target_ulong pc)
 CPUState *cpu = ctx->cs;
 CPURISCVState *env = cpu_env(cpu);
 
-return cpu_ldl_code(env, pc);
+return translator_ldl(env, >base, pc);
 }
 
 /* Include insn module translation function */
@@ -1243,7 +1242,8 @@ static void riscv_tr_translate_insn(DisasContextBase 
*dcbase, CPUState *cpu)
 unsigned page_ofs = ctx->base.pc_next & ~TARGET_PAGE_MASK;
 
 if (page_ofs > TARGET_PAGE_SIZE - MAX_INSN_LEN) {
-uint16_t next_insn = cpu_lduw_code(env, ctx->base.pc_next);
+uint16_t next_insn =
+translator_lduw(env, >base, ctx->base.pc_next);
 int len = insn_len(next_insn);
 
 if (!is_same_page(>base, ctx->base.pc_next + len - 1)) {
-- 
2.34.1

[PATCH v2 28/33] target/cris: Use cris_fetch in translate_v10.c.inc

Reviewed-by: Edgar E. Iglesias 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 target/cris/translate.c |  1 -
 target/cris/translate_v10.c.inc | 30 +-
 2 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/target/cris/translate.c b/target/cris/translate.c
index bb2d6612ba..a30c67eb07 100644
--- a/target/cris/translate.c
+++ b/target/cris/translate.c
@@ -29,7 +29,6 @@
 #include "tcg/tcg-op.h"
 #include "exec/helper-proto.h"
 #include "mmu.h"
-#include "exec/cpu_ldst.h"
 #include "exec/translator.h"
 #include "crisv32-decode.h"
 #include "qemu/qemu-print.h"
diff --git a/target/cris/translate_v10.c.inc b/target/cris/translate_v10.c.inc
index 73fc27c15d..c15ff47505 100644
--- a/target/cris/translate_v10.c.inc
+++ b/target/cris/translate_v10.c.inc
@@ -165,20 +165,7 @@ static int dec10_prep_move_m(CPUCRISState *env, 
DisasContext *dc,
 
 /* Load [$rs] onto T1.  */
 if (is_imm) {
-if (memsize != 4) {
-if (s_ext) {
-if (memsize == 1)
-imm = cpu_ldsb_code(env, dc->pc + 2);
-else
-imm = cpu_ldsw_code(env, dc->pc + 2);
-} else {
-if (memsize == 1)
-imm = cpu_ldub_code(env, dc->pc + 2);
-else
-imm = cpu_lduw_code(env, dc->pc + 2);
-}
-} else
-imm = cpu_ldl_code(env, dc->pc + 2);
+imm = cris_fetch(env, dc, dc->pc + 2, memsize, s_ext);
 
 tcg_gen_movi_tl(dst, imm);
 
@@ -929,10 +916,11 @@ static int dec10_dip(CPUCRISState *env, DisasContext *dc)
 LOG_DIS("dip pc=%x opcode=%d r%d r%d\n",
   dc->pc, dc->opcode, dc->src, dc->dst);
 if (dc->src == 15) {
-imm = cpu_ldl_code(env, dc->pc + 2);
+imm = cris_fetch(env, dc, dc->pc + 2, 4, 0);
 tcg_gen_movi_tl(cpu_PR[PR_PREFIX], imm);
-if (dc->postinc)
+if (dc->postinc) {
 insn_len += 4;
+}
 tcg_gen_addi_tl(cpu_R[15], cpu_R[15], insn_len - 2);
 } else {
 gen_load(dc, cpu_PR[PR_PREFIX], cpu_R[dc->src], 4, 0);
@@ -1095,10 +1083,10 @@ static unsigned int dec10_ind(CPUCRISState *env, 
DisasContext *dc)
 if (dc->src == 15) {
 LOG_DIS("jump.%d %d r%d r%d direct\n", size,
  dc->opcode, dc->src, dc->dst);
-imm = cpu_ldl_code(env, dc->pc + 2);
-if (dc->mode == CRISV10_MODE_AUTOINC)
+imm = cris_fetch(env, dc, dc->pc + 2, size, 0);
+if (dc->mode == CRISV10_MODE_AUTOINC) {
 insn_len += size;
-
+}
 c = tcg_constant_tl(dc->pc + insn_len);
 t_gen_mov_preg_TN(dc, dc->dst, c);
 dc->jmp_pc = imm;
@@ -1164,7 +1152,7 @@ static unsigned int dec10_ind(CPUCRISState *env, 
DisasContext *dc)
 case CRISV10_IND_BCC_M:
 
 cris_cc_mask(dc, 0);
-simm = cpu_ldsw_code(env, dc->pc + 2);
+simm = cris_fetch(env, dc, dc->pc + 2, 2, 1);
 simm += 4;
 
 LOG_DIS("bcc_m: b%s %x\n", cc_name(dc->cond), dc->pc + simm);
@@ -1185,7 +1173,7 @@ static unsigned int crisv10_decoder(CPUCRISState *env, 
DisasContext *dc)
 unsigned int insn_len = 2;
 
 /* Load a halfword onto the instruction register.  */
-dc->ir = cpu_lduw_code(env, dc->pc);
+dc->ir = cris_fetch(env, dc, dc->pc, 2, 0);
 
 /* Now decode it.  */
 dc->opcode   = EXTRACT_FIELD(dc->ir, 6, 9);
-- 
2.34.1

[PATCH v2 00/33] accel/tcg: Improve disassembly for target and plugin

Based-on: 20240424230224.941028-1-richard.hender...@linaro.org
("[PATCH v3 00/20] Rewrite plugin code generation")

Rebase only.
Reviews required for: 
  04-accel-tcg-Reorg-translator_ld.patch
  06-accel-tcg-Record-mmio-bytes-during-translation.patch
  07-accel-tcg-Record-when-translator_fake_ldb-is-used.patch
  08-accel-tcg-Record-DisasContextBase-in-tcg_ctx-for-.patch
  09-plugins-Copy-memory-in-qemu_plugin_insn_data.patch
  10-accel-tcg-Implement-translator_st.patch
  11-plugins-Use-translator_st-for-qemu_plugin_insn_da.patch
  12-plugins-Read-mem_only-directly-from-TB-cflags.patch
  13-plugins-Use-DisasContextBase-for-qemu_plugin_insn.patch
  15-plugins-Merge-alloc_tcg_plugin_context-into-plugi.patch
  18-disas-Split-disas.c.patch
  19-disas-Use-translator_st-to-get-disassembly-data.patch
  21-target-s390x-Fix-translator_fake_ld-length.patch
  22-target-s390x-Disassemble-EXECUTEd-instructions.patch
  23-target-hexagon-Use-translator_ldl-in-pkt_crosses_.patch
  29-target-riscv-Use-translator_ld-for-everything.patch
  32-target-s390x-Use-translator_lduw-in-get_next_pc.patch


r~


Philippe Mathieu-Daudé (1):
  accel/tcg: Remove cpu_ldsb_code / cpu_ldsw_code

Richard Henderson (32):
  accel/tcg: Use vaddr in translator_ld*
  accel/tcg: Hide in_same_page outside of a target-specific context
  accel/tcg: Pass DisasContextBase to translator_fake_ldb
  accel/tcg: Reorg translator_ld*
  accel/tcg: Cap the translation block when we encounter mmio
  accel/tcg: Record mmio bytes during translation
  accel/tcg: Record when translator_fake_ldb is used
  accel/tcg: Record DisasContextBase in tcg_ctx for plugins
  plugins: Copy memory in qemu_plugin_insn_data
  accel/tcg: Implement translator_st
  plugins: Use translator_st for qemu_plugin_insn_data
  plugins: Read mem_only directly from TB cflags
  plugins: Use DisasContextBase for qemu_plugin_insn_haddr
  plugins: Use DisasContextBase for qemu_plugin_tb_vaddr
  plugins: Merge  alloc_tcg_plugin_context into plugin_gen_tb_start
  accel/tcg: Provide default implementation of disas_log
  accel/tcg: Return bool from TranslatorOps.disas_log
  disas: Split disas.c
  disas: Use translator_st to get disassembly data
  accel/tcg: Introduce translator_fake_ld
  target/s390x: Fix translator_fake_ld length
  target/s390x: Disassemble EXECUTEd instructions
  target/hexagon: Use translator_ldl in pkt_crosses_page
  target/microblaze: Use translator_ldl
  target/i386: Use translator_ldub for everything
  target/avr: Use translator_lduw
  target/cris: Use translator_ld* in cris_fetch
  target/cris: Use cris_fetch in translate_v10.c.inc
  target/riscv: Use translator_ld* for everything
  target/rx: Use translator_ld*
  target/xtensa: Use translator_ldub in xtensa_insn_len
  target/s390x: Use translator_lduw in get_next_pc

 disas/disas-internal.h   |   4 +
 include/disas/disas.h|   9 +-
 include/exec/cpu_ldst.h  |  10 -
 include/exec/plugin-gen.h|   7 +-
 include/exec/translator.h|  71 +--
 include/qemu/plugin.h|  22 +-
 include/qemu/qemu-plugin.h   |  15 +-
 include/qemu/typedefs.h  |   1 +
 include/tcg/tcg.h|   1 +
 accel/tcg/plugin-gen.c   |  63 +++---
 accel/tcg/translator.c   | 331 +++---
 contrib/plugins/execlog.c|   5 +-
 contrib/plugins/howvec.c |   4 +-
 disas/disas-common.c | 103 ++
 disas/disas-host.c   | 129 
 disas/disas-mon.c|  15 ++
 disas/disas-target.c |  99 +
 disas/disas.c| 337 ---
 disas/objdump.c  |  37 
 plugins/api.c|  57 --
 target/alpha/translate.c |   9 -
 target/arm/tcg/translate-a64.c   |  11 -
 target/arm/tcg/translate.c   |  12 --
 target/avr/translate.c   |  11 +-
 target/cris/translate.c  |  37 +---
 target/hexagon/translate.c   |  11 +-
 target/hppa/translate.c  |  21 +-
 target/i386/tcg/translate.c  |  19 +-
 target/loongarch/tcg/translate.c |   8 -
 target/m68k/translate.c  |   9 -
 target/microblaze/translate.c|  11 +-
 target/mips/tcg/translate.c  |   9 -
 target/openrisc/translate.c  |  11 -
 target/ppc/translate.c   |   9 -
 target/riscv/translate.c |  24 +--
 target/rx/translate.c|  35 ++--
 target/s390x/tcg/translate.c |  26 ++-
 target/sh4/translate.c   |   9 -
 target/sparc/translate.c |   9 -
 target/tricore/translate.c   |   9 -
 target/xtensa/translate.c|  12 +-
 tcg/tcg.c|  12 --
 target/cris/translate_v10.c.inc  |  30 +--
 disas/meson.build|   8 +-
 44 files changed, 818 insertions(+), 864 deletions(-)
 create mode 100644 disas/disas-common.c
 create mode 100644 disas/disas-host.c
 create mode 100644 disas/disas-target.c
 delete mode 100644 disas/disas.c
 create

[PATCH v2 31/33] target/xtensa: Use translator_ldub in xtensa_insn_len

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 target/xtensa/translate.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target/xtensa/translate.c b/target/xtensa/translate.c
index 42109d33ad..75b7bfda4c 100644
--- a/target/xtensa/translate.c
+++ b/target/xtensa/translate.c
@@ -35,7 +35,6 @@
 #include "tcg/tcg-op.h"
 #include "qemu/log.h"
 #include "qemu/qemu-print.h"
-#include "exec/cpu_ldst.h"
 #include "semihosting/semihost.h"
 #include "exec/translator.h"
 
@@ -1118,7 +1117,7 @@ static void disas_xtensa_insn(CPUXtensaState *env, 
DisasContext *dc)
 
 static inline unsigned xtensa_insn_len(CPUXtensaState *env, DisasContext *dc)
 {
-uint8_t b0 = cpu_ldub_code(env, dc->pc);
+uint8_t b0 = translator_ldub(env, >base, dc->pc);
 return xtensa_op0_insn_len(dc, b0);
 }
 
-- 
2.34.1

[PATCH v2 03/33] accel/tcg: Pass DisasContextBase to translator_fake_ldb

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 include/exec/translator.h| 5 +++--
 accel/tcg/translator.c   | 2 +-
 target/s390x/tcg/translate.c | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/exec/translator.h b/include/exec/translator.h
index c6a9e4b69a..83fe66cba0 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -222,15 +222,16 @@ translator_ldq_swap(CPUArchState *env, DisasContextBase 
*db,
 
 /**
  * translator_fake_ldb - fake instruction load
- * @insn8: byte of instruction
+ * @db: Disassembly context
  * @pc: program counter of instruction
+ * @insn8: byte of instruction
  *
  * This is a special case helper used where the instruction we are
  * about to translate comes from somewhere else (e.g. being
  * re-synthesised for s390x "ex"). It ensures we update other areas of
  * the translator with details of the executed instruction.
  */
-void translator_fake_ldb(uint8_t insn8, vaddr pc);
+void translator_fake_ldb(DisasContextBase *db, vaddr pc, uint8_t insn8);
 
 #ifdef NEED_CPU_H
 /*
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index 42fa977e45..92eb77c3a0 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -374,7 +374,7 @@ uint64_t translator_ldq(CPUArchState *env, DisasContextBase 
*db, vaddr pc)
 return ret;
 }
 
-void translator_fake_ldb(uint8_t insn8, vaddr pc)
+void translator_fake_ldb(DisasContextBase *db, vaddr pc, uint8_t insn8)
 {
 plugin_insn_append(pc, , sizeof(insn8));
 }
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index 90a74ee795..6d7f6e7064 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -6203,7 +6203,7 @@ static const DisasInsn *extract_insn(CPUS390XState *env, 
DisasContext *s)
 /* Register insn bytes with translator so plugins work. */
 for (int i = 0; i < ilen; i++) {
 uint8_t byte = extract64(insn, 56 - (i * 8), 8);
-translator_fake_ldb(byte, pc + i);
+translator_fake_ldb(>base, pc + i, byte);
 }
 op = insn >> 56;
 } else {
-- 
2.34.1

[PATCH v2 17/33] accel/tcg: Return bool from TranslatorOps.disas_log

We have eliminated most uses of this hook.  Reduce
further by allowing the hook to handle only the
special cases, returning false for normal processing.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Richard Henderson 
---
 include/exec/translator.h|  2 +-
 accel/tcg/translator.c   |  5 ++---
 target/hppa/translate.c  | 15 ++-
 target/s390x/tcg/translate.c |  8 +++-
 4 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/include/exec/translator.h b/include/exec/translator.h
index 3c354a4310..bd76d6446b 100644
--- a/include/exec/translator.h
+++ b/include/exec/translator.h
@@ -135,7 +135,7 @@ typedef struct TranslatorOps {
 void (*insn_start)(DisasContextBase *db, CPUState *cpu);
 void (*translate_insn)(DisasContextBase *db, CPUState *cpu);
 void (*tb_stop)(DisasContextBase *db, CPUState *cpu);
-void (*disas_log)(const DisasContextBase *db, CPUState *cpu, FILE *f);
+bool (*disas_log)(const DisasContextBase *db, CPUState *cpu, FILE *f);
 } TranslatorOps;
 
 /**
diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c
index db586b894a..8e8c4e1bf8 100644
--- a/accel/tcg/translator.c
+++ b/accel/tcg/translator.c
@@ -227,9 +227,8 @@ void translator_loop(CPUState *cpu, TranslationBlock *tb, 
int *max_insns,
 if (logfile) {
 fprintf(logfile, "\n");
 
-if (ops->disas_log) {
-ops->disas_log(db, cpu, logfile);
-} else {
+if (!ops->disas_log ||
+!ops->disas_log(db, cpu, logfile)) {
 fprintf(logfile, "IN: %s\n", lookup_symbol(db->pc_first));
 target_disas(logfile, cpu, db->pc_first, db->tb->size);
 }
diff --git a/target/hppa/translate.c b/target/hppa/translate.c
index cafba84631..e8a542c039 100644
--- a/target/hppa/translate.c
+++ b/target/hppa/translate.c
@@ -19,7 +19,6 @@
 
 #include "qemu/osdep.h"
 #include "cpu.h"
-#include "disas/disas.h"
 #include "qemu/host-utils.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg-op.h"
@@ -4816,7 +4815,7 @@ static void hppa_tr_tb_stop(DisasContextBase *dcbase, 
CPUState *cs)
 }
 
 #ifdef CONFIG_USER_ONLY
-static void hppa_tr_disas_log(const DisasContextBase *dcbase,
+static bool hppa_tr_disas_log(const DisasContextBase *dcbase,
   CPUState *cs, FILE *logfile)
 {
 target_ulong pc = dcbase->pc_first;
@@ -4824,20 +4823,18 @@ static void hppa_tr_disas_log(const DisasContextBase 
*dcbase,
 switch (pc) {
 case 0x00:
 fprintf(logfile, "IN:\n0x:  (null)\n");
-return;
+return true;
 case 0xb0:
 fprintf(logfile, "IN:\n0x00b0:  light-weight-syscall\n");
-return;
+return true;
 case 0xe0:
 fprintf(logfile, "IN:\n0x00e0:  set-thread-pointer-syscall\n");
-return;
+return true;
 case 0x100:
 fprintf(logfile, "IN:\n0x0100:  syscall\n");
-return;
+return true;
 }
-
-fprintf(logfile, "IN: %s\n", lookup_symbol(pc));
-target_disas(logfile, cs, pc, dcbase->tb->size);
+return false;
 }
 #endif
 
diff --git a/target/s390x/tcg/translate.c b/target/s390x/tcg/translate.c
index 6d7f6e7064..d74939389a 100644
--- a/target/s390x/tcg/translate.c
+++ b/target/s390x/tcg/translate.c
@@ -31,7 +31,6 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "s390x-internal.h"
-#include "disas/disas.h"
 #include "exec/exec-all.h"
 #include "tcg/tcg-op.h"
 #include "tcg/tcg-op-gvec.h"
@@ -6520,7 +6519,7 @@ static void s390x_tr_tb_stop(DisasContextBase *dcbase, 
CPUState *cs)
 }
 }
 
-static void s390x_tr_disas_log(const DisasContextBase *dcbase,
+static bool s390x_tr_disas_log(const DisasContextBase *dcbase,
CPUState *cs, FILE *logfile)
 {
 DisasContext *dc = container_of(dcbase, DisasContext, base);
@@ -6528,10 +6527,9 @@ static void s390x_tr_disas_log(const DisasContextBase 
*dcbase,
 if (unlikely(dc->ex_value)) {
 /* ??? Unfortunately target_disas can't use host memory.  */
 fprintf(logfile, "IN: EXECUTE %016" PRIx64, dc->ex_value);
-} else {
-fprintf(logfile, "IN: %s\n", lookup_symbol(dc->base.pc_first));
-target_disas(logfile, cs, dc->base.pc_first, dc->base.tb->size);
+return true;
 }
+return false;
 }
 
 static const TranslatorOps s390x_tr_ops = {
-- 
2.34.1

[PATCH v2 01/33] accel/tcg: Use vaddr in translator_ld*