[Qemu-devel] [PATCH v3 09/15] target/sh4: optimize gen_store_fpr64

2017-05-10 Thread Aurelien Jarno
Using extr and avoiding intermediate temps.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index a4c7a0895b..fe8bff54a6 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -305,13 +305,7 @@ static inline void gen_load_fpr64(TCGv_i64 t, int reg)
 
 static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 {
-TCGv_i32 tmp = tcg_temp_new_i32();
-tcg_gen_extrl_i64_i32(tmp, t);
-tcg_gen_mov_i32(cpu_fregs[reg + 1], tmp);
-tcg_gen_shri_i64(t, t, 32);
-tcg_gen_extrl_i64_i32(tmp, t);
-tcg_gen_mov_i32(cpu_fregs[reg], tmp);
-tcg_temp_free_i32(tmp);
+tcg_gen_extr_i64_i32(cpu_fregs[reg + 1], cpu_fregs[reg], t);
 }
 
 #define B3_0 (ctx->opcode & 0xf)
-- 
2.11.0




[Qemu-devel] [PATCH v3 13/15] target/sh4: movua.l is an SH4-A only instruction

2017-05-10 Thread Aurelien Jarno
At the same time change the comment describing the instruction the same
way than other instruction, so that the code is easier to read and search.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index baed19bdac..4bb9105865 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1501,17 +1501,21 @@ static void _decode_opc(DisasContext * ctx)
 }
 ctx->has_movcal = 1;
return;
-case 0x40a9:
-   /* MOVUA.L @Rm,R0 (Rm) -> R0
-  Load non-boundary-aligned data */
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
-   return;
-case 0x40e9:
-   /* MOVUA.L @Rm+,R0   (Rm) -> R0, Rm + 4 -> Rm
-  Load non-boundary-aligned data */
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
-   tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
-   return;
+case 0x40a9:/* movua.l @Rm,R0 */
+/* Load non-boundary-aligned data */
+if (ctx->features & SH_FEATURE_SH4A) {
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+return;
+}
+break;
+case 0x40e9:/* movua.l @Rm+,R0 */
+/* Load non-boundary-aligned data */
+if (ctx->features & SH_FEATURE_SH4A) {
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
+return;
+}
+break;
 case 0x0029:   /* movt Rn */
 tcg_gen_mov_i32(REG(B11_8), cpu_sr_t);
return;
-- 
2.11.0




[Qemu-devel] [PATCH v3 11/15] target/sh4: generate fences for SH4

2017-05-10 Thread Aurelien Jarno
synco is a SH4-A only instruction.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 7a504a7f5a..d61b176a7d 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1569,10 +1569,11 @@ static void _decode_opc(DisasContext * ctx)
else
break;
 case 0x00ab:   /* synco */
-   if (ctx->features & SH_FEATURE_SH4A)
-   return;
-   else
-   break;
+if (ctx->features & SH_FEATURE_SH4A) {
+tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+return;
+}
+break;
 case 0x4024:   /* rotcl Rn */
{
TCGv tmp = tcg_temp_new();
-- 
2.11.0




[Qemu-devel] [PATCH v3 12/15] target/sh4: implement tas.b using atomic helper

2017-05-10 Thread Aurelien Jarno
We only emulate UP SH4, however as the tas.b instruction is used in the GNU
libc, this improve linux-user emulation.

Reviewed-by: Richard Henderson 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 19 +++
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index d61b176a7d..baed19bdac 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1634,19 +1634,14 @@ static void _decode_opc(DisasContext * ctx)
tcg_gen_shri_i32(REG(B11_8), REG(B11_8), 16);
return;
 case 0x401b:   /* tas.b @Rn */
-   {
-   TCGv addr, val;
-   addr = tcg_temp_local_new();
-   tcg_gen_mov_i32(addr, REG(B11_8));
-   val = tcg_temp_local_new();
-tcg_gen_qemu_ld_i32(val, addr, ctx->memidx, MO_UB);
+{
+TCGv val = tcg_const_i32(0x80);
+tcg_gen_atomic_fetch_or_i32(val, REG(B11_8), val,
+ctx->memidx, MO_UB);
 tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, val, 0);
-   tcg_gen_ori_i32(val, val, 0x80);
-tcg_gen_qemu_st_i32(val, addr, ctx->memidx, MO_UB);
-   tcg_temp_free(val);
-   tcg_temp_free(addr);
-   }
-   return;
+tcg_temp_free(val);
+}
+return;
 case 0xf00d: /* fsts FPUL,FRn - FPSCR: Nothing */
CHECK_FPU_ENABLED
tcg_gen_mov_i32(cpu_fregs[FREG(B11_8)], cpu_fpul);
-- 
2.11.0




[Qemu-devel] [PATCH v3 06/15] target/sh4: fix BS_EXCP exit

2017-05-10 Thread Aurelien Jarno
In case of exception, there is no need to call tcg_gen_exit_tb as the
exception helper won't return.

Also fix a few cases where BS_BRANCH is called instead of BS_EXCP.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 04bc18bf7c..f608e314b6 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -339,7 +339,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
 tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
 gen_helper_raise_slot_illegal_instruction(cpu_env);  \
-ctx->bstate = BS_BRANCH; \
+ctx->bstate = BS_EXCP;   \
 return;  \
 }
 
@@ -351,7 +351,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 } else { \
 gen_helper_raise_illegal_instruction(cpu_env);   \
 }\
-ctx->bstate = BS_BRANCH; \
+ctx->bstate = BS_EXCP;   \
 return;  \
 }
 
@@ -363,7 +363,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 } else { \
 gen_helper_raise_fpu_disable(cpu_env);   \
 }\
-ctx->bstate = BS_BRANCH; \
+ctx->bstate = BS_EXCP;   \
 return;  \
 }
 
@@ -1289,7 +1289,7 @@ static void _decode_opc(DisasContext * ctx)
imm = tcg_const_i32(B7_0);
 gen_helper_trapa(cpu_env, imm);
tcg_temp_free(imm);
-   ctx->bstate = BS_BRANCH;
+ctx->bstate = BS_EXCP;
}
return;
 case 0xc800:   /* tst #imm,R0 */
@@ -1798,7 +1798,7 @@ static void _decode_opc(DisasContext * ctx)
 } else {
 gen_helper_raise_illegal_instruction(cpu_env);
 }
-ctx->bstate = BS_BRANCH;
+ctx->bstate = BS_EXCP;
 }
 
 static void decode_opc(DisasContext * ctx)
@@ -1867,7 +1867,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 /* We have hit a breakpoint - make sure PC is up-to-date */
 tcg_gen_movi_i32(cpu_pc, ctx.pc);
 gen_helper_debug(cpu_env);
-ctx.bstate = BS_BRANCH;
+ctx.bstate = BS_EXCP;
 /* The address covered by the breakpoint must be included in
[tb->pc, tb->pc + tb->size) in order to for it to be
properly cleared -- thus we increment the PC here so that
@@ -1911,9 +1911,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 gen_goto_tb(&ctx, 0, ctx.pc);
 break;
 case BS_EXCP:
-/* gen_op_interrupt_restart(); */
-tcg_gen_exit_tb(0);
-break;
+/* fall through */
 case BS_BRANCH:
 default:
 break;
-- 
2.11.0




[Qemu-devel] [PATCH v3 05/15] target/sh4: fix BS_STOP exit

2017-05-10 Thread Aurelien Jarno
When stopping the translation because the state has changed, goto_tb
should not be used as it might link TB with different flags.

Reviewed-by: Richard Henderson 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 2e29936ad8..04bc18bf7c 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1901,8 +1901,9 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 } else {
switch (ctx.bstate) {
 case BS_STOP:
-/* gen_op_interrupt_restart(); */
-/* fall through */
+tcg_gen_movi_i32(cpu_pc, ctx.pc);
+tcg_gen_exit_tb(0);
+break;
 case BS_NONE:
 if (ctx.envflags) {
 gen_store_flags(ctx.envflags);
-- 
2.11.0




[Qemu-devel] [PATCH v3 15/15] target/sh4: use cpu_loop_exit_restore

2017-05-10 Thread Aurelien Jarno
Use cpu_loop_exit_restore when using cpu_restore_state and cpu_loop_exit
together.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/op_helper.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
index fa238c660e..528a40ac1d 100644
--- a/target/sh4/op_helper.c
+++ b/target/sh4/op_helper.c
@@ -48,10 +48,7 @@ void tlb_fill(CPUState *cs, target_ulong addr, MMUAccessType 
access_type,
 ret = superh_cpu_handle_mmu_fault(cs, addr, access_type, mmu_idx);
 if (ret) {
 /* now we have a real cpu fault */
-if (retaddr) {
-cpu_restore_state(cs, retaddr);
-}
-cpu_loop_exit(cs);
+cpu_loop_exit_restore(cs, retaddr);
 }
 }
 
@@ -75,10 +72,7 @@ static inline void QEMU_NORETURN raise_exception(CPUSH4State 
*env, int index,
 CPUState *cs = CPU(sh_env_get_cpu(env));
 
 cs->exception_index = index;
-if (retaddr) {
-cpu_restore_state(cs, retaddr);
-}
-cpu_loop_exit(cs);
+cpu_loop_exit_restore(cs, retaddr);
 }
 
 void helper_raise_illegal_instruction(CPUSH4State *env)
-- 
2.11.0




[Qemu-devel] [PATCH v3 10/15] target/sh4: optimize gen_write_sr using extract op

2017-05-10 Thread Aurelien Jarno
This doesn't change the generated code on x86, but optimizes it on most
RISC architectures and makes the code simpler to read.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index fe8bff54a6..7a504a7f5a 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -204,12 +204,9 @@ static void gen_write_sr(TCGv src)
 {
 tcg_gen_andi_i32(cpu_sr, src,
  ~((1u << SR_Q) | (1u << SR_M) | (1u << SR_T)));
-tcg_gen_shri_i32(cpu_sr_q, src, SR_Q);
-tcg_gen_andi_i32(cpu_sr_q, cpu_sr_q, 1);
-tcg_gen_shri_i32(cpu_sr_m, src, SR_M);
-tcg_gen_andi_i32(cpu_sr_m, cpu_sr_m, 1);
-tcg_gen_shri_i32(cpu_sr_t, src, SR_T);
-tcg_gen_andi_i32(cpu_sr_t, cpu_sr_t, 1);
+tcg_gen_extract_i32(cpu_sr_q, src, SR_Q, 1);
+tcg_gen_extract_i32(cpu_sr_m, src, SR_M, 1);
+tcg_gen_extract_i32(cpu_sr_t, src, SR_T, 1);
 }
 
 static inline void gen_save_cpu_state(DisasContext *ctx, bool save_pc)
-- 
2.11.0




[Qemu-devel] [PATCH v3 02/15] target/sh4: get rid of DELAY_SLOT_CLEARME

2017-05-10 Thread Aurelien Jarno
Now that ctx->flags has been split, it becomes clear that
DELAY_SLOT_CLEARME has not impact on the code generation: in both case
ctx->envflags is cleared, either by clearing all the flags, or by
setting it to 0. This is left-over from pre-TCG era.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.h   |  3 +--
 target/sh4/helper.c|  2 --
 target/sh4/translate.c | 17 +
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index cad8989f7e..9445cc779f 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -93,7 +93,6 @@
 #define DELAY_SLOT (1 << 0)
 #define DELAY_SLOT_CONDITIONAL (1 << 1)
 #define DELAY_SLOT_TRUE(1 << 2)
-#define DELAY_SLOT_CLEARME (1 << 3)
 /* The dynamic value of the DELAY_SLOT_TRUE flag determines whether the jump
  * after the delay slot should be taken or not. It is calculated from SR_T.
  *
@@ -384,7 +383,7 @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, 
target_ulong *pc,
 *pc = env->pc;
 *cs_base = 0;
 *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
-| DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
+| DELAY_SLOT_TRUE))/* Bits  0- 2 */
 | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
 | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))  /* Bits 29-30 */
 | (env->sr & (1u << SR_FD))/* Bit 15 */
diff --git a/target/sh4/helper.c b/target/sh4/helper.c
index 036c5ca56c..71bb49a670 100644
--- a/target/sh4/helper.c
+++ b/target/sh4/helper.c
@@ -170,8 +170,6 @@ void superh_cpu_do_interrupt(CPUState *cs)
/* Clear flags for exception/interrupt routine. */
env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL | DELAY_SLOT_TRUE);
 }
-if (env->flags & DELAY_SLOT_CLEARME)
-env->flags = 0;
 
 if (do_exp) {
 env->expevt = cs->exception_index;
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 6b526a02f2..d84a7a2e6e 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1804,15 +1804,9 @@ static void decode_opc(DisasContext * ctx)
 _decode_opc(ctx);
 
 if (old_flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) {
-if (ctx->envflags & DELAY_SLOT_CLEARME) {
-gen_store_flags(0);
-} else {
-   /* go out of the delay slot */
-uint32_t new_flags = ctx->envflags;
-   new_flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
-   gen_store_flags(new_flags);
-}
-ctx->envflags = 0;
+/* go out of the delay slot */
+ctx->envflags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
+gen_store_flags(ctx->envflags);
 ctx->bstate = BS_BRANCH;
 if (old_flags & DELAY_SLOT_CONDITIONAL) {
gen_delayed_conditional_jump(ctx);
@@ -1840,8 +1834,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 pc_start = tb->pc;
 ctx.pc = pc_start;
 ctx.tbflags = (uint32_t)tb->flags;
-ctx.envflags = tb->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL |
-DELAY_SLOT_CLEARME);
+ctx.envflags = tb->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
 ctx.bstate = BS_NONE;
 ctx.memidx = (ctx.tbflags & (1u << SR_MD)) == 0 ? 1 : 0;
 /* We don't know if the delayed pc came from a dynamic or static branch,
@@ -1908,7 +1901,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 /* fall through */
 case BS_NONE:
 if (ctx.envflags) {
-gen_store_flags(ctx.envflags | DELAY_SLOT_CLEARME);
+gen_store_flags(ctx.envflags);
}
 gen_goto_tb(&ctx, 0, ctx.pc);
 break;
-- 
2.11.0




[Qemu-devel] [PATCH v3 00/15] target/sh4: misc fixes, cleanup and optimizations

2017-05-10 Thread Aurelien Jarno
This patch series try to improve the SH4 target by using the (more or
less) recently introduced TCG features. It also fixes some issues spot
when writting the patches (linking of TB with different flags, SH4-A
specific instructions allowed on SH4) and correctly trap unaligned
accesses.

v3:
- Use extr instead of extrh/extrl in gen_store_fpr64, as suggested by
  Richard Henderson
- Use cpu_loop_exit_restore instead of cpu_loop_exit/cpu_restore_state
  in superh_cpu_do_unaligned_access
- Add a patch to convert the remaining cpu_loop_exit/cpu_restore_state
  to cpu_loop_exit_restore
- Add Reviewed-by entries

v2:
- Add some comments in the struct DisasContext declaration, as suggested
  by Philippe Mathieu-Daudé


Aurelien Jarno (15):
  target/sh4: split ctx->flags into ctx->tbflags and ctx->envflags
  target/sh4: get rid of DELAY_SLOT_CLEARME
  target/sh4: do not include DELAY_SLOT_TRUE in the TB state
  target/sh4: move DELAY_SLOT_TRUE flag into a separate global
  target/sh4: fix BS_STOP exit
  target/sh4: fix BS_EXCP exit
  target/sh4: only save flags state at the end of the TB
  target/sh4: fold ctx->bstate = BS_BRANCH into gen_conditional_jump
  target/sh4: optimize gen_store_fpr64
  target/sh4: optimize gen_write_sr using extract op
  target/sh4: generate fences for SH4
  target/sh4: implement tas.b using atomic helper
  target/sh4: movua.l is an SH4-A only instruction
  target/sh4: trap unaligned accesses
  target/sh4: use cpu_loop_exit_restore

 target/sh4/cpu.c   |   1 +
 target/sh4/cpu.h   |  18 ++-
 target/sh4/helper.c|   4 +-
 target/sh4/op_helper.c |  26 ++--
 target/sh4/translate.c | 322 +++--
 5 files changed, 181 insertions(+), 190 deletions(-)

-- 
2.11.0




[Qemu-devel] [PATCH v3 03/15] target/sh4: do not include DELAY_SLOT_TRUE in the TB state

2017-05-10 Thread Aurelien Jarno
DELAY_SLOT_TRUE is used as a dynamic condition for the branch after the
delay slot instruction. It is not used in code generation, so there is
no need to including in the TB state.

Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index 9445cc779f..da8d15f1b9 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -382,8 +382,7 @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, 
target_ulong *pc,
 {
 *pc = env->pc;
 *cs_base = 0;
-*flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
-| DELAY_SLOT_TRUE))/* Bits  0- 2 */
+*flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) /* Bits 0-1 
*/
 | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
 | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))  /* Bits 29-30 */
 | (env->sr & (1u << SR_FD))/* Bit 15 */
-- 
2.11.0




Re: [Qemu-devel] [PATCH v3 4/6] target/s390x: Implement LOAD PAIR DISJOINT

2017-05-10 Thread Aurelien Jarno
On 2017-05-10 10:43, Richard Henderson wrote:
> On 05/10/2017 10:13 AM, Éric Bischoff wrote:
> > Le mercredi 10 mai 2017, 12:16:20 Aurelien Jarno a écrit :
> > > > +/* In a parallel context, stop the world and single step.  */
> > > > +if (parallel_cpus) {
> > > > +potential_page_fault(s);
> > > > +gen_helper_exit_atomic(cpu_env);
> > > > +return EXIT_NORETURN;
> > > > +}
> > > 
> > > One small additional comment about this patch I haven't spotted at the
> > > first review. The exit_atomic helper is properly restoring the CPU state
> > > passing the return address to cpu_loop_exit_atomic, so I believe the
> > > potential_page_fault call is not necessary. That said, it doesn't hurt
> > > either.
> > 
> > Merci pour la relecture Aurélien.
> > 
> > Richard, what do we do? We remove the potential_page_fault(s); or not?
> 
> I'm thinking of using gen_exception(EXCP_ATOMIC) instead.
> The unwind associated with the regular helper_exit_atomic
> has more overhead than potential_page_fault().

That was just a remark to optimize the code a bit. That said I think the
current code can go like that, it is not wrong.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v3 4/6] target/s390x: Implement LOAD PAIR DISJOINT

2017-05-10 Thread Aurelien Jarno
On 2017-05-09 11:07, Richard Henderson wrote:
> From: Eric Bischoff 
> 
> Reviewed-by: Aurelien Jarno 
> Signed-off-by: Eric Bischoff 
> Message-Id: <20170228120134.7921-1-ebisch...@suse.com>
> [rth: Combine the two via insn->data; free the address temps.]
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/insn-data.def |  4 +++-
>  target/s390x/translate.c   | 42 ++
>  2 files changed, 45 insertions(+), 1 deletion(-)

[snip]

> diff --git a/target/s390x/translate.c b/target/s390x/translate.c
> index 2b66a4e..8de0177 100644
> --- a/target/s390x/translate.c
> +++ b/target/s390x/translate.c
> @@ -2559,6 +2559,7 @@ static ExitStatus op_lctlg(DisasContext *s, DisasOps *o)
>  tcg_temp_free_i32(r3);
>  return NO_EXIT;
>  }
> +
>  static ExitStatus op_lra(DisasContext *s, DisasOps *o)
>  {
>  check_privileged(s);
> @@ -2759,6 +2760,31 @@ static ExitStatus op_lm64(DisasContext *s, DisasOps *o)
>  return NO_EXIT;
>  }
>  
> +static ExitStatus op_lpd(DisasContext *s, DisasOps *o)
> +{
> +TCGv_i64 a1, a2;
> +TCGMemOp mop = s->insn->data;
> +
> +/* In a parallel context, stop the world and single step.  */
> +if (parallel_cpus) {
> +potential_page_fault(s);
> +gen_helper_exit_atomic(cpu_env);
> +return EXIT_NORETURN;
> +}

One small additional comment about this patch I haven't spotted at the
first review. The exit_atomic helper is properly restoring the CPU state
passing the return address to cpu_loop_exit_atomic, so I believe the
potential_page_fault call is not necessary. That said, it doesn't hurt
either.

Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v2 14/14] target/sh4: trap unaligned accesses

2017-05-10 Thread Aurelien Jarno
On 2017-05-09 14:13, Richard Henderson wrote:
> On 05/06/2017 04:14 AM, Aurelien Jarno wrote:
> > +void superh_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
> > +MMUAccessType access_type,
> > +int mmu_idx, uintptr_t retaddr)
> > +{
> > +if (retaddr) {
> > +cpu_restore_state(cs, retaddr);
> > +}
> ...
> > +cpu_loop_exit(cs);
> > +}
> > +
> 
> cpu_loop_exit_restore?

Right, I forgot that this function now exit. I'll change that and
convert the other few place where this sequence is used.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v2 09/14] target/sh4: optimize gen_store_fpr64

2017-05-09 Thread Aurelien Jarno
On 2017-05-09 14:09, Richard Henderson wrote:
> On 05/06/2017 04:14 AM, Aurelien Jarno wrote:
> > +tcg_gen_extrl_i64_i32(cpu_fregs[reg + 1], t);
> > +tcg_gen_extrh_i64_i32(cpu_fregs[reg], t);
> 
> This is
> 
>   tcg_gen_extr_i64_i32(cpu_fregs[reg + 1], cpu_fregs[reg], t);
> 

Good catch, thanks.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v3 1/6] target/s390x: Implement STORE FACILITIES LIST EXTENDED

2017-05-09 Thread Aurelien Jarno
On 2017-05-09 11:07, Richard Henderson wrote:
> At the same time, improve STORE FACILITIES LIST
> so that we don't hard-code the list for all cpus.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/helper.h  |  2 ++
>  target/s390x/insn-data.def |  2 ++
>  target/s390x/misc_helper.c | 59 
> ++
>  target/s390x/translate.c   | 17 ++---
>  4 files changed, 72 insertions(+), 8 deletions(-)

Reviewed-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v2 1/6] target/s390x: Implement STORE FACILITIES LIST EXTENDED

2017-05-09 Thread Aurelien Jarno
On 2017-05-09 07:51, Richard Henderson wrote:
> On 05/09/2017 01:14 AM, Aurelien Jarno wrote:
> > > +/* The maximum bit defined at the moment is 129.  */
> > > +#define MAX_STFL_WORDS  3
> > 
> > Could it be computed from S390_FEAT_MAX? in gen-features.c,
> > S390_FEAT_MAX / 64 + 1 is used.
> 
> No, because the features list in cpu_features_def.h bears no relation to the
> facilities bit index (as seen in the third column of s390_features
> structure).
> 
> > Is there a reason to not use s390_fill_feat_block here?
> 
> Yes.  I used s390_fill_feat_block in the v1 patch and changed this in
> response to review from David Hildenbrand.
> 
> Using s390_fill_feat_block isn't completely trivial.  It stores into a
> big-endian uint8_t array, which means that (for a little-endian host) we
> have to be careful how we copy that data to the guest, lest we inadvertently
> byte swap again.
> 
> Here in v2 I store into a host-endian uint64_t array, which makes the
> ultimate users of this helper more straightforward.

Ok, thanks for the detailed explanations. Then I guess you should fold
the following patch to correctly set the zArch active bit as done in
s390_fill_feat_block:

--- a/target/s390x/misc_helper.c
+++ b/target/s390x/misc_helper.c
@@ -693,6 +693,11 @@ static unsigned do_stfle(CPUS390XState *env, uint64_t 
words[MAX_STFL_WORDS])
 
 memset(words, 0, sizeof(uint64_t) * MAX_STFL_WORDS);
 
+if (test_bit(S390_FEAT_ZARCH, features)) {
+/* z/Architecture is always active if around */
+words[0] |= 1ull << 61;
+}
+
 for (feat = find_first_bit(features, S390_FEAT_MAX);
  feat < S390_FEAT_MAX;
  feat = find_next_bit(features, S390_FEAT_MAX, feat + 1)) {

> > Otherwise it looks fine to me. It's probably a discussion for later
> > patches, but should we also implement a TCG feature mask, like for
> > example on PowerPC? Currently the only allowed CPU for TCG is z900,
> > which makes this code almost useless. And while QEMU implements many
> > features from newer CPU, some of them are missing and we don't want
> > them to appear in the feature list.
> 
> This is complicated by the fact that for a long time, the kernel checked for
> an exact match of facilities present.
> 
> From linux 3.13 arch/s390/kernel/head.S:
> 
> # List of facilities that are required. If not all facilities are present
> # the kernel will crash. Format is number of facility words with bits set,
> # followed by the facility words.
> 
> #if defined(CONFIG_64BIT)
> #if defined(CONFIG_MARCH_ZEC12)
> .long 3, 0xc100efe3, 0xf46ce800, 0x0040
> #elif defined(CONFIG_MARCH_Z196)
> .long 2, 0xc100efe3, 0xf46c
> #elif defined(CONFIG_MARCH_Z10)
> .long 2, 0xc100efe3, 0xf068
> #elif defined(CONFIG_MARCH_Z9_109)
> .long 1, 0xc100efc3
> #elif defined(CONFIG_MARCH_Z990)
> .long 1, 0xc0002000
> #elif defined(CONFIG_MARCH_Z900)
> .long 1, 0xc000
> #endif
> 
> I'm pleased to see this appears to have been dropped at some point; I cannot
> see it present in linux 4.11.  However, this still applies to the kernels
> supplied by the shipping distributions.
> 
> Which means that we cannot mask *anything* from TCG, including the useless
> stuff like hexadecimal floating point, and still have the kernel boot.  So
> in practice a feature mask for TCG will be not only useless but actively
> harmful.

That's true if you want to boot an optimized kernel. That said you might
want to boot a z900 kernel on a more recent machine and still have the
userland check for some facilities using the STFLE instructions. Right
now the choice is limited to the z900 machine. As soon as you try to
use a newer CPU like the z990, the DAT-enhancement facility bit is set
in STFL, and the kernel try to use the idte instruction which is not
emulated by QEMU. OTOH, it's possible to boot a z900 kernel with a -cpu
z990,dateh=off.

Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



[Qemu-devel] [PATCH RFC] target/s390x: improve SIGP to add SMP support

2017-05-09 Thread Aurelien Jarno
This patch adds *very rough* SMP support to the s390x target, and make
it possible to use MTTCG, when used with the various atomic patches
posted on the mailing list. I haven't done any advanced test, so there
is certainly more atomic issues to fix.

Anyway this patch is nothing more than a way to determine what needs to
be implemented in the SIGNAL PROCESSOR instruction to add SMP support,
and a basis for discussion about how to implement things. It should be
rewritten from scratch before reaching in an acceptable state.

It has been wrote mostly looking at the KVM code. Unfortunately I don't
think it's easy to share code between TCG and KVM because part of KVM
SIGP support is implemented on the host kernel side. Given I don't have
a lot of experience with MTTCG, my main question at this point is how to
determine when it is possible to directly interact with another CPU and
when run_on_cpu should be used instead. Any help or comments are
welcome.

Signed-off-by: Aurelien Jarno 
---
 target/s390x/cpu.h |  2 ++
 target/s390x/misc_helper.c | 64 ++
 2 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index fa244f1238..a4651de0d6 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -1057,6 +1057,7 @@ struct sysib_322 {
 #define SIGP_SET_PREFIX0x0d
 #define SIGP_STORE_STATUS_ADDR 0x0e
 #define SIGP_SET_ARCH  0x12
+#define SIGP_SENSE_RUNNING 0x15
 #define SIGP_STORE_ADTL_STATUS 0x17
 
 /* SIGP condition codes */
@@ -1067,6 +1068,7 @@ struct sysib_322 {
 
 /* SIGP status bits */
 #define SIGP_STAT_EQUIPMENT_CHECK   0x8000UL
+#define SIGP_STAT_NOT_RUNNING   0x0400UL
 #define SIGP_STAT_INCORRECT_STATE   0x0200UL
 #define SIGP_STAT_INVALID_PARAMETER 0x0100UL
 #define SIGP_STAT_EXT_CALL_PENDING  0x0080UL
diff --git a/target/s390x/misc_helper.c b/target/s390x/misc_helper.c
index 9178a3987f..5285c82783 100644
--- a/target/s390x/misc_helper.c
+++ b/target/s390x/misc_helper.c
@@ -516,36 +516,90 @@ uint32_t HELPER(sigp)(CPUS390XState *env, uint64_t 
order_code, uint32_t r1,
 
 /* Remember: Use "R1 or R1 + 1, whichever is the odd-numbered register"
as parameter (input). Status (output) is always R1. */
+uint64_t param = (r1 % 2) ? env->regs[r1] : env->regs[r1 + 1];
+
+S390CPU *dst_cpu = s390_cpu_addr2state(cpu_addr);
+
+if (dst_cpu == NULL) {
+return SIGP_STAT_INVALID_PARAMETER;
+}
+
+qemu_mutex_lock_iothread();
 
 switch (order_code & SIGP_ORDER_MASK) {
 case SIGP_SET_ARCH:
+cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 /* switch arch */
 break;
 case SIGP_SENSE:
 /* enumerate CPU status */
 if (cpu_addr) {
 /* XXX implement when SMP comes */
-return 3;
+cc = SIGP_CC_NOT_OPERATIONAL;
+} else {
+env->regs[r1] &= 0xULL;
+cc = SIGP_CC_STATUS_STORED;
 }
-env->regs[r1] &= 0xULL;
-cc = 1;
+break;
+case SIGP_INITIAL_CPU_RESET:
+S390_CPU_GET_CLASS(dst_cpu)->initial_cpu_reset(CPU(dst_cpu));
+cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 break;
 #if !defined(CONFIG_USER_ONLY)
 case SIGP_RESTART:
+/* the restart irq has to be delivered prior to any other pending irq 
*/
+do_restart_interrupt(&dst_cpu->env);
+s390_cpu_set_state(CPU_STATE_OPERATING, dst_cpu);
+qemu_cpu_kick(CPU(dst_cpu));
+cc = SIGP_CC_ORDER_CODE_ACCEPTED;
+#if 0
 qemu_system_reset_request();
 cpu_loop_exit(CPU(s390_env_get_cpu(env)));
+#endif
 break;
 case SIGP_STOP:
-qemu_system_shutdown_request();
-cpu_loop_exit(CPU(s390_env_get_cpu(env)));
+/* FIXME doesn't work */
+if (s390_cpu_halt(dst_cpu) == 0) {
+qemu_system_shutdown_request();
+}
+cpu_loop_exit(CPU(dst_cpu));
+cc = SIGP_CC_ORDER_CODE_ACCEPTED;
+break;
+case SIGP_SET_PREFIX:
+{
+uint32_t addr = param & 0x7fffe000u;
+if (!address_space_access_valid(&address_space_memory, addr,
+   sizeof(struct LowCore), false)) {
+cc = SIGP_STAT_INVALID_PARAMETER;
+} else if (s390_cpu_get_state(dst_cpu) != CPU_STATE_STOPPED) {
+cc = SIGP_STAT_INCORRECT_STATE;
+} else {
+dst_cpu->env.psa = addr;
+tlb_flush_page(CPU(dst_cpu), 0);
+tlb_flush_page(CPU(dst_cpu), TARGET_PAGE_SIZE);
+cc = SIGP_CC_ORDER_CODE_ACCEPTED;
+}
+}
 break;
 #endif
+case SIGP_SENSE_RUNNING:
+if (s390_cpu_get_state(dst_cpu) == CPU_STATE_OPERATING) {
+cc = SIGP_CC_ORDER_CODE_ACCEPTED;
+} else

[Qemu-devel] [PATCH 3/3] target/s390x: implement serialization in BRANCH CONDITION

2017-05-09 Thread Aurelien Jarno
Signed-off-by: Aurelien Jarno 
---
 target/s390x/translate.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index c58c27f8e9..2f07ce2be9 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -1517,6 +1517,21 @@ static ExitStatus op_bc(DisasContext *s, DisasOps *o)
 int imm = is_imm ? get_field(s->fields, i2) : 0;
 DisasCompare c;
 
+/* BCR with R2 = 0 causes no branching */
+if (have_field(s->fields, r2) && get_field(s->fields, r2) == 0) {
+if (m1 == 14) {
+/* Perform serialization */
+/* FIXME: check for fast-BCR-serialization facility */
+tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+}
+if (m1 == 15) {
+/* Perform serialization */
+/* FIXME: perform checkpoint-synchronisation */
+tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+}
+return NO_EXIT;
+}
+
 disas_jcc(s, &c, m1);
 return help_branch(s, &c, is_imm, imm, o->in2);
 }
-- 
2.11.0




[Qemu-devel] [PATCH 2/3] target/s390x: fix SIGNAL PROCESSOR return value

2017-05-09 Thread Aurelien Jarno
The SIGNAL PROCESSOR helper returns its value through the CC register.
set_cc_static should be called just after the helper.

Signed-off-by: Aurelien Jarno 
---
 target/s390x/translate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/target/s390x/translate.c b/target/s390x/translate.c
index 558ff78084..c58c27f8e9 100644
--- a/target/s390x/translate.c
+++ b/target/s390x/translate.c
@@ -3363,6 +3363,7 @@ static ExitStatus op_sigp(DisasContext *s, DisasOps *o)
 check_privileged(s);
 potential_page_fault(s);
 gen_helper_sigp(cc_op, cpu_env, o->in2, r1, o->in1);
+set_cc_static(s);
 tcg_temp_free_i32(r1);
 return NO_EXIT;
 }
-- 
2.11.0




[Qemu-devel] [PATCH 0/3] target/s390x: misc patches

2017-05-09 Thread Aurelien Jarno
Those are just random patches I have written while trying to get a MTTCG
version of qemu/s390x. I just send them to avoid duplicated work.

Aurelien Jarno (3):
  target/s390x: mask the SIGP order_code using SIGP_ORDER_MASK
  target/s390x: fix SIGNAL PROCESSOR return value
  target/s390x: implement serialization in BRANCH CONDITION

 target/s390x/cpu.h |  3 +++
 target/s390x/kvm.c |  2 --
 target/s390x/misc_helper.c |  3 +--
 target/s390x/translate.c   | 16 
 4 files changed, 20 insertions(+), 4 deletions(-)

-- 
2.11.0




[Qemu-devel] [PATCH 1/3] target/s390x: mask the SIGP order_code using SIGP_ORDER_MASK

2017-05-09 Thread Aurelien Jarno
For that move the definition from kvm.c to cpu.h

Signed-off-by: Aurelien Jarno 
---
 target/s390x/cpu.h | 3 +++
 target/s390x/kvm.c | 2 --
 target/s390x/misc_helper.c | 3 +--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
index df823280a5..2471db920d 100644
--- a/target/s390x/cpu.h
+++ b/target/s390x/cpu.h
@@ -1081,6 +1081,9 @@ struct sysib_322 {
 #define SIGP_MODE_Z_ARCH_TRANS_ALL_PSW 1
 #define SIGP_MODE_Z_ARCH_TRANS_CUR_PSW 2
 
+/* SIGP order code mask corresponding to bit positions 56-63 */
+#define SIGP_ORDER_MASK 0x00ff
+
 void load_psw(CPUS390XState *env, uint64_t mask, uint64_t addr);
 int mmu_translate(CPUS390XState *env, target_ulong vaddr, int rw, uint64_t asc,
   target_ulong *raddr, int *flags, bool exc);
diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c
index 1a249d8359..fb105429be 100644
--- a/target/s390x/kvm.c
+++ b/target/s390x/kvm.c
@@ -1764,8 +1764,6 @@ static int sigp_set_architecture(S390CPU *cpu, uint32_t 
param,
 return SIGP_CC_ORDER_CODE_ACCEPTED;
 }
 
-#define SIGP_ORDER_MASK 0x00ff
-
 static int handle_sigp(S390CPU *cpu, struct kvm_run *run, uint8_t ipa1)
 {
 CPUS390XState *env = &cpu->env;
diff --git a/target/s390x/misc_helper.c b/target/s390x/misc_helper.c
index 53cb3b8ac6..395f38dea5 100644
--- a/target/s390x/misc_helper.c
+++ b/target/s390x/misc_helper.c
@@ -521,8 +521,7 @@ uint32_t HELPER(sigp)(CPUS390XState *env, uint64_t 
order_code, uint32_t r1,
 
 qemu_mutex_lock_iothread();
 
-/* sigp contains the order code in bit positions 56-63, mask it here. */
-switch (order_code & 0xff) {
+switch (order_code & SIGP_ORDER_MASK) {
 case SIGP_SET_ARCH:
 cc = SIGP_CC_ORDER_CODE_ACCEPTED;
 /* switch arch */
-- 
2.11.0




Re: [Qemu-devel] [PATCH v2 1/6] target/s390x: Implement STORE FACILITIES LIST EXTENDED

2017-05-09 Thread Aurelien Jarno
On 2017-05-08 08:17, Richard Henderson wrote:
> At the same time, improve STORE FACILITIES LIST
> so that we don't hard-code the list for all cpus.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/helper.h  |  2 ++
>  target/s390x/insn-data.def |  2 ++
>  target/s390x/misc_helper.c | 54 
> ++
>  target/s390x/translate.c   | 17 ---
>  4 files changed, 67 insertions(+), 8 deletions(-)
>
> diff --git a/target/s390x/misc_helper.c b/target/s390x/misc_helper.c
> index eca8244..5263726 100644
> --- a/target/s390x/misc_helper.c
> +++ b/target/s390x/misc_helper.c
> @@ -678,3 +678,57 @@ void HELPER(per_ifetch)(CPUS390XState *env, uint64_t 
> addr)
>  }
>  }
>  #endif
> +
> +/* The maximum bit defined at the moment is 129.  */
> +#define MAX_STFL_WORDS  3

Could it be computed from S390_FEAT_MAX? in gen-features.c,
S390_FEAT_MAX / 64 + 1 is used.

> +
> +/* Canonicalize the current cpu's features into the 64-bit words required
> +   by STFLE.  Return the index-1 of the max word that is non-zero.  */
> +static unsigned do_stfle(CPUS390XState *env, uint64_t words[MAX_STFL_WORDS])
> +{
> +S390CPU *cpu = s390_env_get_cpu(env);
> +const unsigned long *features = cpu->model->features;
> +unsigned max_bit = 0;
> +S390Feat feat;
> +
> +memset(words, 0, sizeof(uint64_t) * MAX_STFL_WORDS);
> +
> +for (feat = find_first_bit(features, S390_FEAT_MAX);
> + feat < S390_FEAT_MAX;
> + feat = find_next_bit(features, S390_FEAT_MAX, feat + 1)) {
> +const S390FeatDef *def = s390_feat_def(feat);
> +if (def->type == S390_FEAT_TYPE_STFL) {
> +unsigned bit = def->bit;
> +if (bit > max_bit) {
> +max_bit = bit;
> +}
> +assert(bit / 64 < MAX_STFL_WORDS);
> +words[bit / 64] |= 1ULL << (63 - bit % 64);
> +}
> +}
> +
> +return max_bit / 64;
> +}

Is there a reason to not use s390_fill_feat_block here? I guess max_bit
can be compute using find_last_bit. It's probably a bit less optimal,
but if we care about STFLE performance, we should probably avoid recomputing
the features words each time. Anyway if there is a good reason to not
use s390_fill_feat_block, the zArch active bit should also be handled here.

Otherwise it looks fine to me. It's probably a discussion for later
patches, but should we also implement a TCG feature mask, like for
example on PowerPC? Currently the only allowed CPU for TCG is z900,
which makes this code almost useless. And while QEMU implements many
features from newer CPU, some of them are missing and we don't want
them to appear in the feature list.

Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v2 6/6] target/s390x: Use atomic operations for LOAD AND OP

2017-05-08 Thread Aurelien Jarno
On 2017-05-08 08:17, Richard Henderson wrote:
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/insn-data.def | 20 ++--
>  target/s390x/translate.c   | 78 
> +-
>  2 files changed, 60 insertions(+), 38 deletions(-)

Reviewed-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v2 4/6] target/s390x: Implement LOAD PAIR DISJOINT

2017-05-08 Thread Aurelien Jarno
On 2017-05-08 08:17, Richard Henderson wrote:
> From: Eric Bischoff 
> 
> Signed-off-by: Eric Bischoff 
> Message-Id: <20170228120134.7921-1-ebisch...@suse.com>
> [rth: Combine the two via insn->data; free the address temps.]
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/insn-data.def |  4 +++-
>  target/s390x/translate.c   | 42 ++
>  2 files changed, 45 insertions(+), 1 deletion(-)

Reviewed-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v2 5/6] target/s390x: Use atomic operations for COMPARE SWAP

2017-05-08 Thread Aurelien Jarno
On 2017-05-08 08:17, Richard Henderson wrote:
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/helper.h |  1 +
>  target/s390x/mem_helper.c | 39 ++
>  target/s390x/translate.c  | 83 
> +--
>  3 files changed, 55 insertions(+), 68 deletions(-)

Reviewed-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v2 2/6] target/s390x: Implement LOAD PROGRAM PARAMETER

2017-05-08 Thread Aurelien Jarno
On 2017-05-08 08:17, Richard Henderson wrote:
> From: Miroslav Benes 
> 
> Linux arch/s390/kernel/head(64).S uses LPP instruction if it is
> available in facilities list provided by stfl/stfle instruction.
> This is the case of newer z/System generations and their qemu
> definition.
> 
> The description of LPP is at
> http://www-01.ibm.com/support/docview.wss?uid=isg26fcd1cc32246f4c8852574ce0044734a
> 
> Signed-off-by: Miroslav Benes 
> Message-Id: <20170227085353.20787-1-mbe...@suse.cz>
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/insn-data.def | 2 ++
>  target/s390x/translate.c   | 9 +++++
>  2 files changed, 11 insertions(+)

Reviewed-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v2 3/6] target/s390x: Diagnose specification exception for atomics

2017-05-08 Thread Aurelien Jarno
On 2017-05-08 08:17, Richard Henderson wrote:
> All of the interlocked access facility instructions raise a
> specification exception for unaligned accesses.  Do this by
> using the (previously unused) unaligned_access hook.
> 
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/cpu.c|  1 +
>  target/s390x/cpu.h|  3 +++
>  target/s390x/helper.c | 16 
>  3 files changed, 20 insertions(+)
> 
> diff --git a/target/s390x/cpu.c b/target/s390x/cpu.c
> index 066dcd1..a1bf2ba 100644
> --- a/target/s390x/cpu.c
> +++ b/target/s390x/cpu.c
> @@ -430,6 +430,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void 
> *data)
>  cc->write_elf64_note = s390_cpu_write_elf64_note;
>  cc->cpu_exec_interrupt = s390_cpu_exec_interrupt;
>  cc->debug_excp_handler = s390x_cpu_debug_excp_handler;
> +cc->do_unaligned_access = s390x_cpu_do_unaligned_access;
>  #endif
>  cc->disas_set_info = s390_cpu_disas_set_info;
>  
> diff --git a/target/s390x/cpu.h b/target/s390x/cpu.h
> index 058ddad..bbed320 100644
> --- a/target/s390x/cpu.h
> +++ b/target/s390x/cpu.h
> @@ -480,6 +480,9 @@ int s390_cpu_handle_mmu_fault(CPUState *cpu, vaddr 
> address, int rw,
>  
>  #ifndef CONFIG_USER_ONLY
>  void do_restart_interrupt(CPUS390XState *env);
> +void s390x_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
> +   MMUAccessType access_type,
> +   int mmu_idx, uintptr_t retaddr);
>  
>  static inline hwaddr decode_basedisp_s(CPUS390XState *env, uint32_t ipb,
> uint8_t *ar)
> diff --git a/target/s390x/helper.c b/target/s390x/helper.c
> index 68bd2f9..9978490 100644
> --- a/target/s390x/helper.c
> +++ b/target/s390x/helper.c
> @@ -718,4 +718,20 @@ void s390x_cpu_debug_excp_handler(CPUState *cs)
>  cpu_loop_exit_noexc(cs);
>  }
>  }
> +
> +/* Unaligned accesses are only diagnosed with MO_ALIGN.  At the moment,
> +   this is only for the atomic operations, for which we want to raise a
> +   specification exception.  */
> +void s390x_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
> +   MMUAccessType access_type,
> +   int mmu_idx, uintptr_t retaddr)
> +{
> +S390CPU *cpu = S390_CPU(cs);
> +CPUS390XState *env = &cpu->env;
> +
> +if (retaddr) {
> +cpu_restore_state(cs, retaddr);
> +}
> +program_interrupt(env, PGM_SPECIFICATION, ILEN_LATER);
> +}
>  #endif /* CONFIG_USER_ONLY */

Reviewed-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH 05/11] hw/mips: add missing include

2017-05-08 Thread Aurelien Jarno
On 2017-05-08 20:39, Philippe Mathieu-Daudé wrote:
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  include/hw/mips/mips.h | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/include/hw/mips/mips.h b/include/hw/mips/mips.h
> index e0065ce808..16412dc150 100644
> --- a/include/hw/mips/mips.h
> +++ b/include/hw/mips/mips.h
> @@ -6,6 +6,7 @@
>  #define INITRD_PAGE_MASK (~((1 << 16) - 1))
>  
>  #include "exec/memory.h"
> +#include "hw/irq.h"
>  
>  /* gt64xxx.c */
>  PCIBus *gt64120_register(qemu_irq *pic);

Acked-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] Floating point unit bugs

2017-05-08 Thread Aurelien Jarno
On 2017-05-08 18:36, G 3 wrote:
> 
> On May 8, 2017, at 6:13 PM, Aurelien Jarno wrote:
> 
> > On 2017-05-07 17:48, G 3 wrote:
> > > I made a diagnostic program for the floating point unit. It will test
> > > various PowerPC floating point instructions for compatibility with
> > > the
> > > PowerPC G3 processor. It was tested on a PowerPC G3 and G5 system.
> > > The
> > > results of the program in qemu-system-ppc were pretty bad. About
> > > every
> > > instruction tested is not implemented correctly.
> > > 
> > > Here is the download link to the program: 
> > > http://www.mediafire.com/file/6j9tqubvk73lkw1/floating_point_test_program.zip
> > 
> > Some comments on the code.
> > > 
> > > /* Check if everything is alright */
> > > uint32_t fpscr;
> > > asm volatile("mffs f0");
> > > asm volatile("stfd f0, 40(r1)");
> > > asm volatile("lwz %0, 44(r1)" : "=r"(fpscr));
> > > if (fpscr != 0) {
> > > printf("Warning: fpscr not equal to zero: 0x%x\n", fpscr);
> > > }
> > 
> > This is overly complicated and just doesn't compile with recent GCC
> > versions.
> 
> Which version of GCC had the problem? GCC 5.2 and GCC 3.3 seems to work
> fine. GCC 4.0 did not work. Could you send me the error message?

I tried with GCC 4.9. Actually the error message is coming from
binutils:

| main.c:34:5: warning: missing braces around initializer [-Wmissing-braces]
|  "FX", "Floating-point exception summary",
|  ^
| main.c:34:5: warning: (near initialization for 'finfo[0]') [-Wmissing-braces]
| main.c: In function 'print_fpscr_settings':
| main.c:73:26: warning: suggest parentheses around comparison in operand of 
'&' [-Wparentheses]
|  if ((fpscr >> i) & 0x1 == 1) {
|   ^
| main.c: In function 'test_failed':
| main.c:146:5: warning: format '%llx' expects argument of type 'long long 
unsigned int', but argument 2 has type 'uint32_t' [-Wformat=]
|  printf("  actual answer: 0x%" PRIx64 "\n", actual_fpscr);
|  ^
| /tmp/cctHPRx4.s: Assembler messages:
| /tmp/cctHPRx4.s:315: Error: unsupported relocation against f0
| /tmp/cctHPRx4.s:318: Error: unsupported relocation against f0
| /tmp/cctHPRx4.s:318: Error: unsupported relocation against r1
| /tmp/cctHPRx4.s:321: Error: unsupported relocation against r1
| /tmp/cctHPRx4.s:438: Error: unsupported relocation against f0
| /tmp/cctHPRx4.s:441: Error: unsupported relocation against f0
| /tmp/cctHPRx4.s:441: Error: unsupported relocation against r1
| /tmp/cctHPRx4.s:444: Error: unsupported relocation against r1

> > What about something like:
> > 
> > Converter c;
> > asm volatile("mffs %0" : "=f"(c.d));
> > fpscr = (uint32_t)c.i;
> 
> This way does work also.
> 
> > 
> > 
> > > /*
> > >  * The action to take if a test fails
> > >  * Input one: message string
> > >  * Input two: actual fpscr value
> > >  * Input three: expected fpscr value
> > >  * Input four: actual answer
> > >  * Input five: expected answer
> > >  */
> > >  void test_failed(const char *message, uint32_t actual_fpscr,
> > > uint32_t
> > > expected_fpscr,
> > >   uint64_t actual_answer, uint64_t expected_answer)
> > >  {
> > > printf("%s\n", message);
> > > printf("expected answer: 0x%" PRIx64 "\n", expected_answer);
> > > printf("  actual answer: 0x%" PRIx64 "\n", actual_fpscr);
> > 
> > This is wrong. It should be actual_answer instead of actual_fpscr. That
> > is why all the instructions seems totally wrongly implemented.
> 
> Thanks for catching this error. Actually this would only effect the "actual
> answer:" output field. The comparison between expected_answer and
> actual_answer in each individual test is still valid.

Indeed, but I guess it gives "better" results than what it looks when
looking at your mail where the values are totally wrong.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] Floating point unit bugs

2017-05-08 Thread Aurelien Jarno
On 2017-05-08 18:09, G 3 wrote:
> 
> On May 8, 2017, at 5:54 PM, Aurelien Jarno wrote:
> 
> > On 2017-05-07 17:48, G 3 wrote:
> > > I made a diagnostic program for the floating point unit. It will test
> > > various PowerPC floating point instructions for compatibility with
> > > the
> > > PowerPC G3 processor. It was tested on a PowerPC G3 and G5 system.
> > > The
> > > results of the program in qemu-system-ppc were pretty bad. About
> > > every
> > > instruction tested is not implemented correctly.
> > 
> > I don't say that qemu-system-ppc is bug free, but this looks suspicious
> > that about every instruction is buggy.
> 
> I really hope you don't think I'm blaming anyone. I'm only reporting the
> results of the test.
> 
> > Have you tried to run your
> > program on a real G3 or G5 system?
> 
> Yes. I made sure it ran on a real G3 and G5 system without problem before
> testing it on QEMU. I suspect the Motorola designed G4 processor will not be
> compatible. I don't have a working G4 system to verify this unfortunately.
> 
> > 
> > [ snip ]
> > 
> > > 
> > > Here is the full test results after running this program in 
> > > qemu-system-ppc
> > > with a Mac OS 10.4 guest:
> > > 
> > > 
> > > 
> > > 
> > > fadd test failed
> > > expected answer: 0x3ff4
> > >   actual answer: 0x82004024
> > > expected fpscr: 0x82064000
> > >   actual fpscr: 0x82004000
> > 
> > This looks highly suspicious that the actual answer match the expected
> > answer.
> 
> You can use this web page to find the decimal value: 
> http://www6.uniovi.es/~antonio/uned/ieee754/IEEE-754hex64.html
> 
> 0x3ff4 = 1.2002
> 0x82004024 = -4.8529708162167760e-299
> 
> The expected answer and actual answer are very far from each other.

Yes, I made a typo in my comment. I wanted to say that I found very
suspicious that the actual answer match the actual fpscr. See my other
mail for the reason.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] Floating point unit bugs

2017-05-08 Thread Aurelien Jarno
On 2017-05-07 17:48, G 3 wrote:
> I made a diagnostic program for the floating point unit. It will test
> various PowerPC floating point instructions for compatibility with the
> PowerPC G3 processor. It was tested on a PowerPC G3 and G5 system. The
> results of the program in qemu-system-ppc were pretty bad. About every
> instruction tested is not implemented correctly.
> 
> Here is the download link to the program: 
> http://www.mediafire.com/file/6j9tqubvk73lkw1/floating_point_test_program.zip

Some comments on the code.
> 
> /* Check if everything is alright */
> uint32_t fpscr;
> asm volatile("mffs f0");
> asm volatile("stfd f0, 40(r1)");
> asm volatile("lwz %0, 44(r1)" : "=r"(fpscr));
> if (fpscr != 0) {
> printf("Warning: fpscr not equal to zero: 0x%x\n", fpscr);
> }

This is overly complicated and just doesn't compile with recent GCC
versions. What about something like:

Converter c;
asm volatile("mffs %0" : "=f"(c.d));
fpscr = (uint32_t)c.i;


> /*
>  * The action to take if a test fails
>  * Input one: message string
>  * Input two: actual fpscr value
>  * Input three: expected fpscr value
>  * Input four: actual answer
>  * Input five: expected answer
>  */
>  void test_failed(const char *message, uint32_t actual_fpscr, uint32_t
> expected_fpscr,
>   uint64_t actual_answer, uint64_t expected_answer)
>  {
> printf("%s\n", message);
> printf("expected answer: 0x%" PRIx64 "\n", expected_answer);
> printf("  actual answer: 0x%" PRIx64 "\n", actual_fpscr);

This is wrong. It should be actual_answer instead of actual_fpscr. That
is why all the instructions seems totally wrongly implemented. Note that
compiling with -Wall would give you a warning:

| main.c: In function ‘test_failed’:
| main.c:146:5: warning: format ‘%llx’ expects argument of type ‘long long 
unsigned int’, but argument 2 has type ‘uint32_t’ [-Wformat=]
|  printf("  actual answer: 0x%" PRIx64 "\n", actual_fpscr);
|  ^

So I think the cone needs to be improved a bit before we can conclude
anything.

Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] Floating point unit bugs

2017-05-08 Thread Aurelien Jarno
On 2017-05-07 17:48, G 3 wrote:
> I made a diagnostic program for the floating point unit. It will test
> various PowerPC floating point instructions for compatibility with the
> PowerPC G3 processor. It was tested on a PowerPC G3 and G5 system. The
> results of the program in qemu-system-ppc were pretty bad. About every
> instruction tested is not implemented correctly.

I don't say that qemu-system-ppc is bug free, but this looks suspicious
that about every instruction is buggy. Have you tried to run your
program on a real G3 or G5 system?

[ snip ]

> 
> Here is the full test results after running this program in qemu-system-ppc
> with a Mac OS 10.4 guest:
> 
> 
> 
> 
> fadd test failed
> expected answer: 0x3ff4
>   actual answer: 0x82004024
> expected fpscr: 0x82064000
>   actual fpscr: 0x82004000

This looks highly suspicious that the actual answer match the expected
answer.

> actual FPSCR bits set:
> bit 17: FPRF - Floating-point greater than or positive
> bit 6: XX - Floating-point inexact exception
> bit 0: FX - Floating-point exception summary
> 
> expected FPSCR bits set:
> bit 17: FPRF - Floating-point greater than or positive
> bit 14: FI - Floating-point fraction inexact
> bit 13: FR - Floating-point fraction rounded
> bit 6: XX - Floating-point inexact exception
> bit 0: FX - Floating-point exception summary
> 
> fadds test failed
> expected answer: 0x407024d5
>   actual answer: 0x82004024
> expected fpscr: 0x82064000
>   actual fpscr: 0x82004000

Ditto.

> actual FPSCR bits set:
> bit 17: FPRF - Floating-point greater than or positive
> bit 6: XX - Floating-point inexact exception
> bit 0: FX - Floating-point exception summary
> 
> expected FPSCR bits set:
> bit 17: FPRF - Floating-point greater than or positive
> bit 14: FI - Floating-point fraction inexact
> bit 13: FR - Floating-point fraction rounded
> bit 6: XX - Floating-point inexact exception
> bit 0: FX - Floating-point exception summary
> 
> fsub test passed
> fsubs test passed
> fmul test failed
> expected answer: 0x40365c28f5c28f5c
>   actual answer: 0x82004024
> expected fpscr: 0x82024000
>   actual fpscr: 0x82004000

Ditto.

> actual FPSCR bits set:
> bit 17: FPRF - Floating-point greater than or positive
> bit 6: XX - Floating-point inexact exception
> bit 0: FX - Floating-point exception summary
> 
> expected FPSCR bits set:
> bit 17: FPRF - Floating-point greater than or positive
> bit 14: FI - Floating-point fraction inexact
> bit 6: XX - Floating-point inexact exception
> bit 0: FX - Floating-point exception summary
> 
> fmuls test failed
> expected answer: 0x412135a4a000
>   actual answer: 0x82004024
> expected fpscr: 0x82024000
>   actual fpscr: 0x82004000
> 
> actual FPSCR bits set:
> bit 17: FPRF - Floating-point greater than or positive
> bit 6: XX - Floating-point inexact exception
> bit 0: FX - Floating-point exception summary
> 
> expected FPSCR bits set:
> bit 17: FPRF - Floating-point greater than or positive
> bit 14: FI - Floating-point fraction inexact
> bit 6: XX - Floating-point inexact exception
> bit 0: FX - Floating-point exception summary
> 
> fdiv test failed
> expected answer: 0x40059f38ee13b48b
>   actual answer: 0x82004024
> expected fpscr: 0x82064000
>   actual fpscr: 0x82004000

Ditto.

And so on...

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] Future of SoftFloat use in QEMU

2017-05-08 Thread Aurelien Jarno
On 2017-05-08 17:58, Thomas Huth wrote:
> On 08.05.2017 17:36, Aurelien Jarno wrote:
> > Hi,
> > 
> > On 2017-05-08 15:58, Alex Bennée wrote:
> >> Hi,
> >>
> >> We've got a task coming up to implement half-precision floating point
> >> (FP16) for ARMv8.2. As you know pretty much all our floating point in
> >> QEMU is handled by our internal fork of John R. Hauser's BSD SoftFloat
> >> library. Our current implementation is based on version 2a which doesn't
> >> support FP16.
> >>
> >> As it happens there has been a new release of SoftFloat recently.
> >> Version 3 is a complete re-write which made a number of changes, some
> >> notable ones being:
> >>
> >>   - Complete rewrite, different use license than earlier releases.
> >>   - Renaming most types and functions, upgrading some algorithms
> >> - restructuring the source files, and making SoftFloat into a true 
> >> library.
> >>   - Added functions to convert between floating-point and unsigned 
> >> integers, both 32-bit and 64-bit (uint32_t and uint64_t).
> >>   - Added functions for fused multiply-add, for all supported 
> >> floating-point formats except 80-bit double-extended-precision.
> >>   - Added support for a fifth rounding mode, near_maxMag (round to 
> >> nearest, with ties to maximum magnitude, away from zero).
> >>
> >> And in the most recent release as of February 2017, 3c:
> >>
> >>   - Added optional rounding mode odd (round to odd, also known as jamming).
> >>   - Implemented the common 16-bit “half-precision” floating-point format 
> >> (float16_t)
> >>
> >> See: 
> >> http://www.jhauser.us/arithmetic/SoftFloat-3c/doc/SoftFloat-history.html
> >>
> >> Of course the softfloat in QEMU's tree hasn't been static either. We've
> >> made numerous changes over the years to add and fix various features,
> >> including features that have since been added to the upstream softfloat.
> >> It seems unlikely we could switch to the newer softfloat without risking
> >> breaking something. However if we look at back-porting stuff from the
> >> newer library we essentially get to own our version of softfloat
> >> forever.
> > 
> > There have been many many changes in our forked version of softfloat:
> > qNaN/sNaN, IEEE754-2008 functions, squash input denormal, many floatx80
> > fixes, ...
> 
> Note that we've apparently also got plenty of bugs in our version of
> softloat left, for example:

I don't say that our version of softfloat is bug free, but I would not
blame softfloat without further investigation:

> - https://bugs.launchpad.net/qemu/+bug/645662

qemu/i386 doesn't use softfloat for many instructions, but rather rely
on the float/double type of the host. This is mostly because softfloat
doesn't provide directly usable trigonometric functions. There are also
a few simple ones that might be more easily converted to softfloat.

> - http://lists.nongnu.org/archive/html/qemu-ppc/2017-05/msg00187.html
> 
> ... would be interesting to know whether such issues are fixed with the
> newer version of softfloat...

The exception flags are likely to be a bug in the PowerPC
implementation, as this CPU provides more fine grained exception than
what softfloat provides. The actual wrong answers which match the
expected fpscr value look very suspicious to me.

Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] Future of SoftFloat use in QEMU

2017-05-08 Thread Aurelien Jarno
Hi,

On 2017-05-08 15:58, Alex Bennée wrote:
> Hi,
> 
> We've got a task coming up to implement half-precision floating point
> (FP16) for ARMv8.2. As you know pretty much all our floating point in
> QEMU is handled by our internal fork of John R. Hauser's BSD SoftFloat
> library. Our current implementation is based on version 2a which doesn't
> support FP16.
> 
> As it happens there has been a new release of SoftFloat recently.
> Version 3 is a complete re-write which made a number of changes, some
> notable ones being:
> 
>   - Complete rewrite, different use license than earlier releases.
>   - Renaming most types and functions, upgrading some algorithms
> - restructuring the source files, and making SoftFloat into a true 
> library.
>   - Added functions to convert between floating-point and unsigned integers, 
> both 32-bit and 64-bit (uint32_t and uint64_t).
>   - Added functions for fused multiply-add, for all supported floating-point 
> formats except 80-bit double-extended-precision.
>   - Added support for a fifth rounding mode, near_maxMag (round to nearest, 
> with ties to maximum magnitude, away from zero).
> 
> And in the most recent release as of February 2017, 3c:
> 
>   - Added optional rounding mode odd (round to odd, also known as jamming).
>   - Implemented the common 16-bit “half-precision” floating-point format 
> (float16_t)
> 
> See: http://www.jhauser.us/arithmetic/SoftFloat-3c/doc/SoftFloat-history.html
> 
> Of course the softfloat in QEMU's tree hasn't been static either. We've
> made numerous changes over the years to add and fix various features,
> including features that have since been added to the upstream softfloat.
> It seems unlikely we could switch to the newer softfloat without risking
> breaking something. However if we look at back-porting stuff from the
> newer library we essentially get to own our version of softfloat
> forever.

There have been many many changes in our forked version of softfloat:
qNaN/sNaN, IEEE754-2008 functions, squash input denormal, many floatx80
fixes, ...

Do we know if all those changes are also in the new softfloat version,
and if it is not the case if we can work with upstream to implement
that? That seems to me like a pre-requisite before trying to get this
new version in QEMU, either as an option or as the only version.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



[Qemu-devel] [PATCH 2/3] scsi-disk: export rotational qdev property

2017-05-06 Thread Aurelien Jarno
Export the rotational qdev property to the block device characteristics
VPD page.

Signed-off-by: Aurelien Jarno 
---
 hw/scsi/scsi-disk.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index a53f058621..21b7e21a23 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -597,6 +597,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, 
uint8_t *outbuf)
 outbuf[buflen++] = 0x83; // device identification
 if (s->qdev.type == TYPE_DISK) {
 outbuf[buflen++] = 0xb0; // block limits
+outbuf[buflen++] = 0xb1; /* block device characteristics */
 outbuf[buflen++] = 0xb2; // thin provisioning
 }
 break;
@@ -739,6 +740,19 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, 
uint8_t *outbuf)
 outbuf[43] = max_io_sectors & 0xff;
 break;
 }
+case 0xb1: /* block device characteristics */
+{
+buflen = 0x40;
+memset(outbuf + 4, 0, buflen - 4);
+
+/* medium rotation rate: 0 = not reported, 1 = non-rotating */
+outbuf[4] = 0;
+outbuf[5] = s->qdev.conf.rotational ? 0 : 1;
+
+/* nominal form factor */
+outbuf[7] = 0; /* not reported */
+break;
+}
 case 0xb2: /* thin provisioning */
 {
 buflen = 8;
-- 
2.11.0




[Qemu-devel] [PATCH 1/3] hw/block: Introduce rotational qdev property

2017-05-06 Thread Aurelien Jarno
The Linux kernel uses different I/O scheduler depending if the block
device is a rotational device or not. Also it uses rotational devices
to add entropy to the random pool.

This patch add a rotational qdev property so that the block device can
be configured as a rotational device or a non-rotational device. Default
to true to not change the default behavior.

Signed-off-by: Aurelien Jarno 
---
 blockdev.c   | 4 
 include/hw/block/block.h | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/blockdev.c b/blockdev.c
index 4d8cdedd54..c914420641 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -4037,6 +4037,10 @@ QemuOptsList qemu_common_drive_opts = {
 .name = BDRV_OPT_READ_ONLY,
 .type = QEMU_OPT_BOOL,
 .help = "open drive file as read-only",
+},{
+.name = "rotational",
+.type = QEMU_OPT_BOOL,
+.help = "rotational drive (off, on)",
 },
 
 THROTTLE_OPTS,
diff --git a/include/hw/block/block.h b/include/hw/block/block.h
index f3f6e8ef02..304a579eca 100644
--- a/include/hw/block/block.h
+++ b/include/hw/block/block.h
@@ -27,6 +27,7 @@ typedef struct BlockConf {
 uint32_t cyls, heads, secs;
 OnOffAuto wce;
 bool share_rw;
+bool rotational;
 BlockdevOnError rerror;
 BlockdevOnError werror;
 } BlockConf;
@@ -56,7 +57,8 @@ static inline unsigned int get_physical_block_exp(BlockConf 
*conf)
_conf.discard_granularity, -1), \
 DEFINE_PROP_ON_OFF_AUTO("write-cache", _state, _conf.wce, \
 ON_OFF_AUTO_AUTO), \
-DEFINE_PROP_BOOL("share-rw", _state, _conf.share_rw, false)
+DEFINE_PROP_BOOL("share-rw", _state, _conf.share_rw, false), \
+DEFINE_PROP_BOOL("rotational", _state, _conf.rotational, true)
 
 #define DEFINE_BLOCK_CHS_PROPERTIES(_state, _conf)  \
 DEFINE_PROP_UINT32("cyls", _state, _conf.cyls, 0),  \
-- 
2.11.0




[Qemu-devel] [PATCH 3/3] ide: export rotational qdev property

2017-05-06 Thread Aurelien Jarno
Export the rotational qdev property in the IDENTIFY request.

Signed-off-by: Aurelien Jarno 
---
 hw/ide/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 0b48b64d3a..1aa76b0d90 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -191,6 +191,9 @@ static void ide_identify(IDEState *s)
 if (dev && dev->conf.discard_granularity) {
 put_le16(p + 169, 1); /* TRIM support */
 }
+if (dev && !dev->conf.rotational) {
+put_le16(p + 217, 1); /* non-rotating device */
+}
 
 ide_identify_size(s);
 s->identify_set = 1;
-- 
2.11.0




Re: [Qemu-devel] [PATCH v6 15/25] tcg/s390: Implement goto_ptr

2017-05-06 Thread Aurelien Jarno
On 2017-05-02 12:22, Richard Henderson wrote:
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390/tcg-target.h |  2 +-
>  tcg/s390/tcg-target.inc.c | 24 +---
>  2 files changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
> index 6b7bcfb..957f0c0 100644
> --- a/tcg/s390/tcg-target.h
> +++ b/tcg/s390/tcg-target.h
> @@ -92,7 +92,7 @@ extern uint64_t s390_facilities;
>  #define TCG_TARGET_HAS_mulsh_i32  0
>  #define TCG_TARGET_HAS_extrl_i64_i32  0
>  #define TCG_TARGET_HAS_extrh_i64_i32  0
> -#define TCG_TARGET_HAS_goto_ptr   0
> +#define TCG_TARGET_HAS_goto_ptr   1
>  
>  #define TCG_TARGET_HAS_div2_i64   1
>  #define TCG_TARGET_HAS_rot_i641
> diff --git a/tcg/s390/tcg-target.inc.c b/tcg/s390/tcg-target.inc.c
> index a679280..5d7083e 100644
> --- a/tcg/s390/tcg-target.inc.c
> +++ b/tcg/s390/tcg-target.inc.c
> @@ -1741,9 +1741,14 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
> opc,
>  
>  switch (opc) {
>  case INDEX_op_exit_tb:
> -/* return value */
> -tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, args[0]);
> -tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
> +/* Reuse the zeroing that exists for goto_ptr.  */
> +a0 = args[0];
> +if (a0 == 0) {
> +tgen_gotoi(s, S390_CC_ALWAYS, s->code_gen_epilogue);
> +} else {
> +tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, a0);
> +tgen_gotoi(s, S390_CC_ALWAYS, tb_ret_addr);
> +}
>  break;
>  
>  case INDEX_op_goto_tb:
> @@ -1767,6 +1772,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
> opc,
>  s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
>  break;
>  
> +case INDEX_op_goto_ptr:
> +tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, args[0]);
> +break;
> +
>  OP_32_64(ld8u):
>  /* ??? LLC (RXY format) is only present with the extended-immediate
> facility, whereas LLGC is always present.  */
> @@ -2241,6 +2250,7 @@ static const TCGTargetOpDef s390_op_defs[] = {
>  { INDEX_op_exit_tb, { } },
>  { INDEX_op_goto_tb, { } },
>  { INDEX_op_br, { } },
> +{ INDEX_op_goto_ptr, { "r" } },
>  
>  { INDEX_op_ld8u_i32, { "r", "r" } },
>  { INDEX_op_ld8s_i32, { "r", "r" } },
> @@ -2439,6 +2449,14 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>  /* br %r3 (go to TB) */
>  tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, tcg_target_call_iarg_regs[1]);
>  
> +/*
> + * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
> + * and fall through to the rest of the epilogue.
> + */
> +s->code_gen_epilogue = s->code_ptr;
> +tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R2, 0);
> +
> +/* TB epilogue */
>  tb_ret_addr = s->code_ptr;
>  
>  /* lmg %r6,%r15,fs+48(%r15) (restore registers) */

Reviewed-by: Aurelien Jarno 
Tested-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v6 18/25] target/s390: Use tcg_gen_lookup_and_goto_ptr

2017-05-06 Thread Aurelien Jarno
On 2017-05-02 12:22, Richard Henderson wrote:
> Signed-off-by: Richard Henderson 
> ---
>  target/s390x/translate.c | 17 -
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/target/s390x/translate.c b/target/s390x/translate.c
> index 01c6217..f7c2123 100644
> --- a/target/s390x/translate.c
> +++ b/target/s390x/translate.c
> @@ -608,11 +608,16 @@ static void gen_op_calc_cc(DisasContext *s)
>  set_cc_static(s);
>  }
>  
> -static int use_goto_tb(DisasContext *s, uint64_t dest)
> +static bool use_exit_tb(DisasContext *s)
>  {
> -if (unlikely(s->singlestep_enabled) ||
> -(s->tb->cflags & CF_LAST_IO) ||
> -(s->tb->flags & FLAG_MASK_PER)) {
> +return (s->singlestep_enabled ||
> +(s->tb->cflags & CF_LAST_IO) ||
> +(s->tb->flags & FLAG_MASK_PER));
> +}
> +
> +static bool use_goto_tb(DisasContext *s, uint64_t dest)
> +{
> +if (unlikely(use_exit_tb(s))) {
>  return false;
>  }
>  #ifndef CONFIG_USER_ONLY
> @@ -5426,8 +5431,10 @@ void gen_intermediate_code(CPUS390XState *env, struct 
> TranslationBlock *tb)
>  /* Exit the TB, either by raising a debug exception or by return.  */
>  if (do_debug) {
>  gen_exception(EXCP_DEBUG);
> -} else {
> +} else if (use_exit_tb(&dc)) {
>  tcg_gen_exit_tb(0);
> +    } else {
> +    tcg_gen_lookup_and_goto_ptr(psw_addr);
>  }
>  break;
>  default:

Reviewed-by: Aurelien Jarno 
Tested-by: Aurelien Jarno 


-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v6 13/25] tcg/aarch64: Implement goto_ptr

2017-05-06 Thread Aurelien Jarno
ing that exists for goto_ptr.  */
> +if (a0 == 0) {
> +tcg_out_goto(s, s->code_gen_epilogue);
> +} else {
> +tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
> +tcg_out_goto(s, tb_ret_addr);
> +}
>  break;
>  
>  case INDEX_op_goto_tb:
> @@ -1374,6 +1379,10 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
>  s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
>  break;
>  
> +case INDEX_op_goto_ptr:
> +tcg_out_insn(s, 3207, BR, a0);
> +break;
> +
>  case INDEX_op_br:
>  tcg_out_goto_label(s, arg_label(a0));
>  break;
> @@ -1735,6 +1744,7 @@ static const TCGTargetOpDef aarch64_op_defs[] = {
>  { INDEX_op_exit_tb, { } },
>  { INDEX_op_goto_tb, { } },
>  { INDEX_op_br, { } },
> +{ INDEX_op_goto_ptr, { "r" } },
>  
>  { INDEX_op_ld8u_i32, { "r", "r" } },
>  { INDEX_op_ld8s_i32, { "r", "r" } },
> @@ -1942,6 +1952,14 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>  tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
>  tcg_out_insn(s, 3207, BR, tcg_target_call_iarg_regs[1]);
>  
> +/*
> + * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
> + * and fall through to the rest of the epilogue.
> + */
> +s->code_gen_epilogue = s->code_ptr;
> +tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_X0, 0);
> +
> +/* TB epilogue */
>  tb_ret_addr = s->code_ptr;
>  
>  /* Remove TCG locals stack space.  */

Reviewed-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH] mips: set CP0 Debug DExcCode for SDBBP instruction

2017-05-06 Thread Aurelien Jarno
On 2017-05-02 15:03, Pavel Dovgalyuk wrote:
> From: Pavel Dovgalyuk 
> 
> This patch fixes setting DExcCode field of CP0 Debug register
> when SDBBP instruction is executed. According to EJTAG specification,
> this field must be set to the value 9 (Bp).
> 
> Signed-off-by: Pavel Dovgalyuk 
> ---
>  target/mips/helper.c |2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/target/mips/helper.c b/target/mips/helper.c
> index e359ca3b44..166f0d1243 100644
> --- a/target/mips/helper.c
> +++ b/target/mips/helper.c
> @@ -627,6 +627,8 @@ void mips_cpu_do_interrupt(CPUState *cs)
>  goto set_DEPC;
>  case EXCP_DBp:
>  env->CP0_Debug |= 1 << CP0DB_DBp;
> +/* Setup DExcCode - SDBBP instruction */
> +env->CP0_Debug = (env->CP0_Debug & ~(0x1fULL << CP0DB_DEC)) | 9 << 
> CP0DB_DEC;
>  goto set_DEPC;
>      case EXCP_DDBS:
>  env->CP0_Debug |= 1 << CP0DB_DDBS;
> 

Reviewed-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



[Qemu-devel] [PATCH v2 01/14] target/sh4: split ctx->flags into ctx->tbflags and ctx->envflags

2017-05-06 Thread Aurelien Jarno
There is a confusion (and not only in the SH4 target) between tb->flags,
env->flags and ctx->flags. To avoid it, split ctx->flags into
ctx->tbflags and ctx->envflags. ctx->tbflags stays unchanged during the
whole TB translation, while ctx->envflags evolves and is kept in sync
with env->flags using TCG instructions. ctx->envflags now only contains
the part that of env->flags that is contained in the TB state, i.e. the
DELAY_SLOT* flags.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 161 +
 1 file changed, 82 insertions(+), 79 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index c89a14733f..6b526a02f2 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -37,7 +37,8 @@ typedef struct DisasContext {
 struct TranslationBlock *tb;
 target_ulong pc;
 uint16_t opcode;
-uint32_t flags;
+uint32_t tbflags;/* should stay unmodified during the TB translation */
+uint32_t envflags;   /* should stay in sync with env->flags using TCG ops 
*/
 int bstate;
 int memidx;
 uint32_t delayed_pc;
@@ -49,7 +50,7 @@ typedef struct DisasContext {
 #if defined(CONFIG_USER_ONLY)
 #define IS_USER(ctx) 1
 #else
-#define IS_USER(ctx) (!(ctx->flags & (1u << SR_MD)))
+#define IS_USER(ctx) (!(ctx->tbflags & (1u << SR_MD)))
 #endif
 
 enum {
@@ -317,51 +318,50 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 #define B11_8 ((ctx->opcode >> 8) & 0xf)
 #define B15_12 ((ctx->opcode >> 12) & 0xf)
 
-#define REG(x) ((x) < 8 && (ctx->flags & (1u << SR_MD))\
-&& (ctx->flags & (1u << SR_RB))\
+#define REG(x) ((x) < 8 && (ctx->tbflags & (1u << SR_MD))\
+&& (ctx->tbflags & (1u << SR_RB))\
 ? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
 
-#define ALTREG(x) ((x) < 8 && (!(ctx->flags & (1u << SR_MD))\
-   || !(ctx->flags & (1u << SR_RB)))\
+#define ALTREG(x) ((x) < 8 && (!(ctx->tbflags & (1u << SR_MD))\
+   || !(ctx->tbflags & (1u << SR_RB)))\
? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
 
-#define FREG(x) (ctx->flags & FPSCR_FR ? (x) ^ 0x10 : (x))
+#define FREG(x) (ctx->tbflags & FPSCR_FR ? (x) ^ 0x10 : (x))
 #define XHACK(x) x) & 1 ) << 4) | ((x) & 0xe))
-#define XREG(x) (ctx->flags & FPSCR_FR ? XHACK(x) ^ 0x10 : XHACK(x))
+#define XREG(x) (ctx->tbflags & FPSCR_FR ? XHACK(x) ^ 0x10 : XHACK(x))
 #define DREG(x) FREG(x) /* Assumes lsb of (x) is always 0 */
 
 #define CHECK_NOT_DELAY_SLOT \
-  if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) \
-  {   \
-  tcg_gen_movi_i32(cpu_pc, ctx->pc);  \
-  gen_helper_raise_slot_illegal_instruction(cpu_env); \
-  ctx->bstate = BS_BRANCH;\
-  return; \
-  }
-
-#define CHECK_PRIVILEGED\
-  if (IS_USER(ctx)) {   \
-  tcg_gen_movi_i32(cpu_pc, ctx->pc);\
-  if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
-  gen_helper_raise_slot_illegal_instruction(cpu_env);   \
-  } else {  \
-  gen_helper_raise_illegal_instruction(cpu_env);\
-  } \
-  ctx->bstate = BS_BRANCH;  \
-  return;   \
-  }
-
-#define CHECK_FPU_ENABLED   \
-  if (ctx->flags & (1u << SR_FD)) { \
-  tcg_gen_movi_i32(cpu_pc, ctx->pc);\
-  if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
-  gen_helper_raise_slot_fpu_disable(cpu_env);   \
-  } else {  \
-  gen_helper_raise_fpu_disable(cpu_env);\
-  } \
-  ctx->bstate = BS_BRANCH;  \
-  return;   \
-  }
+if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
+tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
+gen_helper_raise_slot_illegal_instruction(cpu_env);  \
+ctx->bstate = BS_BRANCH; \
+  

[Qemu-devel] [PATCH v2 14/14] target/sh4: trap unaligned accesses

2017-05-06 Thread Aurelien Jarno
SH4 requires that memory accesses are naturally aligned, except for the
SH4-A movua.l instructions which can do unaligned loads.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.c   |  1 +
 target/sh4/cpu.h   |  4 
 target/sh4/op_helper.c | 19 +++
 target/sh4/translate.c |  6 --
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index 9a481c35dc..9da7e1ed38 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -301,6 +301,7 @@ static void superh_cpu_class_init(ObjectClass *oc, void 
*data)
 #ifdef CONFIG_USER_ONLY
 cc->handle_mmu_fault = superh_cpu_handle_mmu_fault;
 #else
+cc->do_unaligned_access = superh_cpu_do_unaligned_access;
 cc->get_phys_page_debug = superh_cpu_get_phys_page_debug;
 #endif
 cc->disas_set_info = superh_cpu_disas_set_info;
diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index faab3012f9..6c07c6b24b 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -24,6 +24,7 @@
 #include "cpu-qom.h"
 
 #define TARGET_LONG_BITS 32
+#define ALIGNED_ONLY
 
 /* CPU Subtypes */
 #define SH_CPU_SH7750  (1 << 0)
@@ -215,6 +216,9 @@ void superh_cpu_dump_state(CPUState *cpu, FILE *f,
 hwaddr superh_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
 int superh_cpu_gdb_read_register(CPUState *cpu, uint8_t *buf, int reg);
 int superh_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
+void superh_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
+MMUAccessType access_type,
+int mmu_idx, uintptr_t retaddr);
 
 void sh4_translate_init(void);
 SuperHCPU *cpu_sh4_init(const char *cpu_model);
diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
index 684d3f3758..4abd05667c 100644
--- a/target/sh4/op_helper.c
+++ b/target/sh4/op_helper.c
@@ -24,6 +24,25 @@
 
 #ifndef CONFIG_USER_ONLY
 
+void superh_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
+MMUAccessType access_type,
+int mmu_idx, uintptr_t retaddr)
+{
+if (retaddr) {
+cpu_restore_state(cs, retaddr);
+}
+switch (access_type) {
+case MMU_INST_FETCH:
+case MMU_DATA_LOAD:
+cs->exception_index = 0x0e0;
+break;
+case MMU_DATA_STORE:
+cs->exception_index = 0x100;
+break;
+}
+cpu_loop_exit(cs);
+}
+
 void tlb_fill(CPUState *cs, target_ulong addr, MMUAccessType access_type,
   int mmu_idx, uintptr_t retaddr)
 {
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 29ea506c1d..e4e602021e 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1505,14 +1505,16 @@ static void _decode_opc(DisasContext * ctx)
 case 0x40a9:/* movua.l @Rm,R0 */
 /* Load non-boundary-aligned data */
 if (ctx->features & SH_FEATURE_SH4A) {
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx,
+MO_TEUL | MO_UNALN);
 return;
 }
 break;
 case 0x40e9:/* movua.l @Rm+,R0 */
 /* Load non-boundary-aligned data */
 if (ctx->features & SH_FEATURE_SH4A) {
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx,
+MO_TEUL | MO_UNALN);
 tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
 return;
 }
-- 
2.11.0




[Qemu-devel] [PATCH v2 07/14] target/sh4: only save flags state at the end of the TB

2017-05-06 Thread Aurelien Jarno
There is no need to save flags when entering and exiting the delay slot.
They can be saved only when reaching the end of the TB. If the TB is
interrupted before by an exception, they will be restored using
restore_state_to_opc.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 69 --
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index f608e314b6..8cee7d333f 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -212,6 +212,20 @@ static void gen_write_sr(TCGv src)
 tcg_gen_andi_i32(cpu_sr_t, cpu_sr_t, 1);
 }
 
+static inline void gen_save_cpu_state(DisasContext *ctx, bool save_pc)
+{
+if (save_pc) {
+tcg_gen_movi_i32(cpu_pc, ctx->pc);
+}
+if (ctx->delayed_pc != (uint32_t) -1) {
+tcg_gen_movi_i32(cpu_delayed_pc, ctx->delayed_pc);
+}
+if ((ctx->tbflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL))
+!= ctx->envflags) {
+tcg_gen_movi_i32(cpu_flags, ctx->envflags);
+}
+}
+
 static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
 {
 if (unlikely(ctx->singlestep_enabled)) {
@@ -246,6 +260,7 @@ static void gen_jump(DisasContext * ctx)
/* Target is not statically known, it comes necessarily from a
   delayed jump as immediate jump are conditinal jumps */
tcg_gen_mov_i32(cpu_pc, cpu_delayed_pc);
+tcg_gen_discard_i32(cpu_delayed_pc);
if (ctx->singlestep_enabled)
 gen_helper_debug(cpu_env);
tcg_gen_exit_tb(0);
@@ -254,21 +269,12 @@ static void gen_jump(DisasContext * ctx)
 }
 }
 
-static inline void gen_branch_slot(uint32_t delayed_pc, int t)
-{
-tcg_gen_movi_i32(cpu_delayed_pc, delayed_pc);
-if (t) {
-tcg_gen_mov_i32(cpu_delayed_cond, cpu_sr_t);
-} else {
-tcg_gen_xori_i32(cpu_delayed_cond, cpu_sr_t, 1);
-}
-}
-
 /* Immediate conditional jump (bt or bf) */
 static void gen_conditional_jump(DisasContext * ctx,
 target_ulong ift, target_ulong ifnott)
 {
 TCGLabel *l1 = gen_new_label();
+gen_save_cpu_state(ctx, false);
 tcg_gen_brcondi_i32(TCG_COND_NE, cpu_sr_t, 0, l1);
 gen_goto_tb(ctx, 0, ifnott);
 gen_set_label(l1);
@@ -291,11 +297,6 @@ static void gen_delayed_conditional_jump(DisasContext * 
ctx)
 gen_jump(ctx);
 }
 
-static inline void gen_store_flags(uint32_t flags)
-{
-tcg_gen_movi_i32(cpu_flags, flags);
-}
-
 static inline void gen_load_fpr64(TCGv_i64 t, int reg)
 {
 tcg_gen_concat_i32_i64(t, cpu_fregs[reg + 1], cpu_fregs[reg]);
@@ -337,7 +338,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 
 #define CHECK_NOT_DELAY_SLOT \
 if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
-tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
+gen_save_cpu_state(ctx, true);   \
 gen_helper_raise_slot_illegal_instruction(cpu_env);  \
 ctx->bstate = BS_EXCP;   \
 return;  \
@@ -345,7 +346,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 
 #define CHECK_PRIVILEGED \
 if (IS_USER(ctx)) {  \
-tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
+gen_save_cpu_state(ctx, true);   \
 if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
 gen_helper_raise_slot_illegal_instruction(cpu_env);  \
 } else { \
@@ -357,7 +358,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 
 #define CHECK_FPU_ENABLED\
 if (ctx->tbflags & (1u << SR_FD)) {  \
-tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
+gen_save_cpu_state(ctx, true);   \
 if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
 gen_helper_raise_slot_fpu_disable(cpu_env);  \
 } else { \
@@ -501,14 +502,12 @@ static void _decode_opc(DisasContext * ctx)
 case 0xa000:   /* bra disp */
CHECK_NOT_DELAY_SLOT
ctx->delayed_pc = ctx->pc + 4 + B11_0s * 2;
-   tcg_gen_movi_i32(cpu_delayed_pc, ctx->delayed_pc);
 ctx->envflags |= DELAY_SLOT;
return;
 case 0xb000:   /* bsr disp */
CHECK_NOT_DELAY_SLOT
tcg_gen_movi_i32(cpu_pr, ctx->pc + 4);
ctx->delayed_pc = ctx->pc + 4 + B11_0s * 2;
-   tcg_gen_movi_i32(cpu_delayed_pc, ctx->delayed_pc);

[Qemu-devel] [PATCH v2 08/14] target/sh4: fold ctx->bstate = BS_BRANCH into gen_conditional_jump

2017-05-06 Thread Aurelien Jarno
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 8cee7d333f..a4c7a0895b 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -279,6 +279,7 @@ static void gen_conditional_jump(DisasContext * ctx,
 gen_goto_tb(ctx, 0, ifnott);
 gen_set_label(l1);
 gen_goto_tb(ctx, 1, ift);
+ctx->bstate = BS_BRANCH;
 }
 
 /* Delayed conditional jump (bt or bf) */
@@ -1158,9 +1159,7 @@ static void _decode_opc(DisasContext * ctx)
return;
 case 0x8b00:   /* bf label */
CHECK_NOT_DELAY_SLOT
-   gen_conditional_jump(ctx, ctx->pc + 2,
-ctx->pc + 4 + B7_0s * 2);
-   ctx->bstate = BS_BRANCH;
+gen_conditional_jump(ctx, ctx->pc + 2, ctx->pc + 4 + B7_0s * 2);
return;
 case 0x8f00:   /* bf/s label */
CHECK_NOT_DELAY_SLOT
@@ -1170,9 +1169,7 @@ static void _decode_opc(DisasContext * ctx)
return;
 case 0x8900:   /* bt label */
CHECK_NOT_DELAY_SLOT
-   gen_conditional_jump(ctx, ctx->pc + 4 + B7_0s * 2,
-ctx->pc + 2);
-   ctx->bstate = BS_BRANCH;
+gen_conditional_jump(ctx, ctx->pc + 4 + B7_0s * 2, ctx->pc + 2);
return;
 case 0x8d00:   /* bt/s label */
CHECK_NOT_DELAY_SLOT
-- 
2.11.0




[Qemu-devel] [PATCH v2 10/14] target/sh4: optimize gen_write_sr using extract op

2017-05-06 Thread Aurelien Jarno
This doesn't change the generated code on x86, but optimizes it on most
RISC architectures and makes the code simpler to read.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index b4e5606098..8c766eed2a 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -204,12 +204,9 @@ static void gen_write_sr(TCGv src)
 {
 tcg_gen_andi_i32(cpu_sr, src,
  ~((1u << SR_Q) | (1u << SR_M) | (1u << SR_T)));
-tcg_gen_shri_i32(cpu_sr_q, src, SR_Q);
-tcg_gen_andi_i32(cpu_sr_q, cpu_sr_q, 1);
-tcg_gen_shri_i32(cpu_sr_m, src, SR_M);
-tcg_gen_andi_i32(cpu_sr_m, cpu_sr_m, 1);
-tcg_gen_shri_i32(cpu_sr_t, src, SR_T);
-tcg_gen_andi_i32(cpu_sr_t, cpu_sr_t, 1);
+tcg_gen_extract_i32(cpu_sr_q, src, SR_Q, 1);
+tcg_gen_extract_i32(cpu_sr_m, src, SR_M, 1);
+tcg_gen_extract_i32(cpu_sr_t, src, SR_T, 1);
 }
 
 static inline void gen_save_cpu_state(DisasContext *ctx, bool save_pc)
-- 
2.11.0




[Qemu-devel] [PATCH v2 13/14] target/sh4: movua.l is an SH4-A only instruction

2017-05-06 Thread Aurelien Jarno
At the same time change the comment describing the instruction the same
way than other instruction, so that the code is easier to read and search.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index d86fd29264..29ea506c1d 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1502,17 +1502,21 @@ static void _decode_opc(DisasContext * ctx)
 }
 ctx->has_movcal = 1;
return;
-case 0x40a9:
-   /* MOVUA.L @Rm,R0 (Rm) -> R0
-  Load non-boundary-aligned data */
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
-   return;
-case 0x40e9:
-   /* MOVUA.L @Rm+,R0   (Rm) -> R0, Rm + 4 -> Rm
-  Load non-boundary-aligned data */
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
-   tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
-   return;
+case 0x40a9:/* movua.l @Rm,R0 */
+/* Load non-boundary-aligned data */
+if (ctx->features & SH_FEATURE_SH4A) {
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+return;
+}
+break;
+case 0x40e9:/* movua.l @Rm+,R0 */
+/* Load non-boundary-aligned data */
+if (ctx->features & SH_FEATURE_SH4A) {
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
+return;
+}
+break;
 case 0x0029:   /* movt Rn */
 tcg_gen_mov_i32(REG(B11_8), cpu_sr_t);
return;
-- 
2.11.0




[Qemu-devel] [PATCH v2 09/14] target/sh4: optimize gen_store_fpr64

2017-05-06 Thread Aurelien Jarno
Isuing extrh and avoiding intermediate temps.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index a4c7a0895b..b4e5606098 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -305,13 +305,8 @@ static inline void gen_load_fpr64(TCGv_i64 t, int reg)
 
 static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 {
-TCGv_i32 tmp = tcg_temp_new_i32();
-tcg_gen_extrl_i64_i32(tmp, t);
-tcg_gen_mov_i32(cpu_fregs[reg + 1], tmp);
-tcg_gen_shri_i64(t, t, 32);
-tcg_gen_extrl_i64_i32(tmp, t);
-tcg_gen_mov_i32(cpu_fregs[reg], tmp);
-tcg_temp_free_i32(tmp);
+tcg_gen_extrl_i64_i32(cpu_fregs[reg + 1], t);
+tcg_gen_extrh_i64_i32(cpu_fregs[reg], t);
 }
 
 #define B3_0 (ctx->opcode & 0xf)
-- 
2.11.0




[Qemu-devel] [PATCH v2 02/14] target/sh4: get rid of DELAY_SLOT_CLEARME

2017-05-06 Thread Aurelien Jarno
Now that ctx->flags has been split, it becomes clear that
DELAY_SLOT_CLEARME has not impact on the code generation: in both case
ctx->envflags is cleared, either by clearing all the flags, or by
setting it to 0. This is left-over from pre-TCG era.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.h   |  3 +--
 target/sh4/helper.c|  2 --
 target/sh4/translate.c | 17 +
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index cad8989f7e..9445cc779f 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -93,7 +93,6 @@
 #define DELAY_SLOT (1 << 0)
 #define DELAY_SLOT_CONDITIONAL (1 << 1)
 #define DELAY_SLOT_TRUE(1 << 2)
-#define DELAY_SLOT_CLEARME (1 << 3)
 /* The dynamic value of the DELAY_SLOT_TRUE flag determines whether the jump
  * after the delay slot should be taken or not. It is calculated from SR_T.
  *
@@ -384,7 +383,7 @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, 
target_ulong *pc,
 *pc = env->pc;
 *cs_base = 0;
 *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
-| DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
+| DELAY_SLOT_TRUE))/* Bits  0- 2 */
 | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
 | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))  /* Bits 29-30 */
 | (env->sr & (1u << SR_FD))/* Bit 15 */
diff --git a/target/sh4/helper.c b/target/sh4/helper.c
index 036c5ca56c..71bb49a670 100644
--- a/target/sh4/helper.c
+++ b/target/sh4/helper.c
@@ -170,8 +170,6 @@ void superh_cpu_do_interrupt(CPUState *cs)
/* Clear flags for exception/interrupt routine. */
env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL | DELAY_SLOT_TRUE);
 }
-if (env->flags & DELAY_SLOT_CLEARME)
-env->flags = 0;
 
 if (do_exp) {
 env->expevt = cs->exception_index;
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 6b526a02f2..d84a7a2e6e 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1804,15 +1804,9 @@ static void decode_opc(DisasContext * ctx)
 _decode_opc(ctx);
 
 if (old_flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) {
-if (ctx->envflags & DELAY_SLOT_CLEARME) {
-gen_store_flags(0);
-} else {
-   /* go out of the delay slot */
-uint32_t new_flags = ctx->envflags;
-   new_flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
-   gen_store_flags(new_flags);
-}
-ctx->envflags = 0;
+/* go out of the delay slot */
+ctx->envflags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
+gen_store_flags(ctx->envflags);
 ctx->bstate = BS_BRANCH;
 if (old_flags & DELAY_SLOT_CONDITIONAL) {
gen_delayed_conditional_jump(ctx);
@@ -1840,8 +1834,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 pc_start = tb->pc;
 ctx.pc = pc_start;
 ctx.tbflags = (uint32_t)tb->flags;
-ctx.envflags = tb->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL |
-DELAY_SLOT_CLEARME);
+ctx.envflags = tb->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
 ctx.bstate = BS_NONE;
 ctx.memidx = (ctx.tbflags & (1u << SR_MD)) == 0 ? 1 : 0;
 /* We don't know if the delayed pc came from a dynamic or static branch,
@@ -1908,7 +1901,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 /* fall through */
 case BS_NONE:
 if (ctx.envflags) {
-gen_store_flags(ctx.envflags | DELAY_SLOT_CLEARME);
+gen_store_flags(ctx.envflags);
}
 gen_goto_tb(&ctx, 0, ctx.pc);
 break;
-- 
2.11.0




[Qemu-devel] [PATCH v2 03/14] target/sh4: do not include DELAY_SLOT_TRUE in the TB state

2017-05-06 Thread Aurelien Jarno
DELAY_SLOT_TRUE is used as a dynamic condition for the branch after the
delay slot instruction. It is not used in code generation, so there is
no need to including in the TB state.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index 9445cc779f..da8d15f1b9 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -382,8 +382,7 @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, 
target_ulong *pc,
 {
 *pc = env->pc;
 *cs_base = 0;
-*flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
-| DELAY_SLOT_TRUE))/* Bits  0- 2 */
+*flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) /* Bits 0-1 
*/
 | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
 | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))  /* Bits 29-30 */
 | (env->sr & (1u << SR_FD))/* Bit 15 */
-- 
2.11.0




[Qemu-devel] [PATCH v2 06/14] target/sh4: fix BS_EXCP exit

2017-05-06 Thread Aurelien Jarno
In case of exception, there is no need to call tcg_gen_exit_tb as the
exception helper won't return.

Also fix a few cases where BS_BRANCH is called instead of BS_EXCP.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 04bc18bf7c..f608e314b6 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -339,7 +339,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
 tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
 gen_helper_raise_slot_illegal_instruction(cpu_env);  \
-ctx->bstate = BS_BRANCH; \
+ctx->bstate = BS_EXCP;   \
 return;  \
 }
 
@@ -351,7 +351,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 } else { \
 gen_helper_raise_illegal_instruction(cpu_env);   \
 }\
-ctx->bstate = BS_BRANCH; \
+ctx->bstate = BS_EXCP;   \
 return;  \
 }
 
@@ -363,7 +363,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 } else { \
 gen_helper_raise_fpu_disable(cpu_env);   \
 }\
-ctx->bstate = BS_BRANCH; \
+ctx->bstate = BS_EXCP;   \
 return;  \
 }
 
@@ -1289,7 +1289,7 @@ static void _decode_opc(DisasContext * ctx)
imm = tcg_const_i32(B7_0);
 gen_helper_trapa(cpu_env, imm);
tcg_temp_free(imm);
-   ctx->bstate = BS_BRANCH;
+ctx->bstate = BS_EXCP;
}
return;
 case 0xc800:   /* tst #imm,R0 */
@@ -1798,7 +1798,7 @@ static void _decode_opc(DisasContext * ctx)
 } else {
 gen_helper_raise_illegal_instruction(cpu_env);
 }
-ctx->bstate = BS_BRANCH;
+ctx->bstate = BS_EXCP;
 }
 
 static void decode_opc(DisasContext * ctx)
@@ -1867,7 +1867,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 /* We have hit a breakpoint - make sure PC is up-to-date */
 tcg_gen_movi_i32(cpu_pc, ctx.pc);
 gen_helper_debug(cpu_env);
-ctx.bstate = BS_BRANCH;
+ctx.bstate = BS_EXCP;
 /* The address covered by the breakpoint must be included in
[tb->pc, tb->pc + tb->size) in order to for it to be
properly cleared -- thus we increment the PC here so that
@@ -1911,9 +1911,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 gen_goto_tb(&ctx, 0, ctx.pc);
 break;
 case BS_EXCP:
-/* gen_op_interrupt_restart(); */
-tcg_gen_exit_tb(0);
-break;
+/* fall through */
 case BS_BRANCH:
 default:
 break;
-- 
2.11.0




[Qemu-devel] [PATCH v2 04/14] target/sh4: move DELAY_SLOT_TRUE flag into a separate global

2017-05-06 Thread Aurelien Jarno
Instead of using one bit of the env flags to store the condition of the
next delay slot, use a separate global. It simplifies reading and
writing the flags variable and also removes some confusion between
ctx->envflags and env->flags.

Note that the global is first transfered to a temp in order to be
able to discard the global before the brcond.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.h   | 10 ++
 target/sh4/helper.c|  2 +-
 target/sh4/translate.c | 22 +-
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index da8d15f1b9..faab3012f9 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -92,13 +92,6 @@
 
 #define DELAY_SLOT (1 << 0)
 #define DELAY_SLOT_CONDITIONAL (1 << 1)
-#define DELAY_SLOT_TRUE(1 << 2)
-/* The dynamic value of the DELAY_SLOT_TRUE flag determines whether the jump
- * after the delay slot should be taken or not. It is calculated from SR_T.
- *
- * It is unclear if it is permitted to modify the SR_T flag in a delay slot.
- * The use of DELAY_SLOT_TRUE flag makes us accept such SR_T modification.
- */
 
 typedef struct tlb_t {
 uint32_t vpn;  /* virtual page number */
@@ -148,7 +141,8 @@ typedef struct CPUSH4State {
 uint32_t sgr;  /* saved global register 15 */
 uint32_t dbr;  /* debug base register */
 uint32_t pc;   /* program counter */
-uint32_t delayed_pc;   /* target of delayed jump */
+uint32_t delayed_pc;/* target of delayed branch */
+uint32_t delayed_cond;  /* condition of delayed branch */
 uint32_t mach; /* multiply and accumulate high */
 uint32_t macl; /* multiply and accumulate low */
 uint32_t pr;   /* procedure register */
diff --git a/target/sh4/helper.c b/target/sh4/helper.c
index 71bb49a670..8f8ce81401 100644
--- a/target/sh4/helper.c
+++ b/target/sh4/helper.c
@@ -168,7 +168,7 @@ void superh_cpu_do_interrupt(CPUState *cs)
 /* Branch instruction should be executed again before delay slot. */
env->spc -= 2;
/* Clear flags for exception/interrupt routine. */
-   env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL | DELAY_SLOT_TRUE);
+env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
 }
 
 if (do_exp) {
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index d84a7a2e6e..2e29936ad8 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -72,7 +72,7 @@ static TCGv cpu_pr, cpu_fpscr, cpu_fpul, cpu_ldst;
 static TCGv cpu_fregs[32];
 
 /* internal register indexes */
-static TCGv cpu_flags, cpu_delayed_pc;
+static TCGv cpu_flags, cpu_delayed_pc, cpu_delayed_cond;
 
 #include "exec/gen-icount.h"
 
@@ -147,6 +147,10 @@ void sh4_translate_init(void)
 cpu_delayed_pc = tcg_global_mem_new_i32(cpu_env,
offsetof(CPUSH4State, delayed_pc),
"_delayed_pc_");
+cpu_delayed_cond = tcg_global_mem_new_i32(cpu_env,
+  offsetof(CPUSH4State,
+   delayed_cond),
+  "_delayed_cond_");
 cpu_ldst = tcg_global_mem_new_i32(cpu_env,
  offsetof(CPUSH4State, ldst), "_ldst_");
 
@@ -252,11 +256,12 @@ static void gen_jump(DisasContext * ctx)
 
 static inline void gen_branch_slot(uint32_t delayed_pc, int t)
 {
-TCGLabel *label = gen_new_label();
 tcg_gen_movi_i32(cpu_delayed_pc, delayed_pc);
-tcg_gen_brcondi_i32(t ? TCG_COND_EQ : TCG_COND_NE, cpu_sr_t, 0, label);
-tcg_gen_ori_i32(cpu_flags, cpu_flags, DELAY_SLOT_TRUE);
-gen_set_label(label);
+if (t) {
+tcg_gen_mov_i32(cpu_delayed_cond, cpu_sr_t);
+} else {
+tcg_gen_xori_i32(cpu_delayed_cond, cpu_sr_t, 1);
+}
 }
 
 /* Immediate conditional jump (bt or bf) */
@@ -278,18 +283,17 @@ static void gen_delayed_conditional_jump(DisasContext * 
ctx)
 
 l1 = gen_new_label();
 ds = tcg_temp_new();
-tcg_gen_andi_i32(ds, cpu_flags, DELAY_SLOT_TRUE);
+tcg_gen_mov_i32(ds, cpu_delayed_cond);
+tcg_gen_discard_i32(cpu_delayed_cond);
 tcg_gen_brcondi_i32(TCG_COND_NE, ds, 0, l1);
 gen_goto_tb(ctx, 1, ctx->pc + 2);
 gen_set_label(l1);
-tcg_gen_andi_i32(cpu_flags, cpu_flags, ~DELAY_SLOT_TRUE);
 gen_jump(ctx);
 }
 
 static inline void gen_store_flags(uint32_t flags)
 {
-tcg_gen_andi_i32(cpu_flags, cpu_flags, DELAY_SLOT_TRUE);
-tcg_gen_ori_i32(cpu_flags, cpu_flags, flags);
+tcg_gen_movi_i32(cpu_flags, flags);
 }
 
 static inline void gen_load_fpr64(TCGv_i64 t, int reg)
-- 
2.11.0




[Qemu-devel] [PATCH v2 12/14] target/sh4: implement tas.b using atomic helper

2017-05-06 Thread Aurelien Jarno
We only emulate UP SH4, however as the tas.b instruction is used in the GNU
libc, this improve linux-user emulation.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 19 +++
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 9fe2e2d4d9..d86fd29264 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1635,19 +1635,14 @@ static void _decode_opc(DisasContext * ctx)
tcg_gen_shri_i32(REG(B11_8), REG(B11_8), 16);
return;
 case 0x401b:   /* tas.b @Rn */
-   {
-   TCGv addr, val;
-   addr = tcg_temp_local_new();
-   tcg_gen_mov_i32(addr, REG(B11_8));
-   val = tcg_temp_local_new();
-tcg_gen_qemu_ld_i32(val, addr, ctx->memidx, MO_UB);
+{
+TCGv val = tcg_const_i32(0x80);
+tcg_gen_atomic_fetch_or_i32(val, REG(B11_8), val,
+ctx->memidx, MO_UB);
 tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, val, 0);
-   tcg_gen_ori_i32(val, val, 0x80);
-tcg_gen_qemu_st_i32(val, addr, ctx->memidx, MO_UB);
-   tcg_temp_free(val);
-   tcg_temp_free(addr);
-   }
-   return;
+tcg_temp_free(val);
+}
+return;
 case 0xf00d: /* fsts FPUL,FRn - FPSCR: Nothing */
CHECK_FPU_ENABLED
tcg_gen_mov_i32(cpu_fregs[FREG(B11_8)], cpu_fpul);
-- 
2.11.0




[Qemu-devel] [PATCH v2 05/14] target/sh4: fix BS_STOP exit

2017-05-06 Thread Aurelien Jarno
When stopping the translation because the state has changed, goto_tb
should not be used as it might link TB with different flags.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 2e29936ad8..04bc18bf7c 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1901,8 +1901,9 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 } else {
switch (ctx.bstate) {
 case BS_STOP:
-/* gen_op_interrupt_restart(); */
-/* fall through */
+tcg_gen_movi_i32(cpu_pc, ctx.pc);
+tcg_gen_exit_tb(0);
+break;
 case BS_NONE:
 if (ctx.envflags) {
 gen_store_flags(ctx.envflags);
-- 
2.11.0




[Qemu-devel] [PATCH v2 11/14] target/sh4: generate fences for SH4

2017-05-06 Thread Aurelien Jarno
synco is a SH4-A only instruction.

Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 8c766eed2a..9fe2e2d4d9 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1570,10 +1570,11 @@ static void _decode_opc(DisasContext * ctx)
else
break;
 case 0x00ab:   /* synco */
-   if (ctx->features & SH_FEATURE_SH4A)
-   return;
-   else
-   break;
+if (ctx->features & SH_FEATURE_SH4A) {
+tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+return;
+}
+break;
 case 0x4024:   /* rotcl Rn */
{
TCGv tmp = tcg_temp_new();
-- 
2.11.0




[Qemu-devel] [PATCH v2 00/14] target/sh4: misc fixes, cleanup and optimizations

2017-05-06 Thread Aurelien Jarno
This patch series try to improve the SH4 target by using the (more or
less) recently introduced TCG features. It also fixes some issues spot
when writting the patches (linking of TB with different flags, SH4-A
specific instructions allowed on SH4) and correctly trap unaligned
accesses.

v2:
- Add some comments in the struct DisasContext declaration, as suggested
  by Philippe Mathieu-Daudé
- Add Reviewed-by entries

Aurelien Jarno (14):
  target/sh4: split ctx->flags into ctx->tbflags and ctx->envflags
  target/sh4: get rid of DELAY_SLOT_CLEARME
  target/sh4: do not include DELAY_SLOT_TRUE in the TB state
  target/sh4: move DELAY_SLOT_TRUE flag into a separate global
  target/sh4: fix BS_STOP exit
  target/sh4: fix BS_EXCP exit
  target/sh4: only save flags state at the end of the TB
  target/sh4: fold ctx->bstate = BS_BRANCH into gen_conditional_jump
  target/sh4: optimize gen_store_fpr64
  target/sh4: optimize gen_write_sr using extract op
  target/sh4: generate fences for SH4
  target/sh4: implement tas.b using atomic helper
  target/sh4: movua.l is an SH4-A only instruction
  target/sh4: trap unaligned accesses

 target/sh4/cpu.c   |   1 +
 target/sh4/cpu.h   |  18 ++-
 target/sh4/helper.c|   4 +-
 target/sh4/op_helper.c |  19 +++
 target/sh4/translate.c | 323 -
 5 files changed, 183 insertions(+), 182 deletions(-)

-- 
2.11.0




[Qemu-devel] [PULL 0/1] tcg-mips queue

2017-05-06 Thread Aurelien Jarno
The following changes since commit 12a95f320a36ef66f724a49bb05e4fb553ac5dbe:

  Merge remote-tracking branch 'kwolf/tags/for-upstream' into staging 
(2017-05-04 13:44:32 +0100)

are available in the git repository at:

  git://git.aurel32.net/qemu.git tags/pull-tcg-mips-20170506

for you to fetch changes up to 2f5a5f5774d95baacf86c03aa8a77a2d0390f2b2:

  tcg/mips: fix field extraction opcode (2017-05-06 12:48:53 +0200)


Fix MIPS R2 hosts support



Aurelien Jarno (1):
  tcg/mips: fix field extraction opcode

 tcg/mips/tcg-target.inc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

-- 
2.11.0




[Qemu-devel] [PULL 1/1] tcg/mips: fix field extraction opcode

2017-05-06 Thread Aurelien Jarno
The "msb" argument should correspond to (len - 1).

Signed-off-by: Aurelien Jarno 
---
 tcg/mips/tcg-target.inc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index 01ac7b2c81..2a7e1c7f5b 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -2093,11 +2093,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
  args[3] + args[4] - 1, args[3]);
 break;
 case INDEX_op_extract_i32:
-tcg_out_opc_bf(s, OPC_EXT, a0, a1, a2 + args[3] - 1, a2);
+tcg_out_opc_bf(s, OPC_EXT, a0, a1, args[3] - 1, a2);
 break;
 case INDEX_op_extract_i64:
 tcg_out_opc_bf64(s, OPC_DEXT, OPC_DEXTM, OPC_DEXTU, a0, a1,
- a2 + args[3] - 1, a2);
+ args[3] - 1, a2);
 break;
 
 case INDEX_op_brcond_i32:
-- 
2.11.0




Re: [Qemu-devel] [PATCH v5++ 1/3] tcg/mips: implement goto_ptr

2017-05-02 Thread Aurelien Jarno
On 2017-05-02 18:21, Richard Henderson wrote:
> On 04/30/2017 04:52 PM, Aurelien Jarno wrote:
> > +/* jmp to the given host address (could be epilogue) */
> > +tcg_out_opc_reg(s, OPC_JR, 0, a0, 0);
> > +tcg_out_nop(s);
> 
> Any particular reason not to do the zeroing in the delay slot...
> 
> > +s->code_gen_epilogue = s->code_ptr;
> > +tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_V0, TCG_REG_ZERO);
> 
> ... instead of here?

There is no particular reason in the current usage of goto_ptr. It's
just that in the future we might want to use code_gen_epilogue for
other reasons or use the tcg_out_opc_reg to do other things. It's
probably better to have a consistent behaviour across all TCG
targets.

That said if you prefer, I am find sending a v2 with the zeroing moved
to the delay slot.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH 01/14] target/sh4: split ctx->flags into ctx->tbflags and ctx->envflags

2017-05-02 Thread Aurelien Jarno
On 2017-05-02 09:16, Philippe Mathieu-Daudé wrote:
> Hi Aurelien,
> 
> On 05/01/2017 07:10 PM, Aurelien Jarno wrote:
> > There is a confusion (and not only in the SH4 target) between tb->flags,
> > env->flags and ctx->flags. To avoid it, split ctx->flags into
> > ctx->tbflags and ctx->envflags. ctx->tbflags stays unchanged during the
> > whole TB translation, while ctx->envflags evolves and is kept in sync
> > with env->flags using TCG instructions. ctx->envflags now only contains
> > the part that of env->flags that is contained in the TB state, i.e. the
> > DELAY_SLOT* flags.
> 
> I agree with your split, but I'd rather put the 2nd part of your commit
> description as comments in the struct DisasContext declaration, if you mind
> :)

Thanks for the review. I'll do that in the v2, although at some point it
should probably go to some higher level documentation, as it is often a
source of confusion.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



[Qemu-devel] [PATCH 07/14] target/sh4: only save flags state at the end of the TB

2017-05-01 Thread Aurelien Jarno
There is no need to save flags when entering and exiting the delay slot.
They can be saved only when reaching the end of the TB. If the TB is
interrupted before by an exception, they will be restored using
restore_state_to_opc.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 69 --
 1 file changed, 33 insertions(+), 36 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 213eb2a0bb..8c99e4fb21 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -212,6 +212,20 @@ static void gen_write_sr(TCGv src)
 tcg_gen_andi_i32(cpu_sr_t, cpu_sr_t, 1);
 }
 
+static inline void gen_save_cpu_state(DisasContext *ctx, bool save_pc)
+{
+if (save_pc) {
+tcg_gen_movi_i32(cpu_pc, ctx->pc);
+}
+if (ctx->delayed_pc != (uint32_t) -1) {
+tcg_gen_movi_i32(cpu_delayed_pc, ctx->delayed_pc);
+}
+if ((ctx->tbflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL))
+!= ctx->envflags) {
+tcg_gen_movi_i32(cpu_flags, ctx->envflags);
+}
+}
+
 static inline bool use_goto_tb(DisasContext *ctx, target_ulong dest)
 {
 if (unlikely(ctx->singlestep_enabled)) {
@@ -246,6 +260,7 @@ static void gen_jump(DisasContext * ctx)
/* Target is not statically known, it comes necessarily from a
   delayed jump as immediate jump are conditinal jumps */
tcg_gen_mov_i32(cpu_pc, cpu_delayed_pc);
+tcg_gen_discard_i32(cpu_delayed_pc);
if (ctx->singlestep_enabled)
 gen_helper_debug(cpu_env);
tcg_gen_exit_tb(0);
@@ -254,21 +269,12 @@ static void gen_jump(DisasContext * ctx)
 }
 }
 
-static inline void gen_branch_slot(uint32_t delayed_pc, int t)
-{
-tcg_gen_movi_i32(cpu_delayed_pc, delayed_pc);
-if (t) {
-tcg_gen_mov_i32(cpu_delayed_cond, cpu_sr_t);
-} else {
-tcg_gen_xori_i32(cpu_delayed_cond, cpu_sr_t, 1);
-}
-}
-
 /* Immediate conditional jump (bt or bf) */
 static void gen_conditional_jump(DisasContext * ctx,
 target_ulong ift, target_ulong ifnott)
 {
 TCGLabel *l1 = gen_new_label();
+gen_save_cpu_state(ctx, false);
 tcg_gen_brcondi_i32(TCG_COND_NE, cpu_sr_t, 0, l1);
 gen_goto_tb(ctx, 0, ifnott);
 gen_set_label(l1);
@@ -291,11 +297,6 @@ static void gen_delayed_conditional_jump(DisasContext * 
ctx)
 gen_jump(ctx);
 }
 
-static inline void gen_store_flags(uint32_t flags)
-{
-tcg_gen_movi_i32(cpu_flags, flags);
-}
-
 static inline void gen_load_fpr64(TCGv_i64 t, int reg)
 {
 tcg_gen_concat_i32_i64(t, cpu_fregs[reg + 1], cpu_fregs[reg]);
@@ -337,7 +338,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 
 #define CHECK_NOT_DELAY_SLOT \
 if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
-tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
+gen_save_cpu_state(ctx, true);   \
 gen_helper_raise_slot_illegal_instruction(cpu_env);  \
 ctx->bstate = BS_EXCP;   \
 return;  \
@@ -345,7 +346,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 
 #define CHECK_PRIVILEGED \
 if (IS_USER(ctx)) {  \
-tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
+gen_save_cpu_state(ctx, true);   \
 if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
 gen_helper_raise_slot_illegal_instruction(cpu_env);  \
 } else { \
@@ -357,7 +358,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 
 #define CHECK_FPU_ENABLED\
 if (ctx->tbflags & (1u << SR_FD)) {  \
-tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
+gen_save_cpu_state(ctx, true);   \
 if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
 gen_helper_raise_slot_fpu_disable(cpu_env);  \
 } else { \
@@ -501,14 +502,12 @@ static void _decode_opc(DisasContext * ctx)
 case 0xa000:   /* bra disp */
CHECK_NOT_DELAY_SLOT
ctx->delayed_pc = ctx->pc + 4 + B11_0s * 2;
-   tcg_gen_movi_i32(cpu_delayed_pc, ctx->delayed_pc);
 ctx->envflags |= DELAY_SLOT;
return;
 case 0xb000:   /* bsr disp */
CHECK_NOT_DELAY_SLOT
tcg_gen_movi_i32(cpu_pr, ctx->pc + 4);
ctx->delayed_pc = ctx->pc + 4 + B11_0s * 2;
-   tcg_gen_movi_i32(cpu_delayed_pc, ctx->delayed_pc);

[Qemu-devel] [PATCH 14/14] target/sh4: trap unaligned accesses

2017-05-01 Thread Aurelien Jarno
SH4 requires that memory accesses are naturally aligned, except for the
SH4-A movua.l instructions which can do unaligned loads.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.c   |  1 +
 target/sh4/cpu.h   |  4 
 target/sh4/op_helper.c | 19 +++
 target/sh4/translate.c |  6 --
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/target/sh4/cpu.c b/target/sh4/cpu.c
index 9a481c35dc..9da7e1ed38 100644
--- a/target/sh4/cpu.c
+++ b/target/sh4/cpu.c
@@ -301,6 +301,7 @@ static void superh_cpu_class_init(ObjectClass *oc, void 
*data)
 #ifdef CONFIG_USER_ONLY
 cc->handle_mmu_fault = superh_cpu_handle_mmu_fault;
 #else
+cc->do_unaligned_access = superh_cpu_do_unaligned_access;
 cc->get_phys_page_debug = superh_cpu_get_phys_page_debug;
 #endif
 cc->disas_set_info = superh_cpu_disas_set_info;
diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index faab3012f9..6c07c6b24b 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -24,6 +24,7 @@
 #include "cpu-qom.h"
 
 #define TARGET_LONG_BITS 32
+#define ALIGNED_ONLY
 
 /* CPU Subtypes */
 #define SH_CPU_SH7750  (1 << 0)
@@ -215,6 +216,9 @@ void superh_cpu_dump_state(CPUState *cpu, FILE *f,
 hwaddr superh_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
 int superh_cpu_gdb_read_register(CPUState *cpu, uint8_t *buf, int reg);
 int superh_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
+void superh_cpu_do_unaligned_access(CPUState *cpu, vaddr addr,
+MMUAccessType access_type,
+int mmu_idx, uintptr_t retaddr);
 
 void sh4_translate_init(void);
 SuperHCPU *cpu_sh4_init(const char *cpu_model);
diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
index 684d3f3758..4abd05667c 100644
--- a/target/sh4/op_helper.c
+++ b/target/sh4/op_helper.c
@@ -24,6 +24,25 @@
 
 #ifndef CONFIG_USER_ONLY
 
+void superh_cpu_do_unaligned_access(CPUState *cs, vaddr addr,
+MMUAccessType access_type,
+int mmu_idx, uintptr_t retaddr)
+{
+if (retaddr) {
+cpu_restore_state(cs, retaddr);
+}
+switch (access_type) {
+case MMU_INST_FETCH:
+case MMU_DATA_LOAD:
+cs->exception_index = 0x0e0;
+break;
+case MMU_DATA_STORE:
+cs->exception_index = 0x100;
+break;
+}
+cpu_loop_exit(cs);
+}
+
 void tlb_fill(CPUState *cs, target_ulong addr, MMUAccessType access_type,
   int mmu_idx, uintptr_t retaddr)
 {
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index bc70166602..fe3f73b7c0 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1505,14 +1505,16 @@ static void _decode_opc(DisasContext * ctx)
 case 0x40a9:/* movua.l @Rm,R0 */
 /* Load non-boundary-aligned data */
 if (ctx->features & SH_FEATURE_SH4A) {
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx,
+MO_TEUL | MO_UNALN);
 return;
 }
 break;
 case 0x40e9:/* movua.l @Rm+,R0 */
 /* Load non-boundary-aligned data */
 if (ctx->features & SH_FEATURE_SH4A) {
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx,
+MO_TEUL | MO_UNALN);
 tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
 return;
 }
-- 
2.11.0




[Qemu-devel] [PATCH 04/14] target/sh4: move DELAY_SLOT_TRUE flag into a separate global

2017-05-01 Thread Aurelien Jarno
Instead of using one bit of the env flags to store the condition of the
next delay slot, use a separate global. It simplifies reading and
writing the flags variable and also removes some confusion between
ctx->envflags and env->flags.

Note that the global is first transfered to a temp in order to be
able to discard the global before the brcond.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.h   | 10 ++
 target/sh4/helper.c|  2 +-
 target/sh4/translate.c | 22 +-
 3 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index da8d15f1b9..faab3012f9 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -92,13 +92,6 @@
 
 #define DELAY_SLOT (1 << 0)
 #define DELAY_SLOT_CONDITIONAL (1 << 1)
-#define DELAY_SLOT_TRUE(1 << 2)
-/* The dynamic value of the DELAY_SLOT_TRUE flag determines whether the jump
- * after the delay slot should be taken or not. It is calculated from SR_T.
- *
- * It is unclear if it is permitted to modify the SR_T flag in a delay slot.
- * The use of DELAY_SLOT_TRUE flag makes us accept such SR_T modification.
- */
 
 typedef struct tlb_t {
 uint32_t vpn;  /* virtual page number */
@@ -148,7 +141,8 @@ typedef struct CPUSH4State {
 uint32_t sgr;  /* saved global register 15 */
 uint32_t dbr;  /* debug base register */
 uint32_t pc;   /* program counter */
-uint32_t delayed_pc;   /* target of delayed jump */
+uint32_t delayed_pc;/* target of delayed branch */
+uint32_t delayed_cond;  /* condition of delayed branch */
 uint32_t mach; /* multiply and accumulate high */
 uint32_t macl; /* multiply and accumulate low */
 uint32_t pr;   /* procedure register */
diff --git a/target/sh4/helper.c b/target/sh4/helper.c
index 71bb49a670..8f8ce81401 100644
--- a/target/sh4/helper.c
+++ b/target/sh4/helper.c
@@ -168,7 +168,7 @@ void superh_cpu_do_interrupt(CPUState *cs)
 /* Branch instruction should be executed again before delay slot. */
env->spc -= 2;
/* Clear flags for exception/interrupt routine. */
-   env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL | DELAY_SLOT_TRUE);
+env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
 }
 
 if (do_exp) {
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index d4cdf746dc..df3655cc64 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -72,7 +72,7 @@ static TCGv cpu_pr, cpu_fpscr, cpu_fpul, cpu_ldst;
 static TCGv cpu_fregs[32];
 
 /* internal register indexes */
-static TCGv cpu_flags, cpu_delayed_pc;
+static TCGv cpu_flags, cpu_delayed_pc, cpu_delayed_cond;
 
 #include "exec/gen-icount.h"
 
@@ -147,6 +147,10 @@ void sh4_translate_init(void)
 cpu_delayed_pc = tcg_global_mem_new_i32(cpu_env,
offsetof(CPUSH4State, delayed_pc),
"_delayed_pc_");
+cpu_delayed_cond = tcg_global_mem_new_i32(cpu_env,
+  offsetof(CPUSH4State,
+   delayed_cond),
+  "_delayed_cond_");
 cpu_ldst = tcg_global_mem_new_i32(cpu_env,
  offsetof(CPUSH4State, ldst), "_ldst_");
 
@@ -252,11 +256,12 @@ static void gen_jump(DisasContext * ctx)
 
 static inline void gen_branch_slot(uint32_t delayed_pc, int t)
 {
-TCGLabel *label = gen_new_label();
 tcg_gen_movi_i32(cpu_delayed_pc, delayed_pc);
-tcg_gen_brcondi_i32(t ? TCG_COND_EQ : TCG_COND_NE, cpu_sr_t, 0, label);
-tcg_gen_ori_i32(cpu_flags, cpu_flags, DELAY_SLOT_TRUE);
-gen_set_label(label);
+if (t) {
+tcg_gen_mov_i32(cpu_delayed_cond, cpu_sr_t);
+} else {
+tcg_gen_xori_i32(cpu_delayed_cond, cpu_sr_t, 1);
+}
 }
 
 /* Immediate conditional jump (bt or bf) */
@@ -278,18 +283,17 @@ static void gen_delayed_conditional_jump(DisasContext * 
ctx)
 
 l1 = gen_new_label();
 ds = tcg_temp_new();
-tcg_gen_andi_i32(ds, cpu_flags, DELAY_SLOT_TRUE);
+tcg_gen_mov_i32(ds, cpu_delayed_cond);
+tcg_gen_discard_i32(cpu_delayed_cond);
 tcg_gen_brcondi_i32(TCG_COND_NE, ds, 0, l1);
 gen_goto_tb(ctx, 1, ctx->pc + 2);
 gen_set_label(l1);
-tcg_gen_andi_i32(cpu_flags, cpu_flags, ~DELAY_SLOT_TRUE);
 gen_jump(ctx);
 }
 
 static inline void gen_store_flags(uint32_t flags)
 {
-tcg_gen_andi_i32(cpu_flags, cpu_flags, DELAY_SLOT_TRUE);
-tcg_gen_ori_i32(cpu_flags, cpu_flags, flags);
+tcg_gen_movi_i32(cpu_flags, flags);
 }
 
 static inline void gen_load_fpr64(TCGv_i64 t, int reg)
-- 
2.11.0




[Qemu-devel] [PATCH 13/14] target/sh4: movua.l is an SH4-A only instruction

2017-05-01 Thread Aurelien Jarno
At the same time change the comment describing the instruction the same
way than other instruction, so that the code is easier to read and search.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index a158b0e480..bc70166602 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1502,17 +1502,21 @@ static void _decode_opc(DisasContext * ctx)
 }
 ctx->has_movcal = 1;
return;
-case 0x40a9:
-   /* MOVUA.L @Rm,R0 (Rm) -> R0
-  Load non-boundary-aligned data */
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
-   return;
-case 0x40e9:
-   /* MOVUA.L @Rm+,R0   (Rm) -> R0, Rm + 4 -> Rm
-  Load non-boundary-aligned data */
-tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
-   tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
-   return;
+case 0x40a9:/* movua.l @Rm,R0 */
+/* Load non-boundary-aligned data */
+if (ctx->features & SH_FEATURE_SH4A) {
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+return;
+}
+break;
+case 0x40e9:/* movua.l @Rm+,R0 */
+/* Load non-boundary-aligned data */
+if (ctx->features & SH_FEATURE_SH4A) {
+tcg_gen_qemu_ld_i32(REG(0), REG(B11_8), ctx->memidx, MO_TEUL);
+tcg_gen_addi_i32(REG(B11_8), REG(B11_8), 4);
+return;
+}
+break;
 case 0x0029:   /* movt Rn */
 tcg_gen_mov_i32(REG(B11_8), cpu_sr_t);
return;
-- 
2.11.0




[Qemu-devel] [PATCH 06/14] target/sh4: fix BS_EXCP exit

2017-05-01 Thread Aurelien Jarno
In case of exception, there is no need to call tcg_gen_exit_tb as the
exception helper won't return.

Also fix a few cases where BS_BRANCH is called instead of BS_EXCP.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 3999da87c7..213eb2a0bb 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -339,7 +339,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
 tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
 gen_helper_raise_slot_illegal_instruction(cpu_env);  \
-ctx->bstate = BS_BRANCH; \
+ctx->bstate = BS_EXCP;   \
 return;  \
 }
 
@@ -351,7 +351,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 } else { \
 gen_helper_raise_illegal_instruction(cpu_env);   \
 }\
-ctx->bstate = BS_BRANCH; \
+ctx->bstate = BS_EXCP;   \
 return;  \
 }
 
@@ -363,7 +363,7 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 } else { \
 gen_helper_raise_fpu_disable(cpu_env);   \
 }\
-ctx->bstate = BS_BRANCH; \
+ctx->bstate = BS_EXCP;   \
 return;  \
 }
 
@@ -1289,7 +1289,7 @@ static void _decode_opc(DisasContext * ctx)
imm = tcg_const_i32(B7_0);
 gen_helper_trapa(cpu_env, imm);
tcg_temp_free(imm);
-   ctx->bstate = BS_BRANCH;
+ctx->bstate = BS_EXCP;
}
return;
 case 0xc800:   /* tst #imm,R0 */
@@ -1798,7 +1798,7 @@ static void _decode_opc(DisasContext * ctx)
 } else {
 gen_helper_raise_illegal_instruction(cpu_env);
 }
-ctx->bstate = BS_BRANCH;
+ctx->bstate = BS_EXCP;
 }
 
 static void decode_opc(DisasContext * ctx)
@@ -1867,7 +1867,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 /* We have hit a breakpoint - make sure PC is up-to-date */
 tcg_gen_movi_i32(cpu_pc, ctx.pc);
 gen_helper_debug(cpu_env);
-ctx.bstate = BS_BRANCH;
+ctx.bstate = BS_EXCP;
 /* The address covered by the breakpoint must be included in
[tb->pc, tb->pc + tb->size) in order to for it to be
properly cleared -- thus we increment the PC here so that
@@ -1911,9 +1911,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 gen_goto_tb(&ctx, 0, ctx.pc);
 break;
 case BS_EXCP:
-/* gen_op_interrupt_restart(); */
-tcg_gen_exit_tb(0);
-break;
+/* fall through */
 case BS_BRANCH:
 default:
 break;
-- 
2.11.0




[Qemu-devel] [PATCH 11/14] target/sh4: generate fences for SH4

2017-05-01 Thread Aurelien Jarno
synco is a SH4-A only instruction.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 7de459c9a5..c226be9718 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1570,10 +1570,11 @@ static void _decode_opc(DisasContext * ctx)
else
break;
 case 0x00ab:   /* synco */
-   if (ctx->features & SH_FEATURE_SH4A)
-   return;
-   else
-   break;
+if (ctx->features & SH_FEATURE_SH4A) {
+tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
+return;
+}
+break;
 case 0x4024:   /* rotcl Rn */
{
TCGv tmp = tcg_temp_new();
-- 
2.11.0




[Qemu-devel] [PATCH 08/14] target/sh4: fold ctx->bstate = BS_BRANCH into gen_conditional_jump

2017-05-01 Thread Aurelien Jarno
Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 8c99e4fb21..a8399239d1 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -279,6 +279,7 @@ static void gen_conditional_jump(DisasContext * ctx,
 gen_goto_tb(ctx, 0, ifnott);
 gen_set_label(l1);
 gen_goto_tb(ctx, 1, ift);
+ctx->bstate = BS_BRANCH;
 }
 
 /* Delayed conditional jump (bt or bf) */
@@ -1158,9 +1159,7 @@ static void _decode_opc(DisasContext * ctx)
return;
 case 0x8b00:   /* bf label */
CHECK_NOT_DELAY_SLOT
-   gen_conditional_jump(ctx, ctx->pc + 2,
-ctx->pc + 4 + B7_0s * 2);
-   ctx->bstate = BS_BRANCH;
+gen_conditional_jump(ctx, ctx->pc + 2, ctx->pc + 4 + B7_0s * 2);
return;
 case 0x8f00:   /* bf/s label */
CHECK_NOT_DELAY_SLOT
@@ -1170,9 +1169,7 @@ static void _decode_opc(DisasContext * ctx)
return;
 case 0x8900:   /* bt label */
CHECK_NOT_DELAY_SLOT
-   gen_conditional_jump(ctx, ctx->pc + 4 + B7_0s * 2,
-ctx->pc + 2);
-   ctx->bstate = BS_BRANCH;
+gen_conditional_jump(ctx, ctx->pc + 4 + B7_0s * 2, ctx->pc + 2);
return;
 case 0x8d00:   /* bt/s label */
CHECK_NOT_DELAY_SLOT
-- 
2.11.0




[Qemu-devel] [PATCH 03/14] target/sh4: do not include DELAY_SLOT_TRUE in the TB state

2017-05-01 Thread Aurelien Jarno
DELAY_SLOT_TRUE is used as a dynamic condition for the branch after the
delay slot instruction. It is not used in code generation, so there is
no need to including in the TB state.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index 9445cc779f..da8d15f1b9 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -382,8 +382,7 @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, 
target_ulong *pc,
 {
 *pc = env->pc;
 *cs_base = 0;
-*flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
-| DELAY_SLOT_TRUE))/* Bits  0- 2 */
+*flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) /* Bits 0-1 
*/
 | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
 | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))  /* Bits 29-30 */
 | (env->sr & (1u << SR_FD))/* Bit 15 */
-- 
2.11.0




[Qemu-devel] [PATCH 12/14] target/sh4: implement tas.b using atomic helper

2017-05-01 Thread Aurelien Jarno
We only emulate UP SH4, however as the tas.b instruction is used in the GNU
libc, this improve linux-user emulation.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 19 +++
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index c226be9718..a158b0e480 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1635,19 +1635,14 @@ static void _decode_opc(DisasContext * ctx)
tcg_gen_shri_i32(REG(B11_8), REG(B11_8), 16);
return;
 case 0x401b:   /* tas.b @Rn */
-   {
-   TCGv addr, val;
-   addr = tcg_temp_local_new();
-   tcg_gen_mov_i32(addr, REG(B11_8));
-   val = tcg_temp_local_new();
-tcg_gen_qemu_ld_i32(val, addr, ctx->memidx, MO_UB);
+{
+TCGv val = tcg_const_i32(0x80);
+tcg_gen_atomic_fetch_or_i32(val, REG(B11_8), val,
+ctx->memidx, MO_UB);
 tcg_gen_setcondi_i32(TCG_COND_EQ, cpu_sr_t, val, 0);
-   tcg_gen_ori_i32(val, val, 0x80);
-tcg_gen_qemu_st_i32(val, addr, ctx->memidx, MO_UB);
-   tcg_temp_free(val);
-   tcg_temp_free(addr);
-   }
-   return;
+tcg_temp_free(val);
+}
+return;
 case 0xf00d: /* fsts FPUL,FRn - FPSCR: Nothing */
CHECK_FPU_ENABLED
tcg_gen_mov_i32(cpu_fregs[FREG(B11_8)], cpu_fpul);
-- 
2.11.0




[Qemu-devel] [PATCH 05/14] target/sh4: fix BS_STOP exit

2017-05-01 Thread Aurelien Jarno
When stopping the translation because the state has changed, goto_tb
should not be used as it might link TB with different flags.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index df3655cc64..3999da87c7 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1901,8 +1901,9 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 } else {
switch (ctx.bstate) {
 case BS_STOP:
-/* gen_op_interrupt_restart(); */
-/* fall through */
+tcg_gen_movi_i32(cpu_pc, ctx.pc);
+tcg_gen_exit_tb(0);
+break;
 case BS_NONE:
 if (ctx.envflags) {
 gen_store_flags(ctx.envflags);
-- 
2.11.0




[Qemu-devel] [PATCH 02/14] target/sh4: get rid of DELAY_SLOT_CLEARME

2017-05-01 Thread Aurelien Jarno
Now that ctx->flags has been split, it becomes clear that
DELAY_SLOT_CLEARME has not impact on the code generation: in both case
ctx->envflags is cleared, either by clearing all the flags, or by
setting it to 0. This is left-over from pre-TCG era.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/cpu.h   |  3 +--
 target/sh4/helper.c|  2 --
 target/sh4/translate.c | 17 +
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/target/sh4/cpu.h b/target/sh4/cpu.h
index cad8989f7e..9445cc779f 100644
--- a/target/sh4/cpu.h
+++ b/target/sh4/cpu.h
@@ -93,7 +93,6 @@
 #define DELAY_SLOT (1 << 0)
 #define DELAY_SLOT_CONDITIONAL (1 << 1)
 #define DELAY_SLOT_TRUE(1 << 2)
-#define DELAY_SLOT_CLEARME (1 << 3)
 /* The dynamic value of the DELAY_SLOT_TRUE flag determines whether the jump
  * after the delay slot should be taken or not. It is calculated from SR_T.
  *
@@ -384,7 +383,7 @@ static inline void cpu_get_tb_cpu_state(CPUSH4State *env, 
target_ulong *pc,
 *pc = env->pc;
 *cs_base = 0;
 *flags = (env->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL
-| DELAY_SLOT_TRUE | DELAY_SLOT_CLEARME))   /* Bits  0- 3 */
+| DELAY_SLOT_TRUE))/* Bits  0- 2 */
 | (env->fpscr & (FPSCR_FR | FPSCR_SZ | FPSCR_PR))  /* Bits 19-21 */
 | (env->sr & ((1u << SR_MD) | (1u << SR_RB)))  /* Bits 29-30 */
 | (env->sr & (1u << SR_FD))/* Bit 15 */
diff --git a/target/sh4/helper.c b/target/sh4/helper.c
index 036c5ca56c..71bb49a670 100644
--- a/target/sh4/helper.c
+++ b/target/sh4/helper.c
@@ -170,8 +170,6 @@ void superh_cpu_do_interrupt(CPUState *cs)
/* Clear flags for exception/interrupt routine. */
env->flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL | DELAY_SLOT_TRUE);
 }
-if (env->flags & DELAY_SLOT_CLEARME)
-env->flags = 0;
 
 if (do_exp) {
 env->expevt = cs->exception_index;
diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 0b16ff33ea..d4cdf746dc 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -1804,15 +1804,9 @@ static void decode_opc(DisasContext * ctx)
 _decode_opc(ctx);
 
 if (old_flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) {
-if (ctx->envflags & DELAY_SLOT_CLEARME) {
-gen_store_flags(0);
-} else {
-   /* go out of the delay slot */
-uint32_t new_flags = ctx->envflags;
-   new_flags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
-   gen_store_flags(new_flags);
-}
-ctx->envflags = 0;
+/* go out of the delay slot */
+ctx->envflags &= ~(DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
+gen_store_flags(ctx->envflags);
 ctx->bstate = BS_BRANCH;
 if (old_flags & DELAY_SLOT_CONDITIONAL) {
gen_delayed_conditional_jump(ctx);
@@ -1840,8 +1834,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 pc_start = tb->pc;
 ctx.pc = pc_start;
 ctx.tbflags = (uint32_t)tb->flags;
-ctx.envflags = tb->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL |
-DELAY_SLOT_CLEARME);
+ctx.envflags = tb->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL);
 ctx.bstate = BS_NONE;
 ctx.memidx = (ctx.tbflags & (1u << SR_MD)) == 0 ? 1 : 0;
 /* We don't know if the delayed pc came from a dynamic or static branch,
@@ -1908,7 +1901,7 @@ void gen_intermediate_code(CPUSH4State * env, struct 
TranslationBlock *tb)
 /* fall through */
 case BS_NONE:
 if (ctx.envflags) {
-gen_store_flags(ctx.envflags | DELAY_SLOT_CLEARME);
+gen_store_flags(ctx.envflags);
}
 gen_goto_tb(&ctx, 0, ctx.pc);
 break;
-- 
2.11.0




[Qemu-devel] [PATCH 00/14] target/sh4: misc fixes, cleanup and optimizations

2017-05-01 Thread Aurelien Jarno
This patch series try to improve the SH4 target by using the (more or
less) recently introduced TCG features. It also fixes some issues spot
when writting the patches (linking of TB with different flags, SH4-A
specific instructions allowed on SH4) and correctly trap unaligned
accesses.

Aurelien Jarno (14):
  target/sh4: split ctx->flags into ctx->tbflags and ctx->envflags
  target/sh4: get rid of DELAY_SLOT_CLEARME
  target/sh4: do not include DELAY_SLOT_TRUE in the TB state
  target/sh4: move DELAY_SLOT_TRUE flag into a separate global
  target/sh4: fix BS_STOP exit
  target/sh4: fix BS_EXCP exit
  target/sh4: only save flags state at the end of the TB
  target/sh4: fold ctx->bstate = BS_BRANCH into gen_conditional_jump
  target/sh4: optimize gen_store_fpr64
  target/sh4: optimize gen_write_sr using extract op
  target/sh4: generate fences for SH4
  target/sh4: implement tas.b using atomic helper
  target/sh4: movua.l is an SH4-A only instruction
  target/sh4: trap unaligned accesses

 target/sh4/cpu.c   |   1 +
 target/sh4/cpu.h   |  18 ++-
 target/sh4/helper.c|   4 +-
 target/sh4/op_helper.c |  19 +++
 target/sh4/translate.c | 323 -
 5 files changed, 183 insertions(+), 182 deletions(-)

-- 
2.11.0




[Qemu-devel] [PATCH 09/14] target/sh4: optimize gen_store_fpr64

2017-05-01 Thread Aurelien Jarno
Isuing extrh and avoiding intermediate temps.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index a8399239d1..23636eeb4c 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -305,13 +305,8 @@ static inline void gen_load_fpr64(TCGv_i64 t, int reg)
 
 static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 {
-TCGv_i32 tmp = tcg_temp_new_i32();
-tcg_gen_extrl_i64_i32(tmp, t);
-tcg_gen_mov_i32(cpu_fregs[reg + 1], tmp);
-tcg_gen_shri_i64(t, t, 32);
-tcg_gen_extrl_i64_i32(tmp, t);
-tcg_gen_mov_i32(cpu_fregs[reg], tmp);
-tcg_temp_free_i32(tmp);
+tcg_gen_extrl_i64_i32(cpu_fregs[reg + 1], t);
+tcg_gen_extrh_i64_i32(cpu_fregs[reg], t);
 }
 
 #define B3_0 (ctx->opcode & 0xf)
-- 
2.11.0




[Qemu-devel] [PATCH 01/14] target/sh4: split ctx->flags into ctx->tbflags and ctx->envflags

2017-05-01 Thread Aurelien Jarno
There is a confusion (and not only in the SH4 target) between tb->flags,
env->flags and ctx->flags. To avoid it, split ctx->flags into
ctx->tbflags and ctx->envflags. ctx->tbflags stays unchanged during the
whole TB translation, while ctx->envflags evolves and is kept in sync
with env->flags using TCG instructions. ctx->envflags now only contains
the part that of env->flags that is contained in the TB state, i.e. the
DELAY_SLOT* flags.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 161 +
 1 file changed, 82 insertions(+), 79 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index c89a14733f..0b16ff33ea 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -37,7 +37,8 @@ typedef struct DisasContext {
 struct TranslationBlock *tb;
 target_ulong pc;
 uint16_t opcode;
-uint32_t flags;
+uint32_t tbflags;
+uint32_t envflags;
 int bstate;
 int memidx;
 uint32_t delayed_pc;
@@ -49,7 +50,7 @@ typedef struct DisasContext {
 #if defined(CONFIG_USER_ONLY)
 #define IS_USER(ctx) 1
 #else
-#define IS_USER(ctx) (!(ctx->flags & (1u << SR_MD)))
+#define IS_USER(ctx) (!(ctx->tbflags & (1u << SR_MD)))
 #endif
 
 enum {
@@ -317,51 +318,50 @@ static inline void gen_store_fpr64 (TCGv_i64 t, int reg)
 #define B11_8 ((ctx->opcode >> 8) & 0xf)
 #define B15_12 ((ctx->opcode >> 12) & 0xf)
 
-#define REG(x) ((x) < 8 && (ctx->flags & (1u << SR_MD))\
-&& (ctx->flags & (1u << SR_RB))\
+#define REG(x) ((x) < 8 && (ctx->tbflags & (1u << SR_MD))\
+&& (ctx->tbflags & (1u << SR_RB))\
 ? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
 
-#define ALTREG(x) ((x) < 8 && (!(ctx->flags & (1u << SR_MD))\
-   || !(ctx->flags & (1u << SR_RB)))\
+#define ALTREG(x) ((x) < 8 && (!(ctx->tbflags & (1u << SR_MD))\
+   || !(ctx->tbflags & (1u << SR_RB)))\
? (cpu_gregs[x + 16]) : (cpu_gregs[x]))
 
-#define FREG(x) (ctx->flags & FPSCR_FR ? (x) ^ 0x10 : (x))
+#define FREG(x) (ctx->tbflags & FPSCR_FR ? (x) ^ 0x10 : (x))
 #define XHACK(x) x) & 1 ) << 4) | ((x) & 0xe))
-#define XREG(x) (ctx->flags & FPSCR_FR ? XHACK(x) ^ 0x10 : XHACK(x))
+#define XREG(x) (ctx->tbflags & FPSCR_FR ? XHACK(x) ^ 0x10 : XHACK(x))
 #define DREG(x) FREG(x) /* Assumes lsb of (x) is always 0 */
 
 #define CHECK_NOT_DELAY_SLOT \
-  if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) \
-  {   \
-  tcg_gen_movi_i32(cpu_pc, ctx->pc);  \
-  gen_helper_raise_slot_illegal_instruction(cpu_env); \
-  ctx->bstate = BS_BRANCH;\
-  return; \
-  }
-
-#define CHECK_PRIVILEGED\
-  if (IS_USER(ctx)) {   \
-  tcg_gen_movi_i32(cpu_pc, ctx->pc);\
-  if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
-  gen_helper_raise_slot_illegal_instruction(cpu_env);   \
-  } else {  \
-  gen_helper_raise_illegal_instruction(cpu_env);\
-  } \
-  ctx->bstate = BS_BRANCH;  \
-  return;   \
-  }
-
-#define CHECK_FPU_ENABLED   \
-  if (ctx->flags & (1u << SR_FD)) { \
-  tcg_gen_movi_i32(cpu_pc, ctx->pc);\
-  if (ctx->flags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
-  gen_helper_raise_slot_fpu_disable(cpu_env);   \
-  } else {  \
-  gen_helper_raise_fpu_disable(cpu_env);\
-  } \
-  ctx->bstate = BS_BRANCH;  \
-  return;   \
-  }
+if (ctx->envflags & (DELAY_SLOT | DELAY_SLOT_CONDITIONAL)) { \
+tcg_gen_movi_i32(cpu_pc, ctx->pc);   \
+gen_helper_raise_slot_illegal_instruction(cpu_env);  \
+ctx->bstate = BS_BRANCH; \
+return;  \
+}
+
+#define CHECK_PRIV

[Qemu-devel] [PATCH 10/14] target/sh4: optimize gen_write_sr using extract op

2017-05-01 Thread Aurelien Jarno
This doesn't change the generated code on x86, but optimizes it on most
RISC architectures and makes the code simpler to read.

Signed-off-by: Aurelien Jarno 
---
 target/sh4/translate.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/target/sh4/translate.c b/target/sh4/translate.c
index 23636eeb4c..7de459c9a5 100644
--- a/target/sh4/translate.c
+++ b/target/sh4/translate.c
@@ -204,12 +204,9 @@ static void gen_write_sr(TCGv src)
 {
 tcg_gen_andi_i32(cpu_sr, src,
  ~((1u << SR_Q) | (1u << SR_M) | (1u << SR_T)));
-tcg_gen_shri_i32(cpu_sr_q, src, SR_Q);
-tcg_gen_andi_i32(cpu_sr_q, cpu_sr_q, 1);
-tcg_gen_shri_i32(cpu_sr_m, src, SR_M);
-tcg_gen_andi_i32(cpu_sr_m, cpu_sr_m, 1);
-tcg_gen_shri_i32(cpu_sr_t, src, SR_T);
-tcg_gen_andi_i32(cpu_sr_t, cpu_sr_t, 1);
+tcg_gen_extract_i32(cpu_sr_q, src, SR_Q, 1);
+tcg_gen_extract_i32(cpu_sr_m, src, SR_M, 1);
+tcg_gen_extract_i32(cpu_sr_t, src, SR_T, 1);
 }
 
 static inline void gen_save_cpu_state(DisasContext *ctx, bool save_pc)
-- 
2.11.0




[Qemu-devel] [PATCH v5++ 1/3] tcg/mips: implement goto_ptr

2017-04-30 Thread Aurelien Jarno
Signed-off-by: Aurelien Jarno 
---
 tcg/mips/tcg-target.h |  2 +-
 tcg/mips/tcg-target.inc.c | 13 +
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index e3240cfba7..d75cb63ed3 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -130,7 +130,7 @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_muluh_i321
 #define TCG_TARGET_HAS_mulsh_i321
 #define TCG_TARGET_HAS_bswap32_i32  1
-#define TCG_TARGET_HAS_goto_ptr 0
+#define TCG_TARGET_HAS_goto_ptr 1
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_add2_i32 0
diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index 01ac7b2c81..9e5b9f42da 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -1747,6 +1747,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 tcg_out_nop(s);
 s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
 break;
+case INDEX_op_goto_ptr:
+/* jmp to the given host address (could be epilogue) */
+tcg_out_opc_reg(s, OPC_JR, 0, a0, 0);
+tcg_out_nop(s);
+break;
 case INDEX_op_br:
 tcg_out_brcond(s, TCG_COND_EQ, TCG_REG_ZERO, TCG_REG_ZERO,
arg_label(a0));
@@ -2160,6 +2165,7 @@ static const TCGTargetOpDef mips_op_defs[] = {
 { INDEX_op_exit_tb, { } },
 { INDEX_op_goto_tb, { } },
 { INDEX_op_br, { } },
+{ INDEX_op_goto_ptr, { "r" } },
 
 { INDEX_op_ld8u_i32, { "r", "r" } },
 { INDEX_op_ld8s_i32, { "r", "r" } },
@@ -2451,6 +2457,13 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 /* delay slot */
 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
 
+/*
+ * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
+ * and fall through to the rest of the epilogue.
+ */
+s->code_gen_epilogue = s->code_ptr;
+tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_V0, TCG_REG_ZERO);
+
 /* TB epilogue */
 tb_ret_addr = s->code_ptr;
 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
-- 
2.11.0




[Qemu-devel] [PATCH v5++] TCG cross-tb optimizations

2017-04-30 Thread Aurelien Jarno
Please find patches to support cross-tb optimizations on MIPS hosts
and to implement cross-tb optimizations for MIPS target.

Aurelien Jarno (3):
  tcg/mips: implement goto_ptr
  target/mips: optimize cross-page direct jumps in softmmu
  target/mips: optimize indirect branches

 target/mips/translate.c   |  4 ++--
 tcg/mips/tcg-target.h |  2 +-
 tcg/mips/tcg-target.inc.c | 13 +
 3 files changed, 16 insertions(+), 3 deletions(-)

-- 
2.11.0




[Qemu-devel] [PATCH v5++ 2/3] target/mips: optimize cross-page direct jumps in softmmu

2017-04-30 Thread Aurelien Jarno
Cc: Yongbok Kim 
Signed-off-by: Aurelien Jarno 
---
 target/mips/translate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/mips/translate.c b/target/mips/translate.c
index 3022f349cb..1a7ac07c67 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -4233,7 +4233,7 @@ static inline void gen_goto_tb(DisasContext *ctx, int n, 
target_ulong dest)
 save_cpu_state(ctx, 0);
 gen_helper_raise_exception_debug(cpu_env);
 }
-tcg_gen_exit_tb(0);
+tcg_gen_lookup_and_goto_ptr(cpu_PC);
 }
 }
 
-- 
2.11.0




[Qemu-devel] [PATCH v5++ 3/3] target/mips: optimize indirect branches

2017-04-30 Thread Aurelien Jarno
Cc: Yongbok Kim 
Signed-off-by: Aurelien Jarno 
---
 target/mips/translate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target/mips/translate.c b/target/mips/translate.c
index 1a7ac07c67..559f8fed89 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -10725,7 +10725,7 @@ static void gen_branch(DisasContext *ctx, int 
insn_bytes)
 save_cpu_state(ctx, 0);
 gen_helper_raise_exception_debug(cpu_env);
 }
-tcg_gen_exit_tb(0);
+tcg_gen_lookup_and_goto_ptr(cpu_PC);
 break;
 default:
 fprintf(stderr, "unknown branch 0x%x\n", proc_hflags);
-- 
2.11.0




[Qemu-devel] [PATCH] tcg/mips: fix field extraction opcode

2017-04-30 Thread Aurelien Jarno
The "msb" argument should correspond to (len - 1).

Signed-off-by: Aurelien Jarno 
---
 tcg/mips/tcg-target.inc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index 01ac7b2c81..2a7e1c7f5b 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -2093,11 +2093,11 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
  args[3] + args[4] - 1, args[3]);
 break;
 case INDEX_op_extract_i32:
-tcg_out_opc_bf(s, OPC_EXT, a0, a1, a2 + args[3] - 1, a2);
+tcg_out_opc_bf(s, OPC_EXT, a0, a1, args[3] - 1, a2);
 break;
 case INDEX_op_extract_i64:
 tcg_out_opc_bf64(s, OPC_DEXT, OPC_DEXTM, OPC_DEXTU, a0, a1,
- a2 + args[3] - 1, a2);
+ args[3] - 1, a2);
 break;
 
 case INDEX_op_brcond_i32:
-- 
2.11.0




Re: [Qemu-devel] [PATCH v3 06/10] target/arm: optimize indirect branches

2017-04-27 Thread Aurelien Jarno
On 2017-04-26 23:20, Emilio G. Cota wrote:
> On Wed, Apr 26, 2017 at 09:54:07 +0200, Richard Henderson wrote:
> > On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
> > >+static bool gen_jr;...
> > > case DISAS_JUMP:
> > >+if (gen_jr) {
> > 
> > Why the variable?  Why not just try the goto_ptr for any DISAS_JUMP?
> 
> We have code that assumes DISAS_JUMP implies "go to exec loop", e.g.:
> case 6: /* isb */
> /* We need to break the TB after this insn to execute
>  * self-modifying code correctly and also to take
>  * any pending interrupts immediately.
>  */
> gen_lookup_tb(s);
> where gen_lookup_tb does:
> 
> /* Force a TB lookup after an instruction that changes the CPU state.  */
> static inline void gen_lookup_tb(DisasContext *s)
> {
> tcg_gen_movi_i32(cpu_R[15], s->pc & ~1);
> s->is_jmp = DISAS_JUMP;
> }

Changing the CPU state should already be taken into account in
lookup_tb_ptr through the cpu_get_tb_cpu_state function. Also interrupts
should be checked at the beginning of each TB.

> Also, the gen_exception_* functions set DISAS_JUMP. I suspect we want to go
> to the exec loop with those as well.

Technically all the code after the one generated by gen_exception_*
function is not executed, including the tcg_gen_exit_tb(0). Therefore
generating code using tcg_gen_exit_tb or or tcg_gen_lookup_and_goto_ptr
should no change the behaviour.

> Testing shows that I'm onto something; if I remove the variable,
> and note that I make sure DISAS_UPDATE is not falling through, I get
> easily reproducible (~1 out of 5) freezes and other instability
> (e.g. RCU lockup warnings) when booting + shutting down debian jessie
> in the guest.

I agree that always calling lookup_tb_ptr might be suboptimal in case we
know for sure that the lookup will fail or that there will be an exit
request at the beginning of the next TB. That said I found strange that
it causes issues, and I wonder if it just expose a bug somewhere.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v4 07/11] target/arm: optimize indirect branches

2017-04-27 Thread Aurelien Jarno
On 2017-04-27 11:42, Richard Henderson wrote:
> On 04/27/2017 11:36 AM, Aurelien Jarno wrote:
> > On 2017-04-26 23:29, Emilio G. Cota wrote:
> > > Speed up indirect branches by jumping to the target if it is valid.
> > > 
> > > Softmmu measurements (see later commit for user-mode results):
> > > 
> > > Note: baseline (i.e. speedup == 1x) is QEMU v2.9.0.
> > > 
> > > - Impact on Boot time
> > > 
> > > | setup  | ARM debian jessie boot+shutdown time | stddev |
> > > |+--+|
> > > | v2.9.0 | 8.84 |   0.07 |
> > > | +cross | 8.85 |   0.03 |
> > > | +jr| 8.83 |   0.06 |
> > > 
> > > -NBench, arm-softmmu (debian jessie guest). 
> > > Host: Intel i7-4790K @ 4.00GHz
> > > 
> > >1.3x 
> > > +-+-+-+
> > > | 
> > > |
> > > |   cross 
> > >     |
> > >   1.25x 
> > > +cross+jr..#++#.+-+
> > > | 
> > >  #  #   |
> > > | +++#  # 
> > >  #  #   |
> > > |  +++  # 
> > >  #  #   |
> > >1.2x 
> > > +-+...*..*..#..#..#.+-+
> > > |    #*  *  # 
> > >  #  #   |
> > > |  *  *  #*  *  # 
> > >  #  # #  #  |
> > >   1.15x 
> > > +-+*..*..#*..*..#..#..#.#..#+-+
> > > |  *  *  #*  *  # 
> > >  #  # #  #  |
> > > |  *  *  #    *  *  # 
> > >  #  # #  #  |
> > > |  *  *  #  #  #  *  *  # 
> > >  #  # #  #  |
> > >1.1x 
> > > +-+*..*..#..#..#..*..*..#..#..#.#..#.#..#...+-+
> > > |  *  *  #  #  #  *  *  # 
> > >  #  # #  # #  # |
> > > |  *  *  #  #  #  *  *  # 
> > >  #  # #  # #  # |
> > >   1.05x 
> > > +-+....*..*..#..#..#..*..*..#..#..#.#..#..+++*..#...+-+
> > > |*  #  *  *  #  #  #  *  *  #  
> > > *  # #  #   +++ |###  *   *  # |
> > > |*+++*  #  *  *  #  #  #  *  *  #  
> > > *+++*  #    #  *###  *  *  #  *   *  # |
> > > | *###  +++  *   *  #  *  *  #  *  #  *  *  #  *  
> > >  *  #  *  *  #  * | *++#  *  *  #  *   *  # |
> > >  1x 
> > > +-++-+*+++*-+#++++#++*+-+*++#+-*++*++#-+*+++*-+#++*++*++#++*+-+*++#+-*++*++#-+*+++*-+#++*++*++#++*+-+*++#+-++-+
> > > | *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *  
> > >  *  #  *  *  #  *   *  #  *  *  #  *   *  # |
> > > | *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *  
> > >  *  #  *  *  #  *   *  #  *  *  #  *   *  # |
> > >   0.95x 
> > > +-+---*###--###--*###--###--*###--###--*###--###--*###--###--*###---+-+
> > > ASSIGNMENT BITFIELD   FOURFP EMULATION   HUFFMAN   LU 
> > > D

Re: [Qemu-devel] [PATCH v4 00/11] TCG optimizations for 2.10

2017-04-27 Thread Aurelien Jarno
On 2017-04-26 23:29, Emilio G. Cota wrote:
> v3 for context: 
> https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04795.html
> 
> Changes from v3:
> 
> - Added reviewed-by tags.
> 
> - Added a couple of suggested-by tags that I forgot to add in v3
>   regarding lookup_and_goto_ptr and i386's implementation of goto_ptr.
> 
> - lookup_tb_ptr
>   + Dropped the unnecessary exit_request check, as suggested by Paolo and
> Richard.
>   + Only get the CPU state if we get a tb from the jmp_cache, as suggested
> by Richard.
>   + Added tb_htable_lookup if we miss in tb_jmp_cache, as suggested by
> Richard. This requires an extra patch to export tb_htable_lookup.
> 
> - goto_ptr: add IMPL(has_goto_ptr), as pointed out by Richard.
> 
> - target/arm: added a comment about gen_jr. See the v3 thread for why
>   it is needed.
> 
> - target/i386: use TCGV_UNUSED instead of (ab)using NULL on a TCGv,
>   as suggested by Richard. Also took his suggestion to simplify
>   the addition of jr + cs_base.
>   To minimize churn I renamed gen_eob_worker to do_gen_eob_worker,
>   which takes the newly added argument.
> 
> I have *not* re-run all experiments, because it takes several hours and
> performance hasn't changed much from v3, as can be seen in these two charts:
> * spec06int user-mode, test input, v2.9.0 baseline: http://imgur.com/ME2eMq1
> * spec06int softmmu, test input, v3 baseline: http://imgur.com/Clolu9Z
> The perf differences are mostly due to adding the htable check. Note that
> its impact is small, since tb_jmp_cache has a %hit rate in the high 90's.
> 
> You can inspect/fetch the changes at:
>   https://github.com/cota/qemu/tree/tcg-opt-v4

Thanks for this patchset. I have tested it with an arm target, but also
with a mips target with and additional patch. I haven't done any precise
benchmark yet. The patch is trivial and only changes 3 lines, but I am
not 100% sure I have done things correctly (see my comment on patch 7).

Tested-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v4 07/11] target/arm: optimize indirect branches

2017-04-27 Thread Aurelien Jarno
TCGv_i32 
> var)
>   */
>  tcg_gen_andi_i32(var, var, s->thumb ? ~1 : ~3);
>  s->is_jmp = DISAS_JUMP;
> +gen_jr = true;
>  }
>  tcg_gen_mov_i32(cpu_R[reg], var);
>  tcg_temp_free_i32(var);
> @@ -893,6 +895,7 @@ static inline void gen_bx_im(DisasContext *s, uint32_t 
> addr)
>  tcg_temp_free_i32(tmp);
>  }
>  tcg_gen_movi_i32(cpu_R[15], addr & ~1);
> +gen_jr = true;
>  }
>  
>  /* Set PC and Thumb state from var.  var is marked as dead.  */
> @@ -902,6 +905,7 @@ static inline void gen_bx(DisasContext *s, TCGv_i32 var)
>  tcg_gen_andi_i32(cpu_R[15], var, ~1);
>  tcg_gen_andi_i32(var, var, 1);
>  store_cpu_field(var, thumb);
> +gen_jr = true;
>  }
>  
>  /* Variant of store_reg which uses branch&exchange logic when storing
> @@ -12034,6 +12038,20 @@ void gen_intermediate_code(CPUARMState *env, 
> TranslationBlock *tb)
>  gen_set_pc_im(dc, dc->pc);
>  /* fall through */
>  case DISAS_JUMP:
> +/*
> + * gen_jr is not set on every DISAS_JUMP because for some of 
> those
> + * we do want to exit to the exec loop.
> + */

What would be the reason for that? IIUC the lookup_tb_ptr helper calls
cpu_get_tb_cpu_state to get the new TB flags go lookup from the current
CPU state. It means it is able for example to handle a transition from
user to privileged mode. Also the exit_req flag or its new equivalent
is tested at the beginning of each TB in case there is an interruption.

It therefore seems to be that we can replace all calls to
tcg_gen_exit_tb by tcg_gen_lookup_and_goto_ptr with the program counter
in argument.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v2] s390x/misc_helper.c: wrap s390_virtio_hypercall in BQL

2017-04-23 Thread Aurelien Jarno
On 2017-04-23 19:38, Philippe Mathieu-Daudé wrote:
> Hi Aurelien!
> 
> Why don't lock inside s390_virtio_hypercall() directly round the diag500
> dispatch call?

s390_virtio_hypercall is shared between TCG and KVM. For KVM the lock is
already done before calling s390_virtio_hypercall in kvm_arch_handle_exit.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



[Qemu-devel] [PATCH] target-s390x: Mask the SIGP order_code to 8bit.

2017-04-23 Thread Aurelien Jarno
From: Philipp Kern 

According to "CPU Signaling and Response", "Signal-Processor Orders",
the order field is bit position 56-63. Without this, the Linux
guest kernel is sometimes unable to stop emulation and enters
an infinite loop of "XXX unknown sigp: 0x0005".

Signed-off-by: Philipp Kern 
Signed-off-by: Aurelien Jarno 
---
 target/s390x/misc_helper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

This patch has been sent by Philipp Kern a lot of time ago, and it seems
has been lost. I am resending it, as it is still useful.

diff --git a/target/s390x/misc_helper.c b/target/s390x/misc_helper.c
index 3bf09ea222..4946b56ab3 100644
--- a/target/s390x/misc_helper.c
+++ b/target/s390x/misc_helper.c
@@ -534,7 +534,7 @@ uint32_t HELPER(sigp)(CPUS390XState *env, uint64_t 
order_code, uint32_t r1,
 /* Remember: Use "R1 or R1 + 1, whichever is the odd-numbered register"
as parameter (input). Status (output) is always R1. */
 
-switch (order_code) {
+switch (order_code & 0xff) {
 case SIGP_SET_ARCH:
 /* switch arch */
 break;
-- 
2.11.0




[Qemu-devel] [PATCH v2] s390x/misc_helper.c: wrap s390_virtio_hypercall in BQL

2017-04-23 Thread Aurelien Jarno
s390_virtio_hypercall can trigger IO events and interrupts, most notably
when using virtio-ccw devices.

Reviewed-by: Alexander Graf 
Signed-off-by: Aurelien Jarno 
---
 target/s390x/misc_helper.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/target/s390x/misc_helper.c b/target/s390x/misc_helper.c
index 4946b56ab3..aec737d707 100644
--- a/target/s390x/misc_helper.c
+++ b/target/s390x/misc_helper.c
@@ -307,7 +307,9 @@ void HELPER(diag)(CPUS390XState *env, uint32_t r1, uint32_t 
r3, uint32_t num)
 switch (num) {
 case 0x500:
 /* KVM hypercall */
+qemu_mutex_lock_iothread();
 r = s390_virtio_hypercall(env);
+qemu_mutex_unlock_iothread();
 break;
 case 0x44:
 /* yield */
-- 
2.11.0




Re: [Qemu-devel] [PATCH] s390x/misc_helper.c: wrap s390_virtio_hypercall in BQL

2017-04-23 Thread Aurelien Jarno
On 2017-04-23 19:19, Alexander Graf wrote:
> 
> 
> > Am 23.04.2017 um 18:08 schrieb Aurelien Jarno :
> > 
> > s390_virtio_hypercall can trigger IO events and interrupts, most notably
> > when using virtio-ccw devices.
> > 
> > Signed-off-by: Aurelien Jarno 
> > ---
> > roms/qemu-palcode  | 2 +-
> > roms/seabios   | 2 +-
> > target/s390x/misc_helper.c | 2 ++
> > 3 files changed, 4 insertions(+), 2 deletions(-)
> > 
> > diff --git a/roms/qemu-palcode b/roms/qemu-palcode
> > index f3c7e44c70..c87a92639b 16
> > --- a/roms/qemu-palcode
> > +++ b/roms/qemu-palcode
> > @@ -1 +1 @@
> > -Subproject commit f3c7e44c70254975df2a00af39701eafbac4d471
> > +Subproject commit c87a92639b28ac42bc8f6c67443543b405dc479b
> > diff --git a/roms/seabios b/roms/seabios
> > index 5f4c7b13cd..e2fc41e24e 16
> > --- a/roms/seabios
> > +++ b/roms/seabios
> > @@ -1 +1 @@
> > -Subproject commit 5f4c7b13cdf9c450eb55645f4362ea58fa61b79b
> > +Subproject commit e2fc41e24ee0ada60fc511d60b15a41b294538be
> 
> I guess you didn't mean to send those hunks?

Indeed that's a mistake sorry, i'll resend a new version.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



[Qemu-devel] [PATCH] s390x/misc_helper.c: wrap s390_virtio_hypercall in BQL

2017-04-23 Thread Aurelien Jarno
s390_virtio_hypercall can trigger IO events and interrupts, most notably
when using virtio-ccw devices.

Signed-off-by: Aurelien Jarno 
---
 roms/qemu-palcode  | 2 +-
 roms/seabios   | 2 +-
 target/s390x/misc_helper.c | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/roms/qemu-palcode b/roms/qemu-palcode
index f3c7e44c70..c87a92639b 16
--- a/roms/qemu-palcode
+++ b/roms/qemu-palcode
@@ -1 +1 @@
-Subproject commit f3c7e44c70254975df2a00af39701eafbac4d471
+Subproject commit c87a92639b28ac42bc8f6c67443543b405dc479b
diff --git a/roms/seabios b/roms/seabios
index 5f4c7b13cd..e2fc41e24e 16
--- a/roms/seabios
+++ b/roms/seabios
@@ -1 +1 @@
-Subproject commit 5f4c7b13cdf9c450eb55645f4362ea58fa61b79b
+Subproject commit e2fc41e24ee0ada60fc511d60b15a41b294538be
diff --git a/target/s390x/misc_helper.c b/target/s390x/misc_helper.c
index 4946b56ab3..aec737d707 100644
--- a/target/s390x/misc_helper.c
+++ b/target/s390x/misc_helper.c
@@ -307,7 +307,9 @@ void HELPER(diag)(CPUS390XState *env, uint32_t r1, uint32_t 
r3, uint32_t num)
 switch (num) {
 case 0x500:
 /* KVM hypercall */
+qemu_mutex_lock_iothread();
 r = s390_virtio_hypercall(env);
+qemu_mutex_unlock_iothread();
 break;
 case 0x44:
 /* yield */
-- 
2.11.0




Re: [Qemu-devel] [PATCH v7 00/13] Improvements for SM501 display controller emulation

2017-04-22 Thread Aurelien Jarno
On 2017-04-21 17:18, BALATON Zoltan wrote:
> v7: Define default values for some variables to avoid an (invalid)
> warning from gcc 6 or 7 as suggested by Aurelien Jarno.

Thanks a lot for this new version, I confirm it now builds fine with GCC
6 or 7.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v6 00/13] Improvements for SM501 display controller emulation

2017-04-21 Thread Aurelien Jarno
On 2017-04-21 12:31, BALATON Zoltan wrote:
> This version just adds some more explanation to patch #7 and
> Reviewed/Acked by tags for the last patches.
> 
> BALATON Zoltan (13):
>   sm501: Fixed code style and a few typos in comments
>   sm501: Use defined constants instead of literal values where available
>   sm501: Add missing arbitration control register
>   sm501: QOMify
>   sm501: Get rid of base address in draw_hwc_line
>   sm501: Add emulation of chip connected via PCI
>   sm501: Fix device endianness
>   sm501: Fix hardware cursor
>   sm501: Misc clean ups
>   sm501: Add support for panel layer
>   sm501: Add some more missing registers
>   sm501: Add vmstate descriptor
>   ppc: Add SM501 device in ppc softmmu targets default configs
> 
>  default-configs/ppc-softmmu.mak|1 +
>  default-configs/ppc64-softmmu.mak  |1 +
>  default-configs/ppcemb-softmmu.mak |1 +
>  hw/display/sm501.c | 1784 
> ++--
>  hw/display/sm501_template.h|   90 +-
>  hw/sh4/r2d.c   |   11 +-
>  include/hw/devices.h   |5 -
>  include/hw/pci/pci_ids.h   |3 +
>  8 files changed, 1150 insertions(+), 746 deletions(-)

I have tested this patchset on an SH4 machine and I haven't found any
regression.

Tested-by: Aurelien Jarno 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v6 08/13] sm501: Fix hardware cursor

2017-04-21 Thread Aurelien Jarno
On 2017-04-21 12:31, BALATON Zoltan wrote:
> Rework HWC handling to simplify it and fix cursor not updating on
> screen as needed. Previously cursor was not updated because checking
> for changes in a line overrode the update flag set for the cursor but
> fixing this is not enough because the cursor should also be updated if
> its shape or location changes. Introduce hwc_invalidate() function to
> handle that similar to other display controller models.
> 
> Signed-off-by: BALATON Zoltan 
> Reviewed-by: Peter Maydell 
> ---
> 
> v3: simplify return expression in get_bpp
> 
>  hw/display/sm501.c  | 169 
> +---
>  hw/display/sm501_template.h |  25 +++
>  2 files changed, 107 insertions(+), 87 deletions(-)
> 
> diff --git a/hw/display/sm501.c b/hw/display/sm501.c
> index a628ef1..dc806a3 100644
> --- a/hw/display/sm501.c
> +++ b/hw/display/sm501.c
> @@ -1339,14 +1362,16 @@ static void sm501_draw_crt(SM501State *s)
>  /* draw each line according to conditions */
>  memory_region_sync_dirty_bitmap(&s->local_mem_region);
>  for (y = 0; y < height; y++) {
> -int update_hwc = draw_hwc_line ? within_hwc_y_range(s, y, 1) : 0;
> -int update = full_update || update_hwc;
> +int update, update_hwc;
>  ram_addr_t page0 = offset;
>  ram_addr_t page1 = offset + width * src_bpp - 1;
>  
> +/* check if hardware cursor is enabled and we're within its range */
> +update_hwc = draw_hwc_line && c_y <= y && y < c_y + SM501_HWC_HEIGHT;
> +update = full_update || update_hwc;
>  /* check dirty flags for each line */
> -update = memory_region_get_dirty(&s->local_mem_region, page0,
> - page1 - page0, DIRTY_MEMORY_VGA);
> +update |= memory_region_get_dirty(&s->local_mem_region, page0,
> +  page1 - page0, DIRTY_MEMORY_VGA);
>  
>  /* draw line and change status */
>  if (update) {
> @@ -1358,7 +1383,7 @@ static void sm501_draw_crt(SM501State *s)
>  
>  /* draw hardware cursor */
>  if (update_hwc) {
> -draw_hwc_line(s, 1, hwc_palette, y - get_hwc_y(s, 1), d, 
> width);
> +draw_hwc_line(d, hwc_src, width, hwc_palette, c_x, y - c_y);
>  }
>  
>  if (y_start < 0) {

The above change causes a warning with at least GCC 6 and GCC 7, which
causes a build failure when using -Werror:

|   CC  sh4-softmmu/hw/display/sm501.o
| /home/aurel32/git/qemu/hw/display/sm501.c: In function ‘sm501_update_display’:
| /home/aurel32/git/qemu/hw/display/sm501.c:1504:17: warning: ‘hwc_src’ may be 
used uninitialized in this function [-Wmaybe-uninitialized]
|  draw_hwc_line(d, hwc_src, width, hwc_palette, c_x, y - c_y);
|  ^~~
| /home/aurel32/git/qemu/hw/display/sm501.c:1488:59: warning: ‘c_y’ may be used 
uninitialized in this function [-Wmaybe-uninitialized]
|  update_hwc = draw_hwc_line && c_y <= y && y < c_y + SM501_HWC_HEIGHT;
|^
| /home/aurel32/git/qemu/hw/display/sm501.c:1504:17: warning: ‘c_x’ may be used 
uninitialized in this function [-Wmaybe-uninitialized]
|  draw_hwc_line(d, hwc_src, width, hwc_palette, c_x, y - c_y);
|  ^~~

GCC fails to notice that hwc_src, c_x and c_y are always defined if
draw_hwc_line is not NULL. This is obviously not a bug in the code, but
it might be a good idea to put a workaround for example by defining
default values for those variables.

Aurelien
-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] Obsolete QEMU host environments

2017-03-15 Thread Aurelien Jarno
On 2017-03-15 07:09, Richard Henderson wrote:
> On 03/15/2017 03:07 AM, Peter Maydell wrote:
> > On 14 March 2017 at 17:54, Thomas Huth  wrote:
> > > Our ia64 host backend in QEMU (tcg/ia64) is still marked as maintained
> > > ... so it's maybe not as dead as you think? Or should we rather get rid
> > > of that soon, too?
> > 
> > I don't actually mind whether we keep tcg/ia64 or drop it.
> > But if we keep it then we must have a machine we can test
> > it on -- that means one I have access to (and which we
> > can otherwise use for central testing), not just one the
> > maintainer might happen to have.
> > 
> > Also, that MAINTAINERS entry was added in 2011, so it's
> > probably about as out of date as the code :-)
> 
> Red Hat's last ia64 machine died a few years ago, so I haven't
> been able to test it for quite some time.
> 
> I don't know if Aurelien can still test on ia64 or not.

I have the same issue, the last QEMU test I have been able to do on an
ia64 machine dates from last summer, so something like 9 months ago. I
don't think anybody has a lot of interest in ia64 anymore, so I guess
it's time to just remove the ia64 host backend.

Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v5 00/10] tcg mips64 and mips r6 improvements

2016-12-02 Thread Aurelien Jarno
On 2016-12-01 21:51, Jin Guojie wrote:
> Changes in v5:
>   * Update against master(v2.8.0-rc2)
>   * Fix a bug: 64-bit big-endian guests hang on mips64 little-endian
>   hosts, and vice versa. This bug was first introduced in v2 patch,
>   due to obvious misuse of ret/arg registers in tcg_out_bswap64().
> 
> tcg_out_opc_reg(s, OPC_DSBH, ret, 0, arg);
>   - tcg_out_opc_reg(s, OPC_DSHD, ret, 0, arg);
>   + tcg_out_opc_reg(s, OPC_DSHD, ret, 0, ret);
> 
>   * Fix a style problem: checkpatch.pl forbids 'extern' to be used in .c.
> 
>   ERROR: externs should be avoided in .c files
>   #28: FILE: tcg/mips/tcg-target.inc.c:39:
>   +extern int link_error(void);
> 
>   Simply comment the type identifier to pass the check.
> 
>   * Tested successfully on following machines:
>   
> | HOST| qemu-system | Debian ISO  |
> |-|
> | mips 32 le  |i386 |i386 |
> | mips 32 le  |x86_64   |i386 |
> | mips 32 le  |x86_64   |amd64|
> | mips 64 le  |i386 |i386 |
> | mips 64 le  |x86_64   |i386 |
> | mips 64 le  |x86_64   |amd64|
> | mips 64 le  |  mips 64 be |  mips 64 be |
> |-|
> | mips 32 be  |i386 | i386|
> | mips 32 be  |x86_64   | i386|
> | mips 32 be  |x86_64   | amd64   |
> | mips 64 be  |i386 | i386|
> | mips 64 be  |x86_64   | i386|
> | mips 64 be  |x86_64   | amd64   |
> | mips n32 be |386  | i386|
> | mips n32 be |x86_64   | i386|
> | mips n32 be |x86_64   | amd64   |
> 
> (No plan to test MIPS R6 in this patch.)
> 
>   Summary of changes from v4:
> 
>   | tcg-mips: Support 64-bit opcodes   | Fix tcg_out_bswap64() |
>   | tcg-mips: Adjust qemu_ld/st for mips64 | Fix a style problem   |

Thanks for the new version. I just gave it a try on a mips64 le guest,
and I confirm this fixes booting mips64 be and ppc64 guests. I'll try to
do more tests on mips be / mips le / mips64 le over the week-end.

Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v3 00/11] tcg mips64 and mips r6 improvements

2016-11-30 Thread Aurelien Jarno
On 2016-12-01 02:33, Jin Guojie wrote:
> Thanks for Aurelien's  first test results.
> I submitted v4 patch as a feedback to Richard's and your review comments on 
> v3. 
> Since v4 contains functional code change, should we do this test again on v4?
> Really sorry for this burden.

Oops sorry, I answered the wrong mail. The results *are* against *v4*
not v3.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v4 10/10] tcg-mips: Adjust qemu_ld/st for mips64

2016-11-30 Thread Aurelien Jarno
On 2016-11-30 09:42, Richard Henderson wrote:
> On 11/30/2016 09:22 AM, Aurelien Jarno wrote:
> > On 2016-11-29 14:07, Jin Guojie wrote:
> >> @@ -1312,7 +1340,12 @@ static void tcg_out_qemu_ld_slow_path(TCGContext 
> >> *s, TCGLabelQemuLdst *l)
> >>  reloc_pc16(s->code_ptr, l->raddr);
> >>  tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
> >>  /* delay slot */
> >> -tcg_out_mov(s, TCG_TYPE_REG, v0, TCG_REG_V0);
> >> +if (TCG_TARGET_REG_BITS == 64 && l->type == TCG_TYPE_I32) {
> >> +/* we always sign-extend 32-bit loads */
> >> +tcg_out_opc_sa(s, OPC_SLL, v0, TCG_REG_V0, 0);
> >> +} else {
> >> +tcg_out_opc_reg(s, OPC_OR, v0, TCG_REG_V0, TCG_REG_ZERO);
> >> +}
> > 
> > While thoses are equivalent, I think it would be clearer to keep the
> > original version for the else case (ie tcg_out_mov) instead of replacing
> > it by a OR instruction.
> 
> Did we exclude v0 from the registers that can be used?
> Otherwise we do need to ensure that some insn gets placed in the delay slot.

Hmm, that's a good point. Also note that the original code didn't check
about that, so it was probably not working in some cases. I guess we can
therefore leave the code like that for now.

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v4 10/10] tcg-mips: Adjust qemu_ld/st for mips64

2016-11-30 Thread Aurelien Jarno
On 2016-11-29 14:07, Jin Guojie wrote:
> @@ -1312,7 +1340,12 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, 
> TCGLabelQemuLdst *l)
>  reloc_pc16(s->code_ptr, l->raddr);
>  tcg_out_opc_br(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO);
>  /* delay slot */
> -tcg_out_mov(s, TCG_TYPE_REG, v0, TCG_REG_V0);
> +if (TCG_TARGET_REG_BITS == 64 && l->type == TCG_TYPE_I32) {
> +/* we always sign-extend 32-bit loads */
> +tcg_out_opc_sa(s, OPC_SLL, v0, TCG_REG_V0, 0);
> +} else {
> +tcg_out_opc_reg(s, OPC_OR, v0, TCG_REG_V0, TCG_REG_ZERO);
> +}

While thoses are equivalent, I think it would be clearer to keep the
original version for the else case (ie tcg_out_mov) instead of replacing
it by a OR instruction.

Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] [PATCH v3 00/11] tcg mips64 and mips r6 improvements

2016-11-30 Thread Aurelien Jarno
On 2016-11-25 11:31, Jin Guojie wrote:
> Changes since v2:
>   * Update against master(v2.8.0-rc1)
>   * Tested on Loongson as mips32r2(el) and mips64r2(el) hosts.
> Loongson only implements little-endian mips32/mips64 ISA.
>   * Fully work for 32-bit and 64-bit guests.
> Fix two bugs:segmentation fault on mips64el with 32-bit guests,
>   blocking when emulating i386 kernel on mips64el.
>   * Fix some minor style problems.
>   * PATCH v2 12~16 are not examined due to the lack of R6 machine. 
> 
> To be tested:
>   * big-endian mips32 and mips64 hosts.
> I have tried running qemu-system-mips on an X86. The speed is awful. 
> The compilation of qemu did not complete over a night until I gave up.
> A better way is needed to do this test.
>   * MIPS R6.

Thanks for this new patch series, there are clearly a lot of
improvements. I haven't been able to test it fully yet, however, here
are my first tests results.

I have successfully tested the following guests on a mips32
big-endian host:
  - Debian amd64 with qemu-system-x86_64
  - Debian i386 with qemu-system-i386
  - Debian i386 with qemu-system-x86_64
It means that the regression I have spotted with the original series is
now gone.

I have successfully tested the following guests on a mips64
little-endian host:
 - Debian amd64 with qemu-system-x86_64
 - Debian armhf wit qemu-system-arm
 - Debian i386 with qemu-system-i386
 - Debian i386 with qemu-system-x86_64
 - Debian mips with qemu-system-mips
 - Debian mips with qemu-system-mips64
 - Debian mipsel with qemu-system-mipsel
 - Debian mipsel with qemu-system-mips64el
 - Debian mipsel/64-bit kernel with qemu-system-mips64el
 - Debian powerpc with qemu-system-ppc
However it seems that 64-bit big-endian guests are not working
correctly. It happens with either qemu-system-mips64el and
qemu-system-ppc64. The later hangs in the SLOF firmware, so it's
probably the easiest way to debug the issue. It would be nice to
get that fixed, that said as it's not a regression, I don't think
we should block merging the mips64 support on that.

I now plan to do more testing on mips32 be hosts and also test mips32
le hosts. Unfortunately I don't have a way to test mips R6 and mips64 be
hosts.

Aurelien 

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net



Re: [Qemu-devel] qemu-system-sh4 vs qemu-system-arm/i386 default behavior

2016-11-30 Thread Aurelien Jarno
On 2016-11-30 08:33, Thomas Huth wrote:
> On 30.11.2016 02:01, Tom Rini wrote:
> > Hey all,
> > 
> > I'm trying to make use of the r2d platform for U-Boot testing via QEMU.
> > After applying a series[1] I can use the kernel.org sh4 toolchain to get
> > a u-boot.bin that runs, mostly.  I say mostly as first of all I have to
> > pass "-monitor null -serial null -serial stdio -nographic" to
> > qemu-system-sh4 and in that order for me to get output from U-Boot on
> > the prompt.  On other platforms such as arm and vexpress or i386 and the
> > 'pc' machine I do not need to do this.  Does anyone have any idea why
> > this might be and where to start poking in the code to fix this?

The reason is that u-boot and the linux kernel do not have the same way
to number the serial port than the physical hardware. Therefore u-boot
and the Linux kernel use the second physical serial port .The question is
whether we should number our ports from the software (or part of the
sofrware) or hardware point of view.

> The "-serial" parameter is related to the serial_hds[] array in the
> code, so you could search for that one.
> 
> The following line in hw/sh4/r2d.c looks somewhat suspicious:
> 
> sm501_init(address_space_mem, 0x1000, SM501_VRAM_SIZE,
>irq[SM501], serial_hds[2]);
> 
> Why is this machine always using serial_hds[2] and not a lower index?
> ... Maybe the maintainer of the board (Magnus) knows the answer here...

The third serial port is provided by the graphic chipset. The first two
serial ports are provided by the SH7750 CPU, see in hw/sh4/sh7750.c.

Aurelien

-- 
Aurelien Jarno  GPG: 4096R/1DDD8C9B
aurel...@aurel32.net http://www.aurel32.net


signature.asc
Description: PGP signature


<    1   2   3   4   5   6   7   8   9   10   >