Re: [Qemu-devel] Re: [PATCH] tcg, tci: Add TCG and interpreter for bytecode (virtual machine)

2009-10-26 Thread Stuart Brady
On Sat, Oct 24, 2009 at 11:23:43AM +0800, TeLeMan wrote:
 On Sat, Oct 24, 2009 at 02:58, Stefan Weil w...@mail.berlios.de wrote:
  Is patch 4 (call handling) needed, or is it an optimization?
  If it is needed, the tcg disassembler has to be extended as well.
 
 In fact tci has no stack and robber registers and doesn't need
 simulate the CPU work. I am trying to remove tcg_reg_alloc() in
 tcg_reg_alloc_op()  tcg_reg_alloc_call() and access the temporary
 variables directly in tci.

'Doesn't need' doesn't necessarily mean 'is better without', though.
Perhaps it's best for TCI to reflect the behaviour of other TCG targets
where possible?  (You can then compare the code that is generated with
different numbers of registers, and different constraints, etc.)

Cheers,
-- 
Stuart Brady




Re: [Qemu-devel] Re: [PATCH] tcg, tci: Add TCG and interpreter for bytecode (virtual machine)

2009-10-23 Thread Stefan Weil
TeLeMan schrieb:
 Tested i386-softmmu only. Now tci can run windows xp sp2 and its speed
 is about 6 times slower than jit.
 --
 SUN OF A BEACH

Great. Many thanks for the fixes, enhancements and for the testing, too.

Is patch 4 (call handling) needed, or is it an optimization?
If it is needed, the tcg disassembler has to be extended as well.

And did patch 5 (inline) speed up the code? I had expected
that static functions don't need inline, because the compiler
can optimize them anyway.

Regards,
Stefan





Re: [Qemu-devel] Re: [PATCH] tcg, tci: Add TCG and interpreter for bytecode (virtual machine)

2009-10-23 Thread TeLeMan
On Sat, Oct 24, 2009 at 02:58, Stefan Weil w...@mail.berlios.de wrote:
 Is patch 4 (call handling) needed, or is it an optimization?
 If it is needed, the tcg disassembler has to be extended as well.
In fact tci has no stack and robber registers and doesn't need
simulate the CPU work. I am trying to remove tcg_reg_alloc() in
tcg_reg_alloc_op()  tcg_reg_alloc_call() and access the temporary
variables directly in tci.

 And did patch 5 (inline) speed up the code? I had expected
 that static functions don't need inline, because the compiler
 can optimize them anyway.
You are right, patch 5 is not needed.




[Qemu-devel] Re: [PATCH] tcg, tci: Add TCG and interpreter for bytecode (virtual machine)

2009-10-22 Thread TeLeMan
Tested i386-softmmu only. Now tci can run windows xp sp2 and its speed
is about 6 times slower than jit.
--
SUN OF A BEACH
Subject: [PATCH 1/5] tci: fix op_sar_iXX and op_ext16s_iXX

---
 tcg/tci.c |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index e467b3a..81c415c 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -206,7 +206,7 @@ static uint16_t tci_read_r16(uint8_t **tb_ptr)
 }
 
 /* Read indexed register (16 bit signed) from bytecode. */
-static uint16_t tci_read_r16s(uint8_t **tb_ptr)
+static int16_t tci_read_r16s(uint8_t **tb_ptr)
 {
 uint16_t value = tci_read_reg16s(**tb_ptr);
 *tb_ptr += 1;
@@ -549,7 +549,7 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
 t0 = *tb_ptr++;
 t1 = tci_read_ri32(tb_ptr);
 t2 = tci_read_ri32(tb_ptr);
-tci_write_reg32(t0, (t1  t2) | (t1  (1UL  31)));
+tci_write_reg32(t0, ((int32_t)t1  t2));
 break;
 #ifdef TCG_TARGET_HAS_rot_i32
 case INDEX_op_rotl_i32:
@@ -794,7 +794,7 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
 t0 = *tb_ptr++;
 t1 = tci_read_ri64(tb_ptr);
 t2 = tci_read_ri64(tb_ptr);
-tci_write_reg64(t0, (t1  t2) | (t1  (1ULL  63)));
+tci_write_reg64(t0, ((int64_t)t1  t2));
 break;
 #ifdef TCG_TARGET_HAS_rot_i64
 case INDEX_op_rotl_i64:
-- 
1.6.3.msysgit.0

Subject: [PATCH 2/5] tci: add bswapXX_i32,div_i32 and rot_i32

---
 tcg/bytecode/tcg-target.c |   24 +++-
 tcg/tci.c |   40 +++-
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/tcg/bytecode/tcg-target.c b/tcg/bytecode/tcg-target.c
index 2bd12b8..aae570f 100644
--- a/tcg/bytecode/tcg-target.c
+++ b/tcg/bytecode/tcg-target.c
@@ -722,6 +722,10 @@ static void tcg_out_op(TCGContext *s, int opc, const 
TCGArg *args,
 case INDEX_op_shl_i32:
 case INDEX_op_shr_i32:
 case INDEX_op_sar_i32:
+#ifdef TCG_TARGET_HAS_rot_i32
+case INDEX_op_rotl_i32:
+case INDEX_op_rotr_i32:
+#endif
 tcg_out_op_t(s, opc);
 tcg_out_r(s, args[0]);
 tcg_out_ri32(s, const_args[1], args[1]);
@@ -816,7 +820,10 @@ static void tcg_out_op(TCGContext *s, int opc, const 
TCGArg *args,
 case INDEX_op_divu_i32:
 case INDEX_op_rem_i32:
 case INDEX_op_remu_i32:
-TODO();
+tcg_out_op_t(s, opc);
+tcg_out_r(s, args[0]);
+tcg_out_ri32(s, const_args[1], args[1]);
+tcg_out_ri32(s, const_args[2], args[2]);
 break;
 #else
 case INDEX_op_div2_i32:
@@ -1002,6 +1009,21 @@ static void tcg_out_op(TCGContext *s, int opc, const 
TCGArg *args,
 break;
 #endif
 #endif /* TCG_TARGET_REG_BITS == 64 */
+#if defined(TCG_TARGET_HAS_bswap32_i32)
+case INDEX_op_bswap32_i32:
+tcg_out_op_t(s, opc);
+tcg_out_r(s, args[0]);
+tcg_out_r(s, args[1]);
+break;
+#endif
+#if defined(TCG_TARGET_HAS_bswap16_i32)
+case INDEX_op_bswap16_i32:
+tcg_dump_ops(s, stderr);
+tcg_out_op_t(s, opc);
+tcg_out_r(s, args[0]);
+tcg_out_r(s, args[1]);
+break;
+#endif
 case INDEX_op_end:
 TODO();
 break;
diff --git a/tcg/tci.c b/tcg/tci.c
index 81c415c..8bb78e3 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -503,11 +503,29 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
 break;
 #ifdef TCG_TARGET_HAS_div_i32
 case INDEX_op_div_i32:
+t0 = *tb_ptr++;
+t1 = tci_read_ri32(tb_ptr);
+t2 = tci_read_ri32(tb_ptr);
+tci_write_reg32(t0, (int32_t)t1 / (int32_t)t2);
+break;
 case INDEX_op_divu_i32:
+t0 = *tb_ptr++;
+t1 = tci_read_ri32(tb_ptr);
+t2 = tci_read_ri32(tb_ptr);
+tci_write_reg32(t0, t1 / t2);
+break;
 case INDEX_op_rem_i32:
+t0 = *tb_ptr++;
+t1 = tci_read_ri32(tb_ptr);
+t2 = tci_read_ri32(tb_ptr);
+tci_write_reg32(t0, (int32_t)t1 % (int32_t)t2);
+break;
 case INDEX_op_remu_i32:
-TODO();
-break;
+t0 = *tb_ptr++;
+t1 = tci_read_ri32(tb_ptr);
+t2 = tci_read_ri32(tb_ptr);
+tci_write_reg32(t0, t1 % t2);
+break;
 #else
 case INDEX_op_div2_i32:
 case INDEX_op_divu2_i32:
@@ -553,8 +571,16 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
 break;
 #ifdef TCG_TARGET_HAS_rot_i32
 case INDEX_op_rotl_i32:
+t0 = *tb_ptr++;
+t1 = tci_read_ri32(tb_ptr);
+t2 = tci_read_ri32(tb_ptr);
+tci_write_reg32(t0, (t1t2)|(t1(32-t2)));
+break;
 case INDEX_op_rotr_i32:
-TODO();
+t0 = *tb_ptr++;
+t1 = tci_read_ri32(tb_ptr);
+t2 = tci_read_ri32(tb_ptr);
+