Tested i386-softmmu only. Now tci can run windows xp sp2 and its speed
is about 6 times slower than jit.
--
SUN OF A BEACH
Subject: [PATCH 1/5] tci: fix op_sar_iXX and op_ext16s_iXX

---
 tcg/tci.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index e467b3a..81c415c 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -206,7 +206,7 @@ static uint16_t tci_read_r16(uint8_t **tb_ptr)
 }
 
 /* Read indexed register (16 bit signed) from bytecode. */
-static uint16_t tci_read_r16s(uint8_t **tb_ptr)
+static int16_t tci_read_r16s(uint8_t **tb_ptr)
 {
     uint16_t value = tci_read_reg16s(**tb_ptr);
     *tb_ptr += 1;
@@ -549,7 +549,7 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
             t0 = *tb_ptr++;
             t1 = tci_read_ri32(&tb_ptr);
             t2 = tci_read_ri32(&tb_ptr);
-            tci_write_reg32(t0, (t1 >> t2) | (t1 & (1UL << 31)));
+            tci_write_reg32(t0, ((int32_t)t1 >> t2));
             break;
 #ifdef TCG_TARGET_HAS_rot_i32
         case INDEX_op_rotl_i32:
@@ -794,7 +794,7 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
             t0 = *tb_ptr++;
             t1 = tci_read_ri64(&tb_ptr);
             t2 = tci_read_ri64(&tb_ptr);
-            tci_write_reg64(t0, (t1 >> t2) | (t1 & (1ULL << 63)));
+            tci_write_reg64(t0, ((int64_t)t1 >> t2));
             break;
 #ifdef TCG_TARGET_HAS_rot_i64
         case INDEX_op_rotl_i64:
-- 
1.6.3.msysgit.0

Subject: [PATCH 2/5] tci: add bswapXX_i32,div_i32 and rot_i32

---
 tcg/bytecode/tcg-target.c |   24 +++++++++++++++++++++++-
 tcg/tci.c                 |   40 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/tcg/bytecode/tcg-target.c b/tcg/bytecode/tcg-target.c
index 2bd12b8..aae570f 100644
--- a/tcg/bytecode/tcg-target.c
+++ b/tcg/bytecode/tcg-target.c
@@ -722,6 +722,10 @@ static void tcg_out_op(TCGContext *s, int opc, const 
TCGArg *args,
     case INDEX_op_shl_i32:
     case INDEX_op_shr_i32:
     case INDEX_op_sar_i32:
+#ifdef TCG_TARGET_HAS_rot_i32
+    case INDEX_op_rotl_i32:
+    case INDEX_op_rotr_i32:
+#endif
         tcg_out_op_t(s, opc);
         tcg_out_r(s, args[0]);
         tcg_out_ri32(s, const_args[1], args[1]);
@@ -816,7 +820,10 @@ static void tcg_out_op(TCGContext *s, int opc, const 
TCGArg *args,
     case INDEX_op_divu_i32:
     case INDEX_op_rem_i32:
     case INDEX_op_remu_i32:
-        TODO();
+        tcg_out_op_t(s, opc);
+        tcg_out_r(s, args[0]);
+        tcg_out_ri32(s, const_args[1], args[1]);
+        tcg_out_ri32(s, const_args[2], args[2]);
         break;
 #else
     case INDEX_op_div2_i32:
@@ -1002,6 +1009,21 @@ static void tcg_out_op(TCGContext *s, int opc, const 
TCGArg *args,
         break;
 #endif
 #endif /* TCG_TARGET_REG_BITS == 64 */
+#if defined(TCG_TARGET_HAS_bswap32_i32)
+    case INDEX_op_bswap32_i32:
+        tcg_out_op_t(s, opc);
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        break;
+#endif
+#if defined(TCG_TARGET_HAS_bswap16_i32)
+    case INDEX_op_bswap16_i32:
+        tcg_dump_ops(s, stderr);
+        tcg_out_op_t(s, opc);
+        tcg_out_r(s, args[0]);
+        tcg_out_r(s, args[1]);
+        break;
+#endif
     case INDEX_op_end:
         TODO();
         break;
diff --git a/tcg/tci.c b/tcg/tci.c
index 81c415c..8bb78e3 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -503,11 +503,29 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
             break;
 #ifdef TCG_TARGET_HAS_div_i32
         case INDEX_op_div_i32:
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri32(&tb_ptr);
+            t2 = tci_read_ri32(&tb_ptr);
+            tci_write_reg32(t0, (int32_t)t1 / (int32_t)t2);
+            break;
         case INDEX_op_divu_i32:
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri32(&tb_ptr);
+            t2 = tci_read_ri32(&tb_ptr);
+            tci_write_reg32(t0, t1 / t2);
+            break;
         case INDEX_op_rem_i32:
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri32(&tb_ptr);
+            t2 = tci_read_ri32(&tb_ptr);
+            tci_write_reg32(t0, (int32_t)t1 % (int32_t)t2);
+            break;
         case INDEX_op_remu_i32:
-            TODO();
-            break;
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri32(&tb_ptr);
+            t2 = tci_read_ri32(&tb_ptr);
+            tci_write_reg32(t0, t1 % t2);
+            break;
 #else
         case INDEX_op_div2_i32:
         case INDEX_op_divu2_i32:
@@ -553,8 +571,16 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
             break;
 #ifdef TCG_TARGET_HAS_rot_i32
         case INDEX_op_rotl_i32:
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri32(&tb_ptr);
+            t2 = tci_read_ri32(&tb_ptr);
+            tci_write_reg32(t0, (t1<<t2)|(t1>>(32-t2)));
+            break;
         case INDEX_op_rotr_i32:
-            TODO();
+            t0 = *tb_ptr++;
+            t1 = tci_read_ri32(&tb_ptr);
+            t2 = tci_read_ri32(&tb_ptr);
+            tci_write_reg32(t0, (t1>>t2)|(t1<<(32-t2)));
             break;
 #endif
         case INDEX_op_brcond_i32:
@@ -640,12 +666,16 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
 #endif
 #ifdef TCG_TARGET_HAS_bswap16_i32
         case INDEX_op_bswap16_i32:
-            TODO();
+            t0 = *tb_ptr++;
+            t1 = tci_read_r16(&tb_ptr);
+            tci_write_reg32(t0, bswap16(t1));
             break;
 #endif
 #ifdef TCG_TARGET_HAS_bswap32_i32
         case INDEX_op_bswap32_i32:
-            TODO();
+            t0 = *tb_ptr++;
+            t1 = tci_read_r32(&tb_ptr);
+            tci_write_reg32(t0, bswap32(t1));
             break;
 #endif
 #ifdef TCG_TARGET_HAS_not_i32
-- 
1.6.3.msysgit.0

Subject: [PATCH 3/5] tci: support GETPC() for SOFTMMU

---
 dyngen-exec.h |    5 ++++-
 tcg/tci.c     |    7 +++++++
 2 files changed, 11 insertions(+), 1 deletions(-)

diff --git a/dyngen-exec.h b/dyngen-exec.h
index d5620ca..ba213c4 100644
--- a/dyngen-exec.h
+++ b/dyngen-exec.h
@@ -119,7 +119,10 @@ extern int printf(const char *, ...);
 
 /* The return address may point to the start of the next instruction.
    Subtracting one gets us the call instruction itself.  */
-#if defined(__s390__)
+#if defined(CONFIG_TCG_INTERPRETER)
+extern uint8_t * tci_tb_ptr;
+# define GETPC() ((void *)tci_tb_ptr)
+#elif defined(__s390__)
 # define GETPC() ((void*)(((unsigned long)__builtin_return_address(0) & 
0x7fffffffUL) - 1))
 #elif defined(__arm__)
 /* Thumb return addresses have the low bit set, so we need to subtract two.
diff --git a/tcg/tci.c b/tcg/tci.c
index 8bb78e3..0ba605b 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -50,6 +50,10 @@ struct CPUX86State *env;
 #error Target support missing, please fix!
 #endif
 
+#ifdef CONFIG_SOFTMMU
+uint8_t * tci_tb_ptr;
+#endif
+
 static tcg_target_ulong tci_reg[TCG_TARGET_NB_REGS];
 
 static tcg_target_ulong tci_read_reg(uint32_t index)
@@ -380,6 +384,9 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
     tci_reg[TCG_AREG0] = (tcg_target_ulong)env;
 
     for (;;) {
+#ifdef CONFIG_SOFTMMU
+        tci_tb_ptr=tb_ptr;
+#endif
         uint8_t opc = *(uint8_t *)tb_ptr++;
         tcg_target_ulong t0;
         tcg_target_ulong t1;
-- 
1.6.3.msysgit.0

Subject: [PATCH 4/5] tci: new op_call implementation for tci

---
 tcg/bytecode/tcg-target.c |  191 ++++++++++++++++++++++++++++++++++++++++++++-
 tcg/tcg-opc.h             |   25 ++++++
 tcg/tcg.c                 |  139 ++++++++++++++++++++++++++++++++
 tcg/tci.c                 |  156 ++++++++++++++++++++++++++++++++++--
 4 files changed, 500 insertions(+), 11 deletions(-)

diff --git a/tcg/bytecode/tcg-target.c b/tcg/bytecode/tcg-target.c
index aae570f..744b9e6 100644
--- a/tcg/bytecode/tcg-target.c
+++ b/tcg/bytecode/tcg-target.c
@@ -248,6 +248,28 @@ static const TCGTargetOpDef tcg_target_op_defs[] = {
     { INDEX_op_bswap32_i32, { "r", "r" } },
 #endif
 
+    { INDEX_op_call0_r0, { "i"} },
+    { INDEX_op_call1_r0, { "i","ri"} },
+    { INDEX_op_call2_r0, { "i","ri","ri"} },
+    { INDEX_op_call3_r0, { "i","ri","ri","ri"} },
+    { INDEX_op_call4_r0, { "i","ri","ri","ri","ri"} },
+
+    { INDEX_op_call0_r1, { "i","r"} },
+    { INDEX_op_call1_r1, { "i","ri","r"} },
+    { INDEX_op_call2_r1, { "i","ri","ri","r"} },
+    { INDEX_op_call3_r1, { "i","ri","ri","ri","r"} },
+    { INDEX_op_call4_r1, { "i","ri","ri","ri","ri","r"} },
+
+#if TCG_TARGET_REG_BITS == 32
+
+    { INDEX_op_call0_r2, { "i","r","r"} },
+    { INDEX_op_call1_r2, { "i","ri","r","r"} },
+    { INDEX_op_call2_r2, { "i","ri","ri","r","r"} },
+    { INDEX_op_call3_r2, { "i","ri","ri","ri","r","r"} },
+    { INDEX_op_call4_r2, { "i","ri","ri","ri","ri","r","r"} },
+
+#endif
+
     { -1 },
 };
 
@@ -655,6 +677,172 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
     }
 }
 
+static void tcg_out_op_call(TCGContext *s, const TCGArg *args,
+                            const int *const_args)
+{
+    int nb_iargs=args[0]&0x0F;
+    int nb_oargs=args[0]>>4;
+
+    assert(const_args[1]!=0);
+
+    switch(nb_iargs)
+    {
+    case 0:
+        switch(nb_oargs)
+        {
+        case 0:
+            tcg_out_op_t(s, INDEX_op_call0_r0);
+            tcg_out_i(s, args[1]);/*func*/
+            break;
+        case 1:
+            tcg_out_op_t(s, INDEX_op_call0_r1);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_r(s, args[2]);/*r1*/
+            break;
+#if TCG_TARGET_REG_BITS == 32
+        case 2:
+            tcg_out_op_t(s, INDEX_op_call0_r2);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_r(s, args[2]);/*r1*/
+            tcg_out_r(s, args[3]);/*r2*/
+            break;
+#endif
+        default:
+            TODO();
+        }
+        break;
+    case 1:
+        switch(nb_oargs)
+        {
+        case 0:
+            tcg_out_op_t(s, INDEX_op_call1_r0);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg1*/
+            break;
+        case 1:
+            tcg_out_op_t(s, INDEX_op_call1_r1);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg1*/
+            tcg_out_r(s, args[3]);/*r1*/
+            break;
+#if TCG_TARGET_REG_BITS == 32
+        case 2:
+            tcg_out_op_t(s, INDEX_op_call1_r2);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg1*/
+            tcg_out_r(s, args[3]);/*r1*/
+            tcg_out_r(s, args[4]);/*r2*/
+            break;
+#endif
+        default:
+            TODO();
+        }
+        break;
+    case 2:
+        switch(nb_oargs)
+        {
+        case 0:
+            tcg_out_op_t(s, INDEX_op_call2_r0);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[3], args[3]);/*arg1*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg2*/
+            break;
+        case 1:
+            tcg_out_op_t(s, INDEX_op_call2_r1);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[3], args[3]);/*arg1*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg2*/
+            tcg_out_r(s, args[4]);/*r1*/
+            break;
+#if TCG_TARGET_REG_BITS == 32
+        case 2:
+            tcg_out_op_t(s, INDEX_op_call2_r2);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[3], args[3]);/*arg1*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg2*/
+            tcg_out_r(s, args[4]);/*r1*/
+            tcg_out_r(s, args[5]);/*r2*/
+            break;
+#endif
+        default:
+            TODO();
+        }
+        break;
+    case 3:
+        switch(nb_oargs)
+        {
+        case 0:
+            tcg_out_op_t(s, INDEX_op_call3_r0);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[4], args[4]);/*arg1*/
+            tcg_out_ri(s, const_args[3], args[3]);/*arg2*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg3*/
+            break;
+        case 1:
+            tcg_out_op_t(s, INDEX_op_call3_r1);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[4], args[4]);/*arg1*/
+            tcg_out_ri(s, const_args[3], args[3]);/*arg2*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg3*/
+            tcg_out_r(s, args[5]);/*r1*/
+            break;
+#if TCG_TARGET_REG_BITS == 32
+        case 2:
+            tcg_out_op_t(s, INDEX_op_call3_r2);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[4], args[4]);/*arg1*/
+            tcg_out_ri(s, const_args[3], args[3]);/*arg2*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg3*/
+            tcg_out_r(s, args[5]);/*r1*/
+            tcg_out_r(s, args[6]);/*r2*/
+            break;
+#endif
+        default:
+            TODO();
+        }
+        break;
+    case 4:
+        switch(nb_oargs)
+        {
+        case 0:
+            tcg_out_op_t(s, INDEX_op_call4_r0);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[5], args[5]);/*arg1*/
+            tcg_out_ri(s, const_args[4], args[4]);/*arg2*/
+            tcg_out_ri(s, const_args[3], args[3]);/*arg3*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg4*/
+            break;
+        case 1:
+            tcg_out_op_t(s, INDEX_op_call4_r1);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[5], args[5]);/*arg1*/
+            tcg_out_ri(s, const_args[4], args[4]);/*arg2*/
+            tcg_out_ri(s, const_args[3], args[3]);/*arg3*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg4*/
+            tcg_out_r(s, args[6]);/*r1*/
+            break;
+#if TCG_TARGET_REG_BITS == 32
+        case 2:
+            tcg_out_op_t(s, INDEX_op_call4_r2);
+            tcg_out_i(s, args[1]);/*func*/
+            tcg_out_ri(s, const_args[5], args[5]);/*arg1*/
+            tcg_out_ri(s, const_args[4], args[4]);/*arg2*/
+            tcg_out_ri(s, const_args[3], args[3]);/*arg3*/
+            tcg_out_ri(s, const_args[2], args[2]);/*arg4*/
+            tcg_out_r(s, args[6]);/*r1*/
+            tcg_out_r(s, args[7]);/*r2*/
+            break;
+#endif
+        default:
+            TODO();
+        }
+        break;
+    default:
+        TODO();
+    }
+}
+
+
 static void tcg_out_op(TCGContext *s, int opc, const TCGArg *args,
                        const int *const_args)
 {
@@ -683,8 +871,7 @@ static void tcg_out_op(TCGContext *s, int opc, const TCGArg 
*args,
         tci_out_label(s, args[0]);
         break;
     case INDEX_op_call:
-        tcg_out_op_t(s, opc);
-        tcg_out_ri(s, const_args[0], args[0]);
+        tcg_out_op_call(s,args,const_args);
         break;
     case INDEX_op_jmp:
         TODO();
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index b7f3fd7..070ba39 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -269,4 +269,29 @@ DEF2(qemu_st64, 0, 2, 1, TCG_OPF_CALL_CLOBBER | 
TCG_OPF_SIDE_EFFECTS)
 
 #endif /* TCG_TARGET_REG_BITS != 32 */
 
+#ifdef CONFIG_TCG_INTERPRETER
+
+DEF2(call0_r0, 0, 0, 0, 0)
+DEF2(call1_r0, 0, 1, 0, 0)
+DEF2(call2_r0, 0, 2, 0, 0)
+DEF2(call3_r0, 0, 3, 0, 0)
+DEF2(call4_r0, 0, 4, 0, 0)
+DEF2(call0_r1, 1, 0, 0, 0)
+DEF2(call1_r1, 1, 1, 0, 0)
+DEF2(call2_r1, 1, 2, 0, 0)
+DEF2(call3_r1, 1, 3, 0, 0)
+DEF2(call4_r1, 1, 4, 0, 0)
+
+#if TCG_TARGET_REG_BITS == 32
+
+DEF2(call0_r2, 2, 0, 0, 0)
+DEF2(call1_r2, 2, 1, 0, 0)
+DEF2(call2_r2, 2, 2, 0, 0)
+DEF2(call3_r2, 2, 3, 0, 0)
+DEF2(call4_r2, 2, 4, 0, 0)
+
+#endif
+
+#endif
+
 #undef DEF2
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 2a82f37..20aac38 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1690,6 +1690,142 @@ static void tcg_reg_alloc_op(TCGContext *s,
 #define STACK_DIR(x) (x)
 #endif
 
+#ifdef CONFIG_TCG_INTERPRETER
+
+static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
+                              int opc, const TCGArg *args,
+                              unsigned int dead_iargs)
+{
+    int nb_iargs, nb_oargs, flags, i, reg, nb_params;
+    TCGArg arg,func_arg;
+    TCGTemp *ts;
+    tcg_target_long func_addr;
+    TCGRegSet allocated_regs;
+    const TCGArgConstraint *arg_ct;
+    TCGArg new_args[TCG_MAX_OP_ARGS];
+    int const_args[TCG_MAX_OP_ARGS];
+
+    arg = *args++;
+
+    nb_oargs = arg >> 16;
+    nb_iargs = arg & 0xffff;
+    nb_params = nb_iargs - 1;
+
+    flags = args[nb_oargs + nb_iargs];
+
+    const_args[0]=1;
+    new_args[0]=(nb_oargs<<4)|nb_params;
+
+    /* satisfy input constraints */ 
+    tcg_regset_set(allocated_regs, s->reserved_regs);
+
+    for(i = nb_params; i >= 0; i--) {
+        arg = args[nb_oargs + i];
+        ts = &s->temps[arg];
+        if (ts->val_type == TEMP_VAL_MEM) {
+            reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type], 
allocated_regs);
+            tcg_out_ld(s, ts->type, reg, ts->mem_reg, ts->mem_offset);
+            ts->val_type = TEMP_VAL_REG;
+            ts->reg = reg;
+            ts->mem_coherent = 1;
+            s->reg_to_temp[reg] = arg;
+        } else if (ts->val_type == TEMP_VAL_CONST) {
+            /* constant is OK for instruction */
+            const_args[nb_params+1-i] = 1;
+            new_args[nb_params+1-i] = ts->val;
+            goto iarg_end;
+        }
+        assert(ts->val_type == TEMP_VAL_REG);
+        reg = ts->reg;
+        if (tcg_regset_test_reg(tcg_target_available_regs[ts->type], reg)) {
+            /* nothing to do : the constraint is satisfied */
+        } else {
+        allocate_in_reg:
+            /* allocate a new register matching the constraint 
+               and move the temporary register into it */
+            reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type], 
allocated_regs);
+            tcg_out_mov(s, reg, ts->reg);
+        }
+        new_args[nb_params+1-i] = reg;
+        const_args[nb_params+1-i] = 0;
+        tcg_regset_set_reg(allocated_regs, reg);
+    iarg_end: ;
+    }
+    
+    /* mark dead temporaries and free the associated registers */
+    for(i = 0; i < nb_iargs; i++) {
+        arg = args[nb_oargs + i];
+        if (IS_DEAD_IARG(i)) {
+            ts = &s->temps[arg];
+            if (!ts->fixed_reg) {
+                if (ts->val_type == TEMP_VAL_REG)
+                    s->reg_to_temp[ts->reg] = -1;
+                ts->val_type = TEMP_VAL_DEAD;
+            }
+        }
+    }
+
+    /* clobber call registers */ 
+    for(reg = 0; reg < TCG_TARGET_NB_REGS; reg++) {
+        if (tcg_regset_test_reg(tcg_target_call_clobber_regs, reg)) {
+            tcg_reg_free(s, reg);
+        }
+    }
+
+    /* store globals and free associated registers (we assume the insn
+    can modify any global. */
+    if (!(flags & TCG_CALL_CONST)) {
+        save_globals(s, allocated_regs);
+    }
+
+    /* satisfy the output constraints */
+    tcg_regset_set(allocated_regs, s->reserved_regs);
+    for(i = 0; i < nb_oargs; i++) {
+        arg = args[i];
+        ts = &s->temps[arg];
+
+        /* if fixed register, we try to use it */
+        reg = ts->reg;
+        if (ts->fixed_reg &&
+            tcg_regset_test_reg(tcg_target_available_regs[ts->type], reg)) {
+                goto oarg_end;
+        }
+        reg = tcg_reg_alloc(s, tcg_target_available_regs[ts->type], 
allocated_regs);
+
+        tcg_regset_set_reg(allocated_regs, reg);
+        /* if a fixed register is used, then a move will be done afterwards */
+        if (!ts->fixed_reg) {
+            if (ts->val_type == TEMP_VAL_REG)
+                s->reg_to_temp[ts->reg] = -1;
+            ts->val_type = TEMP_VAL_REG;
+            ts->reg = reg;
+            /* temp value is modified, so the value kept in memory is
+            potentially not the same */
+            ts->mem_coherent = 0; 
+            s->reg_to_temp[reg] = arg;
+        }
+oarg_end:
+        new_args[i+nb_params+2] = reg;
+    }
+
+
+    /* emit instruction */
+    tcg_out_op(s, opc, new_args, const_args);
+    
+    /* move the outputs in the correct register if needed */
+    for(i = 0; i < nb_oargs; i++) {
+        ts = &s->temps[args[i]];
+        reg = new_args[i+nb_params+2];
+        if (ts->fixed_reg && ts->reg != reg) {
+            tcg_out_mov(s, ts->reg, reg);
+        }
+    }
+
+    return nb_iargs + nb_oargs + def->nb_cargs + 1;
+}
+
+#else
+
 static int tcg_reg_alloc_call(TCGContext *s, const TCGOpDef *def,
                               int opc, const TCGArg *args,
                               unsigned int dead_iargs)
@@ -1868,6 +2004,9 @@ static int tcg_reg_alloc_call(TCGContext *s, const 
TCGOpDef *def,
     return nb_iargs + nb_oargs + def->nb_cargs + 1;
 }
 
+
+#endif
+
 #ifdef CONFIG_PROFILER
 
 static int64_t tcg_table_op_count[NB_OPS];
diff --git a/tcg/tci.c b/tcg/tci.c
index 0ba605b..3e4165b 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -41,8 +41,23 @@
 #define TRACE() ((void)0)
 #endif
 
-typedef tcg_target_ulong (*helper_function)(tcg_target_ulong, tcg_target_ulong,
+typedef tcg_target_ulong (*helper_function0)(void);
+typedef tcg_target_ulong (*helper_function1)(tcg_target_ulong);
+typedef tcg_target_ulong (*helper_function2)(tcg_target_ulong, 
tcg_target_ulong);
+typedef tcg_target_ulong (*helper_function3)(tcg_target_ulong, 
tcg_target_ulong,
+                                            tcg_target_ulong);
+typedef tcg_target_ulong (*helper_function4)(tcg_target_ulong, 
tcg_target_ulong,
                                             tcg_target_ulong, 
tcg_target_ulong);
+#if TCG_TARGET_REG_BITS == 32
+
+typedef uint64_t (*helper_function0_r64)(void);
+typedef uint64_t (*helper_function1_r64)(tcg_target_ulong);
+typedef uint64_t (*helper_function2_r64)(tcg_target_ulong, tcg_target_ulong);
+typedef uint64_t (*helper_function3_r64)(tcg_target_ulong, tcg_target_ulong,
+                                            tcg_target_ulong);
+typedef uint64_t (*helper_function4_r64)(tcg_target_ulong, tcg_target_ulong,
+                                            tcg_target_ulong, 
tcg_target_ulong);
+#endif
 
 #if defined(TARGET_I386)
 struct CPUX86State *env;
@@ -427,15 +442,138 @@ unsigned long tcg_qemu_tb_exec(uint8_t *tb_ptr)
         case INDEX_op_set_label:
             TODO();
             break;
-        case INDEX_op_call:
-            t0 = tci_read_ri(&tb_ptr);
-            t0 = ((helper_function)t0)(tci_read_reg(TCG_REG_R0),
-                                       tci_read_reg(TCG_REG_R1),
-                                       tci_read_reg(TCG_REG_R2),
-                                       tci_read_reg(TCG_REG_R3));
-            // TODO: fix for 32 bit host / 64 bit target.
-            tci_write_reg(TCG_REG_R0, t0);
+        case INDEX_op_call0_r0:
+            t0 = tci_read_i(&tb_ptr);
+            ((helper_function0)t0)();
+            break;
+        case INDEX_op_call1_r0:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            ((helper_function1)t0)(t1);
+            break;
+        case INDEX_op_call2_r0:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t2 = tci_read_ri(&tb_ptr);
+            ((helper_function2)t0)(t1,t2);
+            break;
+        case INDEX_op_call3_r0:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t2 = tci_read_ri(&tb_ptr);
+            t3 = tci_read_ri(&tb_ptr);
+            ((helper_function3)t0)(t1,t2,t3);
+            break;
+        case INDEX_op_call4_r0:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t2 = tci_read_ri(&tb_ptr);
+            t3 = tci_read_ri(&tb_ptr);
+            t4 = tci_read_ri(&tb_ptr);
+            ((helper_function4)t0)(t1,t2,t3,t4);
+            break;
+        case INDEX_op_call0_r1:
+            t0 = tci_read_i(&tb_ptr);
+            t0 = ((helper_function0)t0)();
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,t0);
+
+            break;
+        case INDEX_op_call1_r1:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t0 = ((helper_function1)t0)(t1);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,t0);
+
+            break;
+        case INDEX_op_call2_r1:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t2 = tci_read_ri(&tb_ptr);
+            t0 = ((helper_function2)t0)(t1,t2);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,t0);
+
+            break;
+        case INDEX_op_call3_r1:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t2 = tci_read_ri(&tb_ptr);
+            t3 = tci_read_ri(&tb_ptr);
+            t0 = ((helper_function3)t0)(t1,t2,t3);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,t0);
+
+            break;
+        case INDEX_op_call4_r1:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t2 = tci_read_ri(&tb_ptr);
+            t3 = tci_read_ri(&tb_ptr);
+            t4 = tci_read_ri(&tb_ptr);
+            t0 = ((helper_function4)t0)(t1,t2,t3,t4);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,t0);
+
             break;
+#if TCG_TARGET_REG_BITS == 32
+        case INDEX_op_call0_r2:
+            t0 = tci_read_i(&tb_ptr);
+            u64 = ((helper_function0_r64)t0)();
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,u64);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5, u64>>32); 
+
+            break;
+        case INDEX_op_call1_r2:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            u64 = ((helper_function1_r64)t0)(t1);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,u64);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5, u64>>32); 
+
+            break;
+        case INDEX_op_call2_r2:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t2 = tci_read_ri(&tb_ptr);
+            u64 = ((helper_function2_r64)t0)(t1,t2);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,u64);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5, u64>>32); 
+
+            break;
+        case INDEX_op_call3_r2:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t2 = tci_read_ri(&tb_ptr);
+            t3 = tci_read_ri(&tb_ptr);
+            u64 = ((helper_function3_r64)t0)(t1,t2,t3);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,u64);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5, u64>>32); 
+
+            break;
+        case INDEX_op_call4_r2:
+            t0 = tci_read_i(&tb_ptr);
+            t1 = tci_read_ri(&tb_ptr);
+            t2 = tci_read_ri(&tb_ptr);
+            t3 = tci_read_ri(&tb_ptr);
+            t4 = tci_read_ri(&tb_ptr);
+            u64 = ((helper_function4_r64)t0)(t1,t2,t3,t4);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5,u64);
+            t5 = *tb_ptr++;
+            tci_write_reg(t5, u64>>32); 
+
+            break;
+#endif
         case INDEX_op_jmp:
         case INDEX_op_br:
             t0 = *(uint64_t *)tb_ptr;
-- 
1.6.3.msysgit.0

Subject: [PATCH 5/5] tci: speed optimization

---
 tcg/tci.c |   66 ++++++++++++++++++++++++++++++------------------------------
 1 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/tcg/tci.c b/tcg/tci.c
index 3e4165b..8628e69 100644
--- a/tcg/tci.c
+++ b/tcg/tci.c
@@ -71,88 +71,88 @@ uint8_t * tci_tb_ptr;
 
 static tcg_target_ulong tci_reg[TCG_TARGET_NB_REGS];
 
-static tcg_target_ulong tci_read_reg(uint32_t index)
+static inline tcg_target_ulong tci_read_reg(uint32_t index)
 {
     assert(index < ARRAY_SIZE(tci_reg));
     return tci_reg[index];
 }
 
-static uint8_t tci_read_reg8(uint32_t index)
+static inline uint8_t tci_read_reg8(uint32_t index)
 {
     return (uint8_t)tci_read_reg(index);
 }
 
-static int8_t tci_read_reg8s(uint32_t index)
+static inline int8_t tci_read_reg8s(uint32_t index)
 {
     return (int8_t)tci_read_reg(index);
 }
 
-static uint16_t tci_read_reg16(uint32_t index)
+static inline uint16_t tci_read_reg16(uint32_t index)
 {
     return (uint16_t)tci_read_reg(index);
 }
 
-static int16_t tci_read_reg16s(uint32_t index)
+static inline int16_t tci_read_reg16s(uint32_t index)
 {
     return (int16_t)tci_read_reg(index);
 }
 
-static uint32_t tci_read_reg32(uint32_t index)
+static inline uint32_t tci_read_reg32(uint32_t index)
 {
     return (uint32_t)tci_read_reg(index);
 }
 
 #if TCG_TARGET_REG_BITS == 64
-static int32_t tci_read_reg32s(uint32_t index)
+static inline int32_t tci_read_reg32s(uint32_t index)
 {
     return (int32_t)tci_read_reg(index);
 }
 
-static uint64_t tci_read_reg64(uint32_t index)
+static inline uint64_t tci_read_reg64(uint32_t index)
 {
     return tci_read_reg(index);
 }
 #endif
 
-static void tci_write_reg(uint32_t index, tcg_target_ulong value)
+static inline void tci_write_reg(uint32_t index, tcg_target_ulong value)
 {
     assert(index < ARRAY_SIZE(tci_reg));
     assert(index != TCG_AREG0);
     tci_reg[index] = value;
 }
 
-static void tci_write_reg8(uint32_t index, uint8_t value)
+static inline void tci_write_reg8(uint32_t index, uint8_t value)
 {
     tci_write_reg(index, value);
 }
 
-static void tci_write_reg8s(uint32_t index, int8_t value)
+static inline void tci_write_reg8s(uint32_t index, int8_t value)
 {
     tci_write_reg(index, value);
 }
 
-static void tci_write_reg16s(uint32_t index, int16_t value)
+static inline void tci_write_reg16s(uint32_t index, int16_t value)
 {
     tci_write_reg(index, value);
 }
 
-static void tci_write_reg16(uint32_t index, uint16_t value)
+static inline void tci_write_reg16(uint32_t index, uint16_t value)
 {
     tci_write_reg(index, value);
 }
 
-static void tci_write_reg32(uint32_t index, uint32_t value)
+static inline void tci_write_reg32(uint32_t index, uint32_t value)
 {
     tci_write_reg(index, value);
 }
 
-static void tci_write_reg32s(uint32_t index, int32_t value)
+static inline void tci_write_reg32s(uint32_t index, int32_t value)
 {
     tci_write_reg(index, value);
 }
 
 #if TCG_TARGET_REG_BITS == 64
-static void tci_write_reg64(uint32_t index, uint64_t value)
+static inline void tci_write_reg64(uint32_t index, uint64_t value)
 {
     tci_write_reg(index, value);
 }
@@ -160,14 +160,14 @@ static void tci_write_reg64(uint32_t index, uint64_t 
value)
 
 #if TCG_TARGET_REG_BITS == 32
 /* Create a 64 bit value from two 32 bit values. */
-static uint64_t tci_uint64(uint32_t high, uint32_t low)
+static inline uint64_t tci_uint64(uint32_t high, uint32_t low)
 {
     return ((uint64_t)high << 32) + low;
 }
 #endif
 
 /* Read constant (native size) from bytecode. */
-static tcg_target_ulong tci_read_i(uint8_t **tb_ptr)
+static inline tcg_target_ulong tci_read_i(uint8_t **tb_ptr)
 {
     tcg_target_ulong value = *(tcg_target_ulong *)(*tb_ptr);
     *tb_ptr += sizeof(tcg_target_ulong);
@@ -175,7 +175,7 @@ static tcg_target_ulong tci_read_i(uint8_t **tb_ptr)
 }
 
 /* Read constant (32 bit) from bytecode. */
-static uint32_t tci_read_i32(uint8_t **tb_ptr)
+static inline uint32_t tci_read_i32(uint8_t **tb_ptr)
 {
     uint32_t value = *(uint32_t *)(*tb_ptr);
     *tb_ptr += 4;
@@ -184,7 +184,7 @@ static uint32_t tci_read_i32(uint8_t **tb_ptr)
 
 #if TCG_TARGET_REG_BITS == 64
 /* Read constant (64 bit) from bytecode. */
-static uint64_t tci_read_i64(uint8_t **tb_ptr)
+static inline uint64_t tci_read_i64(uint8_t **tb_ptr)
 {
     uint64_t value = *(uint64_t *)(*tb_ptr);
     *tb_ptr += 8;
@@ -193,7 +193,7 @@ static uint64_t tci_read_i64(uint8_t **tb_ptr)
 #endif
 
 /* Read indexed register (native size) from bytecode. */
-static tcg_target_ulong tci_read_r(uint8_t **tb_ptr)
+static inline tcg_target_ulong tci_read_r(uint8_t **tb_ptr)
 {
     tcg_target_ulong value = tci_read_reg(**tb_ptr);
     *tb_ptr += 1;
@@ -201,7 +201,7 @@ static tcg_target_ulong tci_read_r(uint8_t **tb_ptr)
 }
 
 /* Read indexed register (8 bit) from bytecode. */
-static uint8_t tci_read_r8(uint8_t **tb_ptr)
+static inline uint8_t tci_read_r8(uint8_t **tb_ptr)
 {
     uint8_t value = tci_read_reg8(**tb_ptr);
     *tb_ptr += 1;
@@ -209,7 +209,7 @@ static uint8_t tci_read_r8(uint8_t **tb_ptr)
 }
 
 /* Read indexed register (8 bit signed) from bytecode. */
-static int8_t tci_read_r8s(uint8_t **tb_ptr)
+static inline int8_t tci_read_r8s(uint8_t **tb_ptr)
 {
     int8_t value = tci_read_reg8s(**tb_ptr);
     *tb_ptr += 1;
@@ -217,7 +217,7 @@ static int8_t tci_read_r8s(uint8_t **tb_ptr)
 }
 
 /* Read indexed register (16 bit) from bytecode. */
-static uint16_t tci_read_r16(uint8_t **tb_ptr)
+static inline uint16_t tci_read_r16(uint8_t **tb_ptr)
 {
     uint16_t value = tci_read_reg16(**tb_ptr);
     *tb_ptr += 1;
@@ -225,7 +225,7 @@ static uint16_t tci_read_r16(uint8_t **tb_ptr)
 }
 
 /* Read indexed register (16 bit signed) from bytecode. */
-static int16_t tci_read_r16s(uint8_t **tb_ptr)
+static inline int16_t tci_read_r16s(uint8_t **tb_ptr)
 {
     uint16_t value = tci_read_reg16s(**tb_ptr);
     *tb_ptr += 1;
@@ -233,7 +233,7 @@ static int16_t tci_read_r16s(uint8_t **tb_ptr)
 }
 
 /* Read indexed register (32 bit) from bytecode. */
-static uint32_t tci_read_r32(uint8_t **tb_ptr)
+static inline uint32_t tci_read_r32(uint8_t **tb_ptr)
 {
     uint32_t value = tci_read_reg32(**tb_ptr);
     *tb_ptr += 1;
@@ -242,7 +242,7 @@ static uint32_t tci_read_r32(uint8_t **tb_ptr)
 
 #if TCG_TARGET_REG_BITS == 64
 /* Read indexed register (32 bit signed) from bytecode. */
-static int32_t tci_read_r32s(uint8_t **tb_ptr)
+static inline int32_t tci_read_r32s(uint8_t **tb_ptr)
 {
     int32_t value = tci_read_reg32s(**tb_ptr);
     *tb_ptr += 1;
@@ -250,7 +250,7 @@ static int32_t tci_read_r32s(uint8_t **tb_ptr)
 }
 
 /* Read indexed register (64 bit) from bytecode. */
-static uint64_t tci_read_r64(uint8_t **tb_ptr)
+static inline uint64_t tci_read_r64(uint8_t **tb_ptr)
 {
     uint64_t value = tci_read_reg64(**tb_ptr);
     *tb_ptr += 1;
@@ -259,7 +259,7 @@ static uint64_t tci_read_r64(uint8_t **tb_ptr)
 #endif
 
 /* Read indexed register or constant (native size) from bytecode. */
-static tcg_target_ulong tci_read_ri(uint8_t **tb_ptr)
+static inline tcg_target_ulong tci_read_ri(uint8_t **tb_ptr)
 {
     bool const_arg;
     tcg_target_ulong value;
@@ -274,7 +274,7 @@ static tcg_target_ulong tci_read_ri(uint8_t **tb_ptr)
 }
 
 /* Read indexed register or constant (32 bit) from bytecode. */
-static uint32_t tci_read_ri32(uint8_t **tb_ptr)
+static inline uint32_t tci_read_ri32(uint8_t **tb_ptr)
 {
     bool const_arg;
     uint32_t value;
@@ -290,7 +290,7 @@ static uint32_t tci_read_ri32(uint8_t **tb_ptr)
 
 #if TCG_TARGET_REG_BITS == 64
 /* Read indexed register or constant (64 bit) from bytecode. */
-static uint64_t tci_read_ri64(uint8_t **tb_ptr)
+static inline uint64_t tci_read_ri64(uint8_t **tb_ptr)
 {
     bool const_arg;
     uint64_t value;
@@ -305,7 +305,7 @@ static uint64_t tci_read_ri64(uint8_t **tb_ptr)
 }
 #endif
 
-static bool tci_compare32(uint32_t u0, uint32_t u1, TCGCond condition)
+static inline bool tci_compare32(uint32_t u0, uint32_t u1, TCGCond condition)
 {
     bool result = false;
     int32_t i0 = u0;
@@ -347,7 +347,7 @@ static bool tci_compare32(uint32_t u0, uint32_t u1, TCGCond 
condition)
     return result;
 }
 
-static bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
+static inline bool tci_compare64(uint64_t u0, uint64_t u1, TCGCond condition)
 {
     bool result = false;
     int64_t i0 = u0;
-- 
1.6.3.msysgit.0

Reply via email to