On 05/10/2015 03:45 PM, Chen Gang wrote: > +static void gen_cmpltsi(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, int8_t imm8) > +{ > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "cmpltsi r%d, r%d, %d\n", > + rdst, rsrc, imm8); > + tcg_gen_setcondi_i64(TCG_COND_LTU, dest_gr(dc, rdst), load_gr(dc, rsrc), > + (int64_t)imm8); > +} > + > +static void gen_cmpltui(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t imm8) > +{ > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "cmpltui r%d, r%d, %d\n", > + rdst, rsrc, imm8); > + tcg_gen_setcondi_i64(TCG_COND_LTU, > + dest_gr(dc, rdst), load_gr(dc, rsrc), imm8); > +} > + > +static void gen_cmpeqi(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t imm8) > +{ > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "cmpeqi r%d, r%d, %d\n", rdst, rsrc, > imm8); > + tcg_gen_setcondi_i64(TCG_COND_EQ, > + dest_gr(dc, rdst), load_gr(dc, rsrc), imm8); > +}
Merge these. > + > +static void gen_cmp(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb, TCGCond cond) > +{ > + const char *prefix; > + > + switch (cond) { > + case TCG_COND_EQ: > + prefix = "eq"; > + break; > + case TCG_COND_LE: > + prefix = "les"; > + break; > + case TCG_COND_LEU: > + prefix = "leu"; > + break; > + case TCG_COND_LT: > + prefix = "lts"; > + break; > + case TCG_COND_LTU: > + prefix = "ltu"; > + break; > + case TCG_COND_NE: > + prefix = "ne"; > + break; > + default: > + dc->exception = TILEGX_EXCP_OPCODE_UNKNOWN; > + return; > + } > + > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "cmp%s r%d, r%d, r%d\n", > + prefix, rdst, rsrc, rsrcb); Better to just pass down the opcode name with the TCGCond rather than trying to recreate it. Then there's no need for a switch, nor a need for a confusing TILEGX_EXCP_OPCODE_UNKNOWN path. > +static void gen_exch(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb, int excp) > +{ > + const char *prefix, *width; > + > + switch (excp) { > + case TILEGX_EXCP_OPCODE_EXCH4: > + prefix = ""; > + width = "4"; > + break; > + case TILEGX_EXCP_OPCODE_EXCH: > + prefix = ""; > + width = ""; > + break; > + case TILEGX_EXCP_OPCODE_CMPEXCH4: > + prefix = "cmp"; > + width = "4"; > + break; > + case TILEGX_EXCP_OPCODE_CMPEXCH: > + prefix = "cmp"; > + width = ""; > + break; > + default: > + dc->exception = TILEGX_EXCP_OPCODE_UNKNOWN; > + return; > + } Likewise. > +static void gen_v1cmpeqi(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t imm8) > +{ > + int count; > + TCGv vdst = dest_gr(dc, rdst); > + TCGv tmp = tcg_temp_new_i64(); > + > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "v1cmpeqi r%d, r%d, %d\n", > + rdst, rsrc, imm8); > + > + tcg_gen_movi_i64(vdst, 0); > + > + for (count = 0; count < 8; count++) { > + tcg_gen_shri_i64(tmp, load_gr(dc, rsrc), (8 - count - 1) * 8); > + tcg_gen_andi_i64(tmp, tmp, 0xff); > + tcg_gen_setcondi_i64(TCG_COND_EQ, tmp, tmp, imm8); > + tcg_gen_or_i64(vdst, vdst, tmp); > + tcg_gen_shli_i64(vdst, vdst, 8); For all of these vector instructions, I would encourage you to use helpers to extract and insert values. Extraction has little choice but to use a shift and a mask, as you use here. But insertion can use tcg_gen_deposit_i64. I think that is a lot easier to reason with than your construction here which sequentially shifts vdst. E.g. static inline void extract_v1(TCGv out, TCGv in, unsigned byte) { tcg_gen_shri_i64(out, in, byte * 8); tcg_gen_ext8u_i64(out, out); } static inline void insert_v1(TCGv out, TCGv in, unsigned byte) { tcg_gen_deposit_i64(out, out, in, byte * 8, 8); } This loop then becomes TCGv vsrc = load_gr(dc, src); for (count = 0; count < 8; ++count) { extract_v1(tmp, vsrc, count); tcg_gen_setcondi_i64(TCG_COND_EQ, tmp, tmp, imm8); insert_v1(vdst, tmp, count); } > +static void gen_v1int_l(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb) > +{ > + int count; > + TCGv vdst = dest_gr(dc, rdst); > + TCGv tmp = tcg_temp_new_i64(); > + > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "v1int_l r%d, r%d, r%d\n", > + rdst, rsrc, rsrcb); > + > + tcg_gen_movi_i64(vdst, 0); > + for (count = 0; count < 4; count++) { > + > + tcg_gen_shli_i64(vdst, vdst, 8); > + > + tcg_gen_shri_i64(tmp, load_gr(dc, rsrc), (4 - count - 1) * 8); > + tcg_gen_andi_i64(tmp, tmp, 0xff); > + tcg_gen_or_i64(vdst, vdst, tmp); > + tcg_gen_shli_i64(vdst, vdst, 8); > + > + tcg_gen_shri_i64(tmp, load_gr(dc, rsrcb), (4 - count - 1) * 8); > + tcg_gen_andi_i64(tmp, tmp, 0xff); > + tcg_gen_or_i64(vdst, vdst, tmp); > + } > + tcg_temp_free_i64(tmp); TCGv vsrc = load_gr(dc, rsrc); TCGv vsrcb = load_gr(dc, rsrcb); for (count = 0; count < 4; ++count) { extract_v1(tmp, vsrc, count); insert_v1(vdst, tmp, count * 2); extract_v1(tmp, vsrcb, count); insert_v1(vdst, tmp, count * 2 + 1); } > +} > + > +/* > + * Functional Description > + * > + * uint64_t output = 0; > + * uint32_t counter; > + * for (counter = 0; counter < (WORD_SIZE / 32); counter++) > + * { > + * bool asel = ((counter & 1) == 1); > + * int in_sel = 0 + counter / 2; > + * int32_t srca = get4Byte (rf[SrcA], in_sel); > + * int32_t srcb = get4Byte (rf[SrcB], in_sel); > + * output = set4Byte (output, counter, (asel ? srca : srcb)); > + * } > + * rf[Dest] = output; > +*/ > + > +static void gen_v4int_l(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb) > +{ > + TCGv vdst = dest_gr(dc, rdst); > + TCGv tmp = tcg_temp_new_i64(); > + > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "v4int_l r%d, r%d, r%d\n", > + rdst, rsrc, rsrcb); > + > + tcg_gen_andi_i64(vdst, load_gr(dc, rsrc), 0xffffffff); > + tcg_gen_shli_i64(vdst, vdst, 8); > + tcg_gen_andi_i64(tmp, load_gr(dc, rsrcb), 0xffffffff); > + tcg_gen_or_i64(vdst, vdst, tmp); And herein is a bug, that I'd hope using the helper functions would avoid: you shift by 8 instead of 32. This function simplifies to tcg_gen_deposit_i64(vdst, load_gr(dc, rsrc), load_gr(dc, rsrcb), 32, 32); > +/* > + * uint64_t mask = 0; > + * int64_t background = ((rf[SrcA] >> BFEnd) & 1) ? -1ULL : 0ULL; > + * mask = ((-1ULL) ^ ((-1ULL << ((BFEnd - BFStart) & 63)) << 1)); > + * uint64_t rot_src = (((uint64_t) rf[SrcA]) >> BFStart) > + * | (rf[SrcA] << (64 - BFStart)); > + * rf[Dest] = (rot_src & mask) | (background & ~mask); > + */ > +static void gen_bfexts(struct DisasContext *dc, uint8_t rdst, uint8_t rsrc, > + uint8_t start, uint8_t end) > +{ > + uint64_t mask = (-1ULL) ^ ((-1ULL << ((end - start) & 63)) << 1); > + TCGv vldst = tcg_temp_local_new_i64(); > + TCGv tmp = tcg_temp_local_new_i64(); > + TCGv pmsk = tcg_const_i64(-1ULL); Why the local temps? > + > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "bfexts r%d, r%d, %d, %d\n", > + rdst, rsrc, start, end); > + > + tcg_gen_rotri_i64(vldst, load_gr(dc, rsrc), start); > + tcg_gen_andi_i64(vldst, vldst, mask); > + > + tcg_gen_shri_i64(tmp, load_gr(dc, rsrc), end); > + tcg_gen_andi_i64(tmp, tmp, 1); > + tcg_gen_movcond_i64(TCG_COND_EQ, tmp, tmp, load_zero(dc), > + load_zero(dc), pmsk); This movcond is equivalent to negation. > +/* > + Functional Description > + uint64_t a = rf[SrcA]; > + uint64_t b = rf[SrcB]; > + uint64_t d = rf[Dest]; > + uint64_t output = 0; > + unsigned int counter; > + for (counter = 0; counter < (WORD_SIZE / BYTE_SIZE); counter++) > + { > + int sel = getByte (b, counter) & 0xf; > + uint8_t byte = (sel < 8) ? getByte (d, sel) : getByte (a, (sel - > 8)); > + output = setByte (output, counter, byte); > + } > + rf[Dest] = output; > + */ > +static void gen_shufflebytes(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb) I strongly suggest this be moved to op_helper.c. It's too big. > +/* FIXME: At present, skip unalignment checking */ > +static void gen_ld(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, TCGMemOp ops) Alignment checks would never be done here anyway. Again, pass down the opcode string rather than rebuild. r~