On 9/22/21 11:09 AM, WANG Xuerui wrote:
Following up on previous, I suggest:
+static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+ tcg_target_long val)
+{
+ if (type == TCG_TYPE_I32) {
+ val = (int32_t)val;
+ }
+
+ /* Single-instruction cases. */
+ tcg_target_long low = sextreg(val, 0, 12);
+ if (low == val) {
+ /* val fits in simm12: addi.w rd, zero, val */
+ tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
+ return;
+ }
+ if (0x800 <= val && val <= 0xfff) {
+ /* val fits in uimm12: ori rd, zero, val */
+ tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
+ return;
+ }
+ /* Test for PC-relative values that can be loaded faster. */
+ intptr_t pc_offset = tcg_pcrel_diff(s, (void *)val);
+ if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) ==
0) {
+ tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2);
+ return;
+ }
/* Handle all 32-bit constants. */
if (val == (int32_t)val) {
tcg_out_opc_lu12i(s, rd, val >> 12);
if (low) {
tcg_out_opc_ori(s, rd, rd, val & 0xfff);
}
return;
}
/* Handle pc-relative values requiring 2 instructions. */
intptr_t pc_lo = sextract64(pc_offset, 0, 12);
intptr_t pc_hi = pc_offset - pc_low;
if (pc_hi == (int32_t)pc_hi) {
tcg_out_opc_pcaddu12i(s, rd, pc_hi >> 12);
tcg_out_opc_addi_d(s, rd, rd, pc_lo);
return;
}
/*
* Choose signed low part if bit 13 is also set,
* which gives us a chance of making more zeros.
* Otherwise, let low be unsigned.
*/
if ((val & 0x1800) != 0x1800) {
low = val & 0xfff;
}
val -= low;
tcg_target_long hi20 = sextract64(val, 12, 20);
tcg_target_long hi32 = sextract64(val, 32, 20);
tcg_target_long hi52 = sextract64(val, 52, 12);
/*
* If we can use the sign-extension of a previous
* operation, suppress higher -1.
*/
if (hi32 < 0 && hi52 == -1) {
hi52 = 0;
}
if (hi20 < 0 && hi32 == -1) {
hi32 = 0;
}
/* Initialize RD with the least non-zero component. */
if (hi20) {
tcg_out_opc_lu12i_w(s, rd, hi20 >> 12);
} else if (hi32) {
/* CU32I_D is modify in place, so RD must be initialized. */
if (low < 0) {
tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, low);
} else {
tcg_out_opc_ori(s, rd, TCG_REG_ZERO, low);
}
low = 0;
} else {
tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, hi52);
hi52 = 0;
}
/* Assume that lu12i + ori are fusable */
if (low > 0) {
tcg_out_opc_ori(s, rd, rd, low);
}
/* Set the high 32 bits */
if (hi32) {
tcg_out_opc_cu32i_d(s, rd, hi32);
}
if (hi52) {
tcg_out_opc_cu52i(s, rd, rd, hi52);
}
/*
* Note that any subtraction must come last,
* because cu32i and cu52i overwrite high bits,
* and we have computed them as val - low.
*/
if (low < 0) {
tcg_out_opc_addi_d(s, rd, rd, low);
}
Untested, and all bugs are mine, of course.
Try "qemu-system-ppc64 -D z -d in_asm,op_opt,out_asm".
You should see some masking constants like
---- 000000001daf2898
and_i64 CA,r9,$0x7fffffffffffffff dead: 2 pref=0xffff
cu52i.d rd, zero, 0x800
addi.d rd, rd, -1
---- 000000001db0775c
mov_i64 r26,$0x300000002 sync: 0 dead: 0 1 pref=0xffff
ori rd, zero, 2
cu32i rd, 3