Hi Richard,

On 9/24/21 00:50, Richard Henderson wrote:
On 9/22/21 11:09 AM, WANG Xuerui wrote:

Following up on previous, I suggest:

+static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+                         tcg_target_long val)
+{
+    if (type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    /* Single-instruction cases.  */
+    tcg_target_long low = sextreg(val, 0, 12);
+    if (low == val) {
+        /* val fits in simm12: addi.w rd, zero, val */
+        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
+        return;
+    }
+    if (0x800 <= val && val <= 0xfff) {
+        /* val fits in uimm12: ori rd, zero, val */
+        tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
+        return;
+    }

+    /* Test for PC-relative values that can be loaded faster.  */
+    intptr_t pc_offset = tcg_pcrel_diff(s, (void *)val);
+    if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) == 0) {
+        tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2);
+        return;
+    }

    /* Handle all 32-bit constants. */
    if (val == (int32_t)val) {
        tcg_out_opc_lu12i(s, rd, val >> 12);
        if (low) {
            tcg_out_opc_ori(s, rd, rd, val & 0xfff);
        }
        return;
    }

    /* Handle pc-relative values requiring 2 instructions. */
    intptr_t pc_lo = sextract64(pc_offset, 0, 12);
    intptr_t pc_hi = pc_offset - pc_low;
    if (pc_hi == (int32_t)pc_hi) {
        tcg_out_opc_pcaddu12i(s, rd, pc_hi >> 12);
        tcg_out_opc_addi_d(s, rd, rd, pc_lo);
        return;
    }

    /*
     * Choose signed low part if bit 13 is also set,
     * which gives us a chance of making more zeros.
     * Otherwise, let low be unsigned.
     */
    if ((val & 0x1800) != 0x1800) {
        low = val & 0xfff;
    }
    val -= low;

    tcg_target_long hi20 = sextract64(val, 12, 20);
    tcg_target_long hi32 = sextract64(val, 32, 20);
    tcg_target_long hi52 = sextract64(val, 52, 12);

    /*
     * If we can use the sign-extension of a previous
     * operation, suppress higher -1.
     */
    if (hi32 < 0 && hi52 == -1) {
        hi52 = 0;
    }
    if (hi20 < 0 && hi32 == -1) {
        hi32 = 0;
    }

    /* Initialize RD with the least non-zero component. */
    if (hi20) {
        tcg_out_opc_lu12i_w(s, rd, hi20 >> 12);
    } else if (hi32) {
        /* CU32I_D is modify in place, so RD must be initialized. */
        if (low < 0) {
            tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, low);
        } else {
            tcg_out_opc_ori(s, rd, TCG_REG_ZERO, low);
        }
        low = 0;
    } else {
        tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, hi52);
        hi52 = 0;
    }

    /* Assume that lu12i + ori are fusable */
    if (low > 0) {
        tcg_out_opc_ori(s, rd, rd, low);
    }

    /* Set the high 32 bits */
    if (hi32) {
        tcg_out_opc_cu32i_d(s, rd, hi32);
    }
    if (hi52) {
        tcg_out_opc_cu52i(s, rd, rd, hi52);
    }

    /*
     * Note that any subtraction must come last,
     * because cu32i and cu52i overwrite high bits,
     * and we have computed them as val - low.
     */
    if (low < 0) {
        tcg_out_opc_addi_d(s, rd, rd, low);
    }

Untested, and all bugs are mine, of course.

Try "qemu-system-ppc64 -D z -d in_asm,op_opt,out_asm".
You should see some masking constants like

 ---- 000000001daf2898
 and_i64 CA,r9,$0x7fffffffffffffff        dead: 2  pref=0xffff

  cu52i.d rd, zero, 0x800
  addi.d  rd, rd, -1

 ---- 000000001db0775c
 mov_i64 r26,$0x300000002                 sync: 0  dead: 0 1 pref=0xffff

  ori     rd, zero, 2
  cu32i   rd, 3

Oops, for some reason I only received this at about 8 pm... I'll of course take advantage of the Saturday and compare the generated code for the cases, hopefully incorporating some of your ideas presented here. Thanks for the detailed reply!

r~

Reply via email to