The expansions that we chose in tcg-op.c may be less than optimial. Delay lowering until optimize, so that we have propagated constants and have computed known zero/one masks.
Signed-off-by: Richard Henderson <[email protected]> --- tcg/optimize.c | 194 +++++++++++++++++++++++++++++++++++++++++++------ tcg/tcg-op.c | 113 ++-------------------------- 2 files changed, 178 insertions(+), 129 deletions(-) diff --git a/tcg/optimize.c b/tcg/optimize.c index 801a0a2c68..890c8068fb 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -1652,12 +1652,17 @@ static bool fold_ctpop(OptContext *ctx, TCGOp *op) static bool fold_deposit(OptContext *ctx, TCGOp *op) { - TempOptInfo *t1 = arg_info(op->args[1]); - TempOptInfo *t2 = arg_info(op->args[2]); + TCGArg ret = op->args[0]; + TCGArg arg1 = op->args[1]; + TCGArg arg2 = op->args[2]; int ofs = op->args[3]; int len = op->args[4]; - int width = 8 * tcg_type_size(ctx->type); - uint64_t z_mask, o_mask, s_mask; + TempOptInfo *t1 = arg_info(arg1); + TempOptInfo *t2 = arg_info(arg2); + int width; + uint64_t z_mask, o_mask, s_mask, type_mask, len_mask; + TCGOp *op2; + bool valid; if (ti_is_const(t1) && ti_is_const(t2)) { return tcg_opt_gen_movi(ctx, op, op->args[0], @@ -1665,35 +1670,182 @@ static bool fold_deposit(OptContext *ctx, TCGOp *op) ti_const_val(t2))); } - /* Inserting a value into zero at offset 0. */ - if (ti_is_const_val(t1, 0) && ofs == 0) { - uint64_t mask = MAKE_64BIT_MASK(0, len); + width = 8 * tcg_type_size(ctx->type); + type_mask = MAKE_64BIT_MASK(0, width); + len_mask = MAKE_64BIT_MASK(0, len); + /* Inserting all-zero into a value. */ + if ((t2->z_mask & len_mask) == 0) { op->opc = INDEX_op_and; - op->args[1] = op->args[2]; - op->args[2] = arg_new_constant(ctx, mask); + op->args[2] = arg_new_constant(ctx, ~(len_mask << ofs)); return fold_and(ctx, op); } - /* Inserting zero into a value. */ - if (ti_is_const_val(t2, 0)) { - uint64_t mask = deposit64(-1, ofs, len, 0); - - op->opc = INDEX_op_and; - op->args[2] = arg_new_constant(ctx, mask); - return fold_and(ctx, op); + /* Inserting all-one into a value. */ + if ((t2->o_mask & len_mask) == len_mask) { + op->opc = INDEX_op_or; + op->args[2] = arg_new_constant(ctx, len_mask << ofs); + return fold_or(ctx, op); } - /* The s_mask from the top portion of the deposit is still valid. */ - if (ofs + len == width) { - s_mask = t2->s_mask << ofs; - } else { - s_mask = t1->s_mask & ~MAKE_64BIT_MASK(0, ofs + len); + valid = TCG_TARGET_deposit_valid(ctx->type, ofs, len); + + /* Lower invalid deposit of constant as AND + OR. */ + if (!valid && ti_is_const(t2)) { + uint64_t ins_val = (ti_const_val(t2) & len_mask) << ofs; + + op2 = opt_insert_before(ctx, op, INDEX_op_and, 3); + op2->args[0] = ret; + op2->args[1] = arg1; + op2->args[2] = arg_new_constant(ctx, ~(len_mask << ofs)); + fold_and(ctx, op2); + + op->opc = INDEX_op_or; + op->args[1] = ret; + op->args[2] = arg_new_constant(ctx, ins_val); + return fold_or(ctx, op); } + /* + * Compute result masks before calling other fold_* subroutines + * which could modify the masks of our inputs. + */ z_mask = deposit64(t1->z_mask, ofs, len, t2->z_mask); o_mask = deposit64(t1->o_mask, ofs, len, t2->o_mask); + if (ofs + len < width) { + s_mask = t1->s_mask & ~MAKE_64BIT_MASK(0, ofs + len); + } else { + s_mask = t2->s_mask << ofs; + } + /* Inserting a value into zero. */ + if (ti_is_const_val(t1, 0)) { + uint64_t need_mask; + + /* Always lower deposit into zero at 0 as AND. */ + if (ofs == 0) { + op->opc = INDEX_op_and; + op->args[1] = arg2; + op->args[2] = arg_new_constant(ctx, len_mask); + return fold_and(ctx, op); + } + + /* + * If the portion of the value outside len that remains after + * shifting is zero, we can elide the mask and just shift. + */ + need_mask = t2->z_mask & ~len_mask; + need_mask = (need_mask << ofs) & type_mask; + if (!need_mask) { + op->opc = INDEX_op_shl; + op->args[1] = arg2; + op->args[2] = arg_new_constant(ctx, ofs); + goto done; + } + + /* Lower invalid deposit into zero as AND + SHL or SHL + AND. */ + if (!valid) { + if (TCG_TARGET_extract_valid(ctx->type, 0, ofs + len) && + !TCG_TARGET_extract_valid(ctx->type, 0, len)) { + op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3); + op2->args[0] = ret; + op2->args[1] = arg2; + op2->args[2] = arg_new_constant(ctx, ofs); + + op->opc = INDEX_op_extract; + op->args[1] = ret; + op->args[2] = 0; + op->args[3] = ofs + len; + goto done; + } + + op2 = opt_insert_before(ctx, op, INDEX_op_and, 3); + op2->args[0] = ret; + op2->args[1] = arg2; + op2->args[2] = arg_new_constant(ctx, len_mask); + fold_and(ctx, op2); + + op->opc = INDEX_op_shl; + op->args[1] = ret; + op->args[2] = arg_new_constant(ctx, ofs); + goto done; + } + } + + /* After special cases, lower invalid deposit. */ + if (!valid) { + TCGArg tmp; + bool has_ext2 = tcg_op_supported(INDEX_op_extract2, ctx->type, 0); + bool has_rotl = tcg_op_supported(INDEX_op_rotl, ctx->type, 0); + + /* + * ret = arg2:arg1 >> len + * ret = rotl(ret, len) + */ + if (ofs == 0 && has_ext2 && has_rotl) { + op2 = opt_insert_before(ctx, op, INDEX_op_extract2, 4); + op2->args[0] = ret; + op2->args[1] = arg1; + op2->args[2] = arg2; + op2->args[3] = len; + + op->opc = INDEX_op_rotl; + op->args[1] = ret; + op->args[2] = arg_new_constant(ctx, len); + goto done; + } + + /* + * tmp = arg1 << len + * ret = arg2:tmp >> len + */ + if (ofs + len == width && has_ext2) { + tmp = ret == arg2 ? arg_new_temp(ctx) : ret; + + op2 = opt_insert_before(ctx, op, INDEX_op_shl, 4); + op2->args[0] = tmp; + op2->args[1] = arg1; + op2->args[2] = arg_new_constant(ctx, len); + + op->opc = INDEX_op_extract2; + op->args[0] = ret; + op->args[1] = tmp; + op->args[2] = arg2; + op->args[3] = len; + goto done; + } + + /* + * tmp = arg2 & mask + * ret = arg1 & ~(mask << ofs) + * tmp = tmp << ofs + * ret = ret | tmp + */ + tmp = arg_new_temp(ctx); + + op2 = opt_insert_before(ctx, op, INDEX_op_and, 3); + op2->args[0] = tmp; + op2->args[1] = arg2; + op2->args[2] = arg_new_constant(ctx, len_mask); + fold_and(ctx, op2); + + op2 = opt_insert_before(ctx, op, INDEX_op_shl, 3); + op2->args[0] = tmp; + op2->args[1] = tmp; + op2->args[2] = arg_new_constant(ctx, ofs); + + op2 = opt_insert_before(ctx, op, INDEX_op_and, 3); + op2->args[0] = ret; + op2->args[1] = arg1; + op2->args[2] = arg_new_constant(ctx, ~(len_mask << ofs)); + fold_and(ctx, op2); + + op->opc = INDEX_op_or; + op->args[1] = ret; + op->args[2] = tmp; + } + + done: return fold_masks_zos(ctx, op, z_mask, o_mask, s_mask); } diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 8d67acc4fc..96f72ba381 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -876,9 +876,6 @@ void tcg_gen_rotri_i32(TCGv_i32 ret, TCGv_i32 arg1, int32_t arg2) void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2, unsigned int ofs, unsigned int len) { - uint32_t mask; - TCGv_i32 t1; - tcg_debug_assert(ofs < 32); tcg_debug_assert(len > 0); tcg_debug_assert(len <= 32); @@ -886,39 +883,9 @@ void tcg_gen_deposit_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2, if (len == 32) { tcg_gen_mov_i32(ret, arg2); - return; - } - if (TCG_TARGET_deposit_valid(TCG_TYPE_I32, ofs, len)) { - tcg_gen_op5ii_i32(INDEX_op_deposit, ret, arg1, arg2, ofs, len); - return; - } - - t1 = tcg_temp_ebb_new_i32(); - - if (tcg_op_supported(INDEX_op_extract2, TCG_TYPE_I32, 0)) { - if (ofs + len == 32) { - tcg_gen_shli_i32(t1, arg1, len); - tcg_gen_extract2_i32(ret, t1, arg2, len); - goto done; - } - if (ofs == 0) { - tcg_gen_extract2_i32(ret, arg1, arg2, len); - tcg_gen_rotli_i32(ret, ret, len); - goto done; - } - } - - mask = (1u << len) - 1; - if (ofs + len < 32) { - tcg_gen_andi_i32(t1, arg2, mask); - tcg_gen_shli_i32(t1, t1, ofs); } else { - tcg_gen_shli_i32(t1, arg2, ofs); + tcg_gen_op5ii_i32(INDEX_op_deposit, ret, arg1, arg2, ofs, len); } - tcg_gen_andi_i32(ret, arg1, ~(mask << ofs)); - tcg_gen_or_i32(ret, ret, t1); - done: - tcg_temp_free_i32(t1); } void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg, @@ -932,28 +899,10 @@ void tcg_gen_deposit_z_i32(TCGv_i32 ret, TCGv_i32 arg, if (ofs + len == 32) { tcg_gen_shli_i32(ret, arg, ofs); } else if (ofs == 0) { - tcg_gen_andi_i32(ret, arg, (1u << len) - 1); - } else if (TCG_TARGET_deposit_valid(TCG_TYPE_I32, ofs, len)) { + tcg_gen_extract_i32(ret, arg, 0, len); + } else { TCGv_i32 zero = tcg_constant_i32(0); tcg_gen_op5ii_i32(INDEX_op_deposit, ret, zero, arg, ofs, len); - } else { - /* - * To help two-operand hosts we prefer to zero-extend first, - * which allows ARG to stay live. - */ - if (TCG_TARGET_extract_valid(TCG_TYPE_I32, 0, len)) { - tcg_gen_extract_i32(ret, arg, 0, len); - tcg_gen_shli_i32(ret, ret, ofs); - return; - } - /* Otherwise prefer zero-extension over AND for code size. */ - if (TCG_TARGET_extract_valid(TCG_TYPE_I32, 0, ofs + len)) { - tcg_gen_shli_i32(ret, arg, ofs); - tcg_gen_extract_i32(ret, ret, 0, ofs + len); - return; - } - tcg_gen_andi_i32(ret, arg, (1u << len) - 1); - tcg_gen_shli_i32(ret, ret, ofs); } } @@ -2148,9 +2097,6 @@ void tcg_gen_rotri_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2) void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2, unsigned int ofs, unsigned int len) { - uint64_t mask; - TCGv_i64 t1; - tcg_debug_assert(ofs < 64); tcg_debug_assert(len > 0); tcg_debug_assert(len <= 64); @@ -2158,40 +2104,9 @@ void tcg_gen_deposit_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2, if (len == 64) { tcg_gen_mov_i64(ret, arg2); - return; - } - - if (TCG_TARGET_deposit_valid(TCG_TYPE_I64, ofs, len)) { - tcg_gen_op5ii_i64(INDEX_op_deposit, ret, arg1, arg2, ofs, len); - return; - } - - t1 = tcg_temp_ebb_new_i64(); - - if (tcg_op_supported(INDEX_op_extract2, TCG_TYPE_I64, 0)) { - if (ofs + len == 64) { - tcg_gen_shli_i64(t1, arg1, len); - tcg_gen_extract2_i64(ret, t1, arg2, len); - goto done; - } - if (ofs == 0) { - tcg_gen_extract2_i64(ret, arg1, arg2, len); - tcg_gen_rotli_i64(ret, ret, len); - goto done; - } - } - - mask = (1ull << len) - 1; - if (ofs + len < 64) { - tcg_gen_andi_i64(t1, arg2, mask); - tcg_gen_shli_i64(t1, t1, ofs); } else { - tcg_gen_shli_i64(t1, arg2, ofs); + tcg_gen_op5ii_i64(INDEX_op_deposit, ret, arg1, arg2, ofs, len); } - tcg_gen_andi_i64(ret, arg1, ~(mask << ofs)); - tcg_gen_or_i64(ret, ret, t1); - done: - tcg_temp_free_i64(t1); } void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg, @@ -2206,27 +2121,9 @@ void tcg_gen_deposit_z_i64(TCGv_i64 ret, TCGv_i64 arg, tcg_gen_shli_i64(ret, arg, ofs); } else if (ofs == 0) { tcg_gen_andi_i64(ret, arg, (1ull << len) - 1); - } else if (TCG_TARGET_deposit_valid(TCG_TYPE_I64, ofs, len)) { + } else { TCGv_i64 zero = tcg_constant_i64(0); tcg_gen_op5ii_i64(INDEX_op_deposit, ret, zero, arg, ofs, len); - } else { - /* - * To help two-operand hosts we prefer to zero-extend first, - * which allows ARG to stay live. - */ - if (TCG_TARGET_extract_valid(TCG_TYPE_I64, 0, len)) { - tcg_gen_extract_i64(ret, arg, 0, len); - tcg_gen_shli_i64(ret, ret, ofs); - return; - } - /* Otherwise prefer zero-extension over AND for code size. */ - if (TCG_TARGET_extract_valid(TCG_TYPE_I64, 0, ofs + len)) { - tcg_gen_shli_i64(ret, arg, ofs); - tcg_gen_extract_i64(ret, ret, 0, ofs + len); - return; - } - tcg_gen_andi_i64(ret, arg, (1ull << len) - 1); - tcg_gen_shli_i64(ret, ret, ofs); } } -- 2.43.0
