[Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate

C Fontana Thu, 23 Jan 2014 12:09:36 -0800

Hi Peter, just two nits, answering from the tablet so sorry if arrives with
strange formatting, hope not..


On Thursday, January 23, 2014, Peter Maydell
<peter.mayd...@linaro.org<javascript:_e({}, 'cvml',
'peter.mayd...@linaro.org');>>
wrote:

> From: Alex Bennée <alex.ben...@linaro.org>
>
> This implements a subset of the AdvSIMD shift operations (namely all the
> none saturating or narrowing ones). The actual shift generation code
> itself is common for both the scalar and vector cases but wrapped with
> either vector element iteration or the fp reg access.
>
> The rounding operations need to take special care to correctly reflect
> the result of adding rounding bits on high bits as the intermediates do
> not truncate.
>
> Signed-off-by: Alex Bennée <alex.ben...@linaro.org>
> Reviewed-by: Richard Henderson <r...@twiddle.net>
> ---
>  target-arm/translate-a64.c | 381
> ++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 379 insertions(+), 2 deletions(-)
>
> diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
> index 5eabf24..9eb91fc4 100644
> --- a/target-arm/translate-a64.c
> +++ b/target-arm/translate-a64.c
> @@ -5531,15 +5531,220 @@ static void
> disas_simd_scalar_pairwise(DisasContext *s, uint32_t insn)
>      unsupported_encoding(s, insn);
>  }
>
> +/*
> + * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
> + *
> + * This code is handles the common shifting code and is used by both
> + * the vector and scalar code.
> + */
> +static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
> +                                    TCGv_i64 tcg_rnd, bool accumulate,
> +                                    bool is_u, int size, int shift)
> +{
> +    bool extended_result = false;
> +    bool round = !TCGV_IS_UNUSED_I64(tcg_rnd);
> +    int ext_lshift = 0;
> +    TCGv_i64 tcg_src_hi;
> +
> +    if (round && size == 3) {
> +        extended_result = true;
> +        ext_lshift = 64 - shift;
> +        tcg_src_hi = tcg_temp_new_i64();
> +    } else if (shift == 64) {
> +        if (!accumulate && is_u) {
> +            /* result is zero */
> +            tcg_gen_movi_i64(tcg_res, 0);
> +            return;
> +        }
> +    }
> +
> +    /* Deal with the rounding step */
> +    if (round) {
> +        if (extended_result) {
> +            TCGv_i64 tcg_zero = tcg_const_i64(0);
> +            if (!is_u) {
> +                /* take care of sign extending tcg_res */
> +                tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
> +                tcg_gen_add2_i64(tcg_src, tcg_src_hi,
> +                                 tcg_src, tcg_src_hi,
> +                                 tcg_rnd, tcg_zero);
> +            } else {
> +                tcg_gen_add2_i64(tcg_src, tcg_src_hi,
> +                                 tcg_src, tcg_zero,
> +                                 tcg_rnd, tcg_zero);
> +            }
> +            tcg_temp_free_i64(tcg_zero);
> +        } else {
> +            tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
> +        }
> +    }
> +
> +    /* Now do the shift right */
> +    if (round && extended_result) {
> +        /* extended case, >64 bit precision required */
> +        if (ext_lshift == 0) {
> +            /* special case, only high bits matter */
> +            tcg_gen_mov_i64(tcg_src, tcg_src_hi);
> +        } else {
> +            tcg_gen_shri_i64(tcg_src, tcg_src, shift);
> +            tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
> +            tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
> +        }
> +    } else {
> +        if (is_u) {
> +            if (shift == 64) {
> +                /* essentially shifting in 64 zeros */
> +                tcg_gen_movi_i64(tcg_src, 0);
> +            } else {
> +                tcg_gen_shri_i64(tcg_src, tcg_src, shift);
> +            }
> +        } else {
> +            if (shift == 64) {
> +                /* effectively extending the sign-bit */
> +                tcg_gen_sari_i64(tcg_src, tcg_src, 63);
> +            } else {
> +                tcg_gen_sari_i64(tcg_src, tcg_src, shift);
> +            }
> +        }
> +    }
> +
> +    if (accumulate) {
> +        tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
> +    } else {
> +        tcg_gen_mov_i64(tcg_res, tcg_src);
> +    }
> +
> +    if (extended_result) {
> +        tcg_temp_free(tcg_src_hi);



should this be tcg_temp_free_i64 ?



> +    }
> +}
> +
> +/* Common SHL/SLI - Shift left with an optional insert */
> +static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
> +                                 bool insert, int shift)
> +{
> +    if (insert) { /* SLI */
> +        tcg_gen_deposit_i64(tcg_res, tcg_res, tcg_src, shift, 64 - shift);
> +    } else { /* SHL */
> +        tcg_gen_shli_i64(tcg_res, tcg_src, shift);
> +    }
> +}
> +
> +/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate)
> */
> +static void handle_scalar_simd_shri(DisasContext *s,
> +                                    bool is_u, int immh, int immb,
> +                                    int opcode, int rn, int rd)
> +{
> +    const int size = 3;
> +    int immhb = immh << 3 | immb;
> +    int shift = 2 * (8 << size) - immhb;
> +    bool accumulate = false;
> +    bool round = false;
> +    TCGv_i64 tcg_rn;
> +    TCGv_i64 tcg_rd;
> +    TCGv_i64 tcg_round;
> +
> +    if (!extract32(immh, 3, 1)) {
> +        unallocated_encoding(s);
> +        return;
> +    }
> +
> +    switch (opcode) {
> +    case 0x02: /* SSRA / USRA (accumulate) */
> +        accumulate = true;
> +        break;
> +    case 0x04: /* SRSHR / URSHR (rounding) */
> +        round = true;
> +        break;
> +    case 0x06: /* SRSRA / URSRA (accum + rounding) */
> +        accumulate = round = true;
> +        break;
> +    }
> +
> +    if (round) {
> +        uint64_t round_const = 1ULL << (shift - 1);
> +        tcg_round = tcg_const_i64(round_const);
> +    } else {
> +        TCGV_UNUSED_I64(tcg_round);
> +    }
> +
> +    tcg_rn = read_fp_dreg(s, rn);
> +    tcg_rd = accumulate ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
> +
> +    handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
> +                               accumulate, is_u, size, shift);
> +
> +    write_fp_dreg(s, rd, tcg_rd);
> +
> +    tcg_temp_free_i64(tcg_rn);
> +    tcg_temp_free_i64(tcg_rd);
> +    if (round) {
> +        tcg_temp_free_i64(tcg_round);
> +    }
> +}
> +
> +/* SHL/SLI - Scalar shift left */
> +static void handle_scalar_simd_shli(DisasContext *s, bool insert,
> +                                    int immh, int immb, int opcode,
> +                                    int rn, int rd)
> +{
> +    int size = 32 - clz32(immh) - 1;
> +    int immhb = immh << 3 | immb;
> +    int shift = immhb - (8 << size);
> +    TCGv_i64 tcg_rn = new_tmp_a64(s);
> +    TCGv_i64 tcg_rd = new_tmp_a64(s);
> +
> +    if (!extract32(immh, 3, 1)) {
> +        unallocated_encoding(s);
> +        return;
> +    }
> +
> +    tcg_rn = read_fp_dreg(s, rn);
> +    tcg_rd = insert ? read_fp_dreg(s, rd) : tcg_temp_new_i64();
> +
> +    handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
> +
> +    write_fp_dreg(s, rd, tcg_rd);
> +
> +    tcg_temp_free_i64(tcg_rn);
> +    tcg_temp_free_i64(tcg_rd);
> +
> +    return;


no harm but maybe remove return?


> +}
> +
>  /* C3.6.9 AdvSIMD scalar shift by immediate
>   *  31 30  29 28         23 22  19 18  16 15    11  10 9    5 4    0
>   * +-----+---+-------------+------+------+--------+---+------+------+
>   * | 0 1 | U | 1 1 1 1 1 0 | immh | immb | opcode | 1 |  Rn  |  Rd  |
>   * +-----+---+-------------+------+------+--------+---+------+------+
> + *
> + * This is the scalar version so it works on a fixed sized registers
>   */
>  static void disas_simd_scalar_shift_imm(DisasContext *s, uint32_t insn)
>  {
> -    unsupported_encoding(s, insn);
> +    int rd = extract32(insn, 0, 5);
> +    int rn = extract32(insn, 5, 5);
> +    int opcode = extract32(insn, 11, 5);
> +    int immb = extract32(insn, 16, 3);
> +    int immh = extract32(insn, 19, 4);
> +    bool is_u = extract32(insn, 29, 1);
> +
> +    switch (opcode) {
> +    case 0x00: /* SSHR / USHR */
> +    case 0x02: /* SSRA / USRA */
> +    case 0x04: /* SRSHR / URSHR */
> +    case 0x06: /* SRSRA / URSRA */
> +        handle_scalar_simd_shri(s, is_u, immh, immb, opcode, rn, rd);
> +        break;
> +    case 0x0a: /* SHL / SLI */
> +        handle_scalar_simd_shli(s, is_u, immh, immb, opcode, rn, rd);
> +        break;
> +    default:
> +        unsupported_encoding(s, insn);
> +        break;
> +    }
> +
> +    return;


also here



>  }
>
>  /* C3.6.10 AdvSIMD scalar three different
> @@ -5845,6 +6050,150 @@ static void disas_simd_scalar_indexed(DisasContext
> *s, uint32_t insn)
>      unsupported_encoding(s, insn);
>  }
>
> +/* SSHR[RA]/USHR[RA] - Vector shift right (optional rounding/accumulate)
> */
> +static void handle_vec_simd_shri(DisasContext *s, bool is_q, bool is_u,
> +                                 int immh, int immb, int opcode, int rn,
> int rd)
> +{
> +    int size = 32 - clz32(immh) - 1;
> +    int immhb = immh << 3 | immb;
> +    int shift = 2 * (8 << size) - immhb;
> +    bool accumulate = false;
> +    bool round = false;
> +    int dsize = is_q ? 128 : 64;
> +    int esize = 8 << size;
> +    int elements = dsize/esize;
> +    TCGMemOp memop = size | (is_u ? 0 : MO_SIGN);
> +    TCGv_i64 tcg_rn = new_tmp_a64(s);
> +    TCGv_i64 tcg_rd = new_tmp_a64(s);
> +    TCGv_i64 tcg_round;
> +    int i;
> +
> +    if (extract32(immh, 3, 1) && !is_q) {
> +        unallocated_encoding(s);
> +        return;
> +    }
> +
> +    if (size > 3 && !is_q) {
> +        unallocated_encoding(s);
> +        return;
> +    }
> +
> +    switch (opcode) {
> +    case 0x02: /* SSRA / USRA (accumulate) */
> +        accumulate = true;
> +        break;
> +    case 0x04: /* SRSHR / URSHR (rounding) */
> +        round = true;
> +        break;
> +    case 0x06: /* SRSRA / URSRA (accum + rounding) */
> +        accumulate = round = true;
> +        break;
> +    }
> +
> +    if (round) {
> +        uint64_t round_const = 1ULL << (shift - 1);
> +        tcg_round = tcg_const_i64(round_const);
> +    } else {
> +        TCGV_UNUSED_I64(tcg_round);
> +    }
> +
> +    for (i = 0; i < elements; i++) {
> +        read_vec_element(s, tcg_rn, rn, i, memop);
> +        if (accumulate) {
> +            read_vec_element(s, tcg_rd, rd, i, memop);
> +        }
> +
> +        handle_shri_with_rndacc(tcg_rd, tcg_rn, tcg_round,
> +                                accumulate, is_u, size, shift);
> +
> +        write_vec_element(s, tcg_rd, rd, i, size);
> +    }
> +
> +    if (!is_q) {
> +        clear_vec_high(s, rd);
> +    }
> +
> +    if (round) {
> +        tcg_temp_free_i64(tcg_round);
> +    }
> +}
> +
> +/* SHL/SLI - Vector shift left */
> +static void handle_vec_simd_shli(DisasContext *s, bool is_q, bool insert,
> +                                int immh, int immb, int opcode, int rn,
> int rd)
> +{
> +    int size = 32 - clz32(immh) - 1;
> +    int immhb = immh << 3 | immb;
> +    int shift = immhb - (8 << size);
> +    int dsize = is_q ? 128 : 64;
> +    int esize = 8 << size;
> +    int elements = dsize/esize;
> +    TCGv_i64 tcg_rn = new_tmp_a64(s);
> +    TCGv_i64 tcg_rd = new_tmp_a64(s);
> +    int i;
> +
> +    if (extract32(immh, 3, 1) && !is_q) {
> +        unallocated_encoding(s);
> +        return;
> +    }
> +
> +    if (size > 3 && !is_q) {
> +        unallocated_encoding(s);
> +        return;
> +    }
> +
> +    for (i = 0; i < elements; i++) {
> +        read_vec_element(s, tcg_rn, rn, i, size);
> +        if (insert) {
> +            read_vec_element(s, tcg_rd, rd, i, size);
> +        }
> +
> +        handle_shli_with_ins(tcg_rd, tcg_rn, insert, shift);
> +
> +        write_vec_element(s, tcg_rd, rd, i, size);
> +    }
> +
> +    if (!is_q) {
> +        clear_vec_high(s, rd);
> +    }
> +
> +    return;


also here.

Ciao

Claudio


> +}
> +
> +/* USHLL/SHLL - Vector shift left with widening */
> +static void handle_vec_simd_wshli(DisasContext *s, bool is_q, bool is_u,
> +                                 int immh, int immb, int opcode, int rn,
> int rd)
> +{
> +    int size = 32 - clz32(immh) - 1;
> +    int immhb = immh << 3 | immb;
> +    int shift = immhb - (8 << size);
> +    int dsize = 64;
> +    int esize = 8 << size;
> +    int elements = dsize/esize;
> +    TCGv_i64 tcg_rn = new_tmp_a64(s);
> +    TCGv_i64 tcg_rd = new_tmp_a64(s);
> +    int i;
> +
> +    if (size >= 3) {
> +        unallocated_encoding(s);
> +        return;
> +    }
> +
> +    /* For the LL variants the store is larger than the load,
> +     * so if rd == rn we would overwrite parts of our input.
> +     * So load everything right now and use shifts in the main loop.
> +     */
> +    read_vec_element(s, tcg_rn, rn, is_q ? 1 : 0, MO_64);
> +
> +    for (i = 0; i < elements; i++) {
> +        tcg_gen_shri_i64(tcg_rd, tcg_rn, i * esize);
> +        ext_and_shift_reg(tcg_rd, tcg_rd, size | (!is_u << 2), 0);
> +        tcg_gen_shli_i64(tcg_rd, tcg_rd, shift);
> +        write_vec_element(s, tcg_rd, rd, i, size + 1);
> +    }
> +}
> +
> +
>  /* C3.6.14 --
> 1.8.5
>
>

[Qemu-devel] [PATCH v2 8/8] target-arm: A64: Add SIMD shift by immediate

Reply via email to