ppc: Optimize emulation of vupkhpx and vupklpx instructions

Aleksandar Markovic Sat, 19 Oct 2019 13:07:33 -0700

On Thursday, October 17, 2019, Stefan Brankovic <stefan.branko...@rt-rk.com>
wrote:


> 'trans_vupkpx' function implements both vupkhpx and vupklpx instructions
> with
> argument 'high' determine which instruction is processed. Instructions are
> implemented in two 'for' loops. Outer 'for' loop repeats unpacking two
> times,
> since both doubleword elements of destination register are formed the same
> way.
> It also stores result of every iteration in temporary register, that is
> later
> transferred to destination register. Inner 'for' loop does unpacking of
> pixels
> and forms resulting doubleword 32 by 32 bits.
>
> Signed-off-by: Stefan Brankovic <stefan.branko...@rt-rk.com>
> ---
>  target/ppc/helper.h                 |  2 -
>  target/ppc/int_helper.c             | 20 --------
>  target/ppc/translate/vmx-impl.inc.c | 91 ++++++++++++++++++++++++++++++
> ++++++-
>  3 files changed, 89 insertions(+), 24 deletions(-)
>
> diff --git a/target/ppc/helper.h b/target/ppc/helper.h
> index b489b38..fd06b56 100644
> --- a/target/ppc/helper.h
> +++ b/target/ppc/helper.h
> @@ -233,8 +233,6 @@ DEF_HELPER_2(vextsh2d, void, avr, avr)
>  DEF_HELPER_2(vextsw2d, void, avr, avr)
>  DEF_HELPER_2(vnegw, void, avr, avr)
>  DEF_HELPER_2(vnegd, void, avr, avr)
> -DEF_HELPER_2(vupkhpx, void, avr, avr)
> -DEF_HELPER_2(vupklpx, void, avr, avr)
>  DEF_HELPER_2(vupkhsb, void, avr, avr)
>  DEF_HELPER_2(vupkhsh, void, avr, avr)
>  DEF_HELPER_2(vupkhsw, void, avr, avr)
> diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
> index f910c11..9ee667d 100644
> --- a/target/ppc/int_helper.c
> +++ b/target/ppc/int_helper.c
> @@ -1737,26 +1737,6 @@ void helper_vsum4ubs(CPUPPCState *env, ppc_avr_t
> *r, ppc_avr_t *a, ppc_avr_t *b)
>  #define UPKHI 0
>  #define UPKLO 1
>  #endif
> -#define VUPKPX(suffix, hi)                                              \
> -    void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)                \
> -    {                                                                   \
> -        int i;                                                          \
> -        ppc_avr_t result;                                               \
> -                                                                        \
> -        for (i = 0; i < ARRAY_SIZE(r->u32); i++) {                      \
> -            uint16_t e = b->u16[hi ? i : i + 4];                        \
> -            uint8_t a = (e >> 15) ? 0xff : 0;                           \
> -            uint8_t r = (e >> 10) & 0x1f;                               \
> -            uint8_t g = (e >> 5) & 0x1f;                                \
> -            uint8_t b = e & 0x1f;                                       \
> -                                                                        \
> -            result.u32[i] = (a << 24) | (r << 16) | (g << 8) | b;       \
> -        }                                                               \
> -        *r = result;                                                    \
> -    }
> -VUPKPX(lpx, UPKLO)
> -VUPKPX(hpx, UPKHI)
> -#undef VUPKPX
>
>  #define VUPK(suffix, unpacked, packee, hi)                              \
>      void helper_vupk##suffix(ppc_avr_t *r, ppc_avr_t *b)                \
> diff --git a/target/ppc/translate/vmx-impl.inc.c
> b/target/ppc/translate/vmx-impl.inc.c
> index 3550ffa..09d80d6 100644
> --- a/target/ppc/translate/vmx-impl.inc.c
> +++ b/target/ppc/translate/vmx-impl.inc.c
> @@ -1031,6 +1031,95 @@ static void trans_vclzd(DisasContext *ctx)
>      tcg_temp_free_i64(avr);
>  }
>
> +/*
> + * vupkhpx VRT,VRB - Vector Unpack High Pixel
> + * vupklpx VRT,VRB - Vector Unpack Low Pixel
> + *
> + * Unpacks 4 pixels coded in 1-5-5-5 pattern from high/low doubleword
> element
> + * of source register into contigous array of bits in the destination
> register.
> + * Argument 'high' determines if high or low doubleword element of source
> + * register is processed.
> + */
> +static void trans_vupkpx(DisasContext *ctx, int high)
> +{
> +    int VT = rD(ctx->opcode);
> +    int VB = rB(ctx->opcode);
> +    TCGv_i64 tmp = tcg_temp_new_i64();
> +    TCGv_i64 avr = tcg_temp_new_i64();
> +    TCGv_i64 result = tcg_temp_new_i64();
> +    TCGv_i64 result1 = tcg_temp_new_i64();
> +    TCGv_i64 result2 = tcg_temp_new_i64();
> +    int64_t mask1 = 0x1fULL;
> +    int64_t mask2 = 0x1fULL << 8;
> +    int64_t mask3 = 0x1fULL << 16;
> +    int64_t mask4 = 0xffULL << 56;
> +    int i, j;
> +
> +    if (high == 1) {
> +        get_avr64(avr, VB, true);
> +    } else {
> +        get_avr64(avr, VB, false);
> +    }
> +
> +    tcg_gen_movi_i64(result, 0x0ULL);
> +    for (i = 0; i < 2; i++) {
> +        for (j = 0; j < 2; j++) {
> +            tcg_gen_shli_i64(tmp, avr, (j * 16));
> +            tcg_gen_andi_i64(tmp, tmp, mask1 << (j * 32));
> +            tcg_gen_or_i64(result, result, tmp);
> +
> +            tcg_gen_shli_i64(tmp, avr, 3 + (j * 16));
> +            tcg_gen_andi_i64(tmp, tmp, mask2 << (j * 32));
> +            tcg_gen_or_i64(result, result, tmp);
> +
> +            tcg_gen_shli_i64(tmp, avr, 6 + (j * 16));
> +            tcg_gen_andi_i64(tmp, tmp, mask3 << (j * 32));
> +            tcg_gen_or_i64(result, result, tmp);
> +
> +            tcg_gen_shri_i64(tmp, avr, (j * 16));
> +            tcg_gen_ext16s_i64(tmp, tmp);
> +            tcg_gen_andi_i64(tmp, tmp, mask4);
> +            tcg_gen_shri_i64(tmp, tmp, (32 * (1 - j)));
> +            tcg_gen_or_i64(result, result, tmp);
> +        }
> +        if (i == 0) {
> +            tcg_gen_mov_i64(result1, result);
> +            tcg_gen_movi_i64(result, 0x0ULL);
> +            tcg_gen_shri_i64(avr, avr, 32);
> +        }
> +        if (i == 1) {
> +            tcg_gen_mov_i64(result2, result);
> +        }
> +    }
> +
> +    set_avr64(VT, result1, false);
> +    set_avr64(VT, result2, true);
> +
> +    tcg_temp_free_i64(tmp);
> +    tcg_temp_free_i64(avr);
> +    tcg_temp_free_i64(result);
> +    tcg_temp_free_i64(result1);
> +    tcg_temp_free_i64(result2);
> +}
> +
> +static void gen_vupkhpx(DisasContext *ctx)
> +{
> +    if (unlikely(!ctx->altivec_enabled)) {
> +        gen_exception(ctx, POWERPC_EXCP_VPU);
> +        return;
> +    }
> +    trans_vupkpx(ctx, 1);
> +}
> +
> +static void gen_vupklpx(DisasContext *ctx)
> +{
> +    if (unlikely(!ctx->altivec_enabled)) {
> +        gen_exception(ctx, POWERPC_EXCP_VPU);
> +        return;
> +    }
> +    trans_vupkpx(ctx, 0);
> +}
> +
>  GEN_VXFORM(vmuloub, 4, 0);
>  GEN_VXFORM(vmulouh, 4, 1);
>  GEN_VXFORM(vmulouw, 4, 2);
> @@ -1348,8 +1437,6 @@ GEN_VXFORM_NOA(vupkhsw, 7, 25);
>  GEN_VXFORM_NOA(vupklsb, 7, 10);
>  GEN_VXFORM_NOA(vupklsh, 7, 11);
>  GEN_VXFORM_NOA(vupklsw, 7, 27);
> -GEN_VXFORM_NOA(vupkhpx, 7, 13);
> -GEN_VXFORM_NOA(vupklpx, 7, 15);


There is inconsistency here compared to your previous patches. There should
be lines:

GEN_VXFORM_TRANS(vupkhpx, 7, 13);
GEN_VXFORM_TRANS(vupklpx, 7, 15);

and there should be two new functions trans_vupkhpx() and trans_vupklpx()
drfined as thin wrappers around trans_vupkpx(). gen_vupkhpx() and
gen_vupklpx() should be deleted.


>  GEN_VXFORM_NOA_ENV(vrefp, 5, 4);
>  GEN_VXFORM_NOA_ENV(vrsqrtefp, 5, 5);
>  GEN_VXFORM_NOA_ENV(vexptefp, 5, 6);
> --
> 2.7.4
>
>
>

Re: [PATCH v7 3/3] target/ppc: Optimize emulation of vupkhpx and vupklpx instructions

Reply via email to