ppc: Optimize emulation of vpkpx instruction

Aleksandar Markovic Sat, 19 Oct 2019 12:42:16 -0700

On Thursday, October 17, 2019, Stefan Brankovic <stefan.branko...@rt-rk.com>
wrote:


> Optimize altivec instruction vpkpx (Vector Pack Pixel).
> Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register)
> into contigous array of bits in the destination register.
>
> In each iteration of outer loop, the instruction is to be done with
> the 6-5-5 pack for 2 pixels of each doubleword element of each
> source register. The first thing to be done in outer loop is
> choosing which doubleword element of which register is to be used
> in current iteration and it is to be placed in avr variable. The
> next step is to perform 6-5-5 pack of pixels on avr variable in inner
> for loop(2 iterations, 1 for each pixel) and save result in tmp variable.
> In the end of outer for loop, the result is merged in variable called
> result and saved in appropriate doubleword element of vD if the whole
> doubleword is finished(every second iteration). The outer loop has 4
> iterations.
>
>
Check spelling.

Use single quotation marks around variavle names and other code elements.

avr variable-> variable 'avr' (and several similar instances)


> Signed-off-by: Stefan Brankovic <stefan.branko...@rt-rk.com>
> ---
>  target/ppc/helper.h                 |  1 -
>  target/ppc/int_helper.c             | 21 --------
>  target/ppc/translate/vmx-impl.inc.c | 99 ++++++++++++++++++++++++++++++
> ++++++-
>  3 files changed, 98 insertions(+), 23 deletions(-)
>
> diff --git a/target/ppc/helper.h b/target/ppc/helper.h
> index 281e54f..b489b38 100644
> --- a/target/ppc/helper.h
> +++ b/target/ppc/helper.h
> @@ -258,7 +258,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr)
>  DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr)
>  DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr)
>  DEF_HELPER_4(vpkudum, void, env, avr, avr, avr)
> -DEF_HELPER_3(vpkpx, void, avr, avr, avr)
>  DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr)
>  DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr)
>  DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr)
> diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
> index cd00f5e..f910c11 100644
> --- a/target/ppc/int_helper.c
> +++ b/target/ppc/int_helper.c
> @@ -1262,27 +1262,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a,
> ppc_avr_t *b)
>  #else
>  #define PKBIG 0
>  #endif
> -void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
> -{
> -    int i, j;
> -    ppc_avr_t result;
> -#if defined(HOST_WORDS_BIGENDIAN)
> -    const ppc_avr_t *x[2] = { a, b };
> -#else
> -    const ppc_avr_t *x[2] = { b, a };
> -#endif
> -
> -    VECTOR_FOR_INORDER_I(i, u64) {
> -        VECTOR_FOR_INORDER_I(j, u32) {
> -            uint32_t e = x[i]->u32[j];
> -
> -            result.u16[4 * i + j] = (((e >> 9) & 0xfc00) |
> -                                     ((e >> 6) & 0x3e0) |
> -                                     ((e >> 3) & 0x1f));
> -        }
> -    }
> -    *r = result;
> -}
>
>  #define VPK(suffix, from, to, cvt, dosat)                               \
>      void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r,             \
> diff --git a/target/ppc/translate/vmx-impl.inc.c
> b/target/ppc/translate/vmx-impl.inc.c
> index a428ef3..3550ffa 100644
> --- a/target/ppc/translate/vmx-impl.inc.c
> +++ b/target/ppc/translate/vmx-impl.inc.c
> @@ -579,6 +579,103 @@ static void trans_lvsr(DisasContext *ctx)
>  }
>
>  /*
> + * vpkpx VRT,VRA,VRB - Vector Pack Pixel
> + *
> + * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source
> register)
> + * into contigous array of bits in the destination register.
> + */
> +static void trans_vpkpx(DisasContext *ctx)
> +{
> +    int VT = rD(ctx->opcode);
> +    int VA = rA(ctx->opcode);
> +    int VB = rB(ctx->opcode);
> +    TCGv_i64 tmp = tcg_temp_new_i64();
> +    TCGv_i64 shifted = tcg_temp_new_i64();
> +    TCGv_i64 avr = tcg_temp_new_i64();
> +    TCGv_i64 result = tcg_temp_new_i64();
> +    TCGv_i64 result1 = tcg_temp_new_i64();
> +    TCGv_i64 result2 = tcg_temp_new_i64();


'result2' is not needed, 'result' can be used in the final half instead all
the way up to the coping to the destination.


> +    int64_t mask1 = 0x1fULL;
> +    int64_t mask2 = 0x1fULL << 5;
> +    int64_t mask3 = 0x3fULL << 10;
> +    int i, j;
> +    /*
> +     * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword
> +     * element of each source register.
> +     */
> +    for (i = 0; i < 4; i++) {
> +        switch (i) {
> +        case 0:
> +            /*
> +             * Get high doubleword of vA to perform 6-5-5 pack of pixels
> +             * 1 and 2.
> +             */
> +            get_avr64(avr, VA, true);
> +            tcg_gen_movi_i64(result, 0x0ULL);
> +            break;
> +        case 1:
> +            /*
> +             * Get low doubleword of vA to perform 6-5-5 pack of pixels
> +             * 3 and 4.
> +             */
> +            get_avr64(avr, VA, false);
> +            break;
> +        case 2:
> +            /*
> +             * Get high doubleword of vB to perform 6-5-5 pack of pixels
> +             * 5 and 6.
> +             */
> +            get_avr64(avr, VB, true);
> +            tcg_gen_movi_i64(result, 0x0ULL);
> +            break;
> +        case 3:
> +            /*
> +             * Get low doubleword of vB to perform 6-5-5 pack of pixels
> +             * 7 and 8.
> +             */
> +            get_avr64(avr, VB, false);
> +            break;
> +        }
> +        /* Perform the packing for 2 pixels(each iteration for 1). */
> +        tcg_gen_movi_i64(tmp, 0x0ULL);
> +        for (j = 0; j < 2; j++) {
> +            tcg_gen_shri_i64(shifted, avr, (j * 16 + 3));
> +            tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16));
> +            tcg_gen_or_i64(tmp, tmp, shifted);
> +
> +            tcg_gen_shri_i64(shifted, avr, (j * 16 + 6));
> +            tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16));
> +            tcg_gen_or_i64(tmp, tmp, shifted);
> +
> +            tcg_gen_shri_i64(shifted, avr, (j * 16 + 9));
> +            tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16));
> +            tcg_gen_or_i64(tmp, tmp, shifted);
> +        }
> +        if ((i == 0) || (i == 2)) {
> +            tcg_gen_shli_i64(tmp, tmp, 32);
> +        }
> +        tcg_gen_or_i64(result, result, tmp);
> +        if (i == 1) {
> +            /* Place packed pixels 1:4 to high doubleword of vD. */
> +            tcg_gen_mov_i64(result1, result);
> +        }
> +        if (i == 3) {
> +            /* Place packed pixels 5:8 to low doubleword of vD. */
> +            tcg_gen_mov_i64(result2, result);
> +        }


If 'result2' is removed, the last tcg movement is not needed...


> +    }
> +    set_avr64(VT, result1, true);
> +    set_avr64(VT, result2, false);


... and here 'result' should be instead of  'result2'.

A.


    +

> +    tcg_temp_free_i64(tmp);
> +    tcg_temp_free_i64(shifted);
> +    tcg_temp_free_i64(avr);
> +    tcg_temp_free_i64(result);
> +    tcg_temp_free_i64(result1);
> +    tcg_temp_free_i64(result2);
> +}
> +
> +/*
>   * vsl VRT,VRA,VRB - Vector Shift Left
>   *
>   * Shifting left 128 bit value of vA by value specified in bits 125-127
> of vB.
> @@ -1063,7 +1160,7 @@ GEN_VXFORM_ENV(vpksdus, 7, 21);
>  GEN_VXFORM_ENV(vpkshss, 7, 6);
>  GEN_VXFORM_ENV(vpkswss, 7, 7);
>  GEN_VXFORM_ENV(vpksdss, 7, 23);
> -GEN_VXFORM(vpkpx, 7, 12);
> +GEN_VXFORM_TRANS(vpkpx, 7, 12);
>  GEN_VXFORM_ENV(vsum4ubs, 4, 24);
>  GEN_VXFORM_ENV(vsum4sbs, 4, 28);
>  GEN_VXFORM_ENV(vsum4shs, 4, 25);
> --
> 2.7.4
>
>
>

Re: [PATCH v7 2/3] target/ppc: Optimize emulation of vpkpx instruction

Reply via email to