On Thursday, October 17, 2019, Stefan Brankovic <stefan.branko...@rt-rk.com> wrote:
> Optimize altivec instruction vpkpx (Vector Pack Pixel). > Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source register) > into contigous array of bits in the destination register. > > In each iteration of outer loop, the instruction is to be done with > the 6-5-5 pack for 2 pixels of each doubleword element of each > source register. The first thing to be done in outer loop is > choosing which doubleword element of which register is to be used > in current iteration and it is to be placed in avr variable. The > next step is to perform 6-5-5 pack of pixels on avr variable in inner > for loop(2 iterations, 1 for each pixel) and save result in tmp variable. > In the end of outer for loop, the result is merged in variable called > result and saved in appropriate doubleword element of vD if the whole > doubleword is finished(every second iteration). The outer loop has 4 > iterations. > > Check spelling. Use single quotation marks around variavle names and other code elements. avr variable-> variable 'avr' (and several similar instances) > Signed-off-by: Stefan Brankovic <stefan.branko...@rt-rk.com> > --- > target/ppc/helper.h | 1 - > target/ppc/int_helper.c | 21 -------- > target/ppc/translate/vmx-impl.inc.c | 99 ++++++++++++++++++++++++++++++ > ++++++- > 3 files changed, 98 insertions(+), 23 deletions(-) > > diff --git a/target/ppc/helper.h b/target/ppc/helper.h > index 281e54f..b489b38 100644 > --- a/target/ppc/helper.h > +++ b/target/ppc/helper.h > @@ -258,7 +258,6 @@ DEF_HELPER_4(vpkudus, void, env, avr, avr, avr) > DEF_HELPER_4(vpkuhum, void, env, avr, avr, avr) > DEF_HELPER_4(vpkuwum, void, env, avr, avr, avr) > DEF_HELPER_4(vpkudum, void, env, avr, avr, avr) > -DEF_HELPER_3(vpkpx, void, avr, avr, avr) > DEF_HELPER_5(vmhaddshs, void, env, avr, avr, avr, avr) > DEF_HELPER_5(vmhraddshs, void, env, avr, avr, avr, avr) > DEF_HELPER_5(vmsumuhm, void, env, avr, avr, avr, avr) > diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c > index cd00f5e..f910c11 100644 > --- a/target/ppc/int_helper.c > +++ b/target/ppc/int_helper.c > @@ -1262,27 +1262,6 @@ void helper_vpmsumd(ppc_avr_t *r, ppc_avr_t *a, > ppc_avr_t *b) > #else > #define PKBIG 0 > #endif > -void helper_vpkpx(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) > -{ > - int i, j; > - ppc_avr_t result; > -#if defined(HOST_WORDS_BIGENDIAN) > - const ppc_avr_t *x[2] = { a, b }; > -#else > - const ppc_avr_t *x[2] = { b, a }; > -#endif > - > - VECTOR_FOR_INORDER_I(i, u64) { > - VECTOR_FOR_INORDER_I(j, u32) { > - uint32_t e = x[i]->u32[j]; > - > - result.u16[4 * i + j] = (((e >> 9) & 0xfc00) | > - ((e >> 6) & 0x3e0) | > - ((e >> 3) & 0x1f)); > - } > - } > - *r = result; > -} > > #define VPK(suffix, from, to, cvt, dosat) \ > void helper_vpk##suffix(CPUPPCState *env, ppc_avr_t *r, \ > diff --git a/target/ppc/translate/vmx-impl.inc.c > b/target/ppc/translate/vmx-impl.inc.c > index a428ef3..3550ffa 100644 > --- a/target/ppc/translate/vmx-impl.inc.c > +++ b/target/ppc/translate/vmx-impl.inc.c > @@ -579,6 +579,103 @@ static void trans_lvsr(DisasContext *ctx) > } > > /* > + * vpkpx VRT,VRA,VRB - Vector Pack Pixel > + * > + * Rearranges 8 pixels coded in 6-5-5 pattern (4 from each source > register) > + * into contigous array of bits in the destination register. > + */ > +static void trans_vpkpx(DisasContext *ctx) > +{ > + int VT = rD(ctx->opcode); > + int VA = rA(ctx->opcode); > + int VB = rB(ctx->opcode); > + TCGv_i64 tmp = tcg_temp_new_i64(); > + TCGv_i64 shifted = tcg_temp_new_i64(); > + TCGv_i64 avr = tcg_temp_new_i64(); > + TCGv_i64 result = tcg_temp_new_i64(); > + TCGv_i64 result1 = tcg_temp_new_i64(); > + TCGv_i64 result2 = tcg_temp_new_i64(); 'result2' is not needed, 'result' can be used in the final half instead all the way up to the coping to the destination. > + int64_t mask1 = 0x1fULL; > + int64_t mask2 = 0x1fULL << 5; > + int64_t mask3 = 0x3fULL << 10; > + int i, j; > + /* > + * In each iteration do the 6-5-5 pack for 2 pixels of each doubleword > + * element of each source register. > + */ > + for (i = 0; i < 4; i++) { > + switch (i) { > + case 0: > + /* > + * Get high doubleword of vA to perform 6-5-5 pack of pixels > + * 1 and 2. > + */ > + get_avr64(avr, VA, true); > + tcg_gen_movi_i64(result, 0x0ULL); > + break; > + case 1: > + /* > + * Get low doubleword of vA to perform 6-5-5 pack of pixels > + * 3 and 4. > + */ > + get_avr64(avr, VA, false); > + break; > + case 2: > + /* > + * Get high doubleword of vB to perform 6-5-5 pack of pixels > + * 5 and 6. > + */ > + get_avr64(avr, VB, true); > + tcg_gen_movi_i64(result, 0x0ULL); > + break; > + case 3: > + /* > + * Get low doubleword of vB to perform 6-5-5 pack of pixels > + * 7 and 8. > + */ > + get_avr64(avr, VB, false); > + break; > + } > + /* Perform the packing for 2 pixels(each iteration for 1). */ > + tcg_gen_movi_i64(tmp, 0x0ULL); > + for (j = 0; j < 2; j++) { > + tcg_gen_shri_i64(shifted, avr, (j * 16 + 3)); > + tcg_gen_andi_i64(shifted, shifted, mask1 << (j * 16)); > + tcg_gen_or_i64(tmp, tmp, shifted); > + > + tcg_gen_shri_i64(shifted, avr, (j * 16 + 6)); > + tcg_gen_andi_i64(shifted, shifted, mask2 << (j * 16)); > + tcg_gen_or_i64(tmp, tmp, shifted); > + > + tcg_gen_shri_i64(shifted, avr, (j * 16 + 9)); > + tcg_gen_andi_i64(shifted, shifted, mask3 << (j * 16)); > + tcg_gen_or_i64(tmp, tmp, shifted); > + } > + if ((i == 0) || (i == 2)) { > + tcg_gen_shli_i64(tmp, tmp, 32); > + } > + tcg_gen_or_i64(result, result, tmp); > + if (i == 1) { > + /* Place packed pixels 1:4 to high doubleword of vD. */ > + tcg_gen_mov_i64(result1, result); > + } > + if (i == 3) { > + /* Place packed pixels 5:8 to low doubleword of vD. */ > + tcg_gen_mov_i64(result2, result); > + } If 'result2' is removed, the last tcg movement is not needed... > + } > + set_avr64(VT, result1, true); > + set_avr64(VT, result2, false); ... and here 'result' should be instead of 'result2'. A. + > + tcg_temp_free_i64(tmp); > + tcg_temp_free_i64(shifted); > + tcg_temp_free_i64(avr); > + tcg_temp_free_i64(result); > + tcg_temp_free_i64(result1); > + tcg_temp_free_i64(result2); > +} > + > +/* > * vsl VRT,VRA,VRB - Vector Shift Left > * > * Shifting left 128 bit value of vA by value specified in bits 125-127 > of vB. > @@ -1063,7 +1160,7 @@ GEN_VXFORM_ENV(vpksdus, 7, 21); > GEN_VXFORM_ENV(vpkshss, 7, 6); > GEN_VXFORM_ENV(vpkswss, 7, 7); > GEN_VXFORM_ENV(vpksdss, 7, 23); > -GEN_VXFORM(vpkpx, 7, 12); > +GEN_VXFORM_TRANS(vpkpx, 7, 12); > GEN_VXFORM_ENV(vsum4ubs, 4, 24); > GEN_VXFORM_ENV(vsum4sbs, 4, 28); > GEN_VXFORM_ENV(vsum4shs, 4, 25); > -- > 2.7.4 > > >