On 2016-12-01 11:26:56 +0200, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
> 
> This reduces the code size of libavcodec/arm/vp9itxfm_neon.o from
> 15324 to 12388 bytes.
> 
> This gives a small slowdown of a couple tens of cycles, up to around
> 150 cycles for the full case of the largest transform, but makes
> it more feasible to add more optimized versions of these transforms.
> 
> Before:                              Cortex A7       A8       A9      A53
> vp9_inv_dct_dct_16x16_sub4_add_neon:    2063.4   1516.0   1719.5   1245.1
> vp9_inv_dct_dct_16x16_sub16_add_neon:   3279.3   2454.5   2525.2   1982.3
> vp9_inv_dct_dct_32x32_sub4_add_neon:   10750.0   7955.4   8525.6   6754.2
> vp9_inv_dct_dct_32x32_sub32_add_neon:  18574.0  17108.4  14216.7  12010.2
> 
> After:
> vp9_inv_dct_dct_16x16_sub4_add_neon:    2060.8   1608.5   1735.7   1262.0
> vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
> vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
> vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9
> ---
>  libavcodec/arm/vp9itxfm_neon.S | 43 
> +++++++++++++++++++++++++-----------------
>  1 file changed, 26 insertions(+), 17 deletions(-)
> 
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> index 5abe435..22e63e5 100644
> --- a/libavcodec/arm/vp9itxfm_neon.S
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -534,7 +534,7 @@ function idct16x16_dc_add_neon
>  endfunc
>  .ltorg
>  
> -.macro idct16
> +function idct16
>          mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  
> d24 = t1a
>          mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = 
> t3a
>          mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = 
> t7a
> @@ -580,9 +580,10 @@ endfunc
>          vmov            d4,  d21                         @ d4  = t10a
>          butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 
> = out[11]
>          butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 
> = out[10]
> -.endm
> +        bx              lr
> +endfunc
>  
> -.macro iadst16
> +function iadst16
>          movrel          r12, iadst16_coeffs
>          vld1.16         {q0-q1}, [r12,:128]
>  
> @@ -653,7 +654,8 @@ endfunc
>  
>          vmov            d16, d2
>          vmov            d30, d4
> -.endm
> +        bx              lr
> +endfunc
>  
>  .macro itxfm16_1d_funcs txfm
>  @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> @@ -662,6 +664,8 @@ endfunc
>  @ r1 = slice offset
>  @ r2 = src
>  function \txfm\()16_1d_4x16_pass1_neon
> +        push            {lr}
> +
>          mov             r12, #32
>          vmov.s16        q2, #0
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> @@ -669,7 +673,7 @@ function \txfm\()16_1d_4x16_pass1_neon
>          vst1.16         {d4},  [r2,:64], r12
>  .endr
>  
> -        \txfm\()16
> +        bl              \txfm\()16
>  
>          @ Do four 4x4 transposes. Originally, d16-d31 contain the
>          @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
> @@ -682,7 +686,7 @@ function \txfm\()16_1d_4x16_pass1_neon
>  .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
>          vst1.16         {d\i}, [r0,:64]!
>  .endr
> -        bx              lr
> +        pop             {pc}
>  1:
>          @ Special case: For the last input column (r1 == 12),
>          @ which would be stored as the last row in the temp buffer,
> @@ -709,7 +713,7 @@ function \txfm\()16_1d_4x16_pass1_neon
>          vmov            d29, d17
>          vmov            d30, d18
>          vmov            d31, d19
> -        bx              lr
> +        pop             {pc}
>  endfunc
>  
>  @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> @@ -719,6 +723,7 @@ endfunc
>  @ r2 = src (temp buffer)
>  @ r3 = slice offset
>  function \txfm\()16_1d_4x16_pass2_neon
> +        push            {lr}
>          mov             r12, #32
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
>          vld1.16         {d\i}, [r2,:64], r12
> @@ -732,7 +737,7 @@ function \txfm\()16_1d_4x16_pass2_neon
>  
>          add             r3,  r0,  r1
>          lsl             r1,  r1,  #1
> -        \txfm\()16
> +        bl              \txfm\()16
>  
>  .macro load_add_store coef0, coef1, coef2, coef3
>          vrshr.s16       \coef0, \coef0, #6
> @@ -773,7 +778,7 @@ function \txfm\()16_1d_4x16_pass2_neon
>          load_add_store  q12, q13, q14, q15
>  .purgem load_add_store
>  
> -        bx              lr
> +        pop             {pc}
>  endfunc
>  .endm
>  
> @@ -908,7 +913,7 @@ function idct32x32_dc_add_neon
>          bx              lr
>  endfunc
>  
> -.macro idct32_odd
> +function idct32_odd
>          movrel          r12, idct_coeffs
>          add             r12, r12, #32
>          vld1.16         {q0-q1}, [r12,:128]
> @@ -967,7 +972,8 @@ endfunc
>          mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 
> = t21a
>          mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 
> = t22
>          mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 
> = t23a
> -.endm
> +        bx              lr
> +endfunc
>  
>  @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
>  @ We don't have register space to do a single pass IDCT of 4x32 though,
> @@ -979,6 +985,8 @@ endfunc
>  @ r1 = unused
>  @ r2 = src
>  function idct32_1d_4x32_pass1_neon
> +        push            {lr}
> +
>          movrel          r12, idct_coeffs
>          vld1.16         {q0-q1}, [r12,:128]
>  
> @@ -992,7 +1000,7 @@ function idct32_1d_4x32_pass1_neon
>          vst1.16         {d4},  [r2,:64], r12
>  .endr
>  
> -        idct16
> +        bl              idct16
>  
>          @ Do four 4x4 transposes. Originally, d16-d31 contain the
>          @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
> @@ -1028,7 +1036,7 @@ function idct32_1d_4x32_pass1_neon
>          vst1.16         {d4},  [r2,:64], r12
>  .endr
>  
> -        idct32_odd
> +        bl              idct32_odd
>  
>          transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, 
> d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
>  
> @@ -1054,7 +1062,7 @@ function idct32_1d_4x32_pass1_neon
>          store_rev       29, 25, 21, 17
>          store_rev       28, 24, 20, 16
>  .purgem store_rev
> -        bx              lr
> +        pop             {pc}
>  endfunc
>  .ltorg
>  
> @@ -1065,6 +1073,7 @@ endfunc
>  @ r1 = dst stride
>  @ r2 = src (temp buffer)
>  function idct32_1d_4x32_pass2_neon
> +        push            {lr}
>          movrel          r12, idct_coeffs
>          vld1.16         {q0-q1}, [r12,:128]
>  
> @@ -1075,7 +1084,7 @@ function idct32_1d_4x32_pass2_neon
>  .endr
>          sub             r2,  r2,  r12, lsl #4
>  
> -        idct16
> +        bl              idct16
>  
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          vst1.16         {d\i}, [r2,:64], r12
> @@ -1091,7 +1100,7 @@ function idct32_1d_4x32_pass2_neon
>          sub             r2,  r2,  r12, lsl #4
>          sub             r2,  r2,  #64
>  
> -        idct32_odd
> +        bl              idct32_odd
>  
>          mov             r12, #128
>  .macro load_acc_store a, b, c, d, neg=0
> @@ -1139,7 +1148,7 @@ function idct32_1d_4x32_pass2_neon
>          load_acc_store  24, 25, 26, 27, 1
>          load_acc_store  28, 29, 30, 31, 1
>  .purgem load_acc_store
> -        bx              lr
> +        pop             {pc}
>  endfunc
>  
>  const min_eob_idct_idct_32, align=4

ok. sorry for the delay.

Janne
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to