This work is sponsored by, and copyright, Google. This increases the code size of libavcodec/aarch64/vp9itxfm_neon.o from 14740 to 18504 bytes.
Before: vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_16x16_sub2_add_neon: 1051.0 vp9_inv_dct_dct_16x16_sub4_add_neon: 1051.0 vp9_inv_dct_dct_16x16_sub8_add_neon: 1051.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 1390.3 vp9_inv_dct_dct_16x16_sub16_add_neon: 1390.1 vp9_inv_dct_dct_32x32_sub1_add_neon: 556.5 vp9_inv_dct_dct_32x32_sub2_add_neon: 5199.1 vp9_inv_dct_dct_32x32_sub4_add_neon: 5199.9 vp9_inv_dct_dct_32x32_sub8_add_neon: 5196.9 vp9_inv_dct_dct_32x32_sub12_add_neon: 6171.6 vp9_inv_dct_dct_32x32_sub16_add_neon: 6170.9 vp9_inv_dct_dct_32x32_sub20_add_neon: 7147.1 vp9_inv_dct_dct_32x32_sub24_add_neon: 7147.0 vp9_inv_dct_dct_32x32_sub28_add_neon: 8118.8 vp9_inv_dct_dct_32x32_sub32_add_neon: 8125.8 After: vp9_inv_dct_dct_16x16_sub1_add_neon: 235.3 vp9_inv_dct_dct_16x16_sub2_add_neon: 697.0 vp9_inv_dct_dct_16x16_sub4_add_neon: 697.0 vp9_inv_dct_dct_16x16_sub8_add_neon: 908.0 vp9_inv_dct_dct_16x16_sub12_add_neon: 1399.6 vp9_inv_dct_dct_16x16_sub16_add_neon: 1403.3 vp9_inv_dct_dct_32x32_sub1_add_neon: 554.1 vp9_inv_dct_dct_32x32_sub2_add_neon: 3879.7 vp9_inv_dct_dct_32x32_sub4_add_neon: 3952.2 vp9_inv_dct_dct_32x32_sub8_add_neon: 3948.4 vp9_inv_dct_dct_32x32_sub12_add_neon: 5462.1 vp9_inv_dct_dct_32x32_sub16_add_neon: 5461.7 vp9_inv_dct_dct_32x32_sub20_add_neon: 7169.2 vp9_inv_dct_dct_32x32_sub24_add_neon: 7162.4 vp9_inv_dct_dct_32x32_sub28_add_neon: 8137.4 vp9_inv_dct_dct_32x32_sub32_add_neon: 8136.7 I.e. in general a very minor overhead for the full subpartition case due to the additional cmps, but a significant speedup for the cases when we only need to process a small part of the actual input data. --- If we wouldn't have made the core transforms standalone functions, the code size would end up at around 28 KB. --- libavcodec/aarch64/vp9itxfm_neon.S | 367 +++++++++++++++++++++++++++++++++++-- 1 file changed, 347 insertions(+), 20 deletions(-) diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S index be9643e..bb79348 100644 --- a/libavcodec/aarch64/vp9itxfm_neon.S +++ b/libavcodec/aarch64/vp9itxfm_neon.S @@ -75,6 +75,16 @@ endconst .endif .endm +// Same as dmbutterfly0 above, but treating the input in in2 as zero, +// writing the same output into both out1 and out2. +.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6 + smull \tmp1\().4s, \in1\().4h, v0.h[0] + smull2 \tmp2\().4s, \in1\().8h, v0.h[0] + rshrn \out1\().4h, \tmp1\().4s, #14 + rshrn2 \out1\().8h, \tmp2\().4s, #14 + mov \out2\().16b, \out1\().16b +.endm + // out1,out2 = in1 * coef1 - in2 * coef2 // out3,out4 = in1 * coef2 + in2 * coef1 // out are 4 x .4s registers, in are 2 x .8h registers @@ -104,6 +114,43 @@ endconst rshrn2 \inout2\().8h, \tmp4\().4s, #14 .endm +// Same as dmbutterfly above, but treating the input in inout2 as zero +.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().4s, \inout1\().4h, \coef1 + smull2 \tmp2\().4s, \inout1\().8h, \coef1 + smull \tmp3\().4s, \inout1\().4h, \coef2 + smull2 \tmp4\().4s, \inout1\().8h, \coef2 + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 +.endm + +// Same as dmbutterfly above, but treating the input in inout1 as zero +.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4 + smull \tmp1\().4s, \inout2\().4h, \coef2 + smull2 \tmp2\().4s, \inout2\().8h, \coef2 + smull \tmp3\().4s, \inout2\().4h, \coef1 + smull2 \tmp4\().4s, \inout2\().8h, \coef1 + neg \tmp1\().4s, \tmp1\().4s + neg \tmp2\().4s, \tmp2\().4s + rshrn \inout2\().4h, \tmp3\().4s, #14 + rshrn2 \inout2\().8h, \tmp4\().4s, #14 + rshrn \inout1\().4h, \tmp1\().4s, #14 + rshrn2 \inout1\().8h, \tmp2\().4s, #14 +.endm + +.macro dsmull_h out1, out2, in, coef + smull \out1\().4s, \in\().4h, \coef + smull2 \out2\().4s, \in\().8h, \coef +.endm + +.macro drshrn_h out, in1, in2, shift + rshrn \out\().4h, \in1\().4s, \shift + rshrn2 \out\().8h, \in2\().4s, \shift +.endm + + // out1 = in1 + in2 // out2 = in1 - in2 .macro butterfly_8h out1, out2, in1, in2 @@ -463,7 +510,7 @@ function idct16x16_dc_add_neon ret endfunc -function idct16 +.macro idct16_full dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a @@ -485,7 +532,10 @@ function idct16 dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +.endm +.macro idct16_end butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6 butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5 @@ -507,6 +557,68 @@ function idct16 butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11] butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10] ret +.endm + +function idct16 + idct16_full +endfunc + +function idct16_half + dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a + dmbutterfly_h1 v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a + dmbutterfly_h1 v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a + dmbutterfly_h2 v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a + dmbutterfly_h1 v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a + dmbutterfly_h2 v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a + dmbutterfly_h1 v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a + dmbutterfly_h2 v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a + + butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3 + butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2 + butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5 + butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6 + butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9 + butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10 + butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13 + butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14 + + dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a + dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a + dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a + idct16_end +endfunc + +function idct16_quarter + dsmull_h v24, v25, v19, v1.h[6] + dsmull_h v4, v5, v17, v0.h[7] + dsmull_h v7, v6, v18, v0.h[4] + dsmull_h v30, v31, v18, v0.h[3] + neg v24.4s, v24.4s + neg v25.4s, v25.4s + dsmull_h v29, v28, v17, v1.h[0] + dsmull_h v26, v27, v19, v1.h[5] + dsmull_h v22, v23, v16, v0.h[0] + drshrn_h v24, v24, v25, #14 + drshrn_h v16, v4, v5, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v6, v30, v31, #14 + drshrn_h v29, v29, v28, #14 + drshrn_h v17, v26, v27, #14 + drshrn_h v28, v22, v23, #14 + + dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2] + dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2] + neg v22.4s, v22.4s + neg v23.4s, v23.4s + drshrn_h v27, v20, v21, #14 + drshrn_h v21, v22, v23, #14 + drshrn_h v23, v18, v19, #14 + drshrn_h v25, v30, v31, #14 + mov v4.16b, v28.16b + mov v5.16b, v28.16b + dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31 + mov v20.16b, v28.16b + idct16_end endfunc function iadst16 @@ -609,12 +721,37 @@ function \txfm\()16_1d_8x16_pass1_neon mov x14, x30 movi v2.8h, #0 + +.ifc \txfm,idct + cmp w3, #10 + b.le 3f + cmp w3, #38 + b.le 4f +.endif + .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 load_clear \i, x2, x9 .endr bl \txfm\()16 +.ifc \txfm,idct + b 5f + +3: +.irp i, 16, 17, 18, 19 + load_clear \i, x2, x9 +.endr + bl idct16_quarter + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load_clear \i, x2, x9 +.endr + bl idct16_half +.endif + +5: // Do two 8x8 transposes. Originally, v16-v31 contain the // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two // transposed 8x8 blocks. @@ -654,38 +791,60 @@ endfunc // x0 = dst // x1 = dst stride // x2 = src (temp buffer) -// x3 = slice offset +// w3 = eob // x9 = temp buffer stride +// x13 = slice offset function \txfm\()16_1d_8x16_pass2_neon mov x14, x30 +.ifc \txfm,idct + cmp w3, #10 + b.le 3f + cmp w3, #38 + b.le 4f +.endif .irp i, 16, 17, 18, 19, 20, 21, 22, 23 load \i, x2, x9 .endr - cbz x3, 1f + cbz x13, 1f .irp i, 24, 25, 26, 27, 28, 29, 30, 31 load \i, x2, x9 .endr 1: - add x3, x0, x1 - lsl x1, x1, #1 bl \txfm\()16 +.ifc \txfm,idct + b 5f +3: +.irp i, 16, 17, 18, 19 + load \i, x2, x9 +.endr + bl idct16_quarter + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + load \i, x2, x9 +.endr + bl idct16_half +.endif +5: + add x8, x0, x1 + lsl x1, x1, #1 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2 srshr \coef0, \coef0, #6 ld1 {v2.8b}, [x0], x1 srshr \coef1, \coef1, #6 - ld1 {v3.8b}, [x3], x1 + ld1 {v3.8b}, [x8], x1 srshr \coef2, \coef2, #6 ld1 {v4.8b}, [x0], x1 srshr \coef3, \coef3, #6 uaddw \coef0, \coef0, v2.8b - ld1 {v5.8b}, [x3], x1 + ld1 {v5.8b}, [x8], x1 uaddw \coef1, \coef1, v3.8b srshr \coef4, \coef4, #6 ld1 {v6.8b}, [x0], x1 srshr \coef5, \coef5, #6 - ld1 {v7.8b}, [x3], x1 + ld1 {v7.8b}, [x8], x1 sqxtun v2.8b, \coef0 srshr \coef6, \coef6, #6 sqxtun v3.8b, \coef1 @@ -693,27 +852,27 @@ function \txfm\()16_1d_8x16_pass2_neon uaddw \coef2, \coef2, v4.8b ld1 {\tmp1}, [x0], x1 uaddw \coef3, \coef3, v5.8b - ld1 {\tmp2}, [x3], x1 + ld1 {\tmp2}, [x8], x1 sqxtun v4.8b, \coef2 sub x0, x0, x1, lsl #2 - sub x3, x3, x1, lsl #2 + sub x8, x8, x1, lsl #2 sqxtun v5.8b, \coef3 uaddw \coef4, \coef4, v6.8b st1 {v2.8b}, [x0], x1 uaddw \coef5, \coef5, v7.8b - st1 {v3.8b}, [x3], x1 + st1 {v3.8b}, [x8], x1 sqxtun v6.8b, \coef4 st1 {v4.8b}, [x0], x1 sqxtun v7.8b, \coef5 - st1 {v5.8b}, [x3], x1 + st1 {v5.8b}, [x8], x1 uaddw \coef6, \coef6, \tmp1 st1 {v6.8b}, [x0], x1 uaddw \coef7, \coef7, \tmp2 - st1 {v7.8b}, [x3], x1 + st1 {v7.8b}, [x8], x1 sqxtun \tmp1, \coef6 sqxtun \tmp2, \coef7 st1 {\tmp1}, [x0], x1 - st1 {\tmp2}, [x3], x1 + st1 {\tmp2}, [x8], x1 .endm load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b @@ -750,6 +909,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 movrel x10, idct_coeffs .ifnc \txfm1\()_\txfm2,idct_idct movrel x11, iadst16_coeffs + mov w3, #256 .endif .ifc \txfm1,idct ld1 {v0.8h,v1.8h}, [x10] @@ -792,7 +952,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 add x0, x4, #(\i) mov x1, x5 add x2, sp, #(\i*2) - mov x3, #\i + mov x13, #\i bl \txfm2\()16_1d_8x16_pass2_neon .endr @@ -848,7 +1008,7 @@ function idct32x32_dc_add_neon ret endfunc -function idct32_odd +.macro idct32_odd_full ld1 {v0.8h,v1.8h}, [x11] dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a @@ -875,7 +1035,10 @@ function idct32_odd dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +.endm +.macro idct32_end butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18 butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a @@ -904,6 +1067,88 @@ function idct32_odd dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22 dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a ret +.endm + +function idct32_odd + idct32_odd_full +endfunc + +function idct32_odd_half + ld1 {v0.8h,v1.8h}, [x11] + + dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a + dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a + dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a + dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a + dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a + dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a + dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a + dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a + + ld1 {v0.8h}, [x10] + + butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17 + butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18 + butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21 + butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22 + butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25 + butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26 + butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30 + butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29 + + dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a + dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a + dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a + dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a + idct32_end +endfunc + +function idct32_odd_quarter + ld1 {v0.8h,v1.8h}, [x11] + + dsmull_h v4, v5, v16, v0.h[0] + dsmull_h v28, v29, v19, v0.h[7] + dsmull_h v30, v31, v16, v0.h[1] + dsmull_h v22, v23, v17, v1.h[6] + dsmull_h v7, v6, v17, v1.h[7] + dsmull_h v26, v27, v19, v0.h[6] + dsmull_h v20, v21, v18, v1.h[0] + dsmull_h v24, v25, v18, v1.h[1] + + ld1 {v0.8h}, [x10] + + neg v28.4s, v28.4s + neg v29.4s, v29.4s + neg v7.4s, v7.4s + neg v6.4s, v6.4s + + drshrn_h v4, v4, v5, #14 + drshrn_h v5, v28, v29, #14 + drshrn_h v29, v30, v31, #14 + drshrn_h v28, v22, v23, #14 + drshrn_h v7, v7, v6, #14 + drshrn_h v31, v26, v27, #14 + drshrn_h v6, v20, v21, #14 + drshrn_h v30, v24, v25, #14 + + dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[3], v0.h[4] + dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[3], v0.h[4] + drshrn_h v23, v16, v17, #14 + drshrn_h v24, v18, v19, #14 + neg v20.4s, v20.4s + neg v21.4s, v21.4s + drshrn_h v27, v27, v26, #14 + drshrn_h v20, v20, v21, #14 + dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[5], v0.h[6] + drshrn_h v21, v16, v17, #14 + drshrn_h v26, v18, v19, #14 + dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[5], v0.h[6] + drshrn_h v25, v16, v17, #14 + neg v18.4s, v18.4s + neg v19.4s, v19.4s + drshrn_h v22, v18, v19, #14 + + idct32_end endfunc // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix. @@ -923,6 +1168,11 @@ function idct32_1d_8x32_pass1_neon movi v4.8h, #0 + cmp w3, #4 + b.le 3f + cmp w3, #135 + b.le 4f + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x2] @@ -930,7 +1180,25 @@ function idct32_1d_8x32_pass1_neon .endr bl idct16 + sub x2, x2, x9, lsl #4 + b 5f +3: +.irp i, 16, 17, 18, 19 + ld1 {v\i\().8h}, [x2] + st1 {v4.8h}, [x2], x9 +.endr + bl idct16_quarter + sub x2, x2, x9, lsl #2 + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x2] + st1 {v4.8h}, [x2], x9 +.endr + bl idct16_half + sub x2, x2, x9, lsl #3 +5: // Do two 8x8 transposes. Originally, v16-v31 contain the // 16 rows. Afterwards, v16-v23 and v24-v31 contain the // two transposed 8x8 blocks. @@ -962,12 +1230,16 @@ function idct32_1d_8x32_pass1_neon sub x0, x0, #512 .purgem store_rev - // Move x2 back to the start of the input, and move - // to the first odd row - sub x2, x2, x9, lsl #4 + // Move x2 to the first odd row add x2, x2, #64 movi v4.8h, #0 + + cmp w3, #34 + b.le 3f + cmp w3, #135 + b.le 4f + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x2] @@ -975,7 +1247,22 @@ function idct32_1d_8x32_pass1_neon .endr bl idct32_odd + b 5f +3: +.irp i, 16, 17, 18, 19 + ld1 {v\i\().8h}, [x2] + st1 {v4.8h}, [x2], x9 +.endr + bl idct32_odd_quarter + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x2] + st1 {v4.8h}, [x2], x9 +.endr + bl idct32_odd_half +5: transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3 transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3 @@ -1027,6 +1314,11 @@ function idct32_1d_8x32_pass2_neon mov x14, x30 ld1 {v0.8h,v1.8h}, [x10] + cmp w3, #34 + b.le 3f + cmp w3, #135 + b.le 4f + // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x2], x9 @@ -1034,7 +1326,22 @@ function idct32_1d_8x32_pass2_neon sub x2, x2, x9, lsl #4 bl idct16 + b 5f +3: +.irp i, 16, 17, 18, 19 + ld1 {v\i\().8h}, [x2], x9 +.endr + sub x2, x2, x9, lsl #2 + bl idct16_quarter + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x2], x9 +.endr + sub x2, x2, x9, lsl #3 + bl idct16_half +5: .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x2], x9 .endr @@ -1042,15 +1349,35 @@ function idct32_1d_8x32_pass2_neon sub x2, x2, x9, lsl #4 add x2, x2, #64 + cmp w3, #34 + b.le 3f + cmp w3, #135 + b.le 4f + // v16 = IN(1), v17 = IN(3) ... v31 = IN(31) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 ld1 {v\i\().8h}, [x2], x9 .endr sub x2, x2, x9, lsl #4 - sub x2, x2, #64 bl idct32_odd + b 5f +3: +.irp i, 16, 17, 18, 19 + ld1 {v\i\().8h}, [x2], x9 +.endr + sub x2, x2, x9, lsl #2 + bl idct32_odd_quarter + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x2], x9 +.endr + sub x2, x2, x9, lsl #3 + bl idct32_odd_half +5: + sub x2, x2, #64 .macro load_acc_store a, b, c, d, neg=0 .if \neg == 0 ld1 {v4.8h}, [x2], x9 -- 2.7.4 _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel