On 2016-11-24 00:09:35 +0200, Martin Storsjö wrote: > --- > libavcodec/aarch64/vp9itxfm_neon.S | 26 +++++++++++++++----------- > 1 file changed, 15 insertions(+), 11 deletions(-) > > diff --git a/libavcodec/aarch64/vp9itxfm_neon.S > b/libavcodec/aarch64/vp9itxfm_neon.S > index 2dc6b75..f4194a6 100644 > --- a/libavcodec/aarch64/vp9itxfm_neon.S > +++ b/libavcodec/aarch64/vp9itxfm_neon.S > @@ -599,9 +599,9 @@ endfunc > // x1 = unused > // x2 = src > // x3 = slice offset > +// x9 = input stride > .macro itxfm16_1d_funcs txfm > function \txfm\()16_1d_8x16_pass1_neon > - mov x9, #32 > movi v2.8h, #0 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > load_clear \i, x2, x9 > @@ -649,8 +649,8 @@ endfunc > // x1 = dst stride > // x2 = src (temp buffer) > // x3 = slice offset > +// x9 = temp buffer stride > function \txfm\()16_1d_8x16_pass2_neon > - mov x9, #32 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23 > load \i, x2, x9 > .endr > @@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, > export=1 > .ifc \txfm1,idct > ld1 {v0.8h,v1.8h}, [x10] > .endif > + mov x9, #32 > > .irp i, 0, 8 > add x0, sp, #(\i*32) > @@ -882,13 +883,12 @@ endfunc > // x0 = dst (temp buffer) > // x1 = unused > // x2 = src > +// x9 = double input stride > // x10 = idct_coeffs > // x11 = idct_coeffs + 32 > function idct32_1d_8x32_pass1_neon > ld1 {v0.8h,v1.8h}, [x10] > > - // Double stride of the input, since we only read every other line > - mov x9, #128 > movi v4.8h, #0 > > // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) > @@ -987,12 +987,13 @@ endfunc > // x0 = dst > // x1 = dst stride > // x2 = src (temp buffer) > +// x7 = negative double temp buffer stride > +// x9 = double temp buffer stride > // x10 = idct_coeffs > // x11 = idct_coeffs + 32 > function idct32_1d_8x32_pass2_neon > ld1 {v0.8h,v1.8h}, [x10] > > - mov x9, #128 > // v16 = IN(0), v17 = IN(2) ... v31 = IN(30) > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > ld1 {v\i\().8h}, [x2], x9 > @@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon > > idct16 > > - mov x9, #128 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > st1 {v\i\().8h}, [x2], x9 > .endr > @@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon > > idct32_odd > > - mov x9, #128 > .macro load_acc_store a, b, c, d, neg=0 > +.if \neg == 0 > ld1 {v4.8h}, [x2], x9 > ld1 {v5.8h}, [x2], x9 > -.if \neg == 0 > add v4.8h, v4.8h, v\a\().8h > ld1 {v6.8h}, [x2], x9 > add v5.8h, v5.8h, v\b\().8h > @@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon > add v6.8h, v6.8h, v\c\().8h > add v7.8h, v7.8h, v\d\().8h > .else > + ld1 {v4.8h}, [x2], x7 > + ld1 {v5.8h}, [x2], x7 > sub v4.8h, v4.8h, v\a\().8h > - ld1 {v6.8h}, [x2], x9 > + ld1 {v6.8h}, [x2], x7 > sub v5.8h, v5.8h, v\b\().8h > - ld1 {v7.8h}, [x2], x9 > + ld1 {v7.8h}, [x2], x7 > sub v6.8h, v6.8h, v\c\().8h > sub v7.8h, v7.8h, v\d\().8h > .endif > @@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon > load_acc_store 23, 22, 21, 20 > load_acc_store 19, 18, 17, 16 > sub x2, x2, x9 > - neg x9, x9 > load_acc_store 16, 17, 18, 19, 1 > load_acc_store 20, 21, 22, 23, 1 > load_acc_store 24, 25, 26, 27, 1 > @@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1 > mov x5, x1 > mov x6, x2 > > + // Double stride of the input, since we only read every other line > + mov x9, #128 > + neg x7, x9 > + > .irp i, 0, 8, 16, 24 > add x0, sp, #(\i*64) > add x2, x6, #(\i*2)
ok Janne _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel