On 2017-01-15 22:55:48 +0200, Martin Storsjö wrote: > The theoretical maximum value of E is 193, so we can just > saturate the addition to 255. > > Before: Cortex A7 A8 A9 A53 A53/AArch64 > vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.8 88.0 87.7 > vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0 136.7 > vp9_loop_filter_v_16_8_neon: 497.0 419.5 379.7 293.0 275.7 > vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0 452.0 > After: > vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.6 84.0 83.0 > vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0 133.7 > vp9_loop_filter_v_16_8_neon: 490.0 417.5 377.7 289.0 271.0 > vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0 446.7 > --- > libavcodec/aarch64/vp9lpf_neon.S | 40 > +++++++++------------------------------- > libavcodec/arm/vp9lpf_neon.S | 11 +++++------ > 2 files changed, 14 insertions(+), 37 deletions(-) > > diff --git a/libavcodec/aarch64/vp9lpf_neon.S > b/libavcodec/aarch64/vp9lpf_neon.S > index 3b8e6eb..4553173 100644 > --- a/libavcodec/aarch64/vp9lpf_neon.S > +++ b/libavcodec/aarch64/vp9lpf_neon.S > @@ -51,13 +51,6 @@ > // see the arm version instead. > > > -.macro uabdl_sz dst1, dst2, in1, in2, sz > - uabdl \dst1, \in1\().8b, \in2\().8b > -.ifc \sz, .16b > - uabdl2 \dst2, \in1\().16b, \in2\().16b > -.endif > -.endm > - > .macro add_sz dst1, dst2, in1, in2, in3, in4, sz > add \dst1, \in1, \in3 > .ifc \sz, .16b > @@ -86,20 +79,6 @@ > .endif > .endm > > -.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz > - cmhs \dst1, \in1, \in3 > -.ifc \sz, .16b > - cmhs \dst2, \in2, \in4 > -.endif > -.endm > - > -.macro xtn_sz dst, in1, in2, sz > - xtn \dst\().8b, \in1 > -.ifc \sz, .16b > - xtn2 \dst\().16b, \in2 > -.endif > -.endm > - > .macro usubl_sz dst1, dst2, in1, in2, sz > usubl \dst1, \in1\().8b, \in2\().8b > .ifc \sz, .16b > @@ -179,20 +158,20 @@ > // tmpq2 == tmp3 + tmp4, etc. > .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, > tmp8 > .if \mix == 0 > - dup v0.8h, w2 // E > - dup v1.8h, w2 // E > + dup v0\sz, w2 // E > dup v2\sz, w3 // I > dup v3\sz, w4 // H > .else > - dup v0.8h, w2 // E > + dup v0.8b, w2 // E > dup v2.8b, w3 // I > dup v3.8b, w4 // H > + lsr w5, w2, #8 > lsr w6, w3, #8 > lsr w7, w4, #8 > - ushr v1.8h, v0.8h, #8 // E > + dup v1.8b, w5 // E > dup v4.8b, w6 // I > - bic v0.8h, #255, lsl 8 // E > dup v5.8b, w7 // H > + trn1 v0.2d, v0.2d, v1.2d
isn't this equivalent to dup v0.8h, w2 uzp1 v0.16b, v0.16b, v0.16b on little endian? > trn1 v2.2d, v2.2d, v4.2d > trn1 v3.2d, v3.2d, v5.2d > .endif > @@ -206,16 +185,15 @@ > umax v4\sz, v4\sz, v5\sz > umax v5\sz, v6\sz, v7\sz > umax \tmp1\sz, \tmp1\sz, \tmp2\sz > - uabdl_sz v6.8h, v7.8h, v23, v24, \sz // abs(p0 - q0) > + uabd v6\sz, v23\sz, v24\sz // abs(p0 - q0) > umax v4\sz, v4\sz, v5\sz > - add_sz v6.8h, v7.8h, v6.8h, v7.8h, v6.8h, v7.8h, \sz > // abs(p0 - q0) * 2 > + uqadd v6\sz, v6\sz, v6\sz // abs(p0 - q0) * 2 > uabd v5\sz, v22\sz, v25\sz // abs(p1 - q1) > umax v4\sz, v4\sz, \tmp1\sz // max(abs(p3 - p2), > ..., abs(q2 - q3)) > ushr v5\sz, v5\sz, #1 > cmhs v4\sz, v2\sz, v4\sz // max(abs()) <= I > - uaddw_sz v6.8h, v7.8h, v6.8h, v7.8h, v5, \sz // abs(p0 - > q0) * 2 + abs(p1 - q1) >> 1 > - cmhs_sz v6.8h, v7.8h, v0.8h, v1.8h, v6.8h, v7.8h, \sz > - xtn_sz v5, v6.8h, v7.8h, \sz > + uqadd v6\sz, v6\sz, v5\sz // abs(p0 - q0) * 2 + > abs(p1 - q1) >> 1 > + cmhs v5\sz, v0\sz, v6\sz > and v4\sz, v4\sz, v5\sz // fm > > // If no pixels need filtering, just exit as soon as possible > diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S > index c57c0e9..5e154f6 100644 > --- a/libavcodec/arm/vp9lpf_neon.S > +++ b/libavcodec/arm/vp9lpf_neon.S > @@ -51,7 +51,7 @@ > @ and d28-d31 as temp registers, or d8-d15. > @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4 > .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, > tmpq1, tmpq2, tmpq3, tmpq4 > - vdup.u16 q0, r2 @ E > + vdup.u8 d0, r2 @ E > vdup.u8 d2, r3 @ I > ldr r3, [sp] > > @@ -64,16 +64,15 @@ > vmax.u8 d4, d4, d5 > vmax.u8 d5, d6, d7 > vmax.u8 \tmp1, \tmp1, \tmp2 > - vabdl.u8 q3, d23, d24 @ abs(p0 - q0) > + vabd.u8 d6, d23, d24 @ abs(p0 - q0) > vmax.u8 d4, d4, d5 > - vadd.u16 q3, q3, q3 @ abs(p0 - q0) * 2 > + vqadd.u8 d6, d6, d6 @ abs(p0 - q0) * 2 > vabd.u8 d5, d22, d25 @ abs(p1 - q1) > vmax.u8 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - > q3)) > vshr.u8 d5, d5, #1 > vcle.u8 d4, d4, d2 @ max(abs()) <= I > - vaddw.u8 q3, q3, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) > >> 1 > - vcle.u16 q3, q3, q0 > - vmovn.u16 d5, q3 > + vqadd.u8 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) > >> 1 > + vcle.u8 d5, d6, d0 > vand d4, d4, d5 @ fm > > vdup.u8 d3, r3 @ H otherwise ok Janne _______________________________________________ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel