> On Feb 2, 2020, at 4:26 AM, Marton Balint <c...@passwd.hu> wrote: > > > > On Sat, 1 Feb 2020, quinkbl...@foxmail.com <mailto:quinkbl...@foxmail.com> > wrote: > >> From: Zhao Zhili <quinkbl...@foxmail.com> >> >> For 8 bit depth: >> ./ffmpeg -threads 1 -f lavfi -t 10 -i >> 'yuvtestsrc=size=4096x2048,format=yuv444p' -vf 'freezedetect' -f null >> -benchmark - >> >> Test results on Snapdragon 845: >> Before: >> frame= 250 fps= 23 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A >> speed=0.924x >> bench: utime=8.360s stime=2.350s rtime=10.820s >> After: >> frame= 250 fps= 51 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A >> speed=2.04x >> bench: utime=2.650s stime=2.210s rtime=4.909s >> >> Test results on HiSilicon Kirin 970: >> Before: >> frame= 250 fps=6.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A >> speed=0.239x >> bench: utime=35.156s stime=6.604s rtime=41.820s >> After: >> frame= 250 fps= 10 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A >> speed=0.403x >> bench: utime=18.400s stime=6.376s rtime=24.798s >> >> For 16 bit depth: >> ./ffmpeg -threads 1 -f lavfi -t 10 -i >> 'yuvtestsrc=size=4096x2048,format=yuv444p16' -vf 'freezedetect' -f null >> -benchmark - >> >> Test results on Snapdragon 845 >> Before: >> frame= 250 fps= 19 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A >> speed=0.756x >> bench: utime=8.700s stime=4.410s rtime=13.226s >> After: >> frame= 250 fps= 27 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A >> speed=1.07x >> bench: utime=4.920s stime=4.350s rtime=9.356s >> >> Test results on HiSilicon Kirin 970: >> Before: >> frame= 250 fps=4.0 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A >> speed=0.161x >> bench: utime=48.868s stime=13.124s rtime=62.110s >> After: >> frame= 250 fps=5.1 q=-0.0 Lsize=N/A time=00:00:10.00 bitrate=N/A >> speed=0.205x >> bench: utime=35.600s stime=13.036s rtime=48.708s >> --- >> libavfilter/aarch64/Makefile | 2 + >> libavfilter/aarch64/scene_sad_init.c | 37 +++++++ >> libavfilter/aarch64/scene_sad_neon.S | 149 +++++++++++++++++++++++++++ >> libavfilter/scene_sad.c | 2 + >> libavfilter/scene_sad.h | 2 + >> 5 files changed, 192 insertions(+) >> create mode 100644 libavfilter/aarch64/scene_sad_init.c >> create mode 100644 libavfilter/aarch64/scene_sad_neon.S > > Does your ASM handles cases when width is not a multiple of the vector size? > If not, then you should probably do something similar to what is done for X86. >
The code after `+ // scalar loop` handles that. It supports width and height >= 1. > Thanks, > Marton > >> >> diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile >> index 6c727f9859..3a458f511f 100644 >> --- a/libavfilter/aarch64/Makefile >> +++ b/libavfilter/aarch64/Makefile >> @@ -1,7 +1,9 @@ >> OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_afir_init.o >> OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_anlmdn_init.o >> +OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/scene_sad_init.o >> OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_init.o >> NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_afir_neon.o >> NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/af_anlmdn_neon.o >> +NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/scene_sad_neon.o >> NEON-OBJS-$(CONFIG_NLMEANS_FILTER) += aarch64/vf_nlmeans_neon.o >> diff --git a/libavfilter/aarch64/scene_sad_init.c >> b/libavfilter/aarch64/scene_sad_init.c >> new file mode 100644 >> index 0000000000..8de769ac10 >> --- /dev/null >> +++ b/libavfilter/aarch64/scene_sad_init.c >> @@ -0,0 +1,37 @@ >> +/* >> + * This file is part of FFmpeg. >> + * >> + * FFmpeg is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2.1 of the License, or (at your option) any later version. >> + * >> + * FFmpeg is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with FFmpeg; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >> USA >> + */ >> + >> +#include "libavutil/aarch64/cpu.h" >> +#include "libavfilter/scene_sad.h" >> + >> +void ff_scene_sad_neon(SCENE_SAD_PARAMS); >> + >> +void ff_scene_sad16_neon(SCENE_SAD_PARAMS); >> + >> +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth) >> +{ >> + int cpu_flags = av_get_cpu_flags(); >> + if (have_neon(cpu_flags)) { >> + if (depth == 8) >> + return ff_scene_sad_neon; >> + if (depth == 16) >> + return ff_scene_sad16_neon; >> + } >> + >> + return NULL; >> +} >> diff --git a/libavfilter/aarch64/scene_sad_neon.S >> b/libavfilter/aarch64/scene_sad_neon.S >> new file mode 100644 >> index 0000000000..5b3b027a53 >> --- /dev/null >> +++ b/libavfilter/aarch64/scene_sad_neon.S >> @@ -0,0 +1,149 @@ >> +/* >> + * Copyright (c) 2020 Zhao Zhili >> + * >> + * This file is part of FFmpeg. >> + * >> + * FFmpeg is free software; you can redistribute it and/or >> + * modify it under the terms of the GNU Lesser General Public >> + * License as published by the Free Software Foundation; either >> + * version 2.1 of the License, or (at your option) any later version. >> + * >> + * FFmpeg is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> + * Lesser General Public License for more details. >> + * >> + * You should have received a copy of the GNU Lesser General Public >> + * License along with FFmpeg; if not, write to the Free Software >> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 >> USA >> + */ >> + >> +#include "libavutil/aarch64/asm.S" >> + >> +// void ff_scene_sadx_neon(const uint8_t *src1, ptrdiff_t stride1, >> +// const uint8_t *src2, ptrdiff_t stride2, >> +// ptrdiff_t width, ptrdiff_t height, >> +// uint64_t *sum) >> +.macro scene_sad_neon, depth=8 >> + // x0: src1 >> + // x1: stride1 >> + // x2: src2 >> + // x3: stride2 >> + // x4: width >> + // x5: height >> + // x6: sum >> + >> + // x7: step of width loop >> + // x8: index of row >> + // x9: width / x7 * x7 >> + // x10: sad >> + // x11: index of column >> + // w12: src1[x] >> + // w13: src2[x] >> + >> + mov x8, xzr >> + mov x10, xzr >> + >> +.if \depth == 8 >> + mov x7, #64 >> + and x9, x4, #0xFFFFFFFFFFFFFFC0 >> +.endif >> + >> +.if \depth == 16 >> + mov x7, #32 >> + and x9, x4, #0xFFFFFFFFFFFFFFE0 >> +.endif >> + >> +1: cmp x4, x7 // check width >> + mov x11, xzr >> + b.lt 3f >> + >> + mov v0.d[0], x10 >> + >> + // vector loop >> +2: >> +.if \depth == 8 >> + add x14, x0, x11 >> + add x15, x2, x11 >> +.endif >> + >> +.if \depth == 16 >> + add x14, x0, x11, lsl #1 >> + add x15, x2, x11, lsl #1 >> +.endif >> + ld1 {v16.4S, v17.4S, v18.4S, v19.4S}, [x14] >> + ld1 {v20.4S, v21.4S, v22.4S, v23.4S}, [x15] >> + add x11, x11, x7 >> + cmp x9, x11 >> + >> +.if \depth == 8 >> + uabd v16.16B, v16.16B, v20.16B >> + uabd v17.16B, v17.16B, v21.16B >> + uabd v18.16B, v18.16B, v22.16B >> + uabd v19.16B, v19.16B, v23.16B >> + uaddlv h16, v16.16B >> + uaddlv h17, v17.16B >> + uaddlv h18, v18.16B >> + uaddlv h19, v19.16B >> +.endif >> + >> +.if \depth == 16 >> + uabd v16.8H, v16.8H, v20.8H >> + uabd v17.8H, v17.8H, v21.8H >> + uabd v18.8H, v18.8H, v22.8H >> + uabd v19.8H, v19.8H, v23.8H >> + uaddlv s16, v16.8H >> + uaddlv s17, v17.8H >> + uaddlv s18, v18.8H >> + uaddlv s19, v19.8H >> +.endif >> + >> + add d16, d16, d17 >> + add d18, d18, d19 >> + add d0, d0, d16 >> + add d0, d0, d18 >> + >> + b.ne 2b >> + >> + cmp x9, x4 >> + fmov x10, d0 >> + b.eq 4f >> + >> + // scalar loop >> +3: >> +.if \depth == 8 >> + ldrb w12, [x0, x11] >> + ldrb w13, [x2, x11] >> +.endif >> + >> +.if \depth == 16 >> + ldrh w12, [x0, x11, lsl #1] >> + ldrh w13, [x2, x11, lsl #1] >> +.endif >> + add x11, x11, #1 >> + subs w12, w12, w13 >> + cneg w12, w12, mi >> + add x10, x10, x12 >> + cmp x11, x4 >> + b.ne 3b >> + >> + // next row >> +4: >> + add x8, x8, #1 // =1 >> + add x0, x0, x1 >> + cmp x8, x5 >> + add x2, x2, x3 >> + b.ne 1b >> + >> +5: >> + str x10, [x6] >> + ret >> +.endm >> + >> +function ff_scene_sad_neon, export=1 >> + scene_sad_neon depth=8 >> +endfunc >> + >> +function ff_scene_sad16_neon, export=1 >> + scene_sad_neon depth=16 >> +endfunc >> diff --git a/libavfilter/scene_sad.c b/libavfilter/scene_sad.c >> index 73d3eacbfa..ee0c71f659 100644 >> --- a/libavfilter/scene_sad.c >> +++ b/libavfilter/scene_sad.c >> @@ -61,6 +61,8 @@ ff_scene_sad_fn ff_scene_sad_get_fn(int depth) >> ff_scene_sad_fn sad = NULL; >> if (ARCH_X86) >> sad = ff_scene_sad_get_fn_x86(depth); >> + if (ARCH_AARCH64) >> + sad = ff_scene_sad_get_fn_aarch64(depth); >> if (!sad) { >> if (depth == 8) >> sad = ff_scene_sad_c; >> diff --git a/libavfilter/scene_sad.h b/libavfilter/scene_sad.h >> index 173a051f2b..c868200dc4 100644 >> --- a/libavfilter/scene_sad.h >> +++ b/libavfilter/scene_sad.h >> @@ -37,6 +37,8 @@ void ff_scene_sad_c(SCENE_SAD_PARAMS); >> void ff_scene_sad16_c(SCENE_SAD_PARAMS); >> +ff_scene_sad_fn ff_scene_sad_get_fn_aarch64(int depth); >> + >> ff_scene_sad_fn ff_scene_sad_get_fn_x86(int depth); >> ff_scene_sad_fn ff_scene_sad_get_fn(int depth); >> -- >> 2.22.0 >> >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel> >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-requ...@ffmpeg.org <mailto:ffmpeg-devel-requ...@ffmpeg.org> >> with subject "unsubscribe". > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org <mailto:ffmpeg-devel@ffmpeg.org> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > <https://ffmpeg.org/mailman/listinfo/ffmpeg-devel> > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org <mailto:ffmpeg-devel-requ...@ffmpeg.org> with > subject "unsubscribe". _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".