Re: [FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc epel assembly

2022-02-07 Thread Martin Storsjö

On Thu, 3 Feb 2022, J. Dekker wrote:


Thanks: Rafal Dabrowa 
---
libavcodec/aarch64/Makefile   |3 +-
libavcodec/aarch64/hevcdsp_epel_neon.S| 2501 +
libavcodec/aarch64/hevcdsp_init_aarch64.c |   52 +
3 files changed, 2555 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S


The same comments as for the qpel code apply here too.

// Martin

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v2 2/2] lavc/aarch64: add hevc epel assembly

2022-02-03 Thread J. Dekker
Thanks: Rafal Dabrowa 
---
 libavcodec/aarch64/Makefile   |3 +-
 libavcodec/aarch64/hevcdsp_epel_neon.S| 2501 +
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   52 +
 3 files changed, 2555 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 8592692479..ebedc03bfa 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -61,7 +61,8 @@ NEON-OBJS-$(CONFIG_VP9_DECODER) += 
aarch64/vp9itxfm_16bpp_neon.o   \
aarch64/vp9lpf_neon.o   
\
aarch64/vp9mc_16bpp_neon.o  
\
aarch64/vp9mc_neon.o
-NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_idct_neon.o 
\
+NEON-OBJS-$(CONFIG_HEVC_DECODER)+= aarch64/hevcdsp_epel_neon.o 
\
+   aarch64/hevcdsp_idct_neon.o 
\
aarch64/hevcdsp_init_aarch64.o  
\
aarch64/hevcdsp_qpel_neon.o 
\
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
new file mode 100644
index 00..bbf93c3d6a
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -0,0 +1,2501 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+function ff_hevc_put_hevc_pel_pixels4_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.s}[0], [x1], x2
+ushll   v4.8h, v0.8b, #6
+subsw3, w3, #1
+st1{v4.d}[0], [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels6_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2 - 8)
+1:  ld1{v0.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+st1{v4.d}[0], [x0], #8
+subsw3, w3, #1
+st1{v4.s}[2], [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels8_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+subsw3, w3, #1
+st1{v4.8h}, [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels12_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2 - 16)
+1:  ld1{v0.8b, v1.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+st1{v4.8h}, [x0], #16
+ushll   v5.8h, v1.8b, #6
+subsw3, w3, #1
+st1{v5.d}[0], [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels16_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.8b, v1.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+ushll   v5.8h, v1.8b, #6
+subsw3, w3, #1
+st1{v4.8h, v5.8h}, [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels24_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.8b-v2.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+ushll   v5.8h, v1.8b, #6
+ushll   v6.8h, v2.8b, #6
+subsw3, w3, #1
+st1{v4.8h-v6.8h}, [x0], x7
+b.ne1b
+ret
+endfunc
+
+function ff_hevc_put_hevc_pel_pixels32_8_neon, export=1
+mov x7, #(MAX_PB_SIZE * 2)
+1:  ld1{v0.8b-v3.8b}, [x1], x2
+ushll   v4.8h, v0.8b, #6
+ushll   v5.8h, v1.8b, #6
+ushll   v6.8h, v2.8b, #6
+ushll   v7.8h, v3.8b, #6
+subsw3, w3, #1
+