On Thu, 23 Jun 2022, J. Dekker wrote:

Signed-off-by: J. Dekker <j...@itanimul.li>
---
libavcodec/aarch64/Makefile               |   3 +-
libavcodec/aarch64/hevcdsp_deblock_neon.S | 168 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c |  14 ++
3 files changed, 184 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/hevcdsp_deblock_neon.S

Passes FATE, I never completed the checkasm for loop_filter, working on
that currently alongside the luma loop filter. This asm can also go
into hevcdsp_sao_neon.S if you would prefer not creating an extra file
for it.

Having a separate file for deblocking is totally fine - much rather that, than lumping unrelated pieces together.

It'd be great to have a checkasm test for it before it goes in - see the existing checkasm tests for h264/vp8/vp9 for examples of how to do that.

diff --git a/libavcodec/aarch64/hevcdsp_deblock_neon.S 
b/libavcodec/aarch64/hevcdsp_deblock_neon.S
new file mode 100644
index 0000000000..d21ad0a54f
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_deblock_neon.S
@@ -0,0 +1,168 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomp...@vtt.fi>
+ * Copyright (c) 2022 J. Dekker <j...@itanimul.li>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#include "libavutil/aarch64/asm.S"
+#include "neon.S"
+
+.macro hevc_loop_filter_chroma_start bitdepth
+        ldr             w14, [x2]
+        ldr             w15, [x2, #4]
+.if \bitdepth > 8
+        lsl             w14, w14, #(\bitdepth - 8)
+        lsl             w15, w15, #(\bitdepth - 8)
+.endif
+        adds            w2, w14, w15
+        b.eq            1f
+        dup             v16.4h, w14
+        dup             v17.4h, w15
+        trn1            v16.2d, v16.2d, v17.2d
+.if \bitdepth == 10
+        mvni            v19.8h, #0xFC, lsl #8 // movi #0x03FF
+.endif
+.if \bitdepth == 12

Nit; this can be .elif

+        mvni            v19.8h, #0xF0, lsl #8 // movi #0x0FFF
+.endif
+.if \bitdepth > 8
+        movi            v18.8h, #0
+.endif
+        neg             v17.8h, v16.8h
+.endm
+
+.macro hevc_loop_filter_chroma_body bitdepth
+.if \bitdepth <= 8
+        uxtl            v0.8h, v0.8b // p1
+        uxtl            v1.8h, v1.8b // p0
+        uxtl            v2.8h, v2.8b // q0
+        uxtl            v3.8h, v3.8b // q1
+.endif
+        sub             v5.8h, v2.8h, v1.8h // q0 - p0
+        sub             v6.8h, v0.8h, v3.8h // p1 - q1
+        shl             v5.8h, v5.8h, #2
+        add             v5.8h, v6.8h, v5.8h
+        srshr           v5.8h, v5.8h, #3
+        smin            v5.8h, v5.8h, v16.8h
+        smax            v5.8h, v5.8h, v17.8h
+        sqadd           v1.8h, v1.8h, v5.8h // p0 + delta
+        sqsub           v2.8h, v2.8h, v5.8h // q0 - delta
+.if \bitdepth <= 8
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+.else
+        smin            v1.8h, v1.8h, v19.8h
+        smin            v2.8h, v2.8h, v19.8h
+        smax            v1.8h, v1.8h, v18.8h
+        smax            v2.8h, v2.8h, v18.8h
+.endif
+.endm
+
+.macro hevc_h_loop_filter_chroma bitdepth
+function ff_hevc_h_loop_filter_chroma_\bitdepth\()_neon, export=1
+        hevc_loop_filter_chroma_start \bitdepth
+        sub             x0, x0, x1, lsl #1
+.if \bitdepth > 8
+        ld1             {v0.8h}, [x0], x1
+        ld1             {v1.8h}, [x0], x1
+        ld1             {v2.8h}, [x0], x1
+        ld1             {v3.8h}, [x0]
+.else
+        ld1             {v0.8b}, [x0], x1
+        ld1             {v1.8b}, [x0], x1
+        ld1             {v2.8b}, [x0], x1
+        ld1             {v3.8h}, [x0]
+.endif
+        hevc_loop_filter_chroma_body \bitdepth
+        sub             x0, x0, x1, lsl #1
+.if \bitdepth > 8
+        st1             {v1.8h}, [x0], x1
+        st1             {v2.8h}, [x0]
+.else
+        st1             {v1.8b}, [x0], x1
+        st1             {v2.8b}, [x0]
+.endif
+1:      ret
+endfunc
+.endm
+
+.macro hevc_v_loop_filter_chroma bitdepth
+function ff_hevc_v_loop_filter_chroma_\bitdepth\()_neon, export=1
+        hevc_loop_filter_chroma_start \bitdepth
+.if \bitdepth > 8
+        sub             x0, x0, #8
+        ld1             {v20.8h}, [x0], x1
+        ld1             {v21.8h}, [x0], x1
+        ld1             {v0.8h},  [x0], x1
+        ld1             {v1.8h},  [x0], x1
+        ld1             {v2.8h},  [x0], x1
+        ld1             {v3.8h},  [x0], x1
+        ld1             {v22.8h}, [x0], x1
+        ld1             {v23.8h}, [x0], x1
+        transpose_8x8H  v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+.else
+        sub             x0, x0, #4
+        ld1             {v20.8b}, [x0], x1
+        ld1             {v21.8b}, [x0], x1
+        ld1             {v0.8b},  [x0], x1
+        ld1             {v1.8b},  [x0], x1
+        ld1             {v2.8b},  [x0], x1
+        ld1             {v3.8b},  [x0], x1
+        ld1             {v22.8b}, [x0], x1
+        ld1             {v23.8b}, [x0], x1
+        transpose_8x8B  v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+.endif
+        hevc_loop_filter_chroma_body \bitdepth
+        sub             x0, x0, x1, lsl #3
+.if \bitdepth > 8
+        transpose_8x8H  v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+        st1             {v20.8h}, [x0], x1
+        st1             {v21.8h}, [x0], x1
+        st1             {v0.8h},  [x0], x1
+        st1             {v1.8h},  [x0], x1
+        st1             {v2.8h},  [x0], x1
+        st1             {v3.8h},  [x0], x1
+        st1             {v22.8h}, [x0], x1
+        st1             {v23.8h}, [x0]
+.else
+        xtn             v0.8b, v0.8h // restore
+        xtn             v3.8b, v3.8h
+        transpose_8x8B  v20, v21, v0, v1, v2, v3, v22, v23, v24, v25
+        st1             {v20.8b}, [x0], x1
+        st1             {v21.8b}, [x0], x1
+        st1             {v0.8b},  [x0], x1
+        st1             {v1.8b},  [x0], x1
+        st1             {v2.8b},  [x0], x1
+        st1             {v3.8b},  [x0], x1
+        st1             {v22.8b}, [x0], x1
+        st1             {v23.8b}, [x0]
+.endif
+1:      ret
+endfunc
+.endm
+
+hevc_h_loop_filter_chroma 8
+hevc_h_loop_filter_chroma 10
+hevc_h_loop_filter_chroma 12
+
+hevc_v_loop_filter_chroma 8
+hevc_v_loop_filter_chroma 10
+hevc_v_loop_filter_chroma 12

Just like I mentioned for add_res; the 10/12 versions are almost identical except for setting up some registers at the start. It'd be good if we can share implementation between them, after the initial setup where things differ.

Self-review for this patch since I just thought of some things I could change: Stores/loads can be interleaved in both filters

I presume you mean using two alternating registers for loading/storing - yes, I was about to suggest that too.

and for the vertical filter maybe I don't need to write the full 8b/8h.

Yep, that could maybe be useful.

Other than that maybe some slightly scheduling improvements?

I don't really see much to improve on scheduling wise here. Most of the body is a sequential operation where each operation depends on the previous one, so there's not really much you can do there - unless you could operate on two blocks at the same time. But maybe for the luma case, you have more pixels to operate on?

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to