Hi, Martin

I have finished the modification, please review again.

Thanks.


在 2023/5/26 16:34, Martin Storsjö 写道:
Hi,

Overall these patches seem mostly ok, but I've got a few minor points to make:

- The usdot instruction requires the i8mm extension (part of armv8.6-a), while udot or sdot would require the dotprod extension (available in armv8.4-a). If you could manage with udot or sdot, these functions would be usable on a wider set of CPUs.

Therefore, the current guards are wrong. Also, I finally got support implemented for optionally using these cpu extensions, even if the baseline of the compile don't include it, by runtime enabling it. See the patchset at https://patchwork.ffmpeg.org/project/ffmpeg/list/?series=9009.

To adapt your patches on top of this, see the two topmost commits at https://github.com/mstorsjo/ffmpeg/commits/archext.
Fixed.
- The indentation is inconsistent; in the first patch, you have some instructions written like this:

+        sqadd   v1.4s, v1.4s, v29.4s

While you later use this style:

+        dup             v1.16b, v28.b[1]

The latter seems to match the style we commonly use; please reformat your code to match that consistently.

With some macro invocations in the first patch, you also seem to have too much indentation in some places. See e.g. this:

+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B     v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2    v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f

(If the macro name is too long, that's ok, but here there's no need to have those lines unaligned.)
Fixed.

- In the third patch, you've got multiple parameters from the stack like this:

+        ldp             x14, x15, [sp]          // mx, my
+        ldr             w13, [sp, #16]          // width

I see that the mx an my parameters are intptr_t; that's good, since if they would be 32 bit integers, the ABI for such parameters on the stack differ between macOS/Darwin and Linux. But as long as they're intptr_t they behave the same.

- At the same place, you're backing up a bunch of registers:

+        stp             x20, x21, [sp, #-16]!
+        stp             x22, x23, [sp, #-16]!
+        stp             x24, x25, [sp, #-16]!
+        stp             x26, x27, [sp, #-16]!
+        stp             x28, x30, [sp, #-16]!

This is inefficient; instead, do this:

+        stp             x28, x30, [sp, #-80]!
+        stp             x20, x21, [sp, #16]
+        stp             x22, x23, [sp, #32]
+        stp             x24, x25, [sp, #48]
+        stp             x26, x27, [sp, #64]

Also, following that, I see that you back up the stack pointer in x28. Why do you use x28 for that? Using x29 would be customary as frame pointer.
Using more efficient implementation now.  And x28 in this case is a common callee save register. Anyway, I use x19 instead now.

Aside for that, I think the rest of the patches is acceptable.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
From 224e1b907b9273f6fecaef007730bd1168493515 Mon Sep 17 00:00:00 2001
From: myais <logan....@myais.com.cn>
Date: Fri, 5 May 2023 22:06:22 +0800
Subject: [PATCH 2/3] lavc/aarch64: new optimization for 8-bit
 hevc_qpel_uni_w_h

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  15 +-
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 434 ++++++++++++++++++++++
 2 files changed, 448 insertions(+), 1 deletion(-)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index 6b5341dd45..a7e62c7d15 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -145,6 +145,7 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, 
ptrdiff_t _dststride, co
     void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
 
+
 NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
@@ -155,6 +156,12 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  
ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
+
+
 #define NEON8_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
         member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
@@ -174,9 +181,11 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  
ptrdiff_t _dststride,
         member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
         member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
 
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
-    if (!have_neon(av_get_cpu_flags())) return;
+    int cpu_flags = av_get_cpu_flags();
+    if (!have_neon(cpu_flags)) return;
 
     if (bit_depth == 8) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_8_neon;
@@ -236,6 +245,10 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
         NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
 
+        if (have_i8mm(cpu_flags)) {
+            NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+        }
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 51df52e1ea..8e8b88c9ea 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -1192,3 +1192,437 @@ function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, 
export=1
         b.hi            3b
         ret
 endfunc
+
+#if HAVE_I8MM
+.macro QPEL_UNI_W_H_HEADER
+        ldr             x12, [sp]
+        sub             x2, x2, #3
+        movrel          x9, qpel_filters
+        add             x9, x9, x12, lsl #3
+        ldr             x11, [x9]
+        dup             v28.2d, x11
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.4s, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_h4_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v0.16b}, [x2], x3
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        usdot           v17.4s, v2.16b, v28.16b
+        addp            v16.4s, v16.4s, v17.4s
+        mul             v16.4s, v16.4s, v30.4s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0]
+        add             x0, x0, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h6_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x1, x1, #4
+1:
+        ld1             {v0.16b}, [x2], x3
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v0.16b, v28.16b
+        usdot           v17.4s, v2.16b, v28.16b
+        usdot           v18.4s, v4.16b, v28.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v18.4s
+        mul             v16.4s, v16.4s, v30.4s
+        mul             v18.2s, v18.2s, v30.2s
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v18.2s, v18.2s, v31.2s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v18.2s, v18.2s, v29.2s
+        sqxtn           v16.4h, v16.4s
+        sqxtn2          v16.8h, v18.4s
+        sqxtun          v16.8b, v16.8h
+        str             s16, [x0], #4
+        st1             {v16.h}[2], [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+.macro  QPEL_UNI_W_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        movi            \d2\().2d, #0
+        movi            \d3\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        usdot           \d2\().4s, \s2\().16b, v28.16b
+        usdot           \d3\().4s, \s3\().16b, v28.16b
+        addp            \d0\().4s, \d0\().4s, \d1\().4s
+        addp            \d2\().4s, \d2\().4s, \d3\().4s
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        mul             \d2\().4s, \d2\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqrshl          \d2\().4s, \d2\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqadd           \d2\().4s, \d2\().4s, v29.4s
+.endm
+
+.macro  QPEL_UNI_W_H_CALC_HALF s0, s1, d0, d1
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v28.16b
+        usdot           \d1\().4s, \s1\().16b, v28.16b
+        addp            \d0\().4s, \d0\().4s, \d1\().4s
+        mul             \d0\().4s, \d0\().4s, v30.4s
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+.endm
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h8_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v0.2d, v16.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_UNI_W_H_CALC  v0, v2, v4, v6,  v18, v19, v20, v21
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v20.4s
+        sqxtun          v18.8b, v18.8h
+        str             d18, [x0]
+        add             x0, x0, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h12_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+        add             x13, x0, #8
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v18.2d, v16.2d, v1.2d
+        zip1            v19.2d, v2.2d, v3.2d
+        zip1            v20.2d, v4.2d, v5.2d
+        zip1            v21.2d, v6.2d, v7.2d
+        zip2            v22.2d, v16.2d, v1.2d
+        zip2            v23.2d, v2.2d, v3.2d
+        QPEL_UNI_W_H_CALC  v18, v19, v20, v21, v0, v2, v4, v6
+        QPEL_UNI_W_H_CALC_HALF v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v4.4s
+        sqxtn           v1.4h, v24.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+
+        str             d0, [x0]
+        str             s1, [x13]
+        add             x0, x0, x1
+        add             x13, x13, x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h16_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21   // v18: 0, 8, 
2, 10 v20: 1, 9, 3, 11
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25    // v22: 4, 
12, 6, 14 v24: 5, 13, 7, 15
+        sqxtn           v0.4h, v18.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v1.4h, v20.4s
+        sqxtn2          v1.8h, v24.4s
+        trn1            v2.8h, v0.8h, v1.8h
+        trn2            v3.8h, v0.8h, v1.8h
+        sqxtun          v0.8b, v2.8h
+        sqxtun2         v0.16b, v3.8h
+        st1             {v0.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h24_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x1, x1, #16
+1:
+        ld1             {v16.16b, v17.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v18, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v18.8h, v19.8h
+        trn2            v21.8h, v18.8h, v19.8h
+        sqxtun          v26.8b, v20.8h
+        sqxtun2         v26.16b, v21.8h                         // 0-15
+        ext             v1.16b, v17.16b, v17.16b, #1
+        ext             v2.16b, v17.16b, v17.16b, #2
+        ext             v3.16b, v17.16b, v17.16b, #3
+        ext             v4.16b, v17.16b, v17.16b, #4
+        ext             v5.16b, v17.16b, v17.16b, #5
+        ext             v6.16b, v17.16b, v17.16b, #6
+        ext             v7.16b, v17.16b, v17.16b, #7
+        zip1            v0.2d, v17.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_UNI_W_H_CALC  v0, v2, v4, v6, v18, v19, v20, v21
+        sqxtn           v18.4h, v18.4s
+        sqxtn2          v18.8h, v20.4s
+        sqxtun          v27.8b, v18.8h
+
+        st1             {v26.16b}, [x0], #16
+        st1             {v27.8b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h32_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b, v18.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v0, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v0.8h, v19.8h
+        trn2            v21.8h, v0.8h, v19.8h
+        sqxtun          v26.8b, v20.8h
+        sqxtun2         v26.16b, v21.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v0, v19, v20, v21
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v23, v24, v25
+        sqxtn           v0.4h, v0.4s
+        sqxtn2          v0.8h, v22.4s
+        sqxtn           v19.4h, v20.4s
+        sqxtn2          v19.8h, v24.4s
+        trn1            v20.8h, v0.8h, v19.8h
+        trn2            v21.8h, v0.8h, v19.8h
+        sqxtun          v27.8b, v20.8h
+        sqxtun2         v27.16b, v21.8h                         // 16-31
+        st1             {v26.16b, v27.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_h48_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], x3
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v25.8b, v22.8h
+        sqxtun2         v25.16b, v23.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v26.8b, v22.8h
+        sqxtun2         v26.16b, v23.8h                         // 16-31
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v27.8b, v22.8h
+        sqxtun2         v27.16b, v23.8h                         // 32-47
+        st1             {v25.16b, v26.16b, v27.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, export=1
+        QPEL_UNI_W_H_HEADER
+        sub             x3, x3, #64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2], #64
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_UNI_W_H_CALC  v16, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v16.8b, v22.8h
+        sqxtun2         v16.16b, v23.8h                         // 0-15
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_UNI_W_H_CALC  v17, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v17.8b, v22.8h
+        sqxtun2         v17.16b, v23.8h                         // 16-31
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_UNI_W_H_CALC  v18, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        ld1             {v0.16b}, [x2], x3
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v18.8b, v22.8h
+        sqxtun2         v18.16b, v23.8h                         // 32-47
+        ext             v1.16b, v19.16b, v0.16b, #1
+        ext             v2.16b, v19.16b, v0.16b, #2
+        ext             v3.16b, v19.16b, v0.16b, #3
+        ext             v4.16b, v19.16b, v0.16b, #4
+        ext             v5.16b, v19.16b, v0.16b, #5
+        ext             v6.16b, v19.16b, v0.16b, #6
+        ext             v7.16b, v19.16b, v0.16b, #7
+        QPEL_UNI_W_H_CALC  v19, v2, v1, v3, v20, v24, v21, v0
+        QPEL_UNI_W_H_CALC  v4, v6, v5, v7, v22, v24, v23, v0
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        sqxtn           v21.4h, v21.4s
+        sqxtn2          v21.8h, v23.4s
+        trn1            v22.8h, v20.8h, v21.8h
+        trn2            v23.8h, v20.8h, v21.8h
+        sqxtun          v19.8b, v22.8h
+        sqxtun2         v19.16b, v23.8h                         // 48-63
+
+        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.hi            1b
+        ret
+endfunc
+
+#endif // HAVE_I8MM
-- 
2.38.0.windows.1

From 6217b6ab771a8e3a04c3b9818141d3ffaf024641 Mon Sep 17 00:00:00 2001
From: myais <logan....@myais.com.cn>
Date: Sun, 28 May 2023 09:56:51 +0800
Subject: [PATCH 3/3] lavc/aarch64: new optimization for 8-bit hevc_qpel_h
 hevc_qpel_uni_w_hv

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   24 +
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 1080 +++++++++++++++++++++
 2 files changed, 1104 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index a7e62c7d15..483a9d5253 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -145,6 +145,13 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, 
ptrdiff_t _dststride, co
     void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
     void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
 
+#define NEON8_FNPROTO_PARTIAL_5(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
 
 NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
@@ -156,11 +163,20 @@ NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  
ptrdiff_t _dststride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width),);
 
+
+NEON8_FNPROTO(qpel_h, (int16_t *dst,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, intptr_t mx, intptr_t my, int width), _i8mm);
+
 NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t _dststride,
         const uint8_t *_src, ptrdiff_t _srcstride,
         int height, int denom, int wx, int ox,
         intptr_t mx, intptr_t my, int width), _i8mm);
 
+NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width), _i8mm);
 
 #define NEON8_FNASSIGN(member, v, h, fn, ext) \
         member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
@@ -181,6 +197,12 @@ NEON8_FNPROTO(qpel_uni_w_h, (uint8_t *_dst,  ptrdiff_t 
_dststride,
         member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
         member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
 
+#define NEON8_FNASSIGN_PARTIAL_5(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
 
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
@@ -247,6 +269,8 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
 
         if (have_i8mm(cpu_flags)) {
             NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm);
+            NEON8_FNASSIGN(c->put_hevc_qpel, 0, 1, qpel_h, _i8mm);
+            NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, 
qpel_uni_w_hv, _i8mm);
         }
 
     }
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 8e8b88c9ea..3b62b3c752 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -1625,4 +1625,1084 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_neon_i8mm, 
export=1
         ret
 endfunc
 
+.macro QPEL_H_HEADER
+        movrel          x9, qpel_filters
+        add             x9, x9, x4, lsl #3
+        ldr             x11, [x9]
+        dup             v31.2d, x11
+        sub             x1, x1, #3
+.endm
+
+function ff_hevc_put_hevc_qpel_h4_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+1:
+        ld1             {v0.16b}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        usdot           v16.4s, v0.16b, v31.16b
+        usdot           v17.4s, v2.16b, v31.16b
+        addp            v16.4s, v16.4s, v17.4s
+        sqxtn           v16.4h, v16.4s
+        str             d16, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h6_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #8
+1:
+        ld1             {v0.16b}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        usdot           v16.4s, v0.16b, v31.16b
+        usdot           v17.4s, v2.16b, v31.16b
+        usdot           v18.4s, v4.16b, v31.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v18.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtn           v18.4h, v18.4s
+        str             d16, [x0]
+        str             s18, [x15]
+        add             x0, x0, x10
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h8_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+1:
+        ld1             {v0.16b}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #1
+        ext             v2.16b, v0.16b, v0.16b, #2
+        ext             v3.16b, v0.16b, v0.16b, #3
+        ext             v4.16b, v0.16b, v0.16b, #4
+        ext             v5.16b, v0.16b, v0.16b, #5
+        ext             v6.16b, v0.16b, v0.16b, #6
+        ext             v7.16b, v0.16b, v0.16b, #7
+        zip1            v0.2d, v0.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        movi            v16.2d, #0
+        movi            v17.2d, #0
+        movi            v18.2d, #0
+        movi            v19.2d, #0
+        usdot           v16.4s, v0.16b, v31.16b
+        usdot           v17.4s, v2.16b, v31.16b
+        usdot           v18.4s, v4.16b, v31.16b
+        usdot           v19.4s, v6.16b, v31.16b
+        addp            v16.4s, v16.4s, v17.4s
+        addp            v18.4s, v18.4s, v19.4s
+        sqxtn           v16.4h, v16.4s
+        sqxtn2          v16.8h, v18.4s
+        str             q16, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro QPEL_H_CALC s0, s1, s2, s3, d0, d1, d2, d3
+        movi            \d0\().2d, #0
+        movi            \d1\().2d, #0
+        movi            \d2\().2d, #0
+        movi            \d3\().2d, #0
+        usdot           \d0\().4s, \s0\().16b, v31.16b
+        usdot           \d1\().4s, \s1\().16b, v31.16b
+        usdot           \d2\().4s, \s2\().16b, v31.16b
+        usdot           \d3\().4s, \s3\().16b, v31.16b
+.endm
+
+function ff_hevc_put_hevc_qpel_h12_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #16
+1:
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        zip1            v18.2d, v4.2d, v5.2d
+        zip1            v19.2d, v6.2d, v7.2d
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        movi            v24.2d, #0
+        movi            v25.2d, #0
+        usdot           v24.4s, v18.16b, v31.16b
+        usdot           v25.4s, v19.16b, v31.16b
+        addp            v24.4s, v24.4s, v25.4s
+        trn1            v26.4s, v20.4s, v21.4s
+        trn2            v27.4s, v20.4s, v21.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn           v27.4h, v27.4s
+        sqxtn2          v26.8h, v24.4s
+
+        str             q26, [x0]
+        str             d27, [x15]
+        add             x0, x0, x10
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h16_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+1:
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+
+        sqxtn           v18.4h, v22.4s
+        sqxtn2          v18.8h, v26.4s
+        sqxtn           v19.4h, v23.4s
+        sqxtn2          v19.8h, v27.4s
+
+        stp             q18, q19, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h24_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #32
+1:
+        ld1             {v16.16b, v17.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v18.4h, v22.4s
+        sqxtn2          v18.8h, v26.4s
+        sqxtn           v19.4h, v23.4s
+        sqxtn2          v19.8h, v27.4s
+        stp             q18, q19, [x0]
+        add             x0, x0, x10
+        ext             v1.16b, v17.16b, v17.16b, #1
+        ext             v2.16b, v17.16b, v17.16b, #2
+        ext             v3.16b, v17.16b, v17.16b, #3
+        ext             v4.16b, v17.16b, v17.16b, #4
+        ext             v5.16b, v17.16b, v17.16b, #5
+        ext             v6.16b, v17.16b, v17.16b, #6
+        ext             v7.16b, v17.16b, v17.16b, #7
+        zip1            v0.2d, v17.2d, v1.2d
+        zip1            v2.2d, v2.2d, v3.2d
+        zip1            v4.2d, v4.2d, v5.2d
+        zip1            v6.2d, v6.2d, v7.2d
+        QPEL_H_CALC     v0, v2, v4, v6, v20, v21, v22, v23
+        addp            v20.4s, v20.4s, v21.4s
+        addp            v22.4s, v22.4s, v23.4s
+        sqxtn           v20.4h, v20.4s
+        sqxtn2          v20.8h, v22.4s
+        str             q20, [x15]
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h32_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2
+        add             x15, x0, #32
+1:
+        ld1             {v16.16b, v17.16b, v18.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0]
+        add             x0, x0, x10
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x15]
+        add             x15, x15, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h48_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        mov             x10, #MAX_PB_SIZE * 2 - 64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], x2
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0]
+        add             x0, x0, x10
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_h64_8_neon_i8mm, export=1
+        QPEL_H_HEADER
+        sub             x2, x2, #64
+1:
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x1], #64
+        ext             v1.16b, v16.16b, v17.16b, #1
+        ext             v2.16b, v16.16b, v17.16b, #2
+        ext             v3.16b, v16.16b, v17.16b, #3
+        ext             v4.16b, v16.16b, v17.16b, #4
+        ext             v5.16b, v16.16b, v17.16b, #5
+        ext             v6.16b, v16.16b, v17.16b, #6
+        ext             v7.16b, v16.16b, v17.16b, #7
+        QPEL_H_CALC     v16, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+
+        ext             v1.16b, v17.16b, v18.16b, #1
+        ext             v2.16b, v17.16b, v18.16b, #2
+        ext             v3.16b, v17.16b, v18.16b, #3
+        ext             v4.16b, v17.16b, v18.16b, #4
+        ext             v5.16b, v17.16b, v18.16b, #5
+        ext             v6.16b, v17.16b, v18.16b, #6
+        ext             v7.16b, v17.16b, v18.16b, #7
+        QPEL_H_CALC     v17, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        ext             v1.16b, v18.16b, v19.16b, #1
+        ext             v2.16b, v18.16b, v19.16b, #2
+        ext             v3.16b, v18.16b, v19.16b, #3
+        ext             v4.16b, v18.16b, v19.16b, #4
+        ext             v5.16b, v18.16b, v19.16b, #5
+        ext             v6.16b, v18.16b, v19.16b, #6
+        ext             v7.16b, v18.16b, v19.16b, #7
+        QPEL_H_CALC     v18, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        ld1             {v28.8b}, [x1], x2
+        ext             v1.16b, v19.16b, v28.16b, #1
+        ext             v2.16b, v19.16b, v28.16b, #2
+        ext             v3.16b, v19.16b, v28.16b, #3
+        ext             v4.16b, v19.16b, v28.16b, #4
+        ext             v5.16b, v19.16b, v28.16b, #5
+        ext             v6.16b, v19.16b, v28.16b, #6
+        ext             v7.16b, v19.16b, v28.16b, #7
+        QPEL_H_CALC     v19, v1, v2, v3, v20, v21, v22, v23
+        QPEL_H_CALC     v4, v5, v6, v7, v24, v25, v26, v27
+        addp            v20.4s, v20.4s, v22.4s
+        addp            v21.4s, v21.4s, v23.4s
+        addp            v24.4s, v24.4s, v26.4s
+        addp            v25.4s, v25.4s, v27.4s
+        trn1            v22.4s, v20.4s, v21.4s
+        trn2            v23.4s, v20.4s, v21.4s
+        trn1            v26.4s, v24.4s, v25.4s
+        trn2            v27.4s, v24.4s, v25.4s
+        sqxtn           v20.4h, v22.4s
+        sqxtn2          v20.8h, v26.4s
+        sqxtn           v21.4h, v23.4s
+        sqxtn2          v21.8h, v27.4s
+        stp             q20, q21, [x0], #32
+        subs            w3, w3, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_HV_HEADER width
+        ldp             x14, x15, [sp]          // mx, my
+        ldr             w13, [sp, #16]          // width
+        stp             x19, x30, [sp, #-80]!
+        stp             x20, x21, [sp, #16]
+        stp             x22, x23, [sp, #32]
+        stp             x24, x25, [sp, #48]
+        stp             x26, x27, [sp, #64]
+        mov             x19, sp
+        mov             x11, #9088
+        sub             sp, sp, x11
+        mov             x20, x0
+        mov             x21, x1
+        mov             x0, sp
+        sub             x1, x2, x3, lsl #1
+        sub             x1, x1, x3
+        mov             x2, x3
+        add             w3, w4, #7
+        mov             w22, w4                 // height
+        mov             x4, x14                 // mx
+        mov             x23, x15                // my
+        mov             w24, w6                 // wx
+        mov             w25, w7                 // ox
+        mov             w26, #-6
+        sub             w26, w26, w5            // -shift
+        mov             w27, w13                // width
+        bl              X(ff_hevc_put_hevc_qpel_h\width\()_8_neon_i8mm)
+        movrel          x9, qpel_filters
+        add             x9, x9, x23, lsl #3
+        ld1             {v0.8b}, [x9]
+        sxtl            v0.8h, v0.8b
+        mov             x10, #(MAX_PB_SIZE * 2)
+        dup             v28.4s, w24
+        dup             v29.4s, w25
+        dup             v30.4s, w26
+.endm
+
+.macro QPEL_UNI_W_HV_END
+        mov             sp, x19
+        ldp             x19, x30, [sp]
+        ldp             x26, x27, [sp, #16]
+        ldp             x24, x25, [sp, #32]
+        ldp             x22, x23, [sp, #48]
+        ldp             x20, x21, [sp, #64]
+        add             sp, sp, #80
+.endm
+
+.macro QPEL_UNI_W_HV_4
+        sshr            v26.4s, v26.4s, #6
+        mul             v24.4s, v26.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x20], x21
+.endm
+
+.macro QPEL_FILTER_H    dst, src0, src1, src2, src3, src4, src5, src6, src7
+        smull           \dst\().4s, \src0\().4h, v0.h[0]
+        smlal           \dst\().4s, \src1\().4h, v0.h[1]
+        smlal           \dst\().4s, \src2\().4h, v0.h[2]
+        smlal           \dst\().4s, \src3\().4h, v0.h[3]
+        smlal           \dst\().4s, \src4\().4h, v0.h[4]
+        smlal           \dst\().4s, \src5\().4h, v0.h[5]
+        smlal           \dst\().4s, \src6\().4h, v0.h[6]
+        smlal           \dst\().4s, \src7\().4h, v0.h[7]
+.endm
+
+.macro QPEL_FILTER_H2    dst, src0, src1, src2, src3, src4, src5, src6, src7
+        smull2          \dst\().4s, \src0\().8h, v0.h[0]
+        smlal2          \dst\().4s, \src1\().8h, v0.h[1]
+        smlal2          \dst\().4s, \src2\().8h, v0.h[2]
+        smlal2          \dst\().4s, \src3\().8h, v0.h[3]
+        smlal2          \dst\().4s, \src4\().8h, v0.h[4]
+        smlal2          \dst\().4s, \src5\().8h, v0.h[5]
+        smlal2          \dst\().4s, \src6\().8h, v0.h[6]
+        smlal2          \dst\().4s, \src7\().8h, v0.h[7]
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv4_8_neon_i8mm, export=1
+        QPEL_UNI_W_HV_HEADER 4
+        ldr             d16, [sp]
+        ldr             d17, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ldr             d18, [sp]
+        ldr             d19, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ldr             d20, [sp]
+        ldr             d21, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ldr             d22, [sp]
+        add             sp, sp, x10
+1:
+        ldr             d23, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d16, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d17, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d18, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d19, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d20, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d21, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             d22, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_HV_4
+        subs            w22, w22, #1
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+.macro QPEL_UNI_W_HV_8
+        sshr            v26.4s, v26.4s, #6
+        sshr            v27.4s, v27.4s, #6
+        mul             v24.4s, v26.4s, v28.4s
+        mul             v25.4s, v27.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqrshl          v25.4s, v25.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x20], x21
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv8_8_neon_i8mm, export=1
+        QPEL_UNI_W_HV_HEADER 8
+        ldr             q16, [sp]
+        ldr             q17, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ldr             q18, [sp]
+        ldr             q19, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ldr             q20, [sp]
+        ldr             q21, [sp, x10]
+        add             sp, sp, x10, lsl #1
+        ldr             q22, [sp]
+        add             sp, sp, x10
+1:
+        ldr             q23, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q16, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q17, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q18, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q19, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q20, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q21, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldr             q22, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_HV_8
+        subs            w22, w22, #1
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+.macro QPEL_UNI_W_HV_16
+        sshr            v24.4s, v24.4s, #6
+        sshr            v25.4s, v25.4s, #6
+        sshr            v26.4s, v26.4s, #6
+        sshr            v27.4s, v27.4s, #6
+        mul             v24.4s, v24.4s, v28.4s
+        mul             v25.4s, v25.4s, v28.4s
+        mul             v26.4s, v26.4s, v28.4s
+        mul             v27.4s, v27.4s, v28.4s
+        sqrshl          v24.4s, v24.4s, v30.4s
+        sqrshl          v25.4s, v25.4s, v30.4s
+        sqrshl          v26.4s, v26.4s, v30.4s
+        sqrshl          v27.4s, v27.4s, v30.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+
+        st1             {v24.16b}, [x20], x21
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_hv16_8_neon_i8mm, export=1
+        QPEL_UNI_W_HV_HEADER 16
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+1:
+        ldp             q23, q31, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.hi            1b
+
+2:
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_qpel_uni_w_hv32_8_neon_i8mm, export=1
+        QPEL_UNI_W_HV_HEADER 32
+        mov             x11, sp
+        mov             w12, w22
+        mov             x13, x20
+3:
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+1:
+        ldp             q23, q31, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.hi            1b
+2:
+        subs            w27, w27, #16
+        add             sp, x11, #32
+        add             x20, x13, #16
+        mov             w22, w12
+        mov             x11, sp
+        mov             x13, x20
+        b.hi            3b
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1
+        QPEL_UNI_W_HV_HEADER 64
+        mov             x11, sp
+        mov             w12, w22
+        mov             x13, x20
+3:
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+1:
+        ldp             q23, q31, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H2  v25, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_H   v26,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_FILTER_H2  v27,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v31
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q16, q1, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H2  v25, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_H   v26,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_FILTER_H2  v27,  v2,  v3,  v4,  v5,  v6,  v7, v31,  v1
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q17, q2, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H2  v25, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_H   v26,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_FILTER_H2  v27,  v3,  v4,  v5,  v6,  v7, v31,  v1,  v2
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q18, q3, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H2  v25, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_H   v26,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_FILTER_H2  v27,  v4,  v5,  v6,  v7, v31,  v1,  v2,  v3
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q19, q4, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H2  v25, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_H   v26,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_FILTER_H2  v27,  v5,  v6,  v7, v31,  v1,  v2,  v3,  v4
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q20, q5, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H2  v25, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_H   v26,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_FILTER_H2  v27,  v6,  v7, v31,  v1,  v2,  v3,  v4,  v5
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q21, q6, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H2  v25, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_H   v26,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_FILTER_H2  v27,  v7, v31,  v1,  v2,  v3,  v4,  v5,  v6
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.eq            2f
+
+        ldp             q22, q7, [sp]
+        add             sp, sp, x10
+        QPEL_FILTER_H   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H2  v25, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_H   v26, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_FILTER_H2  v27, v31,  v1,  v2,  v3,  v4,  v5,  v6,  v7
+        QPEL_UNI_W_HV_16
+        subs            w22, w22, #1
+        b.hi            1b
+2:
+        subs            w27, w27, #16
+        add             sp, x11, #32
+        add             x20, x13, #16
+        mov             w22, w12
+        mov             x11, sp
+        mov             x13, x20
+        b.hi            3b
+        QPEL_UNI_W_HV_END
+        ret
+endfunc
+
 #endif // HAVE_I8MM
+
-- 
2.38.0.windows.1

From 4117cc5433ae3b9a3fd26fd44fb7b1adae168a2d Mon Sep 17 00:00:00 2001
From: myais <logan....@myais.com.cn>
Date: Wed, 3 May 2023 09:53:07 +0800
Subject: [PATCH 1/3] lavc/aarch64: new optimization for 8-bit
 hevc_pel_uni_w_pixels and qpel_uni_w_v

---
 libavcodec/aarch64/hevcdsp_init_aarch64.c |  51 ++
 libavcodec/aarch64/hevcdsp_qpel_neon.S    | 710 ++++++++++++++++++++++
 2 files changed, 761 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c 
b/libavcodec/aarch64/hevcdsp_init_aarch64.c
index be1049a2ec..6b5341dd45 100644
--- a/libavcodec/aarch64/hevcdsp_init_aarch64.c
+++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c
@@ -128,6 +128,52 @@ void ff_hevc_put_hevc_qpel_bi_h16_8_neon(uint8_t *_dst, 
ptrdiff_t _dststride, co
                                          ptrdiff_t _srcstride, const int16_t 
*src2, int height, intptr_t
                                          mx, intptr_t my, int width);
 
+#define NEON8_FNPROTO(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##6_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##12_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##24_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##32_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##48_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+#define NEON8_FNPROTO_PARTIAL_4(fn, args, ext) \
+    void ff_hevc_put_hevc_##fn##4_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##8_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##16_8_neon##ext args; \
+    void ff_hevc_put_hevc_##fn##64_8_neon##ext args; \
+
+NEON8_FNPROTO(pel_uni_w_pixels, (uint8_t *_dst, ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+NEON8_FNPROTO_PARTIAL_4(qpel_uni_w_v, (uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width),);
+
+#define NEON8_FNASSIGN(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[4][v][h] = ff_hevc_put_hevc_##fn##12_8_neon##ext; \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[6][v][h] = ff_hevc_put_hevc_##fn##24_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##32_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##48_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
+#define NEON8_FNASSIGN_PARTIAL_4(member, v, h, fn, ext) \
+        member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext;  \
+        member[3][v][h] = ff_hevc_put_hevc_##fn##8_8_neon##ext;  \
+        member[5][v][h] = ff_hevc_put_hevc_##fn##16_8_neon##ext; \
+        member[7][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[8][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext; \
+        member[9][v][h] = ff_hevc_put_hevc_##fn##64_8_neon##ext;
+
 av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth)
 {
     if (!have_neon(av_get_cpu_flags())) return;
@@ -185,6 +231,11 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, 
const int bit_depth)
         c->put_hevc_qpel_bi[7][0][1]   =
         c->put_hevc_qpel_bi[8][0][1]   =
         c->put_hevc_qpel_bi[9][0][1]   = ff_hevc_put_hevc_qpel_bi_h16_8_neon;
+
+        NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 0, pel_uni_w_pixels,);
+        NEON8_FNASSIGN_PARTIAL_4(c->put_hevc_qpel_uni_w, 1, 0, qpel_uni_w_v,);
+
     }
     if (bit_depth == 10) {
         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_10_neon;
diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S 
b/libavcodec/aarch64/hevcdsp_qpel_neon.S
index 0e7b912678..51df52e1ea 100644
--- a/libavcodec/aarch64/hevcdsp_qpel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S
@@ -30,6 +30,13 @@ const qpel_filters, align=4
         .byte           0,  1, -5, 17, 58,-10, 4, -1
 endconst
 
+const qpel_filters_abs, align=4
+        .byte           0,  0,  0,  0,  0,  0, 0,  0
+        .byte           1,  4, 10, 58, 17,  5, 1,  0
+        .byte           1,  4, 11, 40, 40, 11, 4,  1
+        .byte           0,  1,  5, 17, 58, 10, 4,  1
+endconst
+
 .macro load_filter m
         movrel          x15, qpel_filters
         add             x15, x15, \m, lsl #3
@@ -482,3 +489,706 @@ endfunc
 put_hevc qpel
 put_hevc qpel_uni
 put_hevc qpel_bi
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ldr             s0, [x2]
+        ldr             s1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v0.8h, v0.8b, #6
+        ushll           v1.8h, v1.8b, #6
+        smull           v0.4s, v0.4h, v30.4h
+        smull           v1.4s, v1.4h, v30.4h
+        sqrshl          v0.4s, v0.4s, v31.4s
+        sqrshl          v1.4s, v1.4s, v31.4s
+        sqadd           v0.4s, v0.4s, v29.4s
+        sqadd           v1.4s, v1.4s, v29.4s
+        sqxtn           v0.4h, v0.4s
+        sqxtn           v1.4h, v1.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        str             s0, [x0]
+        str             s1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+        sub             x1, x1, #4
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v0.8h, v0.8b, #6
+        ushll           v1.8h, v1.8b, #6
+        smull           v4.4s, v0.4h, v30.4h
+        smull2          v5.4s, v0.8h, v30.8h
+        smull           v6.4s, v1.4h, v30.4h
+        smull2          v7.4s, v1.8h, v30.8h
+        sqrshl          v4.4s, v4.4s, v31.4s
+        sqrshl          v5.4s, v5.4s, v31.4s
+        sqrshl          v6.4s, v6.4s, v31.4s
+        sqrshl          v7.4s, v7.4s, v31.4s
+        sqadd           v4.4s, v4.4s, v29.4s
+        sqadd           v5.4s, v5.4s, v29.4s
+        sqadd           v6.4s, v6.4s, v29.4s
+        sqadd           v7.4s, v7.4s, v29.4s
+        sqxtn           v0.4h, v4.4s
+        sqxtn2          v0.8h, v5.4s
+        sqxtn           v1.4h, v6.4s
+        sqxtn2          v1.8h, v7.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        str             s0, [x0], #4
+        st1             {v0.h}[2], [x0], x1
+        str             s1, [x0], #4
+        st1             {v1.h}[2], [x0], x1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ldr             d0, [x2]
+        ldr             d1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v0.8h, v0.8b, #6
+        ushll           v1.8h, v1.8b, #6
+        smull           v4.4s, v0.4h, v30.4h
+        smull2          v5.4s, v0.8h, v30.8h
+        smull           v6.4s, v1.4h, v30.4h
+        smull2          v7.4s, v1.8h, v30.8h
+        sqrshl          v4.4s, v4.4s, v31.4s
+        sqrshl          v5.4s, v5.4s, v31.4s
+        sqrshl          v6.4s, v6.4s, v31.4s
+        sqrshl          v7.4s, v7.4s, v31.4s
+        sqadd           v4.4s, v4.4s, v29.4s
+        sqadd           v5.4s, v5.4s, v29.4s
+        sqadd           v6.4s, v6.4s, v29.4s
+        sqadd           v7.4s, v7.4s, v29.4s
+        sqxtn           v0.4h, v4.4s
+        sqxtn2          v0.8h, v5.4s
+        sqxtn           v1.4h, v6.4s
+        sqxtn2          v1.8h, v7.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        str             d0, [x0]
+        str             d1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+        sub             x1, x1, #8
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        ushll2          v7.8h, v1.16b, #6
+        smull           v16.4s, v4.4h, v30.4h
+        smull2          v17.4s, v4.8h, v30.8h
+        smull           v18.4s, v5.4h, v30.4h
+        smull2          v19.4s, v5.8h, v30.8h
+        smull           v20.4s, v6.4h, v30.4h
+        smull2          v21.4s, v6.8h, v30.8h
+        smull           v22.4s, v7.4h, v30.4h
+        smull2          v23.4s, v7.8h, v30.8h
+
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v17.4s, v17.4s, v31.4s
+        sqrshl          v18.4s, v18.4s, v31.4s
+        sqrshl          v19.4s, v19.4s, v31.4s
+        sqrshl          v20.4s, v20.4s, v31.4s
+        sqrshl          v21.4s, v21.4s, v31.4s
+        sqrshl          v22.4s, v22.4s, v31.4s
+        sqrshl          v23.4s, v23.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v17.4s, v17.4s, v29.4s
+        sqadd           v18.4s, v18.4s, v29.4s
+        sqadd           v19.4s, v19.4s, v29.4s
+        sqadd           v20.4s, v20.4s, v29.4s
+        sqadd           v21.4s, v21.4s, v29.4s
+        sqadd           v22.4s, v22.4s, v29.4s
+        sqadd           v23.4s, v23.4s, v29.4s
+        sqxtn           v0.4h, v16.4s
+        sqxtn2          v0.8h, v17.4s
+        sqxtn           v1.4h, v18.4s
+        sqxtn2          v1.8h, v19.4s
+        sqxtn           v2.4h, v20.4s
+        sqxtn2          v2.8h, v21.4s
+        sqxtn           v3.4h, v22.4s
+        sqxtn2          v3.8h, v23.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun2         v0.16b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        sqxtun2         v2.16b, v3.8h
+        str             d0, [x0], #8
+        st1             {v0.s}[2], [x0], x1
+        str             d2, [x0], #8
+        st1             {v2.s}[2], [x0], x1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+.macro PEL_UNI_W_PIXEL_CALC     s0, t0, t1, d0, d1, d2, d3
+        ushll           \t0\().8h, \s0\().8b, #6
+        ushll2          \t1\().8h, \s0\().16b, #6
+        smull           \d0\().4s, \t0\().4h, v30.4h
+        smull2          \d1\().4s, \t0\().8h, v30.8h
+        smull           \d2\().4s, \t1\().4h, v30.4h
+        smull2          \d3\().4s, \t1\().8h, v30.8h
+        sqrshl          \d0\().4s, \d0\().4s, v31.4s
+        sqrshl          \d1\().4s, \d1\().4s, v31.4s
+        sqrshl          \d2\().4s, \d2\().4s, v31.4s
+        sqrshl          \d3\().4s, \d3\().4s, v31.4s
+        sqadd           \d0\().4s, \d0\().4s, v29.4s
+        sqadd           \d1\().4s, \d1\().4s, v29.4s
+        sqadd           \d2\().4s, \d2\().4s, v29.4s
+        sqadd           \d3\().4s, \d3\().4s, v29.4s
+        sqxtn           \t0\().4h, \d0\().4s
+        sqxtn2          \t0\().8h, \d1\().4s
+        sqxtn           \t1\().4h, \d2\().4s
+        sqxtn2          \t1\().8h, \d3\().4s
+        sqxtun          \s0\().8b,  \t0\().8h
+        sqxtun2         \s0\().16b, \t1\().8h
+.endm
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        str             q0, [x0]
+        str             q1, [x0, x1]
+        add             x0, x0, x1, lsl #1
+        subs            w4, w4, #2
+        b.ne            1b
+        ret
+endfunc
+
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        ushll           v4.8h, v0.8b, #6
+        ushll2          v5.8h, v0.16b, #6
+        ushll           v6.8h, v1.8b, #6
+        smull           v16.4s, v4.4h, v30.4h
+        smull2          v17.4s, v4.8h, v30.8h
+        smull           v18.4s, v5.4h, v30.4h
+        smull2          v19.4s, v5.8h, v30.8h
+        smull           v20.4s, v6.4h, v30.4h
+        smull2          v21.4s, v6.8h, v30.8h
+        sqrshl          v16.4s, v16.4s, v31.4s
+        sqrshl          v17.4s, v17.4s, v31.4s
+        sqrshl          v18.4s, v18.4s, v31.4s
+        sqrshl          v19.4s, v19.4s, v31.4s
+        sqrshl          v20.4s, v20.4s, v31.4s
+        sqrshl          v21.4s, v21.4s, v31.4s
+        sqadd           v16.4s, v16.4s, v29.4s
+        sqadd           v17.4s, v17.4s, v29.4s
+        sqadd           v18.4s, v18.4s, v29.4s
+        sqadd           v19.4s, v19.4s, v29.4s
+        sqadd           v20.4s, v20.4s, v29.4s
+        sqadd           v21.4s, v21.4s, v29.4s
+        sqxtn           v0.4h, v16.4s
+        sqxtn2          v0.8h, v17.4s
+        sqxtn           v1.4h, v18.4s
+        sqxtn2          v1.8h, v19.4s
+        sqxtn           v2.4h, v20.4s
+        sqxtn2          v2.8h, v21.4s
+        sqxtun          v0.8b, v0.8h
+        sqxtun          v1.8b, v1.8h
+        sqxtun          v2.8b, v2.8h
+        st1             {v0.8b, v1.8b, v2.8b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        st1             {v0.16b, v1.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b, v2.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        st1             {v0.16b, v1.16b, v2.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_neon, export=1
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6
+        dup             v31.4s, w10
+        dup             v29.4s, w7
+1:
+        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], x3
+        PEL_UNI_W_PIXEL_CALC v0, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v1, v6, v7, v20, v21, v22, v23
+        PEL_UNI_W_PIXEL_CALC v2, v4, v5, v16, v17, v18, v19
+        PEL_UNI_W_PIXEL_CALC v3, v6, v7, v20, v21, v22, v23
+        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
+        subs            w4, w4, #1
+        b.ne            1b
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_HEADER
+        ldur            x12, [sp, #8]          // my
+        sub             x2, x2, x3, lsl #1
+        sub             x2, x2, x3
+        movrel          x9, qpel_filters_abs
+        add             x9, x9, x12, lsl #3
+        ldr             d28, [x9]
+        dup             v0.16b, v28.b[0]
+        dup             v1.16b, v28.b[1]
+        dup             v2.16b, v28.b[2]
+        dup             v3.16b, v28.b[3]
+        dup             v4.16b, v28.b[4]
+        dup             v5.16b, v28.b[5]
+        dup             v6.16b, v28.b[6]
+        dup             v7.16b, v28.b[7]
+
+        mov             w10, #-6
+        sub             w10, w10, w5
+        dup             v30.8h, w6              // wx
+        dup             v31.4s, w10             // shift
+        dup             v29.4s, w7              // ox
+.endm
+
+.macro QPEL_FILTER_B dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull           \dst\().8h, \src1\().8b, v1.8b
+        umlsl           \dst\().8h, \src0\().8b, v0.8b
+        umlsl           \dst\().8h, \src2\().8b, v2.8b
+        umlal           \dst\().8h, \src3\().8b, v3.8b
+        umlal           \dst\().8h, \src4\().8b, v4.8b
+        umlsl           \dst\().8h, \src5\().8b, v5.8b
+        umlal           \dst\().8h, \src6\().8b, v6.8b
+        umlsl           \dst\().8h, \src7\().8b, v7.8b
+.endm
+
+.macro QPEL_FILTER_B2 dst, src0, src1, src2, src3, src4, src5, src6, src7
+        umull2          \dst\().8h, \src1\().16b, v1.16b
+        umlsl2          \dst\().8h, \src0\().16b, v0.16b
+        umlsl2          \dst\().8h, \src2\().16b, v2.16b
+        umlal2          \dst\().8h, \src3\().16b, v3.16b
+        umlal2          \dst\().8h, \src4\().16b, v4.16b
+        umlsl2          \dst\().8h, \src5\().16b, v5.16b
+        umlal2          \dst\().8h, \src6\().16b, v6.16b
+        umlsl2          \dst\().8h, \src7\().16b, v7.16b
+.endm
+
+.macro  QPEL_UNI_W_V_4
+        smull           v24.4s, v24.4h, v30.4h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.s}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             s16, [x2]
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s18, [x2]
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s20, [x2]
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             s22, [x2]
+
+1:      ldr             s23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s16, [x2]
+        QPEL_FILTER_B   v24, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s18, [x2]
+        QPEL_FILTER_B   v24, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s20, [x2]
+        QPEL_FILTER_B   v24, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v24, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             s22, [x2]
+        QPEL_FILTER_B   v24, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_4
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_8
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtun          v24.8b, v24.8h
+        st1             {v24.d}[0], [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v8_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             d16, [x2]
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d18, [x2]
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d20, [x2]
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             d22, [x2]
+
+1:      ldr             d23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d16, [x2]
+        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d18, [x2]
+        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d20, [x2]
+        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             d22, [x2]
+        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_8
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+.macro QPEL_UNI_W_V_16
+        smull           v24.4s, v26.4h, v30.4h
+        smull2          v25.4s, v26.8h, v30.8h
+        smull           v26.4s, v27.4h, v30.4h
+        smull2          v27.4s, v27.8h, v30.8h
+        sqrshl          v24.4s, v24.4s, v31.4s
+        sqrshl          v25.4s, v25.4s, v31.4s
+        sqrshl          v26.4s, v26.4s, v31.4s
+        sqrshl          v27.4s, v27.4s, v31.4s
+        sqadd           v24.4s, v24.4s, v29.4s
+        sqadd           v25.4s, v25.4s, v29.4s
+        sqadd           v26.4s, v26.4s, v29.4s
+        sqadd           v27.4s, v27.4s, v29.4s
+        sqxtn           v24.4h, v24.4s
+        sqxtn2          v24.8h, v25.4s
+        sqxtn           v26.4h, v26.4s
+        sqxtn2          v26.8h, v27.4s
+        sqxtun          v24.8b, v24.8h
+        sqxtun2         v24.16b, v26.8h
+        st1             {v24.16b}, [x0], x1
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v16_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        ret
+endfunc
+
+function ff_hevc_put_hevc_qpel_uni_w_v64_8_neon, export=1
+        QPEL_UNI_W_V_HEADER
+        ldur            w13, [sp, #16]
+        mov             x14, x0
+        mov             x15, x2
+        mov             w11, w4
+
+3:
+        ldr             q16, [x2]
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q18, [x2]
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q20, [x2]
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        ldr             q22, [x2]
+
+
+1:      ldr             q23, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_FILTER_B2  v27, v16, v17, v18, v19, v20, v21, v22, v23
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q16, [x2]
+        QPEL_FILTER_B   v26, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_FILTER_B2  v27, v17, v18, v19, v20, v21, v22, v23, v16
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q17, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_FILTER_B2  v27, v18, v19, v20, v21, v22, v23, v16, v17
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q18, [x2]
+        QPEL_FILTER_B   v26, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_FILTER_B2  v27, v19, v20, v21, v22, v23, v16, v17, v18
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q19, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_FILTER_B2  v27, v20, v21, v22, v23, v16, v17, v18, v19
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q20, [x2]
+        QPEL_FILTER_B   v26, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_FILTER_B2  v27, v21, v22, v23, v16, v17, v18, v19, v20
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q21, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        QPEL_FILTER_B   v26, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_FILTER_B2  v27, v22, v23, v16, v17, v18, v19, v20, v21
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.eq            2f
+
+        ldr             q22, [x2]
+        QPEL_FILTER_B   v26, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_FILTER_B2  v27, v23, v16, v17, v18, v19, v20, v21, v22
+        QPEL_UNI_W_V_16
+        subs            w4, w4, #1
+        b.ne            1b
+2:
+        subs            w13, w13, #16
+        add             x14, x14, #16
+        add             x15, x15, #16
+        mov             x0, x14
+        mov             x2, x15
+        mov             w4, w11
+        b.hi            3b
+        ret
+endfunc
-- 
2.38.0.windows.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to