12-bit (branch master)

Georgii Zagoruiko via ffmpeg-cvslog Thu, 08 Jan 2026 14:00:03 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


The following commit(s) were added to refs/heads/master by this push:
     new 8acdffa22c aarch64/vvc: Optimisations of put_luma_v() functions for 
10/12-bit
8acdffa22c is described below

commit 8acdffa22cf6d05928b28f82a0c466c9611a4aa0
Author:     Georgii Zagoruiko <[email protected]>
AuthorDate: Tue Dec 9 22:38:49 2025 +0000
Commit:     Georgii Zagoruiko <[email protected]>
CommitDate: Thu Jan 8 17:35:55 2026 +0000

    aarch64/vvc: Optimisations of put_luma_v() functions for 10/12-bit
    
    RPi4 (auto-vectorisation is on)
    put_luma_v_10_4x4_c:                                   303.3 ( 1.00x)
    put_luma_v_10_4x4_neon:                                 55.7 ( 5.45x)
    put_luma_v_10_8x8_c:                                  1106.7 ( 1.00x)
    put_luma_v_10_8x8_neon:                                163.8 ( 6.76x)
    put_luma_v_10_16x16_c:                                2242.1 ( 1.00x)
    put_luma_v_10_16x16_neon:                              672.7 ( 3.33x)
    put_luma_v_10_32x32_c:                                7057.3 ( 1.00x)
    put_luma_v_10_32x32_neon:                             2731.3 ( 2.58x)
    put_luma_v_10_64x64_c:                               25699.8 ( 1.00x)
    put_luma_v_10_64x64_neon:                            12145.6 ( 2.12x)
    put_luma_v_10_128x128_c:                             90694.6 ( 1.00x)
    put_luma_v_10_128x128_neon:                          44862.4 ( 2.02x)
    put_luma_v_12_4x4_c:                                   304.4 ( 1.00x)
    put_luma_v_12_4x4_neon:                                 55.6 ( 5.47x)
    put_luma_v_12_8x8_c:                                  1107.4 ( 1.00x)
    put_luma_v_12_8x8_neon:                                164.7 ( 6.72x)
    put_luma_v_12_16x16_c:                                2235.8 ( 1.00x)
    put_luma_v_12_16x16_neon:                              672.5 ( 3.32x)
    put_luma_v_12_32x32_c:                                7049.2 ( 1.00x)
    put_luma_v_12_32x32_neon:                             2731.6 ( 2.58x)
    put_luma_v_12_64x64_c:                               25706.5 ( 1.00x)
    put_luma_v_12_64x64_neon:                            12145.0 ( 2.12x)
    put_luma_v_12_128x128_c:                             90672.5 ( 1.00x)
    put_luma_v_12_128x128_neon:                          44857.1 ( 2.02x)
    
    Apple M4 (auto-vectorisation is on):
    put_luma_v_10_4x4_c:                                    25.6 ( 1.00x)
    put_luma_v_10_4x4_neon:                                  3.1 ( 8.18x)
    put_luma_v_10_8x8_c:                                    34.7 ( 1.00x)
    put_luma_v_10_8x8_neon:                                 10.5 ( 3.32x)
    put_luma_v_10_16x16_c:                                 103.9 ( 1.00x)
    put_luma_v_10_16x16_neon:                               42.3 ( 2.45x)
    put_luma_v_10_32x32_c:                                 399.7 ( 1.00x)
    put_luma_v_10_32x32_neon:                              161.8 ( 2.47x)
    put_luma_v_10_64x64_c:                                1276.7 ( 1.00x)
    put_luma_v_10_64x64_neon:                              840.1 ( 1.52x)
    put_luma_v_10_128x128_c:                              4981.3 ( 1.00x)
    put_luma_v_10_128x128_neon:                           3008.0 ( 1.66x)
    put_luma_v_12_4x4_c:                                    23.6 ( 1.00x)
    put_luma_v_12_4x4_neon:                                  2.0 (11.84x)
    put_luma_v_12_8x8_c:                                    31.8 ( 1.00x)
    put_luma_v_12_8x8_neon:                                 12.4 ( 2.55x)
    put_luma_v_12_16x16_c:                                 100.8 ( 1.00x)
    put_luma_v_12_16x16_neon:                               44.9 ( 2.25x)
    put_luma_v_12_32x32_c:                                 331.1 ( 1.00x)
    put_luma_v_12_32x32_neon:                              175.2 ( 1.89x)
    put_luma_v_12_64x64_c:                                1227.1 ( 1.00x)
    put_luma_v_12_64x64_neon:                              712.7 ( 1.72x)
    put_luma_v_12_128x128_c:                              5149.1 ( 1.00x)
    put_luma_v_12_128x128_neon:                           2809.3 ( 1.83x)
---
 libavcodec/aarch64/vvc/dsp_init.c |  31 +++
 libavcodec/aarch64/vvc/inter.S    | 392 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 423 insertions(+)

diff --git a/libavcodec/aarch64/vvc/dsp_init.c 
b/libavcodec/aarch64/vvc/dsp_init.c
index aa75d22b78..bc2677945e 100644
--- a/libavcodec/aarch64/vvc/dsp_init.c
+++ b/libavcodec/aarch64/vvc/dsp_init.c
@@ -43,6 +43,23 @@ void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t 
*_src, const ptrdif
 void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
                                    const int height, const int8_t *hf, const 
int8_t *vf, const int width);
 
+void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_v16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                 const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_v4_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_v8_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_v16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                 const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+void ff_vvc_put_luma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const 
ptrdiff_t _src_stride,
+                                   const int height, const int8_t *hf, const 
int8_t *vf, const int width);
+
 void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t 
gshift, uint32_t steps);
 
 #define BIT_DEPTH 8
@@ -263,6 +280,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.put[0][5][0][1] =
         c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon;
 
+        c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_10_neon;
+        c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_10_neon;
+        c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_10_neon;
+        c->inter.put[0][4][1][0] =
+        c->inter.put[0][5][1][0] =
+        c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_10_neon;
+
         c->alf.filter[LUMA] = alf_filter_luma_10_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
         c->alf.classify = alf_classify_10_neon;
@@ -279,6 +303,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const 
int bd)
         c->inter.put[0][5][0][1] =
         c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_12_neon;
 
+        c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_12_neon;
+        c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_12_neon;
+        c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_12_neon;
+        c->inter.put[0][4][1][0] =
+        c->inter.put[0][5][1][0] =
+        c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_12_neon;
+
         c->alf.filter[LUMA] = alf_filter_luma_12_neon;
         c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
         c->alf.classify = alf_classify_12_neon;
diff --git a/libavcodec/aarch64/vvc/inter.S b/libavcodec/aarch64/vvc/inter.S
index 41444ec44c..887e456a66 100644
--- a/libavcodec/aarch64/vvc/inter.S
+++ b/libavcodec/aarch64/vvc/inter.S
@@ -1832,3 +1832,395 @@ endfunc
 function ff_vvc_put_luma_h_x16_12_neon, export=1
         put_luma_h_x16_xx_neon 4
 endfunc
+
+.macro put_luma_v4_xx_neon shift
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        sub             x1, x1, x2, lsl #1
+        ld1             {v0.8b}, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+        ld1             {v20.4h}, [x1], x2
+        ld1             {v21.4h}, [x1], x2
+        ld1             {v22.4h}, [x1], x2
+        ld1             {v23.4h}, [x1], x2
+        ld1             {v24.4h}, [x1], x2
+        ld1             {v25.4h}, [x1], x2
+        ld1             {v26.4h}, [x1], x2
+1:
+        ld1             {v27.4h}, [x1], x2
+
+        smull           v1.4s, v20.4h, v0.h[0]
+        smull           v2.4s, v21.4h, v0.h[1]
+        smlal           v1.4s, v22.4h, v0.h[2]
+        smlal           v2.4s, v23.4h, v0.h[3]
+        smlal           v1.4s, v24.4h, v0.h[4]
+        smlal           v2.4s, v25.4h, v0.h[5]
+        smlal           v1.4s, v26.4h, v0.h[6]
+        smlal           v2.4s, v27.4h, v0.h[7]
+
+        ld1             {v28.4h}, [x1], x2
+
+        smull           v3.4s, v21.4h, v0.h[0]
+        smull           v4.4s, v22.4h, v0.h[1]
+        smlal           v3.4s, v23.4h, v0.h[2]
+        smlal           v4.4s, v24.4h, v0.h[3]
+        smlal           v3.4s, v25.4h, v0.h[4]
+        smlal           v4.4s, v26.4h, v0.h[5]
+        smlal           v3.4s, v27.4h, v0.h[6]
+        smlal           v4.4s, v28.4h, v0.h[7]
+        add             v1.4s, v1.4s, v2.4s
+        add             v3.4s, v3.4s, v4.4s
+        sqshrn          v1.4h, v1.4s, #(\shift)
+        sqshrn          v3.4h, v3.4s, #(\shift)
+
+        st1             {v1.4h}, [x0], x9
+        ld1             {v29.4h}, [x1], x2
+        st1             {v3.4h}, [x0], x9
+
+        smull           v1.4s, v22.4h, v0.h[0]
+        smull           v2.4s, v23.4h, v0.h[1]
+        smlal           v1.4s, v24.4h, v0.h[2]
+        smlal           v2.4s, v25.4h, v0.h[3]
+        smlal           v1.4s, v26.4h, v0.h[4]
+        smlal           v2.4s, v27.4h, v0.h[5]
+        smlal           v1.4s, v28.4h, v0.h[6]
+        smlal           v2.4s, v29.4h, v0.h[7]
+
+        ld1             {v30.4h}, [x1], x2
+
+        smull           v3.4s, v23.4h, v0.h[0]
+        smull           v4.4s, v24.4h, v0.h[1]
+        smlal           v3.4s, v25.4h, v0.h[2]
+        smlal           v4.4s, v26.4h, v0.h[3]
+        smlal           v3.4s, v27.4h, v0.h[4]
+        smlal           v4.4s, v28.4h, v0.h[5]
+        smlal           v3.4s, v29.4h, v0.h[6]
+        smlal           v4.4s, v30.4h, v0.h[7]
+        add             v1.4s, v1.4s, v2.4s
+        add             v3.4s, v3.4s, v4.4s
+        sqshrn          v1.4h, v1.4s, #(\shift)
+        sqshrn          v3.4h, v3.4s, #(\shift)
+
+        st1             {v1.4h}, [x0], x9
+
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        mov             v22.16b, v26.16b
+        mov             v23.16b, v27.16b
+        mov             v24.16b, v28.16b
+        mov             v25.16b, v29.16b
+        mov             v26.16b, v30.16b
+
+        subs            w3, w3, #4
+        st1             {v3.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_luma_v4_10_neon, export=1
+        put_luma_v4_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_v4_12_neon, export=1
+        put_luma_v4_xx_neon 4
+endfunc
+
+.macro put_luma_v8_xx_neon shift
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        sub             x1, x1, x2, lsl #1
+        ld1             {v0.8b}, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+        ld1             {v20.8h}, [x1], x2
+        ld1             {v21.8h}, [x1], x2
+        ld1             {v22.8h}, [x1], x2
+        ld1             {v23.8h}, [x1], x2
+        ld1             {v24.8h}, [x1], x2
+        ld1             {v25.8h}, [x1], x2
+        ld1             {v26.8h}, [x1], x2
+1:
+        ld1             {v27.8h}, [x1], x2
+
+        smull           v1.4s, v20.4h, v0.h[0]
+        smull2          v2.4s, v20.8h, v0.h[0]
+        smlal           v1.4s, v21.4h, v0.h[1]
+        smlal2          v2.4s, v21.8h, v0.h[1]
+        smlal           v1.4s, v22.4h, v0.h[2]
+        smlal2          v2.4s, v22.8h, v0.h[2]
+        smlal           v1.4s, v23.4h, v0.h[3]
+        smlal2          v2.4s, v23.8h, v0.h[3]
+        smlal           v1.4s, v24.4h, v0.h[4]
+        smlal2          v2.4s, v24.8h, v0.h[4]
+        smlal           v1.4s, v25.4h, v0.h[5]
+        smlal2          v2.4s, v25.8h, v0.h[5]
+        smlal           v1.4s, v26.4h, v0.h[6]
+        smlal2          v2.4s, v26.8h, v0.h[6]
+        smlal           v1.4s, v27.4h, v0.h[7]
+        smlal2          v2.4s, v27.8h, v0.h[7]
+        sqshrn          v1.4h, v1.4s, #(\shift)
+        sqshrn          v2.4h, v2.4s, #(\shift)
+
+        ld1             {v28.8h}, [x1], x2
+        st1             {v1.4h-v2.4h}, [x0], x9
+
+        smull           v3.4s, v21.4h, v0.h[0]
+        smull2          v4.4s, v21.8h, v0.h[0]
+        smlal           v3.4s, v22.4h, v0.h[1]
+        smlal2          v4.4s, v22.8h, v0.h[1]
+        smlal           v3.4s, v23.4h, v0.h[2]
+        smlal2          v4.4s, v23.8h, v0.h[2]
+        smlal           v3.4s, v24.4h, v0.h[3]
+        smlal2          v4.4s, v24.8h, v0.h[3]
+        smlal           v3.4s, v25.4h, v0.h[4]
+        smlal2          v4.4s, v25.8h, v0.h[4]
+        smlal           v3.4s, v26.4h, v0.h[5]
+        smlal2          v4.4s, v26.8h, v0.h[5]
+        smlal           v3.4s, v27.4h, v0.h[6]
+        smlal2          v4.4s, v27.8h, v0.h[6]
+        smlal           v3.4s, v28.4h, v0.h[7]
+        smlal2          v4.4s, v28.8h, v0.h[7]
+        sqshrn          v3.4h, v3.4s, #(\shift)
+        sqshrn          v4.4h, v4.4s, #(\shift)
+
+        ld1             {v29.8h}, [x1], x2
+        st1             {v3.4h-v4.4h}, [x0], x9
+
+        smull           v1.4s, v22.4h, v0.h[0]
+        smull2          v2.4s, v22.8h, v0.h[0]
+        smlal           v1.4s, v23.4h, v0.h[1]
+        smlal2          v2.4s, v23.8h, v0.h[1]
+        smlal           v1.4s, v24.4h, v0.h[2]
+        smlal2          v2.4s, v24.8h, v0.h[2]
+        smlal           v1.4s, v25.4h, v0.h[3]
+        smlal2          v2.4s, v25.8h, v0.h[3]
+        smlal           v1.4s, v26.4h, v0.h[4]
+        smlal2          v2.4s, v26.8h, v0.h[4]
+        smlal           v1.4s, v27.4h, v0.h[5]
+        smlal2          v2.4s, v27.8h, v0.h[5]
+        smlal           v1.4s, v28.4h, v0.h[6]
+        smlal2          v2.4s, v28.8h, v0.h[6]
+        smlal           v1.4s, v29.4h, v0.h[7]
+        smlal2          v2.4s, v29.8h, v0.h[7]
+        sqshrn          v1.4h, v1.4s, #(\shift)
+        sqshrn          v2.4h, v2.4s, #(\shift)
+
+        ld1             {v30.8h}, [x1], x2
+        st1             {v1.4h-v2.4h}, [x0], x9
+
+        smull           v3.4s, v23.4h, v0.h[0]
+        smull2          v4.4s, v23.8h, v0.h[0]
+        smlal           v3.4s, v24.4h, v0.h[1]
+        smlal2          v4.4s, v24.8h, v0.h[1]
+        smlal           v3.4s, v25.4h, v0.h[2]
+        smlal2          v4.4s, v25.8h, v0.h[2]
+        smlal           v3.4s, v26.4h, v0.h[3]
+        smlal2          v4.4s, v26.8h, v0.h[3]
+        smlal           v3.4s, v27.4h, v0.h[4]
+        smlal2          v4.4s, v27.8h, v0.h[4]
+        smlal           v3.4s, v28.4h, v0.h[5]
+        smlal2          v4.4s, v28.8h, v0.h[5]
+        smlal           v3.4s, v29.4h, v0.h[6]
+        smlal2          v4.4s, v29.8h, v0.h[6]
+        smlal           v3.4s, v30.4h, v0.h[7]
+        smlal2          v4.4s, v30.8h, v0.h[7]
+        sqshrn          v3.4h, v3.4s, #(\shift)
+        sqshrn          v4.4h, v4.4s, #(\shift)
+
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        mov             v22.16b, v26.16b
+        mov             v23.16b, v27.16b
+        mov             v24.16b, v28.16b
+        mov             v25.16b, v29.16b
+        mov             v26.16b, v30.16b
+
+        subs            w3, w3, #4
+        st1             {v3.4h-v4.4h}, [x0], x9
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_luma_v8_10_neon, export=1
+        put_luma_v8_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_v8_12_neon, export=1
+        put_luma_v8_xx_neon 4
+endfunc
+
+.macro put_luma_v_x16_vector_filter shift
+        smull           v2.4s, v16.4h, v1.h[0]
+        smull2          v3.4s, v16.8h, v1.h[0]
+        smlal           v2.4s, v18.4h, v1.h[1]
+        smlal2          v3.4s, v18.8h, v1.h[1]
+        smlal           v2.4s, v20.4h, v1.h[2]
+        smlal2          v3.4s, v20.8h, v1.h[2]
+        smlal           v2.4s, v22.4h, v1.h[3]
+        smlal2          v3.4s, v22.8h, v1.h[3]
+        smlal           v2.4s, v24.4h, v1.h[4]
+        smlal2          v3.4s, v24.8h, v1.h[4]
+        smlal           v2.4s, v26.4h, v1.h[5]
+        smlal2          v3.4s, v26.8h, v1.h[5]
+        smlal           v2.4s, v28.4h, v1.h[6]
+        smlal2          v3.4s, v28.8h, v1.h[6]
+        smlal           v2.4s, v30.4h, v1.h[7]
+        smlal2          v3.4s, v30.8h, v1.h[7]
+
+        smull           v4.4s, v17.4h, v1.h[0]
+        smull2          v5.4s, v17.8h, v1.h[0]
+        smlal           v4.4s, v19.4h, v1.h[1]
+        smlal2          v5.4s, v19.8h, v1.h[1]
+        smlal           v4.4s, v21.4h, v1.h[2]
+        smlal2          v5.4s, v21.8h, v1.h[2]
+        smlal           v4.4s, v23.4h, v1.h[3]
+        smlal2          v5.4s, v23.8h, v1.h[3]
+        smlal           v4.4s, v25.4h, v1.h[4]
+        smlal2          v5.4s, v25.8h, v1.h[4]
+        smlal           v4.4s, v27.4h, v1.h[5]
+        smlal2          v5.4s, v27.8h, v1.h[5]
+        smlal           v4.4s, v29.4h, v1.h[6]
+        smlal2          v5.4s, v29.8h, v1.h[6]
+        smlal           v4.4s, v31.4h, v1.h[7]
+        smlal2          v5.4s, v31.8h, v1.h[7]
+
+        sqshrn          v6.4h, v2.4s, #(\shift)
+        sqshrn          v7.4h, v4.4s, #(\shift)
+        sqshrn2         v6.8h, v3.4s, #(\shift)
+        sqshrn2         v7.8h, v5.4s, #(\shift)
+.endm
+
+.macro put_luma_v16_xx_neon shift
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        sub             x1, x1, x2, lsl #1
+        ld1             {v0.8b}, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+        ld1             {v16.8h-v17.8h}, [x1], x2
+        ld1             {v18.8h-v19.8h}, [x1], x2
+        ld1             {v20.8h-v21.8h}, [x1], x2
+        ld1             {v22.8h-v23.8h}, [x1], x2
+        ld1             {v24.8h-v25.8h}, [x1], x2
+        ld1             {v26.8h-v27.8h}, [x1], x2
+        ld1             {v28.8h-v29.8h}, [x1], x2
+1:
+        mov             v1.16b, v0.16b
+        ld1             {v30.8h-v31.8h}, [x1], x2
+
+        put_luma_v_x16_vector_filter \shift
+
+        ld1             {v16.8h-v17.8h}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #14
+        st1             {v6.8h-v7.8h}, [x0], x9
+
+        put_luma_v_x16_vector_filter \shift
+
+        ld1             {v18.8h-v19.8h}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #12
+        st1             {v6.8h-v7.8h}, [x0], x9
+
+        put_luma_v_x16_vector_filter \shift
+
+        ld1             {v20.8h-v21.8h}, [x1], x2
+        ext             v1.16b, v0.16b, v0.16b, #10
+        st1             {v6.8h-v7.8h}, [x0], x9
+
+        put_luma_v_x16_vector_filter \shift
+
+        subs            w3, w3, #4
+        st1             {v6.8h-v7.8h}, [x0], x9
+
+        mov             v2.16b, v16.16b
+        mov             v3.16b, v17.16b
+        mov             v16.16b, v24.16b
+        mov             v17.16b, v25.16b
+        mov             v24.16b, v2.16b
+        mov             v25.16b, v3.16b
+
+        mov             v2.16b, v18.16b
+        mov             v3.16b, v19.16b
+        mov             v18.16b, v26.16b
+        mov             v19.16b, v27.16b
+        mov             v26.16b, v2.16b
+        mov             v27.16b, v3.16b
+
+        mov             v2.16b, v20.16b
+        mov             v3.16b, v21.16b
+        mov             v20.16b, v28.16b
+        mov             v21.16b, v29.16b
+        mov             v28.16b, v2.16b
+        mov             v29.16b, v3.16b
+
+        mov             v22.16b, v30.16b
+        mov             v23.16b, v31.16b
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_luma_v16_10_neon, export=1
+        put_luma_v16_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_v16_12_neon, export=1
+        put_luma_v16_xx_neon 4
+endfunc
+
+
+.macro put_luma_v_x16_xx_neon shift
+        mov             x9, #(VVC_MAX_PB_SIZE * 2)
+        sub             x1, x1, x2, lsl #1
+        ld1             {v0.8b}, [x5]
+        sub             x1, x1, x2
+        sxtl            v0.8h, v0.8b
+1:
+        mov             w8, #0
+2:
+        add             x11, x1, x8, lsl #1
+        add             x10, x0, x8, lsl #1
+        ld1             {v16.8h-v17.8h}, [x11], x2
+        add             x8, x8, #16
+        ld1             {v18.8h-v19.8h}, [x11], x2
+        cmp             w8, w6
+        ld1             {v20.8h-v21.8h}, [x11], x2
+        mov             v1.16b, v0.16b
+        ld1             {v22.8h-v23.8h}, [x11], x2
+        ld1             {v24.8h-v25.8h}, [x11], x2
+        ld1             {v26.8h-v27.8h}, [x11], x2
+        ld1             {v28.8h-v29.8h}, [x11], x2
+        ld1             {v30.8h-v31.8h}, [x11], x2
+
+        put_luma_v_x16_vector_filter \shift
+
+        ld1             {v16.8h-v17.8h}, [x11], x2
+        ext             v1.16b, v0.16b, v0.16b, #14
+        st1             {v6.8h-v7.8h}, [x10], x9
+
+        put_luma_v_x16_vector_filter \shift
+
+        st1             {v6.8h-v7.8h}, [x10], x9
+        ext             v1.16b, v0.16b, v0.16b, #12
+        ld1             {v18.8h-v19.8h}, [x11], x2
+
+        put_luma_v_x16_vector_filter \shift
+
+        ld1             {v20.8h-v21.8h}, [x11], x2
+        ext             v1.16b, v0.16b, v0.16b, #10
+        st1             {v6.8h-v7.8h}, [x10], x9
+
+        put_luma_v_x16_vector_filter \shift
+
+        st1             {v6.8h-v7.8h}, [x10], x9
+        b.lt            2b
+        add             x0, x0, x9, lsl #2
+        subs            w3, w3, #4
+        add             x1, x1, x2, lsl #2
+        b.gt            1b
+        ret
+.endm
+
+function ff_vvc_put_luma_v_x16_10_neon, export=1
+        put_luma_v_x16_xx_neon 2
+endfunc
+
+function ff_vvc_put_luma_v_x16_12_neon, export=1
+        put_luma_v_x16_xx_neon 4
+endfunc

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] aarch64/vvc: Optimisations of put_luma_v() functions for 10/12-bit (branch master)

Reply via email to