PR #23507 opened by Jun Zhao (mypopydev)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23507
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23507.patch

Add AArch64 NEON implementations of HEVC 8-bit intra angular prediction for 
mode 18 and the vertical/horizontal positive-angle mode families (modes 2-9 and 
27-34) across all block sizes (4x4/8x8/16x16/32x32), each commit self-contained 
and wired into per-size dispatch with C fallbacks for the not-yet-ported modes.


>From 610f267dbd05178ec2dd80db8cd187a1f6e4971a Mon Sep 17 00:00:00 2001
From: Jun Zhao <[email protected]>
Date: Tue, 16 Jun 2026 16:43:38 +0800
Subject: [PATCH 1/3] lavc/hevc: add aarch64 NEON for angular mode 18

Add NEON-optimized implementation for HEVC angular intra prediction
mode 18 (diagonal mode, angle=-32) at 8-bit depth.

Mode 18 is a special case where:
- angle = -32, so idx = -(y+1), fact = 0 (no interpolation needed)
- Row y copies from ref[-y..size-1-y], where ref is built from
  reversed left samples and top samples

Supports all block sizes (4x4, 8x8, 16x16, 32x32):
- 4x4/8x8: Uses register-based ref array with ext instructions
- 16x16: Uses dual-register EXT approach (v0=left reversed, v1=top)
- 32x32: Uses stack-based ref array for larger reference range

The pred_angular[] dispatch wrapper now routes mode 18 to the NEON
implementation, with the existing C fallback kept for the remaining
angular modes, so the decoder uses it immediately.

Speedup over C on Apple M4 (checkasm --bench, 10-run median):

    mode |  4x4    8x8   16x16  32x32
    -----+---------------------------
      18 | 4.50   8.40   4.71   3.14

Signed-off-by: Jun Zhao <[email protected]>
---
 libavcodec/aarch64/hevcpred_init_aarch64.c |  27 +++-
 libavcodec/aarch64/hevcpred_neon.S         | 159 +++++++++++++++++++++
 2 files changed, 181 insertions(+), 5 deletions(-)

diff --git a/libavcodec/aarch64/hevcpred_init_aarch64.c 
b/libavcodec/aarch64/hevcpred_init_aarch64.c
index 03fc5c490e..6d5bde7d29 100644
--- a/libavcodec/aarch64/hevcpred_init_aarch64.c
+++ b/libavcodec/aarch64/hevcpred_init_aarch64.c
@@ -75,6 +75,20 @@ void ff_hevc_pred_angular_mode_26_8_neon(uint8_t *src, const 
uint8_t *top,
                                         const uint8_t *left, ptrdiff_t stride,
                                         int c_idx, int log2_size);
 
+// Mode 18 (diagonal, angle=-32)
+void ff_hevc_pred_angular_mode_18_4x4_8_neon(uint8_t *src, const uint8_t *top,
+                                            const uint8_t *left, ptrdiff_t 
stride,
+                                            int c_idx, int log2_size);
+void ff_hevc_pred_angular_mode_18_8x8_8_neon(uint8_t *src, const uint8_t *top,
+                                            const uint8_t *left, ptrdiff_t 
stride,
+                                            int c_idx, int log2_size);
+void ff_hevc_pred_angular_mode_18_16x16_8_neon(uint8_t *src, const uint8_t 
*top,
+                                              const uint8_t *left, ptrdiff_t 
stride,
+                                              int c_idx, int log2_size);
+void ff_hevc_pred_angular_mode_18_32x32_8_neon(uint8_t *src, const uint8_t 
*top,
+                                              const uint8_t *left, ptrdiff_t 
stride,
+                                              int c_idx, int log2_size);
+
 static void pred_dc_neon(uint8_t *src, const uint8_t *top,
                          const uint8_t *left, ptrdiff_t stride,
                          int log2_size, int c_idx)
@@ -97,7 +111,7 @@ static void pred_dc_neon(uint8_t *src, const uint8_t *top,
     }
 }
 
-#define PRED_ANGULAR_NEON(IDX, LOG2)                                          \
+#define PRED_ANGULAR_NEON(IDX, SZ, LOG2)                                      \
 static void pred_angular_##IDX##_neon(uint8_t *src, const uint8_t *top,       \
                                       const uint8_t *left, ptrdiff_t stride,  \
                                       int c_idx, int mode)                    \
@@ -105,6 +119,9 @@ static void pred_angular_##IDX##_neon(uint8_t *src, const 
uint8_t *top,       \
     if (mode == 10)                                                           \
         ff_hevc_pred_angular_mode_10_8_neon(src, top, left, stride,           \
                                            c_idx, LOG2);                      \
+    else if (mode == 18)                                                      \
+        ff_hevc_pred_angular_mode_18_##SZ##_8_neon(src, top, left, stride,    \
+                                                   c_idx, LOG2);              \
     else if (mode == 26)                                                      \
         ff_hevc_pred_angular_mode_26_8_neon(src, top, left, stride,           \
                                            c_idx, LOG2);                      \
@@ -112,10 +129,10 @@ static void pred_angular_##IDX##_neon(uint8_t *src, const 
uint8_t *top,       \
         ff_hevc_pred_angular_##IDX##_8(src, top, left, stride, c_idx, mode);  \
 }
 
-PRED_ANGULAR_NEON(0, 2)
-PRED_ANGULAR_NEON(1, 3)
-PRED_ANGULAR_NEON(2, 4)
-PRED_ANGULAR_NEON(3, 5)
+PRED_ANGULAR_NEON(0, 4x4,   2)
+PRED_ANGULAR_NEON(1, 8x8,   3)
+PRED_ANGULAR_NEON(2, 16x16, 4)
+PRED_ANGULAR_NEON(3, 32x32, 5)
 
 #undef PRED_ANGULAR_NEON
 
diff --git a/libavcodec/aarch64/hevcpred_neon.S 
b/libavcodec/aarch64/hevcpred_neon.S
index f21492318c..1a49731965 100644
--- a/libavcodec/aarch64/hevcpred_neon.S
+++ b/libavcodec/aarch64/hevcpred_neon.S
@@ -1341,3 +1341,162 @@ function ff_hevc_pred_angular_mode_26_8_neon, export=1
 .Lmode26_ret:
         ret
 endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_mode_18_4x4_8: Mode 18 prediction for 4x4 block
+// Row 0: top[-1], top[0], top[1], top[2]
+// Row 1: left[0], top[-1], top[0], top[1]
+// Row 2: left[1], left[0], top[-1], top[0]
+// Row 3: left[2], left[1], left[0], top[-1]
+// Arguments:
+// x0: src
+// x1: top
+// x2: left
+// x3: stride
+// w4: c_idx (unused, mode 18 has no edge smoothing)
+// w5: log2_size (unused, size is fixed per entry point)
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_mode_18_4x4_8_neon, export=1
+        // Build ref array in register
+        // ref[-4..-1] = left[3], left[2], left[1], left[0]  (reversed)
+        // ref[0..3] = top[-1..2]
+
+        // Load left[0..3] and reverse
+        ldr             s0, [x2]                // left[0..3]
+        rev32           v0.8b, v0.8b            // v0 = {left[3], left[2], 
left[1], left[0], ...}
+
+        // Load top[-1..3]
+        sub             x4, x1, #1
+        ldr             d1, [x4]                // top[-1..6]
+
+        // Combine: {left[3,2,1,0], top[-1,0,1,2,3,...]}
+        ins             v0.s[1], v1.s[0]        // v0 = {left[3,2,1,0], 
top[-1,0,1,2], ...}
+
+        // Rows 0-3: slide a 4-byte window via ext, then batch the stores so
+        // only one pointer update is needed for the four rows.
+        ext             v2.8b, v0.8b, v0.8b, #4 // row 0: ref[0..3]  = 
top[-1..2]
+        ext             v3.8b, v0.8b, v0.8b, #3 // row 1: ref[-1..2] = v0[3..6]
+        ext             v4.8b, v0.8b, v0.8b, #2 // row 2: ref[-2..1] = v0[2..5]
+        ext             v5.8b, v0.8b, v0.8b, #1 // row 3: ref[-3..0] = v0[1..4]
+        str             s2, [x0]
+        str             s3, [x0, x3]
+        add             x0, x0, x3, lsl #1
+        str             s4, [x0]
+        str             s5, [x0, x3]
+
+        ret
+endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_mode_18_8x8_8: Mode 18 prediction for 8x8 block
+// Arguments:
+// x0: src
+// x1: top
+// x2: left
+// x3: stride
+// w4: c_idx (unused, mode 18 has no edge smoothing)
+// w5: log2_size (unused, size is fixed per entry point)
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_mode_18_8x8_8_neon, export=1
+        // ref[-8..-1] = left[7..0] (reversed)
+        // ref[0..7] = top[-1..6]
+
+        // Load left[0..7] and reverse
+        ldr             d0, [x2]                // left[0..7]
+        rev64           v0.8b, v0.8b            // {left[7..0]}
+
+        // Load top[-1..7]
+        sub             x4, x1, #1
+        ldr             q1, [x4]                // top[-1..14]
+
+        // Combine into v2 (16 bytes): {left[7..0], top[-1..7]}
+        mov             v2.d[0], v0.d[0]        // v2[0..7] = left[7..0]
+        mov             v2.d[1], v1.d[0]        // v2[8..15] = top[-1..6]
+
+        // Row 0: ref[0..7] = top[-1..6] = v2[8..15]
+        st1             {v2.d}[1], [x0], x3
+
+        // Row 1-7: use ext with decreasing offset
+.irp offset, 7, 6, 5, 4, 3, 2, 1
+        ext             v3.16b, v2.16b, v2.16b, #\offset
+        st1             {v3.8b}, [x0], x3
+.endr
+
+        ret
+endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_mode_18_16x16_8: Mode 18 prediction for 16x16 block
+// ref[-16..-1] = left[15..0] reversed, ref[0..15] = top[-1..14]
+// Arguments:
+// x0: src
+// x1: top
+// x2: left
+// x3: stride
+// w4: c_idx (unused, mode 18 has no edge smoothing)
+// w5: log2_size (unused, size is fixed per entry point)
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_mode_18_16x16_8_neon, export=1
+        // Register-based approach using EXT to slide a window across {v0:v1}.
+        // v0 = left[15..0] (reversed), v1 = top[-1..14]
+        // Row k: need ref[-k..15-k] = EXT(v0, v1, #16-k) for k=1..15, row 0 = 
v1.
+
+        ldr             q0, [x2]                // left[0..15]
+        rev64           v0.16b, v0.16b          // reverse in 64-bit lanes
+        ext             v0.16b, v0.16b, v0.16b, #8  // v0 = left[15..0]
+        sub             x4, x1, #1
+        ldr             q1, [x4]                // v1 = top[-1..14]
+
+        // Row 0: ref[0..15] = v1
+        st1             {v1.16b}, [x0], x3
+        // Row 1-15: EXT(v0, v1, #N) slides window across {v0:v1}
+.irp offset, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+        ext             v2.16b, v0.16b, v1.16b, #\offset
+        st1             {v2.16b}, [x0], x3
+.endr
+
+        ret
+endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_mode_18_32x32_8: Mode 18 prediction for 32x32 block
+// ref[-32..-1] = left[31..0] reversed, ref[0..31] = top[-1..30]
+// Arguments:
+// x0: src
+// x1: top
+// x2: left
+// x3: stride
+// w4: c_idx (unused, mode 18 has no edge smoothing)
+// w5: log2_size (unused, size is fixed per entry point)
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_mode_18_32x32_8_neon, export=1
+        // Build the 64-byte ref array (ref[-32..31]) on the stack, then slide 
a
+        // 32-byte window one byte left per row.
+        sub             sp, sp, #64
+
+        // Store left[31..0] reversed at sp[0..31] (ref[-32..-1])
+        ldp             q0, q1, [x2]            // left[0..31]
+        rev64           v0.16b, v0.16b
+        ext             v0.16b, v0.16b, v0.16b, #8  // left[15..0]
+        rev64           v1.16b, v1.16b
+        ext             v1.16b, v1.16b, v1.16b, #8  // left[31..16]
+        stp             q1, q0, [sp]            // {left[31..16], left[15..0]}
+
+        // Store top[-1..30] at sp[32..63] (ref[0..31])
+        sub             x4, x1, #1
+        ldp             q2, q3, [x4]            // top[-1..30]
+        stp             q2, q3, [sp, #32]
+
+        // ref_base = sp + 32 (so ref[0] = sp[32], ref[-1] = sp[31], etc.)
+        add             x4, sp, #32
+        mov             w5, #32
+
+1:      ldp             q0, q1, [x4]
+        sub             x4, x4, #1              // slide ref pointer one byte 
left
+        subs            w5, w5, #1
+        st1             {v0.16b, v1.16b}, [x0], x3
+        b.gt            1b
+
+        add             sp, sp, #64
+        ret
+endfunc
-- 
2.52.0


>From 8093014d5310019fde1410bbfbe503cf5283371f Mon Sep 17 00:00:00 2001
From: Jun Zhao <[email protected]>
Date: Tue, 16 Jun 2026 16:44:41 +0800
Subject: [PATCH 2/3] lavc/hevc: add aarch64 NEON for angular V positive (modes
 27-34)

Add NEON-optimized implementations for HEVC angular intra prediction
modes 27-34 (vertical positive angles) at 8-bit depth.

These modes use the top reference with positive angles, computing:
- idx = ((y+1) * angle) >> 5
- fact = ((y+1) * angle) & 31
- Interpolate between ref[idx] and ref[idx+1] using fact

Mode 34 (angle=32) is optimized as a pure diagonal copy since fact=0.

Supports all block sizes (4x4, 8x8, 16x16, 32x32).

The pred_angular[] dispatch wrapper now routes modes 27-34 to the NEON
implementation, with the C fallback kept for the remaining angular
modes, so the decoder uses it immediately.

Speedup over C on Apple M4 (checkasm --bench, 10-run median):

    mode |  4x4    8x8   16x16  32x32
    -----+---------------------------
      27 | 5.00   5.11   5.94   8.69
      28 | 5.00   5.19   7.18   8.96
      29 | 4.95   5.18   7.10   8.97
      30 | 5.05   5.18   7.14   8.95
      31 | 5.05   5.02   7.10   8.92
      32 | 5.02   5.09   7.12   8.93
      33 | 5.00   5.02   6.67   8.73
      34 | 2.00   6.07   1.89   1.90  (pure diagonal copy)

Signed-off-by: Jun Zhao <[email protected]>
---
 libavcodec/aarch64/hevcpred_init_aarch64.c |  17 ++
 libavcodec/aarch64/hevcpred_neon.S         | 275 +++++++++++++++++++++
 2 files changed, 292 insertions(+)

diff --git a/libavcodec/aarch64/hevcpred_init_aarch64.c 
b/libavcodec/aarch64/hevcpred_init_aarch64.c
index 6d5bde7d29..f89635376f 100644
--- a/libavcodec/aarch64/hevcpred_init_aarch64.c
+++ b/libavcodec/aarch64/hevcpred_init_aarch64.c
@@ -89,6 +89,20 @@ void ff_hevc_pred_angular_mode_18_32x32_8_neon(uint8_t *src, 
const uint8_t *top,
                                               const uint8_t *left, ptrdiff_t 
stride,
                                               int c_idx, int log2_size);
 
+// Positive angle vertical modes (mode 27-34)
+void ff_hevc_pred_angular_v_pos_4x4_8_neon(uint8_t *src, const uint8_t *top,
+                                          const uint8_t *left, ptrdiff_t 
stride,
+                                          int c_idx, int mode);
+void ff_hevc_pred_angular_v_pos_8x8_8_neon(uint8_t *src, const uint8_t *top,
+                                          const uint8_t *left, ptrdiff_t 
stride,
+                                          int c_idx, int mode);
+void ff_hevc_pred_angular_v_pos_16x16_8_neon(uint8_t *src, const uint8_t *top,
+                                            const uint8_t *left, ptrdiff_t 
stride,
+                                            int c_idx, int mode);
+void ff_hevc_pred_angular_v_pos_32x32_8_neon(uint8_t *src, const uint8_t *top,
+                                            const uint8_t *left, ptrdiff_t 
stride,
+                                            int c_idx, int mode);
+
 static void pred_dc_neon(uint8_t *src, const uint8_t *top,
                          const uint8_t *left, ptrdiff_t stride,
                          int log2_size, int c_idx)
@@ -125,6 +139,9 @@ static void pred_angular_##IDX##_neon(uint8_t *src, const 
uint8_t *top,       \
     else if (mode == 26)                                                      \
         ff_hevc_pred_angular_mode_26_8_neon(src, top, left, stride,           \
                                            c_idx, LOG2);                      \
+    else if (mode >= 27)                                                      \
+        ff_hevc_pred_angular_v_pos_##SZ##_8_neon(src, top, left, stride,      \
+                                                 c_idx, mode);                \
     else                                                                      \
         ff_hevc_pred_angular_##IDX##_8(src, top, left, stride, c_idx, mode);  \
 }
diff --git a/libavcodec/aarch64/hevcpred_neon.S 
b/libavcodec/aarch64/hevcpred_neon.S
index 1a49731965..bb49272764 100644
--- a/libavcodec/aarch64/hevcpred_neon.S
+++ b/libavcodec/aarch64/hevcpred_neon.S
@@ -1500,3 +1500,278 @@ function ff_hevc_pred_angular_mode_18_32x32_8_neon, 
export=1
         add             sp, sp, #64
         ret
 endfunc
+
+// 
=============================================================================
+// Angular Prediction - Vertical reference modes (Mode 27-34)
+// 
=============================================================================
+
+// Angle table for V reference positive angles (mode 27-34)
+// angle = intra_pred_angle_v[mode - 27]
+const intra_pred_angle_v, align=4
+        .byte   2       // mode 27
+        .byte   5       // mode 28
+        .byte   9       // mode 29
+        .byte   13      // mode 30
+        .byte   17      // mode 31
+        .byte   21      // mode 32
+        .byte   26      // mode 33
+        .byte   32      // mode 34
+endconst
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_v_pos_4x4_8: Vertical reference positive angle prediction 
(mode 27-34)
+// Arguments:
+// x0: src
+// x1: top
+// x2: left (unused for V reference modes)
+// x3: stride
+// w4: c_idx
+// w5: mode
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_v_pos_4x4_8_neon, export=1
+        // Load angle from table
+        movrel          x6, intra_pred_angle_v
+        sub             w7, w5, #27            // mode - 27 (index into angle 
table)
+        ldrsb           w8, [x6, w7, sxtw]     // angle = 
intra_pred_angle_v[mode-27]
+
+        // For mode 34 (angle=32), fact is always 0, optimize as pure copy
+        cmp             w8, #32
+        b.eq            .Lv_pos_4x4_mode34
+
+        mov             w10, #0                 // angle_acc = 0
+        movi            v18.16b, #32            // weight base: 32 for (32 - 
fact)
+
+.macro v_pos_4x4_row
+        add             w10, w10, w8            // angle_acc = (y+1) * angle
+        asr             w11, w10, #5            // idx = angle_acc >> 5
+        and             w12, w10, #31           // fact = angle_acc & 31
+
+        // Load reference pixels top[idx..idx+4]
+        add             x13, x1, w11, sxtw      // x13 = top + idx
+        dup             v17.8b, w12             // broadcast fact (hoisted for 
addr gap)
+        sub             v16.8b, v18.8b, v17.8b  // 32 - fact
+        ldr             s0, [x13]               // ref[idx+1] (2-instr gap 
from add)
+        ldr             s1, [x13, #1]           // ref[idx+2]
+
+        umull           v20.8h, v0.8b, v16.8b   // (32-fact) * ref[idx+1]
+        umlal           v20.8h, v1.8b, v17.8b   // + fact * ref[idx+2]
+        rshrn           v0.8b, v20.8h, #5       // (result + 16) >> 5
+
+        str             s0, [x0]                // str s is faster than 
lane-store on some cores
+        add             x0, x0, x3
+.endm
+        v_pos_4x4_row
+        v_pos_4x4_row
+        v_pos_4x4_row
+        v_pos_4x4_row
+.purgem v_pos_4x4_row
+
+        ret
+
+.Lv_pos_4x4_mode34:
+        // Mode 34: angle=32, each row copies from top[y+1..y+4]
+        // Row 0: top[1..4], Row 1: top[2..5], Row 2: top[3..6], Row 3: 
top[4..7]
+        ldr             s0, [x1, #1]
+        ldr             s1, [x1, #2]
+        ldr             s2, [x1, #3]
+        ldr             s3, [x1, #4]
+        str             s0, [x0]
+        str             s1, [x0, x3]
+        add             x0, x0, x3, lsl #1
+        str             s2, [x0]
+        str             s3, [x0, x3]
+        ret
+endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_v_pos_8x8_8: Vertical reference positive angle prediction 
(mode 27-34)
+// Arguments:
+// x0: src
+// x1: top
+// x2: left (unused for V reference modes)
+// x3: stride
+// w4: c_idx
+// w5: mode
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_v_pos_8x8_8_neon, export=1
+        // Load angle from table
+        movrel          x6, intra_pred_angle_v
+        sub             w7, w5, #27            // mode - 27 (index into angle 
table)
+        ldrsb           w8, [x6, w7, sxtw]     // angle = 
intra_pred_angle_v[mode-27]
+
+        // Mode 34 optimization
+        cmp             w8, #32
+        b.eq            .Lv_pos_8x8_mode34
+
+        mov             w9, #8                  // row counter
+        mov             w10, #0                 // angle_acc = 0
+        movi            v18.16b, #32            // weight base: 32 for (32 - 
fact)
+
+.Lv_pos_8x8_row_loop:
+        add             w10, w10, w8            // angle_acc = (y+1) * angle
+        asr             w11, w10, #5            // idx
+        and             w12, w10, #31           // fact
+
+        add             x13, x1, w11, sxtw
+        dup             v17.8b, w12             // hoisted for addr gap
+        sub             v16.8b, v18.8b, v17.8b
+        ldr             d0, [x13]               // ref[idx+1..idx+8]
+        ldr             d1, [x13, #1]           // ref[idx+2..idx+9]
+
+        umull           v20.8h, v0.8b, v16.8b
+        umlal           v20.8h, v1.8b, v17.8b
+        rshrn           v0.8b, v20.8h, #5
+
+        subs            w9, w9, #1              // subs before store: helps 
in-order cores
+        st1             {v0.8b}, [x0], x3
+        b.gt            .Lv_pos_8x8_row_loop
+
+        ret
+
+.Lv_pos_8x8_mode34:
+        // Mode 34: each row copies from top[y+1..y+8]
+.irp off, 1, 2, 3, 4, 5, 6, 7
+        ldr             d0, [x1, #\off]
+        st1             {v0.8b}, [x0], x3
+.endr
+        ldr             d0, [x1, #8]
+        str             d0, [x0]                // last row: no pointer update 
needed
+        ret
+endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_v_pos_16x16_8: Vertical reference positive angle prediction 
(mode 27-34)
+// Arguments:
+// x0: src
+// x1: top
+// x2: left (unused for V reference modes)
+// x3: stride
+// w4: c_idx
+// w5: mode
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_v_pos_16x16_8_neon, export=1
+        // Load angle from table
+        movrel          x6, intra_pred_angle_v
+        sub             w7, w5, #27            // mode - 27 (index into angle 
table)
+        ldrsb           w8, [x6, w7, sxtw]     // angle = 
intra_pred_angle_v[mode-27]
+
+        // Mode 34 optimization
+        cmp             w8, #32
+        b.eq            .Lv_pos_16x16_mode34
+
+        mov             w9, #16                 // row counter
+        mov             w10, #0                 // angle_acc = 0
+        movi            v18.16b, #32            // weight base: 32 for (32 - 
fact)
+
+.Lv_pos_16x16_row_loop:
+        add             w10, w10, w8            // angle_acc = (y+1) * angle
+        asr             w11, w10, #5            // idx
+        and             w12, w10, #31           // fact
+
+        add             x13, x1, w11, sxtw
+        dup             v17.16b, w12            // hoisted for addr gap
+        sub             v16.16b, v18.16b, v17.16b
+        ldr             q0, [x13]               // ref[idx+1..idx+16]
+        ldr             q1, [x13, #1]           // ref[idx+2..idx+17]
+
+        // Low 8 bytes
+        umull           v20.8h, v0.8b, v16.8b
+        umlal           v20.8h, v1.8b, v17.8b
+        rshrn           v2.8b, v20.8h, #5
+
+        // High 8 bytes
+        umull2          v21.8h, v0.16b, v16.16b
+        umlal2          v21.8h, v1.16b, v17.16b
+        rshrn2          v2.16b, v21.8h, #5
+
+        subs            w9, w9, #1              // subs before store: helps 
in-order cores
+        st1             {v2.16b}, [x0], x3
+        b.gt            .Lv_pos_16x16_row_loop
+
+        ret
+
+.Lv_pos_16x16_mode34:
+        // Mode 34: each row copies from top[y+1..y+16], fully unrolled
+        add             x13, x1, #1
+.irp off, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+        ldr             q0, [x13, #\off]
+        st1             {v0.16b}, [x0], x3
+.endr
+        ret
+endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_v_pos_32x32_8: Vertical reference positive angle prediction 
(mode 27-34)
+// Arguments:
+// x0: src
+// x1: top
+// x2: left (unused for V reference modes)
+// x3: stride
+// w4: c_idx
+// w5: mode
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_v_pos_32x32_8_neon, export=1
+        // Load angle from table
+        movrel          x6, intra_pred_angle_v
+        sub             w7, w5, #27            // mode - 27 (index into angle 
table)
+        ldrsb           w8, [x6, w7, sxtw]     // angle = 
intra_pred_angle_v[mode-27]
+
+        // Mode 34 optimization
+        cmp             w8, #32
+        b.eq            .Lv_pos_32x32_mode34
+
+        mov             w9, #32                 // row counter
+        mov             w10, #0                 // angle_acc = 0
+        movi            v18.16b, #32            // weight base: 32 for (32 - 
fact)
+
+.Lv_pos_32x32_row_loop:
+        add             w10, w10, w8            // angle_acc = (y+1) * angle
+        asr             w11, w10, #5            // idx
+        and             w12, w10, #31           // fact
+
+        add             x13, x1, w11, sxtw
+        dup             v17.16b, w12            // hoisted for addr gap
+        sub             v16.16b, v18.16b, v17.16b
+
+        // Load 32 bytes + 1 for interpolation
+        ldr             q0, [x13]               // ref[idx+1..idx+16]
+        ldr             q1, [x13, #1]           // ref[idx+2..idx+17]
+        ldr             q2, [x13, #16]          // ref[idx+17..idx+32]
+        ldr             q3, [x13, #17]          // ref[idx+18..idx+33]
+
+        // First 16 bytes
+        umull           v20.8h, v0.8b, v16.8b
+        umlal           v20.8h, v1.8b, v17.8b
+        rshrn           v4.8b, v20.8h, #5
+
+        umull2          v21.8h, v0.16b, v16.16b
+        umlal2          v21.8h, v1.16b, v17.16b
+        rshrn2          v4.16b, v21.8h, #5
+
+        // Second 16 bytes
+        umull           v22.8h, v2.8b, v16.8b
+        umlal           v22.8h, v3.8b, v17.8b
+        rshrn           v5.8b, v22.8h, #5
+
+        umull2          v23.8h, v2.16b, v16.16b
+        umlal2          v23.8h, v3.16b, v17.16b
+        rshrn2          v5.16b, v23.8h, #5
+
+        subs            w9, w9, #1              // subs before store: helps 
in-order cores
+        st1             {v4.16b, v5.16b}, [x0], x3
+        b.gt            .Lv_pos_32x32_row_loop
+
+        ret
+
+.Lv_pos_32x32_mode34:
+        // Mode 34: each row copies from top[y+1..y+32]
+        add             x13, x1, #1             // top + 1
+        mov             w9, #32
+.Lv_pos_32x32_mode34_loop:
+        ldp             q0, q1, [x13]
+        add             x13, x13, #1
+        subs            w9, w9, #1
+        st1             {v0.16b, v1.16b}, [x0], x3
+        b.gt            .Lv_pos_32x32_mode34_loop
+        ret
+endfunc
-- 
2.52.0


>From 020e1334d83bc68455c99f71a0cb310e83dcb793 Mon Sep 17 00:00:00 2001
From: Jun Zhao <[email protected]>
Date: Tue, 16 Jun 2026 16:45:35 +0800
Subject: [PATCH 3/3] lavc/hevc: add aarch64 NEON for angular H positive (modes
 2-9)

Add NEON-optimized implementations for HEVC angular intra prediction
modes 2-9 (horizontal positive angles) at 8-bit depth.

These modes use the left reference with positive angles, computing:
- idx = ((x+1) * angle) >> 5
- fact = ((x+1) * angle) & 31
- Interpolate between ref[idx] and ref[idx+1] using fact

Uses batch column computation with matrix transpose to convert
column-oriented interpolation results into contiguous row stores.

Mode 2 (angle=32) is optimized with direct row-wise contiguous writes
since each row copies left[y+1..y+size], avoiding interpolation.

Supports all block sizes (4x4, 8x8, 16x16, 32x32).

The pred_angular[] dispatch wrapper now routes modes 2-9 to the NEON
implementation, with the C fallback kept for the remaining angular
modes, so the decoder uses it immediately.

Speedup over C on Apple M4 (checkasm --bench, 10-run median):

    mode |  4x4    8x8   16x16   32x32
    -----+----------------------------
       2 | 6.71   8.29  22.69   32.68  (pure horizontal copy)
       3 | 3.57   5.11   6.04    8.44
       4 | 3.55   5.09   6.12    8.32
       5 | 3.55   4.97   6.13    8.33
       6 | 3.52   5.09   6.13    8.37
       7 | 3.54   5.08   6.14    8.35
       8 | 3.54   5.08   6.13    8.33
       9 | 3.59   4.82   5.66    8.42

Signed-off-by: Jun Zhao <[email protected]>
---
 libavcodec/aarch64/hevcpred_init_aarch64.c |  17 +
 libavcodec/aarch64/hevcpred_neon.S         | 428 +++++++++++++++++++++
 2 files changed, 445 insertions(+)

diff --git a/libavcodec/aarch64/hevcpred_init_aarch64.c 
b/libavcodec/aarch64/hevcpred_init_aarch64.c
index f89635376f..2c7a5215d6 100644
--- a/libavcodec/aarch64/hevcpred_init_aarch64.c
+++ b/libavcodec/aarch64/hevcpred_init_aarch64.c
@@ -103,6 +103,20 @@ void ff_hevc_pred_angular_v_pos_32x32_8_neon(uint8_t *src, 
const uint8_t *top,
                                             const uint8_t *left, ptrdiff_t 
stride,
                                             int c_idx, int mode);
 
+// Positive angle horizontal modes (mode 2-9)
+void ff_hevc_pred_angular_h_pos_4x4_8_neon(uint8_t *src, const uint8_t *top,
+                                          const uint8_t *left, ptrdiff_t 
stride,
+                                          int c_idx, int mode);
+void ff_hevc_pred_angular_h_pos_8x8_8_neon(uint8_t *src, const uint8_t *top,
+                                          const uint8_t *left, ptrdiff_t 
stride,
+                                          int c_idx, int mode);
+void ff_hevc_pred_angular_h_pos_16x16_8_neon(uint8_t *src, const uint8_t *top,
+                                            const uint8_t *left, ptrdiff_t 
stride,
+                                            int c_idx, int mode);
+void ff_hevc_pred_angular_h_pos_32x32_8_neon(uint8_t *src, const uint8_t *top,
+                                            const uint8_t *left, ptrdiff_t 
stride,
+                                            int c_idx, int mode);
+
 static void pred_dc_neon(uint8_t *src, const uint8_t *top,
                          const uint8_t *left, ptrdiff_t stride,
                          int log2_size, int c_idx)
@@ -142,6 +156,9 @@ static void pred_angular_##IDX##_neon(uint8_t *src, const 
uint8_t *top,       \
     else if (mode >= 27)                                                      \
         ff_hevc_pred_angular_v_pos_##SZ##_8_neon(src, top, left, stride,      \
                                                  c_idx, mode);                \
+    else if (mode <= 9)                                                       \
+        ff_hevc_pred_angular_h_pos_##SZ##_8_neon(src, top, left, stride,      \
+                                                 c_idx, mode);                \
     else                                                                      \
         ff_hevc_pred_angular_##IDX##_8(src, top, left, stride, c_idx, mode);  \
 }
diff --git a/libavcodec/aarch64/hevcpred_neon.S 
b/libavcodec/aarch64/hevcpred_neon.S
index bb49272764..d5b1036707 100644
--- a/libavcodec/aarch64/hevcpred_neon.S
+++ b/libavcodec/aarch64/hevcpred_neon.S
@@ -21,6 +21,7 @@
  */
 
 #include "libavutil/aarch64/asm.S"
+#include "neon.S"
 
 /* HEVC Intra Prediction NEON functions
  *
@@ -1775,3 +1776,430 @@ function ff_hevc_pred_angular_v_pos_32x32_8_neon, 
export=1
         b.gt            .Lv_pos_32x32_mode34_loop
         ret
 endfunc
+
+// 
=============================================================================
+// Angular Prediction - Horizontal reference modes, positive angle (Mode 2-9)
+// 
=============================================================================
+
+const intra_pred_angle_h, align=4
+        .byte   32      // mode 2
+        .byte   26      // mode 3
+        .byte   21      // mode 4
+        .byte   17      // mode 5
+        .byte   13      // mode 6
+        .byte   9       // mode 7
+        .byte   5       // mode 8
+        .byte   2       // mode 9
+endconst
+
+// Shared interpolation column for the H positive 4x4/8x8 entry points: both
+// load 8 bytes of left reference and produce one transposed column in \dst.
+// Requires v21 = 32, w8 = angle, w10 = running angle_acc, x2 = left base.
+.macro h_pos_col_8b dst
+        add             w10, w10, w8
+        asr             w11, w10, #5
+        and             w12, w10, #31
+        add             x13, x2, w11, sxtw
+        dup             v17.8b, w12             // hoisted for addr gap
+        sub             v16.8b, v21.8b, v17.8b
+        ldr             d18, [x13]
+        ldr             d19, [x13, #1]
+        umull           v20.8h, v18.8b, v16.8b
+        umlal           v20.8h, v19.8b, v17.8b
+        rshrn           \dst\().8b, v20.8h, #5
+.endm
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_h_pos_4x4_8: Horizontal reference positive angle prediction 
(mode 2-9)
+// Arguments:
+// x0: src
+// x1: top (unused for H reference modes)
+// x2: left
+// x3: stride
+// w4: c_idx
+// w5: mode
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_h_pos_4x4_8_neon, export=1
+        // Load angle from table
+        movrel          x6, intra_pred_angle_h
+        sub             w7, w5, #2              // mode - 2 (index into angle 
table)
+        ldrb            w8, [x6, w7, uxtw]      // angle = 
intra_pred_angle_h[mode-2]
+
+        // For mode 2 (angle=32), fact is always 0, optimize as pure copy
+        cmp             w8, #32
+        b.eq            .Lh_pos_4x4_mode2
+
+        // === Fully unrolled 4-column computation with transpose ===
+        mov             w10, #0                 // angle_acc
+        movi            v21.16b, #32            // weight base: 32 for (32 - 
fact)
+
+        h_pos_col_8b    v0
+        h_pos_col_8b    v1
+        h_pos_col_8b    v2
+        h_pos_col_8b    v3
+
+        transpose_4x8B  v0, v1, v2, v3, v16, v17, v18, v19
+
+        str             s0, [x0]
+        str             s1, [x0, x3]
+        add             x0, x0, x3, lsl #1
+        str             s2, [x0]
+        str             s3, [x0, x3]
+        ret
+
+.Lh_pos_4x4_mode2:
+        // Mode 2: Row-wise optimization
+        // Row y contains left[y+1..y+4], which is a contiguous read + 
contiguous write
+        // Row 0: left[1..4], Row 1: left[2..5], Row 2: left[3..6], Row 3: 
left[4..7]
+        add             x5, x2, #1              // left + 1
+        ldr             s0, [x5]                // row 0: left[1..4]
+        ldr             s1, [x5, #1]            // row 1: left[2..5]
+        ldr             s2, [x5, #2]            // row 2: left[3..6]
+        ldr             s3, [x5, #3]            // row 3: left[4..7]
+        str             s0, [x0]
+        str             s1, [x0, x3]
+        add             x0, x0, x3, lsl #1
+        str             s2, [x0]
+        str             s3, [x0, x3]
+        ret
+endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_h_pos_8x8_8: Horizontal reference positive angle prediction 
(mode 2-9)
+// Arguments:
+// x0: src
+// x1: top (unused for H reference modes)
+// x2: left
+// x3: stride
+// w4: c_idx
+// w5: mode
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_h_pos_8x8_8_neon, export=1
+        // Load angle from table
+        movrel          x6, intra_pred_angle_h
+        sub             w7, w5, #2
+        ldrb            w8, [x6, w7, uxtw]      // angle
+
+        // Mode 2 optimization
+        cmp             w8, #32
+        b.eq            .Lh_pos_8x8_mode2
+
+        // === Fully unrolled 8-column computation with transpose ===
+        mov             w10, #0                 // angle_acc
+        movi            v21.16b, #32            // weight base: 32 for (32 - 
fact)
+
+        h_pos_col_8b    v0
+        h_pos_col_8b    v1
+        h_pos_col_8b    v2
+        h_pos_col_8b    v3
+        h_pos_col_8b    v4
+        h_pos_col_8b    v5
+        h_pos_col_8b    v6
+        h_pos_col_8b    v7
+.purgem h_pos_col_8b
+
+        transpose_8x8B  v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+
+        st1             {v0.8b}, [x0], x3
+        st1             {v1.8b}, [x0], x3
+        st1             {v2.8b}, [x0], x3
+        st1             {v3.8b}, [x0], x3
+        st1             {v4.8b}, [x0], x3
+        st1             {v5.8b}, [x0], x3
+        st1             {v6.8b}, [x0], x3
+        st1             {v7.8b}, [x0], x3
+        ret
+
+.Lh_pos_8x8_mode2:
+        // Mode 2: Row-wise optimization
+        // Row y contains left[y+1..y+8], contiguous read + contiguous write
+        add             x5, x2, #1              // left + 1
+        ldr             d0, [x5]                // row 0: left[1..8]
+        ldr             d1, [x5, #1]            // row 1: left[2..9]
+        st1             {v0.8b}, [x0], x3
+        st1             {v1.8b}, [x0], x3
+        ldr             d0, [x5, #2]            // row 2: left[3..10]
+        ldr             d1, [x5, #3]            // row 3: left[4..11]
+        st1             {v0.8b}, [x0], x3
+        st1             {v1.8b}, [x0], x3
+        ldr             d0, [x5, #4]            // row 4: left[5..12]
+        ldr             d1, [x5, #5]            // row 5: left[6..13]
+        st1             {v0.8b}, [x0], x3
+        st1             {v1.8b}, [x0], x3
+        ldr             d0, [x5, #6]            // row 6: left[7..14]
+        ldr             d1, [x5, #7]            // row 7: left[8..15]
+        st1             {v0.8b}, [x0], x3
+        st1             {v1.8b}, [x0], x3
+        ret
+endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_h_pos_16x16_8: Horizontal reference positive angle prediction 
(mode 2-9)
+// Arguments:
+// x0: src
+// x1: top (unused for H reference modes)
+// x2: left
+// x3: stride
+// w4: c_idx
+// w5: mode
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_h_pos_16x16_8_neon, export=1
+        // Load angle from table
+        movrel          x6, intra_pred_angle_h
+        sub             w7, w5, #2
+        ldrb            w8, [x6, w7, uxtw]
+
+        // Mode 2 optimization
+        cmp             w8, #32
+        b.eq            .Lh_pos_16x16_mode2
+
+        // === Two batches of 8 columns with 16-byte transpose ===
+        mov             x15, x0                 // save base dst
+        movi            v22.16b, #32            // weight base: 32 for (32 - 
fact)
+
+.macro h_pos_16x16_col dst
+        add             w10, w10, w8
+        asr             w11, w10, #5
+        and             w12, w10, #31
+        add             x13, x2, w11, sxtw
+        dup             v17.16b, w12            // hoisted for addr gap
+        sub             v16.16b, v22.16b, v17.16b
+        ldr             q18, [x13]
+        ldr             q19, [x13, #1]
+        umull           v20.8h, v18.8b, v16.8b
+        umlal           v20.8h, v19.8b, v17.8b
+        rshrn           \dst\().8b, v20.8h, #5
+        umull2          v21.8h, v18.16b, v16.16b
+        umlal2          v21.8h, v19.16b, v17.16b
+        rshrn2          \dst\().16b, v21.8h, #5
+.endm
+
+        // Batch 1: columns 0-7
+        mov             w10, #0
+        h_pos_16x16_col v0
+        h_pos_16x16_col v1
+        h_pos_16x16_col v2
+        h_pos_16x16_col v3
+        h_pos_16x16_col v4
+        h_pos_16x16_col v5
+        h_pos_16x16_col v6
+        h_pos_16x16_col v7
+
+        mov             w9, w10                 // save angle_acc
+
+        transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+
+        // Store cols 0-7 of rows 0-7
+        mov             x16, x15
+        .irp reg, v0, v1, v2, v3, v4, v5, v6, v7
+        st1             {\reg\().8b}, [x16], x3
+        .endr
+        // Store cols 0-7 of rows 8-15
+        .irp reg, v0, v1, v2, v3, v4, v5, v6, v7
+        st1             {\reg\().d}[1], [x16], x3
+        .endr
+
+        // Batch 2: columns 8-15
+        mov             w10, w9
+        h_pos_16x16_col v0
+        h_pos_16x16_col v1
+        h_pos_16x16_col v2
+        h_pos_16x16_col v3
+        h_pos_16x16_col v4
+        h_pos_16x16_col v5
+        h_pos_16x16_col v6
+        h_pos_16x16_col v7
+.purgem h_pos_16x16_col
+
+        transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+
+        // Store cols 8-15 of rows 0-7
+        add             x16, x15, #8
+        .irp reg, v0, v1, v2, v3, v4, v5, v6, v7
+        st1             {\reg\().8b}, [x16], x3
+        .endr
+        // Store cols 8-15 of rows 8-15
+        .irp reg, v0, v1, v2, v3, v4, v5, v6, v7
+        st1             {\reg\().d}[1], [x16], x3
+        .endr
+
+        ret
+
+.Lh_pos_16x16_mode2:
+        // Mode 2: Row-wise optimization with loop unrolling
+        // Row y contains left[y+1..y+16], contiguous read + contiguous write
+        add             x5, x2, #1              // left + 1
+
+        // Rows 0-3
+        ldr             q0, [x5]
+        ldr             q1, [x5, #1]
+        ldr             q2, [x5, #2]
+        ldr             q3, [x5, #3]
+        st1             {v0.16b}, [x0], x3
+        st1             {v1.16b}, [x0], x3
+        st1             {v2.16b}, [x0], x3
+        st1             {v3.16b}, [x0], x3
+
+        // Rows 4-7
+        ldr             q0, [x5, #4]
+        ldr             q1, [x5, #5]
+        ldr             q2, [x5, #6]
+        ldr             q3, [x5, #7]
+        st1             {v0.16b}, [x0], x3
+        st1             {v1.16b}, [x0], x3
+        st1             {v2.16b}, [x0], x3
+        st1             {v3.16b}, [x0], x3
+
+        // Rows 8-11
+        ldr             q0, [x5, #8]
+        ldr             q1, [x5, #9]
+        ldr             q2, [x5, #10]
+        ldr             q3, [x5, #11]
+        st1             {v0.16b}, [x0], x3
+        st1             {v1.16b}, [x0], x3
+        st1             {v2.16b}, [x0], x3
+        st1             {v3.16b}, [x0], x3
+
+        // Rows 12-15
+        ldr             q0, [x5, #12]
+        ldr             q1, [x5, #13]
+        ldr             q2, [x5, #14]
+        ldr             q3, [x5, #15]
+        st1             {v0.16b}, [x0], x3
+        st1             {v1.16b}, [x0], x3
+        st1             {v2.16b}, [x0], x3
+        st1             {v3.16b}, [x0], x3
+        ret
+endfunc
+
+// 
-----------------------------------------------------------------------------
+// pred_angular_h_pos_32x32_8: Horizontal reference positive angle prediction 
(mode 2-9)
+// Arguments:
+// x0: src
+// x1: top (unused for H reference modes)
+// x2: left
+// x3: stride
+// w4: c_idx
+// w5: mode
+// 
-----------------------------------------------------------------------------
+function ff_hevc_pred_angular_h_pos_32x32_8_neon, export=1
+        // Load angle from table
+        movrel          x6, intra_pred_angle_h
+        sub             w7, w5, #2
+        ldrb            w8, [x6, w7, uxtw]
+
+        // Mode 2 optimization
+        cmp             w8, #32
+        b.eq            .Lh_pos_32x32_mode2
+
+        // === 4 batches of 8 columns with 32-byte transpose ===
+        // v0-v7 and v24-v31 hold transposed rows, so the only free constant
+        // register is the callee-saved v15; save/restore its low 64 bits per 
AAPCS64.
+        str             d15, [sp, #-16]!
+        mov             x15, x0                 // save base dst
+        movi            v15.16b, #32            // weight base: 32 for (32 - 
fact)
+
+.macro h_pos_32_col dst_hi, dst_lo
+        add             w10, w10, w8
+        asr             w11, w10, #5
+        and             w12, w10, #31
+        add             x13, x2, w11, sxtw
+        dup             v17.16b, w12            // hoisted for addr gap
+        sub             v16.16b, v15.16b, v17.16b
+        ldr             q18, [x13]              // ref rows 0-15
+        ldr             q19, [x13, #1]
+        ldr             q20, [x13, #16]         // ref rows 16-31
+        ldr             q21, [x13, #17]
+        umull           v22.8h, v18.8b, v16.8b
+        umlal           v22.8h, v19.8b, v17.8b
+        rshrn           \dst_hi\().8b, v22.8h, #5
+        umull2          v23.8h, v18.16b, v16.16b
+        umlal2          v23.8h, v19.16b, v17.16b
+        rshrn2          \dst_hi\().16b, v23.8h, #5
+        umull           v22.8h, v20.8b, v16.8b
+        umlal           v22.8h, v21.8b, v17.8b
+        rshrn           \dst_lo\().8b, v22.8h, #5
+        umull2          v23.8h, v20.16b, v16.16b
+        umlal2          v23.8h, v21.16b, v17.16b
+        rshrn2          \dst_lo\().16b, v23.8h, #5
+.endm
+
+        mov             w10, #0                 // angle_acc
+        mov             x9, #0                  // column byte offset
+        mov             w6, #4                  // batch counter
+
+.Lh_pos_32x32_batch:
+        h_pos_32_col    v0, v24
+        h_pos_32_col    v1, v25
+        h_pos_32_col    v2, v26
+        h_pos_32_col    v3, v27
+        h_pos_32_col    v4, v28
+        h_pos_32_col    v5, v29
+        h_pos_32_col    v6, v30
+        h_pos_32_col    v7, v31
+
+        mov             w11, w10                // save angle_acc
+
+        // Transpose upper half (rows 0-15)
+        transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17
+        // Transpose lower half (rows 16-31)
+        transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v16, v17
+
+        add             x16, x15, x9
+
+        // Rows 0-7
+        .irp reg, v0, v1, v2, v3, v4, v5, v6, v7
+        st1             {\reg\().8b}, [x16], x3
+        .endr
+        // Rows 8-15
+        .irp reg, v0, v1, v2, v3, v4, v5, v6, v7
+        st1             {\reg\().d}[1], [x16], x3
+        .endr
+        // Rows 16-23
+        .irp reg, v24, v25, v26, v27, v28, v29, v30, v31
+        st1             {\reg\().8b}, [x16], x3
+        .endr
+        // Rows 24-31
+        .irp reg, v24, v25, v26, v27, v28, v29, v30, v31
+        st1             {\reg\().d}[1], [x16], x3
+        .endr
+
+        mov             w10, w11                // restore angle_acc
+        add             x9, x9, #8             // advance column offset
+        subs            w6, w6, #1
+        b.gt            .Lh_pos_32x32_batch
+
+.purgem h_pos_32_col
+
+        ldr             d15, [sp], #16
+        ret
+
+.Lh_pos_32x32_mode2:
+        // Mode 2: Row-wise optimization with loop unrolling (4 rows per 
iteration)
+        // Row y contains left[y+1..y+32], contiguous read + contiguous write
+        add             x5, x2, #1              // left + 1
+        mov             w6, #0                  // row offset
+        mov             w9, #8                  // batch counter (32/4 = 8)
+.Lh_pos_32x32_mode2_row4:
+        // Process 4 rows at a time
+        add             x7, x5, w6, uxtw        // base for row y
+        ldp             q0, q1, [x7]            // row y
+        st1             {v0.16b, v1.16b}, [x0], x3
+
+        add             x8, x7, #1              // base for row y+1
+        ldp             q0, q1, [x8]
+        st1             {v0.16b, v1.16b}, [x0], x3
+
+        add             x8, x7, #2              // base for row y+2
+        ldp             q0, q1, [x8]
+        st1             {v0.16b, v1.16b}, [x0], x3
+
+        add             x8, x7, #3              // base for row y+3
+        ldp             q0, q1, [x8]
+        st1             {v0.16b, v1.16b}, [x0], x3
+
+        add             w6, w6, #4             // advance row offset
+        subs            w9, w9, #1
+        b.gt            .Lh_pos_32x32_mode2_row4
+        ret
+endfunc
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to