PR #23507 opened by Jun Zhao (mypopydev) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23507 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23507.patch
Add AArch64 NEON implementations of HEVC 8-bit intra angular prediction for mode 18 and the vertical/horizontal positive-angle mode families (modes 2-9 and 27-34) across all block sizes (4x4/8x8/16x16/32x32), each commit self-contained and wired into per-size dispatch with C fallbacks for the not-yet-ported modes. >From 610f267dbd05178ec2dd80db8cd187a1f6e4971a Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Tue, 16 Jun 2026 16:43:38 +0800 Subject: [PATCH 1/3] lavc/hevc: add aarch64 NEON for angular mode 18 Add NEON-optimized implementation for HEVC angular intra prediction mode 18 (diagonal mode, angle=-32) at 8-bit depth. Mode 18 is a special case where: - angle = -32, so idx = -(y+1), fact = 0 (no interpolation needed) - Row y copies from ref[-y..size-1-y], where ref is built from reversed left samples and top samples Supports all block sizes (4x4, 8x8, 16x16, 32x32): - 4x4/8x8: Uses register-based ref array with ext instructions - 16x16: Uses dual-register EXT approach (v0=left reversed, v1=top) - 32x32: Uses stack-based ref array for larger reference range The pred_angular[] dispatch wrapper now routes mode 18 to the NEON implementation, with the existing C fallback kept for the remaining angular modes, so the decoder uses it immediately. Speedup over C on Apple M4 (checkasm --bench, 10-run median): mode | 4x4 8x8 16x16 32x32 -----+--------------------------- 18 | 4.50 8.40 4.71 3.14 Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/hevcpred_init_aarch64.c | 27 +++- libavcodec/aarch64/hevcpred_neon.S | 159 +++++++++++++++++++++ 2 files changed, 181 insertions(+), 5 deletions(-) diff --git a/libavcodec/aarch64/hevcpred_init_aarch64.c b/libavcodec/aarch64/hevcpred_init_aarch64.c index 03fc5c490e..6d5bde7d29 100644 --- a/libavcodec/aarch64/hevcpred_init_aarch64.c +++ b/libavcodec/aarch64/hevcpred_init_aarch64.c @@ -75,6 +75,20 @@ void ff_hevc_pred_angular_mode_26_8_neon(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int c_idx, int log2_size); +// Mode 18 (diagonal, angle=-32) +void ff_hevc_pred_angular_mode_18_4x4_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int log2_size); +void ff_hevc_pred_angular_mode_18_8x8_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int log2_size); +void ff_hevc_pred_angular_mode_18_16x16_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int log2_size); +void ff_hevc_pred_angular_mode_18_32x32_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int log2_size); + static void pred_dc_neon(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx) @@ -97,7 +111,7 @@ static void pred_dc_neon(uint8_t *src, const uint8_t *top, } } -#define PRED_ANGULAR_NEON(IDX, LOG2) \ +#define PRED_ANGULAR_NEON(IDX, SZ, LOG2) \ static void pred_angular_##IDX##_neon(uint8_t *src, const uint8_t *top, \ const uint8_t *left, ptrdiff_t stride, \ int c_idx, int mode) \ @@ -105,6 +119,9 @@ static void pred_angular_##IDX##_neon(uint8_t *src, const uint8_t *top, \ if (mode == 10) \ ff_hevc_pred_angular_mode_10_8_neon(src, top, left, stride, \ c_idx, LOG2); \ + else if (mode == 18) \ + ff_hevc_pred_angular_mode_18_##SZ##_8_neon(src, top, left, stride, \ + c_idx, LOG2); \ else if (mode == 26) \ ff_hevc_pred_angular_mode_26_8_neon(src, top, left, stride, \ c_idx, LOG2); \ @@ -112,10 +129,10 @@ static void pred_angular_##IDX##_neon(uint8_t *src, const uint8_t *top, \ ff_hevc_pred_angular_##IDX##_8(src, top, left, stride, c_idx, mode); \ } -PRED_ANGULAR_NEON(0, 2) -PRED_ANGULAR_NEON(1, 3) -PRED_ANGULAR_NEON(2, 4) -PRED_ANGULAR_NEON(3, 5) +PRED_ANGULAR_NEON(0, 4x4, 2) +PRED_ANGULAR_NEON(1, 8x8, 3) +PRED_ANGULAR_NEON(2, 16x16, 4) +PRED_ANGULAR_NEON(3, 32x32, 5) #undef PRED_ANGULAR_NEON diff --git a/libavcodec/aarch64/hevcpred_neon.S b/libavcodec/aarch64/hevcpred_neon.S index f21492318c..1a49731965 100644 --- a/libavcodec/aarch64/hevcpred_neon.S +++ b/libavcodec/aarch64/hevcpred_neon.S @@ -1341,3 +1341,162 @@ function ff_hevc_pred_angular_mode_26_8_neon, export=1 .Lmode26_ret: ret endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_mode_18_4x4_8: Mode 18 prediction for 4x4 block +// Row 0: top[-1], top[0], top[1], top[2] +// Row 1: left[0], top[-1], top[0], top[1] +// Row 2: left[1], left[0], top[-1], top[0] +// Row 3: left[2], left[1], left[0], top[-1] +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// w4: c_idx (unused, mode 18 has no edge smoothing) +// w5: log2_size (unused, size is fixed per entry point) +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_mode_18_4x4_8_neon, export=1 + // Build ref array in register + // ref[-4..-1] = left[3], left[2], left[1], left[0] (reversed) + // ref[0..3] = top[-1..2] + + // Load left[0..3] and reverse + ldr s0, [x2] // left[0..3] + rev32 v0.8b, v0.8b // v0 = {left[3], left[2], left[1], left[0], ...} + + // Load top[-1..3] + sub x4, x1, #1 + ldr d1, [x4] // top[-1..6] + + // Combine: {left[3,2,1,0], top[-1,0,1,2,3,...]} + ins v0.s[1], v1.s[0] // v0 = {left[3,2,1,0], top[-1,0,1,2], ...} + + // Rows 0-3: slide a 4-byte window via ext, then batch the stores so + // only one pointer update is needed for the four rows. + ext v2.8b, v0.8b, v0.8b, #4 // row 0: ref[0..3] = top[-1..2] + ext v3.8b, v0.8b, v0.8b, #3 // row 1: ref[-1..2] = v0[3..6] + ext v4.8b, v0.8b, v0.8b, #2 // row 2: ref[-2..1] = v0[2..5] + ext v5.8b, v0.8b, v0.8b, #1 // row 3: ref[-3..0] = v0[1..4] + str s2, [x0] + str s3, [x0, x3] + add x0, x0, x3, lsl #1 + str s4, [x0] + str s5, [x0, x3] + + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_mode_18_8x8_8: Mode 18 prediction for 8x8 block +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// w4: c_idx (unused, mode 18 has no edge smoothing) +// w5: log2_size (unused, size is fixed per entry point) +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_mode_18_8x8_8_neon, export=1 + // ref[-8..-1] = left[7..0] (reversed) + // ref[0..7] = top[-1..6] + + // Load left[0..7] and reverse + ldr d0, [x2] // left[0..7] + rev64 v0.8b, v0.8b // {left[7..0]} + + // Load top[-1..7] + sub x4, x1, #1 + ldr q1, [x4] // top[-1..14] + + // Combine into v2 (16 bytes): {left[7..0], top[-1..7]} + mov v2.d[0], v0.d[0] // v2[0..7] = left[7..0] + mov v2.d[1], v1.d[0] // v2[8..15] = top[-1..6] + + // Row 0: ref[0..7] = top[-1..6] = v2[8..15] + st1 {v2.d}[1], [x0], x3 + + // Row 1-7: use ext with decreasing offset +.irp offset, 7, 6, 5, 4, 3, 2, 1 + ext v3.16b, v2.16b, v2.16b, #\offset + st1 {v3.8b}, [x0], x3 +.endr + + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_mode_18_16x16_8: Mode 18 prediction for 16x16 block +// ref[-16..-1] = left[15..0] reversed, ref[0..15] = top[-1..14] +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// w4: c_idx (unused, mode 18 has no edge smoothing) +// w5: log2_size (unused, size is fixed per entry point) +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_mode_18_16x16_8_neon, export=1 + // Register-based approach using EXT to slide a window across {v0:v1}. + // v0 = left[15..0] (reversed), v1 = top[-1..14] + // Row k: need ref[-k..15-k] = EXT(v0, v1, #16-k) for k=1..15, row 0 = v1. + + ldr q0, [x2] // left[0..15] + rev64 v0.16b, v0.16b // reverse in 64-bit lanes + ext v0.16b, v0.16b, v0.16b, #8 // v0 = left[15..0] + sub x4, x1, #1 + ldr q1, [x4] // v1 = top[-1..14] + + // Row 0: ref[0..15] = v1 + st1 {v1.16b}, [x0], x3 + // Row 1-15: EXT(v0, v1, #N) slides window across {v0:v1} +.irp offset, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 + ext v2.16b, v0.16b, v1.16b, #\offset + st1 {v2.16b}, [x0], x3 +.endr + + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_mode_18_32x32_8: Mode 18 prediction for 32x32 block +// ref[-32..-1] = left[31..0] reversed, ref[0..31] = top[-1..30] +// Arguments: +// x0: src +// x1: top +// x2: left +// x3: stride +// w4: c_idx (unused, mode 18 has no edge smoothing) +// w5: log2_size (unused, size is fixed per entry point) +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_mode_18_32x32_8_neon, export=1 + // Build the 64-byte ref array (ref[-32..31]) on the stack, then slide a + // 32-byte window one byte left per row. + sub sp, sp, #64 + + // Store left[31..0] reversed at sp[0..31] (ref[-32..-1]) + ldp q0, q1, [x2] // left[0..31] + rev64 v0.16b, v0.16b + ext v0.16b, v0.16b, v0.16b, #8 // left[15..0] + rev64 v1.16b, v1.16b + ext v1.16b, v1.16b, v1.16b, #8 // left[31..16] + stp q1, q0, [sp] // {left[31..16], left[15..0]} + + // Store top[-1..30] at sp[32..63] (ref[0..31]) + sub x4, x1, #1 + ldp q2, q3, [x4] // top[-1..30] + stp q2, q3, [sp, #32] + + // ref_base = sp + 32 (so ref[0] = sp[32], ref[-1] = sp[31], etc.) + add x4, sp, #32 + mov w5, #32 + +1: ldp q0, q1, [x4] + sub x4, x4, #1 // slide ref pointer one byte left + subs w5, w5, #1 + st1 {v0.16b, v1.16b}, [x0], x3 + b.gt 1b + + add sp, sp, #64 + ret +endfunc -- 2.52.0 >From 8093014d5310019fde1410bbfbe503cf5283371f Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Tue, 16 Jun 2026 16:44:41 +0800 Subject: [PATCH 2/3] lavc/hevc: add aarch64 NEON for angular V positive (modes 27-34) Add NEON-optimized implementations for HEVC angular intra prediction modes 27-34 (vertical positive angles) at 8-bit depth. These modes use the top reference with positive angles, computing: - idx = ((y+1) * angle) >> 5 - fact = ((y+1) * angle) & 31 - Interpolate between ref[idx] and ref[idx+1] using fact Mode 34 (angle=32) is optimized as a pure diagonal copy since fact=0. Supports all block sizes (4x4, 8x8, 16x16, 32x32). The pred_angular[] dispatch wrapper now routes modes 27-34 to the NEON implementation, with the C fallback kept for the remaining angular modes, so the decoder uses it immediately. Speedup over C on Apple M4 (checkasm --bench, 10-run median): mode | 4x4 8x8 16x16 32x32 -----+--------------------------- 27 | 5.00 5.11 5.94 8.69 28 | 5.00 5.19 7.18 8.96 29 | 4.95 5.18 7.10 8.97 30 | 5.05 5.18 7.14 8.95 31 | 5.05 5.02 7.10 8.92 32 | 5.02 5.09 7.12 8.93 33 | 5.00 5.02 6.67 8.73 34 | 2.00 6.07 1.89 1.90 (pure diagonal copy) Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/hevcpred_init_aarch64.c | 17 ++ libavcodec/aarch64/hevcpred_neon.S | 275 +++++++++++++++++++++ 2 files changed, 292 insertions(+) diff --git a/libavcodec/aarch64/hevcpred_init_aarch64.c b/libavcodec/aarch64/hevcpred_init_aarch64.c index 6d5bde7d29..f89635376f 100644 --- a/libavcodec/aarch64/hevcpred_init_aarch64.c +++ b/libavcodec/aarch64/hevcpred_init_aarch64.c @@ -89,6 +89,20 @@ void ff_hevc_pred_angular_mode_18_32x32_8_neon(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int c_idx, int log2_size); +// Positive angle vertical modes (mode 27-34) +void ff_hevc_pred_angular_v_pos_4x4_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); +void ff_hevc_pred_angular_v_pos_8x8_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); +void ff_hevc_pred_angular_v_pos_16x16_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); +void ff_hevc_pred_angular_v_pos_32x32_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); + static void pred_dc_neon(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx) @@ -125,6 +139,9 @@ static void pred_angular_##IDX##_neon(uint8_t *src, const uint8_t *top, \ else if (mode == 26) \ ff_hevc_pred_angular_mode_26_8_neon(src, top, left, stride, \ c_idx, LOG2); \ + else if (mode >= 27) \ + ff_hevc_pred_angular_v_pos_##SZ##_8_neon(src, top, left, stride, \ + c_idx, mode); \ else \ ff_hevc_pred_angular_##IDX##_8(src, top, left, stride, c_idx, mode); \ } diff --git a/libavcodec/aarch64/hevcpred_neon.S b/libavcodec/aarch64/hevcpred_neon.S index 1a49731965..bb49272764 100644 --- a/libavcodec/aarch64/hevcpred_neon.S +++ b/libavcodec/aarch64/hevcpred_neon.S @@ -1500,3 +1500,278 @@ function ff_hevc_pred_angular_mode_18_32x32_8_neon, export=1 add sp, sp, #64 ret endfunc + +// ============================================================================= +// Angular Prediction - Vertical reference modes (Mode 27-34) +// ============================================================================= + +// Angle table for V reference positive angles (mode 27-34) +// angle = intra_pred_angle_v[mode - 27] +const intra_pred_angle_v, align=4 + .byte 2 // mode 27 + .byte 5 // mode 28 + .byte 9 // mode 29 + .byte 13 // mode 30 + .byte 17 // mode 31 + .byte 21 // mode 32 + .byte 26 // mode 33 + .byte 32 // mode 34 +endconst + +// ----------------------------------------------------------------------------- +// pred_angular_v_pos_4x4_8: Vertical reference positive angle prediction (mode 27-34) +// Arguments: +// x0: src +// x1: top +// x2: left (unused for V reference modes) +// x3: stride +// w4: c_idx +// w5: mode +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_v_pos_4x4_8_neon, export=1 + // Load angle from table + movrel x6, intra_pred_angle_v + sub w7, w5, #27 // mode - 27 (index into angle table) + ldrsb w8, [x6, w7, sxtw] // angle = intra_pred_angle_v[mode-27] + + // For mode 34 (angle=32), fact is always 0, optimize as pure copy + cmp w8, #32 + b.eq .Lv_pos_4x4_mode34 + + mov w10, #0 // angle_acc = 0 + movi v18.16b, #32 // weight base: 32 for (32 - fact) + +.macro v_pos_4x4_row + add w10, w10, w8 // angle_acc = (y+1) * angle + asr w11, w10, #5 // idx = angle_acc >> 5 + and w12, w10, #31 // fact = angle_acc & 31 + + // Load reference pixels top[idx..idx+4] + add x13, x1, w11, sxtw // x13 = top + idx + dup v17.8b, w12 // broadcast fact (hoisted for addr gap) + sub v16.8b, v18.8b, v17.8b // 32 - fact + ldr s0, [x13] // ref[idx+1] (2-instr gap from add) + ldr s1, [x13, #1] // ref[idx+2] + + umull v20.8h, v0.8b, v16.8b // (32-fact) * ref[idx+1] + umlal v20.8h, v1.8b, v17.8b // + fact * ref[idx+2] + rshrn v0.8b, v20.8h, #5 // (result + 16) >> 5 + + str s0, [x0] // str s is faster than lane-store on some cores + add x0, x0, x3 +.endm + v_pos_4x4_row + v_pos_4x4_row + v_pos_4x4_row + v_pos_4x4_row +.purgem v_pos_4x4_row + + ret + +.Lv_pos_4x4_mode34: + // Mode 34: angle=32, each row copies from top[y+1..y+4] + // Row 0: top[1..4], Row 1: top[2..5], Row 2: top[3..6], Row 3: top[4..7] + ldr s0, [x1, #1] + ldr s1, [x1, #2] + ldr s2, [x1, #3] + ldr s3, [x1, #4] + str s0, [x0] + str s1, [x0, x3] + add x0, x0, x3, lsl #1 + str s2, [x0] + str s3, [x0, x3] + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_v_pos_8x8_8: Vertical reference positive angle prediction (mode 27-34) +// Arguments: +// x0: src +// x1: top +// x2: left (unused for V reference modes) +// x3: stride +// w4: c_idx +// w5: mode +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_v_pos_8x8_8_neon, export=1 + // Load angle from table + movrel x6, intra_pred_angle_v + sub w7, w5, #27 // mode - 27 (index into angle table) + ldrsb w8, [x6, w7, sxtw] // angle = intra_pred_angle_v[mode-27] + + // Mode 34 optimization + cmp w8, #32 + b.eq .Lv_pos_8x8_mode34 + + mov w9, #8 // row counter + mov w10, #0 // angle_acc = 0 + movi v18.16b, #32 // weight base: 32 for (32 - fact) + +.Lv_pos_8x8_row_loop: + add w10, w10, w8 // angle_acc = (y+1) * angle + asr w11, w10, #5 // idx + and w12, w10, #31 // fact + + add x13, x1, w11, sxtw + dup v17.8b, w12 // hoisted for addr gap + sub v16.8b, v18.8b, v17.8b + ldr d0, [x13] // ref[idx+1..idx+8] + ldr d1, [x13, #1] // ref[idx+2..idx+9] + + umull v20.8h, v0.8b, v16.8b + umlal v20.8h, v1.8b, v17.8b + rshrn v0.8b, v20.8h, #5 + + subs w9, w9, #1 // subs before store: helps in-order cores + st1 {v0.8b}, [x0], x3 + b.gt .Lv_pos_8x8_row_loop + + ret + +.Lv_pos_8x8_mode34: + // Mode 34: each row copies from top[y+1..y+8] +.irp off, 1, 2, 3, 4, 5, 6, 7 + ldr d0, [x1, #\off] + st1 {v0.8b}, [x0], x3 +.endr + ldr d0, [x1, #8] + str d0, [x0] // last row: no pointer update needed + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_v_pos_16x16_8: Vertical reference positive angle prediction (mode 27-34) +// Arguments: +// x0: src +// x1: top +// x2: left (unused for V reference modes) +// x3: stride +// w4: c_idx +// w5: mode +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_v_pos_16x16_8_neon, export=1 + // Load angle from table + movrel x6, intra_pred_angle_v + sub w7, w5, #27 // mode - 27 (index into angle table) + ldrsb w8, [x6, w7, sxtw] // angle = intra_pred_angle_v[mode-27] + + // Mode 34 optimization + cmp w8, #32 + b.eq .Lv_pos_16x16_mode34 + + mov w9, #16 // row counter + mov w10, #0 // angle_acc = 0 + movi v18.16b, #32 // weight base: 32 for (32 - fact) + +.Lv_pos_16x16_row_loop: + add w10, w10, w8 // angle_acc = (y+1) * angle + asr w11, w10, #5 // idx + and w12, w10, #31 // fact + + add x13, x1, w11, sxtw + dup v17.16b, w12 // hoisted for addr gap + sub v16.16b, v18.16b, v17.16b + ldr q0, [x13] // ref[idx+1..idx+16] + ldr q1, [x13, #1] // ref[idx+2..idx+17] + + // Low 8 bytes + umull v20.8h, v0.8b, v16.8b + umlal v20.8h, v1.8b, v17.8b + rshrn v2.8b, v20.8h, #5 + + // High 8 bytes + umull2 v21.8h, v0.16b, v16.16b + umlal2 v21.8h, v1.16b, v17.16b + rshrn2 v2.16b, v21.8h, #5 + + subs w9, w9, #1 // subs before store: helps in-order cores + st1 {v2.16b}, [x0], x3 + b.gt .Lv_pos_16x16_row_loop + + ret + +.Lv_pos_16x16_mode34: + // Mode 34: each row copies from top[y+1..y+16], fully unrolled + add x13, x1, #1 +.irp off, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + ldr q0, [x13, #\off] + st1 {v0.16b}, [x0], x3 +.endr + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_v_pos_32x32_8: Vertical reference positive angle prediction (mode 27-34) +// Arguments: +// x0: src +// x1: top +// x2: left (unused for V reference modes) +// x3: stride +// w4: c_idx +// w5: mode +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_v_pos_32x32_8_neon, export=1 + // Load angle from table + movrel x6, intra_pred_angle_v + sub w7, w5, #27 // mode - 27 (index into angle table) + ldrsb w8, [x6, w7, sxtw] // angle = intra_pred_angle_v[mode-27] + + // Mode 34 optimization + cmp w8, #32 + b.eq .Lv_pos_32x32_mode34 + + mov w9, #32 // row counter + mov w10, #0 // angle_acc = 0 + movi v18.16b, #32 // weight base: 32 for (32 - fact) + +.Lv_pos_32x32_row_loop: + add w10, w10, w8 // angle_acc = (y+1) * angle + asr w11, w10, #5 // idx + and w12, w10, #31 // fact + + add x13, x1, w11, sxtw + dup v17.16b, w12 // hoisted for addr gap + sub v16.16b, v18.16b, v17.16b + + // Load 32 bytes + 1 for interpolation + ldr q0, [x13] // ref[idx+1..idx+16] + ldr q1, [x13, #1] // ref[idx+2..idx+17] + ldr q2, [x13, #16] // ref[idx+17..idx+32] + ldr q3, [x13, #17] // ref[idx+18..idx+33] + + // First 16 bytes + umull v20.8h, v0.8b, v16.8b + umlal v20.8h, v1.8b, v17.8b + rshrn v4.8b, v20.8h, #5 + + umull2 v21.8h, v0.16b, v16.16b + umlal2 v21.8h, v1.16b, v17.16b + rshrn2 v4.16b, v21.8h, #5 + + // Second 16 bytes + umull v22.8h, v2.8b, v16.8b + umlal v22.8h, v3.8b, v17.8b + rshrn v5.8b, v22.8h, #5 + + umull2 v23.8h, v2.16b, v16.16b + umlal2 v23.8h, v3.16b, v17.16b + rshrn2 v5.16b, v23.8h, #5 + + subs w9, w9, #1 // subs before store: helps in-order cores + st1 {v4.16b, v5.16b}, [x0], x3 + b.gt .Lv_pos_32x32_row_loop + + ret + +.Lv_pos_32x32_mode34: + // Mode 34: each row copies from top[y+1..y+32] + add x13, x1, #1 // top + 1 + mov w9, #32 +.Lv_pos_32x32_mode34_loop: + ldp q0, q1, [x13] + add x13, x13, #1 + subs w9, w9, #1 + st1 {v0.16b, v1.16b}, [x0], x3 + b.gt .Lv_pos_32x32_mode34_loop + ret +endfunc -- 2.52.0 >From 020e1334d83bc68455c99f71a0cb310e83dcb793 Mon Sep 17 00:00:00 2001 From: Jun Zhao <[email protected]> Date: Tue, 16 Jun 2026 16:45:35 +0800 Subject: [PATCH 3/3] lavc/hevc: add aarch64 NEON for angular H positive (modes 2-9) Add NEON-optimized implementations for HEVC angular intra prediction modes 2-9 (horizontal positive angles) at 8-bit depth. These modes use the left reference with positive angles, computing: - idx = ((x+1) * angle) >> 5 - fact = ((x+1) * angle) & 31 - Interpolate between ref[idx] and ref[idx+1] using fact Uses batch column computation with matrix transpose to convert column-oriented interpolation results into contiguous row stores. Mode 2 (angle=32) is optimized with direct row-wise contiguous writes since each row copies left[y+1..y+size], avoiding interpolation. Supports all block sizes (4x4, 8x8, 16x16, 32x32). The pred_angular[] dispatch wrapper now routes modes 2-9 to the NEON implementation, with the C fallback kept for the remaining angular modes, so the decoder uses it immediately. Speedup over C on Apple M4 (checkasm --bench, 10-run median): mode | 4x4 8x8 16x16 32x32 -----+---------------------------- 2 | 6.71 8.29 22.69 32.68 (pure horizontal copy) 3 | 3.57 5.11 6.04 8.44 4 | 3.55 5.09 6.12 8.32 5 | 3.55 4.97 6.13 8.33 6 | 3.52 5.09 6.13 8.37 7 | 3.54 5.08 6.14 8.35 8 | 3.54 5.08 6.13 8.33 9 | 3.59 4.82 5.66 8.42 Signed-off-by: Jun Zhao <[email protected]> --- libavcodec/aarch64/hevcpred_init_aarch64.c | 17 + libavcodec/aarch64/hevcpred_neon.S | 428 +++++++++++++++++++++ 2 files changed, 445 insertions(+) diff --git a/libavcodec/aarch64/hevcpred_init_aarch64.c b/libavcodec/aarch64/hevcpred_init_aarch64.c index f89635376f..2c7a5215d6 100644 --- a/libavcodec/aarch64/hevcpred_init_aarch64.c +++ b/libavcodec/aarch64/hevcpred_init_aarch64.c @@ -103,6 +103,20 @@ void ff_hevc_pred_angular_v_pos_32x32_8_neon(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int c_idx, int mode); +// Positive angle horizontal modes (mode 2-9) +void ff_hevc_pred_angular_h_pos_4x4_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); +void ff_hevc_pred_angular_h_pos_8x8_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); +void ff_hevc_pred_angular_h_pos_16x16_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); +void ff_hevc_pred_angular_h_pos_32x32_8_neon(uint8_t *src, const uint8_t *top, + const uint8_t *left, ptrdiff_t stride, + int c_idx, int mode); + static void pred_dc_neon(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx) @@ -142,6 +156,9 @@ static void pred_angular_##IDX##_neon(uint8_t *src, const uint8_t *top, \ else if (mode >= 27) \ ff_hevc_pred_angular_v_pos_##SZ##_8_neon(src, top, left, stride, \ c_idx, mode); \ + else if (mode <= 9) \ + ff_hevc_pred_angular_h_pos_##SZ##_8_neon(src, top, left, stride, \ + c_idx, mode); \ else \ ff_hevc_pred_angular_##IDX##_8(src, top, left, stride, c_idx, mode); \ } diff --git a/libavcodec/aarch64/hevcpred_neon.S b/libavcodec/aarch64/hevcpred_neon.S index bb49272764..d5b1036707 100644 --- a/libavcodec/aarch64/hevcpred_neon.S +++ b/libavcodec/aarch64/hevcpred_neon.S @@ -21,6 +21,7 @@ */ #include "libavutil/aarch64/asm.S" +#include "neon.S" /* HEVC Intra Prediction NEON functions * @@ -1775,3 +1776,430 @@ function ff_hevc_pred_angular_v_pos_32x32_8_neon, export=1 b.gt .Lv_pos_32x32_mode34_loop ret endfunc + +// ============================================================================= +// Angular Prediction - Horizontal reference modes, positive angle (Mode 2-9) +// ============================================================================= + +const intra_pred_angle_h, align=4 + .byte 32 // mode 2 + .byte 26 // mode 3 + .byte 21 // mode 4 + .byte 17 // mode 5 + .byte 13 // mode 6 + .byte 9 // mode 7 + .byte 5 // mode 8 + .byte 2 // mode 9 +endconst + +// Shared interpolation column for the H positive 4x4/8x8 entry points: both +// load 8 bytes of left reference and produce one transposed column in \dst. +// Requires v21 = 32, w8 = angle, w10 = running angle_acc, x2 = left base. +.macro h_pos_col_8b dst + add w10, w10, w8 + asr w11, w10, #5 + and w12, w10, #31 + add x13, x2, w11, sxtw + dup v17.8b, w12 // hoisted for addr gap + sub v16.8b, v21.8b, v17.8b + ldr d18, [x13] + ldr d19, [x13, #1] + umull v20.8h, v18.8b, v16.8b + umlal v20.8h, v19.8b, v17.8b + rshrn \dst\().8b, v20.8h, #5 +.endm + +// ----------------------------------------------------------------------------- +// pred_angular_h_pos_4x4_8: Horizontal reference positive angle prediction (mode 2-9) +// Arguments: +// x0: src +// x1: top (unused for H reference modes) +// x2: left +// x3: stride +// w4: c_idx +// w5: mode +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_h_pos_4x4_8_neon, export=1 + // Load angle from table + movrel x6, intra_pred_angle_h + sub w7, w5, #2 // mode - 2 (index into angle table) + ldrb w8, [x6, w7, uxtw] // angle = intra_pred_angle_h[mode-2] + + // For mode 2 (angle=32), fact is always 0, optimize as pure copy + cmp w8, #32 + b.eq .Lh_pos_4x4_mode2 + + // === Fully unrolled 4-column computation with transpose === + mov w10, #0 // angle_acc + movi v21.16b, #32 // weight base: 32 for (32 - fact) + + h_pos_col_8b v0 + h_pos_col_8b v1 + h_pos_col_8b v2 + h_pos_col_8b v3 + + transpose_4x8B v0, v1, v2, v3, v16, v17, v18, v19 + + str s0, [x0] + str s1, [x0, x3] + add x0, x0, x3, lsl #1 + str s2, [x0] + str s3, [x0, x3] + ret + +.Lh_pos_4x4_mode2: + // Mode 2: Row-wise optimization + // Row y contains left[y+1..y+4], which is a contiguous read + contiguous write + // Row 0: left[1..4], Row 1: left[2..5], Row 2: left[3..6], Row 3: left[4..7] + add x5, x2, #1 // left + 1 + ldr s0, [x5] // row 0: left[1..4] + ldr s1, [x5, #1] // row 1: left[2..5] + ldr s2, [x5, #2] // row 2: left[3..6] + ldr s3, [x5, #3] // row 3: left[4..7] + str s0, [x0] + str s1, [x0, x3] + add x0, x0, x3, lsl #1 + str s2, [x0] + str s3, [x0, x3] + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_h_pos_8x8_8: Horizontal reference positive angle prediction (mode 2-9) +// Arguments: +// x0: src +// x1: top (unused for H reference modes) +// x2: left +// x3: stride +// w4: c_idx +// w5: mode +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_h_pos_8x8_8_neon, export=1 + // Load angle from table + movrel x6, intra_pred_angle_h + sub w7, w5, #2 + ldrb w8, [x6, w7, uxtw] // angle + + // Mode 2 optimization + cmp w8, #32 + b.eq .Lh_pos_8x8_mode2 + + // === Fully unrolled 8-column computation with transpose === + mov w10, #0 // angle_acc + movi v21.16b, #32 // weight base: 32 for (32 - fact) + + h_pos_col_8b v0 + h_pos_col_8b v1 + h_pos_col_8b v2 + h_pos_col_8b v3 + h_pos_col_8b v4 + h_pos_col_8b v5 + h_pos_col_8b v6 + h_pos_col_8b v7 +.purgem h_pos_col_8b + + transpose_8x8B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 + + st1 {v0.8b}, [x0], x3 + st1 {v1.8b}, [x0], x3 + st1 {v2.8b}, [x0], x3 + st1 {v3.8b}, [x0], x3 + st1 {v4.8b}, [x0], x3 + st1 {v5.8b}, [x0], x3 + st1 {v6.8b}, [x0], x3 + st1 {v7.8b}, [x0], x3 + ret + +.Lh_pos_8x8_mode2: + // Mode 2: Row-wise optimization + // Row y contains left[y+1..y+8], contiguous read + contiguous write + add x5, x2, #1 // left + 1 + ldr d0, [x5] // row 0: left[1..8] + ldr d1, [x5, #1] // row 1: left[2..9] + st1 {v0.8b}, [x0], x3 + st1 {v1.8b}, [x0], x3 + ldr d0, [x5, #2] // row 2: left[3..10] + ldr d1, [x5, #3] // row 3: left[4..11] + st1 {v0.8b}, [x0], x3 + st1 {v1.8b}, [x0], x3 + ldr d0, [x5, #4] // row 4: left[5..12] + ldr d1, [x5, #5] // row 5: left[6..13] + st1 {v0.8b}, [x0], x3 + st1 {v1.8b}, [x0], x3 + ldr d0, [x5, #6] // row 6: left[7..14] + ldr d1, [x5, #7] // row 7: left[8..15] + st1 {v0.8b}, [x0], x3 + st1 {v1.8b}, [x0], x3 + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_h_pos_16x16_8: Horizontal reference positive angle prediction (mode 2-9) +// Arguments: +// x0: src +// x1: top (unused for H reference modes) +// x2: left +// x3: stride +// w4: c_idx +// w5: mode +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_h_pos_16x16_8_neon, export=1 + // Load angle from table + movrel x6, intra_pred_angle_h + sub w7, w5, #2 + ldrb w8, [x6, w7, uxtw] + + // Mode 2 optimization + cmp w8, #32 + b.eq .Lh_pos_16x16_mode2 + + // === Two batches of 8 columns with 16-byte transpose === + mov x15, x0 // save base dst + movi v22.16b, #32 // weight base: 32 for (32 - fact) + +.macro h_pos_16x16_col dst + add w10, w10, w8 + asr w11, w10, #5 + and w12, w10, #31 + add x13, x2, w11, sxtw + dup v17.16b, w12 // hoisted for addr gap + sub v16.16b, v22.16b, v17.16b + ldr q18, [x13] + ldr q19, [x13, #1] + umull v20.8h, v18.8b, v16.8b + umlal v20.8h, v19.8b, v17.8b + rshrn \dst\().8b, v20.8h, #5 + umull2 v21.8h, v18.16b, v16.16b + umlal2 v21.8h, v19.16b, v17.16b + rshrn2 \dst\().16b, v21.8h, #5 +.endm + + // Batch 1: columns 0-7 + mov w10, #0 + h_pos_16x16_col v0 + h_pos_16x16_col v1 + h_pos_16x16_col v2 + h_pos_16x16_col v3 + h_pos_16x16_col v4 + h_pos_16x16_col v5 + h_pos_16x16_col v6 + h_pos_16x16_col v7 + + mov w9, w10 // save angle_acc + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 + + // Store cols 0-7 of rows 0-7 + mov x16, x15 + .irp reg, v0, v1, v2, v3, v4, v5, v6, v7 + st1 {\reg\().8b}, [x16], x3 + .endr + // Store cols 0-7 of rows 8-15 + .irp reg, v0, v1, v2, v3, v4, v5, v6, v7 + st1 {\reg\().d}[1], [x16], x3 + .endr + + // Batch 2: columns 8-15 + mov w10, w9 + h_pos_16x16_col v0 + h_pos_16x16_col v1 + h_pos_16x16_col v2 + h_pos_16x16_col v3 + h_pos_16x16_col v4 + h_pos_16x16_col v5 + h_pos_16x16_col v6 + h_pos_16x16_col v7 +.purgem h_pos_16x16_col + + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 + + // Store cols 8-15 of rows 0-7 + add x16, x15, #8 + .irp reg, v0, v1, v2, v3, v4, v5, v6, v7 + st1 {\reg\().8b}, [x16], x3 + .endr + // Store cols 8-15 of rows 8-15 + .irp reg, v0, v1, v2, v3, v4, v5, v6, v7 + st1 {\reg\().d}[1], [x16], x3 + .endr + + ret + +.Lh_pos_16x16_mode2: + // Mode 2: Row-wise optimization with loop unrolling + // Row y contains left[y+1..y+16], contiguous read + contiguous write + add x5, x2, #1 // left + 1 + + // Rows 0-3 + ldr q0, [x5] + ldr q1, [x5, #1] + ldr q2, [x5, #2] + ldr q3, [x5, #3] + st1 {v0.16b}, [x0], x3 + st1 {v1.16b}, [x0], x3 + st1 {v2.16b}, [x0], x3 + st1 {v3.16b}, [x0], x3 + + // Rows 4-7 + ldr q0, [x5, #4] + ldr q1, [x5, #5] + ldr q2, [x5, #6] + ldr q3, [x5, #7] + st1 {v0.16b}, [x0], x3 + st1 {v1.16b}, [x0], x3 + st1 {v2.16b}, [x0], x3 + st1 {v3.16b}, [x0], x3 + + // Rows 8-11 + ldr q0, [x5, #8] + ldr q1, [x5, #9] + ldr q2, [x5, #10] + ldr q3, [x5, #11] + st1 {v0.16b}, [x0], x3 + st1 {v1.16b}, [x0], x3 + st1 {v2.16b}, [x0], x3 + st1 {v3.16b}, [x0], x3 + + // Rows 12-15 + ldr q0, [x5, #12] + ldr q1, [x5, #13] + ldr q2, [x5, #14] + ldr q3, [x5, #15] + st1 {v0.16b}, [x0], x3 + st1 {v1.16b}, [x0], x3 + st1 {v2.16b}, [x0], x3 + st1 {v3.16b}, [x0], x3 + ret +endfunc + +// ----------------------------------------------------------------------------- +// pred_angular_h_pos_32x32_8: Horizontal reference positive angle prediction (mode 2-9) +// Arguments: +// x0: src +// x1: top (unused for H reference modes) +// x2: left +// x3: stride +// w4: c_idx +// w5: mode +// ----------------------------------------------------------------------------- +function ff_hevc_pred_angular_h_pos_32x32_8_neon, export=1 + // Load angle from table + movrel x6, intra_pred_angle_h + sub w7, w5, #2 + ldrb w8, [x6, w7, uxtw] + + // Mode 2 optimization + cmp w8, #32 + b.eq .Lh_pos_32x32_mode2 + + // === 4 batches of 8 columns with 32-byte transpose === + // v0-v7 and v24-v31 hold transposed rows, so the only free constant + // register is the callee-saved v15; save/restore its low 64 bits per AAPCS64. + str d15, [sp, #-16]! + mov x15, x0 // save base dst + movi v15.16b, #32 // weight base: 32 for (32 - fact) + +.macro h_pos_32_col dst_hi, dst_lo + add w10, w10, w8 + asr w11, w10, #5 + and w12, w10, #31 + add x13, x2, w11, sxtw + dup v17.16b, w12 // hoisted for addr gap + sub v16.16b, v15.16b, v17.16b + ldr q18, [x13] // ref rows 0-15 + ldr q19, [x13, #1] + ldr q20, [x13, #16] // ref rows 16-31 + ldr q21, [x13, #17] + umull v22.8h, v18.8b, v16.8b + umlal v22.8h, v19.8b, v17.8b + rshrn \dst_hi\().8b, v22.8h, #5 + umull2 v23.8h, v18.16b, v16.16b + umlal2 v23.8h, v19.16b, v17.16b + rshrn2 \dst_hi\().16b, v23.8h, #5 + umull v22.8h, v20.8b, v16.8b + umlal v22.8h, v21.8b, v17.8b + rshrn \dst_lo\().8b, v22.8h, #5 + umull2 v23.8h, v20.16b, v16.16b + umlal2 v23.8h, v21.16b, v17.16b + rshrn2 \dst_lo\().16b, v23.8h, #5 +.endm + + mov w10, #0 // angle_acc + mov x9, #0 // column byte offset + mov w6, #4 // batch counter + +.Lh_pos_32x32_batch: + h_pos_32_col v0, v24 + h_pos_32_col v1, v25 + h_pos_32_col v2, v26 + h_pos_32_col v3, v27 + h_pos_32_col v4, v28 + h_pos_32_col v5, v29 + h_pos_32_col v6, v30 + h_pos_32_col v7, v31 + + mov w11, w10 // save angle_acc + + // Transpose upper half (rows 0-15) + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v16, v17 + // Transpose lower half (rows 16-31) + transpose_8x16B v24, v25, v26, v27, v28, v29, v30, v31, v16, v17 + + add x16, x15, x9 + + // Rows 0-7 + .irp reg, v0, v1, v2, v3, v4, v5, v6, v7 + st1 {\reg\().8b}, [x16], x3 + .endr + // Rows 8-15 + .irp reg, v0, v1, v2, v3, v4, v5, v6, v7 + st1 {\reg\().d}[1], [x16], x3 + .endr + // Rows 16-23 + .irp reg, v24, v25, v26, v27, v28, v29, v30, v31 + st1 {\reg\().8b}, [x16], x3 + .endr + // Rows 24-31 + .irp reg, v24, v25, v26, v27, v28, v29, v30, v31 + st1 {\reg\().d}[1], [x16], x3 + .endr + + mov w10, w11 // restore angle_acc + add x9, x9, #8 // advance column offset + subs w6, w6, #1 + b.gt .Lh_pos_32x32_batch + +.purgem h_pos_32_col + + ldr d15, [sp], #16 + ret + +.Lh_pos_32x32_mode2: + // Mode 2: Row-wise optimization with loop unrolling (4 rows per iteration) + // Row y contains left[y+1..y+32], contiguous read + contiguous write + add x5, x2, #1 // left + 1 + mov w6, #0 // row offset + mov w9, #8 // batch counter (32/4 = 8) +.Lh_pos_32x32_mode2_row4: + // Process 4 rows at a time + add x7, x5, w6, uxtw // base for row y + ldp q0, q1, [x7] // row y + st1 {v0.16b, v1.16b}, [x0], x3 + + add x8, x7, #1 // base for row y+1 + ldp q0, q1, [x8] + st1 {v0.16b, v1.16b}, [x0], x3 + + add x8, x7, #2 // base for row y+2 + ldp q0, q1, [x8] + st1 {v0.16b, v1.16b}, [x0], x3 + + add x8, x7, #3 // base for row y+3 + ldp q0, q1, [x8] + st1 {v0.16b, v1.16b}, [x0], x3 + + add w6, w6, #4 // advance row offset + subs w9, w9, #1 + b.gt .Lh_pos_32x32_mode2_row4 + ret +endfunc -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
