PR #23432 opened by Lynne URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23432 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23432.patch
C vs NEON float_fft_120 25.2 3.7 6.85x float_fft_960 189.3 36.3 5.22x float_fft_1920 200.2 78.3 2.56x float_imdct_120 5.1 2.8 1.83x float_imdct_960 51.7 25.4 2.04x float_imdct_1024 52.2 24.5 2.13x float_imdct_16384 1117.3 513.9 2.17x From 49355004fbd5ae85e533d78d3b6ac9537316c15e Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Thu, 4 Jun 2026 06:03:55 +0900 Subject: [PATCH 1/4] lavu/tx: add AArch64 NEON fft15 codelet Based on the C code in doc/transforms.md. --- libavutil/aarch64/tx_float_init.c | 33 +++++ libavutil/aarch64/tx_float_neon.S | 233 ++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+) diff --git a/libavutil/aarch64/tx_float_init.c b/libavutil/aarch64/tx_float_init.c index 8300472c4c..47f1e12700 100644 --- a/libavutil/aarch64/tx_float_init.c +++ b/libavutil/aarch64/tx_float_init.c @@ -26,6 +26,8 @@ TX_DECL_FN(fft4_fwd, neon) TX_DECL_FN(fft4_inv, neon) TX_DECL_FN(fft8, neon) TX_DECL_FN(fft8_ns, neon) +TX_DECL_FN(fft15, neon) +TX_DECL_FN(fft15_ns, neon) TX_DECL_FN(fft16, neon) TX_DECL_FN(fft16_ns, neon) TX_DECL_FN(fft32, neon) @@ -44,6 +46,35 @@ static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd, return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, 8, 0); } +static av_cold int fft15_init(AVTXContext *s, const FFTXCodelet *cd, + uint64_t flags, FFTXCodeletOptions *opts, + int len, int inv, const void *scale) +{ + int ret, cnt = 0, tmp[15]; + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; + + ff_tx_init_tabs_float(len); + + if ((ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5)) < 0) + return ret; + + /* Reorder the 15-pt map so the loads in the pre-permuted assembly path + * become simple contiguous chunks. Mirrors the x86 FFT15 init. */ + memcpy(tmp, s->map, 15*sizeof(*tmp)); + for (int i = 1; i < 15; i += 3) + s->map[cnt++] = tmp[i]; + for (int i = 2; i < 15; i += 3) + s->map[cnt++] = tmp[i]; + for (int i = 0; i < 15; i += 3) + s->map[cnt++] = tmp[i]; + memmove(&s->map[7], &s->map[6], 4*sizeof(int)); + memmove(&s->map[3], &s->map[1], 4*sizeof(int)); + s->map[1] = tmp[2]; + s->map[2] = tmp[0]; + + return 0; +} + const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = { TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE, 0), TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), @@ -52,6 +83,8 @@ const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = { TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0), TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0), TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft15, FFT, 15, 15, 15, 0, 128, fft15_init, neon, NEON, AV_TX_INPLACE, 0), + TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 192, fft15_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), TX_DEF(fft16, FFT, 16, 16, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0), TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), TX_DEF(fft32, FFT, 32, 32, 2, 0, 128, neon_init, neon, NEON, AV_TX_INPLACE, 0), diff --git a/libavutil/aarch64/tx_float_neon.S b/libavutil/aarch64/tx_float_neon.S index 12c4e880dc..3bc759ed87 100644 --- a/libavutil/aarch64/tx_float_neon.S +++ b/libavutil/aarch64/tx_float_neon.S @@ -438,6 +438,239 @@ endfunc FFT16_FN float, 0 FFT16_FN ns_float, 1 +const tab_15pt, align=4 + .float 1.0, 1.0, -1.0, -1.0 +endconst + +// Tab_53 twiddles (v28..v30) duplicated/pre-signed once, instead of per +// transform: v8/v9 = -+tab[8,9]/[10,11], v25/v28/v29 = tab[0,1]/[2,3]/[4,5], +// v10 = +-tab[6,7]. v30 keeps tab[8..11]. Callers preserve d8-d10. +.macro FFT15_DERIVE_CONSTS + dup v8.2d, v30.d[0] + dup v9.2d, v30.d[1] + dup v10.2d, v29.d[1] + dup v25.2d, v28.d[0] + dup v28.2d, v28.d[1] + dup v29.2d, v29.d[0] + fmul v8.4s, v8.4s, v31.4s + fmul v9.4s, v9.4s, v31.4s + fmul v10.4s, v10.4s, v24.4s +.endm + +.macro FFT15_LOAD no_perm, advance=0 +.if \no_perm == 1 + // Writebacks leave x2 a whole transform (120B) ahead for the PFA loop + ld1 { v0.4s }, [x2], #16 // in[0,1] + ld1r { v1.2d }, [x2], #8 // in[2] duplicated + ld1 { v2.4s, v3.4s, v4.4s }, [x2], #48 // in[3..8] + ld1 { v5.4s, v6.4s, v7.4s }, [x2], #48 // in[9..14] +.else + ldp w10, w11, [x4] // lut[0,1] + ldr w12, [x4, #8] // lut[2] + ldp w13, w14, [x4, #12] // lut[3,4] + ldp w15, w16, [x4, #20] // lut[5,6] + + ldr d0, [x2, x10, lsl #3] + add x10, x2, x11, lsl #3 + add x12, x2, x12, lsl #3 + ld1 { v0.d }[1], [x10] + ld1r { v1.2d }, [x12] + + ldr d2, [x2, x13, lsl #3] + add x13, x2, x14, lsl #3 + ldr d3, [x2, x15, lsl #3] + add x15, x2, x16, lsl #3 + ld1 { v2.d }[1], [x13] + ld1 { v3.d }[1], [x15] + + ldp w10, w11, [x4, #28] // lut[7,8] + ldp w12, w13, [x4, #36] // lut[9,10] + ldp w14, w15, [x4, #44] // lut[11,12] + ldp w16, w17, [x4, #52] // lut[13,14] +.if \advance == 1 + add x4, x4, #60 +.endif + + ldr d4, [x2, x10, lsl #3] + add x10, x2, x11, lsl #3 + ldr d5, [x2, x12, lsl #3] + add x12, x2, x13, lsl #3 + ldr d6, [x2, x14, lsl #3] + add x14, x2, x15, lsl #3 + ldr d7, [x2, x16, lsl #3] + add x16, x2, x17, lsl #3 + ld1 { v4.d }[1], [x10] + ld1 { v5.d }[1], [x12] + ld1 { v6.d }[1], [x14] + ld1 { v7.d }[1], [x16] +.endif +.endm + +// Single 15-point FFT (see doc/transforms.md and the AVX2 FFT15); each ymm +// becomes a pair of quads holding 2 complex each. Uses the derived constants +// and tab_15pt in v24 (dc fold sign); with hoist_strides=1 the caller +// provides x6/x7 = stride*3/*5. +.macro FFT15_CORE hoist_strides=0 +.if \hoist_strides == 0 + add x6, x3, x3, lsl #1 // stride*3 + add x7, x3, x3, lsl #2 // stride*5 +.endif + add x8, x1, x7 // &out[5] + add x9, x8, x7 // &out[10] + + // 4x parallel 3pt over in[3..14] (the in[11..14] -+ signs are folded + // into the twiddles: k = in[11..14] + Q4 -+ Q0), interleaved with the + // dc 3pt over in[0..2] ([dc] tagged, v0 = dc[0] dup, v1 = dc[1,2]) + fsub v16.4s, v2.4s, v4.4s // q[0,1]raw = in[3,4]-in[7,8] + ext v26.16b, v0.16b, v0.16b, #8 // [dc] (in1, in0) + fsub v17.4s, v3.4s, v5.4s // q[2,3]raw = in[5,6]-in[9,10] + fadd v27.4s, v0.4s, v26.4s // [dc] pc[1]raw = in0+in1 + fadd v2.4s, v2.4s, v4.4s // q[4,5]raw + fsub v20.4s, v0.4s, v26.4s // [dc] (in0-in1, in1-in0) + fadd v3.4s, v3.4s, v5.4s // q[6,7]raw + rev64 v20.4s, v20.4s // [dc] pc[0]raw in hi half + rev64 v16.4s, v16.4s // q[0,1]raw re/im-swapped + ext v21.16b, v20.16b, v27.16b, #8 // [dc] (pc[0], pc[1]) + rev64 v17.4s, v17.4s // q[2,3]raw re/im-swapped + fadd v0.4s, v1.4s, v27.4s // [dc] dc[0] = in2 + pc[1]raw (dup) + fadd v22.4s, v6.4s, v2.4s // y[0,1] = in[11,12] + q[4,5] + fmul v21.4s, v21.4s, v30.4s // [dc] pc[0,1] scaled by tab[8..11] + fadd v23.4s, v7.4s, v3.4s // y[2,3] = in[13,14] + q[6,7] + fmul v16.4s, v16.4s, v8.4s // Q0[0,1] + ext v26.16b, v21.16b, v21.16b, #8 // [dc] (pc[1], pc[0]) + fmul v17.4s, v17.4s, v8.4s // Q0[2,3] + fmla v21.4s, v26.4s, v24.4s // [dc] (dc[1]_int, dc[2]_int) + fmla v6.4s, v2.4s, v9.4s // M[0,1] = in[11,12] + q*Q4mult + fmla v7.4s, v3.4s, v9.4s // M[2,3] = in[13,14] + q*Q4mult + fmla v1.4s, v21.4s, v31.4s // [dc] v1 = (dc[1], dc[2]) — DC done + fsub v4.4s, v6.4s, v16.4s // k[0,1] = M[0,1] - Q0[0,1] + fsub v5.4s, v7.4s, v17.4s // k[2,3] = M[2,3] - Q0[2,3] + fadd v2.4s, v6.4s, v16.4s // k[4,5] = M[0,1] + Q0[0,1] + fadd v3.4s, v7.4s, v17.4s // k[6,7] = M[2,3] + Q0[2,3] + + // 4pt butterflies on y (v22,v23), k[0..3] (v4,v5), k[4..7] (v2,v3); + // one shared swapped operand per pair leaves the hi t's half-swapped, + // which the dup-symmetric twiddles absorb and the output stage uses + ext v16.16b, v23.16b, v23.16b, #8 // (y3, y2) + ext v17.16b, v5.16b, v5.16b, #8 // (k3, k2) + ext v20.16b, v3.16b, v3.16b, #8 // (k7, k6) + fsub v21.4s, v22.4s, v16.4s // (t3, t2) + fadd v22.4s, v22.4s, v16.4s // (t0, t1) + fsub v26.4s, v4.4s, v17.4s // (t7, t6) + fadd v4.4s, v4.4s, v17.4s // (t4, t5) + fsub v27.4s, v2.4s, v20.4s // (t11, t10) + fadd v2.4s, v2.4s, v20.4s // (t8, t9) + + // the 3 direct outputs: out[0,10,5] = dc[0,1,2] + t[0,4,8] + t[1,5,9] + ext v16.16b, v22.16b, v22.16b, #8 // (t1, t0) + zip1 v17.2d, v4.2d, v2.2d // (t4, t8) + zip2 v20.2d, v4.2d, v2.2d // (t5, t9) + fadd v16.4s, v16.4s, v22.4s // t[0]+t[1] + fadd v17.4s, v17.4s, v20.4s // (t[4]+t[5], t[8]+t[9]) + fadd v16.4s, v16.4s, v0.4s // out[0] + fadd v17.4s, v17.4s, v1.4s // (out[10], out[5]) + st1 { v16.d }[0], [x1] + st1 { v17.d }[1], [x8] + st1 { v17.d }[0], [x9] + + // twiddles; swap(t * tab) = swap(t) * tab as every multiplier is + // dup-symmetric. lo chunks seed the accumulator with dc[] (= the + // output stage's dc preadd): m = dc + t*v25 - swap(t)*v28; hi chunks + // r = t*v29 + swap(t)*v10, v10's +- giving r[3] += t[2]/r[2] -= t[3] + // in half-swapped order. Accumulates are spread out for the A53 + ext v16.16b, v22.16b, v22.16b, #8 // (t1, t0) + mov v6.16b, v0.16b // m0 = dc[0] + ext v17.16b, v21.16b, v21.16b, #8 // (t2, t3) + fmul v7.4s, v21.4s, v29.4s // (r3, r2) + fmla v6.4s, v22.4s, v25.4s // m0 += t[0,1]*r_lo + dup v18.2d, v1.d[0] // m1 = dc[1] + fmla v18.4s, v4.4s, v25.4s // m1 += t[4,5]*r_lo + fmls v6.4s, v16.4s, v28.4s // m0 -= swap*nt_lo + ext v16.16b, v4.16b, v4.16b, #8 // (t5, t4) + fmla v7.4s, v17.4s, v10.4s // (r3, r2) += swap*nt_hi + ext v17.16b, v26.16b, v26.16b, #8 // (t6, t7) + fmls v18.4s, v16.4s, v28.4s // m1 -= swap*nt_lo + fmul v23.4s, v26.4s, v29.4s // (r7, r6) + dup v19.2d, v1.d[1] // m2 = dc[2] + fmla v19.4s, v2.4s, v25.4s // m2 += t[8,9]*r_lo + ext v16.16b, v2.16b, v2.16b, #8 // (t9, t8) + fmla v23.4s, v17.4s, v10.4s // (r7, r6) + ext v17.16b, v27.16b, v27.16b, #8 // (t10, t11) + fmul v5.4s, v27.4s, v29.4s // (r11, r10) + fmls v19.4s, v16.4s, v28.4s // m2 -= swap*nt_lo + fmla v5.4s, v17.4s, v10.4s // (r11, r10) + + // output butterflies around rot(x) = (x.im, -x.re): out = m +- rot(r_hi). + // The half-swap makes rev(r_hi) a plain rev64, and u = rev*v31 is + // exact (+-1.0), so each +-rot pair is one non-destructive fsub/fadd + rev64 v16.4s, v7.4s // (r3.im, r3.re, r2.im, r2.re) + rev64 v17.4s, v23.4s // (r7.im, r7.re, r6.im, r6.re) + rev64 v20.4s, v5.4s // (r11.im, ..., r10.re) + fmul v16.4s, v16.4s, v31.4s // u0 + fmul v17.4s, v17.4s, v31.4s // u1 + fmul v20.4s, v20.4s, v31.4s // u2 + fsub v7.4s, v6.4s, v16.4s // (out6, out3) = m0 + rot + fadd v6.4s, v6.4s, v16.4s // (out9, out12) = m0 - rot + fsub v23.4s, v18.4s, v17.4s // (out1, out13) + fadd v22.4s, v18.4s, v17.4s // (out4, out7) + fsub v5.4s, v19.4s, v20.4s // (out11, out8) + fadd v4.4s, v19.4s, v20.4s // (out14, out2) + + add x10, x1, x6, lsl #1 // &out[6] + add x11, x1, x6 // &out[3] + st1 { v7.d }[0], [x10] + st1 { v7.d }[1], [x11] + add x12, x8, x3, lsl #2 // &out[9] + add x13, x1, x6, lsl #2 // &out[12] + st1 { v6.d }[0], [x12] + st1 { v6.d }[1], [x13] + add x10, x1, x3 // &out[1] + add x11, x9, x6 // &out[13] + st1 { v23.d }[0], [x10] + st1 { v23.d }[1], [x11] + add x12, x1, x3, lsl #2 // &out[4] + add x13, x8, x3, lsl #1 // &out[7] + st1 { v22.d }[0], [x12] + st1 { v22.d }[1], [x13] + add x10, x9, x3 // &out[11] + add x11, x1, x3, lsl #3 // &out[8] + st1 { v5.d }[0], [x10] + st1 { v5.d }[1], [x11] + add x12, x9, x3, lsl #2 // &out[14] + add x13, x1, x3, lsl #1 // &out[2] + st1 { v4.d }[0], [x12] + st1 { v4.d }[1], [x13] +.endm + +.macro FFT15_FN name, no_perm +function ff_tx_fft15_\name\()_neon, export=1 + stp d8, d9, [sp, #-32]! + str d10, [sp, #16] + + SETUP_LUT \no_perm + + movrel x5, X(ff_tx_tab_53_float) + ld1 { v28.4s, v29.4s, v30.4s }, [x5] // 5pt cos, 5pt sin, 3pt + + movrel x5, tab_15pt + ld1 { v24.4s }, [x5] // sign mask + + LOAD_SUBADD // v31 = (-1,+1,-1,+1) + FFT15_DERIVE_CONSTS + + FFT15_LOAD \no_perm + + FFT15_CORE + + ldr d10, [sp, #16] + ldp d8, d9, [sp], #32 + ret +endfunc +.endm + +FFT15_FN float, 0 +FFT15_FN ns_float, 1 + .macro SETUP_SR_RECOMB len, re, im, dec ldr w5, =(\len - 4*7) movrel \re, X(ff_tx_tab_\len\()_float) -- 2.52.0 From 2c10eed62bda07ee7def46709d66e0465f0b94bd Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Thu, 4 Jun 2026 15:04:13 +0900 Subject: [PATCH 2/4] lavu/tx: add AArch64 NEON fft_pfa_15xM Same as the x86 version. --- libavutil/aarch64/tx_float_init.c | 77 ++++++++++++++++++++----- libavutil/aarch64/tx_float_neon.S | 96 +++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 14 deletions(-) diff --git a/libavutil/aarch64/tx_float_init.c b/libavutil/aarch64/tx_float_init.c index 47f1e12700..a049562609 100644 --- a/libavutil/aarch64/tx_float_init.c +++ b/libavutil/aarch64/tx_float_init.c @@ -19,6 +19,7 @@ #define TX_FLOAT #include "libavutil/tx_priv.h" #include "libavutil/attributes.h" +#include "libavutil/mem.h" #include "libavutil/aarch64/cpu.h" TX_DECL_FN(fft2, neon) @@ -34,6 +35,8 @@ TX_DECL_FN(fft32, neon) TX_DECL_FN(fft32_ns, neon) TX_DECL_FN(fft_sr, neon) TX_DECL_FN(fft_sr_ns, neon) +TX_DECL_FN(fft_pfa_15xM, neon) +TX_DECL_FN(fft_pfa_15xM_ns, neon) static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, @@ -46,11 +49,29 @@ static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd, return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, 8, 0); } +/* Reorder one 15-point map so the loads in the pre-permuted assembly path + * become simple contiguous chunks. Mirrors the x86 FFT15 init. */ +static void fft15_permute_map(int *map) +{ + int cnt = 0, tmp[15]; + memcpy(tmp, map, 15*sizeof(*tmp)); + for (int i = 1; i < 15; i += 3) + map[cnt++] = tmp[i]; + for (int i = 2; i < 15; i += 3) + map[cnt++] = tmp[i]; + for (int i = 0; i < 15; i += 3) + map[cnt++] = tmp[i]; + memmove(&map[7], &map[6], 4*sizeof(int)); + memmove(&map[3], &map[1], 4*sizeof(int)); + map[1] = tmp[2]; + map[2] = tmp[0]; +} + static av_cold int fft15_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale) { - int ret, cnt = 0, tmp[15]; + int ret; FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; ff_tx_init_tabs_float(len); @@ -58,19 +79,44 @@ static av_cold int fft15_init(AVTXContext *s, const FFTXCodelet *cd, if ((ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5)) < 0) return ret; - /* Reorder the 15-pt map so the loads in the pre-permuted assembly path - * become simple contiguous chunks. Mirrors the x86 FFT15 init. */ - memcpy(tmp, s->map, 15*sizeof(*tmp)); - for (int i = 1; i < 15; i += 3) - s->map[cnt++] = tmp[i]; - for (int i = 2; i < 15; i += 3) - s->map[cnt++] = tmp[i]; - for (int i = 0; i < 15; i += 3) - s->map[cnt++] = tmp[i]; - memmove(&s->map[7], &s->map[6], 4*sizeof(int)); - memmove(&s->map[3], &s->map[1], 4*sizeof(int)); - s->map[1] = tmp[2]; - s->map[2] = tmp[0]; + fft15_permute_map(s->map); + + return 0; +} + +/* 15xM prime-factor FFT: M inlined 15-point transforms followed by 15 calls to + * a power-of-two subtransform. Mirrors the x86 fft_pfa_15xM, but the aarch64 + * ABI lets us call the subtransform normally, so no FF_TX_ASM_CALL is needed. */ +static av_cold int fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, + uint64_t flags, FFTXCodeletOptions *opts, + int len, int inv, const void *scale) +{ + int ret; + int sub_len = len / cd->factors[0]; + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER }; + + flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */ + flags |= AV_TX_INPLACE; /* in-place */ + flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ + + if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, + sub_len, inv, scale))) + return ret; + + if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, + cd->factors[0], sub_len))) + return ret; + + /* The 15-point transform is itself a compound one, so embed its input map + * and apply the same load-friendly reorder used by fft15_init. */ + TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5); + for (int k = 0; k < sub_len; k++) + fft15_permute_map(&s->map[k*15]); + + if (!(s->tmp = av_malloc(len*sizeof(*s->tmp)))) + return AVERROR(ENOMEM); + + ff_tx_init_tabs_float(len / sub_len); return 0; } @@ -93,5 +139,8 @@ const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = { TX_DEF(fft_sr, FFT, 64, 131072, 2, 0, 128, neon_init, neon, NEON, 0, 0), TX_DEF(fft_sr_ns, FFT, 64, 131072, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 128, fft_pfa_init, neon, NEON, AV_TX_INPLACE, 0), + TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 192, fft_pfa_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + NULL, }; diff --git a/libavutil/aarch64/tx_float_neon.S b/libavutil/aarch64/tx_float_neon.S index 3bc759ed87..7f0f7a006e 100644 --- a/libavutil/aarch64/tx_float_neon.S +++ b/libavutil/aarch64/tx_float_neon.S @@ -671,6 +671,102 @@ endfunc FFT15_FN float, 0 FFT15_FN ns_float, 1 +// 15xM PFA (len = 15*M, M a power of two), like the x86 fft_pfa_15xM: +// dim1 = M 15pt transforms scattered into s->tmp at sub_map[i], spaced M +// apart; dim2 = 15 in-place M-pt subtransforms (plain blr, the uniform ABI +// needs no asm-call variant); post = out[i*stride] = s->tmp[out_map[i]]. +// AVTXContext offsets: len=0, map=8, tmp=24, sub=32, fn[0]=40. +.macro PFA_15_FN name, no_perm +function ff_tx_fft_pfa_15xM_\name\()_neon, export=1 + stp x29, x30, [sp, #-128]! + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp x25, x26, [sp, #64] + stp x27, x28, [sp, #80] + stp d8, d9, [sp, #96] + str d10, [sp, #112] + + mov x25, x0 // root context + mov x26, x1 // user out + mov x27, x3 // user stride (bytes) + + ldr w19, [x0, #0] // len = 15*M + ldr x20, [x0, #24] // s->tmp + ldr x23, [x0, #32] // s->sub + ldr w24, [x23, #0] // M + ldr x28, [x23, #8] // sub_map + lsl x3, x24, #3 // M*8 = 15pt output stride +.if \no_perm == 0 + ldr x4, [x0, #8] // in_map, advanced by the loads +.endif + + movrel x5, X(ff_tx_tab_53_float) + ld1 { v28.4s, v29.4s, v30.4s }, [x5] + movrel x5, tab_15pt + ld1 { v24.4s }, [x5] + LOAD_SUBADD // v31 + FFT15_DERIVE_CONSTS + add x6, x3, x3, lsl #1 // stride*3 + add x7, x3, x3, lsl #2 // stride*5 +1: + ldr w10, [x28], #4 // sub_map[i] + add x1, x20, x10, lsl #3 + FFT15_LOAD \no_perm, advance=1 + FFT15_CORE hoist_strides=1 + subs w19, w19, #15 + b.gt 1b + + ldr x28, [x25, #40] // ctx->fn[0] + mov x21, x20 // column base + mov w19, #15 +2: + mov x0, x23 + mov x1, x21 + mov x2, x21 + mov x3, #8 + blr x28 // M-point FFT, in-place + add x21, x21, x24, lsl #3 + subs w19, w19, #1 + b.gt 2b + + // out[i*stride] = s->tmp[out_map[i]], unrolled by 4 (len % 60 == 0) + ldr w19, [x25, #0] // len + ldr x22, [x25, #8] // s->map + add x22, x22, x19, lsl #2 // out_map = map + len + mov x10, x26 + lsl x11, x27, #1 // 2*stride + add x12, x27, x11 // 3*stride +3: + ldp w13, w14, [x22], #8 // out_map[i, i+1] + ldp w15, w16, [x22], #8 // out_map[i+2, i+3] + ldr d0, [x20, x13, lsl #3] + ldr d1, [x20, x14, lsl #3] + ldr d2, [x20, x15, lsl #3] + ldr d3, [x20, x16, lsl #3] + str d0, [x10] + str d1, [x10, x27] + str d2, [x10, x11] + str d3, [x10, x12] + add x10, x10, x11, lsl #1 + subs w19, w19, #4 + b.gt 3b + + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp x25, x26, [sp, #64] + ldp x27, x28, [sp, #80] + ldp d8, d9, [sp, #96] + ldr d10, [sp, #112] + ldp x29, x30, [sp], #128 + ret +endfunc +.endm + +PFA_15_FN float, 0 +PFA_15_FN ns_float, 1 + .macro SETUP_SR_RECOMB len, re, im, dec ldr w5, =(\len - 4*7) movrel \re, X(ff_tx_tab_\len\()_float) -- 2.52.0 From 8bfa8839d14b7f30f4bc5ab1d7b7cd6889c13efb Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Thu, 4 Jun 2026 17:50:57 +0900 Subject: [PATCH 3/4] lavu/tx: add AArch64 NEON inverse MDCT Same as the x86 version. --- libavutil/aarch64/tx_float_init.c | 46 ++++++++++++ libavutil/aarch64/tx_float_neon.S | 119 ++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) diff --git a/libavutil/aarch64/tx_float_init.c b/libavutil/aarch64/tx_float_init.c index a049562609..70be9e8483 100644 --- a/libavutil/aarch64/tx_float_init.c +++ b/libavutil/aarch64/tx_float_init.c @@ -37,6 +37,7 @@ TX_DECL_FN(fft_sr, neon) TX_DECL_FN(fft_sr_ns, neon) TX_DECL_FN(fft_pfa_15xM, neon) TX_DECL_FN(fft_pfa_15xM_ns, neon) +TX_DECL_FN(mdct_inv, neon) static av_cold int neon_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, @@ -121,6 +122,49 @@ static av_cold int fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, return 0; } +/* Inverse MDCT: a pre-rotation, an in-place len/2 complex FFT, and a + * post-rotation. Mirrors the generic ff_tx_mdct_init / x86 m_inv_init, but the + * subtransform is called with a normal blr (no FF_TX_ASM_CALL on aarch64). */ +static av_cold int mdct_inv_init(AVTXContext *s, const FFTXCodelet *cd, + uint64_t flags, FFTXCodeletOptions *opts, + int len, int inv, const void *scale) +{ + int ret; + FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER }; + + /* The pre-rotation processes two output complex at a time, so len/2 must + * be even. Real codecs always satisfy this; bail out otherwise so the + * generic C MDCT is used. */ + if (len & 3) + return AVERROR(ENOSYS); + + s->scale_d = *((const float *)scale); + s->scale_f = s->scale_d; + + flags &= ~FF_TX_OUT_OF_PLACE; /* The subtransform is in-place */ + flags |= AV_TX_INPLACE; + flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */ + + if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1, + inv, scale))) + return ret; + + s->map = av_malloc((len >> 1)*sizeof(*s->map)); + if (!s->map) + return AVERROR(ENOMEM); + + memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map)); + + if ((ret = ff_tx_mdct_gen_exp_float(s, s->map))) + return ret; + + /* Pre-double the map indices (saves a shift in the hot path). */ + for (int i = 0; i < (len >> 1); i++) + s->map[i] <<= 1; + + return 0; +} + const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = { TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, neon, NEON, AV_TX_INPLACE, 0), TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, neon_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), @@ -142,5 +186,7 @@ const FFTXCodelet * const ff_tx_codelet_list_float_aarch64[] = { TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 128, fft_pfa_init, neon, NEON, AV_TX_INPLACE, 0), TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 192, fft_pfa_init, neon, NEON, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0), + TX_DEF(mdct_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 256, mdct_inv_init, neon, NEON, FF_TX_INVERSE_ONLY, 0), + NULL, }; diff --git a/libavutil/aarch64/tx_float_neon.S b/libavutil/aarch64/tx_float_neon.S index 7f0f7a006e..0b24c355b8 100644 --- a/libavutil/aarch64/tx_float_neon.S +++ b/libavutil/aarch64/tx_float_neon.S @@ -767,6 +767,125 @@ endfunc PFA_15_FN float, 0 PFA_15_FN ns_float, 1 +// Inverse MDCT (see ff_tx_mdct_inv): pre-rotation, in-place N/2pt FFT, +// post-rotation. out is the N/2-complex buffer z, in the N-real input. +// Where ld2 can split the re/im planes the multiplies are planar and fused +// (fmul + fmls/fmla); the gather and the odd tail use interleaved pairs +// (trn1/trn2 broadcast, rev64 swap, the v31 fmla). +// AVTXContext offsets: len=0, map=8 (doubled in init), exp=16, sub=32, fn=40. +function ff_tx_mdct_inv_float_neon, export=1 + stp x29, x30, [sp, #-48]! + stp x19, x20, [sp, #16] + stp x21, x22, [sp, #32] + + mov x19, x1 // z + mov x20, x0 // ctx + ldr w21, [x0, #0] // N + ldr x22, [x0, #16] // exp + ldr x4, [x0, #8] // map (doubled) + + LOAD_SUBADD + + sub x5, x21, #1 + madd x14, x5, x3, x2 // in2 = in + (N-1)*stride + lsr w7, w21, #1 // len2 + + // pre-rotation via gather: z[i] = (in2[-k], in1[k])*exp[i], 2 cx/iter + mov x13, x19 + mov x15, x22 +1: + ldp w5, w6, [x4], #8 // k0, k1 + madd x8, x5, x3, x2 // &in[k0] (im0) + msub x9, x5, x3, x14 // &in2[-k0] (re0) + madd x10, x6, x3, x2 // &in[k1] (im1) + msub x11, x6, x3, x14 // &in2[-k1] (re1) + ldr s0, [x9] + ld1 { v0.s }[1], [x8] + ld1 { v0.s }[2], [x11] + ld1 { v0.s }[3], [x10] // (re0, im0, re1, im1) = tmp + ld1 { v1.4s }, [x15], #16 // exp[i,i+1] + trn1 v2.4s, v0.4s, v0.4s // re dup + trn2 v3.4s, v0.4s, v0.4s // im dup + rev64 v4.4s, v1.4s // exp swapped + fmul v5.4s, v2.4s, v1.4s + fmul v6.4s, v3.4s, v4.4s + fmla v5.4s, v6.4s, v31.4s // z = tmp*exp + st1 { v5.4s }, [x13], #16 + subs w7, w7, #2 + b.gt 1b + + ldr x5, [x20, #40] // fn[0] + ldr x0, [x20, #32] // sub[0] + mov x1, x19 + mov x2, x19 + mov x3, #8 + blr x5 // N/2pt FFT, in-place + + // post-rotation over symmetric pairs (i0 = len4+i, i1 = len4-1-i), + // 2 pairs/iter, planes ld2-split: r = swap(z)*swap(e), the im parts + // crossing partners (z[i0] = (r_i0.re, r_i1.im) and vice versa), so + // each side stores its re plane zipped with the other's reversed ims + add x15, x22, x21, lsl #2 // exp_post = exp + len2 + add x8, x19, x21, lsl #1 // p_i0 = z + len4 + sub x9, x8, #16 // p_i1 = z + len4 - 2 + add x10, x15, x21, lsl #1 // e_i0 = exp_post + len4 + sub x11, x10, #16 // e_i1 = exp_post + len4 - 2 + lsr w7, w21, #2 // len4 + lsr w6, w7, #1 // pairs of pairs + cbz w6, 8f // N == 4: lone middle pair +2: + ld2 { v0.2s, v1.2s }, [x8] // z asc (i0, i0+1): re, im planes + ld2 { v2.2s, v3.2s }, [x9] // z desc (i1-1, i1) + ld2 { v4.2s, v5.2s }, [x10], #16 // e asc: er, ei + ld2 { v6.2s, v7.2s }, [x11] // e desc + sub x11, x11, #16 + fmul v16.2s, v1.2s, v5.2s // asc: z.im*e.im + fmul v18.2s, v1.2s, v4.2s // asc: z.im*e.re + fmul v20.2s, v3.2s, v7.2s // desc: z.im*e.im + fmul v22.2s, v3.2s, v6.2s // desc: z.im*e.re + fmls v16.2s, v0.2s, v4.2s // re plane (r_i0, r_i0+1) + fmla v18.2s, v0.2s, v5.2s // im plane (r_i0, r_i0+1) + fmls v20.2s, v2.2s, v6.2s // re plane (r_i1-1, r_i1) + fmla v22.2s, v2.2s, v7.2s // im plane (r_i1-1, r_i1) + rev64 v22.2s, v22.2s // (r_i1.im, r_i1-1.im) + rev64 v18.2s, v18.2s // (r_i0+1.im, r_i0.im) + zip1 v0.4s, v16.4s, v22.4s // (z[i0], z[i0+1]) + zip1 v2.4s, v20.4s, v18.4s // (z[i1-1], z[i1]) + st1 { v0.4s }, [x8], #16 + st1 { v2.4s }, [x9] + sub x9, x9, #16 + subs w6, w6, #1 + b.gt 2b +8: + // odd len4 (N % 8 == 4): one leftover pair, (i0, i1) = (len2-1, 0). + // x8/x10 already point at it; x9/x11 sit one complex below their slot. + tbz w7, #0, 9f + LOAD_SUBADD // v31 (clobbered by the FFT) + add x9, x9, #8 + add x11, x11, #8 + ldr d0, [x8] // z[i0] + ld1 { v0.d }[1], [x9] // v0 = (z[i0], z[i1]) + ldr d1, [x10] // exp[i0] + ld1 { v1.d }[1], [x11] // v1 = (exp[i0], exp[i1]) + rev64 v2.4s, v0.4s // swap(z) = a + rev64 v3.4s, v1.4s // swap(exp) = b + trn1 v4.4s, v2.4s, v2.4s // a.re dup + trn2 v5.4s, v2.4s, v2.4s // a.im dup + fmul v6.4s, v4.4s, v3.4s // a.re*b + fmul v7.4s, v5.4s, v1.4s // a.im*b_swap (b_swap = orig exp) + fmla v6.4s, v7.4s, v31.4s // v6 = (r0.re, r0.im, r1.re, r1.im) + mov v16.16b, v6.16b + ins v16.s[1], v6.s[3] // (r0.re, r1.im, r1.re, r1.im) + ins v16.s[3], v6.s[1] // (r0.re, r1.im, r1.re, r0.im) + st1 { v16.d }[0], [x8] // z[i0] = (r0.re, r1.im) + st1 { v16.d }[1], [x9] // z[i1] = (r1.re, r0.im) +9: + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x29, x30, [sp], #48 + ret +endfunc + .macro SETUP_SR_RECOMB len, re, im, dec ldr w5, =(\len - 4*7) movrel \re, X(ff_tx_tab_\len\()_float) -- 2.52.0 From d229b4c1242870a8b80942f149a4b81bd25b7c5e Mon Sep 17 00:00:00 2001 From: Lynne <[email protected]> Date: Thu, 4 Jun 2026 18:49:35 +0900 Subject: [PATCH 4/4] lavu/tx: contiguous fast path for the AArch64 inverse MDCT pre-rotation An optimization that gives 20% speedup in return for a dozen more tail complete instructions. --- libavutil/aarch64/tx_float_init.c | 12 +++++-- libavutil/aarch64/tx_float_neon.S | 52 +++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/libavutil/aarch64/tx_float_init.c b/libavutil/aarch64/tx_float_init.c index 70be9e8483..f5b4f71ec9 100644 --- a/libavutil/aarch64/tx_float_init.c +++ b/libavutil/aarch64/tx_float_init.c @@ -149,7 +149,11 @@ static av_cold int mdct_inv_init(AVTXContext *s, const FFTXCodelet *cd, inv, scale))) return ret; - s->map = av_malloc((len >> 1)*sizeof(*s->map)); + /* The map holds the gather map (first half, used by the strided + * pre-rotation) followed by its inverse (second half): the contiguous + * stride==4 path reads the input in order and scatters output complex i + * to position s->map[len2 + j], where j is the contiguous input index. */ + s->map = av_malloc(len*sizeof(*s->map)); if (!s->map) return AVERROR(ENOMEM); @@ -158,7 +162,11 @@ static av_cold int mdct_inv_init(AVTXContext *s, const FFTXCodelet *cd, if ((ret = ff_tx_mdct_gen_exp_float(s, s->map))) return ret; - /* Pre-double the map indices (saves a shift in the hot path). */ + /* Invert the gather map for the contiguous path (before doubling). */ + for (int i = 0; i < (len >> 1); i++) + s->map[(len >> 1) + s->map[i]] = i; + + /* Pre-double the gather-map indices (saves a shift in the strided path). */ for (int i = 0; i < (len >> 1); i++) s->map[i] <<= 1; diff --git a/libavutil/aarch64/tx_float_neon.S b/libavutil/aarch64/tx_float_neon.S index 0b24c355b8..f26eb7dff3 100644 --- a/libavutil/aarch64/tx_float_neon.S +++ b/libavutil/aarch64/tx_float_neon.S @@ -790,6 +790,12 @@ function ff_tx_mdct_inv_float_neon, export=1 madd x14, x5, x3, x2 // in2 = in + (N-1)*stride lsr w7, w21, #1 // len2 + cmp x3, #4 // contiguous input and + b.ne 6f // len2 % 4 == 0 takes the + tst w21, #7 // single-sweep path + b.eq 3f + +6: // pre-rotation via gather: z[i] = (in2[-k], in1[k])*exp[i], 2 cx/iter mov x13, x19 mov x15, x22 @@ -813,7 +819,53 @@ function ff_tx_mdct_inv_float_neon, export=1 st1 { v5.4s }, [x13], #16 subs w7, w7, #2 b.gt 1b + b 4f + // pre-rotation, contiguous: the input is effectively (im, re, im, ...) + // interleaved, so sweep it once from both ends with the planes + // ld2-split: low ims pair with high res (outputs p, p+1) and high ims + // with low res (outputs p'-1, p', p' = len2-1-p), scattered through + // the inverse map (2nd half of s->map) with the natural twiddles +3: + add x15, x22, x21, lsl #2 // exp + len2 (asc) + add x4, x4, x21, lsl #1 // map + len2 (asc) + sub x10, x14, #12 // &in[N-4] (desc) + add x12, x15, x21, lsl #2 + sub x12, x12, #16 // exp + (N-2) (desc) + add x13, x4, x21, lsl #1 + sub x13, x13, #8 // map + (N-2) (desc) +5: + ld2 { v6.2s, v7.2s }, [x2], #16 // (im_p, im_p1) (re_p', re_p'm1) + ld2 { v16.2s, v17.2s }, [x10] // (im_p'm1, im_p') (re_p1, re_p) + sub x10, x10, #16 + rev64 v17.2s, v17.2s // (re_p, re_p1) + ld2 { v1.2s, v2.2s }, [x15], #16 // A: er, ei + rev64 v7.2s, v7.2s // (re_p'm1, re_p') + ld2 { v3.2s, v4.2s }, [x12] // B: er, ei + sub x12, x12, #16 + fmul v5.2s, v17.2s, v1.2s // A: re*er + fmul v18.2s, v17.2s, v2.2s // A: re*ei + fmul v19.2s, v7.2s, v3.2s // B: re*er + fmul v20.2s, v7.2s, v4.2s // B: re*ei + fmls v5.2s, v6.2s, v2.2s // A: z.re plane + fmla v18.2s, v6.2s, v1.2s // A: z.im plane + fmls v19.2s, v16.2s, v4.2s // B: z.re plane + fmla v20.2s, v16.2s, v3.2s // B: z.im plane + ldp w16, w17, [x4], #8 // inv_map[p, p+1] + zip1 v0.2s, v5.2s, v18.2s // z_p + zip2 v1.2s, v5.2s, v18.2s // z_p1 + str d0, [x19, w16, uxtw #3] + str d1, [x19, w17, uxtw #3] + ldp w16, w17, [x13] // inv_map[p'-1, p'] + sub x13, x13, #8 + zip1 v2.2s, v19.2s, v20.2s // z_p'm1 + zip2 v3.2s, v19.2s, v20.2s // z_p' + str d2, [x19, w16, uxtw #3] + str d3, [x19, w17, uxtw #3] + subs w7, w7, #4 + b.gt 5b + +4: ldr x5, [x20, #40] // fn[0] ldr x0, [x20, #32] // sub[0] mov x1, x19 -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
