[FFmpeg-devel] [PR] avfilter/aarch64: NEON threshold filter (PR #23448)

DROO AMOR via ffmpeg-devel Wed, 10 Jun 2026 15:48:20 -0700

PR #23448 opened by DROO AMOR (DROOdotFOO)
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23448
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23448.patch


NEON paths for `vf_threshold`. depth==8 routes to threshold8_neon (16 
bytes/iter); depth > 8 to threshold16_neon (8 shorts/iter, covering 
9/10/12/14/16). 

C is not auto-vectorized because the four independent input streams (in, 
threshold, min, max) defeat clang's vectorizer. 
Per chunk the NEON kernel is two instructions: cmhs builds the in <= threshold 
mask, bsl selects min or max.

Test Name                    M1-clang
-----------------------------------
threshold8_neon         26.7 (12.84x)
threshold10_neon        26.9 ( 6.89x)
threshold12_neon        26.7 ( 6.99x)
threshold16_neon        26.7 ( 7.03x)

Tested on Apple M1 (clang, -O3):
- checkasm --test=vf_threshold across 5 seeds {1, 42, 999, 314159, 271828}: all 
4 depths pass each run. The test iterates widths 1..w-1 with 0xAA-sentinel 
output buffers, so both correctness in [0, w*step) and over-writes past w*step 
are checked. Multi-row + distinct-linesizes phases cover the per-pointer 
row-advance cascade.
- full checkasm: 7845/7845 pass
- fate-checkasm-vf_threshold: passes
- v8-v15 not touched; only x19/x20 spilled as scalar-tail scratch (callee-saved 
per AAPCS-64), balanced ldp before ret.

-- 

Follow-up (separate patch): ff_threshold_init_x86 currently gates the 16-bit 
SSE4/AVX2 paths on s->depth == 16; depths 9/10/12/14 fall through to scalar C 
even though the kernel (pminuw + pcmpeqw + PBLENDVB) is correct for any 
unsigned 16-bit value.  One-line dispatch fix to be sent after this lands.


>From ea3e2ce6242c73a471f8a23d5eb52c9638fd2827 Mon Sep 17 00:00:00 2001
From: DROOdotFOO <[email protected]>
Date: Sat, 30 May 2026 00:14:40 +0200
Subject: [PATCH] avfilter/aarch64: NEON threshold filter

depth==8 routes to threshold8_neon (16 bytes/iter); depth > 8 to
threshold16_neon (8 shorts/iter, covering 9/10/12/14/16). C is not
auto-vectorized: four independent input streams defeat clang's
vectorizer. Per chunk the NEON kernel is cmhs+bsl.

Test Name              M1-clang
-------------------------------
threshold8_neon         26.7 (12.84x)
threshold10_neon        26.9 ( 6.89x)
threshold12_neon        26.7 ( 6.99x)
threshold16_neon        26.7 ( 7.03x)

Signed-off-by: DROOdotFOO <[email protected]>
---
 libavfilter/aarch64/Makefile            |   2 +
 libavfilter/aarch64/vf_threshold_init.c |  51 ++++++++++
 libavfilter/aarch64/vf_threshold_neon.S | 119 ++++++++++++++++++++++++
 libavfilter/threshold.h                 |   1 +
 libavfilter/vf_threshold_init.h         |   2 +
 tests/checkasm/vf_threshold.c           |  82 ++++++++++------
 6 files changed, 231 insertions(+), 26 deletions(-)
 create mode 100644 libavfilter/aarch64/vf_threshold_init.c
 create mode 100644 libavfilter/aarch64/vf_threshold_neon.S

diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
index c7b7e18467..d09f7e4080 100644
--- a/libavfilter/aarch64/Makefile
+++ b/libavfilter/aarch64/Makefile
@@ -1,7 +1,9 @@
 OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
 OBJS-$(CONFIG_COLORDETECT_FILTER)            += aarch64/vf_colordetect_init.o
 OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
+OBJS-$(CONFIG_THRESHOLD_FILTER)              += aarch64/vf_threshold_init.o
 
 NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_neon.o
 NEON-OBJS-$(CONFIG_COLORDETECT_FILTER)       += aarch64/vf_colordetect_neon.o
 NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
+NEON-OBJS-$(CONFIG_THRESHOLD_FILTER)         += aarch64/vf_threshold_neon.o
diff --git a/libavfilter/aarch64/vf_threshold_init.c 
b/libavfilter/aarch64/vf_threshold_init.c
new file mode 100644
index 0000000000..9b14d644a9
--- /dev/null
+++ b/libavfilter/aarch64/vf_threshold_init.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2026 DROOdotFOO <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavfilter/threshold.h"
+
+void ff_threshold8_neon(const uint8_t *in, const uint8_t *threshold,
+                        const uint8_t *min, const uint8_t *max,
+                        uint8_t *out,
+                        ptrdiff_t ilinesize, ptrdiff_t tlinesize,
+                        ptrdiff_t flinesize, ptrdiff_t slinesize,
+                        ptrdiff_t olinesize,
+                        int w, int h);
+
+void ff_threshold16_neon(const uint8_t *in, const uint8_t *threshold,
+                         const uint8_t *min, const uint8_t *max,
+                         uint8_t *out,
+                         ptrdiff_t ilinesize, ptrdiff_t tlinesize,
+                         ptrdiff_t flinesize, ptrdiff_t slinesize,
+                         ptrdiff_t olinesize,
+                         int w, int h);
+
+av_cold void ff_threshold_init_aarch64(ThresholdContext *s)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        if (s->depth == 8)
+            s->threshold = ff_threshold8_neon;
+        else
+            s->threshold = ff_threshold16_neon;
+    }
+}
diff --git a/libavfilter/aarch64/vf_threshold_neon.S 
b/libavfilter/aarch64/vf_threshold_neon.S
new file mode 100644
index 0000000000..6d76d7c5e9
--- /dev/null
+++ b/libavfilter/aarch64/vf_threshold_neon.S
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2026 DROOdotFOO <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+// Stack-arg slot size for an int. Apple packs ints to 4 bytes on the stack;
+// standard AAPCS-64 pads to 8.
+#ifdef __APPLE__
+.set    SP_INT, 4
+#else
+.set    SP_INT, 8
+#endif
+
+// THRESHOLD <bits> <vecsz> <shape> <ldop> <stop> <step>
+//
+// Generates:
+//   void ff_threshold<bits>_neon(const uint8_t *in, const uint8_t *threshold,
+//                                const uint8_t *min, const uint8_t *max,
+//                                uint8_t *out,
+//                                ptrdiff_t ilinesize, ptrdiff_t tlinesize,
+//                                ptrdiff_t flinesize, ptrdiff_t slinesize,
+//                                ptrdiff_t olinesize, int w, int h);
+//
+// Per-pixel: out[x] = in[x] <= threshold[x] ? min[x] : max[x].
+//
+// Stack layout after the x19/x20 spill:
+//   [sp +  0]: saved x19, x20
+//   [sp + 16]: slinesize       (ptrdiff_t, 8 bytes)
+//   [sp + 24]: olinesize       (ptrdiff_t, 8 bytes)
+//   [sp + 32]: w               (int; 4 or 8 byte slot per SP_INT)
+//   [sp + 32 + SP_INT]: h      (int)
+//
+// Register allocation:
+//   x0..x4 : base pointers (in, threshold, min, max, out) - kept across rows
+//   x5..x7 : ilinesize, tlinesize, flinesize - caller-passed, kept
+//   x8, x9 : slinesize, olinesize (loaded from stack)
+//   w10, w11 : w, h (loaded from stack)
+//   x12..x16: per-row working pointers (post-incremented in the row body)
+//   w17    : column counter
+//   w19, w20: scalar-tail scratch (caller-saved x0-x18 minus x18 are all
+//             held by row state, so the tail needs two callee-saved GPRs).
+//   v0..v4 : load buffers + result
+.macro THRESHOLD bits, vecsz, shape, ldop, stop, step
+function ff_threshold\bits\()_neon, export=1
+        stp             x19, x20, [sp, #-16]!
+        ldr             x8,  [sp, #16]              // slinesize
+        ldr             x9,  [sp, #24]              // olinesize
+        ldr             w10, [sp, #32]              // w
+        ldr             w11, [sp, #32+SP_INT]       // h
+        cmp             w11, #0
+        b.le            9f                          // h <= 0: nothing to do
+        cmp             w10, #0
+        b.le            9f                          // w <= 0: nothing to do
+1:      // row_loop:
+        mov             x12, x0
+        mov             x13, x1
+        mov             x14, x2
+        mov             x15, x3
+        mov             x16, x4
+        mov             w17, w10
+2:      // vec_loop: 16 bytes / q-reg
+        cmp             w17, #\vecsz
+        b.lt            3f
+        ld1             {v0.16b}, [x12], #16
+        ld1             {v1.16b}, [x13], #16
+        ld1             {v2.16b}, [x14], #16
+        ld1             {v3.16b}, [x15], #16
+        cmhs            v4.\shape, v1.\shape, v0.\shape   // thr >= in (= in 
<= thr)
+        bsl             v4.16b, v2.16b, v3.16b            // mask ? min : max
+        st1             {v4.16b}, [x16], #16
+        sub             w17, w17, #\vecsz
+        b               2b
+3:      // tail:
+        cbz             w17, 5f
+4:
+        // cmp below sets NZCV; the four ldrb/ldrh after it do not touch
+        // flags, so the csel reads the cmp's result.
+        \ldop           w19, [x12], #\step
+        \ldop           w20, [x13], #\step
+        cmp             w19, w20                    // in vs threshold
+        \ldop           w19, [x14], #\step          // overwrites in  -> min
+        \ldop           w20, [x15], #\step          // overwrites thr -> max
+        csel            w19, w19, w20, ls           // ls = unsigned in <= thr
+        \stop           w19, [x16], #\step
+        subs            w17, w17, #1
+        b.gt            4b
+5:      // end_row:
+        add             x0, x0, x5                  // in        += ilinesize
+        add             x1, x1, x6                  // threshold += tlinesize
+        add             x2, x2, x7                  // min       += flinesize
+        add             x3, x3, x8                  // max       += slinesize
+        add             x4, x4, x9                  // out       += olinesize
+        subs            w11, w11, #1
+        b.gt            1b
+9:
+        ldp             x19, x20, [sp], #16
+        ret
+endfunc
+.endm
+
+THRESHOLD 8,  16, 16b, ldrb, strb, 1
+THRESHOLD 16, 8,  8h,  ldrh, strh, 2
diff --git a/libavfilter/threshold.h b/libavfilter/threshold.h
index 8b55ad6ba1..1efdc9d5e9 100644
--- a/libavfilter/threshold.h
+++ b/libavfilter/threshold.h
@@ -47,5 +47,6 @@ typedef struct ThresholdContext {
 } ThresholdContext;
 
 void ff_threshold_init_x86(ThresholdContext *s);
+void ff_threshold_init_aarch64(ThresholdContext *s);
 
 #endif /* AVFILTER_THRESHOLD_H */
diff --git a/libavfilter/vf_threshold_init.h b/libavfilter/vf_threshold_init.h
index fb319c6cf8..87e2ef20f9 100644
--- a/libavfilter/vf_threshold_init.h
+++ b/libavfilter/vf_threshold_init.h
@@ -86,6 +86,8 @@ av_unused static void ff_threshold_init(ThresholdContext *s)
 
 #if ARCH_X86 && HAVE_X86ASM
     ff_threshold_init_x86(s);
+#elif ARCH_AARCH64
+    ff_threshold_init_aarch64(s);
 #endif
 }
 
diff --git a/tests/checkasm/vf_threshold.c b/tests/checkasm/vf_threshold.c
index e6a425edfe..36fdda167c 100644
--- a/tests/checkasm/vf_threshold.c
+++ b/tests/checkasm/vf_threshold.c
@@ -22,8 +22,11 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem_internal.h"
 
-#define WIDTH 256
-#define WIDTH_PADDED 256 + 32
+#define WIDTH        256
+#define HEIGHT       3
+#define WIDTH_PADDED (WIDTH + 32)
+#define LINESIZE_MAX (WIDTH_PADDED + 32)
+#define BUF_SIZE     (LINESIZE_MAX * HEIGHT)
 
 #define randomize_buffers(buf, size)     \
     do {                                 \
@@ -34,14 +37,15 @@
     } while (0)
 
 static void check_threshold(int depth){
-    LOCAL_ALIGNED_32(uint8_t, in       , [WIDTH_PADDED]);
-    LOCAL_ALIGNED_32(uint8_t, threshold, [WIDTH_PADDED]);
-    LOCAL_ALIGNED_32(uint8_t, min      , [WIDTH_PADDED]);
-    LOCAL_ALIGNED_32(uint8_t, max      , [WIDTH_PADDED]);
-    LOCAL_ALIGNED_32(uint8_t, out_ref  , [WIDTH_PADDED]);
-    LOCAL_ALIGNED_32(uint8_t, out_new  , [WIDTH_PADDED]);
+    LOCAL_ALIGNED_32(uint8_t, in       , [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, threshold, [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, min      , [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, max      , [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, out_ref  , [BUF_SIZE]);
+    LOCAL_ALIGNED_32(uint8_t, out_new  , [BUF_SIZE]);
     ptrdiff_t line_size = WIDTH_PADDED;
-    int w = WIDTH;
+    int step = depth > 8 ? 2 : 1;
+    int w = WIDTH / step;
 
     declare_func(void, const uint8_t *in, const uint8_t *threshold,
                  const uint8_t *min, const uint8_t *max, uint8_t *out,
@@ -53,26 +57,46 @@ static void check_threshold(int depth){
     s.depth = depth;
     ff_threshold_init(&s);
 
-    memset(in,     0, WIDTH_PADDED);
-    memset(threshold, 0, WIDTH_PADDED);
-    memset(min, 0, WIDTH_PADDED);
-    memset(max, 0, WIDTH_PADDED);
-    memset(out_ref, 0, WIDTH_PADDED);
-    memset(out_new, 0, WIDTH_PADDED);
-    randomize_buffers(in, WIDTH);
-    randomize_buffers(threshold, WIDTH);
-    randomize_buffers(min, WIDTH);
-    randomize_buffers(max, WIDTH);
-
-    if (depth == 16)
-        w /= 2;
+    memset(in,        0, BUF_SIZE);
+    memset(threshold, 0, BUF_SIZE);
+    memset(min,       0, BUF_SIZE);
+    memset(max,       0, BUF_SIZE);
+    randomize_buffers(in,        BUF_SIZE);
+    randomize_buffers(threshold, BUF_SIZE);
+    randomize_buffers(min,       BUF_SIZE);
+    randomize_buffers(max,       BUF_SIZE);
 
     if (check_func(s.threshold, "threshold%d", depth)) {
-        call_ref(in, threshold, min, max, out_ref, line_size, line_size, 
line_size, line_size, line_size, w, 1);
-        call_new(in, threshold, min, max, out_new, line_size, line_size, 
line_size, line_size, line_size, w, 1);
-        if (memcmp(out_ref, out_new, WIDTH))
+        for (int i = 1; i < w; i++) {
+            memset(out_ref, 0xAA, BUF_SIZE);
+            memset(out_new, 0xAA, BUF_SIZE);
+            call_ref(in, threshold, min, max, out_ref,
+                     line_size, line_size, line_size, line_size, line_size, i, 
1);
+            call_new(in, threshold, min, max, out_new,
+                     line_size, line_size, line_size, line_size, line_size, i, 
1);
+            if (memcmp(out_ref, out_new, BUF_SIZE))
+                fail();
+        }
+        memset(out_ref, 0xAA, BUF_SIZE);
+        memset(out_new, 0xAA, BUF_SIZE);
+        call_ref(in, threshold, min, max, out_ref,
+                 line_size, line_size, line_size, line_size, line_size, w, 
HEIGHT);
+        call_new(in, threshold, min, max, out_new,
+                 line_size, line_size, line_size, line_size, line_size, w, 
HEIGHT);
+        if (memcmp(out_ref, out_new, BUF_SIZE))
             fail();
-        bench_new(in, threshold, min, max, out_new, line_size, line_size, 
line_size, line_size, line_size, w, 1);
+        memset(out_ref, 0xAA, BUF_SIZE);
+        memset(out_new, 0xAA, BUF_SIZE);
+        call_ref(in, threshold, min, max, out_ref,
+                 line_size + 0,  line_size + 8,  line_size + 16,
+                 line_size + 24, line_size + 32, w, HEIGHT);
+        call_new(in, threshold, min, max, out_new,
+                 line_size + 0,  line_size + 8,  line_size + 16,
+                 line_size + 24, line_size + 32, w, HEIGHT);
+        if (memcmp(out_ref, out_new, BUF_SIZE))
+            fail();
+        bench_new(in, threshold, min, max, out_new,
+                  line_size, line_size, line_size, line_size, line_size, w, 1);
     }
 }
 
@@ -81,6 +105,12 @@ void checkasm_check_vf_threshold(void)
     check_threshold(8);
     report("threshold8");
 
+    check_threshold(10);
+    report("threshold10");
+
+    check_threshold(12);
+    report("threshold12");
+
     check_threshold(16);
     report("threshold16");
 }
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PR] avfilter/aarch64: NEON threshold filter (PR #23448)

Reply via email to