This is an automated email from the git hooks/post-receive script.
Git pushed a commit to branch master
in repository ffmpeg.
The following commit(s) were added to refs/heads/master by this push:
new 91ae6d10ab lavfi/nlmeans: add aarch64 neon for compute_weights_line
91ae6d10ab is described below
commit 91ae6d10abad2f74f6e5b8ec53bd63563c82110e
Author: Jun Zhao <[email protected]>
AuthorDate: Fri Jan 9 21:52:52 2026 +0800
Commit: Zhao Zhili <[email protected]>
CommitDate: Fri Jan 9 16:10:10 2026 +0000
lavfi/nlmeans: add aarch64 neon for compute_weights_line
Implement NEON optimization for compute_weights_line.
Also update the function signature to use ptrdiff_t for stack arguments
(max_meaningful_diff, startx, endx). This is done to unify the stack
layout between Apple platforms (which pack 32-bit stack arguments tightly)
and the generic AAPCS64 ABI (which requires 8-byte stack slots for 32-bit
arguments). Using ptrdiff_t ensures 8-byte slots are used on all AArch64
platforms, avoiding ABI mismatches with the assembly implementation.
The x86 AVX2 prototype is updated to match the new signature.
Performance benchmark (AArch64) in MacOS M4:
./tests/checkasm/checkasm --test=vf_nlmeans --bench
compute_weights_line_c: 151.1 ( 1.00x)
compute_weights_line_neon: 62.6 ( 2.42x)
Reviewed-by: Martin Storsjö <[email protected]>
Signed-off-by: Jun Zhao <[email protected]>
---
libavfilter/aarch64/vf_nlmeans_init.c | 15 +++-
libavfilter/aarch64/vf_nlmeans_neon.S | 126 ++++++++++++++++++++++++++++++++++
libavfilter/vf_nlmeans.h | 4 +-
libavfilter/vf_nlmeans_init.h | 4 +-
libavfilter/x86/vf_nlmeans_init.c | 4 +-
tests/checkasm/vf_nlmeans.c | 78 +++++++++++++++++++++
6 files changed, 224 insertions(+), 7 deletions(-)
diff --git a/libavfilter/aarch64/vf_nlmeans_init.c
b/libavfilter/aarch64/vf_nlmeans_init.c
index 6793370a4a..fbee336322 100644
--- a/libavfilter/aarch64/vf_nlmeans_init.c
+++ b/libavfilter/aarch64/vf_nlmeans_init.c
@@ -25,10 +25,23 @@ void ff_compute_safe_ssd_integral_image_neon(uint32_t *dst,
ptrdiff_t dst_linesi
const uint8_t *s2, ptrdiff_t
linesize2,
int w, int h);
+void ff_compute_weights_line_neon(const uint32_t *const iia,
+ const uint32_t *const iib,
+ const uint32_t *const iid,
+ const uint32_t *const iie,
+ const uint8_t *const src,
+ float *total_weight,
+ float *sum,
+ const float *const weight_lut,
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx);
+
av_cold void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
- if (have_neon(cpu_flags))
+ if (have_neon(cpu_flags)) {
dsp->compute_safe_ssd_integral_image =
ff_compute_safe_ssd_integral_image_neon;
+ dsp->compute_weights_line = ff_compute_weights_line_neon;
+ }
}
diff --git a/libavfilter/aarch64/vf_nlmeans_neon.S
b/libavfilter/aarch64/vf_nlmeans_neon.S
index a788cffd85..fd8eca8f76 100644
--- a/libavfilter/aarch64/vf_nlmeans_neon.S
+++ b/libavfilter/aarch64/vf_nlmeans_neon.S
@@ -78,3 +78,129 @@ function ff_compute_safe_ssd_integral_image_neon, export=1
b.ne 1b
ret
endfunc
+
+function ff_compute_weights_line_neon, export=1
+ // x0 = iia, x1 = iib, x2 = iid, x3 = iie
+ // x4 = src, x5 = total_weight, x6 = sum, x7 = weight_lut
+ // stack: [sp+0] = max_meaningful_diff, [sp+8] = startx, [sp+16] = endx
+
+ ldr w13, [sp, #0] // max_meaningful_diff
+ ldr w9, [sp, #8] // startx
+ ldr w10, [sp, #16] // endx
+
+ cmp w9, w10
+ b.ge 9f // if startx >= endx
return
+
+ // Offset pointers
+ lsl x11, x9, #2 // startx * 4 (for
uint32/float)
+ add x0, x0, x11 // iia += startx
+ add x1, x1, x11 // iib += startx
+ add x2, x2, x11 // iid += startx
+ add x3, x3, x11 // iie += startx
+ add x5, x5, x11 // total_weight +=
startx
+ add x6, x6, x11 // sum += startx
+
+ // src is uint8, so offset is just startx
+ add x4, x4, x9 // src += startx
+
+ dup v7.4s, w13 // v7 =
max_meaningful_diff (for vector ops)
+
+ sub w10, w10, w9 // count = endx -
startx
+
+1: // Main loop
+ cmp w10, #4
+ b.lt 2f // Handle leftovers
+
+ // Load integral image values
+ ld1 {v0.4s}, [x0], #16 // iia
+ ld1 {v1.4s}, [x1], #16 // iib
+ ld1 {v2.4s}, [x2], #16 // iid
+ ld1 {v3.4s}, [x3], #16 // iie
+
+ // diff = a - b + e - d = e - d - b + a
+ sub v0.4s, v0.4s, v1.4s // v0 = a - b
+ sub v3.4s, v3.4s, v2.4s // v3 = e - d
+ add v3.4s, v3.4s, v0.4s // v3 = diff (a - b +
e - d)
+
+ // min(diff, max)
+ umin v3.4s, v3.4s, v7.4s
+
+ // Schedule independent loads early
+ ld1 {v0.4s}, [x5] // v0 = total_weight
+ ld1 {v1.s}[0], [x4], #4 // v1 = src pixels
(low 4 bytes)
+ ld1 {v2.4s}, [x6] // v2 = sum
+
+ // Move to scalar registers to address lut
+ mov w8, v3.s[0]
+ mov w9, v3.s[1]
+ mov w11, v3.s[2]
+ mov w12, v3.s[3]
+
+ // Load 4 float weights using scalar registers
+ // Interleave with src conversion to hide latency
+ ldr s3, [x7, w8, uxtw #2] // w0 -> v3.s[0] (v3
is now free)
+ ldr s4, [x7, w9, uxtw #2] // w1 -> v4.s[0]
+ ldr s5, [x7, w11, uxtw #2] // w2 -> v5.s[0]
+ ldr s6, [x7, w12, uxtw #2] // w3 -> v6.s[0]
+
+ // Convert src pixels to float (independent of weights)
+ uxtl v1.8h, v1.8b
+ uxtl v1.4s, v1.4h
+ ucvtf v1.4s, v1.4s
+
+ // Merge weights into v3.4s
+ trn1 v3.2s, v3.2s, v4.2s // v3 = [w0, w1, ?, ?]
+ trn1 v5.2s, v5.2s, v6.2s // v5 = [w2, w3, ?, ?]
+ trn1 v3.2d, v3.2d, v5.2d // v3 = [w0, w1, w2,
w3]
+
+ // Update total_weight and sum
+ fadd v0.4s, v0.4s, v3.4s // total_weight +=
weight
+ fmla v2.4s, v1.4s, v3.4s // sum += src * weight
+
+ // Store back
+ st1 {v0.4s}, [x5], #16
+ st1 {v2.4s}, [x6], #16
+
+ sub w10, w10, #4
+ b 1b
+
+2: // Leftovers
+ cmp w10, #0
+ b.le 9f
+
+ // Single pixel handling
+ ldr w8, [x0], #4 // iia (reuse w8)
+ ldr w9, [x1], #4 // iib (reuse w9)
+ ldr w11, [x2], #4 // iid (reuse w11)
+ ldr w12, [x3], #4 // iie (reuse w12)
+
+ sub w12, w12, w11
+ sub w12, w12, w9
+ add w12, w12, w8
+
+ // min (unsigned comparison) - use preloaded w13
+ cmp w12, w13
+ csel w12, w12, w13, ls // unsigned lower or
same
+
+ // Load weight
+ ldr s0, [x7, w12, uxtw #2]
+
+ // Load src
+ ldrb w8, [x4], #1 // src (reuse w8)
+ ucvtf s1, w8
+
+ // Load acc
+ ldr s2, [x5]
+ ldr s3, [x6]
+
+ fadd s2, s2, s0
+ fmadd s3, s1, s0, s3
+
+ str s2, [x5], #4
+ str s3, [x6], #4
+
+ sub w10, w10, #1
+ b 2b
+
+9: ret
+endfunc
diff --git a/libavfilter/vf_nlmeans.h b/libavfilter/vf_nlmeans.h
index 61377f8c69..4d6ab47f54 100644
--- a/libavfilter/vf_nlmeans.h
+++ b/libavfilter/vf_nlmeans.h
@@ -35,8 +35,8 @@ typedef struct NLMeansDSPContext {
float *total_weight,
float *sum,
const float *const weight_lut,
- int max_meaningful_diff,
- int startx, int endx);
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx);
} NLMeansDSPContext;
void ff_nlmeans_init_aarch64(NLMeansDSPContext *dsp);
diff --git a/libavfilter/vf_nlmeans_init.h b/libavfilter/vf_nlmeans_init.h
index cf31e74bd7..58ba0fdd83 100644
--- a/libavfilter/vf_nlmeans_init.h
+++ b/libavfilter/vf_nlmeans_init.h
@@ -79,8 +79,8 @@ static void compute_weights_line_c(const uint32_t *const iia,
float *total_weight,
float *sum,
const float *const weight_lut,
- int max_meaningful_diff,
- int startx, int endx)
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx)
{
for (int x = startx; x < endx; x++) {
/*
diff --git a/libavfilter/x86/vf_nlmeans_init.c
b/libavfilter/x86/vf_nlmeans_init.c
index 5d67090a98..0adb2c7e8a 100644
--- a/libavfilter/x86/vf_nlmeans_init.c
+++ b/libavfilter/x86/vf_nlmeans_init.c
@@ -28,8 +28,8 @@ void ff_compute_weights_line_avx2(const uint32_t *const iia,
float *total_weight,
float *sum,
const float *const weight_lut,
- int max_meaningful_diff,
- int startx, int endx);
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx);
av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp)
{
diff --git a/tests/checkasm/vf_nlmeans.c b/tests/checkasm/vf_nlmeans.c
index e61a2efae6..26bd2d5890 100644
--- a/tests/checkasm/vf_nlmeans.c
+++ b/tests/checkasm/vf_nlmeans.c
@@ -18,10 +18,12 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <math.h>
#include "checkasm.h"
#include "libavfilter/vf_nlmeans_init.h"
#include "libavutil/avassert.h"
#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
#define randomize_buffer(buf, size) do { \
int i; \
@@ -110,5 +112,81 @@ void checkasm_check_nlmeans(void)
av_freep(&src);
}
+ if (check_func(dsp.compute_weights_line, "compute_weights_line")) {
+#define TEST_W 256
+#define MAX_MEANINGFUL_DIFF 255
+ const int startx = 10;
+ const int endx = 200;
+
+ // Allocate aligned buffers on stack
+ LOCAL_ALIGNED_32(uint32_t, iia, [TEST_W + 16]);
+ LOCAL_ALIGNED_32(uint32_t, iib, [TEST_W + 16]);
+ LOCAL_ALIGNED_32(uint32_t, iid, [TEST_W + 16]);
+ LOCAL_ALIGNED_32(uint32_t, iie, [TEST_W + 16]);
+ LOCAL_ALIGNED_32(uint8_t, src, [TEST_W + 16]);
+ LOCAL_ALIGNED_32(float, tw_ref, [TEST_W + 16]);
+ LOCAL_ALIGNED_32(float, tw_new, [TEST_W + 16]);
+ LOCAL_ALIGNED_32(float, sum_ref, [TEST_W + 16]);
+ LOCAL_ALIGNED_32(float, sum_new, [TEST_W + 16]);
+ LOCAL_ALIGNED_32(float, lut, [MAX_MEANINGFUL_DIFF + 1]);
+
+ declare_func(void, const uint32_t *const iia,
+ const uint32_t *const iib,
+ const uint32_t *const iid,
+ const uint32_t *const iie,
+ const uint8_t *const src,
+ float *total_weight,
+ float *sum,
+ const float *const weight_lut,
+ ptrdiff_t max_meaningful_diff,
+ ptrdiff_t startx, ptrdiff_t endx);
+
+ // Initialize LUT: weight = exp(-diff * scale)
+ // Using scale = 0.01 for testing
+ for (int i = 0; i <= MAX_MEANINGFUL_DIFF; i++)
+ lut[i] = expf(-i * 0.01f);
+
+ // Initialize source pixels
+ for (int i = 0; i < TEST_W; i++)
+ src[i] = rnd() & 0xff;
+
+ // Initialize integral images
+ // We need to ensure diff = e - d - b + a is non-negative and within
range
+ // Set up as if computing real integral image values
+ for (int i = 0; i < TEST_W; i++) {
+ uint32_t base = rnd() % 1000;
+ iia[i] = base;
+ iib[i] = base + (rnd() % 100);
+ iid[i] = base + (rnd() % 100);
+ // e = a + (b - a) + (d - a) + diff
+ // So diff = e - d - b + a will be in range [0,
max_meaningful_diff]
+ uint32_t diff = rnd() % (MAX_MEANINGFUL_DIFF + 1);
+ iie[i] = iia[i] + (iib[i] - iia[i]) + (iid[i] - iia[i]) + diff;
+ }
+
+ // Clear output buffers
+ memset(tw_ref, 0, (TEST_W + 16) * sizeof(float));
+ memset(tw_new, 0, (TEST_W + 16) * sizeof(float));
+ memset(sum_ref, 0, (TEST_W + 16) * sizeof(float));
+ memset(sum_new, 0, (TEST_W + 16) * sizeof(float));
+
+ call_ref(iia, iib, iid, iie, src, tw_ref, sum_ref, lut,
+ MAX_MEANINGFUL_DIFF, startx, endx);
+ call_new(iia, iib, iid, iie, src, tw_new, sum_new, lut,
+ MAX_MEANINGFUL_DIFF, startx, endx);
+
+ // Compare results with small tolerance for floating point
+ if (!float_near_abs_eps_array(tw_ref + startx, tw_new + startx, 1e-5f,
endx - startx))
+ fail();
+ if (!float_near_abs_eps_array(sum_ref + startx, sum_new + startx,
1e-4f, endx - startx))
+ fail();
+
+ // Benchmark
+ memset(tw_new, 0, (TEST_W + 16) * sizeof(float));
+ memset(sum_new, 0, (TEST_W + 16) * sizeof(float));
+ bench_new(iia, iib, iid, iie, src, tw_new, sum_new, lut,
+ MAX_MEANINGFUL_DIFF, startx, endx);
+ }
+
report("dsp");
}
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]