aarch64: add pngdsp

Zhao Zhili via ffmpeg-cvslog Tue, 03 Feb 2026 22:41:52 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit f54841d3751864357934f644232c7fb176923867
Author:     Zhao Zhili <[email protected]>
AuthorDate: Mon Jan 5 12:24:45 2026 +0800
Commit:     Zhao Zhili <[email protected]>
CommitDate: Wed Feb 4 12:05:35 2026 +0800

    avcodec/aarch64: add pngdsp
    
    Test Name                    A55-gcc-11        M1-clang           
A76-gcc-12          A510-clang        X3-clang
    
-------------------------------------------------------------------------------------------------------------------
    add_bytes_l2_4096_neon        1807.2 ( 2.01x)    1.6 ( 1.94x)    333.0 ( 
6.35x)   1058.2 ( 2.34x)    214.3 ( 1.99x)
    add_paeth_prediction_3_neon  33036.1 ( 2.41x)  145.1 ( 1.66x)  20443.3 ( 
1.97x)  35225.1 ( 1.23x)  19420.8 ( 1.05x)
    add_paeth_prediction_4_neon  24368.6 ( 3.26x)  106.7 ( 2.01x)  15163.8 ( 
2.77x)  26454.7 ( 1.62x)  14319.0 ( 1.35x)
    add_paeth_prediction_6_neon  17900.6 ( 4.44x)   72.0 ( 2.70x)  10214.3 ( 
4.20x)  18296.9 ( 2.27x)   9693.1 ( 1.97x)
    add_paeth_prediction_8_neon  12615.4 ( 6.31x)   54.1 ( 2.58x)   7706.0 ( 
5.45x)  13733.3 ( 2.94x)   7272.6 ( 2.63x)
    
    Signed-off-by: Zhao Zhili <[email protected]>
---
 libavcodec/aarch64/Makefile               |   2 +
 libavcodec/{x86 => aarch64}/pngdsp_init.c |  24 +--
 libavcodec/aarch64/pngdsp_neon.S          | 289 ++++++++++++++++++++++++++++++
 libavcodec/pngdsp.c                       |   4 +-
 libavcodec/pngdsp.h                       |   1 +
 5 files changed, 307 insertions(+), 13 deletions(-)

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 1e838ad901..e3abdbfd72 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_MPEGAUDIODSP)             += 
aarch64/mpegaudiodsp_init.o
 OBJS-$(CONFIG_MPEGVIDEOENCDSP)          += aarch64/mpegvideoencdsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
+OBJS-$(CONFIG_PNG_DECODER)              += aarch64/pngdsp_init.o
 OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
 OBJS-$(CONFIG_VP8DSP)                   += aarch64/vp8dsp_init_aarch64.o
 
@@ -56,6 +57,7 @@ NEON-OBJS-$(CONFIG_ME_CMP)              += 
aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_MPEGVIDEOENCDSP)     += aarch64/mpegvideoencdsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
+NEON-OBJS-$(CONFIG_PNG_DECODER)         += aarch64/pngdsp_neon.o
 NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
 
diff --git a/libavcodec/x86/pngdsp_init.c b/libavcodec/aarch64/pngdsp_init.c
similarity index 62%
copy from libavcodec/x86/pngdsp_init.c
copy to libavcodec/aarch64/pngdsp_init.c
index 5121140dcc..e88814fadb 100644
--- a/libavcodec/x86/pngdsp_init.c
+++ b/libavcodec/aarch64/pngdsp_init.c
@@ -1,6 +1,5 @@
 /*
- * x86 PNG optimizations.
- * Copyright (c) 2008 Loren Merrit <[email protected]>
+ * Copyright (c) 2026 Zhao Zhili <[email protected]>
  *
  * This file is part of FFmpeg.
  *
@@ -20,21 +19,22 @@
  */
 
 #include <stdint.h>
+
 #include "libavutil/attributes.h"
-#include "libavutil/x86/cpu.h"
+#include "libavutil/aarch64/cpu.h"
 #include "libavcodec/pngdsp.h"
 
-void ff_png_add_paeth_prediction_ssse3(uint8_t *dst, const uint8_t *src,
-                                       const uint8_t *top, int w, int bpp);
-void ff_add_bytes_l2_sse2(uint8_t *dst, const uint8_t *src1,
-                          const uint8_t *src2, int w);
+void ff_png_add_bytes_l2_neon(uint8_t *dst, const uint8_t *src1,
+                              const uint8_t *src2, int w);
+void ff_png_add_paeth_prediction_neon(uint8_t *dst, const uint8_t *src,
+                                      const uint8_t *top, int w, int bpp);
 
-av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp)
+av_cold void ff_pngdsp_init_aarch64(PNGDSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_SSE2(cpu_flags))
-        dsp->add_bytes_l2         = ff_add_bytes_l2_sse2;
-    if (EXTERNAL_SSSE3(cpu_flags))
-        dsp->add_paeth_prediction = ff_png_add_paeth_prediction_ssse3;
+    if (have_neon(cpu_flags)) {
+        dsp->add_bytes_l2         = ff_png_add_bytes_l2_neon;
+        dsp->add_paeth_prediction = ff_png_add_paeth_prediction_neon;
+    }
 }
diff --git a/libavcodec/aarch64/pngdsp_neon.S b/libavcodec/aarch64/pngdsp_neon.S
new file mode 100644
index 0000000000..20ae602a8a
--- /dev/null
+++ b/libavcodec/aarch64/pngdsp_neon.S
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2026 Zhao Zhili <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+/* void ff_png_add_bytes_l2_neon(uint8_t *dst, const uint8_t *src1,
+ *                               const uint8_t *src2, int w);
+ * x0: dst
+ * x1: src1
+ * x2: src2
+ * w3: w
+ */
+function ff_png_add_bytes_l2_neon, export=1
+        bic             w4, w3, #63
+        and             w3, w3, #63
+        cbz             w4, 2f
+1:
+        // 64 bytes per loop iteration
+        ldp             q0, q1, [x1]
+        ldp             q2, q3, [x1, #32]
+        subs            w4, w4, #64
+        ldp             q4, q5, [x2]
+        ldp             q6, q7, [x2, #32]
+        add             x1, x1, #64
+        add             v0.16b, v0.16b, v4.16b
+        add             v1.16b, v1.16b, v5.16b
+        add             x2, x2, #64
+        add             v2.16b, v2.16b, v6.16b
+        add             v3.16b, v3.16b, v7.16b
+        st1             {v0.16b - v3.16b}, [x0], #64
+        b.ne            1b
+2:
+        bic             w4, w3, #15
+        and             w3, w3, #15
+        cbz             w4, 4f
+3:
+        // 16 bytes per loop iteration
+        ld1             {v0.16b}, [x1], #16
+        ld1             {v4.16b}, [x2], #16
+        subs            w4, w4, #16
+        add             v0.16b, v0.16b, v4.16b
+        st1             {v0.16b}, [x0], #16
+        b.ne            3b
+4:
+        cbz             w3, 6f
+5:
+        ldrb            w5, [x1], #1
+        ldrb            w6, [x2], #1
+        subs            w3, w3, #1
+        add             w5, w5, w6
+        strb            w5, [x0], #1
+        b.ne            5b
+6:
+        ret
+endfunc
+
+/* This is an iterative process where dst[n] depends on dst[n-bpp], so
+ * add_paeth_prediction can only process bpp bytes each time.
+ *
+ * There are three state
+ * 1. load: load data from memory
+ * 2. shift: simple shift from previous iteration
+ * 3. extract: extract data from registers which was loaded in state 1.
+ *      Data is assembled by the caller.
+ */
+.macro add_paeth_prediction, bpp, state
+        // load data from memory
+.ifc \state,load
+        ld1             {v18.16b, v19.16b, v20.16b}, [x2], x7
+        ld1             {v21.16b, v22.16b, v23.16b}, [x1], x7
+        mov             v1.8b, v18.8b                           // c = top[i - 
bpp]
+        ext             v2.16b, v18.16b, v19.16b, #\bpp         // b = top[i]
+        mov             v17.16b, v21.16b                        // src
+.endif
+        // simple shift from previous iteration
+.ifc \state,shift
+        mov             v1.8b, v2.8b
+        ext             v2.16b, v2.16b, v2.16b, #(\bpp)
+        ext             v17.16b, v17.16b, v17.16b, #(\bpp)
+.endif
+        // Only the first bpp bytes are useful.
+        uabd            v4.8b, v2.8b, v1.8b         // pa = abs(b - c)
+        uaddl           v7.8h, v1.8b, v1.8b         // 2 * c
+        uabd            v3.8b, v0.8b, v1.8b         // pb = abs(a - c)
+        uaddl           v5.8h, v0.8b, v2.8b         // a + b
+
+        cmhs            v16.8b, v3.8b, v4.8b        // pb >= pa
+        uabd            v5.8h, v5.8h, v7.8h
+        umin            v6.8b, v4.8b, v3.8b         // min(pa, pb)
+        uqxtn           v5.8b, v5.8h
+
+        bsl             v16.8b, v0.8b, v2.8b        // pb >= pa ? a : b
+        cmhs            v6.8b, v5.8b, v6.8b         // pc >= min(pa, pb)
+        bsl             v6.8b, v16.8b, v1.8b        // pc >= min ? (a or b) : c
+
+        add             v0.8b, v6.8b, v17.8b
+.if \bpp == 3 || \bpp == 4
+        str             s0, [x0], #\bpp
+.else
+        str             d0, [x0], #\bpp
+.endif
+.endm
+
+/* void ff_png_add_paeth_prediction_neon(uint8_t *dst, const uint8_t *src,
+ *                                       const uint8_t *top, int w, int bpp);
+ * x0: dst
+ * x1: src
+ * x2: top
+ * w3: w
+ * w4: bpp
+ */
+function ff_png_add_paeth_prediction_neon, export=1
+        cmp             w4, #3
+        /* Load 48 bytes from memory in each loop.
+         * The number of bytes processed in each loop is (48 - bpp)
+         */
+        mov             w7, #48
+        /* Overwrite 1 byte in SIMD when bpp = 3, and 2 bytes when bpp = 6.
+         * Let w5 = (w - 2) / (48 -bpp) * (48 -bpp), then fix the overwrite
+         * in loop tail.
+         */
+        sub             w5, w3, #2
+        sub             w7, w7, w4              // (48 - bpp)
+        udiv            w5, w5, w7
+        neg             w6, w4                  // -bpp
+        sub             x2, x2, w4, uxtw        // top - bpp
+        mul             w5, w5, w7              // w5 = (w - 2) / (48 - bpp) * 
(48 - bpp)
+        sub             w3, w3, w5
+        cbz             w5, 2f
+
+        ldr             d0, [x0, w6, sxtw]
+
+        b.gt            40f
+30:     // bpp = 3
+        // 15 bytes
+        add_paeth_prediction 3, state=load
+        subs            w5, w5, w7
+.rept   4
+        add_paeth_prediction 3, state=shift
+.endr
+        // 15 + 15 = 30 bytes
+        ext             v1.16b, v18.16b, v19.16b, #15
+        ext             v2.16b, v19.16b, v20.16b, #2
+        ext             v17.16b, v21.16b, v22.16b, #15
+        add_paeth_prediction 3, state=extract
+.rept   4
+        add_paeth_prediction 3, state=shift
+.endr
+        // 30 + 15 = 45 bytes
+        ext             v1.16b, v19.16b, v20.16b, #14
+        ext             v2.16b, v20.16b, v20.16b, #1
+        ext             v17.16b, v22.16b, v23.16b, #14
+        add_paeth_prediction 3, state=extract
+.rept   4
+        add_paeth_prediction 3, state=shift
+.endr
+        b.ne            30b
+        b               2f
+
+40:     // check bpp = 4
+        cmp             w4, #4
+        b.gt            60f
+        // 44 bytes per loop
+41:
+        // 16 bytes
+        add_paeth_prediction 4, state=load
+        subs            w5, w5, w7
+.rept   3
+        add_paeth_prediction 4, state=shift
+.endr
+        // 16 + 16 = 32 bytes
+        mov             v1.8b, v19.8b
+        ext             v2.16b, v19.16b, v20.16b, #4
+        mov             v17.16b, v22.16b
+        add_paeth_prediction 4, state=extract
+.rept   3
+        add_paeth_prediction 4, state=shift
+.endr
+        // 32 + 12 bytes
+        mov             v1.8b, v20.8b
+        ext             v2.16b, v20.16b, v20.16b, #4
+        mov             v17.16b, v23.16b
+        add_paeth_prediction 4, state=extract
+.rept   2
+        add_paeth_prediction 4, state=shift
+.endr
+        b.ne            41b
+        b               2f
+
+60:     // check bpp = 6
+        cmp             w4, #6
+        b.gt            80f
+61:
+        // process 12 bytes
+        add_paeth_prediction 6, state=load
+        add_paeth_prediction 6, state=shift
+        subs            w5, w5, w7
+
+        // 12 + 12 = 24 bytes
+        ext             v1.16b, v18.16b, v19.16b, #12
+        ext             v2.16b, v19.16b, v20.16b, #2
+        ext             v17.16b, v21.16b, v22.16b, #12
+        add_paeth_prediction 6, state=extract
+        add_paeth_prediction 6, state=shift
+        // 24 + 12 = 36 bytes
+        ext             v1.16b, v19.16b, v20.16b, #8
+        ext             v2.16b, v19.16b, v20.16b, #14
+        ext             v17.16b, v22.16b, v23.16b, #8
+        add_paeth_prediction 6, state=extract
+        add_paeth_prediction 6, state=shift
+        // 36 + 6 = 42 bytes
+        ext             v1.16b, v20.16b, v20.16b, #4
+        ext             v2.16b, v20.16b, v20.16b, #10
+        ext             v17.16b, v23.16b, v23.16b, #4
+        add_paeth_prediction 6, state=extract
+
+        b.ne            61b
+        b               2f
+
+80:     // 40 bytes per loop
+        // 16 bytes
+        add_paeth_prediction 8, state=load
+        add_paeth_prediction 8, state=shift
+        subs            w5, w5, w7
+
+        // 16 + 16 = 32 bytes
+        mov             v1.8b, v19.8b
+        ext             v2.16b, v19.16b, v20.16b, #8
+        mov             v17.16b, v22.16b
+        add_paeth_prediction 8, state=extract
+        add_paeth_prediction 8, state=shift
+
+        // 32 + 8 = 40 bytes
+        mov             v1.8b, v20.8b
+        ext             v2.16b, v20.16b, v20.16b, #8
+        mov             v17.8b, v23.8b
+        add_paeth_prediction 8, state=extract
+
+        b.ne            80b
+2:
+        cbz             w3, 8f
+3:
+        ldrb            w7, [x0, w6, sxtw]          // a = dst[i - bpp]
+        ldrb            w8, [x2, w4, uxtw]          // b = top[i]
+        ldrb            w9, [x2], #1                // c = top[i - bpp]
+
+        sub             w10, w8, w9                 // p = b - c
+        sub             w11, w7, w9                 // a - c
+
+        cmp             w10, #0
+        cneg            w12, w10, lt                // pa = abs(b - c)
+        cmp             w11, #0
+        add             w14, w10, w11
+        cneg            w13, w11, lt                // pb = abs(a - c)
+        cmp             w14, #0
+        cneg            w14, w14, lt                // pc = abs(a + b - 2*c)
+
+        ldrb            w16, [x1], #1
+
+        cmp             w13, w14                    // pb vs pc
+        csel            w15, w8, w9, le             // w15 = (pb <= pc) ? b : c
+        cmp             w12, w13                    // pa vs pb
+        ccmp            w12, w14, #2, le            // if pa <= pb, check pa 
vs pc
+        csel            w15, w7, w15, le            // p = (pa <= pb && pa <= 
pc) ? a : w15
+
+        subs            w3, w3, #1
+        add             w15, w15, w16
+        strb            w15, [x0], #1
+        b.ne            3b
+8:
+        ret
+endfunc
diff --git a/libavcodec/pngdsp.c b/libavcodec/pngdsp.c
index b2f5b6018e..a932701b00 100644
--- a/libavcodec/pngdsp.c
+++ b/libavcodec/pngdsp.c
@@ -87,7 +87,9 @@ av_cold void ff_pngdsp_init(PNGDSPContext *dsp)
     dsp->add_bytes_l2         = add_bytes_l2_c;
     dsp->add_paeth_prediction = ff_png_add_paeth_prediction;
 
-#if ARCH_X86 && HAVE_X86ASM
+#if ARCH_AARCH64
+    ff_pngdsp_init_aarch64(dsp);
+#elif ARCH_X86 && HAVE_X86ASM
     ff_pngdsp_init_x86(dsp);
 #endif
 }
diff --git a/libavcodec/pngdsp.h b/libavcodec/pngdsp.h
index bd35431ae7..99f578f535 100644
--- a/libavcodec/pngdsp.h
+++ b/libavcodec/pngdsp.h
@@ -41,6 +41,7 @@ void ff_png_add_paeth_prediction(uint8_t *dst, const uint8_t 
*src,
                                  const uint8_t *top, int w, int bpp);
 
 void ff_pngdsp_init(PNGDSPContext *dsp);
+void ff_pngdsp_init_aarch64(PNGDSPContext *dsp);
 void ff_pngdsp_init_x86(PNGDSPContext *dsp);
 FF_VISIBILITY_POP_HIDDEN
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 02/02: avcodec/aarch64: add pngdsp

Reply via email to