aarch64: Add neon implementation for pix_abs8 functions.

Martin Storsjö Wed, 28 Sep 2022 02:07:16 -0700

On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:

Provide optimized implementation of pix_abs8 function for arm64.


Performance comparison tests are shown below:
pix_abs_1_1_c: 162.5
pix_abs_1_1_neon: 27.0
pix_abs_1_2_c: 174.0
pix_abs_1_2_neon: 23.5
pix_abs_1_3_c: 203.2
pix_abs_1_3_neon: 34.7

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Grzegorz Bernacki <g...@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c |   9 ++
libavcodec/aarch64/me_cmp_neon.S         | 193 +++++++++++++++++++++++
2 files changed, 202 insertions(+)

I don't see any changes compared to the version you sent last week? Ifreposting a patchset, please do mention what has changed - or ping the oldone. (I had it on my radar to review, but reviews of larger amounts ofcode doesn't happen immediately, unfortunately.)

Overall, you seem to have mixed in tabs among regular spaces. Please dofix that. (I would have kinda expected us to have a fate test that checksthis, but apparently we don't.)

diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c 
b/libavcodec/aarch64/me_cmp_init_aarch64.c
index e143f0816e..3459403ee5 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -59,6 +59,12 @@ int pix_median_abs16_neon(MpegEncContext *v, const uint8_t 
*pix1, const uint8_t
                          ptrdiff_t stride, int h);
int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
                         ptrdiff_t stride, int h);
+int ff_pix_abs8_x2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs8_y2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+                        ptrdiff_t stride, int h);
+int ff_pix_abs8_xy2_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t 
*pix2,
+                         ptrdiff_t stride, int h);

av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
@@ -70,6 +76,9 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, 
AVCodecContext *avctx)
        c->pix_abs[0][2] = ff_pix_abs16_y2_neon;
        c->pix_abs[0][3] = ff_pix_abs16_xy2_neon;
        c->pix_abs[1][0] = ff_pix_abs8_neon;
+           c->pix_abs[1][1] = ff_pix_abs8_x2_neon;
+           c->pix_abs[1][2] = ff_pix_abs8_y2_neon;
+           c->pix_abs[1][3] = ff_pix_abs8_xy2_neon;


In most mediums, the diff here shows that there's something off - tabs.


        c->sad[0] = ff_pix_abs16_neon;
        c->sad[1] = ff_pix_abs8_neon;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 11af4849f9..e03c0c26cd 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -119,6 +119,199 @@ function ff_pix_abs8_neon, export=1
        ret
endfunc

+function ff_pix_abs8_x2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        cmp             w4, #4
+        movi            v26.8h, #0
+        add             x5, x2, #1 // pix2 + 1
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+        ld1             {v1.8b}, [x2], x3
+        ld1             {v2.8b}, [x5], x3
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v4.8b}, [x2], x3
+       urhadd          v30.8b, v1.8b, v2.8b
+        ld1             {v5.8b}, [x5], x3
+        uabal           v26.8h, v0.8b, v30.8b
+        ld1             {v6.8b}, [x1], x3
+       urhadd          v29.8b, v4.8b, v5.8b
+        ld1             {v7.8b}, [x2], x3
+        ld1             {v20.8b}, [x5], x3
+        uabal           v26.8h, v6.8b, v29.8b
+        ld1             {v21.8b}, [x1], x3
+       urhadd          v28.8b, v7.8b, v20.8b
+        ld1             {v22.8b}, [x2], x3
+        ld1             {v23.8b}, [x5], x3
+        uabal           v26.8h, v21.8b, v28.8b
+        sub             w4, w4, #4
+        ld1             {v24.8b}, [x1], x3
+       urhadd          v27.8b, v22.8b, v23.8b
+        cmp             w4, #4
+        uabal           v26.8h, v24.8b, v27.8b
+
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v1.8b}, [x2], x3
+        ld1             {v2.8b}, [x5], x3
+        ld1             {v0.8b}, [x1], x3
+       urhadd          v30.8b, v1.8b, v2.8b
+        subs            w4, w4, #1
+        uabal           v26.8h, v0.8b, v30.8b
+
+        b.ne            2b
+3:
+        uaddlv          s20, v26.8h
+        fmov            w0, s20
+
+        ret
+
+endfunc
+
+function ff_pix_abs8_y2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        cmp             w4, #4
+        movi            v26.8h, #0
+        ld1             {v1.8b}, [x2], x3
+        b.lt            2f
+
+// make 4 iterations at once
+1:
+        ld1             {v2.8b}, [x2], x3
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v6.8b}, [x1], x3
+       urhadd          v30.8b, v1.8b, v2.8b

Good thing that you wrote this better than the currentff_pix_abs16_y2_neon, reusing the data from the previous round. But thescheduling could be much improved here (why load v6 directly after v0 - x1will delay it due to being updated in the previous instruction, and v6won't be needed for a long time here yet).

By fixing the scheduling of this function (and getting rid of theunnecessary mov instruction at the end of the unrolled loop), I got itdown from 74 cycles to 62 cycles on a Cortex A53. I'm attaching the fixupI did, so you can apply it on top instead of having to describe itverbally.

+        ld1             {v5.8b}, [x2], x3
+        ld1             {v21.8b}, [x1], x3
+        uabal           v26.8h, v0.8b, v30.8b
+       urhadd          v29.8b, v2.8b, v5.8b
+        ld1             {v20.8b}, [x2], x3
+        ld1             {v24.8b}, [x1], x3
+        uabal           v26.8h, v6.8b, v29.8b
+       urhadd          v28.8b, v5.8b, v20.8b
+        uabal           v26.8h, v21.8b, v28.8b
+        ld1             {v23.8b}, [x2], x3
+        mov             v1.8b, v23.8b
+        sub             w4, w4, #4
+       urhadd          v27.8b, v20.8b, v23.8b
+        cmp             w4, #4
+        uabal           v26.8h, v24.8b, v27.8b
+
+        b.ge            1b
+        cbz             w4, 3f
+
+// iterate by one
+2:
+        ld1             {v0.8b}, [x1], x3
+        ld1             {v2.8b}, [x2], x3
+       urhadd          v30.8b, v1.8b, v2.8b
+        subs            w4, w4, #1
+        uabal           v26.8h, v0.8b, v30.8b
+        mov             v1.8b, v2.8b
+
+        b.ne            2b
+3:
+        uaddlv          s20, v26.8h
+        fmov            w0, s20
+
+        ret
+
+endfunc
+
+function ff_pix_abs8_xy2_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        movi            v31.8h, #0
+        add             x0, x2, 1  // pix2 + 1
+
+        add             x5, x2, x3 // pix2 + stride = pix3
+        cmp             w4, #4
+        add             x6, x5, 1 // pix3 + stride + 1
+
+        b.lt            2f
+
+        ld1             {v0.8b}, [x2], x3
+        ld1             {v1.8b}, [x0], x3
+        uaddl           v2.8h, v0.8b, v1.8b

If we'd start out with h<4, we'd jump to the 2f label above, missing thesetup of v0-v2 here.


Other than that, the patch looks reasonable to me.

// Martin

From 7909432b3cffdcb31c0953279d862f820a738ec4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <mar...@martin.st>
Date: Wed, 28 Sep 2022 11:29:57 +0300
Subject: [PATCH 1/2] aarch64: me_cmp: Improve scheduling in
 ff_pix_abs8_y2_neon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before:       Cortex A53    A72    A73
pix_abs_1_2_neon:   73.7   31.0   25.7
After:
pix_abs_1_2_neon:   61.7   30.2   24.7

Signed-off-by: Martin StorsjÃ¶ <mar...@martin.st>
---
 libavcodec/aarch64/me_cmp_neon.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index a900ae2190..7f2b7c754f 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -193,21 +193,20 @@ function ff_pix_abs8_y2_neon, export=1
 1:
         ld1             {v2.8b}, [x2], x3
         ld1             {v0.8b}, [x1], x3
-        ld1             {v6.8b}, [x1], x3
 	urhadd          v30.8b, v1.8b, v2.8b
         ld1             {v5.8b}, [x2], x3
-        ld1             {v21.8b}, [x1], x3
+        ld1             {v6.8b}, [x1], x3
         uabal           v26.8h, v0.8b, v30.8b
 	urhadd          v29.8b, v2.8b, v5.8b
         ld1             {v20.8b}, [x2], x3
-        ld1             {v24.8b}, [x1], x3
+        ld1             {v21.8b}, [x1], x3
         uabal           v26.8h, v6.8b, v29.8b
 	urhadd          v28.8b, v5.8b, v20.8b
-        uabal           v26.8h, v21.8b, v28.8b
-        ld1             {v23.8b}, [x2], x3
-        mov             v1.8b, v23.8b
+        ld1             {v1.8b},  [x2], x3
+        ld1             {v24.8b}, [x1], x3
+	urhadd          v27.8b, v20.8b, v1.8b
         sub             w4, w4, #4
-	urhadd          v27.8b, v20.8b, v23.8b
+        uabal           v26.8h, v21.8b, v28.8b
         cmp             w4, #4
         uabal           v26.8h, v24.8b, v27.8b
 
-- 
2.25.1

From dfc91453aa3608b6b6b7c0e832292668150fd0d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <mar...@martin.st>
Date: Wed, 28 Sep 2022 11:43:02 +0300
Subject: [PATCH 2/2] aarch64: me_cmp: Fix up the prologue of
 ff_pix_abs8_xy2_neon

This initializes things properly if this were to be called with
h < 4.
---
 libavcodec/aarch64/me_cmp_neon.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 7f2b7c754f..4037953488 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -245,12 +245,12 @@ function ff_pix_abs8_xy2_neon, export=1
         cmp             w4, #4
         add             x6, x5, 1 // pix3 + stride + 1
 
-        b.lt            2f
-
         ld1             {v0.8b}, [x2], x3
         ld1             {v1.8b}, [x0], x3
         uaddl           v2.8h, v0.8b, v1.8b
 
+        b.lt            2f
+
 // make 4 iterations at once
 1:
         ld1             {v4.8b}, [x5], x3
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: Add neon implementation for pix_abs8 functions.

Reply via email to