aarch64: Add neon implementation for vsad16

Martin Storsjö Fri, 02 Sep 2022 14:49:27 -0700

On Mon, 22 Aug 2022, Hubert Mazur wrote:

Provide optimized implementation of vsad16 function for arm64.


Performance comparison tests are shown below.
- vsad_0_c: 285.0
- vsad_0_neon: 42.5

Benchmarks and tests are run with checkasm tool on AWS Graviton 3.

Signed-off-by: Hubert Mazur <h...@semihalf.com>
---
libavcodec/aarch64/me_cmp_init_aarch64.c |  5 ++
libavcodec/aarch64/me_cmp_neon.S         | 75 ++++++++++++++++++++++++
2 files changed, 80 insertions(+)

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 4198985c6c..d4c0099854 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -584,3 +584,78 @@ function sse4_neon, export=1

        ret
endfunc
+
+function vsad16_neon, export=1
+        // x0           unused
+        // x1           uint8_t *pix1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride
+        // w4           int h
+
+        sub             w4, w4, #1                      // we need to make h-1 
iterations
+        movi            v16.8h, #0
+
+        cmp             w4, #3                          // check if we can 
make 3 iterations at once
+        add             x5, x1, x3                      // pix1 + stride
+        add             x6, x2, x3                      // pix2 + stride
+        b.le            2f
+
+1:
+        // abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
+        // abs(x) = (x < 0 ? (-x) : (x))

This comment (the second line) about abs() doesn't really add anything ofvalue here, does it?

+        ld1             {v0.16b}, [x1], x3              // Load pix1[0], first 
iteration
+        ld1             {v1.16b}, [x2], x3              // Load pix2[0], first 
iteration
+        ld1             {v2.16b}, [x5], x3              // Load pix1[0 + 
stride], first iteration
+        usubl           v31.8h, v0.8b, v1.8b            // Signed difference 
pix1[0] - pix2[0], first iteration
+        ld1             {v3.16b}, [x6], x3              // Load pix2[0 + 
stride], first iteration
+        usubl2          v30.8h, v0.16b, v1.16b          // Signed difference 
pix1[0] - pix2[0], first iteration
+        usubl           v29.8h, v2.8b, v3.8b            // Signed difference 
pix1[0 + stride] - pix2[0 + stride], first iteration
+        ld1             {v4.16b}, [x1], x3              // Load pix1[0], 
second iteration
+        usubl2          v28.8h, v2.16b, v3.16b          // Signed difference 
pix1[0 + stride] - pix2[0 + stride], first iteration
+        ld1             {v5.16b}, [x2], x3              // Load pix2[0], 
second iteration
+        saba            v16.8h, v31.8h, v29.8h          // Signed absolute 
difference and accumulate the result. first iteration
+        ld1             {v6.16b}, [x5], x3              // Load pix1[0 + 
stride], second iteration
+        saba            v16.8h, v30.8h, v28.8h          // Signed absolute 
difference and accumulate the result. first iteration
+        usubl           v27.8h, v4.8b, v5.8b            // Signed difference 
pix1[0] - pix2[0], second iteration
+        ld1             {v7.16b}, [x6], x3              // Load pix2[0 + 
stride], second iteration
+        usubl2          v26.8h, v4.16b, v5.16b          // Signed difference 
pix1[0] - pix2[0], second iteration
+        usubl           v25.8h, v6.8b, v7.8b            // Signed difference 
pix1[0 + stride] - pix2[0 + stride], second iteration
+        ld1             {v17.16b}, [x1], x3             // Load pix1[0], third 
iteration
+        usubl2          v24.8h, v6.16b, v7.16b          // Signed difference 
pix1[0 + stride] - pix2[0 + stride], second iteration
+        ld1             {v18.16b}, [x2], x3             // Load pix2[0], 
second iteration
+        saba            v16.8h, v27.8h, v25.8h          // Signed absolute 
difference and accumulate the result. second iteration
+        ld1             {v19.16b}, [x5], x3             // Load pix1[0 + 
stride], third iteration
+        saba            v16.8h, v26.8h, v24.8h          // Signed absolute 
difference and accumulate the result. second iteration
+        usubl           v23.8h, v17.8b, v18.8b          // Signed difference 
pix1[0] - pix2[0], third iteration
+        ld1             {v20.16b}, [x6], x3             // Load pix2[0 + 
stride], third iteration
+        usubl2          v22.8h, v17.16b, v18.16b        // Signed difference 
pix1[0] - pix2[0], third iteration
+        usubl           v21.8h, v19.8b, v20.8b          // Signed difference 
pix1[0 + stride] - pix2[0 + stride], third iteration
+        sub             w4, w4, #3                      // h -= 3
+        saba            v16.8h, v23.8h, v21.8h          // Signed absolute 
difference and accumulate the result. third iteration
+        usubl2          v31.8h, v19.16b, v20.16b        // Signed difference 
pix1[0 + stride] - pix2[0 + stride], third iteration
+        cmp             w4, #3
+        saba            v16.8h, v22.8h, v31.8h          // Signed absolute 
difference and accumulate the result. third iteration

This unrolled implementation isn't really interleaved much at all. In sucha case there's not really much benefit from an unrolled implementation,other than saving a couple decrements/branch instructions.

I tested playing with your code here. Originally, checkasm gave me thesebenchmark numbers:


         Cortex A53    A72    A73
vsad_0_neon:  162.7   74.5   67.7

When I switched to only running the non-unrolled version below, I gotthese numbers:


vsad_0_neon:  165.2   78.5   76.0

I.e. hardly any difference at all. In such a case, we're just wasting alot of code size (and code maintainability!) on big unrolled code withoutmuch benefit at all. But we can do better.

+
+        b.ge            1b
+        cbz             w4, 3f
+2:
+
+        ld1             {v0.16b}, [x1], x3
+        ld1             {v1.16b}, [x2], x3
+        ld1             {v2.16b}, [x5], x3
+        usubl           v30.8h, v0.8b, v1.8b
+        ld1             {v3.16b}, [x6], x3
+        usubl2          v29.8h, v0.16b, v1.16b
+        usubl           v28.8h, v2.8b, v3.8b
+        usubl2          v27.8h, v2.16b, v3.16b
+        saba            v16.8h, v30.8h, v28.8h
+        subs            w4, w4, #1
+        saba            v16.8h, v29.8h, v27.8h

Now let's look at this simple non-unrolled implementation first. We'redoing a lot of redundant work here (and in the other implementation).

When we've loaded v2/v3 in the first round in this loop, and calculatedv28/v27 from them, and we then proceed to the next iteration, v0/v1 willbe identical to v2/v3, and v30/v29 will be identical to v28/v27 from theprevious iteration.

So if we just load one row from pix1/pix2 each and calculate theirdifference in the start of the function, and then just load one row fromeach, subtract them and accumulate the difference to the previous one,we're done.

Originally with your non-unrolled implementation, I got these benchmarknumbers:


vsad_0_neon:  165.2   78.5   76.0

After this simplification, I got these numbers:

vsad_0_neon:  131.2   68.5   70.0

In this case, my code looked like this:

        ld1             {v0.16b}, [x1], x3
        ld1             {v1.16b}, [x2], x3
        usubl           v31.8h, v0.8b,  v1.8b
        usubl2          v30.8h, v0.16b, v1.16b

2:
        ld1             {v0.16b}, [x1], x3
        ld1             {v1.16b}, [x2], x3
        subs            w4, w4, #1
        usubl           v29.8h,  v0.8b,   v1.8b
        usubl2          v28.8h,  v0.16b,  v1.16b
        saba            v16.8h,  v31.8h,  v29.8h
        mov             v31.16b, v29.16b
        saba            v16.8h,  v30.8h,  v28.8h
        mov             v30.16b, v28.16b
        b.ne            2b

Isn't that much simpler? Now after that, I tried doing the samemodification to the unrolled version. FWIW, I did it like this: For analgorithm this simple, where there's more than enough registers, I wrotethe unrolled implementation like this:



    ld1 // first iter
    ld1 // first iter
    ld1 // second iter
    ld1 // second iter
    ld1 // third iter
    ld1 // third iter
    usubl  // first
    usubl2 // first
    usubl ..
    usubl2 ..
    ...
    saba // first
    saba // first
    ...

After that, I reordered them so that we start with the first usubl's aftera couple of instructions after the corresponding loads, then moved thefollowing saba's closer.


With that, I'm getting these checkasm numbers:

         Cortex A53    A72    A73
vsad_0_neon:  108.7   61.2   58.0

As context, this patch originally gave these numbers:

vsad_0_neon:  162.7   74.5   67.7

I.e. a 1.15x speedup on A73, 1.21x speedup on A72 and an 1.5x speedup onA53.


You can see my version in the attached patch; apply it on top of yours.

// Martin

From 76eb1f213a72cdfd04a62c773442336cd56e0858 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <mar...@martin.st>
Date: Sat, 3 Sep 2022 00:45:55 +0300
Subject: [PATCH] Improve vsad16_neon

---
 libavcodec/aarch64/me_cmp_neon.S | 71 ++++++++++++++------------------
 1 file changed, 31 insertions(+), 40 deletions(-)

diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index ecc9c793d6..7ab8744e0d 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -592,65 +592,56 @@ function vsad16_neon, export=1
         // x3           ptrdiff_t stride
         // w4           int h
 
+        ld1             {v0.16b}, [x1], x3              // Load pix1[0], first iteration
+        ld1             {v1.16b}, [x2], x3              // Load pix2[0], first iteration
+
         sub             w4, w4, #1                      // we need to make h-1 iterations
         movi            v16.8h, #0
 
         cmp             w4, #3                          // check if we can make 3 iterations at once
-        add             x5, x1, x3                      // pix1 + stride
-        add             x6, x2, x3                      // pix2 + stride
-        b.le            2f
+        usubl           v31.8h, v0.8b, v1.8b            // Signed difference pix1[0] - pix2[0], first iteration
+        usubl2          v30.8h, v0.16b, v1.16b          // Signed difference pix1[0] - pix2[0], first iteration
+
+        b.lt            2f
 
 1:
         // abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
         // abs(x) = (x < 0 ? (-x) : (x))
-        ld1             {v0.16b}, [x1], x3              // Load pix1[0], first iteration
-        ld1             {v1.16b}, [x2], x3              // Load pix2[0], first iteration
-        ld1             {v2.16b}, [x5], x3              // Load pix1[0 + stride], first iteration
-        usubl           v31.8h, v0.8b, v1.8b            // Signed difference pix1[0] - pix2[0], first iteration
-        ld1             {v3.16b}, [x6], x3              // Load pix2[0 + stride], first iteration
-        usubl2          v30.8h, v0.16b, v1.16b          // Signed difference pix1[0] - pix2[0], first iteration
-        usubl           v29.8h, v2.8b, v3.8b            // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
-        ld1             {v4.16b}, [x1], x3              // Load pix1[0], second iteration
-        usubl2          v28.8h, v2.16b, v3.16b          // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
-        ld1             {v5.16b}, [x2], x3              // Load pix2[0], second iteration
+        ld1             {v0.16b}, [x1], x3              // Load pix1[0 + stride], first iteration
+        ld1             {v1.16b}, [x2], x3              // Load pix2[0 + stride], first iteration
+        ld1             {v2.16b}, [x1], x3              // Load pix1[0 + stride], second iteration
+        ld1             {v3.16b}, [x2], x3              // Load pix2[0 + stride], second iteration
+        usubl           v29.8h, v0.8b,  v1.8b           // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
+        usubl2          v28.8h, v0.16b, v1.16b          // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
+        ld1             {v4.16b}, [x1], x3              // Load pix1[0 + stride], third iteration
+        ld1             {v5.16b}, [x2], x3              // Load pix2[0 + stride], third iteration
+        usubl           v27.8h, v2.8b,  v3.8b           // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
         saba            v16.8h, v31.8h, v29.8h          // Signed absolute difference and accumulate the result. first iteration
-        ld1             {v6.16b}, [x5], x3              // Load pix1[0 + stride], second iteration
+        usubl2          v26.8h, v2.16b, v3.16b          // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
         saba            v16.8h, v30.8h, v28.8h          // Signed absolute difference and accumulate the result. first iteration
-        usubl           v27.8h, v4.8b, v5.8b            // Signed difference pix1[0] - pix2[0], second iteration
-        ld1             {v7.16b}, [x6], x3              // Load pix2[0 + stride], second iteration
-        usubl2          v26.8h, v4.16b, v5.16b          // Signed difference pix1[0] - pix2[0], second iteration
-        usubl           v25.8h, v6.8b, v7.8b            // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
-        ld1             {v17.16b}, [x1], x3             // Load pix1[0], third iteration
-        usubl2          v24.8h, v6.16b, v7.16b          // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
-        ld1             {v18.16b}, [x2], x3             // Load pix2[0], second iteration
-        saba            v16.8h, v27.8h, v25.8h          // Signed absolute difference and accumulate the result. second iteration
-        ld1             {v19.16b}, [x5], x3             // Load pix1[0 + stride], third iteration
-        saba            v16.8h, v26.8h, v24.8h          // Signed absolute difference and accumulate the result. second iteration
-        usubl           v23.8h, v17.8b, v18.8b          // Signed difference pix1[0] - pix2[0], third iteration
-        ld1             {v20.16b}, [x6], x3             // Load pix2[0 + stride], third iteration
-        usubl2          v22.8h, v17.16b, v18.16b        // Signed difference pix1[0] - pix2[0], third iteration
-        usubl           v21.8h, v19.8b, v20.8b          // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
+        usubl           v25.8h, v4.8b,  v5.8b           // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
+        usubl2          v24.8h, v4.16b, v5.16b          // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
+        saba            v16.8h, v29.8h, v27.8h          // Signed absolute difference and accumulate the result. second iteration
+        mov             v31.16b, v25.16b
+        saba            v16.8h, v28.8h, v26.8h          // Signed absolute difference and accumulate the result. second iteration
         sub             w4, w4, #3                      // h -= 3
-        saba            v16.8h, v23.8h, v21.8h          // Signed absolute difference and accumulate the result. third iteration
-        usubl2          v31.8h, v19.16b, v20.16b        // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
+        mov             v30.16b, v24.16b
+        saba            v16.8h, v27.8h, v25.8h          // Signed absolute difference and accumulate the result. third iteration
         cmp             w4, #3
-        saba            v16.8h, v22.8h, v31.8h          // Signed absolute difference and accumulate the result. third iteration
+        saba            v16.8h, v26.8h, v24.8h          // Signed absolute difference and accumulate the result. third iteration
 
         b.ge            1b
         cbz             w4, 3f
 2:
-
         ld1             {v0.16b}, [x1], x3
         ld1             {v1.16b}, [x2], x3
-        ld1             {v2.16b}, [x5], x3
-        usubl           v30.8h, v0.8b, v1.8b
-        ld1             {v3.16b}, [x6], x3
-        usubl2          v29.8h, v0.16b, v1.16b
-        usubl           v28.8h, v2.8b, v3.8b
-        usubl2          v27.8h, v2.16b, v3.16b
-        saba            v16.8h, v30.8h, v28.8h
         subs            w4, w4, #1
-        saba            v16.8h, v29.8h, v27.8h
+        usubl           v29.8h,  v0.8b,   v1.8b
+        usubl2          v28.8h,  v0.16b,  v1.16b
+        saba            v16.8h,  v31.8h,  v29.8h
+        mov             v31.16b, v29.16b
+        saba            v16.8h,  v30.8h,  v28.8h
+        mov             v30.16b, v28.16b
 
         b.ne            2b
 3:
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/5] lavc/aarch64: Add neon implementation for vsad16

Reply via email to