Re: [FFmpeg-devel] [aarch64] improve performance of ff_hscale_8_to_15_neon

Sebastian Pop Wed, 27 Nov 2019 10:38:57 -0800

On Mon, Nov 25, 2019 at 11:20 PM Jean-Baptiste Kempf <j...@videolan.org> wrote:
> > Is there a coding rule in ffmpeg that restricts the use of intrinsics?
>
> Yes. See doc/optimization.txt.
> Use external asm (nasm/yasm) or inline asm (__asm__()), do not use intrinsics.


Thanks for the pointer.

> Also, here, you're changing some existing code, please improve the code and 
> do not duplicate code.
>
> > If that is the case, I can adapt my code to the existing asm code.
>
> Please.

Please find attached a patch that improves the existing code in aarch64/hscale.S
Performance test with gcc and clang shows that the patch improves
performance by 34% on Graviton A1 instances:

$ ffmpeg -nostats -f lavfi -i testsrc2=4k:d=2 -vf
bench=start,scale=1024x1024,bench=stop -f null -

before: t:0.040303 avg:0.040287 max:0.040371 min:0.039214
after:  t:0.030079 avg:0.030102 max:0.030462 min:0.030051

Tested with `make check` on aarch64-linux.

Please let me know if I can make the patch better.

Thank you,
Sebastian

From e04f9606f7ea581d8398eb2f37df2f59add8b374 Mon Sep 17 00:00:00 2001
From: Sebastian Pop <spop@amazon.com>
Date: Sun, 17 Nov 2019 14:13:13 -0600
Subject: [PATCH] [aarch64] use FMA and increase vector factor to 4

This patch implements ff_hscale_8_to_15_neon with NEON fused multiply accumulate
and bumps the vectorization factor from 2 to 4.
The speedup is of 34% on Graviton A1 instances based on A-72 cpus:

$ ffmpeg -nostats -f lavfi -i testsrc2=4k:d=2 -vf bench=start,scale=1024x1024,bench=stop -f null -
before: t:0.040303 avg:0.040287 max:0.040371 min:0.039214
after:  t:0.030079 avg:0.030102 max:0.030462 min:0.030051

Tested with `make check` on aarch64-linux.
---
 libswscale/aarch64/hscale.S | 176 +++++++++++++++++++++++++++++-------
 1 file changed, 142 insertions(+), 34 deletions(-)

diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S
index cc78c1901d..fb4de4d9a8 100644
--- a/libswscale/aarch64/hscale.S
+++ b/libswscale/aarch64/hscale.S
@@ -20,40 +20,148 @@
 
 #include "libavutil/aarch64/asm.S"
 
+/* #include "arm_neon.h"
+
+void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW,
+                            const uint8_t *src, const int16_t *filter,
+                            const int32_t *filterPos, int filterSize);
+
+static inline int32x4_t ff_reduce(int32x4_t x) {
+  x = vpaddq_s32(x, x);
+  x = vpaddq_s32(x, x);
+  return x;
+}
+
+static inline int32x4_t ff_hscale(int32x4_t x, const uint8_t *src,
+                                  const int16_t *filter) {
+  uint8x8_t a = vld1_u8(src);
+  int16x8_t s = vreinterpretq_s16_u16(vshll_n_u8(a, 0));
+  int16x8_t f = vld1q_s16(filter);
+  x = vmlal_s16(x, vget_low_s16(s), vget_low_s16(f));
+  x = vmlal_high_s16(x, s, f);
+  return x;
+}
+
+static inline int32x4_t ff_zip64(int32x4_t a, int32x4_t b) {
+    int64x2_t x0 = vreinterpretq_s64_s32(a);
+    int64x2_t x1 = vreinterpretq_s64_s32(b);
+    int64x2_t x2 = vzip1q_s64(x0, x1);
+    int32x4_t x3 = vreinterpretq_s32_s64(x2);
+    return x3;
+}
+
+static void ff_hscale_8_to_15_neon_1(SwsContext *c, int16_t *dst, int dstW,
+                              const uint8_t *src, const int16_t *filter,
+                              const int32_t *filterPos, int filterSize)
+{
+    int i;
+    if (dstW <= 0 || filterSize <= 0)
+      return;
+
+    for (i = 0; i < dstW; i += 4) {
+        int j;
+        int32x4_t x0 = vdupq_n_s32(0);
+        int32x4_t x1 = vdupq_n_s32(0);
+        int32x4_t x2 = vdupq_n_s32(0);
+        int32x4_t x3 = vdupq_n_s32(0);
+        int32x4_t x4, x5, x6;
+        int16x4_t x7;
+        const uint8_t *src0 = src + filterPos[i];
+        const uint8_t *src1 = src + filterPos[i + 1];
+        const uint8_t *src2 = src + filterPos[i + 2];
+        const uint8_t *src3 = src + filterPos[i + 3];
+        const int16_t *filter0 = filter;
+        const int16_t *filter1 = filter0 + filterSize;
+        const int16_t *filter2 = filter1 + filterSize;
+        const int16_t *filter3 = filter2 + filterSize;
+        filter = filter3 + filterSize;
+        for (j = 0; j < filterSize; j += 8) {
+            x0 = ff_hscale(x0, src0 + j, filter0 + j);
+            x1 = ff_hscale(x1, src1 + j, filter1 + j);
+            x2 = ff_hscale(x2, src2 + j, filter2 + j);
+            x3 = ff_hscale(x3, src3 + j, filter3 + j);
+        }
+        x0 = ff_reduce(x0);
+        x1 = ff_reduce(x1);
+        x2 = ff_reduce(x2);
+        x3 = ff_reduce(x3);
+
+        x4 = vzip1q_s32(x0, x1);
+        x5 = vzip1q_s32(x2, x3);
+        x6 = ff_zip64(x4, x5);
+        x7 = vqshrn_n_s32(x6, 7);
+        vst1_s16(&dst[i], x7);
+    }
+} */
+
 function ff_hscale_8_to_15_neon, export=1
-        add                 x10, x4, w6, UXTW #1        // filter2 = filter + filterSize*2 (x2 because int16)
-1:      ldr                 w8, [x5], #4                // filterPos[0]
-        ldr                 w9, [x5], #4                // filterPos[1]
-        movi                v4.4S, #0                   // val sum part 1 (for dst[0])
-        movi                v5.4S, #0                   // val sum part 2 (for dst[1])
-        mov                 w7, w6                      // filterSize counter
-        mov                 x13, x3                     // srcp = src
-2:      add                 x11, x13, w8, UXTW          // srcp + filterPos[0]
-        add                 x12, x13, w9, UXTW          // srcp + filterPos[1]
-        ld1                 {v0.8B}, [x11]              // srcp[filterPos[0] + {0..7}]
-        ld1                 {v1.8B}, [x12]              // srcp[filterPos[1] + {0..7}]
-        ld1                 {v2.8H}, [x4],  #16         // load 8x16-bit filter values, part 1
-        ld1                 {v3.8H}, [x10], #16         // ditto at filter+filterSize for part 2
-        uxtl                v0.8H, v0.8B                // unpack part 1 to 16-bit
-        uxtl                v1.8H, v1.8B                // unpack part 2 to 16-bit
-        smull               v16.4S, v0.4H, v2.4H        // v16.i32{0..3} = part 1 of: srcp[filterPos[0] + {0..7}] * filter[{0..7}]
-        smull               v18.4S, v1.4H, v3.4H        // v18.i32{0..3} = part 1 of: srcp[filterPos[1] + {0..7}] * filter[{0..7}]
-        smull2              v17.4S, v0.8H, v2.8H        // v17.i32{0..3} = part 2 of: srcp[filterPos[0] + {0..7}] * filter[{0..7}]
-        smull2              v19.4S, v1.8H, v3.8H        // v19.i32{0..3} = part 2 of: srcp[filterPos[1] + {0..7}] * filter[{0..7}]
-        addp                v16.4S, v16.4S, v17.4S      // horizontal pair adding of the 8x32-bit multiplied values for part 1 into 4x32-bit
-        addp                v18.4S, v18.4S, v19.4S      // horizontal pair adding of the 8x32-bit multiplied values for part 2 into 4x32-bit
-        add                 v4.4S, v4.4S, v16.4S        // update val accumulator for part 1
-        add                 v5.4S, v5.4S, v18.4S        // update val accumulator for part 2
-        add                 x13, x13, #8                // srcp += 8
-        subs                w7, w7, #8                  // processed 8/filterSize
-        b.gt                2b                          // inner loop if filterSize not consumed completely
-        mov                 x4, x10                     // filter = filter2
-        add                 x10, x10, w6, UXTW #1       // filter2 += filterSize*2
-        addp                v4.4S, v4.4S, v5.4S         // horizontal pair adding of the 8x32-bit sums into 4x32-bit
-        addp                v4.4S, v4.4S, v4.4S         // horizontal pair adding of the 4x32-bit sums into 2x32-bit
-        sqshrn              v4.4H, v4.4S, #7            // shift and clip the 2x16-bit final values
-        st1                 {v4.S}[0], [x1], #4         // write to destination
-        subs                w2, w2, #2                  // dstW -= 2
-        b.gt                1b                          // loop until end of line
+        sxtw                x9, w6
+        sbfiz               x12, x6, #1, #32
+        add                 x14, x12, x9
+        mov                 x8, xzr
+        sxtw                x10, w2
+        sbfiz               x11, x6, #3, #32
+        sbfiz               x13, x6, #2, #32
+        lsl                 x14, x14, #1
+1:      lsl                 x17, x8, #2
+        ldrsw               x18, [x5, x17]              // filterPos[0]
+        orr                 x0, x17, #0x4
+        orr                 x2, x17, #0x8
+        orr                 x17, x17, #0xc
+        ldrsw               x0, [x5, x0]                // filterPos[1]
+        ldrsw               x2, [x5, x2]                // filterPos[2]
+        ldrsw               x6, [x5, x17]               // filterPos[3]
+        mov                 x15, xzr                    // j = 0
+        mov                 x16, x4                     // filter0 = filter
+        movi                v0.2d, #0                   // val sum part 1 (for dst[0])
+        movi                v2.2d, #0                   // val sum part 2 (for dst[1])
+        movi                v1.2d, #0                   // val sum part 3 (for dst[2])
+        movi                v3.2d, #0                   // val sum part 4 (for dst[3])
+        add                 x17, x3, x18                // srcp + filterPos[0]
+        add                 x18, x3, x0                 // srcp + filterPos[1]
+        add                 x0, x3, x2                  // srcp + filterPos[2]
+        add                 x2, x3, x6                  // srcp + filterPos[3]
+2:      ldr                 d4, [x17, x15]              // srcp[filterPos[0] + {0..7}]
+        ldr                 q5, [x16]                   // load 8x16-bit filter values, part 1
+        ldr                 d6, [x18, x15]              // srcp[filterPos[1] + {0..7}]
+        ldr                 q7, [x16, x12]              // load 8x16-bit at filter+filterSize
+        ushll               v4.8h, v4.8b, #0            // unpack part 1 to 16-bit
+        smlal               v0.4s, v4.4h, v5.4h         // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
+        smlal2              v0.4s, v4.8h, v5.8h         // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
+        ldr                 d4, [x0, x15]               // srcp[filterPos[2] + {0..7}]
+        ldr                 q5, [x16, x13]              // load 8x16-bit at filter+2*filterSize
+        ushll               v6.8h, v6.8b, #0            // unpack part 2 to 16-bit
+        smlal               v2.4s, v6.4h, v7.4h         // v2 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
+        ushll               v4.8h, v4.8b, #0            // unpack part 3 to 16-bit
+        smlal               v1.4s, v4.4h, v5.4h         // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
+        smlal2              v1.4s, v4.8h, v5.8h         // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
+        ldr                 d4, [x2, x15]               // srcp[filterPos[3] + {0..7}]
+        ldr                 q5, [x16, x14]              // load 8x16-bit at filter+3*filterSize
+        add                 x15, x15, #8                // j += 8
+        smlal2              v2.4s, v6.8h, v7.8h         // v2 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
+        ushll               v4.8h, v4.8b, #0            // unpack part 4 to 16-bit
+        smlal               v3.4s, v4.4h, v5.4h         // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
+        cmp                 x15, x9                     // j < filterSize
+        smlal2              v3.4s, v4.8h, v5.8h         // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
+        add                 x16, x16, #16               // filter0 += 16
+        b.lt                2b                          // inner loop if filterSize not consumed completely
+        addp                v0.4s, v0.4s, v0.4s         // part1 horizontal pair adding
+        addp                v2.4s, v2.4s, v2.4s         // part2 horizontal pair adding
+        addp                v1.4s, v1.4s, v1.4s         // part3 horizontal pair adding
+        addp                v3.4s, v3.4s, v3.4s         // part4 horizontal pair adding
+        addp                v0.4s, v0.4s, v0.4s         // part1 horizontal pair adding
+        addp                v2.4s, v2.4s, v2.4s         // part2 horizontal pair adding
+        addp                v1.4s, v1.4s, v1.4s         // part3 horizontal pair adding
+        addp                v3.4s, v3.4s, v3.4s         // part4 horizontal pair adding
+        zip1                v0.4s, v0.4s, v2.4s         // part12 = zip values from part1 and part2
+        zip1                v1.4s, v1.4s, v3.4s         // part34 = zip values from part3 and part4
+        lsl                 x15, x8, #1
+        add                 x8, x8, #4                  // i += 4
+        mov                 v0.d[1], v1.d[0]            // part1234 = zip values from part12 and part34
+        cmp                 x8, x10                     // i < dstW
+        sqshrn              v0.4h, v0.4s, #7            // shift and clip the 2x16-bit final values
+        add                 x4, x4, x11                 // filter += filterSize*4
+        str                 d0, [x1, x15]               // write to destination part1234
+        b.lt                1b                          // loop until end of line
         ret
 endfunc
-- 
2.20.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [aarch64] improve performance of ff_hscale_8_to_15_neon

Reply via email to