On Mon, Nov 25, 2019 at 11:20 PM Jean-Baptiste Kempf <j...@videolan.org> wrote: > > Is there a coding rule in ffmpeg that restricts the use of intrinsics? > > Yes. See doc/optimization.txt. > Use external asm (nasm/yasm) or inline asm (__asm__()), do not use intrinsics.
Thanks for the pointer. > Also, here, you're changing some existing code, please improve the code and > do not duplicate code. > > > If that is the case, I can adapt my code to the existing asm code. > > Please. Please find attached a patch that improves the existing code in aarch64/hscale.S Performance test with gcc and clang shows that the patch improves performance by 34% on Graviton A1 instances: $ ffmpeg -nostats -f lavfi -i testsrc2=4k:d=2 -vf bench=start,scale=1024x1024,bench=stop -f null - before: t:0.040303 avg:0.040287 max:0.040371 min:0.039214 after: t:0.030079 avg:0.030102 max:0.030462 min:0.030051 Tested with `make check` on aarch64-linux. Please let me know if I can make the patch better. Thank you, Sebastian
From e04f9606f7ea581d8398eb2f37df2f59add8b374 Mon Sep 17 00:00:00 2001 From: Sebastian Pop <spop@amazon.com> Date: Sun, 17 Nov 2019 14:13:13 -0600 Subject: [PATCH] [aarch64] use FMA and increase vector factor to 4 This patch implements ff_hscale_8_to_15_neon with NEON fused multiply accumulate and bumps the vectorization factor from 2 to 4. The speedup is of 34% on Graviton A1 instances based on A-72 cpus: $ ffmpeg -nostats -f lavfi -i testsrc2=4k:d=2 -vf bench=start,scale=1024x1024,bench=stop -f null - before: t:0.040303 avg:0.040287 max:0.040371 min:0.039214 after: t:0.030079 avg:0.030102 max:0.030462 min:0.030051 Tested with `make check` on aarch64-linux. --- libswscale/aarch64/hscale.S | 176 +++++++++++++++++++++++++++++------- 1 file changed, 142 insertions(+), 34 deletions(-) diff --git a/libswscale/aarch64/hscale.S b/libswscale/aarch64/hscale.S index cc78c1901d..fb4de4d9a8 100644 --- a/libswscale/aarch64/hscale.S +++ b/libswscale/aarch64/hscale.S @@ -20,40 +20,148 @@ #include "libavutil/aarch64/asm.S" +/* #include "arm_neon.h" + +void ff_hscale_8_to_15_neon(SwsContext *c, int16_t *dst, int dstW, + const uint8_t *src, const int16_t *filter, + const int32_t *filterPos, int filterSize); + +static inline int32x4_t ff_reduce(int32x4_t x) { + x = vpaddq_s32(x, x); + x = vpaddq_s32(x, x); + return x; +} + +static inline int32x4_t ff_hscale(int32x4_t x, const uint8_t *src, + const int16_t *filter) { + uint8x8_t a = vld1_u8(src); + int16x8_t s = vreinterpretq_s16_u16(vshll_n_u8(a, 0)); + int16x8_t f = vld1q_s16(filter); + x = vmlal_s16(x, vget_low_s16(s), vget_low_s16(f)); + x = vmlal_high_s16(x, s, f); + return x; +} + +static inline int32x4_t ff_zip64(int32x4_t a, int32x4_t b) { + int64x2_t x0 = vreinterpretq_s64_s32(a); + int64x2_t x1 = vreinterpretq_s64_s32(b); + int64x2_t x2 = vzip1q_s64(x0, x1); + int32x4_t x3 = vreinterpretq_s32_s64(x2); + return x3; +} + +static void ff_hscale_8_to_15_neon_1(SwsContext *c, int16_t *dst, int dstW, + const uint8_t *src, const int16_t *filter, + const int32_t *filterPos, int filterSize) +{ + int i; + if (dstW <= 0 || filterSize <= 0) + return; + + for (i = 0; i < dstW; i += 4) { + int j; + int32x4_t x0 = vdupq_n_s32(0); + int32x4_t x1 = vdupq_n_s32(0); + int32x4_t x2 = vdupq_n_s32(0); + int32x4_t x3 = vdupq_n_s32(0); + int32x4_t x4, x5, x6; + int16x4_t x7; + const uint8_t *src0 = src + filterPos[i]; + const uint8_t *src1 = src + filterPos[i + 1]; + const uint8_t *src2 = src + filterPos[i + 2]; + const uint8_t *src3 = src + filterPos[i + 3]; + const int16_t *filter0 = filter; + const int16_t *filter1 = filter0 + filterSize; + const int16_t *filter2 = filter1 + filterSize; + const int16_t *filter3 = filter2 + filterSize; + filter = filter3 + filterSize; + for (j = 0; j < filterSize; j += 8) { + x0 = ff_hscale(x0, src0 + j, filter0 + j); + x1 = ff_hscale(x1, src1 + j, filter1 + j); + x2 = ff_hscale(x2, src2 + j, filter2 + j); + x3 = ff_hscale(x3, src3 + j, filter3 + j); + } + x0 = ff_reduce(x0); + x1 = ff_reduce(x1); + x2 = ff_reduce(x2); + x3 = ff_reduce(x3); + + x4 = vzip1q_s32(x0, x1); + x5 = vzip1q_s32(x2, x3); + x6 = ff_zip64(x4, x5); + x7 = vqshrn_n_s32(x6, 7); + vst1_s16(&dst[i], x7); + } +} */ + function ff_hscale_8_to_15_neon, export=1 - add x10, x4, w6, UXTW #1 // filter2 = filter + filterSize*2 (x2 because int16) -1: ldr w8, [x5], #4 // filterPos[0] - ldr w9, [x5], #4 // filterPos[1] - movi v4.4S, #0 // val sum part 1 (for dst[0]) - movi v5.4S, #0 // val sum part 2 (for dst[1]) - mov w7, w6 // filterSize counter - mov x13, x3 // srcp = src -2: add x11, x13, w8, UXTW // srcp + filterPos[0] - add x12, x13, w9, UXTW // srcp + filterPos[1] - ld1 {v0.8B}, [x11] // srcp[filterPos[0] + {0..7}] - ld1 {v1.8B}, [x12] // srcp[filterPos[1] + {0..7}] - ld1 {v2.8H}, [x4], #16 // load 8x16-bit filter values, part 1 - ld1 {v3.8H}, [x10], #16 // ditto at filter+filterSize for part 2 - uxtl v0.8H, v0.8B // unpack part 1 to 16-bit - uxtl v1.8H, v1.8B // unpack part 2 to 16-bit - smull v16.4S, v0.4H, v2.4H // v16.i32{0..3} = part 1 of: srcp[filterPos[0] + {0..7}] * filter[{0..7}] - smull v18.4S, v1.4H, v3.4H // v18.i32{0..3} = part 1 of: srcp[filterPos[1] + {0..7}] * filter[{0..7}] - smull2 v17.4S, v0.8H, v2.8H // v17.i32{0..3} = part 2 of: srcp[filterPos[0] + {0..7}] * filter[{0..7}] - smull2 v19.4S, v1.8H, v3.8H // v19.i32{0..3} = part 2 of: srcp[filterPos[1] + {0..7}] * filter[{0..7}] - addp v16.4S, v16.4S, v17.4S // horizontal pair adding of the 8x32-bit multiplied values for part 1 into 4x32-bit - addp v18.4S, v18.4S, v19.4S // horizontal pair adding of the 8x32-bit multiplied values for part 2 into 4x32-bit - add v4.4S, v4.4S, v16.4S // update val accumulator for part 1 - add v5.4S, v5.4S, v18.4S // update val accumulator for part 2 - add x13, x13, #8 // srcp += 8 - subs w7, w7, #8 // processed 8/filterSize - b.gt 2b // inner loop if filterSize not consumed completely - mov x4, x10 // filter = filter2 - add x10, x10, w6, UXTW #1 // filter2 += filterSize*2 - addp v4.4S, v4.4S, v5.4S // horizontal pair adding of the 8x32-bit sums into 4x32-bit - addp v4.4S, v4.4S, v4.4S // horizontal pair adding of the 4x32-bit sums into 2x32-bit - sqshrn v4.4H, v4.4S, #7 // shift and clip the 2x16-bit final values - st1 {v4.S}[0], [x1], #4 // write to destination - subs w2, w2, #2 // dstW -= 2 - b.gt 1b // loop until end of line + sxtw x9, w6 + sbfiz x12, x6, #1, #32 + add x14, x12, x9 + mov x8, xzr + sxtw x10, w2 + sbfiz x11, x6, #3, #32 + sbfiz x13, x6, #2, #32 + lsl x14, x14, #1 +1: lsl x17, x8, #2 + ldrsw x18, [x5, x17] // filterPos[0] + orr x0, x17, #0x4 + orr x2, x17, #0x8 + orr x17, x17, #0xc + ldrsw x0, [x5, x0] // filterPos[1] + ldrsw x2, [x5, x2] // filterPos[2] + ldrsw x6, [x5, x17] // filterPos[3] + mov x15, xzr // j = 0 + mov x16, x4 // filter0 = filter + movi v0.2d, #0 // val sum part 1 (for dst[0]) + movi v2.2d, #0 // val sum part 2 (for dst[1]) + movi v1.2d, #0 // val sum part 3 (for dst[2]) + movi v3.2d, #0 // val sum part 4 (for dst[3]) + add x17, x3, x18 // srcp + filterPos[0] + add x18, x3, x0 // srcp + filterPos[1] + add x0, x3, x2 // srcp + filterPos[2] + add x2, x3, x6 // srcp + filterPos[3] +2: ldr d4, [x17, x15] // srcp[filterPos[0] + {0..7}] + ldr q5, [x16] // load 8x16-bit filter values, part 1 + ldr d6, [x18, x15] // srcp[filterPos[1] + {0..7}] + ldr q7, [x16, x12] // load 8x16-bit at filter+filterSize + ushll v4.8h, v4.8b, #0 // unpack part 1 to 16-bit + smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}] + smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}] + ldr d4, [x0, x15] // srcp[filterPos[2] + {0..7}] + ldr q5, [x16, x13] // load 8x16-bit at filter+2*filterSize + ushll v6.8h, v6.8b, #0 // unpack part 2 to 16-bit + smlal v2.4s, v6.4h, v7.4h // v2 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}] + ushll v4.8h, v4.8b, #0 // unpack part 3 to 16-bit + smlal v1.4s, v4.4h, v5.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}] + smlal2 v1.4s, v4.8h, v5.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}] + ldr d4, [x2, x15] // srcp[filterPos[3] + {0..7}] + ldr q5, [x16, x14] // load 8x16-bit at filter+3*filterSize + add x15, x15, #8 // j += 8 + smlal2 v2.4s, v6.8h, v7.8h // v2 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}] + ushll v4.8h, v4.8b, #0 // unpack part 4 to 16-bit + smlal v3.4s, v4.4h, v5.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}] + cmp x15, x9 // j < filterSize + smlal2 v3.4s, v4.8h, v5.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}] + add x16, x16, #16 // filter0 += 16 + b.lt 2b // inner loop if filterSize not consumed completely + addp v0.4s, v0.4s, v0.4s // part1 horizontal pair adding + addp v2.4s, v2.4s, v2.4s // part2 horizontal pair adding + addp v1.4s, v1.4s, v1.4s // part3 horizontal pair adding + addp v3.4s, v3.4s, v3.4s // part4 horizontal pair adding + addp v0.4s, v0.4s, v0.4s // part1 horizontal pair adding + addp v2.4s, v2.4s, v2.4s // part2 horizontal pair adding + addp v1.4s, v1.4s, v1.4s // part3 horizontal pair adding + addp v3.4s, v3.4s, v3.4s // part4 horizontal pair adding + zip1 v0.4s, v0.4s, v2.4s // part12 = zip values from part1 and part2 + zip1 v1.4s, v1.4s, v3.4s // part34 = zip values from part3 and part4 + lsl x15, x8, #1 + add x8, x8, #4 // i += 4 + mov v0.d[1], v1.d[0] // part1234 = zip values from part12 and part34 + cmp x8, x10 // i < dstW + sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values + add x4, x4, x11 // filter += filterSize*4 + str d0, [x1, x15] // write to destination part1234 + b.lt 1b // loop until end of line ret endfunc -- 2.20.1
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".