On Mon, 26 Sep 2022, Grzegorz Bernacki wrote:
Provide optimized implementation for vsse_intra8 for arm64.
Performance tests are shown below.
- vsse_5_c: 87.7
- vsse_5_neon: 26.2
Benchmarks and tests are run with checkasm tool on AWS Graviton 3.
---
libavcodec/aarch64/me_cmp_init_aarch64.c | 4 ++
libavcodec/aarch64/me_cmp_neon.S | 53 ++++++++++++++++++++++++
2 files changed, 57 insertions(+)
diff --git a/libavcodec/aarch64/me_cmp_init_aarch64.c
b/libavcodec/aarch64/me_cmp_init_aarch64.c
index f247372c94..defec37478 100644
--- a/libavcodec/aarch64/me_cmp_init_aarch64.c
+++ b/libavcodec/aarch64/me_cmp_init_aarch64.c
@@ -74,6 +74,9 @@ int nsse8_neon_wrapper(MpegEncContext *c, const uint8_t *s1,
const uint8_t *s2,
int vsse8_neon(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
+int vsse_intra8_neon(MpegEncContext *c, const uint8_t *s, const uint8_t *dummy,
+ ptrdiff_t stride, int h);
+
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
@@ -102,6 +105,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c,
AVCodecContext *avctx)
c->vsse[1] = vsse8_neon;
c->vsse[4] = vsse_intra16_neon;
+ c->vsse[5] = vsse_intra8_neon;
c->nsse[0] = nsse16_neon_wrapper;
c->nsse[1] = nsse8_neon_wrapper;
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 386d2de0c5..82ff05d3f0 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1111,6 +1111,59 @@ function vsse_intra16_neon, export=1
ret
endfunc
+function vsse_intra8_neon, export=1
+ // x0 unused
+ // x1 uint8_t *pix1
+ // x2 uint8_t *dummy
+ // x3 ptrdiff_t stride
+ // w4 int h
+
+ ld1 {v0.8b}, [x1], x3
+ movi v16.4s, #0
+
+ sub w4, w4, #1 // we need to make h-1 iterations
+ cmp w4, #3
+ b.lt 2f
+
+1:
+ // v = abs( pix1[0] - pix1[0 + stride] )
+ // score = sum( v * v )
+ ld1 {v1.8b}, [x1], x3
+ ld1 {v2.8b}, [x1], x3
+ uabd v30.8b, v0.8b, v1.8b
+ ld1 {v3.8b}, [x1], x3
+ umull v29.8h, v30.8b, v30.8b
+ uabd v27.8b, v1.8b, v2.8b
+ uadalp v16.4s, v29.8h
The scheduling here can be improved, please see the attached patch.
Other than that, it does look reasonable.
// Martin
From d0345bceaf013bea2023b1a02b372f2a64c6efaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <mar...@martin.st>
Date: Wed, 28 Sep 2022 11:53:55 +0300
Subject: [PATCH] aarch64: me_cmp: Improve scheduling in vsse_intra8
Before: Cortex A53 A72 A73
vsse_5_neon: 74.7 31.5 26.0
After:
vsse_5_neon: 62.7 32.5 25.7
---
libavcodec/aarch64/me_cmp_neon.S | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/libavcodec/aarch64/me_cmp_neon.S b/libavcodec/aarch64/me_cmp_neon.S
index 4037953488..dc0b1e5f43 100644
--- a/libavcodec/aarch64/me_cmp_neon.S
+++ b/libavcodec/aarch64/me_cmp_neon.S
@@ -1113,11 +1113,11 @@ function vsse_intra8_neon, export=1
// x3 ptrdiff_t stride
// w4 int h
+ sub w4, w4, #1 // we need to make h-1 iterations
ld1 {v0.8b}, [x1], x3
+ cmp w4, #3
movi v16.4s, #0
- sub w4, w4, #1 // we need to make h-1 iterations
- cmp w4, #3
b.lt 2f
1:
@@ -1127,13 +1127,13 @@ function vsse_intra8_neon, export=1
ld1 {v2.8b}, [x1], x3
uabd v30.8b, v0.8b, v1.8b
ld1 {v3.8b}, [x1], x3
- umull v29.8h, v30.8b, v30.8b
uabd v27.8b, v1.8b, v2.8b
- uadalp v16.4s, v29.8h
- umull v26.8h, v27.8b, v27.8b
+ umull v29.8h, v30.8b, v30.8b
uabd v25.8b, v2.8b, v3.8b
- uadalp v16.4s, v26.8h
+ umull v26.8h, v27.8b, v27.8b
+ uadalp v16.4s, v29.8h
umull v24.8h, v25.8b, v25.8b
+ uadalp v16.4s, v26.8h
sub w4, w4, #3
uadalp v16.4s, v24.8h
cmp w4, #3
--
2.25.1
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".