Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

2023-10-31 Thread Martin Storsjö

On Thu, 26 Oct 2023, Logan.Lyu wrote:

And I missed submitting a commit that was earlier than these four commits, 
which caused the corrupted whitespace problem. Now I have recreated these 
patches.


In addition, I rebased it to ensure that these patches can be successfully 
applied on the latest master branch.


Please check again, thank you.


Thanks, now these was possibly to apply, and they looked mostly ok, so I 
touched up the last details I noticed and pushed them.


Things I noticed and fixed before pushing:

A bunch of minor cosmetics, you had minor misindentations in a few places 
(that were copypasted around in lots of places), that I fixed like this:


 ld1 {v18.16b}, [x1], x2
 .macro calc src0, src1, src2, src3
-ld1{\src3\().16b}, [x1], x2
+ld1 {\src3\().16b}, [x1], x2
 moviv4.8h, #0
 moviv5.8h, #0
 calc_epelb  v4, \src0, \src1, \src2, \src3
@@ -461,7 +461,7 @@ function ff_hevc_put_hevc_epel_v64_8_neon, export=1
 .endm
 1:  calc_all16
 .purgem calc
-2: ld1 {v8.8b-v11.8b}, [sp]
+2:  ld1 {v8.8b-v11.8b}, [sp]
 add sp, sp, #32
 ret

The first patch, with mostly small trivial functions, can probably be 
scheduled better for in-order cores. I'll send a patch if I can make them 
measurably faster.


In almost every patch, you have loads/stores to the stack; you use the 
fused stack decrement nicely everywhere possible, but for the loading, 
you're almost always lacking the fused stack increment. I've fixed it now 
for this patchset, but please do keep this in mind and fix it up before 
submitting any further patches. I've fixed that up like this:


 bl  X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm)
-ldp x5, x30, [sp]
 ldp x0, x3, [sp, #16]
-add sp, sp, #32
+ldp x5, x30, [sp], #32
 load_epel_filterh x5, x4

(In many places.)

In one place, you wrote below the stack pointer before decrementing it. 
That's ok on OSes with a defined red zone, but we shouldn't need to assume 
that; I've fixed that like this:


 function ff_hevc_put_hevc_qpel_v48_8_neon, export=1
-stp x5, x30, [sp, #-16]
-stp x0, x1, [sp, #-32]
 stp x2, x3, [sp, #-48]!
+stp x0, x1, [sp, #16]
+stp x5, x30, [sp, #32]

I'll push the patchset with these changes soon.


// Martin

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

2023-10-26 Thread Logan.Lyu

Hi,

I'm sorry that I missed the message from Michael on Oct 14th due to my 
negligence.


And I missed submitting a commit that was earlier than these four 
commits, which caused the corrupted whitespace problem. Now I have 
recreated these patches.


In addition, I rebased it to ensure that these patches can be 
successfully applied on the latest master branch.


Please check again, thank you.


在 2023/10/23 1:18, Martin Storsjö 写道:

On Sun, 22 Oct 2023, Logan.Lyu wrote:


Hi, Martin,

Could you please review these patches and let me know if there are 
any changes needed.


Did you see the message from Michael on Oct 14th? Your patches have 
corrupted whitespace and can't be applied. Earlier you've submitted 
some patches as attached files, and those have been possible to apply.


Secondly; I just pushed some indentation cleanup for aarch64 assembly 
yesterday. In case there are conflicts with your patches, please 
rebase your patches before attempting to resubmit them, so they apply 
cleanly.


// Martin
From 443447657b8ea8684ab2687789b7f77845c83f3f Mon Sep 17 00:00:00 2001
From: Logan Lyu 
Date: Thu, 26 Oct 2023 09:15:24 +0800
Subject: [PATCH 2/5] lavc/aarch64: new optimization for 8-bit hevc_epel_v

checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9

Co-Authored-By: J. Dekker 
---
 libavcodec/aarch64/hevcdsp_epel_neon.S| 223 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 228 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S
index 0705213eed..363750ee7f 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1
 ret
 endfunc
 
+
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr s16, [x1]
+ldr s17, [x1 ,x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().s}[0], [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.4h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2 - 8)
+ldr d16, [x1]
+ldr d17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8b}, [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+st1 {v4.d}[0], [x0], #8
+subsw3, w3, #1
+st1 {v4.s}[2], [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr d16, [x1]
+ldr d17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8b}, [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [x1]
+ldr q17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().16b}, [x1], x2
+moviv4.8h, #0
+moviv5.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+calc_epelb2 v5, \src0, \src1, \src2, \src3
+str q4, [x0]
+subsw3, w3, #1
+

Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

2023-10-22 Thread Martin Storsjö

On Sun, 22 Oct 2023, Logan.Lyu wrote:


Hi, Martin,

Could you please review these patches and let me know if there are any 
changes needed.


Did you see the message from Michael on Oct 14th? Your patches have 
corrupted whitespace and can't be applied. Earlier you've submitted some 
patches as attached files, and those have been possible to apply.


Secondly; I just pushed some indentation cleanup for aarch64 assembly 
yesterday. In case there are conflicts with your patches, please rebase 
your patches before attempting to resubmit them, so they apply cleanly.


// Martin

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

2023-10-22 Thread Logan.Lyu

Hi, Martin,

Could you please review these patches and let me know if there are any 
changes needed.


Thanks.


Logan Lyu

在 2023/10/14 16:45, Logan.Lyu 写道:

checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_epel_neon.S    | 223 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 228 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S

index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, 
export=1

 ret
 endfunc
 +
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub x1, x1, x2
+    mov x10, #(MAX_PB_SIZE * 2)
+    ldr s16, [x1]
+    ldr s17, [x1 ,x2]
+    add x1, x1, x2, lsl #1
+    ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+    ld1 {\src3\().s}[0], [x1], x2
+    movi    v4.8h, #0
+    calc_epelb  v4, \src0, \src1, \src2, \src3
+    subs    w3, w3, #1
+    st1 {v4.4h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub x1, x1, x2
+    mov x10, #(MAX_PB_SIZE * 2 - 8)
+    ldr d16, [x1]
+    ldr d17, [x1, x2]
+    add x1, x1, x2, lsl #1
+    ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+    ld1 {\src3\().8b}, [x1], x2
+    movi    v4.8h, #0
+    calc_epelb  v4, \src0, \src1, \src2, \src3
+    st1 {v4.d}[0], [x0], #8
+    subs    w3, w3, #1
+    st1 {v4.s}[2], [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub x1, x1, x2
+    mov x10, #(MAX_PB_SIZE * 2)
+    ldr d16, [x1]
+    ldr d17, [x1, x2]
+    add x1, x1, x2, lsl #1
+    ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+    ld1 {\src3\().8b}, [x1], x2
+    movi    v4.8h, #0
+    calc_epelb  v4, \src0, \src1, \src2, \src3
+    subs    w3, w3, #1
+    st1 {v4.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub x1, x1, x2
+    mov x10, #(MAX_PB_SIZE * 2)
+    ldr q16, [x1]
+    ldr q17, [x1, x2]
+    add x1, x1, x2, lsl #1
+    ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+    ld1 {\src3\().16b}, [x1], x2
+    movi    v4.8h, #0
+    movi    v5.8h, #0
+    calc_epelb  v4, \src0, \src1, \src2, \src3
+    calc_epelb2 v5, \src0, \src1, \src2, \src3
+    str q4, [x0]
+    subs    w3, w3, #1
+    str d5, [x0, #16]
+    add x0, x0, x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub x1, x1, x2
+    mov x10, #(MAX_PB_SIZE * 2)
+    ldr q16, [x1]
+    ldr q17, [x1, x2]
+    add x1, x1, x2, lsl #1
+    ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+    ld1    {\src3\().16b}, [x1], x2
+    movi    v4.8h, #0
+    movi    v5.8h, #0
+    calc_epelb  v4, \src0, \src1, \src2, \src3
+    calc_epelb2 v5, \src0, \src1, \src2, \src3
+    subs    w3, w3, #1
+    st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+    load_epel_filterb x5, x4
+    sub x1, x1, x2
+    mov x10, 

Re: [FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

2023-10-14 Thread Michael Niedermayer
On Sat, Oct 14, 2023 at 04:45:39PM +0800, Logan.Lyu wrote:
[...]
> diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
> b/libavcodec/aarch64/hevcdsp_epel_neon.S
> index b4ca1e4c20..e541db5430 100644
> --- a/libavcodec/aarch64/hevcdsp_epel_neon.S
> +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
> @@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon,
> export=1
>  ret
>  endfunc
>  +
> +function ff_hevc_put_hevc_epel_v4_8_neon, export=1
> +load_epel_filterb x5, x4

This is not a valid diff, some whitespaces and newlines here are not as
they should be

thx


[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Rewriting code that is poorly written but fully understood is good.
Rewriting code that one doesnt understand is a sign that one is less smart
than the original author, trying to rewrite it will not make it better.


signature.asc
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/4] lavc/aarch64: new optimization for 8-bit hevc_epel_v

2023-10-14 Thread Logan.Lyu

checkasm bench:
put_hevc_epel_v4_8_c: 79.9
put_hevc_epel_v4_8_neon: 25.7
put_hevc_epel_v6_8_c: 151.4
put_hevc_epel_v6_8_neon: 46.4
put_hevc_epel_v8_8_c: 250.9
put_hevc_epel_v8_8_neon: 41.7
put_hevc_epel_v12_8_c: 542.7
put_hevc_epel_v12_8_neon: 108.7
put_hevc_epel_v16_8_c: 939.4
put_hevc_epel_v16_8_neon: 169.2
put_hevc_epel_v24_8_c: 2104.9
put_hevc_epel_v24_8_neon: 307.9
put_hevc_epel_v32_8_c: 3713.9
put_hevc_epel_v32_8_neon: 524.2
put_hevc_epel_v48_8_c: 8175.2
put_hevc_epel_v48_8_neon: 1197.2
put_hevc_epel_v64_8_c: 16049.4
put_hevc_epel_v64_8_neon: 2094.9

Co-Authored-By: J. Dekker 
Signed-off-by: Logan Lyu 
---
 libavcodec/aarch64/hevcdsp_epel_neon.S| 223 ++
 libavcodec/aarch64/hevcdsp_init_aarch64.c |   5 +
 2 files changed, 228 insertions(+)

diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S 
b/libavcodec/aarch64/hevcdsp_epel_neon.S

index b4ca1e4c20..e541db5430 100644
--- a/libavcodec/aarch64/hevcdsp_epel_neon.S
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -243,6 +243,229 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, 
export=1

 ret
 endfunc
 +
+function ff_hevc_put_hevc_epel_v4_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr s16, [x1]
+ldr s17, [x1 ,x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.s}[0], [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().s}[0], [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.4h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v6_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2 - 8)
+ldr d16, [x1]
+ldr d17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8b}, [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+st1 {v4.d}[0], [x0], #8
+subsw3, w3, #1
+st1 {v4.s}[2], [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v8_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr d16, [x1]
+ldr d17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.8b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().8b}, [x1], x2
+moviv4.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v12_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [x1]
+ldr q17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1 {\src3\().16b}, [x1], x2
+moviv4.8h, #0
+moviv5.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+calc_epelb2 v5, \src0, \src1, \src2, \src3
+str q4, [x0]
+subsw3, w3, #1
+str d5, [x0, #16]
+add x0, x0, x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v16_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ldr q16, [x1]
+ldr q17, [x1, x2]
+add x1, x1, x2, lsl #1
+ld1 {v18.16b}, [x1], x2
+.macro calc src0, src1, src2, src3
+ld1{\src3\().16b}, [x1], x2
+moviv4.8h, #0
+moviv5.8h, #0
+calc_epelb  v4, \src0, \src1, \src2, \src3
+calc_epelb2 v5, \src0, \src1, \src2, \src3
+subsw3, w3, #1
+st1 {v4.8h, v5.8h}, [x0], x10
+.endm
+1:  calc_all4
+.purgem calc
+2:  ret
+endfunc
+
+function ff_hevc_put_hevc_epel_v24_8_neon, export=1
+load_epel_filterb x5, x4
+sub x1, x1, x2
+mov x10, #(MAX_PB_SIZE * 2)
+ld1 {v16.8b, v17.8b, v18.8b}, [x1], x2
+ld1 {v19.8b, v20.8b, v21.8b}, [x1], x2
+ld1