[FFmpeg-cvslog] aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it

2017-04-04 Thread Martin Storsjö
ffmpeg | branch: master | Martin Storsjö  | Tue Nov 22 
15:47:17 2016 +0200| [2f99117f6ff24ce5be2abb9e014cb8b86c2aa0e0] | committer: 
Martin Storsjö

aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it

Signed-off-by: Martin Storsjö 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2f99117f6ff24ce5be2abb9e014cb8b86c2aa0e0
---

 libavcodec/aarch64/vp9itxfm_neon.S | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index 2dc6b75..f4194a6 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -599,9 +599,9 @@ endfunc
 // x1 = unused
 // x2 = src
 // x3 = slice offset
+// x9 = input stride
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
-mov x9, #32
 moviv2.8h, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 load_clear  \i,  x2,  x9
@@ -649,8 +649,8 @@ endfunc
 // x1 = dst stride
 // x2 = src (temp buffer)
 // x3 = slice offset
+// x9 = temp buffer stride
 function \txfm\()16_1d_8x16_pass2_neon
-mov x9, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 load\i,  x2,  x9
 .endr
@@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1,idct
 ld1 {v0.8h,v1.8h}, [x10]
 .endif
+mov x9, #32
 
 .irp i, 0, 8
 add x0,  sp,  #(\i*32)
@@ -882,13 +883,12 @@ endfunc
 // x0 = dst (temp buffer)
 // x1 = unused
 // x2 = src
+// x9 = double input stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1_neon
 ld1 {v0.8h,v1.8h}, [x10]
 
-// Double stride of the input, since we only read every other line
-mov x9,  #128
 moviv4.8h, #0
 
 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -987,12 +987,13 @@ endfunc
 // x0 = dst
 // x1 = dst stride
 // x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2_neon
 ld1 {v0.8h,v1.8h}, [x10]
 
-mov x9, #128
 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 ld1 {v\i\().8h}, [x2], x9
@@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon
 
 idct16
 
-mov x9,  #128
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 st1 {v\i\().8h}, [x2], x9
 .endr
@@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon
 
 idct32_odd
 
-mov x9,  #128
 .macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
 ld1 {v4.8h},  [x2], x9
 ld1 {v5.8h},  [x2], x9
-.if \neg == 0
 add v4.8h, v4.8h, v\a\().8h
 ld1 {v6.8h},  [x2], x9
 add v5.8h, v5.8h, v\b\().8h
@@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon
 add v6.8h, v6.8h, v\c\().8h
 add v7.8h, v7.8h, v\d\().8h
 .else
+ld1 {v4.8h},  [x2], x7
+ld1 {v5.8h},  [x2], x7
 sub v4.8h, v4.8h, v\a\().8h
-ld1 {v6.8h},  [x2], x9
+ld1 {v6.8h},  [x2], x7
 sub v5.8h, v5.8h, v\b\().8h
-ld1 {v7.8h},  [x2], x9
+ld1 {v7.8h},  [x2], x7
 sub v6.8h, v6.8h, v\c\().8h
 sub v7.8h, v7.8h, v\d\().8h
 .endif
@@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon
 load_acc_store  23, 22, 21, 20
 load_acc_store  19, 18, 17, 16
 sub x2,  x2,  x9
-neg x9,  x9
 load_acc_store  16, 17, 18, 19, 1
 load_acc_store  20, 21, 22, 23, 1
 load_acc_store  24, 25, 26, 27, 1
@@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
 mov x5,  x1
 mov x6,  x2
 
+// Double stride of the input, since we only read every other line
+mov x9,  #128
+neg x7,  x9
+
 .irp i, 0, 8, 16, 24
 add x0,  sp,  #(\i*64)
 add x2,  x6,  #(\i*2)

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog


[FFmpeg-cvslog] aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it

2017-01-14 Thread Martin Storsjö
ffmpeg | branch: master | Martin Storsjö  | Tue Jan 10 
00:15:13 2017 +0200| [37cb224e3e65b92eb6d77f1a788d882fbee972c3] | committer: 
Michael Niedermayer

aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it

This is cherrypicked from libav commit
2f99117f6ff24ce5be2abb9e014cb8b86c2aa0e0.

Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=37cb224e3e65b92eb6d77f1a788d882fbee972c3
---

 libavcodec/aarch64/vp9itxfm_neon.S | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S 
b/libavcodec/aarch64/vp9itxfm_neon.S
index d5165bf..e5fc612 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -599,9 +599,9 @@ endfunc
 // x1 = unused
 // x2 = src
 // x3 = slice offset
+// x9 = input stride
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
-mov x9, #32
 moviv2.8h, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 load_clear  \i,  x2,  x9
@@ -649,8 +649,8 @@ endfunc
 // x1 = dst stride
 // x2 = src (temp buffer)
 // x3 = slice offset
+// x9 = temp buffer stride
 function \txfm\()16_1d_8x16_pass2_neon
-mov x9, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
 load\i,  x2,  x9
 .endr
@@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1,idct
 ld1 {v0.8h,v1.8h}, [x10]
 .endif
+mov x9, #32
 
 .irp i, 0, 8
 add x0,  sp,  #(\i*32)
@@ -882,13 +883,12 @@ endfunc
 // x0 = dst (temp buffer)
 // x1 = unused
 // x2 = src
+// x9 = double input stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1_neon
 ld1 {v0.8h,v1.8h}, [x10]
 
-// Double stride of the input, since we only read every other line
-mov x9,  #128
 moviv4.8h, #0
 
 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -987,12 +987,13 @@ endfunc
 // x0 = dst
 // x1 = dst stride
 // x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2_neon
 ld1 {v0.8h,v1.8h}, [x10]
 
-mov x9, #128
 // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 ld1 {v\i\().8h}, [x2], x9
@@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon
 
 idct16
 
-mov x9,  #128
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 st1 {v\i\().8h}, [x2], x9
 .endr
@@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon
 
 idct32_odd
 
-mov x9,  #128
 .macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
 ld1 {v4.8h},  [x2], x9
 ld1 {v5.8h},  [x2], x9
-.if \neg == 0
 add v4.8h, v4.8h, v\a\().8h
 ld1 {v6.8h},  [x2], x9
 add v5.8h, v5.8h, v\b\().8h
@@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon
 add v6.8h, v6.8h, v\c\().8h
 add v7.8h, v7.8h, v\d\().8h
 .else
+ld1 {v4.8h},  [x2], x7
+ld1 {v5.8h},  [x2], x7
 sub v4.8h, v4.8h, v\a\().8h
-ld1 {v6.8h},  [x2], x9
+ld1 {v6.8h},  [x2], x7
 sub v5.8h, v5.8h, v\b\().8h
-ld1 {v7.8h},  [x2], x9
+ld1 {v7.8h},  [x2], x7
 sub v6.8h, v6.8h, v\c\().8h
 sub v7.8h, v7.8h, v\d\().8h
 .endif
@@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon
 load_acc_store  23, 22, 21, 20
 load_acc_store  19, 18, 17, 16
 sub x2,  x2,  x9
-neg x9,  x9
 load_acc_store  16, 17, 18, 19, 1
 load_acc_store  20, 21, 22, 23, 1
 load_acc_store  24, 25, 26, 27, 1
@@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
 mov x5,  x1
 mov x6,  x2
 
+// Double stride of the input, since we only read every other line
+mov x9,  #128
+neg x7,  x9
+
 .irp i, 0, 8, 16, 24
 add x0,  sp,  #(\i*64)
 add x2,  x6,  #(\i*2)

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog