Re: [FFmpeg-devel] [PATCH] h264_i386: Optimize decode_significance_8x8_x86 for 64 bit.

2014-12-03 Thread Reimar Döffinger
On 03.12.2014, at 01:40, Michael Niedermayer michae...@gmx.at wrote:
 On Sat, Nov 22, 2014 at 02:09:01PM +0100, Reimar Döffinger wrote:
 On Mon, Nov 17, 2014 at 01:41:13PM +0100, Michael Niedermayer wrote:
 On Mon, Nov 17, 2014 at 08:19:32AM +0100, Reimar Döffinger wrote:
 On 17.11.2014, at 02:37, Michael Niedermayer michae...@gmx.at wrote:
 On Sat, Nov 15, 2014 at 06:16:03PM +0100, Reimar Döffinger wrote:
 11674 - 10877 decicycles on my Phenom II.
 Overall speedup was unfortunately within measurement error.
 
 here its  10153 -10135
 
 I suspect it also depends a bit on the compiler and how it changes the 
 surrounding code.
 Note that I also tested with PIC actually.
 
 but ive a slightly odd feeling about the chnages to the asm code,
 iam not sure if all assemblers will be happy about the changed
 code
 
 Do you mean particularly the movzbl change?
 
 yes and the k stuff
 
 
 I am also unsure about that, I think there was a reason for that %k6 
 mess...
 But this as well as movzx seemed to work for me...
 
 it works here too i just have the feeling it might fail on some odd
 assembler or platform. Thats not meant to keep you from pushing this
 just that it might require to be reverted or fixed if such
 problems actually occor
 
 I pushed it.
 If anyone sees issues please tell me and I'll look into it!
 
 i think these fate failures are caused by it but thats based just
 on other commits in the range looking unlikely:
 
 http://fate.ffmpeg.org/report.cgi?time=20141122231657slot=x86_64-darwin-clang-3.5-O3
 http://fate.ffmpeg.org/report.cgi?time=2014113720slot=x86_64-darwin-clang-3.5

That's annoying, I only expected compile errors, this looks more like a 
compiler bug.
Can someone run tests?
Does just using the m instead of r constraint like on 32 bit fix it?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] h264_i386: Optimize decode_significance_8x8_x86 for 64 bit.

2014-12-03 Thread Michael Niedermayer
On Wed, Dec 03, 2014 at 09:00:39AM +0100, Reimar Döffinger wrote:
 On 03.12.2014, at 01:40, Michael Niedermayer michae...@gmx.at wrote:
  On Sat, Nov 22, 2014 at 02:09:01PM +0100, Reimar Döffinger wrote:
  On Mon, Nov 17, 2014 at 01:41:13PM +0100, Michael Niedermayer wrote:
  On Mon, Nov 17, 2014 at 08:19:32AM +0100, Reimar Döffinger wrote:
  On 17.11.2014, at 02:37, Michael Niedermayer michae...@gmx.at wrote:
  On Sat, Nov 15, 2014 at 06:16:03PM +0100, Reimar Döffinger wrote:
  11674 - 10877 decicycles on my Phenom II.
  Overall speedup was unfortunately within measurement error.
  
  here its  10153 -10135
  
  I suspect it also depends a bit on the compiler and how it changes the 
  surrounding code.
  Note that I also tested with PIC actually.
  
  but ive a slightly odd feeling about the chnages to the asm code,
  iam not sure if all assemblers will be happy about the changed
  code
  
  Do you mean particularly the movzbl change?
  
  yes and the k stuff
  
  
  I am also unsure about that, I think there was a reason for that %k6 
  mess...
  But this as well as movzx seemed to work for me...
  
  it works here too i just have the feeling it might fail on some odd
  assembler or platform. Thats not meant to keep you from pushing this
  just that it might require to be reverted or fixed if such
  problems actually occor
  
  I pushed it.
  If anyone sees issues please tell me and I'll look into it!
  
  i think these fate failures are caused by it but thats based just
  on other commits in the range looking unlikely:
  
  http://fate.ffmpeg.org/report.cgi?time=20141122231657slot=x86_64-darwin-clang-3.5-O3
  http://fate.ffmpeg.org/report.cgi?time=2014113720slot=x86_64-darwin-clang-3.5
 
 That's annoying, I only expected compile errors, this looks more like a 
 compiler bug.
 Can someone run tests?
 Does just using the m instead of r constraint like on 32 bit fix it?

still aborts with:

@@ -37,7 +37,7 @@
 #if HAVE_INLINE_ASM

 #if ARCH_X86_64
-#define REG64 r
+#define REG64 m
 #else
 #define REG64 m
 #endif

ggdb shows not much usefull:
Program received signal SIGABRT, Aborted.
0x7fff82a31866 in ?? ()
(gdb) bt
#0  0x7fff82a31866 in ?? ()
#1  0x7fff8ec4735c in ?? ()
warning: (Internal error: pc 0x0 in read in psymtab, but not in symtab.)

#2  0x in ?? ()
(gdb) disassemble $rip-32,$rip+32
Dump of assembler code from 0x7fff82a31846 to 0x7fff82a31886:
   0x7fff82a31846:  add%eax,(%rax)
   0x7fff82a31848:  add-0x77(%rcx),%cl
   0x7fff82a3184b:  lret   $0x50f
   0x7fff82a3184e:  jae0x7fff82a31858
   0x7fff82a31850:  mov%rax,%rdi
   0x7fff82a31853:  jmpq   0x7fff82a2e175
   0x7fff82a31858:  retq
   0x7fff82a31859:  nop
   0x7fff82a3185a:  nop
   0x7fff82a3185b:  nop
   0x7fff82a3185c:  mov$0x2000148,%eax
   0x7fff82a31861:  mov%rcx,%r10
   0x7fff82a31864:  syscall
= 0x7fff82a31866:  jae0x7fff82a31870
   0x7fff82a31868:  mov%rax,%rdi
   0x7fff82a3186b:  jmpq   0x7fff82a2e175
   0x7fff82a31870:  retq
   0x7fff82a31871:  nop
   0x7fff82a31872:  nop
   0x7fff82a31873:  nop
   0x7fff82a31874:  mov$0x200014c,%eax
   0x7fff82a31879:  mov%rcx,%r10
   0x7fff82a3187c:  syscall
   0x7fff82a3187e:  jae0x7fff82a31888
   0x7fff82a31880:  mov%rax,%rdi
   0x7fff82a31883:  jmpq   0x7fff82a2e175



-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Old school: Use the lowest level language in which you can solve the problem
conveniently.
New school: Use the highest level language in which the latest supercomputer
can solve the problem without the user falling asleep waiting.


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] h264_i386: Optimize decode_significance_8x8_x86 for 64 bit.

2014-12-03 Thread Reimar Döffinger
On Wed, Dec 03, 2014 at 01:19:48PM +0100, Michael Niedermayer wrote:
 On Wed, Dec 03, 2014 at 09:00:39AM +0100, Reimar Döffinger wrote:
  On 03.12.2014, at 01:40, Michael Niedermayer michae...@gmx.at wrote:
   On Sat, Nov 22, 2014 at 02:09:01PM +0100, Reimar Döffinger wrote:
   On Mon, Nov 17, 2014 at 01:41:13PM +0100, Michael Niedermayer wrote:
   On Mon, Nov 17, 2014 at 08:19:32AM +0100, Reimar Döffinger wrote:
   On 17.11.2014, at 02:37, Michael Niedermayer michae...@gmx.at wrote:
   On Sat, Nov 15, 2014 at 06:16:03PM +0100, Reimar Döffinger wrote:
   11674 - 10877 decicycles on my Phenom II.
   Overall speedup was unfortunately within measurement error.
   
   here its  10153 -10135
   
   I suspect it also depends a bit on the compiler and how it changes the 
   surrounding code.
   Note that I also tested with PIC actually.
   
   but ive a slightly odd feeling about the chnages to the asm code,
   iam not sure if all assemblers will be happy about the changed
   code
   
   Do you mean particularly the movzbl change?
   
   yes and the k stuff
   
   
   I am also unsure about that, I think there was a reason for that %k6 
   mess...
   But this as well as movzx seemed to work for me...
   
   it works here too i just have the feeling it might fail on some odd
   assembler or platform. Thats not meant to keep you from pushing this
   just that it might require to be reverted or fixed if such
   problems actually occor
   
   I pushed it.
   If anyone sees issues please tell me and I'll look into it!
   
   i think these fate failures are caused by it but thats based just
   on other commits in the range looking unlikely:
   
   http://fate.ffmpeg.org/report.cgi?time=20141122231657slot=x86_64-darwin-clang-3.5-O3
   http://fate.ffmpeg.org/report.cgi?time=2014113720slot=x86_64-darwin-clang-3.5
  
  That's annoying, I only expected compile errors, this looks more like a 
  compiler bug.
  Can someone run tests?
  Does just using the m instead of r constraint like on 32 bit fix it?
 
 still aborts with:

Oh dear.
On re-reading the code it seems I got a bit confused on what %0 actually
points to (I somehow thought it actually pointed to the on-stack x86_reg).
I can't test and benchmark today, but I think this one might fix it:
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -178,7 +178,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
 
 mov %2, %0 \n\t
 mov %1, %6 \n\t
-mov %6, (%0)   \n\t
+mov %k6, (%0)  \n\t
 
 test $1, %4\n\t
  jnz 5f\n\t
@@ -191,7 +191,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
 cmp $63, %6\n\t
  jb 3b \n\t
 mov %2, %0 \n\t
-mov %6, (%0)   \n\t
+mov %k6, (%0)  \n\t
 5: \n\t
 addl %8, %k0   \n\t
 shr $2, %k0\n\t

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] h264_i386: Optimize decode_significance_8x8_x86 for 64 bit.

2014-12-03 Thread Michael Niedermayer
On Wed, Dec 03, 2014 at 10:39:00PM +0100, Reimar Döffinger wrote:
 On Wed, Dec 03, 2014 at 01:19:48PM +0100, Michael Niedermayer wrote:
  On Wed, Dec 03, 2014 at 09:00:39AM +0100, Reimar Döffinger wrote:
   On 03.12.2014, at 01:40, Michael Niedermayer michae...@gmx.at wrote:
On Sat, Nov 22, 2014 at 02:09:01PM +0100, Reimar Döffinger wrote:
On Mon, Nov 17, 2014 at 01:41:13PM +0100, Michael Niedermayer wrote:
On Mon, Nov 17, 2014 at 08:19:32AM +0100, Reimar Döffinger wrote:
On 17.11.2014, at 02:37, Michael Niedermayer michae...@gmx.at 
wrote:
On Sat, Nov 15, 2014 at 06:16:03PM +0100, Reimar Döffinger wrote:
11674 - 10877 decicycles on my Phenom II.
Overall speedup was unfortunately within measurement error.

here its  10153 -10135

I suspect it also depends a bit on the compiler and how it changes 
the surrounding code.
Note that I also tested with PIC actually.

but ive a slightly odd feeling about the chnages to the asm code,
iam not sure if all assemblers will be happy about the changed
code

Do you mean particularly the movzbl change?

yes and the k stuff


I am also unsure about that, I think there was a reason for that %k6 
mess...
But this as well as movzx seemed to work for me...

it works here too i just have the feeling it might fail on some odd
assembler or platform. Thats not meant to keep you from pushing this
just that it might require to be reverted or fixed if such
problems actually occor

I pushed it.
If anyone sees issues please tell me and I'll look into it!

i think these fate failures are caused by it but thats based just
on other commits in the range looking unlikely:

http://fate.ffmpeg.org/report.cgi?time=20141122231657slot=x86_64-darwin-clang-3.5-O3
http://fate.ffmpeg.org/report.cgi?time=2014113720slot=x86_64-darwin-clang-3.5
   
   That's annoying, I only expected compile errors, this looks more like a 
   compiler bug.
   Can someone run tests?
   Does just using the m instead of r constraint like on 32 bit fix it?
  
  still aborts with:
 
 Oh dear.
 On re-reading the code it seems I got a bit confused on what %0 actually
 points to (I somehow thought it actually pointed to the on-stack x86_reg).
 I can't test and benchmark today, but I think this one might fix it:

applied

thanks

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

He who knows, does not speak. He who speaks, does not know. -- Lao Tsu


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] h264_i386: Optimize decode_significance_8x8_x86 for 64 bit.

2014-12-02 Thread Michael Niedermayer
On Sat, Nov 22, 2014 at 02:09:01PM +0100, Reimar Döffinger wrote:
 On Mon, Nov 17, 2014 at 01:41:13PM +0100, Michael Niedermayer wrote:
  On Mon, Nov 17, 2014 at 08:19:32AM +0100, Reimar Döffinger wrote:
   On 17.11.2014, at 02:37, Michael Niedermayer michae...@gmx.at wrote:
On Sat, Nov 15, 2014 at 06:16:03PM +0100, Reimar Döffinger wrote:
11674 - 10877 decicycles on my Phenom II.
Overall speedup was unfortunately within measurement error.

here its  10153 -10135
   
   I suspect it also depends a bit on the compiler and how it changes the 
   surrounding code.
   Note that I also tested with PIC actually.
   
but ive a slightly odd feeling about the chnages to the asm code,
iam not sure if all assemblers will be happy about the changed
code
   
   Do you mean particularly the movzbl change?
  
  yes and the k stuff
  
  
   I am also unsure about that, I think there was a reason for that %k6 
   mess...
   But this as well as movzx seemed to work for me...
  
  it works here too i just have the feeling it might fail on some odd
  assembler or platform. Thats not meant to keep you from pushing this
  just that it might require to be reverted or fixed if such
  problems actually occor
 
 I pushed it.
 If anyone sees issues please tell me and I'll look into it!

i think these fate failures are caused by it but thats based just
on other commits in the range looking unlikely:

http://fate.ffmpeg.org/report.cgi?time=20141122231657slot=x86_64-darwin-clang-3.5-O3
http://fate.ffmpeg.org/report.cgi?time=2014113720slot=x86_64-darwin-clang-3.5

[...]

-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

There will always be a question for which you do not know the correct answer.


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] h264_i386: Optimize decode_significance_8x8_x86 for 64 bit.

2014-11-22 Thread Reimar Döffinger
On Mon, Nov 17, 2014 at 01:41:13PM +0100, Michael Niedermayer wrote:
 On Mon, Nov 17, 2014 at 08:19:32AM +0100, Reimar Döffinger wrote:
  On 17.11.2014, at 02:37, Michael Niedermayer michae...@gmx.at wrote:
   On Sat, Nov 15, 2014 at 06:16:03PM +0100, Reimar Döffinger wrote:
   11674 - 10877 decicycles on my Phenom II.
   Overall speedup was unfortunately within measurement error.
   
   here its  10153 -10135
  
  I suspect it also depends a bit on the compiler and how it changes the 
  surrounding code.
  Note that I also tested with PIC actually.
  
   but ive a slightly odd feeling about the chnages to the asm code,
   iam not sure if all assemblers will be happy about the changed
   code
  
  Do you mean particularly the movzbl change?
 
 yes and the k stuff
 
 
  I am also unsure about that, I think there was a reason for that %k6 mess...
  But this as well as movzx seemed to work for me...
 
 it works here too i just have the feeling it might fail on some odd
 assembler or platform. Thats not meant to keep you from pushing this
 just that it might require to be reverted or fixed if such
 problems actually occor

I pushed it.
If anyone sees issues please tell me and I'll look into it!
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] h264_i386: Optimize decode_significance_8x8_x86 for 64 bit.

2014-11-16 Thread Michael Niedermayer
On Sat, Nov 15, 2014 at 06:16:03PM +0100, Reimar Döffinger wrote:
 11674 - 10877 decicycles on my Phenom II.
 Overall speedup was unfortunately within measurement error.

here its  10153 -10135

but ive a slightly odd feeling about the chnages to the asm code,
iam not sure if all assemblers will be happy about the changed
code

[...]
-- 
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

The greatest way to live with honor in this world is to be what we pretend
to be. -- Socrates


signature.asc
Description: Digital signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH] h264_i386: Optimize decode_significance_8x8_x86 for 64 bit.

2014-11-16 Thread Reimar Döffinger
On 17.11.2014, at 02:37, Michael Niedermayer michae...@gmx.at wrote:
 On Sat, Nov 15, 2014 at 06:16:03PM +0100, Reimar Döffinger wrote:
 11674 - 10877 decicycles on my Phenom II.
 Overall speedup was unfortunately within measurement error.
 
 here its  10153 -10135

I suspect it also depends a bit on the compiler and how it changes the 
surrounding code.
Note that I also tested with PIC actually.

 but ive a slightly odd feeling about the chnages to the asm code,
 iam not sure if all assemblers will be happy about the changed
 code

Do you mean particularly the movzbl change?
I am also unsure about that, I think there was a reason for that %k6 mess...
But this as well as movzx seemed to work for me...
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


[FFmpeg-devel] [PATCH] h264_i386: Optimize decode_significance_8x8_x86 for 64 bit.

2014-11-15 Thread Reimar Döffinger
11674 - 10877 decicycles on my Phenom II.
Overall speedup was unfortunately within measurement error.

Signed-off-by: Reimar Döffinger reimar.doeffin...@gmx.de
---
 libavcodec/x86/h264_i386.h | 30 ++
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/h264_i386.h b/libavcodec/x86/h264_i386.h
index ef65cf8..dcba42d 100644
--- a/libavcodec/x86/h264_i386.h
+++ b/libavcodec/x86/h264_i386.h
@@ -36,6 +36,12 @@
 
 #if HAVE_INLINE_ASM
 
+#if ARCH_X86_64
+#define REG64 r
+#else
+#define REG64 m
+#endif
+
 //FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
 //as that would make optimization work hard)
 #if HAVE_7REGS  !BROKEN_COMPILER
@@ -140,7 +146,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
 3: \n\t
 
 mov %10, %0\n\t
-movzbl (%0, %6), %k6   \n\t
+movzb (%0, %6), %6 \n\t
 add %9, %6 \n\t
 
 BRANCHLESS_GET_CABAC(%4, %q4, (%6), %3, %w3,
@@ -151,14 +157,14 @@ static int decode_significance_8x8_x86(CABACContext *c,
  AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
  %15)
 
-mov %1, %k6\n\t
+mov %1, %6 \n\t
 test $1, %4\n\t
  jz 4f \n\t
 
 #ifdef BROKEN_RELOCATIONS
-movzbl %c14(%15, %q6), %k6\n\t
+movzb %c14(%15, %q6), %6\n\t
 #else
-movzbl MANGLE(ff_h264_cabac_tables)+%c14(%k6), %k6\n\t
+movzb MANGLE(ff_h264_cabac_tables)+%c14(%6), %6\n\t
 #endif
 add %11, %6\n\t
 
@@ -171,8 +177,8 @@ static int decode_significance_8x8_x86(CABACContext *c,
  %15)
 
 mov %2, %0 \n\t
-mov %1, %k6\n\t
-movl %k6, (%0) \n\t
+mov %1, %6 \n\t
+mov %6, (%0)   \n\t
 
 test $1, %4\n\t
  jnz 5f\n\t
@@ -180,19 +186,19 @@ static int decode_significance_8x8_x86(CABACContext *c,
 addOPSIZE  $4, %2\n\t
 
 4: \n\t
-addl $1, %k6   \n\t
-mov %k6, %1\n\t
-cmpl $63, %k6  \n\t
+add $1, %6 \n\t
+mov %6, %1 \n\t
+cmp $63, %6\n\t
  jb 3b \n\t
 mov %2, %0 \n\t
-movl %k6, (%0) \n\t
+mov %6, (%0)   \n\t
 5: \n\t
 addl %8, %k0   \n\t
 shr $2, %k0\n\t
-: =q(coeff_count), +m(last), +m(index), +r(c-low),
+: =q(coeff_count), +REG64(last), +REG64(index), +r(c-low),
   =r(bit), +r(c-range), =r(state)
 : r(c), m(minusindex), m(significant_coeff_ctx_base),
-  m(sig_off), m(last_coeff_ctx_base),
+  REG64(sig_off), REG64(last_coeff_ctx_base),
   i(offsetof(CABACContext, bytestream)),
   i(offsetof(CABACContext, bytestream_end)),
   i(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
-- 
2.1.3

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel