Ronald S. Bultje <[email protected]> added the comment:
I was wrong, it was the 16x16, there was an overflow in one of the
functions. Attached patch fixes it. I've rearranged the calls a little so
that it hopefully doesn't get slower. I haven't benchmarked the change,
but effect should be minimal (if any).
________________________________________________
FFmpeg issue tracker <[email protected]>
<https://roundup.ffmpeg.org/issue2547>
________________________________________________
Index: libavcodec/x86/h264_intrapred.asm
===================================================================
--- libavcodec/x86/h264_intrapred.asm (revision 26343)
+++ libavcodec/x86/h264_intrapred.asm (working copy)
@@ -344,28 +344,6 @@
%endif
paddw m0, m1 ; sum of H coefficients
-%ifidn %3, h264
- pmullw m0, [pw_5]
- paddw m0, [pw_32]
- psraw m0, 6
-%elifidn %3, rv40
- pmullw m0, [pw_5]
- psraw m0, 6
-%elifidn %3, svq3
- movd r3d, m0
- movsx r3, r3w
- test r3, r3
- lea r4, [r3+3]
- cmovs r3, r4
- sar r3, 2 ; H/4
- lea r3, [r3*5] ; 5*(H/4)
- test r3, r3
- lea r4, [r3+15]
- cmovs r3, r4
- sar r3, 4 ; (5*(H/4))/16
- movd m0, r3d
-%endif
-
lea r4, [r0+r2*8-1]
lea r3, [r0+r2*4-1]
add r4, r2
@@ -468,8 +446,30 @@
movzx r3, byte [r3+r2*2 ]
lea r3, [r3+r4+1]
shl r3, 4
+
movd r1d, m0
movsx r1d, r1w
+%ifnidn %3, svq3
+%ifidn %3, h264
+ lea r1d, [r1d*5+32]
+%else ; rv40
+ lea r1d, [r1d*5]
+%endif
+ sar r1d, 6
+ movd m0, r1d
+%else ; svq3
+ test r1, r1
+ lea r4, [r1+3]
+ cmovs r1, r4
+ sar r1, 2 ; H/4
+ lea r1, [r1*5] ; 5*(H/4)
+ test r1, r1
+ lea r4, [r1+15]
+ cmovs r1, r4
+ sar r1, 4 ; (5*(H/4))/16
+ movd m0, r1d
+%endif
+
add r1d, r5d
add r3d, r1d
shl r1d, 3