Hi,

2014-12-02 14:31 GMT+01:00 Benoit Fouet <benoit.fo...@free.fr>:
> Fixes ticket #4148

Please try that one instead.

As all your changes have been reverted, I've put myself as the only
author. I've left your "signed-off-by", but I'm not sure of its
purpose now.

Christophe
From 76ddca41e1f4ab4df348b64ddbf63c58153c8c50 Mon Sep 17 00:00:00 2001
From: Christophe Gisquet <christophe.gisq...@gmail.com>
Date: Tue, 2 Dec 2014 14:31:49 +0100
Subject: [PATCH] pngdsp x86: use unaligned access

For test images manually generated to contain onlu up prediction,
timing results:
         8380x3032    255x185
before:   138635       1992
after:    139232       1996

Actually jumping to the proper version depending on the alignment:
8380x3032: 138767

A 0.5% speed improvement for gigantic images is not worth the code
duplication.

Fixes ticket #4148

Signed-off-by: Christophe Gisquet <christophe.gisq...@gmail.com>
Signed-off-by: Benoit Fouet <benoit.fo...@gmail.com>
---
 libavcodec/pngdsp.h       |  4 ++--
 libavcodec/x86/pngdsp.asm | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/libavcodec/pngdsp.h b/libavcodec/pngdsp.h
index 1475b0c..fbc1a50 100644
--- a/libavcodec/pngdsp.h
+++ b/libavcodec/pngdsp.h
@@ -25,9 +25,9 @@
 #include <stdint.h>
 
 typedef struct PNGDSPContext {
-    void (*add_bytes_l2)(uint8_t *dst  /* align 16 */,
+    void (*add_bytes_l2)(uint8_t *dst,
                          uint8_t *src1 /* align 16 */,
-                         uint8_t *src2 /* align 16 */, int w);
+                         uint8_t *src2, int w);
 
     /* this might write to dst[w] */
     void (*add_paeth_prediction)(uint8_t *dst, uint8_t *src,
diff --git a/libavcodec/x86/pngdsp.asm b/libavcodec/x86/pngdsp.asm
index 8e23ccf..678a032 100644
--- a/libavcodec/x86/pngdsp.asm
+++ b/libavcodec/x86/pngdsp.asm
@@ -42,12 +42,12 @@ cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
     and                waq, ~(mmsize*2-1)
     jmp .end_v
 .loop_v:
-    mova                m0, [src1q+iq]
-    mova                m1, [src1q+iq+mmsize]
-    paddb               m0, [src2q+iq]
-    paddb               m1, [src2q+iq+mmsize]
-    mova  [dstq+iq       ], m0
-    mova  [dstq+iq+mmsize], m1
+    movu                m0, [src2q+iq]
+    movu                m1, [src2q+iq+mmsize]
+    paddb               m0, [src1q+iq]
+    paddb               m1, [src1q+iq+mmsize]
+    movu  [dstq+iq       ], m0
+    movu  [dstq+iq+mmsize], m1
     add                 iq, mmsize*2
 .end_v:
     cmp                 iq, waq
-- 
1.9.2.msysgit.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to