30-50% faster than the C implementation, 0.5% overall speedup on
bourne.rmvb.
---
 libavcodec/arm/rv34dsp_init_neon.c |    9 +++++++--
 libavcodec/arm/rv34dsp_neon.S      |   29 +++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/libavcodec/arm/rv34dsp_init_neon.c 
b/libavcodec/arm/rv34dsp_init_neon.c
index 9a09fde..16bda46 100644
--- a/libavcodec/arm/rv34dsp_init_neon.c
+++ b/libavcodec/arm/rv34dsp_init_neon.c
@@ -26,8 +26,13 @@
 void ff_rv34_inv_transform_neon(DCTELEM *block);
 void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
 
+void ff_rv34_inv_transform_dc_neon(DCTELEM *block);
+void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block);
+
 void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
 {
-    c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
-    c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
+    c->rv34_inv_transform_tab[0]    = ff_rv34_inv_transform_neon;
+    c->rv34_inv_transform_tab[1]    = ff_rv34_inv_transform_noround_neon;
+    c->rv34_inv_transform_dc_tab[0] = ff_rv34_inv_transform_dc_neon;
+    c->rv34_inv_transform_dc_tab[1] = ff_rv34_inv_transform_noround_dc_neon;
 }
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index f700f5c..e776af0 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -107,3 +107,32 @@ function ff_rv34_inv_transform_noround_neon, export=1
         vst4.16         {d0[3], d1[3], d2[3], d3[3]}, [r2,:64], r1
         bx              lr
 endfunc
+
+/* void rv34_inv_transform_dc_c(DCTELEM *block) */
+function ff_rv34_inv_transform_dc_neon, export=1
+        vld1.16         d28[], [r0:16]         @ block[0]
+        vmov.i16        d4,  #169
+        mov             r1,  #16
+        vmull.s16       q3,  d28, d4
+        vrshrn.s32      d0,  q3,  #10
+        vst1.16         {d0}, [r0:64], r1
+        vst1.16         {d0}, [r0:64], r1
+        vst1.16         {d0}, [r0:64], r1
+        vst1.16         {d0}, [r0:64], r1
+        bx              lr
+endfunc
+
+/* void rv34_inv_transform_dc_noround_c(DCTELEM *block) */
+function ff_rv34_inv_transform_noround_dc_neon, export=1
+        vld1.16         d28[], [r0:16]         @ block[0]
+        vmov.i16        d4,  #251
+        vorr.s16        d4,  #256              @ 13^2 * 3
+        mov             r1,  #16
+        vmull.s16       q3,  d28, d4
+        vshrn.s32       d0,  q3,  #11
+        vst1.64         {d0}, [r0:64], r1
+        vst1.64         {d0}, [r0:64], r1
+        vst1.64         {d0}, [r0:64], r1
+        vst1.64         {d0}, [r0:64], r1
+        bx              lr
+endfunc
-- 
1.7.8.3

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to