[FFmpeg-devel] [PATCH 8/9] sbcenc: add armv6 and neon asm optimizations

2018-02-21 Thread Aurelien Jacobs
This was originally based on libsbc, and was fully integrated into ffmpeg.
---
 libavcodec/arm/Makefile  |   3 +
 libavcodec/arm/sbcdsp_armv6.S| 245 ++
 libavcodec/arm/sbcdsp_init_arm.c | 105 ++
 libavcodec/arm/sbcdsp_neon.S | 714 +++
 libavcodec/sbcdsp.c  |   2 +
 libavcodec/sbcdsp.h  |   1 +
 6 files changed, 1070 insertions(+)
 create mode 100644 libavcodec/arm/sbcdsp_armv6.S
 create mode 100644 libavcodec/arm/sbcdsp_init_arm.c
 create mode 100644 libavcodec/arm/sbcdsp_neon.S

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 1eeac5449e..fd2401f4e5 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -42,6 +42,7 @@ OBJS-$(CONFIG_DCA_DECODER) += 
arm/synth_filter_init_arm.o
 OBJS-$(CONFIG_HEVC_DECODER)+= arm/hevcdsp_init_arm.o
 OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)+= arm/rv40dsp_init_arm.o
+OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o
 OBJS-$(CONFIG_VORBIS_DECODER)  += arm/vorbisdsp_init_arm.o
 OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
 OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o   \
@@ -81,6 +82,7 @@ ARMV6-OBJS-$(CONFIG_VP8DSP)+= arm/vp8_armv6.o 
  \
 
 # decoders/encoders
 ARMV6-OBJS-$(CONFIG_MLP_DECODER)   += arm/mlpdsp_armv6.o
+ARMV6-OBJS-$(CONFIG_SBC_ENCODER)   += arm/sbcdsp_armv6.o
 
 
 # VFP optimizations
@@ -140,6 +142,7 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER)   += 
arm/hevcdsp_init_neon.o   \
 NEON-OBJS-$(CONFIG_RV30_DECODER)   += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)   += arm/rv34dsp_neon.o\
   arm/rv40dsp_neon.o
+NEON-OBJS-$(CONFIG_SBC_ENCODER)+= arm/sbcdsp_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP6_DECODER)+= arm/vp6dsp_neon.o
 NEON-OBJS-$(CONFIG_VP9_DECODER)+= arm/vp9itxfm_16bpp_neon.o \
diff --git a/libavcodec/arm/sbcdsp_armv6.S b/libavcodec/arm/sbcdsp_armv6.S
new file mode 100644
index 00..f1ff845798
--- /dev/null
+++ b/libavcodec/arm/sbcdsp_armv6.S
@@ -0,0 +1,245 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs 
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann 
+ * Copyright (C) 2004-2005  Henryk Ploetz 
+ * Copyright (C) 2005-2006  Brad Midgley 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC ARMv6 optimizations. The instructions are scheduled for ARM11 pipeline.
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_sbc_analyze_4_armv6, export=1
+@ r0 = in, r1 = out, r2 = consts
+push{r1, r3-r7, lr}
+push{r8-r12, r14}
+ldrdr4,  r5,  [r0, #0]
+ldrdr6,  r7,  [r2, #0]
+ldrdr8,  r9,  [r0, #16]
+ldrdr10, r11, [r2, #16]
+mov r14, #0x8000
+smlad   r3,  r4,  r6,  r14
+smlad   r12, r5,  r7,  r14
+ldrdr4,  r5,  [r0, #32]
+ldrdr6,  r7,  [r2, #32]
+smlad   r3,  r8,  r10, r3
+smlad   r12, r9,  r11, r12
+ldrdr8,  r9,  [r0, #48]
+ldrdr10, r11, [r2, #48]
+smlad   r3,  r4,  r6,  r3
+smlad   r12, r5,  r7,  r12
+ldrdr4,  r5,  [r0, #64]
+ldrdr6,  r7,  [r2, #64]
+smlad   r3,  r8,  r10, r3
+smlad   r12, r9,  r11, r12
+ldrdr8,  r9,  [r0, #8]
+ldrdr10, r11, [r2, #8]
+smlad   r3,  r4,  r6,  r3@ t1[0] is done
+smlad   r12, r5,  r7,  r12   @ t1[1] is done
+ldrdr4,  r5,  [r0, #24]
+ldrdr6,  r7,  [r2, #24]
+pkhtb   r3,  r12, r3, asr #16@ combine t1[0] and t1[1]
+smlad   r12, r8,  r10, r14
+smlad   r14, r9,  r11, r14
+ldrd   

[FFmpeg-devel] [PATCH 8/9] sbcenc: add armv6 and neon asm optimizations

2017-12-23 Thread Aurelien Jacobs
This was originally based on libsbc, and was fully integrated into ffmpeg.
---
 libavcodec/arm/Makefile  |   3 +
 libavcodec/arm/sbcdsp_armv6.S| 245 ++
 libavcodec/arm/sbcdsp_init_arm.c | 105 ++
 libavcodec/arm/sbcdsp_neon.S | 714 +++
 libavcodec/sbcdsp.c  |   2 +
 libavcodec/sbcdsp.h  |   1 +
 6 files changed, 1070 insertions(+)
 create mode 100644 libavcodec/arm/sbcdsp_armv6.S
 create mode 100644 libavcodec/arm/sbcdsp_init_arm.c
 create mode 100644 libavcodec/arm/sbcdsp_neon.S

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 1eeac5449e..fd2401f4e5 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -42,6 +42,7 @@ OBJS-$(CONFIG_DCA_DECODER) += 
arm/synth_filter_init_arm.o
 OBJS-$(CONFIG_HEVC_DECODER)+= arm/hevcdsp_init_arm.o
 OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
 OBJS-$(CONFIG_RV40_DECODER)+= arm/rv40dsp_init_arm.o
+OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o
 OBJS-$(CONFIG_VORBIS_DECODER)  += arm/vorbisdsp_init_arm.o
 OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
 OBJS-$(CONFIG_VP9_DECODER) += arm/vp9dsp_init_10bpp_arm.o   \
@@ -81,6 +82,7 @@ ARMV6-OBJS-$(CONFIG_VP8DSP)+= arm/vp8_armv6.o 
  \
 
 # decoders/encoders
 ARMV6-OBJS-$(CONFIG_MLP_DECODER)   += arm/mlpdsp_armv6.o
+ARMV6-OBJS-$(CONFIG_SBC_ENCODER)   += arm/sbcdsp_armv6.o
 
 
 # VFP optimizations
@@ -140,6 +142,7 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER)   += 
arm/hevcdsp_init_neon.o   \
 NEON-OBJS-$(CONFIG_RV30_DECODER)   += arm/rv34dsp_neon.o
 NEON-OBJS-$(CONFIG_RV40_DECODER)   += arm/rv34dsp_neon.o\
   arm/rv40dsp_neon.o
+NEON-OBJS-$(CONFIG_SBC_ENCODER)+= arm/sbcdsp_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP6_DECODER)+= arm/vp6dsp_neon.o
 NEON-OBJS-$(CONFIG_VP9_DECODER)+= arm/vp9itxfm_16bpp_neon.o \
diff --git a/libavcodec/arm/sbcdsp_armv6.S b/libavcodec/arm/sbcdsp_armv6.S
new file mode 100644
index 00..f1ff845798
--- /dev/null
+++ b/libavcodec/arm/sbcdsp_armv6.S
@@ -0,0 +1,245 @@
+/*
+ * Bluetooth low-complexity, subband codec (SBC)
+ *
+ * Copyright (C) 2017  Aurelien Jacobs 
+ * Copyright (C) 2008-2010  Nokia Corporation
+ * Copyright (C) 2004-2010  Marcel Holtmann 
+ * Copyright (C) 2004-2005  Henryk Ploetz 
+ * Copyright (C) 2005-2006  Brad Midgley 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * SBC ARMv6 optimizations. The instructions are scheduled for ARM11 pipeline.
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_sbc_analyze_4_armv6, export=1
+@ r0 = in, r1 = out, r2 = consts
+push{r1, r3-r7, lr}
+push{r8-r12, r14}
+ldrdr4,  r5,  [r0, #0]
+ldrdr6,  r7,  [r2, #0]
+ldrdr8,  r9,  [r0, #16]
+ldrdr10, r11, [r2, #16]
+mov r14, #0x8000
+smlad   r3,  r4,  r6,  r14
+smlad   r12, r5,  r7,  r14
+ldrdr4,  r5,  [r0, #32]
+ldrdr6,  r7,  [r2, #32]
+smlad   r3,  r8,  r10, r3
+smlad   r12, r9,  r11, r12
+ldrdr8,  r9,  [r0, #48]
+ldrdr10, r11, [r2, #48]
+smlad   r3,  r4,  r6,  r3
+smlad   r12, r5,  r7,  r12
+ldrdr4,  r5,  [r0, #64]
+ldrdr6,  r7,  [r2, #64]
+smlad   r3,  r8,  r10, r3
+smlad   r12, r9,  r11, r12
+ldrdr8,  r9,  [r0, #8]
+ldrdr10, r11, [r2, #8]
+smlad   r3,  r4,  r6,  r3@ t1[0] is done
+smlad   r12, r5,  r7,  r12   @ t1[1] is done
+ldrdr4,  r5,  [r0, #24]
+ldrdr6,  r7,  [r2, #24]
+pkhtb   r3,  r12, r3, asr #16@ combine t1[0] and t1[1]
+smlad   r12, r8,  r10, r14
+smlad   r14, r9,  r11, r14
+ldrd