On 2014-02-06 00:40:51 +0000, Christophe Gisquet wrote:
> For the callable function (as opposed to the inline one):
> C SSE SSE2 SSE4
> Win32: 47 42 29 26
> Win64: 30 33 25 23
> The SSE version is neither compiled nor set for 64bits.
That are cpu cycles?
> When the proper compile macros are set (e.g. ARCH_X86_64 or HAVE_SSEx),
> the macro reverts to use the inline function.
No use of ARCH_X86_64 and HAVE_SSEx_INLINE unfortunately doesn't work
that way.
> ---
> libavcodec/dcadec.c | 3 ++
> libavcodec/dcadsp.c | 1 +
> libavcodec/dcadsp.h | 1 +
> libavcodec/x86/Makefile | 2 +
> libavcodec/x86/dca.h | 56 +++++++++++++++++++++++++++
> libavcodec/x86/dcadsp.asm | 90
> ++++++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/dcadsp_init.c | 47 +++++++++++++++++++++++
> 7 files changed, 200 insertions(+)
> create mode 100644 libavcodec/x86/dca.h
> create mode 100644 libavcodec/x86/dcadsp.asm
> create mode 100644 libavcodec/x86/dcadsp_init.c
>
> diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
> index a3ca02c..c6d9be8 100644
> --- a/libavcodec/dcadec.c
> +++ b/libavcodec/dcadec.c
> @@ -50,6 +50,9 @@
> #if ARCH_ARM
> # include "arm/dca.h"
> #endif
> +#if ARCH_X86
> +# include "x86/dca.h"
> +#endif
>
> //#define TRACE
>
> diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
> index b984864..148f6dd 100644
> --- a/libavcodec/dcadsp.c
> +++ b/libavcodec/dcadsp.c
> @@ -88,4 +88,5 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
> s->qmf_32_subbands = dca_qmf_32_subbands;
> s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
> if (ARCH_ARM) ff_dcadsp_init_arm(s);
> + if (ARCH_X86) ff_dcadsp_init_x86(s);
> }
> diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
> index 3feea9f..afe40c4 100644
> --- a/libavcodec/dcadsp.h
> +++ b/libavcodec/dcadsp.h
> @@ -39,5 +39,6 @@ typedef struct DCADSPContext {
>
> void ff_dcadsp_init(DCADSPContext *s);
> void ff_dcadsp_init_arm(DCADSPContext *s);
> +void ff_dcadsp_init_x86(DCADSPContext *s);
>
> #endif /* AVCODEC_DCADSP_H */
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 6f4935b..f985525 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -4,6 +4,7 @@ OBJS += x86/constants.o
> \
> OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
> OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
> OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
> +OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o
> OBJS-$(CONFIG_DCT) += x86/dct_init.o
> OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
> OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o \
> @@ -54,6 +55,7 @@ YASM-OBJS += x86/deinterlace.o
> \
>
> YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
> YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
> +YASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o
> YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
> YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
> x86/fpel.o \
> diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h
> new file mode 100644
> index 0000000..6aa8f9d
> --- /dev/null
> +++ b/libavcodec/x86/dca.h
> @@ -0,0 +1,56 @@
> +/*
> + * Copyright (c) 2012 Christophe Gisquet <[email protected]>
> + *
> + * This file is part of Libav.
> + *
> + * Libav is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * Libav is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with Libav; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#if HAVE_SSE2_INLINE
see below, same as for HAVE_SSE4_INLINE applies, ARCH_X86_64 can be used
for the inline version.
> +# include "libavutil/x86/asm.h"
> +# include "libavutil/mem.h"
> +
> +#undef int8x8_fmul_int32
this still looks backwards to me
> +static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int
> scale)
> +{
> + DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000;
> + __asm__ volatile (
> + "cvtsi2ss %2, %%xmm0 \n\t"
> + "mulss %3, %%xmm0 \n\t"
> +# if HAVE_SSE4_INLINE
just because HAVE_SSE4_INLINE is set, doesn't mean the target processor
supports it. The semantics of this differ between x86 and arm. You can't
do it like this for the inline variant. I don't think we have a way to
specify that SSE4 is always available.
> + "pmovsxbd 0(%1), %%xmm1 \n\t"
> + "pmovsxbd 4(%1), %%xmm2 \n\t"
> +# else
> + "movq (%1), %%xmm1 \n\t"
> + "punpcklbw %%xmm1, %%xmm1 \n\t"
> + "movaps %%xmm1, %%xmm2 \n\t"
> + "punpcklwd %%xmm1, %%xmm1 \n\t"
> + "punpckhwd %%xmm2, %%xmm2 \n\t"
> + "psrad $24, %%xmm1 \n\t"
> + "psrad $24, %%xmm2 \n\t"
> +# endif
> + "shufps $0, %%xmm0, %%xmm0 \n\t"
> + "cvtdq2ps %%xmm1, %%xmm1 \n\t"
> + "cvtdq2ps %%xmm2, %%xmm2 \n\t"
> + "mulps %%xmm0, %%xmm1 \n\t"
> + "mulps %%xmm0, %%xmm2 \n\t"
> + "movaps %%xmm1, 0(%0) \n\t"
> + "movaps %%xmm2, 16(%0) \n\t"
> + :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16)
> + XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2")
> + );
> +}
> +#define int8x8_fmul_int32(dsp) int8x8_fmul_int32
> +#endif /* HAVE_SSE2_INLINE */
> diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
> new file mode 100644
> index 0000000..03593ce
> --- /dev/null
> +++ b/libavcodec/x86/dcadsp.asm
ignored by my
> diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
> new file mode 100644
> index 0000000..39490a2
> --- /dev/null
> +++ b/libavcodec/x86/dcadsp_init.c
> @@ -0,0 +1,47 @@
> +/*
> + * Copyright (c) 2012 Christophe Gisquet <[email protected]>
> + *
> + * This file is part of Libav.
> + *
> + * Libav is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * Libav is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with Libav; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
> USA
> + */
> +
> +#include "libavutil/cpu.h"
> +#include "libavutil/attributes.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/dcadsp.h"
> +
> +void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale);
> +void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale);
> +void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale);
> +
> +av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
> +{
> + int mm_flags = av_get_cpu_flags();
> +
> + if (EXTERNAL_SSE(mm_flags)) {
> +#if ARCH_X86_32
if (ARCH_X86_32 && EXTERNAL_SSE(mm_flags))
is simpler
Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel