From: "Ronald S. Bultje" <[email protected]>
---
arch.mak | 2 +
libavcodec/cabac_functions.h | 2 +-
libavcodec/cavsdsp.c | 3 +-
libavcodec/dct-test.c | 4 +-
libavcodec/dnxhdenc.c | 2 +-
libavcodec/dwt.c | 2 +-
libavcodec/h264_cabac.c | 2 +-
libavcodec/imgconvert.c | 2 +-
libavcodec/lpc.c | 2 +-
libavcodec/mathops.h | 2 +-
libavcodec/mlpdsp.c | 2 +-
libavcodec/mpegvideo.c | 2 +-
libavcodec/x86/Makefile | 25 ++++----
libavcodec/x86/ac3dsp_mmx.c | 3 +-
libavcodec/x86/deinterlace_mmx.h | 39 ++++++++++++
libavcodec/x86/dsputil_mmx.c | 118 ++++++++++++++++++++++-------------
libavcodec/x86/dsputil_mmx.h | 15 -----
libavcodec/x86/dsputilenc_mmx.c | 81 +++++++++++++-----------
libavcodec/x86/fft.c | 4 +-
libavcodec/x86/h264_qpel_mmx.c | 4 +-
libavcodec/x86/h264dsp_mmx.c | 7 +++
libavcodec/x86/mpegaudiodec_mmx.c | 6 +-
libavcodec/x86/rv40dsp_init.c | 7 ++-
libavcodec/x86/vc1dsp_mmx.c | 7 ++-
libavcodec/x86/vp3dsp.asm | 122 ++++++++++++++++++++++++-------------
libavfilter/vf_gradfun.c | 2 +
libavfilter/vf_yadif.c | 6 +-
libavfilter/x86/Makefile | 4 +-
libavutil/internal.h | 9 ++-
libavutil/intmath.h | 2 +-
libavutil/x86/cpu.c | 37 +++++++++++
libavutil/x86/timer.h | 8 +++
libavutil/x86_cpu.h | 2 +
libswscale/rgb2rgb.c | 2 +-
libswscale/swscale.c | 4 +-
libswscale/utils.c | 10 +--
libswscale/x86/Makefile | 5 +-
libswscale/x86/swscale_mmx.c | 6 ++
libswscale/yuv2rgb.c | 2 +-
39 files changed, 383 insertions(+), 181 deletions(-)
create mode 100644 libavcodec/x86/deinterlace_mmx.h
diff --git a/arch.mak b/arch.mak
index 33018f3..0564833 100644
--- a/arch.mak
+++ b/arch.mak
@@ -10,4 +10,6 @@ OBJS-$(HAVE_ALTIVEC) += $(ALTIVEC-OBJS) $(ALTIVEC-OBJS-yes)
OBJS-$(HAVE_VIS) += $(VIS-OBJS) $(VIS-OBJS-yes)
OBJS-$(HAVE_MMX) += $(MMX-OBJS) $(MMX-OBJS-yes)
+OBJS-$(HAVE_INLINE_ASM) += $(INLINEASM-OBJS) $(INLINEASM-OBJS-yes)
+YASM-OBJS-$(HAVE_INLINE_ASM) += $(INLINEYASM-OBJS) $(INLINEYASM-OBJS-yes)
OBJS-$(HAVE_YASM) += $(YASM-OBJS) $(YASM-OBJS-yes)
diff --git a/libavcodec/cabac_functions.h b/libavcodec/cabac_functions.h
index 484ba85..d9a7a46 100644
--- a/libavcodec/cabac_functions.h
+++ b/libavcodec/cabac_functions.h
@@ -32,7 +32,7 @@
#include "cabac.h"
#include "config.h"
-#if ARCH_X86
+#if ARCH_X86 && HAVE_INLINE_ASM
# include "x86/cabac.h"
#endif
diff --git a/libavcodec/cavsdsp.c b/libavcodec/cavsdsp.c
index 04e521b..36ac8bb 100644
--- a/libavcodec/cavsdsp.c
+++ b/libavcodec/cavsdsp.c
@@ -544,5 +544,6 @@ av_cold void ff_cavsdsp_init(CAVSDSPContext* c,
AVCodecContext *avctx) {
c->cavs_filter_ch = cavs_filter_ch_c;
c->cavs_idct8_add = cavs_idct8_add_c;
- if (HAVE_MMX) ff_cavsdsp_init_mmx(c, avctx);
+ if (HAVE_MMX && HAVE_INLINE_ASM)
+ ff_cavsdsp_init_mmx(c, avctx);
}
diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index 4647642..e56bb4e 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -82,7 +82,7 @@ static const struct algo fdct_tab[] = {
{ "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
{ "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
-#if HAVE_MMX
+#if HAVE_MMX && HAVE_INLINE_ASM
{ "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX
},
{ "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2
},
{ "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2
},
@@ -105,7 +105,7 @@ static const struct algo idct_tab[] = {
{ "INT", ff_j_rev_dct, MMX_PERM },
{ "SIMPLE-C", ff_simple_idct_8, NO_PERM },
-#if HAVE_MMX
+#if HAVE_MMX && HAVE_INLINE_ASM
#if CONFIG_GPL
{ "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1
},
{ "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1
},
diff --git a/libavcodec/dnxhdenc.c b/libavcodec/dnxhdenc.c
index 7ceae92..7741ed9 100644
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -275,7 +275,7 @@ static int dnxhd_encode_init(AVCodecContext *avctx)
ctx->block_width_l2 = 3;
}
-#if HAVE_MMX
+#if HAVE_MMX && HAVE_INLINE_ASM
ff_dnxhd_init_mmx(ctx);
#endif
diff --git a/libavcodec/dwt.c b/libavcodec/dwt.c
index 56e4a57..f2fb34a 100644
--- a/libavcodec/dwt.c
+++ b/libavcodec/dwt.c
@@ -992,6 +992,6 @@ void ff_dwt_init(DWTContext *c)
c->horizontal_compose97i = ff_snow_horizontal_compose97i;
c->inner_add_yblock = ff_snow_inner_add_yblock;
- if (HAVE_MMX)
+ if (HAVE_MMX && HAVE_INLINE_ASM)
ff_dwt_init_x86(c);
}
diff --git a/libavcodec/h264_cabac.c b/libavcodec/h264_cabac.c
index 08a6a5b..dc67724 100644
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -38,7 +38,7 @@
#include "h264_mvpred.h"
#include "golomb.h"
-#if ARCH_X86
+#if ARCH_X86 && HAVE_INLINE_ASM
#include "x86/h264_i386.h"
#endif
diff --git a/libavcodec/imgconvert.c b/libavcodec/imgconvert.c
index 90c9b7b..5eaa5b6 100644
--- a/libavcodec/imgconvert.c
+++ b/libavcodec/imgconvert.c
@@ -39,7 +39,7 @@
#include "libavutil/imgutils.h"
#if HAVE_MMX && HAVE_YASM
-#include "x86/dsputil_mmx.h"
+#include "x86/deinterlace_mmx.h"
#endif
#define FF_COLOR_RGB 0 /**< RGB color space */
diff --git a/libavcodec/lpc.c b/libavcodec/lpc.c
index 0d6910f..9e5d93e 100644
--- a/libavcodec/lpc.c
+++ b/libavcodec/lpc.c
@@ -258,7 +258,7 @@ av_cold int ff_lpc_init(LPCContext *s, int blocksize, int
max_order,
s->lpc_apply_welch_window = lpc_apply_welch_window_c;
s->lpc_compute_autocorr = lpc_compute_autocorr_c;
- if (HAVE_MMX)
+ if (HAVE_MMX && HAVE_INLINE_ASM)
ff_lpc_init_x86(s);
return 0;
diff --git a/libavcodec/mathops.h b/libavcodec/mathops.h
index d6eb98d..0b232e0 100644
--- a/libavcodec/mathops.h
+++ b/libavcodec/mathops.h
@@ -35,7 +35,7 @@
# include "mips/mathops.h"
#elif ARCH_PPC
# include "ppc/mathops.h"
-#elif ARCH_X86
+#elif ARCH_X86 && HAVE_INLINE_ASM
# include "x86/mathops.h"
#endif
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index 7d01c75..a27b1fc 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -58,6 +58,6 @@ static void ff_mlp_filter_channel(int32_t *state, const
int32_t *coeff,
void ff_mlp_init(DSPContext* c, AVCodecContext *avctx)
{
c->mlp_filter_channel = ff_mlp_filter_channel;
- if (ARCH_X86)
+ if (ARCH_X86 && HAVE_INLINE_ASM)
ff_mlp_init_x86(c, avctx);
}
diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index 708199a..e042fc6 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -187,7 +187,7 @@ av_cold int ff_dct_common_init(MpegEncContext *s)
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_bitexact;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_c;
-#if HAVE_MMX
+#if HAVE_MMX && HAVE_INLINE_ASM
ff_MPV_common_init_mmx(s);
#elif ARCH_ALPHA
ff_MPV_common_init_axp(s);
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 6602cce..7ee76e9 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -1,10 +1,11 @@
-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
+INLINEASM-OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
+INLINEASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
MMX-OBJS += x86/dsputil_mmx.o \
- x86/fdct_mmx.o \
x86/fmtconvert_mmx.o \
+
+INLINEASM-OBJS += x86/fdct_mmx.o \
x86/idct_mmx_xvid.o \
x86/idct_sse2_xvid.o \
x86/motion_est_mmx.o \
@@ -13,15 +14,15 @@ MMX-OBJS += x86/dsputil_mmx.o
\
MMX-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
MMX-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_mmx.o
-MMX-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o
-MMX-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhd_mmx.o
-MMX-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o
+INLINEASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp_mmx.o
+INLINEASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhd_mmx.o
+INLINEASM-OBJS-$(CONFIG_DWT) += x86/snowdsp_mmx.o
MMX-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o
MMX-OBJS-$(CONFIG_FFT) += x86/fft.o
-MMX-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o
+INLINEASM-OBJS-$(CONFIG_GPL) += x86/idct_mmx.o
MMX-OBJS-$(CONFIG_H264DSP) += x86/h264dsp_mmx.o
MMX-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
-MMX-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o
+INLINEASM-OBJS-$(CONFIG_LPC) += x86/lpc_mmx.o
MMX-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodec_mmx.o
MMX-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp-init.o
MMX-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp-init.o
@@ -37,10 +38,10 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o
-YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
-YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
-YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o
-YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
+INLINEYASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o
+INLINEYASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o
+INLINEYASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o
+INLINEASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \
$(YASM-OBJS-FFT-yes)
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
x86/h264_chromamc_10bit.o
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 1a43183..3ba55d7 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -19,8 +19,9 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
#include "libavutil/x86_cpu.h"
-#include "dsputil_mmx.h"
#include "libavcodec/ac3dsp.h"
extern void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int
nb_coefs);
diff --git a/libavcodec/x86/deinterlace_mmx.h b/libavcodec/x86/deinterlace_mmx.h
new file mode 100644
index 0000000..7a90d86
--- /dev/null
+++ b/libavcodec/x86/deinterlace_mmx.h
@@ -0,0 +1,39 @@
+/*
+ * MMX optimized DSP utils
+ * Copyright (c) 2007 Aurelien Jacobs <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_DEINTERLACE_MMX_H
+#define AVCODEC_X86_DEINTERLACE_MMX_H
+
+#include <stdint.h>
+
+void ff_deinterlace_line_mmx(uint8_t *dst,
+ const uint8_t *lum_m4, const uint8_t *lum_m3,
+ const uint8_t *lum_m2, const uint8_t *lum_m1,
+ const uint8_t *lum,
+ int size);
+
+void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4,
+ const uint8_t *lum_m3,
+ const uint8_t *lum_m2,
+ const uint8_t *lum_m1,
+ const uint8_t *lum, int size);
+
+#endif /* AVCODEC_X86_DEINTERLACE_MMX_H */
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 434d185..c2c59f0 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -29,8 +29,10 @@
#include "libavcodec/mpegvideo.h"
#include "libavcodec/simple_idct.h"
#include "libavcodec/ac3dec.h"
+#if HAVE_INLINE_ASM
#include "dsputil_mmx.h"
#include "idct_xvid.h"
+#endif
//#undef NDEBUG
//#include <assert.h>
@@ -84,6 +86,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = {
0xFEFEFEFEFEFEFEFEULL, 0xFEF
DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
+#if HAVE_INLINE_ASM
+
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
@@ -1808,6 +1812,8 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t
*src, int stride)
avg_pixels16_xy2_mmx(dst, src, stride, 16);
}
+#endif /* HAVE_INLINE_ASM*/
+
#if HAVE_YASM
typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
x86_reg linesize, x86_reg start_y,
@@ -1876,6 +1882,8 @@ static av_noinline void emulated_edge_mc_sse(uint8_t
*buf, const uint8_t *src,
}
#endif /* HAVE_YASM */
+#if HAVE_INLINE_ASM
+
typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
int linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h);
@@ -2045,6 +2053,8 @@ PREFETCH(prefetch_mmx2, prefetcht0)
PREFETCH(prefetch_3dnow, prefetch)
#undef PREFETCH
+#endif /* HAVE_INLINE_ASM */
+
#include "h264_qpel_mmx.c"
void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
@@ -2090,6 +2100,8 @@ CHROMA_MC(avg, 8, 10, sse2)
CHROMA_MC(put, 8, 10, avx)
CHROMA_MC(avg, 8, 10, avx)
+#if HAVE_INLINE_ASM
+
/* CAVS-specific */
void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
{
@@ -2448,6 +2460,8 @@ static void vector_clipf_sse(float *dst, const float *src,
);
}
+#endif /* HAVE_INLINE_ASM */
+
void ff_vp3_idct_mmx(int16_t *input_data);
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
@@ -2574,6 +2588,7 @@ static void dsputil_init_mmx(DSPContext *c,
AVCodecContext *avctx, int mm_flags)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+#if HAVE_INLINE_ASM
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
@@ -2596,10 +2611,6 @@ static void dsputil_init_mmx(DSPContext *c,
AVCodecContext *avctx, int mm_flags)
#if ARCH_X86_32 || !HAVE_YASM
c->gmc = gmc_mmx;
#endif
-#if ARCH_X86_32 && HAVE_YASM
- if (!high_bit_depth)
- c->emulated_edge_mc = emulated_edge_mc_mmx;
-#endif
c->add_bytes = add_bytes_mmx;
@@ -2607,8 +2618,14 @@ static void dsputil_init_mmx(DSPContext *c,
AVCodecContext *avctx, int mm_flags)
c->h263_v_loop_filter = h263_v_loop_filter_mmx;
c->h263_h_loop_filter = h263_h_loop_filter_mmx;
}
+#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
+#if ARCH_X86_32
+ if (!high_bit_depth)
+ c->emulated_edge_mc = emulated_edge_mc_mmx;
+#endif
+
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
@@ -2625,6 +2642,7 @@ static void dsputil_init_mmx2(DSPContext *c,
AVCodecContext *avctx,
const int bit_depth = avctx->bits_per_raw_sample;
const int high_bit_depth = bit_depth > 8;
+#if HAVE_INLINE_ASM
c->prefetch = prefetch_mmx2;
if (!high_bit_depth) {
@@ -2683,8 +2701,18 @@ static void dsputil_init_mmx2(DSPContext *c,
AVCodecContext *avctx,
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
- } else if (bit_depth == 10) {
+ }
+
+ SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
+ SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
+ SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
+ SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
+ }
+#endif /* HAVE_INLINE_ASM */
+
#if HAVE_YASM
+ if (CONFIG_H264QPEL) {
+ if (bit_depth == 10) {
#if !ARCH_X86_64
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
@@ -2693,16 +2721,9 @@ static void dsputil_init_mmx2(DSPContext *c,
AVCodecContext *avctx,
#endif
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
-#endif
}
-
- SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
- SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
- SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
- SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
}
-#if HAVE_YASM
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
@@ -2734,6 +2755,7 @@ static void dsputil_init_3dnow(DSPContext *c,
AVCodecContext *avctx,
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+#if HAVE_INLINE_ASM
c->prefetch = prefetch_3dnow;
if (!high_bit_depth) {
@@ -2791,24 +2813,25 @@ static void dsputil_init_3dnow(DSPContext *c,
AVCodecContext *avctx,
SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
}
+ c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+
+#if HAVE_7REGS
+ c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
+#endif
+#endif /* HAVE_INLINE_ASM */
+
#if HAVE_YASM
if (!high_bit_depth && CONFIG_H264CHROMA) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
}
#endif
-
- c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
-
-#if HAVE_7REGS
- c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
-#endif
}
static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
int mm_flags)
{
-#if HAVE_6REGS
+#if HAVE_6REGS && HAVE_INLINE_ASM
c->vector_fmul_window = vector_fmul_window_3dnow2;
#endif
}
@@ -2817,6 +2840,7 @@ static void dsputil_init_sse(DSPContext *c,
AVCodecContext *avctx, int mm_flags)
{
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+#if HAVE_INLINE_ASM
if (!high_bit_depth) {
if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
@@ -2827,24 +2851,27 @@ static void dsputil_init_sse(DSPContext *c,
AVCodecContext *avctx, int mm_flags)
c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
c->ac3_downmix = ac3_downmix_sse;
-#if HAVE_YASM
- c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
- c->vector_fmul_add = ff_vector_fmul_add_sse;
-#endif
#if HAVE_6REGS
c->vector_fmul_window = vector_fmul_window_sse;
#endif
-
+
c->vector_clipf = vector_clipf_sse;
#if HAVE_YASM
+ c->gmc = gmc_sse;
+#endif
+#endif /* HAVE_INLINE_ASM */
+
+#if HAVE_YASM
+ c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
+ c->vector_fmul_add = ff_vector_fmul_add_sse;
+
c->scalarproduct_float = ff_scalarproduct_float_sse;
c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
if (!high_bit_depth)
c->emulated_edge_mc = emulated_edge_mc_sse;
- c->gmc = gmc_sse;
#endif
}
@@ -2854,6 +2881,7 @@ static void dsputil_init_sse2(DSPContext *c,
AVCodecContext *avctx,
const int bit_depth = avctx->bits_per_raw_sample;
const int high_bit_depth = bit_depth > 8;
+#if HAVE_INLINE_ASM
if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
// these functions are slower than mmx on AMD, but faster on Intel
if (!high_bit_depth) {
@@ -2879,6 +2907,7 @@ static void dsputil_init_sse2(DSPContext *c,
AVCodecContext *avctx,
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
}
+#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
if (bit_depth == 10) {
@@ -2920,6 +2949,7 @@ static void dsputil_init_ssse3(DSPContext *c,
AVCodecContext *avctx,
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
const int bit_depth = avctx->bits_per_raw_sample;
+#if HAVE_INLINE_ASM
if (!high_bit_depth && CONFIG_H264QPEL) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
@@ -2934,8 +2964,10 @@ static void dsputil_init_ssse3(DSPContext *c,
AVCodecContext *avctx,
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
}
+#endif /* HAVE_INLINE_ASM */
+
#if HAVE_YASM
- else if (bit_depth == 10 && CONFIG_H264QPEL) {
+ if (bit_depth == 10 && CONFIG_H264QPEL) {
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
@@ -3017,6 +3049,7 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext
*avctx)
const int idct_algo = avctx->idct_algo;
if (avctx->bits_per_raw_sample <= 8) {
+#if HAVE_INLINE_ASM
if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
c->idct_put = ff_simple_idct_put_mmx;
c->idct_add = ff_simple_idct_add_mmx;
@@ -3035,22 +3068,8 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext
*avctx)
}
c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
#endif
- } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER ||
- CONFIG_VP6_DECODER) &&
- idct_algo == FF_IDCT_VP3 && HAVE_YASM) {
- if (mm_flags & AV_CPU_FLAG_SSE2) {
- c->idct_put = ff_vp3_idct_put_sse2;
- c->idct_add = ff_vp3_idct_add_sse2;
- c->idct = ff_vp3_idct_sse2;
- c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
- } else {
- c->idct_put = ff_vp3_idct_put_mmx;
- c->idct_add = ff_vp3_idct_add_mmx;
- c->idct = ff_vp3_idct_mmx;
- c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
- }
} else if (idct_algo == FF_IDCT_CAVS) {
- c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+ c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
} else if (idct_algo == FF_IDCT_XVIDMMX) {
if (mm_flags & AV_CPU_FLAG_SSE2) {
c->idct_put = ff_idct_xvid_sse2_put;
@@ -3067,6 +3086,23 @@ void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext
*avctx)
c->idct = ff_idct_xvid_mmx;
}
}
+#endif /* HAVE_INLINE_ASM */
+#if HAVE_YASM
+ if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER ||
+ CONFIG_VP6_DECODER) && idct_algo == FF_IDCT_VP3) {
+ if (mm_flags & AV_CPU_FLAG_SSE2) {
+ c->idct_put = ff_vp3_idct_put_sse2;
+ c->idct_add = ff_vp3_idct_add_sse2;
+ c->idct = ff_vp3_idct_sse2;
+ c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
+ } else {
+ c->idct_put = ff_vp3_idct_put_mmx;
+ c->idct_add = ff_vp3_idct_add_mmx;
+ c->idct = ff_vp3_idct_mmx;
+ c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
+ }
+ }
+#endif /* HAVE_YASM */
}
dsputil_init_mmx(c, avctx, mm_flags);
diff --git a/libavcodec/x86/dsputil_mmx.h b/libavcodec/x86/dsputil_mmx.h
index 37f4581..3150b03 100644
--- a/libavcodec/x86/dsputil_mmx.h
+++ b/libavcodec/x86/dsputil_mmx.h
@@ -26,8 +26,6 @@
#include "libavcodec/dsputil.h"
#include "libavutil/x86_cpu.h"
-typedef struct { uint64_t a, b; } xmm_reg;
-
extern const uint64_t ff_bone;
extern const uint64_t ff_wtwo;
@@ -207,17 +205,4 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *block, uint8_t
*pixels, int line_size)
void ff_mmx_idct(DCTELEM *block);
void ff_mmxext_idct(DCTELEM *block);
-
-void ff_deinterlace_line_mmx(uint8_t *dst,
- const uint8_t *lum_m4, const uint8_t *lum_m3,
- const uint8_t *lum_m2, const uint8_t *lum_m1,
- const uint8_t *lum,
- int size);
-
-void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4,
- const uint8_t *lum_m3,
- const uint8_t *lum_m2,
- const uint8_t *lum_m1,
- const uint8_t *lum, int size);
-
#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index d8a60e1..83b35a6 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -27,8 +27,9 @@
#include "libavcodec/dsputil.h"
#include "libavcodec/mpegvideo.h"
#include "libavcodec/mathops.h"
-#include "dsputil_mmx.h"
+#if HAVE_INLINE_ASM
+#include "dsputil_mmx.h"
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int
line_size)
{
@@ -323,8 +324,6 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t *
pix2, int line_size, int
return tmp;
}
-int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
h);
-
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
int tmp;
__asm__ volatile (
@@ -925,17 +924,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst,
const uint8_t *src1, c
"paddusw "#t", "#a" \n\t"\
"movd "#a", "#dst" \n\t"\
-#define hadamard_func(cpu) \
-int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
- int stride, int h); \
-int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
- int stride, int h);
-
-hadamard_func(mmx)
-hadamard_func(mmx2)
-hadamard_func(sse2)
-hadamard_func(ssse3)
-
#define DCT_SAD4(m,mm,o)\
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
"mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
@@ -1094,12 +1082,27 @@ static int ssd_int8_vs_int16_mmx(const int8_t *pix1,
const int16_t *pix2, int si
#undef PHADDD
#endif //HAVE_SSSE3
+#endif /* HAVE_INLINE_ASM */
+
+int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
h);
+
+#define hadamard_func(cpu) \
+int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
+ int stride, int h); \
+int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
+ int stride, int h);
+
+hadamard_func(mmx)
+hadamard_func(mmx2)
+hadamard_func(sse2)
+hadamard_func(ssse3)
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
int bit_depth = avctx->bits_per_raw_sample;
+#if HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_MMX) {
const int dct_algo = avctx->dct_algo;
if (avctx->bits_per_raw_sample <= 8 &&
@@ -1121,11 +1124,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c,
AVCodecContext *avctx)
c->diff_bytes= diff_bytes_mmx;
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
-#if HAVE_YASM
- c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
- c->hadamard8_diff[1]= ff_hadamard8_diff_mmx;
-#endif
-
c->pix_norm1 = pix_norm1_mmx;
c->sse[0] = sse16_mmx;
c->sse[1] = sse8_mmx;
@@ -1147,10 +1145,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c,
AVCodecContext *avctx)
if (mm_flags & AV_CPU_FLAG_MMX2) {
c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
-#if HAVE_YASM
- c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
- c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
-#endif
c->vsad[4]= vsad_intra16_mmx2;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -1164,13 +1158,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c,
AVCodecContext *avctx)
if (bit_depth <= 8)
c->get_pixels = get_pixels_sse2;
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
-#if HAVE_YASM
- c->sse[0] = ff_sse16_sse2;
-#if HAVE_ALIGNED_STACK
- c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
- c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
-#endif
-#endif
}
#if HAVE_SSSE3
@@ -1180,10 +1167,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c,
AVCodecContext *avctx)
}
c->add_8x8basis= add_8x8basis_ssse3;
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
-#if HAVE_YASM && HAVE_ALIGNED_STACK
- c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
- c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
-#endif
}
#endif
@@ -1196,4 +1179,34 @@ void ff_dsputilenc_init_mmx(DSPContext* c,
AVCodecContext *avctx)
}
ff_dsputil_init_pix_mmx(c, avctx);
+#endif /* HAVE_INLINE_ASM */
+
+#if HAVE_YASM
+ if (mm_flags & AV_CPU_FLAG_MMX) {
+ c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
+ c->hadamard8_diff[1]= ff_hadamard8_diff_mmx;
+
+ if (mm_flags & AV_CPU_FLAG_MMX2) {
+ c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
+ c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
+ }
+
+ if(mm_flags & AV_CPU_FLAG_SSE2){
+ c->sse[0] = ff_sse16_sse2;
+#if HAVE_ALIGNED_STACK
+ c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
+ c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
+#endif
+ }
+
+#if HAVE_SSSE3
+ if(mm_flags & AV_CPU_FLAG_SSSE3){
+#if HAVE_ALIGNED_STACK
+ c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
+ c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
+#endif
+ }
+#endif
+ }
+#endif /* HAVE_YASM */
}
diff --git a/libavcodec/x86/fft.c b/libavcodec/x86/fft.c
index 6349c23..b0fdc55 100644
--- a/libavcodec/x86/fft.c
+++ b/libavcodec/x86/fft.c
@@ -23,7 +23,7 @@
av_cold void ff_fft_init_mmx(FFTContext *s)
{
-#if HAVE_YASM
+#if HAVE_YASM && HAVE_INLINE_ASM
int has_vectors = av_get_cpu_flags();
if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
/* 3DNow! for K6-2/3 */
@@ -39,8 +39,8 @@ av_cold void ff_fft_init_mmx(FFTContext *s)
}
if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) {
/* SSE for P3/P4/K8 */
- s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_sse;
+ s->imdct_calc = ff_imdct_calc_sse;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_sse;
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
diff --git a/libavcodec/x86/h264_qpel_mmx.c b/libavcodec/x86/h264_qpel_mmx.c
index 85ae07e..113d80a 100644
--- a/libavcodec/x86/h264_qpel_mmx.c
+++ b/libavcodec/x86/h264_qpel_mmx.c
@@ -19,6 +19,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#if HAVE_INLINE_ASM
+
#include "dsputil_mmx.h"
/***********************************/
@@ -1191,7 +1193,7 @@ H264_MC_816(H264_MC_H, ssse3)
H264_MC_816(H264_MC_HV, ssse3)
#endif
-
+#endif /* HAVE_INLINE_ASM */
//10bit
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c
index dcd9180..0b0ab09 100644
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -21,9 +21,12 @@
#include "libavutil/cpu.h"
#include "libavutil/x86_cpu.h"
#include "libavcodec/h264dsp.h"
+
+#if HAVE_INLINE_ASM
#include "dsputil_mmx.h"
DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
+#endif
/***********************************/
/* IDCT */
@@ -88,6 +91,7 @@ void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output,
DCTELEM *input, int qmul
/***********************************/
/* deblocking */
+#if HAVE_INLINE_ASM
#define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir,
edges, step, mask_mv, dir, d_idx, mask_dir) \
do { \
x86_reg b_idx; \
@@ -240,6 +244,7 @@ static void h264_loop_filter_strength_mmx2( int16_t
bS[2][4][4], uint8_t nnz[40]
:"memory"
);
}
+#endif /* HAVE_INLINE_ASM */
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix,
int stride, \
@@ -344,9 +349,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int
bit_depth, const int chrom
{
int mm_flags = av_get_cpu_flags();
+#if HAVE_INLINE_ASM
if (chroma_format_idc == 1 && mm_flags & AV_CPU_FLAG_MMX2) {
c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
}
+#endif
if (bit_depth == 8) {
#if HAVE_YASM
diff --git a/libavcodec/x86/mpegaudiodec_mmx.c
b/libavcodec/x86/mpegaudiodec_mmx.c
index f51a06d..996ce46 100644
--- a/libavcodec/x86/mpegaudiodec_mmx.c
+++ b/libavcodec/x86/mpegaudiodec_mmx.c
@@ -36,6 +36,8 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float
*in, float *win,
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
+#if HAVE_INLINE_ASM
+
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
@@ -177,7 +179,7 @@ static void apply_window_mp3(float *in, float *win, int
*unused, float *out,
SUM8(MLSS, sum, win + 16 + 32, in + 32);
*out = sum;
}
-
+#endif /* HAVE_INLINE_ASM */
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
@@ -235,9 +237,11 @@ void ff_mpadsp_init_mmx(MPADSPContext *s)
}
}
+#if HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_SSE2) {
s->apply_window_float = apply_window_mp3;
}
+#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
s->imdct36_blocks_float = imdct36_blocks_avx;
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index cc1ea45..32ca4fd 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -26,7 +26,10 @@
* 3,3 is bugged in the rv40 format and maps to _xy2 version
*/
-#include "libavcodec/x86/dsputil_mmx.h"
+#include "config.h"
+#if HAVE_INLINE_ASM
+#include "dsputil_mmx.h"
+#endif
#include "libavcodec/rv34dsp.h"
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
@@ -190,10 +193,12 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext
*dsp)
if (mm_flags & AV_CPU_FLAG_MMX) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
+#if HAVE_INLINE_ASM
c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_mmx;
c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_mmx;
c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_mmx;
c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_mmx;
+#endif
#if ARCH_X86_32
QPEL_MC_SET(put_, _mmx)
#endif
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index e1f5145..a6da9b4 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -27,8 +27,9 @@
#include "libavutil/cpu.h"
#include "libavutil/x86_cpu.h"
#include "libavcodec/dsputil.h"
-#include "dsputil_mmx.h"
#include "libavcodec/vc1dsp.h"
+#if HAVE_INLINE_ASM
+#include "dsputil_mmx.h"
#define OP_PUT(S,D)
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
@@ -682,6 +683,8 @@ static void vc1_inv_trans_8x8_dc_mmx2(uint8_t *dest, int
linesize, DCTELEM *bloc
);
}
+#endif /* HAVE_INLINE_ASM */
+
#define LOOP_FILTER(EXT) \
void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
@@ -730,6 +733,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
{
int mm_flags = av_get_cpu_flags();
+#if HAVE_INLINE_ASM
if (mm_flags & AV_CPU_FLAG_MMX) {
dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
@@ -778,6 +782,7 @@ void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2;
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2;
}
+#endif /* HAVE_INLINE_ASM */
#define ASSIGN_LF(EXT) \
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 791cc8e..6d07214 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -38,13 +38,11 @@ cextern pb_1
cextern pb_3
cextern pb_7
cextern pb_1F
+cextern pb_80
cextern pb_81
cextern pw_8
-cextern put_signed_pixels_clamped_mmx
-cextern add_pixels_clamped_mmx
-
SECTION .text
; this is off by one or two for some cases when filter_limit is greater than 63
@@ -523,60 +521,100 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%endmacro
-%macro vp3_idct_funcs 3
-cglobal vp3_idct_%1, 1, 1, %2
+%macro vp3_idct_funcs 1
+cglobal vp3_idct_%1, 1, 1, 9
VP3_IDCT_%1 r0
RET
-cglobal vp3_idct_put_%1, 3, %3, %2
+cglobal vp3_idct_put_%1, 3, 4, 9
VP3_IDCT_%1 r2
-%if ARCH_X86_64
- mov r3, r2
- mov r2, r1
- mov r1, r0
- mov r0, r3
+
+ movsxdifnidn r1, r1d
+ mova m4, [pb_80]
+ lea r3, [r1*3]
+%assign %%i 0
+%rep 16/mmsize
+ mova m0, [r2+mmsize*0+%%i]
+ mova m1, [r2+mmsize*2+%%i]
+ mova m2, [r2+mmsize*4+%%i]
+ mova m3, [r2+mmsize*6+%%i]
+ packsswb m0, [r2+mmsize*1+%%i]
+ packsswb m1, [r2+mmsize*3+%%i]
+ packsswb m2, [r2+mmsize*5+%%i]
+ packsswb m3, [r2+mmsize*7+%%i]
+ paddb m0, m4
+ paddb m1, m4
+ paddb m2, m4
+ paddb m3, m4
+ movq [r0 ], m0
+%if mmsize == 8
+ movq [r0+r1 ], m1
+ movq [r0+r1*2], m2
+ movq [r0+r3 ], m3
%else
- mov r0m, r2
- mov r1m, r0
- mov r2m, r1
+ movhps [r0+r1 ], m0
+ movq [r0+r1*2], m1
+ movhps [r0+r3 ], m1
%endif
-%if WIN64
- call put_signed_pixels_clamped_mmx
- RET
-%else
- jmp put_signed_pixels_clamped_mmx
+%if %%i == 0
+ lea r0, [r0+r1*4]
+%endif
+%if mmsize == 16
+ movq [r0 ], m2
+ movhps [r0+r1 ], m2
+ movq [r0+r1*2], m3
+ movhps [r0+r3 ], m3
%endif
+%assign %%i %%i+64
+%endrep
+ RET
-cglobal vp3_idct_add_%1, 3, %3, %2
+cglobal vp3_idct_add_%1, 3, 4, 9
VP3_IDCT_%1 r2
-%if ARCH_X86_64
- mov r3, r2
- mov r2, r1
- mov r1, r0
- mov r0, r3
-%else
- mov r0m, r2
- mov r1m, r0
- mov r2m, r1
+
+ mov r3, 4
+ pxor m4, m4
+ movsxdifnidn r1, r1d
+.loop:
+ movq m0, [r0]
+ movq m1, [r0+r1]
+%if mmsize == 8
+ mova m2, m0
+ mova m3, m1
%endif
-%if WIN64
- call add_pixels_clamped_mmx
- RET
-%else
- jmp add_pixels_clamped_mmx
+ punpcklbw m0, m4
+ punpcklbw m1, m4
+%if mmsize == 8
+ punpckhbw m2, m4
+ punpckhbw m3, m4
+%endif
+ paddsw m0, [r2+ 0]
+ paddsw m1, [r2+16]
+%if mmsize == 8
+ paddsw m2, [r2+ 8]
+ paddsw m3, [r2+24]
+ packuswb m0, m2
+ packuswb m1, m3
+%else ; mmsize == 16
+ packuswb m0, m1
%endif
+ movq [r0 ], m0
+%if mmsize == 8
+ movq [r0+r1], m1
+%else ; mmsize == 16
+ movhps [r0+r1], m0
+%endif
+ lea r0, [r0+r1*2]
+ add r2, 32
+ dec r3
+ jg .loop
+ RET
%endmacro
-%if ARCH_X86_64
-%define REGS 4
-%else
-%define REGS 3
-%endif
INIT_MMX
-vp3_idct_funcs mmx, 0, REGS
+vp3_idct_funcs mmx
INIT_XMM
-vp3_idct_funcs sse2, 9, REGS
-%undef REGS
+vp3_idct_funcs sse2
%macro DC_ADD 0
movq m2, [r0 ]
diff --git a/libavfilter/vf_gradfun.c b/libavfilter/vf_gradfun.c
index 303e54c..0965e6d 100644
--- a/libavfilter/vf_gradfun.c
+++ b/libavfilter/vf_gradfun.c
@@ -135,12 +135,14 @@ static av_cold int init(AVFilterContext *ctx, const char
*args, void *opaque)
gf->blur_line = ff_gradfun_blur_line_c;
gf->filter_line = ff_gradfun_filter_line_c;
+#if HAVE_INLINE_ASM
if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX2)
gf->filter_line = ff_gradfun_filter_line_mmx2;
if (HAVE_SSSE3 && cpu_flags & AV_CPU_FLAG_SSSE3)
gf->filter_line = ff_gradfun_filter_line_ssse3;
if (HAVE_SSE && cpu_flags & AV_CPU_FLAG_SSE2)
gf->blur_line = ff_gradfun_blur_line_sse2;
+#endif
av_log(ctx, AV_LOG_INFO, "threshold:%.2f radius:%d\n", thresh, gf->radius);
diff --git a/libavfilter/vf_yadif.c b/libavfilter/vf_yadif.c
index baf8b7a..94aac83 100644
--- a/libavfilter/vf_yadif.c
+++ b/libavfilter/vf_yadif.c
@@ -171,9 +171,7 @@ static void filter(AVFilterContext *ctx, AVFilterBufferRef
*dstpic,
}
}
}
-#if HAVE_MMX
- __asm__ volatile("emms \n\t" : : : "memory");
-#endif
+ emms_c();
}
static AVFilterBufferRef *get_video_buffer(AVFilterLink *link, int perms, int
w, int h)
@@ -407,12 +405,14 @@ static av_cold int init(AVFilterContext *ctx, const char
*args, void *opaque)
if (args) sscanf(args, "%d:%d:%d", &yadif->mode, &yadif->parity,
&yadif->auto_enable);
yadif->filter_line = filter_line_c;
+#if HAVE_INLINE_ASM
if (HAVE_SSSE3 && cpu_flags & AV_CPU_FLAG_SSSE3)
yadif->filter_line = ff_yadif_filter_line_ssse3;
else if (HAVE_SSE && cpu_flags & AV_CPU_FLAG_SSE2)
yadif->filter_line = ff_yadif_filter_line_sse2;
else if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX)
yadif->filter_line = ff_yadif_filter_line_mmx;
+#endif
av_log(ctx, AV_LOG_INFO, "mode:%d parity:%d auto_enable:%d\n",
yadif->mode, yadif->parity, yadif->auto_enable);
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index e98693d..a8e5e2d 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,2 +1,2 @@
-MMX-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o
-MMX-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/gradfun.o
+INLINEASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/yadif.o
+INLINEASMOBJS-$(CONFIG_GRADFUN_FILTER) += x86/gradfun.o
diff --git a/libavutil/internal.h b/libavutil/internal.h
index ae678d5..14fc303 100644
--- a/libavutil/internal.h
+++ b/libavutil/internal.h
@@ -35,6 +35,9 @@
#include <stddef.h>
#include <assert.h>
#include "config.h"
+#if !HAVE_INLINE_ASM
+#include <mmintrin.h>
+#endif
#include "attributes.h"
#include "timer.h"
#include "dict.h"
@@ -110,7 +113,7 @@ struct AVDictionary {
/* math */
-#if ARCH_X86
+#if ARCH_X86 && HAVE_INLINE_ASM
#define MASK_ABS(mask, level)\
__asm__ volatile(\
"cltd \n\t"\
@@ -239,7 +242,11 @@ struct AVDictionary {
*/
static av_always_inline void emms_c(void)
{
+#if HAVE_INLINE_ASM
__asm__ volatile ("emms" ::: "memory");
+#else
+ _mm_empty();
+#endif
}
#else /* HAVE_MMX */
#define emms_c()
diff --git a/libavutil/intmath.h b/libavutil/intmath.h
index e6a2e10..f7baff5 100644
--- a/libavutil/intmath.h
+++ b/libavutil/intmath.h
@@ -34,7 +34,7 @@ extern const uint32_t ff_inverse[257];
#if ARCH_ARM
# include "arm/intmath.h"
-#elif ARCH_X86
+#elif ARCH_X86 && HAVE_INLINE_ASM
# include "x86/intmath.h"
#endif
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index 2424fe4..122f20f 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -25,6 +25,27 @@
#include "libavutil/x86_cpu.h"
#include "libavutil/cpu.h"
+#if !HAVE_INLINE_ASM
+#include <intrin.h>
+#include <immintrin.h>
+
+#define cpuid(index, eax, ebx, ecx, edx) \
+ do { \
+ int info[4]; \
+ __cpuid(info, index); \
+ eax = info[0]; \
+ eax = info[0]; \
+ eax = info[0]; \
+ eax = info[0]; \
+ } while (0)
+
+#define xgetbv(index, a, d) \
+ do { \
+ uint64_t res = __xgetbv(index); \
+ a = res; \
+ d = res >> 32; \
+ } while (0)
+#else
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
#define cpuid(index,eax,ebx,ecx,edx)\
__asm__ volatile\
@@ -37,6 +58,7 @@
#define xgetbv(index,eax,edx) \
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c" (index))
+#endif /* HAVE_INLINE_ASM */
/* Function to test if multimedia instructions are supported... */
int ff_get_cpu_flags_x86(void)
@@ -49,6 +71,20 @@ int ff_get_cpu_flags_x86(void)
#if ARCH_X86_32
x86_reg a, c;
+#if !HAVE_INLINE_ASM
+ __asm {
+ pushfd
+ pop eax
+ mov ebx, eax
+ xor eax, 0x200000
+ push eax
+ popfd
+ pushfd
+ pop eax
+ mov a, eax
+ mov c, ebx
+ }
+#else
__asm__ volatile (
/* See if CPUID instruction is supported ... */
/* ... Get copies of EFLAGS into eax and ecx */
@@ -69,6 +105,7 @@ int ff_get_cpu_flags_x86(void)
:
: "cc"
);
+#endif /* HAVE_INLINE_ASM */
if (a == c)
return 0; /* CPUID not supported */
diff --git a/libavutil/x86/timer.h b/libavutil/x86/timer.h
index 7f51816..31bf6f9 100644
--- a/libavutil/x86/timer.h
+++ b/libavutil/x86/timer.h
@@ -22,14 +22,22 @@
#define AVUTIL_X86_TIMER_H
#include <stdint.h>
+#include "config.h"
+#if !HAVE_INLINE_ASM
+#include <intrin.h>
+#endif
#define AV_READ_TIME read_time
static inline uint64_t read_time(void)
{
+#if HAVE_INLINE_ASM
uint32_t a, d;
__asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
return ((uint64_t)d << 32) + a;
+#else
+ return __rdtsc();
+#endif
}
#endif /* AVUTIL_X86_TIMER_H */
diff --git a/libavutil/x86_cpu.h b/libavutil/x86_cpu.h
index f84eba6..4e8508c 100644
--- a/libavutil/x86_cpu.h
+++ b/libavutil/x86_cpu.h
@@ -95,4 +95,6 @@ typedef int x86_reg;
# define XMM_CLOBBERS_ONLY(...)
#endif
+typedef struct { uint64_t a, b; } xmm_reg;
+
#endif /* AVUTIL_X86_CPU_H */
diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
index 14b595f..eefc001 100644
--- a/libswscale/rgb2rgb.c
+++ b/libswscale/rgb2rgb.c
@@ -128,7 +128,7 @@ void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t
*vdst,
void sws_rgb2rgb_init(void)
{
rgb2rgb_init_c();
- if (HAVE_MMX)
+ if (HAVE_MMX && HAVE_INLINE_ASM)
rgb2rgb_init_x86();
}
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 7ae5af3..0f8ef2b 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -518,7 +518,7 @@ static int swScale(SwsContext *c, const uint8_t *src[],
if (!enough_lines)
break; // we can't output a dstY line so let's try with the next
slice
-#if HAVE_MMX
+#if HAVE_MMX && HAVE_INLINE_ASM
updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex,
lastInLumBuf, lastInChrBuf);
#endif
@@ -661,7 +661,7 @@ static int swScale(SwsContext *c, const uint8_t *src[],
if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf)
fillPlane(dst[3], dstStride[3], dstW, dstY - lastDstY, lastDstY, 255);
-#if HAVE_MMX2
+#if HAVE_MMX2 && HAVE_INLINE_ASM
if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
__asm__ volatile ("sfence" ::: "memory");
#endif
diff --git a/libswscale/utils.c b/libswscale/utils.c
index d8fee58..a6b5a18 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -576,7 +576,7 @@ fail:
return ret;
}
-#if HAVE_MMX2
+#if HAVE_MMX2 && HAVE_INLINE_ASM
static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode,
int16_t *filter, int32_t *filterPos, int numSplits)
{
@@ -739,7 +739,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t
*filterCode,
return fragmentPos + 1;
}
-#endif /* HAVE_MMX2 */
+#endif /* HAVE_MMX2 && HAVE_INLINE_ASM */
static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
{
@@ -971,7 +971,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
SwsFilter *dstFilter)
FF_ALLOC_OR_GOTO(c, c->formatConvBuffer,
(FFALIGN(srcW, 16) * 2 * FFALIGN(c->srcBpc, 8) >> 3) + 16,
fail);
- if (HAVE_MMX2 && cpu_flags & AV_CPU_FLAG_MMX2 &&
+ if (HAVE_MMX2 && HAVE_INLINE_ASM && cpu_flags & AV_CPU_FLAG_MMX2 &&
c->srcBpc == 8 && c->dstBpc <= 10) {
c->canMMX2BeUsed = (dstW >= srcW && (dstW & 31) == 0 &&
(srcW & 15) == 0) ? 1 : 0;
@@ -1010,7 +1010,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
SwsFilter *dstFilter)
/* precalculate horizontal scaler filter coefficients */
{
-#if HAVE_MMX2
+#if HAVE_MMX2 && HAVE_INLINE_ASM
// can't downscale !!!
if (c->canMMX2BeUsed && (flags & SWS_FAST_BILINEAR)) {
c->lumMmx2FilterCodeSize = initMMX2HScaler(dstW, c->lumXInc, NULL,
@@ -1046,7 +1046,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
SwsFilter *dstFilter)
mprotect(c->chrMmx2FilterCode, c->chrMmx2FilterCodeSize, PROT_EXEC
| PROT_READ);
#endif
} else
-#endif /* HAVE_MMX2 */
+#endif /* HAVE_MMX2 && HAVE_INLINE_ASM */
{
const int filterAlign =
(HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) ? 4 :
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 7f37799..0ff5f78 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -1,7 +1,8 @@
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
-MMX-OBJS += x86/rgb2rgb.o \
- x86/swscale_mmx.o \
+MMX-OBJS += x86/swscale_mmx.o
+
+INLINEASM-OBJS += x86/rgb2rgb.o \
x86/yuv2rgb_mmx.o \
YASM-OBJS += x86/input.o \
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index 99b3262..87eb2ee 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -27,6 +27,8 @@
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
+#if HAVE_INLINE_ASM
+
#define DITHER1XBPP
DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
@@ -199,6 +201,8 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int
lumBufIndex, int chrBufI
}
}
+#endif /* HAVE_INLINE_ASM */
+
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ##
opt( \
SwsContext *c, int16_t *data, \
@@ -300,12 +304,14 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
+#if HAVE_INLINE_ASM
if (cpu_flags & AV_CPU_FLAG_MMX)
sws_init_swScale_MMX(c);
#if HAVE_MMX2
if (cpu_flags & AV_CPU_FLAG_MMX2)
sws_init_swScale_MMX2(c);
#endif
+#endif /* HAVE_INLINE_ASM */
#if HAVE_YASM
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
diff --git a/libswscale/yuv2rgb.c b/libswscale/yuv2rgb.c
index 1c44a2f..6aae098 100644
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -533,7 +533,7 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
{
SwsFunc t = NULL;
- if (HAVE_MMX)
+ if (HAVE_MMX && HAVE_INLINE_ASM)
t = ff_yuv2rgb_init_mmx(c);
else if (HAVE_VIS)
t = ff_yuv2rgb_init_vis(c);
--
1.7.9.2
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel