On Mon, Jan 28, 2013 at 07:24:46AM -0800, Ronald S. Bultje wrote:
> From: "Ronald S. Bultje" <[email protected]>
>
> This allows objects to use just halfpel MC without depending on all
> of dsputil. E.g. indeo3, interplayvideo and svq1dec become dsputil-
> independent. The fine-grained HAVE_HPEL_* flags allow only compiling
> a subset of the HPEL functions if a codec only uses a subset of them.
> Currently, only the C code uses this, I'll add this to the assembly
> at a later point.
Do you have patches for that already?
> In addition, I rewrote the C code, it's a little faster on 32bit x86
> and a lot faster on x86-64. The HAVE_HPEL_* flags could be moved to
> configure so they can accessed from assembly as well to conditionally
> compile optimizations, thus allowing further tuning of binary size.
Please split these off into separate patches. This thing is huge and
unreviewable currently.
> --- a/configure
> +++ b/configure
> @@ -1456,10 +1459,12 @@ log2_deps="!msvcrt"
> mpegaudio_select="mpegaudiodsp"
> mpegaudiodsp_select="dct"
> -mpegvideo_select="videodsp"
> -mpegvideoenc_select="mpegvideo"
> +mpegvideo_select="hpelnornddsp videodsp"
> +mpegvideoenc_select="hpelnornddsp mpegvideo"
The change for mpegvideoenc is unnecessary.
> --- a/libavcodec/alpha/dsputil_alpha.c
> +++ b/libavcodec/alpha/hpeldsp_alpha.c
> @@ -212,7 +120,7 @@ static inline uint64_t avg4(uint64_t l1, uint64_t l2,
> uint64_t l3, uint64_t l4)
> #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
> static void OPNAME ## _pixels ## SUFF ## _axp \
> (uint8_t *restrict block, const uint8_t *restrict pixels, \
> - int line_size, int h) \
> + ptrdiff_t line_size, int h) \
> { \
> @@ -223,7 +131,7 @@ static void OPNAME ## _pixels ## SUFF ## _axp
> \
> \
> static void OPNAME ## _pixels16 ## SUFF ## _axp \
> (uint8_t *restrict block, const uint8_t *restrict pixels, \
> - int line_size, int h) \
> + ptrdiff_t line_size, int h) \
> { \
> @@ -262,17 +170,14 @@ PIXOP(put_no_rnd, STORE);
> static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
> - int line_size, int h)
> + ptrdiff_t line_size, int h)
Probably a great idea, but could all be done separately.
See the patch I sent.
> --- /dev/null
> +++ b/libavcodec/arm/hpeldsp_arm.h
> @@ -0,0 +1,29 @@
> +/*
> + * Copyright (c) 2009 Mans Rullgard <[email protected]>
Mans did not write this file.
> --- /dev/null
> +++ b/libavcodec/arm/hpeldsp_init_arm.c
> @@ -0,0 +1,68 @@
> +
> +av_cold void ff_hpeldsp_init_arm(HpelDSPContext* c, int flags)
> +{
> +
> + if (have_armv6(cpu_flags)) ff_hpeldsp_init_armv6(c, flags);
> + if (have_neon(cpu_flags)) ff_hpeldsp_init_neon(c, flags);
Break the lines please.
> --- a/libavcodec/bfin/dsputil_bfin.c
> +++ b/libavcodec/bfin/dsputil_bfin.c
> @@ -203,9 +130,10 @@ void ff_dsputil_init_bfin( DSPContext* c, AVCodecContext
> *avctx )
> c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
> c->add_pixels_clamped = ff_bfin_add_pixels_clamped;
>
> - if (!high_bit_depth)
> + if (!high_bit_depth) {
> c->get_pixels = ff_bfin_get_pixels;
> c->clear_blocks = bfin_clear_blocks;
> + }
> c->pix_sum = ff_bfin_pix_sum;
> c->pix_norm1 = ff_bfin_pix_norm1;
This looks either wrong or unrelated - why does clear_blocks suddenly
depend on bit depth?
> --- a/libavcodec/dsputil.c
> +++ b/libavcodec/dsputil.c
> @@ -2748,7 +2730,7 @@ av_cold void ff_dsputil_init(DSPContext* c,
> AVCodecContext *avctx)
>
> - if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
> + //if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
I guess you now owe me a beer for saving you from embarassment ...
> --- /dev/null
> +++ b/libavcodec/hpeldsp.h
> @@ -0,0 +1,132 @@
> +
> +#ifndef AVCODEC_HPELDSP_H
> +#define AVCODEC_HPELDSP_H
> +
> +typedef struct HpelDSPContext {
> + hpel_mc_func put_pixels_tab[4][4];
> + // FIXME move these flags to configure so .asm files can use them
> + // (from config.asm) also
> +#define HAVE_HPEL_PUT_RND_16 \
> + (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER || CONFIG_SVQ1_DECODER || \
> + CONFIG_SVQ1_ENCODER || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_PUT_RND_8 \
> + (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER || CONFIG_SVQ1_DECODER || \
> + CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_PUT_RND_4 \
> + (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_PUT_RND_2 \
> + (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +
> + /* these use fullpel version only */
> +#define HAVE_HPEL_PUT_FULLPEL_16 \
> + (CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_INDEO3_DECODER || \
> + CONFIG_INTERPLAY_VIDEO_DECODER)
> +#define HAVE_HPEL_PUT_FULLPEL_8 \
> + (CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_MIMIC_DECODER || \
> + CONFIG_INTERPLAY_VIDEO_DECODER || CONFIG_INDEO3_DECODER || \
> + CONFIG_BINK_DECODER)
> +
> + hpel_mc_func avg_pixels_tab[4][4];
> +#define HAVE_HPEL_AVG_RND_16 \
> + (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_AVG_RND_8 \
> + (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_AVG_RND_4 \
> + (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_AVG_RND_2 \
> + (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +
> + hpel_mc_func put_no_rnd_pixels_tab[2][4];
> +#define HAVE_HPEL_PUT_NO_RND_16 \
> + (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER)
> +#define HAVE_HPEL_PUT_NO_RND_8 \
> + (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER || CONFIG_VP3_DECODER)
> +
> + hpel_mc_func avg_no_rnd_pixels_tab[4];
> +#define HAVE_HPEL_AVG_NO_RND_16 \
> + (CONFIG_VC1_DECODER)
> +} HpelDSPContext;
This FIXME ugliness needs to be addressed.
> +/* for internal use only */
> +void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_sh4(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_vis(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);
Given that everything in uninstalled headers is for internal use only,
the comment seems kind of redundant.
> --- a/libavcodec/x86/hpeldsp.asm
> +++ b/libavcodec/x86/hpeldsp.asm
> @@ -463,3 +452,44 @@ INIT_MMX mmxext
> AVG_PIXELS8_XY2
> INIT_MMX 3dnow
> AVG_PIXELS8_XY2
> +
> +INIT_XMM sse2
> +; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
> line_size, int h)
> +cglobal put_pixels16, 4,5,4
> + lea r4, [r2*3]
> +.loop:
> + movu m0, [r1]
> + movu m1, [r1+r2]
> + movu m2, [r1+r2*2]
> + movu m3, [r1+r4]
> + lea r1, [r1+r2*4]
> + mova [r0], m0
> + mova [r0+r2], m1
> + mova [r0+r2*2], m2
> + mova [r0+r4], m3
> + sub r3d, 4
> + lea r0, [r0+r2*4]
> + jnz .loop
> + REP_RET
> +
> +; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
> line_size, int h)
> +cglobal avg_pixels16, 4,5,4
> + lea r4, [r2*3]
> +.loop:
> + movu m0, [r1]
> + movu m1, [r1+r2]
> + movu m2, [r1+r2*2]
> + movu m3, [r1+r4]
> + lea r1, [r1+r2*4]
> + pavgb m0, [r0]
> + pavgb m1, [r0+r2]
> + pavgb m2, [r0+r2*2]
> + pavgb m3, [r0+r4]
> + mova [r0], m0
> + mova [r0+r2], m1
> + mova [r0+r2*2], m2
> + mova [r0+r4], m3
> + sub r3d, 4
> + lea r0, [r0+r2*4]
> + jnz .loop
> + REP_RET
> --- a/libavcodec/x86/dsputil.asm
> +++ b/libavcodec/x86/dsputil.asm
> @@ -648,46 +648,3 @@ BSWAP32_BUF
>
> INIT_XMM ssse3
> BSWAP32_BUF
> -
> -INIT_XMM sse2
> -; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int
> line_size, int h)
> -cglobal put_pixels16, 4,5,4
> - movsxdifnidn r2, r2d
> - lea r4, [r2*3]
> -.loop:
> - movu m0, [r1]
> - movu m1, [r1+r2]
> - movu m2, [r1+r2*2]
> - movu m3, [r1+r4]
> - lea r1, [r1+r2*4]
> - mova [r0], m0
> - mova [r0+r2], m1
> - mova [r0+r2*2], m2
> - mova [r0+r4], m3
> - sub r3d, 4
> - lea r0, [r0+r2*4]
> - jnz .loop
> - REP_RET
> -
> -; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int
> line_size, int h)
> -cglobal avg_pixels16, 4,5,4
> - movsxdifnidn r2, r2d
> - lea r4, [r2*3]
> -.loop:
> - movu m0, [r1]
> - movu m1, [r1+r2]
> - movu m2, [r1+r2*2]
> - movu m3, [r1+r4]
> - lea r1, [r1+r2*4]
> - pavgb m0, [r0]
> - pavgb m1, [r0+r2]
> - pavgb m2, [r0+r2*2]
> - pavgb m3, [r0+r4]
> - mova [r0], m0
> - mova [r0+r2], m1
> - mova [r0+r2*2], m2
> - mova [r0+r4], m3
> - sub r3d, 4
> - lea r0, [r0+r2*4]
> - jnz .loop
> - REP_RET
This can be split off; has been split off already.
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -93,56 +89,14 @@ void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst,
> uint8_t *src1,
> -void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
> - int line_size, int h);
> -void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
> - int line_size, int h);
> -void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
> - int line_size, int h);
> -void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
> - int line_size, int h);
> -void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
> - int line_size, int h);
> -void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
> - int line_size, int h);
> -void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
> - int line_size, int h);
>
> void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int
> line_size, int h);
The last declaration looks like a stray leftover.
> --- /dev/null
> +++ b/libavcodec/x86/hpeldsp_init.c
> @@ -0,0 +1,422 @@
> +
> +#if HAVE_YASM
> +[...]
> +void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
> + ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
> + ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
> + ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
> + ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
> + ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
> + ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
> + ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
> + ptrdiff_t line_size, int h);
> +
> +void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t
> line_size, int h);
> +
> +#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
> +#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
> +#endif /* HAVE_YASM */
The declarations are harmless and need no guarding ifdefs, the defines
likely as well.
> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -194,17 +148,10 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t
> *dst, uint8_t *src,
> -#define MOVQ_BONE(regd) \
> - __asm__ volatile ( \
> - "pcmpeqd %%"#regd", %%"#regd" \n\t" \
> - "psrlw $15, %%"#regd" \n\t" \
> - "packuswb %%"#regd", %%"#regd" \n\t" ::)
> -
> #define MOVQ_WTWO(regd) \
> __asm__ volatile ( \
> "pcmpeqd %%"#regd", %%"#regd" \n\t" \
> @@ -216,14 +163,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t
> *dst, uint8_t *src,
> #define PAVGB_MMX(rega, regb, regr, regfe) \
> "movq "#rega", "#regr" \n\t" \
> "por "#regb", "#regr" \n\t" \
> --- /dev/null
> +++ b/libavcodec/x86/hpeldsp_init.c
> @@ -0,0 +1,422 @@
> +#define MOVQ_BONE(regd) \
> + __asm__ volatile ( \
> + "pcmpeqd %%"#regd", %%"#regd" \n\t" \
> + "psrlw $15, %%"#regd" \n\t" \
> + "packuswb %%"#regd", %%"#regd" \n\t" ::)
> +
> +#define MOVQ_WTWO(regd) \
> + __asm__ volatile ( \
> + "pcmpeqd %%"#regd", %%"#regd" \n\t" \
> + "psrlw $15, %%"#regd" \n\t" \
> + "psllw $1, %%"#regd" \n\t"::)
> +
> +#endif
> +
> +#define PAVGB_MMX(rega, regb, regr, regfe) \
> + "movq "#rega", "#regr" \n\t" \
> + "por "#regb", "#regr" \n\t" \
> + "pxor "#rega", "#regb" \n\t" \
> + "pand "#regfe", "#regb" \n\t" \
> + "psrlq $1, "#regb" \n\t" \
> + "psubb "#regb", "#regr" \n\t"
> +
> +#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
> + "movq "#rega", "#regr" \n\t" \
> + "movq "#regc", "#regp" \n\t" \
> + "por "#regb", "#regr" \n\t" \
> + "por "#regd", "#regp" \n\t" \
> + "pxor "#rega", "#regb" \n\t" \
> + "pxor "#regc", "#regd" \n\t" \
> + "pand %%mm6, "#regb" \n\t" \
> + "pand %%mm6, "#regd" \n\t" \
> + "psrlq $1, "#regd" \n\t" \
> + "psrlq $1, "#regb" \n\t" \
> + "psubb "#regb", "#regr" \n\t" \
> + "psubb "#regd", "#regp" \n\t"
Hmm, at least MOVQ_WTWO and PAVGB_MMX seem to be duplicated now -
what changed?
Diego
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel