On Mon, Jan 28, 2013 at 07:24:46AM -0800, Ronald S. Bultje wrote:
> From: "Ronald S. Bultje" <[email protected]>
> 
> This allows objects to use just halfpel MC without depending on all
> of dsputil. E.g. indeo3, interplayvideo and svq1dec become dsputil-
> independent. The fine-grained HAVE_HPEL_* flags allow only compiling
> a subset of the HPEL functions if a codec only uses a subset of them.
> Currently, only the C code uses this, I'll add this to the assembly
> at a later point.

Do you have patches for that already?

> In addition, I rewrote the C code, it's a little faster on 32bit x86
> and a lot faster on x86-64. The HAVE_HPEL_* flags could be moved to
> configure so they can accessed from assembly as well to conditionally
> compile optimizations, thus allowing further tuning of binary size.

Please split these off into separate patches.  This thing is huge and
unreviewable currently.

> --- a/configure
> +++ b/configure
> @@ -1456,10 +1459,12 @@ log2_deps="!msvcrt"
>  mpegaudio_select="mpegaudiodsp"
>  mpegaudiodsp_select="dct"
> -mpegvideo_select="videodsp"
> -mpegvideoenc_select="mpegvideo"
> +mpegvideo_select="hpelnornddsp videodsp"
> +mpegvideoenc_select="hpelnornddsp mpegvideo"

The change for mpegvideoenc is unnecessary.

> --- a/libavcodec/alpha/dsputil_alpha.c
> +++ b/libavcodec/alpha/hpeldsp_alpha.c
> @@ -212,7 +120,7 @@ static inline uint64_t avg4(uint64_t l1, uint64_t l2, 
> uint64_t l3, uint64_t l4)
>  #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
>  static void OPNAME ## _pixels ## SUFF ## _axp                               \
>          (uint8_t *restrict block, const uint8_t *restrict pixels,           \
> -         int line_size, int h)                                              \
> +         ptrdiff_t line_size, int h)                                        \
>  {                                                                           \
> @@ -223,7 +131,7 @@ static void OPNAME ## _pixels ## SUFF ## _axp             
>                   \
>                                                                              \
>  static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
>          (uint8_t *restrict block, const uint8_t *restrict pixels,           \
> -         int line_size, int h)                                              \
> +         ptrdiff_t line_size, int h)                                        \
>  {                                                                           \
> @@ -262,17 +170,14 @@ PIXOP(put_no_rnd, STORE);
>  static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
> -                                 int line_size, int h)
> +                                 ptrdiff_t line_size, int h)

Probably a great idea, but could all be done separately.
See the patch I sent.

> --- /dev/null
> +++ b/libavcodec/arm/hpeldsp_arm.h
> @@ -0,0 +1,29 @@
> +/*
> + * Copyright (c) 2009 Mans Rullgard <[email protected]>

Mans did not write this file.

> --- /dev/null
> +++ b/libavcodec/arm/hpeldsp_init_arm.c
> @@ -0,0 +1,68 @@
> +
> +av_cold void ff_hpeldsp_init_arm(HpelDSPContext* c, int flags)
> +{
> +
> +    if (have_armv6(cpu_flags))   ff_hpeldsp_init_armv6(c, flags);
> +    if (have_neon(cpu_flags))    ff_hpeldsp_init_neon(c, flags);

Break the lines please.

> --- a/libavcodec/bfin/dsputil_bfin.c
> +++ b/libavcodec/bfin/dsputil_bfin.c
> @@ -203,9 +130,10 @@ void ff_dsputil_init_bfin( DSPContext* c, AVCodecContext 
> *avctx )
>      c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
>      c->add_pixels_clamped = ff_bfin_add_pixels_clamped;
>  
> -    if (!high_bit_depth)
> +    if (!high_bit_depth) {
>      c->get_pixels         = ff_bfin_get_pixels;
>      c->clear_blocks       = bfin_clear_blocks;
> +    }
>      c->pix_sum            = ff_bfin_pix_sum;
>      c->pix_norm1          = ff_bfin_pix_norm1;

This looks either wrong or unrelated - why does clear_blocks suddenly
depend on bit depth?

> --- a/libavcodec/dsputil.c
> +++ b/libavcodec/dsputil.c
> @@ -2748,7 +2730,7 @@ av_cold void ff_dsputil_init(DSPContext* c, 
> AVCodecContext *avctx)
>  
> -    if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);
> +    //if (HAVE_MMX)        ff_dsputil_init_mmx   (c, avctx);

I guess you now owe me a beer for saving you from embarassment ...

> --- /dev/null
> +++ b/libavcodec/hpeldsp.h
> @@ -0,0 +1,132 @@
> +
> +#ifndef AVCODEC_HPELDSP_H
> +#define AVCODEC_HPELDSP_H
> +
> +typedef struct HpelDSPContext {
> +    hpel_mc_func put_pixels_tab[4][4];
> +    // FIXME move these flags to configure so .asm files can use them
> +    // (from config.asm) also
> +#define HAVE_HPEL_PUT_RND_16 \
> +    (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER || CONFIG_SVQ1_DECODER || \
> +     CONFIG_SVQ1_ENCODER || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_PUT_RND_8 \
> +    (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER || CONFIG_SVQ1_DECODER || \
> +     CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_PUT_RND_4 \
> +    (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_PUT_RND_2 \
> +    (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +
> +    /* these use fullpel version only */
> +#define HAVE_HPEL_PUT_FULLPEL_16 \
> +    (CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_INDEO3_DECODER || \
> +     CONFIG_INTERPLAY_VIDEO_DECODER)
> +#define HAVE_HPEL_PUT_FULLPEL_8 \
> +    (CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_MIMIC_DECODER || \
> +     CONFIG_INTERPLAY_VIDEO_DECODER || CONFIG_INDEO3_DECODER || \
> +     CONFIG_BINK_DECODER)
> +
> +    hpel_mc_func avg_pixels_tab[4][4];
> +#define HAVE_HPEL_AVG_RND_16 \
> +    (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_AVG_RND_8 \
> +    (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_AVG_RND_4 \
> +    (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +#define HAVE_HPEL_AVG_RND_2 \
> +    (CONFIG_MPEGVIDEO || CONFIG_SVQ3_DECODER)
> +
> +    hpel_mc_func put_no_rnd_pixels_tab[2][4];
> +#define HAVE_HPEL_PUT_NO_RND_16 \
> +    (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER)
> +#define HAVE_HPEL_PUT_NO_RND_8 \
> +    (CONFIG_MPEGVIDEO || CONFIG_VC1_DECODER || CONFIG_VP3_DECODER)
> +
> +    hpel_mc_func avg_no_rnd_pixels_tab[4];
> +#define HAVE_HPEL_AVG_NO_RND_16 \
> +    (CONFIG_VC1_DECODER)
> +} HpelDSPContext;

This FIXME ugliness needs to be addressed.

> +/* for internal use only */
> +void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_sh4(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_vis(HpelDSPContext *c, int flags);
> +void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);

Given that everything in uninstalled headers is for internal use only,
the comment seems kind of redundant.

> --- a/libavcodec/x86/hpeldsp.asm
> +++ b/libavcodec/x86/hpeldsp.asm
> @@ -463,3 +452,44 @@ INIT_MMX mmxext
>  AVG_PIXELS8_XY2
>  INIT_MMX 3dnow
>  AVG_PIXELS8_XY2
> +
> +INIT_XMM sse2
> +; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
> line_size, int h)
> +cglobal put_pixels16, 4,5,4
> +    lea          r4, [r2*3]
> +.loop:
> +    movu         m0, [r1]
> +    movu         m1, [r1+r2]
> +    movu         m2, [r1+r2*2]
> +    movu         m3, [r1+r4]
> +    lea          r1, [r1+r2*4]
> +    mova       [r0], m0
> +    mova    [r0+r2], m1
> +    mova  [r0+r2*2], m2
> +    mova    [r0+r4], m3
> +    sub         r3d, 4
> +    lea          r0, [r0+r2*4]
> +    jnz       .loop
> +    REP_RET
> +
> +; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
> line_size, int h)
> +cglobal avg_pixels16, 4,5,4
> +    lea          r4, [r2*3]
> +.loop:
> +    movu         m0, [r1]
> +    movu         m1, [r1+r2]
> +    movu         m2, [r1+r2*2]
> +    movu         m3, [r1+r4]
> +    lea          r1, [r1+r2*4]
> +    pavgb        m0, [r0]
> +    pavgb        m1, [r0+r2]
> +    pavgb        m2, [r0+r2*2]
> +    pavgb        m3, [r0+r4]
> +    mova       [r0], m0
> +    mova    [r0+r2], m1
> +    mova  [r0+r2*2], m2
> +    mova    [r0+r4], m3
> +    sub         r3d, 4
> +    lea          r0, [r0+r2*4]
> +    jnz       .loop
> +    REP_RET
> --- a/libavcodec/x86/dsputil.asm
> +++ b/libavcodec/x86/dsputil.asm
> @@ -648,46 +648,3 @@ BSWAP32_BUF
>  
>  INIT_XMM ssse3
>  BSWAP32_BUF
> -
> -INIT_XMM sse2
> -; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int 
> line_size, int h)
> -cglobal put_pixels16, 4,5,4
> -    movsxdifnidn r2, r2d
> -    lea          r4, [r2*3]
> -.loop:
> -    movu         m0, [r1]
> -    movu         m1, [r1+r2]
> -    movu         m2, [r1+r2*2]
> -    movu         m3, [r1+r4]
> -    lea          r1, [r1+r2*4]
> -    mova       [r0], m0
> -    mova    [r0+r2], m1
> -    mova  [r0+r2*2], m2
> -    mova    [r0+r4], m3
> -    sub         r3d, 4
> -    lea          r0, [r0+r2*4]
> -    jnz       .loop
> -    REP_RET
> -
> -; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int 
> line_size, int h)
> -cglobal avg_pixels16, 4,5,4
> -    movsxdifnidn r2, r2d
> -    lea          r4, [r2*3]
> -.loop:
> -    movu         m0, [r1]
> -    movu         m1, [r1+r2]
> -    movu         m2, [r1+r2*2]
> -    movu         m3, [r1+r4]
> -    lea          r1, [r1+r2*4]
> -    pavgb        m0, [r0]
> -    pavgb        m1, [r0+r2]
> -    pavgb        m2, [r0+r2*2]
> -    pavgb        m3, [r0+r4]
> -    mova       [r0], m0
> -    mova    [r0+r2], m1
> -    mova  [r0+r2*2], m2
> -    mova    [r0+r4], m3
> -    sub         r3d, 4
> -    lea          r0, [r0+r2*4]
> -    jnz       .loop
> -    REP_RET

This can be split off; has been split off already.

> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -93,56 +89,14 @@ void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, 
> uint8_t *src1,
> -void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
> -                          int line_size, int h);
> -void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
> -                              int line_size, int h);
> -void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
> -                             int line_size, int h);
> -void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
> -                              int line_size, int h);
> -void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
> -                             int line_size, int h);
> -void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
> -                               int line_size, int h);
> -void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
> -                              int line_size, int h);
>  
>  void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int 
> line_size, int h);

The last declaration looks like a stray leftover.

> --- /dev/null
> +++ b/libavcodec/x86/hpeldsp_init.c
> @@ -0,0 +1,422 @@
> +
> +#if HAVE_YASM
> +[...]
> +void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
> +                           ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
> +                          ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
> +                              ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
> +                             ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
> +                              ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
> +                             ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
> +                               ptrdiff_t line_size, int h);
> +void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
> +                              ptrdiff_t line_size, int h);
> +
> +void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
> line_size, int h);
> +
> +#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
> +#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
> +#endif /* HAVE_YASM */

The declarations are harmless and need no guarding ifdefs, the defines
likely as well.

> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -194,17 +148,10 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t 
> *dst, uint8_t *src,
> -#define MOVQ_BONE(regd)                                 \
> -    __asm__ volatile (                                  \
> -        "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
> -        "psrlw          $15, %%"#regd"  \n\t"           \
> -        "packuswb %%"#regd", %%"#regd"  \n\t" ::)
> -
>  #define MOVQ_WTWO(regd)                                 \
>      __asm__ volatile (                                  \
>          "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
> @@ -216,14 +163,6 @@ void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t 
> *dst, uint8_t *src,
>  #define PAVGB_MMX(rega, regb, regr, regfe)                       \
>      "movq   "#rega", "#regr"            \n\t"                    \
>      "por    "#regb", "#regr"            \n\t"                    \
> --- /dev/null
> +++ b/libavcodec/x86/hpeldsp_init.c
> @@ -0,0 +1,422 @@
> +#define MOVQ_BONE(regd)                                 \
> +    __asm__ volatile (                                  \
> +        "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
> +        "psrlw          $15, %%"#regd"  \n\t"           \
> +        "packuswb %%"#regd", %%"#regd"  \n\t" ::)
> +
> +#define MOVQ_WTWO(regd)                                 \
> +    __asm__ volatile (                                  \
> +        "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
> +        "psrlw         $15, %%"#regd"   \n\t"           \
> +        "psllw          $1, %%"#regd"   \n\t"::)
> +
> +#endif
> +
> +#define PAVGB_MMX(rega, regb, regr, regfe)                       \
> +    "movq   "#rega", "#regr"            \n\t"                    \
> +    "por    "#regb", "#regr"            \n\t"                    \
> +    "pxor   "#rega", "#regb"            \n\t"                    \
> +    "pand  "#regfe", "#regb"            \n\t"                    \
> +    "psrlq       $1, "#regb"            \n\t"                    \
> +    "psubb  "#regb", "#regr"            \n\t"
> +
> +#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
> +    "movq  "#rega", "#regr"             \n\t"                    \
> +    "movq  "#regc", "#regp"             \n\t"                    \
> +    "por   "#regb", "#regr"             \n\t"                    \
> +    "por   "#regd", "#regp"             \n\t"                    \
> +    "pxor  "#rega", "#regb"             \n\t"                    \
> +    "pxor  "#regc", "#regd"             \n\t"                    \
> +    "pand    %%mm6, "#regb"             \n\t"                    \
> +    "pand    %%mm6, "#regd"             \n\t"                    \
> +    "psrlq      $1, "#regd"             \n\t"                    \
> +    "psrlq      $1, "#regb"             \n\t"                    \
> +    "psubb "#regb", "#regr"             \n\t"                    \
> +    "psubb "#regd", "#regp"             \n\t"

Hmm, at least MOVQ_WTWO and PAVGB_MMX seem to be duplicated now -
what changed?

Diego
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to