On Mon, Jan 18, 2016 at 4:38 PM, Diego Biurrun <[email protected]> wrote:
> ---
>  libavcodec/x86/Makefile           |   2 +
>  libavcodec/x86/hpeldsp.asm        |  89 ------------------------------
>  libavcodec/x86/hpeldsp.h          |   4 ++
>  libavcodec/x86/hpeldsp_init.c     |  25 ++-------
>  libavcodec/x86/hpeldsp_vp3.asm    | 111 
> ++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/hpeldsp_vp3_init.c |  54 +++++++++++++++++++
>  6 files changed, 174 insertions(+), 111 deletions(-)
>  create mode 100644 libavcodec/x86/hpeldsp_vp3.asm
>  create mode 100644 libavcodec/x86/hpeldsp_vp3_init.c
>
> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
> index 4afd0a7..1ccea035 100644
> --- a/libavcodec/x86/Makefile
> +++ b/libavcodec/x86/Makefile
> @@ -50,6 +50,7 @@ OBJS-$(CONFIG_TRUEHD_DECODER)          += x86/mlpdsp.o
>  OBJS-$(CONFIG_V210_ENCODER)            += x86/v210enc_init.o
>  OBJS-$(CONFIG_VC1_DECODER)             += x86/vc1dsp_init.o
>  OBJS-$(CONFIG_VORBIS_DECODER)          += x86/vorbisdsp_init.o
> +OBJS-$(CONFIG_VP3_DECODER)             += x86/hpeldsp_vp3_init.o
>  OBJS-$(CONFIG_VP6_DECODER)             += x86/vp6dsp_init.o
>  OBJS-$(CONFIG_VP9_DECODER)             += x86/vp9dsp_init.o
>
> @@ -120,5 +121,6 @@ YASM-OBJS-$(CONFIG_RV40_DECODER)       += x86/rv40dsp.o
>  YASM-OBJS-$(CONFIG_V210_ENCODER)       += x86/v210enc.o
>  YASM-OBJS-$(CONFIG_VC1_DECODER)        += x86/vc1dsp.o
>  YASM-OBJS-$(CONFIG_VORBIS_DECODER)     += x86/vorbisdsp.o
> +YASM-OBJS-$(CONFIG_VP3_DECODER)        += x86/hpeldsp_vp3.o
>  YASM-OBJS-$(CONFIG_VP6_DECODER)        += x86/vp6dsp.o
>  YASM-OBJS-$(CONFIG_VP9_DECODER)        += x86/vp9dsp.o
> diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
> index b8929b9..8e21114 100644
> --- a/libavcodec/x86/hpeldsp.asm
> +++ b/libavcodec/x86/hpeldsp.asm
> @@ -142,53 +142,6 @@ INIT_MMX 3dnow
>  PUT_NO_RND_PIXELS8_X2
>
>
> -; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, 
> ptrdiff_t line_size, int h)
> -%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
> -cglobal put_no_rnd_pixels8_x2_exact, 4,5
> -    lea          r4, [r2*3]
> -    pcmpeqb      m6, m6
> -.loop:
> -    mova         m0, [r1]
> -    mova         m2, [r1+r2]
> -    mova         m1, [r1+1]
> -    mova         m3, [r1+r2+1]
> -    pxor         m0, m6
> -    pxor         m2, m6
> -    pxor         m1, m6
> -    pxor         m3, m6
> -    PAVGB        m0, m1
> -    PAVGB        m2, m3
> -    pxor         m0, m6
> -    pxor         m2, m6
> -    mova       [r0], m0
> -    mova    [r0+r2], m2
> -    mova         m0, [r1+r2*2]
> -    mova         m1, [r1+r2*2+1]
> -    mova         m2, [r1+r4]
> -    mova         m3, [r1+r4+1]
> -    pxor         m0, m6
> -    pxor         m1, m6
> -    pxor         m2, m6
> -    pxor         m3, m6
> -    PAVGB        m0, m1
> -    PAVGB        m2, m3
> -    pxor         m0, m6
> -    pxor         m2, m6
> -    mova  [r0+r2*2], m0
> -    mova    [r0+r4], m2
> -    lea          r1, [r1+r2*4]
> -    lea          r0, [r0+r2*4]
> -    sub         r3d, 4
> -    jg .loop
> -    REP_RET
> -%endmacro
> -
> -INIT_MMX mmxext
> -PUT_NO_RND_PIXELS8_X2_EXACT
> -INIT_MMX 3dnow
> -PUT_NO_RND_PIXELS8_X2_EXACT
> -
> -
>  ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
> line_size, int h)
>  %macro PUT_PIXELS8_Y2 0
>  cglobal put_pixels8_y2, 4,5
> @@ -260,48 +213,6 @@ INIT_MMX 3dnow
>  PUT_NO_RND_PIXELS8_Y2
>
>
> -; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, 
> ptrdiff_t line_size, int h)
> -%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
> -cglobal put_no_rnd_pixels8_y2_exact, 4,5
> -    lea          r4, [r2*3]
> -    mova         m0, [r1]
> -    pcmpeqb      m6, m6
> -    add          r1, r2
> -    pxor         m0, m6
> -.loop:
> -    mova         m1, [r1]
> -    mova         m2, [r1+r2]
> -    pxor         m1, m6
> -    pxor         m2, m6
> -    PAVGB        m0, m1
> -    PAVGB        m1, m2
> -    pxor         m0, m6
> -    pxor         m1, m6
> -    mova       [r0], m0
> -    mova    [r0+r2], m1
> -    mova         m1, [r1+r2*2]
> -    mova         m0, [r1+r4]
> -    pxor         m1, m6
> -    pxor         m0, m6
> -    PAVGB        m2, m1
> -    PAVGB        m1, m0
> -    pxor         m2, m6
> -    pxor         m1, m6
> -    mova  [r0+r2*2], m2
> -    mova    [r0+r4], m1
> -    lea          r1, [r1+r2*4]
> -    lea          r0, [r0+r2*4]
> -    sub         r3d, 4
> -    jg .loop
> -    REP_RET
> -%endmacro
> -
> -INIT_MMX mmxext
> -PUT_NO_RND_PIXELS8_Y2_EXACT
> -INIT_MMX 3dnow
> -PUT_NO_RND_PIXELS8_Y2_EXACT
> -
> -
>  ; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
> line_size, int h)
>  %macro AVG_PIXELS8 0
>  cglobal avg_pixels8, 4,5
> diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h
> index 47b0b8b..d624ed9 100644
> --- a/libavcodec/x86/hpeldsp.h
> +++ b/libavcodec/x86/hpeldsp.h
> @@ -22,6 +22,8 @@
>  #include <stddef.h>
>  #include <stdint.h>
>
> +#include "libavcodec/hpeldsp.h"
> +
>  void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
>                             ptrdiff_t line_size, int h);
>
> @@ -35,4 +37,6 @@ void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t 
> *pixels,
>  void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
>                               ptrdiff_t line_size, int h);
>
> +void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int flags);
> +
>  #endif /* AVCODEC_X86_HPELDSP_H */
> diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
> index 59cb5e1..9ca2505 100644
> --- a/libavcodec/x86/hpeldsp_init.c
> +++ b/libavcodec/x86/hpeldsp_init.c
> @@ -44,12 +44,6 @@ void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const 
> uint8_t *pixels,
>                                       ptrdiff_t line_size, int h);
>  void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
>                                      ptrdiff_t line_size, int h);
> -void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
> -                                           const uint8_t *pixels,
> -                                           ptrdiff_t line_size, int h);
> -void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
> -                                          const uint8_t *pixels,
> -                                          ptrdiff_t line_size, int h);
>  void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
>                                ptrdiff_t line_size, int h);
>  void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
> @@ -58,12 +52,6 @@ void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const 
> uint8_t *pixels,
>                                       ptrdiff_t line_size, int h);
>  void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
>                                      ptrdiff_t line_size, int h);
> -void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
> -                                           const uint8_t *pixels,
> -                                           ptrdiff_t line_size, int h);
> -void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
> -                                          const uint8_t *pixels,
> -                                          ptrdiff_t line_size, int h);
>  void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
>                            ptrdiff_t line_size, int h);
>  void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
> @@ -210,11 +198,6 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int 
> flags, int cpu_flags)
>          c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
>          c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
>      }
> -
> -    if (flags & AV_CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
> -        c->put_no_rnd_pixels_tab[1][1] = 
> ff_put_no_rnd_pixels8_x2_exact_mmxext;
> -        c->put_no_rnd_pixels_tab[1][2] = 
> ff_put_no_rnd_pixels8_y2_exact_mmxext;
> -    }
>  #endif /* HAVE_MMXEXT_EXTERNAL */
>  }
>
> @@ -244,11 +227,6 @@ static void hpeldsp_init_3dnow(HpelDSPContext *c, int 
> flags, int cpu_flags)
>          c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
>          c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
>      }
> -
> -    if (flags & AV_CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
> -        c->put_no_rnd_pixels_tab[1][1] = 
> ff_put_no_rnd_pixels8_x2_exact_3dnow;
> -        c->put_no_rnd_pixels_tab[1][2] = 
> ff_put_no_rnd_pixels8_y2_exact_3dnow;
> -    }
>  #endif /* HAVE_AMD3DNOW_EXTERNAL */
>  }
>
> @@ -279,4 +257,7 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int 
> flags)
>
>      if (EXTERNAL_SSE2(cpu_flags))
>          hpeldsp_init_sse2(c, flags, cpu_flags);
> +
> +    if (CONFIG_VP3_DECODER)
> +        ff_hpeldsp_vp3_init_x86(c, cpu_flags, flags);
>  }
> diff --git a/libavcodec/x86/hpeldsp_vp3.asm b/libavcodec/x86/hpeldsp_vp3.asm
> new file mode 100644
> index 0000000..513f14e
> --- /dev/null
> +++ b/libavcodec/x86/hpeldsp_vp3.asm
> @@ -0,0 +1,111 @@
> +;******************************************************************************
> +;* SIMD-optimized halfpel functions for VP3
> +;*
> +;* This file is part of Libav.
> +;*
> +;* Libav is free software; you can redistribute it and/or
> +;* modify it under the terms of the GNU Lesser General Public
> +;* License as published by the Free Software Foundation; either
> +;* version 2.1 of the License, or (at your option) any later version.
> +;*
> +;* Libav is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +;* Lesser General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU Lesser General Public
> +;* License along with Libav; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> +;******************************************************************************
> +
> +%include "libavutil/x86/x86util.asm"
> +
> +SECTION .text
> +
> +; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, 
> ptrdiff_t line_size, int h)
> +%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
> +cglobal put_no_rnd_pixels8_x2_exact, 4,5
> +    lea          r4, [r2*3]
> +    pcmpeqb      m6, m6
> +.loop:
> +    mova         m0, [r1]
> +    mova         m2, [r1+r2]
> +    mova         m1, [r1+1]
> +    mova         m3, [r1+r2+1]
> +    pxor         m0, m6
> +    pxor         m2, m6
> +    pxor         m1, m6
> +    pxor         m3, m6
> +    PAVGB        m0, m1
> +    PAVGB        m2, m3
> +    pxor         m0, m6
> +    pxor         m2, m6
> +    mova       [r0], m0
> +    mova    [r0+r2], m2
> +    mova         m0, [r1+r2*2]
> +    mova         m1, [r1+r2*2+1]
> +    mova         m2, [r1+r4]
> +    mova         m3, [r1+r4+1]
> +    pxor         m0, m6
> +    pxor         m1, m6
> +    pxor         m2, m6
> +    pxor         m3, m6
> +    PAVGB        m0, m1
> +    PAVGB        m2, m3
> +    pxor         m0, m6
> +    pxor         m2, m6
> +    mova  [r0+r2*2], m0
> +    mova    [r0+r4], m2
> +    lea          r1, [r1+r2*4]
> +    lea          r0, [r0+r2*4]
> +    sub         r3d, 4
> +    jg .loop
> +    REP_RET
> +%endmacro
> +
> +INIT_MMX mmxext
> +PUT_NO_RND_PIXELS8_X2_EXACT
> +INIT_MMX 3dnow
> +PUT_NO_RND_PIXELS8_X2_EXACT
> +
> +
> +; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, 
> ptrdiff_t line_size, int h)
> +%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
> +cglobal put_no_rnd_pixels8_y2_exact, 4,5
> +    lea          r4, [r2*3]
> +    mova         m0, [r1]
> +    pcmpeqb      m6, m6
> +    add          r1, r2
> +    pxor         m0, m6
> +.loop:
> +    mova         m1, [r1]
> +    mova         m2, [r1+r2]
> +    pxor         m1, m6
> +    pxor         m2, m6
> +    PAVGB        m0, m1
> +    PAVGB        m1, m2
> +    pxor         m0, m6
> +    pxor         m1, m6
> +    mova       [r0], m0
> +    mova    [r0+r2], m1
> +    mova         m1, [r1+r2*2]
> +    mova         m0, [r1+r4]
> +    pxor         m1, m6
> +    pxor         m0, m6
> +    PAVGB        m2, m1
> +    PAVGB        m1, m0
> +    pxor         m2, m6
> +    pxor         m1, m6
> +    mova  [r0+r2*2], m2
> +    mova    [r0+r4], m1
> +    lea          r1, [r1+r2*4]
> +    lea          r0, [r0+r2*4]
> +    sub         r3d, 4
> +    jg .loop
> +    REP_RET
> +%endmacro
> +
> +INIT_MMX mmxext
> +PUT_NO_RND_PIXELS8_Y2_EXACT
> +INIT_MMX 3dnow
> +PUT_NO_RND_PIXELS8_Y2_EXACT
> diff --git a/libavcodec/x86/hpeldsp_vp3_init.c 
> b/libavcodec/x86/hpeldsp_vp3_init.c
> new file mode 100644
> index 0000000..06a9d67
> --- /dev/null
> +++ b/libavcodec/x86/hpeldsp_vp3_init.c
> @@ -0,0 +1,54 @@
> +/*
> + * This file is part of Libav.
> + *
> + * Libav is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * Libav is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with Libav; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
> + */
> +
> +#include "libavutil/attributes.h"
> +#include "libavutil/cpu.h"
> +#include "libavutil/x86/cpu.h"
> +#include "libavcodec/avcodec.h"
> +#include "libavcodec/hpeldsp.h"
> +#include "hpeldsp.h"
> +
> +void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
> +                                           const uint8_t *pixels,
> +                                           ptrdiff_t line_size, int h);
> +void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
> +                                          const uint8_t *pixels,
> +                                          ptrdiff_t line_size, int h);
> +void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
> +                                           const uint8_t *pixels,
> +                                           ptrdiff_t line_size, int h);
> +void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
> +                                          const uint8_t *pixels,
> +                                          ptrdiff_t line_size, int h);
> +
> +av_cold void ff_hpeldsp_vp3_init_x86(HpelDSPContext *c, int cpu_flags, int 
> flags)
> +{
> +    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
> +        if (flags & AV_CODEC_FLAG_BITEXACT) {
> +            c->put_no_rnd_pixels_tab[1][1] = 
> ff_put_no_rnd_pixels8_x2_exact_3dnow;
> +            c->put_no_rnd_pixels_tab[1][2] = 
> ff_put_no_rnd_pixels8_y2_exact_3dnow;
> +        }
> +    }
> +
> +    if (EXTERNAL_MMXEXT(cpu_flags)) {
> +        if (flags & AV_CODEC_FLAG_BITEXACT) {
> +            c->put_no_rnd_pixels_tab[1][1] = 
> ff_put_no_rnd_pixels8_x2_exact_mmxext;
> +            c->put_no_rnd_pixels_tab[1][2] = 
> ff_put_no_rnd_pixels8_y2_exact_mmxext;
> +        }
> +    }
> +}
> --

IMHO, either split this off entirely into its own DSP Context, or keep
it grouped with the others.

- Hendrik
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to