Re: [libav-devel] [PATCH 03/15] lavr: x86: optimized 2-channel s16p to flt conversion

2012-07-28 Thread Ronald S. Bultje
Hi,

On Sat, Jul 28, 2012 at 4:57 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   49 
 
  libavresample/x86/audio_convert_init.c |9 ++
  2 files changed, 58 insertions(+), 0 deletions(-)

OK.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/3] avconv: expand AVRational literals.

2012-07-28 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This way, the code looks less like spaghetti, and is easier to parse
for external preprocessors.
---
 avconv.c |   44 
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/avconv.c b/avconv.c
index 439672a..fcf2b69 100644
--- a/avconv.c
+++ b/avconv.c
@@ -797,14 +797,16 @@ static int configure_input_video_filter(FilterGraph *fg, 
InputFilter *ifilter,
 AVFilterContext *first_filter = in-filter_ctx;
 AVFilter *filter = avfilter_get_by_name(buffer);
 InputStream *ist = ifilter-ist;
-AVRational tb = ist-framerate.num ? (AVRational){ist-framerate.den,
-  ist-framerate.num} :
- ist-st-time_base;
+AVRational tb = ist-st-time_base;
 AVRational sar;
 char args[255], name[255];
 int pad_idx = in-pad_idx;
 int ret;
 
+if (ist-framerate.num) {
+tb.num = ist-framerate.den;
+tb.den = ist-framerate.num;
+}
 sar = ist-st-sample_aspect_ratio.num ?
   ist-st-sample_aspect_ratio :
   ist-st-codec-sample_aspect_ratio;
@@ -2196,11 +2198,10 @@ static int output_packet(InputStream *ist, const 
AVPacket *pkt)
 ret = decode_video(ist, avpkt, got_output);
 if (avpkt.duration)
 ist-next_dts += av_rescale_q(avpkt.duration, 
ist-st-time_base, AV_TIME_BASE_Q);
-else if (ist-st-r_frame_rate.num)
-ist-next_dts += av_rescale_q(1, 
(AVRational){ist-st-r_frame_rate.den,
-  
ist-st-r_frame_rate.num},
-  AV_TIME_BASE_Q);
-else if (ist-st-codec-time_base.num != 0) {
+else if (ist-st-r_frame_rate.num) {
+AVRational ifps = { ist-st-r_frame_rate.den, 
ist-st-r_frame_rate.num };
+ist-next_dts += av_rescale_q(1, ifps, AV_TIME_BASE_Q);
+} else if (ist-st-codec-time_base.num != 0) {
 int ticks  = ist-st-parser ? 
ist-st-parser-repeat_pict + 1 :

ist-st-codec-ticks_per_frame;
 ist-next_dts += av_rescale_q(ticks, 
ist-st-codec-time_base, AV_TIME_BASE_Q);
@@ -2479,11 +2480,14 @@ static int transcode_init(void)
 codec-height = icodec-height;
 codec-has_b_frames   = icodec-has_b_frames;
 if (!codec-sample_aspect_ratio.num) {
-codec-sample_aspect_ratio   =
-ost-st-sample_aspect_ratio =
-ist-st-sample_aspect_ratio.num ? 
ist-st-sample_aspect_ratio :
-ist-st-codec-sample_aspect_ratio.num ?
-ist-st-codec-sample_aspect_ratio : (AVRational){0, 
1};
+if (ist-st-sample_aspect_ratio.num) {
+codec-sample_aspect_ratio = 
ist-st-sample_aspect_ratio;
+} else if (ist-st-codec-sample_aspect_ratio.num) {
+codec-sample_aspect_ratio = 
ist-st-codec-sample_aspect_ratio;
+} else {
+codec-sample_aspect_ratio = (AVRational) { 0, 1 };
+}
+ost-st-sample_aspect_ratio = codec-sample_aspect_ratio;
 }
 break;
 case AVMEDIA_TYPE_SUBTITLE:
@@ -2526,7 +2530,11 @@ static int transcode_init(void)
 (video_sync_method ==  VSYNC_CFR ||
  (video_sync_method ==  VSYNC_AUTO 
   !(oc-oformat-flags  (AVFMT_NOTIMESTAMPS | 
AVFMT_VARIABLE_FPS) {
-ost-frame_rate = ist-st-r_frame_rate.num ? 
ist-st-r_frame_rate : (AVRational){25, 1};
+if (ist-st-r_frame_rate.num) {
+ost-frame_rate = ist-st-r_frame_rate;
+} else {
+ost-frame_rate = (AVRational) { 25, 1 };
+}
 if (ost-enc  ost-enc-supported_framerates  
!ost-force_fps) {
 int idx = av_find_nearest_q_idx(ost-frame_rate, 
ost-enc-supported_framerates);
 ost-frame_rate = ost-enc-supported_framerates[idx];
@@ -4095,9 +4103,13 @@ static int copy_chapters(InputFile *ifile, OutputFile 
*ofile, int copy_metadata)
 AVChapter *in_ch = is-chapters[i], *out_ch;
 int64_t ts_off   = av_rescale_q(ofile-start_time - ifile-ts_offset,
AV_TIME_BASE_Q, in_ch-time_base);
-int64_t rt   = (ofile-recording_time == INT64_MAX) ? INT64_MAX :
-   av_rescale_q(ofile-recording_time, AV_TIME_BASE_Q, 
in_ch-time_base);
+int64_t rt;
 
+if (ofile-recording_time == INT64_MAX) {
+rt = INT64_MAX;
+} else {
+rt = av_rescale_q(ofile-recording_time, AV_TIME_BASE_Q

[libav-devel] [PATCH 2/3] lavf/utils.c: : expand AVRational literals.

2012-07-28 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This way, the code looks less like spaghetti, and is easier to parse
for external preprocessors.
---
 libavformat/utils.c |9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/libavformat/utils.c b/libavformat/utils.c
index 4ec70b7..a78b9e7 100644
--- a/libavformat/utils.c
+++ b/libavformat/utils.c
@@ -2205,8 +2205,13 @@ static void compute_chapters_end(AVFormatContext *s)
 for (i = 0; i  s-nb_chapters; i++)
 if (s-chapters[i]-end == AV_NOPTS_VALUE) {
 AVChapter *ch = s-chapters[i];
-int64_t   end = max_time ? av_rescale_q(max_time, AV_TIME_BASE_Q, 
ch-time_base)
- : INT64_MAX;
+int64_t end;
+
+if (max_time) {
+end = av_rescale_q(max_time, AV_TIME_BASE_Q, ch-time_base);
+} else {
+end = INT64_MAX;
+}
 
 for (j = 0; j  s-nb_chapters; j++) {
 AVChapter *ch1 = s-chapters[j];
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 3/3] lavfi: expand AVRational literals.

2012-07-28 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This way, the code looks less like spaghetti, and is easier to parse
for external preprocessors.
---
 libavfilter/avfilter.c |   10 +++---
 libavfilter/vsrc_testsrc.c |7 +--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
index d302264..0df26d4 100644
--- a/libavfilter/avfilter.c
+++ b/libavfilter/avfilter.c
@@ -170,9 +170,13 @@ int avfilter_config_links(AVFilterContext *filter)
 return ret;
 }
 
-if (link-time_base.num == 0  link-time_base.den == 0)
-link-time_base = link-src  link-src-nb_inputs ?
-link-src-inputs[0]-time_base : AV_TIME_BASE_Q;
+if (link-time_base.num == 0  link-time_base.den == 0) {
+if (link-src  link-src-nb_inputs) {
+link-time_base = link-src-inputs[0]-time_base;
+} else {
+link-time_base = AV_TIME_BASE_Q;
+}
+}
 
 if (link-type == AVMEDIA_TYPE_VIDEO) {
 if (!link-sample_aspect_ratio.num  
!link-sample_aspect_ratio.den)
diff --git a/libavfilter/vsrc_testsrc.c b/libavfilter/vsrc_testsrc.c
index 42cd58e..12d4985 100644
--- a/libavfilter/vsrc_testsrc.c
+++ b/libavfilter/vsrc_testsrc.c
@@ -102,8 +102,11 @@ static av_cold int init_common(AVFilterContext *ctx, const 
char *args)
 
 test-time_base.num = frame_rate_q.den;
 test-time_base.den = frame_rate_q.num;
-test-max_pts = duration = 0 ?
-av_rescale_q(duration, AV_TIME_BASE_Q, test-time_base) : -1;
+if (duration = 0) {
+test-max_pts = av_rescale_q(duration, AV_TIME_BASE_Q, 
test-time_base);
+} else {
+test-max_pts = -1;
+}
 test-nb_frame = 0;
 test-pts = 0;
 
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] h264_ps: declare array of colorspace strings on its own line.

2012-07-28 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/h264_ps.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c
index 3f53af8..7d9d596 100644
--- a/libavcodec/h264_ps.c
+++ b/libavcodec/h264_ps.c
@@ -431,6 +431,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
 sps-sar.den= 1;
 
 if(s-avctx-debugFF_DEBUG_PICT_INFO){
+static const char csp[4][5] = { Gray, 420, 422, 444 };
 av_log(h-s.avctx, AV_LOG_DEBUG, sps:%u profile:%d/%d poc:%d ref:%d 
%dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n,
sps_id, sps-profile_idc, sps-level_idc,
sps-poc_type,
@@ -441,7 +442,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){
sps-crop_left, sps-crop_right,
sps-crop_top, sps-crop_bottom,
sps-vui_parameters_present_flag ? VUI : ,
-   ((const 
char*[]){Gray,420,422,444})[sps-chroma_format_idc],
+   csp[sps-chroma_format_idc],
sps-timing_info_present_flag ? sps-num_units_in_tick : 0,
sps-timing_info_present_flag ? sps-time_scale : 0
);
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] vp3: don't compile mmx IDCT functions on x86-64.

2012-07-27 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 11:40 PM, Luca Barbato lu_z...@gentoo.org wrote:
 On 07/27/2012 07:16 AM, Ronald S. Bultje wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 64-bit CPUs always have SSE2, and a SSE2 version exists, thus the MMX
 version will never be used.
 ---
  libavcodec/x86/vp3dsp.asm|3 +++
  libavcodec/x86/vp3dsp_init.c |2 ++
  2 files changed, 5 insertions(+)


 Fine for me as well, somebody might want to run those by selecting the
 cpu flags directly.

Unlikely, that's only for testing. Since (see commit msg) this never
happens in reality, testing for it seems kind of pointless.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/3] dsputil: x86: convert PMINSD, PMAXSD, and CLIPD macros to use cpuflags

2012-07-27 Thread Ronald S. Bultje
Hi,

On Fri, Jul 27, 2012 at 8:54 AM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 On 07/21/2012 05:39 PM, Justin Ruggles wrote:
 ---
 Updated patch to allow float vs. dword min/max as a parameter to CLIPD
 instead of using 2 separate macros.

  libavcodec/x86/dsputil_mmx.c|6 ++--
  libavcodec/x86/dsputil_yasm.asm |   66 
 +++
  libavutil/x86/x86util.asm   |   34 
  3 files changed, 56 insertions(+), 50 deletions(-)

 ping.

OK.

Can you replace swscale.asm in sws around line 375 with this also?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/3] dsputil: x86: convert some of the SPLATD macros to use cpuflags

2012-07-27 Thread Ronald S. Bultje
Hi,

On Sat, Jul 21, 2012 at 2:39 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
  %macro VECTOR_CLIP_INT32 2
  cglobal vector_clip_int32, 5,5,11, dst, src, min, max, len
 +SPLATD_LOW m4, minm
 +SPLATD_LOW m5, maxm
  %if notcpuflag(sse4)  cpuflag(sse2)  notcpuflag(atom)
 -cvtsi2ss   m4, minm
 -cvtsi2ss   m5, maxm
 +cvtdq2ps   m4, m4
 +cvtdq2ps   m5, m5
  %assign is_float 1
  %else

Doesn't this add an instruction?

There's only one user left of SPLATD (sws), isn't it easier to rewrite
that to use this also, and remove SPLATD and rename this to SPLATD
again?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86/dsputilenc: bury inline asm under HAVE_INLINE_ASM.

2012-07-27 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 6:42 AM, Måns Rullgård m...@mansr.com wrote:
 Ronald S. Bultje rsbul...@gmail.com writes:

 Hi,

 On Thu, Jul 26, 2012 at 2:23 AM, Måns Rullgård m...@mansr.com wrote:
 Ronald S. Bultje rsbul...@gmail.com writes:

 From: Ronald S. Bultje rsbul...@gmail.com

 ---
  libavcodec/dct-test.c   |2 +-
  libavcodec/x86/dsputilenc_mmx.c |   80 
 +++
  libavcodec/x86/fdct_mmx.c   |4 ++
  libavcodec/x86/motion_est_mmx.c |6 +++
  libavcodec/x86/mpegvideo_mmx.c  |6 +++
  5 files changed, 64 insertions(+), 34 deletions(-)

 diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
 index 5046544..9e19e0c 100644
 --- a/libavcodec/dct-test.c
 +++ b/libavcodec/dct-test.c
 @@ -85,7 +85,7 @@ static const struct algo fdct_tab[] = {
  { IJG-AAN-INT,ff_fdct_ifast, SCALE_PERM },
  { IJG-LLM-INT,ff_jpeg_fdct_islow_8,  NO_PERM},

 -#if HAVE_MMX
 +#if HAVE_MMX  HAVE_INLINE_ASM
  { MMX,ff_fdct_mmx,   NO_PERM,   AV_CPU_FLAG_MMX 
 },

 This is just as wrong now as it was the first time.

 Why?

 Same reason.

 What do you suggest instead?

 It's probably quicker if I just show you.

I'm still not seeing it.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.

2012-07-27 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 9:46 AM, Ronald S. Bultje rsbul...@gmail.com wrote:
 On Thu, Jul 26, 2012 at 9:05 AM, Måns Rullgård m...@mansr.com wrote:
 Ronald S. Bultje rsbul...@gmail.com writes:
 On Thu, Jul 26, 2012 at 7:30 AM, Martin Storsjö mar...@martin.st wrote:
 On Thu, 26 Jul 2012, Ronald S. Bultje wrote:
 On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote:
 On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote:
 On 07/26/2012 04:27 AM, Ronald S. Bultje wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 ---
  libswscale/swscale.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


 Ok.


 No, not OK.  This is just a repackaged piece of another patch that
 has review questions that were never answered.  Until those questions
 are settled, this cannot go in.


 I've looked at all emails in:
 http://comments.gmane.org/gmane.comp.video.libav.devel/28861

 including yours:
 http://permalink.gmane.org/gmane.comp.video.libav.devel/28871

 and Mans':
 http://permalink.gmane.org/gmane.comp.video.libav.devel/28863

 My original mail has the fence part in it (simply ctrl-F in your
 browser), and neither you nor Mans respond to that particular section.
 So I'm lost now. What is the specific comment you want me to respond
 to?


 http://article.gmane.org/gmane.comp.video.libav.devel/30834

 If someone feels like rewriting swscale, I'm all supportive of that
 effort. For now, sws uses movntq in its inline assembly mmx/3dnow
 optimizations and we'll have to deal with it until someone changes it
 not to do that.

 Doing it in generic code is silly because in practice there is never
 any advantage to doing movntq. Thus, we should discourage its use.
 Adding generic versions of sfence does not contribute to that. The
 whole goal - back when I worked on sws - was to kill all these old
 mmx/3dnow optimizations and replace with modern sse2/avx, which would
 mean we don't need a call to sfence anymore anyways.

 I'm still missing an explanation of why sfence is needed here other than
 movntq somehow being involved.

 My understanding is that if you use movntq and not sfence, the data
 may not be in the destination memory pointer by the time swScale()
 returns.

 But I didn't write this code.

Ping.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] x86inc: clip num_args to 7 on x86-32.

2012-07-27 Thread Ronald S. Bultje
From: Loren Merritt lor...@u.washington.edu

This allows us to unconditionally set the cglobal num_args
parameter to a bigger value, thus making writing yasm code
even easier than before.

Signed-off-by: Ronald S. Bultje rsbul...@gmail.com
---
 libavutil/x86/x86inc.asm |3 +++
 1 file changed, 3 insertions(+)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index b76a10c..dd441b2 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -451,6 +451,9 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
 %assign num_args %1
 %assign regs_used %2
+%if num_args  7
+%assign num_args 7
+%endif
 %if regs_used  7
 %assign regs_used 7
 %endif
-- 
1.7.9.2

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.

2012-07-27 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This completes the conversion of h264dsp to yasm; note that h264 also
uses some dsputil functions, most notably qpel. Performance-wise, the
yasm-version is ~10 cycles faster (182-172) on x86-64, and ~8 cycles
faster (201-193) on x86-32.
---
 libavcodec/x86/h264_deblock.asm |  168 +++
 libavcodec/x86/h264dsp_mmx.c|  162 ++---
 2 files changed, 175 insertions(+), 155 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 1982dc4..77b25d2 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -27,6 +27,10 @@
 %include x86inc.asm
 %include x86util.asm
 
+SECTION_RODATA
+
+pb_3_1: times 4 db 3, 1
+
 SECTION .text
 
 cextern pb_0
@@ -911,3 +915,167 @@ ff_chroma_intra_body_mmxext:
 paddb  m1, m5
 paddb  m2, m6
 ret
+
+;-
+; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
+;int8_t ref[2][40], int16_t mv[2][40][2],
+;int bidir,int edges,int step,
+;int mask_mv0, int mask_mv1, int field);
+;
+; bidiris 0 or 1
+; edgesis 1 or 4
+; step is 1 or 2
+; mask_mv0 is 0 or 3
+; mask_mv1 is 0 or 1
+; fieldis 0 or 1
+;-
+%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
+; dir, d_idx, mask_dir, bidir
+%define edgesd%1
+%define stepd %2
+%define mask_mvd  %3
+%define dir   %4
+%define d_idx %5
+%define mask_dir  %6
+%define bidir %7
+xor  b_idxd, b_idxd ; for (b_idx = 0; b_idx  edges; b_idx += step)
+.b_idx_loop_ %+ dir %+ _ %+ bidir:
+%if mask_dir == 0
+pxor m0, m0
+%endif
+test b_idxd, dword mask_mvd
+jnz .skip_loop_iter_ %+ dir %+ _ %+ bidir ; if (!(b_idx  mask_mv))
+%if bidir == 1
+movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
+punpckldqm2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
+pshufw   m0, [refq+b_idxq+12], 0x44 ; { ref0[b],  ref0[b]  }
+pshufw   m1, [refq+b_idxq+52], 0x44 ; { ref1[b],  ref1[b]  }
+pshufw   m3, m2, 0x4E   ; { ref1[bn], ref0[bn] }
+psubbm0, m2 ; { ref0[b] != ref0[bn],
+;   ref0[b] != ref1[bn] }
+psubbm1, m3 ; { ref1[b] != ref1[bn],
+;   ref1[b] != ref0[bn] }
+
+por  m0, m1
+mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
+mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
+mova m3, m1
+mova m4, m2
+psubwm1, [mvq+b_idxq*4+12*4]
+psubwm2, [mvq+b_idxq*4+12*4+mmsize]
+psubwm3, [mvq+b_idxq*4+52*4]
+psubwm4, [mvq+b_idxq*4+52*4+mmsize]
+packsswb m1, m2
+packsswb m3, m4
+paddbm1, m6
+paddbm3, m6
+psubusb  m1, m5 ; abs(mv[b] - mv[bn]) = limit
+psubusb  m3, m5
+packsswb m1, m3
+
+por  m0, m1
+mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
+mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
+mova m3, m1
+mova m4, m2
+psubwm1, [mvq+b_idxq*4+12*4]
+psubwm2, [mvq+b_idxq*4+12*4+mmsize]
+psubwm3, [mvq+b_idxq*4+52*4]
+psubwm4, [mvq+b_idxq*4+52*4+mmsize]
+packsswb m1, m2
+packsswb m3, m4
+paddbm1, m6
+paddbm3, m6
+psubusb  m1, m5 ; abs(mv[b] - mv[bn]) = limit
+psubusb  m3, m5
+packsswb m1, m3
+
+pshufw   m1, m1, 0x4E
+por  m0, m1
+pshufw   m1, m0, 0x4E
+pminub   m0, m1
+%else ; bidir == 0
+movd m0, [refq+b_idxq+12]
+psubbm0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
+
+mova m1, [mvq+b_idxq*4+12*4]
+mova m2, [mvq+b_idxq*4+12*4+mmsize]
+psubwm1, [mvq+b_idxq*4+(d_idx+12)*4]
+psubwm2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
+packsswb m1, m2
+paddbm1, m6
+psubusb  m1, m5 ; abs(mv[b] - mv[bn]) = limit
+packsswb m1, m1
+por  m0, m1
+%endif ; bidir == 1/0
+
+.skip_loop_iter_ %+ dir %+ _ %+ bidir:
+movd m1, [nnzq+b_idxq+12]
+por  m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
+
+pminub   m1, m7
+pminub   m0, m7
+psllwm1, 1
+pxor m2, m2
+pmaxub

Re: [libav-devel] [PATCH] proresdsp: port x86 assembly to cpuflags.

2012-07-27 Thread Ronald S. Bultje
Hi,

On Fri, Jul 27, 2012 at 11:39 AM, Diego Biurrun di...@biurrun.de wrote:
 On Thu, Jul 26, 2012 at 08:38:27PM -0700, Ronald S. Bultje wrote:
 --- a/libavcodec/x86/proresdsp.asm
 +++ b/libavcodec/x86/proresdsp.asm
 @@ -406,27 +405,25 @@ cglobal prores_idct_put_10_%1, 4, 4, %2

 -INIT_XMM
 -%define SIGNEXTEND signextend_sse2
 -idct_put_fn sse2, 16
 -INIT_XMM
 -%define SIGNEXTEND signextend_sse4
 -idct_put_fn sse4, 16
 -INIT_AVX
 -idct_put_fn avx,  16
 +INIT_XMM sse2
 +idct_put_fn 16
 +INIT_XMM sse4
 +idct_put_fn 16
 +INIT_XMM avx
 +idct_put_fn 16

 What's with

   INIT_XMM avx

 vs.

   INIT_AVX

 ?

 Patch does LGTM otherwise.

See x86inc.asm, INIT_AVX is the deprecated method, INIT_XMM avx is the
correct method.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.

2012-07-27 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/h264_deblock.asm   |  126 +++--
 libavcodec/x86/h264_deblock_10bit.asm |   77 ++--
 libavcodec/x86/h264dsp_mmx.c  |   60 
 3 files changed, 141 insertions(+), 122 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 1982dc4..76a458b 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -282,8 +282,8 @@ cextern pb_A1
 ;-
 ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-%macro DEBLOCK_LUMA 1
-cglobal deblock_v_luma_8_%1, 5,5,10
+%macro DEBLOCK_V_LUMA 0
+cglobal deblock_v_luma_8, 5,5,10
 movdm8, [r4] ; tc0
 lea r4, [r1*3]
 dec r2d; alpha-1
@@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10
 mova[r4+2*r1], m1
 mova[r0], m2
 RET
+%endmacro
 
 ;-
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-INIT_MMX
-cglobal deblock_h_luma_8_%1, 5,9
+%macro DEBLOCK_H_LUMA 0
+cglobal deblock_h_luma_8, 5,9
 movsxd r7,  r1d
 lear8,  [r7+r7*2]
 lear6,  [r0-4]
@@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9
 %if WIN64
 mov[rsp+0x20], r4
 %endif
-call   deblock_v_luma_8_%1
+call   deblock_v_luma_8
 
 ; transpose 16x4 - original space  (only the middle 4 rows were changed 
by the filter)
 addr6, 2
@@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9
 RET
 %endmacro
 
-INIT_XMM
-DEBLOCK_LUMA sse2
-INIT_AVX
-DEBLOCK_LUMA avx
+INIT_XMM sse2
+DEBLOCK_V_LUMA
+INIT_MMX sse2
+DEBLOCK_H_LUMA
+
+INIT_XMM avx
+DEBLOCK_V_LUMA
+INIT_MMX avx
+DEBLOCK_H_LUMA
 
 %else
 
-%macro DEBLOCK_LUMA 3
+%macro DEBLOCK_V_LUMA 2
 ;-
 ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-cglobal deblock_%2_luma_8_%1, 5,5
+cglobal deblock_%1_luma_8, 5,5
 lea r4, [r1*3]
 dec r2 ; alpha-1
 neg r4
 dec r3 ; beta-1
 add r4, r0 ; pix-3*stride
-%assign pad 2*%3+12-(stack_offset15)
+%assign pad 2*%2+12-(stack_offset15)
 SUB esp, pad
 
 movam0, [r4+r1]   ; p1
@@ -415,7 +421,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 movdm4, [r3] ; tc0
 punpcklbw m4, m4
 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
-mova   [esp+%3], m4 ; tc
+mova   [esp+%2], m4 ; tc
 pcmpgtb m4, m3
 movam3, [r4] ; p2
 pandm4, m7
@@ -423,7 +429,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 
 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0|  beta-1
 pandm6, m4
-pandm4, [esp+%3] ; tc
+pandm4, [esp+%2] ; tc
 psubb   m7, m4, m6
 pandm6, m4
 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
@@ -431,7 +437,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 movam4, [r0+2*r1] ; q2
 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0|  beta-1
 pandm6, [esp] ; mask
-movam5, [esp+%3] ; tc
+movam5, [esp+%2] ; tc
 psubb   m7, m6
 pandm5, m6
 movam3, [r0+r1]
@@ -442,12 +448,13 @@ cglobal deblock_%2_luma_8_%1, 5,5
 mova[r0], m2
 ADD esp, pad
 RET
+%endmacro
 
 ;-
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-INIT_MMX
-cglobal deblock_h_luma_8_%1, 0,5
+%macro DEBLOCK_H_LUMA 1
+cglobal deblock_h_luma_8, 0,5
 movr0, r0mp
 movr3, r1m
 lear4, [r3*3]
@@ -470,11 +477,11 @@ cglobal deblock_h_luma_8_%1, 0,5
 PUSH   dword r2m
 PUSH   dword 16
 PUSH   dword r0
-call   deblock_%2_luma_8_%1
-%ifidn %2, v8
+call   deblock_%1_luma_8
+%ifidn %1, v8
 adddword [esp   ], 8 ; pix_tmp+0x38
 adddword [esp+16], 2 ; tc0+2
-call   deblock_%2_luma_8_%1
+call   deblock_%1_luma_8
 %endif
 ADDesp, 20
 
@@ -501,12 +508,17 @@ cglobal deblock_h_luma_8_%1, 0,5
 RET
 %endmacro ; DEBLOCK_LUMA
 
-INIT_MMX
-DEBLOCK_LUMA mmxext, v8, 8
-INIT_XMM
-DEBLOCK_LUMA sse2, v, 16
-INIT_AVX
-DEBLOCK_LUMA avx, v, 16
+INIT_MMX mmx2
+DEBLOCK_V_LUMA v8, 8
+DEBLOCK_H_LUMA v8
+INIT_XMM sse2
+DEBLOCK_V_LUMA v, 16
+INIT_MMX sse2
+DEBLOCK_H_LUMA v
+INIT_XMM avx
+DEBLOCK_V_LUMA v, 16
+INIT_MMX avx
+DEBLOCK_H_LUMA v
 
 %endif ; ARCH
 
@@ -608,7 +620,7 @@ DEBLOCK_LUMA avx, v, 16
 %define mask1p mask1q
 %endmacro

[libav-devel] [PATCH] vp3: port x86 SIMD to cpuflags.

2012-07-27 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/vp3dsp.asm |   94 ++---
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index af2f60c..5877520 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -102,8 +102,8 @@ SECTION .text
 mov  [r0+r3  -1], r2w
 %endmacro
 
-INIT_MMX
-cglobal vp3_v_loop_filter_mmx2, 3, 4
+INIT_MMX mmx2
+cglobal vp3_v_loop_filter, 3, 4
 %if ARCH_X86_64
 movsxdr1, r1d
 %endif
@@ -120,7 +120,7 @@ cglobal vp3_v_loop_filter_mmx2, 3, 4
 movq [r0   ], m3
 RET
 
-cglobal vp3_h_loop_filter_mmx2, 3, 4
+cglobal vp3_h_loop_filter, 3, 4
 %if ARCH_X86_64
 movsxdr1, r1d
 %endif
@@ -354,38 +354,6 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
 movqI(2), m2
 %endmacro
 
-%macro VP3_IDCT_mmx 1
-; eax = quantized input
-; ebx = dequantizer matrix
-; ecx = IDCT constants
-;  M(I) = ecx + MaskOffset(0) + I * 8
-;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
-; edx = output
-; r0..r7 = mm0..mm7
-%define OC_8 [pw_8]
-%define C(x) [vp3_idct_data+16*(x-1)]
-
-; at this point, function has completed dequantization + dezigzag +
-; partial transposition; now do the idct itself
-%define I(x) [%1+16* x ]
-%define J(x) [%1+16*(x-4)+8]
-RowIDCT
-Transpose
-
-%define I(x) [%1+16* x   +64]
-%define J(x) [%1+16*(x-4)+72]
-RowIDCT
-Transpose
-
-%define I(x) [%1+16*x]
-%define J(x) [%1+16*x]
-ColumnIDCT
-
-%define I(x) [%1+16*x+8]
-%define J(x) [%1+16*x+8]
-ColumnIDCT
-%endmacro
-
 %macro VP3_1D_IDCT_SSE2 0
 movdqam2, I(3)  ; xmm2 = i3
 movdqam6, C(3)  ; xmm6 = c3
@@ -501,7 +469,8 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
 movdqa  O(7), m%8
 %endmacro
 
-%macro VP3_IDCT_sse2 1
+%macro VP3_IDCT 1
+%if mmsize == 16
 %define I(x) [%1+16*x]
 %define O(x) [%1+16*x]
 %define C(x) [vp3_idct_data+16*(x-1)]
@@ -519,11 +488,42 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
 %define ADD(x)   paddsw x, [pw_8]
 VP3_1D_IDCT_SSE2
 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
+%else ; mmsize == 8
+; eax = quantized input
+; ebx = dequantizer matrix
+; ecx = IDCT constants
+;  M(I) = ecx + MaskOffset(0) + I * 8
+;  C(I) = ecx + CosineOffset(32) + (I-1) * 8
+; edx = output
+; r0..r7 = mm0..mm7
+%define OC_8 [pw_8]
+%define C(x) [vp3_idct_data+16*(x-1)]
+
+; at this point, function has completed dequantization + dezigzag +
+; partial transposition; now do the idct itself
+%define I(x) [%1+16* x ]
+%define J(x) [%1+16*(x-4)+8]
+RowIDCT
+Transpose
+
+%define I(x) [%1+16* x   +64]
+%define J(x) [%1+16*(x-4)+72]
+RowIDCT
+Transpose
+
+%define I(x) [%1+16*x]
+%define J(x) [%1+16*x]
+ColumnIDCT
+
+%define I(x) [%1+16*x+8]
+%define J(x) [%1+16*x+8]
+ColumnIDCT
+%endif ; mmsize == 16/8
 %endmacro
 
-%macro vp3_idct_funcs 1
-cglobal vp3_idct_put_%1, 3, 4, 9
-VP3_IDCT_%1   r2
+%macro vp3_idct_funcs 0
+cglobal vp3_idct_put, 3, 4, 9
+VP3_IDCT  r2
 
 movsxdifnidn  r1, r1d
 mova  m4, [pb_80]
@@ -565,8 +565,8 @@ cglobal vp3_idct_put_%1, 3, 4, 9
 %endrep
 RET
 
-cglobal vp3_idct_add_%1, 3, 4, 9
-VP3_IDCT_%1   r2
+cglobal vp3_idct_add, 3, 4, 9
+VP3_IDCT  r2
 
 mov   r3, 4
 pxor  m4, m4
@@ -607,10 +607,10 @@ cglobal vp3_idct_add_%1, 3, 4, 9
 RET
 %endmacro
 
-INIT_MMX
-vp3_idct_funcs mmx
-INIT_XMM
-vp3_idct_funcs sse2
+INIT_MMX mmx
+vp3_idct_funcs
+INIT_XMM sse2
+vp3_idct_funcs
 
 %macro DC_ADD 0
 movq  m2, [r0 ]
@@ -631,8 +631,8 @@ vp3_idct_funcs sse2
 movq   [r0+r3  ], m5
 %endmacro
 
-INIT_MMX
-cglobal vp3_idct_dc_add_mmx2, 3, 4
+INIT_MMX mmx2
+cglobal vp3_idct_dc_add, 3, 4
 %if ARCH_X86_64
 movsxdr1, r1d
 %endif
-- 
1.7.9.2

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264_idct_10bit: port x86 assembly to cpuflags.

2012-07-27 Thread Ronald S. Bultje
Hi,

On Fri, Jul 27, 2012 at 2:49 PM, Diego Biurrun di...@biurrun.de wrote:
 On Thu, Jul 26, 2012 at 08:54:30PM -0700, Ronald S. Bultje wrote:
 --- a/libavcodec/x86/h264_idct_10bit.asm
 +++ b/libavcodec/x86/h264_idct_10bit.asm
 @@ -72,25 +72,25 @@ SECTION .text
  ;;; NO FATE SAMPLES TRIGGER THIS
 -%macro ADD4x4IDCT 1
 -add4x4_idct_%1:
 +%macro ADD4x4IDCT 0
 +add4x4_idct_ %+ SUFFIX:
  add   r5, r0
 @@ -107,28 +107,28 @@ add4x4_idct_%1:

  %macro ADD16_OP 3
  cmp  byte [r4+%3], 0
  jz .skipblock%2
  mov r5d, [r1+%2*4]
 -call add4x4_idct_%1
 +call add4x4_idct_ %+ SUFFIX

 You don't need this SUFFIX mangling, same below.

We're not using cglobal for the called labels (they're not functions),
so function_defined is not set, so call suffix completion doesn't
work. Thus, yes, we need the SUFFIX (without _, fixed locally) or else
it'll simply not compile.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.

2012-07-27 Thread Ronald S. Bultje
Hi,

On Fri, Jul 27, 2012 at 4:45 PM, Diego Biurrun di...@biurrun.de wrote:
 On Fri, Jul 27, 2012 at 03:08:26PM -0700, Ronald S. Bultje wrote:

 --- a/libavcodec/x86/h264_deblock.asm
 +++ b/libavcodec/x86/h264_deblock.asm
 @@ -282,8 +282,8 @@ cextern pb_A1
  
 ;-
  ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, 
 int8_t *tc0 )
  
 ;-
 -%macro DEBLOCK_LUMA 1
 -cglobal deblock_v_luma_8_%1, 5,5,10
 +%macro DEBLOCK_V_LUMA 0
 +cglobal deblock_v_luma_8, 5,5,10
  movdm8, [r4] ; tc0
  lea r4, [r1*3]
  dec r2d; alpha-1
 @@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10
  mova[r4+2*r1], m1
  mova[r0], m2
  RET
 +%endmacro

  
 ;-
  ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, 
 int8_t *tc0 )
  
 ;-
 -INIT_MMX
 -cglobal deblock_h_luma_8_%1, 5,9
 +%macro DEBLOCK_H_LUMA 0
 +cglobal deblock_h_luma_8, 5,9
  movsxd r7,  r1d
  lear8,  [r7+r7*2]
  lear6,  [r0-4]
 @@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9
  %if WIN64
  mov[rsp+0x20], r4
  %endif
 -call   deblock_v_luma_8_%1
 +call   deblock_v_luma_8

  ; transpose 16x4 - original space  (only the middle 4 rows were 
 changed by the filter)
  addr6, 2
 @@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9
  RET
  %endmacro

 -INIT_XMM
 -DEBLOCK_LUMA sse2
 -INIT_AVX
 -DEBLOCK_LUMA avx
 +INIT_XMM sse2
 +DEBLOCK_V_LUMA
 +INIT_MMX sse2
 +DEBLOCK_H_LUMA
 +
 +INIT_XMM avx
 +DEBLOCK_V_LUMA
 +INIT_MMX avx
 +DEBLOCK_H_LUMA

 I would suggest that you move the DEBLOCK_V_LUMA macro invocations
 directly below that macro.  This is what we do everywhere.  Not
 seeing the invocations directly below the definition is confusing.
 Same below for the parameterized variants of the macros.

That actually has code cache implications.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.

2012-07-27 Thread Ronald S. Bultje
Hi,

On Fri, Jul 27, 2012 at 5:04 PM, Diego Biurrun di...@biurrun.de wrote:
 On Fri, Jul 27, 2012 at 04:49:18PM -0700, Ronald S. Bultje wrote:
 On Fri, Jul 27, 2012 at 4:45 PM, Diego Biurrun di...@biurrun.de wrote:
  On Fri, Jul 27, 2012 at 03:08:26PM -0700, Ronald S. Bultje wrote:
 
  --- a/libavcodec/x86/h264_deblock.asm
  +++ b/libavcodec/x86/h264_deblock.asm
  @@ -282,8 +282,8 @@ cextern pb_A1
   
  ;-
   ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, 
  int8_t *tc0 )
   
  ;-
  -%macro DEBLOCK_LUMA 1
  -cglobal deblock_v_luma_8_%1, 5,5,10
  +%macro DEBLOCK_V_LUMA 0
  +cglobal deblock_v_luma_8, 5,5,10
   movdm8, [r4] ; tc0
   lea r4, [r1*3]
   dec r2d; alpha-1
  @@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10
   mova[r4+2*r1], m1
   mova[r0], m2
   RET
  +%endmacro
 
   
  ;-
   ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, 
  int8_t *tc0 )
   
  ;-
  -INIT_MMX
  -cglobal deblock_h_luma_8_%1, 5,9
  +%macro DEBLOCK_H_LUMA 0
  +cglobal deblock_h_luma_8, 5,9
   movsxd r7,  r1d
   lear8,  [r7+r7*2]
   lear6,  [r0-4]
  @@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9
   %if WIN64
   mov[rsp+0x20], r4
   %endif
  -call   deblock_v_luma_8_%1
  +call   deblock_v_luma_8
 
   ; transpose 16x4 - original space  (only the middle 4 rows were 
  changed by the filter)
   addr6, 2
  @@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9
   RET
   %endmacro
 
  -INIT_XMM
  -DEBLOCK_LUMA sse2
  -INIT_AVX
  -DEBLOCK_LUMA avx
  +INIT_XMM sse2
  +DEBLOCK_V_LUMA
  +INIT_MMX sse2
  +DEBLOCK_H_LUMA
  +
  +INIT_XMM avx
  +DEBLOCK_V_LUMA
  +INIT_MMX avx
  +DEBLOCK_H_LUMA
 
  I would suggest that you move the DEBLOCK_V_LUMA macro invocations
  directly below that macro.  This is what we do everywhere.  Not
  seeing the invocations directly below the definition is confusing.
  Same below for the parameterized variants of the macros.

 That actually has code cache implications.

 OK, patch fine with me then.  One last question: Why did you split the
 macros into H/V variants?

I didn't see Loren's INIT_MMX cpuname suggestion. I can revert that part back.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.

2012-07-27 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/h264_deblock.asm   |  104 -
 libavcodec/x86/h264_deblock_10bit.asm |   77 
 libavcodec/x86/h264dsp_mmx.c  |   60 +--
 3 files changed, 120 insertions(+), 121 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 1982dc4..0891ef3 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -282,8 +282,8 @@ cextern pb_A1
 ;-
 ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-%macro DEBLOCK_LUMA 1
-cglobal deblock_v_luma_8_%1, 5,5,10
+%macro DEBLOCK_LUMA 0
+cglobal deblock_v_luma_8, 5,5,10
 movdm8, [r4] ; tc0
 lea r4, [r1*3]
 dec r2d; alpha-1
@@ -327,8 +327,8 @@ cglobal deblock_v_luma_8_%1, 5,5,10
 ;-
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-INIT_MMX
-cglobal deblock_h_luma_8_%1, 5,9
+INIT_MMX cpuname
+cglobal deblock_h_luma_8, 5,9
 movsxd r7,  r1d
 lear8,  [r7+r7*2]
 lear6,  [r0-4]
@@ -355,7 +355,7 @@ cglobal deblock_h_luma_8_%1, 5,9
 %if WIN64
 mov[rsp+0x20], r4
 %endif
-call   deblock_v_luma_8_%1
+call   deblock_v_luma_8
 
 ; transpose 16x4 - original space  (only the middle 4 rows were changed 
by the filter)
 addr6, 2
@@ -384,24 +384,24 @@ cglobal deblock_h_luma_8_%1, 5,9
 RET
 %endmacro
 
-INIT_XMM
-DEBLOCK_LUMA sse2
-INIT_AVX
-DEBLOCK_LUMA avx
+INIT_XMM sse2
+DEBLOCK_LUMA
+INIT_XMM avx
+DEBLOCK_LUMA
 
 %else
 
-%macro DEBLOCK_LUMA 3
+%macro DEBLOCK_LUMA 2
 ;-
 ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-cglobal deblock_%2_luma_8_%1, 5,5
+cglobal deblock_%1_luma_8, 5,5
 lea r4, [r1*3]
 dec r2 ; alpha-1
 neg r4
 dec r3 ; beta-1
 add r4, r0 ; pix-3*stride
-%assign pad 2*%3+12-(stack_offset15)
+%assign pad 2*%2+12-(stack_offset15)
 SUB esp, pad
 
 movam0, [r4+r1]   ; p1
@@ -415,7 +415,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 movdm4, [r3] ; tc0
 punpcklbw m4, m4
 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
-mova   [esp+%3], m4 ; tc
+mova   [esp+%2], m4 ; tc
 pcmpgtb m4, m3
 movam3, [r4] ; p2
 pandm4, m7
@@ -423,7 +423,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 
 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0|  beta-1
 pandm6, m4
-pandm4, [esp+%3] ; tc
+pandm4, [esp+%2] ; tc
 psubb   m7, m4, m6
 pandm6, m4
 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
@@ -431,7 +431,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 movam4, [r0+2*r1] ; q2
 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0|  beta-1
 pandm6, [esp] ; mask
-movam5, [esp+%3] ; tc
+movam5, [esp+%2] ; tc
 psubb   m7, m6
 pandm5, m6
 movam3, [r0+r1]
@@ -446,8 +446,8 @@ cglobal deblock_%2_luma_8_%1, 5,5
 ;-
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-INIT_MMX
-cglobal deblock_h_luma_8_%1, 0,5
+INIT_MMX cpuname
+cglobal deblock_h_luma_8, 0,5
 movr0, r0mp
 movr3, r1m
 lear4, [r3*3]
@@ -470,11 +470,11 @@ cglobal deblock_h_luma_8_%1, 0,5
 PUSH   dword r2m
 PUSH   dword 16
 PUSH   dword r0
-call   deblock_%2_luma_8_%1
-%ifidn %2, v8
+call   deblock_%1_luma_8
+%ifidn %1, v8
 adddword [esp   ], 8 ; pix_tmp+0x38
 adddword [esp+16], 2 ; tc0+2
-call   deblock_%2_luma_8_%1
+call   deblock_%1_luma_8
 %endif
 ADDesp, 20
 
@@ -501,12 +501,12 @@ cglobal deblock_h_luma_8_%1, 0,5
 RET
 %endmacro ; DEBLOCK_LUMA
 
-INIT_MMX
-DEBLOCK_LUMA mmxext, v8, 8
-INIT_XMM
-DEBLOCK_LUMA sse2, v, 16
-INIT_AVX
-DEBLOCK_LUMA avx, v, 16
+INIT_MMX mmx2
+DEBLOCK_LUMA v8, 8
+INIT_XMM sse2
+DEBLOCK_LUMA v, 16
+INIT_XMM avx
+DEBLOCK_LUMA v, 16
 
 %endif ; ARCH
 
@@ -608,7 +608,7 @@ DEBLOCK_LUMA avx, v, 16
 %define mask1p mask1q
 %endmacro
 
-%macro DEBLOCK_LUMA_INTRA 2
+%macro DEBLOCK_LUMA_INTRA 1
 %define p1 m0
 %define p0 m1
 %define q0 m2
@@ -643,7 +643,7 @@ DEBLOCK_LUMA avx, v, 16
 ;-
 ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int

Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.

2012-07-27 Thread Ronald S. Bultje
Hi,

On Fri, Jul 27, 2012 at 2:43 PM, Måns Rullgård m...@mansr.com wrote:
 However, the question still remains why it is in generic code.

That's hard to say in hindsight, but it seems it was for simplicity so
that you don't have to add it to each individual mmx function, thus
making the asumption they would all use movntq. See also (directly
under the #endif, just outside the context in this patch) its use of
EMMS, even if it only called SSE2 functions and thus the MMX state was
never clobbered...

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] msvc: check for snprintf

2012-07-26 Thread Ronald S. Bultje
Hi,

On Wed, Jul 25, 2012 at 10:32 PM, Luca Barbato lu_z...@gentoo.org wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 ---

 Here my initial twist about it, ideally I'd consider moving os_support
 in libavu and include it automagically from config.h

I'm not sure why, we do similar hacks for pretty much all math
functions and a few other string-related functions in lavu already.
Why is snprintf() different?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] eval: fix printing of NaN in eval fate test.

2012-07-26 Thread Ronald S. Bultje
Hi,

On Wed, Jul 25, 2012 at 11:05 PM, Alex Converse alex.conve...@gmail.com wrote:
 On Wed, Jul 25, 2012 at 8:42 PM, Ronald S. Bultje rsbul...@gmail.com wrote:

 From: Ronald S. Bultje rsbul...@gmail.com

 This fixes make fate-eval on MSVC builds. Without this, the test outputs
 -1.#NaN instead of nan on MSVS 2010.
 ---
  libavutil/eval.c |5 +
  1 file changed, 5 insertions(+)

 diff --git a/libavutil/eval.c b/libavutil/eval.c
 index ef37ad8..6131263 100644
 --- a/libavutil/eval.c
 +++ b/libavutil/eval.c
 @@ -671,6 +671,11 @@ int main(int argc, char **argv)
  av_expr_parse_and_eval(d, *expr,
 const_names, const_values,
 NULL, NULL, NULL, NULL, NULL, 0, NULL);
 +#ifdef _MSC_VER
 +if (isnan(d))
 +printf('%s' - nan\n\n, *expr);
 +else
 +#endif
  printf('%s' - %f\n\n, *expr, d);
  }


 Funny, when I proposed this without the MSC ifdef, you were wholly against it.

 And once again I will state that a conformant libc has the freedom to
 print [-]nan(n-char-sequence). So why not just drop the ifdef?

I'll drop the ifdef.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote:
 On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote:
 On 07/26/2012 04:27 AM, Ronald S. Bultje wrote:
  From: Ronald S. Bultje rsbul...@gmail.com
 
  ---
   libswscale/swscale.c |2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)

 Ok.

 No, not OK.  This is just a repackaged piece of another patch that
 has review questions that were never answered.  Until those questions
 are settled, this cannot go in.

And that question is ... ?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86/dsputilenc: bury inline asm under HAVE_INLINE_ASM.

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 2:23 AM, Måns Rullgård m...@mansr.com wrote:
 Ronald S. Bultje rsbul...@gmail.com writes:

 From: Ronald S. Bultje rsbul...@gmail.com

 ---
  libavcodec/dct-test.c   |2 +-
  libavcodec/x86/dsputilenc_mmx.c |   80 
 +++
  libavcodec/x86/fdct_mmx.c   |4 ++
  libavcodec/x86/motion_est_mmx.c |6 +++
  libavcodec/x86/mpegvideo_mmx.c  |6 +++
  5 files changed, 64 insertions(+), 34 deletions(-)

 diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
 index 5046544..9e19e0c 100644
 --- a/libavcodec/dct-test.c
 +++ b/libavcodec/dct-test.c
 @@ -85,7 +85,7 @@ static const struct algo fdct_tab[] = {
  { IJG-AAN-INT,ff_fdct_ifast, SCALE_PERM },
  { IJG-LLM-INT,ff_jpeg_fdct_islow_8,  NO_PERM},

 -#if HAVE_MMX
 +#if HAVE_MMX  HAVE_INLINE_ASM
  { MMX,ff_fdct_mmx,   NO_PERM,   AV_CPU_FLAG_MMX   
   },

 This is just as wrong now as it was the first time.

Why?

What do you suggest instead?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote:
 On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote:
 On 07/26/2012 04:27 AM, Ronald S. Bultje wrote:
  From: Ronald S. Bultje rsbul...@gmail.com
 
  ---
   libswscale/swscale.c |2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)

 Ok.

 No, not OK.  This is just a repackaged piece of another patch that
 has review questions that were never answered.  Until those questions
 are settled, this cannot go in.

I've looked at all emails in:
http://comments.gmane.org/gmane.comp.video.libav.devel/28861

including yours:
http://permalink.gmane.org/gmane.comp.video.libav.devel/28871

and Mans':
http://permalink.gmane.org/gmane.comp.video.libav.devel/28863

My original mail has the fence part in it (simply ctrl-F in your
browser), and neither you nor Mans respond to that particular section.
So I'm lost now. What is the specific comment you want me to respond
to?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 7:30 AM, Martin Storsjö mar...@martin.st wrote:
 On Thu, 26 Jul 2012, Ronald S. Bultje wrote:

 Hi,

 On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote:

 On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote:

 On 07/26/2012 04:27 AM, Ronald S. Bultje wrote:

 From: Ronald S. Bultje rsbul...@gmail.com

 ---
  libswscale/swscale.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


 Ok.


 No, not OK.  This is just a repackaged piece of another patch that
 has review questions that were never answered.  Until those questions
 are settled, this cannot go in.


 I've looked at all emails in:
 http://comments.gmane.org/gmane.comp.video.libav.devel/28861

 including yours:
 http://permalink.gmane.org/gmane.comp.video.libav.devel/28871

 and Mans':
 http://permalink.gmane.org/gmane.comp.video.libav.devel/28863

 My original mail has the fence part in it (simply ctrl-F in your
 browser), and neither you nor Mans respond to that particular section.
 So I'm lost now. What is the specific comment you want me to respond
 to?


 http://article.gmane.org/gmane.comp.video.libav.devel/30834

If someone feels like rewriting swscale, I'm all supportive of that
effort. For now, sws uses movntq in its inline assembly mmx/3dnow
optimizations and we'll have to deal with it until someone changes it
not to do that.

Doing it in generic code is silly because in practice there is never
any advantage to doing movntq. Thus, we should discourage its use.
Adding generic versions of sfence does not contribute to that. The
whole goal - back when I worked on sws - was to kill all these old
mmx/3dnow optimizations and replace with modern sse2/avx, which would
mean we don't need a call to sfence anymore anyways.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 9:05 AM, Måns Rullgård m...@mansr.com wrote:
 Ronald S. Bultje rsbul...@gmail.com writes:
 On Thu, Jul 26, 2012 at 7:30 AM, Martin Storsjö mar...@martin.st wrote:
 On Thu, 26 Jul 2012, Ronald S. Bultje wrote:
 On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote:
 On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote:
 On 07/26/2012 04:27 AM, Ronald S. Bultje wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 ---
  libswscale/swscale.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


 Ok.


 No, not OK.  This is just a repackaged piece of another patch that
 has review questions that were never answered.  Until those questions
 are settled, this cannot go in.


 I've looked at all emails in:
 http://comments.gmane.org/gmane.comp.video.libav.devel/28861

 including yours:
 http://permalink.gmane.org/gmane.comp.video.libav.devel/28871

 and Mans':
 http://permalink.gmane.org/gmane.comp.video.libav.devel/28863

 My original mail has the fence part in it (simply ctrl-F in your
 browser), and neither you nor Mans respond to that particular section.
 So I'm lost now. What is the specific comment you want me to respond
 to?


 http://article.gmane.org/gmane.comp.video.libav.devel/30834

 If someone feels like rewriting swscale, I'm all supportive of that
 effort. For now, sws uses movntq in its inline assembly mmx/3dnow
 optimizations and we'll have to deal with it until someone changes it
 not to do that.

 Doing it in generic code is silly because in practice there is never
 any advantage to doing movntq. Thus, we should discourage its use.
 Adding generic versions of sfence does not contribute to that. The
 whole goal - back when I worked on sws - was to kill all these old
 mmx/3dnow optimizations and replace with modern sse2/avx, which would
 mean we don't need a call to sfence anymore anyways.

 I'm still missing an explanation of why sfence is needed here other than
 movntq somehow being involved.

My understanding is that if you use movntq and not sfence, the data
may not be in the destination memory pointer by the time swScale()
returns.

But I didn't write this code.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [RFC] split HAVE_MMX/MMX2/SSE/SSE2/SSSE3/AVX for inline asm vs. yasm

2012-07-26 Thread Ronald S. Bultje
Hi guys,

discussion thread. We currently use HAVE_SSSE3 and related macros to
indicate that we want to compile these and that our compiler tools are
good enough to know what to do with it. As a result, we currently use
HAVE_AVX around all avx code (yasm only - we don't have any avx inline
asm), HAVE_SSSE3 around some yasm and all inline asm code that uses
ssse3 instructions, and sometimes HAVE_SSE/2 around inline asm using
xmm regs. There is no HAVE_SSE4. HAVE_MMX2 is almost never used but
does exist. HAVE_MMX is something entirely different and is used as an
alternative form of ARCH_X86.

In addition to that, we're using inline asm checks to test whether to
enable HAVE_SSSE3 and HAVE_SSE2 (line 2850 of configure).

Can we split these macros in something for yasm vs something for
inline asm? This means e.g. that we can use ssse3 if yasm (but not
inline asm) supports it, if inline asm is lacking, etc.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [RFC] split HAVE_MMX/MMX2/SSE/SSE2/SSSE3/AVX for inline asm vs. yasm

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 2:39 PM, Diego Biurrun di...@biurrun.de wrote:
 On Thu, Jul 26, 2012 at 01:50:17PM -0700, Ronald S. Bultje wrote:

 discussion thread. We currently use HAVE_SSSE3 and related macros to
 indicate that we want to compile these and that our compiler tools are
 good enough to know what to do with it. As a result, we currently use
 HAVE_AVX around all avx code (yasm only - we don't have any avx inline
 asm), HAVE_SSSE3 around some yasm and all inline asm code that uses
 ssse3 instructions, and sometimes HAVE_SSE/2 around inline asm using
 xmm regs. There is no HAVE_SSE4. HAVE_MMX2 is almost never used but
 does exist.

 Do we need HAVE_SSE4?  It should be easy enough to add.

 HAVE_MMX is something entirely different and is used as an
 alternative form of ARCH_X86.

 No, HAVE_MMX is just that.  True, it's abused in some places where
 ARCH_X86 would be better (when invoking init functions), but that
 is an issue that needs to be addressed at some point.

 In addition to that, we're using inline asm checks to test whether to
 enable HAVE_SSSE3 and HAVE_SSE2 (line 2850 of configure).

 Can we split these macros in something for yasm vs something for
 inline asm? This means e.g. that we can use ssse3 if yasm (but not
 inline asm) supports it, if inline asm is lacking, etc.

 What is your goal?  Do you want to write something like

   #if HAVE_INLINE_SSSE3

 instead of

   #if HAVE_SSSE3  HAVE_INLINE_ASM

 ?

Right now, in practice:

HAVE_SSSE3 means we support inline ssse3
HAVE_SSE2 means we support inline sse2
HAVE_AVX means we support yasm avx but depends on HAVE_SSSE3

I wonder whether it makes sense to have a generic HAVE_SSSE3 anyway
- when would we use it, what would it mean? I think in practice, we
probably want a HAVE_INLINE_SSSE3, as you said, because yes, there's
compilers that don't support this, but do support HAVE_INLINE_ASM in
general. Likewise, HAVE_AVX could be renamed HAVE_YASM_AVX or so.
Having HAVE_YASM_SSSE3 seems pointless, I don't think we support any
yasm/nasm version that doesn't understand ssse3, so it'd always be 1.
However, this would make it clear that HAVE_SSSE3 and HAVE_AVX don't
and shouldn't depend on each other.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 3/7] vf_hqdn3d: simplify and optimize

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 3:51 PM, Loren Merritt lor...@u.washington.edu wrote:
 14% faster on penryn, 2% on sandybridge, 9% on bulldozer
 ---
  libavfilter/vf_hqdn3d.c |  157 +++---
  1 files changed, 51 insertions(+), 106 deletions(-)

Looks good.

I am going to ask a very stupid question: why is this faster? I see a
lot of simplification, which is good, but I'm not quite sure which
part actually has a clear speed impact.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 4/7] vf_hqdn3d: reduce intermediate precision

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 3:51 PM, Loren Merritt lor...@u.washington.edu wrote:
 11% faster on penryn, 7% on sandybridge, 5% on bulldozer
 Negligible change to output.
 ---
  libavfilter/vf_hqdn3d.c |   62 --
  1 files changed, 32 insertions(+), 30 deletions(-)

Looks good.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 5/7] vf_hqdn3d: support 10bit colordepth

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 3:51 PM, Loren Merritt lor...@u.washington.edu wrote:
 ---
  libavfilter/vf_hqdn3d.c |   68 +-
  1 files changed, 49 insertions(+), 19 deletions(-)

Can you add 9bpp support also? Not that it's used much, but it'll use
the exact same codepath, I think.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [RFC] split HAVE_MMX/MMX2/SSE/SSE2/SSSE3/AVX for inline asm vs. yasm

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 3:54 PM, Diego Biurrun di...@biurrun.de wrote:
 On Thu, Jul 26, 2012 at 03:42:24PM -0700, Ronald S. Bultje wrote:
 On Thu, Jul 26, 2012 at 2:39 PM, Diego Biurrun di...@biurrun.de wrote:
  On Thu, Jul 26, 2012 at 01:50:17PM -0700, Ronald S. Bultje wrote:
 
  discussion thread. We currently use HAVE_SSSE3 and related macros to
  indicate that we want to compile these and that our compiler tools are
  good enough to know what to do with it. As a result, we currently use
  HAVE_AVX around all avx code (yasm only - we don't have any avx inline
  asm), HAVE_SSSE3 around some yasm and all inline asm code that uses
  ssse3 instructions, and sometimes HAVE_SSE/2 around inline asm using
  xmm regs. There is no HAVE_SSE4. HAVE_MMX2 is almost never used but
  does exist.
 
  Do we need HAVE_SSE4?  It should be easy enough to add.
 
  HAVE_MMX is something entirely different and is used as an
  alternative form of ARCH_X86.
 
  No, HAVE_MMX is just that.  True, it's abused in some places where
  ARCH_X86 would be better (when invoking init functions), but that
  is an issue that needs to be addressed at some point.
 
  In addition to that, we're using inline asm checks to test whether to
  enable HAVE_SSSE3 and HAVE_SSE2 (line 2850 of configure).
 
  Can we split these macros in something for yasm vs something for
  inline asm? This means e.g. that we can use ssse3 if yasm (but not
  inline asm) supports it, if inline asm is lacking, etc.
 
  What is your goal?  Do you want to write something like
 
#if HAVE_INLINE_SSSE3
 
  instead of
 
#if HAVE_SSSE3  HAVE_INLINE_ASM
 
  ?

 Right now, in practice:

 HAVE_SSSE3 means we support inline ssse3
 HAVE_SSE2 means we support inline sse2
 HAVE_AVX means we support yasm avx but depends on HAVE_SSSE3

 I wonder whether it makes sense to have a generic HAVE_SSSE3 anyway
 - when would we use it, what would it mean? I think in practice, we
 probably want a HAVE_INLINE_SSSE3, as you said, because yes, there's
 compilers that don't support this, but do support HAVE_INLINE_ASM in
 general. Likewise, HAVE_AVX could be renamed HAVE_YASM_AVX or so.
 Having HAVE_YASM_SSSE3 seems pointless, I don't think we support any
 yasm/nasm version that doesn't understand ssse3, so it'd always be 1.
 However, this would make it clear that HAVE_SSSE3 and HAVE_AVX don't
 and shouldn't depend on each other.

 Try dropping the line

   avx_deps=ssse3

 from configure and see if that works out the way you want it to.

I'm still wondering if it makes sense to change the names to reflect
what they do, to prevent more misunderstandings.

Plus, someone (i.e. me) needs to go over all our x86 simd function
pointer inits and make sure we use HAVE_INLINE_SSSE3 only for inline,
not yasm. Also, HAVE_SSE2, HAVE_SSE, HAVE_MMX2, HAVE_MMX need such
rules (are they inline? yasm? both?) and the same check in init
functions.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/6] build: Only compile and run dct-test if AAN DCT tables are enabled

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 5:15 PM, Diego Biurrun di...@biurrun.de wrote:
 ---
  libavcodec/Makefile |2 +-
  tests/fate/dct.mak  |2 +-
  2 files changed, 2 insertions(+), 2 deletions(-)

This test tests a lot more than just aan dct?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] x86inc: sync to latest version from x264.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavutil/x86/x86inc.asm |  216 ++
 1 file changed, 124 insertions(+), 92 deletions(-)

diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index b76a10c..23d9d57 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -36,8 +36,8 @@
 
 %define program_name ff
 
-%define UNIX64 0
 %define WIN64  0
+%define UNIX64 0
 %if ARCH_X86_64
 %ifidn __OUTPUT_FORMAT__,win32
 %define WIN64  1
@@ -54,11 +54,6 @@
 %define mangle(x) x
 %endif
 
-; FIXME: All of the 64bit asm functions that take a stride as an argument
-; via register, assume that the high dword of that register is filled with 0.
-; This is true in practice (since we never do any 64bit arithmetic on strides,
-; and x264's strides are all positive), but is not guaranteed by the ABI.
-
 ; Name of the .rodata section.
 ; Kludge: Something on OS X fails to align .rodata even given an align 
attribute,
 ; so use a different read-only section.
@@ -129,34 +124,38 @@ CPU amdnop
 ; registers:
 ; rN and rNq are the native-size register holding function argument N
 ; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
 ; rNm is the original location of arg N (a register or on the stack), dword
 ; rNmp is native size
 
-%macro DECLARE_REG 5-6
+%macro DECLARE_REG 2-3
 %define r%1q %2
-%define r%1d %3
-%define r%1w %4
-%define r%1b %5
-%if %0 == 5
-%define r%1m  %3
+%define r%1d %2d
+%define r%1w %2w
+%define r%1b %2b
+%define r%1h %2h
+%if %0 == 2
+%define r%1m  %2d
 %define r%1mp %2
 %elif ARCH_X86_64 ; memory
-%define r%1m [rsp + stack_offset + %6]
+%define r%1m [rsp + stack_offset + %3]
 %define r%1mp qword r %+ %1m
 %else
-%define r%1m [esp + stack_offset + %6]
+%define r%1m [esp + stack_offset + %3]
 %define r%1mp dword r %+ %1m
 %endif
 %define r%1  %2
 %endmacro
 
-%macro DECLARE_REG_SIZE 2
+%macro DECLARE_REG_SIZE 3
 %define r%1q r%1
 %define e%1q r%1
 %define r%1d e%1
 %define e%1d e%1
 %define r%1w %1
 %define e%1w %1
+%define r%1h %3
+%define e%1h %3
 %define r%1b %2
 %define e%1b %2
 %if ARCH_X86_64 == 0
@@ -164,13 +163,13 @@ CPU amdnop
 %endif
 %endmacro
 
-DECLARE_REG_SIZE ax, al
-DECLARE_REG_SIZE bx, bl
-DECLARE_REG_SIZE cx, cl
-DECLARE_REG_SIZE dx, dl
-DECLARE_REG_SIZE si, sil
-DECLARE_REG_SIZE di, dil
-DECLARE_REG_SIZE bp, bpl
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
 
 ; t# defines for when per-arch register allocation is more complex than just 
function arguments
 
@@ -188,6 +187,7 @@ DECLARE_REG_SIZE bp, bpl
 %define t%1q t%1 %+ q
 %define t%1d t%1 %+ d
 %define t%1w t%1 %+ w
+%define t%1h t%1 %+ h
 %define t%1b t%1 %+ b
 %rotate 1
 %endrep
@@ -277,6 +277,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 CAT_UNDEF arg_name %+ %%i, q
 CAT_UNDEF arg_name %+ %%i, d
 CAT_UNDEF arg_name %+ %%i, w
+CAT_UNDEF arg_name %+ %%i, h
 CAT_UNDEF arg_name %+ %%i, b
 CAT_UNDEF arg_name %+ %%i, m
 CAT_UNDEF arg_name %+ %%i, mp
@@ -292,6 +293,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 %xdefine %1q r %+ %%i %+ q
 %xdefine %1d r %+ %%i %+ d
 %xdefine %1w r %+ %%i %+ w
+%xdefine %1h r %+ %%i %+ h
 %xdefine %1b r %+ %%i %+ b
 %xdefine %1m r %+ %%i %+ m
 %xdefine %1mp r %+ %%i %+ mp
@@ -305,21 +307,21 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 
 %if WIN64 ; Windows x64 ;=
 
-DECLARE_REG 0,  rcx, ecx,  cx,   cl
-DECLARE_REG 1,  rdx, edx,  dx,   dl
-DECLARE_REG 2,  R8,  R8D,  R8W,  R8B
-DECLARE_REG 3,  R9,  R9D,  R9W,  R9B
-DECLARE_REG 4,  R10, R10D, R10W, R10B, 40
-DECLARE_REG 5,  R11, R11D, R11W, R11B, 48
-DECLARE_REG 6,  rax, eax,  ax,   al,   56
-DECLARE_REG 7,  rdi, edi,  di,   dil,  64
-DECLARE_REG 8,  rsi, esi,  si,   sil,  72
-DECLARE_REG 9,  rbx, ebx,  bx,   bl,   80
-DECLARE_REG 10, rbp, ebp,  bp,   bpl,  88
-DECLARE_REG 11, R12, R12D, R12W, R12B, 96
-DECLARE_REG 12, R13, R13D, R13W, R13B, 104
-DECLARE_REG 13, R14, R14D, R14W, R14B, 112
-DECLARE_REG 14, R15, R15D, R15W, R15B, 120
+DECLARE_REG 0,  rcx
+DECLARE_REG 1,  rdx
+DECLARE_REG 2,  R8
+DECLARE_REG 3,  R9
+DECLARE_REG 4,  R10, 40
+DECLARE_REG 5,  R11, 48
+DECLARE_REG 6,  rax, 56
+DECLARE_REG 7,  rdi, 64
+DECLARE_REG 8,  rsi, 72
+DECLARE_REG 9,  rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
 
 %macro PROLOGUE 2-4+ 0 ; #args, #regs

Re: [libav-devel] [PATCH 5/7] vf_hqdn3d: support 9 and 10bit colordepth

2012-07-26 Thread Ronald S. Bultje
Hi,

On Thu, Jul 26, 2012 at 6:42 PM, Loren Merritt lor...@u.washington.edu wrote:
 ---
  libavfilter/vf_hqdn3d.c |   72 ++
  1 files changed, 53 insertions(+), 19 deletions(-)

OK.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] proresdsp: port x86 assembly to cpuflags.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/proresdsp.asm |   39 ++-
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm
index 9b2e11e..70fd686 100644
--- a/libavcodec/x86/proresdsp.asm
+++ b/libavcodec/x86/proresdsp.asm
@@ -82,8 +82,7 @@ section .text align=16
 
 ; %1 = row or col (for rounding variable)
 ; %2 = number of bits to shift at the end
-; %3 = optimization
-%macro IDCT_1D 3
+%macro IDCT_1D 2
 ; a0 = (W4 * row[0]) + (1  (15 - 1));
 ; a1 = a0;
 ; a2 = a0;
@@ -330,8 +329,8 @@ section .text align=16
 
 ; void prores_idct_put_10_opt(uint8_t *pixels, int stride,
 ;   DCTELEM *block, const int16_t *qmat);
-%macro idct_put_fn 2
-cglobal prores_idct_put_10_%1, 4, 4, %2
+%macro idct_put_fn 1
+cglobal prores_idct_put_10, 4, 4, %1
 movsxd  r1,  r1d
 pxorm15, m15   ; zero
 
@@ -347,7 +346,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2
 pmullw  m13,[r3+64]
 pmullw  m12,[r3+96]
 
-IDCT_1D row, 17,  %1
+IDCT_1D row, 17
 
 ; transpose for second part of IDCT
 TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
@@ -362,7 +361,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2
 
 ; for (i = 0; i  8; i++)
 ; idctSparseColAdd(dest + i, line_size, block + i);
-IDCT_1D col, 20,  %1
+IDCT_1D col, 20
 
 ; clip/store
 movam6, [pw_512]
@@ -406,27 +405,25 @@ cglobal prores_idct_put_10_%1, 4, 4, %2
 RET
 %endmacro
 
-%macro signextend_sse2 3 ; dstlow, dsthigh, tmp
+%macro SIGNEXTEND 2-3 ; dstlow, dsthigh, tmp
+%if cpuflag(sse4)
+movhlps %2,  %1
+pmovsxwd%1,  %1
+pmovsxwd%2,  %2
+%else ; sse2
 pxor%3,  %3
 pcmpgtw %3,  %1
 mova%2,  %1
 punpcklwd   %1,  %3
 punpckhwd   %2,  %3
+%endif
 %endmacro
 
-%macro signextend_sse4 2-3 ; dstlow, dsthigh
-movhlps %2,  %1
-pmovsxwd%1,  %1
-pmovsxwd%2,  %2
-%endmacro
-
-INIT_XMM
-%define SIGNEXTEND signextend_sse2
-idct_put_fn sse2, 16
-INIT_XMM
-%define SIGNEXTEND signextend_sse4
-idct_put_fn sse4, 16
-INIT_AVX
-idct_put_fn avx,  16
+INIT_XMM sse2
+idct_put_fn 16
+INIT_XMM sse4
+idct_put_fn 16
+INIT_XMM avx
+idct_put_fn 16
 
 %endif
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] h264_chromamc_10bit: port x86 simd to cpuflags.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/dsputil_mmx.c   |   16 ++---
 libavcodec/x86/h264_chromamc_10bit.asm |   40 
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index e91ede5..afbb531 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2117,10 +2117,10 @@ void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH 
## _ ## OPT  \
   (uint8_t *dst, uint8_t *src,  \
int stride, int h, int x, int y);
 
-CHROMA_MC(put, 2, 10, mmxext)
-CHROMA_MC(avg, 2, 10, mmxext)
-CHROMA_MC(put, 4, 10, mmxext)
-CHROMA_MC(avg, 4, 10, mmxext)
+CHROMA_MC(put, 2, 10, mmx2)
+CHROMA_MC(avg, 2, 10, mmx2)
+CHROMA_MC(put, 4, 10, mmx2)
+CHROMA_MC(avg, 4, 10, mmx2)
 CHROMA_MC(put, 8, 10, sse2)
 CHROMA_MC(avg, 8, 10, sse2)
 CHROMA_MC(put, 8, 10, avx)
@@ -2740,10 +2740,10 @@ static void dsputil_init_mmx2(DSPContext *c, 
AVCodecContext *avctx,
 c-put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
 }
 if (bit_depth == 10  CONFIG_H264CHROMA) {
-c-put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
-c-avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
-c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
-c-avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
+c-put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2;
+c-avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2;
+c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2;
+c-avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2;
 }
 
 c-add_hfyu_median_prediction   = ff_add_hfyu_median_prediction_mmx2;
diff --git a/libavcodec/x86/h264_chromamc_10bit.asm 
b/libavcodec/x86/h264_chromamc_10bit.asm
index 3f7c513..370c7b5 100644
--- a/libavcodec/x86/h264_chromamc_10bit.asm
+++ b/libavcodec/x86/h264_chromamc_10bit.asm
@@ -60,10 +60,10 @@ SECTION .text
 ;-
 ; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int 
mx, int my)
 ;-
-%macro CHROMA_MC8 2
+%macro CHROMA_MC8 1
 ; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
 ;  int stride, int h, int mx, int my)
-cglobal %1_h264_chroma_mc8_10_%2, 6,7,8
+cglobal %1_h264_chroma_mc8_10, 6,7,8
 movsxdifnidn  r2, r2d
 mov  r6d, r5d
 or   r6d, r4d
@@ -173,8 +173,8 @@ cglobal %1_h264_chroma_mc8_10_%2, 6,7,8
 add   r0, r2
 %endmacro
 
-%macro CHROMA_MC4 2
-cglobal %1_h264_chroma_mc4_10_%2, 6,6,7
+%macro CHROMA_MC4 1
+cglobal %1_h264_chroma_mc4_10, 6,6,7
 movsxdifnidn  r2, r2d
 movd  m2, r4m ; x
 movd  m3, r5m ; y
@@ -203,8 +203,8 @@ cglobal %1_h264_chroma_mc4_10_%2, 6,6,7
 ;-
 ; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int 
mx, int my)
 ;-
-%macro CHROMA_MC2 2
-cglobal %1_h264_chroma_mc2_10_%2, 6,7
+%macro CHROMA_MC2 1
+cglobal %1_h264_chroma_mc2_10, 6,7
 movsxdifnidn  r2, r2d
 mov  r6d, r4d
 shl  r4d, 16
@@ -250,24 +250,24 @@ cglobal %1_h264_chroma_mc2_10_%2, 6,7
 %endmacro
 
 %define CHROMAMC_AVG  NOTHING
-INIT_XMM
-CHROMA_MC8 put, sse2
+INIT_XMM sse2
+CHROMA_MC8 put
 %if HAVE_AVX
-INIT_AVX
-CHROMA_MC8 put, avx
+INIT_XMM avx
+CHROMA_MC8 put
 %endif
-INIT_MMX
-CHROMA_MC4 put, mmxext
-CHROMA_MC2 put, mmxext
+INIT_MMX mmx2
+CHROMA_MC4 put
+CHROMA_MC2 put
 
 %define CHROMAMC_AVG  AVG
 %define PAVG  pavgw
-INIT_XMM
-CHROMA_MC8 avg, sse2
+INIT_XMM sse2
+CHROMA_MC8 avg
 %if HAVE_AVX
-INIT_AVX
-CHROMA_MC8 avg, avx
+INIT_XMM avx
+CHROMA_MC8 avg
 %endif
-INIT_MMX
-CHROMA_MC4 avg, mmxext
-CHROMA_MC2 avg, mmxext
+INIT_MMX mmx2
+CHROMA_MC4 avg
+CHROMA_MC2 avg
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] h264_idct_10bit: port x86 assembly to cpuflags.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/h264_idct_10bit.asm |  210 ++--
 1 file changed, 105 insertions(+), 105 deletions(-)

diff --git a/libavcodec/x86/h264_idct_10bit.asm 
b/libavcodec/x86/h264_idct_10bit.asm
index 934a7ff..fd61c98 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -72,25 +72,25 @@ SECTION .text
 STORE_DIFFx2 m2, m3, m4, m5, %1, %3
 %endmacro
 
-%macro IDCT_ADD_10 1
-cglobal h264_idct_add_10_%1, 3,3
+%macro IDCT_ADD_10 0
+cglobal h264_idct_add_10, 3,3
 IDCT4_ADD_10 r0, r1, r2
 RET
 %endmacro
 
-INIT_XMM
-IDCT_ADD_10 sse2
+INIT_XMM sse2
+IDCT_ADD_10
 %if HAVE_AVX
-INIT_AVX
-IDCT_ADD_10 avx
+INIT_XMM avx
+IDCT_ADD_10
 %endif
 
 ;-
 ; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int 
stride, const uint8_t nnzc[6*8])
 ;-
 ;;; NO FATE SAMPLES TRIGGER THIS
-%macro ADD4x4IDCT 1
-add4x4_idct_%1:
+%macro ADD4x4IDCT 0
+add4x4_idct_ %+ SUFFIX:
 add   r5, r0
 mova  m0, [r2+ 0]
 mova  m1, [r2+16]
@@ -107,28 +107,28 @@ add4x4_idct_%1:
 ret
 %endmacro
 
-INIT_XMM
+INIT_XMM sse2
 ALIGN 16
-ADD4x4IDCT sse2
+ADD4x4IDCT
 %if HAVE_AVX
-INIT_AVX
+INIT_XMM avx
 ALIGN 16
-ADD4x4IDCT avx
+ADD4x4IDCT
 %endif
 
 %macro ADD16_OP 3
 cmp  byte [r4+%3], 0
 jz .skipblock%2
 mov r5d, [r1+%2*4]
-call add4x4_idct_%1
+call add4x4_idct_ %+ SUFFIX
 .skipblock%2:
 %if %215
 add  r2, 64
 %endif
 %endmacro
 
-%macro IDCT_ADD16_10 1
-cglobal h264_idct_add16_10_%1, 5,6
+%macro IDCT_ADD16_10 0
+cglobal h264_idct_add16_10, 5,6
 ADD16_OP %1, 0, 4+1*8
 ADD16_OP %1, 1, 5+1*8
 ADD16_OP %1, 2, 4+2*8
@@ -148,11 +148,11 @@ cglobal h264_idct_add16_10_%1, 5,6
 REP_RET
 %endmacro
 
-INIT_XMM
-IDCT_ADD16_10 sse2
+INIT_XMM sse2
+IDCT_ADD16_10
 %if HAVE_AVX
-INIT_AVX
-IDCT_ADD16_10 avx
+INIT_XMM avx
+IDCT_ADD16_10
 %endif
 
 ;-
@@ -185,8 +185,8 @@ IDCT_ADD16_10 avx
 mova [%1+%3  ], m4
 %endmacro
 
-INIT_MMX
-cglobal h264_idct_dc_add_10_mmx2,3,3
+INIT_MMX mmx2
+cglobal h264_idct_dc_add_10,3,3
 movd  m0, [r1]
 paddd m0, [pd_32]
 psrad m0, 6
@@ -199,8 +199,8 @@ cglobal h264_idct_dc_add_10_mmx2,3,3
 ;-
 ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
 ;-
-%macro IDCT8_DC_ADD 1
-cglobal h264_idct8_dc_add_10_%1,3,3,7
+%macro IDCT8_DC_ADD 0
+cglobal h264_idct8_dc_add_10,3,3,7
 mov  r1d, [r1]
 add   r1, 32
 sar   r1, 6
@@ -214,45 +214,45 @@ cglobal h264_idct8_dc_add_10_%1,3,3,7
 RET
 %endmacro
 
-INIT_XMM
-IDCT8_DC_ADD sse2
+INIT_XMM sse2
+IDCT8_DC_ADD
 %if HAVE_AVX
-INIT_AVX
-IDCT8_DC_ADD avx
+INIT_XMM avx
+IDCT8_DC_ADD
 %endif
 
 ;-
 ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, 
int stride, const uint8_t nnzc[6*8])
 ;-
-%macro AC 2
-.ac%2
-mov  r5d, [r1+(%2+0)*4]
-call add4x4_idct_%1
-mov  r5d, [r1+(%2+1)*4]
+%macro AC 1
+.ac%1
+mov  r5d, [r1+(%1+0)*4]
+call add4x4_idct_ %+ SUFFIX
+mov  r5d, [r1+(%1+1)*4]
 add  r2, 64
-call add4x4_idct_%1
+call add4x4_idct_ %+ SUFFIX
 add  r2, 64
-jmp .skipadd%2
+jmp .skipadd%1
 %endmacro
 
 %assign last_block 16
-%macro ADD16_OP_INTRA 3
-cmp  word [r4+%3], 0
-jnz .ac%2
+%macro ADD16_OP_INTRA 2
+cmp  word [r4+%2], 0
+jnz .ac%1
 mov  r5d, [r2+ 0]
 or   r5d, [r2+64]
-jz .skipblock%2
-mov  r5d, [r1+(%2+0)*4]
-call idct_dc_add_%1
-.skipblock%2:
-%if %2last_block-2
+jz .skipblock%1
+mov  r5d, [r1+(%1+0)*4]
+call idct_dc_add_ %+ SUFFIX
+.skipblock%1:
+%if %1last_block-2
 add   r2, 128
 %endif
-.skipadd%2:
+.skipadd%1:
 %endmacro
 
-%macro IDCT_ADD16INTRA_10 1
-idct_dc_add_%1:
+%macro IDCT_ADD16INTRA_10 0
+idct_dc_add_ %+ SUFFIX:
 add   r5, r0
 movq  m0, [r2+ 0]
 movhpsm0, [r2+64]
@@ -265,46 +265,46 @@ idct_dc_add_%1:
 IDCT_DC_ADD_OP_10 r5, r3, r6
 ret
 
-cglobal h264_idct_add16intra_10_%1,5,7,8
-ADD16_OP_INTRA %1, 0, 4+1*8
-ADD16_OP_INTRA %1, 2, 4+2*8
-ADD16_OP_INTRA %1, 4, 6+1*8
-ADD16_OP_INTRA %1, 6, 6+2*8
-ADD16_OP_INTRA %1, 8, 4+3*8
-ADD16_OP_INTRA %1, 10, 4+4*8
-ADD16_OP_INTRA %1, 12, 6+3*8
-ADD16_OP_INTRA %1, 14, 6+4*8
+cglobal h264_idct_add16intra_10,5,7,8
+ADD16_OP_INTRA 0, 4+1*8
+ADD16_OP_INTRA 2, 4+2*8
+ADD16_OP_INTRA 4, 6+1*8
+ADD16_OP_INTRA 6, 6+2*8
+ADD16_OP_INTRA

[libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/h264_deblock.asm |  120 ++-
 libavcodec/x86/h264dsp_mmx.c|   42 +++---
 2 files changed, 88 insertions(+), 74 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 1982dc4..94ff27b 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -282,8 +282,8 @@ cextern pb_A1
 ;-
 ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-%macro DEBLOCK_LUMA 1
-cglobal deblock_v_luma_8_%1, 5,5,10
+%macro DEBLOCK_V_LUMA 0
+cglobal deblock_v_luma_8, 5,5,10
 movdm8, [r4] ; tc0
 lea r4, [r1*3]
 dec r2d; alpha-1
@@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10
 mova[r4+2*r1], m1
 mova[r0], m2
 RET
+%endmacro
 
 ;-
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-INIT_MMX
-cglobal deblock_h_luma_8_%1, 5,9
+%macro DEBLOCK_H_LUMA 0
+cglobal deblock_h_luma_8, 5,9
 movsxd r7,  r1d
 lear8,  [r7+r7*2]
 lear6,  [r0-4]
@@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9
 %if WIN64
 mov[rsp+0x20], r4
 %endif
-call   deblock_v_luma_8_%1
+call   deblock_v_luma_8 %+ SUFFIX
 
 ; transpose 16x4 - original space  (only the middle 4 rows were changed 
by the filter)
 addr6, 2
@@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9
 RET
 %endmacro
 
-INIT_XMM
-DEBLOCK_LUMA sse2
-INIT_AVX
-DEBLOCK_LUMA avx
+INIT_XMM sse2
+DEBLOCK_V_LUMA
+INIT_MMX sse2
+DEBLOCK_H_LUMA
+
+INIT_XMM avx
+DEBLOCK_V_LUMA
+INIT_MMX avx
+DEBLOCK_H_LUMA
 
 %else
 
-%macro DEBLOCK_LUMA 3
+%macro DEBLOCK_LUMA 2
 ;-
 ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-cglobal deblock_%2_luma_8_%1, 5,5
+cglobal deblock_%1_luma_8, 5,5
 lea r4, [r1*3]
 dec r2 ; alpha-1
 neg r4
 dec r3 ; beta-1
 add r4, r0 ; pix-3*stride
-%assign pad 2*%3+12-(stack_offset15)
+%assign pad 2*%2+12-(stack_offset15)
 SUB esp, pad
 
 movam0, [r4+r1]   ; p1
@@ -415,7 +421,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 movdm4, [r3] ; tc0
 punpcklbw m4, m4
 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
-mova   [esp+%3], m4 ; tc
+mova   [esp+%2], m4 ; tc
 pcmpgtb m4, m3
 movam3, [r4] ; p2
 pandm4, m7
@@ -423,7 +429,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 
 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0|  beta-1
 pandm6, m4
-pandm4, [esp+%3] ; tc
+pandm4, [esp+%2] ; tc
 psubb   m7, m4, m6
 pandm6, m4
 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
@@ -431,7 +437,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 movam4, [r0+2*r1] ; q2
 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0|  beta-1
 pandm6, [esp] ; mask
-movam5, [esp+%3] ; tc
+movam5, [esp+%2] ; tc
 psubb   m7, m6
 pandm5, m6
 movam3, [r0+r1]
@@ -446,8 +452,8 @@ cglobal deblock_%2_luma_8_%1, 5,5
 ;-
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-INIT_MMX
-cglobal deblock_h_luma_8_%1, 0,5
+INIT_MMX SUFFIX
+cglobal deblock_h_luma_8, 0,5
 movr0, r0mp
 movr3, r1m
 lear4, [r3*3]
@@ -470,11 +476,11 @@ cglobal deblock_h_luma_8_%1, 0,5
 PUSH   dword r2m
 PUSH   dword 16
 PUSH   dword r0
-call   deblock_%2_luma_8_%1
-%ifidn %2, v8
+call   deblock_%1_luma_8_ %+ SUFFIX
+%ifidn %1, v8
 adddword [esp   ], 8 ; pix_tmp+0x38
 adddword [esp+16], 2 ; tc0+2
-call   deblock_%2_luma_8_%1
+call   deblock_%1_luma_8_ %+ SUFFIX
 %endif
 ADDesp, 20
 
@@ -501,12 +507,12 @@ cglobal deblock_h_luma_8_%1, 0,5
 RET
 %endmacro ; DEBLOCK_LUMA
 
-INIT_MMX
-DEBLOCK_LUMA mmxext, v8, 8
-INIT_XMM
-DEBLOCK_LUMA sse2, v, 16
-INIT_AVX
-DEBLOCK_LUMA avx, v, 16
+INIT_MMX mmx2
+DEBLOCK_LUMA v8, 8
+INIT_XMM sse2
+DEBLOCK_LUMA v, 16
+INIT_XMM avx
+DEBLOCK_LUMA v, 16
 
 %endif ; ARCH
 
@@ -608,7 +614,7 @@ DEBLOCK_LUMA avx, v, 16
 %define mask1p mask1q
 %endmacro
 
-%macro DEBLOCK_LUMA_INTRA 2
+%macro DEBLOCK_V_LUMA_INTRA 1
 %define p1 m0
 %define p0 m1
 %define q0 m2
@@ -643,7 +649,7 @@ DEBLOCK_LUMA avx, v, 16

[libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/h264_deblock.asm   |  124 +++--
 libavcodec/x86/h264_deblock_10bit.asm |   77 ++--
 libavcodec/x86/h264dsp_mmx.c  |   60 
 3 files changed, 139 insertions(+), 122 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 1982dc4..b5e81e7 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -282,8 +282,8 @@ cextern pb_A1
 ;-
 ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-%macro DEBLOCK_LUMA 1
-cglobal deblock_v_luma_8_%1, 5,5,10
+%macro DEBLOCK_V_LUMA 0
+cglobal deblock_v_luma_8, 5,5,10
 movdm8, [r4] ; tc0
 lea r4, [r1*3]
 dec r2d; alpha-1
@@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10
 mova[r4+2*r1], m1
 mova[r0], m2
 RET
+%endmacro
 
 ;-
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-INIT_MMX
-cglobal deblock_h_luma_8_%1, 5,9
+%macro DEBLOCK_H_LUMA 0
+cglobal deblock_h_luma_8, 5,9
 movsxd r7,  r1d
 lear8,  [r7+r7*2]
 lear6,  [r0-4]
@@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9
 %if WIN64
 mov[rsp+0x20], r4
 %endif
-call   deblock_v_luma_8_%1
+call   deblock_v_luma_8 %+ SUFFIX
 
 ; transpose 16x4 - original space  (only the middle 4 rows were changed 
by the filter)
 addr6, 2
@@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9
 RET
 %endmacro
 
-INIT_XMM
-DEBLOCK_LUMA sse2
-INIT_AVX
-DEBLOCK_LUMA avx
+INIT_XMM sse2
+DEBLOCK_V_LUMA
+INIT_MMX sse2
+DEBLOCK_H_LUMA
+
+INIT_XMM avx
+DEBLOCK_V_LUMA
+INIT_MMX avx
+DEBLOCK_H_LUMA
 
 %else
 
-%macro DEBLOCK_LUMA 3
+%macro DEBLOCK_V_LUMA 2
 ;-
 ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-cglobal deblock_%2_luma_8_%1, 5,5
+cglobal deblock_%1_luma_8, 5,5
 lea r4, [r1*3]
 dec r2 ; alpha-1
 neg r4
 dec r3 ; beta-1
 add r4, r0 ; pix-3*stride
-%assign pad 2*%3+12-(stack_offset15)
+%assign pad 2*%2+12-(stack_offset15)
 SUB esp, pad
 
 movam0, [r4+r1]   ; p1
@@ -415,7 +421,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 movdm4, [r3] ; tc0
 punpcklbw m4, m4
 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
-mova   [esp+%3], m4 ; tc
+mova   [esp+%2], m4 ; tc
 pcmpgtb m4, m3
 movam3, [r4] ; p2
 pandm4, m7
@@ -423,7 +429,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 
 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0|  beta-1
 pandm6, m4
-pandm4, [esp+%3] ; tc
+pandm4, [esp+%2] ; tc
 psubb   m7, m4, m6
 pandm6, m4
 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
@@ -431,7 +437,7 @@ cglobal deblock_%2_luma_8_%1, 5,5
 movam4, [r0+2*r1] ; q2
 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0|  beta-1
 pandm6, [esp] ; mask
-movam5, [esp+%3] ; tc
+movam5, [esp+%2] ; tc
 psubb   m7, m6
 pandm5, m6
 movam3, [r0+r1]
@@ -442,12 +448,13 @@ cglobal deblock_%2_luma_8_%1, 5,5
 mova[r0], m2
 ADD esp, pad
 RET
+%endmacro
 
 ;-
 ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t 
*tc0 )
 ;-
-INIT_MMX
-cglobal deblock_h_luma_8_%1, 0,5
+%macro DEBLOCK_H_LUMA 1
+cglobal deblock_h_luma_8, 0,5
 movr0, r0mp
 movr3, r1m
 lear4, [r3*3]
@@ -470,11 +477,11 @@ cglobal deblock_h_luma_8_%1, 0,5
 PUSH   dword r2m
 PUSH   dword 16
 PUSH   dword r0
-call   deblock_%2_luma_8_%1
-%ifidn %2, v8
+call   deblock_%1_luma_8 %+ SUFFIX
+%ifidn %1, v8
 adddword [esp   ], 8 ; pix_tmp+0x38
 adddword [esp+16], 2 ; tc0+2
-call   deblock_%2_luma_8_%1
+call   deblock_%1_luma_8 %+ SUFFIX
 %endif
 ADDesp, 20
 
@@ -501,12 +508,15 @@ cglobal deblock_h_luma_8_%1, 0,5
 RET
 %endmacro ; DEBLOCK_LUMA
 
-INIT_MMX
-DEBLOCK_LUMA mmxext, v8, 8
-INIT_XMM
-DEBLOCK_LUMA sse2, v, 16
-INIT_AVX
-DEBLOCK_LUMA avx, v, 16
+INIT_MMX mmx2
+DEBLOCK_V_LUMA v8, 8
+DEBLOCK_H_LUMA v8
+INIT_XMM sse2
+DEBLOCK_V_LUMA v, 16
+DEBLOCK_H_LUMA v
+INIT_XMM avx
+DEBLOCK_V_LUMA v, 16
+DEBLOCK_H_LUMA h
 
 %endif ; ARCH
 
@@ -608,7 +618,7 @@ DEBLOCK_LUMA avx, v, 16
 %define mask1p mask1q

[libav-devel] [PATCH] vp56: port x86 simd to cpuflags.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/vp56dsp.asm |   34 +++---
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/libavcodec/x86/vp56dsp.asm b/libavcodec/x86/vp56dsp.asm
index 66a97f1..27a82bc 100644
--- a/libavcodec/x86/vp56dsp.asm
+++ b/libavcodec/x86/vp56dsp.asm
@@ -27,7 +27,8 @@ cextern pw_64
 
 SECTION .text
 
-%macro DIAG4_MMX 6
+%macro DIAG4 6
+%if mmsize == 8
 movq  m0, [%1+%2]
 movq  m1, [%1+%3]
 movq  m3, m0
@@ -64,9 +65,7 @@ SECTION .text
 psraw m3, 7
 packuswb  m0, m3
 movq[%6], m0
-%endmacro
-
-%macro DIAG4_SSE2 6
+%else ; mmsize == 16
 movq  m0, [%1+%2]
 movq  m1, [%1+%3]
 punpcklbw m0, m7
@@ -86,9 +85,11 @@ SECTION .text
 psraw m0, 7
 packuswb  m0, m0
 movq[%6], m0
+%endif ; mmsize == 8/16
 %endmacro
 
-%macro SPLAT4REGS_MMX 0
+%macro SPLAT4REGS 0
+%if mmsize == 8
 movq m5, m3
 punpcklwdm3, m3
 movq m4, m3
@@ -102,9 +103,7 @@ SECTION .text
 movq [rsp+8*12], m4
 movq [rsp+8*13], m5
 movq [rsp+8*14], m2
-%endmacro
-
-%macro SPLAT4REGS_SSE2 0
+%else ; mmsize == 16
 pshuflw  m4, m3, 0x0
 pshuflw  m5, m3, 0x55
 pshuflw  m6, m3, 0xAA
@@ -113,15 +112,16 @@ SECTION .text
 punpcklqdq   m5, m5
 punpcklqdq   m6, m6
 punpcklqdq   m3, m3
+%endif ; mmsize == 8/16
 %endmacro
 
-%macro vp6_filter_diag4 2
+%macro vp6_filter_diag4 0
 ; void ff_vp6_filter_diag4_opt(uint8_t *dst, uint8_t *src, int stride,
 ;const int16_t h_weight[4], const int16_t 
v_weights[4])
-cglobal vp6_filter_diag4_%1, 5, 7, %2
+cglobal vp6_filter_diag4, 5, 7, 8
 mov  r5, rsp ; backup stack pointer
 and rsp, ~(mmsize-1) ; align stack
-%ifidn %1, sse2
+%if mmsize == 16
 sub rsp, 8*11
 %else
 sub rsp, 8*15
@@ -162,12 +162,8 @@ cglobal vp6_filter_diag4_%1, 5, 7, %2
 RET
 %endmacro
 
-INIT_MMX
-%define DIAG4  DIAG4_MMX
-%define SPLAT4REGS SPLAT4REGS_MMX
-vp6_filter_diag4 mmx,  0
+INIT_MMX mmx
+vp6_filter_diag4
 
-INIT_XMM
-%define DIAG4  DIAG4_SSE2
-%define SPLAT4REGS SPLAT4REGS_SSE2
-vp6_filter_diag4 sse2, 8
+INIT_XMM sse2
+vp6_filter_diag4
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] vp56: only compile MMX SIMD on x86-32.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

All x86-64 CPUs have SSE2, so the MMX version will never be used. This
leads to smaller binaries.
---
 libavcodec/x86/vp56dsp.asm|2 ++
 libavcodec/x86/vp56dsp_init.c |2 ++
 2 files changed, 4 insertions(+)

diff --git a/libavcodec/x86/vp56dsp.asm b/libavcodec/x86/vp56dsp.asm
index 27a82bc..ca4d97e 100644
--- a/libavcodec/x86/vp56dsp.asm
+++ b/libavcodec/x86/vp56dsp.asm
@@ -162,8 +162,10 @@ cglobal vp6_filter_diag4, 5, 7, 8
 RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 vp6_filter_diag4
+%endif
 
 INIT_XMM sse2
 vp6_filter_diag4
diff --git a/libavcodec/x86/vp56dsp_init.c b/libavcodec/x86/vp56dsp_init.c
index 2989281..ae04440 100644
--- a/libavcodec/x86/vp56dsp_init.c
+++ b/libavcodec/x86/vp56dsp_init.c
@@ -36,9 +36,11 @@ av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum 
CodecID codec)
 int mm_flags = av_get_cpu_flags();
 
 if (CONFIG_VP6_DECODER  codec == CODEC_ID_VP6) {
+#if ARCH_X86_32
 if (mm_flags  AV_CPU_FLAG_MMX) {
 c-vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
 }
+#endif
 
 if (mm_flags  AV_CPU_FLAG_SSE2) {
 c-vp6_filter_diag4 = ff_vp6_filter_diag4_sse2;
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] vp3: port x86 SIMD to cpuflags.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/vp3dsp.asm |   36 ++--
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index af2f60c..98b1cb5 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -102,8 +102,8 @@ SECTION .text
 mov  [r0+r3  -1], r2w
 %endmacro
 
-INIT_MMX
-cglobal vp3_v_loop_filter_mmx2, 3, 4
+INIT_MMX mmx2
+cglobal vp3_v_loop_filter, 3, 4
 %if ARCH_X86_64
 movsxdr1, r1d
 %endif
@@ -120,7 +120,7 @@ cglobal vp3_v_loop_filter_mmx2, 3, 4
 movq [r0   ], m3
 RET
 
-cglobal vp3_h_loop_filter_mmx2, 3, 4
+cglobal vp3_h_loop_filter, 3, 4
 %if ARCH_X86_64
 movsxdr1, r1d
 %endif
@@ -521,9 +521,17 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
 %endmacro
 
-%macro vp3_idct_funcs 1
-cglobal vp3_idct_put_%1, 3, 4, 9
-VP3_IDCT_%1   r2
+%macro VP3_IDCT 1
+%if mmsize == 8
+VP3_IDCT_mmx %1
+%else
+VP3_IDCT_sse2 %1
+%endif
+%endmacro
+
+%macro vp3_idct_funcs 0
+cglobal vp3_idct_put, 3, 4, 9
+VP3_IDCT  r2
 
 movsxdifnidn  r1, r1d
 mova  m4, [pb_80]
@@ -565,8 +573,8 @@ cglobal vp3_idct_put_%1, 3, 4, 9
 %endrep
 RET
 
-cglobal vp3_idct_add_%1, 3, 4, 9
-VP3_IDCT_%1   r2
+cglobal vp3_idct_add, 3, 4, 9
+VP3_IDCT  r2
 
 mov   r3, 4
 pxor  m4, m4
@@ -607,10 +615,10 @@ cglobal vp3_idct_add_%1, 3, 4, 9
 RET
 %endmacro
 
-INIT_MMX
-vp3_idct_funcs mmx
-INIT_XMM
-vp3_idct_funcs sse2
+INIT_MMX mmx
+vp3_idct_funcs
+INIT_XMM sse2
+vp3_idct_funcs
 
 %macro DC_ADD 0
 movq  m2, [r0 ]
@@ -631,8 +639,8 @@ vp3_idct_funcs sse2
 movq   [r0+r3  ], m5
 %endmacro
 
-INIT_MMX
-cglobal vp3_idct_dc_add_mmx2, 3, 4
+INIT_MMX mmx2
+cglobal vp3_idct_dc_add, 3, 4
 %if ARCH_X86_64
 movsxdr1, r1d
 %endif
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] vp3: don't compile mmx IDCT functions on x86-64.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

64-bit CPUs always have SSE2, and a SSE2 version exists, thus the MMX
version will never be used.
---
 libavcodec/x86/vp3dsp.asm|3 +++
 libavcodec/x86/vp3dsp_init.c |2 ++
 2 files changed, 5 insertions(+)

diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 98b1cb5..0b3eaa0 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -615,8 +615,11 @@ cglobal vp3_idct_add, 3, 4, 9
 RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 vp3_idct_funcs
+%endif
+
 INIT_XMM sse2
 vp3_idct_funcs
 
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index cd8e206..704d4a6 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -41,11 +41,13 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
 #if HAVE_YASM
 int cpuflags = av_get_cpu_flags();
 
+#if ARCH_X86_32
 if (HAVE_MMX  cpuflags  AV_CPU_FLAG_MMX) {
 c-idct_put  = ff_vp3_idct_put_mmx;
 c-idct_add  = ff_vp3_idct_add_mmx;
 c-idct_perm = FF_PARTTRANS_IDCT_PERM;
 }
+#endif
 
 if (HAVE_MMX2  cpuflags  AV_CPU_FLAG_MMX2) {
 c-idct_dc_add = ff_vp3_idct_dc_add_mmx2;
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] rv34: port x86 SIMD to cpuflags.

2012-07-26 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/rv34dsp.asm |   11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 32bcdce..c43b77a 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -46,7 +46,7 @@ SECTION .text
 %endmacro
 
 %macro rv34_idct 1
-cglobal rv34_idct_%1_mmx2, 1, 2, 0
+cglobal rv34_idct_%1, 1, 2, 0
 movsx   r1, word [r0]
 IDCT_DC r1
 movdm0, r1
@@ -58,14 +58,15 @@ cglobal rv34_idct_%1_mmx2, 1, 2, 0
 REP_RET
 %endmacro
 
-INIT_MMX
+INIT_MMX mmx2
 %define IDCT_DC IDCT_DC_ROUND
 rv34_idct dc
 %define IDCT_DC IDCT_DC_NOROUND
 rv34_idct dc_noround
 
 ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
-cglobal rv34_idct_dc_add_mmx, 3, 3
+INIT_MMX mmx
+cglobal rv34_idct_dc_add, 3, 3
 ; calculate DC
 IDCT_DC_ROUND r2
 pxor   m1, m1
@@ -167,8 +168,8 @@ cglobal rv34_idct_add, 3,3,0, d, s, b
 ret
 
 ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
-INIT_XMM
-cglobal rv34_idct_dc_add_sse4, 3, 3, 6
+INIT_XMM sse4
+cglobal rv34_idct_dc_add, 3, 3, 6
 ; load data
 IDCT_DC_ROUND r2
 pxor   m1, m1
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] dsputil_mmx: fix incorrect assembly code

2012-07-25 Thread Ronald S. Bultje
Hi,

On Mon, Jul 23, 2012 at 5:30 PM, Derek Buitenhuis
derek.buitenh...@gmail.com wrote:
 From: Yang Wang yang.y.w...@intel.com

 In ff_put_pixels_clamped_mmx(), there are two assembly code blocks.
 In the first block (in the unrolled loop), the instructions
 movq 8%3, %%mm1 \n\t, and so forth, have problems.

 From above instruction, it is clear what the programmer wants: a load from
 p + 8. But this assembly code doesn’t guarantee that. It only works if the
 compiler puts p in a register to produce an instruction like this:
 movq 8(%edi), %mm1. During compiler optimization, it is possible that the
 compiler will be able to constant propagate into p. Suppose p = x[1].
 Then operand 3 can become 1(%edi), where %edi holds x. And the 
 instruction
 becomes movq 81(%edx). That is, it will stride by 81 instead of 8.

 This will cause a segmentation fault.

 This error was fixed in the second block of the assembly code, but not in
 the unrolled loop.

 How to reproduce:
 This error is exposed when we build the ffmpeg using Intel C++ Compiler,
 IPO+PGO optimization. Crashed when decoding an MJPEG video.

 Signed-off-by: Michael Niedermayer michae...@gmx.at
 Signed-off-by: Derek Buitenhuis derek.buitenh...@gmail.com
 ---
  libavcodec/x86/dsputil_mmx.c |   18 +-
  1 file changed, 9 insertions(+), 9 deletions(-)

 diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
 index 5eb4a24..522a565 100644
 --- a/libavcodec/x86/dsputil_mmx.c
 +++ b/libavcodec/x86/dsputil_mmx.c
 @@ -245,14 +245,14 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, 
 uint8_t *pixels,
  pix = pixels;
  /* unrolled loop */
  __asm__ volatile (
 -movq%3, %%mm0  \n\t
 -movq   8%3, %%mm1  \n\t
 -movq  16%3, %%mm2  \n\t
 -movq  24%3, %%mm3  \n\t
 -movq  32%3, %%mm4  \n\t
 -movq  40%3, %%mm5  \n\t
 -movq  48%3, %%mm6  \n\t
 -movq  56%3, %%mm7  \n\t
 +movq  (%3), %%mm0  \n\t
 +movq 8(%3), %%mm1  \n\t
 +movq16(%3), %%mm2  \n\t
 +movq24(%3), %%mm3  \n\t
 +movq32(%3), %%mm4  \n\t
 +movq40(%3), %%mm5  \n\t
 +movq48(%3), %%mm6  \n\t
 +movq56(%3), %%mm7  \n\t
  packuswb %%mm1, %%mm0  \n\t
  packuswb %%mm3, %%mm2  \n\t
  packuswb %%mm5, %%mm4  \n\t
 @@ -262,7 +262,7 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, 
 uint8_t *pixels,
  movq %%mm4, (%0, %1, 2)\n\t
  movq %%mm6, (%0, %2)   \n\t
  :: r(pix), r((x86_reg)line_size), r((x86_reg)line_size * 3),
 -   m(*p)
 +   r(p)
  : memory);
  pix += line_size * 4;
  p   += 32;

OK.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] x86: add support for fmaddps fma4 instruction with abstraction to avx/sse

2012-07-25 Thread Ronald S. Bultje
Hi,

On Tue, Jul 24, 2012 at 2:03 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  configure|5 +
  libavutil/x86/x86inc.asm |   16 +++-
  2 files changed, 16 insertions(+), 5 deletions(-)

OK.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.

2012-07-25 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libswscale/swscale.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 5cfa7f2..0f8ef2b 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -661,7 +661,7 @@ static int swScale(SwsContext *c, const uint8_t *src[],
 if (isPlanar(dstFormat)  isALPHA(dstFormat)  !alpPixBuf)
 fillPlane(dst[3], dstStride[3], dstW, dstY - lastDstY, lastDstY, 255);
 
-#if HAVE_MMX2
+#if HAVE_MMX2  HAVE_INLINE_ASM
 if (av_get_cpu_flags()  AV_CPU_FLAG_MMX2)
 __asm__ volatile (sfence ::: memory);
 #endif
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] x86inc: automatically insert vzeroupper for YMM functions.

2012-07-25 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/dct32_sse.asm|2 --
 libavcodec/x86/dsputil_yasm.asm |   14 --
 libavcodec/x86/fft_mmx.asm  |6 --
 libavresample/x86/audio_convert.asm |   10 --
 libavresample/x86/audio_mix.asm |   10 --
 libavutil/x86/float_dsp.asm |   10 --
 libavutil/x86/x86inc.asm|5 -
 7 files changed, 4 insertions(+), 53 deletions(-)

diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index e3c8a45..351c88d 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -278,8 +278,6 @@ cglobal dct32_float_avx, 2,3,8, out, in, tmp
 vperm2f128  m0, m1, m1, 0x31
 vmovaps [outq+96], m1
 
-vzeroupper
-
 ;pass 6, no SIMD...
 INIT_XMM
 PASS6_AND_PERMUTE
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 077f3a0..70a0aa1 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1158,12 +1158,7 @@ ALIGN 16
 add src1q, 2*mmsize
 sub lenq,  2*mmsize
 jge .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
@@ -1193,12 +1188,7 @@ ALIGN 16
 
 sub lenq,   2*mmsize
 jge .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
@@ -1243,10 +1233,6 @@ cglobal butterflies_float_interleave, 4,4,3, dst, src0, 
src1, len
 %endif
 add   lenq, mmsize
 jl .loop
-%if mmsize == 32
-vzeroupper
-RET
-%endif
 .end:
 REP_RET
 %endmacro
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 1a430b9..527e215 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -749,9 +749,6 @@ section .text
 ; The others pass args in registers and don't spill anything.
 cglobal fft_dispatch%2, 2,5,8, z, nbits
 FFT_DISPATCH fullsuffix, nbits
-%if mmsize == 32
-vzeroupper
-%endif
 RET
 %endmacro ; DECL_FFT
 
@@ -957,9 +954,6 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample 
*output, const FFTSample *i
 %if ARCH_X86_64 == 0
 add esp, 12
 %endif
-%if mmsize == 32
-vzeroupper
-%endif
 RET
 %endmacro
 
diff --git a/libavresample/x86/audio_convert.asm 
b/libavresample/x86/audio_convert.asm
index 7b3cc22..244c4d1 100644
--- a/libavresample/x86/audio_convert.asm
+++ b/libavresample/x86/audio_convert.asm
@@ -145,12 +145,7 @@ cglobal conv_s32_to_flt, 3,3,3, dst, src, len
 mova  [dstq+lenq+mmsize], m2
 add lenq, mmsize*2
 jl .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse2
@@ -218,12 +213,7 @@ cglobal conv_flt_to_s32, 3,3,5, dst, src, len
 mova  [dstq+lenq+3*mmsize], m3
 add lenq, mmsize*4
 jl .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse2
diff --git a/libavresample/x86/audio_mix.asm b/libavresample/x86/audio_mix.asm
index 58a4ded..dbfaa69 100644
--- a/libavresample/x86/audio_mix.asm
+++ b/libavresample/x86/audio_mix.asm
@@ -51,12 +51,7 @@ cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
 addsrcq, mmsize*2
 sublend, mmsize*2/4
 jg .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
@@ -175,12 +170,7 @@ cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, 
src1, matrix1
 add   src0q, mmsize
 sublend, mmsize/4
 jg .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 66ef093..c4e0c66 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -40,12 +40,7 @@ ALIGN 16
 
 sub   lenq, 2*mmsize
 jge   .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
@@ -86,12 +81,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
 mova  [dstq+lenq+mmsize], m2
 sublenq, 2*mmsize
 jge .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 42ba97a..4b523e9 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -369,11 +369,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 120
 %macro RET 0
 WIN64_RESTORE_XMM_INTERNAL rsp
 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+vzeroupper
+%endif
 ret
 %endmacro
 
 %macro REP_RET 0
-%if regs_used  7 || xmm_regs_used  6
+%if regs_used  7 || xmm_regs_used  6 || mmsize == 32
 RET
 %else
 rep ret
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] x86/dsputilenc: bury inline asm under HAVE_INLINE_ASM.

2012-07-25 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/dct-test.c   |2 +-
 libavcodec/x86/dsputilenc_mmx.c |   80 +++
 libavcodec/x86/fdct_mmx.c   |4 ++
 libavcodec/x86/motion_est_mmx.c |6 +++
 libavcodec/x86/mpegvideo_mmx.c  |6 +++
 5 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index 5046544..9e19e0c 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -85,7 +85,7 @@ static const struct algo fdct_tab[] = {
 { IJG-AAN-INT,ff_fdct_ifast, SCALE_PERM },
 { IJG-LLM-INT,ff_jpeg_fdct_islow_8,  NO_PERM},
 
-#if HAVE_MMX
+#if HAVE_MMX  HAVE_INLINE_ASM
 { MMX,ff_fdct_mmx,   NO_PERM,   AV_CPU_FLAG_MMX 
},
 { MMX2,   ff_fdct_mmx2,  NO_PERM,   AV_CPU_FLAG_MMX2
},
 { SSE2,   ff_fdct_sse2,  NO_PERM,   AV_CPU_FLAG_SSE2
},
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index 47fa5ca..3cac979 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -30,6 +30,8 @@
 #include dsputil_mmx.h
 
 
+#if HAVE_INLINE_ASM
+
 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int 
line_size)
 {
 __asm__ volatile(
@@ -323,8 +325,6 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * 
pix2, int line_size, int
 return tmp;
 }
 
-int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int 
h);
-
 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
 int tmp;
   __asm__ volatile (
@@ -925,17 +925,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, 
const uint8_t *src1, c
 paddusw #t, #a   \n\t\
 movd #a, #dst\n\t\
 
-#define hadamard_func(cpu) \
-int ff_hadamard8_diff_##cpu  (void *s, uint8_t *src1, uint8_t *src2, \
-  int stride, int h); \
-int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
-  int stride, int h);
-
-hadamard_func(mmx)
-hadamard_func(mmx2)
-hadamard_func(sse2)
-hadamard_func(ssse3)
-
 #define DCT_SAD4(m,mm,o)\
 mov#m #o+ 0(%1), #mm2  \n\t\
 mov#m #o+16(%1), #mm3  \n\t\
@@ -1094,10 +1083,26 @@ static int ssd_int8_vs_int16_mmx(const int8_t *pix1, 
const int16_t *pix2, int si
 #undef PHADDD
 #endif //HAVE_SSSE3
 
+#endif /* HAVE_INLINE_ASM */
+
+int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int 
h);
+
+#define hadamard_func(cpu) \
+int ff_hadamard8_diff_##cpu  (void *s, uint8_t *src1, uint8_t *src2, \
+  int stride, int h); \
+int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
+  int stride, int h);
+
+hadamard_func(mmx)
+hadamard_func(mmx2)
+hadamard_func(sse2)
+hadamard_func(ssse3)
 
 void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
 int mm_flags = av_get_cpu_flags();
+
+#if HAVE_INLINE_ASM
 int bit_depth = avctx-bits_per_raw_sample;
 
 if (mm_flags  AV_CPU_FLAG_MMX) {
@@ -1121,11 +1126,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, 
AVCodecContext *avctx)
 c-diff_bytes= diff_bytes_mmx;
 c-sum_abs_dctelem= sum_abs_dctelem_mmx;
 
-#if HAVE_YASM
-c-hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
-c-hadamard8_diff[1]= ff_hadamard8_diff_mmx;
-#endif
-
 c-pix_norm1 = pix_norm1_mmx;
 c-sse[0] = sse16_mmx;
 c-sse[1] = sse8_mmx;
@@ -1146,10 +1146,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, 
AVCodecContext *avctx)
 
 
 if (mm_flags  AV_CPU_FLAG_MMX2) {
-#if HAVE_YASM
-c-hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
-c-hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
-#endif
 c-sum_abs_dctelem= sum_abs_dctelem_mmx2;
 c-vsad[4]= vsad_intra16_mmx2;
 
@@ -1164,13 +1160,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, 
AVCodecContext *avctx)
 if (bit_depth = 8)
 c-get_pixels = get_pixels_sse2;
 c-sum_abs_dctelem= sum_abs_dctelem_sse2;
-#if HAVE_YASM
-c-sse[0] = ff_sse16_sse2;
-#if HAVE_ALIGNED_STACK
-c-hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
-c-hadamard8_diff[1]= ff_hadamard8_diff_sse2;
-#endif
-#endif
 }
 
 #if HAVE_SSSE3
@@ -1180,10 +1169,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, 
AVCodecContext *avctx)
 }
 c-add_8x8basis= add_8x8basis_ssse3;
 c-sum_abs_dctelem= sum_abs_dctelem_ssse3;
-#if HAVE_YASM  HAVE_ALIGNED_STACK
-c-hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
-c-hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
-#endif
 }
 #endif
 
@@ -1194,6 +1179,35 @@ void ff_dsputilenc_init_mmx(DSPContext* c, 
AVCodecContext *avctx)
 c-add_8x8basis= add_8x8basis_3dnow;
 }
 }
+#endif /* HAVE_INLINE_ASM */
+
+#if HAVE_YASM

[libav-devel] [PATCH] mpegaudio: bury inline asm under HAVE_INLINE_ASM.

2012-07-25 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/mpegaudiodec_mmx.c |5 +
 1 file changed, 5 insertions(+)

diff --git a/libavcodec/x86/mpegaudiodec_mmx.c 
b/libavcodec/x86/mpegaudiodec_mmx.c
index f51a06d..88a3477 100644
--- a/libavcodec/x86/mpegaudiodec_mmx.c
+++ b/libavcodec/x86/mpegaudiodec_mmx.c
@@ -36,6 +36,8 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float 
*in, float *win,
 
 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
 
+#if HAVE_INLINE_ASM
+
 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
 
@@ -178,6 +180,7 @@ static void apply_window_mp3(float *in, float *win, int 
*unused, float *out,
 *out = sum;
 }
 
+#endif /* HAVE_INLINE_ASM */
 
 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)   \
 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,  \
@@ -235,9 +238,11 @@ void ff_mpadsp_init_mmx(MPADSPContext *s)
 }
 }
 
+#if HAVE_INLINE_ASM
 if (mm_flags  AV_CPU_FLAG_SSE2) {
 s-apply_window_float = apply_window_mp3;
 }
+#endif /* HAVE_INLINE_ASM */
 #if HAVE_YASM
 if (mm_flags  AV_CPU_FLAG_AVX  HAVE_AVX) {
 s-imdct36_blocks_float = imdct36_blocks_avx;
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] eval: add workaround for broken strtod() in MSVS.

2012-07-25 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavutil/eval.c |   35 +++
 1 file changed, 35 insertions(+)

diff --git a/libavutil/eval.c b/libavutil/eval.c
index ff3191d..ef37ad8 100644
--- a/libavutil/eval.c
+++ b/libavutil/eval.c
@@ -26,6 +26,7 @@
  * see http://joe.hotchkiss.com/programming/eval/eval.html
  */
 
+#include avstring.h
 #include avutil.h
 #include eval.h
 #include log.h
@@ -77,6 +78,40 @@ double av_strtod(const char *numstr, char **tail)
 {
 double d;
 char *next;
+#ifdef _MSC_VER
+/* MSVC does not support hexadecimal input, nor does it understand
+ * strings such as inf[inity] or nan. Support them manually. */
+if (!av_strncasecmp(numstr, inf, 3)) {
+d = INFINITY;
+next = numstr + 3;
+} else if (!av_strncasecmp(numstr, infinity, 8)) {
+d = INFINITY;
+next = numstr + 8;
+} else if (!av_strncasecmp(numstr, +inf, 4)) {
+d = INFINITY;
+next = numstr + 4;
+} else if (!av_strncasecmp(numstr, +infinity, 4)) {
+d = INFINITY;
+next = numstr + 9;
+} else if (!av_strncasecmp(numstr, -inf, 4)) {
+d = -INFINITY;
+next = numstr + 4;
+} else if (!av_strncasecmp(numstr, -infinity, 9)) {
+d = -INFINITY;
+next = numstr + 9;
+} else if (!av_strncasecmp(numstr, nan, 3)) {
+d = NAN;
+next = numstr + 3;
+} else if (!av_strncasecmp(numstr, +nan, 4) ||
+   !av_strncasecmp(numstr, -nan, 4)) {
+d = NAN;
+next = numstr + 4;
+} else if (!av_strncasecmp(numstr, 0x, 2) ||
+   !av_strncasecmp(numstr, -0x, 3) ||
+   !av_strncasecmp(numstr, +0x, 3)) {
+d = strtol(numstr, next, 16);
+} else
+#endif
 d = strtod(numstr, next);
 /* if parsing succeeded, check for and interpret postfixes */
 if (next!=numstr) {
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] eval: fix printing of NaN in eval fate test.

2012-07-25 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This fixes make fate-eval on MSVC builds. Without this, the test outputs
-1.#NaN instead of nan on MSVS 2010.
---
 libavutil/eval.c |5 +
 1 file changed, 5 insertions(+)

diff --git a/libavutil/eval.c b/libavutil/eval.c
index ef37ad8..6131263 100644
--- a/libavutil/eval.c
+++ b/libavutil/eval.c
@@ -671,6 +671,11 @@ int main(int argc, char **argv)
 av_expr_parse_and_eval(d, *expr,
const_names, const_values,
NULL, NULL, NULL, NULL, NULL, 0, NULL);
+#ifdef _MSC_VER
+if (isnan(d))
+printf('%s' - nan\n\n, *expr);
+else
+#endif
 printf('%s' - %f\n\n, *expr, d);
 }
 
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] x86inc: automatically insert vzeroupper for YMM functions.

2012-07-25 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libavcodec/x86/dct32_sse.asm|2 --
 libavcodec/x86/dsputil_yasm.asm |   14 --
 libavcodec/x86/fft_mmx.asm  |6 --
 libavresample/x86/audio_convert.asm |   10 --
 libavresample/x86/audio_mix.asm |   10 --
 libavutil/x86/float_dsp.asm |   10 --
 libavutil/x86/x86inc.asm|   15 ---
 7 files changed, 12 insertions(+), 55 deletions(-)

diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm
index e3c8a45..351c88d 100644
--- a/libavcodec/x86/dct32_sse.asm
+++ b/libavcodec/x86/dct32_sse.asm
@@ -278,8 +278,6 @@ cglobal dct32_float_avx, 2,3,8, out, in, tmp
 vperm2f128  m0, m1, m1, 0x31
 vmovaps [outq+96], m1
 
-vzeroupper
-
 ;pass 6, no SIMD...
 INIT_XMM
 PASS6_AND_PERMUTE
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
index 077f3a0..70a0aa1 100644
--- a/libavcodec/x86/dsputil_yasm.asm
+++ b/libavcodec/x86/dsputil_yasm.asm
@@ -1158,12 +1158,7 @@ ALIGN 16
 add src1q, 2*mmsize
 sub lenq,  2*mmsize
 jge .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
@@ -1193,12 +1188,7 @@ ALIGN 16
 
 sub lenq,   2*mmsize
 jge .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
@@ -1243,10 +1233,6 @@ cglobal butterflies_float_interleave, 4,4,3, dst, src0, 
src1, len
 %endif
 add   lenq, mmsize
 jl .loop
-%if mmsize == 32
-vzeroupper
-RET
-%endif
 .end:
 REP_RET
 %endmacro
diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm
index 1a430b9..527e215 100644
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -749,9 +749,6 @@ section .text
 ; The others pass args in registers and don't spill anything.
 cglobal fft_dispatch%2, 2,5,8, z, nbits
 FFT_DISPATCH fullsuffix, nbits
-%if mmsize == 32
-vzeroupper
-%endif
 RET
 %endmacro ; DECL_FFT
 
@@ -957,9 +954,6 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample 
*output, const FFTSample *i
 %if ARCH_X86_64 == 0
 add esp, 12
 %endif
-%if mmsize == 32
-vzeroupper
-%endif
 RET
 %endmacro
 
diff --git a/libavresample/x86/audio_convert.asm 
b/libavresample/x86/audio_convert.asm
index 7b3cc22..244c4d1 100644
--- a/libavresample/x86/audio_convert.asm
+++ b/libavresample/x86/audio_convert.asm
@@ -145,12 +145,7 @@ cglobal conv_s32_to_flt, 3,3,3, dst, src, len
 mova  [dstq+lenq+mmsize], m2
 add lenq, mmsize*2
 jl .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse2
@@ -218,12 +213,7 @@ cglobal conv_flt_to_s32, 3,3,5, dst, src, len
 mova  [dstq+lenq+3*mmsize], m3
 add lenq, mmsize*4
 jl .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse2
diff --git a/libavresample/x86/audio_mix.asm b/libavresample/x86/audio_mix.asm
index 58a4ded..dbfaa69 100644
--- a/libavresample/x86/audio_mix.asm
+++ b/libavresample/x86/audio_mix.asm
@@ -51,12 +51,7 @@ cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1
 addsrcq, mmsize*2
 sublend, mmsize*2/4
 jg .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
@@ -175,12 +170,7 @@ cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, 
src1, matrix1
 add   src0q, mmsize
 sublend, mmsize/4
 jg .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm
index 66ef093..c4e0c66 100644
--- a/libavutil/x86/float_dsp.asm
+++ b/libavutil/x86/float_dsp.asm
@@ -40,12 +40,7 @@ ALIGN 16
 
 sub   lenq, 2*mmsize
 jge   .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
@@ -86,12 +81,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
 mova  [dstq+lenq+mmsize], m2
 sublenq, 2*mmsize
 jge .loop
-%if mmsize == 32
-vzeroupper
-RET
-%else
 REP_RET
-%endif
 %endmacro
 
 INIT_XMM sse
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index 42ba97a..b76a10c 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -369,11 +369,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 120
 %macro RET 0
 WIN64_RESTORE_XMM_INTERNAL rsp
 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+vzeroupper
+%endif
 ret
 %endmacro
 
 %macro REP_RET 0
-%if regs_used  7 || xmm_regs_used  6
+%if regs_used  7 || xmm_regs_used  6 || mmsize == 32
 RET
 %else
 rep ret
@@ -410,11 +413,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 72
 
 %macro RET 0
 POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+vzeroupper
+%endif
 ret
 %endmacro

[libav-devel] [PATCH] avprobe/avconv: fix tentative declaration compile errors on MSVS.

2012-07-25 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 avconv.c  |5 +++--
 avprobe.c |5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/avconv.c b/avconv.c
index 7142ab4..439672a 100644
--- a/avconv.c
+++ b/avconv.c
@@ -104,7 +104,7 @@ typedef struct MetadataMap {
 int  index; /// stream/chapter/program number
 } MetadataMap;
 
-static const OptionDef options[];
+static const OptionDef *options;
 
 static int video_discard = 0;
 static int same_quant = 0;
@@ -4858,7 +4858,7 @@ static int opt_filter_complex(const char *opt, const char 
*arg)
 }
 
 #define OFFSET(x) offsetof(OptionsContext, x)
-static const OptionDef options[] = {
+static const OptionDef real_options[] = {
 /* main options */
 #include cmdutils_common_opts.h
 { f, HAS_ARG | OPT_STRING | OPT_OFFSET, {.off = OFFSET(format)}, force 
format, fmt },
@@ -4975,6 +4975,7 @@ int main(int argc, char **argv)
 OptionsContext o = { 0 };
 int64_t ti;
 
+options = real_options;
 reset_options(o);
 
 av_log_set_flags(AV_LOG_SKIP_REPEATED);
diff --git a/avprobe.c b/avprobe.c
index 8e93d05..aa7dae4 100644
--- a/avprobe.c
+++ b/avprobe.c
@@ -44,7 +44,7 @@ static int use_byte_value_binary_prefix = 0;
 static int use_value_sexagesimal_format = 0;
 
 /* globals */
-static const OptionDef options[];
+static const OptionDef *options;
 
 /* AVprobe context */
 static const char *input_filename;
@@ -887,7 +887,7 @@ static void opt_pretty(void)
 use_value_sexagesimal_format = 1;
 }
 
-static const OptionDef options[] = {
+static const OptionDef real_options[] = {
 #include cmdutils_common_opts.h
 { f, HAS_ARG, {(void*)opt_format}, force format, format },
 { of, HAS_ARG, {(void*)opt_output_format}, output the document either 
as ini or json, output_format },
@@ -927,6 +927,7 @@ int main(int argc, char **argv)
 if (!buffer)
 exit(1);
 
+options = real_options;
 parse_loglevel(argc, argv, options);
 av_register_all();
 avformat_network_init();
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] H264: Remove 3dnow qpel code.

2012-07-24 Thread Ronald S. Bultje
Hi,

On Tue, Jul 24, 2012 at 7:25 AM, Luca Barbato lu_z...@gentoo.org wrote:
 On 7/24/12 4:45 AM, Ronald S. Bultje wrote:

 Hi,

 On Mon, Jul 23, 2012 at 7:45 PM, Ronald S. Bultje rsbul...@gmail.com
 wrote:

 Hi,

 On Mon, Jul 23, 2012 at 5:37 PM, Daniel Kang daniel.d.k...@gmail.com
 wrote:

 On Mon, Jul 23, 2012 at 5:21 PM, Diego Biurrun di...@biurrun.de wrote:


 On Mon, Jul 23, 2012 at 05:12:23PM -0700, Daniel Kang wrote:

 From: Daniel Kang daniel.d.k...@gmail.com

 The only CPUs that have 3dnow and don't have mmxext are 12 years old.
 ---
   libavcodec/x86/dsputil_mmx.c   |9 -
   libavcodec/x86/h264_qpel_mmx.c |4 
   2 files changed, 0 insertions(+), 13 deletions(-)


 What sort of maintenance burden does this relieve us from?
 I'm writing this mail on a system fitting the description
 you mention, my trusty old K6-III.



 [..]

 4. You can probably decode 260p H.264 with a K6-III. Who seriously would
 use
 this?


 This really is the killer. Is there any sort of reasonable expectation
 that a k6-3 can get useful work done when it comes to H264 decoding? I
 wouldn't even mind dropping all MMX optimizations (where MMX2 - i.e.
 SSE - or higher exists) altogether, i.e. going the H264 way


 .. x264 way. :).


 Let's discuss a bit or put a news item, the pros is to having a leaner
 system, the cons is cutting dry systems that might work fine for special
 purposes now.

 If we can have some compelling improvement (e.g. yasm) why not?

The alternatively that I have suggested to Daniel is to keep the old
inline asm code for 3dnow only. The disadvantage of that is that we
keep pretty much all code around, just for 3dnow alone, and duplicate
it in yasm form for all other optimization types. So it's practically
possible, but at a high cost (+ that since it's duplicated, it'll be
orphaned and unmaintained; any improvements to the new qpel code will
not hit the 3dnow optimizations).

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] Replace x*155/100 by x*10158116.

2012-07-24 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

Idea stolen from webp (by Pascal Massimino) - because it's Cool.
---
 libavcodec/vp8.c |   12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c
index d0e2a0c..e4cfbcb 100644
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -249,12 +249,12 @@ static void get_quants(VP8Context *s)
 } else
 base_qi = yac_qi;
 
-s-qmat[i].luma_qmul[0]=   
vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
-s-qmat[i].luma_qmul[1]=   
vp8_ac_qlookup[av_clip_uintp2(base_qi , 7)];
-s-qmat[i].luma_dc_qmul[0] =   2 * 
vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
-s-qmat[i].luma_dc_qmul[1] = 155 * 
vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
-s-qmat[i].chroma_qmul[0]  =   
vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
-s-qmat[i].chroma_qmul[1]  =   
vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
+s-qmat[i].luma_qmul[0]=   
vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
+s-qmat[i].luma_qmul[1]=   
vp8_ac_qlookup[av_clip_uintp2(base_qi , 7)];
+s-qmat[i].luma_dc_qmul[0] =   2 * 
vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
+s-qmat[i].luma_dc_qmul[1] = (101581 * 
vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)])  16;
+s-qmat[i].chroma_qmul[0]  =   
vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
+s-qmat[i].chroma_qmul[1]  =   
vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 
 s-qmat[i].luma_dc_qmul[1] = FFMAX(s-qmat[i].luma_dc_qmul[1], 8);
 s-qmat[i].chroma_qmul[0]  = FFMIN(s-qmat[i].chroma_qmul[0], 132);
-- 
1.7.9.2

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [libav-commits] x86: fft: convert sse inline asm to yasm

2012-07-24 Thread Ronald S. Bultje
Hi,

On Tue, Jul 24, 2012 at 3:05 PM, Jason Garrett-Glaser ja...@x264.com wrote:
 On Tue, Jul 24, 2012 at 9:02 AM, John Stebbins stebb...@jetheaddev.com 
 wrote:
 On 07/24/2012 05:53 PM, Jason Garrett-Glaser wrote:

 On Tue, Jul 24, 2012 at 8:34 AM, Måns Rullgård m...@mansr.com wrote:

 Jason Garrett-Glaser ja...@x264.com writes:

 On Tue, Jul 24, 2012 at 8:05 AM, John Stebbins stebb...@jetheaddev.com
 wrote:

 On 06/25/2012 02:42 PM, Mans Rullgard wrote:

 Module: libav
 Branch: master
 Commit: 82992604706144910f4a2f875d48cfc66c1b70d7

 Author:Mans Rullgard m...@mansr.com
 Committer: Mans Rullgard m...@mansr.com
 Date:  Sat Jun 23 19:08:11 2012 +0100

 x86: fft: convert sse inline asm to yasm

 ---

libavcodec/x86/Makefile|1 -
libavcodec/x86/fft_mmx.asm |  139
 ---
libavcodec/x86/fft_sse.c   |  110
 --
3 files changed, 129 insertions(+), 121 deletions(-)

 Hi,

 This commit is causing some strange interaction with libx264 in
 HandBrake
 under certain conditions.  x264 is encoding at about 1/10th it's normal
 rate
 after updating to this commit.

 A little more background.  When doing ac3 passthru HandBrake encodes a
 single packet of silence data to ac3 that is uses for filling any gaps
 that
 it detects in the audio.  Encoding of this packet happens before any
 other
 encoding or decoding starts. For some crazy reason, if we encode this
 silence, we get the x264 slowdown.  If we do not encode the silence,
 the
 speed is ok.  I ran gprof on the code to see where all the time is
 being
 spent and it is all in x264.  So it's not like there is some run-away
 loop
 somewhere that is bringing everything to it's knees.  I'm guessing some
 cpu
 state must not be getting cleared or restored properly somewhere.

 John

 Could it have anything to do with denormals/NaN?

 Does x264 use floating-point SSE instructions anywhere?

 Yes, in macroblock-tree (because floating-point reciprocal is fast and
 IDIV is slow), and in ratecontrol.



 I don't know if it is of any help, but here's the top entries from gprof
 when this slowdown is happening.
 x264 defaults + b-adapt=2

 Each sample counts as 0.01 seconds.
   %   cumulative   self  self total
  time   seconds   secondscalls  ms/call  ms/call  name
  19.56 26.7126.71 x264_pixel_satd_16x4_internal_avx
  17.85 51.0824.37 x264_pixel_satd_8x8_internal_avx
  10.22 65.0313.95 x264_sub8x8_dct_avx.skip_prologue
   9.11 77.4712.44 x264_hadamard_ac_8x8_avx
   9.08 89.8712.40 x264_intra_sa8d_x9_8x8_avx
   5.08 96.81 6.94 x264_sub8x8_dct8_avx.skip_prologue
   2.96100.85 4.04 x264_pixel_satd_4x4_avx
   2.45104.20 3.35 x264_intra_satd_x9_4x4_avx
   1.80106.66 2.46 x264_mc_chroma_avx
   1.58108.82 2.16 x264_hpel_filter_avx
   1.46110.81 1.99 x264_pixel_ssim_4x4x2_core_avx
   1.21112.46 1.65 x264_add8x8_idct_avx.skip_prologue
   1.09113.95 1.49 x264_pixel_ssd_16x16_avx
   1.09115.44 1.49 x264_me_search_ref
   1.02116.83 1.39 x264_add8x8_idct8_avx.skip_prologue

 According to top, all CPUs are fully saturated

 That's an incredibly distorted profile -- it looks like all the AVX
 functions are running incredibly slowly.

 Note that all those functions do not use 256-bit AVX, only 128-bit
 AVX; Intel hasn't documented any sort of slowdown when mixing 128-bit
 SSE and 128-bit AVX, which we do without problems.

 Could the problem be that ffmpeg is doing 256-bit AVX, but then not
 using vzeroupper afterwards?  Which CPU is this anyways?

Do the x264 functions sign-extend all their integer arguments? Or put
differently, does the problem occur for 32-bit builds also, or only
for 64-bit builds?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] lavr: x86: improve non-SSE4 version of S16_TO_S32_SX macro

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 10:33 AM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 On 06/26/2012 04:55 PM, Justin Ruggles wrote:
 Removes a false dependency on existing contents of the 2nd dst register,
 giving better performance for OOE.
 ---
  libavresample/x86/util.asm |3 ++-
  1 files changed, 2 insertions(+), 1 deletions(-)

 diff --git a/libavresample/x86/util.asm b/libavresample/x86/util.asm
 index 501f662..ca7fde5 100644
 --- a/libavresample/x86/util.asm
 +++ b/libavresample/x86/util.asm
 @@ -26,7 +26,8 @@
  pmovsxwd m%1, m%1
  SWAP %1, %2
  %else
 -punpckhwdm%2, m%1
 +mova m%2, m%1
 +punpckhwdm%2, m%2
  punpcklwdm%1, m%1
  psradm%2, 16
  psradm%1, 16

 ping.

Ok.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 01/15] lavr: x86: optimized 2-channel s16p to s16 conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   36 
 
  libavresample/x86/audio_convert_init.c |   13 +++
  2 files changed, 49 insertions(+), 0 deletions(-)

 diff --git a/libavresample/x86/audio_convert.asm 
 b/libavresample/x86/audio_convert.asm
 index 7b3cc22..0ca562a 100644
 --- a/libavresample/x86/audio_convert.asm
 +++ b/libavresample/x86/audio_convert.asm
 @@ -233,6 +233,42 @@ INIT_YMM avx
  CONV_FLT_TO_S32
  %endif

 +;--
 +; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len,
 +;  int channels);
 +;--
 +
 +%macro CONV_S16P_TO_S16_2CH 0
 +cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src, len, src1
 +mov   src1q, [srcq+gprsize]
 +movsrcq, [srcq]
 +sub   src1q, srcq
 +ALIGN 16
 +.loop
 +mova m0, [srcq ]
 +mova m1, [srcq+src1q   ]
 +mova m2, [srcq  +mmsize]
 +mova m3, [srcq+src1q+mmsize]
 +SBUTTERFLY2  wd, 0, 1, 4
 +SBUTTERFLY2  wd, 2, 3, 4
 +mova  [dstq ], m0
 +mova  [dstq+1*mmsize], m1
 +mova  [dstq+2*mmsize], m2
 +mova  [dstq+3*mmsize], m3
 +addsrcq, mmsize*2
 +adddstq, mmsize*4
 +sublend, mmsize
 +jg .loop
 +REP_RET
 +%endmacro

I'm bored, so... Does:

sub dstq, srcq
sub dstq, srcq

in the init code, and then:

mova [dstq+srcq*2+0*mmsize], m0
mova [dstq+srcq*2+1*mmsize], m1
mova [dstq+srcq*2+2*mmsize], m2
mova [dstq+srcq*2+3*mmsize], m3
add srcq, mmsize*2
sub lend, mmsize*2
jg .loop

Lead to faster execution? If not, you can also add dstq, lenq (after
len sign extend on x86-64), and then neg lenq and use dstq+lenq*2
instead. In both cases, the goal is to get rid of the 2xadd+sub in the
inner loop.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 02/15] lavr: x86: optimized 6-channel s16p to s16 conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   62 
 
  libavresample/x86/audio_convert_init.c |9 +
  2 files changed, 71 insertions(+), 0 deletions(-)

 diff --git a/libavresample/x86/audio_convert.asm 
 b/libavresample/x86/audio_convert.asm
 index 0ca562a..fdcea3a 100644
 --- a/libavresample/x86/audio_convert.asm
 +++ b/libavresample/x86/audio_convert.asm
 @@ -269,6 +269,68 @@ INIT_XMM avx
  CONV_S16P_TO_S16_2CH
  %endif

 +;--
 +; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len,
 +;  int channels);
 +;--
 +
 +%macro CONV_S16P_TO_S16_6CH 0
 +cglobal conv_s16p_to_s16_6ch, 2,8,6, dst, src, src1, src2, src3, src4, src5, 
 len
 +%if ARCH_X86_64
 +mov lend, r2d
 +%else
 +%define lend dword r2m
 +%endif

Eehw, just do:

%if ARCH_X86_64
cglobal ..., 3, 8, 6, dst, src, len, src1, src2, ..
%else
.. what you do up there ..
%endif

 +movq   [dstq   ], m1
 +movq   [dstq+ 8], m0
 +movq   [dstq+16], m2
 +movhps [dstq+24], m1
 +movhps [dstq+32], m0
 +movhps [dstq+40], m2
 +add  srcq, mmsize/2
 +add  dstq, mmsize*3
 +sub  lend, mmsize/4
 +jg .loop
 +REP_RET
 +%endmacro

Here, too, I think you can use imul lenq, 6, then add that to dstq,
neg it and index dstq as [dstq+lenq+0/8/16/..]. Then add lend,
mmsize/4 instead of sub, and jl instead of jg, and you can remove the
add dstq, mmsize*3 from the inner loop.

Does unrolling this by another factor of 2 (and thus being able to use
aligned loads/stores) make a performance difference?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 03/15] lavr: x86: optimized 2-channel s16p to flt conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   49 
 
  libavresample/x86/audio_convert_init.c |9 ++
  2 files changed, 58 insertions(+), 0 deletions(-)

LGTM.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 04/15] lavr: x86: optimized 6-channel s16p to flt conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 21, 2012 at 12:12 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 +%if cpuflag(ssse3)
 +pshufb m3, m0, unpack_odd   ; m3 =  12, 13, 14, 15
 +pshufb m0, unpack_even  ; m0 =   0,  1,  2,  3
 +pshufb m4, m1, unpack_odd   ; m4 =  16, 17, 18, 19
 +pshufb m1, unpack_even  ; m1 =   4,  5,  6,  7
 +pshufb m5, m2, unpack_odd   ; m5 =  20, 21, 22, 23
 +pshufb m2, unpack_even  ; m2 =   8,  9, 10, 11
 +%else

I'm going to assume you tested vpperm and it was not faster?

 +mova  [dstq ], m0
 +mova  [dstq+  mmsize], m1
 +mova  [dstq+2*mmsize], m2
 +mova  [dstq+3*mmsize], m3
 +mova  [dstq+4*mmsize], m4
 +mova  [dstq+5*mmsize], m5
 +add  srcq, mmsize/2
 +add  dstq, mmsize*6
 +sub  lend, mmsize/4

Can you try the pointer munging trick here too (i.e. sign-extend lend;
imul lenq, x; add dstq, lenq; neg lenq) so add dstq, mmsize*6 and sub
lend, mmsize/4 can be merged and we can remove one from the inner
loop?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 05/15] lavr: x86: optimized 2-channel fltp to s16 conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   37 
 
  libavresample/x86/audio_convert_init.c |9 +++
  2 files changed, 46 insertions(+), 0 deletions(-)

 diff --git a/libavresample/x86/audio_convert.asm 
 b/libavresample/x86/audio_convert.asm
 index ba6cb60..b241542 100644
 --- a/libavresample/x86/audio_convert.asm
 +++ b/libavresample/x86/audio_convert.asm
 @@ -463,6 +463,43 @@ INIT_XMM avx
  CONV_S16P_TO_FLT_6CH
  %endif

 +;--
 +; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len,
 +;  int channels);
 +;--
 +
 +%macro CONV_FLTP_TO_S16_2CH 0
 +cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1
 +lea  lenq, [4*lend]
 +mov src1q, [src0q+gprsize]
 +mov src0q, [src0q]
 +add  dstq, lenq
 +add src0q, lenq
 +add src1q, lenq
 +neg  lenq
 +mova   m2, [pf_s16_scale]
 +ALIGN 16
 +.loop:
 +mulps  m0, m2, [src0q+lenq]
 +mulps  m1, m2, [src1q+lenq]
 +cvtps2dq   m0, m0
 +cvtps2dq   m1, m1
 +packssdw   m0, m1
 +movhlpsm1, m0
 +punpcklwd  m0, m1

You should be able to get slightly better performance (because of
smaller dependency chain) by using:

packssdw m0, m0
packssdw m1, m1
punpcklwd m0, m1

Please modify it to use that if faster. Otherwise OK.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 06/15] lavr: x86: optimized 6-channel fltp to s16 conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 +%else ; sse
 +mova  xmm0, [srcq  ]
 +mova  xmm1, [srcq+src1q]
 +mova  xmm2, [srcq+src2q]
 +mova  xmm3, [srcq+src3q]
 +mova  xmm4, [srcq+src4q]
 +mova  xmm5, [srcq+src5q]
 +mulps xmm0, xmm6
 +mulps xmm1, xmm6
 +mulps xmm2, xmm6
 +mulps xmm3, xmm6
 +mulps xmm4, xmm6
 +mulps xmm5, xmm6
 +cvtps2pim0, xmm0
 +cvtps2pim1, xmm1
 +cvtps2pim2, xmm2
 +cvtps2pim3, xmm3
 +cvtps2pim4, xmm4
 +cvtps2pim5, xmm5
 +packssdwm0, m3  ; m0 =  0,  6,  3,  9
 +packssdwm1, m4  ; m1 =  1,  7,  4, 10
 +packssdwm2, m5  ; m2 =  2,  8,  5, 11
 +; unpack words
 +pshufw  m3, m0, q1032   ; m3 =  3,  9,  0,  6
 +punpcklwd   m0, m1  ; m0 =  0,  1,  6,  7
 +punpckhwd   m1, m2  ; m1 =  4,  5, 10, 11
 +punpcklwd   m2, m3  ; m2 =  2,  3,  8,  9
 +; unpack dwords
 +pshufw  m3, m0, q1032   ; m3 =  6,  7,  0,  1
 +punpckldq   m0, m2  ; m0 =  0,  1,  2,  3 (final)
 +punpckhdq   m2, m1  ; m2 =  8,  9, 10, 11 (final)
 +punpckldq   m1, m3  ; m1 =  4,  5,  6,  7 (final)
 +mova [dstq   ], m0
 +mova [dstq+ 8], m1
 +mova [dstq+16], m2

I'd agree with Loren that the use of sse as a function name, but
having mX refer to mmx registers, is somewhat confusing. I guess
it's OK since it's obvious from the code what is intended, just wanted
to note that it's confusing.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 07/15] lavr: x86: optimized 2-channel fltp to flt conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 +mova  [dstq ], m0
 +mova  [dstq+1*mmsize], m1
 +mova  [dstq+2*mmsize], m2
 +mova  [dstq+3*mmsize], m3
 +add   srcq, mmsize*2
 +add   dstq, mmsize*4
 +sub   lend, mmsize/2

You can probably munge pointers such that one add suffices.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 08/15] lavr: x86: optimized 2-channel s16 to s16p conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   38 
 
  libavresample/x86/audio_convert_init.c |   11 +
  2 files changed, 49 insertions(+), 0 deletions(-)

 diff --git a/libavresample/x86/audio_convert.asm 
 b/libavresample/x86/audio_convert.asm
 index 9ba7251..70519e1 100644
 --- a/libavresample/x86/audio_convert.asm
 +++ b/libavresample/x86/audio_convert.asm
 @@ -734,3 +734,41 @@ CONV_FLTP_TO_FLT_6CH
  INIT_XMM avx
  CONV_FLTP_TO_FLT_6CH
  %endif
 +
 +;--
 +; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
 +;  int channels);
 +;--
 +
 +%macro CONV_S16_TO_S16P_2CH 0
 +cglobal conv_s16_to_s16p_2ch, 3,4,3, dst0, src, len, dst1
 +lea   lenq, [2*lend]
 +mov  dst1q, [dst0q+gprsize]
 +mov  dst0q, [dst0q]
 +lea   srcq, [srcq+2*lenq]
 +add  dst0q, lenq
 +add  dst1q, lenq
 +neg   lenq
 +ALIGN 16
 +.loop:
 +movam0, [srcq+2*lenq   ]
 +movam1, [srcq+2*lenq+mmsize]
 +pshuflw m0, m0, q3120
 +pshufhw m0, m0, q3120
 +pshuflw m1, m1, q3120
 +pshufhw m1, m1, q3120
 +shufps  m2, m0, m1, q2020
 +shufps  m0, m1, q3131
 +mova  [dst0q+lenq], m2
 +mova  [dst1q+lenq], m0

The more common way to do this (I believe) is to set up mask reg:

pcmpeqb m4, m4
psrlw m4, 8 ; 0x00ff

Then mask/shift:

mova m0, [srcq+2*lenq+0*mmsize]
mova m1, [srcq+2*lenq+1*mmsize]
psrlw m2, m0, 8
psrlw m3, m1, 8
pand m0, m4
pand m1, m4
packsswb m0, m1
packsswb m2, m3
mova [dst1q+lenq], m0
mova [dst2q+lenq], m2

However, that's not less instructions, maybe worth checking anyway.

Alternatively, a pshufb version:

mova m3, [pb_02468ace13579bdf]
.loop:
mova m0, [srcq+2*lenq+0*mmsize]
mova m1, [srcq+2*lenq+1*mmsize]
pshufb m0, m3
pshufb m1, m3
punpcklqdq m2, m0, m1
punpckhqdq m0, m1
mova [dst1q+lenq], m2
mova [dst2q+lenq], m0

2 instructions less, and only 2 unpacks as opposed to all the
shuffles, so potentially faster (except on Atom where pshufb is
dog-slow).

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 08/15] lavr: x86: optimized 2-channel s16 to s16p conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Tue, Jul 24, 2012 at 9:41 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 Hi,

 On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
 justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   38 
 
  libavresample/x86/audio_convert_init.c |   11 +
  2 files changed, 49 insertions(+), 0 deletions(-)

 diff --git a/libavresample/x86/audio_convert.asm 
 b/libavresample/x86/audio_convert.asm
 index 9ba7251..70519e1 100644
 --- a/libavresample/x86/audio_convert.asm
 +++ b/libavresample/x86/audio_convert.asm
 @@ -734,3 +734,41 @@ CONV_FLTP_TO_FLT_6CH
  INIT_XMM avx
  CONV_FLTP_TO_FLT_6CH
  %endif
 +
 +;--
 +; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len,
 +;  int channels);
 +;--
 +
 +%macro CONV_S16_TO_S16P_2CH 0
 +cglobal conv_s16_to_s16p_2ch, 3,4,3, dst0, src, len, dst1
 +lea   lenq, [2*lend]
 +mov  dst1q, [dst0q+gprsize]
 +mov  dst0q, [dst0q]
 +lea   srcq, [srcq+2*lenq]
 +add  dst0q, lenq
 +add  dst1q, lenq
 +neg   lenq
 +ALIGN 16
 +.loop:
 +movam0, [srcq+2*lenq   ]
 +movam1, [srcq+2*lenq+mmsize]
 +pshuflw m0, m0, q3120
 +pshufhw m0, m0, q3120
 +pshuflw m1, m1, q3120
 +pshufhw m1, m1, q3120
 +shufps  m2, m0, m1, q2020
 +shufps  m0, m1, q3131
 +mova  [dst0q+lenq], m2
 +mova  [dst1q+lenq], m0

 The more common way to do this (I believe) is to set up mask reg:

 pcmpeqb m4, m4
 psrlw m4, 8 ; 0x00ff

 Then mask/shift:

 mova m0, [srcq+2*lenq+0*mmsize]
 mova m1, [srcq+2*lenq+1*mmsize]
 psrlw m2, m0, 8
 psrlw m3, m1, 8
 pand m0, m4
 pand m1, m4
 packsswb m0, m1
 packsswb m2, m3
 mova [dst1q+lenq], m0
 mova [dst2q+lenq], m2

 However, that's not less instructions, maybe worth checking anyway.

 Alternatively, a pshufb version:

 mova m3, [pb_02468ace13579bdf]
 .loop:
 mova m0, [srcq+2*lenq+0*mmsize]
 mova m1, [srcq+2*lenq+1*mmsize]
 pshufb m0, m3
 pshufb m1, m3
 punpcklqdq m2, m0, m1
 punpckhqdq m0, m1
 mova [dst1q+lenq], m2
 mova [dst2q+lenq], m0

 2 instructions less, and only 2 unpacks as opposed to all the
 shuffles, so potentially faster (except on Atom where pshufb is
 dog-slow).

Actually that's all byte-based, but I guess it's obvious what I mean
so should be easy to convert to word-speak.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 09/15] lavr: x86: optimized 6-channel s16 to s16p conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 +mova   m0, [srcq ]  ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
 +mova   m2, [srcq+2*mmsize]  ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
 +movq   m3, [srcq+  mmsize+mmsize/2]
 +movlhpsm3, m2   ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
 +movhpd m1, [srcq+  mmsize]
 +movhlpsm1, m0   ; m1 =  4,  5,  6,  7,  8,  9, 10, 11
 +psrldq m1, 4; m1 =  6,  7,  8,  9, 10, 11,  x,  x
 +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23,  x,  x
 +psrldq m1, 4; m1 =  6,  7,  8,  9, 10, 11,  x,  x
 +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23,  x,  x

That's ... weird (at least for the AVX version):

mova m0, [srcq+0*mmsize] ; 0-7
mova m2, [srcq+1*mmsize] ; 8-15
mova m3, [srcq+2*mmsize] ; 16-23
palignr m1, m0, m2, 12 ; 6-11
palignr m2, m3, 8 ; 12-17
psrldq m3, 4 ; 18-23

 +punpcklwd  m4, m0, m1   ; m4 =  0,  6,  1,  7,  2,  8,  3,  9
 +punpckhwd  m0, m1   ; m0 =  4, 10,  5, 11,  x,  x,  x,  x
 +punpcklwd  m1, m3, m2   ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
 +punpckhwd  m3, m2   ; m3 = 16, 22, 17, 23,  x,  x,  x,  x
 +punpckldq  m2, m4, m1   ; m2 =  0,  6, 12, 18,  1,  7, 13, 19
 +punpckhdq  m4, m1   ; m4 =  2,  8, 14, 20,  3,  9, 15, 21
 +punpckldq  m0, m3   ; m0 =  4, 10, 16, 22,  5, 11, 17, 23
 +movq[dstq  ], m2
 +movhps  [dstq+dst1q], m2
 +movq[dstq+dst2q], m4
 +movhps  [dstq+dst3q], m4
 +movq[dstq+dst4q], m0
 +movhps  [dstq+dst5q], m0
 +add  srcq, mmsize*3
 +add  dstq, mmsize/2
 +sub  lend, mmsize/4

Pointer munging should allow getting rid of one sub/add.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 10/15] lavr: x86: optimized 2-channel s16 to fltp conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   41 
 
  libavresample/x86/audio_convert_init.c |   13 ++
  2 files changed, 54 insertions(+), 0 deletions(-)

LGTM.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 11/15] lavr: x86: optimized 6-channel s16 to fltp conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 +mova   m0, [srcq ]  ; m0 =  0,  1,  2,  3,  4,  5,  6,  7
 +mova   m1, [srcq+  mmsize]  ; m1 =  8,  9, 10, 11, 12, 13, 14, 15
 +mova   m2, [srcq+2*mmsize]  ; m2 = 16, 17, 18, 19, 20, 21, 22, 23
 +movhlpsm3, m1
 +movlhpsm3, m2   ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
 +movlhpsm1, m1
 +movhlpsm1, m0   ; m1 =  4,  5,  6,  7,  8,  9, 10, 11
 +psrldq m1, 4; m1 =  6,  7,  8,  9, 10, 11,  x,  x
 +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23,  x,  x

See 10/15, should be able to do this using palignr x2+psrldqx1 instead.

 +punpcklwd  m4, m0, m1   ; m4 =  0,  6,  1,  7,  2,  8,  3,  9
 +punpckhwd  m0, m1   ; m0 =  4, 10,  5, 11,  x,  x,  x,  x
 +punpcklwd  m1, m3, m2   ; m1 = 12, 18, 13, 19, 14, 20, 15, 21
 +punpckhwd  m3, m2   ; m3 = 16, 22, 17, 23,  x,  x,  x,  x
 +punpckldq  m2, m4, m1   ; m2 =  0,  6, 12, 18,  1,  7, 13, 19
 +punpckhdq  m4, m1   ; m4 =  2,  8, 14, 20,  3,  9, 15, 21
 +punpckldq  m0, m3   ; m0 =  4, 10, 16, 22,  5, 11, 17, 23
 +movhlpsm3, m2   ; m3 =  1,  7, 13, 19,  x,  x,  x,  x
 +movhlpsm5, m4   ; m5 =  3,  9, 15, 21,  x,  x,  x,  x
 +movhlpsm1, m0   ; m1 =  5, 11, 17, 23,  x,  x,  x,  x
 +PMOVSXWD   m0, m0
 +PMOVSXWD   m1, m1
 +PMOVSXWD   m2, m2
 +PMOVSXWD   m3, m3
 +PMOVSXWD   m4, m4
 +PMOVSXWD   m5, m5
 +cvtdq2ps   m0, m0
 +cvtdq2ps   m1, m1
 +cvtdq2ps   m2, m2
 +cvtdq2ps   m3, m3
 +cvtdq2ps   m4, m4
 +cvtdq2ps   m5, m5
 +mulps  m0, m6
 +mulps  m1, m6
 +mulps  m2, m6
 +mulps  m3, m6
 +mulps  m4, m6
 +mulps  m5, m6
 +mova  [dstq  ], m2
 +mova  [dstq+dst1q], m3
 +mova  [dstq+dst2q], m4
 +mova  [dstq+dst3q], m5
 +mova  [dstq+dst4q], m0
 +mova  [dstq+dst5q], m1
 +add  srcq, mmsize*3
 +add  dstq, mmsize
 +sub  lend, mmsize/4

Pointer munging allows to remove one add/sub.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 12/15] lavr: x86: optimized 2-channel flt to s16p conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   49 
 
  libavresample/x86/audio_convert_init.c |9 ++
  2 files changed, 58 insertions(+), 0 deletions(-)

LGTM.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 13/15] lavr: x86: optimized 6-channel flt to s16p conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 +movhlpsm3, m1
 +movlhpsm3, m2   ; m3 = 12, 13, 14, 15, 16, 17, 18, 19
 +movlhpsm1, m1
 +movhlpsm1, m0   ; m1 =  4,  5,  6,  7,  8,  9, 10, 11
 +psrldq m1, 4; m1 =  6,  7,  8,  9, 10, 11,  x,  x
 +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23,  x,  x

palignrx2+psrldqx1, saves 3 instructions.

 +add  srcq, mmsize*6
 +add  dstq, mmsize/2
 +sub  lend, mmsize/4

Pointer munging to remove one add/sub.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 14/15] lavr: x86: optimized 2-channel flt to fltp conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   34 
 
  libavresample/x86/audio_convert_init.c |9 
  2 files changed, 43 insertions(+), 0 deletions(-)

OK.

(Can this be implemented in YMM with the current instructions available?)

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 15/15] lavr: x86: optimized 6-channel flt to fltp conversion

2012-07-24 Thread Ronald S. Bultje
Hi,

On Tue, Jul 17, 2012 at 6:16 AM, Justin Ruggles
justin.rugg...@gmail.com wrote:
 ---
  libavresample/x86/audio_convert.asm|   63 
 
  libavresample/x86/audio_convert_init.c |9 +
  2 files changed, 72 insertions(+), 0 deletions(-)

(I'm going to assume Loren had no further comments) LGTM.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/7] vf_hqdn3d: simplify and optimize

2012-07-24 Thread Ronald S. Bultje
Hi,

On Tue, Jul 24, 2012 at 7:45 PM, Loren Merritt lor...@u.washington.edu wrote:
 -long x, y;
 -uint32_t pixel;
 +uint32_t tmp;

 -for (y = 0; y  h; y++) {
 -for (x = 0; x  w; x++) {
 -pixel = lowpass(frame_ant[x]8, src[x]16, temporal);
 -frame_ant[x] = ((pixel+0x107F)8);
 -dst[x]= ((pixel+0x10007FFF)16);
 +for (long y = 0; y  h; y++) {
 +for (long x = 0; x  w; x++) {

Unfortunately, this won't compile on MSVC, please do declare the
variables outside the loop.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] vp3: don't use calls to inline asm in yasm code.

2012-07-24 Thread Ronald S. Bultje
Hi,

On Mon, Jul 23, 2012 at 7:12 AM, Ronald S. Bultje rsbul...@gmail.com wrote:
 On Sun, Jul 22, 2012 at 2:38 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 Mixing yasm and inline asm is a bad idea, since if either yasm or inline
 asm is not supported by your toolchain, all of the asm stops working.
 Thus, better to use either one or the other alone.
 ---
  libavcodec/x86/vp3dsp.asm |  120 
 +
  1 file changed, 79 insertions(+), 41 deletions(-)

 Ping.

Jason LGTM'ed this over the weekend on IRC, I'll push this if there's
no further comments.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] x86/dsputil: put inline asm under HAVE_INLINE_ASM.

2012-07-24 Thread Ronald S. Bultje
Hi,

On Sun, Jul 22, 2012 at 3:27 PM, Derek Buitenhuis
derek.buitenh...@gmail.com wrote:
 On 22/07/2012 6:14 PM, Ronald S. Bultje wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 This allows compiling with compilers that don't support gcc-style
 inline assembly.
 ---

 I think this looks OK, assuming:

 1) You've tested every yasm/inline asm enable/disable combination
 2) Everybody else is OK with using ifdefs.

If there's no further comments, I'll push this tomorrow.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264: refactor NAL decode loop.

2012-07-23 Thread Ronald S. Bultje
Hi,

On Mon, Jul 23, 2012 at 2:05 AM, Diego Biurrun di...@biurrun.de wrote:
 On Sun, Jul 22, 2012 at 08:46:10PM -0700, Ronald S. Bultje wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 Write out the NAL decoding loops in full so that they are easier to
 parse for a preprocessor without it having to be aware of macros or
 other such things in C code.

 This also makes the code more readable.
 ---
  libavcodec/h264.c |   42 +-
  1 file changed, 25 insertions(+), 17 deletions(-)

 LGTM

 --- a/libavcodec/h264.c
 +++ b/libavcodec/h264.c
 @@ -175,42 +175,50 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, 
 const uint8_t *src,

 +#define STARTCODE_TEST \
 +if (i + 2  length  src[i + 1] == 0  src[i + 2] = 3) { \
 +if (src[i + 2] != 3) { \
 +/* startcode, so we must be past the end */ \
 +length = i; \
 +} \
 +break; \
 +}
  #if HAVE_FAST_UNALIGNED
 +#define FIND_FIRST_ZERO \
 +if (i  0  !src[i]) \
 +i--; \
 +while (src[i]) \
 +i++

 h264.c is one of the pretty files now, so please align the '\',
 preferably on column 72.

My editor doesn't support that - can you do that?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 2/2] vp3: don't use calls to inline asm in yasm code.

2012-07-23 Thread Ronald S. Bultje
Hi,

On Sun, Jul 22, 2012 at 2:38 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 Mixing yasm and inline asm is a bad idea, since if either yasm or inline
 asm is not supported by your toolchain, all of the asm stops working.
 Thus, better to use either one or the other alone.
 ---
  libavcodec/x86/vp3dsp.asm |  120 
 +
  1 file changed, 79 insertions(+), 41 deletions(-)

Ping.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.

2012-07-23 Thread Ronald S. Bultje
Hi,

On Sun, Jul 22, 2012 at 1:16 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 This completes the conversion of h264dsp to yasm; note that h264 also
 uses some dsputil functions, most notably qpel. Performance-wise, the
 yasm-version is ~10 cycles faster (182-172) on x86-64, and ~8 cycles
 faster (201-193) on x86-32.
 ---
  libavcodec/x86/h264_deblock.asm |  168 
 +++
  libavcodec/x86/h264dsp_mmx.c|  162 ++---
  2 files changed, 175 insertions(+), 155 deletions(-)

Ping.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264: refactor NAL decode loop.

2012-07-23 Thread Ronald S. Bultje
Hi,

On Mon, Jul 23, 2012 at 7:14 AM, Kostya Shishkov
kostya.shish...@gmail.com wrote:
 On Mon, Jul 23, 2012 at 07:11:49AM -0700, Ronald S. Bultje wrote:
 Hi,

 On Mon, Jul 23, 2012 at 2:05 AM, Diego Biurrun di...@biurrun.de wrote:
  On Sun, Jul 22, 2012 at 08:46:10PM -0700, Ronald S. Bultje wrote:
  From: Ronald S. Bultje rsbul...@gmail.com
 
  Write out the NAL decoding loops in full so that they are easier to
  parse for a preprocessor without it having to be aware of macros or
  other such things in C code.
 
  This also makes the code more readable.
  ---
   libavcodec/h264.c |   42 +-
   1 file changed, 25 insertions(+), 17 deletions(-)
 
  LGTM
 
  --- a/libavcodec/h264.c
  +++ b/libavcodec/h264.c
  @@ -175,42 +175,50 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, 
  const uint8_t *src,
 
  +#define STARTCODE_TEST \
  +if (i + 2  length  src[i + 1] == 0  src[i + 2] = 3) { \
  +if (src[i + 2] != 3) { \
  +/* startcode, so we must be past the end */ \
  +length = i; \
  +} \
  +break; \
  +}
   #if HAVE_FAST_UNALIGNED
  +#define FIND_FIRST_ZERO \
  +if (i  0  !src[i]) \
  +i--; \
  +while (src[i]) \
  +i++
 
  h264.c is one of the pretty files now, so please align the '\',
  preferably on column 72.

 My editor doesn't support that - can you do that?

 Mine neither, but pressing spacebar and releasing it after some time does the
 trick just fine.

Hard to say if it's at 72 or not, is it?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264: refactor NAL decode loop

2012-07-23 Thread Ronald S. Bultje
Hi,

On Mon, Jul 23, 2012 at 7:29 AM, Luca Barbato lu_z...@gentoo.org wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 Write out the NAL decoding loops in full so that they are easier
 to parse for a preprocessor without it having to be aware of macros
 or other such things in C code.

 This also makes the code more readable.

 Signed-off-by: Luca Barbato lu_z...@gentoo.org
 ---
  libavcodec/h264.c |   42 +-
  1 file changed, 25 insertions(+), 17 deletions(-)

Thanks, go ahead and commit this.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] H264: Remove 3dnow qpel code.

2012-07-23 Thread Ronald S. Bultje
Hi,

On Mon, Jul 23, 2012 at 5:37 PM, Daniel Kang daniel.d.k...@gmail.com wrote:
 On Mon, Jul 23, 2012 at 5:21 PM, Diego Biurrun di...@biurrun.de wrote:

 On Mon, Jul 23, 2012 at 05:12:23PM -0700, Daniel Kang wrote:
  From: Daniel Kang daniel.d.k...@gmail.com
 
  The only CPUs that have 3dnow and don't have mmxext are 12 years old.
  ---
   libavcodec/x86/dsputil_mmx.c   |9 -
   libavcodec/x86/h264_qpel_mmx.c |4 
   2 files changed, 0 insertions(+), 13 deletions(-)

 What sort of maintenance burden does this relieve us from?
 I'm writing this mail on a system fitting the description
 you mention, my trusty old K6-III.


[..]
 4. You can probably decode 260p H.264 with a K6-III. Who seriously would use
 this?

This really is the killer. Is there any sort of reasonable expectation
that a k6-3 can get useful work done when it comes to H264 decoding? I
wouldn't even mind dropping all MMX optimizations (where MMX2 - i.e.
SSE - or higher exists) altogether, i.e. going the H264 way and
requiring SSE for useful x86 performance (i.e. non-C-only).

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] H264: Remove 3dnow qpel code.

2012-07-23 Thread Ronald S. Bultje
Hi,

On Mon, Jul 23, 2012 at 7:45 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 Hi,

 On Mon, Jul 23, 2012 at 5:37 PM, Daniel Kang daniel.d.k...@gmail.com wrote:
 On Mon, Jul 23, 2012 at 5:21 PM, Diego Biurrun di...@biurrun.de wrote:

 On Mon, Jul 23, 2012 at 05:12:23PM -0700, Daniel Kang wrote:
  From: Daniel Kang daniel.d.k...@gmail.com
 
  The only CPUs that have 3dnow and don't have mmxext are 12 years old.
  ---
   libavcodec/x86/dsputil_mmx.c   |9 -
   libavcodec/x86/h264_qpel_mmx.c |4 
   2 files changed, 0 insertions(+), 13 deletions(-)

 What sort of maintenance burden does this relieve us from?
 I'm writing this mail on a system fitting the description
 you mention, my trusty old K6-III.


 [..]
 4. You can probably decode 260p H.264 with a K6-III. Who seriously would use
 this?

 This really is the killer. Is there any sort of reasonable expectation
 that a k6-3 can get useful work done when it comes to H264 decoding? I
 wouldn't even mind dropping all MMX optimizations (where MMX2 - i.e.
 SSE - or higher exists) altogether, i.e. going the H264 way

.. x264 way. :).

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] lavfi: put inline assembly under HAVE_INLINE_ASM.

2012-07-22 Thread Ronald S. Bultje
Hi,

On Sat, Jul 21, 2012 at 5:03 PM, Ronald S. Bultje rsbul...@gmail.com wrote:
 From: Ronald S. Bultje rsbul...@gmail.com

 This allows compiling this code using compilers that do not understand
 gcc-style inline assembly.
 ---
  libavfilter/x86/gradfun.c |6 ++
  libavfilter/x86/yadif.c   |6 ++
  2 files changed, 12 insertions(+)

Ping.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264: refactor NAL decoding loop.

2012-07-22 Thread Ronald S. Bultje
Hi,

On Sat, Jul 21, 2012 at 5:19 PM, Måns Rullgård m...@mansr.com wrote:
 Ronald S. Bultje rsbul...@gmail.com writes:

 From: Ronald S. Bultje rsbul...@gmail.com

 This removes some code duplication between the 3 different versions,
 and aligns brackets in such a way that it is now possible to pull
 this code through a naive pre-processor that doesn't necessarily have
 to be aware of compiler-macros.
 ---
  libavcodec/h264.c |   36 
  1 file changed, 20 insertions(+), 16 deletions(-)

 diff --git a/libavcodec/h264.c b/libavcodec/h264.c
 index a4afcc8..20fa7c3 100644
 --- a/libavcodec/h264.c
 +++ b/libavcodec/h264.c
 @@ -178,30 +178,34 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, 
 const uint8_t *src,
  #if HAVE_FAST_UNALIGNED
  #if HAVE_FAST_64BIT
  #define RS 7
 -for (i = 0; i + 1  length; i += 9) {
 -if (!((~AV_RN64A(src + i) 
 -   (AV_RN64A(src + i) - 0x0100010001000101ULL)) 
 +#define MASKCHECK \
 +if (!((~AV_RN64A(src + i)  \
 +   (AV_RN64A(src + i) - 0x0100010001000101ULL))  \
0x8000800080008080ULL))
  #else
  #define RS 3
 -for (i = 0; i + 1  length; i += 5) {
 -if (!((~AV_RN32A(src + i) 
 -   (AV_RN32A(src + i) - 0x01000101U)) 
 +#define MASKCHECK \
 +if (!((~AV_RN32A(src + i)  \
 +   (AV_RN32A(src + i) - 0x01000101U))  \
0x80008080U))
  #endif
 -continue;
 -if (i  0  !src[i])
 -i--;
 -while (src[i])
 -i++;
 +#define LOOPCHECK \
 +MASKCHECK \
 +continue; \
 +if (i  0  !src[i]) \
 +i--; \
 +while (src[i]) \
 +i++
  #else
  #define RS 0
 -for (i = 0; i + 1  length; i += 2) {
 -if (src[i])
 -continue;
 -if (i  0  src[i - 1] == 0)
 -i--;
 +#define LOOPCHECK \
 +if (src[i]) \
 +continue; \
 +if (i  0  src[i - 1] == 0) \
 +i--
  #endif
 +for (i = 0; i + 1  length; i += RS + 2) {
 +LOOPCHECK;
  if (i + 2  length  src[i + 1] == 0  src[i + 2] = 3) {
  if (src[i + 2] != 3) {
  /* startcode, so we must be past the end */
 --

 This manner of splitting things is incredibly weird-looking.  Instead of
 trying to unify these rather different fragments, turning the second
 half of the loop into a macro and writing out separate loops, each
 calling the macro for the common part, would probably look much more
 sane.

#define LOOP_COMMON_PART \
if (i + 2  length  src[i + 1] == 0  src[i + 2] = 3) { \
if (src[i + 2] != 3) { \
/* startcode, so we must be past the end */ \
length = i; \
} \
break; \
}
#if HAVE_FAST_UNALIGNED
#define CHECK_COMMON_PART \
if (i  0  !src[i]) \
i--; \
while (src[i]) \
i++
#if HAVE_FAST_64BIT
for (i = 0; i + 1  length; i += 9) {
if (!((~AV_RN64A(src + i) 
   (AV_RN64A(src + i) - 0x0100010001000101ULL)) 
  0x8000800080008080ULL))
continue;
CHECK_COMMON_PART;
LOOP_COMMON_PART;
i -= 7;
}
#else
for (i = 0; i + 1  length; i += 5) {
if (!((~AV_RN32A(src + i) 
   (AV_RN32A(src + i) - 0x01000101U)) 
  0x80008080U))
continue;
CHECK_COMMON_PART;
LOOP_COMMON_PART;
i -= 3;
}
#endif
#else
for (i = 0; i + 1  length; i += 2) {
if (src[i])
continue;
if (i  0  src[i - 1] == 0)
i--;
LOOP_COMMON_PART;
}
#endif

Pick your bet and commit whichever is nicer; both work with the
preprocessor. (I think the earlier one looks better.)

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] h264: refactor NAL decoding loop.

2012-07-22 Thread Ronald S. Bultje
Hi,

On Sun, Jul 22, 2012 at 8:17 AM, Måns Rullgård m...@mansr.com wrote:
 Ronald S. Bultje rsbul...@gmail.com writes:

 This manner of splitting things is incredibly weird-looking.  Instead of
 trying to unify these rather different fragments, turning the second
 half of the loop into a macro and writing out separate loops, each
 calling the macro for the common part, would probably look much more
 sane.

 #define LOOP_COMMON_PART \
 if (i + 2  length  src[i + 1] == 0  src[i + 2] = 3) { \
 if (src[i + 2] != 3) { \
 /* startcode, so we must be past the end */ \
 length = i; \
 } \
 break; \
 }
 #if HAVE_FAST_UNALIGNED
 #define CHECK_COMMON_PART \
 if (i  0  !src[i]) \
 i--; \
 while (src[i]) \
 i++
 #if HAVE_FAST_64BIT
 for (i = 0; i + 1  length; i += 9) {
 if (!((~AV_RN64A(src + i) 
(AV_RN64A(src + i) - 0x0100010001000101ULL)) 
   0x8000800080008080ULL))
 continue;
 CHECK_COMMON_PART;
 LOOP_COMMON_PART;
 i -= 7;
 }
 #else
 for (i = 0; i + 1  length; i += 5) {
 if (!((~AV_RN32A(src + i) 
(AV_RN32A(src + i) - 0x01000101U)) 
   0x80008080U))
 continue;
 CHECK_COMMON_PART;
 LOOP_COMMON_PART;
 i -= 3;
 }
 #endif
 #else
 for (i = 0; i + 1  length; i += 2) {
 if (src[i])
 continue;
 if (i  0  src[i - 1] == 0)
 i--;
 LOOP_COMMON_PART;
 }
 #endif

 With a bit saner names for the macros, this is way more readable.

Given the risk of having to send the patch 10x, why don't you guys
suggest some names to make review shorter?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.

2012-07-22 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This completes the conversion of h264dsp to yasm; note that h264 also
uses some dsputil functions, most notably qpel. Performance-wise, the
yasm-version is ~10 cycles faster (182-172) on x86-64, and ~8 cycles
faster (201-193) on x86-32.
---
 libavcodec/x86/h264_deblock.asm |  168 +++
 libavcodec/x86/h264dsp_mmx.c|  160 ++---
 2 files changed, 174 insertions(+), 154 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 1982dc4..43c7b95 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -27,6 +27,10 @@
 %include x86inc.asm
 %include x86util.asm
 
+SECTION_RODATA
+
+pb_3_1: times 4 db 3, 1
+
 SECTION .text
 
 cextern pb_0
@@ -911,3 +915,167 @@ ff_chroma_intra_body_mmxext:
 paddb  m1, m5
 paddb  m2, m6
 ret
+
+;-
+; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
+;int8_t ref[2][40], int16_t mv[2][40][2],
+;int bidir,int edges,int step,
+;int mask_mv0, int mask_mv1, int field);
+;
+; bidiris 0 or 1
+; edgesis 1 or 4
+; step is 1 or 2
+; mask_mv0 is 0 or 3
+; mask_mv1 is 0 or 1
+; fieldis 0 or 1
+;-
+%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
+; dir, d_idx, mask_dir, bidir
+%define edgesm%1
+%define stepm %2
+%define mask_mvm  %3
+%define dir   %4
+%define d_idx %5
+%define mask_dir  %6
+%define bidir %7
+xor  b_idxq, b_idxq ; for (b_idx = 0; b_idx  edges; b_idx += step)
+.b_idx_loop_ %+ dir %+ _ %+ bidir:
+%if mask_dir == 0
+pxor m0, m0
+%endif
+test b_idxd, dword mask_mvm
+jnz .skip_loop_iter_ %+ dir %+ _ %+ bidir ; if (!(b_idx  mask_mv))
+%if bidir == 1
+movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
+punpckldqm2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
+pshufw   m0, [refq+b_idxq+12], 0x44 ; { ref0[b],  ref0[b]  }
+pshufw   m1, [refq+b_idxq+52], 0x44 ; { ref1[b],  ref1[b]  }
+pshufw   m3, m2, 0x4E   ; { ref1[bn], ref0[bn] }
+psubbm0, m2 ; { ref0[b] != ref0[bn],
+;   ref0[b] != ref1[bn] }
+psubbm1, m3 ; { ref1[b] != ref1[bn],
+;   ref1[b] != ref0[bn] }
+
+por  m0, m1
+mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
+mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
+mova m3, m1
+mova m4, m2
+psubwm1, [mvq+b_idxq*4+12*4]
+psubwm2, [mvq+b_idxq*4+12*4+mmsize]
+psubwm3, [mvq+b_idxq*4+52*4]
+psubwm4, [mvq+b_idxq*4+52*4+mmsize]
+packsswb m1, m2
+packsswb m3, m4
+paddbm1, m6
+paddbm3, m6
+psubusb  m1, m5 ; abs(mv[b] - mv[bn]) = limit
+psubusb  m3, m5
+packsswb m1, m3
+
+por  m0, m1
+mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
+mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
+mova m3, m1
+mova m4, m2
+psubwm1, [mvq+b_idxq*4+12*4]
+psubwm2, [mvq+b_idxq*4+12*4+mmsize]
+psubwm3, [mvq+b_idxq*4+52*4]
+psubwm4, [mvq+b_idxq*4+52*4+mmsize]
+packsswb m1, m2
+packsswb m3, m4
+paddbm1, m6
+paddbm3, m6
+psubusb  m1, m5 ; abs(mv[b] - mv[bn]) = limit
+psubusb  m3, m5
+packsswb m1, m3
+
+pshufw   m1, m1, 0x4E
+por  m0, m1
+pshufw   m1, m0, 0x4E
+pminub   m0, m1
+%else ; bidir == 0
+movd m0, [refq+b_idxq+12]
+psubbm0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
+
+mova m1, [mvq+b_idxq*4+12*4]
+mova m2, [mvq+b_idxq*4+12*4+mmsize]
+psubwm1, [mvq+b_idxq*4+(d_idx+12)*4]
+psubwm2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
+packsswb m1, m2
+paddbm1, m6
+psubusb  m1, m5 ; abs(mv[b] - mv[bn]) = limit
+packsswb m1, m1
+por  m0, m1
+%endif ; bidir == 1/0
+
+.skip_loop_iter_ %+ dir %+ _ %+ bidir:
+movd m1, [nnzq+b_idxq+12]
+por  m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
+
+pminub   m1, m7
+pminub   m0, m7
+psllwm1, 1
+pxor m2, m2
+pmaxub

[libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.

2012-07-22 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This completes the conversion of h264dsp to yasm; note that h264 also
uses some dsputil functions, most notably qpel. Performance-wise, the
yasm-version is ~10 cycles faster (182-172) on x86-64, and ~8 cycles
faster (201-193) on x86-32.
---
 libavcodec/x86/h264_deblock.asm |  168 +++
 libavcodec/x86/h264dsp_mmx.c|  162 ++---
 2 files changed, 175 insertions(+), 155 deletions(-)

diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm
index 1982dc4..43c7b95 100644
--- a/libavcodec/x86/h264_deblock.asm
+++ b/libavcodec/x86/h264_deblock.asm
@@ -27,6 +27,10 @@
 %include x86inc.asm
 %include x86util.asm
 
+SECTION_RODATA
+
+pb_3_1: times 4 db 3, 1
+
 SECTION .text
 
 cextern pb_0
@@ -911,3 +915,167 @@ ff_chroma_intra_body_mmxext:
 paddb  m1, m5
 paddb  m2, m6
 ret
+
+;-
+; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
+;int8_t ref[2][40], int16_t mv[2][40][2],
+;int bidir,int edges,int step,
+;int mask_mv0, int mask_mv1, int field);
+;
+; bidiris 0 or 1
+; edgesis 1 or 4
+; step is 1 or 2
+; mask_mv0 is 0 or 3
+; mask_mv1 is 0 or 1
+; fieldis 0 or 1
+;-
+%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
+; dir, d_idx, mask_dir, bidir
+%define edgesm%1
+%define stepm %2
+%define mask_mvm  %3
+%define dir   %4
+%define d_idx %5
+%define mask_dir  %6
+%define bidir %7
+xor  b_idxq, b_idxq ; for (b_idx = 0; b_idx  edges; b_idx += step)
+.b_idx_loop_ %+ dir %+ _ %+ bidir:
+%if mask_dir == 0
+pxor m0, m0
+%endif
+test b_idxd, dword mask_mvm
+jnz .skip_loop_iter_ %+ dir %+ _ %+ bidir ; if (!(b_idx  mask_mv))
+%if bidir == 1
+movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
+punpckldqm2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
+pshufw   m0, [refq+b_idxq+12], 0x44 ; { ref0[b],  ref0[b]  }
+pshufw   m1, [refq+b_idxq+52], 0x44 ; { ref1[b],  ref1[b]  }
+pshufw   m3, m2, 0x4E   ; { ref1[bn], ref0[bn] }
+psubbm0, m2 ; { ref0[b] != ref0[bn],
+;   ref0[b] != ref1[bn] }
+psubbm1, m3 ; { ref1[b] != ref1[bn],
+;   ref1[b] != ref0[bn] }
+
+por  m0, m1
+mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
+mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
+mova m3, m1
+mova m4, m2
+psubwm1, [mvq+b_idxq*4+12*4]
+psubwm2, [mvq+b_idxq*4+12*4+mmsize]
+psubwm3, [mvq+b_idxq*4+52*4]
+psubwm4, [mvq+b_idxq*4+52*4+mmsize]
+packsswb m1, m2
+packsswb m3, m4
+paddbm1, m6
+paddbm3, m6
+psubusb  m1, m5 ; abs(mv[b] - mv[bn]) = limit
+psubusb  m3, m5
+packsswb m1, m3
+
+por  m0, m1
+mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
+mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
+mova m3, m1
+mova m4, m2
+psubwm1, [mvq+b_idxq*4+12*4]
+psubwm2, [mvq+b_idxq*4+12*4+mmsize]
+psubwm3, [mvq+b_idxq*4+52*4]
+psubwm4, [mvq+b_idxq*4+52*4+mmsize]
+packsswb m1, m2
+packsswb m3, m4
+paddbm1, m6
+paddbm3, m6
+psubusb  m1, m5 ; abs(mv[b] - mv[bn]) = limit
+psubusb  m3, m5
+packsswb m1, m3
+
+pshufw   m1, m1, 0x4E
+por  m0, m1
+pshufw   m1, m0, 0x4E
+pminub   m0, m1
+%else ; bidir == 0
+movd m0, [refq+b_idxq+12]
+psubbm0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
+
+mova m1, [mvq+b_idxq*4+12*4]
+mova m2, [mvq+b_idxq*4+12*4+mmsize]
+psubwm1, [mvq+b_idxq*4+(d_idx+12)*4]
+psubwm2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
+packsswb m1, m2
+paddbm1, m6
+psubusb  m1, m5 ; abs(mv[b] - mv[bn]) = limit
+packsswb m1, m1
+por  m0, m1
+%endif ; bidir == 1/0
+
+.skip_loop_iter_ %+ dir %+ _ %+ bidir:
+movd m1, [nnzq+b_idxq+12]
+por  m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
+
+pminub   m1, m7
+pminub   m0, m7
+psllwm1, 1
+pxor m2, m2
+pmaxub

[libav-devel] [PATCH] swscale: add missing HAVE_INLINE_ASM check.

2012-07-22 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

The function called in this block is under HAVE_INLINE_ASM itself also.
---
 libswscale/swscale.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 7ae5af3..5cfa7f2 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -518,7 +518,7 @@ static int swScale(SwsContext *c, const uint8_t *src[],
 if (!enough_lines)
 break;  // we can't output a dstY line so let's try with the next 
slice
 
-#if HAVE_MMX
+#if HAVE_MMX  HAVE_INLINE_ASM
 updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex,
   lastInLumBuf, lastInChrBuf);
 #endif
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] swscale: add fast bilinear scaler under HAVE_INLINE_ASM.

2012-07-22 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libswscale/utils.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index d8fee58..a6b5a18 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -576,7 +576,7 @@ fail:
 return ret;
 }
 
-#if HAVE_MMX2
+#if HAVE_MMX2  HAVE_INLINE_ASM
 static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode,
int16_t *filter, int32_t *filterPos, int numSplits)
 {
@@ -739,7 +739,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t 
*filterCode,
 
 return fragmentPos + 1;
 }
-#endif /* HAVE_MMX2 */
+#endif /* HAVE_MMX2  HAVE_INLINE_ASM */
 
 static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
 {
@@ -971,7 +971,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, 
SwsFilter *dstFilter)
 FF_ALLOC_OR_GOTO(c, c-formatConvBuffer,
  (FFALIGN(srcW, 16) * 2 * FFALIGN(c-srcBpc, 8)  3) + 16,
  fail);
-if (HAVE_MMX2  cpu_flags  AV_CPU_FLAG_MMX2 
+if (HAVE_MMX2  HAVE_INLINE_ASM  cpu_flags  AV_CPU_FLAG_MMX2 
 c-srcBpc == 8  c-dstBpc = 10) {
 c-canMMX2BeUsed = (dstW = srcW  (dstW  31) == 0 
 (srcW  15) == 0) ? 1 : 0;
@@ -1010,7 +1010,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, 
SwsFilter *dstFilter)
 
 /* precalculate horizontal scaler filter coefficients */
 {
-#if HAVE_MMX2
+#if HAVE_MMX2  HAVE_INLINE_ASM
 // can't downscale !!!
 if (c-canMMX2BeUsed  (flags  SWS_FAST_BILINEAR)) {
 c-lumMmx2FilterCodeSize = initMMX2HScaler(dstW, c-lumXInc, NULL,
@@ -1046,7 +1046,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, 
SwsFilter *dstFilter)
 mprotect(c-chrMmx2FilterCode, c-chrMmx2FilterCodeSize, PROT_EXEC 
| PROT_READ);
 #endif
 } else
-#endif /* HAVE_MMX2 */
+#endif /* HAVE_MMX2  HAVE_INLINE_ASM */
 {
 const int filterAlign =
 (HAVE_MMX  cpu_flags  AV_CPU_FLAG_MMX) ? 4 :
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH 1/2] x86/dsputil: put inline asm under HAVE_INLINE_ASM.

2012-07-22 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This allows compiling with compilers that don't support gcc-style
inline assembly.
---
 libavcodec/x86/dsputil_mmx.c |   69 --
 libavcodec/x86/h264_qpel_mmx.c   |4 ++-
 libavcodec/x86/idct_mmx.c|4 +++
 libavcodec/x86/idct_mmx_xvid.c   |4 +++
 libavcodec/x86/idct_sse2_xvid.c  |4 +++
 libavcodec/x86/rv40dsp_init.c|4 +++
 libavcodec/x86/simple_idct_mmx.c |4 +++
 7 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 5eb4a24..a8b31e4 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -84,6 +84,8 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 
0xFEFEFEFEFEFEFEFEULL, 0xFEF
 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
 
+#if HAVE_INLINE_ASM
+
 #define JUMPALIGN() __asm__ volatile (.p2align 3::)
 #define MOVQ_ZERO(regd) __asm__ volatile (pxor %%#regd, %%#regd ::)
 
@@ -1836,6 +1838,8 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t 
*src, int stride)
   avg_pixels16_xy2_mmx(dst, src, stride, 16);
 }
 
+#endif /* HAVE_INLINE_ASM */
+
 #if HAVE_YASM
 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
 x86_reg linesize, x86_reg start_y,
@@ -1904,6 +1908,8 @@ static av_noinline void emulated_edge_mc_sse(uint8_t 
*buf, const uint8_t *src,
 }
 #endif /* HAVE_YASM */
 
+#if HAVE_INLINE_ASM
+
 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
int linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h);
@@ -2073,6 +2079,8 @@ PREFETCH(prefetch_mmx2,  prefetcht0)
 PREFETCH(prefetch_3dnow, prefetch)
 #undef PREFETCH
 
+#endif /* HAVE_INLINE_ASM */
+
 #include h264_qpel_mmx.c
 
 void ff_put_h264_chroma_mc8_mmx_rnd  (uint8_t *dst, uint8_t *src,
@@ -2118,6 +2126,8 @@ CHROMA_MC(avg, 8, 10, sse2)
 CHROMA_MC(put, 8, 10, avx)
 CHROMA_MC(avg, 8, 10, avx)
 
+#if HAVE_INLINE_ASM
+
 /* CAVS-specific */
 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
 {
@@ -2476,6 +2486,8 @@ static void vector_clipf_sse(float *dst, const float *src,
 );
 }
 
+#endif /* HAVE_INLINE_ASM */
+
 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
 int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
@@ -2588,6 +2600,7 @@ static void dsputil_init_mmx(DSPContext *c, 
AVCodecContext *avctx, int mm_flags)
 {
 const int high_bit_depth = avctx-bits_per_raw_sample  8;
 
+#if HAVE_INLINE_ASM
 c-put_pixels_clamped= ff_put_pixels_clamped_mmx;
 c-put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
 c-add_pixels_clamped= ff_add_pixels_clamped_mmx;
@@ -2610,10 +2623,6 @@ static void dsputil_init_mmx(DSPContext *c, 
AVCodecContext *avctx, int mm_flags)
 #if ARCH_X86_32 || !HAVE_YASM
 c-gmc = gmc_mmx;
 #endif
-#if ARCH_X86_32  HAVE_YASM
-if (!high_bit_depth)
-c-emulated_edge_mc = emulated_edge_mc_mmx;
-#endif
 
 c-add_bytes = add_bytes_mmx;
 
@@ -2621,8 +2630,14 @@ static void dsputil_init_mmx(DSPContext *c, 
AVCodecContext *avctx, int mm_flags)
 c-h263_v_loop_filter = h263_v_loop_filter_mmx;
 c-h263_h_loop_filter = h263_h_loop_filter_mmx;
 }
+#endif /* HAVE_INLINE_ASM */
 
 #if HAVE_YASM
+#if ARCH_X86_32
+if (!high_bit_depth)
+c-emulated_edge_mc = emulated_edge_mc_mmx;
+#endif
+
 if (!high_bit_depth  CONFIG_H264CHROMA) {
 c-put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
 c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
@@ -2639,6 +2654,7 @@ static void dsputil_init_mmx2(DSPContext *c, 
AVCodecContext *avctx,
 const int bit_depth  = avctx-bits_per_raw_sample;
 const int high_bit_depth = bit_depth  8;
 
+#if HAVE_INLINE_ASM
 c-prefetch = prefetch_mmx2;
 
 if (!high_bit_depth) {
@@ -2674,22 +2690,27 @@ static void dsputil_init_mmx2(DSPContext *c, 
AVCodecContext *avctx,
 c-put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
 c-put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
 }
+#endif /* HAVE_INLINE_ASM */
 
 if (CONFIG_H264QPEL) {
+#if HAVE_INLINE_ASM
 SET_QPEL_FUNCS(put_qpel,0, 16, mmx2, );
 SET_QPEL_FUNCS(put_qpel,1,  8, mmx2, );
 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
 SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmx2, );
 SET_QPEL_FUNCS(avg_qpel,0, 16, mmx2, );
 SET_QPEL_FUNCS(avg_qpel,1,  8, mmx2, );
+#endif /* HAVE_INLINE_ASM */
 
 if (!high_bit_depth) {
+#if HAVE_INLINE_ASM
 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
 SET_QPEL_FUNCS(put_h264_qpel, 1

[libav-devel] [PATCH 2/2] vp3: don't use calls to inline asm in yasm code.

2012-07-22 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

Mixing yasm and inline asm is a bad idea, since if either yasm or inline
asm is not supported by your toolchain, all of the asm stops working.
Thus, better to use either one or the other alone.
---
 libavcodec/x86/vp3dsp.asm |  120 +
 1 file changed, 79 insertions(+), 41 deletions(-)

diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index 58fa1f7..af2f60c 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -38,13 +38,11 @@ cextern pb_1
 cextern pb_3
 cextern pb_7
 cextern pb_1F
+cextern pb_80
 cextern pb_81
 
 cextern pw_8
 
-cextern put_signed_pixels_clamped_mmx
-cextern add_pixels_clamped_mmx
-
 SECTION .text
 
 ; this is off by one or two for some cases when filter_limit is greater than 63
@@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4
 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
 %endmacro
 
-%macro vp3_idct_funcs 3
-cglobal vp3_idct_put_%1, 3, %3, %2
+%macro vp3_idct_funcs 1
+cglobal vp3_idct_put_%1, 3, 4, 9
 VP3_IDCT_%1   r2
-%if ARCH_X86_64
-mov   r3, r2
-mov   r2, r1
-mov   r1, r0
-mov   r0, r3
+
+movsxdifnidn  r1, r1d
+mova  m4, [pb_80]
+lea   r3, [r1*3]
+%assign %%i 0
+%rep 16/mmsize
+mova  m0, [r2+mmsize*0+%%i]
+mova  m1, [r2+mmsize*2+%%i]
+mova  m2, [r2+mmsize*4+%%i]
+mova  m3, [r2+mmsize*6+%%i]
+packsswb  m0, [r2+mmsize*1+%%i]
+packsswb  m1, [r2+mmsize*3+%%i]
+packsswb  m2, [r2+mmsize*5+%%i]
+packsswb  m3, [r2+mmsize*7+%%i]
+paddb m0, m4
+paddb m1, m4
+paddb m2, m4
+paddb m3, m4
+movq   [r0 ], m0
+%if mmsize == 8
+movq   [r0+r1  ], m1
+movq   [r0+r1*2], m2
+movq   [r0+r3  ], m3
 %else
-mov  r0m, r2
-mov  r1m, r0
-mov  r2m, r1
+movhps [r0+r1  ], m0
+movq   [r0+r1*2], m1
+movhps [r0+r3  ], m1
 %endif
-%if WIN64
-call put_signed_pixels_clamped_mmx
-RET
-%else
-jmp put_signed_pixels_clamped_mmx
+%if %%i == 0
+lea   r0, [r0+r1*4]
+%endif
+%if mmsize == 16
+movq   [r0 ], m2
+movhps [r0+r1  ], m2
+movq   [r0+r1*2], m3
+movhps [r0+r3  ], m3
 %endif
+%assign %%i %%i+64
+%endrep
+RET
 
-cglobal vp3_idct_add_%1, 3, %3, %2
+cglobal vp3_idct_add_%1, 3, 4, 9
 VP3_IDCT_%1   r2
-%if ARCH_X86_64
-mov   r3, r2
-mov   r2, r1
-mov   r1, r0
-mov   r0, r3
-%else
-mov  r0m, r2
-mov  r1m, r0
-mov  r2m, r1
+
+mov   r3, 4
+pxor  m4, m4
+movsxdifnidn  r1, r1d
+.loop:
+movq  m0, [r0]
+movq  m1, [r0+r1]
+%if mmsize == 8
+mova  m2, m0
+mova  m3, m1
 %endif
-%if WIN64
-call add_pixels_clamped_mmx
-RET
-%else
-jmp add_pixels_clamped_mmx
+punpcklbw m0, m4
+punpcklbw m1, m4
+%if mmsize == 8
+punpckhbw m2, m4
+punpckhbw m3, m4
+%endif
+paddswm0, [r2+ 0]
+paddswm1, [r2+16]
+%if mmsize == 8
+paddswm2, [r2+ 8]
+paddswm3, [r2+24]
+packuswb  m0, m2
+packuswb  m1, m3
+%else ; mmsize == 16
+packuswb  m0, m1
 %endif
+movq [r0   ], m0
+%if mmsize == 8
+movq [r0+r1], m1
+%else ; mmsize == 16
+movhps   [r0+r1], m0
+%endif
+lea   r0, [r0+r1*2]
+add   r2, 32
+dec   r3
+jg .loop
+RET
 %endmacro
 
-%if ARCH_X86_64
-%define REGS 4
-%else
-%define REGS 3
-%endif
 INIT_MMX
-vp3_idct_funcs mmx,  0, REGS
+vp3_idct_funcs mmx
 INIT_XMM
-vp3_idct_funcs sse2, 9, REGS
-%undef REGS
+vp3_idct_funcs sse2
 
 %macro DC_ADD 0
 movq  m2, [r0 ]
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


[libav-devel] [PATCH] x86/dsputil: put inline asm under HAVE_INLINE_ASM.

2012-07-22 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This allows compiling with compilers that don't support gcc-style
inline assembly.
---
 libavcodec/dct-test.c|2 +-
 libavcodec/x86/dsputil_mmx.c |   69 --
 libavcodec/x86/h264_qpel_mmx.c   |4 ++-
 libavcodec/x86/idct_mmx.c|4 +++
 libavcodec/x86/idct_mmx_xvid.c   |4 +++
 libavcodec/x86/idct_sse2_xvid.c  |4 +++
 libavcodec/x86/rv40dsp_init.c|4 +++
 libavcodec/x86/simple_idct_mmx.c |4 +++
 8 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index ceff448..5046544 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -108,7 +108,7 @@ static const struct algo idct_tab[] = {
 { INT,ff_j_rev_dct,  MMX_PERM },
 { SIMPLE-C,   ff_simple_idct_8,  NO_PERM  },
 
-#if HAVE_MMX
+#if HAVE_MMX  HAVE_INLINE_ASM
 #if CONFIG_GPL
 { LIBMPEG2-MMX,   ff_mmx_idct,   MMX_PERM,  AV_CPU_FLAG_MMX,  1 
},
 { LIBMPEG2-MMX2,  ff_mmxext_idct,MMX_PERM,  AV_CPU_FLAG_MMX2, 1 
},
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 5eb4a24..a8b31e4 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -84,6 +84,8 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 
0xFEFEFEFEFEFEFEFEULL, 0xFEF
 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
 
+#if HAVE_INLINE_ASM
+
 #define JUMPALIGN() __asm__ volatile (.p2align 3::)
 #define MOVQ_ZERO(regd) __asm__ volatile (pxor %%#regd, %%#regd ::)
 
@@ -1836,6 +1838,8 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t 
*src, int stride)
   avg_pixels16_xy2_mmx(dst, src, stride, 16);
 }
 
+#endif /* HAVE_INLINE_ASM */
+
 #if HAVE_YASM
 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
 x86_reg linesize, x86_reg start_y,
@@ -1904,6 +1908,8 @@ static av_noinline void emulated_edge_mc_sse(uint8_t 
*buf, const uint8_t *src,
 }
 #endif /* HAVE_YASM */
 
+#if HAVE_INLINE_ASM
+
 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
int linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h);
@@ -2073,6 +2079,8 @@ PREFETCH(prefetch_mmx2,  prefetcht0)
 PREFETCH(prefetch_3dnow, prefetch)
 #undef PREFETCH
 
+#endif /* HAVE_INLINE_ASM */
+
 #include h264_qpel_mmx.c
 
 void ff_put_h264_chroma_mc8_mmx_rnd  (uint8_t *dst, uint8_t *src,
@@ -2118,6 +2126,8 @@ CHROMA_MC(avg, 8, 10, sse2)
 CHROMA_MC(put, 8, 10, avx)
 CHROMA_MC(avg, 8, 10, avx)
 
+#if HAVE_INLINE_ASM
+
 /* CAVS-specific */
 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
 {
@@ -2476,6 +2486,8 @@ static void vector_clipf_sse(float *dst, const float *src,
 );
 }
 
+#endif /* HAVE_INLINE_ASM */
+
 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
 int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
@@ -2588,6 +2600,7 @@ static void dsputil_init_mmx(DSPContext *c, 
AVCodecContext *avctx, int mm_flags)
 {
 const int high_bit_depth = avctx-bits_per_raw_sample  8;
 
+#if HAVE_INLINE_ASM
 c-put_pixels_clamped= ff_put_pixels_clamped_mmx;
 c-put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
 c-add_pixels_clamped= ff_add_pixels_clamped_mmx;
@@ -2610,10 +2623,6 @@ static void dsputil_init_mmx(DSPContext *c, 
AVCodecContext *avctx, int mm_flags)
 #if ARCH_X86_32 || !HAVE_YASM
 c-gmc = gmc_mmx;
 #endif
-#if ARCH_X86_32  HAVE_YASM
-if (!high_bit_depth)
-c-emulated_edge_mc = emulated_edge_mc_mmx;
-#endif
 
 c-add_bytes = add_bytes_mmx;
 
@@ -2621,8 +2630,14 @@ static void dsputil_init_mmx(DSPContext *c, 
AVCodecContext *avctx, int mm_flags)
 c-h263_v_loop_filter = h263_v_loop_filter_mmx;
 c-h263_h_loop_filter = h263_h_loop_filter_mmx;
 }
+#endif /* HAVE_INLINE_ASM */
 
 #if HAVE_YASM
+#if ARCH_X86_32
+if (!high_bit_depth)
+c-emulated_edge_mc = emulated_edge_mc_mmx;
+#endif
+
 if (!high_bit_depth  CONFIG_H264CHROMA) {
 c-put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
 c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
@@ -2639,6 +2654,7 @@ static void dsputil_init_mmx2(DSPContext *c, 
AVCodecContext *avctx,
 const int bit_depth  = avctx-bits_per_raw_sample;
 const int high_bit_depth = bit_depth  8;
 
+#if HAVE_INLINE_ASM
 c-prefetch = prefetch_mmx2;
 
 if (!high_bit_depth) {
@@ -2674,22 +2690,27 @@ static void dsputil_init_mmx2(DSPContext *c, 
AVCodecContext *avctx,
 c-put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
 c-put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
 }
+#endif

[libav-devel] [PATCH] x86/dsputil: put inline asm under HAVE_INLINE_ASM.

2012-07-22 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

This allows compiling with compilers that don't support gcc-style
inline assembly.
---
 libavcodec/dct-test.c|2 +-
 libavcodec/x86/dsputil_mmx.c |   69 --
 libavcodec/x86/h264_qpel_mmx.c   |4 ++-
 libavcodec/x86/idct_mmx.c|4 +++
 libavcodec/x86/idct_mmx_xvid.c   |4 +++
 libavcodec/x86/idct_sse2_xvid.c  |4 +++
 libavcodec/x86/rv40dsp_init.c|2 ++
 libavcodec/x86/simple_idct_mmx.c |4 +++
 8 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c
index ceff448..5046544 100644
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -108,7 +108,7 @@ static const struct algo idct_tab[] = {
 { INT,ff_j_rev_dct,  MMX_PERM },
 { SIMPLE-C,   ff_simple_idct_8,  NO_PERM  },
 
-#if HAVE_MMX
+#if HAVE_MMX  HAVE_INLINE_ASM
 #if CONFIG_GPL
 { LIBMPEG2-MMX,   ff_mmx_idct,   MMX_PERM,  AV_CPU_FLAG_MMX,  1 
},
 { LIBMPEG2-MMX2,  ff_mmxext_idct,MMX_PERM,  AV_CPU_FLAG_MMX2, 1 
},
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index 5eb4a24..a8b31e4 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -84,6 +84,8 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 
0xFEFEFEFEFEFEFEFEULL, 0xFEF
 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
 
+#if HAVE_INLINE_ASM
+
 #define JUMPALIGN() __asm__ volatile (.p2align 3::)
 #define MOVQ_ZERO(regd) __asm__ volatile (pxor %%#regd, %%#regd ::)
 
@@ -1836,6 +1838,8 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t 
*src, int stride)
   avg_pixels16_xy2_mmx(dst, src, stride, 16);
 }
 
+#endif /* HAVE_INLINE_ASM */
+
 #if HAVE_YASM
 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
 x86_reg linesize, x86_reg start_y,
@@ -1904,6 +1908,8 @@ static av_noinline void emulated_edge_mc_sse(uint8_t 
*buf, const uint8_t *src,
 }
 #endif /* HAVE_YASM */
 
+#if HAVE_INLINE_ASM
+
 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
int linesize, int block_w, int block_h,
int src_x, int src_y, int w, int h);
@@ -2073,6 +2079,8 @@ PREFETCH(prefetch_mmx2,  prefetcht0)
 PREFETCH(prefetch_3dnow, prefetch)
 #undef PREFETCH
 
+#endif /* HAVE_INLINE_ASM */
+
 #include h264_qpel_mmx.c
 
 void ff_put_h264_chroma_mc8_mmx_rnd  (uint8_t *dst, uint8_t *src,
@@ -2118,6 +2126,8 @@ CHROMA_MC(avg, 8, 10, sse2)
 CHROMA_MC(put, 8, 10, avx)
 CHROMA_MC(avg, 8, 10, avx)
 
+#if HAVE_INLINE_ASM
+
 /* CAVS-specific */
 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
 {
@@ -2476,6 +2486,8 @@ static void vector_clipf_sse(float *dst, const float *src,
 );
 }
 
+#endif /* HAVE_INLINE_ASM */
+
 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
 int order);
 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
@@ -2588,6 +2600,7 @@ static void dsputil_init_mmx(DSPContext *c, 
AVCodecContext *avctx, int mm_flags)
 {
 const int high_bit_depth = avctx-bits_per_raw_sample  8;
 
+#if HAVE_INLINE_ASM
 c-put_pixels_clamped= ff_put_pixels_clamped_mmx;
 c-put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
 c-add_pixels_clamped= ff_add_pixels_clamped_mmx;
@@ -2610,10 +2623,6 @@ static void dsputil_init_mmx(DSPContext *c, 
AVCodecContext *avctx, int mm_flags)
 #if ARCH_X86_32 || !HAVE_YASM
 c-gmc = gmc_mmx;
 #endif
-#if ARCH_X86_32  HAVE_YASM
-if (!high_bit_depth)
-c-emulated_edge_mc = emulated_edge_mc_mmx;
-#endif
 
 c-add_bytes = add_bytes_mmx;
 
@@ -2621,8 +2630,14 @@ static void dsputil_init_mmx(DSPContext *c, 
AVCodecContext *avctx, int mm_flags)
 c-h263_v_loop_filter = h263_v_loop_filter_mmx;
 c-h263_h_loop_filter = h263_h_loop_filter_mmx;
 }
+#endif /* HAVE_INLINE_ASM */
 
 #if HAVE_YASM
+#if ARCH_X86_32
+if (!high_bit_depth)
+c-emulated_edge_mc = emulated_edge_mc_mmx;
+#endif
+
 if (!high_bit_depth  CONFIG_H264CHROMA) {
 c-put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
 c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
@@ -2639,6 +2654,7 @@ static void dsputil_init_mmx2(DSPContext *c, 
AVCodecContext *avctx,
 const int bit_depth  = avctx-bits_per_raw_sample;
 const int high_bit_depth = bit_depth  8;
 
+#if HAVE_INLINE_ASM
 c-prefetch = prefetch_mmx2;
 
 if (!high_bit_depth) {
@@ -2674,22 +2690,27 @@ static void dsputil_init_mmx2(DSPContext *c, 
AVCodecContext *avctx,
 c-put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
 c-put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
 }
+#endif

[libav-devel] [PATCH] swscale: place inline assembly bilinear scaler under HAVE_INLINE_ASM.

2012-07-22 Thread Ronald S. Bultje
From: Ronald S. Bultje rsbul...@gmail.com

---
 libswscale/utils.c |   10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index d8fee58..a6b5a18 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -576,7 +576,7 @@ fail:
 return ret;
 }
 
-#if HAVE_MMX2
+#if HAVE_MMX2  HAVE_INLINE_ASM
 static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode,
int16_t *filter, int32_t *filterPos, int numSplits)
 {
@@ -739,7 +739,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t 
*filterCode,
 
 return fragmentPos + 1;
 }
-#endif /* HAVE_MMX2 */
+#endif /* HAVE_MMX2  HAVE_INLINE_ASM */
 
 static void getSubSampleFactors(int *h, int *v, enum PixelFormat format)
 {
@@ -971,7 +971,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, 
SwsFilter *dstFilter)
 FF_ALLOC_OR_GOTO(c, c-formatConvBuffer,
  (FFALIGN(srcW, 16) * 2 * FFALIGN(c-srcBpc, 8)  3) + 16,
  fail);
-if (HAVE_MMX2  cpu_flags  AV_CPU_FLAG_MMX2 
+if (HAVE_MMX2  HAVE_INLINE_ASM  cpu_flags  AV_CPU_FLAG_MMX2 
 c-srcBpc == 8  c-dstBpc = 10) {
 c-canMMX2BeUsed = (dstW = srcW  (dstW  31) == 0 
 (srcW  15) == 0) ? 1 : 0;
@@ -1010,7 +1010,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, 
SwsFilter *dstFilter)
 
 /* precalculate horizontal scaler filter coefficients */
 {
-#if HAVE_MMX2
+#if HAVE_MMX2  HAVE_INLINE_ASM
 // can't downscale !!!
 if (c-canMMX2BeUsed  (flags  SWS_FAST_BILINEAR)) {
 c-lumMmx2FilterCodeSize = initMMX2HScaler(dstW, c-lumXInc, NULL,
@@ -1046,7 +1046,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, 
SwsFilter *dstFilter)
 mprotect(c-chrMmx2FilterCode, c-chrMmx2FilterCodeSize, PROT_EXEC 
| PROT_READ);
 #endif
 } else
-#endif /* HAVE_MMX2 */
+#endif /* HAVE_MMX2  HAVE_INLINE_ASM */
 {
 const int filterAlign =
 (HAVE_MMX  cpu_flags  AV_CPU_FLAG_MMX) ? 4 :
-- 
1.7.9.5

___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH] swscale: Mark all init functions as av_cold

2012-07-22 Thread Ronald S. Bultje
Hi,

On Sun, Jul 22, 2012 at 3:30 PM, Diego Biurrun di...@biurrun.de wrote:
 ---
  libswscale/output.c  |   15 ---
  libswscale/ppc/swscale_altivec.c |3 ++-
  libswscale/ppc/yuv2rgb_altivec.c |   11 +++
  libswscale/rgb2rgb.c |3 ++-
  libswscale/sparc/yuv2rgb_vis.c   |3 ++-
  libswscale/utils.c   |4 +++-
  libswscale/x86/rgb2rgb.c |3 ++-
  libswscale/x86/swscale.c |3 ++-
  libswscale/x86/yuv2rgb.c |3 ++-
  9 files changed, 30 insertions(+), 18 deletions(-)

OK.

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


Re: [libav-devel] [PATCH 1/2] build: support non-standard replacements for -E flag

2012-07-22 Thread Ronald S. Bultje
Hi,

On Sun, Jul 22, 2012 at 5:35 PM, Måns Rullgård m...@mansr.com wrote:
 Diego Biurrun di...@biurrun.de writes:

 On Mon, Jul 23, 2012 at 01:16:07AM +0100, Måns Rullgård wrote:
 Diego Biurrun di...@biurrun.de writes:

  On Mon, Jul 23, 2012 at 12:16:41AM +0100, Mans Rullgard wrote:
  This allows using non-standard flags for running the C preprocessor.
  The -o flag must be included in this setting due to strange syntax
  required by some compilers.
 
  --- a/configure
  +++ b/configure
  @@ -632,7 +636,7 @@ check_cpp(){
   log_file $TMPC
  -check_cmd $cc $CPPFLAGS $CFLAGS $@ -E -o $TMPO $TMPC
  +check_cmd $cc $CPPFLAGS $CFLAGS $@ $(cc_e $TMPO) $TMPC
   }
 
  @@ -1724,6 +1728,7 @@ cflags_filter=echo
 
   AS_O='-o $@'
  +CC_E='-E -o $@'
   CC_O='-o $@'
   LD_O='-o $@'
   HOSTCC_O='-o $@'
  @@ -2042,7 +2047,8 @@ probe_cc(){
 
  -unset _type _ident _cc_o _flags _cflags _ldflags _depflags _DEPCMD 
  _DEPFLAGS
  +unset _type _ident _cc_e _cc_o _flags _cflags _ldflags
  +unset _depflags _DEPCMD _DEPFLAGS
   _flags_filter=echo
 
  @@ -2105,6 +2111,7 @@ probe_cc(){
   _flags='--gcc --abi=eabi -me'
   _cflags='-D__gnuc_va_list=va_list -D__USER_LABEL_PREFIX__='
  +_cc_e='-ppl -fe=$@'
   _cc_o='-fe=$@'
 
  Why not set CC_E (or whatever) to -E, -ppl and combine it with
  the cc_o function we already have? Something like this:
 
  for gcc (default):
  CC_E='-E'
  for that strange beast:
  CC_E='-ppl'
 
  -check_cmd $cc $CPPFLAGS $CFLAGS $@ -E -o $TMPO $TMPC
  +check_cmd $cc $CPPFLAGS $CFLAGS $@ $CC_E $(cc_o $TMPO) $TMPC

 Because there are even stranger beasts in the compiler jungle.  For
 example, the IAR compiler needs CC_E='--preprocess=n $@' (note the lack
 of -o).  It's probably not the only one.

 CC_E='--preprocess=n'
 _cc_o='$@'

 Why would something like that not work?

 That would break normal compilation, because *then* it wants -o.

How do you intend to use this with MSVC? See:
http://msdn.microsoft.com/en-us/library/8z9z0bx6(v=vs.71).aspx
and
http://msdn.microsoft.com/en-us/library/3xkfswhy(v=vs.71)

Note how the second writes to stdout (leading to HUUGE config.log
files), and the first unfortunately does not allow to specify an
output file, instead writing to `pwd`/inputfilebasename.i. Using -P is
probably useless unless it can be combined with a mv, but maybe -E can
be redirected if our configure script supports that?

Ronald
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel


<    1   2   3   4   5   6   7   8   9   10   >