Re: [libav-devel] [PATCH 03/15] lavr: x86: optimized 2-channel s16p to flt conversion
Hi, On Sat, Jul 28, 2012 at 4:57 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 49 libavresample/x86/audio_convert_init.c |9 ++ 2 files changed, 58 insertions(+), 0 deletions(-) OK. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/3] avconv: expand AVRational literals.
From: Ronald S. Bultje rsbul...@gmail.com This way, the code looks less like spaghetti, and is easier to parse for external preprocessors. --- avconv.c | 44 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/avconv.c b/avconv.c index 439672a..fcf2b69 100644 --- a/avconv.c +++ b/avconv.c @@ -797,14 +797,16 @@ static int configure_input_video_filter(FilterGraph *fg, InputFilter *ifilter, AVFilterContext *first_filter = in-filter_ctx; AVFilter *filter = avfilter_get_by_name(buffer); InputStream *ist = ifilter-ist; -AVRational tb = ist-framerate.num ? (AVRational){ist-framerate.den, - ist-framerate.num} : - ist-st-time_base; +AVRational tb = ist-st-time_base; AVRational sar; char args[255], name[255]; int pad_idx = in-pad_idx; int ret; +if (ist-framerate.num) { +tb.num = ist-framerate.den; +tb.den = ist-framerate.num; +} sar = ist-st-sample_aspect_ratio.num ? ist-st-sample_aspect_ratio : ist-st-codec-sample_aspect_ratio; @@ -2196,11 +2198,10 @@ static int output_packet(InputStream *ist, const AVPacket *pkt) ret = decode_video(ist, avpkt, got_output); if (avpkt.duration) ist-next_dts += av_rescale_q(avpkt.duration, ist-st-time_base, AV_TIME_BASE_Q); -else if (ist-st-r_frame_rate.num) -ist-next_dts += av_rescale_q(1, (AVRational){ist-st-r_frame_rate.den, - ist-st-r_frame_rate.num}, - AV_TIME_BASE_Q); -else if (ist-st-codec-time_base.num != 0) { +else if (ist-st-r_frame_rate.num) { +AVRational ifps = { ist-st-r_frame_rate.den, ist-st-r_frame_rate.num }; +ist-next_dts += av_rescale_q(1, ifps, AV_TIME_BASE_Q); +} else if (ist-st-codec-time_base.num != 0) { int ticks = ist-st-parser ? ist-st-parser-repeat_pict + 1 : ist-st-codec-ticks_per_frame; ist-next_dts += av_rescale_q(ticks, ist-st-codec-time_base, AV_TIME_BASE_Q); @@ -2479,11 +2480,14 @@ static int transcode_init(void) codec-height = icodec-height; codec-has_b_frames = icodec-has_b_frames; if (!codec-sample_aspect_ratio.num) { -codec-sample_aspect_ratio = -ost-st-sample_aspect_ratio = -ist-st-sample_aspect_ratio.num ? ist-st-sample_aspect_ratio : -ist-st-codec-sample_aspect_ratio.num ? -ist-st-codec-sample_aspect_ratio : (AVRational){0, 1}; +if (ist-st-sample_aspect_ratio.num) { +codec-sample_aspect_ratio = ist-st-sample_aspect_ratio; +} else if (ist-st-codec-sample_aspect_ratio.num) { +codec-sample_aspect_ratio = ist-st-codec-sample_aspect_ratio; +} else { +codec-sample_aspect_ratio = (AVRational) { 0, 1 }; +} +ost-st-sample_aspect_ratio = codec-sample_aspect_ratio; } break; case AVMEDIA_TYPE_SUBTITLE: @@ -2526,7 +2530,11 @@ static int transcode_init(void) (video_sync_method == VSYNC_CFR || (video_sync_method == VSYNC_AUTO !(oc-oformat-flags (AVFMT_NOTIMESTAMPS | AVFMT_VARIABLE_FPS) { -ost-frame_rate = ist-st-r_frame_rate.num ? ist-st-r_frame_rate : (AVRational){25, 1}; +if (ist-st-r_frame_rate.num) { +ost-frame_rate = ist-st-r_frame_rate; +} else { +ost-frame_rate = (AVRational) { 25, 1 }; +} if (ost-enc ost-enc-supported_framerates !ost-force_fps) { int idx = av_find_nearest_q_idx(ost-frame_rate, ost-enc-supported_framerates); ost-frame_rate = ost-enc-supported_framerates[idx]; @@ -4095,9 +4103,13 @@ static int copy_chapters(InputFile *ifile, OutputFile *ofile, int copy_metadata) AVChapter *in_ch = is-chapters[i], *out_ch; int64_t ts_off = av_rescale_q(ofile-start_time - ifile-ts_offset, AV_TIME_BASE_Q, in_ch-time_base); -int64_t rt = (ofile-recording_time == INT64_MAX) ? INT64_MAX : - av_rescale_q(ofile-recording_time, AV_TIME_BASE_Q, in_ch-time_base); +int64_t rt; +if (ofile-recording_time == INT64_MAX) { +rt = INT64_MAX; +} else { +rt = av_rescale_q(ofile-recording_time, AV_TIME_BASE_Q
[libav-devel] [PATCH 2/3] lavf/utils.c: : expand AVRational literals.
From: Ronald S. Bultje rsbul...@gmail.com This way, the code looks less like spaghetti, and is easier to parse for external preprocessors. --- libavformat/utils.c |9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libavformat/utils.c b/libavformat/utils.c index 4ec70b7..a78b9e7 100644 --- a/libavformat/utils.c +++ b/libavformat/utils.c @@ -2205,8 +2205,13 @@ static void compute_chapters_end(AVFormatContext *s) for (i = 0; i s-nb_chapters; i++) if (s-chapters[i]-end == AV_NOPTS_VALUE) { AVChapter *ch = s-chapters[i]; -int64_t end = max_time ? av_rescale_q(max_time, AV_TIME_BASE_Q, ch-time_base) - : INT64_MAX; +int64_t end; + +if (max_time) { +end = av_rescale_q(max_time, AV_TIME_BASE_Q, ch-time_base); +} else { +end = INT64_MAX; +} for (j = 0; j s-nb_chapters; j++) { AVChapter *ch1 = s-chapters[j]; -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 3/3] lavfi: expand AVRational literals.
From: Ronald S. Bultje rsbul...@gmail.com This way, the code looks less like spaghetti, and is easier to parse for external preprocessors. --- libavfilter/avfilter.c | 10 +++--- libavfilter/vsrc_testsrc.c |7 +-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c index d302264..0df26d4 100644 --- a/libavfilter/avfilter.c +++ b/libavfilter/avfilter.c @@ -170,9 +170,13 @@ int avfilter_config_links(AVFilterContext *filter) return ret; } -if (link-time_base.num == 0 link-time_base.den == 0) -link-time_base = link-src link-src-nb_inputs ? -link-src-inputs[0]-time_base : AV_TIME_BASE_Q; +if (link-time_base.num == 0 link-time_base.den == 0) { +if (link-src link-src-nb_inputs) { +link-time_base = link-src-inputs[0]-time_base; +} else { +link-time_base = AV_TIME_BASE_Q; +} +} if (link-type == AVMEDIA_TYPE_VIDEO) { if (!link-sample_aspect_ratio.num !link-sample_aspect_ratio.den) diff --git a/libavfilter/vsrc_testsrc.c b/libavfilter/vsrc_testsrc.c index 42cd58e..12d4985 100644 --- a/libavfilter/vsrc_testsrc.c +++ b/libavfilter/vsrc_testsrc.c @@ -102,8 +102,11 @@ static av_cold int init_common(AVFilterContext *ctx, const char *args) test-time_base.num = frame_rate_q.den; test-time_base.den = frame_rate_q.num; -test-max_pts = duration = 0 ? -av_rescale_q(duration, AV_TIME_BASE_Q, test-time_base) : -1; +if (duration = 0) { +test-max_pts = av_rescale_q(duration, AV_TIME_BASE_Q, test-time_base); +} else { +test-max_pts = -1; +} test-nb_frame = 0; test-pts = 0; -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] h264_ps: declare array of colorspace strings on its own line.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/h264_ps.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libavcodec/h264_ps.c b/libavcodec/h264_ps.c index 3f53af8..7d9d596 100644 --- a/libavcodec/h264_ps.c +++ b/libavcodec/h264_ps.c @@ -431,6 +431,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ sps-sar.den= 1; if(s-avctx-debugFF_DEBUG_PICT_INFO){ +static const char csp[4][5] = { Gray, 420, 422, 444 }; av_log(h-s.avctx, AV_LOG_DEBUG, sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n, sps_id, sps-profile_idc, sps-level_idc, sps-poc_type, @@ -441,7 +442,7 @@ int ff_h264_decode_seq_parameter_set(H264Context *h){ sps-crop_left, sps-crop_right, sps-crop_top, sps-crop_bottom, sps-vui_parameters_present_flag ? VUI : , - ((const char*[]){Gray,420,422,444})[sps-chroma_format_idc], + csp[sps-chroma_format_idc], sps-timing_info_present_flag ? sps-num_units_in_tick : 0, sps-timing_info_present_flag ? sps-time_scale : 0 ); -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] vp3: don't compile mmx IDCT functions on x86-64.
Hi, On Thu, Jul 26, 2012 at 11:40 PM, Luca Barbato lu_z...@gentoo.org wrote: On 07/27/2012 07:16 AM, Ronald S. Bultje wrote: From: Ronald S. Bultje rsbul...@gmail.com 64-bit CPUs always have SSE2, and a SSE2 version exists, thus the MMX version will never be used. --- libavcodec/x86/vp3dsp.asm|3 +++ libavcodec/x86/vp3dsp_init.c |2 ++ 2 files changed, 5 insertions(+) Fine for me as well, somebody might want to run those by selecting the cpu flags directly. Unlikely, that's only for testing. Since (see commit msg) this never happens in reality, testing for it seems kind of pointless. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/3] dsputil: x86: convert PMINSD, PMAXSD, and CLIPD macros to use cpuflags
Hi, On Fri, Jul 27, 2012 at 8:54 AM, Justin Ruggles justin.rugg...@gmail.com wrote: On 07/21/2012 05:39 PM, Justin Ruggles wrote: --- Updated patch to allow float vs. dword min/max as a parameter to CLIPD instead of using 2 separate macros. libavcodec/x86/dsputil_mmx.c|6 ++-- libavcodec/x86/dsputil_yasm.asm | 66 +++ libavutil/x86/x86util.asm | 34 3 files changed, 56 insertions(+), 50 deletions(-) ping. OK. Can you replace swscale.asm in sws around line 375 with this also? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/3] dsputil: x86: convert some of the SPLATD macros to use cpuflags
Hi, On Sat, Jul 21, 2012 at 2:39 PM, Justin Ruggles justin.rugg...@gmail.com wrote: %macro VECTOR_CLIP_INT32 2 cglobal vector_clip_int32, 5,5,11, dst, src, min, max, len +SPLATD_LOW m4, minm +SPLATD_LOW m5, maxm %if notcpuflag(sse4) cpuflag(sse2) notcpuflag(atom) -cvtsi2ss m4, minm -cvtsi2ss m5, maxm +cvtdq2ps m4, m4 +cvtdq2ps m5, m5 %assign is_float 1 %else Doesn't this add an instruction? There's only one user left of SPLATD (sws), isn't it easier to rewrite that to use this also, and remove SPLATD and rename this to SPLATD again? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] x86/dsputilenc: bury inline asm under HAVE_INLINE_ASM.
Hi, On Thu, Jul 26, 2012 at 6:42 AM, Måns Rullgård m...@mansr.com wrote: Ronald S. Bultje rsbul...@gmail.com writes: Hi, On Thu, Jul 26, 2012 at 2:23 AM, Måns Rullgård m...@mansr.com wrote: Ronald S. Bultje rsbul...@gmail.com writes: From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/dct-test.c |2 +- libavcodec/x86/dsputilenc_mmx.c | 80 +++ libavcodec/x86/fdct_mmx.c |4 ++ libavcodec/x86/motion_est_mmx.c |6 +++ libavcodec/x86/mpegvideo_mmx.c |6 +++ 5 files changed, 64 insertions(+), 34 deletions(-) diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c index 5046544..9e19e0c 100644 --- a/libavcodec/dct-test.c +++ b/libavcodec/dct-test.c @@ -85,7 +85,7 @@ static const struct algo fdct_tab[] = { { IJG-AAN-INT,ff_fdct_ifast, SCALE_PERM }, { IJG-LLM-INT,ff_jpeg_fdct_islow_8, NO_PERM}, -#if HAVE_MMX +#if HAVE_MMX HAVE_INLINE_ASM { MMX,ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX }, This is just as wrong now as it was the first time. Why? Same reason. What do you suggest instead? It's probably quicker if I just show you. I'm still not seeing it. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.
Hi, On Thu, Jul 26, 2012 at 9:46 AM, Ronald S. Bultje rsbul...@gmail.com wrote: On Thu, Jul 26, 2012 at 9:05 AM, Måns Rullgård m...@mansr.com wrote: Ronald S. Bultje rsbul...@gmail.com writes: On Thu, Jul 26, 2012 at 7:30 AM, Martin Storsjö mar...@martin.st wrote: On Thu, 26 Jul 2012, Ronald S. Bultje wrote: On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote: On 07/26/2012 04:27 AM, Ronald S. Bultje wrote: From: Ronald S. Bultje rsbul...@gmail.com --- libswscale/swscale.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Ok. No, not OK. This is just a repackaged piece of another patch that has review questions that were never answered. Until those questions are settled, this cannot go in. I've looked at all emails in: http://comments.gmane.org/gmane.comp.video.libav.devel/28861 including yours: http://permalink.gmane.org/gmane.comp.video.libav.devel/28871 and Mans': http://permalink.gmane.org/gmane.comp.video.libav.devel/28863 My original mail has the fence part in it (simply ctrl-F in your browser), and neither you nor Mans respond to that particular section. So I'm lost now. What is the specific comment you want me to respond to? http://article.gmane.org/gmane.comp.video.libav.devel/30834 If someone feels like rewriting swscale, I'm all supportive of that effort. For now, sws uses movntq in its inline assembly mmx/3dnow optimizations and we'll have to deal with it until someone changes it not to do that. Doing it in generic code is silly because in practice there is never any advantage to doing movntq. Thus, we should discourage its use. Adding generic versions of sfence does not contribute to that. The whole goal - back when I worked on sws - was to kill all these old mmx/3dnow optimizations and replace with modern sse2/avx, which would mean we don't need a call to sfence anymore anyways. I'm still missing an explanation of why sfence is needed here other than movntq somehow being involved. My understanding is that if you use movntq and not sfence, the data may not be in the destination memory pointer by the time swScale() returns. But I didn't write this code. Ping. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] x86inc: clip num_args to 7 on x86-32.
From: Loren Merritt lor...@u.washington.edu This allows us to unconditionally set the cglobal num_args parameter to a bigger value, thus making writing yasm code even easier than before. Signed-off-by: Ronald S. Bultje rsbul...@gmail.com --- libavutil/x86/x86inc.asm |3 +++ 1 file changed, 3 insertions(+) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index b76a10c..dd441b2 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -451,6 +451,9 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... %assign num_args %1 %assign regs_used %2 +%if num_args 7 +%assign num_args 7 +%endif %if regs_used 7 %assign regs_used 7 %endif -- 1.7.9.2 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.
From: Ronald S. Bultje rsbul...@gmail.com This completes the conversion of h264dsp to yasm; note that h264 also uses some dsputil functions, most notably qpel. Performance-wise, the yasm-version is ~10 cycles faster (182-172) on x86-64, and ~8 cycles faster (201-193) on x86-32. --- libavcodec/x86/h264_deblock.asm | 168 +++ libavcodec/x86/h264dsp_mmx.c| 162 ++--- 2 files changed, 175 insertions(+), 155 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 1982dc4..77b25d2 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -27,6 +27,10 @@ %include x86inc.asm %include x86util.asm +SECTION_RODATA + +pb_3_1: times 4 db 3, 1 + SECTION .text cextern pb_0 @@ -911,3 +915,167 @@ ff_chroma_intra_body_mmxext: paddb m1, m5 paddb m2, m6 ret + +;- +; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], +;int8_t ref[2][40], int16_t mv[2][40][2], +;int bidir,int edges,int step, +;int mask_mv0, int mask_mv1, int field); +; +; bidiris 0 or 1 +; edgesis 1 or 4 +; step is 1 or 2 +; mask_mv0 is 0 or 3 +; mask_mv1 is 0 or 1 +; fieldis 0 or 1 +;- +%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, +; dir, d_idx, mask_dir, bidir +%define edgesd%1 +%define stepd %2 +%define mask_mvd %3 +%define dir %4 +%define d_idx %5 +%define mask_dir %6 +%define bidir %7 +xor b_idxd, b_idxd ; for (b_idx = 0; b_idx edges; b_idx += step) +.b_idx_loop_ %+ dir %+ _ %+ bidir: +%if mask_dir == 0 +pxor m0, m0 +%endif +test b_idxd, dword mask_mvd +jnz .skip_loop_iter_ %+ dir %+ _ %+ bidir ; if (!(b_idx mask_mv)) +%if bidir == 1 +movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } +punpckldqm2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } +pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } +pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } +pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } +psubbm0, m2 ; { ref0[b] != ref0[bn], +; ref0[b] != ref1[bn] } +psubbm1, m3 ; { ref1[b] != ref1[bn], +; ref1[b] != ref0[bn] } + +por m0, m1 +mova m1, [mvq+b_idxq*4+(d_idx+12)*4] +mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] +mova m3, m1 +mova m4, m2 +psubwm1, [mvq+b_idxq*4+12*4] +psubwm2, [mvq+b_idxq*4+12*4+mmsize] +psubwm3, [mvq+b_idxq*4+52*4] +psubwm4, [mvq+b_idxq*4+52*4+mmsize] +packsswb m1, m2 +packsswb m3, m4 +paddbm1, m6 +paddbm3, m6 +psubusb m1, m5 ; abs(mv[b] - mv[bn]) = limit +psubusb m3, m5 +packsswb m1, m3 + +por m0, m1 +mova m1, [mvq+b_idxq*4+(d_idx+52)*4] +mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] +mova m3, m1 +mova m4, m2 +psubwm1, [mvq+b_idxq*4+12*4] +psubwm2, [mvq+b_idxq*4+12*4+mmsize] +psubwm3, [mvq+b_idxq*4+52*4] +psubwm4, [mvq+b_idxq*4+52*4+mmsize] +packsswb m1, m2 +packsswb m3, m4 +paddbm1, m6 +paddbm3, m6 +psubusb m1, m5 ; abs(mv[b] - mv[bn]) = limit +psubusb m3, m5 +packsswb m1, m3 + +pshufw m1, m1, 0x4E +por m0, m1 +pshufw m1, m0, 0x4E +pminub m0, m1 +%else ; bidir == 0 +movd m0, [refq+b_idxq+12] +psubbm0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] + +mova m1, [mvq+b_idxq*4+12*4] +mova m2, [mvq+b_idxq*4+12*4+mmsize] +psubwm1, [mvq+b_idxq*4+(d_idx+12)*4] +psubwm2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] +packsswb m1, m2 +paddbm1, m6 +psubusb m1, m5 ; abs(mv[b] - mv[bn]) = limit +packsswb m1, m1 +por m0, m1 +%endif ; bidir == 1/0 + +.skip_loop_iter_ %+ dir %+ _ %+ bidir: +movd m1, [nnzq+b_idxq+12] +por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] + +pminub m1, m7 +pminub m0, m7 +psllwm1, 1 +pxor m2, m2 +pmaxub
Re: [libav-devel] [PATCH] proresdsp: port x86 assembly to cpuflags.
Hi, On Fri, Jul 27, 2012 at 11:39 AM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 08:38:27PM -0700, Ronald S. Bultje wrote: --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -406,27 +405,25 @@ cglobal prores_idct_put_10_%1, 4, 4, %2 -INIT_XMM -%define SIGNEXTEND signextend_sse2 -idct_put_fn sse2, 16 -INIT_XMM -%define SIGNEXTEND signextend_sse4 -idct_put_fn sse4, 16 -INIT_AVX -idct_put_fn avx, 16 +INIT_XMM sse2 +idct_put_fn 16 +INIT_XMM sse4 +idct_put_fn 16 +INIT_XMM avx +idct_put_fn 16 What's with INIT_XMM avx vs. INIT_AVX ? Patch does LGTM otherwise. See x86inc.asm, INIT_AVX is the deprecated method, INIT_XMM avx is the correct method. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/h264_deblock.asm | 126 +++-- libavcodec/x86/h264_deblock_10bit.asm | 77 ++-- libavcodec/x86/h264dsp_mmx.c | 60 3 files changed, 141 insertions(+), 122 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 1982dc4..76a458b 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -282,8 +282,8 @@ cextern pb_A1 ;- ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -%macro DEBLOCK_LUMA 1 -cglobal deblock_v_luma_8_%1, 5,5,10 +%macro DEBLOCK_V_LUMA 0 +cglobal deblock_v_luma_8, 5,5,10 movdm8, [r4] ; tc0 lea r4, [r1*3] dec r2d; alpha-1 @@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10 mova[r4+2*r1], m1 mova[r0], m2 RET +%endmacro ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 5,9 +%macro DEBLOCK_H_LUMA 0 +cglobal deblock_h_luma_8, 5,9 movsxd r7, r1d lear8, [r7+r7*2] lear6, [r0-4] @@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9 %if WIN64 mov[rsp+0x20], r4 %endif -call deblock_v_luma_8_%1 +call deblock_v_luma_8 ; transpose 16x4 - original space (only the middle 4 rows were changed by the filter) addr6, 2 @@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9 RET %endmacro -INIT_XMM -DEBLOCK_LUMA sse2 -INIT_AVX -DEBLOCK_LUMA avx +INIT_XMM sse2 +DEBLOCK_V_LUMA +INIT_MMX sse2 +DEBLOCK_H_LUMA + +INIT_XMM avx +DEBLOCK_V_LUMA +INIT_MMX avx +DEBLOCK_H_LUMA %else -%macro DEBLOCK_LUMA 3 +%macro DEBLOCK_V_LUMA 2 ;- ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -cglobal deblock_%2_luma_8_%1, 5,5 +cglobal deblock_%1_luma_8, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 dec r3 ; beta-1 add r4, r0 ; pix-3*stride -%assign pad 2*%3+12-(stack_offset15) +%assign pad 2*%2+12-(stack_offset15) SUB esp, pad movam0, [r4+r1] ; p1 @@ -415,7 +421,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 movdm4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] -mova [esp+%3], m4 ; tc +mova [esp+%2], m4 ; tc pcmpgtb m4, m3 movam3, [r4] ; p2 pandm4, m7 @@ -423,7 +429,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| beta-1 pandm6, m4 -pandm4, [esp+%3] ; tc +pandm4, [esp+%2] ; tc psubb m7, m4, m6 pandm6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -431,7 +437,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 movam4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| beta-1 pandm6, [esp] ; mask -movam5, [esp+%3] ; tc +movam5, [esp+%2] ; tc psubb m7, m6 pandm5, m6 movam3, [r0+r1] @@ -442,12 +448,13 @@ cglobal deblock_%2_luma_8_%1, 5,5 mova[r0], m2 ADD esp, pad RET +%endmacro ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 0,5 +%macro DEBLOCK_H_LUMA 1 +cglobal deblock_h_luma_8, 0,5 movr0, r0mp movr3, r1m lear4, [r3*3] @@ -470,11 +477,11 @@ cglobal deblock_h_luma_8_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 -call deblock_%2_luma_8_%1 -%ifidn %2, v8 +call deblock_%1_luma_8 +%ifidn %1, v8 adddword [esp ], 8 ; pix_tmp+0x38 adddword [esp+16], 2 ; tc0+2 -call deblock_%2_luma_8_%1 +call deblock_%1_luma_8 %endif ADDesp, 20 @@ -501,12 +508,17 @@ cglobal deblock_h_luma_8_%1, 0,5 RET %endmacro ; DEBLOCK_LUMA -INIT_MMX -DEBLOCK_LUMA mmxext, v8, 8 -INIT_XMM -DEBLOCK_LUMA sse2, v, 16 -INIT_AVX -DEBLOCK_LUMA avx, v, 16 +INIT_MMX mmx2 +DEBLOCK_V_LUMA v8, 8 +DEBLOCK_H_LUMA v8 +INIT_XMM sse2 +DEBLOCK_V_LUMA v, 16 +INIT_MMX sse2 +DEBLOCK_H_LUMA v +INIT_XMM avx +DEBLOCK_V_LUMA v, 16 +INIT_MMX avx +DEBLOCK_H_LUMA v %endif ; ARCH @@ -608,7 +620,7 @@ DEBLOCK_LUMA avx, v, 16 %define mask1p mask1q %endmacro
[libav-devel] [PATCH] vp3: port x86 SIMD to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/vp3dsp.asm | 94 ++--- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index af2f60c..5877520 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -102,8 +102,8 @@ SECTION .text mov [r0+r3 -1], r2w %endmacro -INIT_MMX -cglobal vp3_v_loop_filter_mmx2, 3, 4 +INIT_MMX mmx2 +cglobal vp3_v_loop_filter, 3, 4 %if ARCH_X86_64 movsxdr1, r1d %endif @@ -120,7 +120,7 @@ cglobal vp3_v_loop_filter_mmx2, 3, 4 movq [r0 ], m3 RET -cglobal vp3_h_loop_filter_mmx2, 3, 4 +cglobal vp3_h_loop_filter, 3, 4 %if ARCH_X86_64 movsxdr1, r1d %endif @@ -354,38 +354,6 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 movqI(2), m2 %endmacro -%macro VP3_IDCT_mmx 1 -; eax = quantized input -; ebx = dequantizer matrix -; ecx = IDCT constants -; M(I) = ecx + MaskOffset(0) + I * 8 -; C(I) = ecx + CosineOffset(32) + (I-1) * 8 -; edx = output -; r0..r7 = mm0..mm7 -%define OC_8 [pw_8] -%define C(x) [vp3_idct_data+16*(x-1)] - -; at this point, function has completed dequantization + dezigzag + -; partial transposition; now do the idct itself -%define I(x) [%1+16* x ] -%define J(x) [%1+16*(x-4)+8] -RowIDCT -Transpose - -%define I(x) [%1+16* x +64] -%define J(x) [%1+16*(x-4)+72] -RowIDCT -Transpose - -%define I(x) [%1+16*x] -%define J(x) [%1+16*x] -ColumnIDCT - -%define I(x) [%1+16*x+8] -%define J(x) [%1+16*x+8] -ColumnIDCT -%endmacro - %macro VP3_1D_IDCT_SSE2 0 movdqam2, I(3) ; xmm2 = i3 movdqam6, C(3) ; xmm6 = c3 @@ -501,7 +469,8 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 movdqa O(7), m%8 %endmacro -%macro VP3_IDCT_sse2 1 +%macro VP3_IDCT 1 +%if mmsize == 16 %define I(x) [%1+16*x] %define O(x) [%1+16*x] %define C(x) [vp3_idct_data+16*(x-1)] @@ -519,11 +488,42 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 %define ADD(x) paddsw x, [pw_8] VP3_1D_IDCT_SSE2 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 +%else ; mmsize == 8 +; eax = quantized input +; ebx = dequantizer matrix +; ecx = IDCT constants +; M(I) = ecx + MaskOffset(0) + I * 8 +; C(I) = ecx + CosineOffset(32) + (I-1) * 8 +; edx = output +; r0..r7 = mm0..mm7 +%define OC_8 [pw_8] +%define C(x) [vp3_idct_data+16*(x-1)] + +; at this point, function has completed dequantization + dezigzag + +; partial transposition; now do the idct itself +%define I(x) [%1+16* x ] +%define J(x) [%1+16*(x-4)+8] +RowIDCT +Transpose + +%define I(x) [%1+16* x +64] +%define J(x) [%1+16*(x-4)+72] +RowIDCT +Transpose + +%define I(x) [%1+16*x] +%define J(x) [%1+16*x] +ColumnIDCT + +%define I(x) [%1+16*x+8] +%define J(x) [%1+16*x+8] +ColumnIDCT +%endif ; mmsize == 16/8 %endmacro -%macro vp3_idct_funcs 1 -cglobal vp3_idct_put_%1, 3, 4, 9 -VP3_IDCT_%1 r2 +%macro vp3_idct_funcs 0 +cglobal vp3_idct_put, 3, 4, 9 +VP3_IDCT r2 movsxdifnidn r1, r1d mova m4, [pb_80] @@ -565,8 +565,8 @@ cglobal vp3_idct_put_%1, 3, 4, 9 %endrep RET -cglobal vp3_idct_add_%1, 3, 4, 9 -VP3_IDCT_%1 r2 +cglobal vp3_idct_add, 3, 4, 9 +VP3_IDCT r2 mov r3, 4 pxor m4, m4 @@ -607,10 +607,10 @@ cglobal vp3_idct_add_%1, 3, 4, 9 RET %endmacro -INIT_MMX -vp3_idct_funcs mmx -INIT_XMM -vp3_idct_funcs sse2 +INIT_MMX mmx +vp3_idct_funcs +INIT_XMM sse2 +vp3_idct_funcs %macro DC_ADD 0 movq m2, [r0 ] @@ -631,8 +631,8 @@ vp3_idct_funcs sse2 movq [r0+r3 ], m5 %endmacro -INIT_MMX -cglobal vp3_idct_dc_add_mmx2, 3, 4 +INIT_MMX mmx2 +cglobal vp3_idct_dc_add, 3, 4 %if ARCH_X86_64 movsxdr1, r1d %endif -- 1.7.9.2 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264_idct_10bit: port x86 assembly to cpuflags.
Hi, On Fri, Jul 27, 2012 at 2:49 PM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 08:54:30PM -0700, Ronald S. Bultje wrote: --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -72,25 +72,25 @@ SECTION .text ;;; NO FATE SAMPLES TRIGGER THIS -%macro ADD4x4IDCT 1 -add4x4_idct_%1: +%macro ADD4x4IDCT 0 +add4x4_idct_ %+ SUFFIX: add r5, r0 @@ -107,28 +107,28 @@ add4x4_idct_%1: %macro ADD16_OP 3 cmp byte [r4+%3], 0 jz .skipblock%2 mov r5d, [r1+%2*4] -call add4x4_idct_%1 +call add4x4_idct_ %+ SUFFIX You don't need this SUFFIX mangling, same below. We're not using cglobal for the called labels (they're not functions), so function_defined is not set, so call suffix completion doesn't work. Thus, yes, we need the SUFFIX (without _, fixed locally) or else it'll simply not compile. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.
Hi, On Fri, Jul 27, 2012 at 4:45 PM, Diego Biurrun di...@biurrun.de wrote: On Fri, Jul 27, 2012 at 03:08:26PM -0700, Ronald S. Bultje wrote: --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -282,8 +282,8 @@ cextern pb_A1 ;- ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -%macro DEBLOCK_LUMA 1 -cglobal deblock_v_luma_8_%1, 5,5,10 +%macro DEBLOCK_V_LUMA 0 +cglobal deblock_v_luma_8, 5,5,10 movdm8, [r4] ; tc0 lea r4, [r1*3] dec r2d; alpha-1 @@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10 mova[r4+2*r1], m1 mova[r0], m2 RET +%endmacro ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 5,9 +%macro DEBLOCK_H_LUMA 0 +cglobal deblock_h_luma_8, 5,9 movsxd r7, r1d lear8, [r7+r7*2] lear6, [r0-4] @@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9 %if WIN64 mov[rsp+0x20], r4 %endif -call deblock_v_luma_8_%1 +call deblock_v_luma_8 ; transpose 16x4 - original space (only the middle 4 rows were changed by the filter) addr6, 2 @@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9 RET %endmacro -INIT_XMM -DEBLOCK_LUMA sse2 -INIT_AVX -DEBLOCK_LUMA avx +INIT_XMM sse2 +DEBLOCK_V_LUMA +INIT_MMX sse2 +DEBLOCK_H_LUMA + +INIT_XMM avx +DEBLOCK_V_LUMA +INIT_MMX avx +DEBLOCK_H_LUMA I would suggest that you move the DEBLOCK_V_LUMA macro invocations directly below that macro. This is what we do everywhere. Not seeing the invocations directly below the definition is confusing. Same below for the parameterized variants of the macros. That actually has code cache implications. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.
Hi, On Fri, Jul 27, 2012 at 5:04 PM, Diego Biurrun di...@biurrun.de wrote: On Fri, Jul 27, 2012 at 04:49:18PM -0700, Ronald S. Bultje wrote: On Fri, Jul 27, 2012 at 4:45 PM, Diego Biurrun di...@biurrun.de wrote: On Fri, Jul 27, 2012 at 03:08:26PM -0700, Ronald S. Bultje wrote: --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -282,8 +282,8 @@ cextern pb_A1 ;- ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -%macro DEBLOCK_LUMA 1 -cglobal deblock_v_luma_8_%1, 5,5,10 +%macro DEBLOCK_V_LUMA 0 +cglobal deblock_v_luma_8, 5,5,10 movdm8, [r4] ; tc0 lea r4, [r1*3] dec r2d; alpha-1 @@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10 mova[r4+2*r1], m1 mova[r0], m2 RET +%endmacro ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 5,9 +%macro DEBLOCK_H_LUMA 0 +cglobal deblock_h_luma_8, 5,9 movsxd r7, r1d lear8, [r7+r7*2] lear6, [r0-4] @@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9 %if WIN64 mov[rsp+0x20], r4 %endif -call deblock_v_luma_8_%1 +call deblock_v_luma_8 ; transpose 16x4 - original space (only the middle 4 rows were changed by the filter) addr6, 2 @@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9 RET %endmacro -INIT_XMM -DEBLOCK_LUMA sse2 -INIT_AVX -DEBLOCK_LUMA avx +INIT_XMM sse2 +DEBLOCK_V_LUMA +INIT_MMX sse2 +DEBLOCK_H_LUMA + +INIT_XMM avx +DEBLOCK_V_LUMA +INIT_MMX avx +DEBLOCK_H_LUMA I would suggest that you move the DEBLOCK_V_LUMA macro invocations directly below that macro. This is what we do everywhere. Not seeing the invocations directly below the definition is confusing. Same below for the parameterized variants of the macros. That actually has code cache implications. OK, patch fine with me then. One last question: Why did you split the macros into H/V variants? I didn't see Loren's INIT_MMX cpuname suggestion. I can revert that part back. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/h264_deblock.asm | 104 - libavcodec/x86/h264_deblock_10bit.asm | 77 libavcodec/x86/h264dsp_mmx.c | 60 +-- 3 files changed, 120 insertions(+), 121 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 1982dc4..0891ef3 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -282,8 +282,8 @@ cextern pb_A1 ;- ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -%macro DEBLOCK_LUMA 1 -cglobal deblock_v_luma_8_%1, 5,5,10 +%macro DEBLOCK_LUMA 0 +cglobal deblock_v_luma_8, 5,5,10 movdm8, [r4] ; tc0 lea r4, [r1*3] dec r2d; alpha-1 @@ -327,8 +327,8 @@ cglobal deblock_v_luma_8_%1, 5,5,10 ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 5,9 +INIT_MMX cpuname +cglobal deblock_h_luma_8, 5,9 movsxd r7, r1d lear8, [r7+r7*2] lear6, [r0-4] @@ -355,7 +355,7 @@ cglobal deblock_h_luma_8_%1, 5,9 %if WIN64 mov[rsp+0x20], r4 %endif -call deblock_v_luma_8_%1 +call deblock_v_luma_8 ; transpose 16x4 - original space (only the middle 4 rows were changed by the filter) addr6, 2 @@ -384,24 +384,24 @@ cglobal deblock_h_luma_8_%1, 5,9 RET %endmacro -INIT_XMM -DEBLOCK_LUMA sse2 -INIT_AVX -DEBLOCK_LUMA avx +INIT_XMM sse2 +DEBLOCK_LUMA +INIT_XMM avx +DEBLOCK_LUMA %else -%macro DEBLOCK_LUMA 3 +%macro DEBLOCK_LUMA 2 ;- ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -cglobal deblock_%2_luma_8_%1, 5,5 +cglobal deblock_%1_luma_8, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 dec r3 ; beta-1 add r4, r0 ; pix-3*stride -%assign pad 2*%3+12-(stack_offset15) +%assign pad 2*%2+12-(stack_offset15) SUB esp, pad movam0, [r4+r1] ; p1 @@ -415,7 +415,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 movdm4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] -mova [esp+%3], m4 ; tc +mova [esp+%2], m4 ; tc pcmpgtb m4, m3 movam3, [r4] ; p2 pandm4, m7 @@ -423,7 +423,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| beta-1 pandm6, m4 -pandm4, [esp+%3] ; tc +pandm4, [esp+%2] ; tc psubb m7, m4, m6 pandm6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -431,7 +431,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 movam4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| beta-1 pandm6, [esp] ; mask -movam5, [esp+%3] ; tc +movam5, [esp+%2] ; tc psubb m7, m6 pandm5, m6 movam3, [r0+r1] @@ -446,8 +446,8 @@ cglobal deblock_%2_luma_8_%1, 5,5 ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 0,5 +INIT_MMX cpuname +cglobal deblock_h_luma_8, 0,5 movr0, r0mp movr3, r1m lear4, [r3*3] @@ -470,11 +470,11 @@ cglobal deblock_h_luma_8_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 -call deblock_%2_luma_8_%1 -%ifidn %2, v8 +call deblock_%1_luma_8 +%ifidn %1, v8 adddword [esp ], 8 ; pix_tmp+0x38 adddword [esp+16], 2 ; tc0+2 -call deblock_%2_luma_8_%1 +call deblock_%1_luma_8 %endif ADDesp, 20 @@ -501,12 +501,12 @@ cglobal deblock_h_luma_8_%1, 0,5 RET %endmacro ; DEBLOCK_LUMA -INIT_MMX -DEBLOCK_LUMA mmxext, v8, 8 -INIT_XMM -DEBLOCK_LUMA sse2, v, 16 -INIT_AVX -DEBLOCK_LUMA avx, v, 16 +INIT_MMX mmx2 +DEBLOCK_LUMA v8, 8 +INIT_XMM sse2 +DEBLOCK_LUMA v, 16 +INIT_XMM avx +DEBLOCK_LUMA v, 16 %endif ; ARCH @@ -608,7 +608,7 @@ DEBLOCK_LUMA avx, v, 16 %define mask1p mask1q %endmacro -%macro DEBLOCK_LUMA_INTRA 2 +%macro DEBLOCK_LUMA_INTRA 1 %define p1 m0 %define p0 m1 %define q0 m2 @@ -643,7 +643,7 @@ DEBLOCK_LUMA avx, v, 16 ;- ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int
Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.
Hi, On Fri, Jul 27, 2012 at 2:43 PM, Måns Rullgård m...@mansr.com wrote: However, the question still remains why it is in generic code. That's hard to say in hindsight, but it seems it was for simplicity so that you don't have to add it to each individual mmx function, thus making the asumption they would all use movntq. See also (directly under the #endif, just outside the context in this patch) its use of EMMS, even if it only called SSE2 functions and thus the MMX state was never clobbered... Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] msvc: check for snprintf
Hi, On Wed, Jul 25, 2012 at 10:32 PM, Luca Barbato lu_z...@gentoo.org wrote: From: Ronald S. Bultje rsbul...@gmail.com --- Here my initial twist about it, ideally I'd consider moving os_support in libavu and include it automagically from config.h I'm not sure why, we do similar hacks for pretty much all math functions and a few other string-related functions in lavu already. Why is snprintf() different? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] eval: fix printing of NaN in eval fate test.
Hi, On Wed, Jul 25, 2012 at 11:05 PM, Alex Converse alex.conve...@gmail.com wrote: On Wed, Jul 25, 2012 at 8:42 PM, Ronald S. Bultje rsbul...@gmail.com wrote: From: Ronald S. Bultje rsbul...@gmail.com This fixes make fate-eval on MSVC builds. Without this, the test outputs -1.#NaN instead of nan on MSVS 2010. --- libavutil/eval.c |5 + 1 file changed, 5 insertions(+) diff --git a/libavutil/eval.c b/libavutil/eval.c index ef37ad8..6131263 100644 --- a/libavutil/eval.c +++ b/libavutil/eval.c @@ -671,6 +671,11 @@ int main(int argc, char **argv) av_expr_parse_and_eval(d, *expr, const_names, const_values, NULL, NULL, NULL, NULL, NULL, 0, NULL); +#ifdef _MSC_VER +if (isnan(d)) +printf('%s' - nan\n\n, *expr); +else +#endif printf('%s' - %f\n\n, *expr, d); } Funny, when I proposed this without the MSC ifdef, you were wholly against it. And once again I will state that a conformant libc has the freedom to print [-]nan(n-char-sequence). So why not just drop the ifdef? I'll drop the ifdef. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.
Hi, On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote: On 07/26/2012 04:27 AM, Ronald S. Bultje wrote: From: Ronald S. Bultje rsbul...@gmail.com --- libswscale/swscale.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Ok. No, not OK. This is just a repackaged piece of another patch that has review questions that were never answered. Until those questions are settled, this cannot go in. And that question is ... ? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] x86/dsputilenc: bury inline asm under HAVE_INLINE_ASM.
Hi, On Thu, Jul 26, 2012 at 2:23 AM, Måns Rullgård m...@mansr.com wrote: Ronald S. Bultje rsbul...@gmail.com writes: From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/dct-test.c |2 +- libavcodec/x86/dsputilenc_mmx.c | 80 +++ libavcodec/x86/fdct_mmx.c |4 ++ libavcodec/x86/motion_est_mmx.c |6 +++ libavcodec/x86/mpegvideo_mmx.c |6 +++ 5 files changed, 64 insertions(+), 34 deletions(-) diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c index 5046544..9e19e0c 100644 --- a/libavcodec/dct-test.c +++ b/libavcodec/dct-test.c @@ -85,7 +85,7 @@ static const struct algo fdct_tab[] = { { IJG-AAN-INT,ff_fdct_ifast, SCALE_PERM }, { IJG-LLM-INT,ff_jpeg_fdct_islow_8, NO_PERM}, -#if HAVE_MMX +#if HAVE_MMX HAVE_INLINE_ASM { MMX,ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX }, This is just as wrong now as it was the first time. Why? What do you suggest instead? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.
Hi, On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote: On 07/26/2012 04:27 AM, Ronald S. Bultje wrote: From: Ronald S. Bultje rsbul...@gmail.com --- libswscale/swscale.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Ok. No, not OK. This is just a repackaged piece of another patch that has review questions that were never answered. Until those questions are settled, this cannot go in. I've looked at all emails in: http://comments.gmane.org/gmane.comp.video.libav.devel/28861 including yours: http://permalink.gmane.org/gmane.comp.video.libav.devel/28871 and Mans': http://permalink.gmane.org/gmane.comp.video.libav.devel/28863 My original mail has the fence part in it (simply ctrl-F in your browser), and neither you nor Mans respond to that particular section. So I'm lost now. What is the specific comment you want me to respond to? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.
Hi, On Thu, Jul 26, 2012 at 7:30 AM, Martin Storsjö mar...@martin.st wrote: On Thu, 26 Jul 2012, Ronald S. Bultje wrote: Hi, On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote: On 07/26/2012 04:27 AM, Ronald S. Bultje wrote: From: Ronald S. Bultje rsbul...@gmail.com --- libswscale/swscale.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Ok. No, not OK. This is just a repackaged piece of another patch that has review questions that were never answered. Until those questions are settled, this cannot go in. I've looked at all emails in: http://comments.gmane.org/gmane.comp.video.libav.devel/28861 including yours: http://permalink.gmane.org/gmane.comp.video.libav.devel/28871 and Mans': http://permalink.gmane.org/gmane.comp.video.libav.devel/28863 My original mail has the fence part in it (simply ctrl-F in your browser), and neither you nor Mans respond to that particular section. So I'm lost now. What is the specific comment you want me to respond to? http://article.gmane.org/gmane.comp.video.libav.devel/30834 If someone feels like rewriting swscale, I'm all supportive of that effort. For now, sws uses movntq in its inline assembly mmx/3dnow optimizations and we'll have to deal with it until someone changes it not to do that. Doing it in generic code is silly because in practice there is never any advantage to doing movntq. Thus, we should discourage its use. Adding generic versions of sfence does not contribute to that. The whole goal - back when I worked on sws - was to kill all these old mmx/3dnow optimizations and replace with modern sse2/avx, which would mean we don't need a call to sfence anymore anyways. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.
Hi, On Thu, Jul 26, 2012 at 9:05 AM, Måns Rullgård m...@mansr.com wrote: Ronald S. Bultje rsbul...@gmail.com writes: On Thu, Jul 26, 2012 at 7:30 AM, Martin Storsjö mar...@martin.st wrote: On Thu, 26 Jul 2012, Ronald S. Bultje wrote: On Thu, Jul 26, 2012 at 2:06 AM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 05:10:10AM +0200, Luca Barbato wrote: On 07/26/2012 04:27 AM, Ronald S. Bultje wrote: From: Ronald S. Bultje rsbul...@gmail.com --- libswscale/swscale.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Ok. No, not OK. This is just a repackaged piece of another patch that has review questions that were never answered. Until those questions are settled, this cannot go in. I've looked at all emails in: http://comments.gmane.org/gmane.comp.video.libav.devel/28861 including yours: http://permalink.gmane.org/gmane.comp.video.libav.devel/28871 and Mans': http://permalink.gmane.org/gmane.comp.video.libav.devel/28863 My original mail has the fence part in it (simply ctrl-F in your browser), and neither you nor Mans respond to that particular section. So I'm lost now. What is the specific comment you want me to respond to? http://article.gmane.org/gmane.comp.video.libav.devel/30834 If someone feels like rewriting swscale, I'm all supportive of that effort. For now, sws uses movntq in its inline assembly mmx/3dnow optimizations and we'll have to deal with it until someone changes it not to do that. Doing it in generic code is silly because in practice there is never any advantage to doing movntq. Thus, we should discourage its use. Adding generic versions of sfence does not contribute to that. The whole goal - back when I worked on sws - was to kill all these old mmx/3dnow optimizations and replace with modern sse2/avx, which would mean we don't need a call to sfence anymore anyways. I'm still missing an explanation of why sfence is needed here other than movntq somehow being involved. My understanding is that if you use movntq and not sfence, the data may not be in the destination memory pointer by the time swScale() returns. But I didn't write this code. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [RFC] split HAVE_MMX/MMX2/SSE/SSE2/SSSE3/AVX for inline asm vs. yasm
Hi guys, discussion thread. We currently use HAVE_SSSE3 and related macros to indicate that we want to compile these and that our compiler tools are good enough to know what to do with it. As a result, we currently use HAVE_AVX around all avx code (yasm only - we don't have any avx inline asm), HAVE_SSSE3 around some yasm and all inline asm code that uses ssse3 instructions, and sometimes HAVE_SSE/2 around inline asm using xmm regs. There is no HAVE_SSE4. HAVE_MMX2 is almost never used but does exist. HAVE_MMX is something entirely different and is used as an alternative form of ARCH_X86. In addition to that, we're using inline asm checks to test whether to enable HAVE_SSSE3 and HAVE_SSE2 (line 2850 of configure). Can we split these macros in something for yasm vs something for inline asm? This means e.g. that we can use ssse3 if yasm (but not inline asm) supports it, if inline asm is lacking, etc. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [RFC] split HAVE_MMX/MMX2/SSE/SSE2/SSSE3/AVX for inline asm vs. yasm
Hi, On Thu, Jul 26, 2012 at 2:39 PM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 01:50:17PM -0700, Ronald S. Bultje wrote: discussion thread. We currently use HAVE_SSSE3 and related macros to indicate that we want to compile these and that our compiler tools are good enough to know what to do with it. As a result, we currently use HAVE_AVX around all avx code (yasm only - we don't have any avx inline asm), HAVE_SSSE3 around some yasm and all inline asm code that uses ssse3 instructions, and sometimes HAVE_SSE/2 around inline asm using xmm regs. There is no HAVE_SSE4. HAVE_MMX2 is almost never used but does exist. Do we need HAVE_SSE4? It should be easy enough to add. HAVE_MMX is something entirely different and is used as an alternative form of ARCH_X86. No, HAVE_MMX is just that. True, it's abused in some places where ARCH_X86 would be better (when invoking init functions), but that is an issue that needs to be addressed at some point. In addition to that, we're using inline asm checks to test whether to enable HAVE_SSSE3 and HAVE_SSE2 (line 2850 of configure). Can we split these macros in something for yasm vs something for inline asm? This means e.g. that we can use ssse3 if yasm (but not inline asm) supports it, if inline asm is lacking, etc. What is your goal? Do you want to write something like #if HAVE_INLINE_SSSE3 instead of #if HAVE_SSSE3 HAVE_INLINE_ASM ? Right now, in practice: HAVE_SSSE3 means we support inline ssse3 HAVE_SSE2 means we support inline sse2 HAVE_AVX means we support yasm avx but depends on HAVE_SSSE3 I wonder whether it makes sense to have a generic HAVE_SSSE3 anyway - when would we use it, what would it mean? I think in practice, we probably want a HAVE_INLINE_SSSE3, as you said, because yes, there's compilers that don't support this, but do support HAVE_INLINE_ASM in general. Likewise, HAVE_AVX could be renamed HAVE_YASM_AVX or so. Having HAVE_YASM_SSSE3 seems pointless, I don't think we support any yasm/nasm version that doesn't understand ssse3, so it'd always be 1. However, this would make it clear that HAVE_SSSE3 and HAVE_AVX don't and shouldn't depend on each other. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 3/7] vf_hqdn3d: simplify and optimize
Hi, On Thu, Jul 26, 2012 at 3:51 PM, Loren Merritt lor...@u.washington.edu wrote: 14% faster on penryn, 2% on sandybridge, 9% on bulldozer --- libavfilter/vf_hqdn3d.c | 157 +++--- 1 files changed, 51 insertions(+), 106 deletions(-) Looks good. I am going to ask a very stupid question: why is this faster? I see a lot of simplification, which is good, but I'm not quite sure which part actually has a clear speed impact. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 4/7] vf_hqdn3d: reduce intermediate precision
Hi, On Thu, Jul 26, 2012 at 3:51 PM, Loren Merritt lor...@u.washington.edu wrote: 11% faster on penryn, 7% on sandybridge, 5% on bulldozer Negligible change to output. --- libavfilter/vf_hqdn3d.c | 62 -- 1 files changed, 32 insertions(+), 30 deletions(-) Looks good. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 5/7] vf_hqdn3d: support 10bit colordepth
Hi, On Thu, Jul 26, 2012 at 3:51 PM, Loren Merritt lor...@u.washington.edu wrote: --- libavfilter/vf_hqdn3d.c | 68 +- 1 files changed, 49 insertions(+), 19 deletions(-) Can you add 9bpp support also? Not that it's used much, but it'll use the exact same codepath, I think. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [RFC] split HAVE_MMX/MMX2/SSE/SSE2/SSSE3/AVX for inline asm vs. yasm
Hi, On Thu, Jul 26, 2012 at 3:54 PM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 03:42:24PM -0700, Ronald S. Bultje wrote: On Thu, Jul 26, 2012 at 2:39 PM, Diego Biurrun di...@biurrun.de wrote: On Thu, Jul 26, 2012 at 01:50:17PM -0700, Ronald S. Bultje wrote: discussion thread. We currently use HAVE_SSSE3 and related macros to indicate that we want to compile these and that our compiler tools are good enough to know what to do with it. As a result, we currently use HAVE_AVX around all avx code (yasm only - we don't have any avx inline asm), HAVE_SSSE3 around some yasm and all inline asm code that uses ssse3 instructions, and sometimes HAVE_SSE/2 around inline asm using xmm regs. There is no HAVE_SSE4. HAVE_MMX2 is almost never used but does exist. Do we need HAVE_SSE4? It should be easy enough to add. HAVE_MMX is something entirely different and is used as an alternative form of ARCH_X86. No, HAVE_MMX is just that. True, it's abused in some places where ARCH_X86 would be better (when invoking init functions), but that is an issue that needs to be addressed at some point. In addition to that, we're using inline asm checks to test whether to enable HAVE_SSSE3 and HAVE_SSE2 (line 2850 of configure). Can we split these macros in something for yasm vs something for inline asm? This means e.g. that we can use ssse3 if yasm (but not inline asm) supports it, if inline asm is lacking, etc. What is your goal? Do you want to write something like #if HAVE_INLINE_SSSE3 instead of #if HAVE_SSSE3 HAVE_INLINE_ASM ? Right now, in practice: HAVE_SSSE3 means we support inline ssse3 HAVE_SSE2 means we support inline sse2 HAVE_AVX means we support yasm avx but depends on HAVE_SSSE3 I wonder whether it makes sense to have a generic HAVE_SSSE3 anyway - when would we use it, what would it mean? I think in practice, we probably want a HAVE_INLINE_SSSE3, as you said, because yes, there's compilers that don't support this, but do support HAVE_INLINE_ASM in general. Likewise, HAVE_AVX could be renamed HAVE_YASM_AVX or so. Having HAVE_YASM_SSSE3 seems pointless, I don't think we support any yasm/nasm version that doesn't understand ssse3, so it'd always be 1. However, this would make it clear that HAVE_SSSE3 and HAVE_AVX don't and shouldn't depend on each other. Try dropping the line avx_deps=ssse3 from configure and see if that works out the way you want it to. I'm still wondering if it makes sense to change the names to reflect what they do, to prevent more misunderstandings. Plus, someone (i.e. me) needs to go over all our x86 simd function pointer inits and make sure we use HAVE_INLINE_SSSE3 only for inline, not yasm. Also, HAVE_SSE2, HAVE_SSE, HAVE_MMX2, HAVE_MMX need such rules (are they inline? yasm? both?) and the same check in init functions. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/6] build: Only compile and run dct-test if AAN DCT tables are enabled
Hi, On Thu, Jul 26, 2012 at 5:15 PM, Diego Biurrun di...@biurrun.de wrote: --- libavcodec/Makefile |2 +- tests/fate/dct.mak |2 +- 2 files changed, 2 insertions(+), 2 deletions(-) This test tests a lot more than just aan dct? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] x86inc: sync to latest version from x264.
From: Ronald S. Bultje rsbul...@gmail.com --- libavutil/x86/x86inc.asm | 216 ++ 1 file changed, 124 insertions(+), 92 deletions(-) diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index b76a10c..23d9d57 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -36,8 +36,8 @@ %define program_name ff -%define UNIX64 0 %define WIN64 0 +%define UNIX64 0 %if ARCH_X86_64 %ifidn __OUTPUT_FORMAT__,win32 %define WIN64 1 @@ -54,11 +54,6 @@ %define mangle(x) x %endif -; FIXME: All of the 64bit asm functions that take a stride as an argument -; via register, assume that the high dword of that register is filled with 0. -; This is true in practice (since we never do any 64bit arithmetic on strides, -; and x264's strides are all positive), but is not guaranteed by the ABI. - ; Name of the .rodata section. ; Kludge: Something on OS X fails to align .rodata even given an align attribute, ; so use a different read-only section. @@ -129,34 +124,38 @@ CPU amdnop ; registers: ; rN and rNq are the native-size register holding function argument N ; rNd, rNw, rNb are dword, word, and byte size +; rNh is the high 8 bits of the word size ; rNm is the original location of arg N (a register or on the stack), dword ; rNmp is native size -%macro DECLARE_REG 5-6 +%macro DECLARE_REG 2-3 %define r%1q %2 -%define r%1d %3 -%define r%1w %4 -%define r%1b %5 -%if %0 == 5 -%define r%1m %3 +%define r%1d %2d +%define r%1w %2w +%define r%1b %2b +%define r%1h %2h +%if %0 == 2 +%define r%1m %2d %define r%1mp %2 %elif ARCH_X86_64 ; memory -%define r%1m [rsp + stack_offset + %6] +%define r%1m [rsp + stack_offset + %3] %define r%1mp qword r %+ %1m %else -%define r%1m [esp + stack_offset + %6] +%define r%1m [esp + stack_offset + %3] %define r%1mp dword r %+ %1m %endif %define r%1 %2 %endmacro -%macro DECLARE_REG_SIZE 2 +%macro DECLARE_REG_SIZE 3 %define r%1q r%1 %define e%1q r%1 %define r%1d e%1 %define e%1d e%1 %define r%1w %1 %define e%1w %1 +%define r%1h %3 +%define e%1h %3 %define r%1b %2 %define e%1b %2 %if ARCH_X86_64 == 0 @@ -164,13 +163,13 @@ CPU amdnop %endif %endmacro -DECLARE_REG_SIZE ax, al -DECLARE_REG_SIZE bx, bl -DECLARE_REG_SIZE cx, cl -DECLARE_REG_SIZE dx, dl -DECLARE_REG_SIZE si, sil -DECLARE_REG_SIZE di, dil -DECLARE_REG_SIZE bp, bpl +DECLARE_REG_SIZE ax, al, ah +DECLARE_REG_SIZE bx, bl, bh +DECLARE_REG_SIZE cx, cl, ch +DECLARE_REG_SIZE dx, dl, dh +DECLARE_REG_SIZE si, sil, null +DECLARE_REG_SIZE di, dil, null +DECLARE_REG_SIZE bp, bpl, null ; t# defines for when per-arch register allocation is more complex than just function arguments @@ -188,6 +187,7 @@ DECLARE_REG_SIZE bp, bpl %define t%1q t%1 %+ q %define t%1d t%1 %+ d %define t%1w t%1 %+ w +%define t%1h t%1 %+ h %define t%1b t%1 %+ b %rotate 1 %endrep @@ -277,6 +277,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 CAT_UNDEF arg_name %+ %%i, q CAT_UNDEF arg_name %+ %%i, d CAT_UNDEF arg_name %+ %%i, w +CAT_UNDEF arg_name %+ %%i, h CAT_UNDEF arg_name %+ %%i, b CAT_UNDEF arg_name %+ %%i, m CAT_UNDEF arg_name %+ %%i, mp @@ -292,6 +293,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %xdefine %1q r %+ %%i %+ q %xdefine %1d r %+ %%i %+ d %xdefine %1w r %+ %%i %+ w +%xdefine %1h r %+ %%i %+ h %xdefine %1b r %+ %%i %+ b %xdefine %1m r %+ %%i %+ m %xdefine %1mp r %+ %%i %+ mp @@ -305,21 +307,21 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %if WIN64 ; Windows x64 ;= -DECLARE_REG 0, rcx, ecx, cx, cl -DECLARE_REG 1, rdx, edx, dx, dl -DECLARE_REG 2, R8, R8D, R8W, R8B -DECLARE_REG 3, R9, R9D, R9W, R9B -DECLARE_REG 4, R10, R10D, R10W, R10B, 40 -DECLARE_REG 5, R11, R11D, R11W, R11B, 48 -DECLARE_REG 6, rax, eax, ax, al, 56 -DECLARE_REG 7, rdi, edi, di, dil, 64 -DECLARE_REG 8, rsi, esi, si, sil, 72 -DECLARE_REG 9, rbx, ebx, bx, bl, 80 -DECLARE_REG 10, rbp, ebp, bp, bpl, 88 -DECLARE_REG 11, R12, R12D, R12W, R12B, 96 -DECLARE_REG 12, R13, R13D, R13W, R13B, 104 -DECLARE_REG 13, R14, R14D, R14W, R14B, 112 -DECLARE_REG 14, R15, R15D, R15W, R15B, 120 +DECLARE_REG 0, rcx +DECLARE_REG 1, rdx +DECLARE_REG 2, R8 +DECLARE_REG 3, R9 +DECLARE_REG 4, R10, 40 +DECLARE_REG 5, R11, 48 +DECLARE_REG 6, rax, 56 +DECLARE_REG 7, rdi, 64 +DECLARE_REG 8, rsi, 72 +DECLARE_REG 9, rbx, 80 +DECLARE_REG 10, rbp, 88 +DECLARE_REG 11, R12, 96 +DECLARE_REG 12, R13, 104 +DECLARE_REG 13, R14, 112 +DECLARE_REG 14, R15, 120 %macro PROLOGUE 2-4+ 0 ; #args, #regs
Re: [libav-devel] [PATCH 5/7] vf_hqdn3d: support 9 and 10bit colordepth
Hi, On Thu, Jul 26, 2012 at 6:42 PM, Loren Merritt lor...@u.washington.edu wrote: --- libavfilter/vf_hqdn3d.c | 72 ++ 1 files changed, 53 insertions(+), 19 deletions(-) OK. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] proresdsp: port x86 assembly to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/proresdsp.asm | 39 ++- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm index 9b2e11e..70fd686 100644 --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -82,8 +82,7 @@ section .text align=16 ; %1 = row or col (for rounding variable) ; %2 = number of bits to shift at the end -; %3 = optimization -%macro IDCT_1D 3 +%macro IDCT_1D 2 ; a0 = (W4 * row[0]) + (1 (15 - 1)); ; a1 = a0; ; a2 = a0; @@ -330,8 +329,8 @@ section .text align=16 ; void prores_idct_put_10_opt(uint8_t *pixels, int stride, ; DCTELEM *block, const int16_t *qmat); -%macro idct_put_fn 2 -cglobal prores_idct_put_10_%1, 4, 4, %2 +%macro idct_put_fn 1 +cglobal prores_idct_put_10, 4, 4, %1 movsxd r1, r1d pxorm15, m15 ; zero @@ -347,7 +346,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2 pmullw m13,[r3+64] pmullw m12,[r3+96] -IDCT_1D row, 17, %1 +IDCT_1D row, 17 ; transpose for second part of IDCT TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 @@ -362,7 +361,7 @@ cglobal prores_idct_put_10_%1, 4, 4, %2 ; for (i = 0; i 8; i++) ; idctSparseColAdd(dest + i, line_size, block + i); -IDCT_1D col, 20, %1 +IDCT_1D col, 20 ; clip/store movam6, [pw_512] @@ -406,27 +405,25 @@ cglobal prores_idct_put_10_%1, 4, 4, %2 RET %endmacro -%macro signextend_sse2 3 ; dstlow, dsthigh, tmp +%macro SIGNEXTEND 2-3 ; dstlow, dsthigh, tmp +%if cpuflag(sse4) +movhlps %2, %1 +pmovsxwd%1, %1 +pmovsxwd%2, %2 +%else ; sse2 pxor%3, %3 pcmpgtw %3, %1 mova%2, %1 punpcklwd %1, %3 punpckhwd %2, %3 +%endif %endmacro -%macro signextend_sse4 2-3 ; dstlow, dsthigh -movhlps %2, %1 -pmovsxwd%1, %1 -pmovsxwd%2, %2 -%endmacro - -INIT_XMM -%define SIGNEXTEND signextend_sse2 -idct_put_fn sse2, 16 -INIT_XMM -%define SIGNEXTEND signextend_sse4 -idct_put_fn sse4, 16 -INIT_AVX -idct_put_fn avx, 16 +INIT_XMM sse2 +idct_put_fn 16 +INIT_XMM sse4 +idct_put_fn 16 +INIT_XMM avx +idct_put_fn 16 %endif -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] h264_chromamc_10bit: port x86 simd to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/dsputil_mmx.c | 16 ++--- libavcodec/x86/h264_chromamc_10bit.asm | 40 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index e91ede5..afbb531 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -2117,10 +2117,10 @@ void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ (uint8_t *dst, uint8_t *src, \ int stride, int h, int x, int y); -CHROMA_MC(put, 2, 10, mmxext) -CHROMA_MC(avg, 2, 10, mmxext) -CHROMA_MC(put, 4, 10, mmxext) -CHROMA_MC(avg, 4, 10, mmxext) +CHROMA_MC(put, 2, 10, mmx2) +CHROMA_MC(avg, 2, 10, mmx2) +CHROMA_MC(put, 4, 10, mmx2) +CHROMA_MC(avg, 4, 10, mmx2) CHROMA_MC(put, 8, 10, sse2) CHROMA_MC(avg, 8, 10, sse2) CHROMA_MC(put, 8, 10, avx) @@ -2740,10 +2740,10 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, c-put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2; } if (bit_depth == 10 CONFIG_H264CHROMA) { -c-put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext; -c-avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext; -c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext; -c-avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext; +c-put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2; +c-avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2; +c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2; +c-avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2; } c-add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm index 3f7c513..370c7b5 100644 --- a/libavcodec/x86/h264_chromamc_10bit.asm +++ b/libavcodec/x86/h264_chromamc_10bit.asm @@ -60,10 +60,10 @@ SECTION .text ;- ; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my) ;- -%macro CHROMA_MC8 2 +%macro CHROMA_MC8 1 ; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, ; int stride, int h, int mx, int my) -cglobal %1_h264_chroma_mc8_10_%2, 6,7,8 +cglobal %1_h264_chroma_mc8_10, 6,7,8 movsxdifnidn r2, r2d mov r6d, r5d or r6d, r4d @@ -173,8 +173,8 @@ cglobal %1_h264_chroma_mc8_10_%2, 6,7,8 add r0, r2 %endmacro -%macro CHROMA_MC4 2 -cglobal %1_h264_chroma_mc4_10_%2, 6,6,7 +%macro CHROMA_MC4 1 +cglobal %1_h264_chroma_mc4_10, 6,6,7 movsxdifnidn r2, r2d movd m2, r4m ; x movd m3, r5m ; y @@ -203,8 +203,8 @@ cglobal %1_h264_chroma_mc4_10_%2, 6,6,7 ;- ; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my) ;- -%macro CHROMA_MC2 2 -cglobal %1_h264_chroma_mc2_10_%2, 6,7 +%macro CHROMA_MC2 1 +cglobal %1_h264_chroma_mc2_10, 6,7 movsxdifnidn r2, r2d mov r6d, r4d shl r4d, 16 @@ -250,24 +250,24 @@ cglobal %1_h264_chroma_mc2_10_%2, 6,7 %endmacro %define CHROMAMC_AVG NOTHING -INIT_XMM -CHROMA_MC8 put, sse2 +INIT_XMM sse2 +CHROMA_MC8 put %if HAVE_AVX -INIT_AVX -CHROMA_MC8 put, avx +INIT_XMM avx +CHROMA_MC8 put %endif -INIT_MMX -CHROMA_MC4 put, mmxext -CHROMA_MC2 put, mmxext +INIT_MMX mmx2 +CHROMA_MC4 put +CHROMA_MC2 put %define CHROMAMC_AVG AVG %define PAVG pavgw -INIT_XMM -CHROMA_MC8 avg, sse2 +INIT_XMM sse2 +CHROMA_MC8 avg %if HAVE_AVX -INIT_AVX -CHROMA_MC8 avg, avx +INIT_XMM avx +CHROMA_MC8 avg %endif -INIT_MMX -CHROMA_MC4 avg, mmxext -CHROMA_MC2 avg, mmxext +INIT_MMX mmx2 +CHROMA_MC4 avg +CHROMA_MC2 avg -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] h264_idct_10bit: port x86 assembly to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/h264_idct_10bit.asm | 210 ++-- 1 file changed, 105 insertions(+), 105 deletions(-) diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm index 934a7ff..fd61c98 100644 --- a/libavcodec/x86/h264_idct_10bit.asm +++ b/libavcodec/x86/h264_idct_10bit.asm @@ -72,25 +72,25 @@ SECTION .text STORE_DIFFx2 m2, m3, m4, m5, %1, %3 %endmacro -%macro IDCT_ADD_10 1 -cglobal h264_idct_add_10_%1, 3,3 +%macro IDCT_ADD_10 0 +cglobal h264_idct_add_10, 3,3 IDCT4_ADD_10 r0, r1, r2 RET %endmacro -INIT_XMM -IDCT_ADD_10 sse2 +INIT_XMM sse2 +IDCT_ADD_10 %if HAVE_AVX -INIT_AVX -IDCT_ADD_10 avx +INIT_XMM avx +IDCT_ADD_10 %endif ;- ; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) ;- ;;; NO FATE SAMPLES TRIGGER THIS -%macro ADD4x4IDCT 1 -add4x4_idct_%1: +%macro ADD4x4IDCT 0 +add4x4_idct_ %+ SUFFIX: add r5, r0 mova m0, [r2+ 0] mova m1, [r2+16] @@ -107,28 +107,28 @@ add4x4_idct_%1: ret %endmacro -INIT_XMM +INIT_XMM sse2 ALIGN 16 -ADD4x4IDCT sse2 +ADD4x4IDCT %if HAVE_AVX -INIT_AVX +INIT_XMM avx ALIGN 16 -ADD4x4IDCT avx +ADD4x4IDCT %endif %macro ADD16_OP 3 cmp byte [r4+%3], 0 jz .skipblock%2 mov r5d, [r1+%2*4] -call add4x4_idct_%1 +call add4x4_idct_ %+ SUFFIX .skipblock%2: %if %215 add r2, 64 %endif %endmacro -%macro IDCT_ADD16_10 1 -cglobal h264_idct_add16_10_%1, 5,6 +%macro IDCT_ADD16_10 0 +cglobal h264_idct_add16_10, 5,6 ADD16_OP %1, 0, 4+1*8 ADD16_OP %1, 1, 5+1*8 ADD16_OP %1, 2, 4+2*8 @@ -148,11 +148,11 @@ cglobal h264_idct_add16_10_%1, 5,6 REP_RET %endmacro -INIT_XMM -IDCT_ADD16_10 sse2 +INIT_XMM sse2 +IDCT_ADD16_10 %if HAVE_AVX -INIT_AVX -IDCT_ADD16_10 avx +INIT_XMM avx +IDCT_ADD16_10 %endif ;- @@ -185,8 +185,8 @@ IDCT_ADD16_10 avx mova [%1+%3 ], m4 %endmacro -INIT_MMX -cglobal h264_idct_dc_add_10_mmx2,3,3 +INIT_MMX mmx2 +cglobal h264_idct_dc_add_10,3,3 movd m0, [r1] paddd m0, [pd_32] psrad m0, 6 @@ -199,8 +199,8 @@ cglobal h264_idct_dc_add_10_mmx2,3,3 ;- ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) ;- -%macro IDCT8_DC_ADD 1 -cglobal h264_idct8_dc_add_10_%1,3,3,7 +%macro IDCT8_DC_ADD 0 +cglobal h264_idct8_dc_add_10,3,3,7 mov r1d, [r1] add r1, 32 sar r1, 6 @@ -214,45 +214,45 @@ cglobal h264_idct8_dc_add_10_%1,3,3,7 RET %endmacro -INIT_XMM -IDCT8_DC_ADD sse2 +INIT_XMM sse2 +IDCT8_DC_ADD %if HAVE_AVX -INIT_AVX -IDCT8_DC_ADD avx +INIT_XMM avx +IDCT8_DC_ADD %endif ;- ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) ;- -%macro AC 2 -.ac%2 -mov r5d, [r1+(%2+0)*4] -call add4x4_idct_%1 -mov r5d, [r1+(%2+1)*4] +%macro AC 1 +.ac%1 +mov r5d, [r1+(%1+0)*4] +call add4x4_idct_ %+ SUFFIX +mov r5d, [r1+(%1+1)*4] add r2, 64 -call add4x4_idct_%1 +call add4x4_idct_ %+ SUFFIX add r2, 64 -jmp .skipadd%2 +jmp .skipadd%1 %endmacro %assign last_block 16 -%macro ADD16_OP_INTRA 3 -cmp word [r4+%3], 0 -jnz .ac%2 +%macro ADD16_OP_INTRA 2 +cmp word [r4+%2], 0 +jnz .ac%1 mov r5d, [r2+ 0] or r5d, [r2+64] -jz .skipblock%2 -mov r5d, [r1+(%2+0)*4] -call idct_dc_add_%1 -.skipblock%2: -%if %2last_block-2 +jz .skipblock%1 +mov r5d, [r1+(%1+0)*4] +call idct_dc_add_ %+ SUFFIX +.skipblock%1: +%if %1last_block-2 add r2, 128 %endif -.skipadd%2: +.skipadd%1: %endmacro -%macro IDCT_ADD16INTRA_10 1 -idct_dc_add_%1: +%macro IDCT_ADD16INTRA_10 0 +idct_dc_add_ %+ SUFFIX: add r5, r0 movq m0, [r2+ 0] movhpsm0, [r2+64] @@ -265,46 +265,46 @@ idct_dc_add_%1: IDCT_DC_ADD_OP_10 r5, r3, r6 ret -cglobal h264_idct_add16intra_10_%1,5,7,8 -ADD16_OP_INTRA %1, 0, 4+1*8 -ADD16_OP_INTRA %1, 2, 4+2*8 -ADD16_OP_INTRA %1, 4, 6+1*8 -ADD16_OP_INTRA %1, 6, 6+2*8 -ADD16_OP_INTRA %1, 8, 4+3*8 -ADD16_OP_INTRA %1, 10, 4+4*8 -ADD16_OP_INTRA %1, 12, 6+3*8 -ADD16_OP_INTRA %1, 14, 6+4*8 +cglobal h264_idct_add16intra_10,5,7,8 +ADD16_OP_INTRA 0, 4+1*8 +ADD16_OP_INTRA 2, 4+2*8 +ADD16_OP_INTRA 4, 6+1*8 +ADD16_OP_INTRA 6, 6+2*8 +ADD16_OP_INTRA
[libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/h264_deblock.asm | 120 ++- libavcodec/x86/h264dsp_mmx.c| 42 +++--- 2 files changed, 88 insertions(+), 74 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 1982dc4..94ff27b 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -282,8 +282,8 @@ cextern pb_A1 ;- ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -%macro DEBLOCK_LUMA 1 -cglobal deblock_v_luma_8_%1, 5,5,10 +%macro DEBLOCK_V_LUMA 0 +cglobal deblock_v_luma_8, 5,5,10 movdm8, [r4] ; tc0 lea r4, [r1*3] dec r2d; alpha-1 @@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10 mova[r4+2*r1], m1 mova[r0], m2 RET +%endmacro ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 5,9 +%macro DEBLOCK_H_LUMA 0 +cglobal deblock_h_luma_8, 5,9 movsxd r7, r1d lear8, [r7+r7*2] lear6, [r0-4] @@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9 %if WIN64 mov[rsp+0x20], r4 %endif -call deblock_v_luma_8_%1 +call deblock_v_luma_8 %+ SUFFIX ; transpose 16x4 - original space (only the middle 4 rows were changed by the filter) addr6, 2 @@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9 RET %endmacro -INIT_XMM -DEBLOCK_LUMA sse2 -INIT_AVX -DEBLOCK_LUMA avx +INIT_XMM sse2 +DEBLOCK_V_LUMA +INIT_MMX sse2 +DEBLOCK_H_LUMA + +INIT_XMM avx +DEBLOCK_V_LUMA +INIT_MMX avx +DEBLOCK_H_LUMA %else -%macro DEBLOCK_LUMA 3 +%macro DEBLOCK_LUMA 2 ;- ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -cglobal deblock_%2_luma_8_%1, 5,5 +cglobal deblock_%1_luma_8, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 dec r3 ; beta-1 add r4, r0 ; pix-3*stride -%assign pad 2*%3+12-(stack_offset15) +%assign pad 2*%2+12-(stack_offset15) SUB esp, pad movam0, [r4+r1] ; p1 @@ -415,7 +421,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 movdm4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] -mova [esp+%3], m4 ; tc +mova [esp+%2], m4 ; tc pcmpgtb m4, m3 movam3, [r4] ; p2 pandm4, m7 @@ -423,7 +429,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| beta-1 pandm6, m4 -pandm4, [esp+%3] ; tc +pandm4, [esp+%2] ; tc psubb m7, m4, m6 pandm6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -431,7 +437,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 movam4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| beta-1 pandm6, [esp] ; mask -movam5, [esp+%3] ; tc +movam5, [esp+%2] ; tc psubb m7, m6 pandm5, m6 movam3, [r0+r1] @@ -446,8 +452,8 @@ cglobal deblock_%2_luma_8_%1, 5,5 ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 0,5 +INIT_MMX SUFFIX +cglobal deblock_h_luma_8, 0,5 movr0, r0mp movr3, r1m lear4, [r3*3] @@ -470,11 +476,11 @@ cglobal deblock_h_luma_8_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 -call deblock_%2_luma_8_%1 -%ifidn %2, v8 +call deblock_%1_luma_8_ %+ SUFFIX +%ifidn %1, v8 adddword [esp ], 8 ; pix_tmp+0x38 adddword [esp+16], 2 ; tc0+2 -call deblock_%2_luma_8_%1 +call deblock_%1_luma_8_ %+ SUFFIX %endif ADDesp, 20 @@ -501,12 +507,12 @@ cglobal deblock_h_luma_8_%1, 0,5 RET %endmacro ; DEBLOCK_LUMA -INIT_MMX -DEBLOCK_LUMA mmxext, v8, 8 -INIT_XMM -DEBLOCK_LUMA sse2, v, 16 -INIT_AVX -DEBLOCK_LUMA avx, v, 16 +INIT_MMX mmx2 +DEBLOCK_LUMA v8, 8 +INIT_XMM sse2 +DEBLOCK_LUMA v, 16 +INIT_XMM avx +DEBLOCK_LUMA v, 16 %endif ; ARCH @@ -608,7 +614,7 @@ DEBLOCK_LUMA avx, v, 16 %define mask1p mask1q %endmacro -%macro DEBLOCK_LUMA_INTRA 2 +%macro DEBLOCK_V_LUMA_INTRA 1 %define p1 m0 %define p0 m1 %define q0 m2 @@ -643,7 +649,7 @@ DEBLOCK_LUMA avx, v, 16
[libav-devel] [PATCH] h264_loopfilter: port x86 simd to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/h264_deblock.asm | 124 +++-- libavcodec/x86/h264_deblock_10bit.asm | 77 ++-- libavcodec/x86/h264dsp_mmx.c | 60 3 files changed, 139 insertions(+), 122 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 1982dc4..b5e81e7 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -282,8 +282,8 @@ cextern pb_A1 ;- ; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -%macro DEBLOCK_LUMA 1 -cglobal deblock_v_luma_8_%1, 5,5,10 +%macro DEBLOCK_V_LUMA 0 +cglobal deblock_v_luma_8, 5,5,10 movdm8, [r4] ; tc0 lea r4, [r1*3] dec r2d; alpha-1 @@ -323,12 +323,13 @@ cglobal deblock_v_luma_8_%1, 5,5,10 mova[r4+2*r1], m1 mova[r0], m2 RET +%endmacro ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 5,9 +%macro DEBLOCK_H_LUMA 0 +cglobal deblock_h_luma_8, 5,9 movsxd r7, r1d lear8, [r7+r7*2] lear6, [r0-4] @@ -355,7 +356,7 @@ cglobal deblock_h_luma_8_%1, 5,9 %if WIN64 mov[rsp+0x20], r4 %endif -call deblock_v_luma_8_%1 +call deblock_v_luma_8 %+ SUFFIX ; transpose 16x4 - original space (only the middle 4 rows were changed by the filter) addr6, 2 @@ -384,24 +385,29 @@ cglobal deblock_h_luma_8_%1, 5,9 RET %endmacro -INIT_XMM -DEBLOCK_LUMA sse2 -INIT_AVX -DEBLOCK_LUMA avx +INIT_XMM sse2 +DEBLOCK_V_LUMA +INIT_MMX sse2 +DEBLOCK_H_LUMA + +INIT_XMM avx +DEBLOCK_V_LUMA +INIT_MMX avx +DEBLOCK_H_LUMA %else -%macro DEBLOCK_LUMA 3 +%macro DEBLOCK_V_LUMA 2 ;- ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -cglobal deblock_%2_luma_8_%1, 5,5 +cglobal deblock_%1_luma_8, 5,5 lea r4, [r1*3] dec r2 ; alpha-1 neg r4 dec r3 ; beta-1 add r4, r0 ; pix-3*stride -%assign pad 2*%3+12-(stack_offset15) +%assign pad 2*%2+12-(stack_offset15) SUB esp, pad movam0, [r4+r1] ; p1 @@ -415,7 +421,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 movdm4, [r3] ; tc0 punpcklbw m4, m4 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] -mova [esp+%3], m4 ; tc +mova [esp+%2], m4 ; tc pcmpgtb m4, m3 movam3, [r4] ; p2 pandm4, m7 @@ -423,7 +429,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| beta-1 pandm6, m4 -pandm4, [esp+%3] ; tc +pandm4, [esp+%2] ; tc psubb m7, m4, m6 pandm6, m4 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 @@ -431,7 +437,7 @@ cglobal deblock_%2_luma_8_%1, 5,5 movam4, [r0+2*r1] ; q2 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| beta-1 pandm6, [esp] ; mask -movam5, [esp+%3] ; tc +movam5, [esp+%2] ; tc psubb m7, m6 pandm5, m6 movam3, [r0+r1] @@ -442,12 +448,13 @@ cglobal deblock_%2_luma_8_%1, 5,5 mova[r0], m2 ADD esp, pad RET +%endmacro ;- ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) ;- -INIT_MMX -cglobal deblock_h_luma_8_%1, 0,5 +%macro DEBLOCK_H_LUMA 1 +cglobal deblock_h_luma_8, 0,5 movr0, r0mp movr3, r1m lear4, [r3*3] @@ -470,11 +477,11 @@ cglobal deblock_h_luma_8_%1, 0,5 PUSH dword r2m PUSH dword 16 PUSH dword r0 -call deblock_%2_luma_8_%1 -%ifidn %2, v8 +call deblock_%1_luma_8 %+ SUFFIX +%ifidn %1, v8 adddword [esp ], 8 ; pix_tmp+0x38 adddword [esp+16], 2 ; tc0+2 -call deblock_%2_luma_8_%1 +call deblock_%1_luma_8 %+ SUFFIX %endif ADDesp, 20 @@ -501,12 +508,15 @@ cglobal deblock_h_luma_8_%1, 0,5 RET %endmacro ; DEBLOCK_LUMA -INIT_MMX -DEBLOCK_LUMA mmxext, v8, 8 -INIT_XMM -DEBLOCK_LUMA sse2, v, 16 -INIT_AVX -DEBLOCK_LUMA avx, v, 16 +INIT_MMX mmx2 +DEBLOCK_V_LUMA v8, 8 +DEBLOCK_H_LUMA v8 +INIT_XMM sse2 +DEBLOCK_V_LUMA v, 16 +DEBLOCK_H_LUMA v +INIT_XMM avx +DEBLOCK_V_LUMA v, 16 +DEBLOCK_H_LUMA h %endif ; ARCH @@ -608,7 +618,7 @@ DEBLOCK_LUMA avx, v, 16 %define mask1p mask1q
[libav-devel] [PATCH] vp56: port x86 simd to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/vp56dsp.asm | 34 +++--- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/libavcodec/x86/vp56dsp.asm b/libavcodec/x86/vp56dsp.asm index 66a97f1..27a82bc 100644 --- a/libavcodec/x86/vp56dsp.asm +++ b/libavcodec/x86/vp56dsp.asm @@ -27,7 +27,8 @@ cextern pw_64 SECTION .text -%macro DIAG4_MMX 6 +%macro DIAG4 6 +%if mmsize == 8 movq m0, [%1+%2] movq m1, [%1+%3] movq m3, m0 @@ -64,9 +65,7 @@ SECTION .text psraw m3, 7 packuswb m0, m3 movq[%6], m0 -%endmacro - -%macro DIAG4_SSE2 6 +%else ; mmsize == 16 movq m0, [%1+%2] movq m1, [%1+%3] punpcklbw m0, m7 @@ -86,9 +85,11 @@ SECTION .text psraw m0, 7 packuswb m0, m0 movq[%6], m0 +%endif ; mmsize == 8/16 %endmacro -%macro SPLAT4REGS_MMX 0 +%macro SPLAT4REGS 0 +%if mmsize == 8 movq m5, m3 punpcklwdm3, m3 movq m4, m3 @@ -102,9 +103,7 @@ SECTION .text movq [rsp+8*12], m4 movq [rsp+8*13], m5 movq [rsp+8*14], m2 -%endmacro - -%macro SPLAT4REGS_SSE2 0 +%else ; mmsize == 16 pshuflw m4, m3, 0x0 pshuflw m5, m3, 0x55 pshuflw m6, m3, 0xAA @@ -113,15 +112,16 @@ SECTION .text punpcklqdq m5, m5 punpcklqdq m6, m6 punpcklqdq m3, m3 +%endif ; mmsize == 8/16 %endmacro -%macro vp6_filter_diag4 2 +%macro vp6_filter_diag4 0 ; void ff_vp6_filter_diag4_opt(uint8_t *dst, uint8_t *src, int stride, ;const int16_t h_weight[4], const int16_t v_weights[4]) -cglobal vp6_filter_diag4_%1, 5, 7, %2 +cglobal vp6_filter_diag4, 5, 7, 8 mov r5, rsp ; backup stack pointer and rsp, ~(mmsize-1) ; align stack -%ifidn %1, sse2 +%if mmsize == 16 sub rsp, 8*11 %else sub rsp, 8*15 @@ -162,12 +162,8 @@ cglobal vp6_filter_diag4_%1, 5, 7, %2 RET %endmacro -INIT_MMX -%define DIAG4 DIAG4_MMX -%define SPLAT4REGS SPLAT4REGS_MMX -vp6_filter_diag4 mmx, 0 +INIT_MMX mmx +vp6_filter_diag4 -INIT_XMM -%define DIAG4 DIAG4_SSE2 -%define SPLAT4REGS SPLAT4REGS_SSE2 -vp6_filter_diag4 sse2, 8 +INIT_XMM sse2 +vp6_filter_diag4 -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] vp56: only compile MMX SIMD on x86-32.
From: Ronald S. Bultje rsbul...@gmail.com All x86-64 CPUs have SSE2, so the MMX version will never be used. This leads to smaller binaries. --- libavcodec/x86/vp56dsp.asm|2 ++ libavcodec/x86/vp56dsp_init.c |2 ++ 2 files changed, 4 insertions(+) diff --git a/libavcodec/x86/vp56dsp.asm b/libavcodec/x86/vp56dsp.asm index 27a82bc..ca4d97e 100644 --- a/libavcodec/x86/vp56dsp.asm +++ b/libavcodec/x86/vp56dsp.asm @@ -162,8 +162,10 @@ cglobal vp6_filter_diag4, 5, 7, 8 RET %endmacro +%if ARCH_X86_32 INIT_MMX mmx vp6_filter_diag4 +%endif INIT_XMM sse2 vp6_filter_diag4 diff --git a/libavcodec/x86/vp56dsp_init.c b/libavcodec/x86/vp56dsp_init.c index 2989281..ae04440 100644 --- a/libavcodec/x86/vp56dsp_init.c +++ b/libavcodec/x86/vp56dsp_init.c @@ -36,9 +36,11 @@ av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum CodecID codec) int mm_flags = av_get_cpu_flags(); if (CONFIG_VP6_DECODER codec == CODEC_ID_VP6) { +#if ARCH_X86_32 if (mm_flags AV_CPU_FLAG_MMX) { c-vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; } +#endif if (mm_flags AV_CPU_FLAG_SSE2) { c-vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] vp3: port x86 SIMD to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/vp3dsp.asm | 36 ++-- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index af2f60c..98b1cb5 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -102,8 +102,8 @@ SECTION .text mov [r0+r3 -1], r2w %endmacro -INIT_MMX -cglobal vp3_v_loop_filter_mmx2, 3, 4 +INIT_MMX mmx2 +cglobal vp3_v_loop_filter, 3, 4 %if ARCH_X86_64 movsxdr1, r1d %endif @@ -120,7 +120,7 @@ cglobal vp3_v_loop_filter_mmx2, 3, 4 movq [r0 ], m3 RET -cglobal vp3_h_loop_filter_mmx2, 3, 4 +cglobal vp3_h_loop_filter, 3, 4 %if ARCH_X86_64 movsxdr1, r1d %endif @@ -521,9 +521,17 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 %endmacro -%macro vp3_idct_funcs 1 -cglobal vp3_idct_put_%1, 3, 4, 9 -VP3_IDCT_%1 r2 +%macro VP3_IDCT 1 +%if mmsize == 8 +VP3_IDCT_mmx %1 +%else +VP3_IDCT_sse2 %1 +%endif +%endmacro + +%macro vp3_idct_funcs 0 +cglobal vp3_idct_put, 3, 4, 9 +VP3_IDCT r2 movsxdifnidn r1, r1d mova m4, [pb_80] @@ -565,8 +573,8 @@ cglobal vp3_idct_put_%1, 3, 4, 9 %endrep RET -cglobal vp3_idct_add_%1, 3, 4, 9 -VP3_IDCT_%1 r2 +cglobal vp3_idct_add, 3, 4, 9 +VP3_IDCT r2 mov r3, 4 pxor m4, m4 @@ -607,10 +615,10 @@ cglobal vp3_idct_add_%1, 3, 4, 9 RET %endmacro -INIT_MMX -vp3_idct_funcs mmx -INIT_XMM -vp3_idct_funcs sse2 +INIT_MMX mmx +vp3_idct_funcs +INIT_XMM sse2 +vp3_idct_funcs %macro DC_ADD 0 movq m2, [r0 ] @@ -631,8 +639,8 @@ vp3_idct_funcs sse2 movq [r0+r3 ], m5 %endmacro -INIT_MMX -cglobal vp3_idct_dc_add_mmx2, 3, 4 +INIT_MMX mmx2 +cglobal vp3_idct_dc_add, 3, 4 %if ARCH_X86_64 movsxdr1, r1d %endif -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] vp3: don't compile mmx IDCT functions on x86-64.
From: Ronald S. Bultje rsbul...@gmail.com 64-bit CPUs always have SSE2, and a SSE2 version exists, thus the MMX version will never be used. --- libavcodec/x86/vp3dsp.asm|3 +++ libavcodec/x86/vp3dsp_init.c |2 ++ 2 files changed, 5 insertions(+) diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index 98b1cb5..0b3eaa0 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -615,8 +615,11 @@ cglobal vp3_idct_add, 3, 4, 9 RET %endmacro +%if ARCH_X86_32 INIT_MMX mmx vp3_idct_funcs +%endif + INIT_XMM sse2 vp3_idct_funcs diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c index cd8e206..704d4a6 100644 --- a/libavcodec/x86/vp3dsp_init.c +++ b/libavcodec/x86/vp3dsp_init.c @@ -41,11 +41,13 @@ av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) #if HAVE_YASM int cpuflags = av_get_cpu_flags(); +#if ARCH_X86_32 if (HAVE_MMX cpuflags AV_CPU_FLAG_MMX) { c-idct_put = ff_vp3_idct_put_mmx; c-idct_add = ff_vp3_idct_add_mmx; c-idct_perm = FF_PARTTRANS_IDCT_PERM; } +#endif if (HAVE_MMX2 cpuflags AV_CPU_FLAG_MMX2) { c-idct_dc_add = ff_vp3_idct_dc_add_mmx2; -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] rv34: port x86 SIMD to cpuflags.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/rv34dsp.asm | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm index 32bcdce..c43b77a 100644 --- a/libavcodec/x86/rv34dsp.asm +++ b/libavcodec/x86/rv34dsp.asm @@ -46,7 +46,7 @@ SECTION .text %endmacro %macro rv34_idct 1 -cglobal rv34_idct_%1_mmx2, 1, 2, 0 +cglobal rv34_idct_%1, 1, 2, 0 movsx r1, word [r0] IDCT_DC r1 movdm0, r1 @@ -58,14 +58,15 @@ cglobal rv34_idct_%1_mmx2, 1, 2, 0 REP_RET %endmacro -INIT_MMX +INIT_MMX mmx2 %define IDCT_DC IDCT_DC_ROUND rv34_idct dc %define IDCT_DC IDCT_DC_NOROUND rv34_idct dc_noround ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); -cglobal rv34_idct_dc_add_mmx, 3, 3 +INIT_MMX mmx +cglobal rv34_idct_dc_add, 3, 3 ; calculate DC IDCT_DC_ROUND r2 pxor m1, m1 @@ -167,8 +168,8 @@ cglobal rv34_idct_add, 3,3,0, d, s, b ret ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); -INIT_XMM -cglobal rv34_idct_dc_add_sse4, 3, 3, 6 +INIT_XMM sse4 +cglobal rv34_idct_dc_add, 3, 3, 6 ; load data IDCT_DC_ROUND r2 pxor m1, m1 -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] dsputil_mmx: fix incorrect assembly code
Hi, On Mon, Jul 23, 2012 at 5:30 PM, Derek Buitenhuis derek.buitenh...@gmail.com wrote: From: Yang Wang yang.y.w...@intel.com In ff_put_pixels_clamped_mmx(), there are two assembly code blocks. In the first block (in the unrolled loop), the instructions movq 8%3, %%mm1 \n\t, and so forth, have problems. From above instruction, it is clear what the programmer wants: a load from p + 8. But this assembly code doesn’t guarantee that. It only works if the compiler puts p in a register to produce an instruction like this: movq 8(%edi), %mm1. During compiler optimization, it is possible that the compiler will be able to constant propagate into p. Suppose p = x[1]. Then operand 3 can become 1(%edi), where %edi holds x. And the instruction becomes movq 81(%edx). That is, it will stride by 81 instead of 8. This will cause a segmentation fault. This error was fixed in the second block of the assembly code, but not in the unrolled loop. How to reproduce: This error is exposed when we build the ffmpeg using Intel C++ Compiler, IPO+PGO optimization. Crashed when decoding an MJPEG video. Signed-off-by: Michael Niedermayer michae...@gmx.at Signed-off-by: Derek Buitenhuis derek.buitenh...@gmail.com --- libavcodec/x86/dsputil_mmx.c | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 5eb4a24..522a565 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -245,14 +245,14 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, pix = pixels; /* unrolled loop */ __asm__ volatile ( -movq%3, %%mm0 \n\t -movq 8%3, %%mm1 \n\t -movq 16%3, %%mm2 \n\t -movq 24%3, %%mm3 \n\t -movq 32%3, %%mm4 \n\t -movq 40%3, %%mm5 \n\t -movq 48%3, %%mm6 \n\t -movq 56%3, %%mm7 \n\t +movq (%3), %%mm0 \n\t +movq 8(%3), %%mm1 \n\t +movq16(%3), %%mm2 \n\t +movq24(%3), %%mm3 \n\t +movq32(%3), %%mm4 \n\t +movq40(%3), %%mm5 \n\t +movq48(%3), %%mm6 \n\t +movq56(%3), %%mm7 \n\t packuswb %%mm1, %%mm0 \n\t packuswb %%mm3, %%mm2 \n\t packuswb %%mm5, %%mm4 \n\t @@ -262,7 +262,7 @@ void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, movq %%mm4, (%0, %1, 2)\n\t movq %%mm6, (%0, %2) \n\t :: r(pix), r((x86_reg)line_size), r((x86_reg)line_size * 3), - m(*p) + r(p) : memory); pix += line_size * 4; p += 32; OK. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] x86: add support for fmaddps fma4 instruction with abstraction to avx/sse
Hi, On Tue, Jul 24, 2012 at 2:03 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- configure|5 + libavutil/x86/x86inc.asm | 16 +++- 2 files changed, 16 insertions(+), 5 deletions(-) OK. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] swscale: bury one more piece of inline asm under HAVE_INLINE_ASM.
From: Ronald S. Bultje rsbul...@gmail.com --- libswscale/swscale.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 5cfa7f2..0f8ef2b 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -661,7 +661,7 @@ static int swScale(SwsContext *c, const uint8_t *src[], if (isPlanar(dstFormat) isALPHA(dstFormat) !alpPixBuf) fillPlane(dst[3], dstStride[3], dstW, dstY - lastDstY, lastDstY, 255); -#if HAVE_MMX2 +#if HAVE_MMX2 HAVE_INLINE_ASM if (av_get_cpu_flags() AV_CPU_FLAG_MMX2) __asm__ volatile (sfence ::: memory); #endif -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] x86inc: automatically insert vzeroupper for YMM functions.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/dct32_sse.asm|2 -- libavcodec/x86/dsputil_yasm.asm | 14 -- libavcodec/x86/fft_mmx.asm |6 -- libavresample/x86/audio_convert.asm | 10 -- libavresample/x86/audio_mix.asm | 10 -- libavutil/x86/float_dsp.asm | 10 -- libavutil/x86/x86inc.asm|5 - 7 files changed, 4 insertions(+), 53 deletions(-) diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm index e3c8a45..351c88d 100644 --- a/libavcodec/x86/dct32_sse.asm +++ b/libavcodec/x86/dct32_sse.asm @@ -278,8 +278,6 @@ cglobal dct32_float_avx, 2,3,8, out, in, tmp vperm2f128 m0, m1, m1, 0x31 vmovaps [outq+96], m1 -vzeroupper - ;pass 6, no SIMD... INIT_XMM PASS6_AND_PERMUTE diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 077f3a0..70a0aa1 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -1158,12 +1158,7 @@ ALIGN 16 add src1q, 2*mmsize sub lenq, 2*mmsize jge .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -1193,12 +1188,7 @@ ALIGN 16 sub lenq, 2*mmsize jge .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -1243,10 +1233,6 @@ cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len %endif add lenq, mmsize jl .loop -%if mmsize == 32 -vzeroupper -RET -%endif .end: REP_RET %endmacro diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 1a430b9..527e215 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -749,9 +749,6 @@ section .text ; The others pass args in registers and don't spill anything. cglobal fft_dispatch%2, 2,5,8, z, nbits FFT_DISPATCH fullsuffix, nbits -%if mmsize == 32 -vzeroupper -%endif RET %endmacro ; DECL_FFT @@ -957,9 +954,6 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i %if ARCH_X86_64 == 0 add esp, 12 %endif -%if mmsize == 32 -vzeroupper -%endif RET %endmacro diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 7b3cc22..244c4d1 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -145,12 +145,7 @@ cglobal conv_s32_to_flt, 3,3,3, dst, src, len mova [dstq+lenq+mmsize], m2 add lenq, mmsize*2 jl .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse2 @@ -218,12 +213,7 @@ cglobal conv_flt_to_s32, 3,3,5, dst, src, len mova [dstq+lenq+3*mmsize], m3 add lenq, mmsize*4 jl .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse2 diff --git a/libavresample/x86/audio_mix.asm b/libavresample/x86/audio_mix.asm index 58a4ded..dbfaa69 100644 --- a/libavresample/x86/audio_mix.asm +++ b/libavresample/x86/audio_mix.asm @@ -51,12 +51,7 @@ cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1 addsrcq, mmsize*2 sublend, mmsize*2/4 jg .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -175,12 +170,7 @@ cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1 add src0q, mmsize sublend, mmsize/4 jg .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 66ef093..c4e0c66 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -40,12 +40,7 @@ ALIGN 16 sub lenq, 2*mmsize jge .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -86,12 +81,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len mova [dstq+lenq+mmsize], m2 sublenq, 2*mmsize jge .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 42ba97a..4b523e9 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -369,11 +369,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 120 %macro RET 0 WIN64_RESTORE_XMM_INTERNAL rsp POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 +%if mmsize == 32 +vzeroupper +%endif ret %endmacro %macro REP_RET 0 -%if regs_used 7 || xmm_regs_used 6 +%if regs_used 7 || xmm_regs_used 6 || mmsize == 32 RET %else rep ret -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] x86/dsputilenc: bury inline asm under HAVE_INLINE_ASM.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/dct-test.c |2 +- libavcodec/x86/dsputilenc_mmx.c | 80 +++ libavcodec/x86/fdct_mmx.c |4 ++ libavcodec/x86/motion_est_mmx.c |6 +++ libavcodec/x86/mpegvideo_mmx.c |6 +++ 5 files changed, 64 insertions(+), 34 deletions(-) diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c index 5046544..9e19e0c 100644 --- a/libavcodec/dct-test.c +++ b/libavcodec/dct-test.c @@ -85,7 +85,7 @@ static const struct algo fdct_tab[] = { { IJG-AAN-INT,ff_fdct_ifast, SCALE_PERM }, { IJG-LLM-INT,ff_jpeg_fdct_islow_8, NO_PERM}, -#if HAVE_MMX +#if HAVE_MMX HAVE_INLINE_ASM { MMX,ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX }, { MMX2, ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 }, { SSE2, ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 }, diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c index 47fa5ca..3cac979 100644 --- a/libavcodec/x86/dsputilenc_mmx.c +++ b/libavcodec/x86/dsputilenc_mmx.c @@ -30,6 +30,8 @@ #include dsputil_mmx.h +#if HAVE_INLINE_ASM + static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) { __asm__ volatile( @@ -323,8 +325,6 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int return tmp; } -int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h); - static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { int tmp; __asm__ volatile ( @@ -925,17 +925,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c paddusw #t, #a \n\t\ movd #a, #dst\n\t\ -#define hadamard_func(cpu) \ -int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ - int stride, int h); \ -int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ - int stride, int h); - -hadamard_func(mmx) -hadamard_func(mmx2) -hadamard_func(sse2) -hadamard_func(ssse3) - #define DCT_SAD4(m,mm,o)\ mov#m #o+ 0(%1), #mm2 \n\t\ mov#m #o+16(%1), #mm3 \n\t\ @@ -1094,10 +1083,26 @@ static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int si #undef PHADDD #endif //HAVE_SSSE3 +#endif /* HAVE_INLINE_ASM */ + +int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h); + +#define hadamard_func(cpu) \ +int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ + int stride, int h); \ +int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ + int stride, int h); + +hadamard_func(mmx) +hadamard_func(mmx2) +hadamard_func(sse2) +hadamard_func(ssse3) void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); + +#if HAVE_INLINE_ASM int bit_depth = avctx-bits_per_raw_sample; if (mm_flags AV_CPU_FLAG_MMX) { @@ -1121,11 +1126,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) c-diff_bytes= diff_bytes_mmx; c-sum_abs_dctelem= sum_abs_dctelem_mmx; -#if HAVE_YASM -c-hadamard8_diff[0]= ff_hadamard8_diff16_mmx; -c-hadamard8_diff[1]= ff_hadamard8_diff_mmx; -#endif - c-pix_norm1 = pix_norm1_mmx; c-sse[0] = sse16_mmx; c-sse[1] = sse8_mmx; @@ -1146,10 +1146,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) if (mm_flags AV_CPU_FLAG_MMX2) { -#if HAVE_YASM -c-hadamard8_diff[0]= ff_hadamard8_diff16_mmx2; -c-hadamard8_diff[1]= ff_hadamard8_diff_mmx2; -#endif c-sum_abs_dctelem= sum_abs_dctelem_mmx2; c-vsad[4]= vsad_intra16_mmx2; @@ -1164,13 +1160,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) if (bit_depth = 8) c-get_pixels = get_pixels_sse2; c-sum_abs_dctelem= sum_abs_dctelem_sse2; -#if HAVE_YASM -c-sse[0] = ff_sse16_sse2; -#if HAVE_ALIGNED_STACK -c-hadamard8_diff[0]= ff_hadamard8_diff16_sse2; -c-hadamard8_diff[1]= ff_hadamard8_diff_sse2; -#endif -#endif } #if HAVE_SSSE3 @@ -1180,10 +1169,6 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) } c-add_8x8basis= add_8x8basis_ssse3; c-sum_abs_dctelem= sum_abs_dctelem_ssse3; -#if HAVE_YASM HAVE_ALIGNED_STACK -c-hadamard8_diff[0]= ff_hadamard8_diff16_ssse3; -c-hadamard8_diff[1]= ff_hadamard8_diff_ssse3; -#endif } #endif @@ -1194,6 +1179,35 @@ void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx) c-add_8x8basis= add_8x8basis_3dnow; } } +#endif /* HAVE_INLINE_ASM */ + +#if HAVE_YASM
[libav-devel] [PATCH] mpegaudio: bury inline asm under HAVE_INLINE_ASM.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/mpegaudiodec_mmx.c |5 + 1 file changed, 5 insertions(+) diff --git a/libavcodec/x86/mpegaudiodec_mmx.c b/libavcodec/x86/mpegaudiodec_mmx.c index f51a06d..88a3477 100644 --- a/libavcodec/x86/mpegaudiodec_mmx.c +++ b/libavcodec/x86/mpegaudiodec_mmx.c @@ -36,6 +36,8 @@ void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; +#if HAVE_INLINE_ASM + #define MACS(rt, ra, rb) rt+=(ra)*(rb) #define MLSS(rt, ra, rb) rt-=(ra)*(rb) @@ -178,6 +180,7 @@ static void apply_window_mp3(float *in, float *win, int *unused, float *out, *out = sum; } +#endif /* HAVE_INLINE_ASM */ #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ @@ -235,9 +238,11 @@ void ff_mpadsp_init_mmx(MPADSPContext *s) } } +#if HAVE_INLINE_ASM if (mm_flags AV_CPU_FLAG_SSE2) { s-apply_window_float = apply_window_mp3; } +#endif /* HAVE_INLINE_ASM */ #if HAVE_YASM if (mm_flags AV_CPU_FLAG_AVX HAVE_AVX) { s-imdct36_blocks_float = imdct36_blocks_avx; -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] eval: add workaround for broken strtod() in MSVS.
From: Ronald S. Bultje rsbul...@gmail.com --- libavutil/eval.c | 35 +++ 1 file changed, 35 insertions(+) diff --git a/libavutil/eval.c b/libavutil/eval.c index ff3191d..ef37ad8 100644 --- a/libavutil/eval.c +++ b/libavutil/eval.c @@ -26,6 +26,7 @@ * see http://joe.hotchkiss.com/programming/eval/eval.html */ +#include avstring.h #include avutil.h #include eval.h #include log.h @@ -77,6 +78,40 @@ double av_strtod(const char *numstr, char **tail) { double d; char *next; +#ifdef _MSC_VER +/* MSVC does not support hexadecimal input, nor does it understand + * strings such as inf[inity] or nan. Support them manually. */ +if (!av_strncasecmp(numstr, inf, 3)) { +d = INFINITY; +next = numstr + 3; +} else if (!av_strncasecmp(numstr, infinity, 8)) { +d = INFINITY; +next = numstr + 8; +} else if (!av_strncasecmp(numstr, +inf, 4)) { +d = INFINITY; +next = numstr + 4; +} else if (!av_strncasecmp(numstr, +infinity, 4)) { +d = INFINITY; +next = numstr + 9; +} else if (!av_strncasecmp(numstr, -inf, 4)) { +d = -INFINITY; +next = numstr + 4; +} else if (!av_strncasecmp(numstr, -infinity, 9)) { +d = -INFINITY; +next = numstr + 9; +} else if (!av_strncasecmp(numstr, nan, 3)) { +d = NAN; +next = numstr + 3; +} else if (!av_strncasecmp(numstr, +nan, 4) || + !av_strncasecmp(numstr, -nan, 4)) { +d = NAN; +next = numstr + 4; +} else if (!av_strncasecmp(numstr, 0x, 2) || + !av_strncasecmp(numstr, -0x, 3) || + !av_strncasecmp(numstr, +0x, 3)) { +d = strtol(numstr, next, 16); +} else +#endif d = strtod(numstr, next); /* if parsing succeeded, check for and interpret postfixes */ if (next!=numstr) { -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] eval: fix printing of NaN in eval fate test.
From: Ronald S. Bultje rsbul...@gmail.com This fixes make fate-eval on MSVC builds. Without this, the test outputs -1.#NaN instead of nan on MSVS 2010. --- libavutil/eval.c |5 + 1 file changed, 5 insertions(+) diff --git a/libavutil/eval.c b/libavutil/eval.c index ef37ad8..6131263 100644 --- a/libavutil/eval.c +++ b/libavutil/eval.c @@ -671,6 +671,11 @@ int main(int argc, char **argv) av_expr_parse_and_eval(d, *expr, const_names, const_values, NULL, NULL, NULL, NULL, NULL, 0, NULL); +#ifdef _MSC_VER +if (isnan(d)) +printf('%s' - nan\n\n, *expr); +else +#endif printf('%s' - %f\n\n, *expr, d); } -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] x86inc: automatically insert vzeroupper for YMM functions.
From: Ronald S. Bultje rsbul...@gmail.com --- libavcodec/x86/dct32_sse.asm|2 -- libavcodec/x86/dsputil_yasm.asm | 14 -- libavcodec/x86/fft_mmx.asm |6 -- libavresample/x86/audio_convert.asm | 10 -- libavresample/x86/audio_mix.asm | 10 -- libavutil/x86/float_dsp.asm | 10 -- libavutil/x86/x86inc.asm| 15 --- 7 files changed, 12 insertions(+), 55 deletions(-) diff --git a/libavcodec/x86/dct32_sse.asm b/libavcodec/x86/dct32_sse.asm index e3c8a45..351c88d 100644 --- a/libavcodec/x86/dct32_sse.asm +++ b/libavcodec/x86/dct32_sse.asm @@ -278,8 +278,6 @@ cglobal dct32_float_avx, 2,3,8, out, in, tmp vperm2f128 m0, m1, m1, 0x31 vmovaps [outq+96], m1 -vzeroupper - ;pass 6, no SIMD... INIT_XMM PASS6_AND_PERMUTE diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm index 077f3a0..70a0aa1 100644 --- a/libavcodec/x86/dsputil_yasm.asm +++ b/libavcodec/x86/dsputil_yasm.asm @@ -1158,12 +1158,7 @@ ALIGN 16 add src1q, 2*mmsize sub lenq, 2*mmsize jge .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -1193,12 +1188,7 @@ ALIGN 16 sub lenq, 2*mmsize jge .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -1243,10 +1233,6 @@ cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len %endif add lenq, mmsize jl .loop -%if mmsize == 32 -vzeroupper -RET -%endif .end: REP_RET %endmacro diff --git a/libavcodec/x86/fft_mmx.asm b/libavcodec/x86/fft_mmx.asm index 1a430b9..527e215 100644 --- a/libavcodec/x86/fft_mmx.asm +++ b/libavcodec/x86/fft_mmx.asm @@ -749,9 +749,6 @@ section .text ; The others pass args in registers and don't spill anything. cglobal fft_dispatch%2, 2,5,8, z, nbits FFT_DISPATCH fullsuffix, nbits -%if mmsize == 32 -vzeroupper -%endif RET %endmacro ; DECL_FFT @@ -957,9 +954,6 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i %if ARCH_X86_64 == 0 add esp, 12 %endif -%if mmsize == 32 -vzeroupper -%endif RET %endmacro diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 7b3cc22..244c4d1 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -145,12 +145,7 @@ cglobal conv_s32_to_flt, 3,3,3, dst, src, len mova [dstq+lenq+mmsize], m2 add lenq, mmsize*2 jl .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse2 @@ -218,12 +213,7 @@ cglobal conv_flt_to_s32, 3,3,5, dst, src, len mova [dstq+lenq+3*mmsize], m3 add lenq, mmsize*4 jl .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse2 diff --git a/libavresample/x86/audio_mix.asm b/libavresample/x86/audio_mix.asm index 58a4ded..dbfaa69 100644 --- a/libavresample/x86/audio_mix.asm +++ b/libavresample/x86/audio_mix.asm @@ -51,12 +51,7 @@ cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1 addsrcq, mmsize*2 sublend, mmsize*2/4 jg .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -175,12 +170,7 @@ cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1 add src0q, mmsize sublend, mmsize/4 jg .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse diff --git a/libavutil/x86/float_dsp.asm b/libavutil/x86/float_dsp.asm index 66ef093..c4e0c66 100644 --- a/libavutil/x86/float_dsp.asm +++ b/libavutil/x86/float_dsp.asm @@ -40,12 +40,7 @@ ALIGN 16 sub lenq, 2*mmsize jge .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse @@ -86,12 +81,7 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len mova [dstq+lenq+mmsize], m2 sublenq, 2*mmsize jge .loop -%if mmsize == 32 -vzeroupper -RET -%else REP_RET -%endif %endmacro INIT_XMM sse diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index 42ba97a..b76a10c 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -369,11 +369,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 120 %macro RET 0 WIN64_RESTORE_XMM_INTERNAL rsp POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 +%if mmsize == 32 +vzeroupper +%endif ret %endmacro %macro REP_RET 0 -%if regs_used 7 || xmm_regs_used 6 +%if regs_used 7 || xmm_regs_used 6 || mmsize == 32 RET %else rep ret @@ -410,11 +413,14 @@ DECLARE_REG 14, R15, R15D, R15W, R15B, 72 %macro RET 0 POP_IF_USED 14, 13, 12, 11, 10, 9 +%if mmsize == 32 +vzeroupper +%endif ret %endmacro
[libav-devel] [PATCH] avprobe/avconv: fix tentative declaration compile errors on MSVS.
From: Ronald S. Bultje rsbul...@gmail.com --- avconv.c |5 +++-- avprobe.c |5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/avconv.c b/avconv.c index 7142ab4..439672a 100644 --- a/avconv.c +++ b/avconv.c @@ -104,7 +104,7 @@ typedef struct MetadataMap { int index; /// stream/chapter/program number } MetadataMap; -static const OptionDef options[]; +static const OptionDef *options; static int video_discard = 0; static int same_quant = 0; @@ -4858,7 +4858,7 @@ static int opt_filter_complex(const char *opt, const char *arg) } #define OFFSET(x) offsetof(OptionsContext, x) -static const OptionDef options[] = { +static const OptionDef real_options[] = { /* main options */ #include cmdutils_common_opts.h { f, HAS_ARG | OPT_STRING | OPT_OFFSET, {.off = OFFSET(format)}, force format, fmt }, @@ -4975,6 +4975,7 @@ int main(int argc, char **argv) OptionsContext o = { 0 }; int64_t ti; +options = real_options; reset_options(o); av_log_set_flags(AV_LOG_SKIP_REPEATED); diff --git a/avprobe.c b/avprobe.c index 8e93d05..aa7dae4 100644 --- a/avprobe.c +++ b/avprobe.c @@ -44,7 +44,7 @@ static int use_byte_value_binary_prefix = 0; static int use_value_sexagesimal_format = 0; /* globals */ -static const OptionDef options[]; +static const OptionDef *options; /* AVprobe context */ static const char *input_filename; @@ -887,7 +887,7 @@ static void opt_pretty(void) use_value_sexagesimal_format = 1; } -static const OptionDef options[] = { +static const OptionDef real_options[] = { #include cmdutils_common_opts.h { f, HAS_ARG, {(void*)opt_format}, force format, format }, { of, HAS_ARG, {(void*)opt_output_format}, output the document either as ini or json, output_format }, @@ -927,6 +927,7 @@ int main(int argc, char **argv) if (!buffer) exit(1); +options = real_options; parse_loglevel(argc, argv, options); av_register_all(); avformat_network_init(); -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] H264: Remove 3dnow qpel code.
Hi, On Tue, Jul 24, 2012 at 7:25 AM, Luca Barbato lu_z...@gentoo.org wrote: On 7/24/12 4:45 AM, Ronald S. Bultje wrote: Hi, On Mon, Jul 23, 2012 at 7:45 PM, Ronald S. Bultje rsbul...@gmail.com wrote: Hi, On Mon, Jul 23, 2012 at 5:37 PM, Daniel Kang daniel.d.k...@gmail.com wrote: On Mon, Jul 23, 2012 at 5:21 PM, Diego Biurrun di...@biurrun.de wrote: On Mon, Jul 23, 2012 at 05:12:23PM -0700, Daniel Kang wrote: From: Daniel Kang daniel.d.k...@gmail.com The only CPUs that have 3dnow and don't have mmxext are 12 years old. --- libavcodec/x86/dsputil_mmx.c |9 - libavcodec/x86/h264_qpel_mmx.c |4 2 files changed, 0 insertions(+), 13 deletions(-) What sort of maintenance burden does this relieve us from? I'm writing this mail on a system fitting the description you mention, my trusty old K6-III. [..] 4. You can probably decode 260p H.264 with a K6-III. Who seriously would use this? This really is the killer. Is there any sort of reasonable expectation that a k6-3 can get useful work done when it comes to H264 decoding? I wouldn't even mind dropping all MMX optimizations (where MMX2 - i.e. SSE - or higher exists) altogether, i.e. going the H264 way .. x264 way. :). Let's discuss a bit or put a news item, the pros is to having a leaner system, the cons is cutting dry systems that might work fine for special purposes now. If we can have some compelling improvement (e.g. yasm) why not? The alternatively that I have suggested to Daniel is to keep the old inline asm code for 3dnow only. The disadvantage of that is that we keep pretty much all code around, just for 3dnow alone, and duplicate it in yasm form for all other optimization types. So it's practically possible, but at a high cost (+ that since it's duplicated, it'll be orphaned and unmaintained; any improvements to the new qpel code will not hit the 3dnow optimizations). Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] Replace x*155/100 by x*10158116.
From: Ronald S. Bultje rsbul...@gmail.com Idea stolen from webp (by Pascal Massimino) - because it's Cool. --- libavcodec/vp8.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c index d0e2a0c..e4cfbcb 100644 --- a/libavcodec/vp8.c +++ b/libavcodec/vp8.c @@ -249,12 +249,12 @@ static void get_quants(VP8Context *s) } else base_qi = yac_qi; -s-qmat[i].luma_qmul[0]= vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)]; -s-qmat[i].luma_qmul[1]= vp8_ac_qlookup[av_clip_uintp2(base_qi , 7)]; -s-qmat[i].luma_dc_qmul[0] = 2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)]; -s-qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100; -s-qmat[i].chroma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)]; -s-qmat[i].chroma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)]; +s-qmat[i].luma_qmul[0]= vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)]; +s-qmat[i].luma_qmul[1]= vp8_ac_qlookup[av_clip_uintp2(base_qi , 7)]; +s-qmat[i].luma_dc_qmul[0] = 2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)]; +s-qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) 16; +s-qmat[i].chroma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)]; +s-qmat[i].chroma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)]; s-qmat[i].luma_dc_qmul[1] = FFMAX(s-qmat[i].luma_dc_qmul[1], 8); s-qmat[i].chroma_qmul[0] = FFMIN(s-qmat[i].chroma_qmul[0], 132); -- 1.7.9.2 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [libav-commits] x86: fft: convert sse inline asm to yasm
Hi, On Tue, Jul 24, 2012 at 3:05 PM, Jason Garrett-Glaser ja...@x264.com wrote: On Tue, Jul 24, 2012 at 9:02 AM, John Stebbins stebb...@jetheaddev.com wrote: On 07/24/2012 05:53 PM, Jason Garrett-Glaser wrote: On Tue, Jul 24, 2012 at 8:34 AM, Måns Rullgård m...@mansr.com wrote: Jason Garrett-Glaser ja...@x264.com writes: On Tue, Jul 24, 2012 at 8:05 AM, John Stebbins stebb...@jetheaddev.com wrote: On 06/25/2012 02:42 PM, Mans Rullgard wrote: Module: libav Branch: master Commit: 82992604706144910f4a2f875d48cfc66c1b70d7 Author:Mans Rullgard m...@mansr.com Committer: Mans Rullgard m...@mansr.com Date: Sat Jun 23 19:08:11 2012 +0100 x86: fft: convert sse inline asm to yasm --- libavcodec/x86/Makefile|1 - libavcodec/x86/fft_mmx.asm | 139 --- libavcodec/x86/fft_sse.c | 110 -- 3 files changed, 129 insertions(+), 121 deletions(-) Hi, This commit is causing some strange interaction with libx264 in HandBrake under certain conditions. x264 is encoding at about 1/10th it's normal rate after updating to this commit. A little more background. When doing ac3 passthru HandBrake encodes a single packet of silence data to ac3 that is uses for filling any gaps that it detects in the audio. Encoding of this packet happens before any other encoding or decoding starts. For some crazy reason, if we encode this silence, we get the x264 slowdown. If we do not encode the silence, the speed is ok. I ran gprof on the code to see where all the time is being spent and it is all in x264. So it's not like there is some run-away loop somewhere that is bringing everything to it's knees. I'm guessing some cpu state must not be getting cleared or restored properly somewhere. John Could it have anything to do with denormals/NaN? Does x264 use floating-point SSE instructions anywhere? Yes, in macroblock-tree (because floating-point reciprocal is fast and IDIV is slow), and in ratecontrol. I don't know if it is of any help, but here's the top entries from gprof when this slowdown is happening. x264 defaults + b-adapt=2 Each sample counts as 0.01 seconds. % cumulative self self total time seconds secondscalls ms/call ms/call name 19.56 26.7126.71 x264_pixel_satd_16x4_internal_avx 17.85 51.0824.37 x264_pixel_satd_8x8_internal_avx 10.22 65.0313.95 x264_sub8x8_dct_avx.skip_prologue 9.11 77.4712.44 x264_hadamard_ac_8x8_avx 9.08 89.8712.40 x264_intra_sa8d_x9_8x8_avx 5.08 96.81 6.94 x264_sub8x8_dct8_avx.skip_prologue 2.96100.85 4.04 x264_pixel_satd_4x4_avx 2.45104.20 3.35 x264_intra_satd_x9_4x4_avx 1.80106.66 2.46 x264_mc_chroma_avx 1.58108.82 2.16 x264_hpel_filter_avx 1.46110.81 1.99 x264_pixel_ssim_4x4x2_core_avx 1.21112.46 1.65 x264_add8x8_idct_avx.skip_prologue 1.09113.95 1.49 x264_pixel_ssd_16x16_avx 1.09115.44 1.49 x264_me_search_ref 1.02116.83 1.39 x264_add8x8_idct8_avx.skip_prologue According to top, all CPUs are fully saturated That's an incredibly distorted profile -- it looks like all the AVX functions are running incredibly slowly. Note that all those functions do not use 256-bit AVX, only 128-bit AVX; Intel hasn't documented any sort of slowdown when mixing 128-bit SSE and 128-bit AVX, which we do without problems. Could the problem be that ffmpeg is doing 256-bit AVX, but then not using vzeroupper afterwards? Which CPU is this anyways? Do the x264 functions sign-extend all their integer arguments? Or put differently, does the problem occur for 32-bit builds also, or only for 64-bit builds? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] lavr: x86: improve non-SSE4 version of S16_TO_S32_SX macro
Hi, On Sat, Jul 14, 2012 at 10:33 AM, Justin Ruggles justin.rugg...@gmail.com wrote: On 06/26/2012 04:55 PM, Justin Ruggles wrote: Removes a false dependency on existing contents of the 2nd dst register, giving better performance for OOE. --- libavresample/x86/util.asm |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/libavresample/x86/util.asm b/libavresample/x86/util.asm index 501f662..ca7fde5 100644 --- a/libavresample/x86/util.asm +++ b/libavresample/x86/util.asm @@ -26,7 +26,8 @@ pmovsxwd m%1, m%1 SWAP %1, %2 %else -punpckhwdm%2, m%1 +mova m%2, m%1 +punpckhwdm%2, m%2 punpcklwdm%1, m%1 psradm%2, 16 psradm%1, 16 ping. Ok. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 01/15] lavr: x86: optimized 2-channel s16p to s16 conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 36 libavresample/x86/audio_convert_init.c | 13 +++ 2 files changed, 49 insertions(+), 0 deletions(-) diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 7b3cc22..0ca562a 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -233,6 +233,42 @@ INIT_YMM avx CONV_FLT_TO_S32 %endif +;-- +; void ff_conv_s16p_to_s16_2ch(int16_t *dst, int16_t *const *src, int len, +; int channels); +;-- + +%macro CONV_S16P_TO_S16_2CH 0 +cglobal conv_s16p_to_s16_2ch, 3,4,5, dst, src, len, src1 +mov src1q, [srcq+gprsize] +movsrcq, [srcq] +sub src1q, srcq +ALIGN 16 +.loop +mova m0, [srcq ] +mova m1, [srcq+src1q ] +mova m2, [srcq +mmsize] +mova m3, [srcq+src1q+mmsize] +SBUTTERFLY2 wd, 0, 1, 4 +SBUTTERFLY2 wd, 2, 3, 4 +mova [dstq ], m0 +mova [dstq+1*mmsize], m1 +mova [dstq+2*mmsize], m2 +mova [dstq+3*mmsize], m3 +addsrcq, mmsize*2 +adddstq, mmsize*4 +sublend, mmsize +jg .loop +REP_RET +%endmacro I'm bored, so... Does: sub dstq, srcq sub dstq, srcq in the init code, and then: mova [dstq+srcq*2+0*mmsize], m0 mova [dstq+srcq*2+1*mmsize], m1 mova [dstq+srcq*2+2*mmsize], m2 mova [dstq+srcq*2+3*mmsize], m3 add srcq, mmsize*2 sub lend, mmsize*2 jg .loop Lead to faster execution? If not, you can also add dstq, lenq (after len sign extend on x86-64), and then neg lenq and use dstq+lenq*2 instead. In both cases, the goal is to get rid of the 2xadd+sub in the inner loop. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 02/15] lavr: x86: optimized 6-channel s16p to s16 conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 62 libavresample/x86/audio_convert_init.c |9 + 2 files changed, 71 insertions(+), 0 deletions(-) diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 0ca562a..fdcea3a 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -269,6 +269,68 @@ INIT_XMM avx CONV_S16P_TO_S16_2CH %endif +;-- +; void ff_conv_s16p_to_s16_6ch(int16_t *dst, int16_t *const *src, int len, +; int channels); +;-- + +%macro CONV_S16P_TO_S16_6CH 0 +cglobal conv_s16p_to_s16_6ch, 2,8,6, dst, src, src1, src2, src3, src4, src5, len +%if ARCH_X86_64 +mov lend, r2d +%else +%define lend dword r2m +%endif Eehw, just do: %if ARCH_X86_64 cglobal ..., 3, 8, 6, dst, src, len, src1, src2, .. %else .. what you do up there .. %endif +movq [dstq ], m1 +movq [dstq+ 8], m0 +movq [dstq+16], m2 +movhps [dstq+24], m1 +movhps [dstq+32], m0 +movhps [dstq+40], m2 +add srcq, mmsize/2 +add dstq, mmsize*3 +sub lend, mmsize/4 +jg .loop +REP_RET +%endmacro Here, too, I think you can use imul lenq, 6, then add that to dstq, neg it and index dstq as [dstq+lenq+0/8/16/..]. Then add lend, mmsize/4 instead of sub, and jl instead of jg, and you can remove the add dstq, mmsize*3 from the inner loop. Does unrolling this by another factor of 2 (and thus being able to use aligned loads/stores) make a performance difference? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 03/15] lavr: x86: optimized 2-channel s16p to flt conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 49 libavresample/x86/audio_convert_init.c |9 ++ 2 files changed, 58 insertions(+), 0 deletions(-) LGTM. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 04/15] lavr: x86: optimized 6-channel s16p to flt conversion
Hi, On Sat, Jul 21, 2012 at 12:12 PM, Justin Ruggles justin.rugg...@gmail.com wrote: +%if cpuflag(ssse3) +pshufb m3, m0, unpack_odd ; m3 = 12, 13, 14, 15 +pshufb m0, unpack_even ; m0 = 0, 1, 2, 3 +pshufb m4, m1, unpack_odd ; m4 = 16, 17, 18, 19 +pshufb m1, unpack_even ; m1 = 4, 5, 6, 7 +pshufb m5, m2, unpack_odd ; m5 = 20, 21, 22, 23 +pshufb m2, unpack_even ; m2 = 8, 9, 10, 11 +%else I'm going to assume you tested vpperm and it was not faster? +mova [dstq ], m0 +mova [dstq+ mmsize], m1 +mova [dstq+2*mmsize], m2 +mova [dstq+3*mmsize], m3 +mova [dstq+4*mmsize], m4 +mova [dstq+5*mmsize], m5 +add srcq, mmsize/2 +add dstq, mmsize*6 +sub lend, mmsize/4 Can you try the pointer munging trick here too (i.e. sign-extend lend; imul lenq, x; add dstq, lenq; neg lenq) so add dstq, mmsize*6 and sub lend, mmsize/4 can be merged and we can remove one from the inner loop? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 05/15] lavr: x86: optimized 2-channel fltp to s16 conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 37 libavresample/x86/audio_convert_init.c |9 +++ 2 files changed, 46 insertions(+), 0 deletions(-) diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index ba6cb60..b241542 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -463,6 +463,43 @@ INIT_XMM avx CONV_S16P_TO_FLT_6CH %endif +;-- +; void ff_conv_fltp_to_s16_2ch(int16_t *dst, float *const *src, int len, +; int channels); +;-- + +%macro CONV_FLTP_TO_S16_2CH 0 +cglobal conv_fltp_to_s16_2ch, 3,4,3, dst, src0, len, src1 +lea lenq, [4*lend] +mov src1q, [src0q+gprsize] +mov src0q, [src0q] +add dstq, lenq +add src0q, lenq +add src1q, lenq +neg lenq +mova m2, [pf_s16_scale] +ALIGN 16 +.loop: +mulps m0, m2, [src0q+lenq] +mulps m1, m2, [src1q+lenq] +cvtps2dq m0, m0 +cvtps2dq m1, m1 +packssdw m0, m1 +movhlpsm1, m0 +punpcklwd m0, m1 You should be able to get slightly better performance (because of smaller dependency chain) by using: packssdw m0, m0 packssdw m1, m1 punpcklwd m0, m1 Please modify it to use that if faster. Otherwise OK. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 06/15] lavr: x86: optimized 6-channel fltp to s16 conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: +%else ; sse +mova xmm0, [srcq ] +mova xmm1, [srcq+src1q] +mova xmm2, [srcq+src2q] +mova xmm3, [srcq+src3q] +mova xmm4, [srcq+src4q] +mova xmm5, [srcq+src5q] +mulps xmm0, xmm6 +mulps xmm1, xmm6 +mulps xmm2, xmm6 +mulps xmm3, xmm6 +mulps xmm4, xmm6 +mulps xmm5, xmm6 +cvtps2pim0, xmm0 +cvtps2pim1, xmm1 +cvtps2pim2, xmm2 +cvtps2pim3, xmm3 +cvtps2pim4, xmm4 +cvtps2pim5, xmm5 +packssdwm0, m3 ; m0 = 0, 6, 3, 9 +packssdwm1, m4 ; m1 = 1, 7, 4, 10 +packssdwm2, m5 ; m2 = 2, 8, 5, 11 +; unpack words +pshufw m3, m0, q1032 ; m3 = 3, 9, 0, 6 +punpcklwd m0, m1 ; m0 = 0, 1, 6, 7 +punpckhwd m1, m2 ; m1 = 4, 5, 10, 11 +punpcklwd m2, m3 ; m2 = 2, 3, 8, 9 +; unpack dwords +pshufw m3, m0, q1032 ; m3 = 6, 7, 0, 1 +punpckldq m0, m2 ; m0 = 0, 1, 2, 3 (final) +punpckhdq m2, m1 ; m2 = 8, 9, 10, 11 (final) +punpckldq m1, m3 ; m1 = 4, 5, 6, 7 (final) +mova [dstq ], m0 +mova [dstq+ 8], m1 +mova [dstq+16], m2 I'd agree with Loren that the use of sse as a function name, but having mX refer to mmx registers, is somewhat confusing. I guess it's OK since it's obvious from the code what is intended, just wanted to note that it's confusing. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 07/15] lavr: x86: optimized 2-channel fltp to flt conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: +mova [dstq ], m0 +mova [dstq+1*mmsize], m1 +mova [dstq+2*mmsize], m2 +mova [dstq+3*mmsize], m3 +add srcq, mmsize*2 +add dstq, mmsize*4 +sub lend, mmsize/2 You can probably munge pointers such that one add suffices. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 08/15] lavr: x86: optimized 2-channel s16 to s16p conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 38 libavresample/x86/audio_convert_init.c | 11 + 2 files changed, 49 insertions(+), 0 deletions(-) diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 9ba7251..70519e1 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -734,3 +734,41 @@ CONV_FLTP_TO_FLT_6CH INIT_XMM avx CONV_FLTP_TO_FLT_6CH %endif + +;-- +; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len, +; int channels); +;-- + +%macro CONV_S16_TO_S16P_2CH 0 +cglobal conv_s16_to_s16p_2ch, 3,4,3, dst0, src, len, dst1 +lea lenq, [2*lend] +mov dst1q, [dst0q+gprsize] +mov dst0q, [dst0q] +lea srcq, [srcq+2*lenq] +add dst0q, lenq +add dst1q, lenq +neg lenq +ALIGN 16 +.loop: +movam0, [srcq+2*lenq ] +movam1, [srcq+2*lenq+mmsize] +pshuflw m0, m0, q3120 +pshufhw m0, m0, q3120 +pshuflw m1, m1, q3120 +pshufhw m1, m1, q3120 +shufps m2, m0, m1, q2020 +shufps m0, m1, q3131 +mova [dst0q+lenq], m2 +mova [dst1q+lenq], m0 The more common way to do this (I believe) is to set up mask reg: pcmpeqb m4, m4 psrlw m4, 8 ; 0x00ff Then mask/shift: mova m0, [srcq+2*lenq+0*mmsize] mova m1, [srcq+2*lenq+1*mmsize] psrlw m2, m0, 8 psrlw m3, m1, 8 pand m0, m4 pand m1, m4 packsswb m0, m1 packsswb m2, m3 mova [dst1q+lenq], m0 mova [dst2q+lenq], m2 However, that's not less instructions, maybe worth checking anyway. Alternatively, a pshufb version: mova m3, [pb_02468ace13579bdf] .loop: mova m0, [srcq+2*lenq+0*mmsize] mova m1, [srcq+2*lenq+1*mmsize] pshufb m0, m3 pshufb m1, m3 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 mova [dst1q+lenq], m2 mova [dst2q+lenq], m0 2 instructions less, and only 2 unpacks as opposed to all the shuffles, so potentially faster (except on Atom where pshufb is dog-slow). Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 08/15] lavr: x86: optimized 2-channel s16 to s16p conversion
Hi, On Tue, Jul 24, 2012 at 9:41 PM, Ronald S. Bultje rsbul...@gmail.com wrote: Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 38 libavresample/x86/audio_convert_init.c | 11 + 2 files changed, 49 insertions(+), 0 deletions(-) diff --git a/libavresample/x86/audio_convert.asm b/libavresample/x86/audio_convert.asm index 9ba7251..70519e1 100644 --- a/libavresample/x86/audio_convert.asm +++ b/libavresample/x86/audio_convert.asm @@ -734,3 +734,41 @@ CONV_FLTP_TO_FLT_6CH INIT_XMM avx CONV_FLTP_TO_FLT_6CH %endif + +;-- +; void ff_conv_s16_to_s16p_2ch(int16_t *const *dst, int16_t *src, int len, +; int channels); +;-- + +%macro CONV_S16_TO_S16P_2CH 0 +cglobal conv_s16_to_s16p_2ch, 3,4,3, dst0, src, len, dst1 +lea lenq, [2*lend] +mov dst1q, [dst0q+gprsize] +mov dst0q, [dst0q] +lea srcq, [srcq+2*lenq] +add dst0q, lenq +add dst1q, lenq +neg lenq +ALIGN 16 +.loop: +movam0, [srcq+2*lenq ] +movam1, [srcq+2*lenq+mmsize] +pshuflw m0, m0, q3120 +pshufhw m0, m0, q3120 +pshuflw m1, m1, q3120 +pshufhw m1, m1, q3120 +shufps m2, m0, m1, q2020 +shufps m0, m1, q3131 +mova [dst0q+lenq], m2 +mova [dst1q+lenq], m0 The more common way to do this (I believe) is to set up mask reg: pcmpeqb m4, m4 psrlw m4, 8 ; 0x00ff Then mask/shift: mova m0, [srcq+2*lenq+0*mmsize] mova m1, [srcq+2*lenq+1*mmsize] psrlw m2, m0, 8 psrlw m3, m1, 8 pand m0, m4 pand m1, m4 packsswb m0, m1 packsswb m2, m3 mova [dst1q+lenq], m0 mova [dst2q+lenq], m2 However, that's not less instructions, maybe worth checking anyway. Alternatively, a pshufb version: mova m3, [pb_02468ace13579bdf] .loop: mova m0, [srcq+2*lenq+0*mmsize] mova m1, [srcq+2*lenq+1*mmsize] pshufb m0, m3 pshufb m1, m3 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 mova [dst1q+lenq], m2 mova [dst2q+lenq], m0 2 instructions less, and only 2 unpacks as opposed to all the shuffles, so potentially faster (except on Atom where pshufb is dog-slow). Actually that's all byte-based, but I guess it's obvious what I mean so should be easy to convert to word-speak. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 09/15] lavr: x86: optimized 6-channel s16 to s16p conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: +mova m0, [srcq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 +mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 +movq m3, [srcq+ mmsize+mmsize/2] +movlhpsm3, m2 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 +movhpd m1, [srcq+ mmsize] +movhlpsm1, m0 ; m1 = 4, 5, 6, 7, 8, 9, 10, 11 +psrldq m1, 4; m1 = 6, 7, 8, 9, 10, 11, x, x +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23, x, x +psrldq m1, 4; m1 = 6, 7, 8, 9, 10, 11, x, x +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23, x, x That's ... weird (at least for the AVX version): mova m0, [srcq+0*mmsize] ; 0-7 mova m2, [srcq+1*mmsize] ; 8-15 mova m3, [srcq+2*mmsize] ; 16-23 palignr m1, m0, m2, 12 ; 6-11 palignr m2, m3, 8 ; 12-17 psrldq m3, 4 ; 18-23 +punpcklwd m4, m0, m1 ; m4 = 0, 6, 1, 7, 2, 8, 3, 9 +punpckhwd m0, m1 ; m0 = 4, 10, 5, 11, x, x, x, x +punpcklwd m1, m3, m2 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21 +punpckhwd m3, m2 ; m3 = 16, 22, 17, 23, x, x, x, x +punpckldq m2, m4, m1 ; m2 = 0, 6, 12, 18, 1, 7, 13, 19 +punpckhdq m4, m1 ; m4 = 2, 8, 14, 20, 3, 9, 15, 21 +punpckldq m0, m3 ; m0 = 4, 10, 16, 22, 5, 11, 17, 23 +movq[dstq ], m2 +movhps [dstq+dst1q], m2 +movq[dstq+dst2q], m4 +movhps [dstq+dst3q], m4 +movq[dstq+dst4q], m0 +movhps [dstq+dst5q], m0 +add srcq, mmsize*3 +add dstq, mmsize/2 +sub lend, mmsize/4 Pointer munging should allow getting rid of one sub/add. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 10/15] lavr: x86: optimized 2-channel s16 to fltp conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 41 libavresample/x86/audio_convert_init.c | 13 ++ 2 files changed, 54 insertions(+), 0 deletions(-) LGTM. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 11/15] lavr: x86: optimized 6-channel s16 to fltp conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: +mova m0, [srcq ] ; m0 = 0, 1, 2, 3, 4, 5, 6, 7 +mova m1, [srcq+ mmsize] ; m1 = 8, 9, 10, 11, 12, 13, 14, 15 +mova m2, [srcq+2*mmsize] ; m2 = 16, 17, 18, 19, 20, 21, 22, 23 +movhlpsm3, m1 +movlhpsm3, m2 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 +movlhpsm1, m1 +movhlpsm1, m0 ; m1 = 4, 5, 6, 7, 8, 9, 10, 11 +psrldq m1, 4; m1 = 6, 7, 8, 9, 10, 11, x, x +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23, x, x See 10/15, should be able to do this using palignr x2+psrldqx1 instead. +punpcklwd m4, m0, m1 ; m4 = 0, 6, 1, 7, 2, 8, 3, 9 +punpckhwd m0, m1 ; m0 = 4, 10, 5, 11, x, x, x, x +punpcklwd m1, m3, m2 ; m1 = 12, 18, 13, 19, 14, 20, 15, 21 +punpckhwd m3, m2 ; m3 = 16, 22, 17, 23, x, x, x, x +punpckldq m2, m4, m1 ; m2 = 0, 6, 12, 18, 1, 7, 13, 19 +punpckhdq m4, m1 ; m4 = 2, 8, 14, 20, 3, 9, 15, 21 +punpckldq m0, m3 ; m0 = 4, 10, 16, 22, 5, 11, 17, 23 +movhlpsm3, m2 ; m3 = 1, 7, 13, 19, x, x, x, x +movhlpsm5, m4 ; m5 = 3, 9, 15, 21, x, x, x, x +movhlpsm1, m0 ; m1 = 5, 11, 17, 23, x, x, x, x +PMOVSXWD m0, m0 +PMOVSXWD m1, m1 +PMOVSXWD m2, m2 +PMOVSXWD m3, m3 +PMOVSXWD m4, m4 +PMOVSXWD m5, m5 +cvtdq2ps m0, m0 +cvtdq2ps m1, m1 +cvtdq2ps m2, m2 +cvtdq2ps m3, m3 +cvtdq2ps m4, m4 +cvtdq2ps m5, m5 +mulps m0, m6 +mulps m1, m6 +mulps m2, m6 +mulps m3, m6 +mulps m4, m6 +mulps m5, m6 +mova [dstq ], m2 +mova [dstq+dst1q], m3 +mova [dstq+dst2q], m4 +mova [dstq+dst3q], m5 +mova [dstq+dst4q], m0 +mova [dstq+dst5q], m1 +add srcq, mmsize*3 +add dstq, mmsize +sub lend, mmsize/4 Pointer munging allows to remove one add/sub. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 12/15] lavr: x86: optimized 2-channel flt to s16p conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 49 libavresample/x86/audio_convert_init.c |9 ++ 2 files changed, 58 insertions(+), 0 deletions(-) LGTM. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 13/15] lavr: x86: optimized 6-channel flt to s16p conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: +movhlpsm3, m1 +movlhpsm3, m2 ; m3 = 12, 13, 14, 15, 16, 17, 18, 19 +movlhpsm1, m1 +movhlpsm1, m0 ; m1 = 4, 5, 6, 7, 8, 9, 10, 11 +psrldq m1, 4; m1 = 6, 7, 8, 9, 10, 11, x, x +psrldq m2, 4; m2 = 18, 19, 20, 21, 22, 23, x, x palignrx2+psrldqx1, saves 3 instructions. +add srcq, mmsize*6 +add dstq, mmsize/2 +sub lend, mmsize/4 Pointer munging to remove one add/sub. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 14/15] lavr: x86: optimized 2-channel flt to fltp conversion
Hi, On Sat, Jul 14, 2012 at 9:29 PM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 34 libavresample/x86/audio_convert_init.c |9 2 files changed, 43 insertions(+), 0 deletions(-) OK. (Can this be implemented in YMM with the current instructions available?) Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 15/15] lavr: x86: optimized 6-channel flt to fltp conversion
Hi, On Tue, Jul 17, 2012 at 6:16 AM, Justin Ruggles justin.rugg...@gmail.com wrote: --- libavresample/x86/audio_convert.asm| 63 libavresample/x86/audio_convert_init.c |9 + 2 files changed, 72 insertions(+), 0 deletions(-) (I'm going to assume Loren had no further comments) LGTM. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/7] vf_hqdn3d: simplify and optimize
Hi, On Tue, Jul 24, 2012 at 7:45 PM, Loren Merritt lor...@u.washington.edu wrote: -long x, y; -uint32_t pixel; +uint32_t tmp; -for (y = 0; y h; y++) { -for (x = 0; x w; x++) { -pixel = lowpass(frame_ant[x]8, src[x]16, temporal); -frame_ant[x] = ((pixel+0x107F)8); -dst[x]= ((pixel+0x10007FFF)16); +for (long y = 0; y h; y++) { +for (long x = 0; x w; x++) { Unfortunately, this won't compile on MSVC, please do declare the variables outside the loop. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] vp3: don't use calls to inline asm in yasm code.
Hi, On Mon, Jul 23, 2012 at 7:12 AM, Ronald S. Bultje rsbul...@gmail.com wrote: On Sun, Jul 22, 2012 at 2:38 PM, Ronald S. Bultje rsbul...@gmail.com wrote: From: Ronald S. Bultje rsbul...@gmail.com Mixing yasm and inline asm is a bad idea, since if either yasm or inline asm is not supported by your toolchain, all of the asm stops working. Thus, better to use either one or the other alone. --- libavcodec/x86/vp3dsp.asm | 120 + 1 file changed, 79 insertions(+), 41 deletions(-) Ping. Jason LGTM'ed this over the weekend on IRC, I'll push this if there's no further comments. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] x86/dsputil: put inline asm under HAVE_INLINE_ASM.
Hi, On Sun, Jul 22, 2012 at 3:27 PM, Derek Buitenhuis derek.buitenh...@gmail.com wrote: On 22/07/2012 6:14 PM, Ronald S. Bultje wrote: From: Ronald S. Bultje rsbul...@gmail.com This allows compiling with compilers that don't support gcc-style inline assembly. --- I think this looks OK, assuming: 1) You've tested every yasm/inline asm enable/disable combination 2) Everybody else is OK with using ifdefs. If there's no further comments, I'll push this tomorrow. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264: refactor NAL decode loop.
Hi, On Mon, Jul 23, 2012 at 2:05 AM, Diego Biurrun di...@biurrun.de wrote: On Sun, Jul 22, 2012 at 08:46:10PM -0700, Ronald S. Bultje wrote: From: Ronald S. Bultje rsbul...@gmail.com Write out the NAL decoding loops in full so that they are easier to parse for a preprocessor without it having to be aware of macros or other such things in C code. This also makes the code more readable. --- libavcodec/h264.c | 42 +- 1 file changed, 25 insertions(+), 17 deletions(-) LGTM --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -175,42 +175,50 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, +#define STARTCODE_TEST \ +if (i + 2 length src[i + 1] == 0 src[i + 2] = 3) { \ +if (src[i + 2] != 3) { \ +/* startcode, so we must be past the end */ \ +length = i; \ +} \ +break; \ +} #if HAVE_FAST_UNALIGNED +#define FIND_FIRST_ZERO \ +if (i 0 !src[i]) \ +i--; \ +while (src[i]) \ +i++ h264.c is one of the pretty files now, so please align the '\', preferably on column 72. My editor doesn't support that - can you do that? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 2/2] vp3: don't use calls to inline asm in yasm code.
Hi, On Sun, Jul 22, 2012 at 2:38 PM, Ronald S. Bultje rsbul...@gmail.com wrote: From: Ronald S. Bultje rsbul...@gmail.com Mixing yasm and inline asm is a bad idea, since if either yasm or inline asm is not supported by your toolchain, all of the asm stops working. Thus, better to use either one or the other alone. --- libavcodec/x86/vp3dsp.asm | 120 + 1 file changed, 79 insertions(+), 41 deletions(-) Ping. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.
Hi, On Sun, Jul 22, 2012 at 1:16 PM, Ronald S. Bultje rsbul...@gmail.com wrote: From: Ronald S. Bultje rsbul...@gmail.com This completes the conversion of h264dsp to yasm; note that h264 also uses some dsputil functions, most notably qpel. Performance-wise, the yasm-version is ~10 cycles faster (182-172) on x86-64, and ~8 cycles faster (201-193) on x86-32. --- libavcodec/x86/h264_deblock.asm | 168 +++ libavcodec/x86/h264dsp_mmx.c| 162 ++--- 2 files changed, 175 insertions(+), 155 deletions(-) Ping. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264: refactor NAL decode loop.
Hi, On Mon, Jul 23, 2012 at 7:14 AM, Kostya Shishkov kostya.shish...@gmail.com wrote: On Mon, Jul 23, 2012 at 07:11:49AM -0700, Ronald S. Bultje wrote: Hi, On Mon, Jul 23, 2012 at 2:05 AM, Diego Biurrun di...@biurrun.de wrote: On Sun, Jul 22, 2012 at 08:46:10PM -0700, Ronald S. Bultje wrote: From: Ronald S. Bultje rsbul...@gmail.com Write out the NAL decoding loops in full so that they are easier to parse for a preprocessor without it having to be aware of macros or other such things in C code. This also makes the code more readable. --- libavcodec/h264.c | 42 +- 1 file changed, 25 insertions(+), 17 deletions(-) LGTM --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -175,42 +175,50 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, +#define STARTCODE_TEST \ +if (i + 2 length src[i + 1] == 0 src[i + 2] = 3) { \ +if (src[i + 2] != 3) { \ +/* startcode, so we must be past the end */ \ +length = i; \ +} \ +break; \ +} #if HAVE_FAST_UNALIGNED +#define FIND_FIRST_ZERO \ +if (i 0 !src[i]) \ +i--; \ +while (src[i]) \ +i++ h264.c is one of the pretty files now, so please align the '\', preferably on column 72. My editor doesn't support that - can you do that? Mine neither, but pressing spacebar and releasing it after some time does the trick just fine. Hard to say if it's at 72 or not, is it? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264: refactor NAL decode loop
Hi, On Mon, Jul 23, 2012 at 7:29 AM, Luca Barbato lu_z...@gentoo.org wrote: From: Ronald S. Bultje rsbul...@gmail.com Write out the NAL decoding loops in full so that they are easier to parse for a preprocessor without it having to be aware of macros or other such things in C code. This also makes the code more readable. Signed-off-by: Luca Barbato lu_z...@gentoo.org --- libavcodec/h264.c | 42 +- 1 file changed, 25 insertions(+), 17 deletions(-) Thanks, go ahead and commit this. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] H264: Remove 3dnow qpel code.
Hi, On Mon, Jul 23, 2012 at 5:37 PM, Daniel Kang daniel.d.k...@gmail.com wrote: On Mon, Jul 23, 2012 at 5:21 PM, Diego Biurrun di...@biurrun.de wrote: On Mon, Jul 23, 2012 at 05:12:23PM -0700, Daniel Kang wrote: From: Daniel Kang daniel.d.k...@gmail.com The only CPUs that have 3dnow and don't have mmxext are 12 years old. --- libavcodec/x86/dsputil_mmx.c |9 - libavcodec/x86/h264_qpel_mmx.c |4 2 files changed, 0 insertions(+), 13 deletions(-) What sort of maintenance burden does this relieve us from? I'm writing this mail on a system fitting the description you mention, my trusty old K6-III. [..] 4. You can probably decode 260p H.264 with a K6-III. Who seriously would use this? This really is the killer. Is there any sort of reasonable expectation that a k6-3 can get useful work done when it comes to H264 decoding? I wouldn't even mind dropping all MMX optimizations (where MMX2 - i.e. SSE - or higher exists) altogether, i.e. going the H264 way and requiring SSE for useful x86 performance (i.e. non-C-only). Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] H264: Remove 3dnow qpel code.
Hi, On Mon, Jul 23, 2012 at 7:45 PM, Ronald S. Bultje rsbul...@gmail.com wrote: Hi, On Mon, Jul 23, 2012 at 5:37 PM, Daniel Kang daniel.d.k...@gmail.com wrote: On Mon, Jul 23, 2012 at 5:21 PM, Diego Biurrun di...@biurrun.de wrote: On Mon, Jul 23, 2012 at 05:12:23PM -0700, Daniel Kang wrote: From: Daniel Kang daniel.d.k...@gmail.com The only CPUs that have 3dnow and don't have mmxext are 12 years old. --- libavcodec/x86/dsputil_mmx.c |9 - libavcodec/x86/h264_qpel_mmx.c |4 2 files changed, 0 insertions(+), 13 deletions(-) What sort of maintenance burden does this relieve us from? I'm writing this mail on a system fitting the description you mention, my trusty old K6-III. [..] 4. You can probably decode 260p H.264 with a K6-III. Who seriously would use this? This really is the killer. Is there any sort of reasonable expectation that a k6-3 can get useful work done when it comes to H264 decoding? I wouldn't even mind dropping all MMX optimizations (where MMX2 - i.e. SSE - or higher exists) altogether, i.e. going the H264 way .. x264 way. :). Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] lavfi: put inline assembly under HAVE_INLINE_ASM.
Hi, On Sat, Jul 21, 2012 at 5:03 PM, Ronald S. Bultje rsbul...@gmail.com wrote: From: Ronald S. Bultje rsbul...@gmail.com This allows compiling this code using compilers that do not understand gcc-style inline assembly. --- libavfilter/x86/gradfun.c |6 ++ libavfilter/x86/yadif.c |6 ++ 2 files changed, 12 insertions(+) Ping. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264: refactor NAL decoding loop.
Hi, On Sat, Jul 21, 2012 at 5:19 PM, Måns Rullgård m...@mansr.com wrote: Ronald S. Bultje rsbul...@gmail.com writes: From: Ronald S. Bultje rsbul...@gmail.com This removes some code duplication between the 3 different versions, and aligns brackets in such a way that it is now possible to pull this code through a naive pre-processor that doesn't necessarily have to be aware of compiler-macros. --- libavcodec/h264.c | 36 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/libavcodec/h264.c b/libavcodec/h264.c index a4afcc8..20fa7c3 100644 --- a/libavcodec/h264.c +++ b/libavcodec/h264.c @@ -178,30 +178,34 @@ const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, #if HAVE_FAST_UNALIGNED #if HAVE_FAST_64BIT #define RS 7 -for (i = 0; i + 1 length; i += 9) { -if (!((~AV_RN64A(src + i) - (AV_RN64A(src + i) - 0x0100010001000101ULL)) +#define MASKCHECK \ +if (!((~AV_RN64A(src + i) \ + (AV_RN64A(src + i) - 0x0100010001000101ULL)) \ 0x8000800080008080ULL)) #else #define RS 3 -for (i = 0; i + 1 length; i += 5) { -if (!((~AV_RN32A(src + i) - (AV_RN32A(src + i) - 0x01000101U)) +#define MASKCHECK \ +if (!((~AV_RN32A(src + i) \ + (AV_RN32A(src + i) - 0x01000101U)) \ 0x80008080U)) #endif -continue; -if (i 0 !src[i]) -i--; -while (src[i]) -i++; +#define LOOPCHECK \ +MASKCHECK \ +continue; \ +if (i 0 !src[i]) \ +i--; \ +while (src[i]) \ +i++ #else #define RS 0 -for (i = 0; i + 1 length; i += 2) { -if (src[i]) -continue; -if (i 0 src[i - 1] == 0) -i--; +#define LOOPCHECK \ +if (src[i]) \ +continue; \ +if (i 0 src[i - 1] == 0) \ +i-- #endif +for (i = 0; i + 1 length; i += RS + 2) { +LOOPCHECK; if (i + 2 length src[i + 1] == 0 src[i + 2] = 3) { if (src[i + 2] != 3) { /* startcode, so we must be past the end */ -- This manner of splitting things is incredibly weird-looking. Instead of trying to unify these rather different fragments, turning the second half of the loop into a macro and writing out separate loops, each calling the macro for the common part, would probably look much more sane. #define LOOP_COMMON_PART \ if (i + 2 length src[i + 1] == 0 src[i + 2] = 3) { \ if (src[i + 2] != 3) { \ /* startcode, so we must be past the end */ \ length = i; \ } \ break; \ } #if HAVE_FAST_UNALIGNED #define CHECK_COMMON_PART \ if (i 0 !src[i]) \ i--; \ while (src[i]) \ i++ #if HAVE_FAST_64BIT for (i = 0; i + 1 length; i += 9) { if (!((~AV_RN64A(src + i) (AV_RN64A(src + i) - 0x0100010001000101ULL)) 0x8000800080008080ULL)) continue; CHECK_COMMON_PART; LOOP_COMMON_PART; i -= 7; } #else for (i = 0; i + 1 length; i += 5) { if (!((~AV_RN32A(src + i) (AV_RN32A(src + i) - 0x01000101U)) 0x80008080U)) continue; CHECK_COMMON_PART; LOOP_COMMON_PART; i -= 3; } #endif #else for (i = 0; i + 1 length; i += 2) { if (src[i]) continue; if (i 0 src[i - 1] == 0) i--; LOOP_COMMON_PART; } #endif Pick your bet and commit whichever is nicer; both work with the preprocessor. (I think the earlier one looks better.) Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] h264: refactor NAL decoding loop.
Hi, On Sun, Jul 22, 2012 at 8:17 AM, Måns Rullgård m...@mansr.com wrote: Ronald S. Bultje rsbul...@gmail.com writes: This manner of splitting things is incredibly weird-looking. Instead of trying to unify these rather different fragments, turning the second half of the loop into a macro and writing out separate loops, each calling the macro for the common part, would probably look much more sane. #define LOOP_COMMON_PART \ if (i + 2 length src[i + 1] == 0 src[i + 2] = 3) { \ if (src[i + 2] != 3) { \ /* startcode, so we must be past the end */ \ length = i; \ } \ break; \ } #if HAVE_FAST_UNALIGNED #define CHECK_COMMON_PART \ if (i 0 !src[i]) \ i--; \ while (src[i]) \ i++ #if HAVE_FAST_64BIT for (i = 0; i + 1 length; i += 9) { if (!((~AV_RN64A(src + i) (AV_RN64A(src + i) - 0x0100010001000101ULL)) 0x8000800080008080ULL)) continue; CHECK_COMMON_PART; LOOP_COMMON_PART; i -= 7; } #else for (i = 0; i + 1 length; i += 5) { if (!((~AV_RN32A(src + i) (AV_RN32A(src + i) - 0x01000101U)) 0x80008080U)) continue; CHECK_COMMON_PART; LOOP_COMMON_PART; i -= 3; } #endif #else for (i = 0; i + 1 length; i += 2) { if (src[i]) continue; if (i 0 src[i - 1] == 0) i--; LOOP_COMMON_PART; } #endif With a bit saner names for the macros, this is way more readable. Given the risk of having to send the patch 10x, why don't you guys suggest some names to make review shorter? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.
From: Ronald S. Bultje rsbul...@gmail.com This completes the conversion of h264dsp to yasm; note that h264 also uses some dsputil functions, most notably qpel. Performance-wise, the yasm-version is ~10 cycles faster (182-172) on x86-64, and ~8 cycles faster (201-193) on x86-32. --- libavcodec/x86/h264_deblock.asm | 168 +++ libavcodec/x86/h264dsp_mmx.c| 160 ++--- 2 files changed, 174 insertions(+), 154 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 1982dc4..43c7b95 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -27,6 +27,10 @@ %include x86inc.asm %include x86util.asm +SECTION_RODATA + +pb_3_1: times 4 db 3, 1 + SECTION .text cextern pb_0 @@ -911,3 +915,167 @@ ff_chroma_intra_body_mmxext: paddb m1, m5 paddb m2, m6 ret + +;- +; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], +;int8_t ref[2][40], int16_t mv[2][40][2], +;int bidir,int edges,int step, +;int mask_mv0, int mask_mv1, int field); +; +; bidiris 0 or 1 +; edgesis 1 or 4 +; step is 1 or 2 +; mask_mv0 is 0 or 3 +; mask_mv1 is 0 or 1 +; fieldis 0 or 1 +;- +%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, +; dir, d_idx, mask_dir, bidir +%define edgesm%1 +%define stepm %2 +%define mask_mvm %3 +%define dir %4 +%define d_idx %5 +%define mask_dir %6 +%define bidir %7 +xor b_idxq, b_idxq ; for (b_idx = 0; b_idx edges; b_idx += step) +.b_idx_loop_ %+ dir %+ _ %+ bidir: +%if mask_dir == 0 +pxor m0, m0 +%endif +test b_idxd, dword mask_mvm +jnz .skip_loop_iter_ %+ dir %+ _ %+ bidir ; if (!(b_idx mask_mv)) +%if bidir == 1 +movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } +punpckldqm2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } +pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } +pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } +pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } +psubbm0, m2 ; { ref0[b] != ref0[bn], +; ref0[b] != ref1[bn] } +psubbm1, m3 ; { ref1[b] != ref1[bn], +; ref1[b] != ref0[bn] } + +por m0, m1 +mova m1, [mvq+b_idxq*4+(d_idx+12)*4] +mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] +mova m3, m1 +mova m4, m2 +psubwm1, [mvq+b_idxq*4+12*4] +psubwm2, [mvq+b_idxq*4+12*4+mmsize] +psubwm3, [mvq+b_idxq*4+52*4] +psubwm4, [mvq+b_idxq*4+52*4+mmsize] +packsswb m1, m2 +packsswb m3, m4 +paddbm1, m6 +paddbm3, m6 +psubusb m1, m5 ; abs(mv[b] - mv[bn]) = limit +psubusb m3, m5 +packsswb m1, m3 + +por m0, m1 +mova m1, [mvq+b_idxq*4+(d_idx+52)*4] +mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] +mova m3, m1 +mova m4, m2 +psubwm1, [mvq+b_idxq*4+12*4] +psubwm2, [mvq+b_idxq*4+12*4+mmsize] +psubwm3, [mvq+b_idxq*4+52*4] +psubwm4, [mvq+b_idxq*4+52*4+mmsize] +packsswb m1, m2 +packsswb m3, m4 +paddbm1, m6 +paddbm3, m6 +psubusb m1, m5 ; abs(mv[b] - mv[bn]) = limit +psubusb m3, m5 +packsswb m1, m3 + +pshufw m1, m1, 0x4E +por m0, m1 +pshufw m1, m0, 0x4E +pminub m0, m1 +%else ; bidir == 0 +movd m0, [refq+b_idxq+12] +psubbm0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] + +mova m1, [mvq+b_idxq*4+12*4] +mova m2, [mvq+b_idxq*4+12*4+mmsize] +psubwm1, [mvq+b_idxq*4+(d_idx+12)*4] +psubwm2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] +packsswb m1, m2 +paddbm1, m6 +psubusb m1, m5 ; abs(mv[b] - mv[bn]) = limit +packsswb m1, m1 +por m0, m1 +%endif ; bidir == 1/0 + +.skip_loop_iter_ %+ dir %+ _ %+ bidir: +movd m1, [nnzq+b_idxq+12] +por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] + +pminub m1, m7 +pminub m0, m7 +psllwm1, 1 +pxor m2, m2 +pmaxub
[libav-devel] [PATCH] h264: convert loop filter strength dsp function to yasm.
From: Ronald S. Bultje rsbul...@gmail.com This completes the conversion of h264dsp to yasm; note that h264 also uses some dsputil functions, most notably qpel. Performance-wise, the yasm-version is ~10 cycles faster (182-172) on x86-64, and ~8 cycles faster (201-193) on x86-32. --- libavcodec/x86/h264_deblock.asm | 168 +++ libavcodec/x86/h264dsp_mmx.c| 162 ++--- 2 files changed, 175 insertions(+), 155 deletions(-) diff --git a/libavcodec/x86/h264_deblock.asm b/libavcodec/x86/h264_deblock.asm index 1982dc4..43c7b95 100644 --- a/libavcodec/x86/h264_deblock.asm +++ b/libavcodec/x86/h264_deblock.asm @@ -27,6 +27,10 @@ %include x86inc.asm %include x86util.asm +SECTION_RODATA + +pb_3_1: times 4 db 3, 1 + SECTION .text cextern pb_0 @@ -911,3 +915,167 @@ ff_chroma_intra_body_mmxext: paddb m1, m5 paddb m2, m6 ret + +;- +; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], +;int8_t ref[2][40], int16_t mv[2][40][2], +;int bidir,int edges,int step, +;int mask_mv0, int mask_mv1, int field); +; +; bidiris 0 or 1 +; edgesis 1 or 4 +; step is 1 or 2 +; mask_mv0 is 0 or 3 +; mask_mv1 is 0 or 1 +; fieldis 0 or 1 +;- +%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, +; dir, d_idx, mask_dir, bidir +%define edgesm%1 +%define stepm %2 +%define mask_mvm %3 +%define dir %4 +%define d_idx %5 +%define mask_dir %6 +%define bidir %7 +xor b_idxq, b_idxq ; for (b_idx = 0; b_idx edges; b_idx += step) +.b_idx_loop_ %+ dir %+ _ %+ bidir: +%if mask_dir == 0 +pxor m0, m0 +%endif +test b_idxd, dword mask_mvm +jnz .skip_loop_iter_ %+ dir %+ _ %+ bidir ; if (!(b_idx mask_mv)) +%if bidir == 1 +movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } +punpckldqm2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } +pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } +pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } +pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } +psubbm0, m2 ; { ref0[b] != ref0[bn], +; ref0[b] != ref1[bn] } +psubbm1, m3 ; { ref1[b] != ref1[bn], +; ref1[b] != ref0[bn] } + +por m0, m1 +mova m1, [mvq+b_idxq*4+(d_idx+12)*4] +mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] +mova m3, m1 +mova m4, m2 +psubwm1, [mvq+b_idxq*4+12*4] +psubwm2, [mvq+b_idxq*4+12*4+mmsize] +psubwm3, [mvq+b_idxq*4+52*4] +psubwm4, [mvq+b_idxq*4+52*4+mmsize] +packsswb m1, m2 +packsswb m3, m4 +paddbm1, m6 +paddbm3, m6 +psubusb m1, m5 ; abs(mv[b] - mv[bn]) = limit +psubusb m3, m5 +packsswb m1, m3 + +por m0, m1 +mova m1, [mvq+b_idxq*4+(d_idx+52)*4] +mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] +mova m3, m1 +mova m4, m2 +psubwm1, [mvq+b_idxq*4+12*4] +psubwm2, [mvq+b_idxq*4+12*4+mmsize] +psubwm3, [mvq+b_idxq*4+52*4] +psubwm4, [mvq+b_idxq*4+52*4+mmsize] +packsswb m1, m2 +packsswb m3, m4 +paddbm1, m6 +paddbm3, m6 +psubusb m1, m5 ; abs(mv[b] - mv[bn]) = limit +psubusb m3, m5 +packsswb m1, m3 + +pshufw m1, m1, 0x4E +por m0, m1 +pshufw m1, m0, 0x4E +pminub m0, m1 +%else ; bidir == 0 +movd m0, [refq+b_idxq+12] +psubbm0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] + +mova m1, [mvq+b_idxq*4+12*4] +mova m2, [mvq+b_idxq*4+12*4+mmsize] +psubwm1, [mvq+b_idxq*4+(d_idx+12)*4] +psubwm2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] +packsswb m1, m2 +paddbm1, m6 +psubusb m1, m5 ; abs(mv[b] - mv[bn]) = limit +packsswb m1, m1 +por m0, m1 +%endif ; bidir == 1/0 + +.skip_loop_iter_ %+ dir %+ _ %+ bidir: +movd m1, [nnzq+b_idxq+12] +por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] + +pminub m1, m7 +pminub m0, m7 +psllwm1, 1 +pxor m2, m2 +pmaxub
[libav-devel] [PATCH] swscale: add missing HAVE_INLINE_ASM check.
From: Ronald S. Bultje rsbul...@gmail.com The function called in this block is under HAVE_INLINE_ASM itself also. --- libswscale/swscale.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libswscale/swscale.c b/libswscale/swscale.c index 7ae5af3..5cfa7f2 100644 --- a/libswscale/swscale.c +++ b/libswscale/swscale.c @@ -518,7 +518,7 @@ static int swScale(SwsContext *c, const uint8_t *src[], if (!enough_lines) break; // we can't output a dstY line so let's try with the next slice -#if HAVE_MMX +#if HAVE_MMX HAVE_INLINE_ASM updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf); #endif -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] swscale: add fast bilinear scaler under HAVE_INLINE_ASM.
From: Ronald S. Bultje rsbul...@gmail.com --- libswscale/utils.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index d8fee58..a6b5a18 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -576,7 +576,7 @@ fail: return ret; } -#if HAVE_MMX2 +#if HAVE_MMX2 HAVE_INLINE_ASM static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, int16_t *filter, int32_t *filterPos, int numSplits) { @@ -739,7 +739,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, return fragmentPos + 1; } -#endif /* HAVE_MMX2 */ +#endif /* HAVE_MMX2 HAVE_INLINE_ASM */ static void getSubSampleFactors(int *h, int *v, enum PixelFormat format) { @@ -971,7 +971,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) FF_ALLOC_OR_GOTO(c, c-formatConvBuffer, (FFALIGN(srcW, 16) * 2 * FFALIGN(c-srcBpc, 8) 3) + 16, fail); -if (HAVE_MMX2 cpu_flags AV_CPU_FLAG_MMX2 +if (HAVE_MMX2 HAVE_INLINE_ASM cpu_flags AV_CPU_FLAG_MMX2 c-srcBpc == 8 c-dstBpc = 10) { c-canMMX2BeUsed = (dstW = srcW (dstW 31) == 0 (srcW 15) == 0) ? 1 : 0; @@ -1010,7 +1010,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) /* precalculate horizontal scaler filter coefficients */ { -#if HAVE_MMX2 +#if HAVE_MMX2 HAVE_INLINE_ASM // can't downscale !!! if (c-canMMX2BeUsed (flags SWS_FAST_BILINEAR)) { c-lumMmx2FilterCodeSize = initMMX2HScaler(dstW, c-lumXInc, NULL, @@ -1046,7 +1046,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) mprotect(c-chrMmx2FilterCode, c-chrMmx2FilterCodeSize, PROT_EXEC | PROT_READ); #endif } else -#endif /* HAVE_MMX2 */ +#endif /* HAVE_MMX2 HAVE_INLINE_ASM */ { const int filterAlign = (HAVE_MMX cpu_flags AV_CPU_FLAG_MMX) ? 4 : -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH 1/2] x86/dsputil: put inline asm under HAVE_INLINE_ASM.
From: Ronald S. Bultje rsbul...@gmail.com This allows compiling with compilers that don't support gcc-style inline assembly. --- libavcodec/x86/dsputil_mmx.c | 69 -- libavcodec/x86/h264_qpel_mmx.c |4 ++- libavcodec/x86/idct_mmx.c|4 +++ libavcodec/x86/idct_mmx_xvid.c |4 +++ libavcodec/x86/idct_sse2_xvid.c |4 +++ libavcodec/x86/rv40dsp_init.c|4 +++ libavcodec/x86/simple_idct_mmx.c |4 +++ 7 files changed, 75 insertions(+), 18 deletions(-) diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 5eb4a24..a8b31e4 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -84,6 +84,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; +#if HAVE_INLINE_ASM + #define JUMPALIGN() __asm__ volatile (.p2align 3::) #define MOVQ_ZERO(regd) __asm__ volatile (pxor %%#regd, %%#regd ::) @@ -1836,6 +1838,8 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) avg_pixels16_xy2_mmx(dst, src, stride, 16); } +#endif /* HAVE_INLINE_ASM */ + #if HAVE_YASM typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src, x86_reg linesize, x86_reg start_y, @@ -1904,6 +1908,8 @@ static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, } #endif /* HAVE_YASM */ +#if HAVE_INLINE_ASM + typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, int linesize, int block_w, int block_h, int src_x, int src_y, int w, int h); @@ -2073,6 +2079,8 @@ PREFETCH(prefetch_mmx2, prefetcht0) PREFETCH(prefetch_3dnow, prefetch) #undef PREFETCH +#endif /* HAVE_INLINE_ASM */ + #include h264_qpel_mmx.c void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, @@ -2118,6 +2126,8 @@ CHROMA_MC(avg, 8, 10, sse2) CHROMA_MC(put, 8, 10, avx) CHROMA_MC(avg, 8, 10, avx) +#if HAVE_INLINE_ASM + /* CAVS-specific */ void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { @@ -2476,6 +2486,8 @@ static void vector_clipf_sse(float *dst, const float *src, ); } +#endif /* HAVE_INLINE_ASM */ + int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order); int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, @@ -2588,6 +2600,7 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) { const int high_bit_depth = avctx-bits_per_raw_sample 8; +#if HAVE_INLINE_ASM c-put_pixels_clamped= ff_put_pixels_clamped_mmx; c-put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; c-add_pixels_clamped= ff_add_pixels_clamped_mmx; @@ -2610,10 +2623,6 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) #if ARCH_X86_32 || !HAVE_YASM c-gmc = gmc_mmx; #endif -#if ARCH_X86_32 HAVE_YASM -if (!high_bit_depth) -c-emulated_edge_mc = emulated_edge_mc_mmx; -#endif c-add_bytes = add_bytes_mmx; @@ -2621,8 +2630,14 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) c-h263_v_loop_filter = h263_v_loop_filter_mmx; c-h263_h_loop_filter = h263_h_loop_filter_mmx; } +#endif /* HAVE_INLINE_ASM */ #if HAVE_YASM +#if ARCH_X86_32 +if (!high_bit_depth) +c-emulated_edge_mc = emulated_edge_mc_mmx; +#endif + if (!high_bit_depth CONFIG_H264CHROMA) { c-put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd; c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx; @@ -2639,6 +2654,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, const int bit_depth = avctx-bits_per_raw_sample; const int high_bit_depth = bit_depth 8; +#if HAVE_INLINE_ASM c-prefetch = prefetch_mmx2; if (!high_bit_depth) { @@ -2674,22 +2690,27 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, c-put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2; c-put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; } +#endif /* HAVE_INLINE_ASM */ if (CONFIG_H264QPEL) { +#if HAVE_INLINE_ASM SET_QPEL_FUNCS(put_qpel,0, 16, mmx2, ); SET_QPEL_FUNCS(put_qpel,1, 8, mmx2, ); SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, ); SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, ); SET_QPEL_FUNCS(avg_qpel,0, 16, mmx2, ); SET_QPEL_FUNCS(avg_qpel,1, 8, mmx2, ); +#endif /* HAVE_INLINE_ASM */ if (!high_bit_depth) { +#if HAVE_INLINE_ASM SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, ); SET_QPEL_FUNCS(put_h264_qpel, 1
[libav-devel] [PATCH 2/2] vp3: don't use calls to inline asm in yasm code.
From: Ronald S. Bultje rsbul...@gmail.com Mixing yasm and inline asm is a bad idea, since if either yasm or inline asm is not supported by your toolchain, all of the asm stops working. Thus, better to use either one or the other alone. --- libavcodec/x86/vp3dsp.asm | 120 + 1 file changed, 79 insertions(+), 41 deletions(-) diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm index 58fa1f7..af2f60c 100644 --- a/libavcodec/x86/vp3dsp.asm +++ b/libavcodec/x86/vp3dsp.asm @@ -38,13 +38,11 @@ cextern pb_1 cextern pb_3 cextern pb_7 cextern pb_1F +cextern pb_80 cextern pb_81 cextern pw_8 -cextern put_signed_pixels_clamped_mmx -cextern add_pixels_clamped_mmx - SECTION .text ; this is off by one or two for some cases when filter_limit is greater than 63 @@ -523,56 +521,96 @@ cglobal vp3_h_loop_filter_mmx2, 3, 4 PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 %endmacro -%macro vp3_idct_funcs 3 -cglobal vp3_idct_put_%1, 3, %3, %2 +%macro vp3_idct_funcs 1 +cglobal vp3_idct_put_%1, 3, 4, 9 VP3_IDCT_%1 r2 -%if ARCH_X86_64 -mov r3, r2 -mov r2, r1 -mov r1, r0 -mov r0, r3 + +movsxdifnidn r1, r1d +mova m4, [pb_80] +lea r3, [r1*3] +%assign %%i 0 +%rep 16/mmsize +mova m0, [r2+mmsize*0+%%i] +mova m1, [r2+mmsize*2+%%i] +mova m2, [r2+mmsize*4+%%i] +mova m3, [r2+mmsize*6+%%i] +packsswb m0, [r2+mmsize*1+%%i] +packsswb m1, [r2+mmsize*3+%%i] +packsswb m2, [r2+mmsize*5+%%i] +packsswb m3, [r2+mmsize*7+%%i] +paddb m0, m4 +paddb m1, m4 +paddb m2, m4 +paddb m3, m4 +movq [r0 ], m0 +%if mmsize == 8 +movq [r0+r1 ], m1 +movq [r0+r1*2], m2 +movq [r0+r3 ], m3 %else -mov r0m, r2 -mov r1m, r0 -mov r2m, r1 +movhps [r0+r1 ], m0 +movq [r0+r1*2], m1 +movhps [r0+r3 ], m1 %endif -%if WIN64 -call put_signed_pixels_clamped_mmx -RET -%else -jmp put_signed_pixels_clamped_mmx +%if %%i == 0 +lea r0, [r0+r1*4] +%endif +%if mmsize == 16 +movq [r0 ], m2 +movhps [r0+r1 ], m2 +movq [r0+r1*2], m3 +movhps [r0+r3 ], m3 %endif +%assign %%i %%i+64 +%endrep +RET -cglobal vp3_idct_add_%1, 3, %3, %2 +cglobal vp3_idct_add_%1, 3, 4, 9 VP3_IDCT_%1 r2 -%if ARCH_X86_64 -mov r3, r2 -mov r2, r1 -mov r1, r0 -mov r0, r3 -%else -mov r0m, r2 -mov r1m, r0 -mov r2m, r1 + +mov r3, 4 +pxor m4, m4 +movsxdifnidn r1, r1d +.loop: +movq m0, [r0] +movq m1, [r0+r1] +%if mmsize == 8 +mova m2, m0 +mova m3, m1 %endif -%if WIN64 -call add_pixels_clamped_mmx -RET -%else -jmp add_pixels_clamped_mmx +punpcklbw m0, m4 +punpcklbw m1, m4 +%if mmsize == 8 +punpckhbw m2, m4 +punpckhbw m3, m4 +%endif +paddswm0, [r2+ 0] +paddswm1, [r2+16] +%if mmsize == 8 +paddswm2, [r2+ 8] +paddswm3, [r2+24] +packuswb m0, m2 +packuswb m1, m3 +%else ; mmsize == 16 +packuswb m0, m1 %endif +movq [r0 ], m0 +%if mmsize == 8 +movq [r0+r1], m1 +%else ; mmsize == 16 +movhps [r0+r1], m0 +%endif +lea r0, [r0+r1*2] +add r2, 32 +dec r3 +jg .loop +RET %endmacro -%if ARCH_X86_64 -%define REGS 4 -%else -%define REGS 3 -%endif INIT_MMX -vp3_idct_funcs mmx, 0, REGS +vp3_idct_funcs mmx INIT_XMM -vp3_idct_funcs sse2, 9, REGS -%undef REGS +vp3_idct_funcs sse2 %macro DC_ADD 0 movq m2, [r0 ] -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
[libav-devel] [PATCH] x86/dsputil: put inline asm under HAVE_INLINE_ASM.
From: Ronald S. Bultje rsbul...@gmail.com This allows compiling with compilers that don't support gcc-style inline assembly. --- libavcodec/dct-test.c|2 +- libavcodec/x86/dsputil_mmx.c | 69 -- libavcodec/x86/h264_qpel_mmx.c |4 ++- libavcodec/x86/idct_mmx.c|4 +++ libavcodec/x86/idct_mmx_xvid.c |4 +++ libavcodec/x86/idct_sse2_xvid.c |4 +++ libavcodec/x86/rv40dsp_init.c|4 +++ libavcodec/x86/simple_idct_mmx.c |4 +++ 8 files changed, 76 insertions(+), 19 deletions(-) diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c index ceff448..5046544 100644 --- a/libavcodec/dct-test.c +++ b/libavcodec/dct-test.c @@ -108,7 +108,7 @@ static const struct algo idct_tab[] = { { INT,ff_j_rev_dct, MMX_PERM }, { SIMPLE-C, ff_simple_idct_8, NO_PERM }, -#if HAVE_MMX +#if HAVE_MMX HAVE_INLINE_ASM #if CONFIG_GPL { LIBMPEG2-MMX, ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 }, { LIBMPEG2-MMX2, ff_mmxext_idct,MMX_PERM, AV_CPU_FLAG_MMX2, 1 }, diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 5eb4a24..a8b31e4 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -84,6 +84,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; +#if HAVE_INLINE_ASM + #define JUMPALIGN() __asm__ volatile (.p2align 3::) #define MOVQ_ZERO(regd) __asm__ volatile (pxor %%#regd, %%#regd ::) @@ -1836,6 +1838,8 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) avg_pixels16_xy2_mmx(dst, src, stride, 16); } +#endif /* HAVE_INLINE_ASM */ + #if HAVE_YASM typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src, x86_reg linesize, x86_reg start_y, @@ -1904,6 +1908,8 @@ static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, } #endif /* HAVE_YASM */ +#if HAVE_INLINE_ASM + typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, int linesize, int block_w, int block_h, int src_x, int src_y, int w, int h); @@ -2073,6 +2079,8 @@ PREFETCH(prefetch_mmx2, prefetcht0) PREFETCH(prefetch_3dnow, prefetch) #undef PREFETCH +#endif /* HAVE_INLINE_ASM */ + #include h264_qpel_mmx.c void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, @@ -2118,6 +2126,8 @@ CHROMA_MC(avg, 8, 10, sse2) CHROMA_MC(put, 8, 10, avx) CHROMA_MC(avg, 8, 10, avx) +#if HAVE_INLINE_ASM + /* CAVS-specific */ void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { @@ -2476,6 +2486,8 @@ static void vector_clipf_sse(float *dst, const float *src, ); } +#endif /* HAVE_INLINE_ASM */ + int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order); int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, @@ -2588,6 +2600,7 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) { const int high_bit_depth = avctx-bits_per_raw_sample 8; +#if HAVE_INLINE_ASM c-put_pixels_clamped= ff_put_pixels_clamped_mmx; c-put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; c-add_pixels_clamped= ff_add_pixels_clamped_mmx; @@ -2610,10 +2623,6 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) #if ARCH_X86_32 || !HAVE_YASM c-gmc = gmc_mmx; #endif -#if ARCH_X86_32 HAVE_YASM -if (!high_bit_depth) -c-emulated_edge_mc = emulated_edge_mc_mmx; -#endif c-add_bytes = add_bytes_mmx; @@ -2621,8 +2630,14 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) c-h263_v_loop_filter = h263_v_loop_filter_mmx; c-h263_h_loop_filter = h263_h_loop_filter_mmx; } +#endif /* HAVE_INLINE_ASM */ #if HAVE_YASM +#if ARCH_X86_32 +if (!high_bit_depth) +c-emulated_edge_mc = emulated_edge_mc_mmx; +#endif + if (!high_bit_depth CONFIG_H264CHROMA) { c-put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd; c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx; @@ -2639,6 +2654,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, const int bit_depth = avctx-bits_per_raw_sample; const int high_bit_depth = bit_depth 8; +#if HAVE_INLINE_ASM c-prefetch = prefetch_mmx2; if (!high_bit_depth) { @@ -2674,22 +2690,27 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, c-put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2; c-put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; } +#endif
[libav-devel] [PATCH] x86/dsputil: put inline asm under HAVE_INLINE_ASM.
From: Ronald S. Bultje rsbul...@gmail.com This allows compiling with compilers that don't support gcc-style inline assembly. --- libavcodec/dct-test.c|2 +- libavcodec/x86/dsputil_mmx.c | 69 -- libavcodec/x86/h264_qpel_mmx.c |4 ++- libavcodec/x86/idct_mmx.c|4 +++ libavcodec/x86/idct_mmx_xvid.c |4 +++ libavcodec/x86/idct_sse2_xvid.c |4 +++ libavcodec/x86/rv40dsp_init.c|2 ++ libavcodec/x86/simple_idct_mmx.c |4 +++ 8 files changed, 74 insertions(+), 19 deletions(-) diff --git a/libavcodec/dct-test.c b/libavcodec/dct-test.c index ceff448..5046544 100644 --- a/libavcodec/dct-test.c +++ b/libavcodec/dct-test.c @@ -108,7 +108,7 @@ static const struct algo idct_tab[] = { { INT,ff_j_rev_dct, MMX_PERM }, { SIMPLE-C, ff_simple_idct_8, NO_PERM }, -#if HAVE_MMX +#if HAVE_MMX HAVE_INLINE_ASM #if CONFIG_GPL { LIBMPEG2-MMX, ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 }, { LIBMPEG2-MMX2, ff_mmxext_idct,MMX_PERM, AV_CPU_FLAG_MMX2, 1 }, diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 5eb4a24..a8b31e4 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -84,6 +84,8 @@ DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 }; DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 }; +#if HAVE_INLINE_ASM + #define JUMPALIGN() __asm__ volatile (.p2align 3::) #define MOVQ_ZERO(regd) __asm__ volatile (pxor %%#regd, %%#regd ::) @@ -1836,6 +1838,8 @@ void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride) avg_pixels16_xy2_mmx(dst, src, stride, 16); } +#endif /* HAVE_INLINE_ASM */ + #if HAVE_YASM typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src, x86_reg linesize, x86_reg start_y, @@ -1904,6 +1908,8 @@ static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, } #endif /* HAVE_YASM */ +#if HAVE_INLINE_ASM + typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src, int linesize, int block_w, int block_h, int src_x, int src_y, int w, int h); @@ -2073,6 +2079,8 @@ PREFETCH(prefetch_mmx2, prefetcht0) PREFETCH(prefetch_3dnow, prefetch) #undef PREFETCH +#endif /* HAVE_INLINE_ASM */ + #include h264_qpel_mmx.c void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src, @@ -2118,6 +2126,8 @@ CHROMA_MC(avg, 8, 10, sse2) CHROMA_MC(put, 8, 10, avx) CHROMA_MC(avg, 8, 10, avx) +#if HAVE_INLINE_ASM + /* CAVS-specific */ void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { @@ -2476,6 +2486,8 @@ static void vector_clipf_sse(float *dst, const float *src, ); } +#endif /* HAVE_INLINE_ASM */ + int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order); int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, @@ -2588,6 +2600,7 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) { const int high_bit_depth = avctx-bits_per_raw_sample 8; +#if HAVE_INLINE_ASM c-put_pixels_clamped= ff_put_pixels_clamped_mmx; c-put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; c-add_pixels_clamped= ff_add_pixels_clamped_mmx; @@ -2610,10 +2623,6 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) #if ARCH_X86_32 || !HAVE_YASM c-gmc = gmc_mmx; #endif -#if ARCH_X86_32 HAVE_YASM -if (!high_bit_depth) -c-emulated_edge_mc = emulated_edge_mc_mmx; -#endif c-add_bytes = add_bytes_mmx; @@ -2621,8 +2630,14 @@ static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags) c-h263_v_loop_filter = h263_v_loop_filter_mmx; c-h263_h_loop_filter = h263_h_loop_filter_mmx; } +#endif /* HAVE_INLINE_ASM */ #if HAVE_YASM +#if ARCH_X86_32 +if (!high_bit_depth) +c-emulated_edge_mc = emulated_edge_mc_mmx; +#endif + if (!high_bit_depth CONFIG_H264CHROMA) { c-put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd; c-put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx; @@ -2639,6 +2654,7 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, const int bit_depth = avctx-bits_per_raw_sample; const int high_bit_depth = bit_depth 8; +#if HAVE_INLINE_ASM c-prefetch = prefetch_mmx2; if (!high_bit_depth) { @@ -2674,22 +2690,27 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, c-put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2; c-put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2; } +#endif
[libav-devel] [PATCH] swscale: place inline assembly bilinear scaler under HAVE_INLINE_ASM.
From: Ronald S. Bultje rsbul...@gmail.com --- libswscale/utils.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index d8fee58..a6b5a18 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -576,7 +576,7 @@ fail: return ret; } -#if HAVE_MMX2 +#if HAVE_MMX2 HAVE_INLINE_ASM static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, int16_t *filter, int32_t *filterPos, int numSplits) { @@ -739,7 +739,7 @@ static int initMMX2HScaler(int dstW, int xInc, uint8_t *filterCode, return fragmentPos + 1; } -#endif /* HAVE_MMX2 */ +#endif /* HAVE_MMX2 HAVE_INLINE_ASM */ static void getSubSampleFactors(int *h, int *v, enum PixelFormat format) { @@ -971,7 +971,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) FF_ALLOC_OR_GOTO(c, c-formatConvBuffer, (FFALIGN(srcW, 16) * 2 * FFALIGN(c-srcBpc, 8) 3) + 16, fail); -if (HAVE_MMX2 cpu_flags AV_CPU_FLAG_MMX2 +if (HAVE_MMX2 HAVE_INLINE_ASM cpu_flags AV_CPU_FLAG_MMX2 c-srcBpc == 8 c-dstBpc = 10) { c-canMMX2BeUsed = (dstW = srcW (dstW 31) == 0 (srcW 15) == 0) ? 1 : 0; @@ -1010,7 +1010,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) /* precalculate horizontal scaler filter coefficients */ { -#if HAVE_MMX2 +#if HAVE_MMX2 HAVE_INLINE_ASM // can't downscale !!! if (c-canMMX2BeUsed (flags SWS_FAST_BILINEAR)) { c-lumMmx2FilterCodeSize = initMMX2HScaler(dstW, c-lumXInc, NULL, @@ -1046,7 +1046,7 @@ int sws_init_context(SwsContext *c, SwsFilter *srcFilter, SwsFilter *dstFilter) mprotect(c-chrMmx2FilterCode, c-chrMmx2FilterCodeSize, PROT_EXEC | PROT_READ); #endif } else -#endif /* HAVE_MMX2 */ +#endif /* HAVE_MMX2 HAVE_INLINE_ASM */ { const int filterAlign = (HAVE_MMX cpu_flags AV_CPU_FLAG_MMX) ? 4 : -- 1.7.9.5 ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH] swscale: Mark all init functions as av_cold
Hi, On Sun, Jul 22, 2012 at 3:30 PM, Diego Biurrun di...@biurrun.de wrote: --- libswscale/output.c | 15 --- libswscale/ppc/swscale_altivec.c |3 ++- libswscale/ppc/yuv2rgb_altivec.c | 11 +++ libswscale/rgb2rgb.c |3 ++- libswscale/sparc/yuv2rgb_vis.c |3 ++- libswscale/utils.c |4 +++- libswscale/x86/rgb2rgb.c |3 ++- libswscale/x86/swscale.c |3 ++- libswscale/x86/yuv2rgb.c |3 ++- 9 files changed, 30 insertions(+), 18 deletions(-) OK. Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel
Re: [libav-devel] [PATCH 1/2] build: support non-standard replacements for -E flag
Hi, On Sun, Jul 22, 2012 at 5:35 PM, Måns Rullgård m...@mansr.com wrote: Diego Biurrun di...@biurrun.de writes: On Mon, Jul 23, 2012 at 01:16:07AM +0100, Måns Rullgård wrote: Diego Biurrun di...@biurrun.de writes: On Mon, Jul 23, 2012 at 12:16:41AM +0100, Mans Rullgard wrote: This allows using non-standard flags for running the C preprocessor. The -o flag must be included in this setting due to strange syntax required by some compilers. --- a/configure +++ b/configure @@ -632,7 +636,7 @@ check_cpp(){ log_file $TMPC -check_cmd $cc $CPPFLAGS $CFLAGS $@ -E -o $TMPO $TMPC +check_cmd $cc $CPPFLAGS $CFLAGS $@ $(cc_e $TMPO) $TMPC } @@ -1724,6 +1728,7 @@ cflags_filter=echo AS_O='-o $@' +CC_E='-E -o $@' CC_O='-o $@' LD_O='-o $@' HOSTCC_O='-o $@' @@ -2042,7 +2047,8 @@ probe_cc(){ -unset _type _ident _cc_o _flags _cflags _ldflags _depflags _DEPCMD _DEPFLAGS +unset _type _ident _cc_e _cc_o _flags _cflags _ldflags +unset _depflags _DEPCMD _DEPFLAGS _flags_filter=echo @@ -2105,6 +2111,7 @@ probe_cc(){ _flags='--gcc --abi=eabi -me' _cflags='-D__gnuc_va_list=va_list -D__USER_LABEL_PREFIX__=' +_cc_e='-ppl -fe=$@' _cc_o='-fe=$@' Why not set CC_E (or whatever) to -E, -ppl and combine it with the cc_o function we already have? Something like this: for gcc (default): CC_E='-E' for that strange beast: CC_E='-ppl' -check_cmd $cc $CPPFLAGS $CFLAGS $@ -E -o $TMPO $TMPC +check_cmd $cc $CPPFLAGS $CFLAGS $@ $CC_E $(cc_o $TMPO) $TMPC Because there are even stranger beasts in the compiler jungle. For example, the IAR compiler needs CC_E='--preprocess=n $@' (note the lack of -o). It's probably not the only one. CC_E='--preprocess=n' _cc_o='$@' Why would something like that not work? That would break normal compilation, because *then* it wants -o. How do you intend to use this with MSVC? See: http://msdn.microsoft.com/en-us/library/8z9z0bx6(v=vs.71).aspx and http://msdn.microsoft.com/en-us/library/3xkfswhy(v=vs.71) Note how the second writes to stdout (leading to HUUGE config.log files), and the first unfortunately does not allow to specify an output file, instead writing to `pwd`/inputfilebasename.i. Using -P is probably useless unless it can be combined with a mv, but maybe -E can be redirected if our configure script supports that? Ronald ___ libav-devel mailing list libav-devel@libav.org https://lists.libav.org/mailman/listinfo/libav-devel