vlc | branch: master | Lyndon Brown <jnq...@gmail.com> | Sun Mar 17 00:55:19 2019 +0000| [464e2d440348d74904c53c42aeb366dce78e3394] | committer: Thomas Guillem
sepia: fix lack of Y adjustment The SIMD accelerated PlanarI420 implementation contained mistakes that meant that actually Y was being written untouched. Signed-off-by: Thomas Guillem <tho...@gllm.fr> > http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=464e2d440348d74904c53c42aeb366dce78e3394 --- modules/video_filter/sepia.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/modules/video_filter/sepia.c b/modules/video_filter/sepia.c index 5f4a4c61fb..9ca567c614 100644 --- a/modules/video_filter/sepia.c +++ b/modules/video_filter/sepia.c @@ -201,24 +201,22 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic ) *****************************************************************************/ VLC_SSE static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src, - int i_intensity_spread) + int i_intensity_shifted_pair) { __asm__ volatile ( // y = y - y / 4 + i_intensity / 4 "movq (%1), %%xmm1\n" - "punpcklbw %%xmm7, %%xmm1\n" - "movq (%1), %%xmm2\n" // store bytes as words with 0s in between - "punpcklbw %%xmm7, %%xmm2\n" + "punpcklbw %%xmm7, %%xmm1\n" // zero-extend bytes to words + "movdqa %%xmm1, %%xmm2\n" // copy it "movd %2, %%xmm3\n" "pshufd $0, %%xmm3, %%xmm3\n" - "psrlw $2, %%xmm2\n" // rotate right 2 - "psubusb %%xmm1, %%xmm2\n" // subtract - "psrlw $2, %%xmm3\n" - "paddsb %%xmm1, %%xmm3\n" // add - "packuswb %%xmm2, %%xmm1\n" // pack back to bytes - "movq %%xmm1, (%0) \n" // load to dest + "psrlw $2, %%xmm2\n" // get 1/4 of it + "psubusb %%xmm2, %%xmm1\n" + "paddusb %%xmm3, %%xmm1\n" + "packuswb %%xmm1, %%xmm1\n" // pack back to bytes + "movq %%xmm1, (%0) \n" : - :"r" (dst), "r"(src), "r"(i_intensity_spread) + :"r" (dst), "r"(src), "r"(i_intensity_shifted_pair) :"memory", "xmm1", "xmm2", "xmm3"); } @@ -230,11 +228,9 @@ static void PlanarI420SepiaSSE( picture_t *p_pic, picture_t *p_outpic, const uint8_t filling_const_8u = 128 - i_intensity / 6; const uint8_t filling_const_8v = 128 + i_intensity / 14; /* prepared value for faster broadcasting in xmm register */ - int i_intensity_spread = 0x10001 * (uint8_t) i_intensity; + int i_intensity_shifted_pair = 0x10001 * (((uint8_t) i_intensity) >> 2); - __asm__ volatile( - "pxor %%xmm7, %%xmm7\n" - ::: "xmm7"); + __asm__ volatile("pxor %%xmm7, %%xmm7\n" ::: "xmm7"); /* iterate for every two visible line in the frame */ for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2) @@ -250,16 +246,16 @@ static void PlanarI420SepiaSSE( picture_t *p_pic, picture_t *p_outpic, /* Compute yellow channel values with asm function */ Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x], &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x], - i_intensity_spread ); + i_intensity_shifted_pair ); Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x], &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x], - i_intensity_spread ); + i_intensity_shifted_pair ); Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8], &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8], - i_intensity_spread ); + i_intensity_shifted_pair ); Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8], &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8], - i_intensity_spread ); + i_intensity_shifted_pair ); /* Copy precomputed values to destination memory location */ memset(&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)], filling_const_8u, 8 ); @@ -363,7 +359,7 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic, /***************************************************************************** * PackedYUVSepia: Applies sepia to one frame of the packed YUV video ***************************************************************************** - * This function applies sepia effext to one frame of the video by iterating + * This function applies sepia effect to one frame of the video by iterating * through video lines. In every pass, we calculate new values for pixels * (UYVY, VYUY, YUYV and YVYU formats are supported) *****************************************************************************/ _______________________________________________ vlc-commits mailing list vlc-commits@videolan.org https://mailman.videolan.org/listinfo/vlc-commits