Am 11.04.2016 um 21:05 schrieb Matt Turner:
> This will make adding SSE2 code a lot cleaner.
> ---
>  src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 42 
> +++++++++++++++-----------
>  1 file changed, 24 insertions(+), 18 deletions(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c 
> b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> index fa5ec75..5d58530 100644
> --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
> @@ -85,6 +85,22 @@ rgba8_copy(void *dst, const void *src, size_t bytes)
>  #ifdef __SSSE3__
>  static const uint8_t rgba8_permutation[16] =
>     { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
> +
> +static inline void
> +rgba8_copy_16_aligned_dst(void *dst, const void *src)
> +{
> +   _mm_store_si128(dst,
> +                   _mm_shuffle_epi8(_mm_loadu_si128(src),
> +                                    *(__m128i *)rgba8_permutation));
> +}
> +
> +static inline void
> +rgba8_copy_16_aligned_src(void *dst, const void *src)
> +{
> +   _mm_storeu_si128(dst,
> +                    _mm_shuffle_epi8(_mm_load_si128(src),
> +                                     *(__m128i *)rgba8_permutation));
> +}
>  #endif
>  
>  /**
> @@ -93,23 +109,18 @@ static const uint8_t rgba8_permutation[16] =
>  static inline void *
>  rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
>  {
> -   uint8_t *d = dst;
> -   uint8_t const *s = src;
> -
>     assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
>  
>  #ifdef __SSSE3__
>     while (bytes >= 16) {
> -      _mm_store_si128((__m128i *)d,
> -                      _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)s),
> -                                       *(__m128i *) rgba8_permutation));
> -      s += 16;
> -      d += 16;
> +      rgba8_copy_16_aligned_dst(dst, src);
> +      src += 16;
> +      dst += 16;
>        bytes -= 16;
>     }
>  #endif
>  
> -   rgba8_copy(d, s, bytes);
> +   rgba8_copy(dst, src, bytes);
>  
>     return dst;
>  }
> @@ -120,23 +131,18 @@ rgba8_copy_aligned_dst(void *dst, const void *src, 
> size_t bytes)
>  static inline void *
>  rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
>  {
> -   uint8_t *d = dst;
> -   uint8_t const *s = src;
> -
>     assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
>  
>  #ifdef __SSSE3__
>     while (bytes >= 16) {
> -      _mm_storeu_si128((__m128i *)d,
> -                       _mm_shuffle_epi8(_mm_load_si128((__m128i *)s),
> -                                        *(__m128i *) rgba8_permutation));
> -      s += 16;
> -      d += 16;
> +      rgba8_copy_16_aligned_src(dst, src);
> +      src += 16;
> +      dst += 16;
>        bytes -= 16;
>     }
>  #endif
>  
> -   rgba8_copy(d, s, bytes);
> +   rgba8_copy(dst, src, bytes);
>  
>     return dst;
>  }
> 

I thought void ptr arithmetic is illegal (though a gcc extension), so
maybe should avoid that?

Roland


_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to