Am 11.04.2016 um 21:05 schrieb Matt Turner: > This will make adding SSE2 code a lot cleaner. > --- > src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 42 > +++++++++++++++----------- > 1 file changed, 24 insertions(+), 18 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > index fa5ec75..5d58530 100644 > --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > @@ -85,6 +85,22 @@ rgba8_copy(void *dst, const void *src, size_t bytes) > #ifdef __SSSE3__ > static const uint8_t rgba8_permutation[16] = > { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; > + > +static inline void > +rgba8_copy_16_aligned_dst(void *dst, const void *src) > +{ > + _mm_store_si128(dst, > + _mm_shuffle_epi8(_mm_loadu_si128(src), > + *(__m128i *)rgba8_permutation)); > +} > + > +static inline void > +rgba8_copy_16_aligned_src(void *dst, const void *src) > +{ > + _mm_storeu_si128(dst, > + _mm_shuffle_epi8(_mm_load_si128(src), > + *(__m128i *)rgba8_permutation)); > +} > #endif > > /** > @@ -93,23 +109,18 @@ static const uint8_t rgba8_permutation[16] = > static inline void * > rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes) > { > - uint8_t *d = dst; > - uint8_t const *s = src; > - > assert(bytes == 0 || !(((uintptr_t)dst) & 0xf)); > > #ifdef __SSSE3__ > while (bytes >= 16) { > - _mm_store_si128((__m128i *)d, > - _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)s), > - *(__m128i *) rgba8_permutation)); > - s += 16; > - d += 16; > + rgba8_copy_16_aligned_dst(dst, src); > + src += 16; > + dst += 16; > bytes -= 16; > } > #endif > > - rgba8_copy(d, s, bytes); > + rgba8_copy(dst, src, bytes); > > return dst; > } > @@ -120,23 +131,18 @@ rgba8_copy_aligned_dst(void *dst, const void *src, > size_t bytes) > static inline void * > rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes) > { > - uint8_t *d = dst; > - uint8_t const *s = src; > - > assert(bytes == 0 || !(((uintptr_t)src) & 0xf)); > > #ifdef __SSSE3__ > while (bytes >= 16) { > - _mm_storeu_si128((__m128i *)d, > - _mm_shuffle_epi8(_mm_load_si128((__m128i *)s), > - *(__m128i *) rgba8_permutation)); > - s += 16; > - d += 16; > + rgba8_copy_16_aligned_src(dst, src); > + src += 16; > + dst += 16; > bytes -= 16; > } > #endif > > - rgba8_copy(d, s, bytes); > + rgba8_copy(dst, src, bytes); > > return dst; > } >
I thought void ptr arithmetic is illegal (though a gcc extension), so maybe should avoid that? Roland _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev