This should help GCC when inlining memcpy to be able to better choose an aligned version. It should also fix potential performance issues in the next commit. --- src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 30 +++++++++++++++++++------- 1 file changed, 22 insertions(+), 8 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c index 3135458..19079d0 100644 --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c @@ -56,6 +56,8 @@ static const uint32_t ytile_width = 128; static const uint32_t ytile_height = 32; static const uint32_t ytile_span = 16; +#define assume_aligned(x, n) __builtin_assume_aligned(x, n) + #ifdef __SSSE3__ static const uint8_t rgba8_permutation[16] = { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; @@ -200,10 +202,12 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0); for (xo = x1; xo < x2; xo += xtile_span) { - mem_copy(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span); + mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), xtile_span), + src + xo, xtile_span); } - mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); + mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), xtile_span), + src + x2, x3 - x2); src += src_pitch; } @@ -259,12 +263,14 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, * at each step so we don't need to calculate it explicitly. */ for (x = x1; x < x2; x += ytile_span) { - mem_copy(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); + mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), ytile_span), + src + x, ytile_span); xo += bytes_per_column; swizzle ^= swizzle_bit; } - mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); + mem_copy(assume_aligned(dst + ((xo + yo) ^ swizzle), ytile_span), + src + x2, x3 - x2); src += src_pitch; } @@ -302,10 +308,14 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0); for (xo = x1; xo < x2; xo += xtile_span) { - mem_copy(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span); + mem_copy(dst + xo, + assume_aligned(src + ((xo + yo) ^ swizzle), xtile_span), + xtile_span); } - mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); + mem_copy(dst + x2, + assume_aligned(src + ((xo + yo) ^ swizzle), xtile_span), + x3 - x2); dst += dst_pitch; } @@ -361,12 +371,16 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, * at each step so we don't need to calculate it explicitly. */ for (x = x1; x < x2; x += ytile_span) { - mem_copy(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); + mem_copy(dst + x, + assume_aligned(src + ((xo + yo) ^ swizzle), ytile_span), + ytile_span); xo += bytes_per_column; swizzle ^= swizzle_bit; } - mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); + mem_copy(dst + x2, + assume_aligned(src + ((xo + yo) ^ swizzle), ytile_span), + x3 - x2); dst += dst_pitch; } -- 2.5.0.400.gff86faf _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev