The OCD in me is seeing a couple more places you could micro-optimize. Before I actually point them out,
Reviewed-by: Jason Ekstrand <ja...@jlekstrand.net> On Thu, Jan 25, 2018 at 8:23 AM, Scott D Phillips < scott.d.phill...@intel.com> wrote: > TileY's low 6 address bits are: v1 v0 u3 u2 u1 u0 > Thus a cache line in the tiled surface is composed of a 2d area of > 16x4 bytes of the linear surface. > > Add a special case where the area being copied is 4-line aligned > and a multiple of 4-lines so that entire cache lines will be > written at a time. > > On Apollolake, this increases tiling throughput to wc maps by > 84.8512% +/- 0.935379% > Nice! > v2: Split [y0, y1) and [y2, y3) loops apart for clarity (Jason Ekstrand) > --- > src/mesa/drivers/dri/i965/intel_tiled_memcpy.c | 80 > +++++++++++++++++++++++--- > 1 file changed, 72 insertions(+), 8 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > index 53a5679691..9e6bafa4b4 100644 > --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c > @@ -287,8 +287,8 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t > x2, uint32_t x3, > */ > static inline void > linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, > - uint32_t y0, uint32_t y1, > - char *dst, const char *src, > + uint32_t y0, uint32_t y3, > + char *dst, const char *src0, > int32_t src_pitch, > uint32_t swizzle_bit, > mem_copy_fn mem_copy, > @@ -306,6 +306,9 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t > x2, uint32_t x3, > const uint32_t column_width = ytile_span; > const uint32_t bytes_per_column = column_width * ytile_height; > > + uint32_t y1 = ALIGN_UP(y0, 4); > + uint32_t y2 = ALIGN_DOWN(y3, 4); > + > uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * > bytes_per_column; > uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * > bytes_per_column; > > @@ -319,26 +322,87 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t > x2, uint32_t x3, > > uint32_t x, yo; > > - src += (ptrdiff_t)y0 * src_pitch; > + const char *src = src0 + (ptrdiff_t)y0 * src_pitch; > > - for (yo = y0 * column_width; yo < y1 * column_width; yo += > column_width) { > + if (y0 != y1) { > + for (yo = y0 * column_width; yo < y1 * column_width; yo += > column_width) { > + uint32_t xo = xo1; > + uint32_t swizzle = swizzle1; > + > + mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0); > + > + /* Step by spans/columns. As it happens, the swizzle bit flips > + * at each step so we don't need to calculate it explicitly. > + */ > + for (x = x1; x < x2; x += ytile_span) { > + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, > ytile_span); > + xo += bytes_per_column; > + swizzle ^= swizzle_bit; > + } > + > + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); > + > + src += src_pitch; > + } > + } > + > + src = src0 + (ptrdiff_t)y1 * src_pitch; > + > + for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * > column_width) { > uint32_t xo = xo1; > uint32_t swizzle = swizzle1; > > - mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0); > + if (x0 != x1) { > + mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + > x0 + 0 * src_pitch, x1 - x0); > + mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + > x0 + 1 * src_pitch, x1 - x0); > + mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + > x0 + 2 * src_pitch, x1 - x0); > + mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + > x0 + 3 * src_pitch, x1 - x0); > + } > > /* Step by spans/columns. As it happens, the swizzle bit flips > * at each step so we don't need to calculate it explicitly. > */ > for (x = x1; x < x2; x += ytile_span) { > - mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, > ytile_span); > + mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), > src + x + 0 * src_pitch, ytile_span); > + mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), > src + x + 1 * src_pitch, ytile_span); > + mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), > src + x + 2 * src_pitch, ytile_span); > + mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), > src + x + 3 * src_pitch, ytile_span); > xo += bytes_per_column; > swizzle ^= swizzle_bit; > } > > - mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); > + if (x2 != x3) { > + mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), > src + x2 + 0 * src_pitch, x3 - x2); > + mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), > src + x2 + 1 * src_pitch, x3 - x2); > + mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), > src + x2 + 2 * src_pitch, x3 - x2); > + mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), > src + x2 + 3 * src_pitch, x3 - x2); > + } > > - src += src_pitch; > + src += 4 * src_pitch; > + } > + > + if (y2 != y3) { > + src = src0 + (ptrdiff_t)y2 * src_pitch; > + > + for (yo = y2 * column_width; yo < y3 * column_width; yo += > column_width) { > + uint32_t xo = xo1; > + uint32_t swizzle = swizzle1; > + > + mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0); > + > + /* Step by spans/columns. As it happens, the swizzle bit flips > + * at each step so we don't need to calculate it explicitly. > + */ > + for (x = x1; x < x2; x += ytile_span) { > + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, > ytile_span); > + xo += bytes_per_column; > + swizzle ^= swizzle_bit; > + } > + > + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); > + > + src += src_pitch; > + } > } > } > > -- > 2.16.1 > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev