-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 On 07/18/2011 01:54 PM, Chad Versace wrote: > On 07/18/2011 11:49 AM, Ian Romanick wrote: >> On 07/18/2011 12:55 AM, Chad Versace wrote: >>> Until now, the stencil buffer was allocated as a Y tiled buffer, because >>> in several locations the PRM states that it is. However, it is actually >>> W tiled. From the PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section >>> 4.5.2.1 W-Major Format: >>> W-Major Tile Format is used for separate stencil. >> >>> The GTT is incapable of W fencing, so we allocate the stencil buffer with >>> I915_TILING_NONE and decode the tile's layout in software. >> >>> This fix touches the following portions of code: >>> - In intel_allocate_renderbuffer_storage(), allocate the stencil >>> buffer with I915_TILING_NONE. >>> - In intel_verify_dri2_has_hiz(), verify that the stencil buffer is >>> not tiled. >>> - In the stencil buffer's span functions, the tile's layout must be >>> decoded in software. >> >>> This commit mutually depends on the xf86-video-intel commit >>> dri: Do not tile stencil buffer >>> Author: Chad Versace <c...@chad-versace.us> >>> Date: Mon Jul 18 00:38:00 2011 -0700 >> >>> On Gen6 with separate stencil enabled, fixes the following Piglit tests: >>> bugs/fdo23670-drawpix_stencil >>> general/stencil-drawpixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX16-copypixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX16-drawpixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX16-readpixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX1-copypixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX1-drawpixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX1-readpixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX4-copypixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX4-drawpixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX4-readpixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX8-copypixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX8-drawpixels >>> spec/EXT_framebuffer_object/fbo-stencil-GL_STENCIL_INDEX8-readpixels >>> spec/EXT_packed_depth_stencil/fbo-stencil-GL_DEPTH24_STENCIL8-copypixels >>> spec/EXT_packed_depth_stencil/fbo-stencil-GL_DEPTH24_STENCIL8-readpixels >>> spec/EXT_packed_depth_stencil/readpixels-24_8 >> >>> Note: This is a candidate for the 7.11 branch. >> >>> CC: Eric Anholt <e...@anholt.net> >>> CC: Kenneth Graunke <kenn...@whitecape.org> >>> Signed-off-by: Chad Versace <c...@chad-versace.us> >>> --- >>> src/mesa/drivers/dri/intel/intel_clear.c | 6 ++ >>> src/mesa/drivers/dri/intel/intel_context.c | 9 ++- >>> src/mesa/drivers/dri/intel/intel_fbo.c | 13 +++-- >>> src/mesa/drivers/dri/intel/intel_screen.h | 9 ++- >>> src/mesa/drivers/dri/intel/intel_span.c | 85 >>> ++++++++++++++++++++-------- >>> 5 files changed, 89 insertions(+), 33 deletions(-) >> >>> diff --git a/src/mesa/drivers/dri/intel/intel_clear.c >>> b/src/mesa/drivers/dri/intel/intel_clear.c >>> index dfca03c..5ab9873 100644 >>> --- a/src/mesa/drivers/dri/intel/intel_clear.c >>> +++ b/src/mesa/drivers/dri/intel/intel_clear.c >>> @@ -143,6 +143,12 @@ intelClear(struct gl_context *ctx, GLbitfield mask) >>> */ >>> tri_mask |= BUFFER_BIT_STENCIL; >>> } >>> + else if (intel->has_separate_stencil && >>> + stencilRegion->tiling == I915_TILING_NONE) { >>> + /* The stencil buffer is actually W tiled, which the hardware >>> + * cannot blit to. */ >>> + tri_mask |= BUFFER_BIT_STENCIL; >>> + } >>> else { >>> /* clearing all stencil bits, use blitting */ >>> blit_mask |= BUFFER_BIT_STENCIL; >>> diff --git a/src/mesa/drivers/dri/intel/intel_context.c >>> b/src/mesa/drivers/dri/intel/intel_context.c >>> index 2ba1363..fe8be08 100644 >>> --- a/src/mesa/drivers/dri/intel/intel_context.c >>> +++ b/src/mesa/drivers/dri/intel/intel_context.c >>> @@ -1439,7 +1439,12 @@ intel_verify_dri2_has_hiz(struct intel_context >>> *intel, >>> assert(stencil_rb->Base.Format == MESA_FORMAT_S8); >>> assert(depth_rb && depth_rb->Base.Format == MESA_FORMAT_X8_Z24); >> >>> - if (stencil_rb->region->tiling == I915_TILING_Y) { >>> + if (stencil_rb->region->tiling == I915_TILING_NONE) { >>> + /* >>> + * The stencil buffer is actually W tiled. The region's tiling is >>> + * I915_TILING_NONE, however, because the GTT is incapable of W >>> + * fencing. >>> + */ >>> intel->intelScreen->dri2_has_hiz = INTEL_DRI2_HAS_HIZ_TRUE; >>> return; >>> } else { >>> @@ -1527,7 +1532,7 @@ intel_verify_dri2_has_hiz(struct intel_context *intel, >>> * Presently, however, no verification or clean up is necessary, and >>> * execution should not reach here. If the framebuffer still has a >>> hiz >>> * region, then we have already set dri2_has_hiz to true after >>> - * confirming above that the stencil buffer is Y tiled. >>> + * confirming above that the stencil buffer is W tiled. >>> */ >>> assert(0); >>> } >>> diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c >>> b/src/mesa/drivers/dri/intel/intel_fbo.c >>> index 1669af2..507cc33 100644 >>> --- a/src/mesa/drivers/dri/intel/intel_fbo.c >>> +++ b/src/mesa/drivers/dri/intel/intel_fbo.c >>> @@ -173,6 +173,9 @@ intel_alloc_renderbuffer_storage(struct gl_context * >>> ctx, struct gl_renderbuffer >> >>> if (irb->Base.Format == MESA_FORMAT_S8) { >>> /* >>> + * The stencil buffer is W tiled. However, we request from the >>> kernel a >>> + * non-tiled buffer because the GTT is incapable of W fencing. >>> + * >>> * The stencil buffer has quirky pitch requirements. From Vol 2a, >>> * 11.5.6.2.1 3DSTATE_STENCIL_BUFFER, field "Surface Pitch": >>> * The pitch must be set to 2x the value computed based on width, >>> as >>> @@ -180,14 +183,14 @@ intel_alloc_renderbuffer_storage(struct gl_context * >>> ctx, struct gl_renderbuffer >>> * To accomplish this, we resort to the nasty hack of doubling the >>> drm >>> * region's cpp and halving its height. >>> * >>> - * If we neglect to double the pitch, then drm_intel_gem_bo_map_gtt() >>> - * maps the memory incorrectly. >>> + * If we neglect to double the pitch, then render corruption occurs. >>> */ >>> irb->region = intel_region_alloc(intel->intelScreen, >>> - I915_TILING_Y, >>> + I915_TILING_NONE, >>> cpp * 2, >>> - width, >>> - height / 2, >>> + ALIGN(width, 64), >>> + /* XXX: Maybe align to 128? */ >>> + ALIGN(height / 2, 64), >>> GL_TRUE); >>> if (!irb->region) >>> return false; >>> diff --git a/src/mesa/drivers/dri/intel/intel_screen.h >>> b/src/mesa/drivers/dri/intel/intel_screen.h >>> index b2013af..9dd6a52 100644 >>> --- a/src/mesa/drivers/dri/intel/intel_screen.h >>> +++ b/src/mesa/drivers/dri/intel/intel_screen.h >>> @@ -63,9 +63,12 @@ >>> * x8_z24 and s8). >>> * >>> * Eventually, intel_update_renderbuffers() makes a DRI2 request for >>> - * DRI2BufferStencil and DRI2BufferHiz. If the returned buffers are Y >>> tiled, >>> - * then we joyfully set intel_screen.dri2_has_hiz to true and continue as >>> if >>> - * nothing happend. >>> + * DRI2BufferStencil and DRI2BufferHiz. If the stencil buffer's tiling is >>> + * I915_TILING_NONE [1], then we joyfully set intel_screen.dri2_has_hiz to >>> + * true and continue as if nothing happend. >>> + * >>> + * [1] The stencil buffer is actually W tiled. However, we request from the >>> + * kernel a non-tiled buffer because the GTT is incapable of W fencing. >>> * >>> * If the buffers are X tiled, however, the handshake has failed and we >>> must >>> * clean up. >>> diff --git a/src/mesa/drivers/dri/intel/intel_span.c >>> b/src/mesa/drivers/dri/intel/intel_span.c >>> index 153803f..d306432 100644 >>> --- a/src/mesa/drivers/dri/intel/intel_span.c >>> +++ b/src/mesa/drivers/dri/intel/intel_span.c >>> @@ -131,38 +131,77 @@ intel_set_span_functions(struct intel_context *intel, >>> int miny = 0; \ >>> int maxx = rb->Width; \ >>> int maxy = rb->Height; \ >>> - int stride = rb->RowStride; >>> \ >>> - uint8_t *buf = rb->Data; >>> \ >>> + \ >>> + /* >>> \ >>> + * Here we ignore rb->Data and rb->RowStride as set by \ >>> + * intelSpanRenderStart. Since intel_offset_S8 decodes the W tile >>> \ >>> + * manually, the region's *real* base address and stride is >>> \ >>> + * required. >>> \ >>> + */ >>> \ >>> + struct intel_renderbuffer *irb = intel_renderbuffer(rb); >>> \ >>> + uint8_t *buf = irb->region->buffer->virtual; >>> \ >>> + unsigned stride = irb->region->pitch; \ >>> + unsigned height = 2 * irb->region->height; >>> \ >>> + bool flip = rb->Name == 0; >>> \ >> >>> -/* Don't flip y. */ >>> #undef Y_FLIP >>> -#define Y_FLIP(y) y >>> +#define Y_FLIP(y) ((1 - flip) * y + flip * (height - 1 - y)) >> >>> /** >>> * \brief Get pointer offset into stencil buffer. >>> * >>> - * The stencil buffer interleaves two rows into one. Yay for crazy >>> hardware. >>> - * The table below demonstrates how the pointer arithmetic behaves for a >>> buffer >>> - * with positive stride (s=stride). >>> - * >>> - * x | y | byte offset >>> - * -------------------------- >>> - * 0 | 0 | 0 >>> - * 0 | 1 | 1 >>> - * 1 | 0 | 2 >>> - * 1 | 1 | 3 >>> - * ... | ... | ... >>> - * 0 | 2 | s >>> - * 0 | 3 | s + 1 >>> - * 1 | 2 | s + 2 >>> - * 1 | 3 | s + 3 >>> - * >>> + * The stencil buffer is W tiled. Since the GTT is incapable of W fencing, >>> we >>> + * must decode the tile's layout in software. >>> * >>> + * See >>> + * - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.2.1 W-Major >>> Tile >>> + * Format. >>> + * - PRM, 2011 Sandy Bridge, Volume 1, Part 2, Section 4.5.3 Tiling >>> Algorithm >>> */ >>> -static inline intptr_t >>> -intel_offset_S8(int stride, GLint x, GLint y) >>> +static inline uintptr_t >> >> Why the type change? > > Two reasons. > > 1. I redeclared the parameters as unsigned so to generate better code. > Since x, y, and stride are unsigned, the division and modulo operators > generate shift and bit-logic instructions rather than the slower arithmetic > instructions.
Is stride always unsigned now? Will it always be in the future? This is the problem that we encountered with bug #37351 (fixed by e8b1c6d). > Below is a comparison of x % 64, signed versus unsigned, compiled with gcc > -O3. > In f, the % generates 5 instructions. In g, only one instruction. > > int f(int x) { return x % 64; } > > f: > movl %edi, %edx > sarl $31, %edx > shrl $26, %edx > leal (%rdi,%rdx), %eax > andl $63, %eax > subl %edx, %eax > ret > > unsigned g(unsigned x) { return x % 64); > > g: > movl %edi, %eax > andl $63, %eax > ret > > 2. I redeclared the return type as unsigned to make it explicit that it does > not return a negative offset. > >>> +intel_offset_S8(uint32_t stride, uint32_t x, uint32_t y) >>> { >>> - return 2 * ((y / 2) * stride + x) + y % 2; >>> + uint32_t tile_size = 4096; >>> + uint32_t tile_width = 64; >>> + uint32_t tile_height = 64; >>> + uint32_t row_size = 64 * stride; >>> + >>> + uint32_t tile_x = x / tile_width; >>> + uint32_t tile_y = y / tile_height; >>> + >>> + /* The byte's address relative to the tile's base addres. */ >>> + uint32_t byte_x = x % tile_width; >>> + uint32_t byte_y = y % tile_height; >>> + >>> + uintptr_t u = tile_y * row_size >>> + + tile_x * tile_size >>> + + 512 * (byte_x / 8) >>> + + 64 * (byte_y / 8) >>> + + 32 * ((byte_y / 4) % 2) >>> + + 16 * ((byte_x / 4) % 2) >>> + + 8 * ((byte_y / 2) % 2) >>> + + 4 * ((byte_x / 2) % 2) >>> + + 2 * (byte_y % 2) >>> + + 1 * (byte_x % 2); >>> + >>> + /* >>> + * Errata for Gen5: >>> + * >>> + * An additional offset is needed which is not documented in the PRM. >>> + * >>> + * if ((byte_x / 8) % 2 == 1) { >>> + * if ((byte_y / 8) % 2) == 0) { >>> + * u += 64; >>> + * } else { >>> + * u -= 64; >>> + * } >>> + * } >>> + * >>> + * The offset is expressed more tersely as >>> + * u += ((int) x & 0x8) * (8 - (((int) y & 0x8) << 1)); >>> + */ >>> + >>> + return u; >>> } >> >>> #define WRITE_STENCIL(x, y, src) buf[intel_offset_S8(stride, x, y)] = src; -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.11 (GNU/Linux) Comment: Using GnuPG with Fedora - http://enigmail.mozdev.org/ iEYEARECAAYFAk4kn08ACgkQX1gOwKyEAw/jIQCfYBpmMYlhRAWQa6lKs+z68HJo aXQAn2DGTu/NU92jUfR4wP19PWU+zdOL =AQiH -----END PGP SIGNATURE----- _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/intel-gfx