From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/si_blit.c | 29 ++++++++++++----------------- src/gallium/drivers/radeonsi/si_pipe.h | 23 +++++++++++++++++++++++ src/gallium/drivers/radeonsi/si_state.c | 29 ++++++++++++----------------- 3 files changed, 47 insertions(+), 34 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index caa4c3c..ae7f809 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -377,43 +377,43 @@ si_decompress_depth(struct si_context *sctx, if (!tex->tc_compatible_htile) { si_blit_decompress_zs_in_place( sctx, tex, levels_z, levels_s, first_layer, last_layer); } /* Only in-place decompression needs to flush DB caches, or * when we don't decompress but TC-compatible planes are dirty. */ - sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_VMEM_L1; + si_make_DB_shader_coherent(sctx, tex->resource.b.b.nr_samples, + inplace_planes & PIPE_MASK_S); /* If we flush DB caches for TC-compatible depth, the dirty * state becomes 0 for the whole mipmap tree and all planes. * (there is nothing else to flush) */ if (tex->tc_compatible_htile) { - if (r600_can_sample_zs(tex, false)) + /* Only clear the mask that we are flushing, because + * si_make_DB_shader_coherent() can treat depth and + * stencil differently. + */ + if (inplace_planes & PIPE_MASK_Z) tex->dirty_level_mask = 0; - if (r600_can_sample_zs(tex, true)) + if (inplace_planes & PIPE_MASK_S) tex->stencil_dirty_level_mask = 0; } } /* set_framebuffer_state takes care of coherency for single-sample. * The DB->CB copy uses CB for the final writes. */ - if (copy_planes && tex->resource.b.b.nr_samples > 1) { - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_FLUSH_AND_INV_CB; - } + if (copy_planes && tex->resource.b.b.nr_samples > 1) + si_make_CB_shader_coherent(sctx, tex->resource.b.b.nr_samples); } static void si_decompress_sampler_depth_textures(struct si_context *sctx, struct si_textures_info *textures) { unsigned i; unsigned mask = textures->needs_depth_decompress_mask; while (mask) { @@ -504,24 +504,21 @@ static void si_blit_decompress_color(struct pipe_context *ctx, } /* The texture will always be dirty if some layers aren't flushed. * I don't think this case occurs often though. */ if (first_layer == 0 && last_layer >= max_layer) { rtex->dirty_level_mask &= ~(1 << level); } } sctx->decompression_enabled = false; - - sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_VMEM_L1; + si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples); } static void si_decompress_color_texture(struct si_context *sctx, struct r600_texture *tex, unsigned first_level, unsigned last_level) { /* CMASK or DCC can be discarded and we can still end up here. */ if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset) return; @@ -1196,23 +1193,21 @@ static void si_do_CB_resolve(struct si_context *sctx, si_blitter_begin(&sctx->b.b, SI_COLOR_RESOLVE | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z, info->src.resource, info->src.box.z, ~0, sctx->custom_blend_resolve, format); si_blitter_end(&sctx->b.b); /* Flush caches for possible texturing. */ - sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_INV_VMEM_L1; + si_make_CB_shader_coherent(sctx, 1); } static bool do_hardware_msaa_resolve(struct pipe_context *ctx, const struct pipe_blit_info *info) { struct si_context *sctx = (struct si_context*)ctx; struct r600_texture *src = (struct r600_texture*)info->src.resource; struct r600_texture *dst = (struct r600_texture*)info->dst.resource; MAYBE_UNUSED struct r600_texture *rtmp; unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index bb5e189..671c488 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -587,11 +587,34 @@ si_optimal_tcc_alignment(struct si_context *sctx, unsigned upload_size) /* If the upload size is less than the cache line size (e.g. 16, 32), * the whole thing will fit into a cache line if we align it to its size. * The idea is that multiple small uploads can share a cache line. * If the upload size is greater, align it to the cache line size. */ alignment = util_next_power_of_two(upload_size); tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size; return MIN2(alignment, tcc_cache_line_size); } +static inline void +si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples) +{ + sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB | + SI_CONTEXT_INV_VMEM_L1; + + /* Single-sample color is coherent with shaders on GFX9. */ + if (sctx->b.chip_class <= VI || num_samples >= 2) + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; +} + +static inline void +si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples, + bool include_stencil) +{ + sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB | + SI_CONTEXT_INV_VMEM_L1; + + /* Single-sample depth (not stencil) is coherent with shaders on GFX9. */ + if (sctx->b.chip_class <= VI || num_samples >= 2 || include_stencil) + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; +} + #endif diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 8010df6..d116c07 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2565,38 +2565,33 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, * DB caches are flushed on demand (using si_decompress_textures). * * When MSAA is enabled, CB and TC caches are flushed on demand * (after FMASK decompression). Shader write -> FB read transitions * cannot happen for MSAA textures, because MSAA shader images are * not supported. * * Only flush and wait for CB if there is actually a bound color buffer. */ if (sctx->framebuffer.nr_samples <= 1 && - sctx->framebuffer.state.nr_cbufs) { - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_FLUSH_AND_INV_CB; - } + sctx->framebuffer.state.nr_cbufs) + si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples); + sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH; /* u_blitter doesn't invoke depth decompression when it does multiple * blits in a row, but the only case when it matters for DB is when * doing generate_mipmap. So here we flush DB manually between * individual generate_mipmap blits. * Note that lower mipmap levels aren't compressed. */ - if (sctx->generate_mipmap_for_depth) { - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_FLUSH_AND_INV_DB; - } + if (sctx->generate_mipmap_for_depth) + si_make_DB_shader_coherent(sctx, 1, false); /* Take the maximum of the old and new count. If the new count is lower, * dirtying is needed to disable the unbound colorbuffers. */ sctx->framebuffer.dirty_cbufs |= (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1; sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf; si_dec_framebuffer_counters(&sctx->framebuffer.state); util_copy_framebuffer_state(&sctx->framebuffer.state, state); @@ -4019,25 +4014,22 @@ static void si_set_tess_state(struct pipe_context *ctx, } static void si_texture_barrier(struct pipe_context *ctx, unsigned flags) { struct si_context *sctx = (struct si_context *)ctx; si_update_fb_dirtiness_after_rendering(sctx); /* Multisample surfaces are flushed in si_decompress_textures. */ if (sctx->framebuffer.nr_samples <= 1 && - sctx->framebuffer.state.nr_cbufs) { - sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | - SI_CONTEXT_INV_GLOBAL_L2 | - SI_CONTEXT_FLUSH_AND_INV_CB; - } + sctx->framebuffer.state.nr_cbufs) + si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples); } /* This only ensures coherency for shader image/buffer stores. */ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) { struct si_context *sctx = (struct si_context *)ctx; /* Subsequent commands must wait for all shader invocations to * complete. */ sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | @@ -4066,22 +4058,25 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) if (sctx->screen->b.chip_class <= CIK) sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; } /* MSAA color, any depth and any stencil are flushed in * si_decompress_textures when needed. */ if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.nr_samples <= 1 && sctx->framebuffer.state.nr_cbufs) { - sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_WRITEBACK_GLOBAL_L2; + sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB; + + /* Single-sample color is coherent with TC on GFX9. */ + if (sctx->screen->b.chip_class <= VI) + sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; } /* Indirect buffers use TC L2 on GFX9, but not older hw. */ if (sctx->screen->b.chip_class <= VI && flags & PIPE_BARRIER_INDIRECT_BUFFER) sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; } static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) { -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev