From: Marek Olšák <marek.ol...@amd.com> Use the mechanism of si_decompress_textures, but instead of doing the actual decompression, just flag the DB cache flush there.
This removes a lot of unnecessary DB cache flushes. --- src/gallium/drivers/radeonsi/si_blit.c | 36 +++++++++++++++++++++------ src/gallium/drivers/radeonsi/si_descriptors.c | 17 +++++++------ src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_state.c | 21 ++++++++++++---- src/gallium/drivers/radeonsi/si_state_draw.c | 6 ++--- 5 files changed, 56 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 9c38ae9..f0abfdc 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -337,24 +337,20 @@ si_decompress_depth(struct si_context *sctx, levels_s = level_mask & tex->stencil_dirty_level_mask; if (levels_s) { if (r600_can_sample_zs(tex, true)) inplace_planes |= PIPE_MASK_S; else copy_planes |= PIPE_MASK_S; } } - assert(!tex->tc_compatible_htile || levels_z == 0); - assert(!tex->tc_compatible_htile || levels_s == 0 || - !r600_can_sample_zs(tex, true)); - /* We may have to allocate the flushed texture here when called from * si_decompress_subresource. */ if (copy_planes && (tex->flushed_depth_texture || r600_init_flushed_depth_texture(&sctx->b.b, &tex->resource.b.b, NULL))) { struct r600_texture *dst = tex->flushed_depth_texture; unsigned fully_copied_levels; unsigned levels = 0; @@ -377,24 +373,44 @@ si_decompress_depth(struct si_context *sctx, first_layer, last_layer, 0, u_max_sample(&tex->resource.b.b)); if (copy_planes & PIPE_MASK_Z) tex->dirty_level_mask &= ~fully_copied_levels; if (copy_planes & PIPE_MASK_S) tex->stencil_dirty_level_mask &= ~fully_copied_levels; } if (inplace_planes) { - si_blit_decompress_zs_in_place( - sctx, tex, - levels_z, levels_s, - first_layer, last_layer); + if (!tex->tc_compatible_htile) { + si_blit_decompress_zs_in_place( + sctx, tex, + levels_z, levels_s, + first_layer, last_layer); + } + + /* Only in-place decompression needs to flush DB caches, or + * when we don't decompress but TC-compatible planes are dirty. + */ + sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_INV_VMEM_L1; + + /* If we flush DB caches for TC-compatible depth, the dirty + * state becomes 0 for the whole mipmap tree and all planes. + * (there is nothing else to flush) + */ + if (tex->tc_compatible_htile) { + if (r600_can_sample_zs(tex, false)) + tex->dirty_level_mask = 0; + if (r600_can_sample_zs(tex, true)) + tex->stencil_dirty_level_mask = 0; + } } } static void si_decompress_sampler_depth_textures(struct si_context *sctx, struct si_textures_info *textures) { unsigned i; unsigned mask = textures->needs_depth_decompress_mask; @@ -1343,25 +1359,29 @@ static boolean si_generate_mipmap(struct pipe_context *ctx, vi_disable_dcc_if_incompatible_format(&sctx->b, tex, base_level, format); si_decompress_subresource(ctx, tex, PIPE_MASK_RGBAZS, base_level, first_layer, last_layer); /* Clear dirty_level_mask for the levels that will be overwritten. */ assert(base_level < last_level); rtex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1, last_level - base_level); + sctx->generate_mipmap_for_depth = rtex->is_depth; + si_blitter_begin(ctx, SI_BLIT | SI_DISABLE_RENDER_COND); util_blitter_generate_mipmap(sctx->blitter, tex, format, base_level, last_level, first_layer, last_layer); si_blitter_end(ctx); + + sctx->generate_mipmap_for_depth = false; return true; } static void si_flush_resource(struct pipe_context *ctx, struct pipe_resource *res) { struct r600_texture *rtex = (struct r600_texture*)res; assert(res->target != PIPE_BUFFER); assert(!rtex->dcc_separate_buffer || rtex->dcc_gather_statistics); diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 375bcae..ee2e80a 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -577,26 +577,28 @@ static void si_set_sampler_view(struct si_context *sctx, sctx->descriptors_dirty |= 1u << si_sampler_and_image_descriptors_idx(shader); } static bool color_needs_decompression(struct r600_texture *rtex) { return rtex->fmask.size || (rtex->dirty_level_mask && (rtex->cmask.size || rtex->dcc_offset)); } -static bool depth_needs_decompression(struct r600_texture *rtex, - struct si_sampler_view *sview) +static bool depth_needs_decompression(struct r600_texture *rtex) { - return rtex->db_compatible && - (!rtex->tc_compatible_htile || - !r600_can_sample_zs(rtex, sview->is_stencil_sampler)); + /* If the depth/stencil texture is TC-compatible, no decompression + * will be done. The decompression function will only flush DB caches + * to make it coherent with shaders. That's necessary because the driver + * doesn't flush DB caches in any other case. + */ + return rtex->db_compatible; } static void si_update_shader_needs_decompress_mask(struct si_context *sctx, unsigned shader) { struct si_textures_info *samplers = &sctx->samplers[shader]; unsigned shader_bit = 1 << shader; if (samplers->needs_depth_decompress_mask || samplers->needs_color_decompress_mask || @@ -626,23 +628,22 @@ static void si_set_sampler_views(struct pipe_context *ctx, samplers->needs_color_decompress_mask &= ~(1u << slot); si_set_sampler_view(sctx, shader, slot, NULL, false); continue; } si_set_sampler_view(sctx, shader, slot, views[i], false); if (views[i]->texture && views[i]->texture->target != PIPE_BUFFER) { struct r600_texture *rtex = (struct r600_texture*)views[i]->texture; - struct si_sampler_view *rview = (struct si_sampler_view *)views[i]; - if (depth_needs_decompression(rtex, rview)) { + if (depth_needs_decompression(rtex)) { samplers->needs_depth_decompress_mask |= 1u << slot; } else { samplers->needs_depth_decompress_mask &= ~(1u << slot); } if (color_needs_decompression(rtex)) { samplers->needs_color_decompress_mask |= 1u << slot; } else { samplers->needs_color_decompress_mask &= ~(1u << slot); } @@ -2373,21 +2374,21 @@ static void si_make_texture_handle_resident(struct pipe_context *ctx, tex_handle = (struct si_texture_handle *)entry->data; sview = (struct si_sampler_view *)tex_handle->view; if (resident) { if (sview->base.texture->target != PIPE_BUFFER) { struct r600_texture *rtex = (struct r600_texture *)sview->base.texture; tex_handle->needs_depth_decompress = - depth_needs_decompression(rtex, sview); + depth_needs_decompression(rtex); tex_handle->needs_color_decompress = color_needs_decompression(rtex); if (rtex->dcc_offset && p_atomic_read(&rtex->framebuffers_bound)) sctx->need_check_render_feedback = true; } else { si_invalidate_bindless_buf_desc(sctx, tex_handle->desc, sview->base.texture, sview->base.u.buf.offset); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 427ac1c..e734595 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -357,20 +357,21 @@ struct si_context { unsigned dbcb_copy_sample; bool dbcb_depth_copy_enabled:1; bool dbcb_stencil_copy_enabled:1; bool db_flush_depth_inplace:1; bool db_flush_stencil_inplace:1; bool db_depth_clear:1; bool db_depth_disable_expclear:1; bool db_stencil_clear:1; bool db_stencil_disable_expclear:1; bool occlusion_queries_disabled:1; + bool generate_mipmap_for_depth:1; /* Emitted draw state. */ bool gs_tri_strip_adj_fix:1; int last_index_size; int last_base_vertex; int last_start_instance; int last_drawid; int last_sh_base_reg; int last_primitive_restart_en; int last_restart_index; diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 1cd1f91..44e5f1c 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2518,29 +2518,40 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, if (!r600_texture_disable_dcc(&sctx->b, rtex)) sctx->b.decompress_dcc(ctx, rtex); surf->dcc_incompatible = false; } /* Only flush TC when changing the framebuffer state, because * the only client not using TC that can change textures is * the framebuffer. * - * Flush all CB and DB caches here because all buffers can be used - * for write by both TC (with shader image stores) and CB/DB. + * Wait for compute shaders because of possible transitions: + * - FB write -> shader read + * - shader write -> FB read + * + * DB caches are flushed on demand (using si_decompress_textures). */ sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_FLUSH_AND_INV_DB | SI_CONTEXT_CS_PARTIAL_FLUSH; + /* u_blitter doesn't invoke depth decompression when it does multiple + * blits in a row, but the only case when it matters for DB is when + * doing generate_mipmap. So here we flush DB manually between + * individual generate_mipmap blits. + * Note that lower mipmap levels aren't compressed. + */ + if (sctx->generate_mipmap_for_depth) + sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB; + /* Take the maximum of the old and new count. If the new count is lower, * dirtying is needed to disable the unbound colorbuffers. */ sctx->framebuffer.dirty_cbufs |= (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1; sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf; si_dec_framebuffer_counters(&sctx->framebuffer.state); util_copy_framebuffer_state(&sctx->framebuffer.state, state); @@ -3977,23 +3988,23 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) } if (flags & PIPE_BARRIER_INDEX_BUFFER) { /* Indices are read through TC L2 since VI. * L1 isn't used. */ if (sctx->screen->b.chip_class <= CIK) sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; } + /* Depth and stencil are flushed in si_decompress_textures when needed. */ if (flags & PIPE_BARRIER_FRAMEBUFFER) - sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB | - SI_CONTEXT_FLUSH_AND_INV_DB; + sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB; if (flags & (PIPE_BARRIER_FRAMEBUFFER | PIPE_BARRIER_INDIRECT_BUFFER)) sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2; } static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) { struct pipe_blend_state blend; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index d039e01..d13c8b7 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -1395,25 +1395,23 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) r600_get_strmout_en(&sctx->b)) { sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; } if (sctx->framebuffer.do_update_surf_dirtiness) { /* Set the depth buffer as dirty. */ if (sctx->framebuffer.state.zsbuf) { struct pipe_surface *surf = sctx->framebuffer.state.zsbuf; struct r600_texture *rtex = (struct r600_texture *)surf->texture; - if (!rtex->tc_compatible_htile) - rtex->dirty_level_mask |= 1 << surf->u.tex.level; + rtex->dirty_level_mask |= 1 << surf->u.tex.level; - if (rtex->surface.flags & RADEON_SURF_SBUFFER && - (!rtex->tc_compatible_htile || !rtex->can_sample_s)) + if (rtex->surface.flags & RADEON_SURF_SBUFFER) rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level; } if (sctx->framebuffer.compressed_cb_mask) { struct pipe_surface *surf; struct r600_texture *rtex; unsigned mask = sctx->framebuffer.compressed_cb_mask; do { unsigned i = u_bit_scan(&mask); surf = sctx->framebuffer.state.cbufs[i]; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev