gfx9: don't flush L2 metadata for CB if not needed

Nicolai Hähnle Tue, 22 Aug 2017 01:19:41 -0700

On 21.08.2017 23:54, Marek Olšák wrote:

From: Marek Olšák <marek.ol...@amd.com>


---
  src/gallium/drivers/radeonsi/si_blit.c       |  8 +++++---
  src/gallium/drivers/radeonsi/si_pipe.h       | 23 +++++++++++++++++++----
  src/gallium/drivers/radeonsi/si_state.c      | 19 +++++++++++++++----
  src/gallium/drivers/radeonsi/si_state_draw.c | 11 +++++------
  4 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c 
b/src/gallium/drivers/radeonsi/si_blit.c
index ae7f809..3228933 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -399,21 +399,22 @@ si_decompress_depth(struct si_context *sctx,
                        if (inplace_planes & PIPE_MASK_Z)
                                tex->dirty_level_mask = 0;
                        if (inplace_planes & PIPE_MASK_S)
                                tex->stencil_dirty_level_mask = 0;
                }
        }
        /* set_framebuffer_state takes care of coherency for single-sample.
         * The DB->CB copy uses CB for the final writes.
         */
        if (copy_planes && tex->resource.b.b.nr_samples > 1)
-               si_make_CB_shader_coherent(sctx, tex->resource.b.b.nr_samples);
+               si_make_CB_shader_coherent(sctx, tex->resource.b.b.nr_samples,
+                                          false);
  }

static void

  si_decompress_sampler_depth_textures(struct si_context *sctx,
                                     struct si_textures_info *textures)
  {
        unsigned i;
        unsigned mask = textures->needs_depth_decompress_mask;

while (mask) {

@@ -504,21 +505,22 @@ static void si_blit_decompress_color(struct pipe_context 
*ctx,
                }

/* The texture will always be dirty if some layers aren't flushed.

                 * I don't think this case occurs often though. */
                if (first_layer == 0 && last_layer >= max_layer) {
                        rtex->dirty_level_mask &= ~(1 << level);
                }
        }

sctx->decompression_enabled = false;

-       si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples);
+       si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples,
+                                  vi_dcc_enabled(rtex, first_level));
  }

static void

  si_decompress_color_texture(struct si_context *sctx, struct r600_texture *tex,
                            unsigned first_level, unsigned last_level)
  {
        /* CMASK or DCC can be discarded and we can still end up here. */
        if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
                return;

@@ -1193,21 +1195,21 @@ static void si_do_CB_resolve(struct si_context *sctx,si_blitter_begin(&sctx->b.b, SI_COLOR_RESOLVE |

                         (info->render_condition_enable ? 0 : 
SI_DISABLE_RENDER_COND));
        util_blitter_custom_resolve_color(sctx->blitter, dst, dst_level, dst_z,
                                          info->src.resource, info->src.box.z,
                                          ~0, sctx->custom_blend_resolve,
                                          format);
        si_blitter_end(&sctx->b.b);

/* Flush caches for possible texturing. */

-       si_make_CB_shader_coherent(sctx, 1);
+       si_make_CB_shader_coherent(sctx, 1, false);
  }

static bool do_hardware_msaa_resolve(struct pipe_context *ctx,

                                     const struct pipe_blit_info *info)
  {
        struct si_context *sctx = (struct si_context*)ctx;
        struct r600_texture *src = (struct r600_texture*)info->src.resource;
        struct r600_texture *dst = (struct r600_texture*)info->dst.resource;
        MAYBE_UNUSED struct r600_texture *rtmp;
        unsigned dst_width = u_minify(info->dst.resource->width0, 
info->dst.level);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 671c488..3e59e21 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -50,21 +50,24 @@
  #define SI_CONTEXT_INV_ICACHE         (R600_CONTEXT_PRIVATE_FLAG << 0)
  /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
  #define SI_CONTEXT_INV_SMEM_L1                (R600_CONTEXT_PRIVATE_FLAG << 1)
  /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
  #define SI_CONTEXT_INV_VMEM_L1                (R600_CONTEXT_PRIVATE_FLAG << 2)
  /* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC 
L2 */
  #define SI_CONTEXT_INV_GLOBAL_L2      (R600_CONTEXT_PRIVATE_FLAG << 3)
  /* Write dirty L2 lines back to memory (shader and CP DMA stores), but don't
   * invalidate L2. SI-CIK can't do it, so they will do complete invalidation. 
*/
  #define SI_CONTEXT_WRITEBACK_GLOBAL_L2        (R600_CONTEXT_PRIVATE_FLAG << 4)
-/* gaps */
+/* Writeback & invalidate the L2 metadata cache. It can only be coupled with
+ * a CB or DB flush. */
+#define SI_CONTEXT_INV_L2_METADATA     (R600_CONTEXT_PRIVATE_FLAG << 5)
+/* gap */
  /* Framebuffer caches. */
  #define SI_CONTEXT_FLUSH_AND_INV_DB   (R600_CONTEXT_PRIVATE_FLAG << 7)
  #define SI_CONTEXT_FLUSH_AND_INV_CB   (R600_CONTEXT_PRIVATE_FLAG << 8)
  /* Engine synchronization. */
  #define SI_CONTEXT_VS_PARTIAL_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 9)
  #define SI_CONTEXT_PS_PARTIAL_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 10)
  #define SI_CONTEXT_CS_PARTIAL_FLUSH   (R600_CONTEXT_PRIVATE_FLAG << 11)
  #define SI_CONTEXT_VGT_FLUSH          (R600_CONTEXT_PRIVATE_FLAG << 12)
  #define SI_CONTEXT_VGT_STREAMOUT_SYNC (R600_CONTEXT_PRIVATE_FLAG << 13)

@@ -190,20 +193,21 @@ struct si_framebuffer {

        unsigned                        spi_shader_col_format_blend;
        unsigned                        spi_shader_col_format_blend_alpha;
        ubyte                           nr_samples:5; /* at most 16xAA */
        ubyte                           log_samples:3; /* at most 4 = 16xAA */
        ubyte                           compressed_cb_mask;
        ubyte                           color_is_int8;
        ubyte                           color_is_int10;
        ubyte                           dirty_cbufs;
        bool                            dirty_zsbuf;
        bool                            any_dst_linear;
+       bool                            CB_has_shader_readable_metadata;
  };

struct si_clip_state {

        struct r600_atom                atom;
        struct pipe_clip_state          state;
        bool                            any_nonzeros;
  };

struct si_sample_locs {

        struct r600_atom        atom;
@@ -588,28 +592,39 @@ si_optimal_tcc_alignment(struct si_context *sctx, 
unsigned upload_size)
         * the whole thing will fit into a cache line if we align it to its 
size.
         * The idea is that multiple small uploads can share a cache line.
         * If the upload size is greater, align it to the cache line size.
         */
        alignment = util_next_power_of_two(upload_size);
        tcc_cache_line_size = sctx->screen->b.info.tcc_cache_line_size;
        return MIN2(alignment, tcc_cache_line_size);
  }

static inline void

-si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples)
+si_make_CB_shader_coherent(struct si_context *sctx, unsigned num_samples,
+                          bool shaders_read_metadata)
  {
        sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB |
                         SI_CONTEXT_INV_VMEM_L1;

- /* Single-sample color is coherent with shaders on GFX9. */

-       if (sctx->b.chip_class <= VI || num_samples >= 2)
+       if (sctx->b.chip_class >= GFX9) {
+               /* Single-sample color is coherent with shaders on GFX9, but
+                * L2 metadata must be flushed if shaders read metadata.
+                * (DCC, CMASK).
+                */
+               if (num_samples >= 2)
+                       sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+               else if (shaders_read_metadata)
+                       sctx->b.flags |= SI_CONTEXT_INV_L2_METADATA;
+       } else {
+               /* SI-CI-VI */
                sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+       }
  }

static inline void

  si_make_DB_shader_coherent(struct si_context *sctx, unsigned num_samples,
                           bool include_stencil)
  {
        sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_DB |
                         SI_CONTEXT_INV_VMEM_L1;

/* Single-sample depth (not stencil) is coherent with shaders on GFX9. */

diff --git a/src/gallium/drivers/radeonsi/si_state.c 
b/src/gallium/drivers/radeonsi/si_state.c
index d116c07..e5d8d21 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2566,21 +2566,22 @@ static void si_set_framebuffer_state(struct 
pipe_context *ctx,
         *
         * When MSAA is enabled, CB and TC caches are flushed on demand
         * (after FMASK decompression). Shader write -> FB read transitions
         * cannot happen for MSAA textures, because MSAA shader images are
         * not supported.
         *
         * Only flush and wait for CB if there is actually a bound color buffer.
         */
        if (sctx->framebuffer.nr_samples <= 1 &&
            sctx->framebuffer.state.nr_cbufs)
-               si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples);
+               si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+                                          
sctx->framebuffer.CB_has_shader_readable_metadata);

sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;/* u_blitter doesn't invoke depth decompression when it does multiple

         * blits in a row, but the only case when it matters for DB is when
         * doing generate_mipmap. So here we flush DB manually between
         * individual generate_mipmap blits.
         * Note that lower mipmap levels aren't compressed.
         */
        if (sctx->generate_mipmap_for_depth)
@@ -2601,20 +2602,21 @@ static void si_set_framebuffer_state(struct 
pipe_context *ctx,
        sctx->framebuffer.spi_shader_col_format_alpha = 0;
        sctx->framebuffer.spi_shader_col_format_blend = 0;
        sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
        sctx->framebuffer.color_is_int8 = 0;
        sctx->framebuffer.color_is_int10 = 0;

sctx->framebuffer.compressed_cb_mask = 0;

        sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
        sctx->framebuffer.log_samples = 
util_logbase2(sctx->framebuffer.nr_samples);
        sctx->framebuffer.any_dst_linear = false;
+       sctx->framebuffer.CB_has_shader_readable_metadata = false;

for (i = 0; i < state->nr_cbufs; i++) {

                if (!state->cbufs[i])
                        continue;

surf = (struct r600_surface*)state->cbufs[i];

                rtex = (struct r600_texture*)surf->base.texture;

if (!surf->color_initialized) {

                        si_initialize_color_surface(sctx, surf);
@@ -2635,20 +2637,23 @@ static void si_set_framebuffer_state(struct 
pipe_context *ctx,
                if (surf->color_is_int10)
                        sctx->framebuffer.color_is_int10 |= 1 << i;

if (rtex->fmask.size) {

                        sctx->framebuffer.compressed_cb_mask |= 1 << i;
                }

if (rtex->surface.is_linear)

                        sctx->framebuffer.any_dst_linear = true;

+ if (vi_dcc_enabled(rtex, surf->base.u.tex.level))

+                       sctx->framebuffer.CB_has_shader_readable_metadata = 
true;
+
                r600_context_add_resource_size(ctx, surf->base.texture);

p_atomic_inc(&rtex->framebuffers_bound);if (rtex->dcc_gather_statistics) {

                        /* Dirty tracking must be enabled for DCC usage 
analysis. */
                        sctx->framebuffer.compressed_cb_mask |= 1 << i;
                        vi_separate_dcc_start_query(ctx, rtex);
                }
        }
@@ -4015,21 +4020,22 @@ static void si_set_tess_state(struct pipe_context *ctx,

static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)

  {
        struct si_context *sctx = (struct si_context *)ctx;

si_update_fb_dirtiness_after_rendering(sctx);/* Multisample surfaces are flushed in si_decompress_textures. */

        if (sctx->framebuffer.nr_samples <= 1 &&
            sctx->framebuffer.state.nr_cbufs)
-               si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples);
+               si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
+                                          
sctx->framebuffer.CB_has_shader_readable_metadata);
  }

/* This only ensures coherency for shader image/buffer stores. */

  static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
  {
        struct si_context *sctx = (struct si_context *)ctx;

/* Subsequent commands must wait for all shader invocations to

         * complete. */
        sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
@@ -4060,23 +4066,28 @@ static void si_memory_barrier(struct pipe_context *ctx, 
unsigned flags)
        }

/* MSAA color, any depth and any stencil are flushed in

         * si_decompress_textures when needed.
         */
        if (flags & PIPE_BARRIER_FRAMEBUFFER &&
            sctx->framebuffer.nr_samples <= 1 &&
            sctx->framebuffer.state.nr_cbufs) {
                sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_CB;

- /* Single-sample color is coherent with TC on GFX9. */

-               if (sctx->screen->b.chip_class <= VI)
+               if (sctx->b.chip_class >= GFX9) {
+                       /* Single-sample color is coherent with TC on GFX9. */
+                       if (sctx->framebuffer.CB_has_shader_readable_metadata)
+                               sctx->b.flags |= SI_CONTEXT_INV_L2_METADATA;

MemoryBarrier is about making *shader writes* visible to other parts ofthe pipeline.


So I think:

- the comment above is misleading

- there should be no need to invalidate the metadata, since DCC mustalready have been disabled or at least decompressed for the relevanttexture, and the shader doesn't modify the metadata anyway.


Cheers,
Nicolai

+               } else {
+                       /* SI-CI-VI */
                        sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
+               }
        }

/* Indirect buffers use TC L2 on GFX9, but not older hw. */

        if (sctx->screen->b.chip_class <= VI &&
            flags & PIPE_BARRIER_INDIRECT_BUFFER)
                sctx->b.flags |= SI_CONTEXT_WRITEBACK_GLOBAL_L2;
  }

static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)

  {
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index 2796427..b981676 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -980,27 +980,26 @@ void si_emit_cache_flush(struct si_context *sctx)
                 * All operations that invalidate L2 also seem to invalidate
                 * metadata. Volatile (VOL) and WC flushes are not listed here.
                 *
                 * TC    | TC_WB         = writeback & invalidate L2 & L1
                 * TC    | TC_WB | TC_NC = writeback & invalidate L2 for MTYPE 
== NC
                 *         TC_WB | TC_NC = writeback L2 for MTYPE == NC
                 * TC            | TC_NC = invalidate L2 for MTYPE == NC
                 * TC    | TC_MD         = writeback & invalidate L2 metadata 
(DCC, etc.)
                 * TCL1                  = invalidate L1
                 */
+               tc_flags = 0;

- /* When flushing CB or DB, L2 metadata should always be invali-

-                * dated before texturing. Invalidating L2 data is not needed
-                * in some cases.
-                */
-               tc_flags = EVENT_TC_ACTION_ENA |
-                          EVENT_TC_MD_ACTION_ENA;
+               if (rctx->flags & SI_CONTEXT_INV_L2_METADATA) {
+                       tc_flags = EVENT_TC_ACTION_ENA |
+                                  EVENT_TC_MD_ACTION_ENA;
+               }

/* Ideally flush TC together with CB/DB. */

                if (rctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
                        /* Writeback and invalidate everything in L2 & L1. */
                        tc_flags = EVENT_TC_ACTION_ENA |
                                   EVENT_TC_WB_ACTION_ENA;

/* Clear the flags. */

                        rctx->flags &= ~(SI_CONTEXT_INV_GLOBAL_L2 |
                                         SI_CONTEXT_WRITEBACK_GLOBAL_L2 |



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 09/15] radeonsi/gfx9: don't flush L2 metadata for CB if not needed

Reply via email to