From: Marek Olšák <marek.ol...@amd.com>

Call r600_dma_emit_wait_idle only when there is a possibility of
a read-after-write hazard. Buffers not yet used by the SDMA IB don't
have to wait.
---
 src/gallium/drivers/r600/evergreen_hw_context.c |  1 -
 src/gallium/drivers/r600/evergreen_state.c      |  1 -
 src/gallium/drivers/r600/r600_hw_context.c      |  1 -
 src/gallium/drivers/r600/r600_state.c           |  1 -
 src/gallium/drivers/radeon/r600_pipe_common.c   | 48 ++++++++++++++-----------
 src/gallium/drivers/radeon/r600_pipe_common.h   |  1 -
 src/gallium/drivers/radeonsi/cik_sdma.c         |  8 -----
 src/gallium/drivers/radeonsi/si_dma.c           |  2 --
 8 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c 
b/src/gallium/drivers/r600/evergreen_hw_context.c
index 06f0348..5352dc0 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -70,21 +70,20 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
                                      RADEON_PRIO_SDMA_BUFFER);
                radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize));
                radeon_emit(cs, dst_offset & 0xffffffff);
                radeon_emit(cs, src_offset & 0xffffffff);
                radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
                radeon_emit(cs, (src_offset >> 32UL) & 0xff);
                dst_offset += csize << shift;
                src_offset += csize << shift;
                size -= csize;
        }
-       r600_dma_emit_wait_idle(&rctx->b);
 }
 
 /* The max number of bytes to copy per packet. */
 #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
 
 void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
                                   struct pipe_resource *dst, uint64_t offset,
                                   unsigned size, uint32_t clear_value,
                                   enum r600_coherency coher)
 {
diff --git a/src/gallium/drivers/r600/evergreen_state.c 
b/src/gallium/drivers/r600/evergreen_state.c
index 015ff02..c5dd9f7 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -3446,21 +3446,20 @@ static void evergreen_dma_copy_tile(struct r600_context 
*rctx,
                radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16));
                radeon_emit(cs, (slice_tile_max << 0));
                radeon_emit(cs, (x << 0) | (z << 18));
                radeon_emit(cs, (y << 0) | (tile_split << 21) | (nbanks << 25) 
| (non_disp_tiling << 28));
                radeon_emit(cs, addr & 0xfffffffc);
                radeon_emit(cs, (addr >> 32UL) & 0xff);
                copy_height -= cheight;
                addr += cheight * pitch;
                y += cheight;
        }
-       r600_dma_emit_wait_idle(&rctx->b);
 }
 
 static void evergreen_dma_copy(struct pipe_context *ctx,
                               struct pipe_resource *dst,
                               unsigned dst_level,
                               unsigned dstx, unsigned dsty, unsigned dstz,
                               struct pipe_resource *src,
                               unsigned src_level,
                               const struct pipe_box *src_box)
 {
diff --git a/src/gallium/drivers/r600/r600_hw_context.c 
b/src/gallium/drivers/r600/r600_hw_context.c
index bc6217a..4663d99 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -548,12 +548,11 @@ void r600_dma_copy_buffer(struct r600_context *rctx,
                                      RADEON_PRIO_SDMA_BUFFER);
                radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize));
                radeon_emit(cs, dst_offset & 0xfffffffc);
                radeon_emit(cs, src_offset & 0xfffffffc);
                radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
                radeon_emit(cs, (src_offset >> 32UL) & 0xff);
                dst_offset += csize << 2;
                src_offset += csize << 2;
                size -= csize;
        }
-       r600_dma_emit_wait_idle(&rctx->b);
 }
diff --git a/src/gallium/drivers/r600/r600_state.c 
b/src/gallium/drivers/r600/r600_state.c
index ba97490..006bb62 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -2897,21 +2897,20 @@ static boolean r600_dma_copy_tile(struct r600_context 
*rctx,
                                (lbpp << 24) | ((height - 1) << 10) |
                                pitch_tile_max);
                radeon_emit(cs, (slice_tile_max << 12) | (z << 0));
                radeon_emit(cs, (x << 3) | (y << 17));
                radeon_emit(cs, addr & 0xfffffffc);
                radeon_emit(cs, (addr >> 32UL) & 0xff);
                copy_height -= cheight;
                addr += cheight * pitch;
                y += cheight;
        }
-       r600_dma_emit_wait_idle(&rctx->b);
        return TRUE;
 }
 
 static void r600_dma_copy(struct pipe_context *ctx,
                          struct pipe_resource *dst,
                          unsigned dst_level,
                          unsigned dstx, unsigned dsty, unsigned dstz,
                          struct pipe_resource *src,
                          unsigned src_level,
                          const struct pipe_box *src_box)
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
b/src/gallium/drivers/radeon/r600_pipe_common.c
index 6b7bbaf..4d8bb74 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -217,20 +217,35 @@ void r600_draw_rectangle(struct blitter_context *blitter,
                memcpy(vb+12, attrib->f, sizeof(float)*4);
                memcpy(vb+20, attrib->f, sizeof(float)*4);
        }
 
        /* draw */
        util_draw_vertex_buffer(&rctx->b, NULL, buf, blitter->vb_slot, offset,
                                R600_PRIM_RECTANGLE_LIST, 3, 2);
        pipe_resource_reference(&buf, NULL);
 }
 
+static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
+{
+       struct radeon_winsys_cs *cs = rctx->dma.cs;
+
+       /* NOP waits for idle on Evergreen and later. */
+       if (rctx->chip_class >= CIK)
+               radeon_emit(cs, 0x00000000); /* NOP */
+       else if (rctx->chip_class >= EVERGREEN)
+               radeon_emit(cs, 0xf0000000); /* NOP */
+       else {
+               /* TODO: R600-R700 should use the FENCE packet.
+                * CS checker support is required. */
+       }
+}
+
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
                          struct r600_resource *dst, struct r600_resource *src)
 {
        uint64_t vram = ctx->dma.cs->used_vram;
        uint64_t gtt = ctx->dma.cs->used_gart;
 
        if (dst) {
                vram += dst->vram_usage;
                gtt += dst->gart_usage;
        }
@@ -254,66 +269,57 @@ void r600_need_dma_space(struct r600_common_context *ctx, 
unsigned num_dw,
         *
         * IBs using too little memory are limited by the IB submission 
overhead.
         * IBs using too much memory are limited by the kernel/TTM overhead.
         * Too long IBs create CPU-GPU pipeline bubbles and add latency.
         *
         * This heuristic makes sure that DMA requests are executed
         * very soon after the call is made and lowers memory usage.
         * It improves texture upload performance by keeping the DMA
         * engine busy while uploads are being submitted.
         */
+       num_dw++; /* for emit_wait_idle below */
        if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) ||
            ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 
||
            !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) 
{
                ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
                assert((num_dw + ctx->dma.cs->current.cdw) <= 
ctx->dma.cs->current.max_dw);
        }
 
+       /* Wait for idle if either buffer has been used in the IB before to
+        * prevent read-after-write hazards.
+        */
+       if ((dst &&
+            ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf,
+                                             RADEON_USAGE_READWRITE)) ||
+           (src &&
+            ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf,
+                                             RADEON_USAGE_WRITE)))
+               r600_dma_emit_wait_idle(ctx);
+
        /* If GPUVM is not supported, the CS checker needs 2 entries
         * in the buffer list per packet, which has to be done manually.
         */
        if (ctx->screen->info.has_virtual_memory) {
                if (dst)
                        radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
                                                  RADEON_USAGE_WRITE,
                                                  RADEON_PRIO_SDMA_BUFFER);
                if (src)
                        radeon_add_to_buffer_list(ctx, &ctx->dma, src,
                                                  RADEON_USAGE_READ,
                                                  RADEON_PRIO_SDMA_BUFFER);
        }
 
        /* this function is called before all DMA calls, so increment this. */
        ctx->num_dma_calls++;
 }
 
-/* This is required to prevent read-after-write hazards. */
-void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
-{
-       struct radeon_winsys_cs *cs = rctx->dma.cs;
-
-       r600_need_dma_space(rctx, 1, NULL, NULL);
-
-       if (!radeon_emitted(cs, 0)) /* empty queue */
-               return;
-
-       /* NOP waits for idle on Evergreen and later. */
-       if (rctx->chip_class >= CIK)
-               radeon_emit(cs, 0x00000000); /* NOP */
-       else if (rctx->chip_class >= EVERGREEN)
-               radeon_emit(cs, 0xf0000000); /* NOP */
-       else {
-               /* TODO: R600-R700 should use the FENCE packet.
-                * CS checker support is required. */
-       }
-}
-
 static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 {
 }
 
 void r600_preflush_suspend_features(struct r600_common_context *ctx)
 {
        /* suspend queries */
        if (!LIST_IS_EMPTY(&ctx->active_queries))
                r600_suspend_queries(ctx);
 
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h 
b/src/gallium/drivers/radeon/r600_pipe_common.h
index 917059c..74f86dc 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -725,21 +725,20 @@ bool r600_can_dump_shader(struct r600_common_screen 
*rscreen,
                          unsigned processor);
 bool r600_extra_shader_checks(struct r600_common_screen *rscreen,
                              unsigned processor);
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct 
pipe_resource *dst,
                              uint64_t offset, uint64_t size, unsigned value);
 struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
                                                  const struct pipe_resource 
*templ);
 const char *r600_get_llvm_processor_name(enum radeon_family family);
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
                         struct r600_resource *dst, struct r600_resource *src);
-void r600_dma_emit_wait_idle(struct r600_common_context *rctx);
 void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
                    struct radeon_saved_cs *saved);
 void radeon_clear_saved_cs(struct radeon_saved_cs *saved);
 bool r600_check_device_reset(struct r600_common_context *rctx);
 
 /* r600_gpu_load.c */
 void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen);
 uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen);
 unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin);
 
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c 
b/src/gallium/drivers/radeonsi/cik_sdma.c
index 648b1ca..bee35cd 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -60,21 +60,20 @@ static void cik_sdma_copy_buffer(struct si_context *ctx,
                radeon_emit(cs, csize);
                radeon_emit(cs, 0); /* src/dst endian swap */
                radeon_emit(cs, src_offset);
                radeon_emit(cs, src_offset >> 32);
                radeon_emit(cs, dst_offset);
                radeon_emit(cs, dst_offset >> 32);
                dst_offset += csize;
                src_offset += csize;
                size -= csize;
        }
-       r600_dma_emit_wait_idle(&ctx->b);
 }
 
 static void cik_sdma_clear_buffer(struct pipe_context *ctx,
                                  struct pipe_resource *dst,
                                  uint64_t offset,
                                  uint64_t size,
                                  unsigned clear_value)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct radeon_winsys_cs *cs = sctx->b.dma.cs;
@@ -101,21 +100,20 @@ static void cik_sdma_clear_buffer(struct pipe_context 
*ctx,
                csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE);
                radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 
0,
                                                0x8000 /* dword copy */));
                radeon_emit(cs, offset);
                radeon_emit(cs, offset >> 32);
                radeon_emit(cs, clear_value);
                radeon_emit(cs, csize);
                offset += csize;
                size -= csize;
        }
-       r600_dma_emit_wait_idle(&sctx->b);
 }
 
 static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned 
blk_w)
 {
        width = u_minify(width, level);
        return DIV_ROUND_UP(width, blk_w);
 }
 
 static unsigned encode_tile_info(struct si_context *sctx,
                                 struct r600_texture *tex, unsigned level,
@@ -244,22 +242,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
                radeon_emit(cs, dstx | (dsty << 16));
                radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
                radeon_emit(cs, dst_slice_pitch - 1);
                if (sctx->b.chip_class == CIK) {
                        radeon_emit(cs, copy_width | (copy_height << 16));
                        radeon_emit(cs, copy_depth);
                } else {
                        radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) 
<< 16));
                        radeon_emit(cs, (copy_depth - 1));
                }
-
-               r600_dma_emit_wait_idle(&sctx->b);
                return true;
        }
 
        /* Tiled <-> linear sub-window copy. */
        if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= 
RADEON_SURF_MODE_1D)) {
                struct r600_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? 
rsrc : rdst;
                struct r600_texture *linear = tiled == rsrc ? rdst : rsrc;
                unsigned tiled_level =  tiled   == rsrc ? src_level : dst_level;
                unsigned linear_level = linear  == rsrc ? src_level : dst_level;
                unsigned tiled_x =      tiled   == rsrc ? srcx : dstx;
@@ -410,22 +406,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
                        radeon_emit(cs, linear_x | (linear_y << 16));
                        radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
                        radeon_emit(cs, linear_slice_pitch - 1);
                        if (sctx->b.chip_class == CIK) {
                                radeon_emit(cs, copy_width_aligned | 
(copy_height << 16));
                                radeon_emit(cs, copy_depth);
                        } else {
                                radeon_emit(cs, (copy_width_aligned - 1) | 
((copy_height - 1) << 16));
                                radeon_emit(cs, (copy_depth - 1));
                        }
-
-                       r600_dma_emit_wait_idle(&sctx->b);
                        return true;
                }
        }
 
        /* Tiled -> Tiled sub-window copy. */
        if (dst_mode >= RADEON_SURF_MODE_1D &&
            src_mode >= RADEON_SURF_MODE_1D &&
            /* check if these fit into the bitfields */
            src_address % 256 == 0 &&
            dst_address % 256 == 0 &&
@@ -508,22 +502,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx,
                        radeon_emit(cs, encode_tile_info(sctx, rdst, dst_level, 
false));
                        if (sctx->b.chip_class == CIK) {
                                radeon_emit(cs, copy_width_aligned |
                                                (copy_height_aligned << 16));
                                radeon_emit(cs, copy_depth);
                        } else {
                                radeon_emit(cs, (copy_width_aligned - 8) |
                                                ((copy_height_aligned - 8) << 
16));
                                radeon_emit(cs, (copy_depth - 1));
                        }
-
-                       r600_dma_emit_wait_idle(&sctx->b);
                        return true;
                }
        }
 
        return false;
 }
 
 static void cik_sdma_copy(struct pipe_context *ctx,
                          struct pipe_resource *dst,
                          unsigned dst_level,
diff --git a/src/gallium/drivers/radeonsi/si_dma.c 
b/src/gallium/drivers/radeonsi/si_dma.c
index 8d186c3..1009bb2 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -69,21 +69,20 @@ static void si_dma_copy_buffer(struct si_context *ctx,
                csize = size < max_csize ? size : max_csize;
                radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, 
csize));
                radeon_emit(cs, dst_offset);
                radeon_emit(cs, src_offset);
                radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
                radeon_emit(cs, (src_offset >> 32UL) & 0xff);
                dst_offset += csize << shift;
                src_offset += csize << shift;
                size -= csize;
        }
-       r600_dma_emit_wait_idle(&ctx->b);
 }
 
 static void si_dma_copy_tile(struct si_context *ctx,
                             struct pipe_resource *dst,
                             unsigned dst_level,
                             unsigned dst_x,
                             unsigned dst_y,
                             unsigned dst_z,
                             struct pipe_resource *src,
                             unsigned src_level,
@@ -170,21 +169,20 @@ static void si_dma_copy_tile(struct si_context *ctx,
                radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16));
                radeon_emit(cs, (slice_tile_max << 0) | (pipe_config << 26));
                radeon_emit(cs, (tiled_x << 0) | (tiled_z << 18));
                radeon_emit(cs, (tiled_y << 0) | (tile_split << 21) | (nbanks 
<< 25) | (mt << 27));
                radeon_emit(cs, addr & 0xfffffffc);
                radeon_emit(cs, (addr >> 32UL) & 0xff);
                copy_height -= cheight;
                addr += cheight * pitch;
                tiled_y += cheight;
        }
-       r600_dma_emit_wait_idle(&ctx->b);
 }
 
 static void si_dma_copy(struct pipe_context *ctx,
                        struct pipe_resource *dst,
                        unsigned dst_level,
                        unsigned dstx, unsigned dsty, unsigned dstz,
                        struct pipe_resource *src,
                        unsigned src_level,
                        const struct pipe_box *src_box)
 {
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to