From: Marek Olšák <marek.ol...@amd.com>

There is a new IB flag that enables this new behavior.
Full invalidation is unnecessary for RELEASE_MEM and doesn't make sense
when draw calls from two adjacent gfx IBs run in parallel. This will be
the new default for Mesa.

Signed-off-by: Marek Olšák <marek.ol...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c    |  8 ++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  4 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c     | 11 +++++++----
 drivers/gpu/drm/amd/amdgpu/soc15d.h       |  1 +
 include/uapi/drm/amdgpu_drm.h             |  4 ++++
 7 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 97449e06a242..d09fcab2398f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -124,39 +124,40 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
 
 /**
  * amdgpu_fence_emit - emit a fence on the requested ring
  *
  * @ring: ring the fence is associated with
  * @f: resulting fence object
  *
  * Emits a fence command on the requested ring (all asics).
  * Returns 0 on success, -ENOMEM on failure.
  */
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f)
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f,
+                     unsigned flags)
 {
        struct amdgpu_device *adev = ring->adev;
        struct amdgpu_fence *fence;
        struct dma_fence *old, **ptr;
        uint32_t seq;
 
        fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
        if (fence == NULL)
                return -ENOMEM;
 
        seq = ++ring->fence_drv.sync_seq;
        fence->ring = ring;
        dma_fence_init(&fence->base, &amdgpu_fence_ops,
                       &ring->fence_drv.lock,
                       adev->fence_context + ring->idx,
                       seq);
        amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
-                              seq, AMDGPU_FENCE_FLAG_INT);
+                              seq, flags | AMDGPU_FENCE_FLAG_INT);
 
        ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
        /* This function can't be called concurrently anyway, otherwise
         * emitting the fence would mess up the hardware ring buffer.
         */
        old = rcu_dereference_protected(*ptr, 1);
        if (old && !dma_fence_is_signaled(old)) {
                DRM_INFO("rcu slot is busy\n");
                dma_fence_wait(old, false);
        }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 311589e02d17..f70eeed9ed76 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -120,20 +120,21 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
                       struct dma_fence **f)
 {
        struct amdgpu_device *adev = ring->adev;
        struct amdgpu_ib *ib = &ibs[0];
        struct dma_fence *tmp = NULL;
        bool skip_preamble, need_ctx_switch;
        unsigned patch_offset = ~0;
        struct amdgpu_vm *vm;
        uint64_t fence_ctx;
        uint32_t status = 0, alloc_size;
+       unsigned fence_flags = 0;
 
        unsigned i;
        int r = 0;
        bool need_pipe_sync = false;
 
        if (num_ibs == 0)
                return -EINVAL;
 
        /* ring tests don't use a job */
        if (job) {
@@ -220,36 +221,39 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
        }
 
        if (ring->funcs->emit_tmz)
                amdgpu_ring_emit_tmz(ring, false);
 
 #ifdef CONFIG_X86_64
        if (!(adev->flags & AMD_IS_APU))
 #endif
                amdgpu_asic_invalidate_hdp(adev, ring);
 
-       r = amdgpu_fence_emit(ring, f);
+       if (ib->flags & AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE)
+               fence_flags |= AMDGPU_FENCE_FLAG_TC_WB_ONLY;
+
+       r = amdgpu_fence_emit(ring, f, fence_flags);
        if (r) {
                dev_err(adev->dev, "failed to emit fence (%d)\n", r);
                if (job && job->vmid)
                        amdgpu_vmid_reset(adev, ring->funcs->vmhub, job->vmid);
                amdgpu_ring_undo(ring);
                return r;
        }
 
        if (ring->funcs->insert_end)
                ring->funcs->insert_end(ring);
 
        /* wrap the last IB with fence */
        if (job && job->uf_addr) {
                amdgpu_ring_emit_fence(ring, job->uf_addr, job->uf_sequence,
-                                      AMDGPU_FENCE_FLAG_64BIT);
+                                      fence_flags | AMDGPU_FENCE_FLAG_64BIT);
        }
 
        if (patch_offset != ~0 && ring->funcs->patch_cond_exec)
                amdgpu_ring_patch_cond_exec(ring, patch_offset);
 
        ring->current_ctx = fence_ctx;
        if (vm && ring->funcs->emit_switch_buffer)
                amdgpu_ring_emit_switch_buffer(ring);
        amdgpu_ring_commit(ring);
        return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 1d0d250cbfdf..222052daedd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -33,20 +33,21 @@
 #define AMDGPU_MAX_COMPUTE_RINGS       8
 #define AMDGPU_MAX_VCE_RINGS           3
 #define AMDGPU_MAX_UVD_ENC_RINGS       2
 
 /* some special values for the owner field */
 #define AMDGPU_FENCE_OWNER_UNDEFINED   ((void*)0ul)
 #define AMDGPU_FENCE_OWNER_VM          ((void*)1ul)
 
 #define AMDGPU_FENCE_FLAG_64BIT         (1 << 0)
 #define AMDGPU_FENCE_FLAG_INT           (1 << 1)
+#define AMDGPU_FENCE_FLAG_TC_WB_ONLY    (1 << 2)
 
 enum amdgpu_ring_type {
        AMDGPU_RING_TYPE_GFX,
        AMDGPU_RING_TYPE_COMPUTE,
        AMDGPU_RING_TYPE_SDMA,
        AMDGPU_RING_TYPE_UVD,
        AMDGPU_RING_TYPE_VCE,
        AMDGPU_RING_TYPE_KIQ,
        AMDGPU_RING_TYPE_UVD_ENC,
        AMDGPU_RING_TYPE_VCN_DEC,
@@ -81,21 +82,22 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev);
 void amdgpu_fence_driver_fini(struct amdgpu_device *adev);
 void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
 
 int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
                                  unsigned num_hw_submission);
 int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
                                   struct amdgpu_irq_src *irq_src,
                                   unsigned irq_type);
 void amdgpu_fence_driver_suspend(struct amdgpu_device *adev);
 void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
-int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence);
+int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
+                     unsigned flags);
 int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
 void amdgpu_fence_process(struct amdgpu_ring *ring);
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
 signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring,
                                      uint32_t wait_seq,
                                      signed long timeout);
 unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring);
 
 /*
  * Rings.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 24474294c92a..fe05351ea4d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -620,21 +620,21 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct 
amdgpu_job *job, bool need_
 
        if (vm_flush_needed) {
                trace_amdgpu_vm_flush(ring, job->vmid, job->vm_pd_addr);
                amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr);
        }
 
        if (pasid_mapping_needed)
                amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
 
        if (vm_flush_needed || pasid_mapping_needed) {
-               r = amdgpu_fence_emit(ring, &fence);
+               r = amdgpu_fence_emit(ring, &fence, 0);
                if (r)
                        return r;
        }
 
        if (vm_flush_needed) {
                mutex_lock(&id_mgr->lock);
                dma_fence_put(id->last_flush);
                id->last_flush = dma_fence_get(fence);
                id->current_gpu_reset_count =
                        atomic_read(&adev->gpu_reset_counter);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 9d39fd5b1822..5dea0d4c0af4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3767,27 +3767,30 @@ static void gfx_v9_0_ring_emit_ib_compute(struct 
amdgpu_ring *ring,
                                 lower_32_bits(ib->gpu_addr));
         amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
         amdgpu_ring_write(ring, control);
 }
 
 static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
                                     u64 seq, unsigned flags)
 {
        bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
        bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
+       bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
 
        /* RELEASE_MEM - flush caches, send int */
        amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
-       amdgpu_ring_write(ring, (EOP_TCL1_ACTION_EN |
-                                EOP_TC_ACTION_EN |
-                                EOP_TC_WB_ACTION_EN |
-                                EOP_TC_MD_ACTION_EN |
+       amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
+                                              EOP_TC_NC_ACTION_EN) :
+                                             (EOP_TCL1_ACTION_EN |
+                                              EOP_TC_ACTION_EN |
+                                              EOP_TC_WB_ACTION_EN |
+                                              EOP_TC_MD_ACTION_EN)) |
                                 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
                                 EVENT_INDEX(5)));
        amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel 
? 2 : 0));
 
        /*
         * the address should be Qword aligned if 64bit write, Dword
         * aligned if only send 32bit data low (discard data high)
         */
        if (write64bit)
                BUG_ON(addr & 0x7);
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15d.h 
b/drivers/gpu/drm/amd/amdgpu/soc15d.h
index 7f408f85fdb6..839a144c1645 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15d.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15d.h
@@ -152,20 +152,21 @@
                 * 4 - *S_PARTIAL_FLUSH
                 */
 #define        PACKET3_RELEASE_MEM                             0x49
 #define                EVENT_TYPE(x)                           ((x) << 0)
 #define                EVENT_INDEX(x)                          ((x) << 8)
 #define                EOP_TCL1_VOL_ACTION_EN                  (1 << 12)
 #define                EOP_TC_VOL_ACTION_EN                    (1 << 13) /* L2 
*/
 #define                EOP_TC_WB_ACTION_EN                     (1 << 15) /* L2 
*/
 #define                EOP_TCL1_ACTION_EN                      (1 << 16)
 #define                EOP_TC_ACTION_EN                        (1 << 17) /* L2 
*/
+#define                EOP_TC_NC_ACTION_EN                     (1 << 19)
 #define                EOP_TC_MD_ACTION_EN                     (1 << 21) /* L2 
metadata */
 
 #define                DATA_SEL(x)                             ((x) << 29)
                /* 0 - discard
                 * 1 - send low 32bit data
                 * 2 - send 64bit data
                 * 3 - send 64bit GPU counter value
                 * 4 - send 64bit sys counter value
                 */
 #define                INT_SEL(x)                              ((x) << 24)
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index 0087799962cf..f5901bd9c7d8 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -516,20 +516,24 @@ union drm_amdgpu_cs {
 
 /* This IB should be submitted to CE */
 #define AMDGPU_IB_FLAG_CE      (1<<0)
 
 /* Preamble flag, which means the IB could be dropped if no context switch */
 #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
 
 /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
 #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
 
+/* The IB fence should do the L2 writeback but not invalidate any shader
+ * caches (L2/vL1/sL1/I$). */
+#define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
+
 struct drm_amdgpu_cs_chunk_ib {
        __u32 _pad;
        /** AMDGPU_IB_FLAG_* */
        __u32 flags;
        /** Virtual address to begin IB execution */
        __u64 va_start;
        /** Size of submission */
        __u32 ib_bytes;
        /** HW IP to submit to */
        __u32 ip_type;
-- 
2.15.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to