Re: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

2022-08-12 Thread Christian König

Hi Jiadong,

yeah, the bug fixes indeed sound like something we would want to have. 
Just drop the part 3 for now.


Regards,
Christian.

Am 11.08.22 um 05:18 schrieb Zhu, Jiadong:

[AMD Official Use Only - General]

Hi Christian,

Thank you for the reply, I will update the patch to fix style issue.

The patch has several changes
1. change the unmap package for mcbp which is not correct in 
gfx_v9_0_kiq_unmap_queues.
2. change the emitted ce/de meta data used for preempted ibs
3. add the function gfx_v9_0_ring_preempt_ib used for debugfs case.

Though the part 3 may be removed in the future.  Those functions of 1 and 2 
could be still used by some projects such as virtualization etc.

Thanks,
Jiadong


-Original Message-
From: Christian König 
Sent: Thursday, August 11, 2022 12:06 AM
To: Zhu, Jiadong ; amd-gfx@lists.freedesktop.org
Cc: Huang, Ray ; Liu, Aaron 
Subject: Re: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

[CAUTION: External Email]

Hi, Jiadong,

first of all your patches have major style issues. Please use the checkpatch.pl 
script before sending those out.

Apart from that as discussed on our call on Monday MCBP is not something we 
will implement on Linux. So we will probably remove the existing debugfs test 
sooner or later.

Regards,
Christian.

Am 09.08.22 um 11:21 schrieb Zhu, Jiadong:

[AMD Official Use Only - General]

Hi,

This patch is to correct the mcbp package for gfx9, which is the basic function 
used for debugfs.
There are no logic about when to trigger mcbp.
Shall we get this reviewed?

Thanks,
Jiadong

-Original Message-
From: Zhu, Jiadong 
Sent: Tuesday, August 9, 2022 5:15 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Aaron ; Huang, Ray ;
Zhu, Jiadong 
Subject: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

From: "Jiadong.Zhu" 

1. Use unmap_queue package to trigger preemption on gfx9
 Add trailing fence to track the preemption done.
2. Modify emit_ce_meta emit_de_meta functions
 for the resumed ibs.
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c| 159 ---
   drivers/gpu/drm/amd/amdgpu/soc15d.h  |   2 +
   3 files changed, 141 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 82c178a9033a..ca626f0ad7b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level {
   #define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
   #define AMDGPU_FENCE_FLAG_INT   (1 << 1)
   #define AMDGPU_FENCE_FLAG_TC_WB_ONLY(1 << 2)
+#define AMDGPU_FENCE_FLAG_EXEC  (1 << 3)

   #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring,
sched)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 5332899642dc..0b7cb4cf13c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device 
*adev);  static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
  struct amdgpu_cu_info *cu_info);
   static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device
*adev); -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring
*ring);
+static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
+resume);
   static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);  static 
void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status); @@
-824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring 
*kiq_ring,

PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));

  if (action == PREEMPT_QUEUES_NO_UNMAP) {
-   amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
-   amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
-   amdgpu_ring_write(kiq_ring, seq);
+   amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & 
ring->buf_mask));
+   amdgpu_ring_write(kiq_ring, 0);
+   amdgpu_ring_write(kiq_ring, 0);
+
  } else {
  amdgpu_ring_write(kiq_ring, 0);
  amdgpu_ring_write(kiq_ring, 0); @@ -5446,11 +5447,15
@@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,

  control |= ib->length_dw | (vmid << 24);

-   if (amdgpu_sriov_vf(ring->adev) && (ib->flags & 
AMDGPU_IB_FLAG_PREEMPT)) {
+   if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->flags
+&
+AMDGPU_IB_FLAG_PREEMPT)) {
  control |= INDIRECT_BUFFER_PRE_ENB(1);

+   if (flags & AMDGPU_IB_PREEMPTED)
+   control |=

RE: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

2022-08-10 Thread Zhu, Jiadong
[AMD Official Use Only - General]

Hi Christian,

Thank you for the reply, I will update the patch to fix style issue.

The patch has several changes
1. change the unmap package for mcbp which is not correct in 
gfx_v9_0_kiq_unmap_queues.
2. change the emitted ce/de meta data used for preempted ibs
3. add the function gfx_v9_0_ring_preempt_ib used for debugfs case.

Though the part 3 may be removed in the future.  Those functions of 1 and 2 
could be still used by some projects such as virtualization etc.

Thanks,
Jiadong


-Original Message-
From: Christian König 
Sent: Thursday, August 11, 2022 12:06 AM
To: Zhu, Jiadong ; amd-gfx@lists.freedesktop.org
Cc: Huang, Ray ; Liu, Aaron 
Subject: Re: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

[CAUTION: External Email]

Hi, Jiadong,

first of all your patches have major style issues. Please use the checkpatch.pl 
script before sending those out.

Apart from that as discussed on our call on Monday MCBP is not something we 
will implement on Linux. So we will probably remove the existing debugfs test 
sooner or later.

Regards,
Christian.

Am 09.08.22 um 11:21 schrieb Zhu, Jiadong:
> [AMD Official Use Only - General]
>
> Hi,
>
> This patch is to correct the mcbp package for gfx9, which is the basic 
> function used for debugfs.
> There are no logic about when to trigger mcbp.
> Shall we get this reviewed?
>
> Thanks,
> Jiadong
>
> -Original Message-
> From: Zhu, Jiadong 
> Sent: Tuesday, August 9, 2022 5:15 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Aaron ; Huang, Ray ;
> Zhu, Jiadong 
> Subject: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)
>
> From: "Jiadong.Zhu" 
>
> 1. Use unmap_queue package to trigger preemption on gfx9
> Add trailing fence to track the preemption done.
> 2. Modify emit_ce_meta emit_de_meta functions
> for the resumed ibs.
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c| 159 ---
>   drivers/gpu/drm/amd/amdgpu/soc15d.h  |   2 +
>   3 files changed, 141 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 82c178a9033a..ca626f0ad7b1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level {
>   #define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
>   #define AMDGPU_FENCE_FLAG_INT   (1 << 1)
>   #define AMDGPU_FENCE_FLAG_TC_WB_ONLY(1 << 2)
> +#define AMDGPU_FENCE_FLAG_EXEC  (1 << 3)
>
>   #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring,
> sched)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 5332899642dc..0b7cb4cf13c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device 
> *adev);  static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
>  struct amdgpu_cu_info *cu_info);
>   static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device
> *adev); -static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring
> *ring);
> +static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
> +resume);
>   static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);  
> static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
>void *ras_error_status); @@
> -824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring 
> *kiq_ring,
>
> PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
>
>  if (action == PREEMPT_QUEUES_NO_UNMAP) {
> -   amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
> -   amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
> -   amdgpu_ring_write(kiq_ring, seq);
> +   amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & 
> ring->buf_mask));
> +   amdgpu_ring_write(kiq_ring, 0);
> +   amdgpu_ring_write(kiq_ring, 0);
> +
>  } else {
>  amdgpu_ring_write(kiq_ring, 0);
>  amdgpu_ring_write(kiq_ring, 0); @@ -5446,11 +5447,15
> @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>
>  control |= ib->length_dw | (vmid << 24);
>
> -   if (amdgpu_sriov_vf(ring->adev) && (ib->flags & 
> AMDGPU_IB_FLAG_PREEMPT)) {
> +   if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->

Re: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

2022-08-10 Thread Christian König

Hi, Jiadong,

first of all your patches have major style issues. Please use the 
checkpatch.pl script before sending those out.


Apart from that as discussed on our call on Monday MCBP is not something 
we will implement on Linux. So we will probably remove the existing 
debugfs test sooner or later.


Regards,
Christian.

Am 09.08.22 um 11:21 schrieb Zhu, Jiadong:

[AMD Official Use Only - General]

Hi,

This patch is to correct the mcbp package for gfx9, which is the basic function 
used for debugfs.
There are no logic about when to trigger mcbp.
Shall we get this reviewed?

Thanks,
Jiadong

-Original Message-
From: Zhu, Jiadong 
Sent: Tuesday, August 9, 2022 5:15 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Aaron ; Huang, Ray ; Zhu, Jiadong 

Subject: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

From: "Jiadong.Zhu" 

1. Use unmap_queue package to trigger preemption on gfx9
Add trailing fence to track the preemption done.
2. Modify emit_ce_meta emit_de_meta functions
for the resumed ibs.
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c| 159 ---
  drivers/gpu/drm/amd/amdgpu/soc15d.h  |   2 +
  3 files changed, 141 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 82c178a9033a..ca626f0ad7b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level {
  #define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
  #define AMDGPU_FENCE_FLAG_INT   (1 << 1)
  #define AMDGPU_FENCE_FLAG_TC_WB_ONLY(1 << 2)
+#define AMDGPU_FENCE_FLAG_EXEC  (1 << 3)

  #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 5332899642dc..0b7cb4cf13c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device 
*adev);  static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
 struct amdgpu_cu_info *cu_info);
  static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev); 
-static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
+static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
+resume);
  static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);  static 
void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
   void *ras_error_status);
@@ -824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring 
*kiq_ring,
 
PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));

 if (action == PREEMPT_QUEUES_NO_UNMAP) {
-   amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
-   amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
-   amdgpu_ring_write(kiq_ring, seq);
+   amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & 
ring->buf_mask));
+   amdgpu_ring_write(kiq_ring, 0);
+   amdgpu_ring_write(kiq_ring, 0);
+
 } else {
 amdgpu_ring_write(kiq_ring, 0);
 amdgpu_ring_write(kiq_ring, 0);
@@ -5446,11 +5447,15 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct 
amdgpu_ring *ring,

 control |= ib->length_dw | (vmid << 24);

-   if (amdgpu_sriov_vf(ring->adev) && (ib->flags & 
AMDGPU_IB_FLAG_PREEMPT)) {
+   if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->flags &
+AMDGPU_IB_FLAG_PREEMPT)) {
 control |= INDIRECT_BUFFER_PRE_ENB(1);

+   if (flags & AMDGPU_IB_PREEMPTED)
+   control |= INDIRECT_BUFFER_PRE_RESUME(1);
+
 if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
-   gfx_v9_0_ring_emit_de_meta(ring);
+   gfx_v9_0_ring_emit_de_meta(ring,
+(!amdgpu_sriov_vf(ring->adev) && flags & 
AMDGPU_IB_PREEMPTED) ?
+true : false);
 }

 amdgpu_ring_write(ring, header);
@@ -5505,6 +5510,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring 
*ring, u64 addr,
 bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
 bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
 bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
+   bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;

 /* RELEASE_MEM - flush caches, send int */
 amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6)); @@ -5515,6 
+5521,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 
addr,
EOP_TC_WB_ACTION_EN |
EOP_TC_MD_ACTION_EN)) |
  EVEN

RE: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

2022-08-09 Thread Zhu, Jiadong
[AMD Official Use Only - General]

Hi,

This patch is to correct the mcbp package for gfx9, which is the basic function 
used for debugfs.
There are no logic about when to trigger mcbp.
Shall we get this reviewed?

Thanks,
Jiadong

-Original Message-
From: Zhu, Jiadong 
Sent: Tuesday, August 9, 2022 5:15 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Aaron ; Huang, Ray ; Zhu, 
Jiadong 
Subject: [PATCH 1/2] drm/amdgpu: modify mcbp implement for gfx9(v2)

From: "Jiadong.Zhu" 

1. Use unmap_queue package to trigger preemption on gfx9
   Add trailing fence to track the preemption done.
2. Modify emit_ce_meta emit_de_meta functions
   for the resumed ibs.
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c| 159 ---
 drivers/gpu/drm/amd/amdgpu/soc15d.h  |   2 +
 3 files changed, 141 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 82c178a9033a..ca626f0ad7b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -59,6 +59,7 @@ enum amdgpu_ring_priority_level {
 #define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
 #define AMDGPU_FENCE_FLAG_INT   (1 << 1)
 #define AMDGPU_FENCE_FLAG_TC_WB_ONLY(1 << 2)
+#define AMDGPU_FENCE_FLAG_EXEC  (1 << 3)

 #define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 5332899642dc..0b7cb4cf13c4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -751,7 +751,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device 
*adev);  static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
struct amdgpu_cu_info *cu_info);
 static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev); 
-static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
+static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool
+resume);
 static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);  static 
void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
  void *ras_error_status);
@@ -824,9 +824,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring 
*kiq_ring,

PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));

if (action == PREEMPT_QUEUES_NO_UNMAP) {
-   amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
-   amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
-   amdgpu_ring_write(kiq_ring, seq);
+   amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & 
ring->buf_mask));
+   amdgpu_ring_write(kiq_ring, 0);
+   amdgpu_ring_write(kiq_ring, 0);
+
} else {
amdgpu_ring_write(kiq_ring, 0);
amdgpu_ring_write(kiq_ring, 0);
@@ -5446,11 +5447,15 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct 
amdgpu_ring *ring,

control |= ib->length_dw | (vmid << 24);

-   if (amdgpu_sriov_vf(ring->adev) && (ib->flags & 
AMDGPU_IB_FLAG_PREEMPT)) {
+   if ((amdgpu_sriov_vf(ring->adev) || amdgpu_mcbp) && (ib->flags &
+AMDGPU_IB_FLAG_PREEMPT)) {
control |= INDIRECT_BUFFER_PRE_ENB(1);

+   if (flags & AMDGPU_IB_PREEMPTED)
+   control |= INDIRECT_BUFFER_PRE_RESUME(1);
+
if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
-   gfx_v9_0_ring_emit_de_meta(ring);
+   gfx_v9_0_ring_emit_de_meta(ring,
+(!amdgpu_sriov_vf(ring->adev) && flags & 
AMDGPU_IB_PREEMPTED) ?
+true : false);
}

amdgpu_ring_write(ring, header);
@@ -5505,6 +5510,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring 
*ring, u64 addr,
bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
+   bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;

/* RELEASE_MEM - flush caches, send int */
amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6)); @@ -5515,6 
+5521,7 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 
addr,
   EOP_TC_WB_ACTION_EN |
   EOP_TC_MD_ACTION_EN)) |
 EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
+(exec ? EOP_EXEC : 0x0) |
 EVENT_INDEX(5)));
amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel 
? 2 : 0));

@@ -5620,33 +5627,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring 
*ring)
amdgpu_ring_write(ring, 0);
 }

-static void gfx_v9_0_ring_emit_ce_met