Re: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status registers

2019-11-05 Thread Koenig, Christian
Am 05.11.19 um 11:21 schrieb Zhu, Changfeng:
> Hi Chris,
>
> Maybe it's better to use amdgpu_ring_emit_reg_wait(ring, reg0, 0, 0); to 
> replace
> amdgpu_ring_emit_reg_wait(ring, reg1, 0, 0); ?

Good point. I've mixed up request and acknowledge register.

Important is that you need 0 as mask and value, or otherwise we could 
potentially wait forever.

Regards,
Christian.

>
> http://ontrack-internal.amd.com/browse/SWDEV-192660
> Jira ticket recommends to read VM_INVALIDATE_ENG*_REQ.
>
> BR,
> Changfeng.
>
> -Original Message-
> From: Koenig, Christian 
> Sent: Tuesday, November 5, 2019 5:13 PM
> To: Zhu, Changfeng ; amd-gfx@lists.freedesktop.org; 
> Tuikov, Luben ; Huang, Ray ; Huang, 
> Shimmer 
> Subject: Re: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM 
> status registers
>
> Am 05.11.19 um 07:32 schrieb Zhu, Changfeng:
>> From: changzhu 
>>
>> The GRBM register interface is now capable of bursting 1 cycle per
>> register wr->wr, wr->rd much faster than previous muticycle per
>> transaction done interface.  This has caused a problem where status
>> registers requiring HW to update have a 1 cycle delay, due to the
>> register update having to go through GRBM.
>>
>> For cp ucode, it has realized dummy read in cp firmware.It covers the
>> use of WAIT_REG_MEM operation 1 case only.So it needs to call
>> gfx_v10_0_wait_reg_mem in gfx10. Besides it also needs to add warning
>> to update firmware in case firmware is too old to have function to
>> realize dummy read in cp firmware.
>>
>> For sdma ucode, it hasn't realized dummy read in sdma firmware. sdma
>> is moved to gfxhub in gfx10. So it needs to add dummy read in driver
>> between amdgpu_ring_emit_wreg and amdgpu_ring_emit_reg_wait for sdma_v5_0.
> First of all thanks for getting your environment setup properly, we are 
> finally making progress with that issue.
>
> A bunch of nice to have comments below and two major bugs/typos which really 
> needs to be fixed.
>
>> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
>> Signed-off-by: changzhu 
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  1 +
>>drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 50 +
>>drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  7 
>>drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  8 ++--
>>drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c  | 13 ++-
>>5 files changed, 73 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> index 459aa9059542..a74ecd449775 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
>> @@ -267,6 +267,7 @@ struct amdgpu_gfx {
>>  uint32_tmec2_feature_version;
>>  boolmec_fw_write_wait;
>>  boolme_fw_write_wait;
>> +boolcp_fw_write_wait;
>>  struct amdgpu_ring  gfx_ring[AMDGPU_MAX_GFX_RINGS];
>>  unsignednum_gfx_rings;
>>  struct amdgpu_ring  compute_ring[AMDGPU_MAX_COMPUTE_RINGS];
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 17a5cbfd0024..814764723c26 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -561,6 +561,32 @@ static void gfx_v10_0_free_microcode(struct 
>> amdgpu_device *adev)
>>  kfree(adev->gfx.rlc.register_list_format);
>>}
>>
>> +static void gfx_v10_0_check_fw_write_wait(struct amdgpu_device *adev)
>> +{
>> +adev->gfx.cp_fw_write_wait = false;
>> +
>> +switch (adev->asic_type) {
>> +case CHIP_NAVI10:
>> +case CHIP_NAVI12:
>> +case CHIP_NAVI14:
>> +if ((adev->gfx.me_fw_version >= 0x0046) &&
>> +(adev->gfx.me_feature_version >= 27) &&
>> +(adev->gfx.pfp_fw_version >= 0x0068) &&
>> +(adev->gfx.pfp_feature_version >= 27) &&
>> +(adev->gfx.mec_fw_version >= 0x005b) &&
>> +(adev->gfx.mec_feature_version >= 27))
>> +adev->gfx.cp_fw_write_wait = true;
>> +break;
>> +default:
>> +break;
>> +}
>> +
>> +if (adev->gfx.cp_fw_write_wait == false)
&g

Re: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status registers

2019-11-05 Thread Koenig, Christian
Am 05.11.19 um 07:32 schrieb Zhu, Changfeng:
> From: changzhu 
>
> The GRBM register interface is now capable of bursting 1 cycle per
> register wr->wr, wr->rd much faster than previous muticycle per
> transaction done interface.  This has caused a problem where
> status registers requiring HW to update have a 1 cycle delay, due
> to the register update having to go through GRBM.
>
> For cp ucode, it has realized dummy read in cp firmware.It covers
> the use of WAIT_REG_MEM operation 1 case only.So it needs to call
> gfx_v10_0_wait_reg_mem in gfx10. Besides it also needs to add warning to
> update firmware in case firmware is too old to have function to realize
> dummy read in cp firmware.
>
> For sdma ucode, it hasn't realized dummy read in sdma firmware. sdma is
> moved to gfxhub in gfx10. So it needs to add dummy read in driver
> between amdgpu_ring_emit_wreg and amdgpu_ring_emit_reg_wait for sdma_v5_0.

First of all thanks for getting your environment setup properly, we are 
finally making progress with that issue.

A bunch of nice to have comments below and two major bugs/typos which 
really needs to be fixed.

>
> Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
> Signed-off-by: changzhu 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 50 +
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  7 
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  8 ++--
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c  | 13 ++-
>   5 files changed, 73 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 459aa9059542..a74ecd449775 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -267,6 +267,7 @@ struct amdgpu_gfx {
>   uint32_tmec2_feature_version;
>   boolmec_fw_write_wait;
>   boolme_fw_write_wait;
> + boolcp_fw_write_wait;
>   struct amdgpu_ring  gfx_ring[AMDGPU_MAX_GFX_RINGS];
>   unsignednum_gfx_rings;
>   struct amdgpu_ring  compute_ring[AMDGPU_MAX_COMPUTE_RINGS];
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 17a5cbfd0024..814764723c26 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -561,6 +561,32 @@ static void gfx_v10_0_free_microcode(struct 
> amdgpu_device *adev)
>   kfree(adev->gfx.rlc.register_list_format);
>   }
>   
> +static void gfx_v10_0_check_fw_write_wait(struct amdgpu_device *adev)
> +{
> + adev->gfx.cp_fw_write_wait = false;
> +
> + switch (adev->asic_type) {
> + case CHIP_NAVI10:
> + case CHIP_NAVI12:
> + case CHIP_NAVI14:
> + if ((adev->gfx.me_fw_version >= 0x0046) &&
> + (adev->gfx.me_feature_version >= 27) &&
> + (adev->gfx.pfp_fw_version >= 0x0068) &&
> + (adev->gfx.pfp_feature_version >= 27) &&
> + (adev->gfx.mec_fw_version >= 0x005b) &&
> + (adev->gfx.mec_feature_version >= 27))
> + adev->gfx.cp_fw_write_wait = true;
> + break;
> + default:
> + break;
> + }
> +
> + if (adev->gfx.cp_fw_write_wait == false)
> + DRM_WARN_ONCE("Warning: check cp_fw_version and update it to 
> realize \
> +   GRBM requires 1-cycle delay in cp 
> firmware\n");
> +}
> +
> +
>   static void gfx_v10_0_init_rlc_ext_microcode(struct amdgpu_device *adev)
>   {
>   const struct rlc_firmware_header_v2_1 *rlc_hdr;
> @@ -4768,6 +4794,28 @@ static void gfx_v10_0_ring_emit_reg_wait(struct 
> amdgpu_ring *ring, uint32_t reg,
>   gfx_v10_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
>   }
>   
> +static void gfx_v10_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
> +   uint32_t reg0, uint32_t reg1,
> +   uint32_t ref, uint32_t mask)
> +{
> + int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
> + struct amdgpu_device *adev = ring->adev;
> + bool fw_version_ok = false;
> +
> + gfx_v10_0_check_fw_write_wait(adev);
> +
> + if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
> + ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)

That check is probably superfluous. A few lines below you are using the 
function in the gfx_v10_0_ring_funcs_gfx and 
gfx_v10_0_ring_funcs_compute, so the ring->funcs->type is always constant.

> + fw_version_ok = adev->gfx.cp_fw_write_wait;
> +
> + if (fw_version_ok)
> + gfx_v10_0_wait_reg_mem(ring, usepfp, 0, 1, reg0, reg1,
> +   ref, mask, 0x20);
> + else
> + 

[PATCH] drm/amdgpu: add dummy read by engines for some GCVM status registers

2019-11-04 Thread Zhu, Changfeng
From: changzhu 

The GRBM register interface is now capable of bursting 1 cycle per
register wr->wr, wr->rd much faster than previous muticycle per
transaction done interface.  This has caused a problem where
status registers requiring HW to update have a 1 cycle delay, due
to the register update having to go through GRBM.

For cp ucode, it has realized dummy read in cp firmware.It covers
the use of WAIT_REG_MEM operation 1 case only.So it needs to call
gfx_v10_0_wait_reg_mem in gfx10. Besides it also needs to add warning to
update firmware in case firmware is too old to have function to realize
dummy read in cp firmware.

For sdma ucode, it hasn't realized dummy read in sdma firmware. sdma is
moved to gfxhub in gfx10. So it needs to add dummy read in driver
between amdgpu_ring_emit_wreg and amdgpu_ring_emit_reg_wait for sdma_v5_0.

Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
Signed-off-by: changzhu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 50 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  7 
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  8 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c  | 13 ++-
 5 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 459aa9059542..a74ecd449775 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -267,6 +267,7 @@ struct amdgpu_gfx {
uint32_tmec2_feature_version;
boolmec_fw_write_wait;
boolme_fw_write_wait;
+   boolcp_fw_write_wait;
struct amdgpu_ring  gfx_ring[AMDGPU_MAX_GFX_RINGS];
unsignednum_gfx_rings;
struct amdgpu_ring  compute_ring[AMDGPU_MAX_COMPUTE_RINGS];
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 17a5cbfd0024..814764723c26 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -561,6 +561,32 @@ static void gfx_v10_0_free_microcode(struct amdgpu_device 
*adev)
kfree(adev->gfx.rlc.register_list_format);
 }
 
+static void gfx_v10_0_check_fw_write_wait(struct amdgpu_device *adev)
+{
+   adev->gfx.cp_fw_write_wait = false;
+
+   switch (adev->asic_type) {
+   case CHIP_NAVI10:
+   case CHIP_NAVI12:
+   case CHIP_NAVI14:
+   if ((adev->gfx.me_fw_version >= 0x0046) &&
+   (adev->gfx.me_feature_version >= 27) &&
+   (adev->gfx.pfp_fw_version >= 0x0068) &&
+   (adev->gfx.pfp_feature_version >= 27) &&
+   (adev->gfx.mec_fw_version >= 0x005b) &&
+   (adev->gfx.mec_feature_version >= 27))
+   adev->gfx.cp_fw_write_wait = true;
+   break;
+   default:
+   break;
+   }
+
+   if (adev->gfx.cp_fw_write_wait == false)
+   DRM_WARN_ONCE("Warning: check cp_fw_version and update it to 
realize \
+ GRBM requires 1-cycle delay in cp 
firmware\n");
+}
+
+
 static void gfx_v10_0_init_rlc_ext_microcode(struct amdgpu_device *adev)
 {
const struct rlc_firmware_header_v2_1 *rlc_hdr;
@@ -4768,6 +4794,28 @@ static void gfx_v10_0_ring_emit_reg_wait(struct 
amdgpu_ring *ring, uint32_t reg,
gfx_v10_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
 }
 
+static void gfx_v10_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
+ uint32_t reg0, uint32_t reg1,
+ uint32_t ref, uint32_t mask)
+{
+   int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
+   struct amdgpu_device *adev = ring->adev;
+   bool fw_version_ok = false;
+
+   gfx_v10_0_check_fw_write_wait(adev);
+
+   if (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+   ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
+   fw_version_ok = adev->gfx.cp_fw_write_wait;
+
+   if (fw_version_ok)
+   gfx_v10_0_wait_reg_mem(ring, usepfp, 0, 1, reg0, reg1,
+ ref, mask, 0x20);
+   else
+   amdgpu_ring_emit_reg_write_reg_wait_helper(ring, reg0, reg1,
+  ref, mask);
+}
+
 static void
 gfx_v10_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
  uint32_t me, uint32_t pipe,
@@ -5158,6 +5206,7 @@ static const struct amdgpu_ring_funcs 
gfx_v10_0_ring_funcs_gfx = {
.emit_tmz = gfx_v10_0_ring_emit_tmz,
.emit_wreg = gfx_v10_0_ring_emit_wreg,
.emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
+   .emit_reg_write_reg_wait =