Re: [PATCH 1/6] drm/amdkfd: Update parameter type of pasid to uint16_t

2019-09-30 Thread Zhao, Yong
I will drop this one, because I found many other functions use 32 bits 
as well and it seems to be convenient.

Regards,

Yong

On 2019-09-30 11:54 a.m., Kuehling, Felix wrote:
> If you want to make this interface consistent, you should make the vmid
> parameter uint8_t at the same time. That said, you don't really save any
> resources, because 8-bit and 16-bit ints still consume 32-bits on the
> call stack.
>
> Regards,
>     Felix
>
> On 2019-09-27 11:41 p.m., Zhao, Yong wrote:
>> This is consistent with other code and registers in the code.
>>
>> Change-Id: I04dd12bdb465a43cfcd8936ed0f227a6546830e8
>> Signed-off-by: Yong Zhao 
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 4 ++--
>>drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 4 ++--
>>drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 4 ++--
>>drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 2 +-
>>drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 2 +-
>>drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++--
>>drivers/gpu/drm/amd/include/kgd_kfd_interface.h   | 2 +-
>>7 files changed, 11 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> index 122698f8dd1e..33cbf1d073d3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
>> @@ -59,7 +59,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
>> *kgd, uint32_t vmid,
>>  uint32_t sh_mem_config,
>>  uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
>>  uint32_t sh_mem_bases);
>> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
>> pasid,
>> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>>  unsigned int vmid);
>>static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
>>static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
>> @@ -232,7 +232,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
>> *kgd, uint32_t vmid,
>>  unlock_srbm(kgd);
>>}
>>
>> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
>> pasid,
>> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>>  unsigned int vmid)
>>{
>>  struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
>> index f77ddf7dba2b..0210d791dea1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
>> @@ -94,7 +94,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
>> *kgd, uint32_t vmid,
>>  uint32_t sh_mem_config, uint32_t sh_mem_ape1_base,
>>  uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
>>
>> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
>> pasid,
>> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>>  unsigned int vmid);
>>
>>static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
>> @@ -256,7 +256,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
>> *kgd, uint32_t vmid,
>>  unlock_srbm(kgd);
>>}
>>
>> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
>> pasid,
>> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>>  unsigned int vmid)
>>{
>>  struct amdgpu_device *adev = get_amdgpu_device(kgd);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
>> index 7478caf096ad..7a4c762e1209 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
>> @@ -52,7 +52,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
>> *kgd, uint32_t vmid,
>>  uint32_t sh_mem_config,
>>  uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
>>  uint32_t sh_mem_bases);
>> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
>> pasid,
>> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>>  unsigned int vmid);
>>static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
>>static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
>> @@ -210,7 +210,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
>> *kgd, uint32_t vmid,
>>  unlock_srbm(kgd);
>>}
>>
>> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
>> pasid,
>> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>> 

[PATCH] drm/amdkfd: Improve KFD IOCTL printing

2019-09-30 Thread Zhao, Yong
The code use hex define, so should the printing.

Change-Id: Ia7cc7690553bb043915b3d8c0157216c64421a60
Signed-off-by: Yong Zhao 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index c28ba0c1d7ac..d9e36dbf13d5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1840,7 +1840,7 @@ static long kfd_ioctl(struct file *filep, unsigned int 
cmd, unsigned long arg)
} else
goto err_i1;
 
-   dev_dbg(kfd_device, "ioctl cmd 0x%x (#%d), arg 0x%lx\n", cmd, nr, arg);
+   dev_dbg(kfd_device, "ioctl cmd 0x%x (#0x%x), arg 0x%lx\n", cmd, nr, 
arg);
 
process = kfd_get_process(current);
if (IS_ERR(process)) {
@@ -1895,7 +1895,8 @@ static long kfd_ioctl(struct file *filep, unsigned int 
cmd, unsigned long arg)
kfree(kdata);
 
if (retcode)
-   dev_dbg(kfd_device, "ret = %d\n", retcode);
+   dev_dbg(kfd_device, "ioctl cmd (#0x%x), arg 0x%lx, ret = %d\n",
+   nr, arg, retcode);
 
return retcode;
 }
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v2] drm/amdgpu: fix multiple memory leaks

2019-09-30 Thread Navid Emamdoost
On Thu, Sep 19, 2019 at 3:03 AM Koenig, Christian
 wrote:
>
> Am 18.09.19 um 21:05 schrieb Navid Emamdoost:
> > In acp_hw_init there are some allocations that needs to be released in
> > case of failure:
> >
> > 1- adev->acp.acp_genpd should be released if any allocation attemp for
> > adev->acp.acp_cell, adev->acp.acp_res or i2s_pdata fails.
> > 2- all of those allocations should be released if pm_genpd_add_device
> > fails.
> >
> > v2: moved the released into goto error handlings
> >
> > Signed-off-by: Navid Emamdoost 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c | 30 +
> >   1 file changed, 21 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
> > index eba42c752bca..c0db75b71859 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
> > @@ -184,7 +184,7 @@ static struct device *get_mfd_cell_dev(const char 
> > *device_name, int r)
> >*/
> >   static int acp_hw_init(void *handle)
> >   {
> > - int r, i;
> > + int r, i, ret;
> >   uint64_t acp_base;
> >   u32 val = 0;
> >   u32 count = 0;
> > @@ -231,20 +231,21 @@ static int acp_hw_init(void *handle)
> >   adev->acp.acp_cell = kcalloc(ACP_DEVS, sizeof(struct mfd_cell),
> >   GFP_KERNEL);
> >
> > - if (adev->acp.acp_cell == NULL)
> > - return -ENOMEM;
> > + if (adev->acp.acp_cell == NULL) {
> > + ret = -ENOMEM;
> > + goto out1;
> > + }
> >
> >   adev->acp.acp_res = kcalloc(5, sizeof(struct resource), GFP_KERNEL);
> >   if (adev->acp.acp_res == NULL) {
> > - kfree(adev->acp.acp_cell);
> > - return -ENOMEM;
> > + ret = -ENOMEM;
> > + goto out2;
> >   }
> >
> >   i2s_pdata = kcalloc(3, sizeof(struct i2s_platform_data), GFP_KERNEL);
> >   if (i2s_pdata == NULL) {
> > - kfree(adev->acp.acp_res);
> > - kfree(adev->acp.acp_cell);
> > - return -ENOMEM;
> > + ret = -ENOMEM;
> > + goto out3;
> >   }
> >
> >   switch (adev->asic_type) {
> > @@ -348,7 +349,8 @@ static int acp_hw_init(void *handle)
> >   r = pm_genpd_add_device(>acp.acp_genpd->gpd, dev);
> >   if (r) {
> >   dev_err(dev, "Failed to add dev to genpd\n");
> > - return r;
> > + ret = r;
> > + goto out4;
> >   }
> >   }
> >
> > @@ -393,6 +395,16 @@ static int acp_hw_init(void *handle)
> >   val &= ~ACP_SOFT_RESET__SoftResetAud_MASK;
> >   cgs_write_register(adev->acp.cgs_device, mmACP_SOFT_RESET, val);
> >   return 0;
> > +
> > +out4:
> > + kfree(i2s_pdata);
> > +out3:
> > + kfree(adev->acp.acp_res);
> > +out2:
> > + kfree(adev->acp.acp_cell);
> > +out1:
> > + kfree(adev->acp.acp_genpd);
>
> kfree on a NULL pointer is harmless, so a single error label should be
> sufficient.
>
I fixed this by just using failure label.

> Christian.
>
> > + return ret;
> >   }
> >
> >   /**
>

In addition to previous cases, I covered 3 more error handling cases
that seemed need to goto failure. One where mfd_add_hotplug_devices
fails and the other two cases where time out values expire.


-- 
Navid.


[PATCH v3] drm/amdgpu: fix multiple memory leaks in acp_hw_init

2019-09-30 Thread Navid Emamdoost
In acp_hw_init there are some allocations that needs to be released in
case of failure:

1- adev->acp.acp_genpd should be released if any allocation attemp for
adev->acp.acp_cell, adev->acp.acp_res or i2s_pdata fails.
2- all of those allocations should be released if
mfd_add_hotplug_devices or pm_genpd_add_device fail.
3- Release is needed in case of time out values expire.

Signed-off-by: Navid Emamdoost 
---
Changes in v2:
-- moved the releases under goto

Changes in v3:
-- fixed multiple goto issue
-- added goto for 3 other failure cases: one when
mfd_add_hotplug_devices fails, and two when time out values expires.
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c | 41 -
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
index eba42c752bca..7809745ec0f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acp.c
@@ -184,12 +184,12 @@ static struct device *get_mfd_cell_dev(const char 
*device_name, int r)
  */
 static int acp_hw_init(void *handle)
 {
-   int r, i;
+   int r, i, ret;
uint64_t acp_base;
u32 val = 0;
u32 count = 0;
struct device *dev;
-   struct i2s_platform_data *i2s_pdata;
+   struct i2s_platform_data *i2s_pdata = NULL;
 
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
@@ -231,20 +231,21 @@ static int acp_hw_init(void *handle)
adev->acp.acp_cell = kcalloc(ACP_DEVS, sizeof(struct mfd_cell),
GFP_KERNEL);
 
-   if (adev->acp.acp_cell == NULL)
-   return -ENOMEM;
+   if (adev->acp.acp_cell == NULL) {
+   ret = -ENOMEM;
+   goto failure;
+   }
 
adev->acp.acp_res = kcalloc(5, sizeof(struct resource), GFP_KERNEL);
if (adev->acp.acp_res == NULL) {
-   kfree(adev->acp.acp_cell);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto failure;
}
 
i2s_pdata = kcalloc(3, sizeof(struct i2s_platform_data), GFP_KERNEL);
if (i2s_pdata == NULL) {
-   kfree(adev->acp.acp_res);
-   kfree(adev->acp.acp_cell);
-   return -ENOMEM;
+   ret = -ENOMEM;
+   goto failure;
}
 
switch (adev->asic_type) {
@@ -340,15 +341,18 @@ static int acp_hw_init(void *handle)
 
r = mfd_add_hotplug_devices(adev->acp.parent, adev->acp.acp_cell,
ACP_DEVS);
-   if (r)
-   return r;
+   if (r) {
+   ret = r;
+   goto failure;
+   }
 
for (i = 0; i < ACP_DEVS ; i++) {
dev = get_mfd_cell_dev(adev->acp.acp_cell[i].name, i);
r = pm_genpd_add_device(>acp.acp_genpd->gpd, dev);
if (r) {
dev_err(dev, "Failed to add dev to genpd\n");
-   return r;
+   ret = r;
+   goto failure;
}
}
 
@@ -367,7 +371,8 @@ static int acp_hw_init(void *handle)
break;
if (--count == 0) {
dev_err(>pdev->dev, "Failed to reset ACP\n");
-   return -ETIMEDOUT;
+   ret = -ETIMEDOUT;
+   goto failure;
}
udelay(100);
}
@@ -384,7 +389,8 @@ static int acp_hw_init(void *handle)
break;
if (--count == 0) {
dev_err(>pdev->dev, "Failed to reset ACP\n");
-   return -ETIMEDOUT;
+   ret = -ETIMEDOUT;
+   goto failure;
}
udelay(100);
}
@@ -393,6 +399,13 @@ static int acp_hw_init(void *handle)
val &= ~ACP_SOFT_RESET__SoftResetAud_MASK;
cgs_write_register(adev->acp.cgs_device, mmACP_SOFT_RESET, val);
return 0;
+
+failure:
+   kfree(i2s_pdata);
+   kfree(adev->acp.acp_res);
+   kfree(adev->acp.acp_cell);
+   kfree(adev->acp.acp_genpd);
+   return ret;
 }
 
 /**
-- 
2.17.1



[PATCH] drm/amdgpu: Initialize variable before use

2019-09-30 Thread Siqueira, Rodrigo
The 'debug_data' variable gets printed in debug statements without a
prior initialization in the function
hubbub1_verify_allow_pstate_change_high, as reported when building with
gcc 9.1.0:

warning: ‘debug_data’ may be used uninitialized in this function 
[-Wmaybe-uninitialized]
  290 |  printk##once(KERN_##level "[" DRM_NAME "] " fmt, ##__VA_ARGS__)
  |  ^~
dc/dcn10/dcn10_hubbub.c:134:15: note: ‘debug_data’ was declared here
  134 |  unsigned int debug_data;

Note that initialize debug_data with 0, in this case, is safe because we
have a loop in a few lines below that will initialize this variable with
the proper value.

Signed-off-by: Rodrigo Siqueira 
---
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubbub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubbub.c 
b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubbub.c
index a780057e2dbc..b6967a7e6c7b 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubbub.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hubbub.c
@@ -133,7 +133,7 @@ bool hubbub1_verify_allow_pstate_change_high(
static unsigned int max_sampled_pstate_wait_us; /* data collection */
static bool forced_pstate_allow; /* help with revert wa */
 
-   unsigned int debug_data;
+   unsigned int debug_data = 0;
unsigned int i;
 
if (forced_pstate_allow) {
-- 
2.23.0


signature.asc
Description: PGP signature
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 5/6] drm/amdgpu: Add the HDP flush support for Navi

2019-09-30 Thread Zhao, Yong
Not much relationship between them, except that this functional change is to 
fix a IOCTL error printing.

Yong

From: Kuehling, Felix 
Sent: Monday, September 30, 2019 11:57 AM
To: Zhao, Yong ; amd-gfx@lists.freedesktop.org 

Subject: Re: [PATCH 5/6] drm/amdgpu: Add the HDP flush support for Navi

As far as I can tell, this is the only patch with functional changes in
the patch series. The rest are purely clean-up. Any relation I'm missing?

Anyway, patches 2,3,5 are

Reviewed-by: Felix Kuehling 

On 2019-09-27 11:41 p.m., Zhao, Yong wrote:
> The HDP flush support code was missing in the nbio and nv files.
>
> Change-Id: I046ff52567676b56bf16dc1728b02481233acb61
> Signed-off-by: Yong Zhao 
> ---
>   drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c | 16 +---
>   drivers/gpu/drm/amd/amdgpu/nv.c|  9 +
>   2 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c 
> b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c
> index e7e36fb6113d..c699cbfe015a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c
> @@ -27,11 +27,21 @@
>   #include "nbio/nbio_2_3_default.h"
>   #include "nbio/nbio_2_3_offset.h"
>   #include "nbio/nbio_2_3_sh_mask.h"
> +#include 
>
>   #define smnPCIE_CONFIG_CNTL 0x11180044
>   #define smnCPM_CONTROL  0x11180460
>   #define smnPCIE_CNTL2   0x11180070
>
> +
> +static void nbio_v2_3_remap_hdp_registers(struct amdgpu_device *adev)
> +{
> + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL,
> + adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL);
> + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL,
> + adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL);
> +}
> +
>   static u32 nbio_v2_3_get_rev_id(struct amdgpu_device *adev)
>   {
>u32 tmp = RREG32_SOC15(NBIO, 0, mmRCC_DEV0_EPF0_STRAP0);
> @@ -56,10 +66,9 @@ static void nbio_v2_3_hdp_flush(struct amdgpu_device *adev,
>struct amdgpu_ring *ring)
>   {
>if (!ring || !ring->funcs->emit_wreg)
> - WREG32_SOC15_NO_KIQ(NBIO, 0, 
> mmBIF_BX_PF_HDP_MEM_COHERENCY_FLUSH_CNTL, 0);
> + WREG32_NO_KIQ((adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0);
>else
> - amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET(
> - NBIO, 0, mmBIF_BX_PF_HDP_MEM_COHERENCY_FLUSH_CNTL), 0);
> + amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0);
>   }
>
>   static u32 nbio_v2_3_get_memsize(struct amdgpu_device *adev)
> @@ -330,4 +339,5 @@ const struct amdgpu_nbio_funcs nbio_v2_3_funcs = {
>.ih_control = nbio_v2_3_ih_control,
>.init_registers = nbio_v2_3_init_registers,
>.detect_hw_virt = nbio_v2_3_detect_hw_virt,
> + .remap_hdp_registers = nbio_v7_4_remap_hdp_registers,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
> index b3e7756fcc4b..6699a45b88ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
> @@ -587,8 +587,11 @@ static const struct amdgpu_asic_funcs nv_asic_funcs =
>
>   static int nv_common_early_init(void *handle)
>   {
> +#define MMIO_REG_HOLE_OFFSET (0x8 - PAGE_SIZE)
>struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>
> + adev->rmmio_remap.reg_offset = MMIO_REG_HOLE_OFFSET;
> + adev->rmmio_remap.bus_addr = adev->rmmio_base + MMIO_REG_HOLE_OFFSET;
>adev->smc_rreg = NULL;
>adev->smc_wreg = NULL;
>adev->pcie_rreg = _pcie_rreg;
> @@ -714,6 +717,12 @@ static int nv_common_hw_init(void *handle)
>nv_program_aspm(adev);
>/* setup nbio registers */
>adev->nbio.funcs->init_registers(adev);
> + /* remap HDP registers to a hole in mmio space,
> +  * for the purpose of expose those registers
> +  * to process space
> +  */
> + if (adev->nbio.funcs->remap_hdp_registers)
> + adev->nbio.funcs->remap_hdp_registers(adev);
>/* enable the doorbell aperture */
>nv_enable_doorbell_aperture(adev, true);
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 6/6] drm/amdkfd: Improve KFD IOCTL printing

2019-09-30 Thread Zhao, Yong
Okay, I will change dev_err back to dev_dbg. The hex printing is still very 
useful.

Yong

From: Kuehling, Felix 
Sent: Monday, September 30, 2019 11:47 AM
To: Zhao, Yong ; amd-gfx@lists.freedesktop.org 

Subject: Re: [PATCH 6/6] drm/amdkfd: Improve KFD IOCTL printing

On 2019-09-27 11:41 p.m., Zhao, Yong wrote:
> The code use hex define, so should the printing. Also, printf a message
> if there is a failure.
>
> Change-Id: Ia7cc7690553bb043915b3d8c0157216c64421a60
> Signed-off-by: Yong Zhao 
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 +++--
>   1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index c28ba0c1d7ac..d1ab09c0f522 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1840,7 +1840,7 @@ static long kfd_ioctl(struct file *filep, unsigned int 
> cmd, unsigned long arg)
>} else
>goto err_i1;
>
> - dev_dbg(kfd_device, "ioctl cmd 0x%x (#%d), arg 0x%lx\n", cmd, nr, arg);
> + dev_dbg(kfd_device, "ioctl cmd 0x%x (#0x%x), arg 0x%lx\n", cmd, nr, 
> arg);
>
>process = kfd_get_process(current);
>if (IS_ERR(process)) {
> @@ -1895,7 +1895,8 @@ static long kfd_ioctl(struct file *filep, unsigned int 
> cmd, unsigned long arg)
>kfree(kdata);
>
>if (retcode)
> - dev_dbg(kfd_device, "ret = %d\n", retcode);
> + dev_err(kfd_device, "ioctl cmd (#0x%x), arg 0x%lx, ret = %d\n",
> + nr, arg, retcode);

NAK. We don't want to spam the kernel log with cryptic error messages
every time ioctl functions fail. Please leave this as a dev_dbg message.
Failing ioctl functions could be perfectly normal for a number of
reasons (system call interrupted by signal, running out of event slots,
timeouts on event waiting, etc). But every bug report will incorrectly
blame any unrelated problem on those messages if they happen to appear
in the kernel log.

Regards,
   Felix


>
>return retcode;
>   }
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 4/6] drm/amdkfd: Use array to probe kfd2kgd_calls

2019-09-30 Thread Zhao, Yong
Thanks. Will check that and fix the missing const.

Yong

From: Kuehling, Felix 
Sent: Monday, September 30, 2019 11:42 AM
To: Zhao, Yong ; amd-gfx@lists.freedesktop.org 

Subject: Re: [PATCH 4/6] drm/amdkfd: Use array to probe kfd2kgd_calls

On 2019-09-27 11:41 p.m., Zhao, Yong wrote:
> This is the same idea as the kfd device info probe and move all the
> probe control together for easy maintenance.
>
> Change-Id: I85c98bb08eb2a4a1a80c3b913c32691cc74602d1
> Signed-off-by: Yong Zhao 

Nice clean-up. See one comment inline.

Also, please check that this doesn't break the build if CONFIG_HSA_AMD
is undefined.

With that fixed and checked, this patch is

Reviewed-by: Felix Kuehling 


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 65 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  7 --
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  8 +--
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|  7 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |  7 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |  7 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  7 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c   | 39 +--
>   8 files changed, 41 insertions(+), 106 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 92666b197f6c..8c531793fe17 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -63,47 +63,10 @@ void amdgpu_amdkfd_fini(void)
>
>   void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev)
>   {
> - const struct kfd2kgd_calls *kfd2kgd;
>bool vf = amdgpu_sriov_vf(adev);
>
> - switch (adev->asic_type) {
> -#ifdef CONFIG_DRM_AMDGPU_CIK
> - case CHIP_KAVERI:
> - case CHIP_HAWAII:
> - kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions();
> - break;
> -#endif
> - case CHIP_CARRIZO:
> - case CHIP_TONGA:
> - case CHIP_FIJI:
> - case CHIP_POLARIS10:
> - case CHIP_POLARIS11:
> - case CHIP_POLARIS12:
> - case CHIP_VEGAM:
> - kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions();
> - break;
> - case CHIP_VEGA10:
> - case CHIP_VEGA12:
> - case CHIP_VEGA20:
> - case CHIP_RAVEN:
> - case CHIP_RENOIR:
> - kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions();
> - break;
> - case CHIP_ARCTURUS:
> - kfd2kgd = amdgpu_amdkfd_arcturus_get_functions();
> - break;
> - case CHIP_NAVI10:
> - case CHIP_NAVI14:
> - case CHIP_NAVI12:
> - kfd2kgd = amdgpu_amdkfd_gfx_10_0_get_functions();
> - break;
> - default:
> - dev_info(adev->dev, "kfd not supported on this ASIC\n");
> - return;
> - }
> -
>adev->kfd.dev = kgd2kfd_probe((struct kgd_dev *)adev,
> -   adev->pdev, kfd2kgd, adev->asic_type, vf);
> +   adev->pdev, adev->asic_type, vf);
>
>if (adev->kfd.dev)
>amdgpu_amdkfd_total_mem_size += adev->gmc.real_vram_size;
> @@ -711,33 +674,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, 
> struct mm_struct *mm)
>return 0;
>   }
>
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
> -{
> - return NULL;
> -}
> -
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void)
> -{
> - return NULL;
> -}
> -
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void)
> -{
> - return NULL;
> -}
> -
> -struct kfd2kgd_calls *amdgpu_amdkfd_arcturus_get_functions(void)
> -{
> - return NULL;
> -}
> -
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_10_0_get_functions(void)
> -{
> - return NULL;
> -}
> -
>   struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev,
> -   const struct kfd2kgd_calls *f2g,
>  unsigned int asic_type, bool vf)
>   {
>return NULL;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 4eb2fb85de26..069d5d230810 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -137,12 +137,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
> kgd_engine_type engine,
>   void amdgpu_amdkfd_set_compute_idle(struct kgd_dev *kgd, bool idle);
>   bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd);
>
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void);
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void);
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void);
> -struct kfd2kgd_calls *amdgpu_amdkfd_arcturus_get_functions(void);
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_10_0_get_functions(void);
> -
>   bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
>
>   int 

Re: [PATCH 5/6] drm/amdgpu: Add the HDP flush support for Navi

2019-09-30 Thread Kuehling, Felix
As far as I can tell, this is the only patch with functional changes in 
the patch series. The rest are purely clean-up. Any relation I'm missing?

Anyway, patches 2,3,5 are

Reviewed-by: Felix Kuehling 

On 2019-09-27 11:41 p.m., Zhao, Yong wrote:
> The HDP flush support code was missing in the nbio and nv files.
>
> Change-Id: I046ff52567676b56bf16dc1728b02481233acb61
> Signed-off-by: Yong Zhao 
> ---
>   drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c | 16 +---
>   drivers/gpu/drm/amd/amdgpu/nv.c|  9 +
>   2 files changed, 22 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c 
> b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c
> index e7e36fb6113d..c699cbfe015a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c
> @@ -27,11 +27,21 @@
>   #include "nbio/nbio_2_3_default.h"
>   #include "nbio/nbio_2_3_offset.h"
>   #include "nbio/nbio_2_3_sh_mask.h"
> +#include 
>   
>   #define smnPCIE_CONFIG_CNTL 0x11180044
>   #define smnCPM_CONTROL  0x11180460
>   #define smnPCIE_CNTL2   0x11180070
>   
> +
> +static void nbio_v2_3_remap_hdp_registers(struct amdgpu_device *adev)
> +{
> + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_MEM_FLUSH_CNTL,
> + adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL);
> + WREG32_SOC15(NBIO, 0, mmREMAP_HDP_REG_FLUSH_CNTL,
> + adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_REG_FLUSH_CNTL);
> +}
> +
>   static u32 nbio_v2_3_get_rev_id(struct amdgpu_device *adev)
>   {
>   u32 tmp = RREG32_SOC15(NBIO, 0, mmRCC_DEV0_EPF0_STRAP0);
> @@ -56,10 +66,9 @@ static void nbio_v2_3_hdp_flush(struct amdgpu_device *adev,
>   struct amdgpu_ring *ring)
>   {
>   if (!ring || !ring->funcs->emit_wreg)
> - WREG32_SOC15_NO_KIQ(NBIO, 0, 
> mmBIF_BX_PF_HDP_MEM_COHERENCY_FLUSH_CNTL, 0);
> + WREG32_NO_KIQ((adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0);
>   else
> - amdgpu_ring_emit_wreg(ring, SOC15_REG_OFFSET(
> - NBIO, 0, mmBIF_BX_PF_HDP_MEM_COHERENCY_FLUSH_CNTL), 0);
> + amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + 
> KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0);
>   }
>   
>   static u32 nbio_v2_3_get_memsize(struct amdgpu_device *adev)
> @@ -330,4 +339,5 @@ const struct amdgpu_nbio_funcs nbio_v2_3_funcs = {
>   .ih_control = nbio_v2_3_ih_control,
>   .init_registers = nbio_v2_3_init_registers,
>   .detect_hw_virt = nbio_v2_3_detect_hw_virt,
> + .remap_hdp_registers = nbio_v7_4_remap_hdp_registers,
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
> index b3e7756fcc4b..6699a45b88ec 100644
> --- a/drivers/gpu/drm/amd/amdgpu/nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/nv.c
> @@ -587,8 +587,11 @@ static const struct amdgpu_asic_funcs nv_asic_funcs =
>   
>   static int nv_common_early_init(void *handle)
>   {
> +#define MMIO_REG_HOLE_OFFSET (0x8 - PAGE_SIZE)
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
> + adev->rmmio_remap.reg_offset = MMIO_REG_HOLE_OFFSET;
> + adev->rmmio_remap.bus_addr = adev->rmmio_base + MMIO_REG_HOLE_OFFSET;
>   adev->smc_rreg = NULL;
>   adev->smc_wreg = NULL;
>   adev->pcie_rreg = _pcie_rreg;
> @@ -714,6 +717,12 @@ static int nv_common_hw_init(void *handle)
>   nv_program_aspm(adev);
>   /* setup nbio registers */
>   adev->nbio.funcs->init_registers(adev);
> + /* remap HDP registers to a hole in mmio space,
> +  * for the purpose of expose those registers
> +  * to process space
> +  */
> + if (adev->nbio.funcs->remap_hdp_registers)
> + adev->nbio.funcs->remap_hdp_registers(adev);
>   /* enable the doorbell aperture */
>   nv_enable_doorbell_aperture(adev, true);
>   
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/6] drm/amdkfd: Update parameter type of pasid to uint16_t

2019-09-30 Thread Kuehling, Felix
If you want to make this interface consistent, you should make the vmid 
parameter uint8_t at the same time. That said, you don't really save any 
resources, because 8-bit and 16-bit ints still consume 32-bits on the 
call stack.

Regards,
   Felix

On 2019-09-27 11:41 p.m., Zhao, Yong wrote:
> This is consistent with other code and registers in the code.
>
> Change-Id: I04dd12bdb465a43cfcd8936ed0f227a6546830e8
> Signed-off-by: Yong Zhao 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c| 4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 2 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 2 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++--
>   drivers/gpu/drm/amd/include/kgd_kfd_interface.h   | 2 +-
>   7 files changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 122698f8dd1e..33cbf1d073d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -59,7 +59,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
> *kgd, uint32_t vmid,
>   uint32_t sh_mem_config,
>   uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
>   uint32_t sh_mem_bases);
> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
> pasid,
> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>   unsigned int vmid);
>   static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
>   static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
> @@ -232,7 +232,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
> *kgd, uint32_t vmid,
>   unlock_srbm(kgd);
>   }
>   
> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
> pasid,
> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>   unsigned int vmid)
>   {
>   struct amdgpu_device *adev = get_amdgpu_device(kgd);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> index f77ddf7dba2b..0210d791dea1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> @@ -94,7 +94,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
> *kgd, uint32_t vmid,
>   uint32_t sh_mem_config, uint32_t sh_mem_ape1_base,
>   uint32_t sh_mem_ape1_limit, uint32_t sh_mem_bases);
>   
> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
> pasid,
> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>   unsigned int vmid);
>   
>   static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
> @@ -256,7 +256,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
> *kgd, uint32_t vmid,
>   unlock_srbm(kgd);
>   }
>   
> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
> pasid,
> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>   unsigned int vmid)
>   {
>   struct amdgpu_device *adev = get_amdgpu_device(kgd);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> index 7478caf096ad..7a4c762e1209 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> @@ -52,7 +52,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
> *kgd, uint32_t vmid,
>   uint32_t sh_mem_config,
>   uint32_t sh_mem_ape1_base, uint32_t sh_mem_ape1_limit,
>   uint32_t sh_mem_bases);
> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
> pasid,
> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>   unsigned int vmid);
>   static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id);
>   static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
> @@ -210,7 +210,7 @@ static void kgd_program_sh_mem_settings(struct kgd_dev 
> *kgd, uint32_t vmid,
>   unlock_srbm(kgd);
>   }
>   
> -static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int 
> pasid,
> +static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, uint16_t pasid,
>   unsigned int vmid)
>   {
>   struct amdgpu_device *adev = get_amdgpu_device(kgd);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index 50f885576bbe..6be6061c5554 100644
> 

Re: [PATCH 6/6] drm/amdkfd: Improve KFD IOCTL printing

2019-09-30 Thread Kuehling, Felix
On 2019-09-27 11:41 p.m., Zhao, Yong wrote:
> The code use hex define, so should the printing. Also, printf a message
> if there is a failure.
>
> Change-Id: Ia7cc7690553bb043915b3d8c0157216c64421a60
> Signed-off-by: Yong Zhao 
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 5 +++--
>   1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index c28ba0c1d7ac..d1ab09c0f522 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -1840,7 +1840,7 @@ static long kfd_ioctl(struct file *filep, unsigned int 
> cmd, unsigned long arg)
>   } else
>   goto err_i1;
>   
> - dev_dbg(kfd_device, "ioctl cmd 0x%x (#%d), arg 0x%lx\n", cmd, nr, arg);
> + dev_dbg(kfd_device, "ioctl cmd 0x%x (#0x%x), arg 0x%lx\n", cmd, nr, 
> arg);
>   
>   process = kfd_get_process(current);
>   if (IS_ERR(process)) {
> @@ -1895,7 +1895,8 @@ static long kfd_ioctl(struct file *filep, unsigned int 
> cmd, unsigned long arg)
>   kfree(kdata);
>   
>   if (retcode)
> - dev_dbg(kfd_device, "ret = %d\n", retcode);
> + dev_err(kfd_device, "ioctl cmd (#0x%x), arg 0x%lx, ret = %d\n",
> + nr, arg, retcode);

NAK. We don't want to spam the kernel log with cryptic error messages 
every time ioctl functions fail. Please leave this as a dev_dbg message. 
Failing ioctl functions could be perfectly normal for a number of 
reasons (system call interrupted by signal, running out of event slots, 
timeouts on event waiting, etc). But every bug report will incorrectly 
blame any unrelated problem on those messages if they happen to appear 
in the kernel log.

Regards,
   Felix


>   
>   return retcode;
>   }
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 4/6] drm/amdkfd: Use array to probe kfd2kgd_calls

2019-09-30 Thread Kuehling, Felix
On 2019-09-27 11:41 p.m., Zhao, Yong wrote:
> This is the same idea as the kfd device info probe and move all the
> probe control together for easy maintenance.
>
> Change-Id: I85c98bb08eb2a4a1a80c3b913c32691cc74602d1
> Signed-off-by: Yong Zhao 

Nice clean-up. See one comment inline.

Also, please check that this doesn't break the build if CONFIG_HSA_AMD 
is undefined.

With that fixed and checked, this patch is

Reviewed-by: Felix Kuehling 


> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c| 65 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  7 --
>   .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c   |  8 +--
>   .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c|  7 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c |  7 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c |  7 +-
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  7 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c   | 39 +--
>   8 files changed, 41 insertions(+), 106 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 92666b197f6c..8c531793fe17 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -63,47 +63,10 @@ void amdgpu_amdkfd_fini(void)
>   
>   void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev)
>   {
> - const struct kfd2kgd_calls *kfd2kgd;
>   bool vf = amdgpu_sriov_vf(adev);
>   
> - switch (adev->asic_type) {
> -#ifdef CONFIG_DRM_AMDGPU_CIK
> - case CHIP_KAVERI:
> - case CHIP_HAWAII:
> - kfd2kgd = amdgpu_amdkfd_gfx_7_get_functions();
> - break;
> -#endif
> - case CHIP_CARRIZO:
> - case CHIP_TONGA:
> - case CHIP_FIJI:
> - case CHIP_POLARIS10:
> - case CHIP_POLARIS11:
> - case CHIP_POLARIS12:
> - case CHIP_VEGAM:
> - kfd2kgd = amdgpu_amdkfd_gfx_8_0_get_functions();
> - break;
> - case CHIP_VEGA10:
> - case CHIP_VEGA12:
> - case CHIP_VEGA20:
> - case CHIP_RAVEN:
> - case CHIP_RENOIR:
> - kfd2kgd = amdgpu_amdkfd_gfx_9_0_get_functions();
> - break;
> - case CHIP_ARCTURUS:
> - kfd2kgd = amdgpu_amdkfd_arcturus_get_functions();
> - break;
> - case CHIP_NAVI10:
> - case CHIP_NAVI14:
> - case CHIP_NAVI12:
> - kfd2kgd = amdgpu_amdkfd_gfx_10_0_get_functions();
> - break;
> - default:
> - dev_info(adev->dev, "kfd not supported on this ASIC\n");
> - return;
> - }
> -
>   adev->kfd.dev = kgd2kfd_probe((struct kgd_dev *)adev,
> -   adev->pdev, kfd2kgd, adev->asic_type, vf);
> +   adev->pdev, adev->asic_type, vf);
>   
>   if (adev->kfd.dev)
>   amdgpu_amdkfd_total_mem_size += adev->gmc.real_vram_size;
> @@ -711,33 +674,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, 
> struct mm_struct *mm)
>   return 0;
>   }
>   
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void)
> -{
> - return NULL;
> -}
> -
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void)
> -{
> - return NULL;
> -}
> -
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void)
> -{
> - return NULL;
> -}
> -
> -struct kfd2kgd_calls *amdgpu_amdkfd_arcturus_get_functions(void)
> -{
> - return NULL;
> -}
> -
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_10_0_get_functions(void)
> -{
> - return NULL;
> -}
> -
>   struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev,
> -   const struct kfd2kgd_calls *f2g,
> unsigned int asic_type, bool vf)
>   {
>   return NULL;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 4eb2fb85de26..069d5d230810 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -137,12 +137,6 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum 
> kgd_engine_type engine,
>   void amdgpu_amdkfd_set_compute_idle(struct kgd_dev *kgd, bool idle);
>   bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd);
>   
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_7_get_functions(void);
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_8_0_get_functions(void);
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_9_0_get_functions(void);
> -struct kfd2kgd_calls *amdgpu_amdkfd_arcturus_get_functions(void);
> -struct kfd2kgd_calls *amdgpu_amdkfd_gfx_10_0_get_functions(void);
> -
>   bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
>   
>   int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
> @@ -248,7 +242,6 @@ void amdgpu_amdkfd_unreserve_memory_limit(struct 
> amdgpu_bo *bo);
>   int kgd2kfd_init(void);
>   void kgd2kfd_exit(void);
>   struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev,
> -   

Re: [RFC PATCH] drm:- Add a modifier to denote 'protected' framebuffer

2019-09-30 Thread Ayan Halder
On Mon, Sep 30, 2019 at 09:51:35AM +, Brian Starkey wrote:
> Hi,
> 
> On Tue, Sep 17, 2019 at 07:36:45PM +0200, Daniel Vetter wrote:
> > On Tue, Sep 17, 2019 at 6:15 PM Neil Armstrong  
> > wrote:
> > >
> > > Hi,
> > >
> > > On 17/09/2019 18:07, Liviu Dudau wrote:
> > > > On Tue, Sep 17, 2019 at 02:53:01PM +0200, Daniel Vetter wrote:
> > > >> On Mon, Sep 09, 2019 at 01:42:53PM +, Ayan Halder wrote:
> > > >>> Add a modifier 'DRM_FORMAT_MOD_ARM_PROTECTED' which denotes that the 
> > > >>> framebuffer
> > > >>> is allocated in a protected system memory.
> > > >>> Essentially, we want to support EGL_EXT_protected_content in our 
> > > >>> komeda driver.
> > > >>>
> > > >>> Signed-off-by: Ayan Kumar Halder 
> > > >>>
> > > >>> /-- Note to reviewer
> > > >>> Komeda driver is capable of rendering DRM (Digital Rights Management) 
> > > >>> protected
> > > >>> content. The DRM content is stored in a framebuffer allocated in 
> > > >>> system memory
> > > >>> (which needs some special hardware signals for access).
> > > >>>
> > > >>> Let us ignore how the protected system memory is allocated and for 
> > > >>> the scope of
> > > >>> this discussion, we want to figure out the best way possible for the 
> > > >>> userspace
> > > >>> to communicate to the drm driver to turn the protected mode on (for 
> > > >>> accessing the
> > > >>> framebuffer with the DRM content) or off.
> > > >>>
> > > >>> The possible ways by which the userspace could achieve this is via:-
> > > >>>
> > > >>> 1. Modifiers :- This looks to me the best way by which the userspace 
> > > >>> can
> > > >>> communicate to the kernel to turn the protected mode on for the 
> > > >>> komeda driver
> > > >>> as it is going to access one of the protected framebuffers. The only 
> > > >>> problem is
> > > >>> that the current modifiers describe the tiling/compression format. 
> > > >>> However, it
> > > >>> does not hurt to extend the meaning of modifiers to denote other 
> > > >>> attributes of
> > > >>> the framebuffer as well.
> > > >>>
> > > >>> The other reason is that on Android, we get an info from Gralloc
> > > >>> (GRALLOC_USAGE_PROTECTED) which tells us that the buffer is 
> > > >>> protected. This can
> > > >>> be used to set up the modifier/s (AddFB2) during framebuffer creation.
> > > >>
> > > >> How does this mesh with other modifiers, like AFBC? That's where I see 
> > > >> the
> > > >> issue here.
> > > >
> > > > AFBC modifiers are currently under Arm's namespace, the thought behind 
> > > > the DRM
> > > > modifiers would be to have it as a "generic" modifier.
> > 
> > But if it's a generic flag, how do you combine that with other
> > modifiers? Like if you have a tiled buffer, but also encrypted? Or
> > afbc compressed, or whatever else. I'd expect for your hw encryption
> > is orthogonal to the buffer/tiling/compression format used?
> 
> This bit doesn't overlap with any of the other AFBC modifiers, so as
> you say it'd be orthogonal, and could be set on AFBC buffers (if we
> went that route).
> 
> > 
> > > >>> 2. Framebuffer flags :- As of today, this can be one of the two values
> > > >>> ie (DRM_MODE_FB_INTERLACED/DRM_MODE_FB_MODIFIERS). Unlike modifiers, 
> > > >>> the drm
> > > >>> framebuffer flags are generic to the drm subsystem and ideally we 
> > > >>> should not
> > > >>> introduce any driver specific constraint/feature.
> > > >>>
> > > >>> 3. Connector property:- I could see the following properties used for 
> > > >>> DRM
> > > >>> protected content:-
> > > >>> DRM_MODE_CONTENT_PROTECTION_DESIRED / ENABLED :- "This property is 
> > > >>> used by
> > > >>> userspace to request the kernel protect future content communicated 
> > > >>> over
> > > >>> the link". Clearly, we are not concerned with the protection 
> > > >>> attributes of the
> > > >>> transmitter. So, we cannot use this property for our case.
> > > >>>
> > > >>> 4. DRM plane property:- Again, we want to communicate that the 
> > > >>> framebuffer(which
> > > >>> can be attached to any plane) is protected. So introducing a new 
> > > >>> plane property
> > > >>> does not help.
> > > >>>
> > > >>> 5. DRM crtc property:- For the same reason as above, introducing a 
> > > >>> new crtc
> > > >>> property does not help.
> > > >>
> > > >> 6. Just track this as part of buffer allocation, i.e. I think it does
> > > >> matter how you allocate these protected buffers. We could add a "is
> > > >> protected buffer" flag at the dma_buf level for this.
> 
> I also like this approach. The protected-ness is a property of the
> allocation, so makes sense to store it with the allocation IMO.
> 
> > > >>
> > > >> So yeah for this stuff here I think we do want the full userspace side,
> > > >> from allocator to rendering something into this protected buffers (no 
> > > >> need
> > > >> to also have the entire "decode a protected bitstream part" imo, since
> > > >> that will freak people out). Unfortunately, in my experience, that 
> > > >> kills
> > > >> it for upstream :-/ 

Re: [PATCH 1/3] drm/amdgpu: recreate retired page's bo if vram get lost in gpu reset

2019-09-30 Thread Christian König

Am 30.09.19 um 11:06 schrieb Zhou1, Tao:

Hi Christian:

So the allocation information of a bo's vram page could be reserved after vram 
lost?


Yes, exactly. VRAM lost just means that we are returning an error code 
to userspace when they try to use a context created before VRAM was lost.



In fact, this series could be dropped if the result of 
amdgpu_bo_create_reserved is guaranteed to be failed after vram lost (protect 
bad pages from reallocating).


The BOs are not freed in any way, so I think Andrey meant something else.

Regards,
Christian.



Hi Andrey:

A bad page's allocation information will not be lost in gpu reset according to 
Christian's comments. Do you have any other concern?

Regards,
Tao


-Original Message-
From: Christian König 
Sent: 2019年9月30日 16:35
To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org;
Grodzovsky, Andrey ; Chen, Guchun
; Zhang, Hawking 
Subject: Re: [PATCH 1/3] drm/amdgpu: recreate retired page's bo if vram get
lost in gpu reset

Am 30.09.19 um 05:19 schrieb Zhou1, Tao:

the info of retired page's bo may be lost if vram lost is encountered
in gpu reset (gpu page table in vram may be lost), force to recreate
all bos

v2: simplify NULL pointer check
  add more comments

Repeating on the v2 of the patch set, this is complete nonsense.

When VRAM is lost the BOs and their reservation still exits, only the content
is lost because the MC is has been resetted.

Regards,
Christian.


Signed-off-by: Tao Zhou 
Suggested-by: Andrey Grodzovsky 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 48

++

   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
   3 files changed, 50 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62955156653c..a89f6d053b3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3675,6 +3675,7 @@ static int amdgpu_do_asic_reset(struct

amdgpu_hive_info *hive,

if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU

reset!\n");

amdgpu_inc_vram_lost(tmp_adev);
+

amdgpu_ras_recovery_reset(tmp_adev);

}

r = amdgpu_gtt_mgr_recover(
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 486568ded6d6..3f2a2f13e4c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1573,6 +1573,54 @@ static int amdgpu_ras_recovery_fini(struct
amdgpu_device *adev)

return 0;
   }
+
+/*
+ * the info of retired page's bo may be lost if vram lost is
+encountered
+ * in gpu reset (gpu page table in vram may be lost), force to
+recreate
+ * all bos
+ */
+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   uint64_t bp;
+   struct amdgpu_bo *bo = NULL;
+   int i;
+
+   if (!con || !con->eh_data)
+   return;
+
+   /* Note: It's called in gpu reset, recovery_lock may be acquired

before

+* gpu reset. The following code is out of protect of con-
recovery_lock
+* in case of deadlock and bps_bo should be recreated (if successfully)
+* whether recovery_lock is locked or not.
+* We assume there is no other ras recovery operation simultaneous

during

+* gpu reset.
+*/
+   data = con->eh_data;
+   /* force to reserve all retired page again */
+   data->last_reserved = 0;
+
+   for (i = data->last_reserved; i < data->count; i++) {
+   bp = data->bps[i].retired_page;
+
+   /* There are three cases of reserve error should be ignored:
+* 1) a ras bad page has been allocated (used by someone);
+* 2) a ras bad page has been reserved (duplicate error

injection

+*for one page);
+* 3) bo info isn't lost in gpu reset
+*/
+   if (amdgpu_bo_create_kernel_at(adev, bp <<

AMDGPU_GPU_PAGE_SHIFT,

+  AMDGPU_GPU_PAGE_SIZE,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  , NULL))
+   DRM_NOTE("RAS NOTE: recreate bo for retired page

0x%llx fail\n", bp);

+   else
+   data->bps_bo[i] = bo;
+   data->last_reserved = i + 1;
+   bo = NULL;
+   }
+}
   /* recovery end */

   /* return 0 if ras will reset gpu and repost.*/ diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f80fd3428c98..7a606d3be806 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ 

RE: [PATCH 1/3] drm/amdgpu: recreate retired page's bo if vram get lost in gpu reset

2019-09-30 Thread Zhou1, Tao
Hi Christian:

So the allocation information of a bo's vram page could be reserved after vram 
lost?
In fact, this series could be dropped if the result of 
amdgpu_bo_create_reserved is guaranteed to be failed after vram lost (protect 
bad pages from reallocating).

Hi Andrey:

A bad page's allocation information will not be lost in gpu reset according to 
Christian's comments. Do you have any other concern?

Regards,
Tao

> -Original Message-
> From: Christian König 
> Sent: 2019年9月30日 16:35
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org;
> Grodzovsky, Andrey ; Chen, Guchun
> ; Zhang, Hawking 
> Subject: Re: [PATCH 1/3] drm/amdgpu: recreate retired page's bo if vram get
> lost in gpu reset
> 
> Am 30.09.19 um 05:19 schrieb Zhou1, Tao:
> > the info of retired page's bo may be lost if vram lost is encountered
> > in gpu reset (gpu page table in vram may be lost), force to recreate
> > all bos
> >
> > v2: simplify NULL pointer check
> >  add more comments
> 
> Repeating on the v2 of the patch set, this is complete nonsense.
> 
> When VRAM is lost the BOs and their reservation still exits, only the content
> is lost because the MC is has been resetted.
> 
> Regards,
> Christian.
> 
> >
> > Signed-off-by: Tao Zhou 
> > Suggested-by: Andrey Grodzovsky 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 48
> ++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
> >   3 files changed, 50 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 62955156653c..a89f6d053b3f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -3675,6 +3675,7 @@ static int amdgpu_do_asic_reset(struct
> amdgpu_hive_info *hive,
> > if (vram_lost) {
> > DRM_INFO("VRAM is lost due to GPU
> reset!\n");
> > amdgpu_inc_vram_lost(tmp_adev);
> > +
>   amdgpu_ras_recovery_reset(tmp_adev);
> > }
> >
> > r = amdgpu_gtt_mgr_recover(
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 486568ded6d6..3f2a2f13e4c6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -1573,6 +1573,54 @@ static int amdgpu_ras_recovery_fini(struct
> > amdgpu_device *adev)
> >
> > return 0;
> >   }
> > +
> > +/*
> > + * the info of retired page's bo may be lost if vram lost is
> > +encountered
> > + * in gpu reset (gpu page table in vram may be lost), force to
> > +recreate
> > + * all bos
> > + */
> > +void amdgpu_ras_recovery_reset(struct amdgpu_device *adev) {
> > +   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> > +   struct ras_err_handler_data *data;
> > +   uint64_t bp;
> > +   struct amdgpu_bo *bo = NULL;
> > +   int i;
> > +
> > +   if (!con || !con->eh_data)
> > +   return;
> > +
> > +   /* Note: It's called in gpu reset, recovery_lock may be acquired
> before
> > +* gpu reset. The following code is out of protect of con-
> >recovery_lock
> > +* in case of deadlock and bps_bo should be recreated (if successfully)
> > +* whether recovery_lock is locked or not.
> > +* We assume there is no other ras recovery operation simultaneous
> during
> > +* gpu reset.
> > +*/
> > +   data = con->eh_data;
> > +   /* force to reserve all retired page again */
> > +   data->last_reserved = 0;
> > +
> > +   for (i = data->last_reserved; i < data->count; i++) {
> > +   bp = data->bps[i].retired_page;
> > +
> > +   /* There are three cases of reserve error should be ignored:
> > +* 1) a ras bad page has been allocated (used by someone);
> > +* 2) a ras bad page has been reserved (duplicate error
> injection
> > +*for one page);
> > +* 3) bo info isn't lost in gpu reset
> > +*/
> > +   if (amdgpu_bo_create_kernel_at(adev, bp <<
> AMDGPU_GPU_PAGE_SHIFT,
> > +  AMDGPU_GPU_PAGE_SIZE,
> > +  AMDGPU_GEM_DOMAIN_VRAM,
> > +  , NULL))
> > +   DRM_NOTE("RAS NOTE: recreate bo for retired page
> 0x%llx fail\n", bp);
> > +   else
> > +   data->bps_bo[i] = bo;
> > +   data->last_reserved = i + 1;
> > +   bo = NULL;
> > +   }
> > +}
> >   /* recovery end */
> >
> >   /* return 0 if ras will reset gpu and repost.*/ diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index f80fd3428c98..7a606d3be806 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ 

Re: [PATCH 1/3] drm/amdgpu: recreate retired page's bo if vram get lost in gpu reset

2019-09-30 Thread Christian König

Am 30.09.19 um 05:19 schrieb Zhou1, Tao:

the info of retired page's bo may be lost if vram lost is encountered
in gpu reset (gpu page table in vram may be lost), force to recreate
all bos

v2: simplify NULL pointer check
 add more comments


Repeating on the v2 of the patch set, this is complete nonsense.

When VRAM is lost the BOs and their reservation still exits, only the 
content is lost because the MC is has been resetted.


Regards,
Christian.



Signed-off-by: Tao Zhou 
Suggested-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 48 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
  3 files changed, 50 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62955156653c..a89f6d053b3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3675,6 +3675,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU 
reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
+   amdgpu_ras_recovery_reset(tmp_adev);
}
  
  r = amdgpu_gtt_mgr_recover(

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 486568ded6d6..3f2a2f13e4c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1573,6 +1573,54 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
  
  	return 0;

  }
+
+/*
+ * the info of retired page's bo may be lost if vram lost is encountered
+ * in gpu reset (gpu page table in vram may be lost), force to recreate
+ * all bos
+ */
+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   uint64_t bp;
+   struct amdgpu_bo *bo = NULL;
+   int i;
+
+   if (!con || !con->eh_data)
+   return;
+
+   /* Note: It's called in gpu reset, recovery_lock may be acquired before
+* gpu reset. The following code is out of protect of con->recovery_lock
+* in case of deadlock and bps_bo should be recreated (if successfully)
+* whether recovery_lock is locked or not.
+* We assume there is no other ras recovery operation simultaneous 
during
+* gpu reset.
+*/
+   data = con->eh_data;
+   /* force to reserve all retired page again */
+   data->last_reserved = 0;
+
+   for (i = data->last_reserved; i < data->count; i++) {
+   bp = data->bps[i].retired_page;
+
+   /* There are three cases of reserve error should be ignored:
+* 1) a ras bad page has been allocated (used by someone);
+* 2) a ras bad page has been reserved (duplicate error 
injection
+*for one page);
+* 3) bo info isn't lost in gpu reset
+*/
+   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
+  AMDGPU_GPU_PAGE_SIZE,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  , NULL))
+   DRM_NOTE("RAS NOTE: recreate bo for retired page 0x%llx 
fail\n", bp);
+   else
+   data->bps_bo[i] = bo;
+   data->last_reserved = i + 1;
+   bo = NULL;
+   }
+}
  /* recovery end */
  
  /* return 0 if ras will reset gpu and repost.*/

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f80fd3428c98..7a606d3be806 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -479,6 +479,7 @@ static inline int amdgpu_ras_is_supported(struct 
amdgpu_device *adev,
  }
  
  int amdgpu_ras_recovery_init(struct amdgpu_device *adev);

+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev);
  int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
unsigned int block);
  


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v4 10/11] drm/amdgpu: job is secure iff CS is secure (v5)

2019-09-30 Thread Christian König

Am 29.09.19 um 09:40 schrieb Huang, Ray:

Mark a job as secure, if and only if the command
submission flag has the secure flag set.

v2: fix the null job pointer while in vmid 0
submission.
v3: Context --> Command submission.
v4: filling cs parser with cs->in.flags
v5: move the job secure flag setting out of amdgpu_cs_submit()

Signed-off-by: Huang Rui 
Co-developed-by: Luben Tuikov 
Signed-off-by: Luben Tuikov 
Reviewed-by: Alex Deucher 


At some point we need to cleanup struct amdgpu_job, but for now that is 
perfectly ok.


Patch is Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 2 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c  | 4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 ++
  3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 49b767b..c18a153 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -231,6 +231,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser 
*p, union drm_amdgpu_cs
if (ret)
goto free_all_kdata;
  
+	p->job->secure = cs->in.flags & AMDGPU_CS_FLAGS_SECURE;

+
if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) {
ret = -ECANCELED;
goto free_all_kdata;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 9a6dbf3..6e0f97a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -213,7 +213,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
if (job && ring->funcs->emit_cntxcntl) {
status |= job->preamble_status;
status |= job->preemption_status;
-   amdgpu_ring_emit_cntxcntl(ring, status, false);
+   amdgpu_ring_emit_cntxcntl(ring, status, job->secure);
}
  
  	for (i = 0; i < num_ibs; ++i) {

@@ -232,7 +232,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
}
  
  	if (ring->funcs->emit_tmz)

-   amdgpu_ring_emit_tmz(ring, false, false);
+   amdgpu_ring_emit_tmz(ring, false, job ? job->secure : false);
  
  #ifdef CONFIG_X86_64

if (!(adev->flags & AMD_IS_APU))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index dc7ee93..aa0e375 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -63,6 +63,8 @@ struct amdgpu_job {
uint64_tuf_addr;
uint64_tuf_sequence;
  
+	/* the job is due to a secure command submission */

+   boolsecure;
  };
  
  int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: recreate retired page's bo if vram get lost in gpu reset

2019-09-30 Thread Christian König

Am 29.09.19 um 08:35 schrieb Zhou1, Tao:

the info of retired page's bo may be lost if vram lost is encountered
in gpu reset (gpu page table in vram may be lost), force to recreate
all bos


NAK, that is complete nonsense.

When VRAM is lost the content of BOs in VRAM is lost, but the BOs itself 
are still there.


Regards,
Christian.



Signed-off-by: Tao Zhou 
Suggested-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 44 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
  3 files changed, 46 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62955156653c..a89f6d053b3f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3675,6 +3675,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU 
reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
+   amdgpu_ras_recovery_reset(tmp_adev);
}
  
  r = amdgpu_gtt_mgr_recover(

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 486568ded6d6..1b3b597aa973 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1573,6 +1573,50 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
  
  	return 0;

  }
+
+/*
+ * the info of retired page's bo may be lost if vram lost is encountered
+ * in gpu reset (gpu page table in vram may be lost), force to recreate
+ * all bos
+ */
+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   uint64_t bp;
+   struct amdgpu_bo *bo = NULL;
+   int i;
+
+   if (!con)
+   return;
+
+   data = con->eh_data;
+   if (!data)
+   return;
+
+   /* force to reserve all retired page again */
+   data->last_reserved = 0;
+
+   for (i = data->last_reserved; i < data->count; i++) {
+   bp = data->bps[i].retired_page;
+
+   /* There are three cases of reserve error should be ignored:
+* 1) a ras bad page has been allocated (used by someone);
+* 2) a ras bad page has been reserved (duplicate error 
injection
+*for one page);
+* 3) bo info isn't lost in gpu reset
+*/
+   if (amdgpu_bo_create_kernel_at(adev, bp << 
AMDGPU_GPU_PAGE_SHIFT,
+  AMDGPU_GPU_PAGE_SIZE,
+  AMDGPU_GEM_DOMAIN_VRAM,
+  , NULL))
+   DRM_NOTE("RAS NOTE: recreate bo for retired page 0x%llx 
fail\n", bp);
+   else
+   data->bps_bo[i] = bo;
+   data->last_reserved = i + 1;
+   bo = NULL;
+   }
+}
  /* recovery end */
  
  /* return 0 if ras will reset gpu and repost.*/

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f80fd3428c98..7a606d3be806 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -479,6 +479,7 @@ static inline int amdgpu_ras_is_supported(struct 
amdgpu_device *adev,
  }
  
  int amdgpu_ras_recovery_init(struct amdgpu_device *adev);

+void amdgpu_ras_recovery_reset(struct amdgpu_device *adev);
  int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
unsigned int block);
  


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: avoid ras error injection for retired page

2019-09-30 Thread Zhou1, Tao


> -Original Message-
> From: Chen, Guchun 
> Sent: 2019年9月30日 15:14
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org;
> Zhang, Hawking 
> Subject: RE: [PATCH] drm/amdgpu: avoid ras error injection for retired page
> 
> 
> 
> 
> Regards,
> Guchun
> 
> -Original Message-
> From: Zhou1, Tao 
> Sent: Monday, September 30, 2019 2:58 PM
> To: amd-gfx@lists.freedesktop.org; Chen, Guchun
> ; Zhang, Hawking 
> Cc: Zhou1, Tao 
> Subject: [PATCH] drm/amdgpu: avoid ras error injection for retired page
> 
> check whether a page is bad page before error injection
> 
> Signed-off-by: Tao Zhou 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38
> +
>  1 file changed, 38 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index fe3a57e567c8..d50e565b0b20 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -71,6 +71,9 @@ const char *ras_block_string[] = {
> 
>  atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
> 
> +static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
> + uint64_t addr);
> +
>  static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
>   size_t size, loff_t *pos)
>  {
> @@ -290,6 +293,13 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct
> file *f, const char __user *
>   break;
>   }
> 
> + /* ce/ue error injection for a bad page is not allowed */
> + if (amdgpu_ras_check_bad_page(adev, data.inject.address))
> {
> + DRM_WARN("DRM WARN: 0x%llx has been marked
> as bad before error injection!\n",
> + data.inject.address);
> + break;
> + }
> +
>   /* data.inject.address is offset instead of absolute gpu
> address */
>   ret = amdgpu_ras_error_inject(adev, );
>   break;
> @@ -1430,6 +1440,34 @@ static int amdgpu_ras_load_bad_pages(struct
> amdgpu_device *adev)
>   return ret;
>  }
> 
> +/* check if an address belongs to bad page */ static bool
> +amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
> + uint64_t addr)
> +{
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_err_handler_data *data;
> + int i, ret = false;
> [Guchun]It's better to use bool type for the ret variable, to keep consistent
> with function return type?
> Apart from that, this patch is: Reviewed-by: Guchun Chen
> 

[Tao] Thanks, I'll correct it before submit.

> 
> +
> + if (!con || !con->eh_data)
> + return ret;
> +
> + mutex_lock(>recovery_lock);
> + data = con->eh_data;
> + if (!data)
> + goto out;
> +
> + addr >>= AMDGPU_GPU_PAGE_SHIFT;
> + for (i = 0; i < data->count; i++)
> + if (addr == data->bps[i].retired_page) {
> + ret = true;
> + goto out;
> + }
> +
> +out:
> + mutex_unlock(>recovery_lock);
> + return ret;
> +}
> +
>  static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)
> {
>   /* Note: the caller should guarantee con and data are not NULL */
> --
> 2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: avoid ras error injection for retired page

2019-09-30 Thread Chen, Guchun



Regards,
Guchun

-Original Message-
From: Zhou1, Tao  
Sent: Monday, September 30, 2019 2:58 PM
To: amd-gfx@lists.freedesktop.org; Chen, Guchun ; Zhang, 
Hawking 
Cc: Zhou1, Tao 
Subject: [PATCH] drm/amdgpu: avoid ras error injection for retired page

check whether a page is bad page before error injection

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +
 1 file changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fe3a57e567c8..d50e565b0b20 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -71,6 +71,9 @@ const char *ras_block_string[] = {
 
 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
 
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+   uint64_t addr);
+
 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
 {
@@ -290,6 +293,13 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
break;
}
 
+   /* ce/ue error injection for a bad page is not allowed */
+   if (amdgpu_ras_check_bad_page(adev, data.inject.address)) {
+   DRM_WARN("DRM WARN: 0x%llx has been marked as bad 
before error injection!\n",
+   data.inject.address);
+   break;
+   }
+
/* data.inject.address is offset instead of absolute gpu 
address */
ret = amdgpu_ras_error_inject(adev, );
break;
@@ -1430,6 +1440,34 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
return ret;
 }
 
+/* check if an address belongs to bad page */ static bool 
+amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+   uint64_t addr)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   int i, ret = false;
[Guchun]It's better to use bool type for the ret variable, to keep consistent 
with function return type?
Apart from that, this patch is: Reviewed-by: Guchun Chen 

+
+   if (!con || !con->eh_data)
+   return ret;
+
+   mutex_lock(>recovery_lock);
+   data = con->eh_data;
+   if (!data)
+   goto out;
+
+   addr >>= AMDGPU_GPU_PAGE_SHIFT;
+   for (i = 0; i < data->count; i++)
+   if (addr == data->bps[i].retired_page) {
+   ret = true;
+   goto out;
+   }
+
+out:
+   mutex_unlock(>recovery_lock);
+   return ret;
+}
+
 static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)  {
/* Note: the caller should guarantee con and data are not NULL */
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] gpu: drm: amd: amdgpu: Remove call to memset after dma_alloc_coherent

2019-09-30 Thread Saurav Girepunje
dma_alloc_coherent has already zeroed the memory.
So memset is not needed.

Signed-off-by: Saurav Girepunje 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
index 6d8f05511aba..111a301ce878 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
@@ -66,7 +66,6 @@ int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct 
amdgpu_ih_ring *ih,
if (ih->ring == NULL)
return -ENOMEM;
 
-   memset((void *)ih->ring, 0, ih->ring_size + 8);
ih->gpu_addr = dma_addr;
ih->wptr_addr = dma_addr + ih->ring_size;
ih->wptr_cpu = >ring[ih->ring_size / 4];
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: avoid ras error injection for retired page

2019-09-30 Thread Zhou1, Tao
check whether a page is bad page before error injection

Signed-off-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 +
 1 file changed, 38 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fe3a57e567c8..d50e565b0b20 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -71,6 +71,9 @@ const char *ras_block_string[] = {
 
 atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
 
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+   uint64_t addr);
+
 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos)
 {
@@ -290,6 +293,13 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
break;
}
 
+   /* ce/ue error injection for a bad page is not allowed */
+   if (amdgpu_ras_check_bad_page(adev, data.inject.address)) {
+   DRM_WARN("DRM WARN: 0x%llx has been marked as bad 
before error injection!\n",
+   data.inject.address);
+   break;
+   }
+
/* data.inject.address is offset instead of absolute gpu 
address */
ret = amdgpu_ras_error_inject(adev, );
break;
@@ -1430,6 +1440,34 @@ static int amdgpu_ras_load_bad_pages(struct 
amdgpu_device *adev)
return ret;
 }
 
+/* check if an address belongs to bad page */
+static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
+   uint64_t addr)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_err_handler_data *data;
+   int i, ret = false;
+
+   if (!con || !con->eh_data)
+   return ret;
+
+   mutex_lock(>recovery_lock);
+   data = con->eh_data;
+   if (!data)
+   goto out;
+
+   addr >>= AMDGPU_GPU_PAGE_SHIFT;
+   for (i = 0; i < data->count; i++)
+   if (addr == data->bps[i].retired_page) {
+   ret = true;
+   goto out;
+   }
+
+out:
+   mutex_unlock(>recovery_lock);
+   return ret;
+}
+
 static void amdgpu_ras_create_bad_pages_bo(struct amdgpu_device *adev)
 {
/* Note: the caller should guarantee con and data are not NULL */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx