Re: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V4

2019-04-29 Thread Alex Deucher
On Mon, Apr 29, 2019 at 11:16 PM Evan Quan  wrote:
>
> Every ring type can have its own timeout setting.
>
>  - V2: update lockup_timeout parameter format and cosmetic fixes
>  - V3: invalidate 0 and negative values
>  - V4: update lockup_timeout parameter format
>
> Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
> Signed-off-by: Evan Quan 

Reviewed-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 79 --
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 35 --
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  2 +-
>  5 files changed, 121 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f6965b9403eb..c9b44b8c1969 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority;
>  extern int amdgpu_hw_i2c;
>  extern int amdgpu_pcie_gen2;
>  extern int amdgpu_msi;
> -extern int amdgpu_lockup_timeout;
>  extern int amdgpu_dpm;
>  extern int amdgpu_fw_load_type;
>  extern int amdgpu_aspm;
> @@ -428,6 +427,7 @@ struct amdgpu_fpriv {
>  };
>
>  int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv);
> +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev);
>
>  int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   unsigned size, struct amdgpu_ib *ib);
> @@ -1001,6 +1001,11 @@ struct amdgpu_device {
> struct work_struct  xgmi_reset_work;
>
> boolin_baco_reset;
> +
> +   longgfx_timeout;
> +   longsdma_timeout;
> +   longvideo_timeout;
> +   longcompute_timeout;
>  };
>
>  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device 
> *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 80bf604019b1..b11af38a0238 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -912,8 +912,10 @@ static void 
> amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
>   * Validates certain module parameters and updates
>   * the associated values used by the driver (all asics).
>   */
> -static void amdgpu_device_check_arguments(struct amdgpu_device *adev)
> +static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
>  {
> +   int ret = 0;
> +
> if (amdgpu_sched_jobs < 4) {
> dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
>  amdgpu_sched_jobs);
> @@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct 
> amdgpu_device *adev)
> amdgpu_vram_page_split = 1024;
> }
>
> -   if (amdgpu_lockup_timeout == 0) {
> -   dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 
> 1\n");
> -   amdgpu_lockup_timeout = 1;
> +   ret = amdgpu_device_get_job_timeout_settings(adev);
> +   if (ret) {
> +   dev_err(adev->dev, "invalid lockup_timeout parameter 
> syntax\n");
> +   return ret;
> }
>
> adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, 
> amdgpu_fw_load_type);
> amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96);
> +
> +   return ret;
>  }
>
>  /**
> @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> mutex_init(>lock_reset);
> mutex_init(>virt.dpm_mutex);
>
> -   amdgpu_device_check_arguments(adev);
> +   r = amdgpu_device_check_arguments(adev);
> +   if (r)
> +   return r;
>
> spin_lock_init(>mmio_idx_lock);
> spin_lock_init(>smc_idx_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 71df27cd03de..609c7af8a3f0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -83,6 +83,8 @@
>
>  #define AMDGPU_VERSION "19.10.9.418"
>
> +#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256
> +
>  int amdgpu_vram_limit = 0;
>  int amdgpu_vis_vram_limit = 0;
>  int amdgpu_gart_size = -1; /* auto */
> @@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0;
>  int amdgpu_hw_i2c = 0;
>  int amdgpu_pcie_gen2 = -1;
>  int amdgpu_msi = -1;
> -int amdgpu_lockup_timeout = 1;
> +char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH];
>  int amdgpu_dpm = -1;
>  int amdgpu_fw_load_type = -1;
>  int amdgpu_aspm = -1;
> @@ -232,12 +234,21 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = 
> disable, -1 = auto)");
>  module_param_named(msi, amdgpu_msi, int, 0444);
>
>  /**
> - * DOC: lockup_timeout (int)
> - * Set 

RE: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V3

2019-04-29 Thread Quan, Evan
Fixed in V4.

Regards,
Evan
> -Original Message-
> From: Alex Deucher 
> Sent: 2019年4月30日 9:34
> To: Quan, Evan 
> Cc: amd-gfx list ; Deucher, Alexander
> ; Lou, Wentao ;
> Daenzer, Michel ; Koenig, Christian
> 
> Subject: Re: [PATCH] drm/amdgpu: enable separate timeout setting for
> every ring type V3
> 
> [CAUTION: External Email]
> 
> On Mon, Apr 29, 2019 at 9:29 PM Quan, Evan  wrote:
> >
> > For old design, lockup_timeout affects non-compute IPs only(e.g.
> amdgpu.lockup_timeout=1000). Compute jobs always use
> MAX_SCHEDULE_TIMEOUT.
> > For the new design, I would like it be backward compatible. With [Non-
> Compute]:[Compute] format, "amdgpu.lockup_timeout=1000" setting has
> the same effect as old design.
> >
> > > * The format can be [Global] or [GFX,Compute,SDMA,Video].
> > I think using this format will break backward compatibility. Since
> "amdgpu.lockup_timeout=1000" setting will affect all IPs(both non-compute
> and compute).
> 
> Ah, ok.  I forgot we always used a different timeout for compute.  In that
> case, how about
> * The format can be [non-Compute] or [GFX,Compute,SDMA,Video].
> 
> Alex
> 
> >
> > Regards,
> > Evan
> > > -Original Message-
> > > From: Alex Deucher 
> > > Sent: 2019年4月29日 22:27
> > > To: Quan, Evan 
> > > Cc: amd-gfx list ; Deucher, Alexander
> > > ; Lou, Wentao
> ;
> > > Daenzer, Michel ; Koenig, Christian
> > > 
> > > Subject: Re: [PATCH] drm/amdgpu: enable separate timeout setting for
> > > every ring type V3
> > >
> > > [CAUTION: External Email]
> > >
> > > On Mon, Apr 29, 2019 at 9:51 AM Evan Quan 
> wrote:
> > > >
> > > > Every ring type can have its own timeout setting.
> > > >
> > > >  - V2: update lockup_timeout parameter format and cosmetic fixes
> > > >  - V3: invalidate 0 and negative values
> > > >
> > > > Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
> > > > Signed-off-by: Evan Quan 
> > > > ---
> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +-
> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++--
> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 90
> > > --
> > > >  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 35 +++--
> > > >  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  2 +-
> > > >  5 files changed, 132 insertions(+), 19 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > index f6965b9403eb..c9b44b8c1969 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > > @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority;  extern int
> > > > amdgpu_hw_i2c;  extern int amdgpu_pcie_gen2;  extern int
> > > > amdgpu_msi; -extern int amdgpu_lockup_timeout;  extern int
> > > > amdgpu_dpm;  extern int amdgpu_fw_load_type;  extern int
> > > > amdgpu_aspm; @@ -428,6 +427,7 @@ struct amdgpu_fpriv {  };
> > > >
> > > >  int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv
> > > > **fpriv);
> > > > +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device
> > > > +*adev);
> > > >
> > > >  int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm
> *vm,
> > > >   unsigned size, struct amdgpu_ib *ib); @@ -1001,6
> > > > +1001,11 @@ struct amdgpu_device {
> > > > struct work_struct  xgmi_reset_work;
> > > >
> > > > boolin_baco_reset;
> > > > +
> > > > +   longgfx_timeout;
> > > > +   longsdma_timeout;
> > > > +   longvideo_timeout;
> > > > +   longcompute_timeout;
> > > >  };
> > > >
> > > >  static inline struct amdgpu_device *amdgpu_ttm_adev(struct
> > > > ttm_bo_device *bdev) diff --git
> > > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > > index 80bf604019b1..b11af38a0238 100644
> > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > > @@ -912,8 +912,10 @@ static void
> > > amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device
> *adev)
> > > >   * Validates certain module parameters and updates
> > > >   * the associated values used by the driver (all asics).
> > > >   */
> > > > -static void amdgpu_device_check_arguments(struct amdgpu_device
> > > *adev)
> > > > +static int amdgpu_device_check_arguments(struct amdgpu_device
> > > > +*adev)
> > > >  {
> > > > +   int ret = 0;
> > > > +
> > > > if (amdgpu_sched_jobs < 4) {
> > > > dev_warn(adev->dev, "sched jobs (%d) must be at least 
> > > > 4\n",
> > > >  amdgpu_sched_jobs); @@ -958,13 +960,16 @@
> > > > static void amdgpu_device_check_arguments(struct amdgpu_device
> > > *adev)
> > > > amdgpu_vram_page_split = 1024;
> > > > }
> > > >
> > > > -   if (amdgpu_lockup_timeout == 0) {
> > > > -

[PATCH] drm/amdgpu: enable separate timeout setting for every ring type V4

2019-04-29 Thread Evan Quan
Every ring type can have its own timeout setting.

 - V2: update lockup_timeout parameter format and cosmetic fixes
 - V3: invalidate 0 and negative values
 - V4: update lockup_timeout parameter format

Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 79 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 35 --
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  2 +-
 5 files changed, 121 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f6965b9403eb..c9b44b8c1969 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -118,7 +118,6 @@ extern int amdgpu_disp_priority;
 extern int amdgpu_hw_i2c;
 extern int amdgpu_pcie_gen2;
 extern int amdgpu_msi;
-extern int amdgpu_lockup_timeout;
 extern int amdgpu_dpm;
 extern int amdgpu_fw_load_type;
 extern int amdgpu_aspm;
@@ -428,6 +427,7 @@ struct amdgpu_fpriv {
 };
 
 int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv);
+int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev);
 
 int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm,
  unsigned size, struct amdgpu_ib *ib);
@@ -1001,6 +1001,11 @@ struct amdgpu_device {
struct work_struct  xgmi_reset_work;
 
boolin_baco_reset;
+
+   longgfx_timeout;
+   longsdma_timeout;
+   longvideo_timeout;
+   longcompute_timeout;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 80bf604019b1..b11af38a0238 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -912,8 +912,10 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct 
amdgpu_device *adev)
  * Validates certain module parameters and updates
  * the associated values used by the driver (all asics).
  */
-static void amdgpu_device_check_arguments(struct amdgpu_device *adev)
+static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
 {
+   int ret = 0;
+
if (amdgpu_sched_jobs < 4) {
dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
 amdgpu_sched_jobs);
@@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct 
amdgpu_device *adev)
amdgpu_vram_page_split = 1024;
}
 
-   if (amdgpu_lockup_timeout == 0) {
-   dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 
1\n");
-   amdgpu_lockup_timeout = 1;
+   ret = amdgpu_device_get_job_timeout_settings(adev);
+   if (ret) {
+   dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
+   return ret;
}
 
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, 
amdgpu_fw_load_type);
amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96);
+
+   return ret;
 }
 
 /**
@@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>lock_reset);
mutex_init(>virt.dpm_mutex);
 
-   amdgpu_device_check_arguments(adev);
+   r = amdgpu_device_check_arguments(adev);
+   if (r)
+   return r;
 
spin_lock_init(>mmio_idx_lock);
spin_lock_init(>smc_idx_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 71df27cd03de..609c7af8a3f0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -83,6 +83,8 @@
 
 #define AMDGPU_VERSION "19.10.9.418"
 
+#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256
+
 int amdgpu_vram_limit = 0;
 int amdgpu_vis_vram_limit = 0;
 int amdgpu_gart_size = -1; /* auto */
@@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0;
 int amdgpu_hw_i2c = 0;
 int amdgpu_pcie_gen2 = -1;
 int amdgpu_msi = -1;
-int amdgpu_lockup_timeout = 1;
+char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH];
 int amdgpu_dpm = -1;
 int amdgpu_fw_load_type = -1;
 int amdgpu_aspm = -1;
@@ -232,12 +234,21 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = 
disable, -1 = auto)");
 module_param_named(msi, amdgpu_msi, int, 0444);
 
 /**
- * DOC: lockup_timeout (int)
- * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be 
adjusted to 1.
- * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The default is 
1.
+ * DOC: lockup_timeout (string)
+ * Set GPU scheduler timeout value in ms.
+ *
+ * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. 

RE: [PATCH] drm/amdgpu: Update latest xgmi topology info after each device is enumulated

2019-04-29 Thread Quan, Evan
Acked-by: Evan Quan 

> -Original Message-
> From: amd-gfx  On Behalf Of Liu,
> Shaoyun
> Sent: 2019年4月30日 3:20
> To: amd-gfx@lists.freedesktop.org
> Cc: Liu, Shaoyun 
> Subject: [PATCH] drm/amdgpu: Update latest xgmi topology info after each
> device is enumulated
> 
> [CAUTION: External Email]
> 
> Adjust the sequence of set/get xgmi topology, so driver can have the latest
> XGMI topology info for future usage
> 
> Change-Id: I627814f82459a6c9c3d72469f81309488b2a9133
> Signed-off-by: shaoyunl 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 32
> 
>  1 file changed, 20 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 04dfc8b..e48e939 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -301,30 +301,41 @@ int amdgpu_xgmi_add_device(struct
> amdgpu_device *adev)
> list_add_tail(>gmc.xgmi.head, >device_list);
> list_for_each_entry(entry, >device_list, head)
> top_info->nodes[count++].node_id = entry->node_id;
> +   top_info->num_nodes = count;
> hive->number_devices = count;
> 
> -   /* Each psp need to get the latest topology */
> list_for_each_entry(tmp_adev, >device_list, gmc.xgmi.head) {
> -   ret = psp_xgmi_get_topology_info(_adev->psp, count,
> top_info);
> +   /* update node list for other device in the hive */
> +   if (tmp_adev != adev) {
> +   top_info = _adev->psp.xgmi_context.top_info;
> +   top_info->nodes[count - 1].node_id = 
> adev->gmc.xgmi.node_id;
> +   top_info->num_nodes = count;
> +   }
> +   ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
> +   if (ret)
> +   goto exit;
> +   }
> +
> +   /* get latest topology info for each device from psp */
> +   list_for_each_entry(tmp_adev, >device_list, gmc.xgmi.head) {
> +   ret = psp_xgmi_get_topology_info(_adev->psp, count,
> +   _adev->psp.xgmi_context.top_info);
> if (ret) {
> dev_err(tmp_adev->dev,
> "XGMI: Get topology failure on device %llx, 
> hive %llx, ret %d",
> tmp_adev->gmc.xgmi.node_id,
> tmp_adev->gmc.xgmi.hive_id, ret);
> /* To do : continue with some node failed or disable 
> the whole
> hive */
> -   break;
> +   goto exit;
> }
> }
> 
> -   list_for_each_entry(tmp_adev, >device_list, gmc.xgmi.head) {
> -   ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
> -   if (ret)
> -   break;
> -   }
> -
> if (!ret)
> ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
> 
> +
> +   mutex_unlock(>hive_lock);
> +exit:
> if (!ret)
> dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
>  adev->gmc.xgmi.physical_node_id, 
> adev->gmc.xgmi.hive_id);
> @@ -333,9 +344,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
> *adev)
> adev->gmc.xgmi.physical_node_id, 
> adev->gmc.xgmi.hive_id,
> ret);
> 
> -
> -   mutex_unlock(>hive_lock);
> -exit:
> return ret;
>  }
> 
> --
> 2.7.4
> 
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: remove ATPX_DGPU_REQ_POWER_FOR_DISPLAYS check when hotplug-in

2019-04-29 Thread Alex Deucher
On Mon, Apr 29, 2019 at 9:58 PM Aaron Liu  wrote:
>
> In amdgpu_atif_handler, when hotplug event received, remove
> ATPX_DGPU_REQ_POWER_FOR_DISPLAYS check. This bit's check will cause missing
> system resume.
>
> Change-Id: Ic9a55fd44b721e59348a7768daeb41d414f21366
> Signed-off-by: Aaron Liu 

Reviewed-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
> index 4376b17..56f8ca2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
> @@ -464,8 +464,7 @@ static int amdgpu_atif_handler(struct amdgpu_device *adev,
> }
> }
> if (req.pending & ATIF_DGPU_DISPLAY_EVENT) {
> -   if ((adev->flags & AMD_IS_PX) &&
> -   amdgpu_atpx_dgpu_req_power_for_displays()) {
> +   if (adev->flags & AMD_IS_PX) {
> pm_runtime_get_sync(adev->ddev->dev);
> /* Just fire off a uevent and let userspace 
> tell us what to do */
> drm_helper_hpd_irq_event(adev->ddev);
> --
> 2.7.4
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: remove ATPX_DGPU_REQ_POWER_FOR_DISPLAYS check when hotplug-in

2019-04-29 Thread Aaron Liu
In amdgpu_atif_handler, when hotplug event received, remove
ATPX_DGPU_REQ_POWER_FOR_DISPLAYS check. This bit's check will cause missing
system resume.

Change-Id: Ic9a55fd44b721e59348a7768daeb41d414f21366
Signed-off-by: Aaron Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
index 4376b17..56f8ca2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c
@@ -464,8 +464,7 @@ static int amdgpu_atif_handler(struct amdgpu_device *adev,
}
}
if (req.pending & ATIF_DGPU_DISPLAY_EVENT) {
-   if ((adev->flags & AMD_IS_PX) &&
-   amdgpu_atpx_dgpu_req_power_for_displays()) {
+   if (adev->flags & AMD_IS_PX) {
pm_runtime_get_sync(adev->ddev->dev);
/* Just fire off a uevent and let userspace 
tell us what to do */
drm_helper_hpd_irq_event(adev->ddev);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V3

2019-04-29 Thread Alex Deucher
On Mon, Apr 29, 2019 at 9:29 PM Quan, Evan  wrote:
>
> For old design, lockup_timeout affects non-compute IPs only(e.g. 
> amdgpu.lockup_timeout=1000). Compute jobs always use MAX_SCHEDULE_TIMEOUT.
> For the new design, I would like it be backward compatible. With 
> [Non-Compute]:[Compute] format, "amdgpu.lockup_timeout=1000" setting has the 
> same effect as old design.
>
> > * The format can be [Global] or [GFX,Compute,SDMA,Video].
> I think using this format will break backward compatibility. Since 
> "amdgpu.lockup_timeout=1000" setting will affect all IPs(both non-compute and 
> compute).

Ah, ok.  I forgot we always used a different timeout for compute.  In
that case, how about
* The format can be [non-Compute] or [GFX,Compute,SDMA,Video].

Alex

>
> Regards,
> Evan
> > -Original Message-
> > From: Alex Deucher 
> > Sent: 2019年4月29日 22:27
> > To: Quan, Evan 
> > Cc: amd-gfx list ; Deucher, Alexander
> > ; Lou, Wentao ;
> > Daenzer, Michel ; Koenig, Christian
> > 
> > Subject: Re: [PATCH] drm/amdgpu: enable separate timeout setting for
> > every ring type V3
> >
> > [CAUTION: External Email]
> >
> > On Mon, Apr 29, 2019 at 9:51 AM Evan Quan  wrote:
> > >
> > > Every ring type can have its own timeout setting.
> > >
> > >  - V2: update lockup_timeout parameter format and cosmetic fixes
> > >  - V3: invalidate 0 and negative values
> > >
> > > Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
> > > Signed-off-by: Evan Quan 
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +-
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++--
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 90
> > --
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 35 +++--
> > >  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  2 +-
> > >  5 files changed, 132 insertions(+), 19 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > index f6965b9403eb..c9b44b8c1969 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > > @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority;  extern int
> > > amdgpu_hw_i2c;  extern int amdgpu_pcie_gen2;  extern int amdgpu_msi;
> > > -extern int amdgpu_lockup_timeout;  extern int amdgpu_dpm;  extern int
> > > amdgpu_fw_load_type;  extern int amdgpu_aspm; @@ -428,6 +427,7 @@
> > > struct amdgpu_fpriv {  };
> > >
> > >  int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv
> > > **fpriv);
> > > +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device
> > > +*adev);
> > >
> > >  int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> > >   unsigned size, struct amdgpu_ib *ib); @@ -1001,6
> > > +1001,11 @@ struct amdgpu_device {
> > > struct work_struct  xgmi_reset_work;
> > >
> > > boolin_baco_reset;
> > > +
> > > +   longgfx_timeout;
> > > +   longsdma_timeout;
> > > +   longvideo_timeout;
> > > +   longcompute_timeout;
> > >  };
> > >
> > >  static inline struct amdgpu_device *amdgpu_ttm_adev(struct
> > > ttm_bo_device *bdev) diff --git
> > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > index 80bf604019b1..b11af38a0238 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > > @@ -912,8 +912,10 @@ static void
> > amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
> > >   * Validates certain module parameters and updates
> > >   * the associated values used by the driver (all asics).
> > >   */
> > > -static void amdgpu_device_check_arguments(struct amdgpu_device
> > *adev)
> > > +static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
> > >  {
> > > +   int ret = 0;
> > > +
> > > if (amdgpu_sched_jobs < 4) {
> > > dev_warn(adev->dev, "sched jobs (%d) must be at least 
> > > 4\n",
> > >  amdgpu_sched_jobs); @@ -958,13 +960,16 @@
> > > static void amdgpu_device_check_arguments(struct amdgpu_device
> > *adev)
> > > amdgpu_vram_page_split = 1024;
> > > }
> > >
> > > -   if (amdgpu_lockup_timeout == 0) {
> > > -   dev_warn(adev->dev, "lockup_timeout msut be > 0, 
> > > adjusting to
> > 1\n");
> > > -   amdgpu_lockup_timeout = 1;
> > > +   ret = amdgpu_device_get_job_timeout_settings(adev);
> > > +   if (ret) {
> > > +   dev_err(adev->dev, "invalid lockup_timeout parameter
> > syntax\n");
> > > +   return ret;
> > > }
> > >
> > > adev->firmware.load_type = amdgpu_ucode_get_load_type(adev,
> > amdgpu_fw_load_type);
> > > amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96);
> > > +
> > > 

RE: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V3

2019-04-29 Thread Quan, Evan
For old design, lockup_timeout affects non-compute IPs only(e.g. 
amdgpu.lockup_timeout=1000). Compute jobs always use MAX_SCHEDULE_TIMEOUT.
For the new design, I would like it be backward compatible. With 
[Non-Compute]:[Compute] format, "amdgpu.lockup_timeout=1000" setting has the 
same effect as old design.

> * The format can be [Global] or [GFX,Compute,SDMA,Video].
I think using this format will break backward compatibility. Since 
"amdgpu.lockup_timeout=1000" setting will affect all IPs(both non-compute and 
compute).

Regards,
Evan
> -Original Message-
> From: Alex Deucher 
> Sent: 2019年4月29日 22:27
> To: Quan, Evan 
> Cc: amd-gfx list ; Deucher, Alexander
> ; Lou, Wentao ;
> Daenzer, Michel ; Koenig, Christian
> 
> Subject: Re: [PATCH] drm/amdgpu: enable separate timeout setting for
> every ring type V3
> 
> [CAUTION: External Email]
> 
> On Mon, Apr 29, 2019 at 9:51 AM Evan Quan  wrote:
> >
> > Every ring type can have its own timeout setting.
> >
> >  - V2: update lockup_timeout parameter format and cosmetic fixes
> >  - V3: invalidate 0 and negative values
> >
> > Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
> > Signed-off-by: Evan Quan 
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++--
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 90
> --
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 35 +++--
> >  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  2 +-
> >  5 files changed, 132 insertions(+), 19 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index f6965b9403eb..c9b44b8c1969 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority;  extern int
> > amdgpu_hw_i2c;  extern int amdgpu_pcie_gen2;  extern int amdgpu_msi;
> > -extern int amdgpu_lockup_timeout;  extern int amdgpu_dpm;  extern int
> > amdgpu_fw_load_type;  extern int amdgpu_aspm; @@ -428,6 +427,7 @@
> > struct amdgpu_fpriv {  };
> >
> >  int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv
> > **fpriv);
> > +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device
> > +*adev);
> >
> >  int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> >   unsigned size, struct amdgpu_ib *ib); @@ -1001,6
> > +1001,11 @@ struct amdgpu_device {
> > struct work_struct  xgmi_reset_work;
> >
> > boolin_baco_reset;
> > +
> > +   longgfx_timeout;
> > +   longsdma_timeout;
> > +   longvideo_timeout;
> > +   longcompute_timeout;
> >  };
> >
> >  static inline struct amdgpu_device *amdgpu_ttm_adev(struct
> > ttm_bo_device *bdev) diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 80bf604019b1..b11af38a0238 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -912,8 +912,10 @@ static void
> amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
> >   * Validates certain module parameters and updates
> >   * the associated values used by the driver (all asics).
> >   */
> > -static void amdgpu_device_check_arguments(struct amdgpu_device
> *adev)
> > +static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
> >  {
> > +   int ret = 0;
> > +
> > if (amdgpu_sched_jobs < 4) {
> > dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
> >  amdgpu_sched_jobs); @@ -958,13 +960,16 @@
> > static void amdgpu_device_check_arguments(struct amdgpu_device
> *adev)
> > amdgpu_vram_page_split = 1024;
> > }
> >
> > -   if (amdgpu_lockup_timeout == 0) {
> > -   dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting 
> > to
> 1\n");
> > -   amdgpu_lockup_timeout = 1;
> > +   ret = amdgpu_device_get_job_timeout_settings(adev);
> > +   if (ret) {
> > +   dev_err(adev->dev, "invalid lockup_timeout parameter
> syntax\n");
> > +   return ret;
> > }
> >
> > adev->firmware.load_type = amdgpu_ucode_get_load_type(adev,
> amdgpu_fw_load_type);
> > amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96);
> > +
> > +   return ret;
> >  }
> >
> >  /**
> > @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device
> *adev,
> > mutex_init(>lock_reset);
> > mutex_init(>virt.dpm_mutex);
> >
> > -   amdgpu_device_check_arguments(adev);
> > +   r = amdgpu_device_check_arguments(adev);
> > +   if (r)
> > +   return r;
> >
> > spin_lock_init(>mmio_idx_lock);
> > spin_lock_init(>smc_idx_lock);
> > 

Re: [PATCH] drm/amdgpu: Unmap CSA under SR-IOV in KFD path

2019-04-29 Thread Kuehling, Felix
I remember a past discussion to change the CSA allocation/mapping scheme 
to avoid this issue in the first place. Can adding the CSA to the VM be 
delayed a little to a point after the VM gets converted to a compute VM? 
Maybe the first command submission?

Regards,
   Felix

On 2019-04-28 6:25 a.m., Trigger Huang wrote:
> In amdgpu open path, CSA will be mappened in VM, so when opening
> KFD, calling mdgpu_vm_make_compute  will fail because it found this
> VM is not a clean VM with some mappings, as a result, it will lead
> to failed to create process VM object
>
> The fix is try to unmap CSA, and actually CSA is not needed in
> compute VF world switch
>
> Signed-off-by: Trigger Huang 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 10 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c  |  2 +-
>   2 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 697b8ef..e0bc457 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -956,6 +956,16 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct 
> kgd_dev *kgd,
>   if (avm->process_info)
>   return -EINVAL;
>   
> + /* Delete CSA mapping to make sure this VM is a clean VM  before
> +  *  converting VM
> +  */
> + if (amdgpu_sriov_vf(adev) && drv_priv->csa_va) {
> + amdgpu_bo_reserve(adev->virt.csa_obj, true);
> + amdgpu_vm_bo_rmv(adev, drv_priv->csa_va);
> + drv_priv->csa_va = NULL;
> + amdgpu_bo_unreserve(adev->virt.csa_obj);
> + }
> +
>   /* Convert VM into a compute VM */
>   ret = amdgpu_vm_make_compute(adev, avm, pasid);
>   if (ret)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index da7b4fe..361c2e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -1069,7 +1069,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
>   
>   amdgpu_vm_bo_rmv(adev, fpriv->prt_va);
>   
> - if (amdgpu_sriov_vf(adev)) {
> + if (amdgpu_sriov_vf(adev) && fpriv->csa_va) {
>   /* TODO: how to handle reserve failure */
>   BUG_ON(amdgpu_bo_reserve(adev->virt.csa_obj, true));
>   amdgpu_vm_bo_rmv(adev, fpriv->csa_va);
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 00/27] KFD upstreaming

2019-04-29 Thread Kuehling, Felix
I'll drop patch 27 from this series until the potential OOM issues can 
be sorted out.

The remaining patches are all KFD specific and shouldn't cause any 
trouble for graphics apps. If there are no other objections I'm planning 
to push patches 1-26 to amd-staging-drm-next tomorrow.

Thanks,
   Felix

On 2019-04-28 3:44 a.m., Kuehling, Felix wrote:
> Assorted KFD changes that have been accumulating on amd-kfd-staging. New
> features and fixes included:
> * Support for VegaM
> * Support for systems with multiple PCI domains
> * New SDMA queue type that's optimized for XGMI links
> * SDMA MQD allocation changes to support future ASICs with more SDMA queues
> * Fix for compute profile switching at process termination
> * Fix for a circular lock dependency in MMU notifiers
> * Fix for TLB flushing bug with XGMI enabled
> * Fix for artificial GTT system memory limitation
> * Trap handler updates
>
> Amber Lin (1):
>drm/amdkfd: Add domain number into gpu_id
>
> Felix Kuehling (1):
>drm/amdkfd: Fix a circular lock dependency
>
> Harish Kasiviswanathan (1):
>drm/amdkfd: Fix compute profile switching
>
> Jay Cornwall (4):
>drm/amdkfd: Fix gfx8 MEM_VIOL exception handler
>drm/amdkfd: Preserve wave state after instruction fetch MEM_VIOL
>drm/amdkfd: Fix gfx9 XNACK state save/restore
>drm/amdkfd: Preserve ttmp[4:5] instead of ttmp[14:15]
>
> Kent Russell (2):
>drm/amdkfd: Add VegaM support
>drm/amdgpu: Fix GTT size calculation
>
> Oak Zeng (16):
>drm/amdkfd: Use 64 bit sdma_bitmap
>drm/amdkfd: Add sdma allocation debug message
>drm/amdkfd: Differentiate b/t sdma_id and sdma_queue_id
>drm/amdkfd: Shift sdma_engine_id and sdma_queue_id in mqd
>drm/amdkfd: Fix a potential memory leak
>drm/amdkfd: Introduce asic-specific mqd_manager_init function
>drm/amdkfd: Introduce DIQ type mqd manager
>drm/amdkfd: Init mqd managers in device queue manager init
>drm/amdkfd: Add mqd size in mqd manager struct
>drm/amdkfd: Allocate MQD trunk for HIQ and SDMA
>drm/amdkfd: Move non-sdma mqd allocation out of init_mqd
>drm/amdkfd: Allocate hiq and sdma mqd from mqd trunk
>drm/amdkfd: Fix sdma queue map issue
>drm/amdkfd: Introduce XGMI SDMA queue type
>drm/amdkfd: Expose sdma engine numbers to topology
>drm/amdkfd: Delete alloc_format field from map_queue struct
>
> Yong Zhao (1):
>drm/amdkfd: Move sdma_queue_id calculation into allocate_sdma_queue()
>
> shaoyunl (1):
>drm/amdgpu: Use heavy weight for tlb invalidation on xgmi
>  configuration
>
>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c |  53 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |   9 +-
>   .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h| 483 +-
>   .../drm/amd/amdkfd/cwsr_trap_handler_gfx8.asm |  13 -
>   .../drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm |  63 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   2 +
>   drivers/gpu/drm/amd/amdkfd/kfd_crat.c |   5 +
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c   |  51 ++
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 354 -
>   .../drm/amd/amdkfd/kfd_device_queue_manager.h |  14 +-
>   .../amd/amdkfd/kfd_device_queue_manager_cik.c |   2 +
>   .../amd/amdkfd/kfd_device_queue_manager_v9.c  |   1 +
>   .../amd/amdkfd/kfd_device_queue_manager_vi.c  |   2 +
>   drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c  |   1 +
>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |   6 +-
>   .../gpu/drm/amd/amdkfd/kfd_kernel_queue_v9.c  |   4 +-
>   .../gpu/drm/amd/amdkfd/kfd_kernel_queue_vi.c  |   4 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c  |  70 ++-
>   drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h  |   8 +
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  53 +-
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  85 +--
>   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  53 +-
>   .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |   4 +-
>   .../gpu/drm/amd/amdkfd/kfd_pm4_headers_ai.h   |   7 +-
>   .../gpu/drm/amd/amdkfd/kfd_pm4_headers_vi.h   |   7 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  14 +-
>   .../amd/amdkfd/kfd_process_queue_manager.c|  14 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.c |  13 +-
>   drivers/gpu/drm/amd/amdkfd/kfd_topology.h |   2 +
>   drivers/gpu/drm/amd/include/cik_structs.h |   3 +-
>   drivers/gpu/drm/amd/include/v9_structs.h  |   3 +-
>   drivers/gpu/drm/amd/include/vi_structs.h  |   3 +-
>   include/uapi/linux/kfd_ioctl.h|   7 +-
>   33 files changed, 826 insertions(+), 587 deletions(-)
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 27/27] drm/amdgpu: Fix GTT size calculation

2019-04-29 Thread Kuehling, Felix
On 2019-04-29 8:34 a.m., Christian König wrote:
> Am 28.04.19 um 09:44 schrieb Kuehling, Felix:
>> From: Kent Russell 
>>
>> GTT size is currently limited to the minimum of VRAM size or 3/4 of
>> system memory. This severely limits the quanitity of system memory
>> that can be used by ROCm application.
>>
>> Increase GTT size to the maximum of VRAM size or system memory size.
>
> Well, NAK.
>
> This limit was done on purpose because we otherwise the 
> max-texture-size would be crashing the system because the OOM killer 
> would be causing a system panic.
>
> Using more than 75% of system memory by the GPU at the same time makes 
> the system unstable and so we can't allow that by default.

Like we discussed, the current implementation is too limiting. On a Fiji 
system with 4GB VRAM and 32GB system memory, it limits system memory 
allocations to 4GB. I think this workaround was fixed once before and 
reverted because it broke a CZ system with 1GB system memory. So I 
suspect that this is an issue affecting small memory systems where maybe 
the 1/2 system memory limit in TTM isn't sufficient to protect from OOM 
panics.

The OOM killer problem is a more general problem that potentially 
affects other drivers too. Keeping this GTT limit broken in AMDGPU is an 
inadequate workaround at best. I'd like to look for a better solution, 
probably some adjustment of the TTM system memory limits on systems with 
small memory, to avoid OOM panics on such systems.

Regards,
   Felix


>
> What could maybe work is to reduce amount of system memory by a fixed 
> factor, but I of hand don't see a way of fixing this in general.
>
> Regards,
> Christian.
>
>>
>> Signed-off-by: Kent Russell 
>> Reviewed-by: Felix Kuehling 
>> Signed-off-by: Felix Kuehling 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 9 -
>>   1 file changed, 4 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> index c14198737dcd..e9ecc3953673 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>> @@ -1740,11 +1740,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>>   struct sysinfo si;
>>     si_meminfo();
>> -    gtt_size = min(max((AMDGPU_DEFAULT_GTT_SIZE_MB << 20),
>> -   adev->gmc.mc_vram_size),
>> -   ((uint64_t)si.totalram * si.mem_unit * 3/4));
>> -    }
>> -    else
>> +    gtt_size = max3((AMDGPU_DEFAULT_GTT_SIZE_MB << 20),
>> +    adev->gmc.mc_vram_size,
>> +    ((uint64_t)si.totalram * si.mem_unit));
>> +    } else
>>   gtt_size = (uint64_t)amdgpu_gtt_size << 20;
>>     /* Initialize GTT memory pool */
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v2 0/9] PCI: add help pci_dev_id

2019-04-29 Thread Bjorn Helgaas
On Wed, Apr 24, 2019 at 09:10:21PM +0200, Heiner Kallweit wrote:
> In several places in the kernel we find PCI_DEVID used like this:
> PCI_DEVID(dev->bus->number, dev->devfn) Therefore create a helper
> for it.
> 
> v2:
> - apply the change to all affected places in the kernel
> 
> Heiner Kallweit (9):
>   PCI: add helper pci_dev_id
>   PCI: use helper pci_dev_id
>   r8169: use new helper pci_dev_id
>   powerpc/powernv/npu: use helper pci_dev_id
>   drm/amdkfd: use helper pci_dev_id
>   iommu/amd: use helper pci_dev_id
>   iommu/vt-d: use helper pci_dev_id
>   stmmac: pci: use helper pci_dev_id
>   platform/chrome: chromeos_laptop: use helper pci_dev_id
> 
>  arch/powerpc/platforms/powernv/npu-dma.c | 14 ++
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c|  3 +--
>  drivers/iommu/amd_iommu.c|  2 +-
>  drivers/iommu/intel-iommu.c  |  2 +-
>  drivers/iommu/intel_irq_remapping.c  |  2 +-
>  drivers/net/ethernet/realtek/r8169.c |  3 +--
>  drivers/net/ethernet/stmicro/stmmac/stmmac_pci.c |  2 +-
>  drivers/pci/msi.c|  6 +++---
>  drivers/pci/search.c | 10 +++---
>  drivers/platform/chrome/chromeos_laptop.c|  2 +-
>  include/linux/pci.h  |  5 +
>  11 files changed, 24 insertions(+), 27 deletions(-)

Applied with acks/reviewed-by from Benson, Joerg, Christian, Alexey, and
David to pci/misc for v5.2, thanks!
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH][next] drm/amd/display: fix incorrect null check on pointer

2019-04-29 Thread Alex Deucher
On Fri, Apr 26, 2019 at 5:48 PM Colin King  wrote:
>
> From: Colin Ian King 
>
> Currently an allocation is being made but the allocation failure
> check is being performed on another pointer. Fix this by checking
> the correct pointer. Also use the normal kernel idiom for null
> pointer checks.
>
> Addresses-Coverity: ("Resource leak")
> Fixes: 43e3ac8389ef ("drm/amd/display: Add function to copy DC streams")
> Signed-off-by: Colin Ian King 

Applied.  Thanks!

Alex

> ---
>  drivers/gpu/drm/amd/display/dc/core/dc_stream.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c 
> b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c
> index 6200df3edcd0..96e97d25d639 100644
> --- a/drivers/gpu/drm/amd/display/dc/core/dc_stream.c
> +++ b/drivers/gpu/drm/amd/display/dc/core/dc_stream.c
> @@ -168,7 +168,7 @@ struct dc_stream_state *dc_copy_stream(const struct 
> dc_stream_state *stream)
> struct dc_stream_state *new_stream;
>
> new_stream = kzalloc(sizeof(struct dc_stream_state), GFP_KERNEL);
> -   if (stream == NULL)
> +   if (!new_stream)
> return NULL;
>
> memcpy(new_stream, stream, sizeof(struct dc_stream_state));
> --
> 2.20.1
>
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: Update latest xgmi topology info after each device is enumulated

2019-04-29 Thread Liu, Shaoyun
Adjust the sequence of set/get xgmi topology, so driver can have the latest
XGMI topology info for future usage

Change-Id: I627814f82459a6c9c3d72469f81309488b2a9133
Signed-off-by: shaoyunl 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 32 
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 04dfc8b..e48e939 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -301,30 +301,41 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
list_add_tail(>gmc.xgmi.head, >device_list);
list_for_each_entry(entry, >device_list, head)
top_info->nodes[count++].node_id = entry->node_id;
+   top_info->num_nodes = count;
hive->number_devices = count;
 
-   /* Each psp need to get the latest topology */
list_for_each_entry(tmp_adev, >device_list, gmc.xgmi.head) {
-   ret = psp_xgmi_get_topology_info(_adev->psp, count, 
top_info);
+   /* update node list for other device in the hive */
+   if (tmp_adev != adev) {
+   top_info = _adev->psp.xgmi_context.top_info;
+   top_info->nodes[count - 1].node_id = 
adev->gmc.xgmi.node_id;
+   top_info->num_nodes = count;
+   }
+   ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
+   if (ret)
+   goto exit;
+   }
+
+   /* get latest topology info for each device from psp */
+   list_for_each_entry(tmp_adev, >device_list, gmc.xgmi.head) {
+   ret = psp_xgmi_get_topology_info(_adev->psp, count,
+   _adev->psp.xgmi_context.top_info);
if (ret) {
dev_err(tmp_adev->dev,
"XGMI: Get topology failure on device %llx, 
hive %llx, ret %d",
tmp_adev->gmc.xgmi.node_id,
tmp_adev->gmc.xgmi.hive_id, ret);
/* To do : continue with some node failed or disable 
the whole hive */
-   break;
+   goto exit;
}
}
 
-   list_for_each_entry(tmp_adev, >device_list, gmc.xgmi.head) {
-   ret = amdgpu_xgmi_update_topology(hive, tmp_adev);
-   if (ret)
-   break;
-   }
-
if (!ret)
ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
 
+
+   mutex_unlock(>hive_lock);
+exit:
if (!ret)
dev_info(adev->dev, "XGMI: Add node %d, hive 0x%llx.\n",
 adev->gmc.xgmi.physical_node_id, 
adev->gmc.xgmi.hive_id);
@@ -333,9 +344,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
adev->gmc.xgmi.physical_node_id, adev->gmc.xgmi.hive_id,
ret);
 
-
-   mutex_unlock(>hive_lock);
-exit:
return ret;
 }
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.

2019-04-29 Thread Christian König
I would clean them up further, but that's only moving code around so 
feel free to add my rb to those.


Christian.

Am 29.04.19 um 16:14 schrieb Grodzovsky, Andrey:


Thanks David, with that only patches 5 and 6 are left for the series 
to be reviewed.


Christian, any more comments on those patches ?

Andrey

On 4/27/19 10:56 PM, Zhou, David(ChunMing) wrote:


Sorry, I only can put my Acked-by: Chunming Zhou 
 on patch#3.


I cannot fully judge patch #4, #5, #6.

-David

*From:*amd-gfx  *On Behalf Of 
*Grodzovsky, Andrey

*Sent:* Friday, April 26, 2019 10:09 PM
*To:* Koenig, Christian ; Zhou, 
David(ChunMing) ; 
dri-de...@lists.freedesktop.org; amd-gfx@lists.freedesktop.org; 
e...@anholt.net; etna...@lists.freedesktop.org
*Cc:* Kazlauskas, Nicholas ; Liu, Monk 

*Subject:* Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty 
job already signaled.


Ping (mostly David and Monk).

Andrey

On 4/24/19 3:09 AM, Christian König wrote:

Am 24.04.19 um 05:02 schrieb Zhou, David(ChunMing):

>> - drm_sched_stop(>sched, >base);
>> -
>>   /* after all hw jobs are reset, hw fence is
meaningless, so force_completion */
>> amdgpu_fence_driver_force_completion(ring);
>>   }

HW fence are already forced completion, then we can just
disable irq fence process and ignore hw fence signal when we
are trying to do GPU reset, I think. Otherwise which will
make the logic much more complex.

If this situation happens because of long time execution, we
can increase timeout of reset detection.


You are not thinking widely enough, forcing the hw fence to
complete can trigger other to start other activity in the system.

We first need to stop everything and make sure that we don't do
any processing any more and then start with our reset procedure
including forcing all hw fences to complete.

Christian.


-David

*From:*amd-gfx 
 *On Behalf Of
*Grodzovsky, Andrey
*Sent:* Wednesday, April 24, 2019 12:00 AM
*To:* Zhou, David(ChunMing) 
; dri-de...@lists.freedesktop.org
;
amd-gfx@lists.freedesktop.org
; e...@anholt.net
; etna...@lists.freedesktop.org
;
ckoenig.leichtzumer...@gmail.com

*Cc:* Kazlauskas, Nicholas 
; Liu, Monk
 
*Subject:* Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if
guilty job already signaled.

No, i mean the actual HW fence which signals when the job
finished execution on the HW.

Andrey

On 4/23/19 11:19 AM, Zhou, David(ChunMing) wrote:

do you mean fence timer? why not stop it as well when
stopping sched for the reason of hw reset?

 Original Message 
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if
guilty job already signaled.
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)"

,dri-de...@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,e...@anholt.net,etna...@lists.freedesktop.org,ckoenig.leichtzumer...@gmail.com


CC: "Kazlauskas, Nicholas" ,"Liu, Monk"


On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
> +Monk.
>
> GPU reset is used widely in SRIOV, so need
virtulizatino guy take a look.
>
> But out of curious, why guilty job can signal more if
the job is already
> set to guilty? set it wrongly?
>
>
> -David


It's possible that the job does completes at a later time
then it's
timeout handler started processing so in this patch we
try to protect
against this by rechecking the HW fence after stopping
all SW
schedulers. We do it BEFORE marking guilty on the job's
sched_entity so
at the point we check the guilty flag is not set yet.

Andrey


>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive
before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep
all the 

Re: [PATCH v13 16/20] IB/mlx4, arm64: untag user pointers in mlx4_get_umem_mr

2019-04-29 Thread Leon Romanovsky
On Wed, Mar 20, 2019 at 03:51:30PM +0100, Andrey Konovalov wrote:
> This patch is a part of a series that extends arm64 kernel ABI to allow to
> pass tagged user pointers (with the top byte set to something else other
> than 0x00) as syscall arguments.
>
> mlx4_get_umem_mr() uses provided user pointers for vma lookups, which can
> only by done with untagged pointers.
>
> Untag user pointers in this function.
>
> Signed-off-by: Andrey Konovalov 
> ---
>  drivers/infiniband/hw/mlx4/mr.c | 7 ---
>  1 file changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
> index 395379a480cb..9a35ed2c6a6f 100644
> --- a/drivers/infiniband/hw/mlx4/mr.c
> +++ b/drivers/infiniband/hw/mlx4/mr.c
> @@ -378,6 +378,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_udata 
> *udata, u64 start,
>* again
>*/
>   if (!ib_access_writable(access_flags)) {
> + unsigned long untagged_start = untagged_addr(start);
>   struct vm_area_struct *vma;
>
>   down_read(>mm->mmap_sem);
> @@ -386,9 +387,9 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_udata 
> *udata, u64 start,
>* cover the memory, but for now it requires a single vma to
>* entirely cover the MR to support RO mappings.
>*/
> - vma = find_vma(current->mm, start);
> - if (vma && vma->vm_end >= start + length &&
> - vma->vm_start <= start) {
> + vma = find_vma(current->mm, untagged_start);
> + if (vma && vma->vm_end >= untagged_start + length &&
> + vma->vm_start <= untagged_start) {
>   if (vma->vm_flags & VM_WRITE)
>   access_flags |= IB_ACCESS_LOCAL_WRITE;
>   } else {
> --

Thanks,
Reviewed-by: Leon Romanovsky 

Interesting, the followup question is why mlx4 is only one driver in IB which
needs such code in umem_mr. I'll take a look on it.

Thanks
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v13 10/20] kernel, arm64: untag user pointers in prctl_set_mm*

2019-04-29 Thread Andrey Konovalov
On Fri, Apr 26, 2019 at 4:50 PM Catalin Marinas  wrote:
>
> On Mon, Apr 01, 2019 at 06:44:34PM +0200, Andrey Konovalov wrote:
> > On Fri, Mar 22, 2019 at 4:41 PM Catalin Marinas  
> > wrote:
> > > On Wed, Mar 20, 2019 at 03:51:24PM +0100, Andrey Konovalov wrote:
> > > > @@ -2120,13 +2135,14 @@ static int prctl_set_mm(int opt, unsigned long 
> > > > addr,
> > > >   if (opt == PR_SET_MM_AUXV)
> > > >   return prctl_set_auxv(mm, addr, arg4);
> > > >
> > > > - if (addr >= TASK_SIZE || addr < mmap_min_addr)
> > > > + if (untagged_addr(addr) >= TASK_SIZE ||
> > > > + untagged_addr(addr) < mmap_min_addr)
> > > >   return -EINVAL;
> > > >
> > > >   error = -EINVAL;
> > > >
> > > >   down_write(>mmap_sem);
> > > > - vma = find_vma(mm, addr);
> > > > + vma = find_vma(mm, untagged_addr(addr));
> > > >
> > > >   prctl_map.start_code= mm->start_code;
> > > >   prctl_map.end_code  = mm->end_code;
> > >
> > > Does this mean that we are left with tagged addresses for the
> > > mm->start_code etc. values? I really don't think we should allow this,
> > > I'm not sure what the implications are in other parts of the kernel.
> > >
> > > Arguably, these are not even pointer values but some address ranges. I
> > > know we decided to relax this notion for mmap/mprotect/madvise() since
> > > the user function prototypes take pointer as arguments but it feels like
> > > we are overdoing it here (struct prctl_mm_map doesn't even have
> > > pointers).
> > >
> > > What is the use-case for allowing tagged addresses here? Can user space
> > > handle untagging?
> >
> > I don't know any use cases for this. I did it because it seems to be
> > covered by the relaxed ABI. I'm not entirely sure what to do here,
> > should I just drop this patch?
>
> If we allow tagged addresses to be passed here, we'd have to untag them
> before they end up in the mm->start_code etc. members.
>
> I know we are trying to relax the ABI here w.r.t. address ranges but
> mostly because we couldn't figure out a way to document unambiguously
> the difference between a user pointer that may be dereferenced by the
> kernel (tags allowed) and an address typically used for managing the
> address space layout. Suggestions welcomed.
>
> I'd say just drop this patch and capture it in the ABI document.

OK, will do in v14.

Vincenzo, could you add a note about this into tour patchset?

>
> --
> Catalin
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 3/3] drm/amd/display: Compensate for pre-DCE12 BTR-VRR hw limitations. (v3)

2019-04-29 Thread Mario Kleiner
On Mon, Apr 29, 2019 at 2:51 PM Kazlauskas, Nicholas
 wrote:
>
> On 4/26/19 5:40 PM, Mario Kleiner wrote:
> > Pre-DCE12 needs special treatment for BTR / low framerate
> > compensation for more stable behaviour:
> >
> > According to comments in the code and some testing on DCE-8
> > and DCE-11, DCE-11 and earlier only apply VTOTAL_MIN/MAX
> > programming with a lag of one frame, so the special BTR hw
> > programming for intermediate fixed duration frames must be
> > done inside the current frame at flip submission in atomic
> > commit tail, ie. one vblank earlier, and the fixed refresh
> > intermediate frame mode must be also terminated one vblank
> > earlier on pre-DCE12 display engines.
> >
> > To achieve proper termination on < DCE-12 shift the point
> > when the switch-back from fixed vblank duration to variable
> > vblank duration happens from the start of VBLANK (vblank irq,
> > as done on DCE-12+) to back-porch or end of VBLANK (handled
> > by vupdate irq handler). We must leave the switch-back code
> > inside VBLANK irq for DCE12+, as before.
> >
> > Doing this, we get much better behaviour of BTR for up-sweeps,
> > ie. going from short to long frame durations (~high to low fps)
> > and for constant framerate flips, as tested on DCE-8 and
> > DCE-11. Behaviour is still not quite as good as on DCN-1
> > though.
> >
> > On down-sweeps, going from long to short frame durations
> > (low fps to high fps) < DCE-12 is a little bit improved,
> > although by far not as much as for up-sweeps and constant
> > fps.
> >
> > v2: Fix some wrong locking, as pointed out by Nicholas.
> > v3: Simplify if-condition in vupdate-irq - nit by Nicholas.
> > Signed-off-by: Mario Kleiner 
>
> Reviewed-by: Nicholas Kazlauskas 
>

Thanks.

> I'd like to push patches 1+3 if you're okay with this.
>

Yes please! I'd love to have these in Linux 5.2, so that would be the
first well suitable kernel to target for my specific VRR use cases.

> I can see the value in the 2nd patch from the graphs and test
> application you've posted but it'll likely take some time to do thorough
> review and testing on this.
>

I understand.

> The question that needs to be examined with the second is what the
> optimal margin is, if any, for typical usecases across common monitor
> ranges. Not all monitors that support BTR have the same range, and many
> have a lower bound of ~48Hz. To me ~53Hz sounds rather early to be
> entering, but I'm not sure how it affects the user experience overall.
>

That's indeed a tricky tuning problem and i didn't spend much thought
on that. Just took the BTR_EXIT_MARGIN already defined as 2 msecs,
because it was conveniently there for the exit path and this way it
was nicely symmetric to the BTR exit path. 1 msec or less might just
be as good, or something more fancy and adaptive. But most value comes
from patch 3/3 anyway.

Atm. i don't have a Freesync capable display at all, so i don't know
how well this patch works out wrt. flicker etc, i can only look at my
plots and the output of the kms_vrr igt test. I just hacked the driver
to accept any DP or HDMI display as VRR capable, so i can run the
synthetic timing tests blindly (A G-Sync display mostly goes blank or
tells me a sad "Out of range signal" on its OSD, a HDMI->VGA connected
CRT monitor cycles between "Out of range", black, and some heavily
distorted image). The G-Sync display has a 30 - 144 Hz range and my
faking patch even set a range 25 - 144 Hz for most tests.

I also thought about sending my "fake VRR display" patch after
cleaning it up. I could add some amdgpu module parameter, e.g., bool
amdgpu.fakevrroutput,
that if set to 1 would allow the driver to accept any DP/HDMI display
as VRR capable, parsing the VRR range from EDID if present (that works
for G-Sync displays) or just faking a range like 30 - 144 Hz or
whatever is realistic? A blanked out display is boring to look at, but
at least it allows to run timing tests or the IGT kms_vrr tests
without the need for a dedicated FreeSync display.

From reading on the web I understand that "FreeSync2 HDR" displays are
also HDR capable and go through some testing and certification at AMD
wrt. to minimal flicker? Would you have some recommendation or
pointers to a display i could buy if i needed a large VRR range,
minimal flicker and also good HDR capabilities - ideally with locally
dimmable backlight or such? I will probably have the money to buy one
FreeSync display, but that should also be useable for working on HDR
related stuff.

Thanks,
-mario

> Nicholas Kazlauskas
>
> > ---
> >   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 48 +--
> >   1 file changed, 44 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index 76b6e621793f..92b3c2cec7dd 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ 

Re: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type V3

2019-04-29 Thread Alex Deucher
On Mon, Apr 29, 2019 at 9:51 AM Evan Quan  wrote:
>
> Every ring type can have its own timeout setting.
>
>  - V2: update lockup_timeout parameter format and cosmetic fixes
>  - V3: invalidate 0 and negative values
>
> Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
> Signed-off-by: Evan Quan 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 90 --
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 35 +++--
>  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  2 +-
>  5 files changed, 132 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f6965b9403eb..c9b44b8c1969 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority;
>  extern int amdgpu_hw_i2c;
>  extern int amdgpu_pcie_gen2;
>  extern int amdgpu_msi;
> -extern int amdgpu_lockup_timeout;
>  extern int amdgpu_dpm;
>  extern int amdgpu_fw_load_type;
>  extern int amdgpu_aspm;
> @@ -428,6 +427,7 @@ struct amdgpu_fpriv {
>  };
>
>  int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv);
> +int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev);
>
>  int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   unsigned size, struct amdgpu_ib *ib);
> @@ -1001,6 +1001,11 @@ struct amdgpu_device {
> struct work_struct  xgmi_reset_work;
>
> boolin_baco_reset;
> +
> +   longgfx_timeout;
> +   longsdma_timeout;
> +   longvideo_timeout;
> +   longcompute_timeout;
>  };
>
>  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device 
> *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 80bf604019b1..b11af38a0238 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -912,8 +912,10 @@ static void 
> amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
>   * Validates certain module parameters and updates
>   * the associated values used by the driver (all asics).
>   */
> -static void amdgpu_device_check_arguments(struct amdgpu_device *adev)
> +static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
>  {
> +   int ret = 0;
> +
> if (amdgpu_sched_jobs < 4) {
> dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
>  amdgpu_sched_jobs);
> @@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct 
> amdgpu_device *adev)
> amdgpu_vram_page_split = 1024;
> }
>
> -   if (amdgpu_lockup_timeout == 0) {
> -   dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 
> 1\n");
> -   amdgpu_lockup_timeout = 1;
> +   ret = amdgpu_device_get_job_timeout_settings(adev);
> +   if (ret) {
> +   dev_err(adev->dev, "invalid lockup_timeout parameter 
> syntax\n");
> +   return ret;
> }
>
> adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, 
> amdgpu_fw_load_type);
> amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96);
> +
> +   return ret;
>  }
>
>  /**
> @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> mutex_init(>lock_reset);
> mutex_init(>virt.dpm_mutex);
>
> -   amdgpu_device_check_arguments(adev);
> +   r = amdgpu_device_check_arguments(adev);
> +   if (r)
> +   return r;
>
> spin_lock_init(>mmio_idx_lock);
> spin_lock_init(>smc_idx_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 71df27cd03de..c936e0735a0a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -83,6 +83,8 @@
>
>  #define AMDGPU_VERSION "19.10.9.418"
>
> +#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256
> +
>  int amdgpu_vram_limit = 0;
>  int amdgpu_vis_vram_limit = 0;
>  int amdgpu_gart_size = -1; /* auto */
> @@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0;
>  int amdgpu_hw_i2c = 0;
>  int amdgpu_pcie_gen2 = -1;
>  int amdgpu_msi = -1;
> -int amdgpu_lockup_timeout = 1;
> +char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH];
>  int amdgpu_dpm = -1;
>  int amdgpu_fw_load_type = -1;
>  int amdgpu_aspm = -1;
> @@ -232,12 +234,25 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = 
> disable, -1 = auto)");
>  module_param_named(msi, amdgpu_msi, int, 0444);
>
>  /**
> - * DOC: lockup_timeout (int)
> - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be 
> adjusted 

Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already signaled.

2019-04-29 Thread Grodzovsky, Andrey
Thanks David, with that only patches 5 and 6 are left for the series to be 
reviewed.

Christian, any more comments on those patches ?

Andrey

On 4/27/19 10:56 PM, Zhou, David(ChunMing) wrote:
Sorry, I only can put my Acked-by: Chunming Zhou 
 on patch#3.

I cannot fully judge patch #4, #5, #6.

-David

From: amd-gfx 

 On Behalf Of Grodzovsky, Andrey
Sent: Friday, April 26, 2019 10:09 PM
To: Koenig, Christian 
; Zhou, 
David(ChunMing) ; 
dri-de...@lists.freedesktop.org; 
amd-gfx@lists.freedesktop.org; 
e...@anholt.net; 
etna...@lists.freedesktop.org
Cc: Kazlauskas, Nicholas 
; Liu, Monk 

Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already 
signaled.


Ping (mostly David and Monk).

Andrey
On 4/24/19 3:09 AM, Christian König wrote:
Am 24.04.19 um 05:02 schrieb Zhou, David(ChunMing):
>> -drm_sched_stop(>sched, >base);
>> -
>>   /* after all hw jobs are reset, hw fence is meaningless, so 
>> force_completion */
>>   amdgpu_fence_driver_force_completion(ring);
>>   }

HW fence are already forced completion, then we can just disable irq fence 
process and ignore hw fence signal when we are trying to do GPU reset, I think. 
Otherwise which will make the logic much more complex.
If this situation happens because of long time execution, we can increase 
timeout of reset detection.

You are not thinking widely enough, forcing the hw fence to complete can 
trigger other to start other activity in the system.

We first need to stop everything and make sure that we don't do any processing 
any more and then start with our reset procedure including forcing all hw 
fences to complete.

Christian.



-David

From: amd-gfx 

 On Behalf Of Grodzovsky, Andrey
Sent: Wednesday, April 24, 2019 12:00 AM
To: Zhou, David(ChunMing) ; 
dri-de...@lists.freedesktop.org; 
amd-gfx@lists.freedesktop.org; 
e...@anholt.net; 
etna...@lists.freedesktop.org; 
ckoenig.leichtzumer...@gmail.com
Cc: Kazlauskas, Nicholas 
; Liu, Monk 

Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already 
signaled.


No, i mean the actual HW fence which signals when the job finished execution on 
the HW.

Andrey
On 4/23/19 11:19 AM, Zhou, David(ChunMing) wrote:
do you mean fence timer? why not stop it as well when stopping sched for the 
reason of hw reset?

 Original Message 
Subject: Re: [PATCH v5 6/6] drm/amdgpu: Avoid HW reset if guilty job already 
signaled.
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)" 
,dri-de...@lists.freedesktop.org,amd-gfx@lists.freedesktop.org,e...@anholt.net,etna...@lists.freedesktop.org,ckoenig.leichtzumer...@gmail.com
CC: "Kazlauskas, Nicholas" ,"Liu, Monk"

On 4/22/19 9:09 AM, Zhou, David(ChunMing) wrote:
> +Monk.
>
> GPU reset is used widely in SRIOV, so need virtulizatino guy take a look.
>
> But out of curious, why guilty job can signal more if the job is already
> set to guilty? set it wrongly?
>
>
> -David


It's possible that the job does completes at a later time then it's
timeout handler started processing so in this patch we try to protect
against this by rechecking the HW fence after stopping all SW
schedulers. We do it BEFORE marking guilty on the job's sched_entity so
at the point we check the guilty flag is not set yet.

Andrey


>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> Also reject TDRs if another one already running.
>>
>> v2:
>> Stop all schedulers across device and entire XGMI hive before
>> force signaling HW fences.
>> Avoid passing job_signaled to helper fnctions to keep all the decision
>> making about skipping HW reset in one place.
>>
>> v3:
>> Fix SW sched. hang after non HW reset. sched.hw_rq_count has to be balanced
>> against it's decrement in drm_sched_stop in non HW reset case.
>> v4: rebase
>> v5: Revert v3 as we do it now in sceduler code.
>>
>> Signed-off-by: Andrey Grodzovsky 
>> 
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 143 
>> +++--
>>1 file changed, 95 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 

Re: [PATCH] drm/amd/powerplay: enable ppfeaturemask module parameter support on Vega20

2019-04-29 Thread Deucher, Alexander
Reviewed-by: Alex Deucher 

From: amd-gfx  on behalf of Evan Quan 

Sent: Monday, April 29, 2019 4:56 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander; Quan, Evan
Subject: [PATCH] drm/amd/powerplay: enable ppfeaturemask module parameter 
support on Vega20

Support DPM/DS/ULV related bitmasks of ppfeaturemask module parameter.

Change-Id: I6b75becf8d39105189b30be41b58ec7d4425f356
Signed-off-by: Evan Quan 
---
 .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c| 21 +++
 1 file changed, 21 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c 
b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
index 91e26f8b3758..d7873df484a4 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
@@ -97,6 +97,27 @@ static void vega20_set_default_registry_data(struct pp_hwmgr 
*hwmgr)
 if (hwmgr->smu_version < 0x282100)
 data->registry_data.disallowed_features |= FEATURE_ECC_MASK;

+   if (!(hwmgr->feature_mask & PP_PCIE_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_LINK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_SCLK_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_GFXCLK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_SOCCLK_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_SOCCLK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_MCLK_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_UCLK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_DCEFCLK_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_DCEFCLK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_ULV_MASK))
+   data->registry_data.disallowed_features |= FEATURE_ULV_MASK;
+
+   if (!(hwmgr->feature_mask & PP_SCLK_DEEP_SLEEP_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DS_GFXCLK_MASK;
+
 data->registry_data.od_state_in_dc_support = 0;
 data->registry_data.thermal_support = 1;
 data->registry_data.skip_baco_hardware = 0;
--
2.21.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: enable separate timeout setting for every ring type V3

2019-04-29 Thread Evan Quan
Every ring type can have its own timeout setting.

 - V2: update lockup_timeout parameter format and cosmetic fixes
 - V3: invalidate 0 and negative values

Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 90 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 35 +++--
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  2 +-
 5 files changed, 132 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f6965b9403eb..c9b44b8c1969 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -118,7 +118,6 @@ extern int amdgpu_disp_priority;
 extern int amdgpu_hw_i2c;
 extern int amdgpu_pcie_gen2;
 extern int amdgpu_msi;
-extern int amdgpu_lockup_timeout;
 extern int amdgpu_dpm;
 extern int amdgpu_fw_load_type;
 extern int amdgpu_aspm;
@@ -428,6 +427,7 @@ struct amdgpu_fpriv {
 };
 
 int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv);
+int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev);
 
 int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm,
  unsigned size, struct amdgpu_ib *ib);
@@ -1001,6 +1001,11 @@ struct amdgpu_device {
struct work_struct  xgmi_reset_work;
 
boolin_baco_reset;
+
+   longgfx_timeout;
+   longsdma_timeout;
+   longvideo_timeout;
+   longcompute_timeout;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 80bf604019b1..b11af38a0238 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -912,8 +912,10 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct 
amdgpu_device *adev)
  * Validates certain module parameters and updates
  * the associated values used by the driver (all asics).
  */
-static void amdgpu_device_check_arguments(struct amdgpu_device *adev)
+static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
 {
+   int ret = 0;
+
if (amdgpu_sched_jobs < 4) {
dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
 amdgpu_sched_jobs);
@@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct 
amdgpu_device *adev)
amdgpu_vram_page_split = 1024;
}
 
-   if (amdgpu_lockup_timeout == 0) {
-   dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 
1\n");
-   amdgpu_lockup_timeout = 1;
+   ret = amdgpu_device_get_job_timeout_settings(adev);
+   if (ret) {
+   dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
+   return ret;
}
 
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, 
amdgpu_fw_load_type);
amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96);
+
+   return ret;
 }
 
 /**
@@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>lock_reset);
mutex_init(>virt.dpm_mutex);
 
-   amdgpu_device_check_arguments(adev);
+   r = amdgpu_device_check_arguments(adev);
+   if (r)
+   return r;
 
spin_lock_init(>mmio_idx_lock);
spin_lock_init(>smc_idx_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 71df27cd03de..c936e0735a0a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -83,6 +83,8 @@
 
 #define AMDGPU_VERSION "19.10.9.418"
 
+#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256
+
 int amdgpu_vram_limit = 0;
 int amdgpu_vis_vram_limit = 0;
 int amdgpu_gart_size = -1; /* auto */
@@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0;
 int amdgpu_hw_i2c = 0;
 int amdgpu_pcie_gen2 = -1;
 int amdgpu_msi = -1;
-int amdgpu_lockup_timeout = 1;
+char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH];
 int amdgpu_dpm = -1;
 int amdgpu_fw_load_type = -1;
 int amdgpu_aspm = -1;
@@ -232,12 +234,25 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = 
disable, -1 = auto)");
 module_param_named(msi, amdgpu_msi, int, 0444);
 
 /**
- * DOC: lockup_timeout (int)
- * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be 
adjusted to 1.
- * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The default is 
1.
+ * DOC: lockup_timeout (string)
+ * Set GPU scheduler timeout value in ms.
+ *
+ * The format can be [Non-Compute]:[Compute] or [GFX,SDMA,[Video]]:[Compute].
+ *   - There can be one or multiple 

Re: print firmware versions on amdgpu sysfs

2019-04-29 Thread Lin, Amber
Thank you Alex and Christian for the feedback. Ori will try to add ioctl into 
SMI CLI to retrieve firmware versions.

Amber

On 2019-04-29 8:22 a.m., Christian König wrote:
We just need to keep in mind that sysfs has more restrictions than debugfs.

E.g. one value per file, backward compatibility etc...

Apart from that I don't see any reason to not do it in sysfs.

Christian.

Am 26.04.19 um 20:35 schrieb Russell, Kent:
The main reasoning and use case for sysfs would be the SMI CLI, since 
unfortunately there is no ioctl support in there.

Kent

KENT RUSSELL
Sr. Software Engineer | Linux Compute Kernel
1 Commerce Valley Drive East
Markham, ON L3T 7X6
O +(1) 289-695-2122 | Ext 72122

From: amd-gfx 

 on behalf of Deucher, Alexander 

Sent: Friday, April 26, 2019 12:18:22 PM
To: Lin, Amber; 
amd-gfx@lists.freedesktop.org
Cc: Freehill, Chris
Subject: Re: print firmware versions on amdgpu sysfs

We also expose the firmware versions via ioctl which is what the UMDs uses.  If 
you'd prefer it in sysfs, we could do that too.

Alex


From: amd-gfx 

 on behalf of Lin, Amber 
Sent: Friday, April 26, 2019 10:14 AM
To: amd-gfx@lists.freedesktop.org
Cc: Freehill, Chris
Subject: print firmware versions on amdgpu sysfs

Hi Alex,

Currently we retrieve firmware versions from /sys/kernel/debug but this
requires debugfs being enabled and superuser privilege. Do you see a
concern we add firmware versions to amdgpu sysfs at
/sys/class/drm/cardX/device like vbios_version?

Regards,
Amber
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 3/3] drm/amd/display: Compensate for pre-DCE12 BTR-VRR hw limitations. (v3)

2019-04-29 Thread Kazlauskas, Nicholas
On 4/26/19 5:40 PM, Mario Kleiner wrote:
> Pre-DCE12 needs special treatment for BTR / low framerate
> compensation for more stable behaviour:
> 
> According to comments in the code and some testing on DCE-8
> and DCE-11, DCE-11 and earlier only apply VTOTAL_MIN/MAX
> programming with a lag of one frame, so the special BTR hw
> programming for intermediate fixed duration frames must be
> done inside the current frame at flip submission in atomic
> commit tail, ie. one vblank earlier, and the fixed refresh
> intermediate frame mode must be also terminated one vblank
> earlier on pre-DCE12 display engines.
> 
> To achieve proper termination on < DCE-12 shift the point
> when the switch-back from fixed vblank duration to variable
> vblank duration happens from the start of VBLANK (vblank irq,
> as done on DCE-12+) to back-porch or end of VBLANK (handled
> by vupdate irq handler). We must leave the switch-back code
> inside VBLANK irq for DCE12+, as before.
> 
> Doing this, we get much better behaviour of BTR for up-sweeps,
> ie. going from short to long frame durations (~high to low fps)
> and for constant framerate flips, as tested on DCE-8 and
> DCE-11. Behaviour is still not quite as good as on DCN-1
> though.
> 
> On down-sweeps, going from long to short frame durations
> (low fps to high fps) < DCE-12 is a little bit improved,
> although by far not as much as for up-sweeps and constant
> fps.
> 
> v2: Fix some wrong locking, as pointed out by Nicholas.
> v3: Simplify if-condition in vupdate-irq - nit by Nicholas.
> Signed-off-by: Mario Kleiner 

Reviewed-by: Nicholas Kazlauskas 

I'd like to push patches 1+3 if you're okay with this.

I can see the value in the 2nd patch from the graphs and test 
application you've posted but it'll likely take some time to do thorough 
review and testing on this.

The question that needs to be examined with the second is what the 
optimal margin is, if any, for typical usecases across common monitor 
ranges. Not all monitors that support BTR have the same range, and many 
have a lower bound of ~48Hz. To me ~53Hz sounds rather early to be 
entering, but I'm not sure how it affects the user experience overall.

Nicholas Kazlauskas

> ---
>   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 48 +--
>   1 file changed, 44 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 76b6e621793f..92b3c2cec7dd 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -364,6 +364,7 @@ static void dm_vupdate_high_irq(void *interrupt_params)
>   struct amdgpu_device *adev = irq_params->adev;
>   struct amdgpu_crtc *acrtc;
>   struct dm_crtc_state *acrtc_state;
> + unsigned long flags;
>   
>   acrtc = get_crtc_by_otg_inst(adev, irq_params->irq_src - 
> IRQ_TYPE_VUPDATE);
>   
> @@ -379,8 +380,25 @@ static void dm_vupdate_high_irq(void *interrupt_params)
>* page-flip completion events that have been queued to us
>* if a pageflip happened inside front-porch.
>*/
> - if (amdgpu_dm_vrr_active(acrtc_state))
> + if (amdgpu_dm_vrr_active(acrtc_state)) {
>   drm_crtc_handle_vblank(>base);
> +
> + /* BTR processing for pre-DCE12 ASICs */
> + if (acrtc_state->stream &&
> + adev->family < AMDGPU_FAMILY_AI) {
> + spin_lock_irqsave(>ddev->event_lock, 
> flags);
> + mod_freesync_handle_v_update(
> + adev->dm.freesync_module,
> + acrtc_state->stream,
> + _state->vrr_params);
> +
> + dc_stream_adjust_vmin_vmax(
> + adev->dm.dc,
> + acrtc_state->stream,
> + _state->vrr_params.adjust);
> + spin_unlock_irqrestore(>ddev->event_lock, 
> flags);
> + }
> + }
>   }
>   }
>   
> @@ -390,6 +408,7 @@ static void dm_crtc_high_irq(void *interrupt_params)
>   struct amdgpu_device *adev = irq_params->adev;
>   struct amdgpu_crtc *acrtc;
>   struct dm_crtc_state *acrtc_state;
> + unsigned long flags;
>   
>   acrtc = get_crtc_by_otg_inst(adev, irq_params->irq_src - 
> IRQ_TYPE_VBLANK);
>   
> @@ -412,9 +431,10 @@ static void dm_crtc_high_irq(void *interrupt_params)
>*/
>   amdgpu_dm_crtc_handle_crc_irq(>base);
>   
> - if (acrtc_state->stream &&
> + if (acrtc_state->stream && adev->family >= AMDGPU_FAMILY_AI &&
>   acrtc_state->vrr_params.supported &&
>   acrtc_state->freesync_config.state == 
> VRR_STATE_ACTIVE_VARIABLE) 

Re: [PATCH 27/27] drm/amdgpu: Fix GTT size calculation

2019-04-29 Thread Christian König

Am 28.04.19 um 09:44 schrieb Kuehling, Felix:

From: Kent Russell 

GTT size is currently limited to the minimum of VRAM size or 3/4 of
system memory. This severely limits the quanitity of system memory
that can be used by ROCm application.

Increase GTT size to the maximum of VRAM size or system memory size.


Well, NAK.

This limit was done on purpose because we otherwise the max-texture-size 
would be crashing the system because the OOM killer would be causing a 
system panic.


Using more than 75% of system memory by the GPU at the same time makes 
the system unstable and so we can't allow that by default.


What could maybe work is to reduce amount of system memory by a fixed 
factor, but I of hand don't see a way of fixing this in general.


Regards,
Christian.



Signed-off-by: Kent Russell 
Reviewed-by: Felix Kuehling 
Signed-off-by: Felix Kuehling 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 9 -
  1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index c14198737dcd..e9ecc3953673 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1740,11 +1740,10 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
struct sysinfo si;
  
  		si_meminfo();

-   gtt_size = min(max((AMDGPU_DEFAULT_GTT_SIZE_MB << 20),
-  adev->gmc.mc_vram_size),
-  ((uint64_t)si.totalram * si.mem_unit * 3/4));
-   }
-   else
+   gtt_size = max3((AMDGPU_DEFAULT_GTT_SIZE_MB << 20),
+   adev->gmc.mc_vram_size,
+   ((uint64_t)si.totalram * si.mem_unit));
+   } else
gtt_size = (uint64_t)amdgpu_gtt_size << 20;
  
  	/* Initialize GTT memory pool */


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: Unmap CSA under SR-IOV in KFD path

2019-04-29 Thread Christian König

Am 28.04.19 um 12:25 schrieb Trigger Huang:

In amdgpu open path, CSA will be mappened in VM, so when opening
KFD, calling mdgpu_vm_make_compute  will fail because it found this
VM is not a clean VM with some mappings, as a result, it will lead
to failed to create process VM object

The fix is try to unmap CSA, and actually CSA is not needed in
compute VF world switch


Well we should rather try to avoid that.

The plan with that was that we do this so that one VM can be used both 
by compute and gfx.


I would rather lower the check in amdgpu_vm_make_compute() to make sure 
that we don't have page tables filled, instead of checking for a mapping.


Christian.



Signed-off-by: Trigger Huang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 10 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c  |  2 +-
  2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 697b8ef..e0bc457 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -956,6 +956,16 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev 
*kgd,
if (avm->process_info)
return -EINVAL;
  
+	/* Delete CSA mapping to make sure this VM is a clean VM  before

+*  converting VM
+*/
+   if (amdgpu_sriov_vf(adev) && drv_priv->csa_va) {
+   amdgpu_bo_reserve(adev->virt.csa_obj, true);
+   amdgpu_vm_bo_rmv(adev, drv_priv->csa_va);
+   drv_priv->csa_va = NULL;
+   amdgpu_bo_unreserve(adev->virt.csa_obj);
+   }
+
/* Convert VM into a compute VM */
ret = amdgpu_vm_make_compute(adev, avm, pasid);
if (ret)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index da7b4fe..361c2e5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1069,7 +1069,7 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
  
  	amdgpu_vm_bo_rmv(adev, fpriv->prt_va);
  
-	if (amdgpu_sriov_vf(adev)) {

+   if (amdgpu_sriov_vf(adev) && fpriv->csa_va) {
/* TODO: how to handle reserve failure */
BUG_ON(amdgpu_bo_reserve(adev->virt.csa_obj, true));
amdgpu_vm_bo_rmv(adev, fpriv->csa_va);


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: print firmware versions on amdgpu sysfs

2019-04-29 Thread Christian König

We just need to keep in mind that sysfs has more restrictions than debugfs.

E.g. one value per file, backward compatibility etc...

Apart from that I don't see any reason to not do it in sysfs.

Christian.

Am 26.04.19 um 20:35 schrieb Russell, Kent:
The main reasoning and use case for sysfs would be the SMI CLI, since 
unfortunately there is no ioctl support in there.


Kent

KENT RUSSELL
Sr. Software Engineer | Linux Compute Kernel
1 Commerce Valley Drive East
Markham, ON L3T 7X6
O +(1) 289-695-2122 | Ext 72122

*From:* amd-gfx  on behalf of 
Deucher, Alexander 

*Sent:* Friday, April 26, 2019 12:18:22 PM
*To:* Lin, Amber; amd-gfx@lists.freedesktop.org
*Cc:* Freehill, Chris
*Subject:* Re: print firmware versions on amdgpu sysfs
We also expose the firmware versions via ioctl which is what the UMDs 
uses.  If you'd prefer it in sysfs, we could do that too.


Alex


*From:* amd-gfx  on behalf of 
Lin, Amber 

*Sent:* Friday, April 26, 2019 10:14 AM
*To:* amd-gfx@lists.freedesktop.org
*Cc:* Freehill, Chris
*Subject:* print firmware versions on amdgpu sysfs
Hi Alex,

Currently we retrieve firmware versions from /sys/kernel/debug but this
requires debugfs being enabled and superuser privilege. Do you see a
concern we add firmware versions to amdgpu sysfs at
/sys/class/drm/cardX/device like vbios_version?

Regards,
Amber
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type

2019-04-29 Thread Quan, Evan
Thanks for reviewing. Will update them in V2.

Regards,
Evan
> -Original Message-
> From: Michel Dänzer 
> Sent: Monday, April 29, 2019 6:17 PM
> To: Quan, Evan 
> Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> ; Lou, Wentao ;
> Koenig, Christian 
> Subject: Re: [PATCH] drm/amdgpu: enable separate timeout setting for
> every ring type
> 
> On 2019-04-29 10:57 a.m., Evan Quan wrote:
> > Every ring type can have its own timeout setting.
> >
> > Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
> > Signed-off-by: Evan Quan 
> 
> This is going in a good direction, but there are still some minor/cosmetic
> issues.
> 
> 
> > @@ -958,13 +960,16 @@ static void
> amdgpu_device_check_arguments(struct amdgpu_device *adev)
> > amdgpu_vram_page_split = 1024;
> > }
> >
> > -   if (amdgpu_lockup_timeout == 0) {
> > -   dev_warn(adev->dev, "lockup_timeout msut be > 0,
> adjusting to 1\n");
> > -   amdgpu_lockup_timeout = 1;
> > +   ret = amdgpu_device_get_job_timeout(adev);
> > +   if (ret) {
> > +   dev_err(adev->dev, "invalid job timeout setting\n");
> > +   return ret;
> > }
> 
> The error message should still explicitly mention lockup_timeout, or it may
> not be clear to the user what it's about. E.g. "Invalid lockup_timeout
> parameter syntax".
> 
> 
> > @@ -232,12 +234,20 @@ MODULE_PARM_DESC(msi, "MSI support (1 =
> enable,
> > 0 = disable, -1 = auto)");  module_param_named(msi, amdgpu_msi, int,
> > 0444);
> >
> >  /**
> > - * DOC: lockup_timeout (int)
> > - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be
> adjusted to 1.
> > - * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The
> default is 1.
> > + * DOC: lockup_timeout (string)
> > + * Set GPU scheduler timeout value in ms.
> > + *
> > + * The format is non-compute[.gfx.sdma.video][:compute].
> > + * With no "non-compute[.gfx.sdma.video]" timeout specified, the
> default timeout for non-compute_job is 1.
> > + * The "non-compute" timeout setting applys to all non compute
> > + IPs(gfx, sdma and video). Unless
> > + * the timeout for this IP is specified separately(by "[.gfx.sdma.video]").
> 
> A dot is a bit weird as a separator, comma (or maybe semicolon) would be
> better.
> 
> Also, instead of requiring a general non-compute value (which is unused if all
> 3 engine specific values are specified) before the engine specific ones, how
> about: If only one non-compute value is specified, it applies to all non-
> compute engines. If multiple values are specified, the first one is for GFX,
> second one for SDMA, third one for video.
> 
> 
> > @@ -1307,6 +1317,66 @@ int amdgpu_file_to_fpriv(struct file *filp, struct
> amdgpu_fpriv **fpriv)
> > return 0;
> >  }
> >
> > +int amdgpu_device_get_job_timeout(struct amdgpu_device *adev) {
> > +   char *input = amdgpu_lockup_timeout;
> > +   char *non_compute_setting = NULL;
> > +   char *timeout_setting = NULL;
> > +   int index = 0;
> > +   int ret = 0;
> > +
> > +   /* Default timeout for non compute job is 1 */
> > +   adev->gfx_timeout =
> > +   adev->sdma_timeout =
> > +   adev->video_timeout = 1;
> 
> This is a bit weird formatting. :) Maybe it can be one or two lines, otherwise
> the second continuation line should have the same indentation as the first
> one.
> 
> 
> > +   /* Enforce no timeout on compute job at default */
> 
> "by default" (same in amdgpu_fence_driver_init_ring).
> 
> 
> --
> Earthling Michel Dänzer   |  https://www.amd.com
> Libre software enthusiast | Mesa and X developer
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type

2019-04-29 Thread Michel Dänzer
On 2019-04-29 10:57 a.m., Evan Quan wrote:
> Every ring type can have its own timeout setting.
> 
> Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
> Signed-off-by: Evan Quan 

This is going in a good direction, but there are still some
minor/cosmetic issues.


> @@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct 
> amdgpu_device *adev)
>   amdgpu_vram_page_split = 1024;
>   }
>  
> - if (amdgpu_lockup_timeout == 0) {
> - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 
> 1\n");
> - amdgpu_lockup_timeout = 1;
> + ret = amdgpu_device_get_job_timeout(adev);
> + if (ret) {
> + dev_err(adev->dev, "invalid job timeout setting\n");
> + return ret;
>   }

The error message should still explicitly mention lockup_timeout, or it
may not be clear to the user what it's about. E.g. "Invalid
lockup_timeout parameter syntax".


> @@ -232,12 +234,20 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = 
> disable, -1 = auto)");
>  module_param_named(msi, amdgpu_msi, int, 0444);
>  
>  /**
> - * DOC: lockup_timeout (int)
> - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be 
> adjusted to 1.
> - * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The default 
> is 1.
> + * DOC: lockup_timeout (string)
> + * Set GPU scheduler timeout value in ms.
> + *
> + * The format is non-compute[.gfx.sdma.video][:compute].
> + * With no "non-compute[.gfx.sdma.video]" timeout specified, the default 
> timeout for non-compute_job is 1.
> + * The "non-compute" timeout setting applys to all non compute IPs(gfx, sdma 
> and video). Unless
> + * the timeout for this IP is specified separately(by "[.gfx.sdma.video]").

A dot is a bit weird as a separator, comma (or maybe semicolon) would be
better.

Also, instead of requiring a general non-compute value (which is unused
if all 3 engine specific values are specified) before the engine
specific ones, how about: If only one non-compute value is specified, it
applies to all non-compute engines. If multiple values are specified,
the first one is for GFX, second one for SDMA, third one for video.


> @@ -1307,6 +1317,66 @@ int amdgpu_file_to_fpriv(struct file *filp, struct 
> amdgpu_fpriv **fpriv)
>   return 0;
>  }
>  
> +int amdgpu_device_get_job_timeout(struct amdgpu_device *adev)
> +{
> + char *input = amdgpu_lockup_timeout;
> + char *non_compute_setting = NULL;
> + char *timeout_setting = NULL;
> + int index = 0;
> + int ret = 0;
> +
> + /* Default timeout for non compute job is 1 */
> + adev->gfx_timeout =
> + adev->sdma_timeout =
> + adev->video_timeout = 1;

This is a bit weird formatting. :) Maybe it can be one or two lines,
otherwise the second continuation line should have the same indentation
as the first one.


> + /* Enforce no timeout on compute job at default */

"by default" (same in amdgpu_fence_driver_init_ring).


-- 
Earthling Michel Dänzer   |  https://www.amd.com
Libre software enthusiast | Mesa and X developer
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: enable separate timeout setting for every ring type

2019-04-29 Thread Koenig, Christian
Am 29.04.19 um 10:57 schrieb Evan Quan:
> Every ring type can have its own timeout setting.
>
> Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
> Signed-off-by: Evan Quan 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 82 --
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 35 +++--
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  2 +-
>   5 files changed, 124 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index f6965b9403eb..9d7bf9fbf2ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -118,7 +118,6 @@ extern int amdgpu_disp_priority;
>   extern int amdgpu_hw_i2c;
>   extern int amdgpu_pcie_gen2;
>   extern int amdgpu_msi;
> -extern int amdgpu_lockup_timeout;
>   extern int amdgpu_dpm;
>   extern int amdgpu_fw_load_type;
>   extern int amdgpu_aspm;
> @@ -428,6 +427,7 @@ struct amdgpu_fpriv {
>   };
>   
>   int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv);
> +int amdgpu_device_get_job_timeout(struct amdgpu_device *adev);
>   
>   int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> unsigned size, struct amdgpu_ib *ib);
> @@ -1001,6 +1001,11 @@ struct amdgpu_device {
>   struct work_struct  xgmi_reset_work;
>   
>   boolin_baco_reset;
> +
> + longgfx_timeout;
> + longsdma_timeout;
> + longvideo_timeout;
> + longcompute_timeout;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device 
> *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 80bf604019b1..bb111c05949d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -912,8 +912,10 @@ static void 
> amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
>* Validates certain module parameters and updates
>* the associated values used by the driver (all asics).
>*/
> -static void amdgpu_device_check_arguments(struct amdgpu_device *adev)
> +static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
>   {
> + int ret = 0;
> +
>   if (amdgpu_sched_jobs < 4) {
>   dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
>amdgpu_sched_jobs);
> @@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct 
> amdgpu_device *adev)
>   amdgpu_vram_page_split = 1024;
>   }
>   
> - if (amdgpu_lockup_timeout == 0) {
> - dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 
> 1\n");
> - amdgpu_lockup_timeout = 1;
> + ret = amdgpu_device_get_job_timeout(adev);
> + if (ret) {
> + dev_err(adev->dev, "invalid job timeout setting\n");
> + return ret;
>   }
>   
>   adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, 
> amdgpu_fw_load_type);
>   amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96);
> +
> + return ret;
>   }
>   
>   /**
> @@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>   mutex_init(>lock_reset);
>   mutex_init(>virt.dpm_mutex);
>   
> - amdgpu_device_check_arguments(adev);
> + ret = amdgpu_device_check_arguments(adev);
> + if (ret)
> + return ret;
>   
>   spin_lock_init(>mmio_idx_lock);
>   spin_lock_init(>smc_idx_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 71df27cd03de..4694e36d1bf1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -83,6 +83,8 @@
>   
>   #define AMDGPU_VERSION  "19.10.9.418"
>   
> +#define MAX_PARAM_LENGTH 256

This needs a better name, especially but an AMDGPU_ prefix in front of it.

> +
>   int amdgpu_vram_limit = 0;
>   int amdgpu_vis_vram_limit = 0;
>   int amdgpu_gart_size = -1; /* auto */
> @@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0;
>   int amdgpu_hw_i2c = 0;
>   int amdgpu_pcie_gen2 = -1;
>   int amdgpu_msi = -1;
> -int amdgpu_lockup_timeout = 1;
> +char amdgpu_lockup_timeout[MAX_PARAM_LENGTH];
>   int amdgpu_dpm = -1;
>   int amdgpu_fw_load_type = -1;
>   int amdgpu_aspm = -1;
> @@ -232,12 +234,20 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = 
> disable, -1 = auto)");
>   module_param_named(msi, amdgpu_msi, int, 0444);
>   
>   /**
> - * DOC: lockup_timeout (int)
> - * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be 
> adjusted to 1.
> - * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The 

[PATCH] drm/amdgpu: enable separate timeout setting for every ring type

2019-04-29 Thread Evan Quan
Every ring type can have its own timeout setting.

Change-Id: I992f224f36bb33acd560162bffd2c3e987840a7e
Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  7 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 82 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 35 +++--
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c  |  2 +-
 5 files changed, 124 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f6965b9403eb..9d7bf9fbf2ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -118,7 +118,6 @@ extern int amdgpu_disp_priority;
 extern int amdgpu_hw_i2c;
 extern int amdgpu_pcie_gen2;
 extern int amdgpu_msi;
-extern int amdgpu_lockup_timeout;
 extern int amdgpu_dpm;
 extern int amdgpu_fw_load_type;
 extern int amdgpu_aspm;
@@ -428,6 +427,7 @@ struct amdgpu_fpriv {
 };
 
 int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv);
+int amdgpu_device_get_job_timeout(struct amdgpu_device *adev);
 
 int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm,
  unsigned size, struct amdgpu_ib *ib);
@@ -1001,6 +1001,11 @@ struct amdgpu_device {
struct work_struct  xgmi_reset_work;
 
boolin_baco_reset;
+
+   longgfx_timeout;
+   longsdma_timeout;
+   longvideo_timeout;
+   longcompute_timeout;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 80bf604019b1..bb111c05949d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -912,8 +912,10 @@ static void amdgpu_device_check_smu_prv_buffer_size(struct 
amdgpu_device *adev)
  * Validates certain module parameters and updates
  * the associated values used by the driver (all asics).
  */
-static void amdgpu_device_check_arguments(struct amdgpu_device *adev)
+static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
 {
+   int ret = 0;
+
if (amdgpu_sched_jobs < 4) {
dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
 amdgpu_sched_jobs);
@@ -958,13 +960,16 @@ static void amdgpu_device_check_arguments(struct 
amdgpu_device *adev)
amdgpu_vram_page_split = 1024;
}
 
-   if (amdgpu_lockup_timeout == 0) {
-   dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 
1\n");
-   amdgpu_lockup_timeout = 1;
+   ret = amdgpu_device_get_job_timeout(adev);
+   if (ret) {
+   dev_err(adev->dev, "invalid job timeout setting\n");
+   return ret;
}
 
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, 
amdgpu_fw_load_type);
amdgpu_direct_gma_size = min(amdgpu_direct_gma_size, 96);
+
+   return ret;
 }
 
 /**
@@ -2468,7 +2473,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(>lock_reset);
mutex_init(>virt.dpm_mutex);
 
-   amdgpu_device_check_arguments(adev);
+   ret = amdgpu_device_check_arguments(adev);
+   if (ret)
+   return ret;
 
spin_lock_init(>mmio_idx_lock);
spin_lock_init(>smc_idx_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 71df27cd03de..4694e36d1bf1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -83,6 +83,8 @@
 
 #define AMDGPU_VERSION "19.10.9.418"
 
+#define MAX_PARAM_LENGTH   256
+
 int amdgpu_vram_limit = 0;
 int amdgpu_vis_vram_limit = 0;
 int amdgpu_gart_size = -1; /* auto */
@@ -95,7 +97,7 @@ int amdgpu_disp_priority = 0;
 int amdgpu_hw_i2c = 0;
 int amdgpu_pcie_gen2 = -1;
 int amdgpu_msi = -1;
-int amdgpu_lockup_timeout = 1;
+char amdgpu_lockup_timeout[MAX_PARAM_LENGTH];
 int amdgpu_dpm = -1;
 int amdgpu_fw_load_type = -1;
 int amdgpu_aspm = -1;
@@ -232,12 +234,20 @@ MODULE_PARM_DESC(msi, "MSI support (1 = enable, 0 = 
disable, -1 = auto)");
 module_param_named(msi, amdgpu_msi, int, 0444);
 
 /**
- * DOC: lockup_timeout (int)
- * Set GPU scheduler timeout value in ms. Value 0 is invalidated, will be 
adjusted to 1.
- * Negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET). The default is 
1.
+ * DOC: lockup_timeout (string)
+ * Set GPU scheduler timeout value in ms.
+ *
+ * The format is non-compute[.gfx.sdma.video][:compute].
+ * With no "non-compute[.gfx.sdma.video]" timeout specified, the default 
timeout for non-compute_job is 1.
+ * The "non-compute" timeout setting applys to all non compute IPs(gfx, sdma 
and video). 

[PATCH] drm/amd/powerplay: enable ppfeaturemask module parameter support on Vega20

2019-04-29 Thread Evan Quan
Support DPM/DS/ULV related bitmasks of ppfeaturemask module parameter.

Change-Id: I6b75becf8d39105189b30be41b58ec7d4425f356
Signed-off-by: Evan Quan 
---
 .../drm/amd/powerplay/hwmgr/vega20_hwmgr.c| 21 +++
 1 file changed, 21 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c 
b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
index 91e26f8b3758..d7873df484a4 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/vega20_hwmgr.c
@@ -97,6 +97,27 @@ static void vega20_set_default_registry_data(struct pp_hwmgr 
*hwmgr)
if (hwmgr->smu_version < 0x282100)
data->registry_data.disallowed_features |= FEATURE_ECC_MASK;
 
+   if (!(hwmgr->feature_mask & PP_PCIE_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_LINK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_SCLK_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_GFXCLK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_SOCCLK_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_SOCCLK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_MCLK_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_UCLK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_DCEFCLK_DPM_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DPM_DCEFCLK_MASK;
+
+   if (!(hwmgr->feature_mask & PP_ULV_MASK))
+   data->registry_data.disallowed_features |= FEATURE_ULV_MASK;
+
+   if (!(hwmgr->feature_mask & PP_SCLK_DEEP_SLEEP_MASK))
+   data->registry_data.disallowed_features |= 
FEATURE_DS_GFXCLK_MASK;
+
data->registry_data.od_state_in_dc_support = 0;
data->registry_data.thermal_support = 1;
data->registry_data.skip_baco_hardware = 0;
-- 
2.21.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amd/powerplay: add helper function to get smu firmware & if version

2019-04-29 Thread Wang, Kevin(Yang)
add this helper function to get smc version.

Change-Id: I6b06470cefd10fafcf06df8a5e8cb03bf79622c0
Signed-off-by: Kevin Wang 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c| 30 +++
 .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  1 +
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 23 +-
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 25d2ba2b9018..c021c05727e8 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -29,6 +29,36 @@
 #include "smu_v11_0.h"
 #include "atom.h"
 
+int smu_get_smc_version(struct smu_context *smu, uint32_t *if_version, 
uint32_t *smu_version)
+{
+   int ret = 0;
+
+   if (!if_version && !smu_version)
+   return -EINVAL;
+
+   if (if_version) {
+   ret = smu_send_smc_msg(smu, SMU_MSG_GetDriverIfVersion);
+   if (ret)
+   return ret;
+
+   ret = smu_read_smc_arg(smu, if_version);
+   if (ret)
+   return ret;
+   }
+
+   if (smu_version) {
+   ret = smu_send_smc_msg(smu, SMU_MSG_GetSmuVersion);
+   if (ret)
+   return ret;
+
+   ret = smu_read_smc_arg(smu, smu_version);
+   if (ret)
+   return ret;
+   }
+
+   return ret;
+}
+
 int smu_set_soft_freq_range(struct smu_context *smu, enum smu_clk_type 
clk_type,
uint32_t min, uint32_t max)
 {
diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index e958d4cb5baf..435727c8ee21 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -931,4 +931,5 @@ int smu_set_soft_freq_range(struct smu_context *smu, enum 
smu_clk_type clk_type,
uint32_t min, uint32_t max);
 int smu_set_hard_freq_range(struct smu_context *smu, enum smu_clk_type 
clk_type,
uint32_t min, uint32_t max);
+int smu_get_smc_version(struct smu_context *smu, uint32_t *if_version, 
uint32_t *smu_version);
 #endif
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c 
b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index ab89c79b4358..c2fe00f51b2b 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -249,20 +249,27 @@ static int smu_v11_0_check_fw_status(struct smu_context 
*smu)
 
 static int smu_v11_0_check_fw_version(struct smu_context *smu)
 {
-   uint32_t smu_version = 0xff;
+   uint32_t if_version = 0xff, smu_version = 0xff;
+   uint16_t smu_major;
+   uint8_t smu_minor, smu_debug;
int ret = 0;
 
-   ret = smu_send_smc_msg(smu, SMU_MSG_GetDriverIfVersion);
+   ret = smu_get_smc_version(smu, _version, _version);
if (ret)
-   goto err;
+   return ret;
 
-   ret = smu_read_smc_arg(smu, _version);
-   if (ret)
-   goto err;
+   smu_major = (smu_version >> 16) & 0x;
+   smu_minor = (smu_version >> 8) & 0xff;
+   smu_debug = (smu_version >> 0) & 0xff;
+
+   pr_info("SMU Driver IF Version = 0x%08x, SMU FW Version = 0x%08x 
(%d.%d.%d)\n",
+   if_version, smu_version, smu_major, smu_minor, smu_debug);
 
-   if (smu_version != smu->smc_if_version)
+   if (if_version != smu->smc_if_version) {
+   pr_err("SMU driver if version not matched\n");
ret = -EINVAL;
-err:
+   }
+
return ret;
 }
 
-- 
2.21.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/amd/powerplay: add helper function to get smu firmware & if version

2019-04-29 Thread Huang, Ray
> -Original Message-
> From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf
> Of Wang, Kevin(Yang)
> Sent: Monday, April 29, 2019 3:05 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Huang, Ray ; Wang, Kevin(Yang)
> 
> Subject: [PATCH] drm/amd/powerplay: add helper function to get smu
> firmware & if version
> 
> add this helper function to get smc version.
> 
> Change-Id: I6b06470cefd10fafcf06df8a5e8cb03bf79622c0
> Signed-off-by: Kevin Wang 

Reviewed-by: Huang Rui 

> ---
>  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c| 30
> +++
>  .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  1 +
>  drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 23 +-
>  3 files changed, 46 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index 25d2ba2b9018..c021c05727e8 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -29,6 +29,36 @@
>  #include "smu_v11_0.h"
>  #include "atom.h"
> 
> +int smu_get_smc_version(struct smu_context *smu, uint32_t *if_version,
> +uint32_t *smu_version) {
> + int ret = 0;
> +
> + if (!if_version && !smu_version)
> + return -EINVAL;
> +
> + if (if_version) {
> + ret = smu_send_smc_msg(smu,
> SMU_MSG_GetDriverIfVersion);
> + if (ret)
> + return ret;
> +
> + ret = smu_read_smc_arg(smu, if_version);
> + if (ret)
> + return ret;
> + }
> +
> + if (smu_version) {
> + ret = smu_send_smc_msg(smu,
> SMU_MSG_GetSmuVersion);
> + if (ret)
> + return ret;
> +
> + ret = smu_read_smc_arg(smu, smu_version);
> + if (ret)
> + return ret;
> + }
> +
> + return ret;
> +}
> +
>  int smu_set_soft_freq_range(struct smu_context *smu, enum
> smu_clk_type clk_type,
>   uint32_t min, uint32_t max)
>  {
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index e958d4cb5baf..435727c8ee21 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -931,4 +931,5 @@ int smu_set_soft_freq_range(struct smu_context
> *smu, enum smu_clk_type clk_type,
>   uint32_t min, uint32_t max);
>  int smu_set_hard_freq_range(struct smu_context *smu, enum
> smu_clk_type clk_type,
>   uint32_t min, uint32_t max);
> +int smu_get_smc_version(struct smu_context *smu, uint32_t *if_version,
> +uint32_t *smu_version);
>  #endif
> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> index ab89c79b4358..c2fe00f51b2b 100644
> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> @@ -249,20 +249,27 @@ static int smu_v11_0_check_fw_status(struct
> smu_context *smu)
> 
>  static int smu_v11_0_check_fw_version(struct smu_context *smu)  {
> - uint32_t smu_version = 0xff;
> + uint32_t if_version = 0xff, smu_version = 0xff;
> + uint16_t smu_major;
> + uint8_t smu_minor, smu_debug;
>   int ret = 0;
> 
> - ret = smu_send_smc_msg(smu, SMU_MSG_GetDriverIfVersion);
> + ret = smu_get_smc_version(smu, _version, _version);
>   if (ret)
> - goto err;
> + return ret;
> 
> - ret = smu_read_smc_arg(smu, _version);
> - if (ret)
> - goto err;
> + smu_major = (smu_version >> 16) & 0x;
> + smu_minor = (smu_version >> 8) & 0xff;
> + smu_debug = (smu_version >> 0) & 0xff;
> +
> + pr_info("SMU Driver IF Version = 0x%08x, SMU FW Version = 0x%08x
> (%d.%d.%d)\n",
> + if_version, smu_version, smu_major, smu_minor,
> smu_debug);
> 
> - if (smu_version != smu->smc_if_version)
> + if (if_version != smu->smc_if_version) {
> + pr_err("SMU driver if version not matched\n");
>   ret = -EINVAL;
> -err:
> + }
> +
>   return ret;
>  }
> 
> --
> 2.21.0
> 
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx