date:20200506

Re: [PATCH] MAINTAINERS: Remove me from amdgpu maintainers

2020-05-06 Thread Christian König


Am 07.05.20 um 04:02 schrieb Chunming Zhou:

Glad to spend time on kernel driver in past years.
I've moved to new focus in umd and couldn't commit
enough time to discussions.

Signed-off-by: Chunming Zhou 


So Long, and Thanks for All the Fish :)

Reviewed-by: Christian König 


---
  MAINTAINERS | 1 -
  1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 938316092634..4ca508bd4c9e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14066,7 +14066,6 @@ F:  drivers/net/wireless/quantenna
  RADEON and AMDGPU DRM DRIVERS
  M:Alex Deucher 
  M:Christian König 
-M: David (ChunMing) Zhou 
  L:amd-gfx@lists.freedesktop.org
  S:Supported
  T:git git://people.freedesktop.org/~agd5f/linux


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: allocate large structures dynamically

2020-05-06 Thread Christian König


Am 06.05.20 um 21:01 schrieb Joe Perches:

On Tue, 2020-05-05 at 16:01 +0200, Arnd Bergmann wrote:

After the structure was padded to 1024 bytes, it is no longer
suitable for being a local variable, as the function surpasses
the warning limit for 32-bit architectures:

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:587:5: error: stack frame size of 1072 
bytes in function 'amdgpu_ras_feature_enable' [-Werror,-Wframe-larger-than=]
int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 ^

Use kzalloc() instead to get it from the heap.

[]

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

[]

@@ -588,19 +588,23 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
struct ras_common_if *head, bool enable)
  {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
-   union ta_ras_cmd_input info;
+   union ta_ras_cmd_input *info;
int ret;
  
  	if (!con)

return -EINVAL;
  
+info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);

Spaces were used for indentation here not a tab.
It might be useful to run your proposed patches through checkpatch

Is this an actual bug fix as the previous use didn't
zero unused info members?


+   if (!info)
+   return -ENOMEM;
+
if (!enable) {
-   info.disable_features = (struct ta_ras_disable_features_input) {
+   info->disable_features = (struct ta_ras_disable_features_input) 
{
.block_id =  amdgpu_ras_block_to_ta(head->block),
.error_type = amdgpu_ras_error_to_ta(head->type),
};
} else {
-   info.enable_features = (struct ta_ras_enable_features_input) {
+   info->enable_features = (struct ta_ras_enable_features_input) {
.block_id =  amdgpu_ras_block_to_ta(head->block),
.error_type = amdgpu_ras_error_to_ta(head->type),
};
@@ -609,26 +613,33 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Do not enable if it is not allowed. */
WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
/* Are we alerady in that state we are going to set? */
-   if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
-   return 0;
+   if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {

And trivia:

The !! uses with bool seem unnecessary and it's probably better
to make amdgpu_ras_is_feature_enabled to return bool.

Maybe something like:
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 538895cfd862..05c4eaf0ddfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -526,16 +526,16 @@ void amdgpu_ras_parse_status_code(struct amdgpu_device* 
adev,
  }
  
  /* feature ctl begin */

-static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
-   struct ras_common_if *head)
+static bool amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
+ struct ras_common_if *head)
  {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  
  	return con->hw_supported & BIT(head->block);

  }
  
-static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,

-   struct ras_common_if *head)
+static bool amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
+ struct ras_common_if *head)
  {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
  
@@ -560,7 +560,7 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,

 */
if (!amdgpu_ras_is_feature_allowed(adev, head))
return 0;
-   if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
+   if (!(enable ^ amdgpu_ras_is_feature_enabled(adev, head)))


And while we are at improving coding style I think that writing this as 
"if (enabled == amdgpu_ras_is_feature_enabled(adev, head))" would be 
much more readable.


Christian.


return 0;
  
  	if (enable) {

@@ -609,7 +609,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Do not enable if it is not allowed. */
WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
/* Are we alerady in that state we are going to set? */
-   if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
+   if (!(enable ^ amdgpu_ras_is_feature_enabled(adev, head)))
return 0;
  
  	if (!amdgpu_ras_intr_triggered()) {




___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


___
amd

[PATCH] MAINTAINERS: Remove me from amdgpu maintainers

2020-05-06 Thread Chunming Zhou

Glad to spend time on kernel driver in past years.
I've moved to new focus in umd and couldn't commit
enough time to discussions.

Signed-off-by: Chunming Zhou 
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 938316092634..4ca508bd4c9e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14066,7 +14066,6 @@ F:  drivers/net/wireless/quantenna
 RADEON and AMDGPU DRM DRIVERS
 M: Alex Deucher 
 M: Christian König 
-M: David (ChunMing) Zhou 
 L: amd-gfx@lists.freedesktop.org
 S: Supported
 T: git git://people.freedesktop.org/~agd5f/linux
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[pull] amdgpu drm-fixes-5.7

2020-05-06 Thread Alex Deucher

Hi Dave, Daniel,

Fixes for 5.7.

The following changes since commit e3dcd86b3b4c045a4db17c02340138a4c514fe20:

  Merge tag 'amd-drm-fixes-5.7-2020-04-29' of 
git://people.freedesktop.org/~agd5f/linux into drm-fixes (2020-05-01 11:19:55 
+1000)

are available in the Git repository at:

  git://people.freedesktop.org/~agd5f/linux tags/amd-drm-fixes-5.7-2020-05-06

for you to fetch changes up to e6142dd511425cb827b5db869f489eb81f5f994d:

  drm/amd/display: Prevent dpcd reads with passive dongles (2020-05-05 16:13:57 
-0400)


amd-drm-fixes-5.7-2020-05-06:

amdgpu:
- Runtime PM fixes
- DC fix for PPC
- Misc DC fixes


Aurabindo Pillai (1):
  drm/amd/display: Prevent dpcd reads with passive dongles

Daniel Kolesa (1):
  drm/amd/display: work around fp code being emitted outside of 
DC_FP_START/END

Evan Quan (2):
  drm/amdgpu: move kfd suspend after ip_suspend_phase1
  drm/amdgpu: drop redundant cg/pg ungate on runpm enter

Michel Dänzer (1):
  drm/amdgpu/dc: Use WARN_ON_ONCE for ASSERT

Roman Li (1):
  drm/amd/display: fix counter in wait_for_no_pipes_pending

Sung Lee (1):
  drm/amd/display: Update DCN2.1 DV Code Revision

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  7 ++---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c  | 17 +++-
 drivers/gpu/drm/amd/display/dc/core/dc.c   |  5 ++--
 .../gpu/drm/amd/display/dc/dcn20/dcn20_resource.c  | 31 --
 .../display/dc/dml/dcn21/display_rq_dlg_calc_21.c  |  8 +++---
 drivers/gpu/drm/amd/display/dc/os_types.h  |  2 +-
 6 files changed, 43 insertions(+), 27 deletions(-)
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH -next] drm/amdgpu/navi10: fix unsigned comparison with 0

2020-05-06 Thread Alex Deucher

On Wed, May 6, 2020 at 3:03 AM ChenTao  wrote:
>
> Fixes warning because size is uint32_t and can never be negtative
>
> drivers/gpu/drm/amd/amdgpu/../powerplay/navi10_ppt.c:1296:12: warning:
> comparison of unsigned expression < 0 is always false [-Wtype-limits]
>if (size < 0)
>
> Reported-by: Hulk Robot 
> Signed-off-by: ChenTao 

Applied.  Thanks!

Alex

> ---
>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> index 2184d247a9f7..0c9be864d072 100644
> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> @@ -1293,8 +1293,6 @@ static int navi10_set_power_profile_mode(struct 
> smu_context *smu, long *input, u
> }
>
> if (smu->power_profile_mode == PP_SMC_POWER_PROFILE_CUSTOM) {
> -   if (size < 0)
> -   return -EINVAL;
>
> ret = smu_update_table(smu,
>SMU_TABLE_ACTIVITY_MONITOR_COEFF, 
> WORKLOAD_PPLIB_CUSTOM_BIT,
> --
> 2.22.0
>
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amd/display: Fix vblank and pageflip event handling for FreeSync

2020-05-06 Thread Nicholas Kazlauskas

[Why]
We're the drm vblank event a frame too early in the case where the
pageflip happens close to VUPDATE and ends up blocking the signal.

The implementation in DM was previously correct *before* we started
sending vblank events from VSTARTUP unconditionally to handle cases
where HUBP was off, OTG was ON and userspace was still requesting some
DRM planes enabled. As part of that patch series we dropped VUPDATE
since it was deemed close enough to VSTARTUP, but there's a key
difference betweeen VSTARTUP and VUPDATE - the VUPDATE signal can be
blocked if we're holding the pipe lock.

There was a fix recently to revert the unconditional behavior for the
DCN VSTARTUP vblank event since it was sending the pageflip event on
the wrong frame - once again, due to blocking VUPDATE and having the
address start scanning out two frames later.

The problem with this fix is it didn't update the logic that calls
drm_crtc_handle_vblank(), so the timestamps are totally bogus now.

[How]
Essentially reverts most of the original VSTARTUP series but retains
the behavior to send back events when active planes == 0.

Some refactoring/cleanup was done to not have duplicated code in both
the handlers.

Fixes: 16f17eda8bad ("drm/amd/display: Send vblank and user events at vsartup 
for DCN")
Fixes: 3a2ce8d66a4b ("drm/amd/display: Disable VUpdate interrupt for DCN 
hardware")
Fixes: 2b5aed9ac3f7 ("drm/amd/display: Fix pageflip event race condition for 
DCN.")

Cc: Leo Li 
Cc: Mario Kleiner 
Signed-off-by: Nicholas Kazlauskas 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 137 +++---
 1 file changed, 55 insertions(+), 82 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 59f1d4a94f12..30ce28f7c444 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -441,7 +441,7 @@ static void dm_vupdate_high_irq(void *interrupt_params)
 
 /**
  * dm_crtc_high_irq() - Handles CRTC interrupt
- * @interrupt_params: ignored
+ * @interrupt_params: used for determining the CRTC instance
  *
  * Handles the CRTC/VSYNC interrupt by notfying DRM's VBLANK
  * event handler.
@@ -455,70 +455,6 @@ static void dm_crtc_high_irq(void *interrupt_params)
unsigned long flags;
 
acrtc = get_crtc_by_otg_inst(adev, irq_params->irq_src - 
IRQ_TYPE_VBLANK);
-
-   if (acrtc) {
-   acrtc_state = to_dm_crtc_state(acrtc->base.state);
-
-   DRM_DEBUG_VBL("crtc:%d, vupdate-vrr:%d\n",
- acrtc->crtc_id,
- amdgpu_dm_vrr_active(acrtc_state));
-
-   /* Core vblank handling at start of front-porch is only possible
-* in non-vrr mode, as only there vblank timestamping will give
-* valid results while done in front-porch. Otherwise defer it
-* to dm_vupdate_high_irq after end of front-porch.
-*/
-   if (!amdgpu_dm_vrr_active(acrtc_state))
-   drm_crtc_handle_vblank(&acrtc->base);
-
-   /* Following stuff must happen at start of vblank, for crc
-* computation and below-the-range btr support in vrr mode.
-*/
-   amdgpu_dm_crtc_handle_crc_irq(&acrtc->base);
-
-   if (acrtc_state->stream && adev->family >= AMDGPU_FAMILY_AI &&
-   acrtc_state->vrr_params.supported &&
-   acrtc_state->freesync_config.state == 
VRR_STATE_ACTIVE_VARIABLE) {
-   spin_lock_irqsave(&adev->ddev->event_lock, flags);
-   mod_freesync_handle_v_update(
-   adev->dm.freesync_module,
-   acrtc_state->stream,
-   &acrtc_state->vrr_params);
-
-   dc_stream_adjust_vmin_vmax(
-   adev->dm.dc,
-   acrtc_state->stream,
-   &acrtc_state->vrr_params.adjust);
-   spin_unlock_irqrestore(&adev->ddev->event_lock, flags);
-   }
-   }
-}
-
-#if defined(CONFIG_DRM_AMD_DC_DCN)
-/**
- * dm_dcn_crtc_high_irq() - Handles VStartup interrupt for DCN generation ASICs
- * @interrupt params - interrupt parameters
- *
- * Notify DRM's vblank event handler at VSTARTUP
- *
- * Unlike DCE hardware, we trigger the handler at VSTARTUP. at which:
- * * We are close enough to VUPDATE - the point of no return for hw
- * * We are in the fixed portion of variable front porch when vrr is enabled
- * * We are before VUPDATE, where double-buffered vrr registers are swapped
- *
- * It is therefore the correct place to signal vblank, send user flip events,
- * and update VRR.
- */
-static void dm_dcn_crtc_high_irq(void *interrupt_params)
-{
-   struct common_irq_params *irq_params = interrupt_params;
-   struct amdgpu_dev

Re: [PATCH hmm v2 0/5] Adjust hmm_range_fault() API

2020-05-06 Thread Jason Gunthorpe

On Fri, May 01, 2020 at 03:20:43PM -0300, Jason Gunthorpe wrote:
> From: Jason Gunthorpe 
> 
> The API is a bit complicated for the uses we actually have, and
> disucssions for simplifying have come up a number of times.
> 
> This small series removes the customizable pfn format and simplifies the
> return code of hmm_range_fault()
> 
> All the drivers are adjusted to process in the simplified format.
> I would appreciated tested-by's for the two drivers, thanks!
> 
> v2:
>  - Move call chain to commit message
>  - Fix typo of HMM_PFN_REQ_FAULT
>  - Move nouveau_hmm_convert_pfn() to nouveau_svm.c
>  - Add acks and tested-bys
> v1: 
> https://lore.kernel.org/r/0-v1-4eb72686de3c+5062-hmm_no_flags_...@mellanox.com
> 
> Cc: Christoph Hellwig 
> Cc: John Hubbard 
> Cc: Jérôme Glisse 
> Cc: Ben Skeggs 
> To: Ralph Campbell 
> Cc: nouv...@lists.freedesktop.org
> Cc: Niranjana Vishwanathapura 
> Cc: intel-...@lists.freedesktop.org
> Cc: "Kuehling, Felix" 
> Cc: Alex Deucher 
> Cc: "Christian König" 
> Cc: "David (ChunMing) Zhou" 
> Cc: amd-gfx@lists.freedesktop.org
> Cc: dri-de...@lists.freedesktop.org
> Cc: linux-ker...@vger.kernel.org
> Cc: "Yang, Philip" 
> To: linux...@kvack.org
> 
> Jason Gunthorpe (5):
>   mm/hmm: make CONFIG_DEVICE_PRIVATE into a select
>   mm/hmm: make hmm_range_fault return 0 or -1
>   drm/amdgpu: remove dead code after hmm_range_fault()
>   mm/hmm: remove HMM_PFN_SPECIAL
>   mm/hmm: remove the customizable pfn format from hmm_range_fault

Applied to hmm.git, thanks

Jason
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: allocate large structures dynamically

2020-05-06 Thread Joe Perches

On Tue, 2020-05-05 at 16:01 +0200, Arnd Bergmann wrote:
> After the structure was padded to 1024 bytes, it is no longer
> suitable for being a local variable, as the function surpasses
> the warning limit for 32-bit architectures:
> 
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:587:5: error: stack frame size of 
> 1072 bytes in function 'amdgpu_ras_feature_enable' 
> [-Werror,-Wframe-larger-than=]
> int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
> ^
> 
> Use kzalloc() instead to get it from the heap.
[]
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
[]
> @@ -588,19 +588,23 @@ int amdgpu_ras_feature_enable(struct amdgpu_device 
> *adev,
>   struct ras_common_if *head, bool enable)
>  {
>   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> - union ta_ras_cmd_input info;
> + union ta_ras_cmd_input *info;
>   int ret;
>  
>   if (!con)
>   return -EINVAL;
>  
> +info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);

Spaces were used for indentation here not a tab.
It might be useful to run your proposed patches through checkpatch

Is this an actual bug fix as the previous use didn't
zero unused info members?

> + if (!info)
> + return -ENOMEM;
> +
>   if (!enable) {
> - info.disable_features = (struct ta_ras_disable_features_input) {
> + info->disable_features = (struct ta_ras_disable_features_input) 
> {
>   .block_id =  amdgpu_ras_block_to_ta(head->block),
>   .error_type = amdgpu_ras_error_to_ta(head->type),
>   };
>   } else {
> - info.enable_features = (struct ta_ras_enable_features_input) {
> + info->enable_features = (struct ta_ras_enable_features_input) {
>   .block_id =  amdgpu_ras_block_to_ta(head->block),
>   .error_type = amdgpu_ras_error_to_ta(head->type),
>   };
> @@ -609,26 +613,33 @@ int amdgpu_ras_feature_enable(struct amdgpu_device 
> *adev,
>   /* Do not enable if it is not allowed. */
>   WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
>   /* Are we alerady in that state we are going to set? */
> - if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
> - return 0;
> + if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) {

And trivia:

The !! uses with bool seem unnecessary and it's probably better
to make amdgpu_ras_is_feature_enabled to return bool.

Maybe something like:
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 538895cfd862..05c4eaf0ddfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -526,16 +526,16 @@ void amdgpu_ras_parse_status_code(struct amdgpu_device* 
adev,
 }
 
 /* feature ctl begin */
-static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
-   struct ras_common_if *head)
+static bool amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
+ struct ras_common_if *head)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
return con->hw_supported & BIT(head->block);
 }
 
-static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
-   struct ras_common_if *head)
+static bool amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
+ struct ras_common_if *head)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 
@@ -560,7 +560,7 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device 
*adev,
 */
if (!amdgpu_ras_is_feature_allowed(adev, head))
return 0;
-   if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
+   if (!(enable ^ amdgpu_ras_is_feature_enabled(adev, head)))
return 0;
 
if (enable) {
@@ -609,7 +609,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Do not enable if it is not allowed. */
WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
/* Are we alerady in that state we are going to set? */
-   if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
+   if (!(enable ^ amdgpu_ras_is_feature_enabled(adev, head)))
return 0;
 
if (!amdgpu_ras_intr_triggered()) {



___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 1/1] drm/amdgpu: Use GEM obj reference for KFD BOs

2020-05-06 Thread Sierra Guiza, Alejandro (Alex)

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Alex Sierra 

-Original Message-
From: amd-gfx  On Behalf Of Christian 
König
Sent: Wednesday, May 6, 2020 3:12 AM
To: Kuehling, Felix ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/1] drm/amdgpu: Use GEM obj reference for KFD BOs

Am 06.05.20 um 02:59 schrieb Felix Kuehling:
> Releasing the AMDGPU BO ref directly leads to problems when BOs were 
> exported as DMA bufs. Releasing the GEM reference makes sure that the 
> AMDGPU/TTM BO is not freed too early.
>
> Also take a GEM reference when importing BOs from DMABufs to keep 
> references to imported BOs balances properly.
>
> Signed-off-by: Felix Kuehling 
> Tested-by: Alex Sierra 

Acked-by: Christian König 

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +++--
>   1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> index 1247938b1ec1..da8b31a53291 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
> @@ -1354,7 +1354,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
>   }
>   
>   /* Free the BO*/
> - amdgpu_bo_unref(&mem->bo);
> + drm_gem_object_put_unlocked(&mem->bo->tbo.base);
>   mutex_destroy(&mem->lock);
>   kfree(mem);
>   
> @@ -1699,7 +1699,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev 
> *kgd,
>   | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE
>   | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
>   
> - (*mem)->bo = amdgpu_bo_ref(bo);
> + drm_gem_object_get(&bo->tbo.base);
> + (*mem)->bo = bo;
>   (*mem)->va = va;
>   (*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
>   AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Calex.sierra%40amd.com%7C14fc955a4c63411b3fe808d7f1952092%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637243495134045678&sdata=3QrUtuE4LU8V1xGRPjaQJ9QeAv9hKhFTZ7GLCuskVeM%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 00/14] drm/radeon: remove comparison to bool

2020-05-06 Thread Koenig, Christian



Am 06.05.2020 18:00 schrieb Alex Deucher :
On Wed, May 6, 2020 at 10:27 AM Zheng Bin  wrote:
>
> Zheng Bin (14):
>   drm/radeon: remove comparison to bool in btc_dpm.c
>   drm/radeon: remove comparison to bool in ci_dpm.c
>   drm/radeon: remove comparison to bool in ni_dpm.c
>   drm/radeon: remove comparison to bool in radeon_atpx_handler.c
>   drm/radeon: remove comparison to bool in radeon_object.c
>   drm/radeon: remove comparison to bool in radeon_ttm.c
>   drm/radeon: remove comparison to bool in r100.c
>   drm/radeon: remove comparison to bool in r300.c
>   drm/radeon: remove comparison to bool in r600.c
>   drm/radeon: remove comparison to bool in rs600.c
>   drm/radeon: remove comparison to bool in rs690.c
>   drm/radeon: remove comparison to bool in rv6xx_dpm.c
>   drm/radeon: remove comparison to bool in rv515.c
>   drm/radeon: remove comparison to bool in si_dpm.c

Does the checker need to be fixed?  All of these are comparing boolean
variables to true/false.  Seems like needless code churn to me.

We should probably make sure that no new code like this leaks in, but I also 
don't see that this is necessary for the old driver stack.

Christian.


Alex

>
>  drivers/gpu/drm/radeon/btc_dpm.c | 2 +-
>  drivers/gpu/drm/radeon/ci_dpm.c  | 4 ++--
>  drivers/gpu/drm/radeon/ni_dpm.c  | 6 +++---
>  drivers/gpu/drm/radeon/r100.c| 2 +-
>  drivers/gpu/drm/radeon/r300.c| 2 +-
>  drivers/gpu/drm/radeon/r600.c| 3 ++-
>  drivers/gpu/drm/radeon/radeon_atpx_handler.c | 4 ++--
>  drivers/gpu/drm/radeon/radeon_object.c   | 2 +-
>  drivers/gpu/drm/radeon/radeon_ttm.c  | 2 +-
>  drivers/gpu/drm/radeon/rs600.c   | 2 +-
>  drivers/gpu/drm/radeon/rs690.c   | 3 ++-
>  drivers/gpu/drm/radeon/rv515.c   | 2 +-
>  drivers/gpu/drm/radeon/rv6xx_dpm.c   | 2 +-
>  drivers/gpu/drm/radeon/si_dpm.c  | 6 +++---
>  14 files changed, 22 insertions(+), 20 deletions(-)
>
> --
> 2.26.0.106.g9fadedd
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cchristian.koenig%40amd.com%7C10c2a90728574bb20ef208d7f1d69e2b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637243776401264275&sdata=Z6alCS8hPA7rWNKHimpkc6zBldtBagK0dGpX8mTOEZA%3D&reserved=0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 0/1] Fiji GPU audio register timeout when in BACO state

2020-05-06 Thread Nicholas Johnson

On Sat, May 02, 2020 at 12:09:13PM +0200, Takashi Iwai wrote:
> On Sat, 02 May 2020 09:27:31 +0200,
> Takashi Iwai wrote:
> > 
> > On Sat, 02 May 2020 09:17:28 +0200,
> > Lukas Wunner wrote:
> > > 
> > > On Sat, May 02, 2020 at 09:11:58AM +0200, Takashi Iwai wrote:
> > > > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > > > @@ -673,6 +673,12 @@ static int amdgpu_dm_audio_component_bind(struct 
> > > > device *kdev,
> > > > struct amdgpu_device *adev = dev->dev_private;
> > > > struct drm_audio_component *acomp = data;
> > > >  
> > > > +   if (!device_link_add(hda_kdev, kdev, DL_FLAG_STATELESS |
> > > > +DL_FLAG_PM_RUNTIME)) {
> > > > +   DRM_ERROR("DM: cannot add device link to audio 
> > > > device\n");
> > > > +   return -ENOMEM;
> > > > +   }
> > > > +
> > > 
> > > Doesn't this duplicate drivers/pci/quirks.c:quirk_gpu_hda() ?
> > 
> > Gah, you're right, that was the place I overlooked.
> > It was a typical "false Eureka right-after-wakeup" phenomenon :)
> > Need a vaccine aka coffee...
> > 
> > So the runtime PM dependency must be already placed there, and the
> > problem is not the lack of the dependency tree but the really other
> > timing issue.  Back to square.
> 
> One interesting test is to open the stream while the mode isn't set
> yet and see whether the same problem appears.
> Namely, after the monitor is connected but no mode is set, run
> directly like
>aplay -Dhdmi:1,0 foo.wav
> You might need to wrap the command with pasuspender if PA is active.
I could not figure out how to get the interface for aplay set other than 
not specifying it and having it find the default device (which can 
change). I even used aplay -L and aplay -l to show devices. I could not 
get it working.

Is there anything else I can try? I did not apply the last patch when it 
was pointed out that it is already a quirk.

Regards,
Nicholas
> 
> 
> Takashi
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH hmm v2 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault

2020-05-06 Thread Jason Gunthorpe

On Mon, May 04, 2020 at 06:30:00PM -0700, John Hubbard wrote:
> On 2020-05-01 11:20, Jason Gunthorpe wrote:
> > From: Jason Gunthorpe 
> > 
> > Presumably the intent here was that hmm_range_fault() could put the data
> > into some HW specific format and thus avoid some work. However, nothing
> > actually does that, and it isn't clear how anything actually could do that
> > as hmm_range_fault() provides CPU addresses which must be DMA mapped.
> > 
> > Perhaps there is some special HW that does not need DMA mapping, but we
> > don't have any examples of this, and the theoretical performance win of
> > avoiding an extra scan over the pfns array doesn't seem worth the
> > complexity. Plus pfns needs to be scanned anyhow to sort out any
> > DEVICE_PRIVATE pages.
> > 
> > This version replaces the uint64_t with an usigned long containing a pfn
> > and fixed flags. On input flags is filled with the HMM_PFN_REQ_* values,
> > on successful output it is filled with HMM_PFN_* values, describing the
> > state of the pages.
> > 
> 
> Just some minor stuff below. I wasn't able to spot any errors in the code,
> though, so these are just documentation nits.
> 
> 
> ...
> 
> > 
> > diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
> > index 9924f2caa0184c..c9f2329113a47f 100644
> > +++ b/Documentation/vm/hmm.rst
> > @@ -185,9 +185,6 @@ The usage pattern is::
> > range.start = ...;
> > range.end = ...;
> > range.pfns = ...;
> 
> That should be:
> 
>   range.hmm_pfns = ...;

Yep

> 
> > -  range.flags = ...;
> > -  range.values = ...;
> > -  range.pfn_shift = ...;
> > if (!mmget_not_zero(interval_sub->notifier.mm))
> > return -EFAULT;
> > @@ -229,15 +226,10 @@ The hmm_range struct has 2 fields, default_flags and 
> > pfn_flags_mask, that specif
> >   fault or snapshot policy for the whole range instead of having to set them
> >   for each entry in the pfns array.
> > -For instance, if the device flags for range.flags are::
> > +For instance if the device driver wants pages for a range with at least 
> > read
> > +permission, it sets::
> > -range.flags[HMM_PFN_VALID] = (1 << 63);
> > -range.flags[HMM_PFN_WRITE] = (1 << 62);
> > -
> > -and the device driver wants pages for a range with at least read 
> > permission,
> > -it sets::
> > -
> > -range->default_flags = (1 << 63);
> > +range->default_flags = HMM_PFN_REQ_FAULT;
> >   range->pfn_flags_mask = 0;
> >   and calls hmm_range_fault() as described above. This will fill fault all 
> > pages
> > @@ -246,18 +238,18 @@ in the range with at least read permission.
> >   Now let's say the driver wants to do the same except for one page in the 
> > range for
> >   which it wants to have write permission. Now driver set::
> > -range->default_flags = (1 << 63);
> > -range->pfn_flags_mask = (1 << 62);
> > -range->pfns[index_of_write] = (1 << 62);
> > +range->default_flags = HMM_PFN_REQ_FAULT;
> > +range->pfn_flags_mask = HMM_PFN_REQ_WRITE;
> > +range->pfns[index_of_write] = HMM_PFN_REQ_WRITE;
> 
> 
> All these choices for _WRITE behavior make it slightly confusing. I mean, it's
> better than it was, but there are default flags, a mask, and an index as well,
> and it looks like maybe we have a little more power and flexibility than
> desirable? Nouveau for example is now just setting the mask only:

The example is showing how to fault all pages but request write for
only certain pages, ie it shows how to use default_flags and pfn_flags
together in probably the only way that could make any sense

> > @@ -542,12 +564,15 @@ static int nouveau_range_fault(struct nouveau_svmm 
> > *svmm,
> > return -EBUSY;
> > range.notifier_seq = mmu_interval_read_begin(range.notifier);
> > -   range.default_flags = 0;
> > -   range.pfn_flags_mask = -1UL;
> > down_read(&mm->mmap_sem);
> > ret = hmm_range_fault(&range);
> > up_read(&mm->mmap_sem);
> > if (ret) {
> > +   /*
> > +* FIXME: the input PFN_REQ flags are destroyed on
> > +* -EBUSY, we need to regenerate them, also for the
> > +* other continue below
> > +*/
> 
> How serious is this FIXME? It seems like we could get stuck in a loop here,
> if we're not issuing a new REQ, right?

Serious enough someone should fix it and not copy it into other
drivers..
 
> > if (ret == -EBUSY)
> > continue;
> > return ret;
> > @@ -562,7 +587,7 @@ static int nouveau_range_fault(struct nouveau_svmm 
> > *svmm,
> > break;
> > }
> > -   nouveau_dmem_convert_pfn(drm, &range);
> > +   nouveau_hmm_convert_pfn(drm, &range, ioctl_addr);
> > svmm->vmm->vmm.object.client->super = true;
> > ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL);
> > @@ -589,6 +614,7 @@

Re: [PATCH hmm v2 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault

2020-05-06 Thread Jason Gunthorpe

On Fri, May 01, 2020 at 05:53:26PM -0700, Ralph Campbell wrote:
> > Acked-by: Felix Kuehling 
> > Tested-by: Ralph Campbell 
> > Signed-off-by: Jason Gunthorpe 
> > Signed-off-by: Christoph Hellwig 
> >   Documentation/vm/hmm.rst|  26 ++--
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  35 ++
> >   drivers/gpu/drm/nouveau/nouveau_dmem.c  |  27 +---
> >   drivers/gpu/drm/nouveau/nouveau_dmem.h  |   3 +-
> >   drivers/gpu/drm/nouveau/nouveau_svm.c   |  87 -
> >   include/linux/hmm.h |  99 ++-
> >   mm/hmm.c| 160 +++-
> >   7 files changed, 192 insertions(+), 245 deletions(-)
> > 
> 
> ...snip...
> 
> > +static void nouveau_hmm_convert_pfn(struct nouveau_drm *drm,
> > +   struct hmm_range *range, u64 *ioctl_addr)
> > +{
> > +   unsigned long i, npages;
> > +
> > +   /*
> > +* The ioctl_addr prepared here is passed through nvif_object_ioctl()
> > +* to an eventual DMA map in something like gp100_vmm_pgt_pfn()
> > +*
> > +* This is all just encoding the internal hmm reprensetation into a
> 
> s/reprensetation/representation/
> 
> Looks good and still tests OK with nouveau.

Got it, thanks

Jason
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 00/14] drm/radeon: remove comparison to bool

2020-05-06 Thread Alex Deucher

On Wed, May 6, 2020 at 10:27 AM Zheng Bin  wrote:
>
> Zheng Bin (14):
>   drm/radeon: remove comparison to bool in btc_dpm.c
>   drm/radeon: remove comparison to bool in ci_dpm.c
>   drm/radeon: remove comparison to bool in ni_dpm.c
>   drm/radeon: remove comparison to bool in radeon_atpx_handler.c
>   drm/radeon: remove comparison to bool in radeon_object.c
>   drm/radeon: remove comparison to bool in radeon_ttm.c
>   drm/radeon: remove comparison to bool in r100.c
>   drm/radeon: remove comparison to bool in r300.c
>   drm/radeon: remove comparison to bool in r600.c
>   drm/radeon: remove comparison to bool in rs600.c
>   drm/radeon: remove comparison to bool in rs690.c
>   drm/radeon: remove comparison to bool in rv6xx_dpm.c
>   drm/radeon: remove comparison to bool in rv515.c
>   drm/radeon: remove comparison to bool in si_dpm.c

Does the checker need to be fixed?  All of these are comparing boolean
variables to true/false.  Seems like needless code churn to me.

Alex

>
>  drivers/gpu/drm/radeon/btc_dpm.c | 2 +-
>  drivers/gpu/drm/radeon/ci_dpm.c  | 4 ++--
>  drivers/gpu/drm/radeon/ni_dpm.c  | 6 +++---
>  drivers/gpu/drm/radeon/r100.c| 2 +-
>  drivers/gpu/drm/radeon/r300.c| 2 +-
>  drivers/gpu/drm/radeon/r600.c| 3 ++-
>  drivers/gpu/drm/radeon/radeon_atpx_handler.c | 4 ++--
>  drivers/gpu/drm/radeon/radeon_object.c   | 2 +-
>  drivers/gpu/drm/radeon/radeon_ttm.c  | 2 +-
>  drivers/gpu/drm/radeon/rs600.c   | 2 +-
>  drivers/gpu/drm/radeon/rs690.c   | 3 ++-
>  drivers/gpu/drm/radeon/rv515.c   | 2 +-
>  drivers/gpu/drm/radeon/rv6xx_dpm.c   | 2 +-
>  drivers/gpu/drm/radeon/si_dpm.c  | 6 +++---
>  14 files changed, 22 insertions(+), 20 deletions(-)
>
> --
> 2.26.0.106.g9fadedd
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/3] drm/amdgpu: optimize amdgpu device attribute code

2020-05-06 Thread Deucher, Alexander

[AMD Official Use Only - Internal Distribution Only]

Perhaps it's too much churn for this patch set, but I'd like to unify the pp 
func callbacks between powerplay and swsmu so we can drop all of the 
is_swsmu_supported() and function pointer checks sprinkled all through the code.

Alex

From: Wang, Kevin(Yang) 
Sent: Wednesday, May 6, 2020 7:04 AM
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org 

Cc: Deucher, Alexander ; Liu, Monk 
; Feng, Kenneth 
Subject: Re: [PATCH 2/3] drm/amdgpu: optimize amdgpu device attribute code


[AMD Official Use Only - Internal Distribution Only]



From: Zhang, Hawking 
Sent: Wednesday, May 6, 2020 5:26 PM
To: Wang, Kevin(Yang) ; amd-gfx@lists.freedesktop.org 

Cc: Deucher, Alexander ; Liu, Monk 
; Feng, Kenneth 
Subject: RE: [PATCH 2/3] drm/amdgpu: optimize amdgpu device attribute code

[AMD Official Use Only - Internal Distribution Only]

Hi Kelvin,

Thanks for the series that remove the duplicated one_vf mode check in all the 
amdgpu_dpm functions.

Can we split the patch into two? One for amdgpu device sysfs attr code refine, 
with the new dev_attr structures, the other for retiring all the unnecessary 
one_vf mode support.

thanks your comment.
[kevin]: Q1, agree, i will split it into two patch.

+enum amdgpu_device_attr_states {
+   ATTR_STATE_UNSUPPORT = 0,
+   ATTR_STATE_SUPPORT,
+   ATTR_STATE_DEAD,
+   ATTR_STATE_ALIVE,
+};
+
The attr_states seems unnecessary to me. You need a flag to mark whether a 
particular attribute is supported by specific ASIC or not, right? Then just a 
bool variable should be good enough for this purpose, Like attr->supported. I' 
d like to understand the use case for DEAD and ALIVE. Accordingly, you can 
simplify the logic that only remove the supported ones.

[kevin]: Q2, the origin idea, it is used to store sysfs file state, but for 
this case, we can try to drop DEAD & ALIVE state,
because the origin code logic will exit directly when create file fail.

If we have to introduce more complicated flags to indicate different status, 
I'd prefer to go directly to initialize one_vf mode attr sets and bare-metal 
attr sets directly.
[kevin]: Q3, i'd like to keep this patch code,  in fact, not all sysfs devices 
need to be created on bare-metal mode.
the driver must check it at runtime. eg: is_sw_smu_support(), if (asic_chip == 
XXX), etc..

In addition, the function naming like default_attr_perform also confusing me. 
Would it be the function that used to update attr status?
+static int default_attr_perform(struct amdgpu_device *adev, struct 
amdgpu_device_attr *attr,
+   uint32_t mask)

[kevin]: Q4, yes, the function is used to update attr status according to asic 
information at runtime.
maybe rename to "attr_update" is better.

Best Regards,
Kevin

Regards,
Hawking

-Original Message-
From: Wang, Kevin(Yang) 
Sent: Wednesday, May 6, 2020 14:23
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Liu, Monk ; Feng, Kenneth 
; Wang, Kevin(Yang) 
Subject: [PATCH 2/3] drm/amdgpu: optimize amdgpu device attribute code

unified amdgpu device attribute node functions:
1. add some helper functions to create amdgpu device attribute node.
2. create device node according to device attr flags on different VF mode.
3. rename some functions name to adapt a new interface.
4. remove unneccessary virt mode check in inernal functions (xxx_show, 
xxx_store).

Signed-off-by: Kevin Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 577 ++---  
drivers/gpu/drm/amd/amdgpu/amdgpu_pm.h |  48 ++
 2 files changed, 271 insertions(+), 354 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index c762deb5abc7..367ac79418b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -154,18 +154,15 @@ int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, 
enum amd_pp_sensors senso
  *
  */

-static ssize_t amdgpu_get_dpm_state(struct device *dev,
-   struct device_attribute *attr,
-   char *buf)
+static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
 {
 struct drm_device *ddev = dev_get_drvdata(dev);
 struct amdgpu_device *adev = ddev->dev_private;
 enum amd_pm_state_type pm;
 int ret;

-   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
-   return 0;
-
 ret = pm_runtime_get_sync(ddev->dev);
 if (ret < 0)
 return ret;
@@ -189,19 +186,16 @@ static ssize_t amdgpu_get_dpm_state(struct device *dev,
 (pm == POWER_STATE_TYPE_BALANCED) ? "balanced" : 
"performance");  }

-static ssize_t amdgpu_set_dpm_state(st

Re: [PATCH] drm/amdgpu: force fbdev into vram

2020-05-06 Thread Christian König


Am 06.05.20 um 16:14 schrieb Alex Deucher:

We set the fb smem pointer to the offset into the BAR, so keep
the fbdev bo in vram.

Bug: https://bugzilla.kernel.org/show_bug.cgi?id=207581
Fixes: 6c8d74caa2fa33 ("drm/amdgpu: Enable scatter gather display support")
Signed-off-by: Alex Deucher 


Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
index 9ae7b61f696a..25ddb482466a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
@@ -133,8 +133,7 @@ static int amdgpufb_create_pinned_object(struct 
amdgpu_fbdev *rfbdev,
u32 cpp;
u64 flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
-  AMDGPU_GEM_CREATE_VRAM_CLEARED|
-  AMDGPU_GEM_CREATE_CPU_GTT_USWC;
+  AMDGPU_GEM_CREATE_VRAM_CLEARED;
  
  	info = drm_get_format_info(adev->ddev, mode_cmd);

cpp = info->cpp[0];


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 03/14] drm/radeon: remove comparison to bool in ni_dpm.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/ni_dpm.c:807:5-26: WARNING: Comparison to bool
drivers/gpu/drm/radeon/ni_dpm.c:2466:5-36: WARNING: Comparison to boo
drivers/gpu/drm/radeon/ni_dpm.c:3146:5-22: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/ni_dpm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/radeon/ni_dpm.c b/drivers/gpu/drm/radeon/ni_dpm.c
index b57c37ddd164..66c48ce107a5 100644
--- a/drivers/gpu/drm/radeon/ni_dpm.c
+++ b/drivers/gpu/drm/radeon/ni_dpm.c
@@ -804,7 +804,7 @@ static void ni_apply_state_adjust_rules(struct 
radeon_device *rdev,
else
max_limits = &rdev->pm.dpm.dyn_state.max_clock_voltage_on_dc;

-   if (rdev->pm.dpm.ac_power == false) {
+   if (!rdev->pm.dpm.ac_power) {
for (i = 0; i < ps->performance_level_count; i++) {
if (ps->performance_levels[i].mclk > max_limits->mclk)
ps->performance_levels[i].mclk = 
max_limits->mclk;
@@ -2463,7 +2463,7 @@ static int ni_populate_power_containment_values(struct 
radeon_device *rdev,
u32 power_boost_limit;
u8 max_ps_percent;

-   if (ni_pi->enable_power_containment == false)
+   if (!ni_pi->enable_power_containment)
return 0;

if (state->performance_level_count == 0)
@@ -3143,7 +3143,7 @@ static int ni_initialize_smc_cac_tables(struct 
radeon_device *rdev)
int i, ret;
u32 reg;

-   if (ni_pi->enable_cac == false)
+   if (!ni_pi->enable_cac)
return 0;

cac_tables = kzalloc(sizeof(PP_NIslands_CACTABLES), GFP_KERNEL);
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 10/14] drm/radeon: remove comparison to bool in rs600.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/rs600.c:1132:5-37: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/rs600.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c
index c88b4906f7bc..a7ff0609a3eb 100644
--- a/drivers/gpu/drm/radeon/rs600.c
+++ b/drivers/gpu/drm/radeon/rs600.c
@@ -1129,7 +1129,7 @@ int rs600_init(struct radeon_device *rdev)
RREG32(R_0007C0_CP_STAT));
}
/* check if cards are posted or not */
-   if (radeon_boot_test_post_card(rdev) == false)
+   if (!radeon_boot_test_post_card(rdev))
return -EINVAL;

/* Initialize clocks */
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 02/14] drm/radeon: remove comparison to bool in ci_dpm.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/ci_dpm.c:814:5-26: WARNING: Comparison to bool
drivers/gpu/drm/radeon/ci_dpm.c:2916:6-21: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/ci_dpm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/radeon/ci_dpm.c b/drivers/gpu/drm/radeon/ci_dpm.c
index 134aa2b01f90..c77ca911a8b6 100644
--- a/drivers/gpu/drm/radeon/ci_dpm.c
+++ b/drivers/gpu/drm/radeon/ci_dpm.c
@@ -811,7 +811,7 @@ static void ci_apply_state_adjust_rules(struct 
radeon_device *rdev,
else
max_limits = &rdev->pm.dpm.dyn_state.max_clock_voltage_on_dc;

-   if (rdev->pm.dpm.ac_power == false) {
+   if (!rdev->pm.dpm.ac_power) {
for (i = 0; i < ps->performance_level_count; i++) {
if (ps->performance_levels[i].mclk > max_limits->mclk)
ps->performance_levels[i].mclk = 
max_limits->mclk;
@@ -2913,7 +2913,7 @@ static int ci_populate_single_memory_level(struct 
radeon_device *rdev,

if (pi->mclk_stutter_mode_threshold &&
(memory_clock <= pi->mclk_stutter_mode_threshold) &&
-   (pi->uvd_enabled == false) &&
+   !pi->uvd_enabled &&
(RREG32(DPG_PIPE_STUTTER_CONTROL) & STUTTER_ENABLE) &&
(rdev->pm.dpm.new_active_crtc_count <= 2))
memory_level->StutterEnable = true;
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 01/14] drm/radeon: remove comparison to bool in btc_dpm.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/btc_dpm.c:2115:5-26: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/btc_dpm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/btc_dpm.c b/drivers/gpu/drm/radeon/btc_dpm.c
index d1d8aaf8323c..60b32eba6f46 100644
--- a/drivers/gpu/drm/radeon/btc_dpm.c
+++ b/drivers/gpu/drm/radeon/btc_dpm.c
@@ -2112,7 +2112,7 @@ static void btc_apply_state_adjust_rules(struct 
radeon_device *rdev,
else
max_limits = &rdev->pm.dpm.dyn_state.max_clock_voltage_on_dc;

-   if (rdev->pm.dpm.ac_power == false) {
+   if (!rdev->pm.dpm.ac_power) {
if (ps->high.mclk > max_limits->mclk)
ps->high.mclk = max_limits->mclk;
if (ps->high.sclk > max_limits->sclk)
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 11/14] drm/radeon: remove comparison to bool in rs690.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/rs690.c:190:6-35: WARNING: Comparison to bool
drivers/gpu/drm/radeon/rs690.c:844:5-37: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/rs690.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/rs690.c b/drivers/gpu/drm/radeon/rs690.c
index c296f94f9700..ddc3bfbb557c 100644
--- a/drivers/gpu/drm/radeon/rs690.c
+++ b/drivers/gpu/drm/radeon/rs690.c
@@ -187,7 +187,8 @@ static void rs690_mc_init(struct radeon_device *rdev)
/* FastFB shall be used with UMA memory. Here it is simply 
disabled when sideport
 * memory is present.
 */
-   if (rdev->mc.igp_sideport_enabled == false && radeon_fastfb == 
1) {
+   if (!rdev->mc.igp_sideport_enabled &&
+   radeon_fastfb == 1) {
DRM_INFO("Direct mapping: aper base at 0x%llx, replaced 
by direct mapping base 0x%llx.\n",
(unsigned long long)rdev->mc.aper_base, 
k8_addr);
rdev->mc.aper_base = (resource_size_t)k8_addr;
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 06/14] drm/radeon: remove comparison to bool in radeon_ttm.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/radeon_ttm.c:141:6-62: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/radeon_ttm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c 
b/drivers/gpu/drm/radeon/radeon_ttm.c
index 5d50c9edbe80..d1fcb5f995b0 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -138,7 +138,7 @@ static void radeon_evict_flags(struct ttm_buffer_object *bo,
rbo = container_of(bo, struct radeon_bo, tbo);
switch (bo->mem.mem_type) {
case TTM_PL_VRAM:
-   if (rbo->rdev->ring[radeon_copy_ring_index(rbo->rdev)].ready == 
false)
+   if (!rbo->rdev->ring[radeon_copy_ring_index(rbo->rdev)].ready)
radeon_ttm_placement_from_domain(rbo, 
RADEON_GEM_DOMAIN_CPU);
else if (rbo->rdev->mc.visible_vram_size < 
rbo->rdev->mc.real_vram_size &&
 bo->mem.start < (rbo->rdev->mc.visible_vram_size >> 
PAGE_SHIFT)) {
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 13/14] drm/radeon: remove comparison to bool in rv515.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/rv515.c:666:5-37: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/rv515.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c
index 147e5cf8348d..77e6b9dcdb69 100644
--- a/drivers/gpu/drm/radeon/rv515.c
+++ b/drivers/gpu/drm/radeon/rv515.c
@@ -663,7 +663,7 @@ int rv515_init(struct radeon_device *rdev)
RREG32(R_0007C0_CP_STAT));
}
/* check if cards are posted or not */
-   if (radeon_boot_test_post_card(rdev) == false)
+   if (!radeon_boot_test_post_card(rdev))
return -EINVAL;
/* Initialize clocks */
radeon_get_clock_info(rdev->ddev);
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 04/14] drm/radeon: remove comparison to bool in radeon_atpx_handler.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/radeon_atpx_handler.c:561:15-49: WARNING: Comparison to 
bool
drivers/gpu/drm/radeon/radeon_atpx_handler.c:571:15-49: WARNING: Comparison to 
bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/radeon_atpx_handler.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_atpx_handler.c 
b/drivers/gpu/drm/radeon/radeon_atpx_handler.c
index 6f93f54bf651..6131917322b4 100644
--- a/drivers/gpu/drm/radeon/radeon_atpx_handler.c
+++ b/drivers/gpu/drm/radeon/radeon_atpx_handler.c
@@ -558,7 +558,7 @@ static bool radeon_atpx_detect(void)
while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, pdev)) != 
NULL) {
vga_count++;

-   has_atpx |= (radeon_atpx_pci_probe_handle(pdev) == true);
+   has_atpx |= radeon_atpx_pci_probe_handle(pdev);

parent_pdev = pci_upstream_bridge(pdev);
d3_supported |= parent_pdev && parent_pdev->bridge_d3;
@@ -568,7 +568,7 @@ static bool radeon_atpx_detect(void)
while ((pdev = pci_get_class(PCI_CLASS_DISPLAY_OTHER << 8, pdev)) != 
NULL) {
vga_count++;

-   has_atpx |= (radeon_atpx_pci_probe_handle(pdev) == true);
+   has_atpx |= radeon_atpx_pci_probe_handle(pdev);

parent_pdev = pci_upstream_bridge(pdev);
d3_supported |= parent_pdev && parent_pdev->bridge_d3;
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 05/14] drm/radeon: remove comparison to bool in radeon_object.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/radeon_object.c:427:6-35: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/radeon_object.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/radeon_object.c 
b/drivers/gpu/drm/radeon/radeon_object.c
index 140d94cc080d..f06c5e9dc72c 100644
--- a/drivers/gpu/drm/radeon/radeon_object.c
+++ b/drivers/gpu/drm/radeon/radeon_object.c
@@ -424,7 +424,7 @@ int radeon_bo_evict_vram(struct radeon_device *rdev)
/* late 2.6.33 fix IGP hibernate - we need pm ops to do this correct */
 #ifndef CONFIG_HIBERNATION
if (rdev->flags & RADEON_IS_IGP) {
-   if (rdev->mc.igp_sideport_enabled == false)
+   if (!rdev->mc.igp_sideport_enabled)
/* Useless to evict on IGP chips */
return 0;
}
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 14/14] drm/radeon: remove comparison to bool in si_dpm.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/si_dpm.c:1885:7-44: WARNING: Comparison to bool
drivers/gpu/drm/radeon/si_dpm.c:2463:5-22: WARNING: Comparison to bool
drivers/gpu/drm/radeon/si_dpm.c:3015:5-26: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/si_dpm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c
index a167e1c36d24..98e288e5d8c9 100644
--- a/drivers/gpu/drm/radeon/si_dpm.c
+++ b/drivers/gpu/drm/radeon/si_dpm.c
@@ -1882,7 +1882,7 @@ static void si_initialize_powertune_defaults(struct 
radeon_device *rdev)
update_dte_from_pl2 = true;
break;
default:
-   if (si_pi->dte_data.enable_dte_by_default == true)
+   if (si_pi->dte_data.enable_dte_by_default)
DRM_ERROR("DTE is not enabled!\n");
break;
}
@@ -2460,7 +2460,7 @@ static int si_initialize_smc_dte_tables(struct 
radeon_device *rdev)
if (dte_data == NULL)
si_pi->enable_dte = false;

-   if (si_pi->enable_dte == false)
+   if (!si_pi->enable_dte)
return 0;

if (dte_data->k <= 0)
@@ -3012,7 +3012,7 @@ static void si_apply_state_adjust_rules(struct 
radeon_device *rdev,
if (ps->performance_levels[i].vddc > 
ps->performance_levels[i+1].vddc)
ps->performance_levels[i].vddc = 
ps->performance_levels[i+1].vddc;
}
-   if (rdev->pm.dpm.ac_power == false) {
+   if (!rdev->pm.dpm.ac_power) {
for (i = 0; i < ps->performance_level_count; i++) {
if (ps->performance_levels[i].mclk > max_limits->mclk)
ps->performance_levels[i].mclk = 
max_limits->mclk;
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 00/14] drm/radeon: remove comparison to bool

2020-05-06 Thread Zheng Bin

Zheng Bin (14):
  drm/radeon: remove comparison to bool in btc_dpm.c
  drm/radeon: remove comparison to bool in ci_dpm.c
  drm/radeon: remove comparison to bool in ni_dpm.c
  drm/radeon: remove comparison to bool in radeon_atpx_handler.c
  drm/radeon: remove comparison to bool in radeon_object.c
  drm/radeon: remove comparison to bool in radeon_ttm.c
  drm/radeon: remove comparison to bool in r100.c
  drm/radeon: remove comparison to bool in r300.c
  drm/radeon: remove comparison to bool in r600.c
  drm/radeon: remove comparison to bool in rs600.c
  drm/radeon: remove comparison to bool in rs690.c
  drm/radeon: remove comparison to bool in rv6xx_dpm.c
  drm/radeon: remove comparison to bool in rv515.c
  drm/radeon: remove comparison to bool in si_dpm.c

 drivers/gpu/drm/radeon/btc_dpm.c | 2 +-
 drivers/gpu/drm/radeon/ci_dpm.c  | 4 ++--
 drivers/gpu/drm/radeon/ni_dpm.c  | 6 +++---
 drivers/gpu/drm/radeon/r100.c| 2 +-
 drivers/gpu/drm/radeon/r300.c| 2 +-
 drivers/gpu/drm/radeon/r600.c| 3 ++-
 drivers/gpu/drm/radeon/radeon_atpx_handler.c | 4 ++--
 drivers/gpu/drm/radeon/radeon_object.c   | 2 +-
 drivers/gpu/drm/radeon/radeon_ttm.c  | 2 +-
 drivers/gpu/drm/radeon/rs600.c   | 2 +-
 drivers/gpu/drm/radeon/rs690.c   | 3 ++-
 drivers/gpu/drm/radeon/rv515.c   | 2 +-
 drivers/gpu/drm/radeon/rv6xx_dpm.c   | 2 +-
 drivers/gpu/drm/radeon/si_dpm.c  | 6 +++---
 14 files changed, 22 insertions(+), 20 deletions(-)

--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 07/14] drm/radeon: remove comparison to bool in r100.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/r100.c:4065:5-37: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/r100.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 24c8db673931..298a9c22074a 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -4062,7 +4062,7 @@ int r100_init(struct radeon_device *rdev)
RREG32(R_0007C0_CP_STAT));
}
/* check if cards are posted or not */
-   if (radeon_boot_test_post_card(rdev) == false)
+   if (!radeon_boot_test_post_card(rdev))
return -EINVAL;
/* Set asic errata */
r100_errata(rdev);
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 12/14] drm/radeon: remove comparison to bool in rv6xx_dpm.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/rv6xx_dpm.c:1571:5-20: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/rv6xx_dpm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/rv6xx_dpm.c 
b/drivers/gpu/drm/radeon/rv6xx_dpm.c
index 69d380fff22a..ebdb937730c2 100644
--- a/drivers/gpu/drm/radeon/rv6xx_dpm.c
+++ b/drivers/gpu/drm/radeon/rv6xx_dpm.c
@@ -1568,7 +1568,7 @@ int rv6xx_dpm_enable(struct radeon_device *rdev)
rv6xx_program_engine_speed_parameters(rdev);

rv6xx_enable_display_gap(rdev, true);
-   if (pi->display_gap == false)
+   if (!pi->display_gap)
rv6xx_enable_display_gap(rdev, false);

rv6xx_program_power_level_enter_state(rdev);
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 08/14] drm/radeon: remove comparison to bool in r300.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/r300.c:1544:5-37: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/r300.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
index 3b7ead5be5bf..26448b6e97e6 100644
--- a/drivers/gpu/drm/radeon/r300.c
+++ b/drivers/gpu/drm/radeon/r300.c
@@ -1541,7 +1541,7 @@ int r300_init(struct radeon_device *rdev)
RREG32(R_0007C0_CP_STAT));
}
/* check if cards are posted or not */
-   if (radeon_boot_test_post_card(rdev) == false)
+   if (!radeon_boot_test_post_card(rdev))
return -EINVAL;
/* Set asic errata */
r300_errata(rdev);
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 09/14] drm/radeon: remove comparison to bool in r600.c

2020-05-06 Thread Zheng Bin

Fixes coccicheck warning:

drivers/gpu/drm/radeon/r600.c:1494:8-37: WARNING: Comparison to bool

Reported-by: Hulk Robot 
Signed-off-by: Zheng Bin 
---
 drivers/gpu/drm/radeon/r600.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index d9a33ca768f3..a37f50907107 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -1491,7 +1491,8 @@ static int r600_mc_init(struct radeon_device *rdev)
/* FastFB shall be used with UMA memory. Here 
it is simply disabled when sideport
* memory is present.
*/
-   if (rdev->mc.igp_sideport_enabled == false && 
radeon_fastfb == 1) {
+   if (!rdev->mc.igp_sideport_enabled &&
+   radeon_fastfb == 1) {
DRM_INFO("Direct mapping: aper base at 
0x%llx, replaced by direct mapping base 0x%llx.\n",
(unsigned long 
long)rdev->mc.aper_base, k8_addr);
rdev->mc.aper_base = 
(resource_size_t)k8_addr;
--
2.26.0.106.g9fadedd

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amdgpu: force fbdev into vram

2020-05-06 Thread Alex Deucher

We set the fb smem pointer to the offset into the BAR, so keep
the fbdev bo in vram.

Bug: https://bugzilla.kernel.org/show_bug.cgi?id=207581
Fixes: 6c8d74caa2fa33 ("drm/amdgpu: Enable scatter gather display support")
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
index 9ae7b61f696a..25ddb482466a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fb.c
@@ -133,8 +133,7 @@ static int amdgpufb_create_pinned_object(struct 
amdgpu_fbdev *rfbdev,
u32 cpp;
u64 flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
-  AMDGPU_GEM_CREATE_VRAM_CLEARED|
-  AMDGPU_GEM_CREATE_CPU_GTT_USWC;
+  AMDGPU_GEM_CREATE_VRAM_CLEARED;
 
info = drm_get_format_info(adev->ddev, mode_cmd);
cpp = info->cpp[0];
-- 
2.25.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [bug report] drm/amdgpu: add amdgpu_ras.c to support ras (v2)

2020-05-06 Thread Dan Carpenter

On Wed, May 06, 2020 at 10:10:56AM +, Pan, Xinhui wrote:
> [AMD Official Use Only - Internal Distribution Only]
> 
> no.  below function checks if block is valid or not.
> I think you need check your code_checker. or you were checking on a very old 
> codebase?
> 
> /* check if ras is supported on block, say, sdma, gfx */
> static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
> unsigned int block)

Ah!  That's right.  Thanks.

What happens here is that Smatch thinks amdgpu_ras_is_supported() always
returns false because there is a bug in how it tracks ras->supported.
I will fix this.

regards,
dan carpenter

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/3] drm/amdgpu: optimize amdgpu device attribute code

2020-05-06 Thread Wang, Kevin(Yang)

[AMD Official Use Only - Internal Distribution Only]

From: Zhang, Hawking 
Sent: Wednesday, May 6, 2020 5:26 PM
To: Wang, Kevin(Yang) ; amd-gfx@lists.freedesktop.org 

Cc: Deucher, Alexander ; Liu, Monk 
; Feng, Kenneth 
Subject: RE: [PATCH 2/3] drm/amdgpu: optimize amdgpu device attribute code

[AMD Official Use Only - Internal Distribution Only]

Hi Kelvin,

Thanks for the series that remove the duplicated one_vf mode check in all the 
amdgpu_dpm functions.

Can we split the patch into two? One for amdgpu device sysfs attr code refine, 
with the new dev_attr structures, the other for retiring all the unnecessary 
one_vf mode support.

thanks your comment.
[kevin]: Q1, agree, i will split it into two patch.

+enum amdgpu_device_attr_states {
+   ATTR_STATE_UNSUPPORT = 0,
+   ATTR_STATE_SUPPORT,
+   ATTR_STATE_DEAD,
+   ATTR_STATE_ALIVE,
+};
+
The attr_states seems unnecessary to me. You need a flag to mark whether a 
particular attribute is supported by specific ASIC or not, right? Then just a 
bool variable should be good enough for this purpose, Like attr->supported. I' 
d like to understand the use case for DEAD and ALIVE. Accordingly, you can 
simplify the logic that only remove the supported ones.

[kevin]: Q2, the origin idea, it is used to store sysfs file state, but for 
this case, we can try to drop DEAD & ALIVE state,
because the origin code logic will exit directly when create file fail.

If we have to introduce more complicated flags to indicate different status, 
I'd prefer to go directly to initialize one_vf mode attr sets and bare-metal 
attr sets directly.
[kevin]: Q3, i'd like to keep this patch code,  in fact, not all sysfs devices 
need to be created on bare-metal mode.
the driver must check it at runtime. eg: is_sw_smu_support(), if (asic_chip == 
XXX), etc..

In addition, the function naming like default_attr_perform also confusing me. 
Would it be the function that used to update attr status?
+static int default_attr_perform(struct amdgpu_device *adev, struct 
amdgpu_device_attr *attr,
+   uint32_t mask)

[kevin]: Q4, yes, the function is used to update attr status according to asic 
information at runtime.
maybe rename to "attr_update" is better.

Best Regards,
Kevin

Regards,
Hawking

-Original Message-
From: Wang, Kevin(Yang) 
Sent: Wednesday, May 6, 2020 14:23
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Liu, Monk ; Feng, Kenneth 
; Wang, Kevin(Yang) 
Subject: [PATCH 2/3] drm/amdgpu: optimize amdgpu device attribute code

unified amdgpu device attribute node functions:
1. add some helper functions to create amdgpu device attribute node.
2. create device node according to device attr flags on different VF mode.
3. rename some functions name to adapt a new interface.
4. remove unneccessary virt mode check in inernal functions (xxx_show, 
xxx_store).

Signed-off-by: Kevin Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 577 ++---  
drivers/gpu/drm/amd/amdgpu/amdgpu_pm.h |  48 ++
 2 files changed, 271 insertions(+), 354 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index c762deb5abc7..367ac79418b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -154,18 +154,15 @@ int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, 
enum amd_pp_sensors senso
  *
  */

-static ssize_t amdgpu_get_dpm_state(struct device *dev,
-   struct device_attribute *attr,
-   char *buf)
+static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
 {
 struct drm_device *ddev = dev_get_drvdata(dev);
 struct amdgpu_device *adev = ddev->dev_private;
 enum amd_pm_state_type pm;
 int ret;

-   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
-   return 0;
-
 ret = pm_runtime_get_sync(ddev->dev);
 if (ret < 0)
 return ret;
@@ -189,19 +186,16 @@ static ssize_t amdgpu_get_dpm_state(struct device *dev,
 (pm == POWER_STATE_TYPE_BALANCED) ? "balanced" : 
"performance");  }

-static ssize_t amdgpu_set_dpm_state(struct device *dev,
-   struct device_attribute *attr,
-   const char *buf,
-   size_t count)
+static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
 {
 struct drm_device *ddev = dev_get_drvdata(dev);
 struct amdgpu_device *adev = ddev->dev_private;
 en

Re: [PATCH] drm/amdgpu: avoid clearing freed bo with sdma in gpu reset

2020-05-06 Thread Christian König


Yes, exactly that one.

Regards,
Christian.

Am 06.05.20 um 12:35 schrieb Zhou, Tiecheng:

[AMD Official Use Only - Internal Distribution Only]

Thanks, Christian,

Is this the fix that you are mentioning:

commit 1675c3a24d075d484377003789245f48c2114a0b
Author: Christian König 
Date:   Fri Feb 21 15:10:31 2020 +0100

 drm/amdgpu: stop disable the scheduler during HW fini

 When we stop the HW for example for GPU reset we should not stop the
 front-end scheduler. Otherwise we run into intermediate failures during
 command submission.

 The scheduler should only be stopped in very few cases:
 1. We can't get the hardware working in ring or IB test after a GPU reset.
 2. The KIQ scheduler is not used in the front-end and should be disabled 
during GPU reset.
 3. In amdgpu_ring_fini() when the driver unloads.

 Signed-off-by: Christian König 
 Reviewed-by: Alex Deucher 
 Acked-by: Nirmoy Das 
 Test-by: Dennis Li 
 Signed-off-by: Alex Deucher 

Thanks
Tiecheng


-Original Message-
From: Christian König 
Sent: Wednesday, May 6, 2020 5:44 PM
To: Zhou, Tiecheng ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: avoid clearing freed bo with sdma in gpu reset

NAK, the fundamental problem was that we disabled the SDMA paging queue during 
reset:

[  885.694682] [drm] schedpage0 is not ready, skipping [  885.694682]
[drm] schedpage1 is not ready, skipping

This is fixed by now, so the problem should not happen any more.

Regards,
Christian.


Am 06.05.20 um 11:36 schrieb Tiecheng Zhou:

WHY:
For V320 passthrough and "modprobe amdgpu lockup_timeout=500", there
will be kernel NULL pointer when using quark ~ BACO reset, for instance:
hang_vm_compute0_bad_cs_dispatch.lua
hang_vm_dma0_corrupted_header.lua
etc.
-
[  884.792885] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring
comp_1.0.0 timeout, signaled seq=3, emitted seq=4 [  884.793772]
[drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information:
process quark pid 16939 thread quark pid 16940 [  884.859979] amdgpu:
[powerplay] set virtualization GFX DPM policy success [  884.861003]
amdgpu: [powerplay] activate virtualization GFX DPM policy success [  
884.861065] amdgpu: [powerplay] set virtualization VCE DPM policy success [  
885.693554] [drm:amdgpu_cs_ioctl [amdgpu]] *ERROR* Failed to initialize parser 
-125!
[  885.694682] [drm] schedpage0 is not ready, skipping [  885.694682]
[drm] schedpage1 is not ready, skipping [  885.694720]
[drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA (-2)
[  885.695328] BUG: unable to handle kernel NULL pointer dereference
at 0008 [  885.695909] PGD 0 P4D 0 [  885.696104] Oops:
 [#1] SMP PTI
[  885.696368] CPU: 2 PID: 16940 Comm: quark Tainted: G   OE 
4.19.52+ #6
[  885.696945] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.10.2-1 04/01/2014 [  885.697593] RIP:
0010:amdgpu_vm_sdma_commit+0x59/0x130 [amdgpu] ...
[  885.705042] Call Trace:
[  885.705251]  ? amdgpu_vm_bo_update_mapping+0xdf/0xf0 [amdgpu] [
885.705696]  ? amdgpu_vm_clear_freed+0xcc/0x1b0 [amdgpu] [
885.706112]  ? amdgpu_gem_va_ioctl+0x4a1/0x510 [amdgpu] [  885.706493]
? __radix_tree_delete+0x7e/0xa0 [  885.706822]  ?
amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu] [  885.707220]  ?
drm_ioctl_kernel+0xaa/0xf0 [drm] [  885.707568]  ?
amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu] [  885.707962]  ?
drm_ioctl_kernel+0xaa/0xf0 [drm] [  885.708294]  ?
drm_ioctl+0x3a7/0x3f0 [drm] [  885.708632]  ?
amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu] [  885.709032]  ?
unmap_region+0xd9/0x120 [  885.709328]  ? amdgpu_drm_ioctl+0x49/0x80
[amdgpu] [  885.709684]  ? do_vfs_ioctl+0xa1/0x620 [  885.709971]  ?
do_munmap+0x32e/0x430 [  885.710232]  ? ksys_ioctl+0x66/0x70 [
885.710513]  ? __x64_sys_ioctl+0x16/0x20 [  885.710806]  ?
do_syscall_64+0x55/0x100 [  885.711092]  ?
entry_SYSCALL_64_after_hwframe+0x44/0xa9
...
[  885.719408] ---[ end trace 7ee3180f42e9f572 ]--- [  885.719766]
RIP: 0010:amdgpu_vm_sdma_commit+0x59/0x130 [amdgpu] ...
-

the NULL pointer (entity->rq == NULL in amdgpu_vm_sdma_commit()) as follows:
1. quark sends bad job that triggers job timeout; 2. guest KMD detects
the job timeout and goes to gpu recovery, and it goes to
 ip_suspend for SDMA, and it sets sdma[].sched.ready to false; 3.
quark sends UNMAP operation through amdgpu_gem_va_ioctl, and guest KMD goes
 through amdgpu_gem_va_update_vm and finally goes to amdgpu_vm_sdma_commit,
 it goes to amdgpu_job_submit to drm_sched_job_init 4.
drm_sched_job_init fails at drm_sched_pick_best() since
 sdma[].sched.ready is set to false; in the meanwhile entity->rq
becomes NULL; 5. quark sends other UNMAP operations through 
amdgpu_gem_va_ioctl, while this time
 there will be NULL pointer because entity->rq is NULL;

the above sequence occurs only when "modprobe amdgpu lockup_timeout=500".
it does not occur when lockup_time

RE: [PATCH] drm/amdgpu: avoid clearing freed bo with sdma in gpu reset

2020-05-06 Thread Zhou, Tiecheng

[AMD Official Use Only - Internal Distribution Only]

Thanks, Christian,

Is this the fix that you are mentioning:

commit 1675c3a24d075d484377003789245f48c2114a0b
Author: Christian König 
Date:   Fri Feb 21 15:10:31 2020 +0100

drm/amdgpu: stop disable the scheduler during HW fini

When we stop the HW for example for GPU reset we should not stop the
front-end scheduler. Otherwise we run into intermediate failures during
command submission.

The scheduler should only be stopped in very few cases:
1. We can't get the hardware working in ring or IB test after a GPU reset.
2. The KIQ scheduler is not used in the front-end and should be disabled 
during GPU reset.
3. In amdgpu_ring_fini() when the driver unloads.

Signed-off-by: Christian König 
Reviewed-by: Alex Deucher 
Acked-by: Nirmoy Das 
Test-by: Dennis Li 
Signed-off-by: Alex Deucher 

Thanks
Tiecheng


-Original Message-
From: Christian König  
Sent: Wednesday, May 6, 2020 5:44 PM
To: Zhou, Tiecheng ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: avoid clearing freed bo with sdma in gpu reset

NAK, the fundamental problem was that we disabled the SDMA paging queue during 
reset:
> [  885.694682] [drm] schedpage0 is not ready, skipping [  885.694682] 
> [drm] schedpage1 is not ready, skipping

This is fixed by now, so the problem should not happen any more.

Regards,
Christian.


Am 06.05.20 um 11:36 schrieb Tiecheng Zhou:
> WHY:
> For V320 passthrough and "modprobe amdgpu lockup_timeout=500", there 
> will be kernel NULL pointer when using quark ~ BACO reset, for instance:
>hang_vm_compute0_bad_cs_dispatch.lua
>hang_vm_dma0_corrupted_header.lua
>etc.
> -
> [  884.792885] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring 
> comp_1.0.0 timeout, signaled seq=3, emitted seq=4 [  884.793772] 
> [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: 
> process quark pid 16939 thread quark pid 16940 [  884.859979] amdgpu: 
> [powerplay] set virtualization GFX DPM policy success [  884.861003] 
> amdgpu: [powerplay] activate virtualization GFX DPM policy success [  
> 884.861065] amdgpu: [powerplay] set virtualization VCE DPM policy success [  
> 885.693554] [drm:amdgpu_cs_ioctl [amdgpu]] *ERROR* Failed to initialize 
> parser -125!
> [  885.694682] [drm] schedpage0 is not ready, skipping [  885.694682] 
> [drm] schedpage1 is not ready, skipping [  885.694720] 
> [drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA (-2) 
> [  885.695328] BUG: unable to handle kernel NULL pointer dereference 
> at 0008 [  885.695909] PGD 0 P4D 0 [  885.696104] Oops: 
>  [#1] SMP PTI
> [  885.696368] CPU: 2 PID: 16940 Comm: quark Tainted: G   OE 
> 4.19.52+ #6
> [  885.696945] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
> BIOS 1.10.2-1 04/01/2014 [  885.697593] RIP: 
> 0010:amdgpu_vm_sdma_commit+0x59/0x130 [amdgpu] ...
> [  885.705042] Call Trace:
> [  885.705251]  ? amdgpu_vm_bo_update_mapping+0xdf/0xf0 [amdgpu] [  
> 885.705696]  ? amdgpu_vm_clear_freed+0xcc/0x1b0 [amdgpu] [  
> 885.706112]  ? amdgpu_gem_va_ioctl+0x4a1/0x510 [amdgpu] [  885.706493]  
> ? __radix_tree_delete+0x7e/0xa0 [  885.706822]  ? 
> amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu] [  885.707220]  ? 
> drm_ioctl_kernel+0xaa/0xf0 [drm] [  885.707568]  ? 
> amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu] [  885.707962]  ? 
> drm_ioctl_kernel+0xaa/0xf0 [drm] [  885.708294]  ? 
> drm_ioctl+0x3a7/0x3f0 [drm] [  885.708632]  ? 
> amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu] [  885.709032]  ? 
> unmap_region+0xd9/0x120 [  885.709328]  ? amdgpu_drm_ioctl+0x49/0x80 
> [amdgpu] [  885.709684]  ? do_vfs_ioctl+0xa1/0x620 [  885.709971]  ? 
> do_munmap+0x32e/0x430 [  885.710232]  ? ksys_ioctl+0x66/0x70 [  
> 885.710513]  ? __x64_sys_ioctl+0x16/0x20 [  885.710806]  ? 
> do_syscall_64+0x55/0x100 [  885.711092]  ? 
> entry_SYSCALL_64_after_hwframe+0x44/0xa9
> ...
> [  885.719408] ---[ end trace 7ee3180f42e9f572 ]--- [  885.719766] 
> RIP: 0010:amdgpu_vm_sdma_commit+0x59/0x130 [amdgpu] ...
> -
>
> the NULL pointer (entity->rq == NULL in amdgpu_vm_sdma_commit()) as follows:
> 1. quark sends bad job that triggers job timeout; 2. guest KMD detects 
> the job timeout and goes to gpu recovery, and it goes to
> ip_suspend for SDMA, and it sets sdma[].sched.ready to false; 3. 
> quark sends UNMAP operation through amdgpu_gem_va_ioctl, and guest KMD goes
> through amdgpu_gem_va_update_vm and finally goes to amdgpu_vm_sdma_commit,
> it goes to amdgpu_job_submit to drm_sched_job_init 4. 
> drm_sched_job_init fails at drm_sched_pick_best() since
> sdma[].sched.ready is set to false; in the meanwhile entity->rq 
> becomes NULL; 5. quark sends other UNMAP operations through 
> amdgpu_gem_va_ioctl, while this time
> there will be NULL pointer because entity->rq is NULL;
>
> the above sequence occurs only when "modprobe amdgpu loc

Re: [bug report] drm/amdgpu: add amdgpu_ras.c to support ras (v2)

2020-05-06 Thread Pan, Xinhui

[AMD Official Use Only - Internal Distribution Only]

no.  below function checks if block is valid or not.
I think you need check your code_checker. or you were checking on a very old 
codebase?

/* check if ras is supported on block, say, sdma, gfx */
static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
unsigned int block)

From: Dan Carpenter 
Sent: Wednesday, May 6, 2020 5:17:34 PM
To: Zhou1, Tao 
Cc: Pan, Xinhui ; amd-gfx@lists.freedesktop.org 

Subject: Re: [bug report] drm/amdgpu: add amdgpu_ras.c to support ras (v2)

On Wed, May 06, 2020 at 07:26:16AM +, Zhou1, Tao wrote:
> [AMD Public Use]
>
> Hi Dan:
>
> Please check the following piece of code in 
> amdgpu_ras_debugfs_ctrl_parse_data:
>
>if (op != -1) {
>if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
>return -EINVAL;
>
>data->head.block = block_id;
>
> amdgpu_ras_find_block_id_by_name will return error directly if someone try to 
> provide an invalid block_name intentionally via debugfs.
>

No.  It's the line after that which are the problem.

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
   147  static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
   148  const char __user *buf, size_t size,
   149  loff_t *pos, struct ras_debug_if *data)
   150  {
   151  ssize_t s = min_t(u64, 64, size);
   152  char str[65];
   153  char block_name[33];
   154  char err[9] = "ue";
   155  int op = -1;
   156  int block_id;
   157  uint32_t sub_block;
   158  u64 address, value;
   159
   160  if (*pos)
   161  return -EINVAL;
   162  *pos = size;
   163
   164  memset(str, 0, sizeof(str));
   165  memset(data, 0, sizeof(*data));
   166
   167  if (copy_from_user(str, buf, s))
   168  return -EINVAL;
   169
   170  if (sscanf(str, "disable %32s", block_name) == 1)
   171  op = 0;
   172  else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
   173  op = 1;
   174  else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
   175  op = 2;
   176  else if (str[0] && str[1] && str[2] && str[3])
   177  /* ascii string, but commands are not matched. */

Say we don't write an ascii string.

   178  return -EINVAL;
   179
   180  if (op != -1) {
   181  if (amdgpu_ras_find_block_id_by_name(block_name, 
&block_id))
   182  return -EINVAL;
   183
   184  data->head.block = block_id;
   185  /* only ue and ce errors are supported */
   186  if (!memcmp("ue", err, 2))
   187  data->head.type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
   188  else if (!memcmp("ce", err, 2))
   189  data->head.type = 
AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
   190  else
   191  return -EINVAL;
   192
   193  data->op = op;
   194
   195  if (op == 2) {
   196  if (sscanf(str, "%*s %*s %*s %u %llu %llu",
   197  &sub_block, &address, 
&value) != 3)
   198  if (sscanf(str, "%*s %*s %*s 0x%x 
0x%llx 0x%llx",
   199  &sub_block, 
&address, &value) != 3)
   200  return -EINVAL;
   201  data->head.sub_block_index = sub_block;
   202  data->inject.address = address;
   203  data->inject.value = value;
   204  }
   205  } else {
   206  if (size < sizeof(*data))
   207  return -EINVAL;
   208
   209  if (copy_from_user(data, buf, sizeof(*data)))
^^^
This lets us set the data->head.block to whatever we want.  Premusably
there is a trusted app which knows how to write the correct values.
But if it has a bug that will cause a crash and we'll have to find a
way to disable it in the kernel for kernel lock down mode etc so either
way we'll need to do a bit of work.

   210  return -EINVAL;
   211  }
   212
   213  return 0;
   214  }

regards,
dan carpenter

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: avoid clearing freed bo with sdma in gpu reset

2020-05-06 Thread Christian König

NAK, the fundamental problem was that we disabled the SDMA paging queue 
during reset:

[  885.694682] [drm] schedpage0 is not ready, skipping
[  885.694682] [drm] schedpage1 is not ready, skipping


This is fixed by now, so the problem should not happen any more.

Regards,
Christian.


Am 06.05.20 um 11:36 schrieb Tiecheng Zhou:

WHY:
For V320 passthrough and "modprobe amdgpu lockup_timeout=500", there will be
kernel NULL pointer when using quark ~ BACO reset, for instance:
   hang_vm_compute0_bad_cs_dispatch.lua
   hang_vm_dma0_corrupted_header.lua
   etc.
-
[  884.792885] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 
timeout, signaled seq=3, emitted seq=4
[  884.793772] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: 
process quark pid 16939 thread quark pid 16940
[  884.859979] amdgpu: [powerplay] set virtualization GFX DPM policy success
[  884.861003] amdgpu: [powerplay] activate virtualization GFX DPM policy 
success
[  884.861065] amdgpu: [powerplay] set virtualization VCE DPM policy success
[  885.693554] [drm:amdgpu_cs_ioctl [amdgpu]] *ERROR* Failed to initialize 
parser -125!
[  885.694682] [drm] schedpage0 is not ready, skipping
[  885.694682] [drm] schedpage1 is not ready, skipping
[  885.694720] [drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA 
(-2)
[  885.695328] BUG: unable to handle kernel NULL pointer dereference at 
0008
[  885.695909] PGD 0 P4D 0
[  885.696104] Oops:  [#1] SMP PTI
[  885.696368] CPU: 2 PID: 16940 Comm: quark Tainted: G   OE 
4.19.52+ #6
[  885.696945] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-1 04/01/2014
[  885.697593] RIP: 0010:amdgpu_vm_sdma_commit+0x59/0x130 [amdgpu]
...
[  885.705042] Call Trace:
[  885.705251]  ? amdgpu_vm_bo_update_mapping+0xdf/0xf0 [amdgpu]
[  885.705696]  ? amdgpu_vm_clear_freed+0xcc/0x1b0 [amdgpu]
[  885.706112]  ? amdgpu_gem_va_ioctl+0x4a1/0x510 [amdgpu]
[  885.706493]  ? __radix_tree_delete+0x7e/0xa0
[  885.706822]  ? amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu]
[  885.707220]  ? drm_ioctl_kernel+0xaa/0xf0 [drm]
[  885.707568]  ? amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu]
[  885.707962]  ? drm_ioctl_kernel+0xaa/0xf0 [drm]
[  885.708294]  ? drm_ioctl+0x3a7/0x3f0 [drm]
[  885.708632]  ? amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu]
[  885.709032]  ? unmap_region+0xd9/0x120
[  885.709328]  ? amdgpu_drm_ioctl+0x49/0x80 [amdgpu]
[  885.709684]  ? do_vfs_ioctl+0xa1/0x620
[  885.709971]  ? do_munmap+0x32e/0x430
[  885.710232]  ? ksys_ioctl+0x66/0x70
[  885.710513]  ? __x64_sys_ioctl+0x16/0x20
[  885.710806]  ? do_syscall_64+0x55/0x100
[  885.711092]  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
...
[  885.719408] ---[ end trace 7ee3180f42e9f572 ]---
[  885.719766] RIP: 0010:amdgpu_vm_sdma_commit+0x59/0x130 [amdgpu]
...
-

the NULL pointer (entity->rq == NULL in amdgpu_vm_sdma_commit()) as follows:
1. quark sends bad job that triggers job timeout;
2. guest KMD detects the job timeout and goes to gpu recovery, and it goes to
ip_suspend for SDMA, and it sets sdma[].sched.ready to false;
3. quark sends UNMAP operation through amdgpu_gem_va_ioctl, and guest KMD goes
through amdgpu_gem_va_update_vm and finally goes to amdgpu_vm_sdma_commit,
it goes to amdgpu_job_submit to drm_sched_job_init
4. drm_sched_job_init fails at drm_sched_pick_best() since
sdma[].sched.ready is set to false; in the meanwhile entity->rq becomes 
NULL;
5. quark sends other UNMAP operations through amdgpu_gem_va_ioctl, while this 
time
there will be NULL pointer because entity->rq is NULL;

the above sequence occurs only when "modprobe amdgpu lockup_timeout=500".
it does not occur when lockup_timeout=1 (default) because step 2. KMD 
detects
job timeout will be sometime after quark sends UNMAP operations; i.e. quark 
UNMAP
opeartions are finished before sdma ip suspend.

HOW:
here is to add mutex_lock to wait to avoid using sdma during gpu reset.

Signed-off-by: Tiecheng Zhou 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index e205ecc75a21..018b88f3b6da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2047,6 +2047,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
struct dma_fence *f = NULL;
int r;
  
+	mutex_lock(&adev->lock_reset);

+
while (!list_empty(&vm->freed)) {
mapping = list_first_entry(&vm->freed,
struct amdgpu_bo_va_mapping, list);
@@ -2062,6 +2064,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
amdgpu_vm_free_mapping(adev, vm, mapping, f);
if (r) {
dma_fence_put(f);
+   mutex_unlock(&adev->lock_reset);
return r;
}

[PATCH] drm/amdgpu: avoid clearing freed bo with sdma in gpu reset

2020-05-06 Thread Tiecheng Zhou

WHY:
For V320 passthrough and "modprobe amdgpu lockup_timeout=500", there will be
kernel NULL pointer when using quark ~ BACO reset, for instance:
  hang_vm_compute0_bad_cs_dispatch.lua
  hang_vm_dma0_corrupted_header.lua
  etc.
-
[  884.792885] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 
timeout, signaled seq=3, emitted seq=4
[  884.793772] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process information: 
process quark pid 16939 thread quark pid 16940
[  884.859979] amdgpu: [powerplay] set virtualization GFX DPM policy success
[  884.861003] amdgpu: [powerplay] activate virtualization GFX DPM policy 
success
[  884.861065] amdgpu: [powerplay] set virtualization VCE DPM policy success
[  885.693554] [drm:amdgpu_cs_ioctl [amdgpu]] *ERROR* Failed to initialize 
parser -125!
[  885.694682] [drm] schedpage0 is not ready, skipping
[  885.694682] [drm] schedpage1 is not ready, skipping
[  885.694720] [drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA 
(-2)
[  885.695328] BUG: unable to handle kernel NULL pointer dereference at 
0008
[  885.695909] PGD 0 P4D 0
[  885.696104] Oops:  [#1] SMP PTI
[  885.696368] CPU: 2 PID: 16940 Comm: quark Tainted: G   OE 
4.19.52+ #6
[  885.696945] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-1 04/01/2014
[  885.697593] RIP: 0010:amdgpu_vm_sdma_commit+0x59/0x130 [amdgpu]
...
[  885.705042] Call Trace:
[  885.705251]  ? amdgpu_vm_bo_update_mapping+0xdf/0xf0 [amdgpu]
[  885.705696]  ? amdgpu_vm_clear_freed+0xcc/0x1b0 [amdgpu]
[  885.706112]  ? amdgpu_gem_va_ioctl+0x4a1/0x510 [amdgpu]
[  885.706493]  ? __radix_tree_delete+0x7e/0xa0
[  885.706822]  ? amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu]
[  885.707220]  ? drm_ioctl_kernel+0xaa/0xf0 [drm]
[  885.707568]  ? amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu]
[  885.707962]  ? drm_ioctl_kernel+0xaa/0xf0 [drm]
[  885.708294]  ? drm_ioctl+0x3a7/0x3f0 [drm]
[  885.708632]  ? amdgpu_gem_va_map_flags+0x70/0x70 [amdgpu]
[  885.709032]  ? unmap_region+0xd9/0x120
[  885.709328]  ? amdgpu_drm_ioctl+0x49/0x80 [amdgpu]
[  885.709684]  ? do_vfs_ioctl+0xa1/0x620
[  885.709971]  ? do_munmap+0x32e/0x430
[  885.710232]  ? ksys_ioctl+0x66/0x70
[  885.710513]  ? __x64_sys_ioctl+0x16/0x20
[  885.710806]  ? do_syscall_64+0x55/0x100
[  885.711092]  ? entry_SYSCALL_64_after_hwframe+0x44/0xa9
...
[  885.719408] ---[ end trace 7ee3180f42e9f572 ]---
[  885.719766] RIP: 0010:amdgpu_vm_sdma_commit+0x59/0x130 [amdgpu]
...
-

the NULL pointer (entity->rq == NULL in amdgpu_vm_sdma_commit()) as follows:
1. quark sends bad job that triggers job timeout;
2. guest KMD detects the job timeout and goes to gpu recovery, and it goes to
   ip_suspend for SDMA, and it sets sdma[].sched.ready to false;
3. quark sends UNMAP operation through amdgpu_gem_va_ioctl, and guest KMD goes
   through amdgpu_gem_va_update_vm and finally goes to amdgpu_vm_sdma_commit,
   it goes to amdgpu_job_submit to drm_sched_job_init
4. drm_sched_job_init fails at drm_sched_pick_best() since
   sdma[].sched.ready is set to false; in the meanwhile entity->rq becomes NULL;
5. quark sends other UNMAP operations through amdgpu_gem_va_ioctl, while this 
time
   there will be NULL pointer because entity->rq is NULL;

the above sequence occurs only when "modprobe amdgpu lockup_timeout=500".
it does not occur when lockup_timeout=1 (default) because step 2. KMD 
detects
job timeout will be sometime after quark sends UNMAP operations; i.e. quark 
UNMAP
opeartions are finished before sdma ip suspend.

HOW:
here is to add mutex_lock to wait to avoid using sdma during gpu reset.

Signed-off-by: Tiecheng Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index e205ecc75a21..018b88f3b6da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2047,6 +2047,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
struct dma_fence *f = NULL;
int r;
 
+   mutex_lock(&adev->lock_reset);
+
while (!list_empty(&vm->freed)) {
mapping = list_first_entry(&vm->freed,
struct amdgpu_bo_va_mapping, list);
@@ -2062,6 +2064,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
amdgpu_vm_free_mapping(adev, vm, mapping, f);
if (r) {
dma_fence_put(f);
+   mutex_unlock(&adev->lock_reset);
return r;
}
}
@@ -2073,6 +2076,7 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
dma_fence_put(f);
}
 
+   mutex_unlock(&adev->lock_reset);
return 0;
 
 }
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman

RE: [PATCH 2/3] drm/amdgpu: optimize amdgpu device attribute code

2020-05-06 Thread Zhang, Hawking

[AMD Official Use Only - Internal Distribution Only]

Hi Kelvin,

Thanks for the series that remove the duplicated one_vf mode check in all the 
amdgpu_dpm functions.

Can we split the patch into two? One for amdgpu device sysfs attr code refine, 
with the new dev_attr structures, the other for retiring all the unnecessary 
one_vf mode support.

+enum amdgpu_device_attr_states {
+   ATTR_STATE_UNSUPPORT = 0,
+   ATTR_STATE_SUPPORT,
+   ATTR_STATE_DEAD,
+   ATTR_STATE_ALIVE,
+};
+
The attr_states seems unnecessary to me. You need a flag to mark whether a 
particular attribute is supported by specific ASIC or not, right? Then just a 
bool variable should be good enough for this purpose, Like attr->supported. I' 
d like to understand the use case for DEAD and ALIVE. Accordingly, you can 
simplify the logic that only remove the supported ones.

If we have to introduce more complicated flags to indicate different status, 
I'd prefer to go directly to initialize one_vf mode attr sets and bare-metal 
attr sets directly.

In addition, the function naming like default_attr_perform also confusing me. 
Would it be the function that used to update attr status? 
+static int default_attr_perform(struct amdgpu_device *adev, struct 
amdgpu_device_attr *attr,
+   uint32_t mask)

Regards,
Hawking

-Original Message-
From: Wang, Kevin(Yang)  
Sent: Wednesday, May 6, 2020 14:23
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Deucher, Alexander 
; Liu, Monk ; Feng, Kenneth 
; Wang, Kevin(Yang) 
Subject: [PATCH 2/3] drm/amdgpu: optimize amdgpu device attribute code

unified amdgpu device attribute node functions:
1. add some helper functions to create amdgpu device attribute node.
2. create device node according to device attr flags on different VF mode.
3. rename some functions name to adapt a new interface.
4. remove unneccessary virt mode check in inernal functions (xxx_show, 
xxx_store).

Signed-off-by: Kevin Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 577 ++---  
drivers/gpu/drm/amd/amdgpu/amdgpu_pm.h |  48 ++
 2 files changed, 271 insertions(+), 354 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index c762deb5abc7..367ac79418b9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -154,18 +154,15 @@ int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, 
enum amd_pp_sensors senso
  *
  */
 
-static ssize_t amdgpu_get_dpm_state(struct device *dev,
-   struct device_attribute *attr,
-   char *buf)
+static ssize_t amdgpu_get_power_dpm_state(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
 {
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = ddev->dev_private;
enum amd_pm_state_type pm;
int ret;
 
-   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
-   return 0;
-
ret = pm_runtime_get_sync(ddev->dev);
if (ret < 0)
return ret;
@@ -189,19 +186,16 @@ static ssize_t amdgpu_get_dpm_state(struct device *dev,
(pm == POWER_STATE_TYPE_BALANCED) ? "balanced" : 
"performance");  }
 
-static ssize_t amdgpu_set_dpm_state(struct device *dev,
-   struct device_attribute *attr,
-   const char *buf,
-   size_t count)
+static ssize_t amdgpu_set_power_dpm_state(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
 {
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = ddev->dev_private;
enum amd_pm_state_type  state;
int ret;
 
-   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
-   return -EINVAL;
-
if (strncmp("battery", buf, strlen("battery")) == 0)
state = POWER_STATE_TYPE_BATTERY;
else if (strncmp("balanced", buf, strlen("balanced")) == 0) @@ -294,18 
+288,15 @@ static ssize_t amdgpu_set_dpm_state(struct device *dev,
  *
  */
 
-static ssize_t amdgpu_get_dpm_forced_performance_level(struct device *dev,
-   struct device_attribute *attr,
-   char *buf)
+static ssize_t amdgpu_get_power_dpm_force_performance_level(struct device *dev,
+   struct 
device_attribute *attr,
+   char *buf)
 {
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev

Re: [bug report] drm/amdgpu: add amdgpu_ras.c to support ras (v2)

2020-05-06 Thread Dan Carpenter

On Wed, May 06, 2020 at 07:26:16AM +, Zhou1, Tao wrote:
> [AMD Public Use]
> 
> Hi Dan:
> 
> Please check the following piece of code in 
> amdgpu_ras_debugfs_ctrl_parse_data:
> 
>   if (op != -1) {
>   if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
>   return -EINVAL;
> 
>   data->head.block = block_id;
> 
> amdgpu_ras_find_block_id_by_name will return error directly if someone try to 
> provide an invalid block_name intentionally via debugfs.
> 

No.  It's the line after that which are the problem.

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
   147  static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
   148  const char __user *buf, size_t size,
   149  loff_t *pos, struct ras_debug_if *data)
   150  {
   151  ssize_t s = min_t(u64, 64, size);
   152  char str[65];
   153  char block_name[33];
   154  char err[9] = "ue";
   155  int op = -1;
   156  int block_id;
   157  uint32_t sub_block;
   158  u64 address, value;
   159  
   160  if (*pos)
   161  return -EINVAL;
   162  *pos = size;
   163  
   164  memset(str, 0, sizeof(str));
   165  memset(data, 0, sizeof(*data));
   166  
   167  if (copy_from_user(str, buf, s))
   168  return -EINVAL;
   169  
   170  if (sscanf(str, "disable %32s", block_name) == 1)
   171  op = 0;
   172  else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
   173  op = 1;
   174  else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
   175  op = 2;
   176  else if (str[0] && str[1] && str[2] && str[3])
   177  /* ascii string, but commands are not matched. */

Say we don't write an ascii string.

   178  return -EINVAL;
   179  
   180  if (op != -1) {
   181  if (amdgpu_ras_find_block_id_by_name(block_name, 
&block_id))
   182  return -EINVAL;
   183  
   184  data->head.block = block_id;
   185  /* only ue and ce errors are supported */
   186  if (!memcmp("ue", err, 2))
   187  data->head.type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
   188  else if (!memcmp("ce", err, 2))
   189  data->head.type = 
AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
   190  else
   191  return -EINVAL;
   192  
   193  data->op = op;
   194  
   195  if (op == 2) {
   196  if (sscanf(str, "%*s %*s %*s %u %llu %llu",
   197  &sub_block, &address, 
&value) != 3)
   198  if (sscanf(str, "%*s %*s %*s 0x%x 
0x%llx 0x%llx",
   199  &sub_block, 
&address, &value) != 3)
   200  return -EINVAL;
   201  data->head.sub_block_index = sub_block;
   202  data->inject.address = address;
   203  data->inject.value = value;
   204  }
   205  } else {
   206  if (size < sizeof(*data))
   207  return -EINVAL;
   208  
   209  if (copy_from_user(data, buf, sizeof(*data)))
^^^
This lets us set the data->head.block to whatever we want.  Premusably
there is a trusted app which knows how to write the correct values.
But if it has a bug that will cause a crash and we'll have to find a
way to disable it in the kernel for kernel lock down mode etc so either
way we'll need to do a bit of work.

   210  return -EINVAL;
   211  }
   212  
   213  return 0;
   214  }

regards,
dan carpenter

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 1/4] drm/amdgpu: switch to common xgmi ta helpers

2020-05-06 Thread Clements, John

[AMD Public Use]

Reviewed-by: John Clements 

-Original Message-
From: Zhou1, Tao  
Sent: Wednesday, May 6, 2020 4:39 PM
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; 
Deucher, Alexander ; Clements, John 
; Gao, Likun ; Chen, Guchun 
; Li, Dennis 
Cc: Zhang, Hawking 
Subject: RE: [PATCH 1/4] drm/amdgpu: switch to common xgmi ta helpers

[AMD Public Use]

The series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Hawking Zhang 
> Sent: 2020年5月6日 14:39
> To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
> ; Clements, John ; 
> Gao, Likun ; Chen, Guchun ; 
> Li, Dennis ; Zhou1, Tao 
> Cc: Zhang, Hawking 
> Subject: [PATCH 1/4] drm/amdgpu: switch to common xgmi ta helpers
> 
> get_hive_id/get_node_id/get_topology_info/set_topology_info
> are common xgmi command supported by TA for all the ASICs that support 
> xgmi link. They should be implemented as common helper functions to 
> avoid duplicated code per IP generation
> 
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 115
> ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  24 +++ 
> drivers/gpu/drm/amd/amdgpu/psp_v11_0.c  | 121 
> 
>  3 files changed, 123 insertions(+), 137 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index f061ad6..bb5b510 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -664,6 +664,121 @@ int psp_xgmi_initialize(struct psp_context *psp)
>   return ret;
>  }
> 
> +int psp_xgmi_get_hive_id(struct psp_context *psp, uint64_t *hive_id) {
> + struct ta_xgmi_shared_memory *xgmi_cmd;
> + int ret;
> +
> + xgmi_cmd = (struct ta_xgmi_shared_memory*)psp-
> >xgmi_context.xgmi_shared_buf;
> + memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory));
> +
> + xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_HIVE_ID;
> +
> + /* Invoke xgmi ta to get hive id */
> + ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id);
> + if (ret)
> + return ret;
> +
> + *hive_id = xgmi_cmd->xgmi_out_message.get_hive_id.hive_id;
> +
> + return 0;
> +}
> +
> +int psp_xgmi_get_node_id(struct psp_context *psp, uint64_t *node_id) {
> + struct ta_xgmi_shared_memory *xgmi_cmd;
> + int ret;
> +
> + xgmi_cmd = (struct ta_xgmi_shared_memory*)psp-
> >xgmi_context.xgmi_shared_buf;
> + memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory));
> +
> + xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_NODE_ID;
> +
> + /* Invoke xgmi ta to get the node id */
> + ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id);
> + if (ret)
> + return ret;
> +
> + *node_id = xgmi_cmd->xgmi_out_message.get_node_id.node_id;
> +
> + return 0;
> +}
> +
> +int psp_xgmi_get_topology_info(struct psp_context *psp,
> +int number_devices,
> +struct psp_xgmi_topology_info *topology) {
> + struct ta_xgmi_shared_memory *xgmi_cmd;
> + struct ta_xgmi_cmd_get_topology_info_input *topology_info_input;
> + struct ta_xgmi_cmd_get_topology_info_output
> *topology_info_output;
> + int i;
> + int ret;
> +
> + if (!topology || topology->num_nodes >
> TA_XGMI__MAX_CONNECTED_NODES)
> + return -EINVAL;
> +
> + xgmi_cmd = (struct ta_xgmi_shared_memory*)psp-
> >xgmi_context.xgmi_shared_buf;
> + memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory));
> +
> + /* Fill in the shared memory with topology information as input */
> + topology_info_input = &xgmi_cmd-
> >xgmi_in_message.get_topology_info;
> + xgmi_cmd->cmd_id =
> TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO;
> + topology_info_input->num_nodes = number_devices;
> +
> + for (i = 0; i < topology_info_input->num_nodes; i++) {
> + topology_info_input->nodes[i].node_id = topology-
> >nodes[i].node_id;
> + topology_info_input->nodes[i].num_hops = topology-
> >nodes[i].num_hops;
> + topology_info_input->nodes[i].is_sharing_enabled =
> topology->nodes[i].is_sharing_enabled;
> + topology_info_input->nodes[i].sdma_engine = topology-
> >nodes[i].sdma_engine;
> + }
> +
> + /* Invoke xgmi ta to get the topology information */
> + ret = psp_xgmi_invoke(psp,
> TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO);
> + if (ret)
> + return ret;
> +
> + /* Read the output topology information from the shared memory */
> + topology_info_output = &xgmi_cmd-
> >xgmi_out_message.get_topology_info;
> + topology->num_nodes = xgmi_cmd-
> >xgmi_out_message.get_topology_info.num_nodes;
> + for (i = 0; i < topology->num_nodes; i++) {
> + topology->nodes[i].node_id = topology_info_output-
> >nodes[i].node_id;
> + topology->nodes[i].num_hops = topology_info_output-
> >nodes[i].num_hops;
> + topology->nodes[i].is_sharing_enabled =
> topology_

RE: [PATCH] drm/amdgpu: use node_id and node_size to calcualte dram_base_address

2020-05-06 Thread Clements, John

[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: John Clements 

-Original Message-
From: Hawking Zhang  
Sent: Wednesday, May 6, 2020 2:42 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Clements, John 
Cc: Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: use node_id and node_size to calcualte 
dram_base_address

physical_node_id * node_segment_size should be the dram_base_address for 
current gpu node in xgmi config

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_df.h   |  3 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 27 ++--
 drivers/gpu/drm/amd/amdgpu/df_v3_6.c | 54 
 3 files changed, 2 insertions(+), 82 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h
index 057f6ea..61a26c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h
@@ -52,9 +52,6 @@ struct amdgpu_df_funcs {
uint64_t (*get_fica)(struct amdgpu_device *adev, uint32_t ficaa_val);
void (*set_fica)(struct amdgpu_device *adev, uint32_t ficaa_val,
 uint32_t ficadl_val, uint32_t ficadh_val);
-   uint64_t (*get_dram_base_addr)(struct amdgpu_device *adev,
-  uint32_t df_inst);
-   uint32_t (*get_df_inst_id)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_df {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 48c0ce1..90610b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -649,31 +649,8 @@ void amdgpu_xgmi_ras_fini(struct amdgpu_device *adev)  
uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
   uint64_t addr)
 {
-   uint32_t df_inst_id;
-   uint64_t dram_base_addr = 0;
-   const struct amdgpu_df_funcs *df_funcs = adev->df.funcs;
-
-   if ((!df_funcs) ||
-   (!df_funcs->get_df_inst_id) ||
-   (!df_funcs->get_dram_base_addr)) {
-   dev_warn(adev->dev,
-"XGMI: relative phy_addr algorithm is not 
supported\n");
-   return addr;
-   }
-
-   if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) {
-   dev_warn(adev->dev,
-"failed to disable DF-Cstate, DF register may not be 
accessible\n");
-   return addr;
-   }
-
-   df_inst_id = df_funcs->get_df_inst_id(adev);
-   dram_base_addr = df_funcs->get_dram_base_addr(adev, df_inst_id);
-
-   if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
-   dev_warn(adev->dev, "failed to enable DF-Cstate\n");
-
-   return addr + dram_base_addr;
+   struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
+   return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
 }
 
 static void pcs_clear_status(struct amdgpu_device *adev, uint32_t 
pcs_status_reg) diff --git a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c 
b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c
index 5a1bd8e..a7b8292 100644
--- a/drivers/gpu/drm/amd/amdgpu/df_v3_6.c
+++ b/drivers/gpu/drm/amd/amdgpu/df_v3_6.c
@@ -686,58 +686,6 @@ static void df_v3_6_pmc_get_count(struct amdgpu_device 
*adev,
}
 }
 
-static uint64_t df_v3_6_get_dram_base_addr(struct amdgpu_device *adev,
-  uint32_t df_inst)
-{
-   uint32_t base_addr_reg_val  = 0;
-   uint64_t base_addr  = 0;
-
-   base_addr_reg_val = RREG32_PCIE(smnDF_CS_UMC_AON0_DramBaseAddress0 +
-   df_inst * DF_3_6_SMN_REG_INST_DIST);
-
-   if (REG_GET_FIELD(base_addr_reg_val,
- DF_CS_UMC_AON0_DramBaseAddress0,
- AddrRngVal) == 0) {
-   DRM_WARN("address range not valid");
-   return 0;
-   }
-
-   base_addr = REG_GET_FIELD(base_addr_reg_val,
- DF_CS_UMC_AON0_DramBaseAddress0,
- DramBaseAddr);
-
-   return base_addr << 28;
-}
-
-static uint32_t df_v3_6_get_df_inst_id(struct amdgpu_device *adev) -{
-   uint32_t xgmi_node_id   = 0;
-   uint32_t df_inst_id = 0;
-
-   /* Walk through DF dst nodes to find current XGMI node */
-   for (df_inst_id = 0; df_inst_id < DF_3_6_INST_CNT; df_inst_id++) {
-
-   xgmi_node_id = RREG32_PCIE(smnDF_CS_UMC_AON0_DramLimitAddress0 +
-  df_inst_id * 
DF_3_6_SMN_REG_INST_DIST);
-   xgmi_node_id = REG_GET_FIELD(xgmi_node_id,
-DF_CS_UMC_AON0_DramLimitAddress0,
-DstFabricID);
-
-   /* TODO: establish reason dest fabric id is offset by 7 */
-   xgmi_node_id = xgmi_node_id >> 7;
-
-   if (adev->gmc.xgmi.physical_

RE: [PATCH 1/4] drm/amdgpu: switch to common xgmi ta helpers

2020-05-06 Thread Zhou1, Tao

[AMD Public Use]

The series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Hawking Zhang 
> Sent: 2020年5月6日 14:39
> To: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> ; Clements, John
> ; Gao, Likun ; Chen,
> Guchun ; Li, Dennis ;
> Zhou1, Tao 
> Cc: Zhang, Hawking 
> Subject: [PATCH 1/4] drm/amdgpu: switch to common xgmi ta helpers
> 
> get_hive_id/get_node_id/get_topology_info/set_topology_info
> are common xgmi command supported by TA for all the ASICs that support
> xgmi link. They should be implemented as common helper functions to avoid
> duplicated code per IP generation
> 
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 115
> ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  24 +++
> drivers/gpu/drm/amd/amdgpu/psp_v11_0.c  | 121 
>  3 files changed, 123 insertions(+), 137 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index f061ad6..bb5b510 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -664,6 +664,121 @@ int psp_xgmi_initialize(struct psp_context *psp)
>   return ret;
>  }
> 
> +int psp_xgmi_get_hive_id(struct psp_context *psp, uint64_t *hive_id) {
> + struct ta_xgmi_shared_memory *xgmi_cmd;
> + int ret;
> +
> + xgmi_cmd = (struct ta_xgmi_shared_memory*)psp-
> >xgmi_context.xgmi_shared_buf;
> + memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory));
> +
> + xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_HIVE_ID;
> +
> + /* Invoke xgmi ta to get hive id */
> + ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id);
> + if (ret)
> + return ret;
> +
> + *hive_id = xgmi_cmd->xgmi_out_message.get_hive_id.hive_id;
> +
> + return 0;
> +}
> +
> +int psp_xgmi_get_node_id(struct psp_context *psp, uint64_t *node_id) {
> + struct ta_xgmi_shared_memory *xgmi_cmd;
> + int ret;
> +
> + xgmi_cmd = (struct ta_xgmi_shared_memory*)psp-
> >xgmi_context.xgmi_shared_buf;
> + memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory));
> +
> + xgmi_cmd->cmd_id = TA_COMMAND_XGMI__GET_NODE_ID;
> +
> + /* Invoke xgmi ta to get the node id */
> + ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id);
> + if (ret)
> + return ret;
> +
> + *node_id = xgmi_cmd->xgmi_out_message.get_node_id.node_id;
> +
> + return 0;
> +}
> +
> +int psp_xgmi_get_topology_info(struct psp_context *psp,
> +int number_devices,
> +struct psp_xgmi_topology_info *topology) {
> + struct ta_xgmi_shared_memory *xgmi_cmd;
> + struct ta_xgmi_cmd_get_topology_info_input *topology_info_input;
> + struct ta_xgmi_cmd_get_topology_info_output
> *topology_info_output;
> + int i;
> + int ret;
> +
> + if (!topology || topology->num_nodes >
> TA_XGMI__MAX_CONNECTED_NODES)
> + return -EINVAL;
> +
> + xgmi_cmd = (struct ta_xgmi_shared_memory*)psp-
> >xgmi_context.xgmi_shared_buf;
> + memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory));
> +
> + /* Fill in the shared memory with topology information as input */
> + topology_info_input = &xgmi_cmd-
> >xgmi_in_message.get_topology_info;
> + xgmi_cmd->cmd_id =
> TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO;
> + topology_info_input->num_nodes = number_devices;
> +
> + for (i = 0; i < topology_info_input->num_nodes; i++) {
> + topology_info_input->nodes[i].node_id = topology-
> >nodes[i].node_id;
> + topology_info_input->nodes[i].num_hops = topology-
> >nodes[i].num_hops;
> + topology_info_input->nodes[i].is_sharing_enabled =
> topology->nodes[i].is_sharing_enabled;
> + topology_info_input->nodes[i].sdma_engine = topology-
> >nodes[i].sdma_engine;
> + }
> +
> + /* Invoke xgmi ta to get the topology information */
> + ret = psp_xgmi_invoke(psp,
> TA_COMMAND_XGMI__GET_GET_TOPOLOGY_INFO);
> + if (ret)
> + return ret;
> +
> + /* Read the output topology information from the shared memory */
> + topology_info_output = &xgmi_cmd-
> >xgmi_out_message.get_topology_info;
> + topology->num_nodes = xgmi_cmd-
> >xgmi_out_message.get_topology_info.num_nodes;
> + for (i = 0; i < topology->num_nodes; i++) {
> + topology->nodes[i].node_id = topology_info_output-
> >nodes[i].node_id;
> + topology->nodes[i].num_hops = topology_info_output-
> >nodes[i].num_hops;
> + topology->nodes[i].is_sharing_enabled =
> topology_info_output->nodes[i].is_sharing_enabled;
> + topology->nodes[i].sdma_engine = topology_info_output-
> >nodes[i].sdma_engine;
> + }
> +
> + return 0;
> +}
> +
> +int psp_xgmi_set_topology_info(struct psp_context *psp,
> +int number_devices,
> +struct psp_xgmi_topology_info *topology)

Re: [PATCH 1/1] drm/amdgpu: Use GEM obj reference for KFD BOs

2020-05-06 Thread Christian König


Am 06.05.20 um 02:59 schrieb Felix Kuehling:

Releasing the AMDGPU BO ref directly leads to problems when BOs were
exported as DMA bufs. Releasing the GEM reference makes sure that the
AMDGPU/TTM BO is not freed too early.

Also take a GEM reference when importing BOs from DMABufs to keep
references to imported BOs balances properly.

Signed-off-by: Felix Kuehling 
Tested-by: Alex Sierra 


Acked-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 1247938b1ec1..da8b31a53291 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1354,7 +1354,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
}
  
  	/* Free the BO*/

-   amdgpu_bo_unref(&mem->bo);
+   drm_gem_object_put_unlocked(&mem->bo->tbo.base);
mutex_destroy(&mem->lock);
kfree(mem);
  
@@ -1699,7 +1699,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev *kgd,

| KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE
| KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
  
-	(*mem)->bo = amdgpu_bo_ref(bo);

+   drm_gem_object_get(&bo->tbo.base);
+   (*mem)->bo = bo;
(*mem)->va = va;
(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

2020-05-06 Thread Christian König


Am 06.05.20 um 05:45 schrieb Zhao, Jiange:

[AMD Official Use Only - Internal Distribution Only]

Hi Christian,


Hi Jiange, well that looks correct to me, but seems to be a bit to complicated. 
What exactly was wrong with version 3?

(1) If you open amdgpu_autodump, use it and close it, then you can't open it 
again, because wait_for_completion_interruptible_timeout() would decrement 
amdgpu_autodump.dumping.done to 0, then .open() would always return failure 
except the first time.


In this case we should probably just use complete_all() instead of just 
complete(). So that the struct complete stays in the completed state.



(2) reset lock is not optimal in this case. Because usermode app would take any 
operation at any time and there are so many race conditions to solve. A 
dedicated lock would be simpler and clearer.


I don't think that this will work. Using the reset lock is mandatory 
here or otherwise we always race between a new process opening the file 
and an ongoing GPU reset.


Just imagine what happens when the process which waited for the GPU 
reset event doesn't do a dump, but just closes and immediately reopens 
the file while the last reset is still ongoing.


What we could do here is using mutex_trylock() on the reset lock and 
return -EBUSY when a reset is ongoing. Or maybe better 
mutex_lock_interruptible().



Please completely drop this extra check. Waking up the queue and waiting for 
completion should always work when done right.

This check is very necessary, because if there is no usermode app listening, 
the following wait_for_completion_interruptible_timeout() would wait until 
timeout anyway, which is 10 minutes for nothing. This is not what we wanted.


See the wait_event_* documentation, exactly that's what you should never do.

Instead just signal the struct completion with complete_all() directly 
after it is created. This way the wakeup is a no-op and waiting for the 
struct completion continues immediately.


Regards,
Christian.



Jiange

-Original Message-
From: Koenig, Christian 
Sent: Wednesday, April 29, 2020 10:09 PM
To: Pelloux-prayer, Pierre-eric ; Zhao, Jiange 
; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Deucher, Alexander 
; Liu, Monk ; Zhang, Hawking 

Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset v4

Am 29.04.20 um 16:04 schrieb Pierre-Eric Pelloux-Prayer:

Hi Jiange,

This version seems to work fine.

Tested-by: Pierre-Eric Pelloux-Prayer



On 29/04/2020 07:08, Zhao, Jiange wrote:

[AMD Official Use Only - Internal Distribution Only]


Hi all,

I worked out the race condition and here is version 5. Please have a look.

Jiange
-
-
-
-
-
-
-
-
-
-
-
-
-
-

*From:* Zhao, Jiange 
*Sent:* Wednesday, April 29, 2020 1:06 PM
*To:* amd-gfx@lists.freedesktop.org 
*Cc:* Koenig, Christian ; Kuehling, Felix
; Pelloux-prayer, Pierre-eric
; Deucher, Alexander
; Zhang, Hawking ;
Liu, Monk ; Zhao, Jiange 
*Subject:* [PATCH] drm/amdgpu: Add autodump debugfs node for gpu
reset v4
   
From: Jiange Zhao 


When GPU got timeout, it would notify an interested part of an
opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system and
poll() for readable/writable. When a GPU reset is due, amdgpu would
notify usermode app through wait_queue_head and give it 10 minutes to
dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through the
completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

v2: (1) changed 'registered' to 'app_listening'
      (2) add a mutex in open() to prevent race condition

v3 (chk): grab the reset lock to avoid race in autodump_open,
    rename debugfs file to amdgpu_autodump,
    provide autodump_read as well,
    style and code cleanups

v4

RE: [bug report] drm/amdgpu: add amdgpu_ras.c to support ras (v2)

2020-05-06 Thread Zhou1, Tao

[AMD Public Use]

Hi Dan:

Please check the following piece of code in amdgpu_ras_debugfs_ctrl_parse_data:

if (op != -1) {
if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
return -EINVAL;

data->head.block = block_id;

amdgpu_ras_find_block_id_by_name will return error directly if someone try to 
provide an invalid block_name intentionally via debugfs.

Regards,
Tao

> -Original Message-
> From: amd-gfx  On Behalf Of Dan
> Carpenter
> Sent: 2020年5月5日 17:13
> To: Pan, Xinhui 
> Cc: amd-gfx@lists.freedesktop.org
> Subject: [bug report] drm/amdgpu: add amdgpu_ras.c to support ras (v2)
> 
> Hello xinhui pan,
> 
> The patch c030f2e4166c: "drm/amdgpu: add amdgpu_ras.c to support ras
> (v2)" from Oct 31, 2018, leads to the following static checker
> warning:
> 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c:620
> amdgpu_ras_feature_enable()
>   warn: uncapped user index 'ras_block_string[head->block]'
> 
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>587  int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
>588  struct ras_common_if *head, bool enable)
>589  {
>590  struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>591  union ta_ras_cmd_input info;
>592  int ret;
>593
>594  if (!con)
>595  return -EINVAL;
>596
>597  if (!enable) {
>598  info.disable_features = (struct 
> ta_ras_disable_features_input)
> {
>599  .block_id =  
> amdgpu_ras_block_to_ta(head->block),
>600  .error_type = 
> amdgpu_ras_error_to_ta(head->type),
>601  };
>602  } else {
>603  info.enable_features = (struct 
> ta_ras_enable_features_input)
> {
>604  .block_id =  
> amdgpu_ras_block_to_ta(head->block),
>605  .error_type = 
> amdgpu_ras_error_to_ta(head->type),
>606  };
>607  }
>608
>609  /* Do not enable if it is not allowed. */
>610  WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev,
> head));
>611  /* Are we alerady in that state we are going to set? */
>612  if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
>613  return 0;
>614
>615  if (!amdgpu_ras_intr_triggered()) {
>616  ret = psp_ras_enable_features(&adev->psp, &info, 
> enable);
>617  if (ret) {
>618  amdgpu_ras_parse_status_code(adev,
>619   enable ? 
> "enable":"disable",
>620   
> ras_block_str(head->block),
>
> ^^^ The head->block
> value can be set to anything using debugfs.  That's a problem because it
> could easily lead to a kernel panic (which is
> annoying) and also because these days we try to restrict what root can do.
> 
>621  (enum 
> ta_ras_status)ret);
>622  if (ret == TA_RAS_STATUS__RESET_NEEDED)
>623  return -EAGAIN;
>624  return -EINVAL;
>625  }
>626  }
>627
>628  /* setup the obj */
>629  __amdgpu_ras_feature_enable(adev, head, enable);
>630
>631  return 0;
>632  }
> 
> regards,
> dan carpenter
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.f
> reedesktop.org%2Fmailman%2Flistinfo%2Famd-
> gfx&data=02%7C01%7Ctao.zhou1%40amd.com%7Cd9925eca763149180
> ea508d7f0d47cfe%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C6
> 37242667754892396&sdata=k%2FMTgz9D1jIJGRFBu2Yuu6wuHP%2BPbR
> vEcNJp87slIJE%3D&reserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/1] drm/amdgpu: Take a reference to an exported BO

2020-05-06 Thread Christian König


Am 05.05.20 um 19:29 schrieb Felix Kuehling:


What we could probably do to detect this is adding a BUG_ON() in TTMs 
release function and checking if the GEM reference count is really dead.


The problem is, that we have to guess whether there are still any 
dmabuf references to the GEM BO. There is no way amdgpu can know that. 
You can't make amdgpu responsible for keeping a reference to the TTM 
BO while the GEM BO is still referenced by entities completely out of 
the control of amdgpu.




That sounds like you don't understand how this works on the graphics 
side, so here is a brief overview once more.


The last object still around is always the TTM BO, which can even be 
resurrected from the dead (kref_init() called again) if necessary for 
delayed delete.


Then we have the GEM object which holds a reference to the TTM BO. This 
reference gets dropped when the GEM object gets destroyed.


Then we optionally have the DMA-buf object for exported BOs which holds 
a reference to the GEM object and the driver.



This construct guarantees that the GEM object is never destroyed nor the 
driver unloaded before the DMA-buf object referencing it is destroyed.


Additional to that the reference from the GEM object to the TTM BO 
guarantees that the TTM BO is never destroyed before the GEM object is 
destroyed.


Another weird thing I see is that amdgpu_gem_free_object calls 
amdgpu_bo_unref. That implies that the GEM object conceptually holds a 
reference to the amdpu/TTM BO. But that is not really the case. Amdgpu 
never takes that reference that GEM is supposed to own. If it did, we 
would leak all our memory because nobody would ever drop that reference. 


See amdgpu_bo_do_create(), the GEM object is initialized with 
drm_gem_private_object_init() with a reference count of 1. Then the TTM 
BO is initialized with ttm_bo_init_reserved() with a reference count of 1.


In general there are two use cases here, the first one is userspace 
allocations with a GEM object handle. In this case the initial reference 
is owned by the GEM object and dropped in amdgpu_gem_free_object().


The other use case are kernel internal allocations like page tables and 
other general buffers. In this case the GEM object is ignored and the 
last TTM BO reference is dropped by calling amdgpu_bo_unref().


I think the correct solution is for amdgpu_bo_ref/unref to delegate 
its reference counting to drm_gem_object_get/put instead of 
ttm_bo_get/put. The amdgpu BO would hold one token reference to the 
TTM BO, which it can drop when the GEM BO refcount drops to 0. 
Finally, the amdgpu BO should only be freed once the TTM BO refcount 
also becomes 0.


Just the other way around, but yes the long term plan should probably 
be to merge the two.


I need a short term solution. Because I have a bug that causes a 
kernel oops with applications that are valid and correct, as far as I 
can tell.


I'm thinking a solution that doesn't require major changes to the way 
TTM and GEM interact would put amdgpu in charge of coordinating the 
two. Unfortunately that would mean adding a third reference count in 
amdgpu_bo, in addition to the ones in TTM and GEM. The amdgpu BO would 
hold one token reference to each of the GEM and TTM BO. When amdgpu 
refcount goes to 0 it releases that GEM BO token reference. When the 
GEM BO refcount goes  to 0, we get a callback amdgpu_gem_object_free. 
There we can drop the token reference to the TTM BO. Once the TTM BO 
reference goes to 0 we free the memory.


Does this sound feasible?



Well of hand it sounds like it makes the whole thing much more 
complicated without any gain. I probably still haven't understood what 
the core problem here is.


Regards,
Christian.


Regards,
  Felix




The difficult is currently we have a mismatch what locks could be 
taken when we drop the references.


Regards,
Christian.


Regards,
  Felix




Regards,
Christian.

Am 01.05.20 um 16:44 schrieb Felix Kuehling:


[dropping my gmail address]

We saw this backtrace showing the call chain while investigating a 
kernel oops caused by this issue on the DKMS branch with the KFD 
IPC API. It happens after a dma-buf file is released with fput:


[ 1255.049330] BUG: kernel NULL pointer dereference, address: 051e
[ 1255.049727] #PF: supervisor read access in kernel mode
[ 1255.050092] #PF: error_code(0x) - not-present page
[ 1255.050416] PGD 0 P4D 0
[ 1255.050736] Oops:  [#1] SMP PTI
[ 1255.051060] CPU: 27 PID: 2292 Comm: kworker/27:2 Tainted: G   OE 
5.3.0-46-generic #38~18.04.1-Ubuntu
[ 1255.051400] Hardware name: Supermicro SYS-4029GP-TRT2/X11DPG-OT-CPU, BIOS 
3.0a 02/26/2019
[ 1255.051752] Workqueue: events delayed_fput
[ 1255.052111] RIP: 0010:drm_gem_object_put_unlocked+0x1c/0x70 [drm]
[ 1255.052465] Code: 4d 80 c8 ee 0f 0b eb d8 66 0f 1f 44 00 00 0f 1f 44 00 00 48 85 
ff 74 34 55 48 89 e5 41 54 53 48 89 fb 48 8b 7f 08 48 8b 47 20 <48> 83 b8 a0 00 
00 00 00 74 1a 4c 8d 67 68 48 89 df 4c 89 e6

[PATCH -next] drm/amdgpu/navi10: fix unsigned comparison with 0

2020-05-06 Thread ChenTao

Fixes warning because size is uint32_t and can never be negtative

drivers/gpu/drm/amd/amdgpu/../powerplay/navi10_ppt.c:1296:12: warning:
comparison of unsigned expression < 0 is always false [-Wtype-limits]
   if (size < 0)

Reported-by: Hulk Robot 
Signed-off-by: ChenTao 
---
 drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
index 2184d247a9f7..0c9be864d072 100644
--- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
@@ -1293,8 +1293,6 @@ static int navi10_set_power_profile_mode(struct 
smu_context *smu, long *input, u
}
 
if (smu->power_profile_mode == PP_SMC_POWER_PROFILE_CUSTOM) {
-   if (size < 0)
-   return -EINVAL;
 
ret = smu_update_table(smu,
   SMU_TABLE_ACTIVITY_MONITOR_COEFF, 
WORKLOAD_PPLIB_CUSTOM_BIT,
-- 
2.22.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 4/4] drm/amdgpu: switch to common rlc_autoload helper

2020-05-06 Thread Chen, Guchun

[AMD Public Use]

One spelling typo in the commit message, 'functon' should be 'function'. Apart 
from this, series is: Reviewed-by: Guchun Chen 

' drop IP specific psp functon for rlc autoload since the autoload_supported '

Regards,
Guchun

-Original Message-
From: Hawking Zhang  
Sent: Wednesday, May 6, 2020 2:39 PM
To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
; Clements, John ; Gao, Likun 
; Chen, Guchun ; Li, Dennis 
; Zhou1, Tao 
Cc: Zhang, Hawking 
Subject: [PATCH 4/4] drm/amdgpu: switch to common rlc_autoload helper

drop IP specific psp functon for rlc autoload since the autoload_supported was 
introduced to mark ASICs that support rlc_autoload

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 3 ---  
drivers/gpu/drm/amd/amdgpu/psp_v11_0.c  | 6 --
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 19f09b4..5146542 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1646,7 +1646,7 @@ static int psp_np_fw_load(struct psp_context *psp)
/* Start rlc autoload after psp recieved all the gfx firmware */
if (psp->autoload_supported && ucode->ucode_id == 
(amdgpu_sriov_vf(adev) ?
AMDGPU_UCODE_ID_CP_MEC2 : AMDGPU_UCODE_ID_RLC_G)) {
-   ret = psp_rlc_autoload(psp);
+   ret = psp_rlc_autoload_start(psp);
if (ret) {
DRM_ERROR("Failed to start rlc autoload\n");
return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 46bd85f..2a56ad9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -95,7 +95,6 @@ struct psp_funcs
enum psp_ring_type ring_type);
bool (*smu_reload_quirk)(struct psp_context *psp);
int (*mode1_reset)(struct psp_context *psp);
-   int (*rlc_autoload_start)(struct psp_context *psp);
int (*mem_training_init)(struct psp_context *psp);
void (*mem_training_fini)(struct psp_context *psp);
int (*mem_training)(struct psp_context *psp, uint32_t ops); @@ -307,8 
+306,6 @@ struct amdgpu_psp_funcs {
((psp)->funcs->smu_reload_quirk ? 
(psp)->funcs->smu_reload_quirk((psp)) : false)  #define psp_mode1_reset(psp) \
((psp)->funcs->mode1_reset ? (psp)->funcs->mode1_reset((psp)) : 
false) -#define psp_rlc_autoload(psp) \
-   ((psp)->funcs->rlc_autoload_start ? 
(psp)->funcs->rlc_autoload_start((psp)) : 0)
 #define psp_mem_training_init(psp) \
((psp)->funcs->mem_training_init ? 
(psp)->funcs->mem_training_init((psp)) : 0)  #define psp_mem_training_fini(psp) 
\ diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index cfbf30a..1de89cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -524,11 +524,6 @@ static int psp_v11_0_mode1_reset(struct psp_context *psp)
return 0;
 }
 
-static int psp_v11_0_rlc_autoload_start(struct psp_context *psp) -{
-   return psp_rlc_autoload_start(psp);
-}
-
 static int psp_v11_0_memory_training_send_msg(struct psp_context *psp, int 
msg)  {
int ret;
@@ -825,7 +820,6 @@ static const struct psp_funcs psp_v11_0_funcs = {
.ring_stop = psp_v11_0_ring_stop,
.ring_destroy = psp_v11_0_ring_destroy,
.mode1_reset = psp_v11_0_mode1_reset,
-   .rlc_autoload_start = psp_v11_0_rlc_autoload_start,
.mem_training_init = psp_v11_0_memory_training_init,
.mem_training_fini = psp_v11_0_memory_training_fini,
.mem_training = psp_v11_0_memory_training,
--
2.7.4
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

50 matches

Mail list logo