Re: [PATCH] drm/amdgpu: Avoid another list of reset devices

2022-08-03 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking

Get Outlook for iOS

From: Lazar, Lijo 
Sent: Wednesday, August 3, 2022 7:36:20 PM
To: amd-gfx@lists.freedesktop.org 
Cc: Zhang, Hawking ; Deucher, Alexander 

Subject: [PATCH] drm/amdgpu: Avoid another list of reset devices

A list of devices to be reset are already created in
amdgpu_device_gpu_recover function. Creating another list with the
same nodes is incorrect and not supported in list_head. Instead, pass
the device list as part of reset context.

Fixes: 9e08564727fc (drm/amdgpu: Refactor mode2 reset logic for v13.0.2)
Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c | 45 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
 3 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index c6cc493a5486..2b97b8a96fb4 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -148,30 +148,22 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control 
*reset_ctl,
   struct amdgpu_reset_context *reset_context)
 {
 struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+   struct list_head *reset_device_list = reset_context->reset_device_list;
 struct amdgpu_device *tmp_adev = NULL;
-   struct list_head reset_device_list;
 int r = 0;

 dev_dbg(adev->dev, "aldebaran perform hw reset\n");
+
+   if (reset_device_list == NULL)
+   return -EINVAL;
+
 if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
 reset_context->hive == NULL) {
 /* Wrong context, return error */
 return -EINVAL;
 }

-   INIT_LIST_HEAD(_device_list);
-   if (reset_context->hive) {
-   list_for_each_entry (tmp_adev,
-_context->hive->device_list,
-gmc.xgmi.head)
-   list_add_tail(_adev->reset_list,
- _device_list);
-   } else {
-   list_add_tail(_context->reset_req_dev->reset_list,
- _device_list);
-   }
-
-   list_for_each_entry (tmp_adev, _device_list, reset_list) {
+   list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
 mutex_lock(_adev->reset_cntl->reset_lock);
 tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
 }
@@ -179,7 +171,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control 
*reset_ctl,
  * Mode2 reset doesn't need any sync between nodes in XGMI hive, 
instead launch
  * them together so that they can be completed asynchronously on 
multiple nodes
  */
-   list_for_each_entry (tmp_adev, _device_list, reset_list) {
+   list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
 /* For XGMI run all resets in parallel to speed up the process 
*/
 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
 if (!queue_work(system_unbound_wq,
@@ -197,7 +189,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control 
*reset_ctl,

 /* For XGMI wait for all resets to complete before proceed */
 if (!r) {
-   list_for_each_entry (tmp_adev, _device_list, reset_list) {
+   list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
 flush_work(_adev->reset_cntl->reset_work);
 r = tmp_adev->asic_reset_res;
@@ -207,7 +199,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control 
*reset_ctl,
 }
 }

-   list_for_each_entry (tmp_adev, _device_list, reset_list) {
+   list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
 mutex_unlock(_adev->reset_cntl->reset_lock);
 tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
 }
@@ -339,10 +331,13 @@ static int
 aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
   struct amdgpu_reset_context *reset_context)
 {
+   struct list_head *reset_device_list = reset_context->reset_device_list;
 struct amdgpu_device *tmp_adev = NULL;
-   struct list_head reset_device_list;
 int r;

+   if (reset_device_list == NULL)
+   return -EINVAL;
+
 if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
 IP_VERSION(13, 0, 2) &&
 reset_context->hive == NULL) {
@@ -350,19 +345,7 @@ aldebaran_mode2_restore_hwcontext(struct 

Re: [PATCH] drm/amd/display: set panel orientation before drm_dev_register

2022-08-03 Thread Harry Wentland



On 2022-08-03 12:24, Melissa Wen wrote:
> To set the panel orientation property with quirk, we need the mode size
> provided by EDID. This info is available after EDID is read by 
> dc_link_detect()
> and updated by amdgpu_dm_update_connector_after_detect(). The detection
> happens at driver load in amdgpu_dm_initialize_drm_device() and,
> therefore, we can get modes and set panel orientation before
> drm_dev_register() to avoid DRM warns on creating the connector property
> after device registration:
> 
> [2.563969] [ cut here ]
> [2.563971] WARNING: CPU: 6 PID: 325 at 
> drivers/gpu/drm/drm_mode_object.c:45 drm_mode_object_add+0x72/0x80 [drm]
> [2.563997] Modules linked in: btusb btrtl btbcm btintel btmtk bluetooth 
> rfkill ecdh_generic ecc usbhid crc16 amdgpu(+) drm_ttm_helper ttm agpgart 
> gpu_sched i2c_algo_bit drm_display_helper drm_kms_helper syscopyarea 
> sysfillrect sysimgblt fb_sys_fops drm serio_raw sdhci_pci atkbd libps2 cqhci 
> vivaldi_fmap ccp sdhci i8042 crct10dif_pclmul crc32_pclmul hid_multitouch 
> ghash_clmulni_intel aesni_intel crypto_simd cryptd wdat_wdt mmc_core cec 
> xhci_pci sp5100_tco rng_core xhci_pci_renesas serio 8250_dw i2c_hid_acpi 
> i2c_hid btrfs blake2b_generic libcrc32c crc32c_generic crc32c_intel xor 
> raid6_pq dm_mirror dm_region_hash dm_log dm_mod pkcs8_key_parser crypto_user
> [2.564032] CPU: 6 PID: 325 Comm: systemd-udevd Not tainted 
> 5.18.0-amd-staging-drm-next+ #67
> [2.564034] Hardware name: Valve Jupiter/Jupiter, BIOS F7A0105 03/21/2022
> [2.564036] RIP: 0010:drm_mode_object_add+0x72/0x80 [drm]
> [2.564053] Code: f0 89 c3 85 c0 78 07 89 45 00 44 89 65 04 4c 89 ef e8 e2 
> 99 04 f1 31 c0 85 db 0f 4e c3 5b 5d 41 5c 41 5d c3 80 7f 50 00 74 ac <0f> 0b 
> eb a8 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 54 4c
> [2.564055] RSP: 0018:b2e880413860 EFLAGS: 00010202
> [2.564056] RAX: c0ba1440 RBX: 99508a860010 RCX: 
> 0001
> [2.564057] RDX: b0b0b0b0 RSI: 99508c050110 RDI: 
> 99508a860010
> [2.564058] RBP: 99508c050110 R08: 0020 R09: 
> 99508c292c20
> [2.564059] R10:  R11: 99508c0507d8 R12: 
> b0b0b0b0
> [2.564060] R13: 0004 R14: c068a4b6 R15: 
> c068a47f
> [2.564061] FS:  7fc69b5f1a40() GS:9953aff8() 
> knlGS:
> [2.564063] CS:  0010 DS:  ES:  CR0: 80050033
> [2.564063] CR2: 7f9506804000 CR3: 000107f92000 CR4: 
> 00350ee0
> [2.564065] Call Trace:
> [2.564068]  
> [2.564070]  drm_property_create+0xc9/0x170 [drm]
> [2.564088]  drm_property_create_enum+0x1f/0x70 [drm]
> [2.564105]  drm_connector_set_panel_orientation_with_quirk+0x96/0xc0 [drm]
> [2.564123]  get_modes+0x4fb/0x530 [amdgpu]
> [2.564378]  drm_helper_probe_single_connector_modes+0x1ad/0x850 
> [drm_kms_helper]
> [2.564390]  drm_client_modeset_probe+0x229/0x1400 [drm]
> [2.564411]  ? xas_store+0x52/0x5e0
> [2.564416]  ? kmem_cache_alloc_trace+0x177/0x2c0
> [2.564420]  __drm_fb_helper_initial_config_and_unlock+0x44/0x4e0 
> [drm_kms_helper]
> [2.564430]  drm_fbdev_client_hotplug+0x173/0x210 [drm_kms_helper]
> [2.564438]  drm_fbdev_generic_setup+0xa5/0x166 [drm_kms_helper]
> [2.564446]  amdgpu_pci_probe+0x35e/0x370 [amdgpu]
> [2.564621]  local_pci_probe+0x45/0x80
> [2.564625]  ? pci_match_device+0xd7/0x130
> [2.564627]  pci_device_probe+0xbf/0x220
> [2.564629]  ? sysfs_do_create_link_sd+0x69/0xd0
> [2.564633]  really_probe+0x19c/0x380
> [2.564637]  __driver_probe_device+0xfe/0x180
> [2.564639]  driver_probe_device+0x1e/0x90
> [2.564641]  __driver_attach+0xc0/0x1c0
> [2.564643]  ? __device_attach_driver+0xe0/0xe0
> [2.564644]  ? __device_attach_driver+0xe0/0xe0
> [2.564646]  bus_for_each_dev+0x78/0xc0
> [2.564648]  bus_add_driver+0x149/0x1e0
> [2.564650]  driver_register+0x8f/0xe0
> [2.564652]  ? 0xc1023000
> [2.564654]  do_one_initcall+0x44/0x200
> [2.564657]  ? kmem_cache_alloc_trace+0x177/0x2c0
> [2.564659]  do_init_module+0x4c/0x250
> [2.564663]  __do_sys_init_module+0x12e/0x1b0
> [2.564666]  do_syscall_64+0x3b/0x90
> [2.564670]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [2.564673] RIP: 0033:0x7fc69bff232e
> [2.564674] Code: 48 8b 0d 45 0b 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 
> 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 
> 01 f0 ff ff 73 01 c3 48 8b 0d 12 0b 0c 00 f7 d8 64 89 01 48
> [2.564676] RSP: 002b:7ffe872ba3e8 EFLAGS: 0246 ORIG_RAX: 
> 00af
> [2.564677] RAX: ffda RBX: 55873f797820 RCX: 
> 7fc69bff232e
> [2.564678] RDX: 55873f7bf390 RSI: 01155e81 RDI: 
> 7fc699e4d010
> [2.564679] RBP: 7fc699e4d010 R08: 55873f7bfe20 R09: 
> 01155e90
> [

Re: [PATCH v2] drm/amd/display: Fix vblank refcount in vrr transition

2022-08-03 Thread Harry Wentland
On 2022-07-28 09:49, Yunxiang Li wrote:
> manage_dm_interrupts disable/enable vblank using drm_crtc_vblank_off/on
> which causes drm_crtc_vblank_get in vrr_transition to fail, and later
> when drm_crtc_vblank_put is called the refcount on vblank will be messed
> up. Therefore move the call to after manage_dm_interrupts.
> 
> Unchecked calls to drm_crtc_vblank_get seems to be common in other
> drivers as well so it may make sense to let get always succeed during
> modset, see
> https://lists.freedesktop.org/archives/dri-devel/2022-July/365589.html
> 

Thanks for this patch.

This change makes sense to me but I would love Mario and/or Nick to take
a look as well, if possible. Would prefer if we hold off on the merge and
give them a chance to chime in.

Reviewed-by: Harry Wentland 

Harry

> Signed-off-by: Yunxiang Li 
> ---
> v2: check the return code for calls that might fail and warn on them  
> 
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 34 ---
>  1 file changed, 14 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 92470a0e0262..9f3fdb92e7a4 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -7480,15 +7480,15 @@ static void amdgpu_dm_handle_vrr_transition(struct 
> dm_crtc_state *old_state,
>* We also need vupdate irq for the actual core vblank handling
>* at end of vblank.
>*/
> - dm_set_vupdate_irq(new_state->base.crtc, true);
> - drm_crtc_vblank_get(new_state->base.crtc);
> + WARN_ON(dm_set_vupdate_irq(new_state->base.crtc, true) != 0);
> + WARN_ON(drm_crtc_vblank_get(new_state->base.crtc) != 0);
>   DRM_DEBUG_DRIVER("%s: crtc=%u VRR off->on: Get vblank ref\n",
>__func__, new_state->base.crtc->base.id);
>   } else if (old_vrr_active && !new_vrr_active) {
>   /* Transition VRR active -> inactive:
>* Allow vblank irq disable again for fixed refresh rate.
>*/
> - dm_set_vupdate_irq(new_state->base.crtc, false);
> + WARN_ON(dm_set_vupdate_irq(new_state->base.crtc, false) != 0);
>   drm_crtc_vblank_put(new_state->base.crtc);
>   DRM_DEBUG_DRIVER("%s: crtc=%u VRR on->off: Drop vblank ref\n",
>__func__, new_state->base.crtc->base.id);
> @@ -8252,23 +8252,6 @@ static void amdgpu_dm_atomic_commit_tail(struct 
> drm_atomic_state *state)
>   mutex_unlock(>dc_lock);
>   }
>  
> - /* Count number of newly disabled CRTCs for dropping PM refs later. */
> - for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state,
> -   new_crtc_state, i) {
> - if (old_crtc_state->active && !new_crtc_state->active)
> - crtc_disable_count++;
> -
> - dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
> - dm_old_crtc_state = to_dm_crtc_state(old_crtc_state);
> -
> - /* For freesync config update on crtc state and params for irq 
> */
> - update_stream_irq_parameters(dm, dm_new_crtc_state);
> -
> - /* Handle vrr on->off / off->on transitions */
> - amdgpu_dm_handle_vrr_transition(dm_old_crtc_state,
> - dm_new_crtc_state);
> - }
> -
>   /**
>* Enable interrupts for CRTCs that are newly enabled or went through
>* a modeset. It was intentionally deferred until after the front end
> @@ -8287,7 +8270,15 @@ static void amdgpu_dm_atomic_commit_tail(struct 
> drm_atomic_state *state)
>   cur_crc_src = acrtc->dm_irq_params.crc_src;
>   spin_unlock_irqrestore(_to_drm(adev)->event_lock, flags);
>  #endif
> + /* Count number of newly disabled CRTCs for dropping PM refs 
> later. */
> + if (old_crtc_state->active && !new_crtc_state->active)
> + crtc_disable_count++;
> +
>   dm_new_crtc_state = to_dm_crtc_state(new_crtc_state);
> + dm_old_crtc_state = to_dm_crtc_state(old_crtc_state);
> +
> + /* For freesync config update on crtc state and params for irq 
> */
> + update_stream_irq_parameters(dm, dm_new_crtc_state);
>  
>   if (new_crtc_state->active &&
>   (!old_crtc_state->active ||
> @@ -8324,6 +8315,9 @@ static void amdgpu_dm_atomic_commit_tail(struct 
> drm_atomic_state *state)
>   DRM_DEBUG_DRIVER("Failed to configure 
> crc source");
>  #endif
>   }
> +
> + /* Handle vrr on->off / off->on transitions */
> + amdgpu_dm_handle_vrr_transition(dm_old_crtc_state, 
> dm_new_crtc_state);
>   }
>  
>   

Re: [RESEND RFC 18/18] drm/display/dp_mst: Move all payload info into the atomic state

2022-08-03 Thread Lyude Paul
On Tue, 2022-07-05 at 09:10 +, Lin, Wayne wrote:
> > +struct drm_dp_mst_port;
> > +
> >   /* DP MST stream allocation (payload bandwidth number) */
> >   struct dc_dp_mst_stream_allocation {
> >    uint8_t vcp_id;
> >    /* number of slots required for the DP stream in
> >    * transport packet */
> >    uint8_t slot_count;
> > + /* The MST port this is on, this is used to associate DC MST payloads
> > with their
> > + * respective DRM payloads allocations, and can be ignored on non-
> > Linux.
> > + */
> 
> Is it necessary for adding this new member? Since this is for setting the DC
> HW and not relating to drm.

I don't entirely know, honestly. The reasons I did it:

 * Mapping things from DRM to DC and from DC to DRM is really confusing for
   outside contributors like myself, so it wasn't even really clear to me if
   there was another way to reconstruct the DRM context from the spots where
   we call from DC up to DM (not a typo, see next point).
 * These DC structs for MST are already layer mixing as far as I can tell,
   just not in an immediately obvious way. While this struct itself is for DC,
   there's multiple spots where we pass the DC payload structs down from DM to
   DC, then pass them back up from DC to DM and have to figure out how to
   reconstruct the DRM context that we actually need to use the MST helpers
   from that. So, that kind of further complicates the confusion of where
   layers should be separated.
 * As far as I'm aware with C there shouldn't be any issue with adding a
   pointer to a struct whose contents are undefined. IMHO, this is also
   preferable to just using void* because then at least you get some hint as
   to the actual type of the data and avoid the possibility of casting it to
   the wrong type. So tl;dr, on any platform even outside of Linux with a
   reasonably compliant compiler this should still build just fine. It'll even
   give you the added bonus of warning people if they try to access the
   contents of this member in DC on non-Linux platforms. If void* is preferred
   though I'm fine with switching it to that.

-- 
Cheers, Lyude Paul (she/her) Software Engineer at Red Hat



Re: [PATCH v4 00/41] DYNDBG: opt-in class'd debug for modules, use in drm.

2022-08-03 Thread jim . cromie
On Wed, Jul 20, 2022 at 9:32 AM Jim Cromie  wrote:
>

> Hi Jason, Greg, DRM-folk,
>
> This adds 'typed' "class FOO" support to dynamic-debug, where 'typed'
> means either DISJOINT (like drm debug categories), or VERBOSE (like
> nouveau debug-levels).  Use it in DRM modules: core, helpers, and in
> drivers i915, amdgpu, nouveau.
>

This revision fell over, on a conflict with something in drm-MUMBLE

Error: patch 
https://patchwork.freedesktop.org/api/1.0/series/106427/revisions/2/mbox/
not applied
Applying: dyndbg: fix static_branch manipulation
Applying: dyndbg: fix module.dyndbg handling
Applying: dyndbg: show both old and new in change-info
Applying: dyndbg: reverse module walk in cat control
Applying: dyndbg: reverse module.callsite walk in cat control
Applying: dyndbg: use ESCAPE_SPACE for cat control
Applying: dyndbg: let query-modname override actual module name
Applying: dyndbg: add test_dynamic_debug module
Applying: dyndbg: drop EXPORTed dynamic_debug_exec_queries

Jason,
those above are decent maintenance patches, particularly the drop export.
It would be nice to trim this unused api this cycle.

Applying: dyndbg: add class_id to pr_debug callsites
Applying: dyndbg: add __pr_debug_cls for testing
Applying: dyndbg: add DECLARE_DYNDBG_CLASSMAP
Applying: kernel/module: add __dyndbg_classes section
Applying: dyndbg: add ddebug_attach_module_classes
Applying: dyndbg: validate class FOO by checking with module
Applying: dyndbg: add drm.debug style bitmap support
Applying: dyndbg: test DECLARE_DYNDBG_CLASSMAP, sysfs nodes
Applying: doc-dyndbg: describe "class CLASS_NAME" query support
Applying: doc-dyndbg: edit dynamic-debug-howto for brevity, audience
Applying: drm_print: condense enum drm_debug_category
Applying: drm: POC drm on dyndbg - use in core, 2 helpers, 3 drivers.
Applying: drm_print: interpose drm_*dbg with forwarding macros
Applying: drm_print: wrap drm_*_dbg in dyndbg descriptor factory macro
Using index info to reconstruct a base tree...
M drivers/gpu/drm/Kconfig
M drivers/gpu/drm/Makefile
Falling back to patching base and 3-way merge...
Auto-merging drivers/gpu/drm/Makefile
Auto-merging drivers/gpu/drm/Kconfig
CONFLICT (content): Merge conflict in drivers/gpu/drm/Kconfig
error: Failed to merge in the changes.


Before I resend, I should sort out that possible conflict
which tree is patchwork applied upon ?

or was it just transient ? after 5.19 I rebased a copy onto drm-next/drm-next,
and there was nothing to fix - I will revisit presently..


[PATCH] drm/amdkfd: Allocate doorbells only when needed

2022-08-03 Thread Felix Kuehling
Only allocate doorbells when the first queue is created on a GPU or the
doorbells need to be mapped into CPU or GPU virtual address space. This
avoids allocating doorbells unnecessarily and can allow more processes
to use KFD on multi-GPU systems.

Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 13 +
 drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c |  9 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  5 -
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 2b3d8bc8f0aa..907f4711abce 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -327,6 +327,12 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
goto err_bind_process;
}
 
+   if (!pdd->doorbell_index &&
+   kfd_alloc_process_doorbells(dev, >doorbell_index) < 0) {
+   err = -ENOMEM;
+   goto err_alloc_doorbells;
+   }
+
/* Starting with GFX11, wptr BOs must be mapped to GART for MES to 
determine work
 * on unmapped queues for usermode queue oversubscription (no 
aggregated doorbell)
 */
@@ -404,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
if (wptr_bo)
amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
 err_wptr_map_gart:
+err_alloc_doorbells:
 err_bind_process:
 err_pdd:
mutex_unlock(>mutex);
@@ -1092,6 +1099,10 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file 
*filep,
goto err_unlock;
}
offset = kfd_get_process_doorbells(pdd);
+   if (!offset) {
+   err = -ENOMEM;
+   goto err_unlock;
+   }
} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
if (args->size != PAGE_SIZE) {
err = -EINVAL;
@@ -2173,6 +2184,8 @@ static int criu_restore_memory_of_gpu(struct 
kfd_process_device *pdd,
return -EINVAL;
 
offset = kfd_get_process_doorbells(pdd);
+   if (!offset)
+   return -ENOMEM;
} else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) 
{
/* MMIO BOs need remapped bus address */
if (bo_bucket->size != PAGE_SIZE) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
index cb3d2ccc5100..b33798f89ef0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c
@@ -157,6 +157,8 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct 
kfd_process *process,
 
/* Calculate physical address of doorbell */
address = kfd_get_process_doorbells(pdd);
+   if (!address)
+   return -ENOMEM;
vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
VM_DONTDUMP | VM_PFNMAP;
 
@@ -275,6 +277,13 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
 
 phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd)
 {
+   if (!pdd->doorbell_index) {
+   int r = kfd_alloc_process_doorbells(pdd->dev,
+   >doorbell_index);
+   if (r)
+   return 0;
+   }
+
return pdd->dev->doorbell_base +
pdd->doorbell_index * kfd_doorbell_process_slice(pdd->dev);
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 6c83a519b3a1..951b63677248 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1499,11 +1499,6 @@ struct kfd_process_device 
*kfd_create_process_device_data(struct kfd_dev *dev,
if (!pdd)
return NULL;
 
-   if (kfd_alloc_process_doorbells(dev, >doorbell_index) < 0) {
-   pr_err("Failed to alloc doorbell for pdd\n");
-   goto err_free_pdd;
-   }
-
if (init_doorbell_bitmap(>qpd, dev)) {
pr_err("Failed to init doorbell for process\n");
goto err_free_pdd;
-- 
2.32.0



Re: [PATCH] drm/amd/display: set panel orientation before drm_dev_register

2022-08-03 Thread Melissa Wen
On 08/03, Melissa Wen wrote:
> To set the panel orientation property with quirk, we need the mode size
> provided by EDID. This info is available after EDID is read by 
> dc_link_detect()
> and updated by amdgpu_dm_update_connector_after_detect(). The detection
> happens at driver load in amdgpu_dm_initialize_drm_device() and,
> therefore, we can get modes and set panel orientation before
> drm_dev_register() to avoid DRM warns on creating the connector property
> after device registration:

+ cc'ing: Simon Ser

> 
> [2.563969] [ cut here ]
> [2.563971] WARNING: CPU: 6 PID: 325 at 
> drivers/gpu/drm/drm_mode_object.c:45 drm_mode_object_add+0x72/0x80 [drm]
> [2.563997] Modules linked in: btusb btrtl btbcm btintel btmtk bluetooth 
> rfkill ecdh_generic ecc usbhid crc16 amdgpu(+) drm_ttm_helper ttm agpgart 
> gpu_sched i2c_algo_bit drm_display_helper drm_kms_helper syscopyarea 
> sysfillrect sysimgblt fb_sys_fops drm serio_raw sdhci_pci atkbd libps2 cqhci 
> vivaldi_fmap ccp sdhci i8042 crct10dif_pclmul crc32_pclmul hid_multitouch 
> ghash_clmulni_intel aesni_intel crypto_simd cryptd wdat_wdt mmc_core cec 
> xhci_pci sp5100_tco rng_core xhci_pci_renesas serio 8250_dw i2c_hid_acpi 
> i2c_hid btrfs blake2b_generic libcrc32c crc32c_generic crc32c_intel xor 
> raid6_pq dm_mirror dm_region_hash dm_log dm_mod pkcs8_key_parser crypto_user
> [2.564032] CPU: 6 PID: 325 Comm: systemd-udevd Not tainted 
> 5.18.0-amd-staging-drm-next+ #67
> [2.564034] Hardware name: Valve Jupiter/Jupiter, BIOS F7A0105 03/21/2022
> [2.564036] RIP: 0010:drm_mode_object_add+0x72/0x80 [drm]
> [2.564053] Code: f0 89 c3 85 c0 78 07 89 45 00 44 89 65 04 4c 89 ef e8 e2 
> 99 04 f1 31 c0 85 db 0f 4e c3 5b 5d 41 5c 41 5d c3 80 7f 50 00 74 ac <0f> 0b 
> eb a8 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 54 4c
> [2.564055] RSP: 0018:b2e880413860 EFLAGS: 00010202
> [2.564056] RAX: c0ba1440 RBX: 99508a860010 RCX: 
> 0001
> [2.564057] RDX: b0b0b0b0 RSI: 99508c050110 RDI: 
> 99508a860010
> [2.564058] RBP: 99508c050110 R08: 0020 R09: 
> 99508c292c20
> [2.564059] R10:  R11: 99508c0507d8 R12: 
> b0b0b0b0
> [2.564060] R13: 0004 R14: c068a4b6 R15: 
> c068a47f
> [2.564061] FS:  7fc69b5f1a40() GS:9953aff8() 
> knlGS:
> [2.564063] CS:  0010 DS:  ES:  CR0: 80050033
> [2.564063] CR2: 7f9506804000 CR3: 000107f92000 CR4: 
> 00350ee0
> [2.564065] Call Trace:
> [2.564068]  
> [2.564070]  drm_property_create+0xc9/0x170 [drm]
> [2.564088]  drm_property_create_enum+0x1f/0x70 [drm]
> [2.564105]  drm_connector_set_panel_orientation_with_quirk+0x96/0xc0 [drm]
> [2.564123]  get_modes+0x4fb/0x530 [amdgpu]
> [2.564378]  drm_helper_probe_single_connector_modes+0x1ad/0x850 
> [drm_kms_helper]
> [2.564390]  drm_client_modeset_probe+0x229/0x1400 [drm]
> [2.564411]  ? xas_store+0x52/0x5e0
> [2.564416]  ? kmem_cache_alloc_trace+0x177/0x2c0
> [2.564420]  __drm_fb_helper_initial_config_and_unlock+0x44/0x4e0 
> [drm_kms_helper]
> [2.564430]  drm_fbdev_client_hotplug+0x173/0x210 [drm_kms_helper]
> [2.564438]  drm_fbdev_generic_setup+0xa5/0x166 [drm_kms_helper]
> [2.564446]  amdgpu_pci_probe+0x35e/0x370 [amdgpu]
> [2.564621]  local_pci_probe+0x45/0x80
> [2.564625]  ? pci_match_device+0xd7/0x130
> [2.564627]  pci_device_probe+0xbf/0x220
> [2.564629]  ? sysfs_do_create_link_sd+0x69/0xd0
> [2.564633]  really_probe+0x19c/0x380
> [2.564637]  __driver_probe_device+0xfe/0x180
> [2.564639]  driver_probe_device+0x1e/0x90
> [2.564641]  __driver_attach+0xc0/0x1c0
> [2.564643]  ? __device_attach_driver+0xe0/0xe0
> [2.564644]  ? __device_attach_driver+0xe0/0xe0
> [2.564646]  bus_for_each_dev+0x78/0xc0
> [2.564648]  bus_add_driver+0x149/0x1e0
> [2.564650]  driver_register+0x8f/0xe0
> [2.564652]  ? 0xc1023000
> [2.564654]  do_one_initcall+0x44/0x200
> [2.564657]  ? kmem_cache_alloc_trace+0x177/0x2c0
> [2.564659]  do_init_module+0x4c/0x250
> [2.564663]  __do_sys_init_module+0x12e/0x1b0
> [2.564666]  do_syscall_64+0x3b/0x90
> [2.564670]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [2.564673] RIP: 0033:0x7fc69bff232e
> [2.564674] Code: 48 8b 0d 45 0b 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 
> 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 
> 01 f0 ff ff 73 01 c3 48 8b 0d 12 0b 0c 00 f7 d8 64 89 01 48
> [2.564676] RSP: 002b:7ffe872ba3e8 EFLAGS: 0246 ORIG_RAX: 
> 00af
> [2.564677] RAX: ffda RBX: 55873f797820 RCX: 
> 7fc69bff232e
> [2.564678] RDX: 55873f7bf390 RSI: 01155e81 RDI: 
> 7fc699e4d010
> [2.564679] RBP: 7fc699e4d010 R08: 55873f7bfe20 R09: 
> 01155e90
> [

[PATCH] drm/amd/display: set panel orientation before drm_dev_register

2022-08-03 Thread Melissa Wen
To set the panel orientation property with quirk, we need the mode size
provided by EDID. This info is available after EDID is read by dc_link_detect()
and updated by amdgpu_dm_update_connector_after_detect(). The detection
happens at driver load in amdgpu_dm_initialize_drm_device() and,
therefore, we can get modes and set panel orientation before
drm_dev_register() to avoid DRM warns on creating the connector property
after device registration:

[2.563969] [ cut here ]
[2.563971] WARNING: CPU: 6 PID: 325 at drivers/gpu/drm/drm_mode_object.c:45 
drm_mode_object_add+0x72/0x80 [drm]
[2.563997] Modules linked in: btusb btrtl btbcm btintel btmtk bluetooth 
rfkill ecdh_generic ecc usbhid crc16 amdgpu(+) drm_ttm_helper ttm agpgart 
gpu_sched i2c_algo_bit drm_display_helper drm_kms_helper syscopyarea 
sysfillrect sysimgblt fb_sys_fops drm serio_raw sdhci_pci atkbd libps2 cqhci 
vivaldi_fmap ccp sdhci i8042 crct10dif_pclmul crc32_pclmul hid_multitouch 
ghash_clmulni_intel aesni_intel crypto_simd cryptd wdat_wdt mmc_core cec 
xhci_pci sp5100_tco rng_core xhci_pci_renesas serio 8250_dw i2c_hid_acpi 
i2c_hid btrfs blake2b_generic libcrc32c crc32c_generic crc32c_intel xor 
raid6_pq dm_mirror dm_region_hash dm_log dm_mod pkcs8_key_parser crypto_user
[2.564032] CPU: 6 PID: 325 Comm: systemd-udevd Not tainted 
5.18.0-amd-staging-drm-next+ #67
[2.564034] Hardware name: Valve Jupiter/Jupiter, BIOS F7A0105 03/21/2022
[2.564036] RIP: 0010:drm_mode_object_add+0x72/0x80 [drm]
[2.564053] Code: f0 89 c3 85 c0 78 07 89 45 00 44 89 65 04 4c 89 ef e8 e2 
99 04 f1 31 c0 85 db 0f 4e c3 5b 5d 41 5c 41 5d c3 80 7f 50 00 74 ac <0f> 0b eb 
a8 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 54 4c
[2.564055] RSP: 0018:b2e880413860 EFLAGS: 00010202
[2.564056] RAX: c0ba1440 RBX: 99508a860010 RCX: 0001
[2.564057] RDX: b0b0b0b0 RSI: 99508c050110 RDI: 99508a860010
[2.564058] RBP: 99508c050110 R08: 0020 R09: 99508c292c20
[2.564059] R10:  R11: 99508c0507d8 R12: b0b0b0b0
[2.564060] R13: 0004 R14: c068a4b6 R15: c068a47f
[2.564061] FS:  7fc69b5f1a40() GS:9953aff8() 
knlGS:
[2.564063] CS:  0010 DS:  ES:  CR0: 80050033
[2.564063] CR2: 7f9506804000 CR3: 000107f92000 CR4: 00350ee0
[2.564065] Call Trace:
[2.564068]  
[2.564070]  drm_property_create+0xc9/0x170 [drm]
[2.564088]  drm_property_create_enum+0x1f/0x70 [drm]
[2.564105]  drm_connector_set_panel_orientation_with_quirk+0x96/0xc0 [drm]
[2.564123]  get_modes+0x4fb/0x530 [amdgpu]
[2.564378]  drm_helper_probe_single_connector_modes+0x1ad/0x850 
[drm_kms_helper]
[2.564390]  drm_client_modeset_probe+0x229/0x1400 [drm]
[2.564411]  ? xas_store+0x52/0x5e0
[2.564416]  ? kmem_cache_alloc_trace+0x177/0x2c0
[2.564420]  __drm_fb_helper_initial_config_and_unlock+0x44/0x4e0 
[drm_kms_helper]
[2.564430]  drm_fbdev_client_hotplug+0x173/0x210 [drm_kms_helper]
[2.564438]  drm_fbdev_generic_setup+0xa5/0x166 [drm_kms_helper]
[2.564446]  amdgpu_pci_probe+0x35e/0x370 [amdgpu]
[2.564621]  local_pci_probe+0x45/0x80
[2.564625]  ? pci_match_device+0xd7/0x130
[2.564627]  pci_device_probe+0xbf/0x220
[2.564629]  ? sysfs_do_create_link_sd+0x69/0xd0
[2.564633]  really_probe+0x19c/0x380
[2.564637]  __driver_probe_device+0xfe/0x180
[2.564639]  driver_probe_device+0x1e/0x90
[2.564641]  __driver_attach+0xc0/0x1c0
[2.564643]  ? __device_attach_driver+0xe0/0xe0
[2.564644]  ? __device_attach_driver+0xe0/0xe0
[2.564646]  bus_for_each_dev+0x78/0xc0
[2.564648]  bus_add_driver+0x149/0x1e0
[2.564650]  driver_register+0x8f/0xe0
[2.564652]  ? 0xc1023000
[2.564654]  do_one_initcall+0x44/0x200
[2.564657]  ? kmem_cache_alloc_trace+0x177/0x2c0
[2.564659]  do_init_module+0x4c/0x250
[2.564663]  __do_sys_init_module+0x12e/0x1b0
[2.564666]  do_syscall_64+0x3b/0x90
[2.564670]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[2.564673] RIP: 0033:0x7fc69bff232e
[2.564674] Code: 48 8b 0d 45 0b 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 
0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 af 00 00 00 0f 05 <48> 3d 01 
f0 ff ff 73 01 c3 48 8b 0d 12 0b 0c 00 f7 d8 64 89 01 48
[2.564676] RSP: 002b:7ffe872ba3e8 EFLAGS: 0246 ORIG_RAX: 
00af
[2.564677] RAX: ffda RBX: 55873f797820 RCX: 7fc69bff232e
[2.564678] RDX: 55873f7bf390 RSI: 01155e81 RDI: 7fc699e4d010
[2.564679] RBP: 7fc699e4d010 R08: 55873f7bfe20 R09: 01155e90
[2.564680] R10: 00055873f7bf R11: 0246 R12: 55873f7bf390
[2.564681] R13: 000d R14: 55873f7c4cb0 R15: 55873f797820
[2.564683]  
[2.564683] ---[ end trace  ]---
[2.564696] 

RE: [PATCH] Increase tlb flush timeout for sriov

2022-08-03 Thread Wong, Alice
[AMD Official Use Only - General]


-Original Message-
From: amd-gfx  On Behalf Of Dusica 
Milinkovic
Sent: August 3, 2022 5:02 AM
To: amd-gfx@lists.freedesktop.org
Cc: Milinkovic, Dusica 
Subject: [PATCH] Increase tlb flush timeout for sriov

Signed-off-by: Dusica Milinkovic 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +-  
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 9ae8cdaa033e..6ab7d329916f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -419,6 +419,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
uint32_t seq;
uint16_t queried_pasid;
bool ret;
+   uint32_t sriov_usec_timeout = 600;  /* wait for 12 * 500ms for 
+SRIOV */
struct amdgpu_ring *ring = >gfx.kiq.ring;
struct amdgpu_kiq *kiq = >gfx.kiq;
 
@@ -437,7 +438,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
 
amdgpu_ring_commit(ring);
spin_unlock(>gfx.kiq.ring_lock);
-   r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+   if (amdgpu_sriov_vf(adev))
+   r = amdgpu_fence_wait_polling(ring, seq, 
sriov_usec_timeout);
+   else
+   r = amdgpu_fence_wait_polling(ring, seq, 
adev->usec_timeout);
if (r < 1) {
dev_err(adev->dev, "wait for kiq fence error: %ld.\n", 
r);
return -ETIME;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 22761a3bb818..941a6b52fa72 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -896,6 +896,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
uint32_t seq;
uint16_t queried_pasid;
bool ret;
+   uint32_t sriov_usec_timeout = 600;  /* wait for 12 * 500ms for 
+SRIOV */
struct amdgpu_ring *ring = >gfx.kiq.ring;
struct amdgpu_kiq *kiq = >gfx.kiq;
 
@@ -935,7 +936,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
 
amdgpu_ring_commit(ring);
spin_unlock(>gfx.kiq.ring_lock);
-   r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+   if (amdgpu_sriov_vf(adev))
+   r = amdgpu_fence_wait_polling(ring, seq, 
sriov_usec_timeout);
+   else
+   r = amdgpu_fence_wait_polling(ring, seq, 
adev->usec_timeout);
if (r < 1) {
dev_err(adev->dev, "wait for kiq fence error: %ld.\n", 
r);
up_read(>reset_domain->sem);
--
2.25.1


Re: [PATCH v3 1/6] drm/amdgpu: add mode2 reset for sienna_cichlid

2022-08-03 Thread Andrey Grodzovsky

Series is Acked-by: Andrey Grodzovsky 

Andrey

On 2022-08-01 00:07, Victor Zhao wrote:

To meet the requirement for multi container usecase which needs
a quicker reset and not causing VRAM lost, adding the Mode2
reset handler for sienna_cichlid.

v2: move skip mode2 flag part separately

v3: remove the use of asic_reset_res

Signed-off-by: Victor Zhao 
---
  drivers/gpu/drm/amd/amdgpu/Makefile   |   2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |   7 +
  drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c   | 296 ++
  drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h   |  32 ++
  .../pm/swsmu/inc/pmfw_if/smu_v11_0_7_ppsmc.h  |   4 +-
  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |   3 +-
  .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   |  54 
  7 files changed, 394 insertions(+), 4 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
  create mode 100644 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index c7d0cd15b5ef..7030ac2d7d2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -75,7 +75,7 @@ amdgpu-y += \
vi.o mxgpu_vi.o nbio_v6_1.o soc15.o emu_soc.o mxgpu_ai.o nbio_v7_0.o 
vega10_reg_init.o \
vega20_reg_init.o nbio_v7_4.o nbio_v2_3.o nv.o arct_reg_init.o 
mxgpu_nv.o \
nbio_v7_2.o hdp_v4_0.o hdp_v5_0.o aldebaran_reg_init.o aldebaran.o 
soc21.o \
-   nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o lsdma_v6_0.o
+   sienna_cichlid.o nbio_v4_3.o hdp_v6_0.o nbio_v7_7.o hdp_v5_2.o 
lsdma_v6_0.o
  
  # add DF block

  amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 32c86a0b145c..f778466bb9db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -23,6 +23,7 @@
  
  #include "amdgpu_reset.h"

  #include "aldebaran.h"
+#include "sienna_cichlid.h"
  
  int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,

 struct amdgpu_reset_handler *handler)
@@ -40,6 +41,9 @@ int amdgpu_reset_init(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 2):
ret = aldebaran_reset_init(adev);
break;
+   case IP_VERSION(11, 0, 7):
+   ret = sienna_cichlid_reset_init(adev);
+   break;
default:
break;
}
@@ -55,6 +59,9 @@ int amdgpu_reset_fini(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 2):
ret = aldebaran_reset_fini(adev);
break;
+   case IP_VERSION(11, 0, 7):
+   ret = sienna_cichlid_reset_fini(adev);
+   break;
default:
break;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c 
b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
new file mode 100644
index ..b61a8ddec7ef
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "sienna_cichlid.h"
+#include "amdgpu_reset.h"
+#include "amdgpu_amdkfd.h"
+#include "amdgpu_dpm.h"
+#include "amdgpu_job.h"
+#include "amdgpu_ring.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_psp.h"
+#include "amdgpu_xgmi.h"
+
+static struct amdgpu_reset_handler *
+sienna_cichlid_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
+   struct amdgpu_reset_context *reset_context)
+{
+   struct amdgpu_reset_handler *handler;
+   struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+
+   if (reset_context->method != AMD_RESET_METHOD_NONE) {
+   list_for_each_entry(handler, _ctl->reset_handlers,
+handler_list) {
+   if 

[PATCH] drm/amdgpu: Avoid another list of reset devices

2022-08-03 Thread Lijo Lazar
A list of devices to be reset are already created in
amdgpu_device_gpu_recover function. Creating another list with the
same nodes is incorrect and not supported in list_head. Instead, pass
the device list as part of reset context.

Fixes: 9e08564727fc (drm/amdgpu: Refactor mode2 reset logic for v13.0.2)
Signed-off-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/aldebaran.c | 45 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
 3 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/aldebaran.c 
b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
index c6cc493a5486..2b97b8a96fb4 100644
--- a/drivers/gpu/drm/amd/amdgpu/aldebaran.c
+++ b/drivers/gpu/drm/amd/amdgpu/aldebaran.c
@@ -148,30 +148,22 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control 
*reset_ctl,
  struct amdgpu_reset_context *reset_context)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
+   struct list_head *reset_device_list = reset_context->reset_device_list;
struct amdgpu_device *tmp_adev = NULL;
-   struct list_head reset_device_list;
int r = 0;
 
dev_dbg(adev->dev, "aldebaran perform hw reset\n");
+
+   if (reset_device_list == NULL)
+   return -EINVAL;
+
if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
reset_context->hive == NULL) {
/* Wrong context, return error */
return -EINVAL;
}
 
-   INIT_LIST_HEAD(_device_list);
-   if (reset_context->hive) {
-   list_for_each_entry (tmp_adev,
-_context->hive->device_list,
-gmc.xgmi.head)
-   list_add_tail(_adev->reset_list,
- _device_list);
-   } else {
-   list_add_tail(_context->reset_req_dev->reset_list,
- _device_list);
-   }
-
-   list_for_each_entry (tmp_adev, _device_list, reset_list) {
+   list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
mutex_lock(_adev->reset_cntl->reset_lock);
tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
}
@@ -179,7 +171,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control 
*reset_ctl,
 * Mode2 reset doesn't need any sync between nodes in XGMI hive, 
instead launch
 * them together so that they can be completed asynchronously on 
multiple nodes
 */
-   list_for_each_entry (tmp_adev, _device_list, reset_list) {
+   list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
/* For XGMI run all resets in parallel to speed up the process 
*/
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
if (!queue_work(system_unbound_wq,
@@ -197,7 +189,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control 
*reset_ctl,
 
/* For XGMI wait for all resets to complete before proceed */
if (!r) {
-   list_for_each_entry (tmp_adev, _device_list, reset_list) {
+   list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
flush_work(_adev->reset_cntl->reset_work);
r = tmp_adev->asic_reset_res;
@@ -207,7 +199,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control 
*reset_ctl,
}
}
 
-   list_for_each_entry (tmp_adev, _device_list, reset_list) {
+   list_for_each_entry(tmp_adev, reset_device_list, reset_list) {
mutex_unlock(_adev->reset_cntl->reset_lock);
tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
}
@@ -339,10 +331,13 @@ static int
 aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
  struct amdgpu_reset_context *reset_context)
 {
+   struct list_head *reset_device_list = reset_context->reset_device_list;
struct amdgpu_device *tmp_adev = NULL;
-   struct list_head reset_device_list;
int r;
 
+   if (reset_device_list == NULL)
+   return -EINVAL;
+
if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
IP_VERSION(13, 0, 2) &&
reset_context->hive == NULL) {
@@ -350,19 +345,7 @@ aldebaran_mode2_restore_hwcontext(struct 
amdgpu_reset_control *reset_ctl,
return -EINVAL;
}
 
-   INIT_LIST_HEAD(_device_list);
-   if (reset_context->hive) {
-   list_for_each_entry (tmp_adev,
-_context->hive->device_list,
-gmc.xgmi.head)
-   list_add_tail(_adev->reset_list,
-  

RE: [PATCH 2/2] drm/amd/pm: Fix a potential gpu_metrics_table memory leak

2022-08-03 Thread Quan, Evan
[AMD Official Use Only - General]

Thanks for the fixes! The series is reviewed-by: Evan Quan 

Evan
> -Original Message-
> From: Zhen Ni 
> Sent: Wednesday, August 3, 2022 5:20 PM
> To: airl...@linux.ie; dan...@ffwll.ch; Quan, Evan ;
> Deucher, Alexander ; Koenig, Christian
> ; Pan, Xinhui 
> Cc: amd-gfx@lists.freedesktop.org; dri-de...@lists.freedesktop.org; linux-
> ker...@vger.kernel.org; Zhen Ni 
> Subject: [PATCH 2/2] drm/amd/pm: Fix a potential gpu_metrics_table
> memory leak
> 
> Memory is allocated for gpu_metrics_table in
> smu_v13_0_5_init_smc_tables(), but not freed in
> smu_v13_0_5_fini_smc_tables(). This may cause memory leaks, fix it.
> 
> Signed-off-by: Zhen Ni 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c
> index b81711c4ff33..267c9c43a010 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_5_ppt.c
> @@ -167,6 +167,9 @@ static int smu_v13_0_5_fini_smc_tables(struct
> smu_context *smu)
>   kfree(smu_table->watermarks_table);
>   smu_table->watermarks_table = NULL;
> 
> + kfree(smu_table->gpu_metrics_table);
> + smu_table->gpu_metrics_table = NULL;
> +
>   return 0;
>  }
> 
> --
> 2.20.1
> 
> 


[PATCH] Increase tlb flush timeout for sriov

2022-08-03 Thread Dusica Milinkovic
Signed-off-by: Dusica Milinkovic 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 6 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 6 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 9ae8cdaa033e..6ab7d329916f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -419,6 +419,7 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
uint32_t seq;
uint16_t queried_pasid;
bool ret;
+   uint32_t sriov_usec_timeout = 600;  /* wait for 12 * 500ms for 
SRIOV */
struct amdgpu_ring *ring = >gfx.kiq.ring;
struct amdgpu_kiq *kiq = >gfx.kiq;
 
@@ -437,7 +438,10 @@ static int gmc_v10_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
 
amdgpu_ring_commit(ring);
spin_unlock(>gfx.kiq.ring_lock);
-   r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+   if (amdgpu_sriov_vf(adev))
+   r = amdgpu_fence_wait_polling(ring, seq, 
sriov_usec_timeout);
+   else
+   r = amdgpu_fence_wait_polling(ring, seq, 
adev->usec_timeout);
if (r < 1) {
dev_err(adev->dev, "wait for kiq fence error: %ld.\n", 
r);
return -ETIME;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 22761a3bb818..941a6b52fa72 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -896,6 +896,7 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
uint32_t seq;
uint16_t queried_pasid;
bool ret;
+   uint32_t sriov_usec_timeout = 600;  /* wait for 12 * 500ms for 
SRIOV */
struct amdgpu_ring *ring = >gfx.kiq.ring;
struct amdgpu_kiq *kiq = >gfx.kiq;
 
@@ -935,7 +936,10 @@ static int gmc_v9_0_flush_gpu_tlb_pasid(struct 
amdgpu_device *adev,
 
amdgpu_ring_commit(ring);
spin_unlock(>gfx.kiq.ring_lock);
-   r = amdgpu_fence_wait_polling(ring, seq, adev->usec_timeout);
+   if (amdgpu_sriov_vf(adev))
+   r = amdgpu_fence_wait_polling(ring, seq, 
sriov_usec_timeout);
+   else
+   r = amdgpu_fence_wait_polling(ring, seq, 
adev->usec_timeout);
if (r < 1) {
dev_err(adev->dev, "wait for kiq fence error: %ld.\n", 
r);
up_read(>reset_domain->sem);
-- 
2.25.1



Re: [PATCH v2 01/29] ACPI: video: Add acpi_video_backlight_use_native() helper

2022-08-03 Thread Daniel Dadap

On 8/2/22 06:31, Hans de Goede wrote:

Hi Daniel,

On 7/21/22 23:30, Daniel Dadap wrote:

On 7/21/22 16:24, Daniel Dadap wrote:

On 7/12/22 14:38, Hans de Goede wrote:

ATM on x86 laptops where we want userspace to use the acpi_video backlight
device we often register both the GPU's native backlight device and
acpi_video's firmware acpi_video# backlight device. This relies on
userspace preferring firmware type backlight devices over native ones, but
registering 2 backlight devices for a single display really is undesirable.

On x86 laptops where the native GPU backlight device should be used,
the registering of other backlight devices is avoided by their drivers
using acpi_video_get_backlight_type() and only registering their backlight
if the return value matches their type.

acpi_video_get_backlight_type() uses
backlight_device_get_by_type(BACKLIGHT_RAW) to determine if a native
driver is available and will never return native if this returns
false. This means that the GPU's native backlight registering code
cannot just call acpi_video_get_backlight_type() to determine if it
should register its backlight, since acpi_video_get_backlight_type() will
never return native until the native backlight has already registered.

To fix this add a new internal native function parameter to
acpi_video_get_backlight_type(), which when set to true will make
acpi_video_get_backlight_type() behave as if a native backlight has
already been registered.

And add a new acpi_video_backlight_use_native() helper, which sets this
to true, for use in native GPU backlight code.

Changes in v2:
- Replace adding a native parameter to acpi_video_get_backlight_type() with
    adding a new acpi_video_backlight_use_native() helper.

Signed-off-by: Hans de Goede 
---
   drivers/acpi/video_detect.c | 24 
   include/acpi/video.h    |  5 +
   2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index becc198e4c22..4346c990022d 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -17,8 +17,9 @@
    * Otherwise vendor specific drivers like thinkpad_acpi, asus-laptop,
    * sony_acpi,... can take care about backlight brightness.
    *
- * Backlight drivers can use acpi_video_get_backlight_type() to determine
- * which driver should handle the backlight.
+ * Backlight drivers can use acpi_video_get_backlight_type() to determine which
+ * driver should handle the backlight. RAW/GPU-driver backlight drivers must
+ * use the acpi_video_backlight_use_native() helper for this.
    *
    * If CONFIG_ACPI_VIDEO is neither set as "compiled in" (y) nor as a module 
(m)
    * this file will not be compiled and acpi_video_get_backlight_type() will
@@ -548,9 +549,10 @@ static int acpi_video_backlight_notify(struct 
notifier_block *nb,
    * Arguably the native on win8 check should be done first, but that would
    * be a behavior change, which may causes issues.
    */
-enum acpi_backlight_type acpi_video_get_backlight_type(void)
+static enum acpi_backlight_type __acpi_video_get_backlight_type(bool native)
   {
   static DEFINE_MUTEX(init_mutex);
+    static bool native_available;
   static bool init_done;
   static long video_caps;
   @@ -570,6 +572,8 @@ enum acpi_backlight_type 
acpi_video_get_backlight_type(void)
   backlight_notifier_registered = true;
   init_done = true;
   }
+    if (native)
+    native_available = true;
   mutex_unlock(_mutex);
     if (acpi_backlight_cmdline != acpi_backlight_undef)
@@ -581,13 +585,25 @@ enum acpi_backlight_type 
acpi_video_get_backlight_type(void)
   if (!(video_caps & ACPI_VIDEO_BACKLIGHT))
   return acpi_backlight_vendor;
   -    if (acpi_osi_is_win8() && backlight_device_get_by_type(BACKLIGHT_RAW))
+    if (acpi_osi_is_win8() &&
+    (native_available || backlight_device_get_by_type(BACKLIGHT_RAW)))
   return acpi_backlight_native;
     return acpi_backlight_video;


So I ran into a minor problem when testing the NVIDIA proprietary driver 
against this change set, after checking acpi_video_backlight_use_native() 
before registering the NVIDIA proprietary driver's backlight handler. Namely, 
for the case where a user installs the NVIDIA proprietary driver after the 
video.ko has already registered its backlight handler, we end up with both the 
firmware and native handlers registered simultaneously, since the ACPI video 
driver no longer unregisters its backlight handler. In this state, desktop 
environments end up preferring the registered but non-functional firmware 
handler from video.ko. (Manually twiddling the sysfs interface for the native 
NVIDIA handler works fine.) When rebooting the system after installing the 
NVIDIA proprietary driver, it is able to register its native handler before the 
delayed work to register the ACPI video backlight handler fires, so we end up 
with only one (native) handler, and 

Re: [PATCH v2 01/29] ACPI: video: Add acpi_video_backlight_use_native() helper

2022-08-03 Thread Hans de Goede
Hi Daniel,

On 7/21/22 23:30, Daniel Dadap wrote:
> 
> On 7/21/22 16:24, Daniel Dadap wrote:
>>
>> On 7/12/22 14:38, Hans de Goede wrote:
>>> ATM on x86 laptops where we want userspace to use the acpi_video backlight
>>> device we often register both the GPU's native backlight device and
>>> acpi_video's firmware acpi_video# backlight device. This relies on
>>> userspace preferring firmware type backlight devices over native ones, but
>>> registering 2 backlight devices for a single display really is undesirable.
>>>
>>> On x86 laptops where the native GPU backlight device should be used,
>>> the registering of other backlight devices is avoided by their drivers
>>> using acpi_video_get_backlight_type() and only registering their backlight
>>> if the return value matches their type.
>>>
>>> acpi_video_get_backlight_type() uses
>>> backlight_device_get_by_type(BACKLIGHT_RAW) to determine if a native
>>> driver is available and will never return native if this returns
>>> false. This means that the GPU's native backlight registering code
>>> cannot just call acpi_video_get_backlight_type() to determine if it
>>> should register its backlight, since acpi_video_get_backlight_type() will
>>> never return native until the native backlight has already registered.
>>>
>>> To fix this add a new internal native function parameter to
>>> acpi_video_get_backlight_type(), which when set to true will make
>>> acpi_video_get_backlight_type() behave as if a native backlight has
>>> already been registered.
>>>
>>> And add a new acpi_video_backlight_use_native() helper, which sets this
>>> to true, for use in native GPU backlight code.
>>>
>>> Changes in v2:
>>> - Replace adding a native parameter to acpi_video_get_backlight_type() with
>>>    adding a new acpi_video_backlight_use_native() helper.
>>>
>>> Signed-off-by: Hans de Goede 
>>> ---
>>>   drivers/acpi/video_detect.c | 24 
>>>   include/acpi/video.h    |  5 +
>>>   2 files changed, 25 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
>>> index becc198e4c22..4346c990022d 100644
>>> --- a/drivers/acpi/video_detect.c
>>> +++ b/drivers/acpi/video_detect.c
>>> @@ -17,8 +17,9 @@
>>>    * Otherwise vendor specific drivers like thinkpad_acpi, asus-laptop,
>>>    * sony_acpi,... can take care about backlight brightness.
>>>    *
>>> - * Backlight drivers can use acpi_video_get_backlight_type() to determine
>>> - * which driver should handle the backlight.
>>> + * Backlight drivers can use acpi_video_get_backlight_type() to determine 
>>> which
>>> + * driver should handle the backlight. RAW/GPU-driver backlight drivers 
>>> must
>>> + * use the acpi_video_backlight_use_native() helper for this.
>>>    *
>>>    * If CONFIG_ACPI_VIDEO is neither set as "compiled in" (y) nor as a 
>>> module (m)
>>>    * this file will not be compiled and acpi_video_get_backlight_type() will
>>> @@ -548,9 +549,10 @@ static int acpi_video_backlight_notify(struct 
>>> notifier_block *nb,
>>>    * Arguably the native on win8 check should be done first, but that would
>>>    * be a behavior change, which may causes issues.
>>>    */
>>> -enum acpi_backlight_type acpi_video_get_backlight_type(void)
>>> +static enum acpi_backlight_type __acpi_video_get_backlight_type(bool 
>>> native)
>>>   {
>>>   static DEFINE_MUTEX(init_mutex);
>>> +    static bool native_available;
>>>   static bool init_done;
>>>   static long video_caps;
>>>   @@ -570,6 +572,8 @@ enum acpi_backlight_type 
>>> acpi_video_get_backlight_type(void)
>>>   backlight_notifier_registered = true;
>>>   init_done = true;
>>>   }
>>> +    if (native)
>>> +    native_available = true;
>>>   mutex_unlock(_mutex);
>>>     if (acpi_backlight_cmdline != acpi_backlight_undef)
>>> @@ -581,13 +585,25 @@ enum acpi_backlight_type 
>>> acpi_video_get_backlight_type(void)
>>>   if (!(video_caps & ACPI_VIDEO_BACKLIGHT))
>>>   return acpi_backlight_vendor;
>>>   -    if (acpi_osi_is_win8() && 
>>> backlight_device_get_by_type(BACKLIGHT_RAW))
>>> +    if (acpi_osi_is_win8() &&
>>> +    (native_available || backlight_device_get_by_type(BACKLIGHT_RAW)))
>>>   return acpi_backlight_native;
>>>     return acpi_backlight_video;
>>
>>
>> So I ran into a minor problem when testing the NVIDIA proprietary driver 
>> against this change set, after checking acpi_video_backlight_use_native() 
>> before registering the NVIDIA proprietary driver's backlight handler. 
>> Namely, for the case where a user installs the NVIDIA proprietary driver 
>> after the video.ko has already registered its backlight handler, we end up 
>> with both the firmware and native handlers registered simultaneously, since 
>> the ACPI video driver no longer unregisters its backlight handler. In this 
>> state, desktop environments end up preferring the registered but 
>> non-functional firmware handler from video.ko.