RE: [PATCH V3 12/12] drm/amdgpu: Removed redundant ras code

2022-01-03 Thread Clements, John
[AMD Official Use Only]

Thank you Thomas,

This series looks good to me too.

Reviewed-by: John Clements 

-Original Message-
From: Zhang, Hawking  
Sent: Wednesday, December 29, 2021 3:55 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org; Zhou1, 
Tao ; Clements, John ; Yang, Stanley 

Cc: Chai, Thomas ; Chai, Thomas 
Subject: RE: [PATCH V3 12/12] drm/amdgpu: Removed redundant ras code

[AMD Official Use Only]

Thank you Thomas. V3 looks good to me. @Zhou1, Tao/@Clements, John/@Yang, 
Stanley please also take a look and raise concern if any.

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of yipechai
Sent: Wednesday, December 29, 2021 14:32
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas ; 
Chai, Thomas 
Subject: [PATCH V3 12/12] drm/amdgpu: Removed redundant ras code

Removed redundant ras code.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 82 ++---
 1 file changed, 20 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 21765e05b003..17de79be6d8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -910,51 +910,23 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
return -EINVAL;

block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
+   if (!block_obj || !block_obj->hw_ops)   {
+   dev_info(adev->dev, "%s doesn't config ras function \n",
+   get_ras_block_str(>head));
+   return -EINVAL;
+   }

-   switch (info->head.block) {
-   case AMDGPU_RAS_BLOCK__UMC:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function \n",
-   get_ras_block_str(>head));
-   return -EINVAL;
-   }
+   if (block_obj->hw_ops->query_ras_error_count)
+   block_obj->hw_ops->query_ras_error_count(adev, _data);

-   if (block_obj->hw_ops->query_ras_error_count)
-   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
-   /* umc query_ras_error_address is also responsible for clearing
-* error status
-*/
-   if (block_obj->hw_ops->query_ras_error_address)
-   block_obj->hw_ops->query_ras_error_address(adev, 
_data);
-   break;
-   case AMDGPU_RAS_BLOCK__SDMA:
-   case AMDGPU_RAS_BLOCK__GFX:
-   case AMDGPU_RAS_BLOCK__MMHUB:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function \n",
-   get_ras_block_str(>head));
-   return -EINVAL;
-   }
-   if (block_obj->hw_ops->query_ras_error_count)
-   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
+   if (info->head.block == AMDGPU_RAS_BLOCK__UMC)
+   block_obj->hw_ops->query_ras_error_address(adev, _data);

+   if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
+   (info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
+   (info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
if (block_obj->hw_ops->query_ras_error_status)
block_obj->hw_ops->query_ras_error_status(adev);
-   break;
-   case AMDGPU_RAS_BLOCK__PCIE_BIF:
-   case AMDGPU_RAS_BLOCK__XGMI_WAFL:
-   case AMDGPU_RAS_BLOCK__HDP:
-   case AMDGPU_RAS_BLOCK__MCA:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function \n",
-   get_ras_block_str(>head));
-   return -EINVAL;
-   }
-   if (block_obj->hw_ops->query_ras_error_count)
-   block_obj->hw_ops->query_ras_error_count(adev, 
_data);
-   break;
-   default:
-   break;
}

obj->err_data.ue_count += err_data.ue_count; @@ -1016,32 +988,18 @@ int 
amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
if (!amdgpu_ras_is_supported(adev, block))
return -EINVAL;

-   switch (block) {
-   case AMDGPU_RAS_BLOCK__GFX:
-   case AMDGPU_RAS_BLOCK__MMHUB:
-   if (!block_obj || !block_obj->hw_ops)   {
-   dev_info(adev->dev, "%s doesn't config ras function 
\n", ras_block_str(block));
-   return -EINVAL;
-   }
+   if (!block_obj || !block_obj->hw_ops)   {
+   dev_info(adev->dev, "%s doesn't config ras function \n", 
ras_block_str(block));
+   return -EINVAL;
+   }

-   if (block_obj->hw_ops->reset_ras_error_count)
-   

RE: [PATCH 2/2] drm/amdgpu: don't set s3 and s0ix at the same time

2022-01-03 Thread Quan, Evan
[AMD Official Use Only]

Series is acked-by: Evan Quan 

> -Original Message-
> From: amd-gfx  On Behalf Of
> Mario Limonciello
> Sent: Monday, January 3, 2022 11:23 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Limonciello, Mario 
> Subject: [PATCH 2/2] drm/amdgpu: don't set s3 and s0ix at the same time
> 
> This makes it clearer which codepaths are in use specifically in
> one state or the other.
> 
> Signed-off-by: Mario Limonciello 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index db2a9dfd5918..413fecc89e6c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2165,9 +2165,9 @@ static int amdgpu_pmops_suspend(struct device
> *dev)
> 
>   if (amdgpu_acpi_is_s0ix_active(adev))
>   adev->in_s0ix = true;
> - adev->in_s3 = true;
> + else
> + adev->in_s3 = true;
>   r = amdgpu_device_suspend(drm_dev, true);
> - adev->in_s3 = false;
>   if (r)
>   return r;
>   if (!adev->in_s0ix)
> @@ -2188,6 +2188,8 @@ static int amdgpu_pmops_resume(struct device
> *dev)
>   r = amdgpu_device_resume(drm_dev, true);
>   if (amdgpu_acpi_is_s0ix_active(adev))
>   adev->in_s0ix = false;
> + else
> + adev->in_s3 = false;
>   return r;
>  }
> 
> --
> 2.25.1


[PATCH] drm/amdgpu: Delay unmapping MMIO VRAM to amdgpu_ttm_fini() in GPU initialization failure

2022-01-03 Thread Leslie Shi
If the driver loads failed during hw_init(), delay unmapping MMIO VRAM to 
amdgpu_ttm_fini().
Its prevents accessing invalid memory address in vcn_v3_0_sw_fini().

Signed-off-by: Leslie Shi 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c|  4 
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ce93a304292c..d6006de57af5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3830,7 +3830,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return r;
 }
 
-static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
+static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev, bool 
unmap_mmio_vram)
 {
/* Clear all CPU mappings pointing to this device */
unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
@@ -3840,9 +3840,12 @@ static void amdgpu_device_unmap_mmio(struct 
amdgpu_device *adev)
 
iounmap(adev->rmmio);
adev->rmmio = NULL;
-   if (adev->mman.aper_base_kaddr)
-   iounmap(adev->mman.aper_base_kaddr);
-   adev->mman.aper_base_kaddr = NULL;
+
+   if (unmap_mmio_vram) {
+   if (adev->mman.aper_base_kaddr)
+   iounmap(adev->mman.aper_base_kaddr);
+   adev->mman.aper_base_kaddr = NULL;
+   }
 
/* Memory manager related */
if (!adev->gmc.xgmi.connected_to_cpu) {
@@ -3905,8 +3908,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 
amdgpu_gart_dummy_page_fini(adev);
 
-   if (drm_dev_is_unplugged(adev_to_drm(adev)))
-   amdgpu_device_unmap_mmio(adev);
+   amdgpu_device_unmap_mmio(adev, drm_dev_is_unplugged(adev_to_drm(adev)));
 
 }
 
@@ -5727,7 +5729,7 @@ void amdgpu_device_halt(struct amdgpu_device *adev)
 
adev->no_hw_access = true;
 
-   amdgpu_device_unmap_mmio(adev);
+   amdgpu_device_unmap_mmio(adev, true);
 
pci_disable_device(pdev);
pci_wait_for_pending_transaction(pdev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 367abed1d6e6..67cd12caf019 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1815,6 +1815,10 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)
  NULL, NULL);
amdgpu_ttm_fw_reserve_vram_fini(adev);
 
+if (adev->mman.aper_base_kaddr)
+iounmap(adev->mman.aper_base_kaddr);
+adev->mman.aper_base_kaddr = NULL;
+
amdgpu_vram_mgr_fini(adev);
amdgpu_gtt_mgr_fini(adev);
amdgpu_preempt_mgr_fini(adev);
-- 
2.25.1



Re: softlockup in v5.15.12 in dcn20_post_unlock_program_front_end

2022-01-03 Thread Jeff Layton
On Sun, 2022-01-02 at 09:30 -0500, Jeff Layton wrote:
> I'm seeing a reproducible softlockup on amdgpu on v5.15.12:
> 
> [  861.656146] [drm:dc_dmub_srv_wait_idle [amdgpu]] *ERROR* Error waiting for 
> DMUB idle: status=3
> [  861.914848] [drm:dc_dmub_srv_wait_idle [amdgpu]] *ERROR* Error waiting for 
> DMUB idle: status=3
> [  862.173368] [drm:dc_dmub_srv_wait_idle [amdgpu]] *ERROR* Error waiting for 
> DMUB idle: status=3
> [  862.381635] [drm] enabling link 0 failed: 15
> [  862.640908] [drm:dc_dmub_srv_wait_idle [amdgpu]] *ERROR* Error waiting for 
> DMUB idle: status=3
> [  862.743704] [drm:dcn20_wait_for_blank_complete [amdgpu]] *ERROR* DC: 
> failed to blank crtc!
> [  863.002846] [drm:dc_dmub_srv_wait_idle [amdgpu]] *ERROR* Error waiting for 
> DMUB idle: status=3
> [  863.261451] [drm:dc_dmub_srv_wait_idle [amdgpu]] *ERROR* Error waiting for 
> DMUB idle: status=3
> [  863.262090] [drm] REG_WAIT timeout 1us * 10 tries - optc3_lock line:112
> [  863.532231] [drm] REG_WAIT timeout 1us * 10 tries - 
> optc1_wait_for_state line:835
> [  888.900914] watchdog: BUG: soft lockup - CPU#11 stuck for 26s! 
> [gnome-shell:2306]
> [  888.900921] Modules linked in: uinput rfcomm snd_seq_dummy snd_hrtimer 
> rpcrdma rdma_cm iw_cm ib_cm ib_core nft_objref nf_conntrack_netbios_ns 
> nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib 
> nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat 
> bridge stp llc ip6table_nat ip6table_mangle ip6table_raw ip6table_security 
> iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle 
> iptable_raw iptable_security ip_set nf_tables nfnetlink ip6table_filter 
> ip6_tables iptable_filter qrtr ns bnep vfat fat snd_hda_codec_realtek 
> intel_rapl_msr snd_hda_codec_generic intel_rapl_common ledtrig_audio 
> snd_hda_codec_hdmi snd_hda_intel snd_intel_dspcfg edac_mce_amd 
> snd_intel_sdw_acpi snd_usb_audio snd_hda_codec kvm_amd snd_hda_core btusb 
> snd_usbmidi_lib btrtl snd_rawmidi snd_hwdep btbcm ppdev kvm snd_seq btintel 
> uvcvideo snd_seq_device videobuf2_vmalloc videobuf2_memops bluetooth 
> videobuf2_v4l2 snd_pcm videobuf2_common irqbypass wmi_bmof mxm_wmi
> [  888.900963]  pcspkr snd_timer rapl k10temp i2c_piix4 videodev snd 
> ecdh_generic rfkill joydev soundcore mc parport_pc parport gpio_amdpt 
> gpio_generic acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc zram 
> ip_tables amdgpu drm_ttm_helper ttm iommu_v2 gpu_sched i2c_algo_bit 
> drm_kms_helper cec drm crct10dif_pclmul crc32_pclmul crc32c_intel uas ccp 
> ghash_clmulni_intel sp5100_tco usb_storage r8169 nvme nvme_core wmi 
> ipmi_devintf ipmi_msghandler fuse
> [  888.900989] CPU: 11 PID: 2306 Comm: gnome-shell Not tainted 
> 5.15.12-200.fc35.x86_64 #1
> [  888.900992] Hardware name: Micro-Star International Co., Ltd. MS-7A33/X370 
> SLI PLUS (MS-7A33), BIOS 3.JR 11/29/2019
> [  888.900993] RIP: 0010:delay_halt_mwaitx+0x39/0x40
> [  888.900999] Code: 03 05 cb b6 95 4d 31 d2 48 89 d1 0f 01 fa b8 ff ff ff ff 
> b9 02 00 00 00 48 39 c6 48 0f 46 c6 48 89 c3 b8 f0 00 00 00 0f 01 fb <5b> c3 
> 0f 1f 44 00 00 0f 1f 44 00 00 48 8b 05 9c 2f 03 01 e9 7f 47
> [  888.901001] RSP: 0018:b7f243e63878 EFLAGS: 0293
> [  888.901003] RAX: 00f0 RBX: 002dc50a RCX: 
> 0002
> [  888.901005] RDX:  RSI: 002dc50a RDI: 
> 027b5712e506
> [  888.901006] RBP: 002dc50a R08: b7f243e63824 R09: 
> 0001
> [  888.901007] R10: b7f243e63660 R11: 000d R12: 
> 917bd719
> [  888.901009] R13: 917dd450 R14: 917dd45006a0 R15: 
> 917bd541fc00
> [  888.901010] FS:  7f2912683d80() GS:918a9ecc() 
> knlGS:
> [  888.901011] CS:  0010 DS:  ES:  CR0: 80050033
> [  888.901013] CR2: 33910fce CR3: 000105b22000 CR4: 
> 003506e0
> [  888.901014] Call Trace:
> [  888.901016]  
> [  888.901018]  delay_halt+0x3b/0x60
> [  888.901021]  dcn20_post_unlock_program_front_end+0xf4/0x2c0 [amdgpu]
> [  888.901209]  dc_commit_state+0x4b6/0xa50 [amdgpu]
> [  888.901382]  amdgpu_dm_atomic_commit_tail+0x55c/0x2610 [amdgpu]
> [  888.901557]  ? dcn20_calculate_dlg_params+0x4f4/0x540 [amdgpu]
> [  888.901735]  ? dcn20_calculate_dlg_params+0x4f4/0x540 [amdgpu]
> [  888.901916]  ? dcn30_calculate_wm_and_dlg_fp+0x707/0x8a0 [amdgpu]
> [  888.902090]  ? dcn30_validate_bandwidth+0x10f/0x240 [amdgpu]
> [  888.902261]  ? kfree+0xaa/0x3f0
> [  888.902265]  ? dcn30_validate_bandwidth+0x10f/0x240 [amdgpu]
> [  888.902435]  ? dc_validate_global_state+0x31f/0x3c0 [amdgpu]
> [  888.902604]  ? ttm_bo_mem_compat+0x2c/0x90 [ttm]
> [  888.902609]  ? ttm_bo_validate+0x42/0x100 [ttm]
> [  888.902614]  ? __raw_callee_save___native_queued_spin_unlock+0x11/0x1e
> [  888.902619]  ? amdgpu_bo_destroy+0x70/0x70 [amdgpu]
> [  888.902746]  ? dm_plane_helper_prepare_fb+0x1f4/0x260 [amdgpu]
> [  888.902924]  ? __cond_resched+0x16/0x40
> [  

Re: [PATCH v2] drm/amd/display: move calcs folder into DML

2022-01-03 Thread Isabella Basso
Hello, Jasdeep,

On 2021-12-23 17:44, Dhillon, Jasdeep wrote:
> [AMD Official Use Only]
> 
>  Hi Isabbasso,  
> 
>  The patch fails to compile when there is No DCN because the calc
> object files fail to generate since dml depends on the
> CONFIG_DRM_AMD_DC_DCN being enabled (Makefile inside dc folder): 
> 
>  ifdef CONFIG_DRM_AMD_DC_DCN 
> DC_LIBS += dcn20 
> DC_LIBS += dsc 
> DC_LIBS += dcn10 dml 
> DC_LIBS += dcn21 
> DC_LIBS += dcn30 
> DC_LIBS += dcn301 
> DC_LIBS += dcn302 
> DC_LIBS += dcn303 endif
> 
>  A few changes need to be made to the patch, which are: 
> 
>  -The Makefile in dc needs the line: DC_LIBS+= dml/calcs 
>  -the Makefile in the calcs folder that the patch deletes can be
> placed inside of dc/dml/calcs instead of adding it to the Makefiles in
> dc/dml  
> 

Siqueira had warned me of this issue just before you emailed me, so I'd
already
prepared a v3 with no calcs Makefile but actually using the dml one for
everything, as it's pretty much set up for working with or without the
DRM_AMD_DC_DCN config turned on. Would this be alright, or would you
rather
have it as you suggested?

>From what I tested locally both work pretty much the same. I think my
solution
is a little harder to test but ends up being a little more compact,
which might
translate into being more maintainable (maybe?).

Please let me know what you think :).
Cheers,
--
Isabella Basso

>  Could you revise your patch based on these changes.  
> 
>  Regards,  
>  Jasdeep 
> 
> -
> 
> From: isabba...@riseup.net 
> Sent: December 20, 2021 6:23 PM
> To: Deucher, Alexander ; Koenig, Christian
> ; dan...@ffwll.ch ;
> Wentland, Harry ; Siqueira, Rodrigo
> ; Li, Sun peng (Leo) ;
> Pan, Xinhui ; Zhuo, Qingqing (Lillian)
> ; Dhillon, Jasdeep ;
> m...@igalia.com 
> Cc: amd-gfx@lists.freedesktop.org ;
> ~lkcamp/patc...@lists.sr.ht <~lkcamp/patc...@lists.sr.ht>
> Subject: Re: [PATCH v2] drm/amd/display: move calcs folder into DML 
> 
> On 2021-12-20 20:20, Isabella Basso wrote:
>> The calcs folder has FPU code on it, which should be isolated inside
> the
>> DML folder as per
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fseries%2F93042%2Fdata=04%7C01%7Cjasdeep.dhillon%40amd.com%7C01959e019f6e45e25a6208d9c40fc233%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637756394247493762%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=bB4zotGMArbsTzZNDr0u2uw3VBD4jxNornJMol9YJlA%3Dreserved=0
> [1].
>>
>> This commit aims single-handedly to correct the location of such FPU
>> code and does not refactor any functions.
>>
>> Signed-off-by: Isabella Basso 
>> ---
>>  drivers/gpu/drm/amd/display/dc/Makefile   |  2 +-
>>  drivers/gpu/drm/amd/display/dc/calcs/Makefile | 68
> ---
>>  drivers/gpu/drm/amd/display/dc/dml/Makefile   | 13 +++-
>>  .../amd/display/dc/{ => dml}/calcs/bw_fixed.c |  0
>>  .../display/dc/{ => dml}/calcs/calcs_logger.h |  0
>>  .../display/dc/{ => dml}/calcs/custom_float.c |  0
>>  .../display/dc/{ => dml}/calcs/dce_calcs.c|  0
>>  .../dc/{ => dml}/calcs/dcn_calc_auto.c|  0
>>  .../dc/{ => dml}/calcs/dcn_calc_auto.h|  0
>>  .../dc/{ => dml}/calcs/dcn_calc_math.c|  0
>>  .../display/dc/{ => dml}/calcs/dcn_calcs.c|  0
>>  11 files changed, 13 insertions(+), 70 deletions(-)
>>  delete mode 100644 drivers/gpu/drm/amd/display/dc/calcs/Makefile
>>  rename drivers/gpu/drm/amd/display/dc/{ => dml}/calcs/bw_fixed.c
> (100%)
>>  rename drivers/gpu/drm/amd/display/dc/{ =>
> dml}/calcs/calcs_logger.h (100%)
>>  rename drivers/gpu/drm/amd/display/dc/{ =>
> dml}/calcs/custom_float.c (100%)
>>  rename drivers/gpu/drm/amd/display/dc/{ => dml}/calcs/dce_calcs.c
> (100%)
>>  rename drivers/gpu/drm/amd/display/dc/{ =>
> dml}/calcs/dcn_calc_auto.c (100%)
>>  rename drivers/gpu/drm/amd/display/dc/{ =>
> dml}/calcs/dcn_calc_auto.h (100%)
>>  rename drivers/gpu/drm/amd/display/dc/{ =>
> dml}/calcs/dcn_calc_math.c (100%)
>>  rename drivers/gpu/drm/amd/display/dc/{ => dml}/calcs/dcn_calcs.c
> (100%)
>>
>> diff --git a/drivers/gpu/drm/amd/display/dc/Makefile
>> b/drivers/gpu/drm/amd/display/dc/Makefile
>> index b1f0d6260226..1872adc96a00 100644
>> --- a/drivers/gpu/drm/amd/display/dc/Makefile
>> +++ b/drivers/gpu/drm/amd/display/dc/Makefile
>> @@ -23,7 +23,7 @@
>>  # Makefile for Display Core (dc) component.
>>  #
>>
>> -DC_LIBS = basics bios calcs clk_mgr dce gpio irq virtual
>> +DC_LIBS = basics bios clk_mgr dce gpio irq virtual
>>
>>  ifdef CONFIG_DRM_AMD_DC_DCN
>>  DC_LIBS += dcn20
>> diff --git a/drivers/gpu/drm/amd/display/dc/calcs/Makefile
>> b/drivers/gpu/drm/amd/display/dc/calcs/Makefile
>> deleted file mode 100644
>> index f3c00f479e1c..
>> --- a/drivers/gpu/drm/amd/display/dc/calcs/Makefile
>> +++ /dev/null
>> @@ -1,68 +0,0 @@
>> -#
>> -# Copyright 2017 Advanced Micro Devices, Inc.
>> -# Copyright 2019 Raptor Engineering, LLC
>> -#
>> -# Permission is hereby 

[PATCH AUTOSEL 5.10 7/8] drm/amd/display: Added power down for DCN10

2022-01-03 Thread Sasha Levin
From: "Lai, Derek" 

[ Upstream commit d97e631af2db84c8c9d63abf68d487d0bb559e4c ]

[Why]
The change of setting a timer callback on boot for 10 seconds is still
working, just lacked power down for DCN10.

[How]
Added power down for DCN10.

Tested-by: Daniel Wheeler 
Reviewed-by: Anthony Koo 
Acked-by: Rodrigo Siqueira 
Signed-off-by: Derek Lai 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c 
b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c
index b24c8ae8b1ece..7e228c181b298 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c
@@ -77,6 +77,7 @@ static const struct hw_sequencer_funcs dcn10_funcs = {
.get_clock = dcn10_get_clock,
.get_vupdate_offset_from_vsync = dcn10_get_vupdate_offset_from_vsync,
.calc_vupdate_position = dcn10_calc_vupdate_position,
+   .power_down = dce110_power_down,
.set_backlight_level = dce110_set_backlight_level,
.set_abm_immediate_disable = dce110_set_abm_immediate_disable,
.set_pipe = dce110_set_pipe,
-- 
2.34.1



[PATCH AUTOSEL 5.15 14/16] drm/amd/display: Added power down for DCN10

2022-01-03 Thread Sasha Levin
From: "Lai, Derek" 

[ Upstream commit d97e631af2db84c8c9d63abf68d487d0bb559e4c ]

[Why]
The change of setting a timer callback on boot for 10 seconds is still
working, just lacked power down for DCN10.

[How]
Added power down for DCN10.

Tested-by: Daniel Wheeler 
Reviewed-by: Anthony Koo 
Acked-by: Rodrigo Siqueira 
Signed-off-by: Derek Lai 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c 
b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c
index 34001a30d449a..10e613ec7d24f 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_init.c
@@ -78,6 +78,7 @@ static const struct hw_sequencer_funcs dcn10_funcs = {
.get_clock = dcn10_get_clock,
.get_vupdate_offset_from_vsync = dcn10_get_vupdate_offset_from_vsync,
.calc_vupdate_position = dcn10_calc_vupdate_position,
+   .power_down = dce110_power_down,
.set_backlight_level = dce110_set_backlight_level,
.set_abm_immediate_disable = dce110_set_abm_immediate_disable,
.set_pipe = dce110_set_pipe,
-- 
2.34.1



[PATCH AUTOSEL 5.15 13/16] drm/amd/display: fix B0 TMDS deepcolor no dislay issue

2022-01-03 Thread Sasha Levin
From: Charlene Liu 

[ Upstream commit 2eb82577a16d4c8eb31e4ed520649850bb95b223 ]

[why]
B0 PHY C map to F, D map to G driver use logic instance, dmub does the
remap. Driver still need use the right PHY instance to access right HW.

[how]
use phyical instance when program PHY register.

[note]
could move resync_control programming to dmub next.

Tested-by: Daniel Wheeler 
Reviewed-by: Dmytro Laktyushkin 
Reviewed-by: Jun Lei 
Acked-by: Rodrigo Siqueira 
Signed-off-by: Charlene Liu 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 .../drm/amd/display/dc/dcn31/dcn31_resource.c | 25 +--
 .../drm/amd/display/dc/dcn31/dcn31_resource.h | 31 +++
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c 
b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c
index 79e92ecca96c1..0c0fe3fb70e19 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c
+++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c
@@ -352,6 +352,14 @@ static const struct dce110_clk_src_regs clk_src_regs[] = {
clk_src_regs(3, D),
clk_src_regs(4, E)
 };
+/*pll_id being rempped in dmub, in driver it is logical instance*/
+static const struct dce110_clk_src_regs clk_src_regs_b0[] = {
+   clk_src_regs(0, A),
+   clk_src_regs(1, B),
+   clk_src_regs(2, F),
+   clk_src_regs(3, G),
+   clk_src_regs(4, E)
+};
 
 static const struct dce110_clk_src_shift cs_shift = {
CS_COMMON_MASK_SH_LIST_DCN2_0(__SHIFT)
@@ -2019,14 +2027,27 @@ static bool dcn31_resource_construct(
dcn30_clock_source_create(ctx, ctx->dc_bios,
CLOCK_SOURCE_COMBO_PHY_PLL1,
_src_regs[1], false);
-   pool->base.clock_sources[DCN31_CLK_SRC_PLL2] =
+   /*move phypllx_pixclk_resync to dmub next*/
+   if (dc->ctx->asic_id.hw_internal_rev == YELLOW_CARP_B0) {
+   pool->base.clock_sources[DCN31_CLK_SRC_PLL2] =
+   dcn30_clock_source_create(ctx, ctx->dc_bios,
+   CLOCK_SOURCE_COMBO_PHY_PLL2,
+   _src_regs_b0[2], false);
+   pool->base.clock_sources[DCN31_CLK_SRC_PLL3] =
+   dcn30_clock_source_create(ctx, ctx->dc_bios,
+   CLOCK_SOURCE_COMBO_PHY_PLL3,
+   _src_regs_b0[3], false);
+   } else {
+   pool->base.clock_sources[DCN31_CLK_SRC_PLL2] =
dcn30_clock_source_create(ctx, ctx->dc_bios,
CLOCK_SOURCE_COMBO_PHY_PLL2,
_src_regs[2], false);
-   pool->base.clock_sources[DCN31_CLK_SRC_PLL3] =
+   pool->base.clock_sources[DCN31_CLK_SRC_PLL3] =
dcn30_clock_source_create(ctx, ctx->dc_bios,
CLOCK_SOURCE_COMBO_PHY_PLL3,
_src_regs[3], false);
+   }
+
pool->base.clock_sources[DCN31_CLK_SRC_PLL4] =
dcn30_clock_source_create(ctx, ctx->dc_bios,
CLOCK_SOURCE_COMBO_PHY_PLL4,
diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.h 
b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.h
index 93571c9769967..cc4bed675588c 100644
--- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.h
+++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.h
@@ -39,4 +39,35 @@ struct resource_pool *dcn31_create_resource_pool(
const struct dc_init_data *init_data,
struct dc *dc);
 
+/*temp: B0 specific before switch to dcn313 headers*/
+#ifndef regPHYPLLF_PIXCLK_RESYNC_CNTL
+#define regPHYPLLF_PIXCLK_RESYNC_CNTL 0x007e
+#define regPHYPLLF_PIXCLK_RESYNC_CNTL_BASE_IDX 1
+#define regPHYPLLG_PIXCLK_RESYNC_CNTL 0x005f
+#define regPHYPLLG_PIXCLK_RESYNC_CNTL_BASE_IDX 1
+
+//PHYPLLF_PIXCLK_RESYNC_CNTL
+#define PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_PIXCLK_RESYNC_ENABLE__SHIFT 0x0
+#define 
PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_DEEP_COLOR_DTO_ENABLE_STATUS__SHIFT 0x1
+#define PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_DCCG_DEEP_COLOR_CNTL__SHIFT 0x4
+#define PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_PIXCLK_ENABLE__SHIFT 0x8
+#define PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_PIXCLK_DOUBLE_RATE_ENABLE__SHIFT 
0x9
+#define PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_PIXCLK_RESYNC_ENABLE_MASK 
0x0001L
+#define PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_DEEP_COLOR_DTO_ENABLE_STATUS_MASK 
0x0002L
+#define PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_DCCG_DEEP_COLOR_CNTL_MASK 
0x0030L
+#define PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_PIXCLK_ENABLE_MASK 0x0100L
+#define PHYPLLF_PIXCLK_RESYNC_CNTL__PHYPLLF_PIXCLK_DOUBLE_RATE_ENABLE_MASK 
0x0200L
+
+//PHYPLLG_PIXCLK_RESYNC_CNTL
+#define PHYPLLG_PIXCLK_RESYNC_CNTL__PHYPLLG_PIXCLK_RESYNC_ENABLE__SHIFT 0x0
+#define 

[PATCH AUTOSEL 5.15 11/16] drm/amdgpu: put SMU into proper state on runpm suspending for BOCO capable platform

2022-01-03 Thread Sasha Levin
From: Evan Quan 

[ Upstream commit 7be3be2b027c12e84833b3dc9597d3bb7e4c5464 ]

By setting mp1_state as PP_MP1_STATE_UNLOAD, MP1 will do some proper cleanups 
and
put itself into a state ready for PNP. That can workaround some random resuming
failure observed on BOCO capable platforms.

Signed-off-by: Evan Quan 
Acked-by: Alex Deucher 
Reviewed-by: Guchun Chen 
Reviewed-by: Lijo Lazar 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ada083fbc052b..6e682bf8c2d6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1578,12 +1578,27 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
if (amdgpu_device_supports_px(drm_dev))
drm_dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
 
+   /*
+* By setting mp1_state as PP_MP1_STATE_UNLOAD, MP1 will do some
+* proper cleanups and put itself into a state ready for PNP. That
+* can address some random resuming failure observed on BOCO capable
+* platforms.
+* TODO: this may be also needed for PX capable platform.
+*/
+   if (amdgpu_device_supports_boco(drm_dev))
+   adev->mp1_state = PP_MP1_STATE_UNLOAD;
+
ret = amdgpu_device_suspend(drm_dev, false);
if (ret) {
adev->in_runpm = false;
+   if (amdgpu_device_supports_boco(drm_dev))
+   adev->mp1_state = PP_MP1_STATE_NONE;
return ret;
}
 
+   if (amdgpu_device_supports_boco(drm_dev))
+   adev->mp1_state = PP_MP1_STATE_NONE;
+
if (amdgpu_device_supports_px(drm_dev)) {
/* Only need to handle PCI state in the driver for ATPX
 * PCI core handles it for _PR3.
-- 
2.34.1



[PATCH AUTOSEL 5.15 10/16] drm/amdgpu: always reset the asic in suspend (v2)

2022-01-03 Thread Sasha Levin
From: Alex Deucher 

[ Upstream commit daf8de0874ab5b74b38a38726fdd3d07ef98a7ee ]

If the platform suspend happens to fail and the power rail
is not turned off, the GPU will be in an unknown state on
resume, so reset the asic so that it will be in a known
good state on resume even if the platform suspend failed.

v2: handle s0ix

Acked-by: Luben Tuikov 
Acked-by: Evan Quan 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index f18240f873878..ada083fbc052b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1498,7 +1498,10 @@ static int amdgpu_pmops_suspend(struct device *dev)
adev->in_s3 = true;
r = amdgpu_device_suspend(drm_dev, true);
adev->in_s3 = false;
-
+   if (r)
+   return r;
+   if (!adev->in_s0ix)
+   r = amdgpu_asic_reset(adev);
return r;
 }
 
-- 
2.34.1



[PATCH AUTOSEL 5.15 09/16] drm/amd/pm: skip setting gfx cgpg in the s0ix suspend-resume

2022-01-03 Thread Sasha Levin
From: Prike Liang 

[ Upstream commit 8c45096c60d6ce6341c374636100ed1b2c1c33a1 ]

In the s0ix entry need retain gfx in the gfxoff state,so here need't
set gfx cgpg in the S0ix suspend-resume process. Moreover move the S0ix
check into SMU12 can simplify the code condition check.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1712
Signed-off-by: Prike Liang 
Reviewed-by: Evan Quan 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c  | 7 ++-
 drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c | 3 ++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 04863a7971155..30ee8819587e2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -1536,9 +1536,7 @@ static int smu_suspend(void *handle)
 
smu->watermarks_bitmap &= ~(WATERMARKS_LOADED);
 
-   /* skip CGPG when in S0ix */
-   if (smu->is_apu && !adev->in_s0ix)
-   smu_set_gfx_cgpg(>smu, false);
+   smu_set_gfx_cgpg(>smu, false);
 
return 0;
 }
@@ -1569,8 +1567,7 @@ static int smu_resume(void *handle)
return ret;
}
 
-   if (smu->is_apu)
-   smu_set_gfx_cgpg(>smu, true);
+   smu_set_gfx_cgpg(>smu, true);
 
smu->disable_uclk_switch = 0;
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c
index 43028f2cd28b5..9c91e79c955fb 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c
@@ -120,7 +120,8 @@ int smu_v12_0_powergate_sdma(struct smu_context *smu, bool 
gate)
 
 int smu_v12_0_set_gfx_cgpg(struct smu_context *smu, bool enable)
 {
-   if (!(smu->adev->pg_flags & AMD_PG_SUPPORT_GFX_PG))
+   /* Until now the SMU12 only implemented for Renoir series so here 
neen't do APU check. */
+   if (!(smu->adev->pg_flags & AMD_PG_SUPPORT_GFX_PG) || 
smu->adev->in_s0ix)
return 0;
 
return smu_cmn_send_smc_msg_with_param(smu,
-- 
2.34.1



[PATCH AUTOSEL 5.15 03/16] drm/amd/pm: Fix xgmi link control on aldebaran

2022-01-03 Thread Sasha Levin
From: Lijo Lazar 

[ Upstream commit 19e66d512e4182a0461530fa3159638e0f55d97e ]

Fix the message argument.
0: Allow power down
1: Disallow power down

Signed-off-by: Lijo Lazar 
Reviewed-by: Hawking Zhang 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index 5019903db492a..c9cfeb094750d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -1619,7 +1619,7 @@ static int aldebaran_allow_xgmi_power_down(struct 
smu_context *smu, bool en)
 {
return smu_cmn_send_smc_msg_with_param(smu,
   SMU_MSG_GmiPwrDnControl,
-  en ? 1 : 0,
+  en ? 0 : 1,
   NULL);
 }
 
-- 
2.34.1



[PATCH AUTOSEL 5.15 02/16] drm/amdgpu: fix dropped backing store handling in amdgpu_dma_buf_move_notify

2022-01-03 Thread Sasha Levin
From: Christian König 

[ Upstream commit fc74881c28d314b10efac016ef49df4ff40b8b97 ]

bo->tbo.resource can now be NULL.

Signed-off-by: Christian König 
Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1811
Acked-by: Alex Deucher 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20211210083927.1754-1-christian.koe...@amd.com
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
index ae6ab93c868b8..784a12bf8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c
@@ -384,7 +384,7 @@ amdgpu_dma_buf_move_notify(struct dma_buf_attachment 
*attach)
struct amdgpu_vm_bo_base *bo_base;
int r;
 
-   if (bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
+   if (!bo->tbo.resource || bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
return;
 
r = ttm_bo_validate(>tbo, , );
-- 
2.34.1



[PATCH 1/2] drm/amdgpu: explicitly check for s0ix when evicting resources

2022-01-03 Thread Mario Limonciello
This codepath should be running in both s0ix and s3, but only does
currently because s3 and s0ix are both set in the s0ix case.

Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ce93a304292c..412f377f80b1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3956,8 +3956,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
  */
 static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
 {
-   /* No need to evict vram on APUs for suspend to ram */
-   if (adev->in_s3 && (adev->flags & AMD_IS_APU))
+   /* No need to evict vram on APUs for suspend to ram or s2idle */
+   if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
return;
 
if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
-- 
2.25.1



[PATCH 2/2] drm/amdgpu: don't set s3 and s0ix at the same time

2022-01-03 Thread Mario Limonciello
This makes it clearer which codepaths are in use specifically in
one state or the other.

Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index db2a9dfd5918..413fecc89e6c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2165,9 +2165,9 @@ static int amdgpu_pmops_suspend(struct device *dev)
 
if (amdgpu_acpi_is_s0ix_active(adev))
adev->in_s0ix = true;
-   adev->in_s3 = true;
+   else
+   adev->in_s3 = true;
r = amdgpu_device_suspend(drm_dev, true);
-   adev->in_s3 = false;
if (r)
return r;
if (!adev->in_s0ix)
@@ -2188,6 +2188,8 @@ static int amdgpu_pmops_resume(struct device *dev)
r = amdgpu_device_resume(drm_dev, true);
if (amdgpu_acpi_is_s0ix_active(adev))
adev->in_s0ix = false;
+   else
+   adev->in_s3 = false;
return r;
 }
 
-- 
2.25.1



Re: [PATCH] radeon: add a force flush to delay work when radeon suspend

2022-01-03 Thread Christian König

Am 25.12.21 um 03:56 schrieb 周雪梅:
Although radeon card fence and wait for gpu to finish processing 
current batch rings,
there is still a corner case that radeon lockup work queue may not be 
fully flushed,
and meanwhile the radeon_suspend_kms() function has called 
pci_set_power_state() to

put device in D3hot state.

Per PCI spec rev 4.0 on 5.3.1.4.1 D3hot State.
> Configuration and Message requests are the only TLPs accepted by a 
Function in
> the D3hot state. All other received Requests must be handled as 
Unsupported Requests,
> and all received Completions may optionally be handled as Unexpected 
Completions.


Well first of all this is the completely wrong place for this. The flush 
belongs into the fence code and not here.


Then I don't think that this is a good idea since it might cause deadlocks.

Christian.




This issue will happen in following logs:

1Unable to handle kernel paging request at virtual address 
8800e0008010

CPU 0 kworker/0:3(131): Oops 0
pc = []  ra = []  ps =  
Tainted: G        W

pc is at si_gpu_check_soft_reset+0x3c/0x240
ra is at si_dma_is_lockup+0x34/0xd0
v0 =   t0 = fff08800e0008010  t1 = 0001
t2 = 8010  t3 = fff7e3c0  t4 = fff7e3c00258
t5 =   t6 = 0001  t7 = fff7ef078000
s0 = fff7e3c016e8  s1 = fff7e3c0  s2 = fff7e3c00018
s3 = fff7e3c0  s4 = fff7fff59d80  s5 = 
s6 = fff7ef07bd98
a0 = fff7e3c0  a1 = fff7e3c016e8  a2 = 0008
a3 = 0001  a4 = 8f5c28f5c28f5c29  a5 = 810f4338
t8 = 0275  t9 = 809b66f8 t10 = ff6769c5d964b800
t11= b886  pv = 811bea20  at = 
gp = 81d89690  sp = aa814126
4Disabling lock debugging due to kernel taint
Trace:
[] si_dma_is_lockup+0x34/0xd0
[] radeon_fence_check_lockup+0xd0/0x290
[] process_one_work+0x280/0x550
[] worker_thread+0x70/0x7c0
[] worker_thread+0x130/0x7c0
[] kthread+0x200/0x210
[] worker_thread+0x0/0x7c0
[] kthread+0x14c/0x210
[] ret_from_kernel_thread+0x18/0x20
[] kthread+0x0/0x210

 Code: ad3e0008  43f0074a  ad7e0018  ad9e0020 8c3001e8  40230101
 <8821> 4821ed21

So force lockup work queue flush to fix this problem.

Reviewed-by: Su Weiqiang 
Reviewed-by: Zhou Xuemei 
Signed-off-by: Xu Chenjiao 
---
 drivers/gpu/drm/radeon/radeon_device.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/radeon/radeon_device.c 
b/drivers/gpu/drm/radeon/radeon_device.c

index 59c8a6647ff2..cc1c07963116 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -1625,6 +1625,9 @@ int radeon_suspend_kms(struct drm_device *dev, 
bool suspend,

if (r) {
/* delay GPU reset to resume */
radeon_fence_driver_force_completion(rdev, i);
+} else {
+/* finish executing delayed work */
+flush_delayed_work(>fence_drv[i].lockup_work);
}
}
--
2.17.1







Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset protection for SRIOV

2022-01-03 Thread Christian König

Please don't. This patch is vital to the cleanup of the reset procedure.

If SRIOV doesn't work with that we need to change SRIOV and not the driver.

Christian.

Am 30.12.21 um 19:45 schrieb Andrey Grodzovsky:

Sure, I guess i can drop this patch then.

Andrey

On 2021-12-24 4:57 a.m., JingWen Chen wrote:
I do agree with shaoyun, if the host find the gpu engine hangs first, 
and do the flr, guest side thread may not know this and still try to 
access HW(e.g. kfd is using a lot of amdgpu_in_reset and reset_sem to 
identify the reset status). And this may lead to very bad result.


On 2021/12/24 下午4:58, Deng, Emily wrote:
These patches look good to me. JingWen will pull these patches and 
do some basic TDR test on sriov environment, and give feedback.


Best wishes
Emily Deng




-Original Message-
From: Liu, Monk 
Sent: Thursday, December 23, 2021 6:14 PM
To: Koenig, Christian ; Grodzovsky, Andrey
; dri-de...@lists.freedesktop.org; amd-
g...@lists.freedesktop.org; Chen, Horace ; Chen,
JingWen ; Deng, Emily 
Cc: dan...@ffwll.ch
Subject: RE: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
protection

for SRIOV

[AMD Official Use Only]

@Chen, Horace @Chen, JingWen @Deng, Emily

Please take a review on Andrey's patch

Thanks
---
Monk Liu | Cloud GPU & Virtualization Solution | AMD
---
we are hiring software manager for CVS core team
---

-Original Message-
From: Koenig, Christian 
Sent: Thursday, December 23, 2021 4:42 PM
To: Grodzovsky, Andrey ; dri-
de...@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Cc: dan...@ffwll.ch; Liu, Monk ; Chen, Horace

Subject: Re: [RFC v2 8/8] drm/amd/virt: Drop concurrent GPU reset 
protection

for SRIOV

Am 22.12.21 um 23:14 schrieb Andrey Grodzovsky:

Since now flr work is serialized against  GPU resets there is no need
for this.

Signed-off-by: Andrey Grodzovsky 

Acked-by: Christian König 


---
   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 11 ---
   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 11 ---
   2 files changed, 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 487cd654b69e..7d59a66e3988 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -248,15 +248,7 @@ static void xgpu_ai_mailbox_flr_work(struct

work_struct *work)

   struct amdgpu_device *adev = container_of(virt, struct

amdgpu_device, virt);

   int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;

-    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
- * otherwise the mailbox msg will be ruined/reseted by
- * the VF FLR.
- */
-    if (!down_write_trylock(>reset_sem))
-    return;
-
   amdgpu_virt_fini_data_exchange(adev);
-    atomic_set(>in_gpu_reset, 1);

   xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);

@@ -269,9 +261,6 @@ static void xgpu_ai_mailbox_flr_work(struct

work_struct *work)

   } while (timeout > 1);

   flr_done:
-    atomic_set(>in_gpu_reset, 0);
-    up_write(>reset_sem);
-
   /* Trigger recovery for world switch failure if no TDR */
   if (amdgpu_device_should_recover_gpu(adev)
   && (!amdgpu_device_has_job_running(adev) || diff --git
a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index e3869067a31d..f82c066c8e8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -277,15 +277,7 @@ static void xgpu_nv_mailbox_flr_work(struct

work_struct *work)

   struct amdgpu_device *adev = container_of(virt, struct

amdgpu_device, virt);

   int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;

-    /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
- * otherwise the mailbox msg will be ruined/reseted by
- * the VF FLR.
- */
-    if (!down_write_trylock(>reset_sem))
-    return;
-
   amdgpu_virt_fini_data_exchange(adev);
-    atomic_set(>in_gpu_reset, 1);

   xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);

@@ -298,9 +290,6 @@ static void xgpu_nv_mailbox_flr_work(struct

work_struct *work)

   } while (timeout > 1);

   flr_done:
-    atomic_set(>in_gpu_reset, 0);
-    up_write(>reset_sem);
-
   /* Trigger recovery for world switch failure if no TDR */
   if (amdgpu_device_should_recover_gpu(adev)
   && (!amdgpu_device_has_job_running(adev) ||