RE: [PATCH 01/17] drm/amd/powerplay: move the API shared by SMU v11 to smu_v11_0.c

2020-07-15 Thread Quan, Evan
[AMD Official Use Only - Internal Distribution Only]

Updated in the new patch.

BR,
Evan
-Original Message-
From: Alex Deucher 
Sent: Thursday, July 16, 2020 1:58 AM
To: Quan, Evan 
Cc: amd-gfx list ; Deucher, Alexander 

Subject: Re: [PATCH 01/17] drm/amd/powerplay: move the API shared by SMU v11 to 
smu_v11_0.c

On Tue, Jul 14, 2020 at 4:04 AM Evan Quan  wrote:
>
> To fit our original desgin and this can help to maintain clear code
> layer.
>
> Change-Id: Id89476c14709b5676bbf043371a27f27b94a58ed
> Signed-off-by: Evan Quan 
> ---
>  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c| 16 ---
>  drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  2 +-
>  .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  4 
>  drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h |  4 
>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c|  4 ++--
>  .../drm/amd/powerplay/sienna_cichlid_ppt.c|  2 +-
>  drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 20 +--
>  7 files changed, 26 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index 16ff64644e2e..0daea412d0a0 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -675,22 +675,6 @@ static int smu_late_init(void *handle)
> return 0;
>  }
>
> -int smu_get_atom_data_table(struct smu_context *smu, uint32_t table,
> -   uint16_t *size, uint8_t *frev, uint8_t *crev,
> -   uint8_t **addr)
> -{
> -   struct amdgpu_device *adev = smu->adev;
> -   uint16_t data_start;
> -
> -   if (!amdgpu_atom_parse_data_header(adev->mode_info.atom_context, 
> table,
> -  size, frev, crev, _start))
> -   return -EINVAL;
> -
> -   *addr = (uint8_t *)adev->mode_info.atom_context->bios + data_start;
> -
> -   return 0;
> -}
> -
>  static int smu_init_fb_allocations(struct smu_context *smu)  {
> struct amdgpu_device *adev = smu->adev; diff --git
> a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> index 56dc20a617fd..03361d0194fe 100644
> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> @@ -488,7 +488,7 @@ static int arcturus_append_powerplay_table(struct 
> smu_context *smu)
> index = 
> get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
>smc_dpm_info);
>
> -   ret = smu_get_atom_data_table(smu, index, NULL, NULL, NULL,
> +   ret = smu_v11_0_get_atom_data_table(smu, index, NULL, NULL,
> + NULL,
>   (uint8_t **)_dpm_table);
> if (ret)
> return ret;
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index 52e5603dcc97..28894b8bab67 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -674,10 +674,6 @@ int smu_baco_exit(struct smu_context *smu);
>
>  int smu_mode2_reset(struct smu_context *smu);
>
> -extern int smu_get_atom_data_table(struct smu_context *smu, uint32_t table,
> -  uint16_t *size, uint8_t *frev, uint8_t 
> *crev,
> -  uint8_t **addr);
> -
>  extern const struct amd_ip_funcs smu_ip_funcs;
>
>  extern const struct amdgpu_ip_block_version smu_v11_0_ip_block; diff
> --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> index 9b724e4aceaa..8a4053d8eb8c 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> @@ -175,6 +175,10 @@ int smu_v11_0_fini_power(struct smu_context
> *smu);
>
>  int smu_v11_0_check_fw_status(struct smu_context *smu);
>
> +int smu_v11_0_get_atom_data_table(struct smu_context *smu, uint32_t table,
> +   uint16_t *size, uint8_t *frev, uint8_t *crev,
> +   uint8_t **addr);
> +
>  int smu_v11_0_setup_pptable(struct smu_context *smu);
>
>  int smu_v11_0_get_vbios_bootup_values(struct smu_context *smu); diff
> --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> index 41bd6d157271..ff717b800086 100644
> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> @@ -467,7 +467,7 @@ static int navi10_append_powerplay_table(struct 
> smu_context *smu)
> index = 
> get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
>smc_dpm_info);
>
> -   ret = smu_get_atom_data_table(smu, index, NULL, NULL, NULL,
> +   ret = smu_v11_0_get_atom_data_table(smu, index, NULL, NULL,
> + NULL,
>   (uint8_t 

[PATCH] drm/amd/powerplay: widely share the API for data table retrieving

2020-07-15 Thread Evan Quan
Considering the data table retrieving can be more widely shared,
amdgpu_atombios.c is the right place.

Change-Id: Id89476c14709b5676bbf043371a27f27b94a58ed
Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c| 17 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h|  7 +++
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c  | 16 
 drivers/gpu/drm/amd/powerplay/arcturus_ppt.c|  3 ++-
 drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h  |  4 
 drivers/gpu/drm/amd/powerplay/navi10_ppt.c  |  5 +++--
 .../gpu/drm/amd/powerplay/sienna_cichlid_ppt.c  |  3 ++-
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c   |  5 +++--
 8 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
index c687432da426..29f767e026e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.c
@@ -2036,3 +2036,20 @@ int amdgpu_atombios_init(struct amdgpu_device *adev)
return 0;
 }
 
+int amdgpu_atombios_get_data_table(struct amdgpu_device *adev,
+  uint32_t table,
+  uint16_t *size,
+  uint8_t *frev,
+  uint8_t *crev,
+  uint8_t **addr)
+{
+   uint16_t data_start;
+
+   if (!amdgpu_atom_parse_data_header(adev->mode_info.atom_context, table,
+  size, frev, crev, _start))
+   return -EINVAL;
+
+   *addr = (uint8_t *)adev->mode_info.atom_context->bios + data_start;
+
+   return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h
index fd8f18074f7a..1321ec09c734 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atombios.h
@@ -216,6 +216,13 @@ int amdgpu_atombios_get_svi2_info(struct amdgpu_device 
*adev,
  u8 voltage_type,
  u8 *svd_gpio_id, u8 *svc_gpio_id);
 
+int amdgpu_atombios_get_data_table(struct amdgpu_device *adev,
+  uint32_t table,
+  uint16_t *size,
+  uint8_t *frev,
+  uint8_t *crev,
+  uint8_t **addr);
+
 void amdgpu_atombios_fini(struct amdgpu_device *adev);
 int amdgpu_atombios_init(struct amdgpu_device *adev);
 
diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 03125c8a2145..01d669a36e1f 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -676,22 +676,6 @@ static int smu_late_init(void *handle)
return 0;
 }
 
-int smu_get_atom_data_table(struct smu_context *smu, uint32_t table,
-   uint16_t *size, uint8_t *frev, uint8_t *crev,
-   uint8_t **addr)
-{
-   struct amdgpu_device *adev = smu->adev;
-   uint16_t data_start;
-
-   if (!amdgpu_atom_parse_data_header(adev->mode_info.atom_context, table,
-  size, frev, crev, _start))
-   return -EINVAL;
-
-   *addr = (uint8_t *)adev->mode_info.atom_context->bios + data_start;
-
-   return 0;
-}
-
 static int smu_init_fb_allocations(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c 
b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
index 56dc20a617fd..578c50e294c7 100644
--- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
@@ -27,6 +27,7 @@
 #include "smu_internal.h"
 #include "atomfirmware.h"
 #include "amdgpu_atomfirmware.h"
+#include "amdgpu_atombios.h"
 #include "smu_v11_0.h"
 #include "smu11_driver_if_arcturus.h"
 #include "soc15_common.h"
@@ -488,7 +489,7 @@ static int arcturus_append_powerplay_table(struct 
smu_context *smu)
index = 
get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
   smc_dpm_info);
 
-   ret = smu_get_atom_data_table(smu, index, NULL, NULL, NULL,
+   ret = amdgpu_atombios_get_data_table(smu->adev, index, NULL, NULL, NULL,
  (uint8_t **)_dpm_table);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index 70181ba7ee0c..ba9beffb887d 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -678,10 +678,6 @@ bool smu_mode1_reset_is_support(struct smu_context *smu);
 int smu_mode1_reset(struct smu_context *smu);
 int smu_mode2_reset(struct smu_context *smu);
 
-extern int 

Re: [PATCH 10/10] drm/amd/display: handle failed allocation during stream construction

2020-07-15 Thread Sasha Levin
Hi

[This is an automated email]

This commit has been processed because it contains a -stable tag.
The stable tag indicates that it's relevant for the following trees: all

The bot has tested the following trees: v5.7.8, v5.4.51, v4.19.132, v4.14.188, 
v4.9.230, v4.4.230.

v5.7.8: Build OK!
v5.4.51: Failed to apply! Possible dependencies:
d9e32672a1285 ("drm/amd/display: cleanup of construct and destruct funcs")

v4.19.132: Failed to apply! Possible dependencies:
0e3d73f1a440e ("drm/amd/display: Add Raven2 definitions in dc")
1e7e86c43f38d ("drm/amd/display: decouple front and backend pgm using 
dpms_off as backend enable flag")
21e471f0850de ("drm/amd/display: Set dispclk and dprefclock directly")
24f7dd7ea98dc ("drm/amd/display: move pplib/smu notification to dccg block")
4e60536d093f4 ("drm/amd/display: Set DFS bypass flags for dce110")
5a83c93249098 ("drm/amd/display: Add support for toggling DFS bypass")
76d981a9fe823 ("Revert "drm/amd/display: make clk_mgr call enable_pme_wa"")
7ed4e6352c16f ("drm/amd/display: Add DCN2 HW Sequencer and Resource")
84e7fc05a9270 ("drm/amd/display: rename dccg to clk_mgr")
8c3db1284a016 ("drm/amdgpu: fill in 
amdgpu_dm_remove_sink_from_freesync_module")
98e6436d3af5f ("drm/amd/display: Refactor FreeSync module")
ad908423ef86f ("drm/amd/display: support 48 MHZ refclk off")
d9673c920c035 ("drm/amd/display: Pass init_data into DCN resource creation")
d9e32672a1285 ("drm/amd/display: cleanup of construct and destruct funcs")

v4.14.188: Failed to apply! Possible dependencies:
1b0c0f9dc5ca6 ("drm/amdgpu: move userptr BOs to CPU domain during CS v2")
1ed3d2567c800 ("drm/amdgpu: keep the MMU lock until the update ends v4")
3fe89771cb0a6 ("drm/amdgpu: stop reserving the BO in the MMU callback v3")
4562236b3bc0a ("drm/amd/dc: Add dc display driver (v2)")
60de1c1740f39 ("drm/amdgpu: use a rw_semaphore for MMU notifiers")
9a18999640fa6 ("drm/amdgpu: move MMU notifier related defines to 
amdgpu_mn.h")
9cca0b8e5df0a ("drm/amdgpu: move amdgpu_cs_sysvm_access_required into 
find_mapping")
a216ab09955d6 ("drm/amdgpu: fix userptr put_page handling")
b72cf4fca2bb7 ("drm/amdgpu: move taking mmap_sem into get_user_pages v2")
ca666a3c298f8 ("drm/amdgpu: stop using BO status for user pages")

v4.9.230: Failed to apply! Possible dependencies:
1cec20f0ea0e3 ("dma-buf: Restart reservation_object_wait_timeout_rcu() 
after writes")
3fe89771cb0a6 ("drm/amdgpu: stop reserving the BO in the MMU callback v3")
4562236b3bc0a ("drm/amd/dc: Add dc display driver (v2)")
4df654d293c64 ("drm/amdgpu: move amdgpu_uvd structure to uvd header")
5e5681788befb ("drm/amdgpu: move amdgpu_vce structure to vce header")
660e855813f78 ("amdgpu: use drm sync objects for shared semaphores (v6)")
78010cd9736ec ("dma-buf/fence: add an lockdep_assert_held()")
95aa13f6b196d ("drm/amdgpu: move amdgpu_vcn structure to vcn header")
95d0906f85065 ("drm/amdgpu: add initial vcn support and decode tests")
9a18999640fa6 ("drm/amdgpu: move MMU notifier related defines to 
amdgpu_mn.h")
b636922553ee2 ("drm/amdgpu: only move VM BOs in the LRU during validation 
v2")
b72cf4fca2bb7 ("drm/amdgpu: move taking mmap_sem into get_user_pages v2")
f54d1867005c3 ("dma-buf: Rename struct fence to dma_fence")
fedf54132d241 ("dma-buf: Restart reservation_object_get_fences_rcu() after 
writes")

v4.4.230: Failed to apply! Possible dependencies:
1f7371b2a5faf ("drm/amd/powerplay: add basic powerplay framework")
288912cb95d15 ("drm/amdgpu: use $(src) in Makefile (v2)")
37cd0ca204a55 ("drm/amdgpu: unify AMDGPU_CTX_MAX_CS_PENDING and 
amdgpu_sched_jobs")
3c0eea6c35d93 ("drm/amdgpu: put VM page tables directly into duplicates 
list")
3f99dd814a6fd ("drm/amdgpu: save and restore UVD context with suspend and 
resume")
4325198180e5a ("drm/amdgpu: remove GART page addr array")
4562236b3bc0a ("drm/amd/dc: Add dc display driver (v2)")
4acabfe3793eb ("drm/amdgpu: fix num_ibs check")
4df654d293c64 ("drm/amdgpu: move amdgpu_uvd structure to uvd header")
50838c8cc413d ("drm/amdgpu: add proper job alloc/free functions")
56467ebfb2548 ("drm/amdgpu: split VM PD and PT handling during CS")
5e5681788befb ("drm/amdgpu: move amdgpu_vce structure to vce header")
7270f8391df1a ("drm/amdgpu: add amdgpu_set_ib_value helper (v2)")
95aa13f6b196d ("drm/amdgpu: move amdgpu_vcn structure to vcn header")
9a18999640fa6 ("drm/amdgpu: move MMU notifier related defines to 
amdgpu_mn.h")
a1d29476d666f ("drm/amdgpu: optionally enable GART debugfs file")
a8fe58cec351c ("drm/amd: add ACP driver support")
c036554170fcc ("drm/amdgpu: handle more than 10 UVD sessions (v2)")
c3cca41e6249e ("drm/amdgpu: cleanup amdgpu_cs_parser structure")
cadf97b196a1e ("drm/amdgpu: clean up non-scheduler code path (v2)")
cd75dc6887f1e ("drm/amdgpu: separate pushing CS to 

Re: [PATCH 04/10] drm/amd/display: OLED panel backlight adjust not work with external display connected

2020-07-15 Thread Sasha Levin
Hi

[This is an automated email]

This commit has been processed because it contains a -stable tag.
The stable tag indicates that it's relevant for the following trees: all

The bot has tested the following trees: v5.7.8, v5.4.51, v4.19.132, v4.14.188, 
v4.9.230, v4.4.230.

v5.7.8: Build OK!
v5.4.51: Failed to apply! Possible dependencies:
945628101be55 ("drm/amd/display: Add backlight support via AUX")

v4.19.132: Failed to apply! Possible dependencies:
0cafc82fae415 ("drm/amd/display: set backlight level limit to 1")
11c3ee48bd7c2 ("drm/amdgpu/display: add support for LVDS (v5)")
1e7e86c43f38d ("drm/amd/display: decouple front and backend pgm using 
dpms_off as backend enable flag")
206bbafe00dca ("drm/amd: Query and use ACPI backlight caps")
262485a50fd45 ("drm/amd/display: Expand dc to use 16.16 bit backlight")
694d0775ca94b ("drm/amd: Don't fail on backlight = 0")
8c3db1284a016 ("drm/amdgpu: fill in 
amdgpu_dm_remove_sink_from_freesync_module")
945628101be55 ("drm/amd/display: Add backlight support via AUX")
98e6436d3af5f ("drm/amd/display: Refactor FreeSync module")
aa9c4abe466ac ("drm/amd/display: Refactor FPGA-specific link setup")

v4.14.188: Failed to apply! Possible dependencies:
1b0c0f9dc5ca6 ("drm/amdgpu: move userptr BOs to CPU domain during CS v2")
1ed3d2567c800 ("drm/amdgpu: keep the MMU lock until the update ends v4")
3fe89771cb0a6 ("drm/amdgpu: stop reserving the BO in the MMU callback v3")
4562236b3bc0a ("drm/amd/dc: Add dc display driver (v2)")
60de1c1740f39 ("drm/amdgpu: use a rw_semaphore for MMU notifiers")
945628101be55 ("drm/amd/display: Add backlight support via AUX")
9a18999640fa6 ("drm/amdgpu: move MMU notifier related defines to 
amdgpu_mn.h")
9cca0b8e5df0a ("drm/amdgpu: move amdgpu_cs_sysvm_access_required into 
find_mapping")
a216ab09955d6 ("drm/amdgpu: fix userptr put_page handling")
b72cf4fca2bb7 ("drm/amdgpu: move taking mmap_sem into get_user_pages v2")
ca666a3c298f8 ("drm/amdgpu: stop using BO status for user pages")

v4.9.230: Failed to apply! Possible dependencies:
1cec20f0ea0e3 ("dma-buf: Restart reservation_object_wait_timeout_rcu() 
after writes")
3fe89771cb0a6 ("drm/amdgpu: stop reserving the BO in the MMU callback v3")
4562236b3bc0a ("drm/amd/dc: Add dc display driver (v2)")
4df654d293c64 ("drm/amdgpu: move amdgpu_uvd structure to uvd header")
5e5681788befb ("drm/amdgpu: move amdgpu_vce structure to vce header")
660e855813f78 ("amdgpu: use drm sync objects for shared semaphores (v6)")
78010cd9736ec ("dma-buf/fence: add an lockdep_assert_held()")
945628101be55 ("drm/amd/display: Add backlight support via AUX")
95aa13f6b196d ("drm/amdgpu: move amdgpu_vcn structure to vcn header")
95d0906f85065 ("drm/amdgpu: add initial vcn support and decode tests")
9a18999640fa6 ("drm/amdgpu: move MMU notifier related defines to 
amdgpu_mn.h")
b636922553ee2 ("drm/amdgpu: only move VM BOs in the LRU during validation 
v2")
b72cf4fca2bb7 ("drm/amdgpu: move taking mmap_sem into get_user_pages v2")
f54d1867005c3 ("dma-buf: Rename struct fence to dma_fence")
fedf54132d241 ("dma-buf: Restart reservation_object_get_fences_rcu() after 
writes")

v4.4.230: Failed to apply! Possible dependencies:
1f7371b2a5faf ("drm/amd/powerplay: add basic powerplay framework")
288912cb95d15 ("drm/amdgpu: use $(src) in Makefile (v2)")
37cd0ca204a55 ("drm/amdgpu: unify AMDGPU_CTX_MAX_CS_PENDING and 
amdgpu_sched_jobs")
3c0eea6c35d93 ("drm/amdgpu: put VM page tables directly into duplicates 
list")
3f99dd814a6fd ("drm/amdgpu: save and restore UVD context with suspend and 
resume")
4325198180e5a ("drm/amdgpu: remove GART page addr array")
4562236b3bc0a ("drm/amd/dc: Add dc display driver (v2)")
4acabfe3793eb ("drm/amdgpu: fix num_ibs check")
4df654d293c64 ("drm/amdgpu: move amdgpu_uvd structure to uvd header")
50838c8cc413d ("drm/amdgpu: add proper job alloc/free functions")
56467ebfb2548 ("drm/amdgpu: split VM PD and PT handling during CS")
5e5681788befb ("drm/amdgpu: move amdgpu_vce structure to vce header")
7270f8391df1a ("drm/amdgpu: add amdgpu_set_ib_value helper (v2)")
945628101be55 ("drm/amd/display: Add backlight support via AUX")
95aa13f6b196d ("drm/amdgpu: move amdgpu_vcn structure to vcn header")
9a18999640fa6 ("drm/amdgpu: move MMU notifier related defines to 
amdgpu_mn.h")
a1d29476d666f ("drm/amdgpu: optionally enable GART debugfs file")
a8fe58cec351c ("drm/amd: add ACP driver support")
c036554170fcc ("drm/amdgpu: handle more than 10 UVD sessions (v2)")
c3cca41e6249e ("drm/amdgpu: cleanup amdgpu_cs_parser structure")
cadf97b196a1e ("drm/amdgpu: clean up non-scheduler code path (v2)")
cd75dc6887f1e ("drm/amdgpu: separate pushing CS to scheduler")
d71518b5aa7c9 ("drm/amdgpu: cleanup in kernel job submission")
d7af97dbccf01 ("drm/amdgpu: send UVD IB tests 

[pull] amdgpu drm-fixes-5.8

2020-07-15 Thread Alex Deucher
Hi Dave, Daniel,

Fixes for 5.8.

The following changes since commit 38794a5465b752118098e36cf95c59083f9f1f88:

  Merge tag 'amd-drm-fixes-5.8-2020-07-09' of 
git://people.freedesktop.org/~agd5f/linux into drm-fixes (2020-07-10 07:02:02 
+1000)

are available in the Git repository at:

  git://people.freedesktop.org/~agd5f/linux tags/amd-drm-fixes-5.8-2020-07-15

for you to fetch changes up to 05051496b2622e4d12e2036b35165969aa502f89:

  drm/amdgpu/sdma5: fix wptr overwritten in ->get_wptr() (2020-07-14 15:42:17 
-0400)


amd-drm-fixes-5.8-2020-07-15:

amdgpu:
- Fix a race condition with KIQ
- Preemption fix
- Fix handling of fake MST encoders
- OLED panel fix
- Handle allocation failure in stream construction
- Renoir SMC fix
- SDMA 5.x fix


Alex Deucher (1):
  drm/amdgpu/display: create fake mst encoders ahead of time (v4)

Jack Xiao (2):
  drm/amdgpu/gfx10: fix race condition for kiq
  drm/amdgpu: fix preemption unit test

Josip Pavic (1):
  drm/amd/display: handle failed allocation during stream construction

Xiaojie Yuan (1):
  drm/amdgpu/sdma5: fix wptr overwritten in ->get_wptr()

chen gong (1):
  drm/amdgpu/powerplay: Modify SMC message name for setting power profile 
mode

hersen wu (1):
  drm/amd/display: OLED panel backlight adjust not work with external 
display connected

 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c| 20 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c |  9 +++-
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 26 ---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c  | 14 ++
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h  | 11 -
 .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.c| 53 +++---
 .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.h|  3 ++
 drivers/gpu/drm/amd/display/dc/core/dc_stream.c| 19 ++--
 drivers/gpu/drm/amd/powerplay/renoir_ppt.c |  2 +-
 9 files changed, 101 insertions(+), 56 deletions(-)
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 01/17] drm/amd/powerplay: move the API shared by SMU v11 to smu_v11_0.c

2020-07-15 Thread Alex Deucher
On Tue, Jul 14, 2020 at 4:04 AM Evan Quan  wrote:
>
> To fit our original desgin and this can help to maintain
> clear code layer.
>
> Change-Id: Id89476c14709b5676bbf043371a27f27b94a58ed
> Signed-off-by: Evan Quan 
> ---
>  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c| 16 ---
>  drivers/gpu/drm/amd/powerplay/arcturus_ppt.c  |  2 +-
>  .../gpu/drm/amd/powerplay/inc/amdgpu_smu.h|  4 
>  drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h |  4 
>  drivers/gpu/drm/amd/powerplay/navi10_ppt.c|  4 ++--
>  .../drm/amd/powerplay/sienna_cichlid_ppt.c|  2 +-
>  drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 20 +--
>  7 files changed, 26 insertions(+), 26 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
> b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index 16ff64644e2e..0daea412d0a0 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -675,22 +675,6 @@ static int smu_late_init(void *handle)
> return 0;
>  }
>
> -int smu_get_atom_data_table(struct smu_context *smu, uint32_t table,
> -   uint16_t *size, uint8_t *frev, uint8_t *crev,
> -   uint8_t **addr)
> -{
> -   struct amdgpu_device *adev = smu->adev;
> -   uint16_t data_start;
> -
> -   if (!amdgpu_atom_parse_data_header(adev->mode_info.atom_context, 
> table,
> -  size, frev, crev, _start))
> -   return -EINVAL;
> -
> -   *addr = (uint8_t *)adev->mode_info.atom_context->bios + data_start;
> -
> -   return 0;
> -}
> -
>  static int smu_init_fb_allocations(struct smu_context *smu)
>  {
> struct amdgpu_device *adev = smu->adev;
> diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> index 56dc20a617fd..03361d0194fe 100644
> --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
> @@ -488,7 +488,7 @@ static int arcturus_append_powerplay_table(struct 
> smu_context *smu)
> index = 
> get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
>smc_dpm_info);
>
> -   ret = smu_get_atom_data_table(smu, index, NULL, NULL, NULL,
> +   ret = smu_v11_0_get_atom_data_table(smu, index, NULL, NULL, NULL,
>   (uint8_t **)_dpm_table);
> if (ret)
> return ret;
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h 
> b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index 52e5603dcc97..28894b8bab67 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -674,10 +674,6 @@ int smu_baco_exit(struct smu_context *smu);
>
>  int smu_mode2_reset(struct smu_context *smu);
>
> -extern int smu_get_atom_data_table(struct smu_context *smu, uint32_t table,
> -  uint16_t *size, uint8_t *frev, uint8_t 
> *crev,
> -  uint8_t **addr);
> -
>  extern const struct amd_ip_funcs smu_ip_funcs;
>
>  extern const struct amdgpu_ip_block_version smu_v11_0_ip_block;
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h 
> b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> index 9b724e4aceaa..8a4053d8eb8c 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/smu_v11_0.h
> @@ -175,6 +175,10 @@ int smu_v11_0_fini_power(struct smu_context *smu);
>
>  int smu_v11_0_check_fw_status(struct smu_context *smu);
>
> +int smu_v11_0_get_atom_data_table(struct smu_context *smu, uint32_t table,
> +   uint16_t *size, uint8_t *frev, uint8_t *crev,
> +   uint8_t **addr);
> +
>  int smu_v11_0_setup_pptable(struct smu_context *smu);
>
>  int smu_v11_0_get_vbios_bootup_values(struct smu_context *smu);
> diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> index 41bd6d157271..ff717b800086 100644
> --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
> @@ -467,7 +467,7 @@ static int navi10_append_powerplay_table(struct 
> smu_context *smu)
> index = 
> get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
>smc_dpm_info);
>
> -   ret = smu_get_atom_data_table(smu, index, NULL, NULL, NULL,
> +   ret = smu_v11_0_get_atom_data_table(smu, index, NULL, NULL, NULL,
>   (uint8_t **)_dpm_table);
> if (ret)
> return ret;
> @@ -487,7 +487,7 @@ static int navi10_append_powerplay_table(struct 
> smu_context *smu)
> sizeof(*smc_dpm_table) - 
> sizeof(smc_dpm_table->table_header));
> break;
> case 7: /* nv12 */
> -   ret = 

Re: [PATCH 01/42] drm/amdgpu: expand to add multiple trap event irq id

2020-07-15 Thread Alex Deucher
On Wed, Jul 15, 2020 at 9:24 AM Alex Deucher  wrote:
>
> On Wed, Jul 15, 2020 at 5:21 AM Christian König
>  wrote:
> >
> > Am 14.07.20 um 20:23 schrieb Alex Deucher:
> > > From: Huang Rui 
> > >
> > > Sienna_cichlid has four sdma instances, but other chips don't.
> > > So we need expand to add multiple trap event irq id in sdma
> > > v5.2.
> > >
> > > Signed-off-by: Huang Rui 
> > > Reviewed-by: Alex Deucher 
> > > Signed-off-by: Alex Deucher 
> >
> > Reviewed-by: Christian König 
> >
> > But side question why do we have the _Sienna_Cichlid postfix on the define?
>
> I suspect when it was originally added it was specific to sienna
> cichlid, but it should be dropped since it's generic.

Just checked and it's specific to this family of asics.  Other asics
use a different client id for SDMA3.  See soc15_ih_clientid.h.

Alex

>
> Alex
>
>
> >
> > Christian.
> >
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 67 --
> > >   1 file changed, 41 insertions(+), 26 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
> > > b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
> > > index 824f3e23c3d9..de8342283fdb 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
> > > @@ -1165,6 +1165,40 @@ static int sdma_v5_2_early_init(void *handle)
> > >   return 0;
> > >   }
> > >
> > > +static unsigned sdma_v5_2_seq_to_irq_id(int seq_num)
> > > +{
> > > + switch (seq_num) {
> > > + case 0:
> > > + return SOC15_IH_CLIENTID_SDMA0;
> > > + case 1:
> > > + return SOC15_IH_CLIENTID_SDMA1;
> > > + case 2:
> > > + return SOC15_IH_CLIENTID_SDMA2;
> > > + case 3:
> > > + return SOC15_IH_CLIENTID_SDMA3_Sienna_Cichlid;
> > > + default:
> > > + break;
> > > + }
> > > + return -EINVAL;
> > > +}
> > > +
> > > +static unsigned sdma_v5_2_seq_to_trap_id(int seq_num)
> > > +{
> > > + switch (seq_num) {
> > > + case 0:
> > > + return SDMA0_5_0__SRCID__SDMA_TRAP;
> > > + case 1:
> > > + return SDMA1_5_0__SRCID__SDMA_TRAP;
> > > + case 2:
> > > + return SDMA2_5_0__SRCID__SDMA_TRAP;
> > > + case 3:
> > > + return SDMA3_5_0__SRCID__SDMA_TRAP;
> > > + default:
> > > + break;
> > > + }
> > > + return -EINVAL;
> > > +}
> > > +
> > >   static int sdma_v5_2_sw_init(void *handle)
> > >   {
> > >   struct amdgpu_ring *ring;
> > > @@ -1172,32 +1206,13 @@ static int sdma_v5_2_sw_init(void *handle)
> > >   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > >
> > >   /* SDMA trap event */
> > > - r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA0,
> > > -   SDMA0_5_0__SRCID__SDMA_TRAP,
> > > -   >sdma.trap_irq);
> > > - if (r)
> > > - return r;
> > > -
> > > - /* SDMA trap event */
> > > - r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA1,
> > > -   SDMA1_5_0__SRCID__SDMA_TRAP,
> > > -   >sdma.trap_irq);
> > > - if (r)
> > > - return r;
> > > -
> > > - /* SDMA trap event */
> > > - r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA2,
> > > -   SDMA2_5_0__SRCID__SDMA_TRAP,
> > > -   >sdma.trap_irq);
> > > - if (r)
> > > - return r;
> > > -
> > > - /* SDMA trap event */
> > > - r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA3_Sienna_Cichlid,
> > > -   SDMA3_5_0__SRCID__SDMA_TRAP,
> > > -   >sdma.trap_irq);
> > > - if (r)
> > > - return r;
> > > + for (i = 0; i < adev->sdma.num_instances; i++) {
> > > + r = amdgpu_irq_add_id(adev, sdma_v5_2_seq_to_irq_id(i),
> > > +   sdma_v5_2_seq_to_trap_id(i),
> > > +   >sdma.trap_irq);
> > > + if (r)
> > > + return r;
> > > + }
> > >
> > >   r = sdma_v5_2_init_microcode(adev);
> > >   if (r) {
> >
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amd/powerplay: suppress compile error around BUG_ON

2020-07-15 Thread Deucher, Alexander
[AMD Public Use]

Acked-by: Alex Deucher 

From: Quan, Evan 
Sent: Wednesday, July 15, 2020 2:52 AM
To: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Quan, Evan 

Subject: [PATCH] drm/amd/powerplay: suppress compile error around BUG_ON

To suppress the compile error below for "ARCH=arc".
   drivers/gpu/drm/amd/amdgpu/../powerplay/arcturus_ppt.c: In function 
'arcturus_fill_eeprom_i2c_req':
>> arch/arc/include/asm/bug.h:22:2: error: implicit declaration of function 
>> 'pr_warn'; did you mean 'pci_warn'? [-Werror=implicit-function-declaration]
  22 |  pr_warn("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, 
__func__); \
 |  ^~~
   include/asm-generic/bug.h:62:57: note: in expansion of macro 'BUG'
  62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } 
while (0)
 | ^~~
   drivers/gpu/drm/amd/amdgpu/../powerplay/arcturus_ppt.c:2157:2: note: in 
expansion of macro 'BUG_ON'
2157 |  BUG_ON(numbytes > MAX_SW_I2C_COMMANDS);

Change-Id: I314b0d08197398a04b5439bce6546c4c68ca5dff
Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c 
b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
index fde6a8242565..0784a97bd67b 100644
--- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
@@ -1881,8 +1881,6 @@ static void arcturus_fill_eeprom_i2c_req(SwI2cRequest_t  
*req, bool write,
 {
 int i;

-   BUG_ON(numbytes > MAX_SW_I2C_COMMANDS);
-
 req->I2CcontrollerPort = 0;
 req->I2CSpeed = 2;
 req->SlaveAddress = address;
@@ -1920,6 +1918,12 @@ static int arcturus_i2c_eeprom_read_data(struct 
i2c_adapter *control,
 struct smu_table_context *smu_table = >smu.smu_table;
 struct smu_table *table = _table->driver_table;

+   if (numbytes > MAX_SW_I2C_COMMANDS) {
+   dev_err(adev->dev, "numbytes requested %d is over max allowed 
%d\n",
+   numbytes, MAX_SW_I2C_COMMANDS);
+   return -EINVAL;
+   }
+
 memset(, 0, sizeof(req));
 arcturus_fill_eeprom_i2c_req(, false, address, numbytes, data);

@@ -1956,6 +1960,12 @@ static int arcturus_i2c_eeprom_write_data(struct 
i2c_adapter *control,
 SwI2cRequest_t req;
 struct amdgpu_device *adev = to_amdgpu_device(control);

+   if (numbytes > MAX_SW_I2C_COMMANDS) {
+   dev_err(adev->dev, "numbytes requested %d is over max allowed 
%d\n",
+   numbytes, MAX_SW_I2C_COMMANDS);
+   return -EINVAL;
+   }
+
 memset(, 0, sizeof(req));
 arcturus_fill_eeprom_i2c_req(, true, address, numbytes, data);

--
2.27.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: Failed to find memory space for buffer eviction

2020-07-15 Thread Felix Kuehling

Am 2020-07-15 um 5:28 a.m. schrieb Christian König:
> Am 15.07.20 um 04:49 schrieb Felix Kuehling:
>> Am 2020-07-14 um 4:28 a.m. schrieb Christian König:
>>> Hi Felix,
>>>
>>> yes I already stumbled over this as well quite recently.
>>>
>>> See the following patch which I pushed to drm-misc-next just yesterday:
>>>
>>> commit e04be2310b5eac683ec03b096c0e22c4c2e23593
>>> Author: Christian König 
>>> Date:   Mon Jul 6 17:32:55 2020 +0200
>>>
>>>  drm/ttm: further cleanup ttm_mem_reg handling
>>>
>>>  Stop touching the backend private pointer alltogether and
>>>  make sure we never put the same mem twice by.
>>>
>>>  Signed-off-by: Christian König 
>>>  Reviewed-by: Madhav Chauhan 
>>>  Link:
>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2F375613%2Fdata=02%7C01%7Cfelix.kuehling%40amd.com%7Cd859556fb0f04658081208d828a16797%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637304020992423068sdata=Dpno3Wmqgyb%2FkRWzoye9T3tBg8BEgCXM0THGw8pKESY%3Dreserved=0
>>>
>>>
>>> But this shouldn't have been problematic since we used a dummy value
>>> for mem->mm_node in this case.
>> Hmm, yeah, I was reading the code wrong. It's possible that I was really
>> just out of GTT space. But see below.
>
> It looks like it yes.

I checked. I don't see a general GTT space leak. During the eviction
test the GTT usage spikes, but after finishing the test, GTT usage goes
back down to 7MB.


>
>>> What could be problematic and result is an overrun is that TTM was
>>> buggy and called put_node twice for the same memory.
>>>
>>> So I've seen that the code needs fixing as well, but I'm not 100% sure
>>> how you ran into your problem.
>> This is in the KFD eviction test, which deliberately overcommits VRAM in
>> order to trigger lots of evictions. It will use some GTT space while BOs
>> are evicted. But shouldn't it move them further out of GTT and into
>> SYSTEM to free up GTT space?
>
> Yes, exactly that should happen.
>
> But for some reason it couldn't find a candidate to evict and the
> 14371 pages left are just a bit to small for the buffer.

That would be a nested eviction. A VRAM to GTT eviction requires a GTT
to SYSTEM eviction to make space in GTT. Is that even possible?

Regards,
  Felix


>
> Regards,
> Christian.
>
>> Your change "further cleanup ttm_mem_reg handling" removes a
>> mem->mm_node = NULL in ttm_bo_handle_move_mem in exactly the case where
>> a BO is moved from GTT to SYSTEM. I think that leads to a later put_node
>> call not happening or amdgpu_gtt_mgr_del returning before incrementing
>> mgr->available.
>>
>> I can try if cherry-picking your two fixes will help with the
>> eviction test.
>>
>> Regards,
>>    Felix
>>
>>
>>> Regards,
>>> Christian.
>>>
>>> Am 14.07.20 um 02:44 schrieb Felix Kuehling:
 I'm running into this problem with the KFD EvictionTest. The log
 snippet
 below looks like it ran out of GTT space for the eviction of a 64MB
 buffer. But then it dumps the used and free space and shows plenty of
 free space.

 As I understand it, the per-page breakdown of used and free space
 shown
 by TTM is the GART space. So it's not very meaningful.

 What matters more is the GTT space managed by amdgpu_gtt_mgr.c. And
 that's where the problem is. It keeps track of available GTT space
 with
 an atomic counter in amdgpu_gtt_mgr.available. It gets decremented in
 amdgpu_gtt_mgr_new and incremented in amdgpu_gtt_mgr_del. The trouble
 is, that TTM doesn't call the latter for ttm_mem_regs that don't
 have an
 mm_node:

> void ttm_bo_mem_put(struct ttm_buffer_object *bo, struct ttm_mem_reg
> *mem)
> {
>   struct ttm_mem_type_manager *man =
> >bdev->man[mem->mem_type];
>
>   if (mem->mm_node)
>   (*man->func->put_node)(man, mem);
> }
 GTT BOs that don't have GART space allocated, don't hate an
 mm_node. So
 the amdgpu_gtt_mgr.available counter doesn't get incremented when an
 unmapped GTT BO is freed, and eventually runs out of space.

 Now I know what the problem is, but I don't know how to fix it.
 Maybe a
 dummy-mm_node for unmapped GTT BOs, to trick TTM into calling our
 put_node callback? Or a change in TTM to call put_node
 unconditionally?

 Regards,
     Felix


 [  360.082552] [TTM] Failed to find memory space for buffer
 0x264c823c eviction
 [  360.090331] [TTM]  No space for 264c823c (16384 pages,
 65536K, 64M)
 [  360.090334] [TTM]    placement[0]=0x00010002 (1)
 [  360.090336] [TTM]  has_type: 1
 [  360.090337] [TTM]  use_type: 1
 [  360.090339] [TTM]  flags: 0x000A
 [  360.090341] [TTM]  gpu_offset: 0xFF
 [  360.090342] [TTM]  size: 1048576
 [  360.090344] [TTM]  available_caching: 0x0007
 [  360.090346] [TTM]  

Re: Failed to find memory space for buffer eviction

2020-07-15 Thread Deucher, Alexander
[AMD Public Use]

Maybe we should re-test the problematic piglit test and if it's no longer an 
issue, revert:

commit 24562523688bebc7ec17a88271b4e8c3fc337b74
Author: Andrey Grodzovsky 
Date:   Fri Dec 15 12:09:16 2017 -0500

Revert "drm/amd/amdgpu: set gtt size according to system memory size only"

This reverts commit ba851eed895c76be0eb4260bdbeb7e26f9ccfaa2.
With that change piglit max size tests (running with -t max.*size) are 
causing
OOM and hard hang on my CZ with 1GB RAM.

Signed-off-by: Andrey Grodzovsky 
Acked-by: Alex Deucher 
Reviewed-by: Christian König 
Reviewed-by: Roger He 
Signed-off-by: Alex Deucher 


From: amd-gfx  on behalf of Christian 
König 
Sent: Wednesday, July 15, 2020 5:28 AM
To: Kuehling, Felix ; Koenig, Christian 
; amd-gfx list 
Subject: Re: Failed to find memory space for buffer eviction

Am 15.07.20 um 04:49 schrieb Felix Kuehling:
> Am 2020-07-14 um 4:28 a.m. schrieb Christian König:
>> Hi Felix,
>>
>> yes I already stumbled over this as well quite recently.
>>
>> See the following patch which I pushed to drm-misc-next just yesterday:
>>
>> commit e04be2310b5eac683ec03b096c0e22c4c2e23593
>> Author: Christian König 
>> Date:   Mon Jul 6 17:32:55 2020 +0200
>>
>>  drm/ttm: further cleanup ttm_mem_reg handling
>>
>>  Stop touching the backend private pointer alltogether and
>>  make sure we never put the same mem twice by.
>>
>>  Signed-off-by: Christian König 
>>  Reviewed-by: Madhav Chauhan 
>>  Link: 
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2F375613%2Fdata=02%7C01%7Calexander.deucher%40amd.com%7Ce19192b295fc41a7fb4c08d828a168d4%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637304021017509156sdata=zilZiBrs%2FVrzhZuolVzhLSO2kIBDugp16HT58G7tX8w%3Dreserved=0
>>
>>
>> But this shouldn't have been problematic since we used a dummy value
>> for mem->mm_node in this case.
> Hmm, yeah, I was reading the code wrong. It's possible that I was really
> just out of GTT space. But see below.

It looks like it yes.

>> What could be problematic and result is an overrun is that TTM was
>> buggy and called put_node twice for the same memory.
>>
>> So I've seen that the code needs fixing as well, but I'm not 100% sure
>> how you ran into your problem.
> This is in the KFD eviction test, which deliberately overcommits VRAM in
> order to trigger lots of evictions. It will use some GTT space while BOs
> are evicted. But shouldn't it move them further out of GTT and into
> SYSTEM to free up GTT space?

Yes, exactly that should happen.

But for some reason it couldn't find a candidate to evict and the 14371
pages left are just a bit to small for the buffer.

Regards,
Christian.

> Your change "further cleanup ttm_mem_reg handling" removes a
> mem->mm_node = NULL in ttm_bo_handle_move_mem in exactly the case where
> a BO is moved from GTT to SYSTEM. I think that leads to a later put_node
> call not happening or amdgpu_gtt_mgr_del returning before incrementing
> mgr->available.
>
> I can try if cherry-picking your two fixes will help with the eviction test.
>
> Regards,
>Felix
>
>
>> Regards,
>> Christian.
>>
>> Am 14.07.20 um 02:44 schrieb Felix Kuehling:
>>> I'm running into this problem with the KFD EvictionTest. The log snippet
>>> below looks like it ran out of GTT space for the eviction of a 64MB
>>> buffer. But then it dumps the used and free space and shows plenty of
>>> free space.
>>>
>>> As I understand it, the per-page breakdown of used and free space shown
>>> by TTM is the GART space. So it's not very meaningful.
>>>
>>> What matters more is the GTT space managed by amdgpu_gtt_mgr.c. And
>>> that's where the problem is. It keeps track of available GTT space with
>>> an atomic counter in amdgpu_gtt_mgr.available. It gets decremented in
>>> amdgpu_gtt_mgr_new and incremented in amdgpu_gtt_mgr_del. The trouble
>>> is, that TTM doesn't call the latter for ttm_mem_regs that don't have an
>>> mm_node:
>>>
 void ttm_bo_mem_put(struct ttm_buffer_object *bo, struct ttm_mem_reg
 *mem)
 {
   struct ttm_mem_type_manager *man =
 >bdev->man[mem->mem_type];

   if (mem->mm_node)
   (*man->func->put_node)(man, mem);
 }
>>> GTT BOs that don't have GART space allocated, don't hate an mm_node. So
>>> the amdgpu_gtt_mgr.available counter doesn't get incremented when an
>>> unmapped GTT BO is freed, and eventually runs out of space.
>>>
>>> Now I know what the problem is, but I don't know how to fix it. Maybe a
>>> dummy-mm_node for unmapped GTT BOs, to trick TTM into calling our
>>> put_node callback? Or a change in TTM to call put_node unconditionally?
>>>
>>> Regards,
>>> Felix
>>>
>>>
>>> [  360.082552] [TTM] Failed to find memory space for buffer
>>> 0x264c823c eviction
>>> [  360.090331] [TTM]  No space for 264c823c (16384 pages,
>>> 65536K, 64M)
>>> [  

RE: [PATCH 5/5] drm/amd/sriov skip vcn powergating and dec_ring_test

2020-07-15 Thread Liu, Leo
Reviewed-by: Leo Liu 

-Original Message-
From: Jack Zhang  
Sent: July 15, 2020 1:50 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Jack (Jian) ; Zhang, Hawking 
; Liu, Leo 
Subject: [PATCH 5/5] drm/amd/sriov skip vcn powergating and dec_ring_test

1.Skip decode_ring test in VF, because VCN in SRIOV does not support direct 
register read/write.

2.Skip powergating configuration in hw fini because
VCN3.0 SRIOV doesn't support powergating.

V2: delete unneccessary white lines and refine implementation.

Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c |  4 
 drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c   | 21 -
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 15ff30c53e24..92a55e40bc48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -421,6 +421,10 @@ int amdgpu_vcn_dec_ring_test_ring(struct amdgpu_ring *ring)
unsigned i;
int r;
 
+   /* VCN in SRIOV does not support direct register read/write */
+   if (amdgpu_sriov_vf(adev))
+   return 0;
+
WREG32(adev->vcn.inst[ring->me].external.scratch9, 0xCAFEDEAD);
r = amdgpu_ring_alloc(ring, 3);
if (r)
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
index 0a0ca10bf55b..910a4a32ff78 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v3_0.c
@@ -354,11 +354,13 @@ static int vcn_v3_0_hw_fini(void *handle)
 
ring = >vcn.inst[i].ring_dec;
 
-   if ((adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) ||
-   (adev->vcn.cur_state != AMD_PG_STATE_GATE &&
-   RREG32_SOC15(VCN, i, mmUVD_STATUS)))
-   vcn_v3_0_set_powergating_state(adev, AMD_PG_STATE_GATE);
-
+   if (!amdgpu_sriov_vf(adev)) {
+   if ((adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) ||
+   (adev->vcn.cur_state != 
AMD_PG_STATE_GATE &&
+RREG32_SOC15(VCN, i, mmUVD_STATUS))) {
+   vcn_v3_0_set_powergating_state(adev, 
AMD_PG_STATE_GATE);
+   }
+   }
ring->sched.ready = false;
 
for (j = 0; j < adev->vcn.num_enc_rings; ++j) { @@ -1861,6 
+1863,15 @@ static int vcn_v3_0_set_powergating_state(void *handle,
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
int ret;
 
+   /* for SRIOV, guest should not control VCN Power-gating
+* MMSCH FW should control Power-gating and clock-gating
+* guest should avoid touching CGC and PG
+*/
+   if (amdgpu_sriov_vf(adev)) {
+   adev->vcn.cur_state = AMD_PG_STATE_UNGATE;
+   return 0;
+   }
+
if(state == adev->vcn.cur_state)
return 0;
 
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 01/42] drm/amdgpu: expand to add multiple trap event irq id

2020-07-15 Thread Alex Deucher
On Wed, Jul 15, 2020 at 5:21 AM Christian König
 wrote:
>
> Am 14.07.20 um 20:23 schrieb Alex Deucher:
> > From: Huang Rui 
> >
> > Sienna_cichlid has four sdma instances, but other chips don't.
> > So we need expand to add multiple trap event irq id in sdma
> > v5.2.
> >
> > Signed-off-by: Huang Rui 
> > Reviewed-by: Alex Deucher 
> > Signed-off-by: Alex Deucher 
>
> Reviewed-by: Christian König 
>
> But side question why do we have the _Sienna_Cichlid postfix on the define?

I suspect when it was originally added it was specific to sienna
cichlid, but it should be dropped since it's generic.

Alex


>
> Christian.
>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 67 --
> >   1 file changed, 41 insertions(+), 26 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
> > b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
> > index 824f3e23c3d9..de8342283fdb 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
> > @@ -1165,6 +1165,40 @@ static int sdma_v5_2_early_init(void *handle)
> >   return 0;
> >   }
> >
> > +static unsigned sdma_v5_2_seq_to_irq_id(int seq_num)
> > +{
> > + switch (seq_num) {
> > + case 0:
> > + return SOC15_IH_CLIENTID_SDMA0;
> > + case 1:
> > + return SOC15_IH_CLIENTID_SDMA1;
> > + case 2:
> > + return SOC15_IH_CLIENTID_SDMA2;
> > + case 3:
> > + return SOC15_IH_CLIENTID_SDMA3_Sienna_Cichlid;
> > + default:
> > + break;
> > + }
> > + return -EINVAL;
> > +}
> > +
> > +static unsigned sdma_v5_2_seq_to_trap_id(int seq_num)
> > +{
> > + switch (seq_num) {
> > + case 0:
> > + return SDMA0_5_0__SRCID__SDMA_TRAP;
> > + case 1:
> > + return SDMA1_5_0__SRCID__SDMA_TRAP;
> > + case 2:
> > + return SDMA2_5_0__SRCID__SDMA_TRAP;
> > + case 3:
> > + return SDMA3_5_0__SRCID__SDMA_TRAP;
> > + default:
> > + break;
> > + }
> > + return -EINVAL;
> > +}
> > +
> >   static int sdma_v5_2_sw_init(void *handle)
> >   {
> >   struct amdgpu_ring *ring;
> > @@ -1172,32 +1206,13 @@ static int sdma_v5_2_sw_init(void *handle)
> >   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> >
> >   /* SDMA trap event */
> > - r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA0,
> > -   SDMA0_5_0__SRCID__SDMA_TRAP,
> > -   >sdma.trap_irq);
> > - if (r)
> > - return r;
> > -
> > - /* SDMA trap event */
> > - r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA1,
> > -   SDMA1_5_0__SRCID__SDMA_TRAP,
> > -   >sdma.trap_irq);
> > - if (r)
> > - return r;
> > -
> > - /* SDMA trap event */
> > - r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA2,
> > -   SDMA2_5_0__SRCID__SDMA_TRAP,
> > -   >sdma.trap_irq);
> > - if (r)
> > - return r;
> > -
> > - /* SDMA trap event */
> > - r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA3_Sienna_Cichlid,
> > -   SDMA3_5_0__SRCID__SDMA_TRAP,
> > -   >sdma.trap_irq);
> > - if (r)
> > - return r;
> > + for (i = 0; i < adev->sdma.num_instances; i++) {
> > + r = amdgpu_irq_add_id(adev, sdma_v5_2_seq_to_irq_id(i),
> > +   sdma_v5_2_seq_to_trap_id(i),
> > +   >sdma.trap_irq);
> > + if (r)
> > + return r;
> > + }
> >
> >   r = sdma_v5_2_init_microcode(adev);
> >   if (r) {
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 19/25] drm/amdgpu: s/GFP_KERNEL/GFP_ATOMIC in scheduler code

2020-07-15 Thread Daniel Vetter
On Wed, Jul 15, 2020 at 11:17 AM Christian König
 wrote:
>
> Am 14.07.20 um 16:31 schrieb Daniel Vetter:
> > On Tue, Jul 14, 2020 at 01:40:11PM +0200, Christian König wrote:
> >> Am 14.07.20 um 12:49 schrieb Daniel Vetter:
> >>> On Tue, Jul 07, 2020 at 10:12:23PM +0200, Daniel Vetter wrote:
>  My dma-fence lockdep annotations caught an inversion because we
>  allocate memory where we really shouldn't:
> 
> kmem_cache_alloc+0x2b/0x6d0
> amdgpu_fence_emit+0x30/0x330 [amdgpu]
> amdgpu_ib_schedule+0x306/0x550 [amdgpu]
> amdgpu_job_run+0x10f/0x260 [amdgpu]
> drm_sched_main+0x1b9/0x490 [gpu_sched]
> kthread+0x12e/0x150
> 
>  Trouble right now is that lockdep only validates against GFP_FS, which
>  would be good enough for shrinkers. But for mmu_notifiers we actually
>  need !GFP_ATOMIC, since they can be called from any page laundering,
>  even if GFP_NOFS or GFP_NOIO are set.
> 
>  I guess we should improve the lockdep annotations for
>  fs_reclaim_acquire/release.
> 
>  Ofc real fix is to properly preallocate this fence and stuff it into
>  the amdgpu job structure. But GFP_ATOMIC gets the lockdep splat out of
>  the way.
> 
>  v2: Two more allocations in scheduler paths.
> 
>  Frist one:
> 
> __kmalloc+0x58/0x720
> amdgpu_vmid_grab+0x100/0xca0 [amdgpu]
> amdgpu_job_dependency+0xf9/0x120 [amdgpu]
> drm_sched_entity_pop_job+0x3f/0x440 [gpu_sched]
> drm_sched_main+0xf9/0x490 [gpu_sched]
> 
>  Second one:
> 
> kmem_cache_alloc+0x2b/0x6d0
> amdgpu_sync_fence+0x7e/0x110 [amdgpu]
> amdgpu_vmid_grab+0x86b/0xca0 [amdgpu]
> amdgpu_job_dependency+0xf9/0x120 [amdgpu]
> drm_sched_entity_pop_job+0x3f/0x440 [gpu_sched]
> drm_sched_main+0xf9/0x490 [gpu_sched]
> 
>  Cc: linux-me...@vger.kernel.org
>  Cc: linaro-mm-...@lists.linaro.org
>  Cc: linux-r...@vger.kernel.org
>  Cc: amd-gfx@lists.freedesktop.org
>  Cc: intel-...@lists.freedesktop.org
>  Cc: Chris Wilson 
>  Cc: Maarten Lankhorst 
>  Cc: Christian König 
>  Signed-off-by: Daniel Vetter 
> >>> Has anyone from amd side started looking into how to fix this properly?
> >> Yeah I checked both and neither are any real problem.
> > I'm confused ... do you mean "no real problem fixing them" or "not
> > actually a real problem"?
>
> Both, at least the VMID stuff is trivial to avoid.
>
> And the fence allocation is extremely unlikely. E.g. when we allocate a
> new one we previously most likely just freed one already.

Yeah I think debugging we can avoid, just stop debugging if things get
hung up like that. So mempool for the hw fences should be perfectly
fine.

The vmid stuff I don't really understand enough, but the hw fence
stuff I think I grok, plus other scheduler users need that too from a
quick look. I might be tackling that one (maybe put the mempool
outright into drm_scheduler code as a helper), except if you have
patches already in the works. vmid I'll leave to you guys :-)

-Daniel

>
> >
> >>> I looked a bit into fixing this with mempool, and the big guarantee we
> >>> need is that
> >>> - there's a hard upper limit on how many allocations we minimally need to
> >>> guarantee forward progress. And the entire vmid allocation and
> >>> amdgpu_sync_fence stuff kinda makes me question that's a valid
> >>> assumption.
> >> We do have hard upper limits for those.
> >>
> >> The VMID allocation could as well just return the fence instead of putting
> >> it into the sync object IIRC. So that just needs some cleanup and can avoid
> >> the allocation entirely.
> > Yeah embedding should be simplest solution of all.
> >
> >> The hardware fence is limited by the number of submissions we can have
> >> concurrently on the ring buffers, so also not a problem at all.
> > Ok that sounds good. Wrt releasing the memory again, is that also done
> > without any of the allocation-side locks held? I've seen some vmid manager
> > somewhere ...
>
> Well that's the issue. We can't guarantee that for the hardware fence
> memory since it could be that we hold another reference during debugging
> IIRC.
>
> Still looking if and how we could fix this. But as I said this problem
> is so extremely unlikely.
>
> Christian.
>
> > -Daniel
> >
> >> Regards,
> >> Christian.
> >>
> >>> - mempool_free must be called without any locks in the way which are held
> >>> while we call mempool_alloc. Otherwise we again have a nice deadlock
> >>> with no forward progress. I tried auditing that, but got lost in 
> >>> amdgpu
> >>> and scheduler code. Some lockdep annotations for mempool.c might help,
> >>> but they're not going to catch everything. Plus it would be again 
> >>> manual
> >>> annotations because this is yet another cross-release issue. So not 
> >>> sure
> >>> that helps at all.
> >>>
> >>> iow, not 

Re: Failed to find memory space for buffer eviction

2020-07-15 Thread Christian König

Am 15.07.20 um 04:49 schrieb Felix Kuehling:

Am 2020-07-14 um 4:28 a.m. schrieb Christian König:

Hi Felix,

yes I already stumbled over this as well quite recently.

See the following patch which I pushed to drm-misc-next just yesterday:

commit e04be2310b5eac683ec03b096c0e22c4c2e23593
Author: Christian König 
Date:   Mon Jul 6 17:32:55 2020 +0200

     drm/ttm: further cleanup ttm_mem_reg handling

     Stop touching the backend private pointer alltogether and
     make sure we never put the same mem twice by.

     Signed-off-by: Christian König 
     Reviewed-by: Madhav Chauhan 
     Link: https://patchwork.freedesktop.org/patch/375613/


But this shouldn't have been problematic since we used a dummy value
for mem->mm_node in this case.

Hmm, yeah, I was reading the code wrong. It's possible that I was really
just out of GTT space. But see below.


It looks like it yes.


What could be problematic and result is an overrun is that TTM was
buggy and called put_node twice for the same memory.

So I've seen that the code needs fixing as well, but I'm not 100% sure
how you ran into your problem.

This is in the KFD eviction test, which deliberately overcommits VRAM in
order to trigger lots of evictions. It will use some GTT space while BOs
are evicted. But shouldn't it move them further out of GTT and into
SYSTEM to free up GTT space?


Yes, exactly that should happen.

But for some reason it couldn't find a candidate to evict and the 14371 
pages left are just a bit to small for the buffer.


Regards,
Christian.


Your change "further cleanup ttm_mem_reg handling" removes a
mem->mm_node = NULL in ttm_bo_handle_move_mem in exactly the case where
a BO is moved from GTT to SYSTEM. I think that leads to a later put_node
call not happening or amdgpu_gtt_mgr_del returning before incrementing
mgr->available.

I can try if cherry-picking your two fixes will help with the eviction test.

Regards,
   Felix



Regards,
Christian.

Am 14.07.20 um 02:44 schrieb Felix Kuehling:

I'm running into this problem with the KFD EvictionTest. The log snippet
below looks like it ran out of GTT space for the eviction of a 64MB
buffer. But then it dumps the used and free space and shows plenty of
free space.

As I understand it, the per-page breakdown of used and free space shown
by TTM is the GART space. So it's not very meaningful.

What matters more is the GTT space managed by amdgpu_gtt_mgr.c. And
that's where the problem is. It keeps track of available GTT space with
an atomic counter in amdgpu_gtt_mgr.available. It gets decremented in
amdgpu_gtt_mgr_new and incremented in amdgpu_gtt_mgr_del. The trouble
is, that TTM doesn't call the latter for ttm_mem_regs that don't have an
mm_node:


void ttm_bo_mem_put(struct ttm_buffer_object *bo, struct ttm_mem_reg
*mem)
{
  struct ttm_mem_type_manager *man =
>bdev->man[mem->mem_type];

  if (mem->mm_node)
  (*man->func->put_node)(man, mem);
}

GTT BOs that don't have GART space allocated, don't hate an mm_node. So
the amdgpu_gtt_mgr.available counter doesn't get incremented when an
unmapped GTT BO is freed, and eventually runs out of space.

Now I know what the problem is, but I don't know how to fix it. Maybe a
dummy-mm_node for unmapped GTT BOs, to trick TTM into calling our
put_node callback? Or a change in TTM to call put_node unconditionally?

Regards,
    Felix


[  360.082552] [TTM] Failed to find memory space for buffer
0x264c823c eviction
[  360.090331] [TTM]  No space for 264c823c (16384 pages,
65536K, 64M)
[  360.090334] [TTM]    placement[0]=0x00010002 (1)
[  360.090336] [TTM]  has_type: 1
[  360.090337] [TTM]  use_type: 1
[  360.090339] [TTM]  flags: 0x000A
[  360.090341] [TTM]  gpu_offset: 0xFF
[  360.090342] [TTM]  size: 1048576
[  360.090344] [TTM]  available_caching: 0x0007
[  360.090346] [TTM]  default_caching: 0x0001
[  360.090349] [TTM]  0x0400-0x0402: 2: used
[  360.090352] [TTM]  0x0402-0x0404: 2: used
[  360.090354] [TTM]  0x0404-0x0406: 2: used
[  360.090355] [TTM]  0x0406-0x0408: 2: used
[  360.090357] [TTM]  0x0408-0x040a: 2: used
[  360.090359] [TTM]  0x040a-0x040c: 2: used
[  360.090361] [TTM]  0x040c-0x040e: 2: used
[  360.090363] [TTM]  0x040e-0x0410: 2: used
[  360.090365] [TTM]  0x0410-0x0412: 2: used
[  360.090367] [TTM]  0x0412-0x0414: 2: used
[  360.090368] [TTM]  0x0414-0x0415: 1: used
[  360.090370] [TTM]  0x0415-0x0515: 256: used
[  360.090372] [TTM]  0x0515-0x0516: 1: used
[  360.090374] [TTM]  0x0516-0x0517: 1: used
[  360.090376] [TTM]  0x0517-0x0518: 1: used
[  360.090378] [TTM]  

Re: [PATCH 01/42] drm/amdgpu: expand to add multiple trap event irq id

2020-07-15 Thread Christian König

Am 14.07.20 um 20:23 schrieb Alex Deucher:

From: Huang Rui 

Sienna_cichlid has four sdma instances, but other chips don't.
So we need expand to add multiple trap event irq id in sdma
v5.2.

Signed-off-by: Huang Rui 
Reviewed-by: Alex Deucher 
Signed-off-by: Alex Deucher 


Reviewed-by: Christian König 

But side question why do we have the _Sienna_Cichlid postfix on the define?

Christian.


---
  drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 67 --
  1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 824f3e23c3d9..de8342283fdb 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -1165,6 +1165,40 @@ static int sdma_v5_2_early_init(void *handle)
return 0;
  }
  
+static unsigned sdma_v5_2_seq_to_irq_id(int seq_num)

+{
+   switch (seq_num) {
+   case 0:
+   return SOC15_IH_CLIENTID_SDMA0;
+   case 1:
+   return SOC15_IH_CLIENTID_SDMA1;
+   case 2:
+   return SOC15_IH_CLIENTID_SDMA2;
+   case 3:
+   return SOC15_IH_CLIENTID_SDMA3_Sienna_Cichlid;
+   default:
+   break;
+   }
+   return -EINVAL;
+}
+
+static unsigned sdma_v5_2_seq_to_trap_id(int seq_num)
+{
+   switch (seq_num) {
+   case 0:
+   return SDMA0_5_0__SRCID__SDMA_TRAP;
+   case 1:
+   return SDMA1_5_0__SRCID__SDMA_TRAP;
+   case 2:
+   return SDMA2_5_0__SRCID__SDMA_TRAP;
+   case 3:
+   return SDMA3_5_0__SRCID__SDMA_TRAP;
+   default:
+   break;
+   }
+   return -EINVAL;
+}
+
  static int sdma_v5_2_sw_init(void *handle)
  {
struct amdgpu_ring *ring;
@@ -1172,32 +1206,13 @@ static int sdma_v5_2_sw_init(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
  
  	/* SDMA trap event */

-   r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA0,
- SDMA0_5_0__SRCID__SDMA_TRAP,
- >sdma.trap_irq);
-   if (r)
-   return r;
-
-   /* SDMA trap event */
-   r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA1,
- SDMA1_5_0__SRCID__SDMA_TRAP,
- >sdma.trap_irq);
-   if (r)
-   return r;
-
-   /* SDMA trap event */
-   r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA2,
- SDMA2_5_0__SRCID__SDMA_TRAP,
- >sdma.trap_irq);
-   if (r)
-   return r;
-
-   /* SDMA trap event */
-   r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_SDMA3_Sienna_Cichlid,
- SDMA3_5_0__SRCID__SDMA_TRAP,
- >sdma.trap_irq);
-   if (r)
-   return r;
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   r = amdgpu_irq_add_id(adev, sdma_v5_2_seq_to_irq_id(i),
+ sdma_v5_2_seq_to_trap_id(i),
+ >sdma.trap_irq);
+   if (r)
+   return r;
+   }
  
  	r = sdma_v5_2_init_microcode(adev);

if (r) {


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 19/25] drm/amdgpu: s/GFP_KERNEL/GFP_ATOMIC in scheduler code

2020-07-15 Thread Christian König

Am 14.07.20 um 16:31 schrieb Daniel Vetter:

On Tue, Jul 14, 2020 at 01:40:11PM +0200, Christian König wrote:

Am 14.07.20 um 12:49 schrieb Daniel Vetter:

On Tue, Jul 07, 2020 at 10:12:23PM +0200, Daniel Vetter wrote:

My dma-fence lockdep annotations caught an inversion because we
allocate memory where we really shouldn't:

kmem_cache_alloc+0x2b/0x6d0
amdgpu_fence_emit+0x30/0x330 [amdgpu]
amdgpu_ib_schedule+0x306/0x550 [amdgpu]
amdgpu_job_run+0x10f/0x260 [amdgpu]
drm_sched_main+0x1b9/0x490 [gpu_sched]
kthread+0x12e/0x150

Trouble right now is that lockdep only validates against GFP_FS, which
would be good enough for shrinkers. But for mmu_notifiers we actually
need !GFP_ATOMIC, since they can be called from any page laundering,
even if GFP_NOFS or GFP_NOIO are set.

I guess we should improve the lockdep annotations for
fs_reclaim_acquire/release.

Ofc real fix is to properly preallocate this fence and stuff it into
the amdgpu job structure. But GFP_ATOMIC gets the lockdep splat out of
the way.

v2: Two more allocations in scheduler paths.

Frist one:

__kmalloc+0x58/0x720
amdgpu_vmid_grab+0x100/0xca0 [amdgpu]
amdgpu_job_dependency+0xf9/0x120 [amdgpu]
drm_sched_entity_pop_job+0x3f/0x440 [gpu_sched]
drm_sched_main+0xf9/0x490 [gpu_sched]

Second one:

kmem_cache_alloc+0x2b/0x6d0
amdgpu_sync_fence+0x7e/0x110 [amdgpu]
amdgpu_vmid_grab+0x86b/0xca0 [amdgpu]
amdgpu_job_dependency+0xf9/0x120 [amdgpu]
drm_sched_entity_pop_job+0x3f/0x440 [gpu_sched]
drm_sched_main+0xf9/0x490 [gpu_sched]

Cc: linux-me...@vger.kernel.org
Cc: linaro-mm-...@lists.linaro.org
Cc: linux-r...@vger.kernel.org
Cc: amd-gfx@lists.freedesktop.org
Cc: intel-...@lists.freedesktop.org
Cc: Chris Wilson 
Cc: Maarten Lankhorst 
Cc: Christian König 
Signed-off-by: Daniel Vetter 

Has anyone from amd side started looking into how to fix this properly?

Yeah I checked both and neither are any real problem.

I'm confused ... do you mean "no real problem fixing them" or "not
actually a real problem"?


Both, at least the VMID stuff is trivial to avoid.

And the fence allocation is extremely unlikely. E.g. when we allocate a 
new one we previously most likely just freed one already.





I looked a bit into fixing this with mempool, and the big guarantee we
need is that
- there's a hard upper limit on how many allocations we minimally need to
guarantee forward progress. And the entire vmid allocation and
amdgpu_sync_fence stuff kinda makes me question that's a valid
assumption.

We do have hard upper limits for those.

The VMID allocation could as well just return the fence instead of putting
it into the sync object IIRC. So that just needs some cleanup and can avoid
the allocation entirely.

Yeah embedding should be simplest solution of all.


The hardware fence is limited by the number of submissions we can have
concurrently on the ring buffers, so also not a problem at all.

Ok that sounds good. Wrt releasing the memory again, is that also done
without any of the allocation-side locks held? I've seen some vmid manager
somewhere ...


Well that's the issue. We can't guarantee that for the hardware fence 
memory since it could be that we hold another reference during debugging 
IIRC.


Still looking if and how we could fix this. But as I said this problem 
is so extremely unlikely.


Christian.


-Daniel


Regards,
Christian.


- mempool_free must be called without any locks in the way which are held
while we call mempool_alloc. Otherwise we again have a nice deadlock
with no forward progress. I tried auditing that, but got lost in amdgpu
and scheduler code. Some lockdep annotations for mempool.c might help,
but they're not going to catch everything. Plus it would be again manual
annotations because this is yet another cross-release issue. So not sure
that helps at all.

iow, not sure what to do here. Ideas?

Cheers, Daniel


---
   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c   | 2 +-
   drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c  | 2 +-
   3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 8d84975885cd..a089a827fdfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -143,7 +143,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f,
uint32_t seq;
int r;
-   fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
+   fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
if (fence == NULL)
return -ENOMEM;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
index 267fa45ddb66..a333ca2d4ddd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
+++ 

Re: [PATCH 4/4] drm/amdgpu: add module parameter choose reset mode

2020-07-15 Thread Christian König

In second case, two conditions could be  true at the same, so when we want to 
indicate that the program should choose either one or none of the options, the 
first one should be much more cleaner and better.


When you check two independent conditions which just happen to use the 
same value? Well that sounds like a rather bad idea to me.


"else if" should be used when you can simplify cond2 because part of it 
already contains !cond1.


If both conditions are truly independent using "else if" just confuses 
the reader and is rather error prone.


Regards,
Christian.

Am 15.07.20 um 05:06 schrieb Sheng, Wenhui:

[AMD Official Use Only - Internal Distribution Only]


What do you mean with that? To just keep the two "if" more closely together?

I mean generally we want to use if to check value A, we always do like :
if (cond1(A))
;
else if (cond2(A))
;
  or
if (cond1(A))
;
if (cond2(A))
;

In second case, two conditions could be  true at the same, so when we want to 
indicate that the program should choose either one or none of the options, the 
first one should be much more cleaner and better.


Brs
Wenhui
-Original Message-
From: Christian König 
Sent: Tuesday, July 14, 2020 7:46 PM
To: Sheng, Wenhui ; Koenig, Christian 
; amd-gfx@lists.freedesktop.org
Cc: Gao, Likun ; Zhang, Hawking 
Subject: Re: [PATCH 4/4] drm/amdgpu: add module parameter choose reset mode


I use "else if" because I don't want to break the reset_method checking 
context, it will be much easier for code reviewing later I think.

What do you mean with that? To just keep the two "if" more closely together?

See we usually avoid extra "else" when there is a return in the if to make sure 
not to confuse the reader.

E.g. and "else" usually implies that both branches converge again after the if 
and that's not the case if you have a return or goto.

Regards,
Christian.

Am 14.07.20 um 11:58 schrieb Sheng, Wenhui:

[AMD Official Use Only - Internal Distribution Only]

Well,  drop else will make code much more clean.
I use "else if" because I don't want to break the reset_method checking 
context, it will be much easier for code reviewing later I think.

Will refine it anyway:)

Thanks
Wenhui
-Original Message-
From: Christian König 
Sent: Tuesday, July 14, 2020 4:36 PM
To: Sheng, Wenhui ;
amd-gfx@lists.freedesktop.org
Cc: Gao, Likun ; Zhang, Hawking

Subject: Re: [PATCH 4/4] drm/amdgpu: add module parameter choose reset
mode

Am 14.07.20 um 04:29 schrieb Wenhui Sheng:

Default value is auto, doesn't change original reset method logic.

v2: change to use parameter reset_method
v3: add warn msg if specified mode isn't supported

Signed-off-by: Likun Gao 
Signed-off-by: Wenhui Sheng 
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 8 
drivers/gpu/drm/amd/amdgpu/cik.c| 7 +++
drivers/gpu/drm/amd/amdgpu/nv.c | 7 +++
drivers/gpu/drm/amd/amdgpu/si.c | 5 +
drivers/gpu/drm/amd/amdgpu/soc15.c  | 8 
drivers/gpu/drm/amd/amdgpu/vi.c | 7 +++
7 files changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4de93cef79b9..06bfb8658dec 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -196,6 +196,7 @@ static const bool debug_evictions; /* = false */
#endif

extern int amdgpu_tmz;

+extern int amdgpu_reset_method;

#ifdef CONFIG_DRM_AMDGPU_SI

extern int amdgpu_si_support;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 94c83a9d4987..581d5fcac361 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -154,6 +154,7 @@ int amdgpu_mes = 0;
int amdgpu_noretry = 1;
int amdgpu_force_asic_type = -1;
int amdgpu_tmz = 0;
+int amdgpu_reset_method = -1; /* auto */

struct amdgpu_mgpu_info mgpu_info = {

.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
@@ -793,6 +794,13 @@ module_param_named(abmlevel, amdgpu_dm_abm_level, uint, 
0444);
MODULE_PARM_DESC(tmz, "Enable TMZ feature (-1 = auto, 0 = off (default), 1 = 
on)");
module_param_named(tmz, amdgpu_tmz, int, 0444);

+/**

+ * DOC: reset_method (int)
+ * GPU reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 =
+mode1, 3 = mode2, 4 = baco)  */ MODULE_PARM_DESC(reset_method, "GPU
+reset method (-1 = auto (default), 0 = legacy, 1 = mode0, 2 = mode1,
+3 = mode2, 4 = baco)"); module_param_named(reset_method,
+amdgpu_reset_method, int, 0444);
+
static const struct pci_device_id pciidlist[] = {
#ifdef  CONFIG_DRM_AMDGPU_SI
{0x1002, 0x6780, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TAHITI}, diff
--git a/drivers/gpu/drm/amd/amdgpu/cik.c
b/drivers/gpu/drm/amd/amdgpu/cik.c
index 

RE: [PATCH] drm/amdgpu: resolve bug in ta microcode init

2020-07-15 Thread Chen, Guchun
[AMD Public Use]

Hi John,

The patch is good. However, maybe it's better we can improve the patch commit 
message title/body together a bit.

Generally, original title 'resolve bug...' is not pretty clear to audience. If 
possible, we can modify the title to 'correct fw start address', and in message 
body, illustrate why the patch is needed.

With the comment addressed, the patch is:
Reviewed-by: Guchun Chen 

Regards,
Guchun

From: Clements, John 
Sent: Wednesday, July 15, 2020 2:44 PM
To: amd-gfx list ; Lakha, Bhawanpreet 
; Zhang, Hawking ; Chen, 
Guchun 
Subject: [PATCH] drm/amdgpu: resolve bug in ta microcode init


[AMD Public Use]

Submitting patch for review to resolve bug when calculating FW offset within 
binaries with PSP TA v2 header.

Thank you,
John Clements
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amd/powerplay: suppress compile error around BUG_ON

2020-07-15 Thread Evan Quan
To suppress the compile error below for "ARCH=arc".
   drivers/gpu/drm/amd/amdgpu/../powerplay/arcturus_ppt.c: In function 
'arcturus_fill_eeprom_i2c_req':
>> arch/arc/include/asm/bug.h:22:2: error: implicit declaration of function 
>> 'pr_warn'; did you mean 'pci_warn'? [-Werror=implicit-function-declaration]
  22 |  pr_warn("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, 
__func__); \
 |  ^~~
   include/asm-generic/bug.h:62:57: note: in expansion of macro 'BUG'
  62 | #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } 
while (0)
 | ^~~
   drivers/gpu/drm/amd/amdgpu/../powerplay/arcturus_ppt.c:2157:2: note: in 
expansion of macro 'BUG_ON'
2157 |  BUG_ON(numbytes > MAX_SW_I2C_COMMANDS);

Change-Id: I314b0d08197398a04b5439bce6546c4c68ca5dff
Signed-off-by: Evan Quan 
---
 drivers/gpu/drm/amd/powerplay/arcturus_ppt.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c 
b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
index fde6a8242565..0784a97bd67b 100644
--- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
@@ -1881,8 +1881,6 @@ static void arcturus_fill_eeprom_i2c_req(SwI2cRequest_t  
*req, bool write,
 {
int i;
 
-   BUG_ON(numbytes > MAX_SW_I2C_COMMANDS);
-
req->I2CcontrollerPort = 0;
req->I2CSpeed = 2;
req->SlaveAddress = address;
@@ -1920,6 +1918,12 @@ static int arcturus_i2c_eeprom_read_data(struct 
i2c_adapter *control,
struct smu_table_context *smu_table = >smu.smu_table;
struct smu_table *table = _table->driver_table;
 
+   if (numbytes > MAX_SW_I2C_COMMANDS) {
+   dev_err(adev->dev, "numbytes requested %d is over max allowed 
%d\n",
+   numbytes, MAX_SW_I2C_COMMANDS);
+   return -EINVAL;
+   }
+
memset(, 0, sizeof(req));
arcturus_fill_eeprom_i2c_req(, false, address, numbytes, data);
 
@@ -1956,6 +1960,12 @@ static int arcturus_i2c_eeprom_write_data(struct 
i2c_adapter *control,
SwI2cRequest_t req;
struct amdgpu_device *adev = to_amdgpu_device(control);
 
+   if (numbytes > MAX_SW_I2C_COMMANDS) {
+   dev_err(adev->dev, "numbytes requested %d is over max allowed 
%d\n",
+   numbytes, MAX_SW_I2C_COMMANDS);
+   return -EINVAL;
+   }
+
memset(, 0, sizeof(req));
arcturus_fill_eeprom_i2c_req(, true, address, numbytes, data);
 
-- 
2.27.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: resolve bug in ta microcode init

2020-07-15 Thread Clements, John
[AMD Public Use]

Submitting patch for review to resolve bug when calculating FW offset within 
binaries with PSP TA v2 header.

Thank you,
John Clements


0001-drm-amdgpu-resolve-bug-in-ta-microcode-init.patch
Description: 0001-drm-amdgpu-resolve-bug-in-ta-microcode-init.patch
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx