[PATCH 2/2] drm/amdgpu: fix no interrupt issue for renoir emu (v2)

2019-08-29 Thread Aaron Liu

In renoir's vega10_ih model, there's a security change in mmIH_CHICKEN
register, that limits IH to use physical address (FBPA, GPA) directly.
Those chicken bits need to be programmed first.

Signed-off-by: Aaron Liu 
Reviewed-by: Huang Rui 
Reviewed-by: Hawking Zhang 
Acked-by: Alex Deucher 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/vega10_ih.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
index f19268a..b273eb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
+++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
@@ -232,7 +232,13 @@ static int vega10_ih_irq_init(struct amdgpu_device *adev)
 	WREG32_SOC15(OSSSYS, 0, mmIH_RB_BASE_HI, (ih->gpu_addr >> 40) & 0xff);
 
 	ih_rb_cntl = RREG32_SOC15(OSSSYS, 0, mmIH_RB_CNTL);
+	ih_chicken = RREG32_SOC15(OSSSYS, 0, mmIH_CHICKEN);
 	ih_rb_cntl = vega10_ih_rb_cntl(ih, ih_rb_cntl);
+	if (adev->irq.ih.use_bus_addr) {
+		ih_chicken = REG_SET_FIELD(ih_chicken, IH_CHICKEN, MC_SPACE_GPA_ENABLE, 1);
+	} else {
+		ih_chicken = REG_SET_FIELD(ih_chicken, IH_CHICKEN, MC_SPACE_FBPA_ENABLE, 1);
+	}
 	ih_rb_cntl = REG_SET_FIELD(ih_rb_cntl, IH_RB_CNTL, RPTR_REARM,
    !!adev->irq.msi_enabled);
 
@@ -245,14 +251,10 @@ static int vega10_ih_irq_init(struct amdgpu_device *adev)
 		WREG32_SOC15(OSSSYS, 0, mmIH_RB_CNTL, ih_rb_cntl);
 	}
 
-	if ((adev->asic_type == CHIP_ARCTURUS || adev->asic_type == CHIP_RENOIR) &&
-		adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT) {
-		if (adev->irq.ih.use_bus_addr) {
-			ih_chicken = RREG32_SOC15(OSSSYS, 0, mmIH_CHICKEN);
-			ih_chicken |= 0x0010;
-			WREG32_SOC15(OSSSYS, 0, mmIH_CHICKEN, ih_chicken);
-		}
-	}
+	if ((adev->asic_type == CHIP_ARCTURUS
+		&& adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT)
+		|| adev->asic_type == CHIP_RENOIR)
+		WREG32_SOC15(OSSSYS, 0, mmIH_CHICKEN, ih_chicken);
 
 	/* set the writeback address whether it's enabled or not */
 	WREG32_SOC15(OSSSYS, 0, mmIH_RB_WPTR_ADDR_LO,
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amdgpu: update IH_CHICKEN in oss 4.0 IP header for VG/RV series

2019-08-29 Thread Aaron Liu

In Renoir's emulator, those chicken bits need to be programmed.

Signed-off-by: Aaron Liu 
Reviewed-by: Huang Rui 
Reviewed-by: Hawking Zhang 
Acked-by: Alex Deucher 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/include/asic_reg/oss/osssys_4_0_sh_mask.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/oss/osssys_4_0_sh_mask.h b/drivers/gpu/drm/amd/include/asic_reg/oss/osssys_4_0_sh_mask.h
index 1ee3a23..dc9895a 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/oss/osssys_4_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/oss/osssys_4_0_sh_mask.h
@@ -1109,7 +1109,11 @@
 #define IH_CID_REMAP_DATA__CLIENT_ID_REMAP_MASK   0x00FFL
 //IH_CHICKEN
 #define IH_CHICKEN__ACTIVE_FCN_ID_PROT_ENABLE__SHIFT  0x0
+#define IH_CHICKEN__MC_SPACE_FBPA_ENABLE__SHIFT   0x3
+#define IH_CHICKEN__MC_SPACE_GPA_ENABLE__SHIFT0x4
 #define IH_CHICKEN__ACTIVE_FCN_ID_PROT_ENABLE_MASK0x0001L
+#define IH_CHICKEN__MC_SPACE_FBPA_ENABLE_MASK 0x0008L
+#define IH_CHICKEN__MC_SPACE_GPA_ENABLE_MASK  0x0010L
 //IH_MMHUB_CNTL
 #define IH_MMHUB_CNTL__UNITID__SHIFT  0x0
 #define IH_MMHUB_CNTL__IV_TLVL__SHIFT 0x8
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 2/2] drm/amdgpu: Disable page faults while reading user wptrs

2019-08-29 Thread Kuehling, Felix
These wptrs must be pinned and GPU accessible when this is called
from hqd_load functions. So they should never fault. This resolves
a circular lock dependency issue involving four locks including the
DQM lock and mmap_sem.

Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 1af8f83f7e02..c003d9275837 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -179,10 +179,17 @@ uint64_t amdgpu_amdkfd_get_mmio_remap_phys_addr(struct 
kgd_dev *kgd);
 uint32_t amdgpu_amdkfd_get_num_gws(struct kgd_dev *kgd);
 uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev *dst, struct kgd_dev 
*src);
 
+/* Read user wptr from a specified user address space with page fault
+ * disabled. The memory must be pinned and mapped to the hardware when
+ * this is called in hqd_load functions, so it should never fault in
+ * the first place. This resolves a circular lock dependency involving
+ * four locks, including the DQM lock and mmap_sem.
+ */
 #define read_user_wptr(mmptr, wptr, dst)   \
({  \
bool valid = false; \
if ((mmptr) && (wptr)) {\
+   pagefault_disable();\
if ((mmptr) == current->mm) {   \
valid = !get_user((dst), (wptr));   \
} else if (current->mm == NULL) {   \
@@ -190,6 +197,7 @@ uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev 
*dst, struct kgd_dev *s
valid = !get_user((dst), (wptr));   \
unuse_mm(mmptr);\
}   \
+   pagefault_enable(); \
}   \
valid;  \
})
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 1/2] drm/amdgpu: Remove unnecessary TLB workaround

2019-08-29 Thread Kuehling, Felix
This workaround is better handled in user mode in a way that doesn't
require allocating extra memory and breaking userptr BOs.

The TLB bug is a performance bug, not a functional or security bug.
Hence it is safe to remove this kernel part of the workaround to
allow a better workaround using only virtual address alignments in
user mode.

Signed-off-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 42d209f5fd18..2c73ea7c425c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1110,7 +1110,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
uint64_t user_addr = 0;
struct amdgpu_bo *bo;
struct amdgpu_bo_param bp;
-   int byte_align;
u32 domain, alloc_domain;
u64 alloc_flags;
int ret;
@@ -1165,15 +1164,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
if ((*mem)->aql_queue)
size = size >> 1;
 
-   /* Workaround for TLB bug on older VI chips */
-   byte_align = (adev->family == AMDGPU_FAMILY_VI &&
-   adev->asic_type != CHIP_FIJI &&
-   adev->asic_type != CHIP_POLARIS10 &&
-   adev->asic_type != CHIP_POLARIS11 &&
-   adev->asic_type != CHIP_POLARIS12 &&
-   adev->asic_type != CHIP_VEGAM) ?
-   VI_BO_SIZE_ALIGN : 1;
-
(*mem)->alloc_flags = flags;
 
amdgpu_sync_create(&(*mem)->sync);
@@ -1189,7 +1179,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
 
memset(, 0, sizeof(bp));
bp.size = size;
-   bp.byte_align = byte_align;
+   bp.byte_align = 1;
bp.domain = alloc_domain;
bp.flags = alloc_flags;
bp.type = bo_type;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 7/7] drm/amdgpu: switch to amdgpu_ras_late_init for nbio v7_4 (v2)

2019-08-29 Thread Zhou1, Tao
With the two points in patch #1 and patch #5 are fixed, the series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: amd-gfx  On Behalf Of
> Hawking Zhang
> Sent: 2019年8月29日 21:31
> To: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> ; Zhou1, Tao ; Chen,
> Guchun 
> Cc: Zhang, Hawking 
> Subject: [PATCH 7/7] drm/amdgpu: switch to amdgpu_ras_late_init for nbio
> v7_4 (v2)
> 
> call helper function in late init phase to handle ras init for nbio ip block
> 
> v2: init local var r to 0 in case the function return failure on asics that 
> don't
> have ras_late_init implementation
> 
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/soc15.c | 13 -
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c
> b/drivers/gpu/drm/amd/amdgpu/soc15.c
> index e791ac3..c6ff225c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/soc15.c
> +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
> @@ -1208,11 +1208,15 @@ static int soc15_common_early_init(void
> *handle)  static int soc15_common_late_init(void *handle)  {
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> + int r = 0;
> 
>   if (amdgpu_sriov_vf(adev))
>   xgpu_ai_mailbox_get_irq(adev);
> 
> - return 0;
> + if (adev->nbio.funcs->ras_late_init)
> + r = adev->nbio.funcs->ras_late_init(adev);
> +
> + return r;
>  }
> 
>  static int soc15_common_sw_init(void *handle) @@ -1289,6 +1293,13 @@
> static int soc15_common_hw_fini(void *handle)
>   if (amdgpu_sriov_vf(adev))
>   xgpu_ai_mailbox_put_irq(adev);
> 
> + if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
> + if (adev->nbio.funcs->init_ras_controller_interrupt)
> + amdgpu_irq_put(adev, 
> >nbio.ras_controller_irq, 0);
> + if (adev->nbio.funcs->init_ras_err_event_athub_interrupt)
> + amdgpu_irq_put(adev, 
> >nbio.ras_err_event_athub_irq, 0);
> + }
> +
>   return 0;
>  }
> 
> --
> 2.7.4
> 
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback function (v2)

2019-08-29 Thread Zhou1, Tao


> -Original Message-
> From: Hawking Zhang 
> Sent: 2019年8月29日 21:31
> To: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> ; Zhou1, Tao ; Chen,
> Guchun 
> Cc: Zhang, Hawking 
> Subject: [PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback
> function (v2)
> 
> The function will be called in late init phase to do mmhub ras init
> 
> v2: check ras_late_init function pointer before invoking the function
> 
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  1 +
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 26 --
>  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c   | 28
> 
>  3 files changed, 33 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> index 2d75ecf..df04c71 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
> @@ -23,6 +23,7 @@
> 
>  struct amdgpu_mmhub_funcs {
>   void (*ras_init)(struct amdgpu_device *adev);
> + int (*ras_late_init)(struct amdgpu_device *adev);
>   void (*query_ras_error_count)(struct amdgpu_device *adev,
>   void *ras_error_status);
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 8a7a56a..70a05e3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -762,7 +762,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)  {
>   int r;
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> - struct ras_ih_if mmhub_ih_info;
>   struct ras_fs_if umc_fs_info = {
>   .sysfs_name = "umc_err_count",
>   .debugfs_name = "umc_err_inject",
> @@ -770,10 +769,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)
>   struct ras_ih_if umc_ih_info = {
>   .cb = gmc_v9_0_process_ras_data_cb,
>   };
> - struct ras_fs_if mmhub_fs_info = {
> - .sysfs_name = "mmhub_err_count",
> - .debugfs_name = "mmhub_err_inject",
> - };
> 
>   if (!adev->gmc.umc_ras_if) {
>   adev->gmc.umc_ras_if = kmalloc(sizeof(struct
> ras_common_if), GFP_KERNEL); @@ -797,29 +792,16 @@ static int
> gmc_v9_0_ecc_late_init(void *handle)
>   goto umc_late_fini;
>   }
> 
> - if (!adev->gmc.mmhub_ras_if) {
> - adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct
> ras_common_if), GFP_KERNEL);
> - if (!adev->gmc.mmhub_ras_if)
> - return -ENOMEM;
> - adev->gmc.mmhub_ras_if->block =
> AMDGPU_RAS_BLOCK__MMHUB;
> - adev->gmc.mmhub_ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> - adev->gmc.mmhub_ras_if->sub_block_index = 0;
> - strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
> + if (adev->mmhub_funcs->ras_late_init) {
> + r = adev->mmhub_funcs->ras_late_init(adev);
> + if (r)
> + return r;
>   }
> - mmhub_ih_info.head = mmhub_fs_info.head = *adev-
> >gmc.mmhub_ras_if;
[Tao] mmhub_ih_info.cb = NULL is recommended in case of random value

> - r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
> -  _fs_info, _ih_info);
> - if (r)
> - goto mmhub_late_fini;
> -
>   return 0;
> -mmhub_late_fini:
> - amdgpu_ras_late_fini(adev, adev->gmc.mmhub_ras_if,
> _ih_info);
>  umc_late_fini:
>   amdgpu_ras_late_fini(adev, adev->gmc.umc_ras_if, _ih_info);
>  free:
>   kfree(adev->gmc.umc_ras_if);
> - kfree(adev->gmc.mmhub_ras_if);
>   return r;
>  }
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> index 04cd4b6..9f7d5d1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
> @@ -31,6 +31,7 @@
>  #include "vega10_enum.h"
> 
>  #include "soc15_common.h"
> +#include "amdgpu_ras.h"
> 
>  #define mmDAGB0_CNTL_MISC2_RV 0x008f
>  #define mmDAGB0_CNTL_MISC2_RV_BASE_IDX 0 @@ -615,6 +616,33 @@
> static void mmhub_v1_0_query_ras_error_count(struct amdgpu_device
> *adev,
>   }
>  }
> 
> +static int mmhub_v1_0_ras_late_init(struct amdgpu_device *adev) {
> + int r;
> + struct ras_ih_if mmhub_ih_info;
> + struct ras_fs_if mmhub_fs_info = {
> + .sysfs_name = "mmhub_err_count",
> + .debugfs_name = "mmhub_err_inject",
> + };
> +
> + if (!adev->gmc.mmhub_ras_if) {
> + adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct
> ras_common_if), GFP_KERNEL);
> + if (!adev->gmc.mmhub_ras_if)
> + return -ENOMEM;
> + adev->gmc.mmhub_ras_if->block =
> AMDGPU_RAS_BLOCK__MMHUB;
> + adev->gmc.mmhub_ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> + adev->gmc.mmhub_ras_if->sub_block_index = 0;
> +

RE: [PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init/fini (v3)

2019-08-29 Thread Zhou1, Tao


> -Original Message-
> From: Hawking Zhang 
> Sent: 2019年8月29日 21:30
> To: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> ; Zhou1, Tao ; Chen,
> Guchun 
> Cc: Zhang, Hawking 
> Subject: [PATCH 1/7] drm/amdgpu: add helper function to do common
> ras_late_init/fini (v3)
> 
> In late_init for ras, the helper function will be used to 1). disable ras 
> feature
> if the IP block is masked as disabled 2). send enable feature command if the
> ip block was masked as enabled 3). create debugfs/sysfs node per IP block 4).
> register interrupt handler
> 
> v2: check ih_info.cb to decide add interrupt handler or not
> 
> v3: add ras_late_fini for cleanup all the ras fs node and remove interrupt
> handler
> 
> Signed-off-by: Hawking Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 72
> +
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  7 
>  2 files changed, 79 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 230f7e6..2b930fa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1564,6 +1564,78 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   return -EINVAL;
>  }
> 
> +/* helper function to handle common stuff in ip late init phase */ int
> +amdgpu_ras_late_init(struct amdgpu_device *adev,
> +  struct ras_common_if *ras_block,
> +  struct ras_fs_if *fs_info,
> +  struct ras_ih_if *ih_info)
> +{
> + int r;
> +
> + /* disable RAS feature per IP block if it is not supported */
> + if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
> + amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
> + return 0;
> + }
> +
> + r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
> + if (r) {
> + if (r == -EAGAIN) {
> + /* request gpu reset. will run again */
> + amdgpu_ras_request_reset_on_boot(adev,
> + ras_block->block);
> + return 0;
> + } else if (adev->in_suspend || adev->in_gpu_reset) {
> + /* in resume phase, if fail to enable ras,
> +  * clean up all ras fs nodes, and disable ras */
> + goto cleanup;
> + } else
> + return r;
> + }
> +
> + /* in resume phase, no need to create ras fs node */
> + if (adev->in_suspend || adev->in_gpu_reset)
> + return 0;
> +
> + if(ih_info->cb) {
[Tao] need a space between "if" and "("

> + r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
> + if (r)
> + goto interrupt;
> + }
> +
> + amdgpu_ras_debugfs_create(adev, fs_info);
> +
> + r = amdgpu_ras_sysfs_create(adev, fs_info);
> + if (r)
> + goto sysfs;
> +
> + return 0;
> +cleanup:
> + amdgpu_ras_sysfs_remove(adev, ras_block);
> +sysfs:
> + amdgpu_ras_debugfs_remove(adev, ras_block);
> + if (ih_info->cb)
> + amdgpu_ras_interrupt_remove_handler(adev, ih_info);
> +interrupt:
> + amdgpu_ras_feature_enable(adev, ras_block, 0);
> + return r;
> +}
> +
> +/* helper function to remove ras fs node and interrupt handler */ void
> +amdgpu_ras_late_fini(struct amdgpu_device *adev,
> +   struct ras_common_if *ras_block,
> +   struct ras_ih_if *ih_info)
> +{
> + if (!ras_block || !ih_info)
> + return;
> +
> + amdgpu_ras_sysfs_remove(adev, ras_block);
> + amdgpu_ras_debugfs_remove(adev, ras_block);
> + if (ih_info->cb)
> +amdgpu_ras_interrupt_remove_handler(adev, ih_info);
> + amdgpu_ras_feature_enable(adev, ras_block, 0); }
> +
>  /* do some init work after IP late init as dependence.
>   * and it runs in resume/gpu reset/booting up cases.
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 6c76bb2..66b7152 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -566,6 +566,13 @@ amdgpu_ras_error_to_ta(enum
> amdgpu_ras_error_type error) {  int amdgpu_ras_init(struct amdgpu_device
> *adev);  int amdgpu_ras_fini(struct amdgpu_device *adev);  int
> amdgpu_ras_pre_fini(struct amdgpu_device *adev);
> +int amdgpu_ras_late_init(struct amdgpu_device *adev,
> +  struct ras_common_if *ras_block,
> +  struct ras_fs_if *fs_info,
> +  struct ras_ih_if *ih_info);
> +void amdgpu_ras_late_fini(struct amdgpu_device *adev,
> +   struct ras_common_if *ras_block,
> +   struct ras_ih_if *ih_info);
> 
>  int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
>   struct ras_common_if *head, bool enable);
> --
> 

Re: [PATCH v3 00/39] put_user_pages(): miscellaneous call sites

2019-08-29 Thread John Hubbard

On 8/29/2019 6:29 PM, Mike Marshall wrote:

Hi John...

I added this patch series on top of Linux 5.3rc6 and ran
xfstests with no regressions...

Acked-by: Mike Marshall 



Hi Mike (and I hope Ira and others are reading as well, because
I'm making a bunch of claims further down),

That's great news, thanks for running that test suite and for
the report and the ACK.

There is an interesting pause right now, due to the fact that
we've made some tentative decisions about gup pinning, that affect
the call sites. A key decision is that only pages that were
requested via FOLL_PIN, will require put_user_page*() to release
them. There are 4 main cases, which were first explained by Jan
Kara and Vlastimil Babka, and are now written up in my FOLL_PIN
patch [1].

So, what that means for this series is that:

1. Some call sites (mlock.c for example, and a lot of the mm/ files
in fact, and more) will not be converted: some of these patches will
get dropped, especially in mm/.

2. Call sites that do DirectIO or RDMA will need to set FOLL_PIN, and
will also need to call put_user_page().

3. Call sites that do RDMA will need to set FOLL_LONGTERM *and* FOLL_PIN,

   3.a. ...and will at least in some cases need to provide a link to a
   vaddr_pin object, and thus back to a struct file*...maybe. Still
   under discussion.

4. It's desirable to keep FOLL_* flags (or at least FOLL_PIN) internal
to the gup() calls. That implies using a wrapper call such as Ira's
vaddr_pin_[user]_pages(), instead of gup(), and vaddr_unpin_[user]_pages()
instead of put_user_page*().

5. We don't want to churn the call sites unnecessarily.

With that in mind, I've taken another pass through all these patches
and narrowed it down to:

a) 12 call sites that I'd like to convert soon, but even those
   really look cleaner with a full conversion to a wrapper call
   similar to (identical to?) vaddr_pin_[user]_pages(), probably
   just the FOLL_PIN only variant (not FOLL_LONGTERM). That
   wrapper call is not ready yet, though.

b) Some more call sites that require both FOLL_PIN and FOLL_LONGTERM.
   Definitely will wait to use the wrapper calls for these, because
   they may also require hooking up to a struct file*.

c) A few more that were already applied, which is fine, because they
   show where to convert, and simplify a few sites anyway. But they'll
   need follow-on changes to, one way or another, set FOLL_PIN.

d) And of course a few sites whose patches get dropped, as mentioned
   above.

[1] https://lore.kernel.org/r/20190821040727.19650-3-jhubb...@nvidia.com

thanks,
--
John Hubbard
NVIDIA


RE: [PATCH V2] drm/amd/powerplay: SMU_MSG_OverridePcieParameters is unsupport for APU

2019-08-29 Thread Quan, Evan
Reviewed-by: Evan Quan 

-Original Message-
From: amd-gfx  On Behalf Of Aaron Liu
Sent: Friday, August 30, 2019 10:10 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Huang, Ray 
; Liu, Aaron 
Subject: [PATCH V2] drm/amd/powerplay: SMU_MSG_OverridePcieParameters is 
unsupport for APU

For apu, SMU_MSG_OverridePcieParameters is unsupport.
So return directly in smu_override_pcie_parameters function.

Signed-off-by: Aaron Liu 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 8c61778..b726565 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1001,6 +1001,9 @@ static int smu_override_pcie_parameters(struct 
smu_context *smu)
uint32_t pcie_gen = 0, pcie_width = 0, smu_pcie_arg;
int ret;
 
+   if (adev->flags & AMD_IS_APU)
+   return 0;
+
if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4)
pcie_gen = 3;
else if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH V2] drm/amd/powerplay: SMU_MSG_OverridePcieParameters is unsupport for APU

2019-08-29 Thread Aaron Liu
For apu, SMU_MSG_OverridePcieParameters is unsupport.
So return directly in smu_override_pcie_parameters function.

Signed-off-by: Aaron Liu 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 8c61778..b726565 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1001,6 +1001,9 @@ static int smu_override_pcie_parameters(struct 
smu_context *smu)
uint32_t pcie_gen = 0, pcie_width = 0, smu_pcie_arg;
int ret;
 
+   if (adev->flags & AMD_IS_APU)
+   return 0;
+
if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4)
pcie_gen = 3;
else if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH] drm/amd/powerplay: SMU_MSG_OverridePcieParameters is unsupport for APU

2019-08-29 Thread Aaron Liu

For apu, SMU_MSG_OverridePcieParameters is unsupport.
So return directly in smu_override_pcie_parameters function.

Signed-off-by: Aaron Liu 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 8c61778..b726565 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1001,6 +1001,9 @@ static int smu_override_pcie_parameters(struct smu_context *smu)
 	uint32_t pcie_gen = 0, pcie_width = 0, smu_pcie_arg;
 	int ret;
 
+	if (adev->flags & AMD_IS_APU)
+		return 0;
+
 	if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4)
 		pcie_gen = 3;
 	else if (adev->pm.pcie_gen_mask & CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3)
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH v2 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Zhou1, Tao


> -Original Message-
> From: Andrey Grodzovsky 
> Sent: 2019年8月30日 8:54
> To: amd-gfx@lists.freedesktop.org
> Cc: alexdeuc...@gmail.com; Zhang, Hawking ;
> ckoenig.leichtzumer...@gmail.com; Zhou1, Tao ;
> Grodzovsky, Andrey 
> Subject: [PATCH v2 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.
> 
> Problem:
> Under certain conditions, when some IP bocks take a RAS error, we can get

[Tao] typo: "dmr/amdgpu" -> "drm/amdgpu", "IP bocks" -> "IP blocks"

> into a situation where a GPU reset is not possible due to issues in RAS in
> SMU/PSP.
> 
> Temporary fix until proper solution in PSP/SMU is ready:
> When uncorrectable error happens the DF will unconditionally broadcast
> error event packets to all its clients/slave upon receiving fatal error event 
> and
> freeze all its outbound queues, err_event_athub interrupt  will be triggered.
> In such case and we use this interrupt
> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
> reset, only stops schedulers, deatches all in progress and not yet scheduled
> job's fences, set error code on them and signals.
> Also reject any new incoming job submissions from user space.
> All this is done to notify the applications of the problem.
> 
> v2:
> Extract amdgpu_amdkfd_pre/post_reset from
> amdgpu_device_lock/unlock_adev Move
> amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c Remove print param
> from amdgpu_ras_query_error_count
> 
> Signed-off-by: Andrey Grodzovsky 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46
> +++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  5 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 38
> 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|  3 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  6 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 22 --
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 10 +++
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 24 +---
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  5 
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++--
>  12 files changed, 163 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 9da681e..300adb8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -38,6 +38,7 @@
>  #include "amdgpu_gmc.h"
>  #include "amdgpu_gem.h"
>  #include "amdgpu_display.h"
> +#include "amdgpu_ras.h"
> 
>  #if defined(HAVE_DRM_FREE_LARGE)
>  #define kvfree drm_free_large
> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void
> *data, struct drm_file *filp)
>   bool reserved_buffers = false;
>   int i, r;
> 
> + if (amdgpu_ras_intr_triggered())
> + return -EHWPOISON;
> +
>   if (!adev->accel_working)
>   return -EBUSY;
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a5daccc..d3a078b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3727,25 +3727,18 @@ static bool amdgpu_device_lock_adev(struct
> amdgpu_device *adev, bool trylock)
>   adev->mp1_state = PP_MP1_STATE_NONE;
>   break;
>   }
> - /* Block kfd: SRIOV would do it separately */
> - if (!amdgpu_sriov_vf(adev))
> -amdgpu_amdkfd_pre_reset(adev);
> 
>   return true;
>  }
> 
>  static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)  {
> - /*unlock kfd: SRIOV would do it separately */
> - if (!amdgpu_sriov_vf(adev))
> -amdgpu_amdkfd_post_reset(adev);
>   amdgpu_vf_error_trans_all(adev);
>   adev->mp1_state = PP_MP1_STATE_NONE;
>   adev->in_gpu_reset = 0;
>   mutex_unlock(>lock_reset);
>  }
> 
> -
>  /**
>   * amdgpu_device_gpu_recover - reset the asic and recover scheduler
>   *
> @@ -3765,11 +3758,12 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
>   struct amdgpu_hive_info *hive = NULL;
>   struct amdgpu_device *tmp_adev = NULL;
>   int i, r = 0;
> + bool in_ras_intr = amdgpu_ras_intr_triggered();
> 
>   need_full_reset = job_signaled = false;
>   INIT_LIST_HEAD(_list);
> 
> - dev_info(adev->dev, "GPU reset begin!\n");
> + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs
> +stop":"reset");
> 
>   cancel_delayed_work_sync(>delayed_init_work);
> 
> @@ -3796,9 +3790,16 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
>   return 0;
>   }
> 
> + /* Block kfd: SRIOV would do it separately */
> + if (!amdgpu_sriov_vf(adev))
> +amdgpu_amdkfd_pre_reset(adev);
> +
>   /* Build list of devices to reset */
>   if  (adev->gmc.xgmi.num_physical_nodes 

Re: [PATCH v3 00/39] put_user_pages(): miscellaneous call sites

2019-08-29 Thread Mike Marshall
Hi John...

I added this patch series on top of Linux 5.3rc6 and ran
xfstests with no regressions...

Acked-by: Mike Marshall 

-Mike

On Tue, Aug 6, 2019 at 9:50 PM John Hubbard  wrote:
>
> On 8/6/19 6:32 PM, john.hubb...@gmail.com wrote:
> > From: John Hubbard 
> > ...
> >
> > John Hubbard (38):
> >   mm/gup: add make_dirty arg to put_user_pages_dirty_lock()
> ...
> >  54 files changed, 191 insertions(+), 323 deletions(-)
> >
> ahem, yes, apparently this is what happens if I add a few patches while 
> editing
> the cover letter... :)
>
> The subject line should read "00/41", and the list of files affected here is
> therefore under-reported in this cover letter. However, the patch series 
> itself is
> intact and ready for submission.
>
> thanks,
> --
> John Hubbard
> NVIDIA


Re: [PATCH v2 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Kuehling, Felix

On 2019-08-29 8:53 p.m., Andrey Grodzovsky wrote:
> Problem:
> Under certain conditions, when some IP bocks take a RAS error,
> we can get into a situation where a GPU reset is not possible
> due to issues in RAS in SMU/PSP.
>
> Temporary fix until proper solution in PSP/SMU is ready:
> When uncorrectable error happens the DF will unconditionally
> broadcast error event packets to all its clients/slave upon
> receiving fatal error event and freeze all its outbound queues,
> err_event_athub interrupt  will be triggered.
> In such case and we use this interrupt
> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
> reset, only stops schedulers, deatches all in progress and not yet scheduled
> job's fences, set error code on them and signals.
> Also reject any new incoming job submissions from user space.
> All this is done to notify the applications of the problem.
>
> v2:
> Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev
> Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c
> Remove print param from amdgpu_ras_query_error_count
>
> Signed-off-by: Andrey Grodzovsky 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 
> +++---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  5 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 38 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  6 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 22 --
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 10 +++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 24 +---
>   drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  5 
>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++--
>   12 files changed, 163 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 9da681e..300adb8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -38,6 +38,7 @@
>   #include "amdgpu_gmc.h"
>   #include "amdgpu_gem.h"
>   #include "amdgpu_display.h"
> +#include "amdgpu_ras.h"
>   
>   #if defined(HAVE_DRM_FREE_LARGE)
>   #define kvfree drm_free_large
> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
> struct drm_file *filp)
>   bool reserved_buffers = false;
>   int i, r;
>   
> + if (amdgpu_ras_intr_triggered())
> + return -EHWPOISON;
> +
>   if (!adev->accel_working)
>   return -EBUSY;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a5daccc..d3a078b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3727,25 +3727,18 @@ static bool amdgpu_device_lock_adev(struct 
> amdgpu_device *adev, bool trylock)
>   adev->mp1_state = PP_MP1_STATE_NONE;
>   break;
>   }
> - /* Block kfd: SRIOV would do it separately */
> - if (!amdgpu_sriov_vf(adev))
> -amdgpu_amdkfd_pre_reset(adev);
>   
>   return true;
>   }
>   
>   static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>   {
> - /*unlock kfd: SRIOV would do it separately */
> - if (!amdgpu_sriov_vf(adev))
> -amdgpu_amdkfd_post_reset(adev);
>   amdgpu_vf_error_trans_all(adev);
>   adev->mp1_state = PP_MP1_STATE_NONE;
>   adev->in_gpu_reset = 0;
>   mutex_unlock(>lock_reset);
>   }
>   
> -
>   /**
>* amdgpu_device_gpu_recover - reset the asic and recover scheduler
>*
> @@ -3765,11 +3758,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>   struct amdgpu_hive_info *hive = NULL;
>   struct amdgpu_device *tmp_adev = NULL;
>   int i, r = 0;
> + bool in_ras_intr = amdgpu_ras_intr_triggered();
>   
>   need_full_reset = job_signaled = false;
>   INIT_LIST_HEAD(_list);
>   
> - dev_info(adev->dev, "GPU reset begin!\n");
> + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs 
> stop":"reset");
>   
>   cancel_delayed_work_sync(>delayed_init_work);
>   
> @@ -3796,9 +3790,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>   return 0;
>   }
>   
> + /* Block kfd: SRIOV would do it separately */
> + if (!amdgpu_sriov_vf(adev))
> +amdgpu_amdkfd_pre_reset(adev);
> +
>   /* Build list of devices to reset */
>   if  (adev->gmc.xgmi.num_physical_nodes > 1) {
>   if (!hive) {
> + /*unlock kfd: SRIOV would do it separately */
> + if (!amdgpu_sriov_vf(adev))
> + amdgpu_amdkfd_post_reset(adev);
>   

[PATCH v2 2/2] dmr/amdgpu: Add system auto reboot to RAS.

2019-08-29 Thread Andrey Grodzovsky
In case of RAS error allow user configure auto system
reboot through ras_ctrl.
This is also part of the temproray work around for the RAS
hang problem.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d3a078b..2586e8e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3760,6 +3760,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool in_ras_intr = amdgpu_ras_intr_triggered();
 
+   /*
+* Flush RAM to disk so that after reboot
+* the user can read log and see why the system rebooted.
+*
+* Using user mode app call instead of kernel APIs such as
+* ksys_sync_helper for backward comparability with earlier
+* kernels into which this is also intended.
+*/
+   if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
+   char *envp[] = { "HOME=/", NULL };
+   char *argv[] = { "/bin/sync", NULL };
+
+   DRM_WARN("Emergency reboot.");
+
+   call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+   emergency_restart();
+   }
+
need_full_reset = job_signaled = false;
INIT_LIST_HEAD(_list);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7b00ac6..038b0a6f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -30,6 +30,7 @@
 #include "amdgpu_ras.h"
 #include "amdgpu_atomfirmware.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include 
 
 const char *ras_error_string[] = {
"none",
@@ -154,6 +155,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file 
*f,
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
+   else if (sscanf(str, "reboot %32s", block_name) == 1)
+   op = 3;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;
@@ -287,6 +290,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
/* data.inject.address is offset instead of absolute gpu 
address */
ret = amdgpu_ras_error_inject(adev, );
break;
+   case 3:
+   amdgpu_ras_get_context(adev)->reboot = true;
+   break;
default:
ret = -EINVAL;
break;
@@ -1733,6 +1739,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 {
if (atomic_cmpxchg(_ras_in_intr, 0, 1) == 0) {
-   DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! 
Stopping all GPU jobs.\n");
+   DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT 
detected!\n");
+
+   amdgpu_ras_reset_gpu(adev, false);
}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cf5ffb6..45c0dab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -333,6 +333,7 @@ struct amdgpu_ras {
struct mutex recovery_lock;
 
uint32_t flags;
+   bool reboot;
 };
 
 struct ras_fs_data {
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH v2 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Andrey Grodzovsky
Problem:
Under certain conditions, when some IP bocks take a RAS error,
we can get into a situation where a GPU reset is not possible
due to issues in RAS in SMU/PSP.

Temporary fix until proper solution in PSP/SMU is ready:
When uncorrectable error happens the DF will unconditionally
broadcast error event packets to all its clients/slave upon
receiving fatal error event and freeze all its outbound queues,
err_event_athub interrupt  will be triggered.
In such case and we use this interrupt
to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
reset, only stops schedulers, deatches all in progress and not yet scheduled
job's fences, set error code on them and signals.
Also reject any new incoming job submissions from user space.
All this is done to notify the applications of the problem.

v2:
Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev
Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c
Remove print param from amdgpu_ras_query_error_count

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  5 
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c| 38 
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h|  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  6 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 22 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 10 +++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 ---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 24 +---
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  5 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++--
 12 files changed, 163 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 9da681e..300adb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -38,6 +38,7 @@
 #include "amdgpu_gmc.h"
 #include "amdgpu_gem.h"
 #include "amdgpu_display.h"
+#include "amdgpu_ras.h"
 
 #if defined(HAVE_DRM_FREE_LARGE)
 #define kvfree drm_free_large
@@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
bool reserved_buffers = false;
int i, r;
 
+   if (amdgpu_ras_intr_triggered())
+   return -EHWPOISON;
+
if (!adev->accel_working)
return -EBUSY;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a5daccc..d3a078b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3727,25 +3727,18 @@ static bool amdgpu_device_lock_adev(struct 
amdgpu_device *adev, bool trylock)
adev->mp1_state = PP_MP1_STATE_NONE;
break;
}
-   /* Block kfd: SRIOV would do it separately */
-   if (!amdgpu_sriov_vf(adev))
-amdgpu_amdkfd_pre_reset(adev);
 
return true;
 }
 
 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 {
-   /*unlock kfd: SRIOV would do it separately */
-   if (!amdgpu_sriov_vf(adev))
-amdgpu_amdkfd_post_reset(adev);
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
adev->in_gpu_reset = 0;
mutex_unlock(>lock_reset);
 }
 
-
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -3765,11 +3758,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
*adev,
struct amdgpu_hive_info *hive = NULL;
struct amdgpu_device *tmp_adev = NULL;
int i, r = 0;
+   bool in_ras_intr = amdgpu_ras_intr_triggered();
 
need_full_reset = job_signaled = false;
INIT_LIST_HEAD(_list);
 
-   dev_info(adev->dev, "GPU reset begin!\n");
+   dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs 
stop":"reset");
 
cancel_delayed_work_sync(>delayed_init_work);
 
@@ -3796,9 +3790,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
return 0;
}
 
+   /* Block kfd: SRIOV would do it separately */
+   if (!amdgpu_sriov_vf(adev))
+amdgpu_amdkfd_pre_reset(adev);
+
/* Build list of devices to reset */
if  (adev->gmc.xgmi.num_physical_nodes > 1) {
if (!hive) {
+   /*unlock kfd: SRIOV would do it separately */
+   if (!amdgpu_sriov_vf(adev))
+   amdgpu_amdkfd_post_reset(adev);
amdgpu_device_unlock_adev(adev);
return -ENODEV;
}
@@ -3824,7 +3825,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, 

[PATCH 3/4] drm/amdgpu: Determing PTE flags separately for each mapping (v3)

2019-08-29 Thread Kuehling, Felix
The same BO can be mapped with different PTE flags by different GPUs.
Therefore determine the PTE flags separately for each mapping instead
of storing them in the KFD buffer object.

Add a helper function to determine the PTE flags to be extended with
ASIC and memory-type-specific logic in subsequent commits.

v2: Split Arcturus-specific MTYPE changes into separate commit
v3: Fix return type of get_pte_flags to uint64_t

Signed-off-by: Felix Kuehling 
Acked-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  2 +-
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 39 +++
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index e519df3fd2b6..1af8f83f7e02 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -57,7 +57,7 @@ struct kgd_mem {
unsigned int mapped_to_gpu_memory;
uint64_t va;
 
-   uint32_t mapping_flags;
+   uint32_t alloc_flags;
 
atomic_t invalid;
struct amdkfd_process_info *process_info;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 44a52b09cc58..aae19d221f42 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -355,6 +355,23 @@ static int vm_update_pds(struct amdgpu_vm *vm, struct 
amdgpu_sync *sync)
return amdgpu_sync_fence(NULL, sync, vm->last_update, false);
 }
 
+static uint64_t get_pte_flags(struct amdgpu_device *adev, struct kgd_mem *mem)
+{
+   bool coherent = mem->alloc_flags & ALLOC_MEM_FLAGS_COHERENT;
+   uint32_t mapping_flags;
+
+   mapping_flags = AMDGPU_VM_PAGE_READABLE;
+   if (mem->alloc_flags & ALLOC_MEM_FLAGS_WRITABLE)
+   mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;
+   if (mem->alloc_flags & ALLOC_MEM_FLAGS_EXECUTABLE)
+   mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
+
+   mapping_flags |= coherent ?
+   AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
+
+   return amdgpu_gmc_get_pte_flags(adev, mapping_flags);
+}
+
 /* add_bo_to_vm - Add a BO to a VM
  *
  * Everything that needs to bo done only once when a BO is first added
@@ -403,8 +420,7 @@ static int add_bo_to_vm(struct amdgpu_device *adev, struct 
kgd_mem *mem,
}
 
bo_va_entry->va = va;
-   bo_va_entry->pte_flags = amdgpu_gmc_get_pte_flags(adev,
-mem->mapping_flags);
+   bo_va_entry->pte_flags = get_pte_flags(adev, mem);
bo_va_entry->kgd_dev = (void *)adev;
list_add(_va_entry->bo_list, list_bo_va);
 
@@ -1081,7 +1097,6 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
int byte_align;
u32 domain, alloc_domain;
u64 alloc_flags;
-   uint32_t mapping_flags;
int ret;
 
/*
@@ -1143,16 +1158,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
adev->asic_type != CHIP_VEGAM) ?
VI_BO_SIZE_ALIGN : 1;
 
-   mapping_flags = AMDGPU_VM_PAGE_READABLE;
-   if (flags & ALLOC_MEM_FLAGS_WRITABLE)
-   mapping_flags |= AMDGPU_VM_PAGE_WRITEABLE;
-   if (flags & ALLOC_MEM_FLAGS_EXECUTABLE)
-   mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
-   if (flags & ALLOC_MEM_FLAGS_COHERENT)
-   mapping_flags |= AMDGPU_VM_MTYPE_UC;
-   else
-   mapping_flags |= AMDGPU_VM_MTYPE_NC;
-   (*mem)->mapping_flags = mapping_flags;
+   (*mem)->alloc_flags = flags;
 
amdgpu_sync_create(&(*mem)->sync);
 
@@ -1625,9 +1631,10 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct kgd_dev 
*kgd,
 
INIT_LIST_HEAD(&(*mem)->bo_va_list);
mutex_init(&(*mem)->lock);
-   (*mem)->mapping_flags =
-   AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
-   AMDGPU_VM_PAGE_EXECUTABLE | AMDGPU_VM_MTYPE_NC;
+   (*mem)->alloc_flags =
+   ((bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
+ALLOC_MEM_FLAGS_VRAM : ALLOC_MEM_FLAGS_GTT) |
+   ALLOC_MEM_FLAGS_WRITABLE | ALLOC_MEM_FLAGS_EXECUTABLE;
 
(*mem)->bo = amdgpu_bo_ref(bo);
(*mem)->va = va;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: gnome-shell stuck because of amdgpu driver [5.3 RC5]

2019-08-29 Thread mikhail . v . gavrilov
On Sun, Aug 25, 2019 at 10:13:05PM +0800, Hillf Danton wrote:
> Can we try to add the fallback timer manually?
> 
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -322,6 +322,10 @@ int amdgpu_fence_wait_empty(struct amdgp
> }
> rcu_read_unlock();
>  
> +   if (!timer_pending(>fence_drv.fallback_timer))
> +   mod_timer(>fence_drv.fallback_timer,
> +   jiffies + (AMDGPU_FENCE_JIFFIES_TIMEOUT <<
> 1));
> +
> r = dma_fence_wait(fence, false);
> dma_fence_put(fence);
> return r;
> --
> 
> Or simply wait with an ear on signal and timeout if adding timer
> seems to go a bit too far?
> 
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -322,7 +322,12 @@ int amdgpu_fence_wait_empty(struct amdgp
> }
> rcu_read_unlock();
>  
> -   r = dma_fence_wait(fence, false);
> +   if (0 < dma_fence_wait_timeout(fence, true,
> +   AMDGPU_FENCE_JIFFIES_TIMEOUT +
> +   (AMDGPU_FENCE_JIFFIES_TIMEOUT >> 3)))
> +   r = 0;
> +   else
> +   r = -EINVAL;
> dma_fence_put(fence);
> return r;
>  }

I tested both patches on top of 5.3 RC6. Each patch I was tested more
than 24 hours and I don't seen any regressions or problems with them.


On Mon, 2019-08-26 at 11:24 +0200, Daniel Vetter wrote:
> 
> This will paper over the issue, but won't fix it. dma_fences have to
> complete, at least for normal operations, otherwise your desktop will
> start feeling like the gpu hangs all the time.
> 
> I think would be much more interesting to dump which fence isn't
> completing here in time, i.e. not just the timeout, but lots of debug
> printks.
> -Daniel

As I am understood none of these patches couldn't be merged because
they do not fix the root cause they eliminate only the consequences?
Eliminating consequences has any negative effects? And we will never
know the root cause because not having enough debugging information.



Re: [PATCH] drm/amdgpu: Handle job is NULL use case in amdgpu_device_gpu_recover

2019-08-29 Thread Grodzovsky, Andrey

On 8/28/19 4:58 AM, Ernst Sjöstrand wrote:
> Den tis 27 aug. 2019 kl 20:17 skrev Andrey Grodzovsky
> :
>> This should be checked at all places job is accessed.
>>
>> Signed-off-by: Andrey Grodzovsky 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 
>>   1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 02b3e7d..190d9a3 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3790,14 +3790,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>> *adev,
>>
>>  if (hive && !mutex_trylock(>reset_lock)) {
>>  DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as 
>> another already in progress",
>> -job->base.id, hive->hive_id);
>> + job ? job->base.id : -1, hive->hive_id);
>>  return 0;
>>  }
>>
>>  /* Start with adev pre asic reset first for soft reset check.*/
>>  if (!amdgpu_device_lock_adev(adev, !hive)) {
>>  DRM_INFO("Bailing on TDR for s_job:%llx, as another already 
>> in progress",
>> -job->base.id);
>> + job ? job->base.id : -1);
>>  return 0;
>>  }
>>
>> @@ -3838,7 +3838,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>> *adev,
>>  if (!ring || !ring->sched.thread)
>>  continue;
>>
>> -   drm_sched_stop(>sched, >base);
>> +   drm_sched_stop(>sched, job ? >base : 
>> NULL);
>>  }
>>  }
>>
>> @@ -3864,7 +3864,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>> *adev,
>>
>>  /* Guilty job will be freed after this*/
>>  r = amdgpu_device_pre_asic_reset(adev,
>> -job,
>> +job ? job : NULL,
> This check looks redundant.


Agree. Will remove.

Andrey


>
>>   _full_reset);
>>  if (r) {
>>  /*TODO Should we stop ?*/
>> --
>> 2.7.4
>>
>> ___
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> Regards
> //Ernst
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v2] drm/amdgpu: Default disable GDS for compute+gfx

2019-08-29 Thread Marek Olšák
If you decide to add it back, use this instead, it's simpler:
https://patchwork.freedesktop.org/patch/318391/?series=63775=1

Maybe remove OA reservation if you don't need it.

Marek

On Thu, Aug 29, 2019 at 5:06 AM zhoucm1  wrote:

>
> On 2019/8/29 下午3:22, Christian König wrote:
>
> Am 29.08.19 um 07:55 schrieb zhoucm1:
>
>
> On 2019/8/29 上午1:08, Marek Olšák wrote:
>
> It can't break an older driver, because there is no older driver that
> requires the static allocation.
>
> Note that closed source drivers don't count, because they don't need
> backward compatibility.
>
> Yes, I agree, we don't need take care of closed source stack.
>
> But AMDVLK is indeed an open source stack, many fans are using it, we need
> keep its compatibility, don't we?
>
>
> Actually that is still under discussion.
>
> But AMDVLK should have never ever used the static GDS space in the first
> place. We only added that for a transition time for old OpenGL and it
> shouldn't have leaked into the upstream driver.
>
> Not sure what's the best approach here. We could revert "[PATCH]
> drm/amdgpu: remove static GDS, GWS and OA", but that would break KFD. So we
> can only choose between two evils here.
>
> Only alternative I can see which would work for both would be to still
> allocate the static GDS, GWS and OA space, but make it somehow dynamic so
> that the KFD can swap it out again.
>
> Agree with you.
>
> -David
>
>
> Christian.
>
> -David
>
>
> Marek
>
> On Wed, Aug 28, 2019 at 2:44 AM zhoucm1  wrote:
>
>>
>> On 2019/7/23 上午3:08, Christian König wrote:
>> > Am 22.07.19 um 17:34 schrieb Greathouse, Joseph:
>> >> Units in the GDS block default to allowing all VMIDs access to all
>> >> entries. Disable shader access to the GDS, GWS, and OA blocks from all
>> >> compute and gfx VMIDs by default. For compute, HWS firmware will set
>> >> up the access bits for the appropriate VMID when a compute queue
>> >> requires access to these blocks.
>> >> The driver will handle enabling access on-demand for graphics VMIDs.
>>
>> gds_switch is depending on job->gds/gws/oa/_base/size.
>>
>> "[PATCH] drm/amdgpu: remove static GDS, GWS and OA allocation", the
>> default allocations in kernel were removed. If some UMD stacks don't
>> pass gds/gws/oa allocation to bo_list, then kernel will not enable
>> access of them, that will break previous driver.
>>
>> do we need revert "[PATCH] drm/amdgpu: remove static GDS, GWS and OA
>> allocation" ?
>>
>> -David
>>
>> >>
>> >> Leaving VMID0 with full access because otherwise HWS cannot save or
>> >> restore values during task switch.
>> >>
>> >> v2: Fixed code and comment styling.
>> >>
>> >> Change-Id: I3d768a96935d2ed1dff09b02c995090f4fbfa539
>> >> Signed-off-by: Joseph Greathouse 
>> >
>> > Reviewed-by: Christian König 
>> >
>> >> ---
>> >>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 25 ++---
>> >>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c  | 24 +---
>> >>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 24 +---
>> >>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 24 +---
>> >>   4 files changed, 69 insertions(+), 28 deletions(-)
>> >>
>> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> >> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> >> index 73dcb632a3ce..2a9692bc34b4 100644
>> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> >> @@ -1516,17 +1516,27 @@ static void
>> >> gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>> >>   }
>> >>   nv_grbm_select(adev, 0, 0, 0, 0);
>> >>   mutex_unlock(>srbm_mutex);
>> >> +}
>> >>   -/* Initialize all compute VMIDs to have no GDS, GWS, or OA
>> >> -   acccess. These should be enabled by FW for target VMIDs. */
>> >> -for (i = FIRST_COMPUTE_VMID; i < LAST_COMPUTE_VMID; i++) {
>> >> -WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_BASE, 2 * i, 0);
>> >> -WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_SIZE, 2 * i, 0);
>> >> -WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
>> >> -WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
>> >> +static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
>> >> +{
>> >> +int vmid;
>> >> +
>> >> +/*
>> >> + * Initialize all compute and user-gfx VMIDs to have no GDS,
>> >> GWS, or OA
>> >> + * access. Compute VMIDs should be enabled by FW for target VMIDs,
>> >> + * the driver can enable them for graphics. VMID0 should maintain
>> >> + * access so that HWS firmware can save/restore entries.
>> >> + */
>> >> +for (vmid = 1; vmid < 16; vmid++) {
>> >> +WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_BASE, 2 * vmid, 0);
>> >> +WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_SIZE, 2 * vmid, 0);
>> >> +WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, vmid, 0);
>> >> +WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, vmid, 0);
>> >>   }
>> >>   }
>> >>   +
>> >>   static void gfx_v10_0_tcp_harvest(struct amdgpu_device 

Re: [PATCH 00/23] Add DC support for Renoir

2019-08-29 Thread Harry Wentland
Patches are
Acked-by: Harry Wentland 

Harry

On 2019-08-28 3:56 p.m., Alex Deucher wrote:
> This patch set adds initial DC display support for
> Renoir.  Renoir is a new APU.
> 
> I have omitted the register patch due to size.  The
> full tree is available here:
> https://cgit.freedesktop.org/~agd5f/linux/log/?h=amd-staging-drm-next-renoir-dc
> 
> 
> Bhawanpreet Lakha (20):
>   drm/amd/display: Add Renoir registers (v3)
>   drm/amd/display: Add Renoir clock registers list
>   drm/amd/display: Add Renoir hw_seq register list
>   drm/amd/display: Add pp_smu functions for Renoir
>   drm/amd/display: Add Renoir irq_services
>   drm/amd/display: Add hubp block for Renoir (v2)
>   drm/amd/display: Add Renoir hubbub registers list
>   drm/amd/display: Add Renoir Hubbub (v2)
>   drm/amd/display: Add Renoir clock manager
>   drm/amd/display: Add Renoir resource (v2)
>   drm/amd/display: Add Renoir GPIO
>   drm/amd/display: Add Renoir DML
>   drm/amd/display: Fix register names
>   drm/amd/display: Handle Renoir in DC
>   drm/amd/display: Handle Renoir in amdgpu_dm (v2)
>   drm/amd/display: call update_bw_bounding_box
>   drm/amd/display: add dal_asic_id for renoir
>   drm/amd/display: add dcn21 core DC changes
>   drm/amd/display: build dcn21 blocks
>   drm/amd/display: add Renoir to kconfig
> 
> Roman Li (3):
>   drm/amd/display: Correct order of RV family clk managers for Renoir
>   drm/amd/display: Add DCN2.1 changes to DML
>   drm/amdgpu: Enable DC on Renoir
> 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 3 +
>  drivers/gpu/drm/amd/amdgpu/soc15.c| 6 +
>  drivers/gpu/drm/amd/display/Kconfig   | 8 +
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |14 +
>  drivers/gpu/drm/amd/display/dc/Makefile   | 3 +
>  .../display/dc/bios/command_table_helper2.c   | 5 +
>  .../gpu/drm/amd/display/dc/clk_mgr/Makefile   |10 +
>  .../gpu/drm/amd/display/dc/clk_mgr/clk_mgr.c  | 9 +
>  .../amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.c |   590 +
>  .../amd/display/dc/clk_mgr/dcn21/rn_clk_mgr.h |39 +
>  .../dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.c   |   200 +
>  .../dc/clk_mgr/dcn21/rn_clk_mgr_vbios_smu.h   |40 +
>  drivers/gpu/drm/amd/display/dc/core/dc.c  | 5 +
>  .../gpu/drm/amd/display/dc/core/dc_resource.c |12 +
>  drivers/gpu/drm/amd/display/dc/dc.h   | 3 +
>  .../drm/amd/display/dc/dce/dce_clock_source.h |17 +
>  .../gpu/drm/amd/display/dc/dce/dce_hwseq.h|97 +
>  .../drm/amd/display/dc/dcn10/dcn10_hubbub.h   |73 +
>  .../drm/amd/display/dc/dcn20/dcn20_hubbub.h   |10 +
>  .../gpu/drm/amd/display/dc/dcn20/dcn20_hubp.h |35 +
>  drivers/gpu/drm/amd/display/dc/dcn21/Makefile |10 +
>  .../drm/amd/display/dc/dcn21/dcn21_hubbub.c   |   595 +
>  .../drm/amd/display/dc/dcn21/dcn21_hubbub.h   |   132 +
>  .../gpu/drm/amd/display/dc/dcn21/dcn21_hubp.c |   244 +
>  .../gpu/drm/amd/display/dc/dcn21/dcn21_hubp.h |   133 +
>  .../drm/amd/display/dc/dcn21/dcn21_resource.c |  1680 +
>  .../drm/amd/display/dc/dcn21/dcn21_resource.h |45 +
>  drivers/gpu/drm/amd/display/dc/dm_pp_smu.h|47 +
>  drivers/gpu/drm/amd/display/dc/dml/Makefile   | 8 +
>  .../dc/dml/dcn21/display_mode_vba_21.c|  6123 ++
>  .../dc/dml/dcn21/display_mode_vba_21.h|32 +
>  .../dc/dml/dcn21/display_rq_dlg_calc_21.c |  1823 +
>  .../dc/dml/dcn21/display_rq_dlg_calc_21.h |73 +
>  .../drm/amd/display/dc/dml/display_mode_lib.c |19 +
>  .../drm/amd/display/dc/dml/display_mode_lib.h | 3 +
>  drivers/gpu/drm/amd/display/dc/gpio/Makefile  | 7 +
>  .../display/dc/gpio/dcn21/hw_factory_dcn21.c  |   210 +
>  .../display/dc/gpio/dcn21/hw_factory_dcn21.h  |33 +
>  .../dc/gpio/dcn21/hw_translate_dcn21.c|   386 +
>  .../dc/gpio/dcn21/hw_translate_dcn21.h|35 +
>  .../gpu/drm/amd/display/dc/gpio/hw_factory.c  | 8 +
>  .../drm/amd/display/dc/gpio/hw_translate.c| 8 +
>  .../gpu/drm/amd/display/dc/inc/core_types.h   | 8 +
>  .../gpu/drm/amd/display/dc/inc/hw/clk_mgr.h   |   125 +
>  .../gpu/drm/amd/display/dc/inc/hw/mem_input.h | 4 +
>  .../gpu/drm/amd/display/dc/inc/hw_sequencer.h | 1 +
>  drivers/gpu/drm/amd/display/dc/irq/Makefile   |10 +
>  .../display/dc/irq/dcn21/irq_service_dcn21.c  |   372 +
>  .../display/dc/irq/dcn21/irq_service_dcn21.h  |34 +
>  .../gpu/drm/amd/display/include/dal_asic_id.h | 5 +
>  .../gpu/drm/amd/display/include/dal_types.h   | 3 +
>  .../include/asic_reg/clk/clk_10_0_2_offset.h  |56 +
>  .../include/asic_reg/clk/clk_10_0_2_sh_mask.h |73 +
>  .../include/asic_reg/dcn/dcn_2_1_0_offset.h   | 13862 
>  .../include/asic_reg/dcn/dcn_2_1_0_sh_mask.h  | 56638 
>  .../include/asic_reg/dcn/dpcs_2_1_0_offset.h  |   565 +
>  .../include/asic_reg/dcn/dpcs_2_1_0_sh_mask.h |  3430 +
>  .../gpu/drm/amd/include/renoir_ip_offset.h|  1364 +
>  58 files changed, 89383 insertions(+)
>  create mode 100644 

Re: Couldn't read Speaker Allocation Data Block/SADs

2019-08-29 Thread Alex Deucher
On Thu, Aug 29, 2019 at 9:11 AM Jean Delvare  wrote:
>
> Hi all,
>
> Since I connected my Dell display on my Radeon R5 240 (Oland) card over
> DisplayPort instead of VGA, I get the following error messages logged at 
> every boot:
>
> [drm:dce_v6_0_encoder_mode_set [amdgpu]] *ERROR* Couldn't read Speaker 
> Allocation Data Block: -2
> [drm:dce_v6_0_encoder_mode_set [amdgpu]] *ERROR* Couldn't read SADs: -2
>
> I also see them each time the display wakes up and also on VT change.
> This is with kernel 5.2.9.
>
> This was also reported as bug #107825 by Paul Menzel:
> https://bugs.freedesktop.org/show_bug.cgi?id=107825
>
> Error -2 is ENOENT (No such file or directory). The driver queries the
> display for audio-related information, while my display does not have
> speakers nor headset connector.
>
> I suspect that the "error" is pretty much expected in this case and the
> driver is being too verbose about it. Either the calling code should
> consider -ENOENT as a non-error (11 calling sites to fix), or the
> helper functions should simply return 0 when no audio-related data is
> available from the display (2 functions to fix, calling sites may have
> to be inspected too as some treat 0 as an error too, which seems
> incorrect to me).
>
> Option 1 seems cleaner to me, but I don't know if there could be
> legitimate reasons to distinguish between no audio information block
> from display and empty audio information from display in the future.
>
> What do you think?

Feel free to remove the message or make it debug only.

Thanks!

Alex

>
> Thanks,
> --
> Jean Delvare
> SUSE L3 Support
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Kuehling, Felix
On 2019-08-29 1:21 p.m., Grodzovsky, Andrey wrote:
> On 8/29/19 12:18 PM, Kuehling, Felix wrote:
>> On 2019-08-29 10:08 a.m., Grodzovsky, Andrey wrote:
>>> Agree, the placement of amdgpu_amdkfd_pre/post _reset in
>>> amdgpu_device_lock/unlock_adev is a bit wierd.
>>>
>> amdgpu_device_reset_sriov already calls amdgpu_amdkfd_pre/post_reset
>> itself while it has exclusive access to the GPU.
> So in that case amdgpu_amdkfd_pre/post_reset gets called twice - once
> from amdgpu_device_lock/unlock_adev and second time from
> amdgpu_device_reset_sriov, no ? Why is it ?

No, it's not called twice because the bare metal case has conditions if 
(!amdgpu_sriov_vf(adev)). If you don't move the 
amdgpu_amdkfd_pre/post_reset calls into a bare-metal-specific code-path 
(such as amdgpu_do_asic_reset), you'll need to keep those conditions.


>
>
>> It would make sense to
>> move the same calls into amdgpu_do_asic_reset for the bare-metal case.
>
> Problem is i am skipping amdgpu_do_asic_reset totally in this case as
> there is no HW reset here so i will just extract it from
> amdgpu_device_lock/unlock_adev

OK.

Regards,
   Felix


>
> Andrey
>
>
>> Regards,
>>  Felix
>>
>>
>>> Andrey
>>>
>>> On 8/29/19 10:06 AM, Koenig, Christian wrote:
> Felix advised that the way to stop all KFD activity is simply to NOT
> call amdgpu_amdkfd_post_reset so that why I added this. Do you mean you
> prefer amdgpu_amdkfd_post_reset to be outside of 
> amdgpu_device_unlock_adev ?
 Yes, exactly. It doesn't seems to be related to the unlock operation in
 the first place, but rather only signals the KFD that the reset is
 completed.

 Christian.

>>> ___
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH AUTOSEL 5.2 65/76] drm/amdgpu: prevent memory leaks in AMDGPU_CS ioctl

2019-08-29 Thread Sasha Levin
From: Nicolai Hähnle 

[ Upstream commit 1a701ea924815b0518733aa8d5d05c1f6fa87062 ]

Error out if the AMDGPU_CS ioctl is called with multiple SYNCOBJ_OUT and/or
TIMELINE_SIGNAL chunks, since otherwise the last chunk wins while the
allocated array as well as the reference counts of sync objects are leaked.

Signed-off-by: Nicolai Hähnle 
Reviewed-by: Christian König 
Signed-off-by: Alex Deucher 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index fe028561dc0e6..bc40d6eabce7d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1192,6 +1192,9 @@ static int amdgpu_cs_process_syncobj_out_dep(struct 
amdgpu_cs_parser *p,
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_sem);
 
+   if (p->post_deps)
+   return -EINVAL;
+
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
 GFP_KERNEL);
p->num_post_deps = 0;
@@ -1215,8 +1218,7 @@ static int amdgpu_cs_process_syncobj_out_dep(struct 
amdgpu_cs_parser *p,
 
 
 static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser 
*p,
- struct amdgpu_cs_chunk
- *chunk)
+ struct amdgpu_cs_chunk 
*chunk)
 {
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps;
unsigned num_deps;
@@ -1226,6 +1228,9 @@ static int 
amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p
num_deps = chunk->length_dw * 4 /
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
 
+   if (p->post_deps)
+   return -EINVAL;
+
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
 GFP_KERNEL);
p->num_post_deps = 0;
-- 
2.20.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Grodzovsky, Andrey

On 8/29/19 12:18 PM, Kuehling, Felix wrote:
> On 2019-08-29 10:08 a.m., Grodzovsky, Andrey wrote:
>> Agree, the placement of amdgpu_amdkfd_pre/post _reset in
>> amdgpu_device_lock/unlock_adev is a bit wierd.
>>
> amdgpu_device_reset_sriov already calls amdgpu_amdkfd_pre/post_reset
> itself while it has exclusive access to the GPU.

So in that case amdgpu_amdkfd_pre/post_reset gets called twice - once 
from amdgpu_device_lock/unlock_adev and second time from 
amdgpu_device_reset_sriov, no ? Why is it ?


> It would make sense to
> move the same calls into amdgpu_do_asic_reset for the bare-metal case.


Problem is i am skipping amdgpu_do_asic_reset totally in this case as 
there is no HW reset here so i will just extract it from 
amdgpu_device_lock/unlock_adev

Andrey


>
> Regards,
>     Felix
>
>
>> Andrey
>>
>> On 8/29/19 10:06 AM, Koenig, Christian wrote:
 Felix advised that the way to stop all KFD activity is simply to NOT
 call amdgpu_amdkfd_post_reset so that why I added this. Do you mean you
 prefer amdgpu_amdkfd_post_reset to be outside of amdgpu_device_unlock_adev 
 ?
>>> Yes, exactly. It doesn't seems to be related to the unlock operation in
>>> the first place, but rather only signals the KFD that the reset is
>>> completed.
>>>
>>> Christian.
>>>
>> ___
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 20/20] drm/amd/display: Add hdcp to Kconfig

2019-08-29 Thread Bhawanpreet Lakha
[Why]
HDCP is not fully finished, so we need to be able to
build and run the driver without it.

[How]
Add a Kconfig to toggle it

Signed-off-by: Bhawanpreet Lakha 
---
 drivers/gpu/drm/amd/display/Kconfig | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/Kconfig 
b/drivers/gpu/drm/amd/display/Kconfig
index 48c7423e92da..3e39c0c2ee7e 100644
--- a/drivers/gpu/drm/amd/display/Kconfig
+++ b/drivers/gpu/drm/amd/display/Kconfig
@@ -35,6 +35,14 @@ config DRM_AMD_DC_DSC_SUPPORT
Choose this option if you want to have
Dynamic Stream Compression support
 
+config DRM_AMD_DC_HDCP
+bool "Enable HDCP support in DC"
+depends on DRM_AMD_DC
+help
+ Choose this option
+ if you want to support
+ HDCP authentication
+
 config DEBUG_KERNEL_DC
bool "Enable kgdb break in DC"
depends on DRM_AMD_DC
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 18/20] drm/amd/display: Update CP property based on HW query

2019-08-29 Thread Bhawanpreet Lakha
[Why]
We need to use HW state to set content protection to ENABLED.
This way we know that the link is encrypted from the HW side

[How]
Create a workqueue that queries the HW every ~2seconds, and sets it to
ENABLED or DESIRED based on the result from the hardware

Change-Id: Ide8dbbb5877c83c4aac576bb4bd3e0b9cbd9f63e
Signed-off-by: Bhawanpreet Lakha 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 16 +
 .../amd/display/amdgpu_dm/amdgpu_dm_hdcp.c| 65 ++-
 .../amd/display/amdgpu_dm/amdgpu_dm_hdcp.h|  7 +-
 3 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 8cb48cf257a6..e3f547490b0e 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -5370,19 +5370,9 @@ static void update_content_protection(struct 
drm_connector_state *state, const s
 {
struct amdgpu_dm_connector *aconnector = 
to_amdgpu_dm_connector(connector);
 
-   if (state->content_protection == DRM_MODE_CONTENT_PROTECTION_DESIRED) {
-   hdcp_add_display(hdcp_w, aconnector->dc_link->link_index);
-
-   /*
-* TODO: ENABLED should be verified using psp, it is planned 
later.
-* Just set this to ENABLED for now
-*/
-   state->content_protection = DRM_MODE_CONTENT_PROTECTION_ENABLED;
-
-   return;
-   }
-
-   if (state->content_protection == DRM_MODE_CONTENT_PROTECTION_UNDESIRED)
+   if (state->content_protection == DRM_MODE_CONTENT_PROTECTION_DESIRED)
+   hdcp_add_display(hdcp_w, aconnector->dc_link->link_index, 
aconnector);
+   else if (state->content_protection == 
DRM_MODE_CONTENT_PROTECTION_UNDESIRED)
hdcp_remove_display(hdcp_w, aconnector->dc_link->link_index, 
aconnector->base.index);
 
 }
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
index 9d11d7695508..2443c238c188 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
@@ -27,6 +27,7 @@
 #include "amdgpu.h"
 #include "amdgpu_dm.h"
 #include "dm_helpers.h"
+#include 
 
 bool lp_write_i2c(void *handle, uint32_t address, const uint8_t *data, 
uint32_t size)
 {
@@ -82,16 +83,19 @@ static void process_output(struct hdcp_workqueue *hdcp_work)
 
 }
 
-void hdcp_add_display(struct hdcp_workqueue *hdcp_work, unsigned int 
link_index)
+void hdcp_add_display(struct hdcp_workqueue *hdcp_work, unsigned int 
link_index, struct amdgpu_dm_connector *aconnector)
 {
struct hdcp_workqueue *hdcp_w = _work[link_index];
struct mod_hdcp_display *display = _work[link_index].display;
struct mod_hdcp_link *link = _work[link_index].link;
 
mutex_lock(_w->mutex);
+   hdcp_w->aconnector = aconnector;
 
mod_hdcp_add_display(_w->hdcp, link, display, _w->output);
 
+   schedule_delayed_work(_w->property_validate_dwork, 
msecs_to_jiffies(DRM_HDCP_CHECK_PERIOD_MS));
+
process_output(hdcp_w);
 
mutex_unlock(_w->mutex);
@@ -106,6 +110,9 @@ void hdcp_remove_display(struct hdcp_workqueue *hdcp_work, 
unsigned int link_ind
 
mod_hdcp_remove_display(_w->hdcp, display_index, _w->output);
 
+   cancel_delayed_work(_w->property_validate_dwork);
+   hdcp_w->encryption_status = MOD_HDCP_ENCRYPTION_STATUS_HDCP_OFF;
+
process_output(hdcp_w);
 
mutex_unlock(_w->mutex);
@@ -120,6 +127,9 @@ void hdcp_reset_display(struct hdcp_workqueue *hdcp_work, 
unsigned int link_inde
 
mod_hdcp_reset_connection(_w->hdcp,  _w->output);
 
+   cancel_delayed_work(_w->property_validate_dwork);
+   hdcp_w->encryption_status = MOD_HDCP_ENCRYPTION_STATUS_HDCP_OFF;
+
process_output(hdcp_w);
 
mutex_unlock(_w->mutex);
@@ -155,7 +165,58 @@ static void event_callback(struct work_struct *work)
 
 
 }
+static void event_property_update(struct work_struct *work)
+{
+
+   struct hdcp_workqueue *hdcp_work = container_of(work, struct 
hdcp_workqueue, property_update_work);
+   struct amdgpu_dm_connector *aconnector = hdcp_work->aconnector;
+   struct drm_device *dev = hdcp_work->aconnector->base.dev;
+   long ret;
+
+   drm_modeset_lock(>mode_config.connection_mutex, NULL);
+   mutex_lock(_work->mutex);
+
+
+   if (aconnector->base.state->commit) {
+   ret = 
wait_for_completion_interruptible_timeout(>base.state->commit->hw_done,
 10 * HZ);
+
+   if (ret == 0) {
+   DRM_ERROR("HDCP state unknown! Setting it to DESIRED");
+   hdcp_work->encryption_status = 
MOD_HDCP_ENCRYPTION_STATUS_HDCP_OFF;
+   }
+   }
+
+   if (hdcp_work->encryption_status == MOD_HDCP_ENCRYPTION_STATUS_HDCP1_ON)
+   

[PATCH 12/20] drm/amd/display: Update hdcp display config

2019-08-29 Thread Bhawanpreet Lakha
[Why]
We need to update the hdcp display parameter whenever the link is
updated, so the next time there is an update to hdcp we have the
latest display info

[How]
Create a callback, and use this anytime there is a change in the link. This will
be used later by the dm.

Signed-off-by: Bhawanpreet Lakha 
---
 drivers/gpu/drm/amd/display/dc/core/dc.c  | 10 
 drivers/gpu/drm/amd/display/dc/core/dc_link.c | 31 
 drivers/gpu/drm/amd/display/dc/dc.h   |  5 ++
 drivers/gpu/drm/amd/display/dc/dc_types.h |  7 +++
 drivers/gpu/drm/amd/display/dc/dm_cp_psp.h| 49 +++
 .../gpu/drm/amd/display/dc/inc/core_types.h   |  4 +-
 6 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/display/dc/dm_cp_psp.h

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c 
b/drivers/gpu/drm/amd/display/dc/core/dc.c
index e46aaff55fb9..7e0b42476496 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -815,6 +815,16 @@ struct dc *dc_create(const struct dc_init_data 
*init_params)
 void dc_init_callbacks(struct dc *dc,
const struct dc_callback_init *init_params)
 {
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   dc->ctx->cp_psp = init_params->cp_psp;
+#endif
+}
+
+void dc_deinit_callbacks(struct dc *dc)
+{
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   memset(>ctx->cp_psp, 0, sizeof(dc->ctx->cp_psp));
+#endif
 }
 
 void dc_destroy(struct dc **dc)
diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c 
b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
index a13b497ae49c..1c86d1702bd8 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
@@ -2665,6 +2665,24 @@ static enum dc_status deallocate_mst_payload(struct 
pipe_ctx *pipe_ctx)
 
return DC_OK;
 }
+#if defined(CONFIG_DRM_AMD_DC_HDCP)
+static void update_psp_stream_config(struct pipe_ctx *pipe_ctx, bool dpms_off)
+{
+   struct cp_psp *cp_psp = _ctx->stream->ctx->cp_psp;
+   if (cp_psp && cp_psp->funcs.update_stream_config) {
+   struct cp_psp_stream_config config;
+
+   memset(, 0, sizeof(config));
+
+   config.otg_inst = (uint8_t) pipe_ctx->stream_res.tg->inst;
+   config.stream_enc_inst = (uint8_t) 
pipe_ctx->stream_res.stream_enc->id;
+   config.link_enc_inst = pipe_ctx->stream->link->link_enc_hw_inst;
+   config.dpms_off = dpms_off;
+   config.dm_stream_ctx = pipe_ctx->stream->dm_stream_context;
+   cp_psp->funcs.update_stream_config(cp_psp->handle, );
+   }
+}
+#endif
 
 void core_link_enable_stream(
struct dc_state *state,
@@ -2725,6 +2743,9 @@ void core_link_enable_stream(
/* Do not touch link on seamless boot optimization. */
if (pipe_ctx->stream->apply_seamless_boot_optimization) {
pipe_ctx->stream->dpms_off = false;
+#if defined(CONFIG_DRM_AMD_DC_HDCP)
+   update_psp_stream_config(pipe_ctx, false);
+#endif
return;
}
 
@@ -2732,6 +2753,9 @@ void core_link_enable_stream(
if (pipe_ctx->stream->signal == SIGNAL_TYPE_EDP &&
apply_edp_fast_boot_optimization) {
pipe_ctx->stream->dpms_off = false;
+#if defined(CONFIG_DRM_AMD_DC_HDCP)
+   update_psp_stream_config(pipe_ctx, false);
+#endif
return;
}
 
@@ -2791,6 +2815,9 @@ void core_link_enable_stream(
 
if (dc_is_dp_signal(pipe_ctx->stream->signal))
enable_stream_features(pipe_ctx);
+#if defined(CONFIG_DRM_AMD_DC_HDCP)
+   update_psp_stream_config(pipe_ctx, false);
+#endif
}
 #ifdef CONFIG_DRM_AMD_DC_DSC_SUPPORT
else { // if (IS_FPGA_MAXIMUS_DC(core_dc->ctx->dce_environment))
@@ -2808,6 +2835,10 @@ void core_link_disable_stream(struct pipe_ctx *pipe_ctx)
struct dc_stream_state *stream = pipe_ctx->stream;
struct dc_link *link = stream->sink->link;
 
+#if defined(CONFIG_DRM_AMD_DC_HDCP)
+   update_psp_stream_config(pipe_ctx, true);
+#endif
+
core_dc->hwss.blank_stream(pipe_ctx);
 
if (pipe_ctx->stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST)
diff --git a/drivers/gpu/drm/amd/display/dc/dc.h 
b/drivers/gpu/drm/amd/display/dc/dc.h
index 42b6a6e41c0b..1d364d8819d2 100644
--- a/drivers/gpu/drm/amd/display/dc/dc.h
+++ b/drivers/gpu/drm/amd/display/dc/dc.h
@@ -550,7 +550,11 @@ struct dc_init_data {
 };
 
 struct dc_callback_init {
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   struct cp_psp cp_psp;
+#else
uint8_t reserved;
+#endif
 };
 
 struct dc *dc_create(const struct dc_init_data *init_params);
@@ -562,6 +566,7 @@ int dc_setup_system_context(struct dc *dc, struct 
dc_phy_addr_space_config *pa_c
 #endif
 void dc_init_callbacks(struct dc *dc,
const struct 

[PATCH 16/20] drm/amd/display: Handle Content protection property changes

2019-08-29 Thread Bhawanpreet Lakha
[Why]
We need to manage the content protection property changes for
different usecase, once cp is DESIRED we need to maintain the
ENABLED/DESIRED status for different cases.

[How]
1. Attach the content_protection property

2. HDCP enable (UNDESIRED -> DESIRED)
call into the module with the correct parameters to start
hdcp. Set cp to ENABLED

3. HDCP disable (ENABLED -> UNDESIRED)
Call the module to disable hdcp.

3. Handle Special cases (Hotplug, S3, headless S3, DPMS)
If already ENABLED: set to DESIRED on unplug/suspend/dpms,
and disable hdcp

Then on plugin/resume/dpms: enable HDCP

Signed-off-by: Bhawanpreet Lakha 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 96 +++
 1 file changed, 96 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index bac9cf5be473..b9281e77752f 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #if defined(CONFIG_DRM_AMD_DC_DCN1_0)
 #include "ivsrcid/dcn/irqsrcs_dcn_1_0.h"
@@ -1465,6 +1466,11 @@ amdgpu_dm_update_connector_after_detect(struct 
amdgpu_dm_connector *aconnector)
dc_sink_release(aconnector->dc_sink);
aconnector->dc_sink = NULL;
aconnector->edid = NULL;
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   /* Set CP to DESIRED if it was ENABLED, so we can re-enable it 
again on hotplug */
+   if (connector->state->content_protection == 
DRM_MODE_CONTENT_PROTECTION_ENABLED)
+   connector->state->content_protection = 
DRM_MODE_CONTENT_PROTECTION_DESIRED;
+#endif
}
 
mutex_unlock(>mode_config.mutex);
@@ -1479,6 +1485,9 @@ static void handle_hpd_irq(void *param)
struct drm_connector *connector = >base;
struct drm_device *dev = connector->dev;
enum dc_connection_type new_connection_type = dc_connection_none;
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   struct amdgpu_device *adev = dev->dev_private;
+#endif
 
/*
 * In case of failure or MST no need to update connector status or 
notify the OS
@@ -1486,6 +1495,9 @@ static void handle_hpd_irq(void *param)
 */
mutex_lock(>hpd_lock);
 
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   hdcp_reset_display(adev->dm.hdcp_workqueue, 
aconnector->dc_link->link_index);
+#endif
if (aconnector->fake_enable)
aconnector->fake_enable = false;
 
@@ -5058,6 +5070,9 @@ void amdgpu_dm_connector_init_helper(struct 
amdgpu_display_manager *dm,
adev->mode_info.freesync_property, 0);
drm_object_attach_property(>base.base,
adev->mode_info.freesync_capable_property, 0);
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   
drm_connector_attach_content_protection_property(>base, false);
+#endif
}
 }
 
@@ -5300,6 +5315,63 @@ is_scaling_state_different(const struct 
dm_connector_state *dm_state,
return false;
 }
 
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+static bool is_content_protection_different(struct drm_connector_state *state,
+   const struct drm_connector_state 
*old_state,
+   const struct drm_connector 
*connector, struct hdcp_workqueue *hdcp_w)
+{
+   struct amdgpu_dm_connector *aconnector = 
to_amdgpu_dm_connector(connector);
+
+   /* CP is being re enabled, ignore this */
+   if (old_state->content_protection == 
DRM_MODE_CONTENT_PROTECTION_ENABLED &&
+   state->content_protection == DRM_MODE_CONTENT_PROTECTION_DESIRED) {
+   state->content_protection = DRM_MODE_CONTENT_PROTECTION_ENABLED;
+   return false;
+   }
+
+   /* S3 resume case, since old state will always be 0 (UNDESIRED) and the 
restored state will be ENABLED */
+   if (old_state->content_protection == 
DRM_MODE_CONTENT_PROTECTION_UNDESIRED &&
+   state->content_protection == DRM_MODE_CONTENT_PROTECTION_ENABLED)
+   state->content_protection = DRM_MODE_CONTENT_PROTECTION_DESIRED;
+
+   /* Check if something is connected/enabled, otherwise we start hdcp but 
nothing is connected/enabled
+* hot-plug, headless s3, dpms
+*/
+   if (state->content_protection == DRM_MODE_CONTENT_PROTECTION_DESIRED && 
connector->dpms == DRM_MODE_DPMS_ON &&
+   aconnector->dc_sink != NULL)
+   return true;
+
+   if (old_state->content_protection == state->content_protection)
+   return false;
+
+   if (state->content_protection == DRM_MODE_CONTENT_PROTECTION_UNDESIRED)
+   return true;
+
+   return false;
+}
+
+static void update_content_protection(struct drm_connector_state *state, const 
struct drm_connector *connector,
+

[PATCH 17/20] drm/amd/display: handle DP cpirq

2019-08-29 Thread Bhawanpreet Lakha
[Why]
This is needed for DP as DP can send us info using irq.

[How]
Check if irq bit is set on short pulse and call the
function that handles cpirq in amdgpu_dm_hdcp

Signed-off-by: Bhawanpreet Lakha 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index b9281e77752f..8cb48cf257a6 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -1616,6 +1616,12 @@ static void handle_hpd_rx_irq(void *param)
struct dc_link *dc_link = aconnector->dc_link;
bool is_mst_root_connector = aconnector->mst_mgr.mst_state;
enum dc_connection_type new_connection_type = dc_connection_none;
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   union hpd_irq_data hpd_irq_data;
+   struct amdgpu_device *adev = dev->dev_private;
+
+   memset(_irq_data, 0, sizeof(hpd_irq_data));
+#endif
 
/*
 * TODO:Temporary add mutex to protect hpd interrupt not have a gpio
@@ -1625,7 +1631,12 @@ static void handle_hpd_rx_irq(void *param)
if (dc_link->type != dc_connection_mst_branch)
mutex_lock(>hpd_lock);
 
+
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   if (dc_link_handle_hpd_rx_irq(dc_link, _irq_data, NULL) &&
+#else
if (dc_link_handle_hpd_rx_irq(dc_link, NULL, NULL) &&
+#endif
!is_mst_root_connector) {
/* Downstream Port status changed. */
if (!dc_link_detect_sink(dc_link, _connection_type))
@@ -1660,6 +1671,10 @@ static void handle_hpd_rx_irq(void *param)
drm_kms_helper_hotplug_event(dev);
}
}
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   if (hpd_irq_data.bytes.device_service_irq.bits.CP_IRQ)
+   hdcp_handle_cpirq(adev->dm.hdcp_workqueue,  
aconnector->base.index);
+#endif
if ((dc_link->cur_link_settings.lane_count != LANE_COUNT_UNKNOWN) ||
(dc_link->type == dc_connection_mst_branch))
dm_handle_hpd_rx_irq(aconnector);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 02/20] drm: generic fn converting be24 to cpu and vice versa

2019-08-29 Thread Bhawanpreet Lakha
From: Ramalingam C 

Existing functions for converting a 3bytes(be24) of big endian value
into u32 of little endian and vice versa are renamed as

s/drm_hdcp2_seq_num_to_u32/drm_hdcp_be24_to_cpu
s/drm_hdcp2_u32_to_seq_num/drm_hdcp_cpu_to_be24

Signed-off-by: Ramalingam C 
Suggested-by: Daniel Vetter 
cc: Tomas Winkler 
Acked-by: Dave Airlie 
Signed-off-by: Daniel Vetter 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20190507162745.25600-4-ramalinga...@intel.com
---
 drivers/gpu/drm/i915/intel_hdcp.c | 5 +++--
 drivers/misc/mei/hdcp/mei_hdcp.c  | 2 +-
 include/drm/drm_hdcp.h| 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_hdcp.c 
b/drivers/gpu/drm/i915/intel_hdcp.c
index 99b007169c49..536cddc74d22 100644
--- a/drivers/gpu/drm/i915/intel_hdcp.c
+++ b/drivers/gpu/drm/i915/intel_hdcp.c
@@ -1306,7 +1306,7 @@ int hdcp2_propagate_stream_management_info(struct 
intel_connector *connector)
 
/* Prepare RepeaterAuth_Stream_Manage msg */
msgs.stream_manage.msg_id = HDCP_2_2_REP_STREAM_MANAGE;
-   drm_hdcp2_u32_to_seq_num(msgs.stream_manage.seq_num_m, hdcp->seq_num_m);
+   drm_hdcp_cpu_to_be24(msgs.stream_manage.seq_num_m, hdcp->seq_num_m);
 
/* K no of streams is fixed as 1. Stored as big-endian. */
msgs.stream_manage.k = cpu_to_be16(1);
@@ -1371,7 +1371,8 @@ int hdcp2_authenticate_repeater_topology(struct 
intel_connector *connector)
}
 
/* Converting and Storing the seq_num_v to local variable as DWORD */
-   seq_num_v = drm_hdcp2_seq_num_to_u32(msgs.recvid_list.seq_num_v);
+   seq_num_v =
+   drm_hdcp_be24_to_cpu((const u8 *)msgs.recvid_list.seq_num_v);
 
if (seq_num_v < hdcp->seq_num_v) {
/* Roll over of the seq_num_v from repeater. Reauthenticate. */
diff --git a/drivers/misc/mei/hdcp/mei_hdcp.c b/drivers/misc/mei/hdcp/mei_hdcp.c
index b07000202d4a..417865129407 100644
--- a/drivers/misc/mei/hdcp/mei_hdcp.c
+++ b/drivers/misc/mei/hdcp/mei_hdcp.c
@@ -576,7 +576,7 @@ static int mei_hdcp_verify_mprime(struct device *dev,
 
memcpy(verify_mprime_in.m_prime, stream_ready->m_prime,
   HDCP_2_2_MPRIME_LEN);
-   drm_hdcp2_u32_to_seq_num(verify_mprime_in.seq_num_m, data->seq_num_m);
+   drm_hdcp_cpu_to_be24(verify_mprime_in.seq_num_m, data->seq_num_m);
memcpy(verify_mprime_in.streams, data->streams,
   (data->k * sizeof(struct hdcp2_streamid_type)));
 
diff --git a/include/drm/drm_hdcp.h b/include/drm/drm_hdcp.h
index f243408ecf26..1cc66df05a43 100644
--- a/include/drm/drm_hdcp.h
+++ b/include/drm/drm_hdcp.h
@@ -252,13 +252,13 @@ struct hdcp2_rep_stream_ready {
  * host format and back
  */
 static inline
-u32 drm_hdcp2_seq_num_to_u32(u8 seq_num[HDCP_2_2_SEQ_NUM_LEN])
+u32 drm_hdcp_be24_to_cpu(const u8 seq_num[HDCP_2_2_SEQ_NUM_LEN])
 {
return (u32)(seq_num[2] | seq_num[1] << 8 | seq_num[0] << 16);
 }
 
 static inline
-void drm_hdcp2_u32_to_seq_num(u8 seq_num[HDCP_2_2_SEQ_NUM_LEN], u32 val)
+void drm_hdcp_cpu_to_be24(u8 seq_num[HDCP_2_2_SEQ_NUM_LEN], u32 val)
 {
seq_num[0] = val >> 16;
seq_num[1] = val >> 8;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 08/20] drm/amdgpu: psp HDCP init

2019-08-29 Thread Bhawanpreet Lakha
This patch adds
-Loading the firmware
-The functions and definitions for communication with the firmware

Signed-off-by: Bhawanpreet Lakha 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 188 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h   |  17 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |   3 +
 drivers/gpu/drm/amd/amdgpu/psp_v10_0.c|  31 +++-
 4 files changed, 237 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 9f7cc5bcc037..ccce1b506a12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -769,6 +769,179 @@ static int psp_ras_initialize(struct psp_context *psp)
 }
 // ras end
 
+// HDCP start
+static void psp_prep_hdcp_ta_load_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+   uint64_t hdcp_ta_mc, uint64_t hdcp_mc_shared,
+   uint32_t hdcp_ta_size, uint32_t shared_size)
+{
+   cmd->cmd_id = GFX_CMD_ID_LOAD_TA;
+   cmd->cmd.cmd_load_ta.app_phy_addr_lo = lower_32_bits(hdcp_ta_mc);
+   cmd->cmd.cmd_load_ta.app_phy_addr_hi = upper_32_bits(hdcp_ta_mc);
+   cmd->cmd.cmd_load_ta.app_len = hdcp_ta_size;
+
+   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_lo = 
lower_32_bits(hdcp_mc_shared);
+   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_hi = 
upper_32_bits(hdcp_mc_shared);
+   cmd->cmd.cmd_load_ta.cmd_buf_len = shared_size;
+}
+
+static int psp_hdcp_init_shared_buf(struct psp_context *psp)
+{
+   int ret;
+
+   /*
+* Allocate 16k memory aligned to 4k from Frame Buffer (local
+* physical) for hdcp ta <-> Driver
+*/
+   ret = amdgpu_bo_create_kernel(psp->adev, PSP_HDCP_SHARED_MEM_SIZE,
+   PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
+   >hdcp_context.hdcp_shared_bo,
+   >hdcp_context.hdcp_shared_mc_addr,
+   >hdcp_context.hdcp_shared_buf);
+
+   return ret;
+}
+
+static int psp_hdcp_load(struct psp_context *psp)
+{
+   int ret;
+   struct psp_gfx_cmd_resp *cmd;
+
+   /*
+* TODO: bypass the loading in sriov for now
+*/
+   if (amdgpu_sriov_vf(psp->adev))
+   return 0;
+
+   cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
+   if (!cmd)
+   return -ENOMEM;
+
+   memset(psp->fw_pri_buf, 0, PSP_1_MEG);
+   memcpy(psp->fw_pri_buf, psp->ta_hdcp_start_addr, 
psp->ta_hdcp_ucode_size);
+
+   psp_prep_hdcp_ta_load_cmd_buf(cmd, psp->fw_pri_mc_addr,
+   psp->hdcp_context.hdcp_shared_mc_addr,
+   psp->ta_hdcp_ucode_size, PSP_HDCP_SHARED_MEM_SIZE);
+
+   ret = psp_cmd_submit_buf(psp, NULL, cmd,
+   psp->fence_buf_mc_addr);
+
+   if (!ret) {
+   psp->hdcp_context.hdcp_initialized = 1;
+   psp->hdcp_context.session_id = cmd->resp.session_id;
+   }
+
+   kfree(cmd);
+
+   return ret;
+}
+static int psp_hdcp_initialize(struct psp_context *psp)
+{
+   int ret;
+
+   if (!psp->hdcp_context.hdcp_initialized) {
+   ret = psp_hdcp_init_shared_buf(psp);
+   if (ret)
+   return ret;
+   }
+
+   ret = psp_hdcp_load(psp);
+   if (ret)
+   return ret;
+
+   return 0;
+}
+static void psp_prep_hdcp_ta_unload_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+   uint32_t hdcp_session_id)
+{
+   cmd->cmd_id = GFX_CMD_ID_UNLOAD_TA;
+   cmd->cmd.cmd_unload_ta.session_id = hdcp_session_id;
+}
+
+static int psp_hdcp_unload(struct psp_context *psp)
+{
+   int ret;
+   struct psp_gfx_cmd_resp *cmd;
+
+   /*
+* TODO: bypass the unloading in sriov for now
+*/
+   if (amdgpu_sriov_vf(psp->adev))
+   return 0;
+
+   cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
+   if (!cmd)
+   return -ENOMEM;
+
+   psp_prep_hdcp_ta_unload_cmd_buf(cmd, psp->hdcp_context.session_id);
+
+   ret = psp_cmd_submit_buf(psp, NULL, cmd,
+   psp->fence_buf_mc_addr);
+
+   kfree(cmd);
+
+   return ret;
+}
+
+static void psp_prep_hdcp_ta_invoke_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+   uint32_t ta_cmd_id,
+   uint32_t hdcp_session_id)
+{
+   cmd->cmd_id = GFX_CMD_ID_INVOKE_CMD;
+   cmd->cmd.cmd_invoke_cmd.session_id = hdcp_session_id;
+   cmd->cmd.cmd_invoke_cmd.ta_cmd_id = ta_cmd_id;
+   /* Note: cmd_invoke_cmd.buf is not used for now */
+}
+
+int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
+{
+   int ret;
+   struct psp_gfx_cmd_resp *cmd;
+
+   /*
+* TODO: bypass the loading in sriov for now
+*/
+   if (amdgpu_sriov_vf(psp->adev))
+   return 0;
+
+   cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
+   if (!cmd)
+   return -ENOMEM;
+
+  

[PATCH 13/20] drm/amd/display: Create amdgpu_dm_hdcp

2019-08-29 Thread Bhawanpreet Lakha
[Why]
We need to interact with the hdcp module from the DM, the module
has to be interacted with in terms of events

[How]
Create the files needed for linux hdcp. These files manage the events
needed for the dm to interact with the hdcp module.

We use the kernel work queue to process the events needed for
the module

Signed-off-by: Bhawanpreet Lakha 
---
 .../gpu/drm/amd/display/amdgpu_dm/Makefile|   4 +
 .../amd/display/amdgpu_dm/amdgpu_dm_hdcp.c| 241 ++
 .../amd/display/amdgpu_dm/amdgpu_dm_hdcp.h|  61 +
 3 files changed, 306 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
 create mode 100644 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.h

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/Makefile 
b/drivers/gpu/drm/amd/display/amdgpu_dm/Makefile
index 94911871eb9b..9a3b7bf8ab0b 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/Makefile
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/Makefile
@@ -31,6 +31,10 @@ ifneq ($(CONFIG_DRM_AMD_DC),)
 AMDGPUDM += amdgpu_dm_services.o amdgpu_dm_helpers.o amdgpu_dm_pp_smu.o
 endif
 
+ifdef CONFIG_DRM_AMD_DC_HDCP
+AMDGPUDM += amdgpu_dm_hdcp.o
+endif
+
 ifneq ($(CONFIG_DEBUG_FS),)
 AMDGPUDM += amdgpu_dm_crc.o amdgpu_dm_debugfs.o
 endif
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
new file mode 100644
index ..004b6e8e9ed5
--- /dev/null
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: AMD
+ *
+ */
+
+#include "amdgpu_dm_hdcp.h"
+#include "amdgpu.h"
+#include "amdgpu_dm.h"
+
+static void process_output(struct hdcp_workqueue *hdcp_work)
+{
+   struct mod_hdcp_output output = hdcp_work->output;
+
+   if (output.callback_stop)
+   cancel_delayed_work(_work->callback_dwork);
+
+   if (output.callback_needed)
+   schedule_delayed_work(_work->callback_dwork,
+ msecs_to_jiffies(output.callback_delay));
+
+   if (output.watchdog_timer_stop)
+   cancel_delayed_work(_work->watchdog_timer_dwork);
+
+   if (output.watchdog_timer_needed)
+   schedule_delayed_work(_work->watchdog_timer_dwork,
+ 
msecs_to_jiffies(output.watchdog_timer_delay));
+
+}
+
+void hdcp_add_display(struct hdcp_workqueue *hdcp_work, unsigned int 
link_index)
+{
+   struct hdcp_workqueue *hdcp_w = _work[link_index];
+   struct mod_hdcp_display *display = _work[link_index].display;
+   struct mod_hdcp_link *link = _work[link_index].link;
+
+   mutex_lock(_w->mutex);
+
+   mod_hdcp_add_display(_w->hdcp, link, display, _w->output);
+
+   process_output(hdcp_w);
+
+   mutex_unlock(_w->mutex);
+
+}
+
+void hdcp_remove_display(struct hdcp_workqueue *hdcp_work, unsigned int 
link_index,  unsigned int display_index)
+{
+   struct hdcp_workqueue *hdcp_w = _work[link_index];
+
+   mutex_lock(_w->mutex);
+
+   mod_hdcp_remove_display(_w->hdcp, display_index, _w->output);
+
+   process_output(hdcp_w);
+
+   mutex_unlock(_w->mutex);
+
+}
+
+void hdcp_reset_display(struct hdcp_workqueue *hdcp_work, unsigned int 
link_index)
+{
+   struct hdcp_workqueue *hdcp_w = _work[link_index];
+
+   mutex_lock(_w->mutex);
+
+   mod_hdcp_reset_connection(_w->hdcp,  _w->output);
+
+   process_output(hdcp_w);
+
+   mutex_unlock(_w->mutex);
+}
+
+void hdcp_handle_cpirq(struct hdcp_workqueue *hdcp_work, unsigned int 
link_index)
+{
+   struct hdcp_workqueue *hdcp_w = _work[link_index];
+
+   schedule_work(_w->cpirq_work);
+}
+
+
+
+
+static void event_callback(struct work_struct *work)
+{
+   struct hdcp_workqueue *hdcp_work;
+
+   hdcp_work = container_of(to_delayed_work(work), 

[PATCH 05/20] drm: Add Content protection type property

2019-08-29 Thread Bhawanpreet Lakha
From: Ramalingam C 

This patch adds a DRM ENUM property to the selected connectors.
This property is used for mentioning the protected content's type
from userspace to kernel HDCP authentication.

Type of the stream is decided by the protected content providers.
Type 0 content can be rendered on any HDCP protected display wires.
But Type 1 content can be rendered only on HDCP2.2 protected paths.

So when a userspace sets this property to Type 1 and starts the HDCP
enable, kernel will honour it only if HDCP2.2 authentication is through
for type 1. Else HDCP enable will be failed.

Pekka have completed the Weston DRM-backend review in
https://gitlab.freedesktop.org/wayland/weston/merge_requests/48
and the UAPI for HDCP 2.2 looks good.

The userspace is accepted in Weston.

v2:
  cp_content_type is replaced with content_protection_type [daniel]
  check at atomic_set_property is removed [Maarten]
v3:
  %s/content_protection_type/hdcp_content_type [Pekka]
v4:
  property is created for the first requested connector and then reused.
[Danvet]
v5:
  kernel doc nits addressed [Daniel]
  Rebased as part of patch reordering.
v6:
  Kernel docs are modified [pekka]
v7:
  More details in Kernel docs. [pekka]
v8:
  Few more clarification into kernel doc of content type [pekka]
v9:
  Small fixes in coding style.
v10:
  Moving DRM_MODE_HDCP_CONTENT_TYPEx definition to drm_hdcp.h [pekka]

Signed-off-by: Ramalingam C 
Reviewed-by: Daniel Vetter 
Acked-by: Pekka Paalanen 
Acked-by: Jani Nikula 
Link: https://patchwork.freedesktop.org/patch/320957/?series=57232=14
---
 drivers/gpu/drm/drm_atomic_uapi.c |  4 +++
 drivers/gpu/drm/drm_connector.c   | 51 +++
 drivers/gpu/drm/drm_hdcp.c| 36 +-
 drivers/gpu/drm/i915/intel_hdcp.c |  4 ++-
 include/drm/drm_connector.h   |  7 +
 include/drm/drm_hdcp.h|  7 -
 include/drm/drm_mode_config.h |  6 
 7 files changed, 112 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/drm_atomic_uapi.c 
b/drivers/gpu/drm/drm_atomic_uapi.c
index 4131e669785a..a85f3ccfe699 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -738,6 +738,8 @@ static int drm_atomic_connector_set_property(struct 
drm_connector *connector,
return -EINVAL;
}
state->content_protection = val;
+   } else if (property == config->hdcp_content_type_property) {
+   state->hdcp_content_type = val;
} else if (property == connector->colorspace_property) {
state->colorspace = val;
} else if (property == config->writeback_fb_id_property) {
@@ -816,6 +818,8 @@ drm_atomic_connector_get_property(struct drm_connector 
*connector,
*val = state->scaling_mode;
} else if (property == config->content_protection_property) {
*val = state->content_protection;
+   } else if (property == config->hdcp_content_type_property) {
+   *val = state->hdcp_content_type;
} else if (property == config->writeback_fb_id_property) {
/* Writeback framebuffer is one-shot, write and forget */
*val = 0;
diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 11fcd25bc640..3b0910b36ef5 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -956,6 +956,57 @@ static const struct drm_prop_enum_list hdmi_colorspaces[] 
= {
  *   is no longer protected and userspace should take appropriate action
  *   (whatever that might be).
  *
+ * HDCP Content Type:
+ * This Enum property is used by the userspace to declare the content type
+ * of the display stream, to kernel. Here display stream stands for any
+ * display content that userspace intended to display through HDCP
+ * encryption.
+ *
+ * Content Type of a stream is decided by the owner of the stream, as
+ * "HDCP Type0" or "HDCP Type1".
+ *
+ * The value of the property can be one of the below:
+ *   - "HDCP Type0": DRM_MODE_HDCP_CONTENT_TYPE0 = 0
+ *   - "HDCP Type1": DRM_MODE_HDCP_CONTENT_TYPE1 = 1
+ *
+ * When kernel starts the HDCP authentication (see "Content Protection"
+ * for details), it uses the content type in "HDCP Content Type"
+ * for performing the HDCP authentication with the display sink.
+ *
+ * Please note in HDCP spec versions, a link can be authenticated with
+ * HDCP 2.2 for Content Type 0/Content Type 1. Where as a link can be
+ * authenticated with HDCP1.4 only for Content Type 0(though it is implicit
+ * in nature. As there is no reference for Content Type in HDCP1.4).
+ *
+ * HDCP2.2 authentication protocol itself takes the "Content Type" as a
+ * parameter, which is a input for the DP HDCP2.2 encryption algo.
+ *
+ * In case of Type 0 content protection request, kernel driver can choose
+ * either of HDCP spec 

[PATCH 19/20] drm/amd/display: only enable HDCP for DCN+

2019-08-29 Thread Bhawanpreet Lakha
[Why]
We don't support HDCP for pre RAVEN asics

[How]
Check if we are RAVEN+. Use this to attach the content_protection
property, this way usermode can't try to enable HDCP on pre DCN asics.

Also we need to update the module on hpd so guard it aswell

Change-Id: I1f425bca6eb1139a4b0fe808c455df148ca0925e
Signed-off-by: Bhawanpreet Lakha 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 20 +++
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index e3f547490b0e..2b4eead187f6 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -733,14 +733,16 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
amdgpu_dm_init_color_mod();
 
 #ifdef CONFIG_DRM_AMD_DC_HDCP
-   adev->dm.hdcp_workqueue = hdcp_create_workqueue(>psp, 
_params.cp_psp, adev->dm.dc);
+   if (adev->asic_type >= CHIP_RAVEN) {
+   adev->dm.hdcp_workqueue = hdcp_create_workqueue(>psp, 
_params.cp_psp, adev->dm.dc);
 
-   if (!adev->dm.hdcp_workqueue)
-   DRM_ERROR("amdgpu: failed to initialize hdcp_workqueue.\n");
-   else
-   DRM_DEBUG_DRIVER("amdgpu: hdcp_workqueue init done %p.\n", 
adev->dm.hdcp_workqueue);
+   if (!adev->dm.hdcp_workqueue)
+   DRM_ERROR("amdgpu: failed to initialize 
hdcp_workqueue.\n");
+   else
+   DRM_DEBUG_DRIVER("amdgpu: hdcp_workqueue init done 
%p.\n", adev->dm.hdcp_workqueue);
 
-   dc_init_callbacks(adev->dm.dc, _params);
+   dc_init_callbacks(adev->dm.dc, _params);
+   }
 #endif
if (amdgpu_dm_initialize_drm_device(adev)) {
DRM_ERROR(
@@ -1496,7 +1498,8 @@ static void handle_hpd_irq(void *param)
mutex_lock(>hpd_lock);
 
 #ifdef CONFIG_DRM_AMD_DC_HDCP
-   hdcp_reset_display(adev->dm.hdcp_workqueue, 
aconnector->dc_link->link_index);
+   if (adev->asic_type >= CHIP_RAVEN)
+   hdcp_reset_display(adev->dm.hdcp_workqueue, 
aconnector->dc_link->link_index);
 #endif
if (aconnector->fake_enable)
aconnector->fake_enable = false;
@@ -5086,7 +5089,8 @@ void amdgpu_dm_connector_init_helper(struct 
amdgpu_display_manager *dm,
drm_object_attach_property(>base.base,
adev->mode_info.freesync_capable_property, 0);
 #ifdef CONFIG_DRM_AMD_DC_HDCP
-   
drm_connector_attach_content_protection_property(>base, false);
+   if (adev->asic_type >= CHIP_RAVEN)
+   
drm_connector_attach_content_protection_property(>base, false);
 #endif
}
 }
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 03/20] drm: revocation check at drm subsystem

2019-08-29 Thread Bhawanpreet Lakha
From: Ramalingam C 

On every hdcp revocation check request SRM is read from fw file
/lib/firmware/display_hdcp_srm.bin

SRM table is parsed and stored at drm_hdcp.c, with functions exported
for the services for revocation check from drivers (which
implements the HDCP authentication)

This patch handles the HDCP1.4 and 2.2 versions of SRM table.

v2:
  moved the uAPI to request_firmware_direct() [Daniel]
v3:
  kdoc added. [Daniel]
  srm_header unified and bit field definitions are removed. [Daniel]
  locking improved. [Daniel]
  vrl length violation is fixed. [Daniel]
v4:
  s/__swab16/be16_to_cpu [Daniel]
  be24_to_cpu is done through a global func [Daniel]
  Unused variables are removed. [Daniel]
  unchecked return values are dropped from static funcs [Daniel]

Signed-off-by: Ramalingam C 
Acked-by: Satyeshwar Singh 
Reviewed-by: Daniel Vetter 
Acked-by: Dave Airlie 
Signed-off-by: Daniel Vetter 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20190507162745.25600-5-ramalinga...@intel.com
---
 Documentation/gpu/drm-kms-helpers.rst |   6 +
 drivers/gpu/drm/Makefile  |   2 +-
 drivers/gpu/drm/drm_hdcp.c| 333 ++
 drivers/gpu/drm/drm_internal.h|   4 +
 drivers/gpu/drm/drm_sysfs.c   |   2 +
 include/drm/drm_hdcp.h|  24 ++
 6 files changed, 370 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/drm_hdcp.c

diff --git a/Documentation/gpu/drm-kms-helpers.rst 
b/Documentation/gpu/drm-kms-helpers.rst
index 14102ae035dc..0fe726a6ee67 100644
--- a/Documentation/gpu/drm-kms-helpers.rst
+++ b/Documentation/gpu/drm-kms-helpers.rst
@@ -181,6 +181,12 @@ Panel Helper Reference
 .. kernel-doc:: drivers/gpu/drm/drm_panel_orientation_quirks.c
:export:
 
+HDCP Helper Functions Reference
+===
+
+.. kernel-doc:: drivers/gpu/drm/drm_hdcp.c
+   :export:
+
 Display Port Helper Functions Reference
 ===
 
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index f204830669e2..7fa09ea00ffd 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -17,7 +17,7 @@ drm-y   :=drm_auth.o drm_cache.o \
drm_plane.o drm_color_mgmt.o drm_print.o \
drm_dumb_buffers.o drm_mode_config.o drm_vblank.o \
drm_syncobj.o drm_lease.o drm_writeback.o drm_client.o \
-   drm_atomic_uapi.o
+   drm_atomic_uapi.o drm_hdcp.o
 
 drm-$(CONFIG_DRM_LEGACY) += drm_legacy_misc.o drm_bufs.o drm_context.o 
drm_dma.o drm_scatter.o drm_lock.o
 drm-$(CONFIG_DRM_LIB_RANDOM) += lib/drm_random.o
diff --git a/drivers/gpu/drm/drm_hdcp.c b/drivers/gpu/drm/drm_hdcp.c
new file mode 100644
index ..5e5409505c31
--- /dev/null
+++ b/drivers/gpu/drm/drm_hdcp.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Intel Corporation.
+ *
+ * Authors:
+ * Ramalingam C 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+struct hdcp_srm {
+   u32 revoked_ksv_cnt;
+   u8 *revoked_ksv_list;
+
+   /* Mutex to protect above struct member */
+   struct mutex mutex;
+} *srm_data;
+
+static inline void drm_hdcp_print_ksv(const u8 *ksv)
+{
+   DRM_DEBUG("\t%#02x, %#02x, %#02x, %#02x, %#02x\n",
+ ksv[0], ksv[1], ksv[2], ksv[3], ksv[4]);
+}
+
+static u32 drm_hdcp_get_revoked_ksv_count(const u8 *buf, u32 vrls_length)
+{
+   u32 parsed_bytes = 0, ksv_count = 0, vrl_ksv_cnt, vrl_sz;
+
+   while (parsed_bytes < vrls_length) {
+   vrl_ksv_cnt = *buf;
+   ksv_count += vrl_ksv_cnt;
+
+   vrl_sz = (vrl_ksv_cnt * DRM_HDCP_KSV_LEN) + 1;
+   buf += vrl_sz;
+   parsed_bytes += vrl_sz;
+   }
+
+   /*
+* When vrls are not valid, ksvs are not considered.
+* Hence SRM will be discarded.
+*/
+   if (parsed_bytes != vrls_length)
+   ksv_count = 0;
+
+   return ksv_count;
+}
+
+static u32 drm_hdcp_get_revoked_ksvs(const u8 *buf, u8 *revoked_ksv_list,
+u32 vrls_length)
+{
+   u32 parsed_bytes = 0, ksv_count = 0;
+   u32 vrl_ksv_cnt, vrl_ksv_sz, vrl_idx = 0;
+
+   do {
+   vrl_ksv_cnt = *buf;
+   vrl_ksv_sz = vrl_ksv_cnt * DRM_HDCP_KSV_LEN;
+
+   buf++;
+
+   DRM_DEBUG("vrl: %d, Revoked KSVs: %d\n", vrl_idx++,
+ vrl_ksv_cnt);
+   memcpy(revoked_ksv_list, buf, vrl_ksv_sz);
+
+   ksv_count += vrl_ksv_cnt;
+   revoked_ksv_list += vrl_ksv_sz;
+   buf += vrl_ksv_sz;
+
+   parsed_bytes += (vrl_ksv_sz + 1);
+   } while (parsed_bytes < vrls_length);
+
+   return ksv_count;
+}
+
+static inline u32 get_vrl_length(const u8 *buf)
+{
+   return drm_hdcp_be24_to_cpu(buf);
+}
+
+static int 

[PATCH 14/20] drm/amd/display: Create dpcd and i2c packing functions

2019-08-29 Thread Bhawanpreet Lakha
[Why]
We need to read and write specific i2c and dpcd messages.

[How]
Created static functions for packing the dpcd and i2c messages for hdcp.

Signed-off-by: Bhawanpreet Lakha 
---
 .../amd/display/amdgpu_dm/amdgpu_dm_hdcp.c| 40 ++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
index 004b6e8e9ed5..9d11d7695508 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
@@ -26,6 +26,41 @@
 #include "amdgpu_dm_hdcp.h"
 #include "amdgpu.h"
 #include "amdgpu_dm.h"
+#include "dm_helpers.h"
+
+bool lp_write_i2c(void *handle, uint32_t address, const uint8_t *data, 
uint32_t size)
+{
+
+   struct dc_link *link = handle;
+   struct i2c_payload i2c_payloads[] = {{true, address, size, (void 
*)data} };
+   struct i2c_command cmd = {i2c_payloads, 1, I2C_COMMAND_ENGINE_HW, 
link->dc->caps.i2c_speed_in_khz};
+
+   return dm_helpers_submit_i2c(link->ctx, link, );
+}
+
+bool lp_read_i2c(void *handle, uint32_t address, uint8_t offset, uint8_t 
*data, uint32_t size)
+{
+   struct dc_link *link = handle;
+
+   struct i2c_payload i2c_payloads[] = {{true, address, 1, }, 
{false, address, size, data} };
+   struct i2c_command cmd = {i2c_payloads, 2, I2C_COMMAND_ENGINE_HW, 
link->dc->caps.i2c_speed_in_khz};
+
+   return dm_helpers_submit_i2c(link->ctx, link, );
+}
+
+bool lp_write_dpcd(void *handle, uint32_t address, const uint8_t *data, 
uint32_t size)
+{
+   struct dc_link *link = handle;
+
+   return dm_helpers_dp_write_dpcd(link->ctx, link, address, data, size);
+}
+
+bool lp_read_dpcd(void *handle, uint32_t address, uint8_t *data, uint32_t size)
+{
+   struct dc_link *link = handle;
+
+   return dm_helpers_dp_read_dpcd(link->ctx, link, address, data, size);
+}
 
 static void process_output(struct hdcp_workqueue *hdcp_work)
 {
@@ -220,7 +255,10 @@ struct hdcp_workqueue *hdcp_create_workqueue(void 
*psp_context, struct cp_psp *c
 
hdcp_work[i].hdcp.config.psp.handle =  psp_context;
hdcp_work[i].hdcp.config.ddc.handle = dc_get_link_at_index(dc, 
i);
-
+   hdcp_work[i].hdcp.config.ddc.funcs.write_i2c = lp_write_i2c;
+   hdcp_work[i].hdcp.config.ddc.funcs.read_i2c = lp_read_i2c;
+   hdcp_work[i].hdcp.config.ddc.funcs.write_dpcd = lp_write_dpcd;
+   hdcp_work[i].hdcp.config.ddc.funcs.read_dpcd = lp_read_dpcd;
}
 
cp_psp->funcs.update_stream_config = update_config;
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 07/20] drm/hdcp: update content protection property with uevent

2019-08-29 Thread Bhawanpreet Lakha
From: Ramalingam C 

drm function is defined and exported to update a connector's
content protection property state and to generate a uevent along
with it.

Pekka have completed the Weston DRM-backend review in
https://gitlab.freedesktop.org/wayland/weston/merge_requests/48
and the UAPI for HDCP 2.2 looks good.

The userspace is accepted in Weston.

v2:
  Update only when state is different from old one.
v3:
  KDoc is added [Daniel]
v4:
  KDoc is extended bit more [pekka]
v5:
  Uevent usage is documented at kdoc of "Content Protection" also
  [pekka]

Signed-off-by: Ramalingam C 
Reviewed-by: Daniel Vetter 
Acked-by: Pekka Paalanen 
Acked-by: Jani Nikula 
Link: https://patchwork.freedesktop.org/patch/320963/?series=57232=14
---
 drivers/gpu/drm/drm_connector.c | 17 +
 drivers/gpu/drm/drm_hdcp.c  | 34 +
 include/drm/drm_hdcp.h  |  2 ++
 3 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 3b0910b36ef5..3a0cacb71235 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -951,10 +951,19 @@ static const struct drm_prop_enum_list hdmi_colorspaces[] 
= {
  * - If the state is DESIRED, kernel should attempt to re-authenticate the
  *   link whenever possible. This includes across disable/enable, dpms,
  *   hotplug, downstream device changes, link status failures, etc..
- * - Userspace is responsible for polling the property to determine when
- *   the value transitions from ENABLED to DESIRED. This signifies the link
- *   is no longer protected and userspace should take appropriate action
- *   (whatever that might be).
+ * - Kernel sends uevent with the connector id and property id through
+ *   @drm_hdcp_update_content_protection, upon below kernel triggered
+ *   scenarios:
+ * DESIRED -> ENABLED  (authentication success)
+ * ENABLED -> DESIRED  (termination of authentication)
+ * - Please note no uevents for userspace triggered property state changes,
+ *   which can't fail such as
+ * DESIRED/ENABLED -> UNDESIRED
+ * UNDESIRED -> DESIRED
+ * - Userspace is responsible for polling the property or listen to uevents
+ *   to determine when the value transitions from ENABLED to DESIRED.
+ *   This signifies the link is no longer protected and userspace should
+ *   take appropriate action (whatever that might be).
  *
  * HDCP Content Type:
  * This Enum property is used by the userspace to declare the content type
diff --git a/drivers/gpu/drm/drm_hdcp.c b/drivers/gpu/drm/drm_hdcp.c
index 75402463466b..1e2a50bcab7e 100644
--- a/drivers/gpu/drm/drm_hdcp.c
+++ b/drivers/gpu/drm/drm_hdcp.c
@@ -372,6 +372,10 @@ DRM_ENUM_NAME_FN(drm_get_hdcp_content_type_name,
  *
  * The content protection will be set to 
_connector_state.content_protection
  *
+ * When kernel triggered content protection state change like DESIRED->ENABLED
+ * and ENABLED->DESIRED, will use drm_hdcp_update_content_protection() to 
update
+ * the content protection state of a connector.
+ *
  * Returns:
  * Zero on success, negative errno on failure.
  */
@@ -412,3 +416,33 @@ int drm_connector_attach_content_protection_property(
return 0;
 }
 EXPORT_SYMBOL(drm_connector_attach_content_protection_property);
+
+/**
+ * drm_hdcp_update_content_protection - Updates the content protection state
+ * of a connector
+ *
+ * @connector: drm_connector on which content protection state needs an update
+ * @val: New state of the content protection property
+ *
+ * This function can be used by display drivers, to update the kernel triggered
+ * content protection state changes of a drm_connector such as DESIRED->ENABLED
+ * and ENABLED->DESIRED. No uevent for DESIRED->UNDESIRED or 
ENABLED->UNDESIRED,
+ * as userspace is triggering such state change and kernel performs it without
+ * fail.This function update the new state of the property into the connector's
+ * state and generate an uevent to notify the userspace.
+ */
+void drm_hdcp_update_content_protection(struct drm_connector *connector,
+   u64 val)
+{
+   struct drm_device *dev = connector->dev;
+   struct drm_connector_state *state = connector->state;
+
+   WARN_ON(!drm_modeset_is_locked(>mode_config.connection_mutex));
+   if (state->content_protection == val)
+   return;
+
+   state->content_protection = val;
+   drm_sysfs_connector_status_event(connector,
+dev->mode_config.content_protection_property);
+}
+EXPORT_SYMBOL(drm_hdcp_update_content_protection);
diff --git a/include/drm/drm_hdcp.h b/include/drm/drm_hdcp.h
index 82447af98aa2..06a11202a097 100644
--- a/include/drm/drm_hdcp.h
+++ b/include/drm/drm_hdcp.h
@@ -292,6 +292,8 @@ bool drm_hdcp_check_ksvs_revoked(struct drm_device *dev,
 

[PATCH 15/20] drm/amd/display: Initialize HDCP work queue

2019-08-29 Thread Bhawanpreet Lakha
[Why]
We need this to enable HDCP on linux, as we need events to interact
with the hdcp module

[How]
Add work queue to display manager and handle the creation and destruction
of the queue

Signed-off-by: Bhawanpreet Lakha 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 30 +++
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |  3 ++
 2 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 160af0c8b40c..bac9cf5be473 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -37,6 +37,9 @@
 #include "amdgpu_ucode.h"
 #include "atom.h"
 #include "amdgpu_dm.h"
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+#include "amdgpu_dm_hdcp.h"
+#endif
 #include "amdgpu_pm.h"
 
 #include "amd_shared.h"
@@ -644,11 +647,18 @@ void amdgpu_dm_audio_eld_notify(struct amdgpu_device 
*adev, int pin)
 static int amdgpu_dm_init(struct amdgpu_device *adev)
 {
struct dc_init_data init_data;
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   struct dc_callback_init init_params;
+#endif
+
adev->dm.ddev = adev->ddev;
adev->dm.adev = adev;
 
/* Zero all the fields */
memset(_data, 0, sizeof(init_data));
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   memset(_params, 0, sizeof(init_params));
+#endif
 
mutex_init(>dm.dc_lock);
mutex_init(>dm.audio_lock);
@@ -721,6 +731,16 @@ static int amdgpu_dm_init(struct amdgpu_device *adev)
 
amdgpu_dm_init_color_mod();
 
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   adev->dm.hdcp_workqueue = hdcp_create_workqueue(>psp, 
_params.cp_psp, adev->dm.dc);
+
+   if (!adev->dm.hdcp_workqueue)
+   DRM_ERROR("amdgpu: failed to initialize hdcp_workqueue.\n");
+   else
+   DRM_DEBUG_DRIVER("amdgpu: hdcp_workqueue init done %p.\n", 
adev->dm.hdcp_workqueue);
+
+   dc_init_callbacks(adev->dm.dc, _params);
+#endif
if (amdgpu_dm_initialize_drm_device(adev)) {
DRM_ERROR(
"amdgpu: failed to initialize sw for display support.\n");
@@ -762,6 +782,16 @@ static void amdgpu_dm_fini(struct amdgpu_device *adev)
 
amdgpu_dm_destroy_drm_device(>dm);
 
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   if (adev->dm.hdcp_workqueue) {
+   hdcp_destroy(adev->dm.hdcp_workqueue);
+   adev->dm.hdcp_workqueue = NULL;
+   }
+
+   if (adev->dm.dc)
+   dc_deinit_callbacks(adev->dm.dc);
+#endif
+
/* DC Destroy TODO: Replace destroy DAL */
if (adev->dm.dc)
dc_destroy(>dm.dc);
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
index cbd6608f58e6..7a34eca12dab 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
@@ -222,6 +222,9 @@ struct amdgpu_display_manager {
struct amdgpu_dm_backlight_caps backlight_caps;
 
struct mod_freesync *freesync_module;
+#ifdef CONFIG_DRM_AMD_DC_HDCP
+   struct hdcp_workqueue *hdcp_workqueue;
+#endif
 
struct drm_atomic_state *cached_state;
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 11/20] drm/amd/display: add PSP block to verify hdcp steps

2019-08-29 Thread Bhawanpreet Lakha
[Why]
All the HDCP transactions should be verified using PSP.

[How]
This patch calls psp with the correct inputs to verify the steps
of authentication.

Signed-off-by: Bhawanpreet Lakha 
---
 .../drm/amd/display/modules/hdcp/hdcp_psp.c   | 328 ++
 .../drm/amd/display/modules/hdcp/hdcp_psp.h   | 272 +++
 2 files changed, 600 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
 create mode 100644 drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.h

diff --git a/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c 
b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
new file mode 100644
index ..646d909bbc37
--- /dev/null
+++ b/drivers/gpu/drm/amd/display/modules/hdcp/hdcp_psp.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: AMD
+ *
+ */
+
+#define MAX_NUM_DISPLAYS 24
+
+
+#include "hdcp.h"
+
+#include "amdgpu.h"
+#include "hdcp_psp.h"
+
+enum mod_hdcp_status mod_hdcp_remove_display_topology(struct mod_hdcp *hdcp)
+{
+
+   struct psp_context *psp = hdcp->config.psp.handle;
+   struct ta_dtm_shared_memory *dtm_cmd;
+   struct mod_hdcp_display *display = NULL;
+   uint8_t i;
+
+   dtm_cmd = (struct ta_dtm_shared_memory 
*)psp->dtm_context.dtm_shared_buf;
+
+   for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++) {
+   if (hdcp->connection.displays[i].state == 
MOD_HDCP_DISPLAY_ACTIVE_AND_ADDED) {
+
+   memset(dtm_cmd, 0, sizeof(struct ta_dtm_shared_memory));
+
+   display = >connection.displays[i];
+
+   dtm_cmd->cmd_id = TA_DTM_COMMAND__TOPOLOGY_UPDATE_V2;
+   
dtm_cmd->dtm_in_message.topology_update_v2.display_handle = display->index;
+   dtm_cmd->dtm_in_message.topology_update_v2.is_active = 
0;
+   dtm_cmd->dtm_status = TA_DTM_STATUS__GENERIC_FAILURE;
+
+   psp_dtm_invoke(psp, dtm_cmd->cmd_id);
+
+   if (dtm_cmd->dtm_status != TA_DTM_STATUS__SUCCESS)
+   return MOD_HDCP_STATUS_UPDATE_TOPOLOGY_FAILURE;
+
+   display->state = MOD_HDCP_DISPLAY_ACTIVE;
+   HDCP_TOP_REMOVE_DISPLAY_TRACE(hdcp, display->index);
+   }
+   }
+
+   return MOD_HDCP_STATUS_SUCCESS;
+}
+
+enum mod_hdcp_status mod_hdcp_add_display_topology(struct mod_hdcp *hdcp)
+{
+   struct psp_context *psp = hdcp->config.psp.handle;
+   struct ta_dtm_shared_memory *dtm_cmd;
+   struct mod_hdcp_display *display = NULL;
+   struct mod_hdcp_link *link = >connection.link;
+   uint8_t i;
+
+   if (!psp->dtm_context.dtm_initialized) {
+   DRM_ERROR("Failed to add display topology, DTM TA is not 
initialized.");
+   return MOD_HDCP_STATUS_FAILURE;
+   }
+
+   dtm_cmd = (struct ta_dtm_shared_memory 
*)psp->dtm_context.dtm_shared_buf;
+
+   for (i = 0; i < MAX_NUM_OF_DISPLAYS; i++) {
+   if (hdcp->connection.displays[i].state == 
MOD_HDCP_DISPLAY_ACTIVE) {
+   display = >connection.displays[i];
+
+   memset(dtm_cmd, 0, sizeof(struct ta_dtm_shared_memory));
+
+   dtm_cmd->cmd_id = TA_DTM_COMMAND__TOPOLOGY_UPDATE_V2;
+   
dtm_cmd->dtm_in_message.topology_update_v2.display_handle = display->index;
+   dtm_cmd->dtm_in_message.topology_update_v2.is_active = 
1;
+   dtm_cmd->dtm_in_message.topology_update_v2.controller = 
display->controller;
+   dtm_cmd->dtm_in_message.topology_update_v2.ddc_line = 
link->ddc_line;
+   dtm_cmd->dtm_in_message.topology_update_v2.dig_be = 
link->dig_be;
+   dtm_cmd->dtm_in_message.topology_update_v2.dig_fe = 
display->dig_fe;
+  

[PATCH 09/20] drm/amdgpu: psp DTM init

2019-08-29 Thread Bhawanpreet Lakha
DTM is the display topology manager. This is needed to communicate with
psp about the display configurations.

This patch adds
-Loading the firmware
-The functions and definitions for communication with the firmware

Signed-off-by: Bhawanpreet Lakha 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 153 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h   |  15 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |   3 +
 drivers/gpu/drm/amd/amdgpu/psp_v10_0.c|   4 +
 4 files changed, 175 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index ccce1b506a12..7192e7fba6dc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -942,6 +942,149 @@ static int psp_hdcp_terminate(struct psp_context *psp)
 }
 // HDCP end
 
+// DTM start
+static void psp_prep_dtm_ta_load_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+   uint64_t dtm_ta_mc, uint64_t dtm_mc_shared,
+   uint32_t dtm_ta_size, uint32_t shared_size)
+{
+   cmd->cmd_id = GFX_CMD_ID_LOAD_TA;
+   cmd->cmd.cmd_load_ta.app_phy_addr_lo = lower_32_bits(dtm_ta_mc);
+   cmd->cmd.cmd_load_ta.app_phy_addr_hi = upper_32_bits(dtm_ta_mc);
+   cmd->cmd.cmd_load_ta.app_len = dtm_ta_size;
+
+   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_lo = lower_32_bits(dtm_mc_shared);
+   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_hi = upper_32_bits(dtm_mc_shared);
+   cmd->cmd.cmd_load_ta.cmd_buf_len = shared_size;
+}
+
+static int psp_dtm_init_shared_buf(struct psp_context *psp)
+{
+   int ret;
+
+   /*
+* Allocate 16k memory aligned to 4k from Frame Buffer (local
+* physical) for dtm ta <-> Driver
+*/
+   ret = amdgpu_bo_create_kernel(psp->adev, PSP_DTM_SHARED_MEM_SIZE,
+   PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
+   >dtm_context.dtm_shared_bo,
+   >dtm_context.dtm_shared_mc_addr,
+   >dtm_context.dtm_shared_buf);
+
+   return ret;
+}
+
+static int psp_dtm_load(struct psp_context *psp)
+{
+   int ret;
+   struct psp_gfx_cmd_resp *cmd;
+
+   /*
+* TODO: bypass the loading in sriov for now
+*/
+   if (amdgpu_sriov_vf(psp->adev))
+   return 0;
+
+   cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
+   if (!cmd)
+   return -ENOMEM;
+
+   memset(psp->fw_pri_buf, 0, PSP_1_MEG);
+   memcpy(psp->fw_pri_buf, psp->ta_dtm_start_addr, psp->ta_dtm_ucode_size);
+
+   psp_prep_dtm_ta_load_cmd_buf(cmd, psp->fw_pri_mc_addr,
+   psp->dtm_context.dtm_shared_mc_addr,
+   psp->ta_dtm_ucode_size, PSP_DTM_SHARED_MEM_SIZE);
+
+   ret = psp_cmd_submit_buf(psp, NULL, cmd,
+   psp->fence_buf_mc_addr);
+
+   if (!ret) {
+   printk("LOADEDDE dtm !1");
+   psp->dtm_context.dtm_initialized = 1;
+   psp->dtm_context.session_id = cmd->resp.session_id;
+   }
+
+   kfree(cmd);
+
+   return ret;
+}
+
+static int psp_dtm_initialize(struct psp_context *psp)
+{
+   int ret;
+
+   if (!psp->dtm_context.dtm_initialized) {
+   ret = psp_dtm_init_shared_buf(psp);
+   if (ret)
+   return ret;
+   }
+
+   ret = psp_dtm_load(psp);
+   if (ret)
+   return ret;
+
+   return 0;
+}
+
+static void psp_prep_dtm_ta_invoke_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+   uint32_t ta_cmd_id,
+   uint32_t dtm_session_id)
+{
+   cmd->cmd_id = GFX_CMD_ID_INVOKE_CMD;
+   cmd->cmd.cmd_invoke_cmd.session_id = dtm_session_id;
+   cmd->cmd.cmd_invoke_cmd.ta_cmd_id = ta_cmd_id;
+   /* Note: cmd_invoke_cmd.buf is not used for now */
+}
+
+int psp_dtm_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
+{
+   int ret;
+   struct psp_gfx_cmd_resp *cmd;
+
+   /*
+* TODO: bypass the loading in sriov for now
+*/
+   if (amdgpu_sriov_vf(psp->adev))
+   return 0;
+
+   cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
+   if (!cmd)
+   return -ENOMEM;
+
+   psp_prep_dtm_ta_invoke_cmd_buf(cmd, ta_cmd_id,
+   psp->dtm_context.session_id);
+
+   ret = psp_cmd_submit_buf(psp, NULL, cmd,
+   psp->fence_buf_mc_addr);
+
+   kfree(cmd);
+
+   return ret;
+}
+
+static int psp_dtm_terminate(struct psp_context *psp)
+{
+   int ret;
+
+   if (!psp->dtm_context.dtm_initialized)
+   return 0;
+
+   ret = psp_hdcp_unload(psp);
+   if (ret)
+   return ret;
+
+   psp->dtm_context.dtm_initialized = 0;
+
+   /* free hdcp shared memory */
+   amdgpu_bo_free_kernel(>dtm_context.dtm_shared_bo,
+   >dtm_context.dtm_shared_mc_addr,
+   >dtm_context.dtm_shared_buf);
+
+   

[PATCH 06/20] drm: uevent for connector status change

2019-08-29 Thread Bhawanpreet Lakha
From: Ramalingam C 

DRM API for generating uevent for a status changes of connector's
property.

This uevent will have following details related to the status change:

  HOTPLUG=1, CONNECTOR= and PROPERTY=

Pekka have completed the Weston DRM-backend review in
https://gitlab.freedesktop.org/wayland/weston/merge_requests/48
and the UAPI for HDCP 2.2 looks good.

The userspace is accepted in Weston.

v2:
  Minor fixes at KDoc comments [Daniel]
v3:
  Check the property is really attached with connector [Daniel]
v4:
  Typos and string length suggestions are addressed [Sean]

Signed-off-by: Ramalingam C 
Reviewed-by: Daniel Vetter 
Reviewed-by: Sean Paul 
Acked-by: Pekka Paalanen 
Acked-by: Jani Nikula 
Link: https://patchwork.freedesktop.org/patch/320961/?series=57232=14
---
 drivers/gpu/drm/drm_sysfs.c | 35 +++
 include/drm/drm_sysfs.h |  5 -
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c
index 18b1ac442997..9f0ccec44a04 100644
--- a/drivers/gpu/drm/drm_sysfs.c
+++ b/drivers/gpu/drm/drm_sysfs.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include "drm_internal.h"
+#include "drm_crtc_internal.h"
 
 #define to_drm_minor(d) dev_get_drvdata(d)
 #define to_drm_connector(d) dev_get_drvdata(d)
@@ -320,6 +321,9 @@ void drm_sysfs_lease_event(struct drm_device *dev)
  * Send a uevent for the DRM device specified by @dev.  Currently we only
  * set HOTPLUG=1 in the uevent environment, but this could be expanded to
  * deal with other types of events.
+ *
+ * Any new uapi should be using the drm_sysfs_connector_status_event()
+ * for uevents on connector status change.
  */
 void drm_sysfs_hotplug_event(struct drm_device *dev)
 {
@@ -332,6 +336,37 @@ void drm_sysfs_hotplug_event(struct drm_device *dev)
 }
 EXPORT_SYMBOL(drm_sysfs_hotplug_event);
 
+/**
+ * drm_sysfs_connector_status_event - generate a DRM uevent for connector
+ * property status change
+ * @connector: connector on which property status changed
+ * @property: connector property whose status changed.
+ *
+ * Send a uevent for the DRM device specified by @dev.  Currently we
+ * set HOTPLUG=1 and connector id along with the attached property id
+ * related to the status change.
+ */
+void drm_sysfs_connector_status_event(struct drm_connector *connector,
+ struct drm_property *property)
+{
+   struct drm_device *dev = connector->dev;
+   char hotplug_str[] = "HOTPLUG=1", conn_id[21], prop_id[21];
+   char *envp[4] = { hotplug_str, conn_id, prop_id, NULL };
+
+   WARN_ON(!drm_mode_obj_find_prop_id(>base,
+  property->base.id));
+
+   snprintf(conn_id, ARRAY_SIZE(conn_id),
+"CONNECTOR=%u", connector->base.id);
+   snprintf(prop_id, ARRAY_SIZE(prop_id),
+"PROPERTY=%u", property->base.id);
+
+   DRM_DEBUG("generating connector status event\n");
+
+   kobject_uevent_env(>primary->kdev->kobj, KOBJ_CHANGE, envp);
+}
+EXPORT_SYMBOL(drm_sysfs_connector_status_event);
+
 static void drm_sysfs_release(struct device *dev)
 {
kfree(dev);
diff --git a/include/drm/drm_sysfs.h b/include/drm/drm_sysfs.h
index 4f311e836cdc..d454ef617b2c 100644
--- a/include/drm/drm_sysfs.h
+++ b/include/drm/drm_sysfs.h
@@ -4,10 +4,13 @@
 
 struct drm_device;
 struct device;
+struct drm_connector;
+struct drm_property;
 
 int drm_class_device_register(struct device *dev);
 void drm_class_device_unregister(struct device *dev);
 
 void drm_sysfs_hotplug_event(struct drm_device *dev);
-
+void drm_sysfs_connector_status_event(struct drm_connector *connector,
+ struct drm_property *property);
 #endif
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 04/20] drm/hdcp: gathering hdcp related code into drm_hdcp.c

2019-08-29 Thread Bhawanpreet Lakha
From: Ramalingam C 

Considering the significant size of hdcp related code in drm, all
hdcp related codes are moved into separate file called drm_hdcp.c.

v2:
  Rebased.
v2:
  Rebased.

Signed-off-by: Ramalingam C 
Suggested-by: Daniel Vetter 
Reviewed-by: Daniel Vetter 
Acked-by: Dave Airlie 
Signed-off-by: Daniel Vetter 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20190507162745.25600-7-ramalinga...@intel.com
---
 drivers/gpu/drm/drm_connector.c | 44 --
 drivers/gpu/drm/drm_hdcp.c  | 47 +
 include/drm/drm_connector.h |  2 --
 include/drm/drm_hdcp.h  |  3 +++
 4 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index 0490c204122d..11fcd25bc640 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -823,13 +823,6 @@ static const struct drm_prop_enum_list 
drm_tv_subconnector_enum_list[] = {
 DRM_ENUM_NAME_FN(drm_get_tv_subconnector_name,
 drm_tv_subconnector_enum_list)
 
-static struct drm_prop_enum_list drm_cp_enum_list[] = {
-   { DRM_MODE_CONTENT_PROTECTION_UNDESIRED, "Undesired" },
-   { DRM_MODE_CONTENT_PROTECTION_DESIRED, "Desired" },
-   { DRM_MODE_CONTENT_PROTECTION_ENABLED, "Enabled" },
-};
-DRM_ENUM_NAME_FN(drm_get_content_protection_name, drm_cp_enum_list)
-
 static const struct drm_prop_enum_list hdmi_colorspaces[] = {
/* For Default case, driver will set the colorspace */
{ DRM_MODE_COLORIMETRY_DEFAULT, "Default" },
@@ -1509,43 +1502,6 @@ int drm_connector_attach_scaling_mode_property(struct 
drm_connector *connector,
 }
 EXPORT_SYMBOL(drm_connector_attach_scaling_mode_property);
 
-/**
- * drm_connector_attach_content_protection_property - attach content protection
- * property
- *
- * @connector: connector to attach CP property on.
- *
- * This is used to add support for content protection on select connectors.
- * Content Protection is intentionally vague to allow for different underlying
- * technologies, however it is most implemented by HDCP.
- *
- * The content protection will be set to 
_connector_state.content_protection
- *
- * Returns:
- * Zero on success, negative errno on failure.
- */
-int drm_connector_attach_content_protection_property(
-   struct drm_connector *connector)
-{
-   struct drm_device *dev = connector->dev;
-   struct drm_property *prop =
-   dev->mode_config.content_protection_property;
-
-   if (!prop)
-   prop = drm_property_create_enum(dev, 0, "Content Protection",
-   drm_cp_enum_list,
-   ARRAY_SIZE(drm_cp_enum_list));
-   if (!prop)
-   return -ENOMEM;
-
-   drm_object_attach_property(>base, prop,
-  DRM_MODE_CONTENT_PROTECTION_UNDESIRED);
-   dev->mode_config.content_protection_property = prop;
-
-   return 0;
-}
-EXPORT_SYMBOL(drm_connector_attach_content_protection_property);
-
 /**
  * drm_mode_create_aspect_ratio_property - create aspect ratio property
  * @dev: DRM device
diff --git a/drivers/gpu/drm/drm_hdcp.c b/drivers/gpu/drm/drm_hdcp.c
index 5e5409505c31..0da7b3718bad 100644
--- a/drivers/gpu/drm/drm_hdcp.c
+++ b/drivers/gpu/drm/drm_hdcp.c
@@ -17,6 +17,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 struct hdcp_srm {
u32 revoked_ksv_cnt;
@@ -331,3 +334,47 @@ void drm_teardown_hdcp_srm(struct class *drm_class)
kfree(srm_data);
}
 }
+
+static struct drm_prop_enum_list drm_cp_enum_list[] = {
+   { DRM_MODE_CONTENT_PROTECTION_UNDESIRED, "Undesired" },
+   { DRM_MODE_CONTENT_PROTECTION_DESIRED, "Desired" },
+   { DRM_MODE_CONTENT_PROTECTION_ENABLED, "Enabled" },
+};
+DRM_ENUM_NAME_FN(drm_get_content_protection_name, drm_cp_enum_list)
+
+/**
+ * drm_connector_attach_content_protection_property - attach content protection
+ * property
+ *
+ * @connector: connector to attach CP property on.
+ *
+ * This is used to add support for content protection on select connectors.
+ * Content Protection is intentionally vague to allow for different underlying
+ * technologies, however it is most implemented by HDCP.
+ *
+ * The content protection will be set to 
_connector_state.content_protection
+ *
+ * Returns:
+ * Zero on success, negative errno on failure.
+ */
+int drm_connector_attach_content_protection_property(
+   struct drm_connector *connector)
+{
+   struct drm_device *dev = connector->dev;
+   struct drm_property *prop =
+   dev->mode_config.content_protection_property;
+
+   if (!prop)
+   prop = drm_property_create_enum(dev, 0, "Content Protection",
+   drm_cp_enum_list,
+   ARRAY_SIZE(drm_cp_enum_list));

[PATCH 01/20] drm: move content protection property to mode_config

2019-08-29 Thread Bhawanpreet Lakha
From: Ramalingam C 

Content protection property is created once and stored in
drm_mode_config. And attached to all HDCP capable connectors.

Signed-off-by: Ramalingam C 
Reviewed-by: Daniel Vetter 
Acked-by: Dave Airlie 
Signed-off-by: Daniel Vetter 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20190507162745.25600-2-ramalinga...@intel.com
---
 drivers/gpu/drm/drm_atomic_uapi.c |  4 ++--
 drivers/gpu/drm/drm_connector.c   | 13 +++--
 include/drm/drm_connector.h   |  6 --
 include/drm/drm_mode_config.h |  6 ++
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/drm_atomic_uapi.c 
b/drivers/gpu/drm/drm_atomic_uapi.c
index 428d82662dc4..4131e669785a 100644
--- a/drivers/gpu/drm/drm_atomic_uapi.c
+++ b/drivers/gpu/drm/drm_atomic_uapi.c
@@ -732,7 +732,7 @@ static int drm_atomic_connector_set_property(struct 
drm_connector *connector,
state->content_type = val;
} else if (property == connector->scaling_mode_property) {
state->scaling_mode = val;
-   } else if (property == connector->content_protection_property) {
+   } else if (property == config->content_protection_property) {
if (val == DRM_MODE_CONTENT_PROTECTION_ENABLED) {
DRM_DEBUG_KMS("only drivers can set CP Enabled\n");
return -EINVAL;
@@ -814,7 +814,7 @@ drm_atomic_connector_get_property(struct drm_connector 
*connector,
*val = state->colorspace;
} else if (property == connector->scaling_mode_property) {
*val = state->scaling_mode;
-   } else if (property == connector->content_protection_property) {
+   } else if (property == config->content_protection_property) {
*val = state->content_protection;
} else if (property == config->writeback_fb_id_property) {
/* Writeback framebuffer is one-shot, write and forget */
diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c
index b34c3d38bf15..0490c204122d 100644
--- a/drivers/gpu/drm/drm_connector.c
+++ b/drivers/gpu/drm/drm_connector.c
@@ -1528,18 +1528,19 @@ int drm_connector_attach_content_protection_property(
struct drm_connector *connector)
 {
struct drm_device *dev = connector->dev;
-   struct drm_property *prop;
+   struct drm_property *prop =
+   dev->mode_config.content_protection_property;
 
-   prop = drm_property_create_enum(dev, 0, "Content Protection",
-   drm_cp_enum_list,
-   ARRAY_SIZE(drm_cp_enum_list));
+   if (!prop)
+   prop = drm_property_create_enum(dev, 0, "Content Protection",
+   drm_cp_enum_list,
+   ARRAY_SIZE(drm_cp_enum_list));
if (!prop)
return -ENOMEM;
 
drm_object_attach_property(>base, prop,
   DRM_MODE_CONTENT_PROTECTION_UNDESIRED);
-
-   connector->content_protection_property = prop;
+   dev->mode_config.content_protection_property = prop;
 
return 0;
 }
diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h
index 02a131202add..5e41942e5679 100644
--- a/include/drm/drm_connector.h
+++ b/include/drm/drm_connector.h
@@ -1061,12 +1061,6 @@ struct drm_connector {
 */
struct drm_property *vrr_capable_property;
 
-   /**
-* @content_protection_property: DRM ENUM property for content
-* protection. See drm_connector_attach_content_protection_property().
-*/
-   struct drm_property *content_protection_property;
-
/**
 * @colorspace_property: Connector property to set the suitable
 * colorspace supported by the sink.
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 7f60e8eb269a..5764ee3c7453 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -836,6 +836,12 @@ struct drm_mode_config {
 */
struct drm_property *writeback_out_fence_ptr_property;
 
+   /**
+* @content_protection_property: DRM ENUM property for content
+* protection. See drm_connector_attach_content_protection_property().
+*/
+   struct drm_property *content_protection_property;
+
/* dumb ioctl parameters */
uint32_t preferred_depth, prefer_shadow;
 
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 00/20] HDCP 1.4 Content Protection

2019-08-29 Thread Bhawanpreet Lakha
This patch set introduces HDCP 1.4 capability to Asics starting with  Raven(DCN 
1.0).

This only introduces the ability to authenticate and encrypt the link. These
patches by themselves don't constitute a complete and compliant
HDCP content protection solution but are a requirement for such a solution.

NOTE: The 7 patches by Ramalingam have already been merged to drm-misc
but are required to apply the HDCP patches on amd-staging-drm-next

Bhawanpreet Lakha (13):
  drm/amdgpu: psp HDCP init
  drm/amdgpu: psp DTM init
  drm/amd/display: Add HDCP module
  drm/amd/display: add PSP block to verify hdcp steps
  drm/amd/display: Update hdcp display config
  drm/amd/display: Create amdgpu_dm_hdcp
  drm/amd/display: Create dpcd and i2c packing functions
  drm/amd/display: Initialize HDCP work queue
  drm/amd/display: Handle Content protection property changes
  drm/amd/display: handle DP cpirq
  drm/amd/display: Update CP property based on HW query
  drm/amd/display: only enable HDCP for DCN+
  drm/amd/display: Add hdcp to Kconfig

Ramalingam C (7):
  drm: move content protection property to mode_config
  drm: generic fn converting be24 to cpu and vice versa
  drm: revocation check at drm subsystem
  drm/hdcp: gathering hdcp related code into drm_hdcp.c
  drm: Add Content protection type property
  drm: uevent for connector status change
  drm/hdcp: update content protection property with uevent

 Documentation/gpu/drm-kms-helpers.rst |   6 +
 drivers/gpu/drm/Makefile  |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 341 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h   |  32 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |   6 +
 drivers/gpu/drm/amd/amdgpu/psp_v10_0.c|  35 +-
 drivers/gpu/drm/amd/display/Kconfig   |   8 +
 drivers/gpu/drm/amd/display/Makefile  |   7 +
 .../gpu/drm/amd/display/amdgpu_dm/Makefile|   4 +
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 135 +
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |   3 +
 .../amd/display/amdgpu_dm/amdgpu_dm_hdcp.c| 342 +++
 .../amd/display/amdgpu_dm/amdgpu_dm_hdcp.h|  66 +++
 drivers/gpu/drm/amd/display/dc/Makefile   |   4 +
 drivers/gpu/drm/amd/display/dc/core/dc.c  |  10 +
 drivers/gpu/drm/amd/display/dc/core/dc_link.c |  31 +
 drivers/gpu/drm/amd/display/dc/dc.h   |   5 +
 drivers/gpu/drm/amd/display/dc/dc_types.h |   7 +
 drivers/gpu/drm/amd/display/dc/dm_cp_psp.h|  49 ++
 drivers/gpu/drm/amd/display/dc/hdcp/Makefile  |  28 +
 .../gpu/drm/amd/display/dc/hdcp/hdcp_msg.c| 324 +++
 .../gpu/drm/amd/display/dc/inc/core_types.h   |   4 +-
 .../gpu/drm/amd/display/include/hdcp_types.h  |  96 
 .../gpu/drm/amd/display/modules/hdcp/Makefile |  32 ++
 .../gpu/drm/amd/display/modules/hdcp/hdcp.c   | 426 ++
 .../gpu/drm/amd/display/modules/hdcp/hdcp.h   | 442 +++
 .../display/modules/hdcp/hdcp1_execution.c| 531 ++
 .../display/modules/hdcp/hdcp1_transition.c   | 307 ++
 .../drm/amd/display/modules/hdcp/hdcp_ddc.c   | 305 ++
 .../drm/amd/display/modules/hdcp/hdcp_log.c   | 163 ++
 .../drm/amd/display/modules/hdcp/hdcp_log.h   | 139 +
 .../drm/amd/display/modules/hdcp/hdcp_psp.c   | 328 +++
 .../drm/amd/display/modules/hdcp/hdcp_psp.h   | 272 +
 .../drm/amd/display/modules/inc/mod_hdcp.h| 297 ++
 drivers/gpu/drm/drm_atomic_uapi.c |   8 +-
 drivers/gpu/drm/drm_connector.c   | 111 ++--
 drivers/gpu/drm/drm_hdcp.c| 448 +++
 drivers/gpu/drm/drm_internal.h|   4 +
 drivers/gpu/drm/drm_sysfs.c   |  37 ++
 drivers/gpu/drm/i915/intel_hdcp.c |   9 +-
 drivers/misc/mei/hdcp/mei_hdcp.c  |   2 +-
 include/drm/drm_connector.h   |  15 +-
 include/drm/drm_hdcp.h|  38 +-
 include/drm/drm_mode_config.h |  12 +
 include/drm/drm_sysfs.h   |   5 +-
 45 files changed, 5408 insertions(+), 68 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.c
 create mode 100644 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_hdcp.h
 create mode 100644 drivers/gpu/drm/amd/display/dc/dm_cp_psp.h
 create mode 100644 drivers/gpu/drm/amd/display/dc/hdcp/Makefile
 create mode 100644 drivers/gpu/drm/amd/display/dc/hdcp/hdcp_msg.c
 create mode 100644 drivers/gpu/drm/amd/display/include/hdcp_types.h
 create mode 100644 drivers/gpu/drm/amd/display/modules/hdcp/Makefile
 create mode 100644 drivers/gpu/drm/amd/display/modules/hdcp/hdcp.c
 create mode 100644 drivers/gpu/drm/amd/display/modules/hdcp/hdcp.h
 create mode 100644 drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c
 create mode 100644 drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_transition.c
 create mode 100644 drivers/gpu/drm/amd/display/modules/hdcp/hdcp_ddc.c
 create mode 100644 

Re: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Kuehling, Felix
On 2019-08-29 10:08 a.m., Grodzovsky, Andrey wrote:
>
> Agree, the placement of amdgpu_amdkfd_pre/post _reset in 
> amdgpu_device_lock/unlock_adev is a bit wierd.
>
amdgpu_device_reset_sriov already calls amdgpu_amdkfd_pre/post_reset 
itself while it has exclusive access to the GPU. It would make sense to 
move the same calls into amdgpu_do_asic_reset for the bare-metal case.

Regards,
   Felix


> Andrey
>
> On 8/29/19 10:06 AM, Koenig, Christian wrote:
>>> Felix advised that the way to stop all KFD activity is simply to NOT
>>> call amdgpu_amdkfd_post_reset so that why I added this. Do you mean you
>>> prefer amdgpu_amdkfd_post_reset to be outside of amdgpu_device_unlock_adev ?
>> Yes, exactly. It doesn't seems to be related to the unlock operation in
>> the first place, but rather only signals the KFD that the reset is
>> completed.
>>
>> Christian.
>>
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH V3] drm: Add LTTPR defines for DP 1.4a

2019-08-29 Thread Harry Wentland
On 2019-08-28 3:52 p.m., Siqueira, Rodrigo wrote:
> DP 1.4a specification defines Link Training Tunable PHY Repeater (LTTPR)
> which is required to add support for systems with Thunderbolt or other
> repeater devices.
> 
> Changes since V2:
> - Drop the kernel-doc comment
> - Reorder LTTPR according to register offset
> Changes since V1:
> - Adjusts registers names to be aligned with spec and the rest of the
>   file
> - Update spec comment from 1.4 to 1.4a
> 
> Cc: Abdoulaye Berthe 
> Cc: Harry Wentland 
> Cc: Leo Li 
> Cc: Jani Nikula 
> Signed-off-by: Rodrigo Siqueira 
> Signed-off-by: Abdoulaye Berthe 

Reviewed-by: Harry Wentland 

Harry

> ---
>  include/drm/drm_dp_helper.h | 25 +
>  1 file changed, 25 insertions(+)
> 
> diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
> index 8364502f92cf..67f9121d781d 100644
> --- a/include/drm/drm_dp_helper.h
> +++ b/include/drm/drm_dp_helper.h
> @@ -989,6 +989,31 @@
>  #define HDCP_2_2_DP_RXSTATUS_REAUTH_REQ(x)   ((x) & BIT(3))
>  #define HDCP_2_2_DP_RXSTATUS_LINK_FAILED(x)  ((x) & BIT(4))
>  
> +/* Link Training (LT)-tunable Physical Repeaters - DP 1.4a */
> +#define DP_LT_TUNABLE_PHY_REPEATER_FIELD_DATA_STRUCTURE_REV 0xf
> +#define DP_MAX_LINK_RATE_PHY_REPEATER   0xf0001
> +#define DP_PHY_REPEATER_CNT 0xf0002
> +#define DP_PHY_REPEATER_MODE0xf0003
> +#define DP_MAX_LANE_COUNT_PHY_REPEATER  0xf0004
> +#define DP_PHY_REPEATER_EXTENDED_WAIT_TIMEOUT   0xf0005
> +#define DP_TRAINING_PATTERN_SET_PHY_REPEATER1   0xf0010
> +#define DP_TRAINING_LANE0_SET_PHY_REPEATER1 0xf0011
> +#define DP_TRAINING_LANE1_SET_PHY_REPEATER1 0xf0012
> +#define DP_TRAINING_LANE2_SET_PHY_REPEATER1 0xf0013
> +#define DP_TRAINING_LANE3_SET_PHY_REPEATER1 0xf0014
> +#define DP_TRAINING_AUX_RD_INTERVAL_PHY_REPEATER1   0xf0020
> +#define DP_TRANSMITTER_CAPABILITY_PHY_REPEATER1 0xf0021
> +#define DP_LANE0_1_STATUS_PHY_REPEATER1 0xf0030
> +#define DP_LANE2_3_STATUS_PHY_REPEATER1 0xf0031
> +#define DP_LANE_ALIGN_STATUS_UPDATED_PHY_REPEATER1  0xf0032
> +#define DP_ADJUST_REQUEST_LANE0_1_PHY_REPEATER1 0xf0033
> +#define DP_ADJUST_REQUEST_LANE2_3_PHY_REPEATER1 0xf0034
> +#define DP_SYMBOL_ERROR_COUNT_LANE0_PHY_REPEATER1   0xf0035
> +#define DP_SYMBOL_ERROR_COUNT_LANE1_PHY_REPEATER1   0xf0037
> +#define DP_SYMBOL_ERROR_COUNT_LANE2_PHY_REPEATER1   0xf0039
> +#define DP_SYMBOL_ERROR_COUNT_LANE3_PHY_REPEATER1   0xf003b
> +#define DP_FEC_STATUS_PHY_REPEATER1 0xf0290
> +
>  /* DP 1.2 Sideband message defines */
>  /* peer device type - DP 1.2a Table 2-92 */
>  #define DP_PEER_DEVICE_NONE  0x0
> 



signature.asc
Description: OpenPGP digital signature
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH RFC v4 13/16] drm, cgroup: Allow more aggressive memory reclaim

2019-08-29 Thread Kenny Ho
Yes, and I think it has quite a lot of coupling with mm's page and
pressure mechanisms.  My current thought is to just copy the API but
have a separate implementation of "ttm_shrinker" and
"ttm_shrinker_control" or something like that.  I am certainly happy
to listen to additional feedbacks and suggestions.

Regards,
Kenny


On Thu, Aug 29, 2019 at 10:12 AM Koenig, Christian
 wrote:
>
> Yeah, that's also a really good idea as well.
>
> The problem with the shrinker API is that it only applies to system memory 
> currently.
>
> So you won't have a distinction which domain you need to evict stuff from.
>
> Regards,
> Christian.
>
> Am 29.08.19 um 16:07 schrieb Kenny Ho:
>
> Thanks for the feedback Christian.  I am still digging into this one.  Daniel 
> suggested leveraging the Shrinker API for the functionality of this commit in 
> RFC v3 but I am still trying to figure it out how/if ttm fit with shrinker 
> (though the idea behind the shrinker API seems fairly straightforward as far 
> as I understand it currently.)
>
> Regards,
> Kenny
>
> On Thu, Aug 29, 2019 at 3:08 AM Koenig, Christian  
> wrote:
>>
>> Am 29.08.19 um 08:05 schrieb Kenny Ho:
>> > Allow DRM TTM memory manager to register a work_struct, such that, when
>> > a drmcgrp is under memory pressure, memory reclaiming can be triggered
>> > immediately.
>> >
>> > Change-Id: I25ac04e2db9c19ff12652b88ebff18b44b2706d8
>> > Signed-off-by: Kenny Ho 
>> > ---
>> >   drivers/gpu/drm/ttm/ttm_bo.c| 49 +
>> >   include/drm/drm_cgroup.h| 16 +++
>> >   include/drm/ttm/ttm_bo_driver.h |  2 ++
>> >   kernel/cgroup/drm.c | 30 
>> >   4 files changed, 97 insertions(+)
>> >
>> > diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
>> > index d7e3d3128ebb..72efae694b7e 100644
>> > --- a/drivers/gpu/drm/ttm/ttm_bo.c
>> > +++ b/drivers/gpu/drm/ttm/ttm_bo.c
>> > @@ -1590,6 +1590,46 @@ int ttm_bo_evict_mm(struct ttm_bo_device *bdev, 
>> > unsigned mem_type)
>> >   }
>> >   EXPORT_SYMBOL(ttm_bo_evict_mm);
>> >
>> > +static void ttm_bo_reclaim_wq(struct work_struct *work)
>> > +{
>> > + struct ttm_operation_ctx ctx = {
>> > + .interruptible = false,
>> > + .no_wait_gpu = false,
>> > + .flags = TTM_OPT_FLAG_FORCE_ALLOC
>> > + };
>> > + struct ttm_mem_type_manager *man =
>> > + container_of(work, struct ttm_mem_type_manager, reclaim_wq);
>> > + struct ttm_bo_device *bdev = man->bdev;
>> > + struct dma_fence *fence;
>> > + int mem_type;
>> > + int ret;
>> > +
>> > + for (mem_type = 0; mem_type < TTM_NUM_MEM_TYPES; mem_type++)
>> > + if (>man[mem_type] == man)
>> > + break;
>> > +
>> > + WARN_ON(mem_type >= TTM_NUM_MEM_TYPES);
>> > + if (mem_type >= TTM_NUM_MEM_TYPES)
>> > + return;
>> > +
>> > + if (!drmcg_mem_pressure_scan(bdev, mem_type))
>> > + return;
>> > +
>> > + ret = ttm_mem_evict_first(bdev, mem_type, NULL, , NULL);
>> > + if (ret)
>> > + return;
>> > +
>> > + spin_lock(>move_lock);
>> > + fence = dma_fence_get(man->move);
>> > + spin_unlock(>move_lock);
>> > +
>> > + if (fence) {
>> > + ret = dma_fence_wait(fence, false);
>> > + dma_fence_put(fence);
>> > + }
>>
>> Why do you want to block for the fence here? That is a rather bad idea
>> and would break pipe-lining.
>>
>> Apart from that I don't think we should put that into TTM.
>>
>> Instead drmcg_register_device_mm() should get a function pointer which
>> is called from a work item when the group is under pressure.
>>
>> TTM can then provides the function which can be called, but the actually
>> registration is job of the device and not TTM.
>>
>> Regards,
>> Christian.
>>
>> > +
>> > +}
>> > +
>> >   int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type,
>> >   unsigned long p_size)
>> >   {
>> > @@ -1624,6 +1664,13 @@ int ttm_bo_init_mm(struct ttm_bo_device *bdev, 
>> > unsigned type,
>> >   INIT_LIST_HEAD(>lru[i]);
>> >   man->move = NULL;
>> >
>> > + pr_err("drmcg %p type %d\n", bdev->ddev, type);
>> > +
>> > + if (type <= TTM_PL_VRAM) {
>> > + INIT_WORK(>reclaim_wq, ttm_bo_reclaim_wq);
>> > + drmcg_register_device_mm(bdev->ddev, type, >reclaim_wq);
>> > + }
>> > +
>> >   return 0;
>> >   }
>> >   EXPORT_SYMBOL(ttm_bo_init_mm);
>> > @@ -1701,6 +1748,8 @@ int ttm_bo_device_release(struct ttm_bo_device *bdev)
>> >   man = >man[i];
>> >   if (man->has_type) {
>> >   man->use_type = false;
>> > + drmcg_unregister_device_mm(bdev->ddev, i);
>> > + cancel_work_sync(>reclaim_wq);
>> >   if ((i != TTM_PL_SYSTEM) && ttm_bo_clean_mm(bdev, 
>> > i)) {
>> >   ret = -EBUSY;
>> >  

Re: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Grodzovsky, Andrey

On 8/29/19 3:56 AM, Zhou1, Tao wrote:
>
>> -Original Message-
>> From: amd-gfx  On Behalf Of
>> Andrey Grodzovsky
>> Sent: 2019年8月29日 4:00
>> To: amd-gfx@lists.freedesktop.org
>> Cc: alexdeuc...@gmail.com; ckoenig.leichtzumer...@gmail.com;
>> Grodzovsky, Andrey ; Zhang, Hawking
>> 
>> Subject: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.
>>
>> Problem:
>> Under certain conditions, when some IP bocks take a RAS error, we can get
> [Tao] typo: "dmr/amdgpu" -> "drm/amdgpu", "IP bocks" -> "IP blocks"
>
>> into a situation where a GPU reset is not possible due to issues in RAS in
>> SMU/PSP.
>>
>> Temporary fix until proper solution in PSP/SMU is ready:
>> When uncorrectable error happens the DF will unconditionally broadcast
>> error event packets to all its clients/slave upon receiving fatal error 
>> event and
>> freeze all its outbound queues, err_event_athub interrupt  will be triggered.
>> In such case and we use this interrupt
>> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
>> reset, only stops schedulers, deatches all in progress and not yet scheduled
>> job's fences, set error code on them and signals.
>> Also reject any new incoming job submissions from user space.
>> All this is done to notify the applications of the problem.
>>
>> Signed-off-by: Andrey Grodzovsky 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98
>> ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  5 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  6 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 30 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 12 +++-
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 +--
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 24 
>>   drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  5 ++
>>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +-
>>   10 files changed, 164 insertions(+), 62 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 9da681e..300adb8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -38,6 +38,7 @@
>>   #include "amdgpu_gmc.h"
>>   #include "amdgpu_gem.h"
>>   #include "amdgpu_display.h"
>> +#include "amdgpu_ras.h"
>>
>>   #if defined(HAVE_DRM_FREE_LARGE)
>>   #define kvfree drm_free_large
>> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void
>> *data, struct drm_file *filp)
>>  bool reserved_buffers = false;
>>  int i, r;
>>
>> +if (amdgpu_ras_intr_triggered())
>> +return -EHWPOISON;
>> +
>>  if (!adev->accel_working)
>>  return -EBUSY;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 07a4ba0..3ecee10 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct
>> amdgpu_device *adev, bool trylock)
>>  return true;
>>   }
>>
>> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev, bool
>> +skip_kfd)
>>   {
>>  /*unlock kfd: SRIOV would do it separately */
>> -if (!amdgpu_sriov_vf(adev))
>> +if (!amdgpu_sriov_vf(adev) && !skip_kfd)
>>   amdgpu_amdkfd_post_reset(adev);
>>  amdgpu_vf_error_trans_all(adev);
>>  adev->mp1_state = PP_MP1_STATE_NONE;
>> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct
>> amdgpu_device *adev)  }
>>
>>
>> +#define to_drm_sched_job(sched_job) \
>> +container_of((sched_job), struct drm_sched_job,
>> queue_node)
>> +
>> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>> +*sched) {
>> +struct drm_sched_job *s_job;
>> +struct drm_sched_entity *s_entity = NULL;
>> +int i;
>> +
>> +/* Signal all jobs not yet scheduled */
>> +for (i = DRM_SCHED_PRIORITY_MAX - 1; i >=
>> DRM_SCHED_PRIORITY_MIN; i--) {
>> +struct drm_sched_rq *rq = >sched_rq[i];
>> +
>> +if (!rq)
>> +continue;
>> +
>> +spin_lock(>lock);
>> +list_for_each_entry(s_entity, >entities, list) {
>> +while ((s_job =
>> to_drm_sched_job(spsc_queue_pop(_entity->job_queue {
>> +struct drm_sched_fence *s_fence = s_job-
>>> s_fence;
>> +
>> +dma_fence_signal(_fence->scheduled);
>> +dma_fence_set_error(_fence->finished, -
>> EHWPOISON);
>> +dma_fence_signal(_fence->finished);
>> +}
>> +}
>> +spin_unlock(>lock);
>> +}
>> +
>> +/* Signal all jobs already scheduled to HW */
>> +list_for_each_entry(s_job, 

Re: [PATCH RFC v4 13/16] drm, cgroup: Allow more aggressive memory reclaim

2019-08-29 Thread Koenig, Christian
Yeah, that's also a really good idea as well.

The problem with the shrinker API is that it only applies to system memory 
currently.

So you won't have a distinction which domain you need to evict stuff from.

Regards,
Christian.

Am 29.08.19 um 16:07 schrieb Kenny Ho:
Thanks for the feedback Christian.  I am still digging into this one.  Daniel 
suggested leveraging the Shrinker API for the functionality of this commit in 
RFC v3 but I am still trying to figure it out how/if ttm fit with shrinker 
(though the idea behind the shrinker API seems fairly straightforward as far as 
I understand it currently.)

Regards,
Kenny

On Thu, Aug 29, 2019 at 3:08 AM Koenig, Christian 
mailto:christian.koe...@amd.com>> wrote:
Am 29.08.19 um 08:05 schrieb Kenny Ho:
> Allow DRM TTM memory manager to register a work_struct, such that, when
> a drmcgrp is under memory pressure, memory reclaiming can be triggered
> immediately.
>
> Change-Id: I25ac04e2db9c19ff12652b88ebff18b44b2706d8
> Signed-off-by: Kenny Ho mailto:kenny...@amd.com>>
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c| 49 +
>   include/drm/drm_cgroup.h| 16 +++
>   include/drm/ttm/ttm_bo_driver.h |  2 ++
>   kernel/cgroup/drm.c | 30 
>   4 files changed, 97 insertions(+)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index d7e3d3128ebb..72efae694b7e 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -1590,6 +1590,46 @@ int ttm_bo_evict_mm(struct ttm_bo_device *bdev, 
> unsigned mem_type)
>   }
>   EXPORT_SYMBOL(ttm_bo_evict_mm);
>
> +static void ttm_bo_reclaim_wq(struct work_struct *work)
> +{
> + struct ttm_operation_ctx ctx = {
> + .interruptible = false,
> + .no_wait_gpu = false,
> + .flags = TTM_OPT_FLAG_FORCE_ALLOC
> + };
> + struct ttm_mem_type_manager *man =
> + container_of(work, struct ttm_mem_type_manager, reclaim_wq);
> + struct ttm_bo_device *bdev = man->bdev;
> + struct dma_fence *fence;
> + int mem_type;
> + int ret;
> +
> + for (mem_type = 0; mem_type < TTM_NUM_MEM_TYPES; mem_type++)
> + if (>man[mem_type] == man)
> + break;
> +
> + WARN_ON(mem_type >= TTM_NUM_MEM_TYPES);
> + if (mem_type >= TTM_NUM_MEM_TYPES)
> + return;
> +
> + if (!drmcg_mem_pressure_scan(bdev, mem_type))
> + return;
> +
> + ret = ttm_mem_evict_first(bdev, mem_type, NULL, , NULL);
> + if (ret)
> + return;
> +
> + spin_lock(>move_lock);
> + fence = dma_fence_get(man->move);
> + spin_unlock(>move_lock);
> +
> + if (fence) {
> + ret = dma_fence_wait(fence, false);
> + dma_fence_put(fence);
> + }

Why do you want to block for the fence here? That is a rather bad idea
and would break pipe-lining.

Apart from that I don't think we should put that into TTM.

Instead drmcg_register_device_mm() should get a function pointer which
is called from a work item when the group is under pressure.

TTM can then provides the function which can be called, but the actually
registration is job of the device and not TTM.

Regards,
Christian.

> +
> +}
> +
>   int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type,
>   unsigned long p_size)
>   {
> @@ -1624,6 +1664,13 @@ int ttm_bo_init_mm(struct ttm_bo_device *bdev, 
> unsigned type,
>   INIT_LIST_HEAD(>lru[i]);
>   man->move = NULL;
>
> + pr_err("drmcg %p type %d\n", bdev->ddev, type);
> +
> + if (type <= TTM_PL_VRAM) {
> + INIT_WORK(>reclaim_wq, ttm_bo_reclaim_wq);
> + drmcg_register_device_mm(bdev->ddev, type, >reclaim_wq);
> + }
> +
>   return 0;
>   }
>   EXPORT_SYMBOL(ttm_bo_init_mm);
> @@ -1701,6 +1748,8 @@ int ttm_bo_device_release(struct ttm_bo_device *bdev)
>   man = >man[i];
>   if (man->has_type) {
>   man->use_type = false;
> + drmcg_unregister_device_mm(bdev->ddev, i);
> + cancel_work_sync(>reclaim_wq);
>   if ((i != TTM_PL_SYSTEM) && ttm_bo_clean_mm(bdev, i)) {
>   ret = -EBUSY;
>   pr_err("DRM memory manager type %d is not 
> clean\n",
> diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
> index c11df388fdf2..6d9707e1eb72 100644
> --- a/include/drm/drm_cgroup.h
> +++ b/include/drm/drm_cgroup.h
> @@ -5,6 +5,7 @@
>   #define __DRM_CGROUP_H__
>
>   #include 
> +#include 
>   #include 
>   #include 
>
> @@ -25,12 +26,17 @@ struct drmcg_props {
>   s64 mem_bw_avg_bytes_per_us_default;
>
>   s64 mem_highs_default[TTM_PL_PRIV+1];
> +
> + struct work_struct  *mem_reclaim_wq[TTM_PL_PRIV];
>   };
>
>   #ifdef CONFIG_CGROUP_DRM
>
>   void drmcg_device_update(struct drm_device *device);
>   

Re: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Grodzovsky, Andrey
Agree, the placement of amdgpu_amdkfd_pre/post _reset in 
amdgpu_device_lock/unlock_adev is a bit wierd.

Andrey

On 8/29/19 10:06 AM, Koenig, Christian wrote:

Felix advised that the way to stop all KFD activity is simply to NOT
call amdgpu_amdkfd_post_reset so that why I added this. Do you mean you
prefer amdgpu_amdkfd_post_reset to be outside of amdgpu_device_unlock_adev ?


Yes, exactly. It doesn't seems to be related to the unlock operation in
the first place, but rather only signals the KFD that the reset is
completed.

Christian.


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH RFC v4 13/16] drm, cgroup: Allow more aggressive memory reclaim

2019-08-29 Thread Kenny Ho
Thanks for the feedback Christian.  I am still digging into this one.
Daniel suggested leveraging the Shrinker API for the functionality of this
commit in RFC v3 but I am still trying to figure it out how/if ttm fit with
shrinker (though the idea behind the shrinker API seems fairly
straightforward as far as I understand it currently.)

Regards,
Kenny

On Thu, Aug 29, 2019 at 3:08 AM Koenig, Christian 
wrote:

> Am 29.08.19 um 08:05 schrieb Kenny Ho:
> > Allow DRM TTM memory manager to register a work_struct, such that, when
> > a drmcgrp is under memory pressure, memory reclaiming can be triggered
> > immediately.
> >
> > Change-Id: I25ac04e2db9c19ff12652b88ebff18b44b2706d8
> > Signed-off-by: Kenny Ho 
> > ---
> >   drivers/gpu/drm/ttm/ttm_bo.c| 49 +
> >   include/drm/drm_cgroup.h| 16 +++
> >   include/drm/ttm/ttm_bo_driver.h |  2 ++
> >   kernel/cgroup/drm.c | 30 
> >   4 files changed, 97 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> > index d7e3d3128ebb..72efae694b7e 100644
> > --- a/drivers/gpu/drm/ttm/ttm_bo.c
> > +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> > @@ -1590,6 +1590,46 @@ int ttm_bo_evict_mm(struct ttm_bo_device *bdev,
> unsigned mem_type)
> >   }
> >   EXPORT_SYMBOL(ttm_bo_evict_mm);
> >
> > +static void ttm_bo_reclaim_wq(struct work_struct *work)
> > +{
> > + struct ttm_operation_ctx ctx = {
> > + .interruptible = false,
> > + .no_wait_gpu = false,
> > + .flags = TTM_OPT_FLAG_FORCE_ALLOC
> > + };
> > + struct ttm_mem_type_manager *man =
> > + container_of(work, struct ttm_mem_type_manager, reclaim_wq);
> > + struct ttm_bo_device *bdev = man->bdev;
> > + struct dma_fence *fence;
> > + int mem_type;
> > + int ret;
> > +
> > + for (mem_type = 0; mem_type < TTM_NUM_MEM_TYPES; mem_type++)
> > + if (>man[mem_type] == man)
> > + break;
> > +
> > + WARN_ON(mem_type >= TTM_NUM_MEM_TYPES);
> > + if (mem_type >= TTM_NUM_MEM_TYPES)
> > + return;
> > +
> > + if (!drmcg_mem_pressure_scan(bdev, mem_type))
> > + return;
> > +
> > + ret = ttm_mem_evict_first(bdev, mem_type, NULL, , NULL);
> > + if (ret)
> > + return;
> > +
> > + spin_lock(>move_lock);
> > + fence = dma_fence_get(man->move);
> > + spin_unlock(>move_lock);
> > +
> > + if (fence) {
> > + ret = dma_fence_wait(fence, false);
> > + dma_fence_put(fence);
> > + }
>
> Why do you want to block for the fence here? That is a rather bad idea
> and would break pipe-lining.
>
> Apart from that I don't think we should put that into TTM.
>
> Instead drmcg_register_device_mm() should get a function pointer which
> is called from a work item when the group is under pressure.
>
> TTM can then provides the function which can be called, but the actually
> registration is job of the device and not TTM.
>
> Regards,
> Christian.
>
> > +
> > +}
> > +
> >   int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type,
> >   unsigned long p_size)
> >   {
> > @@ -1624,6 +1664,13 @@ int ttm_bo_init_mm(struct ttm_bo_device *bdev,
> unsigned type,
> >   INIT_LIST_HEAD(>lru[i]);
> >   man->move = NULL;
> >
> > + pr_err("drmcg %p type %d\n", bdev->ddev, type);
> > +
> > + if (type <= TTM_PL_VRAM) {
> > + INIT_WORK(>reclaim_wq, ttm_bo_reclaim_wq);
> > + drmcg_register_device_mm(bdev->ddev, type,
> >reclaim_wq);
> > + }
> > +
> >   return 0;
> >   }
> >   EXPORT_SYMBOL(ttm_bo_init_mm);
> > @@ -1701,6 +1748,8 @@ int ttm_bo_device_release(struct ttm_bo_device
> *bdev)
> >   man = >man[i];
> >   if (man->has_type) {
> >   man->use_type = false;
> > + drmcg_unregister_device_mm(bdev->ddev, i);
> > + cancel_work_sync(>reclaim_wq);
> >   if ((i != TTM_PL_SYSTEM) && ttm_bo_clean_mm(bdev,
> i)) {
> >   ret = -EBUSY;
> >   pr_err("DRM memory manager type %d is not
> clean\n",
> > diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
> > index c11df388fdf2..6d9707e1eb72 100644
> > --- a/include/drm/drm_cgroup.h
> > +++ b/include/drm/drm_cgroup.h
> > @@ -5,6 +5,7 @@
> >   #define __DRM_CGROUP_H__
> >
> >   #include 
> > +#include 
> >   #include 
> >   #include 
> >
> > @@ -25,12 +26,17 @@ struct drmcg_props {
> >   s64 mem_bw_avg_bytes_per_us_default;
> >
> >   s64 mem_highs_default[TTM_PL_PRIV+1];
> > +
> > + struct work_struct  *mem_reclaim_wq[TTM_PL_PRIV];
> >   };
> >
> >   #ifdef CONFIG_CGROUP_DRM
> >
> >   void drmcg_device_update(struct drm_device *device);
> >   void drmcg_device_early_init(struct drm_device *device);
> > +void 

Re: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Koenig, Christian
Am 29.08.19 um 16:03 schrieb Grodzovsky, Andrey:
> On 8/29/19 3:30 AM, Christian König wrote:
>> Am 28.08.19 um 22:00 schrieb Andrey Grodzovsky:
>>> Problem:
>>> Under certain conditions, when some IP bocks take a RAS error,
>>> we can get into a situation where a GPU reset is not possible
>>> due to issues in RAS in SMU/PSP.
>>>
>>> Temporary fix until proper solution in PSP/SMU is ready:
>>> When uncorrectable error happens the DF will unconditionally
>>> broadcast error event packets to all its clients/slave upon
>>> receiving fatal error event and freeze all its outbound queues,
>>> err_event_athub interrupt  will be triggered.
>>> In such case and we use this interrupt
>>> to issue GPU reset. THe GPU reset code is modified for such case to
>>> avoid HW
>>> reset, only stops schedulers, deatches all in progress and not yet
>>> scheduled
>>> job's fences, set error code on them and signals.
>>> Also reject any new incoming job submissions from user space.
>>> All this is done to notify the applications of the problem.
>>>
>>> Signed-off-by: Andrey Grodzovsky 
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98
>>> ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 30 +++--
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 12 +++-
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 +--
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 24 
>>>    drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  5 ++
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +-
>>>    10 files changed, 164 insertions(+), 62 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 9da681e..300adb8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -38,6 +38,7 @@
>>>    #include "amdgpu_gmc.h"
>>>    #include "amdgpu_gem.h"
>>>    #include "amdgpu_display.h"
>>> +#include "amdgpu_ras.h"
>>>      #if defined(HAVE_DRM_FREE_LARGE)
>>>    #define kvfree drm_free_large
>>> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev,
>>> void *data, struct drm_file *filp)
>>>    bool reserved_buffers = false;
>>>    int i, r;
>>>    +    if (amdgpu_ras_intr_triggered())
>>> +    return -EHWPOISON;
>>> +
>>>    if (!adev->accel_working)
>>>    return -EBUSY;
>>>    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 07a4ba0..3ecee10 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct
>>> amdgpu_device *adev, bool trylock)
>>>    return true;
>>>    }
>>>    -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>>> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev,
>>> bool skip_kfd)
>>>    {
>>>    /*unlock kfd: SRIOV would do it separately */
>>> -    if (!amdgpu_sriov_vf(adev))
>>> +    if (!amdgpu_sriov_vf(adev) && !skip_kfd)
>>>    amdgpu_amdkfd_post_reset(adev);
>> It's most likely better to completely remove the call to
>> amdgpu_amdkfd_post_reset() here.
>
> Felix advised that the way to stop all KFD activity is simply to NOT
> call amdgpu_amdkfd_post_reset so that why I added this. Do you mean you
> prefer amdgpu_amdkfd_post_reset to be outside of amdgpu_device_unlock_adev ?

Yes, exactly. It doesn't seems to be related to the unlock operation in 
the first place, but rather only signals the KFD that the reset is 
completed.

Christian.

>
>
>>>    amdgpu_vf_error_trans_all(adev);
>>>    adev->mp1_state = PP_MP1_STATE_NONE;
>>> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct
>>> amdgpu_device *adev)
>>>    }
>>>      +#define to_drm_sched_job(sched_job)    \
>>> +    container_of((sched_job), struct drm_sched_job, queue_node)
>>> +
>>> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler
>>> *sched)
>>> +{
>>> +    struct drm_sched_job *s_job;
>>> +    struct drm_sched_entity *s_entity = NULL;
>>> +    int i;
>>> +
>>> +    /* Signal all jobs not yet scheduled */
>>> +    for (i = DRM_SCHED_PRIORITY_MAX - 1; i >=
>>> DRM_SCHED_PRIORITY_MIN; i--) {
>>> +    struct drm_sched_rq *rq = >sched_rq[i];
>>> +
>>> +    if (!rq)
>>> +    continue;
>>> +
>>> +    spin_lock(>lock);
>>> +    list_for_each_entry(s_entity, >entities, list) {
>>> +    while ((s_job =
>>> to_drm_sched_job(spsc_queue_pop(_entity->job_queue {
>>> +    struct drm_sched_fence *s_fence = s_job->s_fence;
>>> +
>>> +    dma_fence_signal(_fence->scheduled);
>>> +    dma_fence_set_error(_fence->finished, 

Re: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Grodzovsky, Andrey

On 8/29/19 3:30 AM, Christian König wrote:
> Am 28.08.19 um 22:00 schrieb Andrey Grodzovsky:
>> Problem:
>> Under certain conditions, when some IP bocks take a RAS error,
>> we can get into a situation where a GPU reset is not possible
>> due to issues in RAS in SMU/PSP.
>>
>> Temporary fix until proper solution in PSP/SMU is ready:
>> When uncorrectable error happens the DF will unconditionally
>> broadcast error event packets to all its clients/slave upon
>> receiving fatal error event and freeze all its outbound queues,
>> err_event_athub interrupt  will be triggered.
>> In such case and we use this interrupt
>> to issue GPU reset. THe GPU reset code is modified for such case to 
>> avoid HW
>> reset, only stops schedulers, deatches all in progress and not yet 
>> scheduled
>> job's fences, set error code on them and signals.
>> Also reject any new incoming job submissions from user space.
>> All this is done to notify the applications of the problem.
>>
>> Signed-off-by: Andrey Grodzovsky 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98 
>> ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 30 +++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 12 +++-
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 +--
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 24 
>>   drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  5 ++
>>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +-
>>   10 files changed, 164 insertions(+), 62 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 9da681e..300adb8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -38,6 +38,7 @@
>>   #include "amdgpu_gmc.h"
>>   #include "amdgpu_gem.h"
>>   #include "amdgpu_display.h"
>> +#include "amdgpu_ras.h"
>>     #if defined(HAVE_DRM_FREE_LARGE)
>>   #define kvfree drm_free_large
>> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, 
>> void *data, struct drm_file *filp)
>>   bool reserved_buffers = false;
>>   int i, r;
>>   +    if (amdgpu_ras_intr_triggered())
>> +    return -EHWPOISON;
>> +
>>   if (!adev->accel_working)
>>   return -EBUSY;
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 07a4ba0..3ecee10 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct 
>> amdgpu_device *adev, bool trylock)
>>   return true;
>>   }
>>   -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev, 
>> bool skip_kfd)
>>   {
>>   /*unlock kfd: SRIOV would do it separately */
>> -    if (!amdgpu_sriov_vf(adev))
>> +    if (!amdgpu_sriov_vf(adev) && !skip_kfd)
>>   amdgpu_amdkfd_post_reset(adev);
>
> It's most likely better to completely remove the call to 
> amdgpu_amdkfd_post_reset() here.


Felix advised that the way to stop all KFD activity is simply to NOT 
call amdgpu_amdkfd_post_reset so that why I added this. Do you mean you 
prefer amdgpu_amdkfd_post_reset to be outside of amdgpu_device_unlock_adev ?


>
>>   amdgpu_vf_error_trans_all(adev);
>>   adev->mp1_state = PP_MP1_STATE_NONE;
>> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct 
>> amdgpu_device *adev)
>>   }
>>     +#define to_drm_sched_job(sched_job)    \
>> +    container_of((sched_job), struct drm_sched_job, queue_node)
>> +
>> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler 
>> *sched)
>> +{
>> +    struct drm_sched_job *s_job;
>> +    struct drm_sched_entity *s_entity = NULL;
>> +    int i;
>> +
>> +    /* Signal all jobs not yet scheduled */
>> +    for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= 
>> DRM_SCHED_PRIORITY_MIN; i--) {
>> +    struct drm_sched_rq *rq = >sched_rq[i];
>> +
>> +    if (!rq)
>> +    continue;
>> +
>> +    spin_lock(>lock);
>> +    list_for_each_entry(s_entity, >entities, list) {
>> +    while ((s_job = 
>> to_drm_sched_job(spsc_queue_pop(_entity->job_queue {
>> +    struct drm_sched_fence *s_fence = s_job->s_fence;
>> +
>> +    dma_fence_signal(_fence->scheduled);
>> +    dma_fence_set_error(_fence->finished, -EHWPOISON);
>> +    dma_fence_signal(_fence->finished);
>> +    }
>> +    }
>> +    spin_unlock(>lock);
>> +    }
>> +
>> +    /* Signal all jobs already scheduled to HW */
>> +    list_for_each_entry(s_job, >ring_mirror_list, node) {
>> +    struct drm_sched_fence *s_fence = s_job->s_fence;
>> +
>> +    

[PATCH 6/7] drm/amdgpu: add ras_late_init callback function for nbio v7_4 (v3)

2019-08-29 Thread Hawking Zhang
ras_late_init callback function will be used to do common ras
init in late init phase.

v2: call ras_late_fini to do cleanup when fails to enable interrupt

v3: rename sysfs/debugfs node name to pcie_bif_xxx

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 45 
 2 files changed, 47 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
index a04c5ea..51078da6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_nbio.h
@@ -81,12 +81,14 @@ struct amdgpu_nbio_funcs {
void (*handle_ras_err_event_athub_intr_no_bifring)(struct amdgpu_device 
*adev);
int (*init_ras_controller_interrupt)(struct amdgpu_device *adev);
int (*init_ras_err_event_athub_interrupt)(struct amdgpu_device *adev);
+   int (*ras_late_init)(struct amdgpu_device *adev);
 };
 
 struct amdgpu_nbio {
const struct nbio_hdp_flush_reg *hdp_flush_reg;
struct amdgpu_irq_src ras_controller_irq;
struct amdgpu_irq_src ras_err_event_athub_irq;
+   struct ras_common_if *ras_if;
const struct amdgpu_nbio_funcs *funcs;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index faf9300..5e784bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -23,6 +23,7 @@
 #include "amdgpu.h"
 #include "amdgpu_atombios.h"
 #include "nbio_v7_4.h"
+#include "amdgpu_ras.h"
 
 #include "nbio/nbio_7_4_offset.h"
 #include "nbio/nbio_7_4_sh_mask.h"
@@ -468,6 +469,49 @@ static int nbio_v7_4_init_ras_err_event_athub_interrupt 
(struct amdgpu_device *a
return 0;
 }
 
+static int nbio_v7_4_ras_late_init(struct amdgpu_device *adev)
+{
+   int r;
+   struct ras_ih_if ih_info = {
+   .cb = NULL,
+   };
+   struct ras_fs_if fs_info = {
+   .sysfs_name = "pcie_bif_err_count",
+   .debugfs_name = "pcie_bif_err_inject",
+   };
+
+   if (!adev->nbio.ras_if) {
+   adev->nbio.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
+   if (!adev->nbio.ras_if)
+   return -ENOMEM;
+   adev->nbio.ras_if->block = AMDGPU_RAS_BLOCK__PCIE_BIF;
+   adev->nbio.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->nbio.ras_if->sub_block_index = 0;
+   strcpy(adev->nbio.ras_if->name, "pcie_bif");
+   }
+   ih_info.head = fs_info.head = *adev->nbio.ras_if;
+   r = amdgpu_ras_late_init(adev, adev->nbio.ras_if,
+_info, _info);
+   if (r)
+   goto free;
+
+   if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
+   r = amdgpu_irq_get(adev, >nbio.ras_controller_irq, 0);
+   if (r)
+   goto late_fini;
+   r = amdgpu_irq_get(adev, >nbio.ras_err_event_athub_irq, 
0);
+   if (r)
+   goto late_fini;
+   }
+
+   return 0;
+late_fini:
+   amdgpu_ras_late_fini(adev, adev->nbio.ras_if, _info);
+free:
+   kfree(adev->nbio.ras_if);
+   return r;
+}
+
 const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
.get_hdp_flush_req_offset = nbio_v7_4_get_hdp_flush_req_offset,
.get_hdp_flush_done_offset = nbio_v7_4_get_hdp_flush_done_offset,
@@ -493,4 +537,5 @@ const struct amdgpu_nbio_funcs nbio_v7_4_funcs = {
.handle_ras_err_event_athub_intr_no_bifring = 
nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring,
.init_ras_controller_interrupt = 
nbio_v7_4_init_ras_controller_interrupt,
.init_ras_err_event_athub_interrupt = 
nbio_v7_4_init_ras_err_event_athub_interrupt,
+   .ras_late_init = nbio_v7_4_ras_late_init,
 };
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 7/7] drm/amdgpu: switch to amdgpu_ras_late_init for nbio v7_4 (v2)

2019-08-29 Thread Hawking Zhang
call helper function in late init phase to handle ras init
for nbio ip block

v2: init local var r to 0 in case the function return failure
on asics that don't have ras_late_init implementation

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/soc15.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c 
b/drivers/gpu/drm/amd/amdgpu/soc15.c
index e791ac3..c6ff225c 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1208,11 +1208,15 @@ static int soc15_common_early_init(void *handle)
 static int soc15_common_late_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+   int r = 0;
 
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_get_irq(adev);
 
-   return 0;
+   if (adev->nbio.funcs->ras_late_init)
+   r = adev->nbio.funcs->ras_late_init(adev);
+
+   return r;
 }
 
 static int soc15_common_sw_init(void *handle)
@@ -1289,6 +1293,13 @@ static int soc15_common_hw_fini(void *handle)
if (amdgpu_sriov_vf(adev))
xgpu_ai_mailbox_put_irq(adev);
 
+   if (amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
+   if (adev->nbio.funcs->init_ras_controller_interrupt)
+   amdgpu_irq_put(adev, >nbio.ras_controller_irq, 0);
+   if (adev->nbio.funcs->init_ras_err_event_athub_interrupt)
+   amdgpu_irq_put(adev, 
>nbio.ras_err_event_athub_irq, 0);
+   }
+
return 0;
 }
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 5/7] drm/amdgpu: add mmhub ras_late_init callback function (v2)

2019-08-29 Thread Hawking Zhang
The function will be called in late init phase to do mmhub
ras init

v2: check ras_late_init function pointer before invoking the
function

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h |  1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 26 --
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c   | 28 
 3 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
index 2d75ecf..df04c71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h
@@ -23,6 +23,7 @@
 
 struct amdgpu_mmhub_funcs {
void (*ras_init)(struct amdgpu_device *adev);
+   int (*ras_late_init)(struct amdgpu_device *adev);
void (*query_ras_error_count)(struct amdgpu_device *adev,
void *ras_error_status);
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 8a7a56a..70a05e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -762,7 +762,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)
 {
int r;
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-   struct ras_ih_if mmhub_ih_info;
struct ras_fs_if umc_fs_info = {
.sysfs_name = "umc_err_count",
.debugfs_name = "umc_err_inject",
@@ -770,10 +769,6 @@ static int gmc_v9_0_ecc_late_init(void *handle)
struct ras_ih_if umc_ih_info = {
.cb = gmc_v9_0_process_ras_data_cb,
};
-   struct ras_fs_if mmhub_fs_info = {
-   .sysfs_name = "mmhub_err_count",
-   .debugfs_name = "mmhub_err_inject",
-   };
 
if (!adev->gmc.umc_ras_if) {
adev->gmc.umc_ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
@@ -797,29 +792,16 @@ static int gmc_v9_0_ecc_late_init(void *handle)
goto umc_late_fini;
}
 
-   if (!adev->gmc.mmhub_ras_if) {
-   adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
-   if (!adev->gmc.mmhub_ras_if)
-   return -ENOMEM;
-   adev->gmc.mmhub_ras_if->block = AMDGPU_RAS_BLOCK__MMHUB;
-   adev->gmc.mmhub_ras_if->type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
-   adev->gmc.mmhub_ras_if->sub_block_index = 0;
-   strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
+   if (adev->mmhub_funcs->ras_late_init) {
+   r = adev->mmhub_funcs->ras_late_init(adev);
+   if (r)
+   return r;
}
-   mmhub_ih_info.head = mmhub_fs_info.head = *adev->gmc.mmhub_ras_if;
-   r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
-_fs_info, _ih_info);
-   if (r)
-   goto mmhub_late_fini;
-
return 0;
-mmhub_late_fini:
-   amdgpu_ras_late_fini(adev, adev->gmc.mmhub_ras_if, _ih_info);
 umc_late_fini:
amdgpu_ras_late_fini(adev, adev->gmc.umc_ras_if, _ih_info);
 free:
kfree(adev->gmc.umc_ras_if);
-   kfree(adev->gmc.mmhub_ras_if);
return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c 
b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index 04cd4b6..9f7d5d1 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -31,6 +31,7 @@
 #include "vega10_enum.h"
 
 #include "soc15_common.h"
+#include "amdgpu_ras.h"
 
 #define mmDAGB0_CNTL_MISC2_RV 0x008f
 #define mmDAGB0_CNTL_MISC2_RV_BASE_IDX 0
@@ -615,6 +616,33 @@ static void mmhub_v1_0_query_ras_error_count(struct 
amdgpu_device *adev,
}
 }
 
+static int mmhub_v1_0_ras_late_init(struct amdgpu_device *adev)
+{
+   int r;
+   struct ras_ih_if mmhub_ih_info;
+   struct ras_fs_if mmhub_fs_info = {
+   .sysfs_name = "mmhub_err_count",
+   .debugfs_name = "mmhub_err_inject",
+   };
+
+   if (!adev->gmc.mmhub_ras_if) {
+   adev->gmc.mmhub_ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
+   if (!adev->gmc.mmhub_ras_if)
+   return -ENOMEM;
+   adev->gmc.mmhub_ras_if->block = AMDGPU_RAS_BLOCK__MMHUB;
+   adev->gmc.mmhub_ras_if->type = 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->gmc.mmhub_ras_if->sub_block_index = 0;
+   strcpy(adev->gmc.mmhub_ras_if->name, "mmhub");
+   }
+   mmhub_ih_info.head = mmhub_fs_info.head = *adev->gmc.mmhub_ras_if;
+   r = amdgpu_ras_late_init(adev, adev->gmc.mmhub_ras_if,
+_fs_info, _ih_info);
+   if (r)
+   kfree(adev->gmc.mmhub_ras_if);
+   return r;
+}
+
 const struct amdgpu_mmhub_funcs mmhub_v1_0_funcs = {
+   .ras_late_init = mmhub_v1_0_ras_late_init,

[PATCH 2/7] drm/amdgpu: switch to amdgpu_ras_late_init for gfx v9 block (v2)

2019-08-29 Thread Hawking Zhang
call helper function in late init phase to handle ras init
for gfx ip block

v2: call ras_late_fini to do clean up when fail to enable interrupt

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 92 ---
 1 file changed, 21 insertions(+), 71 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 90b900f..5b18642 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4389,7 +4389,6 @@ static int gfx_v9_0_process_ras_data_cb(struct 
amdgpu_device *adev,
 static int gfx_v9_0_ecc_late_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-   struct ras_common_if **ras_if = >gfx.ras_if;
struct ras_ih_if ih_info = {
.cb = gfx_v9_0_process_ras_data_cb,
};
@@ -4397,18 +4396,18 @@ static int gfx_v9_0_ecc_late_init(void *handle)
.sysfs_name = "gfx_err_count",
.debugfs_name = "gfx_err_inject",
};
-   struct ras_common_if ras_block = {
-   .block = AMDGPU_RAS_BLOCK__GFX,
-   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-   .sub_block_index = 0,
-   .name = "gfx",
-   };
int r;
 
-   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
-   amdgpu_ras_feature_enable_on_boot(adev, _block, 0);
-   return 0;
+   if (!adev->gfx.ras_if) {
+   adev->gfx.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
+   if (!adev->gfx.ras_if)
+   return -ENOMEM;
+   adev->gfx.ras_if->block = AMDGPU_RAS_BLOCK__GFX;
+   adev->gfx.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->gfx.ras_if->sub_block_index = 0;
+   strcpy(adev->gfx.ras_if->name, "gfx");
}
+   fs_info.head = ih_info.head = *adev->gfx.ras_if;
 
r = gfx_v9_0_do_edc_gds_workarounds(adev);
if (r)
@@ -4419,71 +4418,22 @@ static int gfx_v9_0_ecc_late_init(void *handle)
if (r)
return r;
 
-   /* handle resume path. */
-   if (*ras_if) {
-   /* resend ras TA enable cmd during resume.
-* prepare to handle failure.
-*/
-   ih_info.head = **ras_if;
-   r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-   if (r) {
-   if (r == -EAGAIN) {
-   /* request a gpu reset. will run again. */
-   amdgpu_ras_request_reset_on_boot(adev,
-   AMDGPU_RAS_BLOCK__GFX);
-   return 0;
-   }
-   /* fail to enable ras, cleanup all. */
-   goto irq;
-   }
-   /* enable successfully. continue. */
-   goto resume;
-   }
-
-   *ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
-   if (!*ras_if)
-   return -ENOMEM;
-
-   **ras_if = ras_block;
-
-   r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-   if (r) {
-   if (r == -EAGAIN) {
-   amdgpu_ras_request_reset_on_boot(adev,
-   AMDGPU_RAS_BLOCK__GFX);
-   r = 0;
-   }
-   goto feature;
-   }
-
-   ih_info.head = **ras_if;
-   fs_info.head = **ras_if;
-
-   r = amdgpu_ras_interrupt_add_handler(adev, _info);
+   r = amdgpu_ras_late_init(adev, adev->gfx.ras_if,
+_info, _info);
if (r)
-   goto interrupt;
+   goto free;
 
-   amdgpu_ras_debugfs_create(adev, _info);
-
-   r = amdgpu_ras_sysfs_create(adev, _info);
-   if (r)
-   goto sysfs;
-resume:
-   r = amdgpu_irq_get(adev, >gfx.cp_ecc_error_irq, 0);
-   if (r)
-   goto irq;
+   if (amdgpu_ras_is_supported(adev, adev->gfx.ras_if->block)) {
+   r = amdgpu_irq_get(adev, >gfx.cp_ecc_error_irq, 0);
+   if (r)
+   goto late_fini;
+   }
 
return 0;
-irq:
-   amdgpu_ras_sysfs_remove(adev, *ras_if);
-sysfs:
-   amdgpu_ras_debugfs_remove(adev, *ras_if);
-   amdgpu_ras_interrupt_remove_handler(adev, _info);
-interrupt:
-   amdgpu_ras_feature_enable(adev, *ras_if, 0);
-feature:
-   kfree(*ras_if);
-   *ras_if = NULL;
+late_fini:
+   amdgpu_ras_late_fini(adev, adev->gfx.ras_if, _info);
+free:
+   kfree(adev->gfx.ras_if);
return r;
 }
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH 4/7] drm/amdgpu: switch to amdgpu_ras_late_init for gmc v9 block (v2)

2019-08-29 Thread Hawking Zhang
call helper function in late init phase to handle ras init
for gmc ip block

v2: call ras_late_fini to do clean up when fail to enable interrupt

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 159 ++
 1 file changed, 47 insertions(+), 112 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 950ac61..8a7a56a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -758,133 +758,68 @@ static int gmc_v9_0_allocate_vm_inv_eng(struct 
amdgpu_device *adev)
return 0;
 }
 
-static int gmc_v9_0_ecc_ras_block_late_init(void *handle,
-   struct ras_fs_if *fs_info, struct ras_common_if 
*ras_block)
-{
-   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-   struct ras_common_if **ras_if = NULL;
-   struct ras_ih_if ih_info = {
-   .cb = gmc_v9_0_process_ras_data_cb,
-   };
-   int r;
-
-   if (ras_block->block == AMDGPU_RAS_BLOCK__UMC)
-   ras_if = >gmc.umc_ras_if;
-   else if (ras_block->block == AMDGPU_RAS_BLOCK__MMHUB)
-   ras_if = >gmc.mmhub_ras_if;
-   else
-   BUG();
-
-   if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
-   amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
-   return 0;
-   }
-
-   /* handle resume path. */
-   if (*ras_if) {
-   /* resend ras TA enable cmd during resume.
-* prepare to handle failure.
-*/
-   ih_info.head = **ras_if;
-   r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-   if (r) {
-   if (r == -EAGAIN) {
-   /* request a gpu reset. will run again. */
-   amdgpu_ras_request_reset_on_boot(adev,
-   ras_block->block);
-   return 0;
-   }
-   /* fail to enable ras, cleanup all. */
-   goto irq;
-   }
-   /* enable successfully. continue. */
-   goto resume;
-   }
-
-   *ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
-   if (!*ras_if)
-   return -ENOMEM;
-
-   **ras_if = *ras_block;
-
-   r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-   if (r) {
-   if (r == -EAGAIN) {
-   amdgpu_ras_request_reset_on_boot(adev,
-   ras_block->block);
-   r = 0;
-   }
-   goto feature;
-   }
-
-   ih_info.head = **ras_if;
-   fs_info->head = **ras_if;
-
-   if (ras_block->block == AMDGPU_RAS_BLOCK__UMC) {
-   r = amdgpu_ras_interrupt_add_handler(adev, _info);
-   if (r)
-   goto interrupt;
-   }
-
-   amdgpu_ras_debugfs_create(adev, fs_info);
-
-   r = amdgpu_ras_sysfs_create(adev, fs_info);
-   if (r)
-   goto sysfs;
-resume:
-   if (ras_block->block == AMDGPU_RAS_BLOCK__UMC) {
-   r = amdgpu_irq_get(adev, >gmc.ecc_irq, 0);
-   if (r)
-   goto irq;
-   }
-
-   return 0;
-irq:
-   amdgpu_ras_sysfs_remove(adev, *ras_if);
-sysfs:
-   amdgpu_ras_debugfs_remove(adev, *ras_if);
-   if (ras_block->block == AMDGPU_RAS_BLOCK__UMC)
-   amdgpu_ras_interrupt_remove_handler(adev, _info);
-interrupt:
-   amdgpu_ras_feature_enable(adev, *ras_if, 0);
-feature:
-   kfree(*ras_if);
-   *ras_if = NULL;
-   return r;
-}
-
 static int gmc_v9_0_ecc_late_init(void *handle)
 {
int r;
-
+   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+   struct ras_ih_if mmhub_ih_info;
struct ras_fs_if umc_fs_info = {
.sysfs_name = "umc_err_count",
.debugfs_name = "umc_err_inject",
};
-   struct ras_common_if umc_ras_block = {
-   .block = AMDGPU_RAS_BLOCK__UMC,
-   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-   .sub_block_index = 0,
-   .name = "umc",
+   struct ras_ih_if umc_ih_info = {
+   .cb = gmc_v9_0_process_ras_data_cb,
};
struct ras_fs_if mmhub_fs_info = {
.sysfs_name = "mmhub_err_count",
.debugfs_name = "mmhub_err_inject",
};
-   struct ras_common_if mmhub_ras_block = {
-   .block = AMDGPU_RAS_BLOCK__MMHUB,
-   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-   .sub_block_index = 0,
-   .name = "mmhub",
-   };
 
-   r = gmc_v9_0_ecc_ras_block_late_init(handle,
-   _fs_info, _ras_block);
+   if (!adev->gmc.umc_ras_if) {
+   

[PATCH 3/7] drm/amdgpu: switch to amdgpu_ras_late_init for sdma v4 block (v2)

2019-08-29 Thread Hawking Zhang
call helper function in late init phase to handle ras init
for sdma ip block

v2: call ras_late_fini to do clean up when fail to enable interrupt

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 98 +-
 1 file changed, 24 insertions(+), 74 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 256d381..f8d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1692,7 +1692,6 @@ static int sdma_v4_0_process_ras_data_cb(struct 
amdgpu_device *adev,
 static int sdma_v4_0_late_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-   struct ras_common_if **ras_if = >sdma.ras_if;
struct ras_ih_if ih_info = {
.cb = sdma_v4_0_process_ras_data_cb,
};
@@ -1700,87 +1699,38 @@ static int sdma_v4_0_late_init(void *handle)
.sysfs_name = "sdma_err_count",
.debugfs_name = "sdma_err_inject",
};
-   struct ras_common_if ras_block = {
-   .block = AMDGPU_RAS_BLOCK__SDMA,
-   .type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
-   .sub_block_index = 0,
-   .name = "sdma",
-   };
int r, i;
 
-   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
-   amdgpu_ras_feature_enable_on_boot(adev, _block, 0);
-   return 0;
-   }
-
-   /* handle resume path. */
-   if (*ras_if) {
-   /* resend ras TA enable cmd during resume.
-* prepare to handle failure.
-*/
-   ih_info.head = **ras_if;
-   r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-   if (r) {
-   if (r == -EAGAIN) {
-   /* request a gpu reset. will run again. */
-   amdgpu_ras_request_reset_on_boot(adev,
-   AMDGPU_RAS_BLOCK__SDMA);
-   return 0;
-   }
-   /* fail to enable ras, cleanup all. */
-   goto irq;
-   }
-   /* enable successfully. continue. */
-   goto resume;
-   }
-
-   *ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
-   if (!*ras_if)
-   return -ENOMEM;
-
-   **ras_if = ras_block;
-
-   r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
-   if (r) {
-   if (r == -EAGAIN) {
-   amdgpu_ras_request_reset_on_boot(adev,
-   AMDGPU_RAS_BLOCK__SDMA);
-   r = 0;
-   }
-   goto feature;
+   if (!adev->sdma.ras_if) {
+   adev->sdma.ras_if = kmalloc(sizeof(struct ras_common_if), 
GFP_KERNEL);
+   if (!adev->sdma.ras_if)
+   return -ENOMEM;
+   adev->sdma.ras_if->block = AMDGPU_RAS_BLOCK__SDMA;
+   adev->sdma.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->sdma.ras_if->sub_block_index = 0;
+   strcpy(adev->sdma.ras_if->name, "sdma");
}
+   fs_info.head = ih_info.head = *adev->sdma.ras_if;
 
-   ih_info.head = **ras_if;
-   fs_info.head = **ras_if;
-
-   r = amdgpu_ras_interrupt_add_handler(adev, _info);
+   r = amdgpu_ras_late_init(adev, adev->sdma.ras_if,
+_info, _info);
if (r)
-   goto interrupt;
-
-   amdgpu_ras_debugfs_create(adev, _info);
+   goto free;
 
-   r = amdgpu_ras_sysfs_create(adev, _info);
-   if (r)
-   goto sysfs;
-resume:
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   r = amdgpu_irq_get(adev, >sdma.ecc_irq,
-  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
-   if (r)
-   goto irq;
+   if (amdgpu_ras_is_supported(adev, adev->sdma.ras_if->block)) {
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   r = amdgpu_irq_get(adev, >sdma.ecc_irq,
+   AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   if (r)
+   goto late_fini;
+   }
}
 
-   return 0;
-irq:
-   amdgpu_ras_sysfs_remove(adev, *ras_if);
-sysfs:
-   amdgpu_ras_debugfs_remove(adev, *ras_if);
-   amdgpu_ras_interrupt_remove_handler(adev, _info);
-interrupt:
-   amdgpu_ras_feature_enable(adev, *ras_if, 0);
-feature:
-   kfree(*ras_if);
-   *ras_if = NULL;
+return 0;
+late_fini:
+   amdgpu_ras_late_fini(adev, adev->sdma.ras_if, _info);
+free:
+   kfree(adev->sdma.ras_if);
return r;
 }
 
-- 
2.7.4

___
amd-gfx mailing list

[PATCH 1/7] drm/amdgpu: add helper function to do common ras_late_init/fini (v3)

2019-08-29 Thread Hawking Zhang
In late_init for ras, the helper function will be used to
1). disable ras feature if the IP block is masked as disabled
2). send enable feature command if the ip block was masked as enabled
3). create debugfs/sysfs node per IP block
4). register interrupt handler

v2: check ih_info.cb to decide add interrupt handler or not

v3: add ras_late_fini for cleanup all the ras fs node and remove
interrupt handler

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 72 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  7 
 2 files changed, 79 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 230f7e6..2b930fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1564,6 +1564,78 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
return -EINVAL;
 }
 
+/* helper function to handle common stuff in ip late init phase */
+int amdgpu_ras_late_init(struct amdgpu_device *adev,
+struct ras_common_if *ras_block,
+struct ras_fs_if *fs_info,
+struct ras_ih_if *ih_info)
+{
+   int r;
+
+   /* disable RAS feature per IP block if it is not supported */
+   if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
+   amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
+   return 0;
+   }
+
+   r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
+   if (r) {
+   if (r == -EAGAIN) {
+   /* request gpu reset. will run again */
+   amdgpu_ras_request_reset_on_boot(adev,
+   ras_block->block);
+   return 0;
+   } else if (adev->in_suspend || adev->in_gpu_reset) {
+   /* in resume phase, if fail to enable ras,
+* clean up all ras fs nodes, and disable ras */
+   goto cleanup;
+   } else
+   return r;
+   }
+
+   /* in resume phase, no need to create ras fs node */
+   if (adev->in_suspend || adev->in_gpu_reset)
+   return 0;
+
+   if(ih_info->cb) {
+   r = amdgpu_ras_interrupt_add_handler(adev, ih_info);
+   if (r)
+   goto interrupt;
+   }
+
+   amdgpu_ras_debugfs_create(adev, fs_info);
+
+   r = amdgpu_ras_sysfs_create(adev, fs_info);
+   if (r)
+   goto sysfs;
+
+   return 0;
+cleanup:
+   amdgpu_ras_sysfs_remove(adev, ras_block);
+sysfs:
+   amdgpu_ras_debugfs_remove(adev, ras_block);
+   if (ih_info->cb)
+   amdgpu_ras_interrupt_remove_handler(adev, ih_info);
+interrupt:
+   amdgpu_ras_feature_enable(adev, ras_block, 0);
+   return r;
+}
+
+/* helper function to remove ras fs node and interrupt handler */
+void amdgpu_ras_late_fini(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block,
+ struct ras_ih_if *ih_info)
+{
+   if (!ras_block || !ih_info)
+   return;
+
+   amdgpu_ras_sysfs_remove(adev, ras_block);
+   amdgpu_ras_debugfs_remove(adev, ras_block);
+   if (ih_info->cb)
+amdgpu_ras_interrupt_remove_handler(adev, ih_info);
+   amdgpu_ras_feature_enable(adev, ras_block, 0);
+}
+
 /* do some init work after IP late init as dependence.
  * and it runs in resume/gpu reset/booting up cases.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 6c76bb2..66b7152 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -566,6 +566,13 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) {
 int amdgpu_ras_init(struct amdgpu_device *adev);
 int amdgpu_ras_fini(struct amdgpu_device *adev);
 int amdgpu_ras_pre_fini(struct amdgpu_device *adev);
+int amdgpu_ras_late_init(struct amdgpu_device *adev,
+struct ras_common_if *ras_block,
+struct ras_fs_if *fs_info,
+struct ras_ih_if *ih_info);
+void amdgpu_ras_late_fini(struct amdgpu_device *adev,
+ struct ras_common_if *ras_block,
+ struct ras_ih_if *ih_info);
 
 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
struct ras_common_if *head, bool enable);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Couldn't read Speaker Allocation Data Block/SADs

2019-08-29 Thread Jean Delvare
Hi all,

Since I connected my Dell display on my Radeon R5 240 (Oland) card over
DisplayPort instead of VGA, I get the following error messages logged at every 
boot:

[drm:dce_v6_0_encoder_mode_set [amdgpu]] *ERROR* Couldn't read Speaker 
Allocation Data Block: -2
[drm:dce_v6_0_encoder_mode_set [amdgpu]] *ERROR* Couldn't read SADs: -2

I also see them each time the display wakes up and also on VT change.
This is with kernel 5.2.9.

This was also reported as bug #107825 by Paul Menzel:
https://bugs.freedesktop.org/show_bug.cgi?id=107825

Error -2 is ENOENT (No such file or directory). The driver queries the
display for audio-related information, while my display does not have
speakers nor headset connector.

I suspect that the "error" is pretty much expected in this case and the
driver is being too verbose about it. Either the calling code should
consider -ENOENT as a non-error (11 calling sites to fix), or the
helper functions should simply return 0 when no audio-related data is
available from the display (2 functions to fix, calling sites may have
to be inspected too as some treat 0 as an error too, which seems
incorrect to me).

Option 1 seems cleaner to me, but I don't know if there could be
legitimate reasons to distinguish between no audio information block
from display and empty audio information from display in the future.

What do you think?

Thanks,
-- 
Jean Delvare
SUSE L3 Support
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v3 6/7] drm/amdgpu: utilize subconnector property for DP through atombios

2019-08-29 Thread Alex Deucher
On Mon, Aug 26, 2019 at 9:22 AM Oleg Vasilev  wrote:
>
> Since DP-specific information is stored in driver's structures, every
> driver needs to implement subconnector property by itself.
>
> Reviewed-by: Emil Velikov 
> Signed-off-by: Oleg Vasilev 
> Cc: Alex Deucher 
> Cc: Christian König 
> Cc: David (ChunMing) Zhou 
> Cc: amd-gfx@lists.freedesktop.org

Similar to Ilia's sentiments, do these make sense for amd drivers?  We
expose the physical connectors only.  So physical DP ports show up as
DP drm connectors and if you connect a passive DP to HDMI/DVI dingle,
the driver just does the right thing.  We don't expose multiple drm
connectors for the same physical connector.

Alex

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c | 10 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h   |  1 +
>  drivers/gpu/drm/amd/amdgpu/atombios_dp.c   | 18 +-
>  3 files changed, 28 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
> index ece55c8fa673..348ed9e46bae 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
> @@ -26,6 +26,7 @@
>
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include "amdgpu.h"
> @@ -1407,6 +1408,10 @@ amdgpu_connector_dp_detect(struct drm_connector 
> *connector, bool force)
> pm_runtime_put_autosuspend(connector->dev->dev);
> }
>
> +   drm_dp_set_subconnector_property(_connector->base,
> +ret,
> +amdgpu_dig_connector->dpcd,
> +
> amdgpu_dig_connector->downstream_ports);
> return ret;
>  }
>
> @@ -1934,6 +1939,11 @@ amdgpu_connector_add(struct amdgpu_device *adev,
> if (has_aux)
> amdgpu_atombios_dp_aux_init(amdgpu_connector);
>
> +   if (connector_type == DRM_MODE_CONNECTOR_DisplayPort ||
> +   connector_type == DRM_MODE_CONNECTOR_eDP) {
> +   
> drm_mode_add_dp_subconnector_property(_connector->base);
> +   }
> +
> return;
>
>  failed:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
> index eb9975f4decb..cb360b44371c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
> @@ -469,6 +469,7 @@ struct amdgpu_encoder {
>  struct amdgpu_connector_atom_dig {
> /* displayport */
> u8 dpcd[DP_RECEIVER_CAP_SIZE];
> +   u8 downstream_ports[DP_MAX_DOWNSTREAM_PORTS];
> u8 dp_sink_type;
> int dp_clock;
> int dp_lane_count;
> diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c 
> b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
> index 6858cde9fc5d..b0d414553e71 100644
> --- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
> @@ -334,6 +334,22 @@ static void amdgpu_atombios_dp_probe_oui(struct 
> amdgpu_connector *amdgpu_connect
>   buf[0], buf[1], buf[2]);
>  }
>
> +static void amdgpu_atombios_dp_ds_ports(struct amdgpu_connector 
> *amdgpu_connector)
> +{
> +   struct amdgpu_connector_atom_dig *dig_connector = 
> amdgpu_connector->con_priv;
> +   int ret;
> +
> +   if (dig_connector->dpcd[DP_DPCD_REV] > 0x10) {
> +   ret = drm_dp_dpcd_read(_connector->ddc_bus->aux,
> +  DP_DOWNSTREAM_PORT_0,
> +  dig_connector->downstream_ports,
> +  DP_MAX_DOWNSTREAM_PORTS);
> +   if (ret)
> +   memset(dig_connector->downstream_ports, 0,
> +  DP_MAX_DOWNSTREAM_PORTS);
> +   }
> +}
> +
>  int amdgpu_atombios_dp_get_dpcd(struct amdgpu_connector *amdgpu_connector)
>  {
> struct amdgpu_connector_atom_dig *dig_connector = 
> amdgpu_connector->con_priv;
> @@ -349,7 +365,7 @@ int amdgpu_atombios_dp_get_dpcd(struct amdgpu_connector 
> *amdgpu_connector)
>   dig_connector->dpcd);
>
> amdgpu_atombios_dp_probe_oui(amdgpu_connector);
> -
> +   amdgpu_atombios_dp_ds_ports(amdgpu_connector);
> return 0;
> }
>
> --
> 2.23.0
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 14/17] drm/amd/display: Isolate DSC module from driver dependencies

2019-08-29 Thread Kazlauskas, Nicholas
On 2019-08-29 1:38 a.m., Dave Airlie wrote:
> On Thu, 29 Aug 2019 at 07:04, Bhawanpreet Lakha
>  wrote:
>>
>> From: Bayan Zabihiyan 
>>
>> [Why]
>> Edid Utility wishes to include DSC module from driver instead
>> of doing it's own logic which will need to be updated every time
>> someone modifies the driver logic.
>>
>> [How]
>> Modify some functions such that we dont need to pass the entire
>> DC structure as parameter.
>> -Remove DC inclusion from module.
>> -Filter out problematic types and inclusions
> 
> Do we really want the ifdef stuff upstream, the EDID utility isn't
> shipped with the kernel is it.
> 
> Dave.

It's not, and this isn't a kernel configurable option anyway.

So this really should be dropped or split out in a way that we don't 
need to use this.

Nicholas Kazlauskas

> 
>>
>> Signed-off-by: Bayan Zabihiyan 
>> Reviewed-by: Jun Lei 
>> Acked-by: Bhawanpreet Lakha 
>> ---
>>   .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  3 +-
>>   drivers/gpu/drm/amd/display/dc/dc_dsc.h   | 14 +++-
>>   drivers/gpu/drm/amd/display/dc/dc_hw_types.h  | 57 --
>>   drivers/gpu/drm/amd/display/dc/dc_types.h |  9 +++
>>   drivers/gpu/drm/amd/display/dc/dsc/dc_dsc.c   | 75 ---
>>   drivers/gpu/drm/amd/display/dc/inc/hw/dsc.h   | 12 ++-
>>   6 files changed, 125 insertions(+), 45 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> index 654679c4fded..82ea8cf8563e 100644
>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> @@ -3677,8 +3677,9 @@ create_stream_for_sink(struct amdgpu_dm_connector 
>> *aconnector,
>>   
>> dc_link_get_link_cap(aconnector->dc_link));
>>
>>  if (dsc_caps.is_dsc_supported)
>> -   if 
>> (dc_dsc_compute_config(aconnector->dc_link->ctx->dc,
>> +   if 
>> (dc_dsc_compute_config(aconnector->dc_link->ctx->dc->res_pool->dscs[0],
>>_caps,
>> + 
>> aconnector->dc_link->ctx->dc->debug.dsc_min_slice_height_override,
>>link_bandwidth_kbps,
>>>timing,
>>>timing.dsc_cfg))
>> diff --git a/drivers/gpu/drm/amd/display/dc/dc_dsc.h 
>> b/drivers/gpu/drm/amd/display/dc/dc_dsc.h
>> index 6e42209f0e20..0ed2962add5a 100644
>> --- a/drivers/gpu/drm/amd/display/dc/dc_dsc.h
>> +++ b/drivers/gpu/drm/amd/display/dc/dc_dsc.h
>> @@ -30,6 +30,7 @@
>>   #define DP_DSC_BRANCH_OVERALL_THROUGHPUT_0  0x0a0   /* DP 1.4a SCR */
>>   #define DP_DSC_BRANCH_OVERALL_THROUGHPUT_1  0x0a1
>>   #define DP_DSC_BRANCH_MAX_LINE_WIDTH0x0a2
>> +#include "dc_types.h"
>>
>>   struct dc_dsc_bw_range {
>>  uint32_t min_kbps; /* Bandwidth if min_target_bpp_x16 is used */
>> @@ -39,13 +40,21 @@ struct dc_dsc_bw_range {
>>  uint32_t stream_kbps; /* Uncompressed stream bandwidth */
>>   };
>>
>> +struct display_stream_compressor {
>> +   const struct dsc_funcs *funcs;
>> +#ifndef AMD_EDID_UTILITY
>> +   struct dc_context *ctx;
>> +   int inst;
>> +#endif
>> +};
>>
>>   bool dc_dsc_parse_dsc_dpcd(const uint8_t *dpcd_dsc_basic_data,
>>  const uint8_t *dpcd_dsc_ext_data,
>>  struct dsc_dec_dpcd_caps *dsc_sink_caps);
>>
>>   bool dc_dsc_compute_bandwidth_range(
>> -   const struct dc *dc,
>> +   const struct display_stream_compressor *dsc,
>> +   const uint32_t dsc_min_slice_height_override,
>>  const uint32_t min_kbps,
>>  const uint32_t max_kbps,
>>  const struct dsc_dec_dpcd_caps *dsc_sink_caps,
>> @@ -53,8 +62,9 @@ bool dc_dsc_compute_bandwidth_range(
>>  struct dc_dsc_bw_range *range);
>>
>>   bool dc_dsc_compute_config(
>> -   const struct dc *dc,
>> +   const struct display_stream_compressor *dsc,
>>  const struct dsc_dec_dpcd_caps *dsc_sink_caps,
>> +   const uint32_t dsc_min_slice_height_override,
>>  uint32_t target_bandwidth_kbps,
>>  const struct dc_crtc_timing *timing,
>>  struct dc_dsc_config *dsc_cfg);
>> diff --git a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h 
>> b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h
>> index dafc19a7b699..2869b26d966a 100644
>> --- a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h
>> +++ b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h
>> @@ -26,6 +26,8 @@
>>   #ifndef DC_HW_TYPES_H
>>   #define DC_HW_TYPES_H
>>
>> +#ifndef AMD_EDID_UTILITY
>> +
>>   #include "os_types.h"
>>   #include "fixed31_32.h"
>>   #include "signal_types.h"
>> @@ -587,6 +589,8 @@ struct scaling_taps {
>> 

[PATCH v4 6/7] drm/amdgpu: utilize subconnector property for DP through atombios

2019-08-29 Thread Oleg Vasilev
Since DP-specific information is stored in driver's structures, every
driver needs to implement subconnector property by itself.

Reviewed-by: Emil Velikov 
Signed-off-by: Oleg Vasilev 
Cc: Alex Deucher 
Cc: Christian König 
Cc: David (ChunMing) Zhou 
Cc: amd-gfx@lists.freedesktop.org
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c | 10 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h   |  1 +
 drivers/gpu/drm/amd/amdgpu/atombios_dp.c   | 18 +-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
index ece55c8fa673..348ed9e46bae 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c
@@ -26,6 +26,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include "amdgpu.h"
@@ -1407,6 +1408,10 @@ amdgpu_connector_dp_detect(struct drm_connector 
*connector, bool force)
pm_runtime_put_autosuspend(connector->dev->dev);
}
 
+   drm_dp_set_subconnector_property(_connector->base,
+ret,
+amdgpu_dig_connector->dpcd,
+
amdgpu_dig_connector->downstream_ports);
return ret;
 }
 
@@ -1934,6 +1939,11 @@ amdgpu_connector_add(struct amdgpu_device *adev,
if (has_aux)
amdgpu_atombios_dp_aux_init(amdgpu_connector);
 
+   if (connector_type == DRM_MODE_CONNECTOR_DisplayPort ||
+   connector_type == DRM_MODE_CONNECTOR_eDP) {
+   drm_mode_add_dp_subconnector_property(_connector->base);
+   }
+
return;
 
 failed:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
index eb9975f4decb..cb360b44371c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
@@ -469,6 +469,7 @@ struct amdgpu_encoder {
 struct amdgpu_connector_atom_dig {
/* displayport */
u8 dpcd[DP_RECEIVER_CAP_SIZE];
+   u8 downstream_ports[DP_MAX_DOWNSTREAM_PORTS];
u8 dp_sink_type;
int dp_clock;
int dp_lane_count;
diff --git a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c 
b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
index 6858cde9fc5d..b0d414553e71 100644
--- a/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
+++ b/drivers/gpu/drm/amd/amdgpu/atombios_dp.c
@@ -334,6 +334,22 @@ static void amdgpu_atombios_dp_probe_oui(struct 
amdgpu_connector *amdgpu_connect
  buf[0], buf[1], buf[2]);
 }
 
+static void amdgpu_atombios_dp_ds_ports(struct amdgpu_connector 
*amdgpu_connector)
+{
+   struct amdgpu_connector_atom_dig *dig_connector = 
amdgpu_connector->con_priv;
+   int ret;
+
+   if (dig_connector->dpcd[DP_DPCD_REV] > 0x10) {
+   ret = drm_dp_dpcd_read(_connector->ddc_bus->aux,
+  DP_DOWNSTREAM_PORT_0,
+  dig_connector->downstream_ports,
+  DP_MAX_DOWNSTREAM_PORTS);
+   if (ret)
+   memset(dig_connector->downstream_ports, 0,
+  DP_MAX_DOWNSTREAM_PORTS);
+   }
+}
+
 int amdgpu_atombios_dp_get_dpcd(struct amdgpu_connector *amdgpu_connector)
 {
struct amdgpu_connector_atom_dig *dig_connector = 
amdgpu_connector->con_priv;
@@ -349,7 +365,7 @@ int amdgpu_atombios_dp_get_dpcd(struct amdgpu_connector 
*amdgpu_connector)
  dig_connector->dpcd);
 
amdgpu_atombios_dp_probe_oui(amdgpu_connector);
-
+   amdgpu_atombios_dp_ds_ports(amdgpu_connector);
return 0;
}
 
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH v4 7/7] drm/amdgpu: utilize subconnector property for DP through DisplayManager

2019-08-29 Thread Oleg Vasilev
Since DP-specific information is stored in driver's structures, every
driver needs to implement subconnector property by itself. Display
Core already has the subconnector information, we only need to
expose it through DRM property.

Signed-off-by: Oleg Vasilev 
Tested-by: Oleg Vasilev 
Cc: Alex Deucher 
Cc: Christian König 
Cc: David (ChunMing) Zhou 
Cc: amd-gfx@lists.freedesktop.org
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 41 ++-
 .../display/amdgpu_dm/amdgpu_dm_mst_types.c   |  3 ++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 028a710c1b46..6c03831f02f2 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -143,6 +143,42 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
 static void handle_cursor_update(struct drm_plane *plane,
 struct drm_plane_state *old_plane_state);
 
+static enum drm_mode_subconnector get_subconnector_type(struct dc_link *link)
+{
+   switch (link->dpcd_caps.dongle_type) {
+   case DISPLAY_DONGLE_NONE:
+   return DRM_MODE_SUBCONNECTOR_Native;
+   case DISPLAY_DONGLE_DP_VGA_CONVERTER:
+   return DRM_MODE_SUBCONNECTOR_VGA;
+   case DISPLAY_DONGLE_DP_DVI_CONVERTER:
+   case DISPLAY_DONGLE_DP_DVI_DONGLE:
+   return DRM_MODE_SUBCONNECTOR_DVID;
+   case DISPLAY_DONGLE_DP_HDMI_CONVERTER:
+   case DISPLAY_DONGLE_DP_HDMI_DONGLE:
+   return DRM_MODE_SUBCONNECTOR_HDMIA;
+   case DISPLAY_DONGLE_DP_HDMI_MISMATCHED_DONGLE:
+   default:
+   return DRM_MODE_SUBCONNECTOR_Unknown;
+   }
+}
+
+static void update_subconnector_property(struct amdgpu_dm_connector 
*aconnector)
+{
+   struct dc_link *link = aconnector->dc_link;
+   struct drm_connector *connector = >base;
+   enum drm_mode_subconnector subconnector = DRM_MODE_SUBCONNECTOR_Unknown;
+
+   if (connector->connector_type != DRM_MODE_CONNECTOR_DisplayPort)
+   return;
+
+   if (aconnector->dc_sink)
+   subconnector = get_subconnector_type(link);
+
+   drm_object_property_set_value(>base,
+   connector->dev->mode_config.dp_subconnector_property,
+   subconnector);
+}
+
 /*
  * dm_vblank_get_counter
  *
@@ -1322,7 +1358,6 @@ amdgpu_dm_update_connector_after_detect(struct 
amdgpu_dm_connector *aconnector)
if (aconnector->mst_mgr.mst_state == true)
return;
 
-
sink = aconnector->dc_link->local_sink;
if (sink)
dc_sink_retain(sink);
@@ -1438,6 +1473,8 @@ amdgpu_dm_update_connector_after_detect(struct 
amdgpu_dm_connector *aconnector)
 
mutex_unlock(>mode_config.mutex);
 
+   update_subconnector_property(aconnector);
+
if (sink)
dc_sink_release(sink);
 }
@@ -3774,6 +3811,8 @@ amdgpu_dm_connector_detect(struct drm_connector 
*connector, bool force)
else
connected = (aconnector->base.force == DRM_FORCE_ON);
 
+   update_subconnector_property(aconnector);
+
return (connected ? connector_status_connected :
connector_status_disconnected);
 }
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
index 16218a202b59..f4088914e34f 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
@@ -25,6 +25,7 @@
 
 #include 
 #include 
+#include 
 #include "dm_services.h"
 #include "amdgpu.h"
 #include "amdgpu_dm.h"
@@ -425,5 +426,7 @@ void amdgpu_dm_initialize_dp_connector(struct 
amdgpu_display_manager *dm,
16,
4,
aconnector->connector_id);
+
+   drm_mode_add_dp_subconnector_property(>base);
 }
 
-- 
2.23.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH libdrm 0/4] amdgpu: new approach for ras inject test

2019-08-29 Thread Zhou1, Tao
The series is:

Reviewed-by: Tao Zhou 

> -Original Message-
> From: Guchun Chen 
> Sent: 2019年8月29日 16:59
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> ; Li, Dennis ; Koenig,
> Christian ; Deucher, Alexander
> ; Zhou1, Tao 
> Cc: Li, Candice ; Chen, Guchun
> 
> Subject: [PATCH libdrm 0/4] amdgpu: new approach for ras inject test
> 
> These patches are to remove additional external lib-jsonc dependence, and
> to put all test configurations into C code.
> 
> Guchun Chen (4):
>   amdgpu: remove json package dependence
>   amdgpu: delete test configuration file
>   amdgpu: add ras inject unit test
>   amdgpu: add ras feature capability check in inject test
> 
>  configure.ac |  18 ---
>  data/amdgpu_ras.json | 267 --
>  meson.build  |   1 -
>  tests/amdgpu/Makefile.am |   5 +-
>  tests/amdgpu/meson.build |  16 +-
>  tests/amdgpu/ras_tests.c | 305 ++-
>  6 files changed, 116 insertions(+), 496 deletions(-)  delete mode 100644
> data/amdgpu_ras.json
> 
> --
> 2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v2] drm/amdgpu: Default disable GDS for compute+gfx

2019-08-29 Thread zhoucm1


On 2019/8/29 下午3:22, Christian König wrote:

Am 29.08.19 um 07:55 schrieb zhoucm1:



On 2019/8/29 上午1:08, Marek Olšák wrote:
It can't break an older driver, because there is no older driver 
that requires the static allocation.


Note that closed source drivers don't count, because they don't need 
backward compatibility.


Yes, I agree, we don't need take care of closed source stack.

But AMDVLK is indeed an open source stack, many fans are using it, we 
need keep its compatibility, don't we?




Actually that is still under discussion.

But AMDVLK should have never ever used the static GDS space in the 
first place. We only added that for a transition time for old OpenGL 
and it shouldn't have leaked into the upstream driver.


Not sure what's the best approach here. We could revert "[PATCH] 
drm/amdgpu: remove static GDS, GWS and OA", but that would break KFD. 
So we can only choose between two evils here.


Only alternative I can see which would work for both would be to still 
allocate the static GDS, GWS and OA space, but make it somehow dynamic 
so that the KFD can swap it out again.


Agree with you.

-David



Christian.


-David



Marek

On Wed, Aug 28, 2019 at 2:44 AM zhoucm1 > wrote:



On 2019/7/23 上午3:08, Christian König wrote:
> Am 22.07.19 um 17:34 schrieb Greathouse, Joseph:
>> Units in the GDS block default to allowing all VMIDs access
to all
>> entries. Disable shader access to the GDS, GWS, and OA blocks
from all
>> compute and gfx VMIDs by default. For compute, HWS firmware
will set
>> up the access bits for the appropriate VMID when a compute queue
>> requires access to these blocks.
>> The driver will handle enabling access on-demand for graphics
VMIDs.

gds_switch is depending on job->gds/gws/oa/_base/size.

"[PATCH] drm/amdgpu: remove static GDS, GWS and OA allocation", the
default allocations in kernel were removed. If some UMD stacks
don't
pass gds/gws/oa allocation to bo_list, then kernel will not enable
access of them, that will break previous driver.

do we need revert "[PATCH] drm/amdgpu: remove static GDS, GWS
and OA
allocation" ?

-David

>>
>> Leaving VMID0 with full access because otherwise HWS cannot
save or
>> restore values during task switch.
>>
>> v2: Fixed code and comment styling.
>>
>> Change-Id: I3d768a96935d2ed1dff09b02c995090f4fbfa539
>> Signed-off-by: Joseph Greathouse mailto:joseph.greatho...@amd.com>>
>
> Reviewed-by: Christian König mailto:christian.koe...@amd.com>>
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 25
++---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c  | 24
+---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 24
+---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 24
+---
>>   4 files changed, 69 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 73dcb632a3ce..2a9692bc34b4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -1516,17 +1516,27 @@ static void
>> gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>>   }
>>   nv_grbm_select(adev, 0, 0, 0, 0);
>>   mutex_unlock(>srbm_mutex);
>> +}
>>   -    /* Initialize all compute VMIDs to have no GDS, GWS, or OA
>> -   acccess. These should be enabled by FW for target
VMIDs. */
>> -    for (i = FIRST_COMPUTE_VMID; i < LAST_COMPUTE_VMID; i++) {
>> -    WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_BASE, 2 * i, 0);
>> -    WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_SIZE, 2 * i, 0);
>> -    WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
>> -    WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
>> +static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
>> +{
>> +    int vmid;
>> +
>> +    /*
>> + * Initialize all compute and user-gfx VMIDs to have no
GDS,
>> GWS, or OA
>> + * access. Compute VMIDs should be enabled by FW for
target VMIDs,
>> + * the driver can enable them for graphics. VMID0 should
maintain
>> + * access so that HWS firmware can save/restore entries.
>> + */
>> +    for (vmid = 1; vmid < 16; vmid++) {
>> +    WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_BASE, 2 *
vmid, 0);
>> +    WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_SIZE, 2 *
vmid, 0);
>> +    WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, vmid, 0);
>> +    WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, vmid, 0);
>>   }
>>   }
>>   +
>>   static void gfx_v10_0_tcp_harvest(struct amdgpu_device *adev)
>>   {
>>   int i, j, k;
>> @@ -1629,6 +1639,7 

Re: [PATCH libdrm 0/4] amdgpu: new approach for ras inject test

2019-08-29 Thread Christian König

Only skimmed over the patches, but in general looks good to me.

Feel free to add an Acked-by: Christian König  
to the whole series.


But somebody with more ras knowledge than I have should probably take a 
look as well.


Christian.

Am 29.08.19 um 10:59 schrieb Guchun Chen:

These patches are to remove additional external lib-jsonc
dependence, and to put all test configurations into C code.

Guchun Chen (4):
   amdgpu: remove json package dependence
   amdgpu: delete test configuration file
   amdgpu: add ras inject unit test
   amdgpu: add ras feature capability check in inject test

  configure.ac |  18 ---
  data/amdgpu_ras.json | 267 --
  meson.build  |   1 -
  tests/amdgpu/Makefile.am |   5 +-
  tests/amdgpu/meson.build |  16 +-
  tests/amdgpu/ras_tests.c | 305 ++-
  6 files changed, 116 insertions(+), 496 deletions(-)
  delete mode 100644 data/amdgpu_ras.json



___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH libdrm 4/4] amdgpu: add ras feature capability check in inject test

2019-08-29 Thread Guchun Chen
When running ras inject test, it's needed to be aligned
with kernel's ras enablement.

Change-Id: I7e69a1a3f6ab7a0053f67f7f1dd3fb9af64f478f
Signed-off-by: Guchun Chen 
---
 tests/amdgpu/ras_tests.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/tests/amdgpu/ras_tests.c b/tests/amdgpu/ras_tests.c
index d510b644..c1c543c1 100644
--- a/tests/amdgpu/ras_tests.c
+++ b/tests/amdgpu/ras_tests.c
@@ -861,6 +861,10 @@ static void __amdgpu_ras_ip_inject_test(const struct 
ras_inject_test_config *ip_
if (block == ARRAY_SIZE(ras_block_string))
break;
 
+   /* Ensure RAS feature for the IP block is enabled by kernel */
+   if (amdgpu_ras_is_feature_supported(block) <= 0)
+   break;
+
ret = amdgpu_ras_query_err_count(block, _ue, _ce);
CU_ASSERT_EQUAL(ret, 0);
if (ret)
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH libdrm 3/4] amdgpu: add ras inject unit test

2019-08-29 Thread Guchun Chen
Both UMC and GFX ras single_correctable
inject tests are added.

Change-Id: I46c29b8761294122fc9acb620441a7aace6509e4
Signed-off-by: Guchun Chen 
---
 tests/amdgpu/ras_tests.c | 144 +--
 1 file changed, 107 insertions(+), 37 deletions(-)

diff --git a/tests/amdgpu/ras_tests.c b/tests/amdgpu/ras_tests.c
index 86b4137b..d510b644 100644
--- a/tests/amdgpu/ras_tests.c
+++ b/tests/amdgpu/ras_tests.c
@@ -30,7 +30,8 @@
 #include 
 #include 
 #include "xf86drm.h"
-#include "stdlib.h"
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
 
 const char *ras_block_string[] = {
"umc",
@@ -311,11 +312,10 @@ enum amdgpu_ras_error_type {
AMDGPU_RAS_ERROR__POISON= 8,
 };
 
-struct ras_test_item {
+struct ras_inject_test_config {
char name[64];
-   int block;
+   char block[32];
int sub_block;
-   char error_type_str[64];
enum amdgpu_ras_error_type type;
uint64_t address;
uint64_t value;
@@ -390,12 +390,78 @@ struct ras_DID_test_mask{
DEFAULT_RAS_BLOCK_MASK_BASIC\
 }
 
+static const struct ras_inject_test_config umc_ras_inject_test[] = {
+   {"ras_umc.1.0", "umc", 0, AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+};
+
+static const struct ras_inject_test_config gfx_ras_inject_test[] = {
+   {"ras_gfx.2.0", "gfx", AMDGPU_RAS_BLOCK__GFX_CPC_UCODE,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.1", "gfx", AMDGPU_RAS_BLOCK__GFX_CPF_TAG,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.2", "gfx", AMDGPU_RAS_BLOCK__GFX_CPG_TAG,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.3", "gfx", AMDGPU_RAS_BLOCK__GFX_SQ_LDS_D,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.4", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_DATA_CU1_UTCL1_LFIFO,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.5", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKA_TAG_RAM,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.6", "gfx", AMDGPU_RAS_BLOCK__GFX_SQC_INST_BANKB_TAG_RAM,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.7", "gfx", AMDGPU_RAS_BLOCK__GFX_TA_FS_DFIFO,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.8", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.9", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_0_1,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.10", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_0,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.11", "gfx", AMDGPU_RAS_BLOCK__GFX_TCC_CACHE_DATA_BANK_1_1,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.12", "gfx", AMDGPU_RAS_BLOCK__GFX_TCP_CACHE_RAM,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.13", "gfx", AMDGPU_RAS_BLOCK__GFX_TD_SS_FIFO_LO,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+   {"ras_gfx.2.14", "gfx", AMDGPU_RAS_BLOCK__GFX_EA_DRAMRD_CMDMEM,
+   AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE, 0, 0},
+};
+
 static const struct ras_DID_test_mask ras_DID_array[] = {
{0x66a1, 0x00, RAS_BLOCK_MASK_ALL},
{0x66a1, 0x01, RAS_BLOCK_MASK_ALL},
{0x66a1, 0x04, RAS_BLOCK_MASK_ALL},
 };
 
+static uint32_t amdgpu_ras_find_block_id_by_name(const char *name)
+{
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
+   if (strcmp(name, ras_block_string[i]) == 0)
+   return i;
+   }
+
+   return ARRAY_SIZE(ras_block_string);
+}
+
+static char *amdgpu_ras_get_error_type_id(enum amdgpu_ras_error_type type)
+{
+   switch (type) {
+   case AMDGPU_RAS_ERROR__PARITY:
+   return "parity";
+   case AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE:
+   return "single_correctable";
+   case AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE:
+   return "multi_uncorrectable";
+   case AMDGPU_RAS_ERROR__POISON:
+   return "poison";
+   case AMDGPU_RAS_ERROR__NONE:
+   default:
+   return NULL;
+   }
+}
+
 static struct ras_test_mask amdgpu_ras_get_test_mask(drmDevicePtr device)
 {
int i;
@@ -775,43 +841,36 @@ static void amdgpu_ras_enable_test(void)
}
 }
 
-static int amdgpu_ras_get_test_items(struct ras_test_item **pitems, int *size)
+static void __amdgpu_ras_ip_inject_test(const struct ras_inject_test_config 
*ip_test,
+   uint32_t size)
 {
-   *pitems = NULL;
-   *size = 0;
-
-   return 0;
-}
-
-static void __amdgpu_ras_inject_test(void)
-{
-   struct ras_test_item *items = NULL;
-   int i, size;
-   int ret;
+   int i, ret;
unsigned long 

[PATCH libdrm 2/4] amdgpu: delete test configuration file

2019-08-29 Thread Guchun Chen
Json package dependence is removed from amdgpu_test,
so this json configuration file is not needed any more.

Change-Id: Ibd64c30244c5ae894928d9de5460f1c776408054
Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
 data/amdgpu_ras.json | 267 ---
 1 file changed, 267 deletions(-)
 delete mode 100644 data/amdgpu_ras.json

diff --git a/data/amdgpu_ras.json b/data/amdgpu_ras.json
deleted file mode 100644
index 26fd9465..
--- a/data/amdgpu_ras.json
+++ /dev/null
@@ -1,267 +0,0 @@
-{
-"version": "0.0.1",
-"block": {
-"umc": {
-"index": 0
-},
-"gfx": {
-"index": 2,
-"subblock": {
-"gfx_cpc_scratch": 0,
-"gfx_cpc_ucode": 1,
-"gfx_dc_state_me1": 2,
-"gfx_dc_csinvoc_me1": 3,
-"gfx_dc_restore_me1": 4,
-"gfx_dc_state_me2": 5,
-"gfx_dc_csinvoc_me2": 6,
-"gfx_dc_restore_me2": 7,
-"gfx_cpf_roq_me2": 8,
-"gfx_cpf_roq_me1": 9,
-"gfx_cpf_tag": 10,
-"gfx_cpg_dma_roq": 11,
-"gfx_cpg_dma_tag": 12,
-"gfx_cpg_tag": 13,
-"gfx_gds_mem": 14,
-"gfx_gds_input_queue": 15,
-"gfx_gds_oa_phy_cmd_ram_mem": 16,
-"gfx_gds_oa_phy_data_ram_mem": 17,
-"gfx_gds_oa_pipe_mem": 18,
-"gfx_spi_sr_mem": 19,
-"gfx_sq_sgpr": 20,
-"gfx_sq_lds_d": 21,
-"gfx_sq_lds_i": 22,
-"gfx_sq_vgpr": 23,
-"gfx_sqc_inst_utcl1_lfifo": 24,
-"gfx_sqc_data_cu0_write_data_buf": 25,
-"gfx_sqc_data_cu0_utcl1_lfifo": 26,
-"gfx_sqc_data_cu1_write_data_buf": 27,
-"gfx_sqc_data_cu1_utcl1_lfifo": 28,
-"gfx_sqc_data_cu2_write_data_buf": 29,
-"gfx_sqc_data_cu2_utcl1_lfifo": 30,
-"gfx_sqc_inst_banka_tag_ram": 31,
-"gfx_sqc_inst_banka_utcl1_miss_fifo": 32,
-"gfx_sqc_inst_banka_miss_fifo": 33,
-"gfx_sqc_inst_banka_bank_ram": 34,
-"gfx_sqc_data_banka_tag_ram": 35,
-"gfx_sqc_data_banka_hit_fifo": 36,
-"gfx_sqc_data_banka_miss_fifo": 37,
-"gfx_sqc_data_banka_dirty_bit_ram": 38,
-"gfx_sqc_data_banka_bank_ram": 39,
-"gfx_sqc_inst_bankb_tag_ram": 40,
-"gfx_sqc_inst_bankb_utcl1_miss_fifo": 41,
-"gfx_sqc_inst_bankb_miss_fifo": 42,
-"gfx_sqc_inst_bankb_bank_ram": 43,
-"gfx_sqc_data_bankb_tag_ram": 44,
-"gfx_sqc_data_bankb_hit_fifo": 45,
-"gfx_sqc_data_bankb_miss_fifo": 46,
-"gfx_sqc_data_bankb_dirty_bit_ram": 47,
-"gfx_sqc_data_bankb_bank_ram": 48,
-"gfx_ta_fs_dfifo": 49,
-"gfx_ta_fs_afifo": 50,
-"gfx_ta_fl_lfifo": 51,
-"gfx_ta_fx_lfifo": 52,
-"gfx_ta_fs_cfifo": 53,
-"gfx_tca_hole_fifo": 54,
-"gfx_tca_req_fifo": 55,
-"gfx_tcc_cache_data": 56,
-"gfx_tcc_cache_data_bank_0_1": 57,
-"gfx_tcc_cache_data_bank_1_0": 58,
-"gfx_tcc_cache_data_bank_1_1": 59,
-"gfx_tcc_cache_dirty_bank_0": 60,
-"gfx_tcc_cache_dirty_bank_1": 61,
-"gfx_tcc_high_rate_tag": 62,
-"gfx_tcc_low_rate_tag": 63,
-"gfx_tcc_in_use_dec": 64,
-"gfx_tcc_in_use_transfer": 65,
-"gfx_tcc_return_data": 66,
-"gfx_tcc_return_control": 67,
-"gfx_tcc_uc_atomic_fifo": 68,
-"gfx_tcc_write_return": 69,
-"gfx_tcc_write_cache_read": 70,
-"gfx_tcc_src_fifo": 71,
-"gfx_tcc_src_fifo_next_ram": 72,
-"gfx_tcc_cache_tag_probe_fifo": 73,
-"gfx_tcc_latency_fifo": 74,
-"gfx_tcc_latency_fifo_next_ram": 75,
-"gfx_tcc_wrret_tag_write_return": 76,
-"gfx_tcc_atomic_return_buffer": 77,
-"gfx_tci_write_ram": 78,
-"gfx_tcp_cache_ram": 79,
-"gfx_tcp_lfifo_ram": 80,
-"gfx_tcp_cmd_fifo": 81,
-"gfx_tcp_vm_fifo": 82,
-"gfx_tcp_db_ram": 83,
-"gfx_tcp_utcl1_lfifo0": 84,
-"gfx_tcp_utcl1_lfifo1": 85,
-"gfx_td_ss_fifo_lo": 86,
-"gfx_td_ss_fifo_hi": 87,
-"gfx_td_cs_fifo": 88,
-"gfx_ea_dramrd_cmdmem": 89,
-"gfx_ea_dramwr_cmdmem": 90,
-"gfx_ea_dramwr_datamem": 91,
-"gfx_ea_rret_tagmem": 92,
- 

[PATCH libdrm 0/4] amdgpu: new approach for ras inject test

2019-08-29 Thread Guchun Chen
These patches are to remove additional external lib-jsonc
dependence, and to put all test configurations into C code.

Guchun Chen (4):
  amdgpu: remove json package dependence
  amdgpu: delete test configuration file
  amdgpu: add ras inject unit test
  amdgpu: add ras feature capability check in inject test

 configure.ac |  18 ---
 data/amdgpu_ras.json | 267 --
 meson.build  |   1 -
 tests/amdgpu/Makefile.am |   5 +-
 tests/amdgpu/meson.build |  16 +-
 tests/amdgpu/ras_tests.c | 305 ++-
 6 files changed, 116 insertions(+), 496 deletions(-)
 delete mode 100644 data/amdgpu_ras.json

-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH libdrm 1/4] amdgpu: remove json package dependence

2019-08-29 Thread Guchun Chen
Except CUnit library, no additional external
library should be needed when compiling amdgpu_test.
This will keep this binary self containing.

Change-Id: Id1935ef4431a0674c69391a67813370a3e9348e6
Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
 configure.ac |  18 -
 meson.build  |   1 -
 tests/amdgpu/Makefile.am |   5 +-
 tests/amdgpu/meson.build |  16 +---
 tests/amdgpu/ras_tests.c | 165 +--
 5 files changed, 9 insertions(+), 196 deletions(-)

diff --git a/configure.ac b/configure.ac
index 983b4371..1cf91347 100644
--- a/configure.ac
+++ b/configure.ac
@@ -430,24 +430,10 @@ if test "x$AMDGPU" != xno; then
AC_SUBST([CUNIT_CFLAGS])
fi
fi
-
-   # Detect json-c library
-   PKG_CHECK_MODULES([JSONC], [json-c >= 0.10.1], [have_jsonc=yes], 
[have_jsonc=no])
-   if test "x${have_jsonc}" = "xno"; then
-   AC_CHECK_LIB([json-c], [json_object_object_get], 
[have_jsonc=yes], [have_jsonc=no])
-   if test "x${have_jsonc}" = "xyes"; then
-   JSONC_LIBS="-ljson-c"
-   JSONC_CFLAGS=""
-   AC_SUBST([JSONC_LIBS])
-   AC_SUBST([JSONC_CFLAGS])
-   fi
-   fi
 else
have_cunit=no
-   have_jsonc=no
 fi
 AM_CONDITIONAL(HAVE_CUNIT, [test "x$have_cunit" != "xno"])
-AM_CONDITIONAL(HAVE_JSONC, [test "x$have_jsonc" != "xno"])
 
 AM_CONDITIONAL(HAVE_AMDGPU, [test "x$AMDGPU" = xyes])
 if test "x$AMDGPU" = xyes; then
@@ -456,10 +442,6 @@ if test "x$AMDGPU" = xyes; then
if test "x$have_cunit" = "xno"; then
AC_MSG_WARN([Could not find cunit library. Disabling amdgpu 
tests])
fi
-
-   if test "x$have_jsonc" = "xno"; then
-   AC_MSG_WARN([Could not find json-c library. Disabling amdgpu 
tests])
-   fi
 else
AC_DEFINE(HAVE_AMDGPU, 0)
 fi
diff --git a/meson.build b/meson.build
index bc5cfc58..e292554a 100644
--- a/meson.build
+++ b/meson.build
@@ -217,7 +217,6 @@ libdrm_c_args = warn_c_args + ['-fvisibility=hidden']
 
 dep_pciaccess = dependency('pciaccess', version : '>= 0.10', required : 
with_intel)
 dep_cunit = dependency('cunit', version : '>= 2.1', required : false)
-dep_json = dependency('json-c', version : '>= 0.10.1', required : false)
 _cairo_tests = get_option('cairo-tests')
 if _cairo_tests != 'false'
   dep_cairo = dependency('cairo', required : _cairo_tests == 'true')
diff --git a/tests/amdgpu/Makefile.am b/tests/amdgpu/Makefile.am
index 339bb0a9..920882d0 100644
--- a/tests/amdgpu/Makefile.am
+++ b/tests/amdgpu/Makefile.am
@@ -7,8 +7,7 @@ AM_CFLAGS = \
 
 LDADD = $(top_builddir)/libdrm.la \
$(top_builddir)/amdgpu/libdrm_amdgpu.la \
-   $(CUNIT_LIBS) \
-   $(JSONC_LIBS)
+   $(CUNIT_LIBS)
 
 if HAVE_INSTALL_TESTS
 bin_PROGRAMS = \
@@ -18,7 +17,7 @@ noinst_PROGRAMS = \
amdgpu_test
 endif
 
-amdgpu_test_CPPFLAGS = $(CUNIT_CFLAGS) $(JSONC_CFLAGS)
+amdgpu_test_CPPFLAGS = $(CUNIT_CFLAGS)
 
 amdgpu_test_SOURCES = \
amdgpu_test.c \
diff --git a/tests/amdgpu/meson.build b/tests/amdgpu/meson.build
index 4307295e..1726cb43 100644
--- a/tests/amdgpu/meson.build
+++ b/tests/amdgpu/meson.build
@@ -18,7 +18,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-if dep_cunit.found() and dep_json.found()
+if dep_cunit.found()
   amdgpu_test = executable(
 'amdgpu_test',
 files(
@@ -26,19 +26,9 @@ if dep_cunit.found() and dep_json.found()
   'vce_tests.c', 'uvd_enc_tests.c', 'vcn_tests.c', 'deadlock_tests.c',
   'vm_tests.c', 'ras_tests.c', 'syncobj_tests.c',
 ),
-dependencies : [dep_cunit, dep_json, dep_threads],
+dependencies : [dep_cunit, dep_threads],
 include_directories : [inc_root, inc_drm, 
include_directories('../../amdgpu')],
 link_with : [libdrm, libdrm_amdgpu],
 install : with_install_tests,
   )
-
-  configure_file(input : '../../data/amdgpu_ras.json',
-output : 'amdgpu_ras.json',
-configuration : configuration_data())
-
-  install_data(
-'../../data/amdgpu_ras.json',
-install_mode : 'rw-r--r--',
-install_dir : datadir_amdgpu,
-  )
-endif
\ No newline at end of file
+endif
diff --git a/tests/amdgpu/ras_tests.c b/tests/amdgpu/ras_tests.c
index 5309bf64..86b4137b 100644
--- a/tests/amdgpu/ras_tests.c
+++ b/tests/amdgpu/ras_tests.c
@@ -30,7 +30,7 @@
 #include 
 #include 
 #include "xf86drm.h"
-#include "json.h"
+#include "stdlib.h"
 
 const char *ras_block_string[] = {
"umc",
@@ -775,169 +775,12 @@ static void amdgpu_ras_enable_test(void)
}
 }
 
-static int _json_get_block_id(json_object *block_obj, const char *name)
-{
-   json_object *item_obj, *index_obj;
-
-   if (!json_object_object_get_ex(block_obj, name, _obj))
-   return -1;
-
-   if (!json_object_object_get_ex(item_obj, "index", _obj))
-   return -1;
-
-   

RE: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Zhou1, Tao


> -Original Message-
> From: amd-gfx  On Behalf Of
> Andrey Grodzovsky
> Sent: 2019年8月29日 4:00
> To: amd-gfx@lists.freedesktop.org
> Cc: alexdeuc...@gmail.com; ckoenig.leichtzumer...@gmail.com;
> Grodzovsky, Andrey ; Zhang, Hawking
> 
> Subject: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.
> 
> Problem:
> Under certain conditions, when some IP bocks take a RAS error, we can get
[Tao] typo: "dmr/amdgpu" -> "drm/amdgpu", "IP bocks" -> "IP blocks"

> into a situation where a GPU reset is not possible due to issues in RAS in
> SMU/PSP.
> 
> Temporary fix until proper solution in PSP/SMU is ready:
> When uncorrectable error happens the DF will unconditionally broadcast
> error event packets to all its clients/slave upon receiving fatal error event 
> and
> freeze all its outbound queues, err_event_athub interrupt  will be triggered.
> In such case and we use this interrupt
> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
> reset, only stops schedulers, deatches all in progress and not yet scheduled
> job's fences, set error code on them and signals.
> Also reject any new incoming job submissions from user space.
> All this is done to notify the applications of the problem.
> 
> Signed-off-by: Andrey Grodzovsky 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98
> ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  5 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  6 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 30 +++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 12 +++-
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 +--
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 24 
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  5 ++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +-
>  10 files changed, 164 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 9da681e..300adb8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -38,6 +38,7 @@
>  #include "amdgpu_gmc.h"
>  #include "amdgpu_gem.h"
>  #include "amdgpu_display.h"
> +#include "amdgpu_ras.h"
> 
>  #if defined(HAVE_DRM_FREE_LARGE)
>  #define kvfree drm_free_large
> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void
> *data, struct drm_file *filp)
>   bool reserved_buffers = false;
>   int i, r;
> 
> + if (amdgpu_ras_intr_triggered())
> + return -EHWPOISON;
> +
>   if (!adev->accel_working)
>   return -EBUSY;
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 07a4ba0..3ecee10 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct
> amdgpu_device *adev, bool trylock)
>   return true;
>  }
> 
> -static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> +static void amdgpu_device_unlock_adev(struct amdgpu_device *adev, bool
> +skip_kfd)
>  {
>   /*unlock kfd: SRIOV would do it separately */
> - if (!amdgpu_sriov_vf(adev))
> + if (!amdgpu_sriov_vf(adev) && !skip_kfd)
>  amdgpu_amdkfd_post_reset(adev);
>   amdgpu_vf_error_trans_all(adev);
>   adev->mp1_state = PP_MP1_STATE_NONE;
> @@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct
> amdgpu_device *adev)  }
> 
> 
> +#define to_drm_sched_job(sched_job)  \
> + container_of((sched_job), struct drm_sched_job,
> queue_node)
> +
> +static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler
> +*sched) {
> + struct drm_sched_job *s_job;
> + struct drm_sched_entity *s_entity = NULL;
> + int i;
> +
> + /* Signal all jobs not yet scheduled */
> + for (i = DRM_SCHED_PRIORITY_MAX - 1; i >=
> DRM_SCHED_PRIORITY_MIN; i--) {
> + struct drm_sched_rq *rq = >sched_rq[i];
> +
> + if (!rq)
> + continue;
> +
> + spin_lock(>lock);
> + list_for_each_entry(s_entity, >entities, list) {
> + while ((s_job =
> to_drm_sched_job(spsc_queue_pop(_entity->job_queue {
> + struct drm_sched_fence *s_fence = s_job-
> >s_fence;
> +
> + dma_fence_signal(_fence->scheduled);
> + dma_fence_set_error(_fence->finished, -
> EHWPOISON);
> + dma_fence_signal(_fence->finished);
> + }
> + }
> + spin_unlock(>lock);
> + }
> +
> + /* Signal all jobs already scheduled to HW */
> + list_for_each_entry(s_job, >ring_mirror_list, node) {
> + struct drm_sched_fence *s_fence = s_job->s_fence;
> +
> + 

Re: [PATCH 2/2] dmr/amdgpu: Add system auto reboot to RAS.

2019-08-29 Thread Christian König

Am 28.08.19 um 22:00 schrieb Andrey Grodzovsky:

In case of RAS error allow user configure auto system
reboot through ras_ctrl.
This is also part of the temproray work around for the RAS
hang problem.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 10 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
  3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3ecee10..f1cff47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3805,6 +3805,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
int i, r = 0;
bool in_ras_intr = amdgpu_ras_intr_triggered();
  
+	/*

+* Flush RAM to disk so that after reboot
+* the user can read log and see why the system rebooted.
+*
+* Using user mode app call instead of kernel APIs such as
+* ksys_sync_helper for backward comparability with earlier
+* kernels into which this is also intended.
+*/
+   if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
+   char *envp[] = { "HOME=/", NULL };
+   char *argv[] = { "/bin/sync", NULL };
+
+   DRM_WARN("Emergency reboot.");
+
+   call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+   emergency_restart();
+   }
+
need_full_reset = job_signaled = false;
INIT_LIST_HEAD(_list);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

index 086e6df..423a1ba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -30,6 +30,7 @@
  #include "amdgpu_ras.h"
  #include "amdgpu_atomfirmware.h"
  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include 
  
  const char *ras_error_string[] = {

"none",
@@ -154,6 +155,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file 
*f,
op = 1;
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
op = 2;
+   else if (sscanf(str, "reboot %32s", block_name) == 1)
+   op = 3;
else if (str[0] && str[1] && str[2] && str[3])
/* ascii string, but commands are not matched. */
return -EINVAL;


This is actually becoming quite a mess. We should consider removing the 
parsing in the long term and using separate debugfs files for each action.


Christian.


@@ -287,6 +290,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f, const char __user *
/* data.inject.address is offset instead of absolute gpu 
address */
ret = amdgpu_ras_error_inject(adev, );
break;
+   case 3:
+   amdgpu_ras_get_context(adev)->reboot = true;
+   break;
default:
ret = -EINVAL;
break;
@@ -1733,6 +1739,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
  void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
  {
if (atomic_cmpxchg(_ras_in_intr, 0, 1) == 0) {
-   DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! 
Stopping all GPU jobs.\n");
+   DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT 
detected!\n");
+
+   amdgpu_ras_reset_gpu(adev, false);
}
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index c0e22af..e3f0764 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -333,6 +333,7 @@ struct amdgpu_ras {
struct mutex recovery_lock;
  
  	uint32_t flags;

+   bool reboot;
  };
  
  struct ras_fs_data {


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

2019-08-29 Thread Christian König

Am 28.08.19 um 22:00 schrieb Andrey Grodzovsky:

Problem:
Under certain conditions, when some IP bocks take a RAS error,
we can get into a situation where a GPU reset is not possible
due to issues in RAS in SMU/PSP.

Temporary fix until proper solution in PSP/SMU is ready:
When uncorrectable error happens the DF will unconditionally
broadcast error event packets to all its clients/slave upon
receiving fatal error event and freeze all its outbound queues,
err_event_athub interrupt  will be triggered.
In such case and we use this interrupt
to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
reset, only stops schedulers, deatches all in progress and not yet scheduled
job's fences, set error code on them and signals.
Also reject any new incoming job submissions from user space.
All this is done to notify the applications of the problem.

Signed-off-by: Andrey Grodzovsky 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c |  4 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 98 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|  5 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c|  6 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 30 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h| 12 +++-
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 10 +--
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 24 
  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  5 ++
  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +-
  10 files changed, 164 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 9da681e..300adb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -38,6 +38,7 @@
  #include "amdgpu_gmc.h"
  #include "amdgpu_gem.h"
  #include "amdgpu_display.h"
+#include "amdgpu_ras.h"
  
  #if defined(HAVE_DRM_FREE_LARGE)

  #define kvfree drm_free_large
@@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
bool reserved_buffers = false;
int i, r;
  
+	if (amdgpu_ras_intr_triggered())

+   return -EHWPOISON;
+
if (!adev->accel_working)
return -EBUSY;
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 07a4ba0..3ecee10 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3734,10 +3734,10 @@ static bool amdgpu_device_lock_adev(struct 
amdgpu_device *adev, bool trylock)
return true;
  }
  
-static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)

+static void amdgpu_device_unlock_adev(struct amdgpu_device *adev, bool 
skip_kfd)
  {
/*unlock kfd: SRIOV would do it separately */
-   if (!amdgpu_sriov_vf(adev))
+   if (!amdgpu_sriov_vf(adev) && !skip_kfd)
  amdgpu_amdkfd_post_reset(adev);


It's most likely better to completely remove the call to 
amdgpu_amdkfd_post_reset() here.



amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
@@ -3746,6 +3746,44 @@ static void amdgpu_device_unlock_adev(struct 
amdgpu_device *adev)
  }
  
  
+#define to_drm_sched_job(sched_job)		\

+   container_of((sched_job), struct drm_sched_job, queue_node)
+
+static void amdgpu_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
+{
+   struct drm_sched_job *s_job;
+   struct drm_sched_entity *s_entity = NULL;
+   int i;
+
+   /* Signal all jobs not yet scheduled */
+   for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
+   struct drm_sched_rq *rq = >sched_rq[i];
+
+   if (!rq)
+   continue;
+
+   spin_lock(>lock);
+   list_for_each_entry(s_entity, >entities, list) {
+   while ((s_job = 
to_drm_sched_job(spsc_queue_pop(_entity->job_queue {
+   struct drm_sched_fence *s_fence = 
s_job->s_fence;
+
+   dma_fence_signal(_fence->scheduled);
+   dma_fence_set_error(_fence->finished, 
-EHWPOISON);
+   dma_fence_signal(_fence->finished);
+   }
+   }
+   spin_unlock(>lock);
+   }
+
+   /* Signal all jobs already scheduled to HW */
+   list_for_each_entry(s_job, >ring_mirror_list, node) {
+   struct drm_sched_fence *s_fence = s_job->s_fence;
+
+   dma_fence_set_error(_fence->finished, -EHWPOISON);
+   dma_fence_signal(_fence->finished);
+   }
+}


That might be better put into amdgpu_job.c.

And I assume this is called only during GPU reset will the scheduler 
fully stopped?



+
  /**
   * amdgpu_device_gpu_recover - reset the asic and recover scheduler
   *
@@ -3765,11 +3803,12 @@ int amdgpu_device_gpu_recover(struct 

Re: [PATCH v2] drm/amdgpu: Default disable GDS for compute+gfx

2019-08-29 Thread Christian König

Am 29.08.19 um 07:55 schrieb zhoucm1:



On 2019/8/29 上午1:08, Marek Olšák wrote:
It can't break an older driver, because there is no older driver that 
requires the static allocation.


Note that closed source drivers don't count, because they don't need 
backward compatibility.


Yes, I agree, we don't need take care of closed source stack.

But AMDVLK is indeed an open source stack, many fans are using it, we 
need keep its compatibility, don't we?




Actually that is still under discussion.

But AMDVLK should have never ever used the static GDS space in the first 
place. We only added that for a transition time for old OpenGL and it 
shouldn't have leaked into the upstream driver.


Not sure what's the best approach here. We could revert "[PATCH] 
drm/amdgpu: remove static GDS, GWS and OA", but that would break KFD. So 
we can only choose between two evils here.


Only alternative I can see which would work for both would be to still 
allocate the static GDS, GWS and OA space, but make it somehow dynamic 
so that the KFD can swap it out again.


Christian.


-David



Marek

On Wed, Aug 28, 2019 at 2:44 AM zhoucm1 > wrote:



On 2019/7/23 上午3:08, Christian König wrote:
> Am 22.07.19 um 17:34 schrieb Greathouse, Joseph:
>> Units in the GDS block default to allowing all VMIDs access to all
>> entries. Disable shader access to the GDS, GWS, and OA blocks
from all
>> compute and gfx VMIDs by default. For compute, HWS firmware
will set
>> up the access bits for the appropriate VMID when a compute queue
>> requires access to these blocks.
>> The driver will handle enabling access on-demand for graphics
VMIDs.

gds_switch is depending on job->gds/gws/oa/_base/size.

"[PATCH] drm/amdgpu: remove static GDS, GWS and OA allocation", the
default allocations in kernel were removed. If some UMD stacks don't
pass gds/gws/oa allocation to bo_list, then kernel will not enable
access of them, that will break previous driver.

do we need revert "[PATCH] drm/amdgpu: remove static GDS, GWS and OA
allocation" ?

-David

>>
>> Leaving VMID0 with full access because otherwise HWS cannot
save or
>> restore values during task switch.
>>
>> v2: Fixed code and comment styling.
>>
>> Change-Id: I3d768a96935d2ed1dff09b02c995090f4fbfa539
>> Signed-off-by: Joseph Greathouse mailto:joseph.greatho...@amd.com>>
>
> Reviewed-by: Christian König mailto:christian.koe...@amd.com>>
>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 25
++---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c  | 24
+---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c  | 24
+---
>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 24
+---
>>   4 files changed, 69 insertions(+), 28 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 73dcb632a3ce..2a9692bc34b4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -1516,17 +1516,27 @@ static void
>> gfx_v10_0_init_compute_vmid(struct amdgpu_device *adev)
>>   }
>>   nv_grbm_select(adev, 0, 0, 0, 0);
>>   mutex_unlock(>srbm_mutex);
>> +}
>>   -    /* Initialize all compute VMIDs to have no GDS, GWS, or OA
>> -   acccess. These should be enabled by FW for target
VMIDs. */
>> -    for (i = FIRST_COMPUTE_VMID; i < LAST_COMPUTE_VMID; i++) {
>> -    WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_BASE, 2 * i, 0);
>> -    WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_SIZE, 2 * i, 0);
>> -    WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, i, 0);
>> -    WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, i, 0);
>> +static void gfx_v10_0_init_gds_vmid(struct amdgpu_device *adev)
>> +{
>> +    int vmid;
>> +
>> +    /*
>> + * Initialize all compute and user-gfx VMIDs to have no GDS,
>> GWS, or OA
>> + * access. Compute VMIDs should be enabled by FW for
target VMIDs,
>> + * the driver can enable them for graphics. VMID0 should
maintain
>> + * access so that HWS firmware can save/restore entries.
>> + */
>> +    for (vmid = 1; vmid < 16; vmid++) {
>> +    WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_BASE, 2 *
vmid, 0);
>> +    WREG32_SOC15_OFFSET(GC, 0, mmGDS_VMID0_SIZE, 2 *
vmid, 0);
>> +    WREG32_SOC15_OFFSET(GC, 0, mmGDS_GWS_VMID0, vmid, 0);
>> +    WREG32_SOC15_OFFSET(GC, 0, mmGDS_OA_VMID0, vmid, 0);
>>   }
>>   }
>>   +
>>   static void gfx_v10_0_tcp_harvest(struct amdgpu_device *adev)
>>   {
>>   int i, j, k;
>> @@ -1629,6 +1639,7 @@ static void gfx_v10_0_constants_init(struct
>> amdgpu_device *adev)
>>   

Re: [PATCH 2/2] drm/amdgpu: keep the stolen memory in visible vram region

2019-08-29 Thread Yin, Tianci (Rico)
Ok, I'll fix that, thanks!

From: Christian König 
Sent: Thursday, August 29, 2019 15:13
To: Yin, Tianci (Rico) ; amd-gfx@lists.freedesktop.org 

Cc: Xu, Feifei ; Ma, Le ; Xiao, Jack 
; Zhang, Hawking 
Subject: Re: [PATCH 2/2] drm/amdgpu: keep the stolen memory in visible vram 
region

Am 29.08.19 um 05:05 schrieb Tianci Yin:
> From: "Tianci.Yin" 
>
> stolen memory should be fixed in visible region.
>
> Change-Id: Icbbbd39fd113e93423aad8d2555f4073c08020e5
> Signed-off-by: Tianci.Yin 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 --
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 3 ++-
>   2 files changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 801f912..dcd32d0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -1733,6 +1733,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>uint64_t gtt_size;
>int r;
>u64 vis_vram_limit;
> + void *stolen_vga_buf;
>
>mutex_init(>mman.gtt_window_lock);
>
> @@ -1787,7 +1788,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>r = amdgpu_bo_create_kernel(adev, adev->gmc.stolen_size, PAGE_SIZE,
>AMDGPU_GEM_DOMAIN_VRAM,
>>stolen_vga_memory,
> - NULL, NULL);
> + NULL, _vga_buf);
>if (r)
>return r;
>DRM_INFO("amdgpu: %uM of VRAM memory ready\n",
> @@ -1851,8 +1852,9 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
>*/
>   void amdgpu_ttm_late_init(struct amdgpu_device *adev)
>   {
> + void *stolen_vga_buf;

Coding style says we should add a new line between declaration and code.

Apart from that the series is Reviewed-by: Christian König
.

>/* return the VGA stolen memory (if any) back to VRAM */
> - amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, NULL);
> + amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, _vga_buf);
>   }
>
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index f77138b..ab43ae2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1248,6 +1248,7 @@ static int gmc_v9_0_sw_init(void *handle)
>   static int gmc_v9_0_sw_fini(void *handle)
>   {
>struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> + void *stolen_vga_buf;
>
>if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
>adev->gmc.umc_ras_if) {
> @@ -1280,7 +1281,7 @@ static int gmc_v9_0_sw_fini(void *handle)
>amdgpu_vm_manager_fini(adev);
>
>if (gmc_v9_0_keep_stolen_memory(adev))
> - amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, NULL);
> + amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, 
> _vga_buf);
>
>amdgpu_gart_table_vram_free(adev);
>amdgpu_bo_fini(adev);

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 2/2] drm/amdgpu: keep the stolen memory in visible vram region

2019-08-29 Thread Christian König

Am 29.08.19 um 05:05 schrieb Tianci Yin:

From: "Tianci.Yin" 

stolen memory should be fixed in visible region.

Change-Id: Icbbbd39fd113e93423aad8d2555f4073c08020e5
Signed-off-by: Tianci.Yin 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 6 --
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 3 ++-
  2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 801f912..dcd32d0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1733,6 +1733,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
uint64_t gtt_size;
int r;
u64 vis_vram_limit;
+   void *stolen_vga_buf;
  
  	mutex_init(>mman.gtt_window_lock);
  
@@ -1787,7 +1788,7 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)

r = amdgpu_bo_create_kernel(adev, adev->gmc.stolen_size, PAGE_SIZE,
AMDGPU_GEM_DOMAIN_VRAM,
>stolen_vga_memory,
-   NULL, NULL);
+   NULL, _vga_buf);
if (r)
return r;
DRM_INFO("amdgpu: %uM of VRAM memory ready\n",
@@ -1851,8 +1852,9 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
   */
  void amdgpu_ttm_late_init(struct amdgpu_device *adev)
  {
+   void *stolen_vga_buf;


Coding style says we should add a new line between declaration and code.

Apart from that the series is Reviewed-by: Christian König 
.



/* return the VGA stolen memory (if any) back to VRAM */
-   amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, NULL);
+   amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, _vga_buf);
  }
  
  /**

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index f77138b..ab43ae2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1248,6 +1248,7 @@ static int gmc_v9_0_sw_init(void *handle)
  static int gmc_v9_0_sw_fini(void *handle)
  {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+   void *stolen_vga_buf;
  
  	if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&

adev->gmc.umc_ras_if) {
@@ -1280,7 +1281,7 @@ static int gmc_v9_0_sw_fini(void *handle)
amdgpu_vm_manager_fini(adev);
  
  	if (gmc_v9_0_keep_stolen_memory(adev))

-   amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, NULL);
+   amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, 
_vga_buf);
  
  	amdgpu_gart_table_vram_free(adev);

amdgpu_bo_fini(adev);


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: fix gfx ib test failed in sriov

2019-08-29 Thread Christian König

Hi Eric,

Yin has already proposed patches for fixing this a few days ago. Please 
help to review those instead.


Thanks,
Christian

Am 28.08.19 um 16:59 schrieb Huang, JinHuiEric:

It partially reverts the regression of

commit e4a67e6cf14c258619f
("drm/amdgpu/psp: move TMR to cpu invisible vram region")

which causes gfx ib test failed when driver loading
in sriov system.

Signed-off-by: Eric Huang 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 16 
  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  1 +
  2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 9f7cc5b..9f91ced 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -261,9 +261,14 @@ static int psp_tmr_init(struct psp_context *psp)
}
}
  
-	ret = amdgpu_bo_create_kernel(psp->adev, tmr_size, PSP_TMR_SIZE,

- AMDGPU_GEM_DOMAIN_VRAM,
- >tmr_bo, >tmr_mc_addr, NULL);
+   if (amdgpu_sriov_vf(psp->adev))
+   ret = amdgpu_bo_create_kernel(psp->adev, tmr_size, PSP_TMR_SIZE,
+ AMDGPU_GEM_DOMAIN_VRAM,
+ >tmr_bo, >tmr_mc_addr, >tmr_buf);
+   else
+   ret = amdgpu_bo_create_kernel(psp->adev, tmr_size, PSP_TMR_SIZE,
+ AMDGPU_GEM_DOMAIN_VRAM,
+ >tmr_bo, >tmr_mc_addr, NULL);
  
  	return ret;

  }
@@ -1216,7 +1221,10 @@ static int psp_hw_fini(void *handle)
  
  	psp_ring_destroy(psp, PSP_RING_TYPE__KM);
  
-	amdgpu_bo_free_kernel(>tmr_bo, >tmr_mc_addr, NULL);

+   if (amdgpu_sriov_vf(adev))
+   amdgpu_bo_free_kernel(>tmr_bo, >tmr_mc_addr, 
>tmr_buf);
+   else
+   amdgpu_bo_free_kernel(>tmr_bo, >tmr_mc_addr, NULL);
amdgpu_bo_free_kernel(>fw_pri_bo,
  >fw_pri_mc_addr, >fw_pri_buf);
amdgpu_bo_free_kernel(>fence_buf_bo,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index bc0947f..b73d4aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -171,6 +171,7 @@ struct psp_context
/* tmr buffer */
struct amdgpu_bo*tmr_bo;
uint64_ttmr_mc_addr;
+   void*tmr_buf;
  
  	/* asd firmware and buffer */

const struct firmware   *asd_fw;


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH RFC v4 13/16] drm, cgroup: Allow more aggressive memory reclaim

2019-08-29 Thread Koenig, Christian
Am 29.08.19 um 08:05 schrieb Kenny Ho:
> Allow DRM TTM memory manager to register a work_struct, such that, when
> a drmcgrp is under memory pressure, memory reclaiming can be triggered
> immediately.
>
> Change-Id: I25ac04e2db9c19ff12652b88ebff18b44b2706d8
> Signed-off-by: Kenny Ho 
> ---
>   drivers/gpu/drm/ttm/ttm_bo.c| 49 +
>   include/drm/drm_cgroup.h| 16 +++
>   include/drm/ttm/ttm_bo_driver.h |  2 ++
>   kernel/cgroup/drm.c | 30 
>   4 files changed, 97 insertions(+)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
> index d7e3d3128ebb..72efae694b7e 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo.c
> @@ -1590,6 +1590,46 @@ int ttm_bo_evict_mm(struct ttm_bo_device *bdev, 
> unsigned mem_type)
>   }
>   EXPORT_SYMBOL(ttm_bo_evict_mm);
>   
> +static void ttm_bo_reclaim_wq(struct work_struct *work)
> +{
> + struct ttm_operation_ctx ctx = {
> + .interruptible = false,
> + .no_wait_gpu = false,
> + .flags = TTM_OPT_FLAG_FORCE_ALLOC
> + };
> + struct ttm_mem_type_manager *man =
> + container_of(work, struct ttm_mem_type_manager, reclaim_wq);
> + struct ttm_bo_device *bdev = man->bdev;
> + struct dma_fence *fence;
> + int mem_type;
> + int ret;
> +
> + for (mem_type = 0; mem_type < TTM_NUM_MEM_TYPES; mem_type++)
> + if (>man[mem_type] == man)
> + break;
> +
> + WARN_ON(mem_type >= TTM_NUM_MEM_TYPES);
> + if (mem_type >= TTM_NUM_MEM_TYPES)
> + return;
> +
> + if (!drmcg_mem_pressure_scan(bdev, mem_type))
> + return;
> +
> + ret = ttm_mem_evict_first(bdev, mem_type, NULL, , NULL);
> + if (ret)
> + return;
> +
> + spin_lock(>move_lock);
> + fence = dma_fence_get(man->move);
> + spin_unlock(>move_lock);
> +
> + if (fence) {
> + ret = dma_fence_wait(fence, false);
> + dma_fence_put(fence);
> + }

Why do you want to block for the fence here? That is a rather bad idea 
and would break pipe-lining.

Apart from that I don't think we should put that into TTM.

Instead drmcg_register_device_mm() should get a function pointer which 
is called from a work item when the group is under pressure.

TTM can then provides the function which can be called, but the actually 
registration is job of the device and not TTM.

Regards,
Christian.

> +
> +}
> +
>   int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type,
>   unsigned long p_size)
>   {
> @@ -1624,6 +1664,13 @@ int ttm_bo_init_mm(struct ttm_bo_device *bdev, 
> unsigned type,
>   INIT_LIST_HEAD(>lru[i]);
>   man->move = NULL;
>   
> + pr_err("drmcg %p type %d\n", bdev->ddev, type);
> +
> + if (type <= TTM_PL_VRAM) {
> + INIT_WORK(>reclaim_wq, ttm_bo_reclaim_wq);
> + drmcg_register_device_mm(bdev->ddev, type, >reclaim_wq);
> + }
> +
>   return 0;
>   }
>   EXPORT_SYMBOL(ttm_bo_init_mm);
> @@ -1701,6 +1748,8 @@ int ttm_bo_device_release(struct ttm_bo_device *bdev)
>   man = >man[i];
>   if (man->has_type) {
>   man->use_type = false;
> + drmcg_unregister_device_mm(bdev->ddev, i);
> + cancel_work_sync(>reclaim_wq);
>   if ((i != TTM_PL_SYSTEM) && ttm_bo_clean_mm(bdev, i)) {
>   ret = -EBUSY;
>   pr_err("DRM memory manager type %d is not 
> clean\n",
> diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
> index c11df388fdf2..6d9707e1eb72 100644
> --- a/include/drm/drm_cgroup.h
> +++ b/include/drm/drm_cgroup.h
> @@ -5,6 +5,7 @@
>   #define __DRM_CGROUP_H__
>   
>   #include 
> +#include 
>   #include 
>   #include 
>   
> @@ -25,12 +26,17 @@ struct drmcg_props {
>   s64 mem_bw_avg_bytes_per_us_default;
>   
>   s64 mem_highs_default[TTM_PL_PRIV+1];
> +
> + struct work_struct  *mem_reclaim_wq[TTM_PL_PRIV];
>   };
>   
>   #ifdef CONFIG_CGROUP_DRM
>   
>   void drmcg_device_update(struct drm_device *device);
>   void drmcg_device_early_init(struct drm_device *device);
> +void drmcg_register_device_mm(struct drm_device *dev, unsigned int type,
> + struct work_struct *wq);
> +void drmcg_unregister_device_mm(struct drm_device *dev, unsigned int type);
>   bool drmcg_try_chg_bo_alloc(struct drmcg *drmcg, struct drm_device *dev,
>   size_t size);
>   void drmcg_unchg_bo_alloc(struct drmcg *drmcg, struct drm_device *dev,
> @@ -53,6 +59,16 @@ static inline void drmcg_device_early_init(struct 
> drm_device *device)
>   {
>   }
>   
> +static inline void drmcg_register_device_mm(struct drm_device *dev,
> + unsigned int type, struct work_struct *wq)
> +{
> +}
> +
> +static inline void 

[PATCH RFC v4 14/16] drm, cgroup: Introduce lgpu as DRM cgroup resource

2019-08-29 Thread Kenny Ho
drm.lgpu
A read-write nested-keyed file which exists on all cgroups.
Each entry is keyed by the DRM device's major:minor.

lgpu stands for logical GPU, it is an abstraction used to
subdivide a physical DRM device for the purpose of resource
management.

The lgpu is a discrete quantity that is device specific (i.e.
some DRM devices may have 64 lgpus while others may have 100
lgpus.)  The lgpu is a single quantity with two representations
denoted by the following nested keys.

  = 
  count Representing lgpu as anonymous resource
  list  Representing lgpu as named resource
  = 

For example:
226:0 count=256 list=0-255
226:1 count=4 list=0,2,4,6
226:2 count=32 list=32-63

lgpu is represented by a bitmap and uses the bitmap_parselist
kernel function so the list key input format is a
comma-separated list of decimal numbers and ranges.

Consecutively set bits are shown as two hyphen-separated decimal
numbers, the smallest and largest bit numbers set in the range.
Optionally each range can be postfixed to denote that only parts
of it should be set.  The range will divided to groups of
specific size.
Syntax: range:used_size/group_size
Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769

The count key is the hamming weight / hweight of the bitmap.

Both count and list accept the max and default keywords.

Some DRM devices may only support lgpu as anonymous resources.
In such case, the significance of the position of the set bits
in list will be ignored.

This lgpu resource supports the 'allocation' resource
distribution model.

Change-Id: I1afcacf356770930c7f925df043e51ad06ceb98e
Signed-off-by: Kenny Ho 
---
 Documentation/admin-guide/cgroup-v2.rst |  46 
 include/drm/drm_cgroup.h|   4 +
 include/linux/cgroup_drm.h  |   6 ++
 kernel/cgroup/drm.c | 135 
 4 files changed, 191 insertions(+)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 87a195133eaa..57f18469bd76 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1958,6 +1958,52 @@ DRM Interface Files
Set largest allocation for /dev/dri/card1 to 4MB
echo "226:1 4m" > drm.buffer.peak.max
 
+  drm.lgpu
+   A read-write nested-keyed file which exists on all cgroups.
+   Each entry is keyed by the DRM device's major:minor.
+
+   lgpu stands for logical GPU, it is an abstraction used to
+   subdivide a physical DRM device for the purpose of resource
+   management.
+
+   The lgpu is a discrete quantity that is device specific (i.e.
+   some DRM devices may have 64 lgpus while others may have 100
+   lgpus.)  The lgpu is a single quantity with two representations
+   denoted by the following nested keys.
+
+ = 
+ count Representing lgpu as anonymous resource
+ list  Representing lgpu as named resource
+ = 
+
+   For example:
+   226:0 count=256 list=0-255
+   226:1 count=4 list=0,2,4,6
+   226:2 count=32 list=32-63
+
+   lgpu is represented by a bitmap and uses the bitmap_parselist
+   kernel function so the list key input format is a
+   comma-separated list of decimal numbers and ranges.
+
+   Consecutively set bits are shown as two hyphen-separated decimal
+   numbers, the smallest and largest bit numbers set in the range.
+   Optionally each range can be postfixed to denote that only parts
+   of it should be set.  The range will divided to groups of
+   specific size.
+   Syntax: range:used_size/group_size
+   Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769
+
+   The count key is the hamming weight / hweight of the bitmap.
+
+   Both count and list accept the max and default keywords.
+
+   Some DRM devices may only support lgpu as anonymous resources.
+   In such case, the significance of the position of the set bits
+   in list will be ignored.
+
+   This lgpu resource supports the 'allocation' resource
+   distribution model.
+
 GEM Buffer Ownership
 
 
diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
index 6d9707e1eb72..a8d6be0b075b 100644
--- a/include/drm/drm_cgroup.h
+++ b/include/drm/drm_cgroup.h
@@ -6,6 +6,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -28,6 +29,9 @@ struct drmcg_props {
s64 mem_highs_default[TTM_PL_PRIV+1];
 
struct 

[PATCH RFC v4 11/16] drm, cgroup: Add per cgroup bw measure and control

2019-08-29 Thread Kenny Ho
The bandwidth is measured by keeping track of the amount of bytes moved
by ttm within a time period.  We defined two type of bandwidth: burst
and average.  Average bandwidth is calculated by dividing the total
amount of bytes moved within a cgroup by the lifetime of the cgroup.
Burst bandwidth is similar except that the byte and time measurement is
reset after a user configurable period.

The bandwidth control is best effort since it is done on a per move
basis instead of per byte.  The bandwidth is limited by delaying the
move of a buffer.  The bandwidth limit can be exceeded when the next
move is larger than the remaining allowance.

drm.burst_bw_period_in_us
A read-write flat-keyed file which exists on the root cgroup.
Each entry is keyed by the drm device's major:minor.

Length of a period use to measure burst bandwidth in us.
One period per device.

drm.burst_bw_period_in_us.default
A read-only flat-keyed file which exists on the root cgroup.
Each entry is keyed by the drm device's major:minor.

Default length of a period in us (one per device.)

drm.bandwidth.stats
A read-only nested-keyed file which exists on all cgroups.
Each entry is keyed by the drm device's major:minor.  The
following nested keys are defined.

  = ==
  burst_byte_per_us Burst bandwidth
  avg_bytes_per_us  Average bandwidth
  moved_byteAmount of byte moved within a period
  accum_us  Amount of time accumulated in a period
  total_moved_byte  Byte moved within the cgroup lifetime
  total_accum_usCgroup lifetime in us
  byte_credit   Available byte credit to limit avg bw
  = ==

Reading returns the following::
226:1 burst_byte_per_us=23 avg_bytes_per_us=0 moved_byte=2244608
accum_us=95575 total_moved_byte=45899776 total_accum_us=201634590
byte_credit=13214278590464
226:2 burst_byte_per_us=10 avg_bytes_per_us=219 moved_byte=430080
accum_us=39350 total_moved_byte=65518026752 total_accum_us=298337721
byte_credit=9223372036854644735

drm.bandwidth.high
A read-write nested-keyed file which exists on all cgroups.
Each entry is keyed by the drm device's major:minor.  The
following nested keys are defined.

    ===
  bytes_in_period   Burst limit per period in byte
  avg_bytes_per_us  Average bandwidth limit in bytes per us
    ===

Reading returns the following::

226:1 bytes_in_period=9223372036854775807 avg_bytes_per_us=65536
226:2 bytes_in_period=9223372036854775807 avg_bytes_per_us=65536

drm.bandwidth.default
A read-only nested-keyed file which exists on the root cgroup.
Each entry is keyed by the drm device's major:minor.  The
following nested keys are defined.

    
  bytes_in_period   Default burst limit per period in byte
  avg_bytes_per_us  Default average bw limit in bytes per us
    

Reading returns the following::

226:1 bytes_in_period=9223372036854775807 avg_bytes_per_us=65536
226:2 bytes_in_period=9223372036854775807 avg_bytes_per_us=65536

Change-Id: Ie573491325ccc16535bb943e7857f43bd0962add
Signed-off-by: Kenny Ho 
---
 drivers/gpu/drm/ttm/ttm_bo.c |   7 +
 include/drm/drm_cgroup.h |  19 +++
 include/linux/cgroup_drm.h   |  16 ++
 kernel/cgroup/drm.c  | 319 ++-
 4 files changed, 359 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index a0e9ce46baf3..32eee85f3641 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1256,6 +1257,12 @@ int ttm_bo_validate(struct ttm_buffer_object *bo,
 * Check whether we need to move buffer.
 */
if (!ttm_bo_mem_compat(placement, >mem, _flags)) {
+   unsigned int move_delay = drmcg_get_mem_bw_period_in_us(bo);
+
+   move_delay /= 2000; /* check every half period in ms*/
+   while (bo->bdev->ddev != NULL && !drmcg_mem_can_move(bo))
+   msleep(move_delay);
+
ret = ttm_bo_move_buffer(bo, placement, ctx);
if (ret)
return ret;
diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
index 7d63f73a5375..9ce0d54e6bd8 100644
--- a/include/drm/drm_cgroup.h
+++ b/include/drm/drm_cgroup.h
@@ -16,6 +16,12 @@ 

[PATCH RFC v4 09/16] drm, cgroup: Add TTM buffer allocation stats

2019-08-29 Thread Kenny Ho
The drm resource being measured is the TTM (Translation Table Manager)
buffers.  TTM manages different types of memory that a GPU might access.
These memory types include dedicated Video RAM (VRAM) and host/system
memory accessible through IOMMU (GART/GTT).  TTM is currently used by
multiple drm drivers (amd, ast, bochs, cirrus, hisilicon, maga200,
nouveau, qxl, virtio, vmwgfx.)

drm.memory.stats
A read-only nested-keyed file which exists on all cgroups.
Each entry is keyed by the drm device's major:minor.  The
following nested keys are defined.

  == =
  system Host/system memory
  tt Host memory used by the drm device (GTT/GART)
  vram   Video RAM used by the drm device
  priv   Other drm device, vendor specific memory
  == =

Reading returns the following::

226:0 system=0 tt=0 vram=0 priv=0
226:1 system=0 tt=9035776 vram=17768448 priv=16809984
226:2 system=0 tt=9035776 vram=17768448 priv=16809984

drm.memory.evict.stats
A read-only flat-keyed file which exists on all cgroups.  Each
entry is keyed by the drm device's major:minor.

Total number of evictions.

Change-Id: Ice2c4cc845051229549bebeb6aa2d7d6153bdf6a
Signed-off-by: Kenny Ho 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |   3 +-
 drivers/gpu/drm/ttm/ttm_bo.c|  30 +++
 drivers/gpu/drm/ttm/ttm_bo_util.c   |   4 +
 include/drm/drm_cgroup.h|  19 +
 include/drm/ttm/ttm_bo_api.h|   2 +
 include/drm/ttm/ttm_bo_driver.h |   8 ++
 include/linux/cgroup_drm.h  |   6 ++
 kernel/cgroup/drm.c | 108 
 8 files changed, 179 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index cfcbbdc39656..463e015e8694 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1720,8 +1720,9 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
mutex_init(>mman.gtt_window_lock);
 
/* No others user of address space so set it to 0 */
-   r = ttm_bo_device_init(>mman.bdev,
+   r = ttm_bo_device_init_tmp(>mman.bdev,
   _bo_driver,
+  adev->ddev,
   adev->ddev->anon_inode->i_mapping,
   adev->need_dma32);
if (r) {
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 58c403eda04e..a0e9ce46baf3 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -42,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static void ttm_bo_global_kobj_release(struct kobject *kobj);
 
@@ -151,6 +153,10 @@ static void ttm_bo_release_list(struct kref *list_kref)
struct ttm_bo_device *bdev = bo->bdev;
size_t acc_size = bo->acc_size;
 
+   if (bo->bdev->ddev != NULL) // TODO: remove after ddev initiazlied for 
all
+   drmcg_unchg_mem(bo);
+   drmcg_put(bo->drmcg);
+
BUG_ON(kref_read(>list_kref));
BUG_ON(kref_read(>kref));
BUG_ON(atomic_read(>cpu_writers));
@@ -360,6 +366,8 @@ static int ttm_bo_handle_move_mem(struct ttm_buffer_object 
*bo,
if (bo->mem.mem_type == TTM_PL_SYSTEM) {
if (bdev->driver->move_notify)
bdev->driver->move_notify(bo, evict, mem);
+   if (bo->bdev->ddev != NULL) // TODO: remove after ddev 
initiazlied for all
+   drmcg_mem_track_move(bo, evict, mem);
bo->mem = *mem;
mem->mm_node = NULL;
goto moved;
@@ -368,6 +376,8 @@ static int ttm_bo_handle_move_mem(struct ttm_buffer_object 
*bo,
 
if (bdev->driver->move_notify)
bdev->driver->move_notify(bo, evict, mem);
+   if (bo->bdev->ddev != NULL) // TODO: remove after ddev initiazlied for 
all
+   drmcg_mem_track_move(bo, evict, mem);
 
if (!(old_man->flags & TTM_MEMTYPE_FLAG_FIXED) &&
!(new_man->flags & TTM_MEMTYPE_FLAG_FIXED))
@@ -381,6 +391,8 @@ static int ttm_bo_handle_move_mem(struct ttm_buffer_object 
*bo,
if (bdev->driver->move_notify) {
swap(*mem, bo->mem);
bdev->driver->move_notify(bo, false, mem);
+   if (bo->bdev->ddev != NULL) // TODO: remove after ddev 
initiazlied for all
+   drmcg_mem_track_move(bo, evict, mem);
swap(*mem, bo->mem);
}
 
@@ -1355,6 +1367,10 @@ int 

[PATCH RFC v4 05/16] drm, cgroup: Add peak GEM buffer allocation stats

2019-08-29 Thread Kenny Ho
drm.buffer.peak.stats
A read-only flat-keyed file which exists on all cgroups.  Each
entry is keyed by the drm device's major:minor.

Largest (high water mark) GEM buffer allocated in bytes.

Change-Id: I79e56222151a3d33a76a61ba0097fe93ebb3449f
Signed-off-by: Kenny Ho 
---
 Documentation/admin-guide/cgroup-v2.rst |  6 ++
 include/linux/cgroup_drm.h  |  3 +++
 kernel/cgroup/drm.c | 12 
 3 files changed, 21 insertions(+)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 0e29d136e2f9..8588a0ffc69d 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1907,6 +1907,12 @@ DRM Interface Files
 
Total GEM buffer allocation in bytes.
 
+  drm.buffer.peak.stats
+   A read-only flat-keyed file which exists on all cgroups.  Each
+   entry is keyed by the drm device's major:minor.
+
+   Largest (high water mark) GEM buffer allocated in bytes.
+
 GEM Buffer Ownership
 
 
diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
index 1d8a7f2cdb4e..974d390cfa4f 100644
--- a/include/linux/cgroup_drm.h
+++ b/include/linux/cgroup_drm.h
@@ -15,6 +15,7 @@
 
 enum drmcg_res_type {
DRMCG_TYPE_BO_TOTAL,
+   DRMCG_TYPE_BO_PEAK,
__DRMCG_TYPE_LAST,
 };
 
@@ -24,6 +25,8 @@ enum drmcg_res_type {
 struct drmcg_device_resource {
/* for per device stats */
s64 bo_stats_total_allocated;
+
+   s64 bo_stats_peak_allocated;
 };
 
 /**
diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
index 87ae9164d8d8..0bf5b95668c4 100644
--- a/kernel/cgroup/drm.c
+++ b/kernel/cgroup/drm.c
@@ -129,6 +129,9 @@ static void drmcg_print_stats(struct drmcg_device_resource 
*ddr,
case DRMCG_TYPE_BO_TOTAL:
seq_printf(sf, "%lld\n", ddr->bo_stats_total_allocated);
break;
+   case DRMCG_TYPE_BO_PEAK:
+   seq_printf(sf, "%lld\n", ddr->bo_stats_peak_allocated);
+   break;
default:
seq_puts(sf, "\n");
break;
@@ -177,6 +180,12 @@ struct cftype files[] = {
.private = DRMCG_CTF_PRIV(DRMCG_TYPE_BO_TOTAL,
DRMCG_FTYPE_STATS),
},
+   {
+   .name = "buffer.peak.stats",
+   .seq_show = drmcg_seq_show,
+   .private = DRMCG_CTF_PRIV(DRMCG_TYPE_BO_PEAK,
+   DRMCG_FTYPE_STATS),
+   },
{ } /* terminate */
 };
 
@@ -260,6 +269,9 @@ void drmcg_chg_bo_alloc(struct drmcg *drmcg, struct 
drm_device *dev,
ddr = drmcg->dev_resources[devIdx];
 
ddr->bo_stats_total_allocated += (s64)size;
+
+   if (ddr->bo_stats_peak_allocated < (s64)size)
+   ddr->bo_stats_peak_allocated = (s64)size;
}
mutex_unlock(>drmcg_mutex);
 }
-- 
2.22.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH RFC v4 15/16] drm, cgroup: add update trigger after limit change

2019-08-29 Thread Kenny Ho
Before this commit, drmcg limits are updated but enforcement is delayed
until the next time the driver check against the new limit.  While this
is sufficient for certain resources, a more proactive enforcement may be
needed for other resources.

Introducing an optional drmcg_limit_updated callback for the DRM
drivers.  When defined, it will be called in two scenarios:
1) When limits are updated for a particular cgroup, the callback will be
triggered for each task in the updated cgroup.
2) When a task is migrated from one cgroup to another, the callback will
be triggered for each resource type for the migrated task.

Change-Id: I68187a72818b855b5f295aefcb241cda8ab63b00
Signed-off-by: Kenny Ho 
---
 include/drm/drm_drv.h | 10 
 kernel/cgroup/drm.c   | 57 +++
 2 files changed, 67 insertions(+)

diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index c8a37a08d98d..7e588b874a27 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -669,6 +669,16 @@ struct drm_driver {
void (*drmcg_custom_init)(struct drm_device *dev,
struct drmcg_props *props);
 
+   /**
+* @drmcg_limit_updated
+*
+* Optional callback
+*/
+   void (*drmcg_limit_updated)(struct drm_device *dev,
+   struct task_struct *task,\
+   struct drmcg_device_resource *ddr,
+   enum drmcg_res_type res_type);
+
/**
 * @gem_vm_ops: Driver private ops for this object
 */
diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
index 18c4368e2c29..99772e5d9ccc 100644
--- a/kernel/cgroup/drm.c
+++ b/kernel/cgroup/drm.c
@@ -621,6 +621,23 @@ static void drmcg_nested_limit_parse(struct 
kernfs_open_file *of,
}
 }
 
+static void drmcg_limit_updated(struct drm_device *dev, struct drmcg *drmcg,
+   enum drmcg_res_type res_type)
+{
+   struct drmcg_device_resource *ddr =
+   drmcg->dev_resources[dev->primary->index];
+   struct css_task_iter it;
+   struct task_struct *task;
+
+   css_task_iter_start(>css.cgroup->self,
+   CSS_TASK_ITER_PROCS, );
+   while ((task = css_task_iter_next())) {
+   dev->driver->drmcg_limit_updated(dev, task,
+   ddr, res_type);
+   }
+   css_task_iter_end();
+}
+
 static ssize_t drmcg_limit_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
 {
@@ -726,6 +743,10 @@ static ssize_t drmcg_limit_write(struct kernfs_open_file 
*of, char *buf,
default:
break;
}
+
+   if (dm->dev->driver->drmcg_limit_updated)
+   drmcg_limit_updated(dm->dev, drmcg, type);
+
drm_dev_put(dm->dev); /* release from drm_minor_acquire */
}
 
@@ -863,9 +884,45 @@ struct cftype files[] = {
{ } /* terminate */
 };
 
+static int drmcg_attach_fn(int id, void *ptr, void *data)
+{
+   struct drm_minor *minor = ptr;
+   struct task_struct *task = data;
+   struct drm_device *dev;
+
+   if (minor->type != DRM_MINOR_PRIMARY)
+   return 0;
+
+   dev = minor->dev;
+
+   if (dev->driver->drmcg_limit_updated) {
+   struct drmcg *drmcg = drmcg_get(task);
+   struct drmcg_device_resource *ddr =
+   drmcg->dev_resources[minor->index];
+   enum drmcg_res_type type;
+
+   for (type = 0; type < __DRMCG_TYPE_LAST; type++)
+   dev->driver->drmcg_limit_updated(dev, task, ddr, type);
+
+   drmcg_put(drmcg);
+   }
+
+   return 0;
+}
+
+static void drmcg_attach(struct cgroup_taskset *tset)
+{
+   struct task_struct *task;
+   struct cgroup_subsys_state *css;
+
+   cgroup_taskset_for_each(task, css, tset)
+   drm_minor_for_each(_attach_fn, task);
+}
+
 struct cgroup_subsys drm_cgrp_subsys = {
.css_alloc  = drmcg_css_alloc,
.css_free   = drmcg_css_free,
+   .attach = drmcg_attach,
.early_init = false,
.legacy_cftypes = files,
.dfl_cftypes= files,
-- 
2.22.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH RFC v4 08/16] drm, cgroup: Add peak GEM buffer allocation limit

2019-08-29 Thread Kenny Ho
drm.buffer.peak.default
A read-only flat-keyed file which exists on the root cgroup.
Each entry is keyed by the drm device's major:minor.

Default limits on the largest GEM buffer allocation in bytes.

drm.buffer.peak.max
A read-write flat-keyed file which exists on all cgroups.  Each
entry is keyed by the drm device's major:minor.

Per device limits on the largest GEM buffer allocation in bytes.
This is a hard limit.  Attempts in allocating beyond the cgroup
limit will result in ENOMEM.  Shorthand understood by memparse
(such as k, m, g) can be used.

Set largest allocation for /dev/dri/card1 to 4MB
echo "226:1 4m" > drm.buffer.peak.max

Change-Id: I0830d56775568e1cf215b56cc892d5e7945e9f25
Signed-off-by: Kenny Ho 
---
 Documentation/admin-guide/cgroup-v2.rst | 18 ++
 include/drm/drm_cgroup.h|  1 +
 include/linux/cgroup_drm.h  |  1 +
 kernel/cgroup/drm.c | 48 +
 4 files changed, 68 insertions(+)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index e8fac2684179..87a195133eaa 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1940,6 +1940,24 @@ DRM Interface Files
Set allocation limit for /dev/dri/card0 to 512MB
echo "226:0 512m" > drm.buffer.total.max
 
+  drm.buffer.peak.default
+   A read-only flat-keyed file which exists on the root cgroup.
+   Each entry is keyed by the drm device's major:minor.
+
+   Default limits on the largest GEM buffer allocation in bytes.
+
+  drm.buffer.peak.max
+   A read-write flat-keyed file which exists on all cgroups.  Each
+   entry is keyed by the drm device's major:minor.
+
+   Per device limits on the largest GEM buffer allocation in bytes.
+   This is a hard limit.  Attempts in allocating beyond the cgroup
+   limit will result in ENOMEM.  Shorthand understood by memparse
+   (such as k, m, g) can be used.
+
+   Set largest allocation for /dev/dri/card1 to 4MB
+   echo "226:1 4m" > drm.buffer.peak.max
+
 GEM Buffer Ownership
 
 
diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
index 49c5d35ff6e1..d61b90beded5 100644
--- a/include/drm/drm_cgroup.h
+++ b/include/drm/drm_cgroup.h
@@ -14,6 +14,7 @@ struct drmcg_props {
boollimit_enforced;
 
s64 bo_limits_total_allocated_default;
+   s64 bo_limits_peak_allocated_default;
 };
 
 #ifdef CONFIG_CGROUP_DRM
diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
index eb54e56f20ae..87a2566c9fdd 100644
--- a/include/linux/cgroup_drm.h
+++ b/include/linux/cgroup_drm.h
@@ -29,6 +29,7 @@ struct drmcg_device_resource {
s64 bo_limits_total_allocated;
 
s64 bo_stats_peak_allocated;
+   s64 bo_limits_peak_allocated;
 
s64 bo_stats_count_allocated;
 };
diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
index 7161fa40e156..2f54bff291e5 100644
--- a/kernel/cgroup/drm.c
+++ b/kernel/cgroup/drm.c
@@ -75,6 +75,9 @@ static inline int init_drmcg_single(struct drmcg *drmcg, 
struct drm_device *dev)
ddr->bo_limits_total_allocated =
dev->drmcg_props.bo_limits_total_allocated_default;
 
+   ddr->bo_limits_peak_allocated =
+   dev->drmcg_props.bo_limits_peak_allocated_default;
+
mutex_unlock(>drmcg_mutex);
return 0;
 }
@@ -157,6 +160,9 @@ static void drmcg_print_limits(struct drmcg_device_resource 
*ddr,
case DRMCG_TYPE_BO_TOTAL:
seq_printf(sf, "%lld\n", ddr->bo_limits_total_allocated);
break;
+   case DRMCG_TYPE_BO_PEAK:
+   seq_printf(sf, "%lld\n", ddr->bo_limits_peak_allocated);
+   break;
default:
seq_puts(sf, "\n");
break;
@@ -171,6 +177,10 @@ static void drmcg_print_default(struct drmcg_props *props,
seq_printf(sf, "%lld\n",
props->bo_limits_total_allocated_default);
break;
+   case DRMCG_TYPE_BO_PEAK:
+   seq_printf(sf, "%lld\n",
+   props->bo_limits_peak_allocated_default);
+   break;
default:
seq_puts(sf, "\n");
break;
@@ -327,6 +337,24 @@ static ssize_t drmcg_limit_write(struct kernfs_open_file 
*of, char *buf,
drmcg_value_apply(dm->dev,
>bo_limits_total_allocated, val);
break;
+   case DRMCG_TYPE_BO_PEAK:
+   p_max = parent == NULL ? S64_MAX :
+   parent->dev_resources[minor]->
+   

[PATCH RFC v4 04/16] drm, cgroup: Add total GEM buffer allocation stats

2019-08-29 Thread Kenny Ho
The drm resource being measured here is the GEM buffer objects.  User
applications allocate and free these buffers.  In addition, a process
can allocate a buffer and share it with another process.  The consumer
of a shared buffer can also outlive the allocator of the buffer.

For the purpose of cgroup accounting and limiting, ownership of the
buffer is deemed to be the cgroup for which the allocating process
belongs to.  There is one cgroup stats per drm device.  Each allocation
is charged to the owning cgroup as well as all its ancestors.

Similar to the memory cgroup, migrating a process to a different cgroup
does not move the GEM buffer usages that the process started while in
previous cgroup, to the new cgroup.

The following is an example to illustrate some of the operations.  Given
the following cgroup hierarchy (The letters are cgroup names with R
being the root cgroup.  The numbers in brackets are processes.  The
processes are placed with cgroup's 'No Internal Process Constraint' in
mind, so no process is placed in cgroup B.)

R (4, 5) -- A (6)
 \
  B  C (7,8)
   \
D (9)

Here is a list of operation and the associated effect on the size
track by the cgroups (for simplicity, each buffer is 1 unit in size.)

==  ==  ==  ==  ==  ===
R   A   B   C   D   Ops
==  ==  ==  ==  ==  ===
1   0   0   0   0   4 allocated a buffer
1   0   0   0   0   4 shared a buffer with 5
1   0   0   0   0   4 shared a buffer with 9
2   0   1   0   1   9 allocated a buffer
3   0   2   1   1   7 allocated a buffer
3   0   2   1   1   7 shared a buffer with 8
3   0   2   1   1   7 sharing with 9
3   0   2   1   1   7 release a buffer
3   0   2   1   1   7 migrate to cgroup D
3   0   2   1   1   9 release a buffer from 7
2   0   1   0   1   8 release a buffer from 7 (last ref to shared buf)
==  ==  ==  ==  ==  ===

drm.buffer.stats
A read-only flat-keyed file which exists on all cgroups.  Each
entry is keyed by the drm device's major:minor.

Total GEM buffer allocation in bytes.

Change-Id: I9d662ec50d64bb40a37dbf47f018b2f3a1c033ad
Signed-off-by: Kenny Ho 
---
 Documentation/admin-guide/cgroup-v2.rst |  50 +-
 drivers/gpu/drm/drm_gem.c   |   9 ++
 include/drm/drm_cgroup.h|  16 +++
 include/drm/drm_gem.h   |  11 +++
 include/linux/cgroup_drm.h  |   6 ++
 kernel/cgroup/drm.c | 126 
 6 files changed, 217 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 2936423a3fd5..0e29d136e2f9 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -63,6 +63,7 @@ v1 is available under Documentation/cgroup-v1/.
5-7-1. RDMA Interface Files
  5-8. DRM
5-8-1. DRM Interface Files
+   5-8-2. GEM Buffer Ownership
  5-9. Misc
5-9-1. perf_event
  5-N. Non-normative information
@@ -1900,7 +1901,54 @@ of DRM (Direct Rendering Manager) and GPU-related 
resources.
 DRM Interface Files
 
 
-TODO
+  drm.buffer.stats
+   A read-only flat-keyed file which exists on all cgroups.  Each
+   entry is keyed by the drm device's major:minor.
+
+   Total GEM buffer allocation in bytes.
+
+GEM Buffer Ownership
+
+
+For the purpose of cgroup accounting and limiting, ownership of the
+buffer is deemed to be the cgroup for which the allocating process
+belongs to.  There is one cgroup stats per drm device.  Each allocation
+is charged to the owning cgroup as well as all its ancestors.
+
+Similar to the memory cgroup, migrating a process to a different cgroup
+does not move the GEM buffer usages that the process started while in
+previous cgroup, to the new cgroup.
+
+The following is an example to illustrate some of the operations.  Given
+the following cgroup hierarchy (The letters are cgroup names with R
+being the root cgroup.  The numbers in brackets are processes.  The
+processes are placed with cgroup's 'No Internal Process Constraint' in
+mind, so no process is placed in cgroup B.)
+
+R (4, 5) -- A (6)
+ \
+  B  C (7,8)
+   \
+D (9)
+
+Here is a list of operation and the associated effect on the size
+track by the cgroups (for simplicity, each buffer is 1 unit in size.)
+
+==  ==  ==  ==  ==  ===
+R   A   B   C   D   Ops
+==  ==  ==  ==  ==  ===
+1   0   0   0   0   4 allocated a buffer
+1   0   0   0   0   4 shared a buffer with 5
+1   0   0   0   0   4 shared a buffer with 9
+2   0   1   0   1   9 allocated a buffer
+3   0   2   1   1   7 allocated a buffer
+3   0   2   1   1   7 shared a buffer with 8
+3   0   2   1   1   7 sharing with 9
+3   0   2   1   1   

[PATCH RFC v4 06/16] drm, cgroup: Add GEM buffer allocation count stats

2019-08-29 Thread Kenny Ho
drm.buffer.count.stats
A read-only flat-keyed file which exists on all cgroups.  Each
entry is keyed by the drm device's major:minor.

Total number of GEM buffer allocated.

Change-Id: Id3e1809d5fee8562e47a7d2b961688956d844ec6
Signed-off-by: Kenny Ho 
---
 Documentation/admin-guide/cgroup-v2.rst |  6 ++
 include/linux/cgroup_drm.h  |  3 +++
 kernel/cgroup/drm.c | 22 +++---
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 8588a0ffc69d..4dc72339a9b6 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1913,6 +1913,12 @@ DRM Interface Files
 
Largest (high water mark) GEM buffer allocated in bytes.
 
+  drm.buffer.count.stats
+   A read-only flat-keyed file which exists on all cgroups.  Each
+   entry is keyed by the drm device's major:minor.
+
+   Total number of GEM buffer allocated.
+
 GEM Buffer Ownership
 
 
diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
index 974d390cfa4f..972f7aa975b5 100644
--- a/include/linux/cgroup_drm.h
+++ b/include/linux/cgroup_drm.h
@@ -16,6 +16,7 @@
 enum drmcg_res_type {
DRMCG_TYPE_BO_TOTAL,
DRMCG_TYPE_BO_PEAK,
+   DRMCG_TYPE_BO_COUNT,
__DRMCG_TYPE_LAST,
 };
 
@@ -27,6 +28,8 @@ struct drmcg_device_resource {
s64 bo_stats_total_allocated;
 
s64 bo_stats_peak_allocated;
+
+   s64 bo_stats_count_allocated;
 };
 
 /**
diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
index 0bf5b95668c4..85e46ece4a82 100644
--- a/kernel/cgroup/drm.c
+++ b/kernel/cgroup/drm.c
@@ -132,6 +132,9 @@ static void drmcg_print_stats(struct drmcg_device_resource 
*ddr,
case DRMCG_TYPE_BO_PEAK:
seq_printf(sf, "%lld\n", ddr->bo_stats_peak_allocated);
break;
+   case DRMCG_TYPE_BO_COUNT:
+   seq_printf(sf, "%lld\n", ddr->bo_stats_count_allocated);
+   break;
default:
seq_puts(sf, "\n");
break;
@@ -186,6 +189,12 @@ struct cftype files[] = {
.private = DRMCG_CTF_PRIV(DRMCG_TYPE_BO_PEAK,
DRMCG_FTYPE_STATS),
},
+   {
+   .name = "buffer.count.stats",
+   .seq_show = drmcg_seq_show,
+   .private = DRMCG_CTF_PRIV(DRMCG_TYPE_BO_COUNT,
+   DRMCG_FTYPE_STATS),
+   },
{ } /* terminate */
 };
 
@@ -272,6 +281,8 @@ void drmcg_chg_bo_alloc(struct drmcg *drmcg, struct 
drm_device *dev,
 
if (ddr->bo_stats_peak_allocated < (s64)size)
ddr->bo_stats_peak_allocated = (s64)size;
+
+   ddr->bo_stats_count_allocated++;
}
mutex_unlock(>drmcg_mutex);
 }
@@ -289,15 +300,20 @@ EXPORT_SYMBOL(drmcg_chg_bo_alloc);
 void drmcg_unchg_bo_alloc(struct drmcg *drmcg, struct drm_device *dev,
size_t size)
 {
+   struct drmcg_device_resource *ddr;
int devIdx = dev->primary->index;
 
if (drmcg == NULL)
return;
 
mutex_lock(>drmcg_mutex);
-   for ( ; drmcg != NULL; drmcg = drmcg_parent(drmcg))
-   drmcg->dev_resources[devIdx]->bo_stats_total_allocated
-   -= (s64)size;
+   for ( ; drmcg != NULL; drmcg = drmcg_parent(drmcg)) {
+   ddr = drmcg->dev_resources[devIdx];
+
+   ddr->bo_stats_total_allocated -= (s64)size;
+
+   ddr->bo_stats_count_allocated--;
+   }
mutex_unlock(>drmcg_mutex);
 }
 EXPORT_SYMBOL(drmcg_unchg_bo_alloc);
-- 
2.22.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH RFC v4 12/16] drm, cgroup: Add soft VRAM limit

2019-08-29 Thread Kenny Ho
The drm resource being limited is the TTM (Translation Table Manager)
buffers.  TTM manages different types of memory that a GPU might access.
These memory types include dedicated Video RAM (VRAM) and host/system
memory accessible through IOMMU (GART/GTT).  TTM is currently used by
multiple drm drivers (amd, ast, bochs, cirrus, hisilicon, maga200,
nouveau, qxl, virtio, vmwgfx.)

TTM buffers belonging to drm cgroups under memory pressure will be
selected to be evicted first.

drm.memory.high
A read-write nested-keyed file which exists on all cgroups.
Each entry is keyed by the drm device's major:minor.  The
following nested keys are defined.

   =
  vram Video RAM soft limit for a drm device in byte
   =

Reading returns the following::

226:0 vram=0
226:1 vram=17768448
226:2 vram=17768448

drm.memory.default
A read-only nested-keyed file which exists on the root cgroup.
Each entry is keyed by the drm device's major:minor.  The
following nested keys are defined.

   ===
  vram Video RAM default limit in byte
   ===

Reading returns the following::

226:0 vram=0
226:1 vram=17768448
226:2 vram=17768448

Change-Id: I7988e28a453b53140b40a28c176239acbc81d491
Signed-off-by: Kenny Ho 
---
 drivers/gpu/drm/ttm/ttm_bo.c |   7 ++
 include/drm/drm_cgroup.h |  17 +
 include/linux/cgroup_drm.h   |   2 +
 kernel/cgroup/drm.c  | 135 +++
 4 files changed, 161 insertions(+)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 32eee85f3641..d7e3d3128ebb 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -853,14 +853,21 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev,
struct ttm_bo_global *glob = bdev->glob;
struct ttm_mem_type_manager *man = >man[mem_type];
bool locked = false;
+   bool check_drmcg;
unsigned i;
int ret;
 
+   check_drmcg = drmcg_mem_pressure_scan(bdev, mem_type);
+
spin_lock(>lru_lock);
for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) {
list_for_each_entry(bo, >lru[i], lru) {
bool busy;
 
+   if (check_drmcg &&
+   !drmcg_mem_should_evict(bo, mem_type))
+   continue;
+
if (!ttm_bo_evict_swapout_allowable(bo, ctx, ,
)) {
if (busy && !busy_bo &&
diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
index 9ce0d54e6bd8..c11df388fdf2 100644
--- a/include/drm/drm_cgroup.h
+++ b/include/drm/drm_cgroup.h
@@ -6,6 +6,7 @@
 
 #include 
 #include 
+#include 
 
 /**
  * Per DRM device properties for DRM cgroup controller for the purpose
@@ -22,6 +23,8 @@ struct drmcg_props {
 
s64 mem_bw_bytes_in_period_default;
s64 mem_bw_avg_bytes_per_us_default;
+
+   s64 mem_highs_default[TTM_PL_PRIV+1];
 };
 
 #ifdef CONFIG_CGROUP_DRM
@@ -38,6 +41,8 @@ void drmcg_mem_track_move(struct ttm_buffer_object *old_bo, 
bool evict,
struct ttm_mem_reg *new_mem);
 unsigned int drmcg_get_mem_bw_period_in_us(struct ttm_buffer_object *tbo);
 bool drmcg_mem_can_move(struct ttm_buffer_object *tbo);
+bool drmcg_mem_pressure_scan(struct ttm_bo_device *bdev, unsigned int type);
+bool drmcg_mem_should_evict(struct ttm_buffer_object *tbo, unsigned int type);
 
 #else
 static inline void drmcg_device_update(struct drm_device *device)
@@ -81,5 +86,17 @@ static inline bool drmcg_mem_can_move(struct 
ttm_buffer_object *tbo)
 {
return true;
 }
+
+static inline bool drmcg_mem_pressure_scan(struct ttm_bo_device *bdev,
+   unsigned int type)
+{
+   return false;
+}
+
+static inline bool drmcg_mem_should_evict(struct ttm_buffer_object *tbo,
+   unsigned int type)
+{
+   return true;
+}
 #endif /* CONFIG_CGROUP_DRM */
 #endif /* __DRM_CGROUP_H__ */
diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
index 27809a583bf2..c56cfe74d1a6 100644
--- a/include/linux/cgroup_drm.h
+++ b/include/linux/cgroup_drm.h
@@ -50,6 +50,8 @@ struct drmcg_device_resource {
 
s64 mem_stats[TTM_PL_PRIV+1];
s64 mem_peaks[TTM_PL_PRIV+1];
+   s64 mem_highs[TTM_PL_PRIV+1];
+   boolmem_pressure[TTM_PL_PRIV+1];
s64 mem_stats_evict;
 
s64 mem_bw_stats_last_update_us;
diff --git a/kernel/cgroup/drm.c 

[PATCH RFC v4 16/16] drm/amdgpu: Integrate with DRM cgroup

2019-08-29 Thread Kenny Ho
The number of logical gpu (lgpu) is defined to be the number of compute
unit (CU) for a device.  The lgpu allocation limit only applies to
compute workload for the moment (enforced via kfd queue creation.)  Any
cu_mask update is validated against the availability of the compute unit
as defined by the drmcg the kfd process belongs to.

Change-Id: I69a57452c549173a1cd623c30dc57195b3b6563e
Signed-off-by: Kenny Ho 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   4 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   |  21 +++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   6 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   3 +
 .../amd/amdkfd/kfd_process_queue_manager.c| 140 ++
 5 files changed, 174 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 55cb1b2094fd..369915337213 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -198,6 +198,10 @@ uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct kgd_dev 
*dst, struct kgd_dev *s
valid;  \
})
 
+int amdgpu_amdkfd_update_cu_mask_for_process(struct task_struct *task,
+   struct amdgpu_device *adev, unsigned long *lgpu_bitmap,
+   unsigned int nbits);
+
 /* GPUVM API */
 int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, unsigned int 
pasid,
void **vm, void **process_info,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 163a4fbf0611..8abeffdd2e5b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1398,9 +1398,29 @@ amdgpu_get_crtc_scanout_position(struct drm_device *dev, 
unsigned int pipe,
 static void amdgpu_drmcg_custom_init(struct drm_device *dev,
struct drmcg_props *props)
 {
+   struct amdgpu_device *adev = dev->dev_private;
+
+   props->lgpu_capacity = adev->gfx.cu_info.number;
+
props->limit_enforced = true;
 }
 
+static void amdgpu_drmcg_limit_updated(struct drm_device *dev,
+   struct task_struct *task, struct drmcg_device_resource *ddr,
+   enum drmcg_res_type res_type)
+{
+   struct amdgpu_device *adev = dev->dev_private;
+
+   switch (res_type) {
+   case DRMCG_TYPE_LGPU:
+   amdgpu_amdkfd_update_cu_mask_for_process(task, adev,
+ddr->lgpu_allocated, dev->drmcg_props.lgpu_capacity);
+   break;
+   default:
+   break;
+   }
+}
+
 static struct drm_driver kms_driver = {
.driver_features =
DRIVER_USE_AGP | DRIVER_ATOMIC |
@@ -1438,6 +1458,7 @@ static struct drm_driver kms_driver = {
.gem_prime_mmap = amdgpu_gem_prime_mmap,
 
.drmcg_custom_init = amdgpu_drmcg_custom_init,
+   .drmcg_limit_updated = amdgpu_drmcg_limit_updated,
 
.name = DRIVER_NAME,
.desc = DRIVER_DESC,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 138c70454e2b..fa765b803f97 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -450,6 +450,12 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct 
kfd_process *p,
return -EFAULT;
}
 
+   if (!pqm_drmcg_lgpu_validate(p, args->queue_id, properties.cu_mask, 
cu_mask_size)) {
+   pr_debug("CU mask not permitted by DRM Cgroup");
+   kfree(properties.cu_mask);
+   return -EACCES;
+   }
+
mutex_lock(>mutex);
 
retval = pqm_set_cu_mask(>pqm, args->queue_id, );
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 8b0eee5b3521..1bec7550 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1038,6 +1038,9 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
   u32 *ctl_stack_used_size,
   u32 *save_area_used_size);
 
+bool pqm_drmcg_lgpu_validate(struct kfd_process *p, int qid, u32 *cu_mask,
+   unsigned int cu_mask_size);
+
 int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
unsigned int fence_value,
unsigned int timeout_ms);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 7e6c3ee82f5b..a896de290307 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -23,9 +23,11 @@
 
 #include 
 #include 
+#include 
 #include "kfd_device_queue_manager.h"
 #include "kfd_priv.h"
 #include "kfd_kernel_queue.h"
+#include "amdgpu.h"
 #include "amdgpu_amdkfd.h"
 
 static inline struct process_queue_node 

[PATCH RFC v4 03/16] drm, cgroup: Initialize drmcg properties

2019-08-29 Thread Kenny Ho
drmcg initialization involves allocating a per cgroup, per device data
structure and setting the defaults.  There are two entry points for
drmcg init:

1) When struct drmcg is created via css_alloc, initialization is done
for each device

2) When DRM devices are created after drmcgs are created
  a) Per device drmcg data structure is allocated at the beginning of
  DRM device creation such that drmcg can begin tracking usage
  statistics
  b) At the end of DRM device creation, drmcg_device_update is called in
  case device specific defaults need to be applied.

Entry point #2 usually applies to the root cgroup since it can be
created before DRM devices are available.  The drmcg controller will go
through all existing drm cgroups and initialize them with the new device
accordingly.

Change-Id: I908ee6975ea0585e4c30eafde4599f87094d8c65
Signed-off-by: Kenny Ho 
---
 drivers/gpu/drm/drm_drv.c  |   7 +++
 include/drm/drm_cgroup.h   |  27 
 include/drm/drm_device.h   |   7 +++
 include/drm/drm_drv.h  |   9 +++
 include/linux/cgroup_drm.h |  13 
 kernel/cgroup/drm.c| 123 +
 6 files changed, 186 insertions(+)
 create mode 100644 include/drm/drm_cgroup.h

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 000cddabd970..94265eba68ca 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "drm_crtc_internal.h"
 #include "drm_legacy.h"
@@ -672,6 +673,7 @@ int drm_dev_init(struct drm_device *dev,
mutex_init(>filelist_mutex);
mutex_init(>clientlist_mutex);
mutex_init(>master_mutex);
+   mutex_init(>drmcg_mutex);
 
dev->anon_inode = drm_fs_inode_new();
if (IS_ERR(dev->anon_inode)) {
@@ -708,6 +710,7 @@ int drm_dev_init(struct drm_device *dev,
if (ret)
goto err_setunique;
 
+   drmcg_device_early_init(dev);
return 0;
 
 err_setunique:
@@ -722,6 +725,7 @@ int drm_dev_init(struct drm_device *dev,
drm_fs_inode_free(dev->anon_inode);
 err_free:
put_device(dev->dev);
+   mutex_destroy(>drmcg_mutex);
mutex_destroy(>master_mutex);
mutex_destroy(>clientlist_mutex);
mutex_destroy(>filelist_mutex);
@@ -798,6 +802,7 @@ void drm_dev_fini(struct drm_device *dev)
 
put_device(dev->dev);
 
+   mutex_destroy(>drmcg_mutex);
mutex_destroy(>master_mutex);
mutex_destroy(>clientlist_mutex);
mutex_destroy(>filelist_mutex);
@@ -1008,6 +1013,8 @@ int drm_dev_register(struct drm_device *dev, unsigned 
long flags)
 dev->dev ? dev_name(dev->dev) : "virtual device",
 dev->primary->index);
 
+   drmcg_device_update(dev);
+
goto out_unlock;
 
 err_minors:
diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
new file mode 100644
index ..bef9f9245924
--- /dev/null
+++ b/include/drm/drm_cgroup.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: MIT
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ */
+#ifndef __DRM_CGROUP_H__
+#define __DRM_CGROUP_H__
+
+/**
+ * Per DRM device properties for DRM cgroup controller for the purpose
+ * of storing per device defaults
+ */
+struct drmcg_props {
+};
+
+#ifdef CONFIG_CGROUP_DRM
+
+void drmcg_device_update(struct drm_device *device);
+void drmcg_device_early_init(struct drm_device *device);
+#else
+static inline void drmcg_device_update(struct drm_device *device)
+{
+}
+
+static inline void drmcg_device_early_init(struct drm_device *device)
+{
+}
+#endif /* CONFIG_CGROUP_DRM */
+#endif /* __DRM_CGROUP_H__ */
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index 7f9ef709b2b6..5d7d779a5083 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -8,6 +8,7 @@
 
 #include 
 #include 
+#include 
 
 struct drm_driver;
 struct drm_minor;
@@ -304,6 +305,12 @@ struct drm_device {
 */
struct drm_fb_helper *fb_helper;
 
+/** \name DRM Cgroup */
+   /*@{ */
+   struct mutex drmcg_mutex;
+   struct drmcg_props drmcg_props;
+   /*@} */
+
/* Everything below here is for legacy driver, never use! */
/* private: */
 #if IS_ENABLED(CONFIG_DRM_LEGACY)
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index 24f8d054c570..c8a37a08d98d 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -660,6 +660,15 @@ struct drm_driver {
struct drm_device *dev,
uint32_t handle);
 
+   /**
+* @drmcg_custom_init
+*
+* Optional callback used to initialize drm cgroup per device properties
+* such as resource limit defaults.
+*/
+   void (*drmcg_custom_init)(struct drm_device *dev,
+   struct drmcg_props *props);
+
/**
 * @gem_vm_ops: Driver private ops for this object
 */
diff --git 

[PATCH RFC v4 13/16] drm, cgroup: Allow more aggressive memory reclaim

2019-08-29 Thread Kenny Ho
Allow DRM TTM memory manager to register a work_struct, such that, when
a drmcgrp is under memory pressure, memory reclaiming can be triggered
immediately.

Change-Id: I25ac04e2db9c19ff12652b88ebff18b44b2706d8
Signed-off-by: Kenny Ho 
---
 drivers/gpu/drm/ttm/ttm_bo.c| 49 +
 include/drm/drm_cgroup.h| 16 +++
 include/drm/ttm/ttm_bo_driver.h |  2 ++
 kernel/cgroup/drm.c | 30 
 4 files changed, 97 insertions(+)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index d7e3d3128ebb..72efae694b7e 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1590,6 +1590,46 @@ int ttm_bo_evict_mm(struct ttm_bo_device *bdev, unsigned 
mem_type)
 }
 EXPORT_SYMBOL(ttm_bo_evict_mm);
 
+static void ttm_bo_reclaim_wq(struct work_struct *work)
+{
+   struct ttm_operation_ctx ctx = {
+   .interruptible = false,
+   .no_wait_gpu = false,
+   .flags = TTM_OPT_FLAG_FORCE_ALLOC
+   };
+   struct ttm_mem_type_manager *man =
+   container_of(work, struct ttm_mem_type_manager, reclaim_wq);
+   struct ttm_bo_device *bdev = man->bdev;
+   struct dma_fence *fence;
+   int mem_type;
+   int ret;
+
+   for (mem_type = 0; mem_type < TTM_NUM_MEM_TYPES; mem_type++)
+   if (>man[mem_type] == man)
+   break;
+
+   WARN_ON(mem_type >= TTM_NUM_MEM_TYPES);
+   if (mem_type >= TTM_NUM_MEM_TYPES)
+   return;
+
+   if (!drmcg_mem_pressure_scan(bdev, mem_type))
+   return;
+
+   ret = ttm_mem_evict_first(bdev, mem_type, NULL, , NULL);
+   if (ret)
+   return;
+
+   spin_lock(>move_lock);
+   fence = dma_fence_get(man->move);
+   spin_unlock(>move_lock);
+
+   if (fence) {
+   ret = dma_fence_wait(fence, false);
+   dma_fence_put(fence);
+   }
+
+}
+
 int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned type,
unsigned long p_size)
 {
@@ -1624,6 +1664,13 @@ int ttm_bo_init_mm(struct ttm_bo_device *bdev, unsigned 
type,
INIT_LIST_HEAD(>lru[i]);
man->move = NULL;
 
+   pr_err("drmcg %p type %d\n", bdev->ddev, type);
+
+   if (type <= TTM_PL_VRAM) {
+   INIT_WORK(>reclaim_wq, ttm_bo_reclaim_wq);
+   drmcg_register_device_mm(bdev->ddev, type, >reclaim_wq);
+   }
+
return 0;
 }
 EXPORT_SYMBOL(ttm_bo_init_mm);
@@ -1701,6 +1748,8 @@ int ttm_bo_device_release(struct ttm_bo_device *bdev)
man = >man[i];
if (man->has_type) {
man->use_type = false;
+   drmcg_unregister_device_mm(bdev->ddev, i);
+   cancel_work_sync(>reclaim_wq);
if ((i != TTM_PL_SYSTEM) && ttm_bo_clean_mm(bdev, i)) {
ret = -EBUSY;
pr_err("DRM memory manager type %d is not 
clean\n",
diff --git a/include/drm/drm_cgroup.h b/include/drm/drm_cgroup.h
index c11df388fdf2..6d9707e1eb72 100644
--- a/include/drm/drm_cgroup.h
+++ b/include/drm/drm_cgroup.h
@@ -5,6 +5,7 @@
 #define __DRM_CGROUP_H__
 
 #include 
+#include 
 #include 
 #include 
 
@@ -25,12 +26,17 @@ struct drmcg_props {
s64 mem_bw_avg_bytes_per_us_default;
 
s64 mem_highs_default[TTM_PL_PRIV+1];
+
+   struct work_struct  *mem_reclaim_wq[TTM_PL_PRIV];
 };
 
 #ifdef CONFIG_CGROUP_DRM
 
 void drmcg_device_update(struct drm_device *device);
 void drmcg_device_early_init(struct drm_device *device);
+void drmcg_register_device_mm(struct drm_device *dev, unsigned int type,
+   struct work_struct *wq);
+void drmcg_unregister_device_mm(struct drm_device *dev, unsigned int type);
 bool drmcg_try_chg_bo_alloc(struct drmcg *drmcg, struct drm_device *dev,
size_t size);
 void drmcg_unchg_bo_alloc(struct drmcg *drmcg, struct drm_device *dev,
@@ -53,6 +59,16 @@ static inline void drmcg_device_early_init(struct drm_device 
*device)
 {
 }
 
+static inline void drmcg_register_device_mm(struct drm_device *dev,
+   unsigned int type, struct work_struct *wq)
+{
+}
+
+static inline void drmcg_unregister_device_mm(struct drm_device *dev,
+   unsigned int type)
+{
+}
+
 static inline void drmcg_try_chg_bo_alloc(struct drmcg *drmcg,
struct drm_device *dev, size_t size)
 {
diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h
index e1a805d65b83..529cef92bcf6 100644
--- a/include/drm/ttm/ttm_bo_driver.h
+++ b/include/drm/ttm/ttm_bo_driver.h
@@ -205,6 +205,8 @@ struct ttm_mem_type_manager {
 * Protected by @move_lock.
 */
struct dma_fence *move;
+
+   struct work_struct reclaim_wq;
 };
 
 /**
diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
index 04fb9a398740..0ea7f0619e25 

[PATCH RFC v4 00/16] new cgroup controller for gpu/drm subsystem

2019-08-29 Thread Kenny Ho
This is a follow up to the RFC I made previously to introduce a cgroup
controller for the GPU/DRM subsystem [v1,v2,v3].  The goal is to be able to
provide resource management to GPU resources using things like container.  

With this RFC v4, I am hoping to have some consensus on a merge plan.  I believe
the GEM related resources (drm.buffer.*) introduced in previous RFC and,
hopefully, the logical GPU concept (drm.lgpu.*) introduced in this RFC are
uncontroversial and ready to move out of RFC and into a more formal review.  I
will continue to work on the memory backend resources (drm.memory.*).

The cover letter from v1 is copied below for reference.

[v1]: https://lists.freedesktop.org/archives/dri-devel/2018-November/197106.html
[v2]: https://www.spinics.net/lists/cgroups/msg22074.html
[v3]: https://lists.freedesktop.org/archives/amd-gfx/2019-June/036026.html

v4:
Unchanged (no review needed)
* drm.memory.*/ttm resources (Patch 9-13, I am still working on memory bandwidth
and shrinker)
Base on feedbacks on v3:
* update nominclature to drmcg
* embed per device drmcg properties into drm_device
* split GEM buffer related commits into stats and limit
* rename function name to align with convention
* combined buffer accounting and check into a try_charge function
* support buffer stats without limit enforcement
* removed GEM buffer sharing limitation
* updated documentations
New features:
* introducing logical GPU concept
* example implementation with AMD KFD

v3:
Base on feedbacks on v2:
* removed .help type file from v2
* conform to cgroup convention for default and max handling
* conform to cgroup convention for addressing device specific limits (with 
major:minor)
New function:
* adopted memparse for memory size related attributes
* added macro to marshall drmcgrp cftype private  (DRMCG_CTF_PRIV, etc.)
* added ttm buffer usage stats (per cgroup, for system, tt, vram.)
* added ttm buffer usage limit (per cgroup, for vram.)
* added per cgroup bandwidth stats and limiting (burst and average bandwidth)

v2:
* Removed the vendoring concepts
* Add limit to total buffer allocation
* Add limit to the maximum size of a buffer allocation

v1: cover letter

The purpose of this patch series is to start a discussion for a generic cgroup
controller for the drm subsystem.  The design proposed here is a very early one.
We are hoping to engage the community as we develop the idea.


Backgrounds
==
Control Groups/cgroup provide a mechanism for aggregating/partitioning sets of
tasks, and all their future children, into hierarchical groups with specialized
behaviour, such as accounting/limiting the resources which processes in a cgroup
can access[1].  Weights, limits, protections, allocations are the main resource
distribution models.  Existing cgroup controllers includes cpu, memory, io,
rdma, and more.  cgroup is one of the foundational technologies that enables the
popular container application deployment and management method.

Direct Rendering Manager/drm contains code intended to support the needs of
complex graphics devices. Graphics drivers in the kernel may make use of DRM
functions to make tasks like memory management, interrupt handling and DMA
easier, and provide a uniform interface to applications.  The DRM has also
developed beyond traditional graphics applications to support compute/GPGPU
applications.


Motivations
=
As GPU grow beyond the realm of desktop/workstation graphics into areas like
data center clusters and IoT, there are increasing needs to monitor and regulate
GPU as a resource like cpu, memory and io.

Matt Roper from Intel began working on similar idea in early 2018 [2] for the
purpose of managing GPU priority using the cgroup hierarchy.  While that
particular use case may not warrant a standalone drm cgroup controller, there
are other use cases where having one can be useful [3].  Monitoring GPU
resources such as VRAM and buffers, CU (compute unit [AMD's nomenclature])/EU
(execution unit [Intel's nomenclature]), GPU job scheduling [4] can help
sysadmins get a better understanding of the applications usage profile.  Further
usage regulations of the aforementioned resources can also help sysadmins
optimize workload deployment on limited GPU resources.

With the increased importance of machine learning, data science and other
cloud-based applications, GPUs are already in production use in data centers
today [5,6,7].  Existing GPU resource management is very course grain, however,
as sysadmins are only able to distribute workload on a per-GPU basis [8].  An
alternative is to use GPU virtualization (with or without SRIOV) but it
generally acts on the entire GPU instead of the specific resources in a GPU.
With a drm cgroup controller, we can enable alternate, fine-grain, sub-GPU
resource management (in addition to what may be available via GPU
virtualization.)

In addition to production use, the DRM cgroup can also help with testing
graphics application robustness by providing a mean 

[PATCH RFC v4 01/16] drm: Add drm_minor_for_each

2019-08-29 Thread Kenny Ho
To allow other subsystems to iterate through all stored DRM minors and
act upon them.

Also exposes drm_minor_acquire and drm_minor_release for other subsystem
to handle drm_minor.  DRM cgroup controller is the initial consumer of
this new features.

Change-Id: I7c4b67ce6b31f06d1037b03435386ff5b8144ca5
Signed-off-by: Kenny Ho 
---
 drivers/gpu/drm/drm_drv.c  | 19 +++
 drivers/gpu/drm/drm_internal.h |  4 
 include/drm/drm_drv.h  |  4 
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 862621494a93..000cddabd970 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -254,11 +254,13 @@ struct drm_minor *drm_minor_acquire(unsigned int minor_id)
 
return minor;
 }
+EXPORT_SYMBOL(drm_minor_acquire);
 
 void drm_minor_release(struct drm_minor *minor)
 {
drm_dev_put(minor->dev);
 }
+EXPORT_SYMBOL(drm_minor_release);
 
 /**
  * DOC: driver instance overview
@@ -1078,6 +1080,23 @@ int drm_dev_set_unique(struct drm_device *dev, const 
char *name)
 }
 EXPORT_SYMBOL(drm_dev_set_unique);
 
+/**
+ * drm_minor_for_each - Iterate through all stored DRM minors
+ * @fn: Function to be called for each pointer.
+ * @data: Data passed to callback function.
+ *
+ * The callback function will be called for each @drm_minor entry, passing
+ * the minor, the entry and @data.
+ *
+ * If @fn returns anything other than %0, the iteration stops and that
+ * value is returned from this function.
+ */
+int drm_minor_for_each(int (*fn)(int id, void *p, void *data), void *data)
+{
+   return idr_for_each(_minors_idr, fn, data);
+}
+EXPORT_SYMBOL(drm_minor_for_each);
+
 /*
  * DRM Core
  * The DRM core module initializes all global DRM objects and makes them
diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
index e19ac7ca602d..6bfad76f8e78 100644
--- a/drivers/gpu/drm/drm_internal.h
+++ b/drivers/gpu/drm/drm_internal.h
@@ -54,10 +54,6 @@ void drm_prime_destroy_file_private(struct 
drm_prime_file_private *prime_fpriv);
 void drm_prime_remove_buf_handle_locked(struct drm_prime_file_private 
*prime_fpriv,
struct dma_buf *dma_buf);
 
-/* drm_drv.c */
-struct drm_minor *drm_minor_acquire(unsigned int minor_id);
-void drm_minor_release(struct drm_minor *minor);
-
 /* drm_vblank.c */
 void drm_vblank_disable_and_save(struct drm_device *dev, unsigned int pipe);
 void drm_vblank_cleanup(struct drm_device *dev);
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index 68ca736c548d..24f8d054c570 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -799,5 +799,9 @@ static inline bool drm_drv_uses_atomic_modeset(struct 
drm_device *dev)
 
 int drm_dev_set_unique(struct drm_device *dev, const char *name);
 
+int drm_minor_for_each(int (*fn)(int id, void *p, void *data), void *data);
+
+struct drm_minor *drm_minor_acquire(unsigned int minor_id);
+void drm_minor_release(struct drm_minor *minor);
 
 #endif
-- 
2.22.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

[PATCH RFC v4 07/16] drm, cgroup: Add total GEM buffer allocation limit

2019-08-29 Thread Kenny Ho
The drm resource being limited here is the GEM buffer objects.  User
applications allocate and free these buffers.  In addition, a process
can allocate a buffer and share it with another process.  The consumer
of a shared buffer can also outlive the allocator of the buffer.

For the purpose of cgroup accounting and limiting, ownership of the
buffer is deemed to be the cgroup for which the allocating process
belongs to.  There is one cgroup limit per drm device.

The limiting functionality is added to the previous stats collection
function.  The drm_gem_private_object_init is modified to have a return
value to allow failure due to cgroup limit.

The try_chg function only fails if the DRM cgroup properties has
limit_enforced set to true for the DRM device.  This is to allow the DRM
cgroup controller to collect usage stats without enforcing the limits.

drm.buffer.default
A read-only flat-keyed file which exists on the root cgroup.
Each entry is keyed by the drm device's major:minor.

Default limits on the total GEM buffer allocation in bytes.

drm.buffer.max
A read-write flat-keyed file which exists on all cgroups.  Each
entry is keyed by the drm device's major:minor.

Per device limits on the total GEM buffer allocation in byte.
This is a hard limit.  Attempts in allocating beyond the cgroup
limit will result in ENOMEM.  Shorthand understood by memparse
(such as k, m, g) can be used.

Set allocation limit for /dev/dri/card1 to 1GB
echo "226:1 1g" > drm.buffer.total.max

Set allocation limit for /dev/dri/card0 to 512MB
echo "226:0 512m" > drm.buffer.total.max

Change-Id: I96e0b7add4d331ed8bb267b3c9243d360c6e9903
Signed-off-by: Kenny Ho 
---
 Documentation/admin-guide/cgroup-v2.rst|  21 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c|   8 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |   6 +-
 drivers/gpu/drm/drm_gem.c  |  11 +-
 include/drm/drm_cgroup.h   |   7 +-
 include/drm/drm_gem.h  |   2 +-
 include/linux/cgroup_drm.h |   1 +
 kernel/cgroup/drm.c| 221 -
 8 files changed, 260 insertions(+), 17 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 4dc72339a9b6..e8fac2684179 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1919,6 +1919,27 @@ DRM Interface Files
 
Total number of GEM buffer allocated.
 
+  drm.buffer.default
+   A read-only flat-keyed file which exists on the root cgroup.
+   Each entry is keyed by the drm device's major:minor.
+
+   Default limits on the total GEM buffer allocation in bytes.
+
+  drm.buffer.max
+   A read-write flat-keyed file which exists on all cgroups.  Each
+   entry is keyed by the drm device's major:minor.
+
+   Per device limits on the total GEM buffer allocation in byte.
+   This is a hard limit.  Attempts in allocating beyond the cgroup
+   limit will result in ENOMEM.  Shorthand understood by memparse
+   (such as k, m, g) can be used.
+
+   Set allocation limit for /dev/dri/card1 to 1GB
+   echo "226:1 1g" > drm.buffer.total.max
+
+   Set allocation limit for /dev/dri/card0 to 512MB
+   echo "226:0 512m" > drm.buffer.total.max
+
 GEM Buffer Ownership
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c0bbd3aa0558..163a4fbf0611 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1395,6 +1395,12 @@ amdgpu_get_crtc_scanout_position(struct drm_device *dev, 
unsigned int pipe,
  stime, etime, mode);
 }
 
+static void amdgpu_drmcg_custom_init(struct drm_device *dev,
+   struct drmcg_props *props)
+{
+   props->limit_enforced = true;
+}
+
 static struct drm_driver kms_driver = {
.driver_features =
DRIVER_USE_AGP | DRIVER_ATOMIC |
@@ -1431,6 +1437,8 @@ static struct drm_driver kms_driver = {
.gem_prime_vunmap = amdgpu_gem_prime_vunmap,
.gem_prime_mmap = amdgpu_gem_prime_mmap,
 
+   .drmcg_custom_init = amdgpu_drmcg_custom_init,
+
.name = DRIVER_NAME,
.desc = DRIVER_DESC,
.date = DRIVER_DATE,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 989b7b55cb2e..b1bd66be3e1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 #include "amdgpu_amdkfd.h"
@@ -454,7 +455,10 @@ static int amdgpu_bo_do_create(struct amdgpu_device *adev,
bo = kzalloc(sizeof(struct amdgpu_bo), GFP_KERNEL);
if (bo == NULL)

[PATCH RFC v4 02/16] cgroup: Introduce cgroup for drm subsystem

2019-08-29 Thread Kenny Ho
With the increased importance of machine learning, data science and
other cloud-based applications, GPUs are already in production use in
data centers today.  Existing GPU resource management is very coarse
grain, however, as sysadmins are only able to distribute workload on a
per-GPU basis.  An alternative is to use GPU virtualization (with or
without SRIOV) but it generally acts on the entire GPU instead of the
specific resources in a GPU.  With a drm cgroup controller, we can
enable alternate, fine-grain, sub-GPU resource management (in addition
to what may be available via GPU virtualization.)

Change-Id: I6830d3990f63f0c13abeba29b1d330cf28882831
Signed-off-by: Kenny Ho 
---
 Documentation/admin-guide/cgroup-v2.rst | 18 -
 Documentation/cgroup-v1/drm.rst |  1 +
 include/linux/cgroup_drm.h  | 92 +
 include/linux/cgroup_subsys.h   |  4 ++
 init/Kconfig|  5 ++
 kernel/cgroup/Makefile  |  1 +
 kernel/cgroup/drm.c | 42 +++
 7 files changed, 161 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/cgroup-v1/drm.rst
 create mode 100644 include/linux/cgroup_drm.h
 create mode 100644 kernel/cgroup/drm.c

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 88e746074252..2936423a3fd5 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -61,8 +61,10 @@ v1 is available under Documentation/cgroup-v1/.
  5-6. Device
  5-7. RDMA
5-7-1. RDMA Interface Files
- 5-8. Misc
-   5-8-1. perf_event
+ 5-8. DRM
+   5-8-1. DRM Interface Files
+ 5-9. Misc
+   5-9-1. perf_event
  5-N. Non-normative information
5-N-1. CPU controller root cgroup process behaviour
5-N-2. IO controller root cgroup process behaviour
@@ -1889,6 +1891,18 @@ RDMA Interface Files
  ocrdma1 hca_handle=1 hca_object=23
 
 
+DRM
+---
+
+The "drm" controller regulates the distribution and accounting of
+of DRM (Direct Rendering Manager) and GPU-related resources.
+
+DRM Interface Files
+
+
+TODO
+
+
 Misc
 
 
diff --git a/Documentation/cgroup-v1/drm.rst b/Documentation/cgroup-v1/drm.rst
new file mode 100644
index ..5f5658e1f5ed
--- /dev/null
+++ b/Documentation/cgroup-v1/drm.rst
@@ -0,0 +1 @@
+Please see ../cgroup-v2.rst for details
diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
new file mode 100644
index ..971166f9dd78
--- /dev/null
+++ b/include/linux/cgroup_drm.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: MIT
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ */
+#ifndef _CGROUP_DRM_H
+#define _CGROUP_DRM_H
+
+#ifdef CONFIG_CGROUP_DRM
+
+#include 
+
+/**
+ * The DRM cgroup controller data structure.
+ */
+struct drmcg {
+   struct cgroup_subsys_state  css;
+};
+
+/**
+ * css_to_drmcg - get the corresponding drmcg ref from a cgroup_subsys_state
+ * @css: the target cgroup_subsys_state
+ *
+ * Return: DRM cgroup that contains the @css
+ */
+static inline struct drmcg *css_to_drmcg(struct cgroup_subsys_state *css)
+{
+   return css ? container_of(css, struct drmcg, css) : NULL;
+}
+
+/**
+ * drmcg_get - get the drmcg reference that a task belongs to
+ * @task: the target task
+ *
+ * This increase the reference count of the css that the @task belongs to
+ *
+ * Return: reference to the DRM cgroup the task belongs to
+ */
+static inline struct drmcg *drmcg_get(struct task_struct *task)
+{
+   return css_to_drmcg(task_get_css(task, drm_cgrp_id));
+}
+
+/**
+ * drmcg_put - put a drmcg reference
+ * @drmcg: the target drmcg
+ *
+ * Put a reference obtained via drmcg_get
+ */
+static inline void drmcg_put(struct drmcg *drmcg)
+{
+   if (drmcg)
+   css_put(>css);
+}
+
+/**
+ * drmcg_parent - find the parent of a drm cgroup
+ * @cg: the target drmcg
+ *
+ * This does not increase the reference count of the parent cgroup
+ *
+ * Return: parent DRM cgroup of @cg
+ */
+static inline struct drmcg *drmcg_parent(struct drmcg *cg)
+{
+   return css_to_drmcg(cg->css.parent);
+}
+
+#else /* CONFIG_CGROUP_DRM */
+
+struct drmcg {
+};
+
+static inline struct drmcg *css_to_drmcg(struct cgroup_subsys_state *css)
+{
+   return NULL;
+}
+
+static inline struct drmcg *drmcg_get(struct task_struct *task)
+{
+   return NULL;
+}
+
+static inline void drmcg_put(struct drmcg *drmcg)
+{
+}
+
+static inline struct drmcg *drmcg_parent(struct drmcg *cg)
+{
+   return NULL;
+}
+
+#endif /* CONFIG_CGROUP_DRM */
+#endif /* _CGROUP_DRM_H */
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..ddedad809e8b 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -61,6 +61,10 @@ SUBSYS(pids)
 SUBSYS(rdma)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_DRM)
+SUBSYS(drm)
+#endif
+
 /*
  * The following subsystems are not 

[PATCH RFC v4 10/16] drm, cgroup: Add TTM buffer peak usage stats

2019-08-29 Thread Kenny Ho
drm.memory.peak.stats
A read-only nested-keyed file which exists on all cgroups.
Each entry is keyed by the drm device's major:minor.  The
following nested keys are defined.

  == ==
  system Peak host memory used
  tt Peak host memory used by the device (GTT/GART)
  vram   Peak Video RAM used by the drm device
  priv   Other drm device specific memory peak usage
  == ==

Reading returns the following::

226:0 system=0 tt=0 vram=0 priv=0
226:1 system=0 tt=9035776 vram=17768448 priv=16809984
226:2 system=0 tt=9035776 vram=17768448 priv=16809984

Change-Id: I986e44533848f66411465bdd52105e78105a709a
Signed-off-by: Kenny Ho 
---
 include/linux/cgroup_drm.h |  2 ++
 kernel/cgroup/drm.c| 19 +++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h
index 4c2794c9333d..9579e2a0b71d 100644
--- a/include/linux/cgroup_drm.h
+++ b/include/linux/cgroup_drm.h
@@ -20,6 +20,7 @@ enum drmcg_res_type {
DRMCG_TYPE_BO_COUNT,
DRMCG_TYPE_MEM,
DRMCG_TYPE_MEM_EVICT,
+   DRMCG_TYPE_MEM_PEAK,
__DRMCG_TYPE_LAST,
 };
 
@@ -37,6 +38,7 @@ struct drmcg_device_resource {
s64 bo_stats_count_allocated;
 
s64 mem_stats[TTM_PL_PRIV+1];
+   s64 mem_peaks[TTM_PL_PRIV+1];
s64 mem_stats_evict;
 };
 
diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c
index 4960a8d1e8f4..899dc44722c3 100644
--- a/kernel/cgroup/drm.c
+++ b/kernel/cgroup/drm.c
@@ -162,6 +162,13 @@ static void drmcg_print_stats(struct drmcg_device_resource 
*ddr,
case DRMCG_TYPE_MEM_EVICT:
seq_printf(sf, "%lld\n", ddr->mem_stats_evict);
break;
+   case DRMCG_TYPE_MEM_PEAK:
+   for (i = 0; i <= TTM_PL_PRIV; i++) {
+   seq_printf(sf, "%s=%lld ", ttm_placement_names[i],
+   ddr->mem_peaks[i]);
+   }
+   seq_puts(sf, "\n");
+   break;
default:
seq_puts(sf, "\n");
break;
@@ -443,6 +450,12 @@ struct cftype files[] = {
.private = DRMCG_CTF_PRIV(DRMCG_TYPE_MEM_EVICT,
DRMCG_FTYPE_STATS),
},
+   {
+   .name = "memory.peaks.stats",
+   .seq_show = drmcg_seq_show,
+   .private = DRMCG_CTF_PRIV(DRMCG_TYPE_MEM_PEAK,
+   DRMCG_FTYPE_STATS),
+   },
{ } /* terminate */
 };
 
@@ -617,6 +630,8 @@ void drmcg_chg_mem(struct ttm_buffer_object *tbo)
for ( ; drmcg != NULL; drmcg = drmcg_parent(drmcg)) {
ddr = drmcg->dev_resources[devIdx];
ddr->mem_stats[mem_type] += size;
+   ddr->mem_peaks[mem_type] = max(ddr->mem_peaks[mem_type],
+   ddr->mem_stats[mem_type]);
}
mutex_unlock(>drmcg_mutex);
 }
@@ -668,6 +683,10 @@ void drmcg_mem_track_move(struct ttm_buffer_object 
*old_bo, bool evict,
ddr->mem_stats[old_mem_type] -= move_in_bytes;
ddr->mem_stats[new_mem_type] += move_in_bytes;
 
+   ddr->mem_peaks[new_mem_type] = max(
+   ddr->mem_peaks[new_mem_type],
+   ddr->mem_stats[new_mem_type]);
+
if (evict)
ddr->mem_stats_evict++;
}
-- 
2.22.0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx