[PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset

2017-09-18 Thread Monk Liu
GPU reset will require all hw doing hw_init thus
ucode_init_bo will be invoked again, which lead to
memory leak

skip the fw_buf allocation during sriov gpu reset to avoid
memory leak.

Change-Id: I31131eda1bd45ea2f5bdc50c5da5fc5a9fe9027d
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 64 +++
 2 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6ff2959..3d0c633 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1185,6 +1185,9 @@ struct amdgpu_firmware {
 
/* gpu info firmware data pointer */
const struct firmware *gpu_info_fw;
+
+   void *fw_buf_ptr;
+   uint64_t fw_buf_mc;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
index f306374..6564902 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
@@ -360,8 +360,6 @@ static int amdgpu_ucode_patch_jt(struct 
amdgpu_firmware_info *ucode,
 int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
 {
struct amdgpu_bo **bo = &adev->firmware.fw_buf;
-   uint64_t fw_mc_addr;
-   void *fw_buf_ptr = NULL;
uint64_t fw_offset = 0;
int i, err;
struct amdgpu_firmware_info *ucode = NULL;
@@ -372,37 +370,39 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
return 0;
}
 
-   err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
-   amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM 
: AMDGPU_GEM_DOMAIN_GTT,
-   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
-   NULL, NULL, 0, bo);
-   if (err) {
-   dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", 
err);
-   goto failed;
-   }
+   if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
+   err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, 
true,
+   amdgpu_sriov_vf(adev) ? 
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
+   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
+   NULL, NULL, 0, bo);
+   if (err) {
+   dev_err(adev->dev, "(%d) Firmware buffer allocate 
failed\n", err);
+   goto failed;
+   }
 
-   err = amdgpu_bo_reserve(*bo, false);
-   if (err) {
-   dev_err(adev->dev, "(%d) Firmware buffer reserve failed\n", 
err);
-   goto failed_reserve;
-   }
+   err = amdgpu_bo_reserve(*bo, false);
+   if (err) {
+   dev_err(adev->dev, "(%d) Firmware buffer reserve 
failed\n", err);
+   goto failed_reserve;
+   }
 
-   err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM 
: AMDGPU_GEM_DOMAIN_GTT,
-   &fw_mc_addr);
-   if (err) {
-   dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", err);
-   goto failed_pin;
-   }
+   err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? 
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
+   &adev->firmware.fw_buf_mc);
+   if (err) {
+   dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", 
err);
+   goto failed_pin;
+   }
 
-   err = amdgpu_bo_kmap(*bo, &fw_buf_ptr);
-   if (err) {
-   dev_err(adev->dev, "(%d) Firmware buffer kmap failed\n", err);
-   goto failed_kmap;
-   }
+   err = amdgpu_bo_kmap(*bo, &adev->firmware.fw_buf_ptr);
+   if (err) {
+   dev_err(adev->dev, "(%d) Firmware buffer kmap 
failed\n", err);
+   goto failed_kmap;
+   }
 
-   amdgpu_bo_unreserve(*bo);
+   amdgpu_bo_unreserve(*bo);
+   }
 
-   memset(fw_buf_ptr, 0, adev->firmware.fw_size);
+   memset(adev->firmware.fw_buf_ptr, 0, adev->firmware.fw_size);
 
/*
 * if SMU loaded firmware, it needn't add SMC, UVD, and VCE
@@ -421,14 +421,14 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
ucode = &adev->firmware.ucode[i];
if (ucode->fw) {
header = (const struct common_firmware_header 
*)ucode->fw->data;
-   amdgpu_ucode_init_single_fw(adev, ucode, fw_mc_addr + 
fw_offset,
-   (void *)((uint8_t 
*)fw_buf_ptr + fw_offset));
+   amdgpu_ucode_init_single_fw(adev, ucode, 
adev->firmware.fw_buf_mc + fw_offset,
+   adev->firmwa

Re: [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset

2017-09-19 Thread Christian König

Am 19.09.2017 um 08:41 schrieb Monk Liu:

GPU reset will require all hw doing hw_init thus
ucode_init_bo will be invoked again, which lead to
memory leak

skip the fw_buf allocation during sriov gpu reset to avoid
memory leak.

Change-Id: I31131eda1bd45ea2f5bdc50c5da5fc5a9fe9027d
Signed-off-by: Monk Liu 


Acked-by: Christian König  for now.

But we should really clean this up and do the allocation during SW init.

Christian.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 64 +++
  2 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6ff2959..3d0c633 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1185,6 +1185,9 @@ struct amdgpu_firmware {
  
  	/* gpu info firmware data pointer */

const struct firmware *gpu_info_fw;
+
+   void *fw_buf_ptr;
+   uint64_t fw_buf_mc;
  };
  
  /*

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
index f306374..6564902 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
@@ -360,8 +360,6 @@ static int amdgpu_ucode_patch_jt(struct 
amdgpu_firmware_info *ucode,
  int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
  {
struct amdgpu_bo **bo = &adev->firmware.fw_buf;
-   uint64_t fw_mc_addr;
-   void *fw_buf_ptr = NULL;
uint64_t fw_offset = 0;
int i, err;
struct amdgpu_firmware_info *ucode = NULL;
@@ -372,37 +370,39 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
return 0;
}
  
-	err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,

-   amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM 
: AMDGPU_GEM_DOMAIN_GTT,
-   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
-   NULL, NULL, 0, bo);
-   if (err) {
-   dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", 
err);
-   goto failed;
-   }
+   if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
+   err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, 
true,
+   amdgpu_sriov_vf(adev) ? 
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
+   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
+   NULL, NULL, 0, bo);
+   if (err) {
+   dev_err(adev->dev, "(%d) Firmware buffer allocate 
failed\n", err);
+   goto failed;
+   }
  
-	err = amdgpu_bo_reserve(*bo, false);

-   if (err) {
-   dev_err(adev->dev, "(%d) Firmware buffer reserve failed\n", 
err);
-   goto failed_reserve;
-   }
+   err = amdgpu_bo_reserve(*bo, false);
+   if (err) {
+   dev_err(adev->dev, "(%d) Firmware buffer reserve 
failed\n", err);
+   goto failed_reserve;
+   }
  
-	err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,

-   &fw_mc_addr);
-   if (err) {
-   dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", err);
-   goto failed_pin;
-   }
+   err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? 
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
+   &adev->firmware.fw_buf_mc);
+   if (err) {
+   dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", 
err);
+   goto failed_pin;
+   }
  
-	err = amdgpu_bo_kmap(*bo, &fw_buf_ptr);

-   if (err) {
-   dev_err(adev->dev, "(%d) Firmware buffer kmap failed\n", err);
-   goto failed_kmap;
-   }
+   err = amdgpu_bo_kmap(*bo, &adev->firmware.fw_buf_ptr);
+   if (err) {
+   dev_err(adev->dev, "(%d) Firmware buffer kmap 
failed\n", err);
+   goto failed_kmap;
+   }
  
-	amdgpu_bo_unreserve(*bo);

+   amdgpu_bo_unreserve(*bo);
+   }
  
-	memset(fw_buf_ptr, 0, adev->firmware.fw_size);

+   memset(adev->firmware.fw_buf_ptr, 0, adev->firmware.fw_size);
  
  	/*

 * if SMU loaded firmware, it needn't add SMC, UVD, and VCE
@@ -421,14 +421,14 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
ucode = &adev->firmware.ucode[i];
if (ucode->fw) {
header = (const struct common_firmware_header 
*)ucode->fw->data;
-   amdgpu_ucode_init_single_fw(adev, ucode, fw_mc_addr + 
fw_offset,
-   (void *)((uint8_t 
*)fw_buf_ptr + fw_offset));
+

RE: [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset

2017-09-19 Thread Liu, Monk
No ,we cannot do this in SW init, because PSP need all other component finish 
their HW init and get the fw_size, before it can call ucode_init, so if we put 
this in SW init, the fw_size is still 0


BR Monk

-Original Message-
From: Christian König [mailto:ckoenig.leichtzumer...@gmail.com] 
Sent: 2017年9月19日 19:44
To: Liu, Monk ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset

Am 19.09.2017 um 08:41 schrieb Monk Liu:
> GPU reset will require all hw doing hw_init thus ucode_init_bo will be 
> invoked again, which lead to memory leak
>
> skip the fw_buf allocation during sriov gpu reset to avoid memory 
> leak.
>
> Change-Id: I31131eda1bd45ea2f5bdc50c5da5fc5a9fe9027d
> Signed-off-by: Monk Liu 

Acked-by: Christian König  for now.

But we should really clean this up and do the allocation during SW init.

Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 64 
> +++
>   2 files changed, 35 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6ff2959..3d0c633 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1185,6 +1185,9 @@ struct amdgpu_firmware {
>   
>   /* gpu info firmware data pointer */
>   const struct firmware *gpu_info_fw;
> +
> + void *fw_buf_ptr;
> + uint64_t fw_buf_mc;
>   };
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> index f306374..6564902 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> @@ -360,8 +360,6 @@ static int amdgpu_ucode_patch_jt(struct 
> amdgpu_firmware_info *ucode,
>   int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   {
>   struct amdgpu_bo **bo = &adev->firmware.fw_buf;
> - uint64_t fw_mc_addr;
> - void *fw_buf_ptr = NULL;
>   uint64_t fw_offset = 0;
>   int i, err;
>   struct amdgpu_firmware_info *ucode = NULL; @@ -372,37 +370,39 @@ 
> int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   return 0;
>   }
>   
> - err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
> - amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM 
> : AMDGPU_GEM_DOMAIN_GTT,
> - AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> - NULL, NULL, 0, bo);
> - if (err) {
> - dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", 
> err);
> - goto failed;
> - }
> + if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
> + err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, 
> true,
> + amdgpu_sriov_vf(adev) ? 
> AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> + AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> + NULL, NULL, 0, bo);
> + if (err) {
> + dev_err(adev->dev, "(%d) Firmware buffer allocate 
> failed\n", err);
> + goto failed;
> + }
>   
> - err = amdgpu_bo_reserve(*bo, false);
> - if (err) {
> - dev_err(adev->dev, "(%d) Firmware buffer reserve failed\n", 
> err);
> - goto failed_reserve;
> - }
> + err = amdgpu_bo_reserve(*bo, false);
> + if (err) {
> + dev_err(adev->dev, "(%d) Firmware buffer reserve 
> failed\n", err);
> + goto failed_reserve;
> + }
>   
> - err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM 
> : AMDGPU_GEM_DOMAIN_GTT,
> - &fw_mc_addr);
> - if (err) {
> - dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", err);
> - goto failed_pin;
> - }
> + err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? 
> AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> + &adev->firmware.fw_buf_mc);
> + if (err) {
> + dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", 
> err);
> + goto failed_pin;
> + }
>   
> - err = amdgpu_bo_kmap(*bo, &fw_buf_ptr);
> - if (err) {
> - dev_err(adev->dev, "(%d) Firmware buffer kmap failed\n", err);
> - goto failed_