RE: [PATCH] drm/amdkfd: Separate pinned BOs destruction from general routine

2021-10-14 Thread Yu, Lang
[AMD Official Use Only]



>-Original Message-
>From: Kuehling, Felix 
>Sent: Wednesday, October 13, 2021 11:25 PM
>To: Yu, Lang ; amd-gfx@lists.freedesktop.org
>Cc: Koenig, Christian ; Deucher, Alexander
>; Huang, Ray 
>Subject: Re: [PATCH] drm/amdkfd: Separate pinned BOs destruction from
>general routine
>
>Am 2021-10-11 um 4:58 a.m. schrieb Lang Yu:
>> Currently, all kfd BOs use same destruction routine. But pinned BOs
>> are not unpinned properly. Separate them from general routine.
>>
>> Signed-off-by: Lang Yu 
>
>I think the general idea is right. However, we need another safeguard for the
>signal BO, which is allocated by user mode and can be freed by user mode at
>any time. We can solve this in one of two ways:
>
> 1. Add special handling for the signal BO in
>kfd_ioctl_free_memory_of_gpu to kunmap the BO and make sure the
>signal handling code is aware of it
> 2. Fail kfd_ioctl_free_memory_of_gpu for signal BOs and only allow them
>to be destroyed at process termination
>
>I think #2 is easier, and is consistent with what current user mode does.

Will add safeguard to prevent that according to #2.
 
>
>A few more comment inline ...
>
>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   2 +
>>  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  10 ++
>>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   3 +
>>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   3 +
>>  drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 125 ++---
>-
>>  5 files changed, 114 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 69de31754907..751557af09bb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -279,6 +279,8 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
>>  struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);  int
>> amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
>>  struct kgd_mem *mem, void **kptr, uint64_t *size);
>> +void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct
>kgd_dev
>> +*kgd, struct kgd_mem *mem);
>> +
>>  int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
>>  struct dma_fence **ef);
>>  int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, diff
>> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index 054c1a224def..6acc78b02bdc 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -1871,6 +1871,16 @@ int
>amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
>>  return ret;
>>  }
>>
>> +void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct
>kgd_dev
>> +*kgd, struct kgd_mem *mem) {
>> +struct amdgpu_bo *bo = mem->bo;
>> +
>> +amdgpu_bo_reserve(bo, true);
>> +amdgpu_bo_kunmap(bo);
>> +amdgpu_bo_unpin(bo);
>> +amdgpu_bo_unreserve(bo);
>> +}
>> +
>>  int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
>>struct kfd_vm_fault_info *mem)
>{ diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index f1e7edeb4e6b..0db48ac10fde 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -1051,6 +1051,9 @@ static int kfd_ioctl_create_event(struct file *filp,
>struct kfd_process *p,
>>  pr_err("Failed to set event page\n");
>
>Need to kunmap the signal BO here.

Will kunmap it here.

>
>>  return err;
>>  }
>> +
>> +p->signal_handle = args->event_page_offset;
>> +
>>  }
>>
>>  err = kfd_event_create(filp, p, args->event_type, diff --git
>> a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 6d8f9bb2d905..30f08f1606bb 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -608,12 +608,14 @@ struct qcm_process_device {
>>  uint32_t sh_hidden_private_base;
>>
>>  /* CWSR memory */
>> +struct kgd_mem *cwsr_mem;
>>  void *cwsr_kaddr;
>>  uint64_t cwsr_base;
>>  uint64_t tba_addr;
>>  uint64_t tma_addr;
>>
>>  /* IB memory */
>> +struct kgd_mem *ib_mem;
>>  uint64_t ib_base;
>>  void *ib_kaddr;
>>
>> @@ -808,6 +810,7 @@ struct kfd_process {
>>  /* Event ID allocator and lookup */
>>  struct idr event_idr;
>>  /* Event page */
>> +u64 signal_handle;
>>  struct kfd_signal_page *signal_page;
>>  size_t signal_mapped_size;
>>  size_t signal_event_count;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 21ec8a18cad2..c024f2e2efaa 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drive

Re: bf756fb833cb ("drm/amdgpu: add missing cleanups for Polaris12 UVD/VCE on suspend")

2021-10-14 Thread Borislav Petkov
On Thu, Oct 14, 2021 at 02:02:48AM +, Quan, Evan wrote:
> [Quan, Evan] Yes, but not(apply them) at the same time. One by one as you did 
> before.
> - try the patch1 first

Ok, first patch worked fine.

> - undo the changes of patch1 and try patch2

Did that, worked fine too except after the first resume cycle, the video
didn't continue playing.

Then I restarted the video and did a couple more suspend cycles to see
if it would not continue again. In the subsequent tries it would resume
fine and the video would continue playing too.

So I'm going to chalk that single case of halted video with the second
patch to a resume glitch or so.

Btw, I don't have pm-suspend on that box but I did suspend to RAM
assuming this is what you wanted, which is done as root with two
commands:

# echo "suspend" > /sys/power/disk
# echo "mem" > /sys/power/state

If you want me to do more extensive testing, just shoot.

HTH.

-- 
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette


[pull] amdgpu, amdkfd drm-next-5.16

2021-10-14 Thread Alex Deucher
Hi Dave, Daniel,

Bug fixes for 5.16.

The following changes since commit 1176d15f0f6e556d54ced510ac4a91694960332b:

  Merge tag 'drm-intel-gt-next-2021-10-08' of 
git://anongit.freedesktop.org/drm/drm-intel into drm-next (2021-10-11 18:09:39 
+1000)

are available in the Git repository at:

  https://gitlab.freedesktop.org/agd5f/linux.git 
tags/amd-drm-next-5.16-2021-10-14

for you to fetch changes up to 48737ac4d70faffeb516e2a9847e24f9a7eee05f:

  drm/amdgpu/psp: add some missing cases to 
psp_check_pmfw_centralized_cstate_management (2021-10-13 22:51:41 -0400)


amd-drm-next-5.16-2021-10-14:

amdgpu:
- Cyan Skillfish fixes
- HDP flush fixes for asics with more than 2 SDMA instances
- IP discovery enumeration fixes
- Display fixes
- RAS fixes for Aldebaran
- Power limit fixes
- IOMMU fixes for Raven
- Fix potential out of bounds write in debugfs

amdkfd:
- SVM fixes


Alex Deucher (5):
  drm/amdgpu/nbio7.4: don't use GPU_HDP_FLUSH bit 12
  drm/amdgpu/nbio2.3: don't use GPU_HDP_FLUSH bit 12
  drm/amdgpu/smu11: fix firmware version check for vangogh
  drm/amdgpu/swsmu: fix is_support_sw_smu() for VEGA20
  drm/amdgpu/psp: add some missing cases to 
psp_check_pmfw_centralized_cstate_management

Alex Sierra (2):
  drm/amdkfd: avoid conflicting address mappings
  amd/amdkfd: remove svms declaration to avoid werror

Aurabindo Pillai (1):
  drm/amd/display: fix null pointer deref when plugging in display

Darren Powell (2):
  amdgpu/pm: (v2) add limit_type to (pptable_funcs)->set_power_limit 
signature
  drm/amd/pm: Fix incorrect power limit readback in smu11 if POWER_SOURCE_DC

Harry Wentland (1):
  MAINTAINERS: Add Siqueira for AMD DC

Lang Yu (2):
  drm/amdgpu: query default sclk from smu for cyan_skillfish
  drm/amdgpu: enable display for cyan skillfish

Mukul Joshi (2):
  drm/amdgpu: Enable RAS error injection after mode2 reset on Aldebaran
  drm/amdgpu: Fix RAS page retirement with mode2 reset on Aldebaran

Nicholas Kazlauskas (2):
  drm/amd/display: Enable PSR by default on newer DCN
  drm/amd/display: Fix surface optimization regression on Carrizo

Philip Yang (3):
  drm/amdkfd: ratelimited svm debug messages
  drm/amdkfd: handle svm partial migration cpages 0
  drm/amdkfd: unregistered svm range not overlap with TTM range

Simon Ser (1):
  amd/display: check cursor plane matches underlying plane

Thelford Williams (1):
  drm/amdgpu: fix out of bounds write

Yifan Zhang (4):
  drm/amdkfd: export svm_range_list_lock_and_flush_work
  drm/amdkfd: fix KFDSVMRangeTest.PartialUnmapSysMemTest fails
  drm/amdkfd: fix boot failure when iommu is disabled in Picasso.
  drm/amdkfd: fix resume error when iommu disabled in Picasso

 MAINTAINERS|   1 +
 drivers/gpu/drm/amd/amdgpu/aldebaran.c |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   4 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c  |   8 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c|   3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c|  33 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c|   4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h|   2 +-
 drivers/gpu/drm/amd/amdgpu/nbio_v2_3.c |  31 
 drivers/gpu/drm/amd/amdgpu/nbio_v2_3.h |   1 +
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c |  21 ++-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  17 ++
 drivers/gpu/drm/amd/amdkfd/kfd_device.c|   4 +
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c   | 120 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 185 ++---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |   1 +
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c  |  66 ++--
 .../drm/amd/display/amdgpu_dm/amdgpu_dm_debugfs.c  |   2 +-
 drivers/gpu/drm/amd/display/dc/core/dc.c   |  15 +-
 drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c |   3 +-
 drivers/gpu/drm/amd/include/amd_shared.h   |   5 +-
 drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h|   4 +-
 drivers/gpu/drm/amd/pm/inc/smu_v11_0.h |   4 +-
 drivers/gpu/drm/amd/pm/inc/smu_v13_0.h |   4 +-
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c  |   9 +-
 .../drm/amd/pm/swsmu/smu11/cyan_skillfish_ppt.c|  17 +-
 drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c |  20 ++-
 drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c   |   7 +-
 drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c |   6 +-
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c |  11 +-
 30 files changed, 458 insertions(+), 152 deletions(-)


Re: [PATCH v5] amd/display: only require overlay plane to cover whole CRTC on ChromeOS

2021-10-14 Thread Simon Ser
On Tuesday, October 12th, 2021 at 23:03, Alex Deucher  
wrote:

> > > @Harry Wentland, @Simon Ser Do you have a preference on whether we
> > > apply this patch or revert ddab8bd788f5? I'm fine with either.

I'd prefer to revert because (1) the ChromeOS team seems to be okay with that
(2) they can remove it more easily once they have fixed their userspace and
(3) this avoids adding workarounds in the kernel.

Will send a patch for this soon!


[PATCH] amd/display: remove ChromeOS workaround

2021-10-14 Thread Simon Ser
This reverts commits ddab8bd788f5 ("drm/amd/display: Fix two cursor duplication
when using overlay") and e7d9560aeae5 ("Revert "drm/amd/display: Fix overlay
validation by considering cursors"").

tl;dr ChromeOS uses the atomic interface for everything except the cursor. This
is incorrect and forces amdgpu to disable some hardware features. Let's revert
the ChromeOS-specific workaround in mainline and allow the Chrome team to keep
it internally in their own tree.

See [1] for more details. This patch is an alternative to [2], which added
ChromeOS detection.

[1]: 
https://lore.kernel.org/amd-gfx/JIQ_93_cHcshiIDsrMU1huBzx9P9LVQxucx8hQArpQu7Wk5DrCl_vTXj_Q20m_L-8C8A5dSpNcSJ8ehfcCrsQpfB5QG_Spn14EYkH9chtg0=@emersion.fr/
[2]: 
https://lore.kernel.org/amd-gfx/20211011151609.452132-1-cont...@emersion.fr/

Signed-off-by: Simon Ser 
Cc: Alex Deucher 
Cc: Harry Wentland 
Cc: Nicholas Kazlauskas 
Cc: Bas Nieuwenhuizen 
Cc: Rodrigo Siqueira 
Cc: Sean Paul 
Fixes: ddab8bd788f5 ("drm/amd/display: Fix two cursor duplication when using 
overlay")
Fixes: e7d9560aeae5 ("Revert "drm/amd/display: Fix overlay validation by 
considering cursors"")
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 51 ---
 1 file changed, 51 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 20065a145851..014c5a9fe461 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -10628,53 +10628,6 @@ static int add_affected_mst_dsc_crtcs(struct 
drm_atomic_state *state, struct drm
 }
 #endif
 
-static int validate_overlay(struct drm_atomic_state *state)
-{
-   int i;
-   struct drm_plane *plane;
-   struct drm_plane_state *new_plane_state;
-   struct drm_plane_state *primary_state, *overlay_state = NULL;
-
-   /* Check if primary plane is contained inside overlay */
-   for_each_new_plane_in_state_reverse(state, plane, new_plane_state, i) {
-   if (plane->type == DRM_PLANE_TYPE_OVERLAY) {
-   if (drm_atomic_plane_disabling(plane->state, 
new_plane_state))
-   return 0;
-
-   overlay_state = new_plane_state;
-   continue;
-   }
-   }
-
-   /* check if we're making changes to the overlay plane */
-   if (!overlay_state)
-   return 0;
-
-   /* check if overlay plane is enabled */
-   if (!overlay_state->crtc)
-   return 0;
-
-   /* find the primary plane for the CRTC that the overlay is enabled on */
-   primary_state = drm_atomic_get_plane_state(state, 
overlay_state->crtc->primary);
-   if (IS_ERR(primary_state))
-   return PTR_ERR(primary_state);
-
-   /* check if primary plane is enabled */
-   if (!primary_state->crtc)
-   return 0;
-
-   /* Perform the bounds check to ensure the overlay plane covers the 
primary */
-   if (primary_state->crtc_x < overlay_state->crtc_x ||
-   primary_state->crtc_y < overlay_state->crtc_y ||
-   primary_state->crtc_x + primary_state->crtc_w > 
overlay_state->crtc_x + overlay_state->crtc_w ||
-   primary_state->crtc_y + primary_state->crtc_h > 
overlay_state->crtc_y + overlay_state->crtc_h) {
-   DRM_DEBUG_ATOMIC("Overlay plane is enabled with hardware cursor 
but does not fully cover primary plane\n");
-   return -EINVAL;
-   }
-
-   return 0;
-}
-
 /**
  * amdgpu_dm_atomic_check() - Atomic check implementation for AMDgpu DM.
  * @dev: The DRM device
@@ -10856,10 +10809,6 @@ static int amdgpu_dm_atomic_check(struct drm_device 
*dev,
goto fail;
}
 
-   ret = validate_overlay(state);
-   if (ret)
-   goto fail;
-
/* Add new/modified planes */
for_each_oldnew_plane_in_state_reverse(state, plane, old_plane_state, 
new_plane_state, i) {
ret = dm_update_plane_state(dc, state, plane,
-- 
2.33.0




[PATCH v1 1/2] ext4/xfs: add page refcount helper

2021-10-14 Thread Alex Sierra
From: Ralph Campbell 

There are several places where ZONE_DEVICE struct pages assume a reference
count == 1 means the page is idle and free. Instead of open coding this,
add a helper function to hide this detail.

Signed-off-by: Ralph Campbell 
Signed-off-by: Alex Sierra 
Reviewed-by: Christoph Hellwig 
Acked-by: Theodore Ts'o 
Acked-by: Darrick J. Wong 
---
v3:
[AS]: rename dax_layout_is_idle_page func to dax_page_unused

v4:
[AS]: This ref count functionality was missing on fuse/dax.c.
---
 fs/dax.c|  4 ++--
 fs/ext4/inode.c |  5 +
 fs/fuse/dax.c   |  4 +---
 fs/xfs/xfs_file.c   |  4 +---
 include/linux/dax.h | 10 ++
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 62352cbcf0f4..c387d09e3e5a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -369,7 +369,7 @@ static void dax_disassociate_entry(void *entry, struct 
address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
 
-   WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
+   WARN_ON_ONCE(trunc && !dax_page_unused(page));
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
page->mapping = NULL;
page->index = 0;
@@ -383,7 +383,7 @@ static struct page *dax_busy_page(void *entry)
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
 
-   if (page_ref_count(page) > 1)
+   if (!dax_page_unused(page))
return page;
}
return NULL;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fe6045a46599..05ffe6875cb1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3971,10 +3971,7 @@ int ext4_break_layouts(struct inode *inode)
if (!page)
return 0;
 
-   error = ___wait_var_event(&page->_refcount,
-   atomic_read(&page->_refcount) == 1,
-   TASK_INTERRUPTIBLE, 0, 0,
-   ext4_wait_dax_page(ei));
+   error = dax_wait_page(ei, page, ext4_wait_dax_page);
} while (error == 0);
 
return error;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index ff99ab2a3c43..2b1f190ba78a 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -677,9 +677,7 @@ static int __fuse_dax_break_layouts(struct inode *inode, 
bool *retry,
return 0;
 
*retry = true;
-   return ___wait_var_event(&page->_refcount,
-   atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
-   0, 0, fuse_wait_dax_page(inode));
+   return dax_wait_page(inode, page, fuse_wait_dax_page);
 }
 
 /* dmap_end == 0 leads to unmapping of whole file */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 396ef36dcd0a..182057281086 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -840,9 +840,7 @@ xfs_break_dax_layouts(
return 0;
 
*retry = true;
-   return ___wait_var_event(&page->_refcount,
-   atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
-   0, 0, xfs_wait_dax_page(inode));
+   return dax_wait_page(inode, page, xfs_wait_dax_page);
 }
 
 int
diff --git a/include/linux/dax.h b/include/linux/dax.h
index b52f084aa643..8b5da1d60dbc 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -243,6 +243,16 @@ static inline bool dax_mapping(struct address_space 
*mapping)
return mapping->host && IS_DAX(mapping->host);
 }
 
+static inline bool dax_page_unused(struct page *page)
+{
+   return page_ref_count(page) == 1;
+}
+
+#define dax_wait_page(_inode, _page, _wait_cb) \
+   ___wait_var_event(&(_page)->_refcount,  \
+   dax_page_unused(_page), \
+   TASK_INTERRUPTIBLE, 0, 0, _wait_cb(_inode))
+
 #ifdef CONFIG_DEV_DAX_HMEM_DEVICES
 void hmem_register_device(int target_nid, struct resource *r);
 #else
-- 
2.32.0



[PATCH v1 0/2] mm: remove extra ZONE_DEVICE struct page refcount

2021-10-14 Thread Alex Sierra
This patch cleans up ZONE_DEVICE page refcounting. Quoting Matthew
Wilcox, "it removes a ton of cruft from every call to put_page()"
This work was originally done by Ralph Campbell and submitted as RFC.
As of today, it has been ack by Theodore Ts'o / Darrick J. Wong, and
reviewed by Christoph Hellwig.
https://lore.kernel.org/linux-mm/20201001181715.17416-1-rcampb...@nvidia.com/

"MEMORY_DEVICE_COHERENT for CPU-accessible coherent device memory"
patchset depends on this patchset.
https://lore.kernel.org/linux-mm/20211012171247.2861-1-alex.sie...@amd.com/

Ralph Campbell (2):
  ext4/xfs: add page refcount helper
  mm: remove extra ZONE_DEVICE struct page refcount

 arch/powerpc/kvm/book3s_hv_uvmem.c |  2 +-
 drivers/gpu/drm/nouveau/nouveau_dmem.c |  2 +-
 fs/dax.c   |  8 +--
 fs/ext4/inode.c|  5 +-
 fs/fuse/dax.c  |  4 +-
 fs/xfs/xfs_file.c  |  4 +-
 include/linux/dax.h| 10 
 include/linux/memremap.h   |  7 +--
 include/linux/mm.h | 11 
 lib/test_hmm.c |  2 +-
 mm/internal.h  |  8 +++
 mm/memcontrol.c|  6 +--
 mm/memremap.c  | 69 +++---
 mm/migrate.c   |  5 --
 mm/page_alloc.c|  3 ++
 mm/swap.c  | 45 ++---
 16 files changed, 60 insertions(+), 131 deletions(-)

-- 
2.32.0



[PATCH v1 2/2] mm: remove extra ZONE_DEVICE struct page refcount

2021-10-14 Thread Alex Sierra
From: Ralph Campbell 

ZONE_DEVICE struct pages have an extra reference count that complicates the
code for put_page() and several places in the kernel that need to check the
reference count to see that a page is not being used (gup, compaction,
migration, etc.). Clean up the code so the reference count doesn't need to
be treated specially for ZONE_DEVICE.

Signed-off-by: Ralph Campbell 
Signed-off-by: Alex Sierra 
Reviewed-by: Christoph Hellwig 
---
v2:
AS: merged this patch in linux 5.11 version

v5:
AS: add condition at try_grab_page to check for the zone device type, while
page ref counter is checked less/equal to zero. In case of device zone, pages
ref counter are initialized to zero.

v7:
AS: fix condition at try_grab_page added at v5, is invalid. It supposed
to fix xfstests/generic/413 test, however, there's a known issue on
this test where DAX mapped area DIO to non-DAX expect to fail.
https://patchwork.kernel.org/project/fstests/patch/1489463960-3579-1-git-send-email-xz...@redhat.com
This condition was removed after rebase over patch series
https://lore.kernel.org/r/20210813044133.1536842-4-jhubb...@nvidia.com
---
 arch/powerpc/kvm/book3s_hv_uvmem.c |  2 +-
 drivers/gpu/drm/nouveau/nouveau_dmem.c |  2 +-
 fs/dax.c   |  4 +-
 include/linux/dax.h|  2 +-
 include/linux/memremap.h   |  7 +--
 include/linux/mm.h | 11 
 lib/test_hmm.c |  2 +-
 mm/internal.h  |  8 +++
 mm/memcontrol.c|  6 +--
 mm/memremap.c  | 69 +++---
 mm/migrate.c   |  5 --
 mm/page_alloc.c|  3 ++
 mm/swap.c  | 45 ++---
 13 files changed, 46 insertions(+), 120 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 84e5a2dc8be5..acee67710620 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -711,7 +711,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
gpa, struct kvm *kvm)
 
dpage = pfn_to_page(uvmem_pfn);
dpage->zone_device_data = pvt;
-   get_page(dpage);
+   init_page_count(dpage);
lock_page(dpage);
return dpage;
 out_clear:
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 92987daa5e17..8bc7120e1216 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -324,7 +324,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
return NULL;
}
 
-   get_page(page);
+   init_page_count(page);
lock_page(page);
return page;
 }
diff --git a/fs/dax.c b/fs/dax.c
index c387d09e3e5a..1166630b7190 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -571,14 +571,14 @@ static void *grab_mapping_entry(struct xa_state *xas,
 
 /**
  * dax_layout_busy_page_range - find first pinned page in @mapping
- * @mapping: address space to scan for a page with ref count > 1
+ * @mapping: address space to scan for a page with ref count > 0
  * @start: Starting offset. Page containing 'start' is included.
  * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
  *   pages from 'start' till the end of file are included.
  *
  * DAX requires ZONE_DEVICE mapped pages. These pages are never
  * 'onlined' to the page allocator so they are considered idle when
- * page->count == 1. A filesystem uses this interface to determine if
+ * page->count == 0. A filesystem uses this interface to determine if
  * any page in the mapping is busy, i.e. for DMA, or other
  * get_user_pages() usages.
  *
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8b5da1d60dbc..05fc982ce153 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -245,7 +245,7 @@ static inline bool dax_mapping(struct address_space 
*mapping)
 
 static inline bool dax_page_unused(struct page *page)
 {
-   return page_ref_count(page) == 1;
+   return page_ref_count(page) == 0;
 }
 
 #define dax_wait_page(_inode, _page, _wait_cb) \
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 45a79da89c5f..77ff5fd0685f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -66,9 +66,10 @@ enum memory_type {
 
 struct dev_pagemap_ops {
/*
-* Called once the page refcount reaches 1.  (ZONE_DEVICE pages never
-* reach 0 refcount unless there is a refcount bug. This allows the
-* device driver to implement its own memory management.)
+* Called once the page refcount reaches 0. The reference count
+* should be reset to one with init_page_count(page) before reusing
+* the page. This allows the device driver to implement its own
+* memory management.
 */
void (*page_free)(str

Re: [PATCH] drm/amdkfd: Separate pinned BOs destruction from general routine

2021-10-14 Thread Christian König

Am 14.10.21 um 12:14 schrieb Yu, Lang:

[AMD Official Use Only]




-Original Message-
From: Kuehling, Felix 
Sent: Wednesday, October 13, 2021 11:25 PM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian ; Deucher, Alexander
; Huang, Ray 
Subject: Re: [PATCH] drm/amdkfd: Separate pinned BOs destruction from
general routine

Am 2021-10-11 um 4:58 a.m. schrieb Lang Yu:

Currently, all kfd BOs use same destruction routine. But pinned BOs
are not unpinned properly. Separate them from general routine.

Signed-off-by: Lang Yu 

I think the general idea is right. However, we need another safeguard for the
signal BO, which is allocated by user mode and can be freed by user mode at
any time. We can solve this in one of two ways:

1. Add special handling for the signal BO in
kfd_ioctl_free_memory_of_gpu to kunmap the BO and make sure the
signal handling code is aware of it
2. Fail kfd_ioctl_free_memory_of_gpu for signal BOs and only allow them
to be destroyed at process termination

I think #2 is easier, and is consistent with what current user mode does.

Will add safeguard to prevent that according to #2.


Well, exactly that are the things why upstream people insisted on this :)

Sounds like the best solution to me as well.

Thanks for taking care of this,
Christian.

  

A few more comment inline ...



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   2 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  10 ++
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   3 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   3 +
  drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 125 ++---

-

  5 files changed, 114 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 69de31754907..751557af09bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -279,6 +279,8 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);  int
amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
struct kgd_mem *mem, void **kptr, uint64_t *size);
+void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct

kgd_dev

+*kgd, struct kgd_mem *mem);
+
  int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
struct dma_fence **ef);
  int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, diff
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 054c1a224def..6acc78b02bdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1871,6 +1871,16 @@ int

amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,

return ret;
  }

+void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct

kgd_dev

+*kgd, struct kgd_mem *mem) {
+   struct amdgpu_bo *bo = mem->bo;
+
+   amdgpu_bo_reserve(bo, true);
+   amdgpu_bo_kunmap(bo);
+   amdgpu_bo_unpin(bo);
+   amdgpu_bo_unreserve(bo);
+}
+
  int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
  struct kfd_vm_fault_info *mem)

{ diff --git

a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f1e7edeb4e6b..0db48ac10fde 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1051,6 +1051,9 @@ static int kfd_ioctl_create_event(struct file *filp,

struct kfd_process *p,

pr_err("Failed to set event page\n");

Need to kunmap the signal BO here.

Will kunmap it here.


return err;
}
+
+   p->signal_handle = args->event_page_offset;
+
}

err = kfd_event_create(filp, p, args->event_type, diff --git
a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6d8f9bb2d905..30f08f1606bb 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -608,12 +608,14 @@ struct qcm_process_device {
uint32_t sh_hidden_private_base;

/* CWSR memory */
+   struct kgd_mem *cwsr_mem;
void *cwsr_kaddr;
uint64_t cwsr_base;
uint64_t tba_addr;
uint64_t tma_addr;

/* IB memory */
+   struct kgd_mem *ib_mem;
uint64_t ib_base;
void *ib_kaddr;

@@ -808,6 +810,7 @@ struct kfd_process {
/* Event ID allocator and lookup */
struct idr event_idr;
/* Event page */
+   u64 signal_handle;
struct kfd_signal_page *signal_page;
size_t signal_mapped_size;
size_t signal_event_count;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 21ec8a18cad2..c024f2e2efaa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pro

[PATCH] drm/amdgpu: Warn when bad pages approaches threshold

2021-10-14 Thread Kent Russell
Currently dmesg doesn't warn when the number of bad pages approaches the
threshold for page retirement. WARN when the number of bad pages
is at 90% or greater for easier checks and planning, instead of waiting
until the GPU is full of bad pages

Signed-off-by: Kent Russell 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 98732518543e..8270aad23a06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1077,6 +1077,16 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
if (res)
DRM_ERROR("RAS table incorrect checksum or error:%d\n",
  res);
+
+   /* threshold = -1 is automatic, threshold = 0 means that page
+* retirement is disabled.
+*/
+   if (amdgpu_bad_page_threshold > 0 &&
+   control->ras_num_recs >= 0 &&
+   control->ras_num_recs >= (amdgpu_bad_page_threshold * 9 / 
10))
+   DRM_WARN("RAS records:%u approaching threshold:%d",
+   control->ras_num_recs,
+   amdgpu_bad_page_threshold);
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
   amdgpu_bad_page_threshold != 0) {
res = __verify_ras_table_checksum(control);
-- 
2.25.1



Re: [PATCH v1 2/2] mm: remove extra ZONE_DEVICE struct page refcount

2021-10-14 Thread Ralph Campbell



On 10/14/21 10:06 AM, Jason Gunthorpe wrote:

On Thu, Oct 14, 2021 at 10:39:28AM -0500, Alex Sierra wrote:

From: Ralph Campbell 

ZONE_DEVICE struct pages have an extra reference count that complicates the
code for put_page() and several places in the kernel that need to check the
reference count to see that a page is not being used (gup, compaction,
migration, etc.). Clean up the code so the reference count doesn't need to
be treated specially for ZONE_DEVICE.

Signed-off-by: Ralph Campbell 
Signed-off-by: Alex Sierra 
Reviewed-by: Christoph Hellwig 
---
v2:
AS: merged this patch in linux 5.11 version

v5:
AS: add condition at try_grab_page to check for the zone device type, while
page ref counter is checked less/equal to zero. In case of device zone, pages
ref counter are initialized to zero.

v7:
AS: fix condition at try_grab_page added at v5, is invalid. It supposed
to fix xfstests/generic/413 test, however, there's a known issue on
this test where DAX mapped area DIO to non-DAX expect to fail.
https://patchwork.kernel.org/project/fstests/patch/1489463960-3579-1-git-send-email-xz...@redhat.com
This condition was removed after rebase over patch series
https://lore.kernel.org/r/20210813044133.1536842-4-jhubb...@nvidia.com
---
  arch/powerpc/kvm/book3s_hv_uvmem.c |  2 +-
  drivers/gpu/drm/nouveau/nouveau_dmem.c |  2 +-
  fs/dax.c   |  4 +-
  include/linux/dax.h|  2 +-
  include/linux/memremap.h   |  7 +--
  include/linux/mm.h | 11 
  lib/test_hmm.c |  2 +-
  mm/internal.h  |  8 +++
  mm/memcontrol.c|  6 +--
  mm/memremap.c  | 69 +++---
  mm/migrate.c   |  5 --
  mm/page_alloc.c|  3 ++
  mm/swap.c  | 45 ++---
  13 files changed, 46 insertions(+), 120 deletions(-)

Has anyone tested this with FSDAX? Does get_user_pages() on fsdax
backed memory still work?


I ran xfstests-dev using the kernel boot option to "fake" a pmem device
when I first posted this patch. The tests ran OK (or at least the same
tests passed with and without my patch). However, I could never really
convince myself the changes were "OK" for fsdax since I didn't understand
the code that well. I would still like to see a xfsdax maintainer or
expert ACK this change.

https://git.kernel.org/pub/scm/fs/xfs/xfstests-dev.git


What refcount value does the struct pages have when they are installed
in the PTEs? Remember a 0 refcount will make all the get_user_pages()
fail.

I'm looking at the call path starting in ext4_punch_hole() and I would
expect to see something manipulating the page ref count before
the ext4_break_layouts() call path gets to the dax_page_unused() test.

All I see is we go into unmap_mapping_pages() - that would normally
put back the page references held by PTEs but insert_pfn() has this:

if (pfn_t_devmap(pfn))
entry = pte_mkdevmap(pfn_t_pte(pfn, prot));

And:

static inline pte_t pte_mkdevmap(pte_t pte)
{
return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
}

Which interacts with vm_normal_page():

if (pte_devmap(pte))
return NULL;

To disable that refcounting?

So... I have a feeling this will have PTEs pointing to 0 refcount
pages? Unless FSDAX is !pte_devmap which is not the case, right?

This seems further confirmed by this comment:

/*
 * If we race get_user_pages_fast() here either we'll see the
 * elevated page count in the iteration and wait, or
 * get_user_pages_fast() will see that the page it took a reference
 * against is no longer mapped in the page tables and bail to the
 * get_user_pages() slow path.  The slow path is protected by
 * pte_lock() and pmd_lock(). New references are not taken without
 * holding those locks, and unmap_mapping_pages() will not zero the
 * pte or pmd without holding the respective lock, so we are
 * guaranteed to either see new references or prevent new
 * references from being established.
 */

Which seems to explain this scheme relies on unmap_mapping_pages() to
fence GUP_fast, not on GUP_fast observing 0 refcounts when it should
stop.

This seems like it would be properly fixed by using normal page
refcounting for PTEs - ie stop using special for these pages?

Does anyone know why devmap is pte_special anyhow?


+void free_zone_device_page(struct page *page)
+{
+   switch (page->pgmap->type) {
+   case MEMORY_DEVICE_PRIVATE:
+   free_device_page(page);
+   return;
+   case MEMORY_DEVICE_FS_DAX:
+   /* notify page idle */
+   wake_up_var(&page->_refcount);
+   return;

It is not for this series, but I wonder if we should just always call
ops->page_free 

[PATCH] drm/amdkfd: map gpu hive id to xgmi connected cpu

2021-10-14 Thread Jonathan Kim
ROCr needs to be able to identify all devices that have direct access to
fine grain memory, which should include CPUs that are connected to GPUs
over xGMI. The GPU hive ID can be mapped onto the CPU hive ID since the
CPU is part of the hive.

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 98cca5f2b27f..d04c48dfd72b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1296,6 +1296,27 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 
proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
 
+   adev = (struct amdgpu_device *)(gpu->kgd);
+
+   /* Include the CPU in xGMI hive if xGMI connected by assigning it the 
hive ID. */
+   if (gpu->hive_id && adev->gmc.xgmi.connected_to_cpu) {
+   int i;
+
+   for (i = 0; i < proximity_domain; i++) {
+   struct kfd_topology_device *to_dev =
+   
kfd_topology_device_by_proximity_domain(i);
+
+   if (!to_dev)
+   continue;
+
+   if (to_dev->gpu)
+   break;
+
+   to_dev->node_props.hive_id = gpu->hive_id;
+   break;
+   }
+   }
+
/* Check to see if this gpu device exists in the topology_device_list.
 * If so, assign the gpu to that device,
 * else create a Virtual CRAT for this gpu device and then parse that
@@ -1457,7 +1478,6 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->node_props.max_waves_per_simd = 10;
}
 
-   adev = (struct amdgpu_device *)(dev->gpu->kgd);
/* kfd only concerns sram ecc on GFX and HBM ecc on UMC */
dev->node_props.capability |=
((adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
-- 
2.25.1



Re: [PATCH] drm/amdkfd: map gpu hive id to xgmi connected cpu

2021-10-14 Thread Felix Kuehling
Am 2021-10-14 um 1:44 p.m. schrieb Jonathan Kim:
> ROCr needs to be able to identify all devices that have direct access to
> fine grain memory, which should include CPUs that are connected to GPUs
> over xGMI. The GPU hive ID can be mapped onto the CPU hive ID since the
> CPU is part of the hive.
>
> Signed-off-by: Jonathan Kim 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 22 +-
>  1 file changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 98cca5f2b27f..d04c48dfd72b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1296,6 +1296,27 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>  
>   proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
>  
> + adev = (struct amdgpu_device *)(gpu->kgd);
> +
> + /* Include the CPU in xGMI hive if xGMI connected by assigning it the 
> hive ID. */
> + if (gpu->hive_id && adev->gmc.xgmi.connected_to_cpu) {
> + int i;
> +
> + for (i = 0; i < proximity_domain; i++) {
> + struct kfd_topology_device *to_dev =
> + 
> kfd_topology_device_by_proximity_domain(i);
> +
> + if (!to_dev)
> + continue;
> +
> + if (to_dev->gpu)
> + break;
> +
> + to_dev->node_props.hive_id = gpu->hive_id;
> + break;

On a NUMA system there will be multiple CPU nodes (e.g. in NPS-4 mode).
The "break" statement here means, you'll only update the hive ID on the
first NUMA node.

Other than that, this change makes sense.

Regards,
  Felix


> + }
> + }
> +
>   /* Check to see if this gpu device exists in the topology_device_list.
>* If so, assign the gpu to that device,
>* else create a Virtual CRAT for this gpu device and then parse that
> @@ -1457,7 +1478,6 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>   dev->node_props.max_waves_per_simd = 10;
>   }
>  
> - adev = (struct amdgpu_device *)(dev->gpu->kgd);
>   /* kfd only concerns sram ecc on GFX and HBM ecc on UMC */
>   dev->node_props.capability |=
>   ((adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?


[PATCH] drm/amdkfd: map gpu hive id to xgmi connected cpu

2021-10-14 Thread Jonathan Kim
ROCr needs to be able to identify all devices that have direct access to
fine grain memory, which should include CPUs that are connected to GPUs
over xGMI. The GPU hive ID can be mapped onto the CPU hive ID since the
CPU is part of the hive.

v2: fixup to ensure all numa nodes get the hive id mapped

Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 98cca5f2b27f..9fda4ee03813 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -1296,6 +1296,26 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
 
proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
 
+   adev = (struct amdgpu_device *)(gpu->kgd);
+
+   /* Include the CPU in xGMI hive if xGMI connected by assigning it the 
hive ID. */
+   if (gpu->hive_id && adev->gmc.xgmi.connected_to_cpu) {
+   int i;
+
+   for (i = 0; i < proximity_domain; i++) {
+   struct kfd_topology_device *to_dev =
+   
kfd_topology_device_by_proximity_domain(i);
+
+   if (!to_dev)
+   continue;
+
+   if (to_dev->gpu)
+   break;
+
+   to_dev->node_props.hive_id = gpu->hive_id;
+   }
+   }
+
/* Check to see if this gpu device exists in the topology_device_list.
 * If so, assign the gpu to that device,
 * else create a Virtual CRAT for this gpu device and then parse that
@@ -1457,7 +1477,6 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->node_props.max_waves_per_simd = 10;
}
 
-   adev = (struct amdgpu_device *)(dev->gpu->kgd);
/* kfd only concerns sram ecc on GFX and HBM ecc on UMC */
dev->node_props.capability |=
((adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?
-- 
2.25.1



Re: [PATCH v1 2/2] mm: remove extra ZONE_DEVICE struct page refcount

2021-10-14 Thread Matthew Wilcox


It would probably help if you cc'd Dan on this.
As far as I know he's the only person left who cares about GUP on DAX.

On Thu, Oct 14, 2021 at 02:06:34PM -0300, Jason Gunthorpe wrote:
> On Thu, Oct 14, 2021 at 10:39:28AM -0500, Alex Sierra wrote:
> > From: Ralph Campbell 
> > 
> > ZONE_DEVICE struct pages have an extra reference count that complicates the
> > code for put_page() and several places in the kernel that need to check the
> > reference count to see that a page is not being used (gup, compaction,
> > migration, etc.). Clean up the code so the reference count doesn't need to
> > be treated specially for ZONE_DEVICE.
> > 
> > Signed-off-by: Ralph Campbell 
> > Signed-off-by: Alex Sierra 
> > Reviewed-by: Christoph Hellwig 
> > ---
> > v2:
> > AS: merged this patch in linux 5.11 version
> > 
> > v5:
> > AS: add condition at try_grab_page to check for the zone device type, while
> > page ref counter is checked less/equal to zero. In case of device zone, 
> > pages
> > ref counter are initialized to zero.
> > 
> > v7:
> > AS: fix condition at try_grab_page added at v5, is invalid. It supposed
> > to fix xfstests/generic/413 test, however, there's a known issue on
> > this test where DAX mapped area DIO to non-DAX expect to fail.
> > https://patchwork.kernel.org/project/fstests/patch/1489463960-3579-1-git-send-email-xz...@redhat.com
> > This condition was removed after rebase over patch series
> > https://lore.kernel.org/r/20210813044133.1536842-4-jhubb...@nvidia.com
> > ---
> >  arch/powerpc/kvm/book3s_hv_uvmem.c |  2 +-
> >  drivers/gpu/drm/nouveau/nouveau_dmem.c |  2 +-
> >  fs/dax.c   |  4 +-
> >  include/linux/dax.h|  2 +-
> >  include/linux/memremap.h   |  7 +--
> >  include/linux/mm.h | 11 
> >  lib/test_hmm.c |  2 +-
> >  mm/internal.h  |  8 +++
> >  mm/memcontrol.c|  6 +--
> >  mm/memremap.c  | 69 +++---
> >  mm/migrate.c   |  5 --
> >  mm/page_alloc.c|  3 ++
> >  mm/swap.c  | 45 ++---
> >  13 files changed, 46 insertions(+), 120 deletions(-)
> 
> Has anyone tested this with FSDAX? Does get_user_pages() on fsdax
> backed memory still work?
> 
> What refcount value does the struct pages have when they are installed
> in the PTEs? Remember a 0 refcount will make all the get_user_pages()
> fail.
> 
> I'm looking at the call path starting in ext4_punch_hole() and I would
> expect to see something manipulating the page ref count before
> the ext4_break_layouts() call path gets to the dax_page_unused() test.
> 
> All I see is we go into unmap_mapping_pages() - that would normally
> put back the page references held by PTEs but insert_pfn() has this:
> 
>   if (pfn_t_devmap(pfn))
>   entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
> 
> And:
> 
> static inline pte_t pte_mkdevmap(pte_t pte)
> {
>   return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
> }
> 
> Which interacts with vm_normal_page():
> 
>   if (pte_devmap(pte))
>   return NULL;
> 
> To disable that refcounting?
> 
> So... I have a feeling this will have PTEs pointing to 0 refcount
> pages? Unless FSDAX is !pte_devmap which is not the case, right?
> 
> This seems further confirmed by this comment:
> 
>   /*
>* If we race get_user_pages_fast() here either we'll see the
>* elevated page count in the iteration and wait, or
>* get_user_pages_fast() will see that the page it took a reference
>* against is no longer mapped in the page tables and bail to the
>* get_user_pages() slow path.  The slow path is protected by
>* pte_lock() and pmd_lock(). New references are not taken without
>* holding those locks, and unmap_mapping_pages() will not zero the
>* pte or pmd without holding the respective lock, so we are
>* guaranteed to either see new references or prevent new
>* references from being established.
>*/
> 
> Which seems to explain this scheme relies on unmap_mapping_pages() to
> fence GUP_fast, not on GUP_fast observing 0 refcounts when it should
> stop.
> 
> This seems like it would be properly fixed by using normal page
> refcounting for PTEs - ie stop using special for these pages?
> 
> Does anyone know why devmap is pte_special anyhow?
> 
> > +void free_zone_device_page(struct page *page)
> > +{
> > +   switch (page->pgmap->type) {
> > +   case MEMORY_DEVICE_PRIVATE:
> > +   free_device_page(page);
> > +   return;
> > +   case MEMORY_DEVICE_FS_DAX:
> > +   /* notify page idle */
> > +   wake_up_var(&page->_refcount);
> > +   return;
> 
> It is not for this series, but I wonder if we should just always call
> ops->page_free and have free_device_page() logic in that cal

Re: [PATCH] drm/amdkfd: map gpu hive id to xgmi connected cpu

2021-10-14 Thread Felix Kuehling
Am 2021-10-14 um 2:12 p.m. schrieb Jonathan Kim:
> ROCr needs to be able to identify all devices that have direct access to
> fine grain memory, which should include CPUs that are connected to GPUs
> over xGMI. The GPU hive ID can be mapped onto the CPU hive ID since the
> CPU is part of the hive.
>
> v2: fixup to ensure all numa nodes get the hive id mapped
>
> Signed-off-by: Jonathan Kim 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 21 -
>  1 file changed, 20 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 98cca5f2b27f..9fda4ee03813 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -1296,6 +1296,26 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>  
>   proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
>  
> + adev = (struct amdgpu_device *)(gpu->kgd);
> +
> + /* Include the CPU in xGMI hive if xGMI connected by assigning it the 
> hive ID. */
> + if (gpu->hive_id && adev->gmc.xgmi.connected_to_cpu) {
> + int i;
> +
> + for (i = 0; i < proximity_domain; i++) {
> + struct kfd_topology_device *to_dev =
> + 
> kfd_topology_device_by_proximity_domain(i);

Sorry, one more nit-pick. This loop is pretty inefficient (0(n^2))
because kfd_topolody_device_by_proximity_domain does a linear search
itself. It would be more efficient to just loop over the
topology_device_list directly here (while holding the read lock):

>     down_read(&topology_lock);
>
>     list_for_each_entry(top_dev, &topology_device_list, list) {
>                 ...
Regards,
  Felix


> +
> + if (!to_dev)
> + continue;
> +
> + if (to_dev->gpu)
> + break;
> +
> + to_dev->node_props.hive_id = gpu->hive_id;
> + }
> + }
> +
>   /* Check to see if this gpu device exists in the topology_device_list.
>* If so, assign the gpu to that device,
>* else create a Virtual CRAT for this gpu device and then parse that
> @@ -1457,7 +1477,6 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
>   dev->node_props.max_waves_per_simd = 10;
>   }
>  
> - adev = (struct amdgpu_device *)(dev->gpu->kgd);
>   /* kfd only concerns sram ecc on GFX and HBM ecc on UMC */
>   dev->node_props.capability |=
>   ((adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ?


Re: [PATCH v1 2/2] mm: remove extra ZONE_DEVICE struct page refcount

2021-10-14 Thread Dan Williams
On Thu, Oct 14, 2021 at 11:45 AM Matthew Wilcox  wrote:
>
>
> It would probably help if you cc'd Dan on this.

Thanks.

[..]
>
> On Thu, Oct 14, 2021 at 02:06:34PM -0300, Jason Gunthorpe wrote:
> > On Thu, Oct 14, 2021 at 10:39:28AM -0500, Alex Sierra wrote:
> > > From: Ralph Campbell 
> > >
> > > ZONE_DEVICE struct pages have an extra reference count that complicates 
> > > the
> > > code for put_page() and several places in the kernel that need to check 
> > > the
> > > reference count to see that a page is not being used (gup, compaction,
> > > migration, etc.). Clean up the code so the reference count doesn't need to
> > > be treated specially for ZONE_DEVICE.
> > >
> > > Signed-off-by: Ralph Campbell 
> > > Signed-off-by: Alex Sierra 
> > > Reviewed-by: Christoph Hellwig 
> > > ---
> > > v2:
> > > AS: merged this patch in linux 5.11 version
> > >
> > > v5:
> > > AS: add condition at try_grab_page to check for the zone device type, 
> > > while
> > > page ref counter is checked less/equal to zero. In case of device zone, 
> > > pages
> > > ref counter are initialized to zero.
> > >
> > > v7:
> > > AS: fix condition at try_grab_page added at v5, is invalid. It supposed
> > > to fix xfstests/generic/413 test, however, there's a known issue on
> > > this test where DAX mapped area DIO to non-DAX expect to fail.
> > > https://patchwork.kernel.org/project/fstests/patch/1489463960-3579-1-git-send-email-xz...@redhat.com
> > > This condition was removed after rebase over patch series
> > > https://lore.kernel.org/r/20210813044133.1536842-4-jhubb...@nvidia.com
> > > ---
> > >  arch/powerpc/kvm/book3s_hv_uvmem.c |  2 +-
> > >  drivers/gpu/drm/nouveau/nouveau_dmem.c |  2 +-
> > >  fs/dax.c   |  4 +-
> > >  include/linux/dax.h|  2 +-
> > >  include/linux/memremap.h   |  7 +--
> > >  include/linux/mm.h | 11 
> > >  lib/test_hmm.c |  2 +-
> > >  mm/internal.h  |  8 +++
> > >  mm/memcontrol.c|  6 +--
> > >  mm/memremap.c  | 69 +++---
> > >  mm/migrate.c   |  5 --
> > >  mm/page_alloc.c|  3 ++
> > >  mm/swap.c  | 45 ++---
> > >  13 files changed, 46 insertions(+), 120 deletions(-)
> >
> > Has anyone tested this with FSDAX? Does get_user_pages() on fsdax
> > backed memory still work?
> >
> > What refcount value does the struct pages have when they are installed
> > in the PTEs? Remember a 0 refcount will make all the get_user_pages()
> > fail.
> >
> > I'm looking at the call path starting in ext4_punch_hole() and I would
> > expect to see something manipulating the page ref count before
> > the ext4_break_layouts() call path gets to the dax_page_unused() test.
> >
> > All I see is we go into unmap_mapping_pages() - that would normally
> > put back the page references held by PTEs but insert_pfn() has this:
> >
> >   if (pfn_t_devmap(pfn))
> >   entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
> >
> > And:
> >
> > static inline pte_t pte_mkdevmap(pte_t pte)
> > {
> >   return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP);
> > }
> >
> > Which interacts with vm_normal_page():
> >
> >   if (pte_devmap(pte))
> >   return NULL;
> >
> > To disable that refcounting?
> >
> > So... I have a feeling this will have PTEs pointing to 0 refcount
> > pages? Unless FSDAX is !pte_devmap which is not the case, right?
> >
> > This seems further confirmed by this comment:
> >
> >   /*
> >* If we race get_user_pages_fast() here either we'll see the
> >* elevated page count in the iteration and wait, or
> >* get_user_pages_fast() will see that the page it took a reference
> >* against is no longer mapped in the page tables and bail to the
> >* get_user_pages() slow path.  The slow path is protected by
> >* pte_lock() and pmd_lock(). New references are not taken without
> >* holding those locks, and unmap_mapping_pages() will not zero the
> >* pte or pmd without holding the respective lock, so we are
> >* guaranteed to either see new references or prevent new
> >* references from being established.
> >*/
> >
> > Which seems to explain this scheme relies on unmap_mapping_pages() to
> > fence GUP_fast, not on GUP_fast observing 0 refcounts when it should
> > stop.
> >
> > This seems like it would be properly fixed by using normal page
> > refcounting for PTEs - ie stop using special for these pages?
> >
> > Does anyone know why devmap is pte_special anyhow?

It does not need to be special as mentioned here:

https://lore.kernel.org/all/CAPcyv4iFeVDVPn6uc=aksyuvkiu3-fk-n16ijvzq3n8ot00...@mail.gmail.com/

The refcount dependencies also go away after this...

https://lore.kernel.org/all/161604050866.1463742.7759521510383551

Re: [PATCH v1 2/2] mm: remove extra ZONE_DEVICE struct page refcount

2021-10-14 Thread Ralph Campbell



On 10/14/21 11:01 AM, Jason Gunthorpe wrote:

On Thu, Oct 14, 2021 at 10:35:27AM -0700, Ralph Campbell wrote:


I ran xfstests-dev using the kernel boot option to "fake" a pmem device
when I first posted this patch. The tests ran OK (or at least the same
tests passed with and without my patch).

Hmm. I know nothing of xfstests but

tests/generic/413

Looks kind of like it might cover this situation?

Did it run for you?

Jason


I don't remember. I'll have to rerun the test which might take a day or two
to set up again.



Re: [PATCH v1 2/2] mm: remove extra ZONE_DEVICE struct page refcount

2021-10-14 Thread Jason Gunthorpe
On Thu, Oct 14, 2021 at 12:01:14PM -0700, Dan Williams wrote:
> > > Does anyone know why devmap is pte_special anyhow?
> 
> It does not need to be special as mentioned here:
> 
> https://lore.kernel.org/all/CAPcyv4iFeVDVPn6uc=aksyuvkiu3-fk-n16ijvzq3n8ot00...@mail.gmail.com/

I added a remark there

Not special means more to me, it means devmap should do the refcounts
properly like normal memory pages.

It means vm_normal_page should return !NULL and it means insert_page,
not insert_pfn should be used to install them in the PTE. VMAs should
not be MIXED MAP, but normal struct page maps.

I think this change alone would fix all the refcount problems
everwhere in DAX and devmap.

> The refcount dependencies also go away after this...
> 
> https://lore.kernel.org/all/161604050866.1463742.7759521510383551055.st...@dwillia2-desk3.amr.corp.intel.com/
>
> ...but you can see that patches 1 and 2 in that series depend on being
> able to guarantee that all mappings are invalidated when the undelying
> device that owns the pgmap goes away.

If I have put everything together right this is because of what I
pointed to here. FS-DAX is installing 0 refcount pages into PTEs and
expecting that to work sanely. 

This means the page map cannot be removed until all the PTEs are fully
flushed, which buggily doesn't happen because of the missing unplug.

However, this is all because nobody incrd a refcount to represent the
reference in the PTE and since this ment that 0 refcount pages were
wrongly stuffed into PTEs then devmap used the refcount == 1 hack to
unbreak GUP?

So.. Is there some reason why devmap pages are trying so hard to avoid
sane refcounting???

If the PTE itself holds the refcount (by not being special) then there
is no need for the pagemap stuff in GUP. pagemap already waits for
refs to go to 0 so the missing shootdown during nvdimm unplug will
cause pagemap to block until the address spaces are invalidated. IMHO
this is already better than the current buggy situation of allowing
continued PTE reference to memory that is now removed from the system.

> For that to happen there needs to be communication back to the FS for
> device-gone / failure events. That work is in progress via this
> series:
> 
> https://lore.kernel.org/all/20210924130959.2695749-1-ruansy.f...@fujitsu.com/

This is fine, but I don't think it should block fixing the mm side -
the end result here still cannot be 0 ref count pages installed in
PTEs.

Fixing that does not depend on shootdown during device removal, right?

It requires holding refcounts while pages are installed into address
spaces - and this lack is a direct cause of making the PTEs all
special and using insert_pfn and MIXED_MAP.

Thanks,
Jason



Re: [PATCH 2/2] amd/amdgpu_dm: Verify Gamma and Degamma LUT sizes using DRM Core check

2021-10-14 Thread Sean Paul
On Wed, Oct 13, 2021 at 2:12 PM Mark Yacoub  wrote:
>
> From: Mark Yacoub 
>
> [Why]
> drm_atomic_helper_check_crtc now verifies both legacy and non-legacy LUT
> sizes. There is no need to check it within amdgpu_dm_atomic_check.
>
> [How]
> Remove the local call to verify LUT sizes and use DRM Core function
> instead.
>
> Tested on ChromeOS Zork.
>
> v1:
> Remove amdgpu_dm_verify_lut_sizes everywhere.
>

Reviewed-by: Sean Paul 

> Signed-off-by: Mark Yacoub 
> ---
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  8 ++---
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h |  1 -
>  .../amd/display/amdgpu_dm/amdgpu_dm_color.c   | 35 ---
>  3 files changed, 4 insertions(+), 40 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index f74663b6b046e..47f8de1cfc3a5 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -10244,6 +10244,10 @@ static int amdgpu_dm_atomic_check(struct drm_device 
> *dev,
> }
> }
>  #endif
> +   ret = drm_atomic_helper_check_crtcs(state);
> +   if (ret)
> +   return ret;
> +
> for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, 
> new_crtc_state, i) {
> dm_old_crtc_state = to_dm_crtc_state(old_crtc_state);
>
> @@ -10253,10 +10257,6 @@ static int amdgpu_dm_atomic_check(struct drm_device 
> *dev,
> dm_old_crtc_state->dsc_force_changed == false)
> continue;
>
> -   ret = amdgpu_dm_verify_lut_sizes(new_crtc_state);
> -   if (ret)
> -   goto fail;
> -
> if (!new_crtc_state->enable)
> continue;
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> index fcb9c4a629c32..22730e5542092 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
> @@ -617,7 +617,6 @@ void amdgpu_dm_trigger_timing_sync(struct drm_device 
> *dev);
>  #define MAX_COLOR_LEGACY_LUT_ENTRIES 256
>
>  void amdgpu_dm_init_color_mod(void);
> -int amdgpu_dm_verify_lut_sizes(const struct drm_crtc_state *crtc_state);
>  int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc);
>  int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
>   struct dc_plane_state *dc_plane_state);
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
> index a022e5bb30a5c..319f8a8a89835 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
> @@ -284,37 +284,6 @@ static int __set_input_tf(struct dc_transfer_func *func,
> return res ? 0 : -ENOMEM;
>  }
>
> -/**
> - * Verifies that the Degamma and Gamma LUTs attached to the |crtc_state| are 
> of
> - * the expected size.
> - * Returns 0 on success.
> - */
> -int amdgpu_dm_verify_lut_sizes(const struct drm_crtc_state *crtc_state)
> -{
> -   const struct drm_color_lut *lut = NULL;
> -   uint32_t size = 0;
> -
> -   lut = __extract_blob_lut(crtc_state->degamma_lut, &size);
> -   if (lut && size != MAX_COLOR_LUT_ENTRIES) {
> -   DRM_DEBUG_DRIVER(
> -   "Invalid Degamma LUT size. Should be %u but got 
> %u.\n",
> -   MAX_COLOR_LUT_ENTRIES, size);
> -   return -EINVAL;
> -   }
> -
> -   lut = __extract_blob_lut(crtc_state->gamma_lut, &size);
> -   if (lut && size != MAX_COLOR_LUT_ENTRIES &&
> -   size != MAX_COLOR_LEGACY_LUT_ENTRIES) {
> -   DRM_DEBUG_DRIVER(
> -   "Invalid Gamma LUT size. Should be %u (or %u for 
> legacy) but got %u.\n",
> -   MAX_COLOR_LUT_ENTRIES, MAX_COLOR_LEGACY_LUT_ENTRIES,
> -   size);
> -   return -EINVAL;
> -   }
> -
> -   return 0;
> -}
> -
>  /**
>   * amdgpu_dm_update_crtc_color_mgmt: Maps DRM color management to DC stream.
>   * @crtc: amdgpu_dm crtc state
> @@ -348,10 +317,6 @@ int amdgpu_dm_update_crtc_color_mgmt(struct 
> dm_crtc_state *crtc)
> bool is_legacy;
> int r;
>
> -   r = amdgpu_dm_verify_lut_sizes(&crtc->base);
> -   if (r)
> -   return r;
> -
> degamma_lut = __extract_blob_lut(crtc->base.degamma_lut, 
> °amma_size);
> regamma_lut = __extract_blob_lut(crtc->base.gamma_lut, ®amma_size);
>
> --
> 2.33.0.882.g93a45727a2-goog
>


[PATCH 0/5] 0 MHz is not a valid current frequency (v3)

2021-10-14 Thread Luben Tuikov
Some ASICs support low-power functionality for the whole ASIC or just
an IP block. When in such low-power mode, some sysfs interfaces would
report a frequency of 0, e.g.,

$cat /sys/class/drm/card0/device/pp_dpm_sclk
0: 500Mhz 
1: 0Mhz *
2: 2200Mhz 
$_

An operating frequency of 0 MHz doesn't make sense, and this interface
is designed to report only operating clock frequencies, i.e. non-zero,
and possibly the current one.

When in this low-power state, round to the smallest
operating frequency, for this interface, as follows,

$cat /sys/class/drm/card0/device/pp_dpm_sclk
0: 500Mhz *
1: 2200Mhz 
$_

v2: Fix description to reflect change in patch 1--add an 's'.
v3: Don't tag a current if current is 0.

Luben Tuikov (5):
  drm/amd/pm: Slight function rename (v2)
  drm/amd/pm: Rename cur_value to curr_value
  drm/amd/pm: Rename freq_values --> freq_value
  dpm/amd/pm: Sienna: 0 MHz is not a current clock frequency (v2)
  dpm/amd/pm: Navi10: 0 MHz is not a current clock frequency (v2)

 .../gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c   | 57 --
 .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   | 74 ---
 2 files changed, 83 insertions(+), 48 deletions(-)


base-commit: b81c53cdbe1482b1f4013ba7a41bca2174cde109
-- 
2.33.1.558.g2bd2f258f4



[PATCH 3/5] drm/amd/pm: Rename freq_values --> freq_value

2021-10-14 Thread Luben Tuikov
By usage: read freq_values[x] to freq_value[x].

Cc: Alex Deucher 
Signed-off-by: Luben Tuikov 
---
 .../gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c| 16 
 .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c| 18 +-
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
index f810549df493d5..646e9bbf8af42a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
@@ -1268,7 +1268,7 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
uint16_t *curve_settings;
int i, size = 0, ret = 0;
uint32_t curr_value = 0, value = 0, count = 0;
-   uint32_t freq_values[3] = {0};
+   uint32_t freq_value[3] = {0, 0, 0};
uint32_t mark_index = 0;
struct smu_table_context *table_context = &smu->smu_table;
uint32_t gen_speed, lane_width;
@@ -1310,21 +1310,21 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
curr_value == value ? "*" : "");
}
} else {
-   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0, 
&freq_values[0]);
+   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0, 
&freq_value[0]);
if (ret)
return size;
-   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 
count - 1, &freq_values[2]);
+   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 
count - 1, &freq_value[2]);
if (ret)
return size;
 
-   freq_values[1] = curr_value;
-   mark_index = curr_value == freq_values[0] ? 0 :
-curr_value == freq_values[2] ? 2 : 1;
+   freq_value[1] = curr_value;
+   mark_index = curr_value == freq_value[0] ? 0 :
+curr_value == freq_value[2] ? 2 : 1;
if (mark_index != 1)
-   freq_values[1] = (freq_values[0] + 
freq_values[2]) / 2;
+   freq_value[1] = (freq_value[0] + freq_value[2]) 
/ 2;
 
for (i = 0; i < 3; i++) {
-   size += sysfs_emit_at(buf, size, "%d: %uMhz 
%s\n", i, freq_values[i],
+   size += sysfs_emit_at(buf, size, "%d: %uMhz 
%s\n", i, freq_value[i],
i == mark_index ? "*" : "");
}
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 3ebded3a99b5f2..f630d5e928ccfe 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -1053,7 +1053,7 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
(OverDriveTable_t *)table_context->overdrive_table;
int i, size = 0, ret = 0;
uint32_t curr_value = 0, value = 0, count = 0;
-   uint32_t freq_values[3] = {0};
+   uint32_t freq_value[3] = {0, 0, 0};
uint32_t mark_index = 0;
uint32_t gen_speed, lane_width;
uint32_t min_value, max_value;
@@ -1096,26 +1096,26 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
curr_value == value ? "*" : "");
}
} else {
-   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0, 
&freq_values[0]);
+   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0, 
&freq_value[0]);
if (ret)
goto print_clk_out;
-   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 
count - 1, &freq_values[2]);
+   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 
count - 1, &freq_value[2]);
if (ret)
goto print_clk_out;
 
-   freq_values[1] = curr_value;
-   mark_index = curr_value == freq_values[0] ? 0 :
-curr_value == freq_values[2] ? 2 : 1;
+   freq_value[1] = curr_value;
+   mark_index = curr_value == freq_value[0] ? 0 :
+curr_value == freq_value[2] ? 2 : 1;
 
count = 3;
if (mark_index != 1) {
count = 2;
-   freq_values[1] = freq_values[2];
+   freq_value[1] = freq_value[2];
}
 
 

[PATCH 1/5] drm/amd/pm: Slight function rename (v2)

2021-10-14 Thread Luben Tuikov
Rename
sienna_cichlid_is_support_fine_grained_dpm() to
sienna_cichlid_supports_fine_grained_dpm().

Rename
navi10_is_support_fine_grained_dpm() to
navi10_supports_fine_grained_dpm().

v2: Fix function name in commit message to reflect
the change being done: add a missing 's'.

Cc: Alex Deucher 
Signed-off-by: Luben Tuikov 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 7 ---
 drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 7 ---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
index 71161f6b78fea9..0fe9790f67f5af 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
@@ -1231,7 +1231,8 @@ static int navi10_get_current_clk_freq_by_table(struct 
smu_context *smu,
   value);
 }
 
-static bool navi10_is_support_fine_grained_dpm(struct smu_context *smu, enum 
smu_clk_type clk_type)
+static bool navi10_supports_fine_grained_dpm(struct smu_context *smu,
+enum smu_clk_type clk_type)
 {
PPTable_t *pptable = smu->smu_table.driver_pptable;
DpmDescriptor_t *dpm_desc = NULL;
@@ -1299,7 +1300,7 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
if (ret)
return size;
 
-   if (!navi10_is_support_fine_grained_dpm(smu, clk_type)) {
+   if (!navi10_supports_fine_grained_dpm(smu, clk_type)) {
for (i = 0; i < count; i++) {
ret = smu_v11_0_get_dpm_freq_by_index(smu, 
clk_type, i, &value);
if (ret)
@@ -1465,7 +1466,7 @@ static int navi10_force_clk_levels(struct smu_context 
*smu,
case SMU_UCLK:
case SMU_FCLK:
/* There is only 2 levels for fine grained DPM */
-   if (navi10_is_support_fine_grained_dpm(smu, clk_type)) {
+   if (navi10_supports_fine_grained_dpm(smu, clk_type)) {
soft_max_level = (soft_max_level >= 1 ? 1 : 0);
soft_min_level = (soft_min_level >= 1 ? 1 : 0);
}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 15e66e1912de33..3f5721baa5ff50 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -1006,7 +1006,8 @@ static int 
sienna_cichlid_get_current_clk_freq_by_table(struct smu_context *smu,
 
 }
 
-static bool sienna_cichlid_is_support_fine_grained_dpm(struct smu_context 
*smu, enum smu_clk_type clk_type)
+static bool sienna_cichlid_supports_fine_grained_dpm(struct smu_context *smu,
+enum smu_clk_type clk_type)
 {
DpmDescriptor_t *dpm_desc = NULL;
DpmDescriptor_t *table_member;
@@ -1084,7 +1085,7 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
if (ret)
goto print_clk_out;
 
-   if (!sienna_cichlid_is_support_fine_grained_dpm(smu, clk_type)) 
{
+   if (!sienna_cichlid_supports_fine_grained_dpm(smu, clk_type)) {
for (i = 0; i < count; i++) {
ret = smu_v11_0_get_dpm_freq_by_index(smu, 
clk_type, i, &value);
if (ret)
@@ -1235,7 +1236,7 @@ static int sienna_cichlid_force_clk_levels(struct 
smu_context *smu,
case SMU_UCLK:
case SMU_FCLK:
/* There is only 2 levels for fine grained DPM */
-   if (sienna_cichlid_is_support_fine_grained_dpm(smu, clk_type)) {
+   if (sienna_cichlid_supports_fine_grained_dpm(smu, clk_type)) {
soft_max_level = (soft_max_level >= 1 ? 1 : 0);
soft_min_level = (soft_min_level >= 1 ? 1 : 0);
}
-- 
2.33.1.558.g2bd2f258f4



[PATCH 4/5] dpm/amd/pm: Sienna: 0 MHz is not a current clock frequency (v2)

2021-10-14 Thread Luben Tuikov
A current value of a clock frequency of 0, means
that the IP block is in some kind of low power
state. Ignore it and don't report it here. Here we
only report the possible operating (non-zero)
frequencies of the block requested. So, if the
current clock value is 0, then print the DPM
frequencies, but don't report a current value.

v2: Don't report the minimum one as the current
one when reported one is 0, i.e. don't add an
asterisk (Lijo). LT: It is conceivable that this
may confuse user-mode tools if they scan and look
for a current one, i.e. look for an asterisk, but
they'll have to adapt and use other methods for
finding power states of the chip--we can't report
0 as current.

Cc: Alex Deucher 
Cc: Lijo Lazar 
Signed-off-by: Luben Tuikov 
---
 .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   | 60 ---
 1 file changed, 40 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index f630d5e928ccfe..6fe792be77dbbb 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -1040,7 +1040,8 @@ static void sienna_cichlid_get_od_setting_range(struct 
smu_11_0_7_overdrive_tabl
 }
 
 static int sienna_cichlid_print_clk_levels(struct smu_context *smu,
-   enum smu_clk_type clk_type, char *buf)
+  enum smu_clk_type clk_type,
+  char *buf)
 {
struct amdgpu_device *adev = smu->adev;
struct smu_table_context *table_context = &smu->smu_table;
@@ -1052,12 +1053,12 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
OverDriveTable_t *od_table =
(OverDriveTable_t *)table_context->overdrive_table;
int i, size = 0, ret = 0;
-   uint32_t curr_value = 0, value = 0, count = 0;
+   uint32_t curr_value, value, count;
uint32_t freq_value[3] = {0, 0, 0};
-   uint32_t mark_index = 0;
uint32_t gen_speed, lane_width;
uint32_t min_value, max_value;
uint32_t smu_version;
+   bool fine_grained;
 
smu_cmn_get_sysfs_buf(&buf, &size);
 
@@ -1077,6 +1078,20 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
if (ret)
goto print_clk_out;
 
+   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0,
+ &freq_value[0]);
+   if (ret)
+   goto print_clk_out;
+
+   /* A current value of a clock frequency of 0, means
+* that the IP block is in some kind of low power
+* state. Ignore it and don't report it here. Here we
+* only report the possible operating (non-zero)
+* frequencies of the block requested. So, if the
+* current clock value is 0, then we don't report a
+* "current" value from the DPM states, i.e. we don't
+* add an asterisk.
+*/
 
/* no need to disable gfxoff when retrieving the current gfxclk 
*/
if ((clk_type == SMU_GFXCLK) || (clk_type == SMU_SCLK))
@@ -1086,38 +1101,43 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
if (ret)
goto print_clk_out;
 
-   if (!sienna_cichlid_supports_fine_grained_dpm(smu, clk_type)) {
-   for (i = 0; i < count; i++) {
+   fine_grained = sienna_cichlid_supports_fine_grained_dpm(smu, 
clk_type);
+   if (!fine_grained) {
+   /* We already got the 0-th index--print it
+* here and continue thereafter.
+*/
+   size += sysfs_emit_at(buf, size, "%d: %uMhz %s\n", 0, 
freq_value[0],
+ curr_value == freq_value[0] ? "*" 
: "");
+   for (i = 1; i < count; i++) {
ret = smu_v11_0_get_dpm_freq_by_index(smu, 
clk_type, i, &value);
if (ret)
goto print_clk_out;
-
size += sysfs_emit_at(buf, size, "%d: %uMhz 
%s\n", i, value,
curr_value == value ? "*" : "");
}
} else {
-   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0, 
&freq_value[0]);
-   if (ret)
-   goto print_clk_out;
+   freq_value[1] = curr_value ?: freq_value[0];
ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 
count - 1, &freq_value[2]);
if (ret)
goto print_clk_out

[PATCH 2/5] drm/amd/pm: Rename cur_value to curr_value

2021-10-14 Thread Luben Tuikov
Rename "cur_value", which stands for "cursor
value" to "curr_value", which stands for "current
value".

Cc: Alex Deucher 
Signed-off-by: Luben Tuikov 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c   | 12 ++--
 .../drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   | 15 ---
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
index 0fe9790f67f5af..f810549df493d5 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
@@ -1267,7 +1267,7 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
 {
uint16_t *curve_settings;
int i, size = 0, ret = 0;
-   uint32_t cur_value = 0, value = 0, count = 0;
+   uint32_t curr_value = 0, value = 0, count = 0;
uint32_t freq_values[3] = {0};
uint32_t mark_index = 0;
struct smu_table_context *table_context = &smu->smu_table;
@@ -1292,7 +1292,7 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
case SMU_VCLK:
case SMU_DCLK:
case SMU_DCEFCLK:
-   ret = navi10_get_current_clk_freq_by_table(smu, clk_type, 
&cur_value);
+   ret = navi10_get_current_clk_freq_by_table(smu, clk_type, 
&curr_value);
if (ret)
return size;
 
@@ -1307,7 +1307,7 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
return size;
 
size += sysfs_emit_at(buf, size, "%d: %uMhz 
%s\n", i, value,
-   cur_value == value ? "*" : "");
+   curr_value == value ? "*" : "");
}
} else {
ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0, 
&freq_values[0]);
@@ -1317,9 +1317,9 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
if (ret)
return size;
 
-   freq_values[1] = cur_value;
-   mark_index = cur_value == freq_values[0] ? 0 :
-cur_value == freq_values[2] ? 2 : 1;
+   freq_values[1] = curr_value;
+   mark_index = curr_value == freq_values[0] ? 0 :
+curr_value == freq_values[2] ? 2 : 1;
if (mark_index != 1)
freq_values[1] = (freq_values[0] + 
freq_values[2]) / 2;
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 3f5721baa5ff50..3ebded3a99b5f2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -1052,7 +1052,7 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
OverDriveTable_t *od_table =
(OverDriveTable_t *)table_context->overdrive_table;
int i, size = 0, ret = 0;
-   uint32_t cur_value = 0, value = 0, count = 0;
+   uint32_t curr_value = 0, value = 0, count = 0;
uint32_t freq_values[3] = {0};
uint32_t mark_index = 0;
uint32_t gen_speed, lane_width;
@@ -1073,10 +1073,11 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
case SMU_DCLK:
case SMU_DCLK1:
case SMU_DCEFCLK:
-   ret = sienna_cichlid_get_current_clk_freq_by_table(smu, 
clk_type, &cur_value);
+   ret = sienna_cichlid_get_current_clk_freq_by_table(smu, 
clk_type, &curr_value);
if (ret)
goto print_clk_out;
 
+
/* no need to disable gfxoff when retrieving the current gfxclk 
*/
if ((clk_type == SMU_GFXCLK) || (clk_type == SMU_SCLK))
amdgpu_gfx_off_ctrl(adev, false);
@@ -1092,7 +1093,7 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
goto print_clk_out;
 
size += sysfs_emit_at(buf, size, "%d: %uMhz 
%s\n", i, value,
-   cur_value == value ? "*" : "");
+   curr_value == value ? "*" : "");
}
} else {
ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0, 
&freq_values[0]);
@@ -1102,9 +1103,9 @@ static int sienna_cichlid_print_clk_levels(struct 
smu_context *smu,
if (ret)
goto print_clk_out;
 
-   freq_values[1] = cur_value;
-   mark_index = cur_value == freq_values[0] ? 0 :
-cur_value == freq_values[2] ? 2 : 1;
+  

[PATCH 5/5] dpm/amd/pm: Navi10: 0 MHz is not a current clock frequency (v2)

2021-10-14 Thread Luben Tuikov
A current value of a clock frequency of 0, means
that the IP block is in some kind of low power
state. Ignore it and don't report it here. Here we
only report the possible operating (non-zero)
frequencies of the block requested. So, if the
current clock value is 0, then print the DPM
frequencies, but don't report a current value.

v2: Don't report the minimum one as the current
one when reported one is 0, i.e. don't add an
asterisk (Lijo). LT: It is conceivable that this
may confuse user-mode tools if they scan and look
for a current one, i.e. look for an asterisk, but
they'll have to adapt and use other methods for
finding power states of the chip--we can't report
0 as current.

Cc: Alex Deucher 
Cc: Lijo Lazar 
Signed-off-by: Luben Tuikov 
---
 .../gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c   | 44 ---
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
index 646e9bbf8af42a..2af6fd336352aa 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c
@@ -1267,9 +1267,8 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
 {
uint16_t *curve_settings;
int i, size = 0, ret = 0;
-   uint32_t curr_value = 0, value = 0, count = 0;
+   uint32_t curr_value, value, count;
uint32_t freq_value[3] = {0, 0, 0};
-   uint32_t mark_index = 0;
struct smu_table_context *table_context = &smu->smu_table;
uint32_t gen_speed, lane_width;
struct smu_dpm_context *smu_dpm = &smu->smu_dpm;
@@ -1279,6 +1278,7 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
(OverDriveTable_t *)table_context->overdrive_table;
struct smu_11_0_overdrive_table *od_settings = smu->od_settings;
uint32_t min_value, max_value;
+   bool fine_grained;
 
smu_cmn_get_sysfs_buf(&buf, &size);
 
@@ -1296,12 +1296,20 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
if (ret)
return size;
 
+   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0,
+ &freq_value[0]);
+   if (ret)
+   return size;
+
ret = smu_v11_0_get_dpm_level_count(smu, clk_type, &count);
if (ret)
return size;
 
-   if (!navi10_supports_fine_grained_dpm(smu, clk_type)) {
-   for (i = 0; i < count; i++) {
+   fine_grained = navi10_supports_fine_grained_dpm(smu, clk_type);
+   if (!fine_grained) {
+   size += sysfs_emit_at(buf, size, "%d: %uMhz %s\n", 0, 
freq_value[0],
+ curr_value == freq_value[0] ? "*" 
: "");
+   for (i = 1; i < count; i++) {
ret = smu_v11_0_get_dpm_freq_by_index(smu, 
clk_type, i, &value);
if (ret)
return size;
@@ -1310,24 +1318,28 @@ static int navi10_print_clk_levels(struct smu_context 
*smu,
curr_value == value ? "*" : "");
}
} else {
-   ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 0, 
&freq_value[0]);
-   if (ret)
-   return size;
+   freq_value[1] = curr_value ?: freq_value[0];
ret = smu_v11_0_get_dpm_freq_by_index(smu, clk_type, 
count - 1, &freq_value[2]);
if (ret)
return size;
 
-   freq_value[1] = curr_value;
-   mark_index = curr_value == freq_value[0] ? 0 :
-curr_value == freq_value[2] ? 2 : 1;
-   if (mark_index != 1)
-   freq_value[1] = (freq_value[0] + freq_value[2]) 
/ 2;
-
-   for (i = 0; i < 3; i++) {
-   size += sysfs_emit_at(buf, size, "%d: %uMhz 
%s\n", i, freq_value[i],
-   i == mark_index ? "*" : "");
+   if (freq_value[1] == freq_value[0]) {
+   i = 1;
+   count = 3;
+   } else if (freq_value[1] == freq_value[2]) {
+   i = 0;
+   count = 2;
+   } else {
+   i = 0;
+   count = 3;
}
 
+   for ( ; i < count; i++) {
+   size += sysfs_emit_at(buf, size,
+ "%d: %uMhz %s\n",
+

Re: [PATCH v1 2/2] mm: remove extra ZONE_DEVICE struct page refcount

2021-10-14 Thread Dan Williams
On Thu, Oct 14, 2021 at 4:06 PM Jason Gunthorpe  wrote:
>
> On Thu, Oct 14, 2021 at 12:01:14PM -0700, Dan Williams wrote:
> > > > Does anyone know why devmap is pte_special anyhow?
> >
> > It does not need to be special as mentioned here:
> >
> > https://lore.kernel.org/all/CAPcyv4iFeVDVPn6uc=aksyuvkiu3-fk-n16ijvzq3n8ot00...@mail.gmail.com/
>
> I added a remark there
>
> Not special means more to me, it means devmap should do the refcounts
> properly like normal memory pages.
>
> It means vm_normal_page should return !NULL and it means insert_page,
> not insert_pfn should be used to install them in the PTE. VMAs should
> not be MIXED MAP, but normal struct page maps.
>
> I think this change alone would fix all the refcount problems
> everwhere in DAX and devmap.
>
> > The refcount dependencies also go away after this...
> >
> > https://lore.kernel.org/all/161604050866.1463742.7759521510383551055.st...@dwillia2-desk3.amr.corp.intel.com/
> >
> > ...but you can see that patches 1 and 2 in that series depend on being
> > able to guarantee that all mappings are invalidated when the undelying
> > device that owns the pgmap goes away.
>
> If I have put everything together right this is because of what I
> pointed to here. FS-DAX is installing 0 refcount pages into PTEs and
> expecting that to work sanely.
>
> This means the page map cannot be removed until all the PTEs are fully
> flushed, which buggily doesn't happen because of the missing unplug.
>
> However, this is all because nobody incrd a refcount to represent the
> reference in the PTE and since this ment that 0 refcount pages were
> wrongly stuffed into PTEs then devmap used the refcount == 1 hack to
> unbreak GUP?
>
> So.. Is there some reason why devmap pages are trying so hard to avoid
> sane refcounting???

I wouldn't put it that way. It's more that the original sin of
ZONE_DEVICE that sought to reuse the lru field space, by never having
a zero recount, then got layered upon and calcified in malignant ways.
In the meantime surrounding infrastructure got decrustified. Work like
the 'struct page' cleanup among other things, made it clearer and
clearer over time that the original design choice needed to be fixed.

> If the PTE itself holds the refcount (by not being special) then there
> is no need for the pagemap stuff in GUP. pagemap already waits for
> refs to go to 0 so the missing shootdown during nvdimm unplug will
> cause pagemap to block until the address spaces are invalidated. IMHO
> this is already better than the current buggy situation of allowing
> continued PTE reference to memory that is now removed from the system.
>
> > For that to happen there needs to be communication back to the FS for
> > device-gone / failure events. That work is in progress via this
> > series:
> >
> > https://lore.kernel.org/all/20210924130959.2695749-1-ruansy.f...@fujitsu.com/
>
> This is fine, but I don't think it should block fixing the mm side -
> the end result here still cannot be 0 ref count pages installed in
> PTEs.
>
> Fixing that does not depend on shootdown during device removal, right?
>
> It requires holding refcounts while pages are installed into address
> spaces - and this lack is a direct cause of making the PTEs all
> special and using insert_pfn and MIXED_MAP.

The MIXED_MAP and insert_pfn were a holdover from page-less DAX, but
now that we have page-available DAX, yes, we can skip the FS
notification and just rely on typical refcounting and hanging until
the FS has a chance to uninstall the PTEs. You're right, the FS
notification is an improvement to the conversion, not a requirement.

However, there still needs to be something in the gup-fast path to
indicate that GUP_LONGTERM is not possible because the PTE represents
a pfn that can not support typical page-cache behavior for truncate
which is to just disconnect the page from the file and keep the page
pinned indefinitely. I think the "no longterm" caveat would be the
only remaining utility of PTE_DEVMAP after the above conversion to use
typical page refcounts throughout DAX.



RE: [PATCH] drm/amdgpu/gfx10: fix typo in gfx_v10_0_update_gfx_clock_gating()

2021-10-14 Thread Quan, Evan
[AMD Official Use Only]

Reviewed-by: Evan Quan 

> -Original Message-
> From: amd-gfx  On Behalf Of Alex
> Deucher
> Sent: Wednesday, October 13, 2021 12:40 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander 
> Subject: [PATCH] drm/amdgpu/gfx10: fix typo in
> gfx_v10_0_update_gfx_clock_gating()
> 
> Check was incorrectly converted to IP version checking.
> 
> Fixes: 4b0ad8425498ba ("drm/amdgpu/gfx10: convert to IP version checking")
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 71bb3c0dc1da..8cec03949835 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -8238,8 +8238,9 @@ static int
> gfx_v10_0_update_gfx_clock_gating(struct amdgpu_device *adev,
>   /* ===  CGCG + CGLS === */
>   gfx_v10_0_update_coarse_grain_clock_gating(adev, enable);
> 
> - if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(10, 1,
> 10)) &&
> -  (adev->ip_versions[GC_HWIP][0] <= IP_VERSION(10, 1, 2)))
> + if ((adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1,
> 10)) ||
> + (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 1))
> ||
> + (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 1, 2)))
> 
>   gfx_v10_0_apply_medium_grain_clock_gating_workaround(adev);
>   } else {
>   /* CGCG/CGLS should be disabled before MGCG/MGLS
> --
> 2.31.1


RE: [PATCH 0/5] 0 MHz is not a valid current frequency

2021-10-14 Thread Quan, Evan
[AMD Official Use Only]

+Kent who maintains the Rocm tool

From: amd-gfx  On Behalf Of Lazar, Lijo
Sent: Thursday, October 14, 2021 1:07 AM
To: Tuikov, Luben ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: Re: [PATCH 0/5] 0 MHz is not a valid current frequency


[AMD Official Use Only]


[AMD Official Use Only]

>Or maybe just a list without default hint, i.e. no asterisk?

I think this is also fine meaning we are having trouble in determining the 
current frequency or DPM level. Evan/Alex? Don't know if this will crash the 
tools.

Thanks,
Lijo

From: Tuikov, Luben mailto:luben.tui...@amd.com>>
Sent: Wednesday, October 13, 2021 9:52:09 PM
To: Lazar, Lijo mailto:lijo.la...@amd.com>>; 
amd-gfx@lists.freedesktop.org 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Deucher, Alexander 
mailto:alexander.deuc...@amd.com>>
Subject: Re: [PATCH 0/5] 0 MHz is not a valid current frequency

On 2021-10-13 00:14, Lazar, Lijo wrote:
>
> On 10/13/2021 8:40 AM, Luben Tuikov wrote:
>> Some ASIC support low-power functionality for the whole ASIC or just
>> an IP block. When in such low-power mode, some sysfs interfaces would
>> report a frequency of 0, e.g.,
>>
>> $cat /sys/class/drm/card0/device/pp_dpm_sclk
>> 0: 500Mhz
>> 1: 0Mhz *
>> 2: 2200Mhz
>> $_
>>
>> An operating frequency of 0 MHz doesn't make sense, and this interface
>> is designed to report only operating clock frequencies, i.e. non-zero,
>> and possibly the current one.
>>
>> When in this low-power state, round to the smallest
>> operating frequency, for this interface, as follows,
>>
> Would rather avoid this -
>
> 1) It is manipulating FW reported value. If at all there is an uncaught
> issue in FW reporting of frequency values, that is masked here.
> 2) Otherwise, if 0MHz is described as GFX power gated case, this
> provides a convenient interface to check if GFX is power gated.
>
> If seeing a '0' is not pleasing, consider changing to something like
>"NA" - not available (frequency cannot be fetched at the moment).

There's a ROCm tool which literally asserts if the values are not ordered in 
increasing order. Now since 0 < 550, but 0 is listed as the second entry, the 
tool simply asserts and crashes.

It is not clear what you'd rather see here:

$cat /sys/class/drm/card0/device/pp_dpm_sclk
0: 550Mhz
1: N/A *
2: 2200MHz
$_

Is this what you want to see? (That'll crash other tools which expect %uMhz.)

Or maybe just a list without default hint, i.e. no asterisk?

$cat /sys/class/drm/card0/device/pp_dpm_sclk
0: 550Mhz
1: 2200MHz
$_

What should the output be?

We want to avoid showing 0, but still show numbers.

Regards,
Luben

>
> Thanks,
> Lijo
>
>> $cat /sys/class/drm/card0/device/pp_dpm_sclk
>> 0: 500Mhz *
>> 1: 2200Mhz
>> $_
>>
>> Luben Tuikov (5):
>>drm/amd/pm: Slight function rename
>>drm/amd/pm: Rename cur_value to curr_value
>>drm/amd/pm: Rename freq_values --> freq_value
>>dpm/amd/pm: Sienna: 0 MHz is not a current clock frequency
>>dpm/amd/pm: Navi10: 0 MHz is not a current clock frequency
>>
>>   .../gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c   | 60 +--
>>   .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   | 73 ---
>>   2 files changed, 86 insertions(+), 47 deletions(-)
>>


RE: [PATCH 1/5] drm/amdkfd: protect hawaii_device_info with CONFIG_DRM_AMDGPU_CIK

2021-10-14 Thread Quan, Evan
[AMD Official Use Only]

Patch1 & 2 are reviewed-by: Evan Quan 
Patch 3 - 5 are acked-by: Evan Quan 

> -Original Message-
> From: amd-gfx  On Behalf Of Alex
> Deucher
> Sent: Thursday, October 14, 2021 2:21 AM
> To: Deucher, Alexander 
> Cc: amd-gfx list 
> Subject: Re: [PATCH 1/5] drm/amdkfd: protect hawaii_device_info with
> CONFIG_DRM_AMDGPU_CIK
> 
> Ping on this series.
> 
> On Mon, Oct 11, 2021 at 11:02 AM Alex Deucher
>  wrote:
> >
> > hawaii_device_info is not used when CONFIG_DRM_AMDGPU_CIK is not
> set.
> >
> > Signed-off-by: Alex Deucher 
> > ---
> >  drivers/gpu/drm/amd/amdkfd/kfd_device.c | 2 ++
> >  1 file changed, 2 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > index 064d42acd54e..31e255ba15ed 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> > @@ -114,6 +114,7 @@ static const struct kfd_device_info
> raven_device_info = {
> > .num_sdma_queues_per_engine = 2,  };
> >
> > +#ifdef CONFIG_DRM_AMDGPU_CIK
> >  static const struct kfd_device_info hawaii_device_info = {
> > .asic_family = CHIP_HAWAII,
> > .asic_name = "hawaii",
> > @@ -133,6 +134,7 @@ static const struct kfd_device_info
> hawaii_device_info = {
> > .num_xgmi_sdma_engines = 0,
> > .num_sdma_queues_per_engine = 2,  };
> > +#endif
> >
> >  static const struct kfd_device_info tonga_device_info = {
> > .asic_family = CHIP_TONGA,
> > --
> > 2.31.1
> >


Re: [PATCH v1 2/2] mm: remove extra ZONE_DEVICE struct page refcount

2021-10-14 Thread Sierra Guiza, Alejandro (Alex)



On 10/14/2021 3:57 PM, Ralph Campbell wrote:


On 10/14/21 11:01 AM, Jason Gunthorpe wrote:

On Thu, Oct 14, 2021 at 10:35:27AM -0700, Ralph Campbell wrote:


I ran xfstests-dev using the kernel boot option to "fake" a pmem device
when I first posted this patch. The tests ran OK (or at least the same
tests passed with and without my patch).

Hmm. I know nothing of xfstests but

tests/generic/413

Looks kind of like it might cover this situation?

Did it run for you?

Jason


I don't remember. I'll have to rerun the test which might take a day 
or two

to set up again.


I just ran this generic/413 on my side using pmem fake device. It does fail.
I remember we proposed a fix on this patch before try_get_page was removed.
@@ -1186,7 +1153,7 @@ bool __must_check try_grab_page(struct page *page, 
unsigned int flags);

 static inline __must_check bool try_get_page(struct page *page)
 {
    page = compound_head(page);
-   if (WARN_ON_ONCE(page_ref_count(page) <= 0))
+   if (WARN_ON_ONCE(page_ref_count(page) < 
(int)!is_zone_device_page(page)))

    return false;
    page_ref_inc(page);
    return true;

Alex



[PATCH v2] drm/amdkfd: Separate pinned BOs destruction from general routine

2021-10-14 Thread Lang Yu
Currently, all kfd BOs use same destruction routine. But pinned
BOs are not unpinned properly. Separate them from general routine.

v2 (Felix):
Add safeguard to prevent user space from freeing signal BO.
Kunmap signal BO in the event of setting event page error.
Just kunmap signal BO to avoid duplicating the code.

Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|   2 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  10 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  31 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   3 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 110 +-
 5 files changed, 119 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 69de31754907..751557af09bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -279,6 +279,8 @@ int amdgpu_amdkfd_gpuvm_sync_memory(
struct kgd_dev *kgd, struct kgd_mem *mem, bool intr);
 int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd,
struct kgd_mem *mem, void **kptr, uint64_t *size);
+void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_dev *kgd, struct 
kgd_mem *mem);
+
 int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
struct dma_fence **ef);
 int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index cdf46bd0d8d5..4969763c2e47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1871,6 +1871,16 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct 
kgd_dev *kgd,
return ret;
 }
 
+void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_dev *kgd, struct 
kgd_mem *mem)
+{
+   struct amdgpu_bo *bo = mem->bo;
+
+   amdgpu_bo_reserve(bo, true);
+   amdgpu_bo_kunmap(bo);
+   amdgpu_bo_unpin(bo);
+   amdgpu_bo_unreserve(bo);
+}
+
 int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd,
  struct kfd_vm_fault_info *mem)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f1e7edeb4e6b..9317a2e238d0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1011,11 +1011,6 @@ static int kfd_ioctl_create_event(struct file *filp, 
struct kfd_process *p,
void *mem, *kern_addr;
uint64_t size;
 
-   if (p->signal_page) {
-   pr_err("Event page is already set\n");
-   return -EINVAL;
-   }
-
kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset));
if (!kfd) {
pr_err("Getting device by id failed in %s\n", __func__);
@@ -1023,6 +1018,13 @@ static int kfd_ioctl_create_event(struct file *filp, 
struct kfd_process *p,
}
 
mutex_lock(&p->mutex);
+
+   if (p->signal_page) {
+   pr_err("Event page is already set\n");
+   err = -EINVAL;
+   goto out_unlock;
+   }
+
pdd = kfd_bind_process_to_device(kfd, p);
if (IS_ERR(pdd)) {
err = PTR_ERR(pdd);
@@ -1037,20 +1039,24 @@ static int kfd_ioctl_create_event(struct file *filp, 
struct kfd_process *p,
err = -EINVAL;
goto out_unlock;
}
-   mutex_unlock(&p->mutex);
 
err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(kfd->kgd,
mem, &kern_addr, &size);
if (err) {
pr_err("Failed to map event page to kernel\n");
-   return err;
+   goto out_unlock;
}
 
err = kfd_event_page_set(p, kern_addr, size);
if (err) {
pr_err("Failed to set event page\n");
-   return err;
+   amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(kfd->kgd, 
mem);
+   goto out_unlock;
}
+
+   p->signal_handle = args->event_page_offset;
+
+   mutex_unlock(&p->mutex);
}
 
err = kfd_event_create(filp, p, args->event_type,
@@ -1368,6 +1374,15 @@ static int kfd_ioctl_free_memory_of_gpu(struct file 
*filep,
return -EINVAL;
 
mutex_lock(&p->mutex);
+   /*
+* Safeguard to prevent user space from freeing signal BO.
+* It will be freed at process termination.
+*/
+   if (p->signal_handle && (p->signal_handle == args->handle)) {
+