On Tuesday, 16 November 2021 6:30:18 AM AEDT Alex Sierra wrote:
> Device memory that is cache coherent from device and CPU point of view.
> This is used on platforms that have an advanced system bus (like CAPI
> or CXL). Any page of a process can be migrated to such memory. However,
> no one should be allowed to pin such memory so that it can always be
> evicted.
> 
> Signed-off-by: Alex Sierra <alex.sie...@amd.com>
> ---
>  include/linux/memremap.h |  8 ++++++++
>  include/linux/mm.h       |  9 +++++++++
>  mm/memcontrol.c          |  6 +++---
>  mm/memory-failure.c      |  6 +++++-
>  mm/memremap.c            |  5 ++++-
>  mm/migrate.c             | 21 +++++++++++++--------
>  6 files changed, 42 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index c0e9d35889e8..ff4d398edf35 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -39,6 +39,13 @@ struct vmem_altmap {
>   * A more complete discussion of unaddressable memory may be found in
>   * include/linux/hmm.h and Documentation/vm/hmm.rst.
>   *
> + * MEMORY_DEVICE_COHERENT:
> + * Device memory that is cache coherent from device and CPU point of view. 
> This
> + * is used on platforms that have an advanced system bus (like CAPI or CXL). 
> A
> + * driver can hotplug the device memory using ZONE_DEVICE and with that 
> memory
> + * type. Any page of a process can be migrated to such memory. However no one
> + * should be allowed to pin such memory so that it can always be evicted.
> + *
>   * MEMORY_DEVICE_FS_DAX:
>   * Host memory that has similar access semantics as System RAM i.e. DMA
>   * coherent and supports page pinning. In support of coordinating page
> @@ -59,6 +66,7 @@ struct vmem_altmap {
>  enum memory_type {
>       /* 0 is reserved to catch uninitialized type fields */
>       MEMORY_DEVICE_PRIVATE = 1,
> +     MEMORY_DEVICE_COHERENT,
>       MEMORY_DEVICE_FS_DAX,
>       MEMORY_DEVICE_GENERIC,
>       MEMORY_DEVICE_PCI_P2PDMA,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 73a52aba448f..d23b6ebd2177 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1162,6 +1162,7 @@ static inline bool page_is_devmap_managed(struct page 
> *page)
>               return false;
>       switch (page->pgmap->type) {
>       case MEMORY_DEVICE_PRIVATE:
> +     case MEMORY_DEVICE_COHERENT:
>       case MEMORY_DEVICE_FS_DAX:
>               return true;
>       default:
> @@ -1191,6 +1192,14 @@ static inline bool is_device_private_page(const struct 
> page *page)
>               page->pgmap->type == MEMORY_DEVICE_PRIVATE;
>  }
>  
> +static inline bool is_device_page(const struct page *page)
> +{
> +     return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> +             is_zone_device_page(page) &&
> +             (page->pgmap->type == MEMORY_DEVICE_PRIVATE ||
> +             page->pgmap->type == MEMORY_DEVICE_COHERENT);
> +}
> +
>  static inline bool is_pci_p2pdma_page(const struct page *page)
>  {
>       return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 6da5020a8656..e0275ebe1198 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5695,8 +5695,8 @@ static int mem_cgroup_move_account(struct page *page,
>   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
>   *     target for charge migration. if @target is not NULL, the entry is 
> stored
>   *     in target->ent.
> - *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
> MEMORY_DEVICE_PRIVATE
> - *     (so ZONE_DEVICE page and thus not on the lru).
> + *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
> MEMORY_DEVICE_COHERENT
> + *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the 
> lru).
>   *     For now we such page is charge like a regular page would be as for all
>   *     intent and purposes it is just special memory taking the place of a
>   *     regular page.
> @@ -5730,7 +5730,7 @@ static enum mc_target_type get_mctgt_type(struct 
> vm_area_struct *vma,
>                */
>               if (page_memcg(page) == mc.from) {
>                       ret = MC_TARGET_PAGE;
> -                     if (is_device_private_page(page))
> +                     if (is_device_page(page))
>                               ret = MC_TARGET_DEVICE;
>                       if (target)
>                               target->page = page;
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 3e6449f2102a..51b55fc5172c 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -1554,12 +1554,16 @@ static int memory_failure_dev_pagemap(unsigned long 
> pfn, int flags,
>               goto unlock;
>       }
>  
> -     if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
> +     switch (pgmap->type) {
> +     case MEMORY_DEVICE_PRIVATE:
> +     case MEMORY_DEVICE_COHERENT:
>               /*
>                * TODO: Handle HMM pages which may need coordination
>                * with device-side memory.
>                */
>               goto unlock;
> +     default:
> +             break;
>       }
>  
>       /*
> diff --git a/mm/memremap.c b/mm/memremap.c
> index ed593bf87109..94d6a1e01d42 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key);
>  static void devmap_managed_enable_put(struct dev_pagemap *pgmap)
>  {
>       if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
> +         pgmap->type == MEMORY_DEVICE_COHERENT ||
>           pgmap->type == MEMORY_DEVICE_FS_DAX)
>               static_branch_dec(&devmap_managed_key);
>  }
> @@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap 
> *pgmap)
>  static void devmap_managed_enable_get(struct dev_pagemap *pgmap)
>  {
>       if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
> +         pgmap->type == MEMORY_DEVICE_COHERENT ||
>           pgmap->type == MEMORY_DEVICE_FS_DAX)
>               static_branch_inc(&devmap_managed_key);
>  }
> @@ -328,6 +330,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
>  
>       switch (pgmap->type) {
>       case MEMORY_DEVICE_PRIVATE:
> +     case MEMORY_DEVICE_COHERENT:
>               if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
>                       WARN(1, "Device private memory not supported\n");
>                       return ERR_PTR(-EINVAL);
> @@ -498,7 +501,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap);
>  void free_devmap_managed_page(struct page *page)
>  {
>       /* notify page idle for dax */
> -     if (!is_device_private_page(page)) {
> +     if (!is_device_page(page)) {
>               wake_up_var(&page->_refcount);
>               return;
>       }
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 1852d787e6ab..f74422a42192 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -362,7 +362,7 @@ static int expected_page_refs(struct address_space 
> *mapping, struct page *page)
>        * Device private pages have an extra refcount as they are
>        * ZONE_DEVICE pages.
>        */
> -     expected_count += is_device_private_page(page);
> +     expected_count += is_device_page(page);
>       if (mapping)
>               expected_count += thp_nr_pages(page) + page_has_private(page);
>  
> @@ -2503,7 +2503,7 @@ static bool migrate_vma_check_page(struct page *page)
>                * FIXME proper solution is to rework migration_entry_wait() so
>                * it does not need to take a reference on page.
>                */

Note that I have posted a patch to fix this - see
https://lore.kernel.org/all/20211118020754.954425-1-apop...@nvidia.com/. This
looks ok for now assuming coherent pages can never be pinned.

However that raises a question - what happens when something calls
get_user_pages() on a pfn pointing to a coherent device page? I can't see
anything in this series that prevents pinning of coherent device pages, so we
can't just assume they aren't pinned.

In the case of device-private pages this is enforced by the fact they never
have present pte's, so any attempt to GUP them results in a fault. But if I'm
understanding this series correctly that won't be the case for coherent device
pages right?

> -             return is_device_private_page(page);
> +             return is_device_page(page);
>       }
>  
>       /* For file back page */
> @@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
>   *     handle_pte_fault()
>   *       do_anonymous_page()
>   * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
> - * private page.
> + * private or coherent page.
>   */
>  static void migrate_vma_insert_page(struct migrate_vma *migrate,
>                                   unsigned long addr,
> @@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct 
> migrate_vma *migrate,
>                               swp_entry = make_readable_device_private_entry(
>                                                       page_to_pfn(page));
>                       entry = swp_entry_to_pte(swp_entry);
> +             } else if (is_device_page(page)) {

How about adding an explicit `is_device_coherent_page()` helper? It would make
the test more explicit that this is expected to handle just coherent pages and
I bet there will be future changes that need to differentiate between private
and coherent pages anyway.

> +                     entry = pte_mkold(mk_pte(page,
> +                                              READ_ONCE(vma->vm_page_prot)));
> +                     if (vma->vm_flags & VM_WRITE)
> +                             entry = pte_mkwrite(pte_mkdirty(entry));
>               } else {
>                       /*
> -                      * For now we only support migrating to un-addressable
> -                      * device memory.
> +                      * We support migrating to private and coherent types
> +                      * for device zone memory.
>                        */
>                       pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
>                       goto abort;
> @@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate)
>               mapping = page_mapping(page);
>  
>               if (is_zone_device_page(newpage)) {
> -                     if (is_device_private_page(newpage)) {
> +                     if (is_device_page(newpage)) {
>                               /*
> -                              * For now only support private anonymous when
> -                              * migrating to un-addressable device memory.
> +                              * For now only support private and coherent
> +                              * anonymous when migrating to device memory.
>                                */
>                               if (mapping) {
>                                       migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
> 




Reply via email to