Alistair Popple wrote: > Zone device pages are used to represent various type of device memory > managed by device drivers. Currently compound zone device pages are > not supported. This is because MEMORY_DEVICE_FS_DAX pages are the only > user of higher order zone device pages and have their own page > reference counting. > > A future change will unify FS DAX reference counting with normal page > reference counting rules and remove the special FS DAX reference > counting. Supporting that requires compound zone device pages. > > Supporting compound zone device pages requires compound_head() to > distinguish between head and tail pages whilst still preserving the > special struct page fields that are specific to zone device pages. > > A tail page is distinguished by having bit zero being set in > page->compound_head, with the remaining bits pointing to the head > page. For zone device pages page->compound_head is shared with > page->pgmap. > > The page->pgmap field is common to all pages within a memory section. > Therefore pgmap is the same for both head and tail pages and can be > moved into the folio and we can use the standard scheme to find > compound_head from a tail page. > > Signed-off-by: Alistair Popple <apop...@nvidia.com> > Reviewed-by: Jason Gunthorpe <j...@nvidia.com> > > --- > > Changes since v1: > > - Move pgmap to the folio as suggested by Matthew Wilcox > --- > drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++- > drivers/pci/p2pdma.c | 6 +++--- > include/linux/memremap.h | 6 +++--- > include/linux/migrate.h | 4 ++-- > include/linux/mm_types.h | 9 +++++++-- > include/linux/mmzone.h | 8 +++++++- > lib/test_hmm.c | 3 ++- > mm/hmm.c | 2 +- > mm/memory.c | 4 +++- > mm/memremap.c | 14 +++++++------- > mm/migrate_device.c | 7 +++++-- > mm/mm_init.c | 2 +- > 12 files changed, 43 insertions(+), 25 deletions(-) > > diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c > b/drivers/gpu/drm/nouveau/nouveau_dmem.c > index 6fb65b0..58d308c 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c > +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c > @@ -88,7 +88,8 @@ struct nouveau_dmem { > > static struct nouveau_dmem_chunk *nouveau_page_to_chunk(struct page *page) > { > - return container_of(page->pgmap, struct nouveau_dmem_chunk, pagemap); > + return container_of(page_dev_pagemap(page), struct nouveau_dmem_chunk,
page_dev_pagemap() feels like a mouthful. I would be ok with page_pgmap() since that is the most common idenifier for struct struct dev_pagemap instances. > + pagemap); > } > > static struct nouveau_drm *page_to_drm(struct page *page) > diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c > index 210b9f4..a58f2c1 100644 > --- a/drivers/pci/p2pdma.c > +++ b/drivers/pci/p2pdma.c > @@ -199,7 +199,7 @@ static const struct attribute_group p2pmem_group = { > > static void p2pdma_page_free(struct page *page) > { > - struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap); > + struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_dev_pagemap(page)); > /* safe to dereference while a reference is held to the percpu ref */ > struct pci_p2pdma *p2pdma = > rcu_dereference_protected(pgmap->provider->p2pdma, 1); > @@ -1022,8 +1022,8 @@ enum pci_p2pdma_map_type > pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state, struct device > *dev, > struct scatterlist *sg) > { > - if (state->pgmap != sg_page(sg)->pgmap) { > - state->pgmap = sg_page(sg)->pgmap; > + if (state->pgmap != page_dev_pagemap(sg_page(sg))) { > + state->pgmap = page_dev_pagemap(sg_page(sg)); > state->map = pci_p2pdma_map_type(state->pgmap, dev); > state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset; > } > diff --git a/include/linux/memremap.h b/include/linux/memremap.h > index 3f7143a..14273e6 100644 > --- a/include/linux/memremap.h > +++ b/include/linux/memremap.h > @@ -161,7 +161,7 @@ static inline bool is_device_private_page(const struct > page *page) > { > return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && > is_zone_device_page(page) && > - page->pgmap->type == MEMORY_DEVICE_PRIVATE; > + page_dev_pagemap(page)->type == MEMORY_DEVICE_PRIVATE; > } > > static inline bool folio_is_device_private(const struct folio *folio) > @@ -173,13 +173,13 @@ static inline bool is_pci_p2pdma_page(const struct page > *page) > { > return IS_ENABLED(CONFIG_PCI_P2PDMA) && > is_zone_device_page(page) && > - page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; > + page_dev_pagemap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; > } > > static inline bool is_device_coherent_page(const struct page *page) > { > return is_zone_device_page(page) && > - page->pgmap->type == MEMORY_DEVICE_COHERENT; > + page_dev_pagemap(page)->type == MEMORY_DEVICE_COHERENT; > } > > static inline bool folio_is_device_coherent(const struct folio *folio) > diff --git a/include/linux/migrate.h b/include/linux/migrate.h > index 002e49b..9a85a82 100644 > --- a/include/linux/migrate.h > +++ b/include/linux/migrate.h > @@ -207,8 +207,8 @@ struct migrate_vma { > unsigned long end; > > /* > - * Set to the owner value also stored in page->pgmap->owner for > - * migrating out of device private memory. The flags also need to > + * Set to the owner value also stored in page_dev_pagemap(page)->owner > + * for migrating out of device private memory. The flags also need to > * be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE. > * The caller should always set this field when using mmu notifier > * callbacks to avoid device MMU invalidations for device private > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 6e3bdf8..c2f1d53 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -129,8 +129,11 @@ struct page { > unsigned long compound_head; /* Bit zero is set */ > }; > struct { /* ZONE_DEVICE pages */ > - /** @pgmap: Points to the hosting device page map. */ > - struct dev_pagemap *pgmap; > + /* > + * The first word is used for compound_head or folio > + * pgmap > + */ > + void *_unused; I would feel better with "_unused_pgmap_compound_head", similar to how _unused_slab_obj_exts in 'struct foio' indicates the placeholer contents. > void *zone_device_data; > /* > * ZONE_DEVICE private pages are counted as being > @@ -299,6 +302,7 @@ typedef struct { > * @_refcount: Do not access this member directly. Use folio_ref_count() > * to find how many references there are to this folio. > * @memcg_data: Memory Control Group data. > + * @pgmap: Metadata for ZONE_DEVICE mappings > * @virtual: Virtual address in the kernel direct map. > * @_last_cpupid: IDs of last CPU and last process that accessed the folio. > * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). > @@ -337,6 +341,7 @@ struct folio { > /* private: */ > }; > /* public: */ > + struct dev_pagemap *pgmap; > }; > struct address_space *mapping; > pgoff_t index; > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 17506e4..e191434 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -1134,6 +1134,12 @@ static inline bool is_zone_device_page(const struct > page *page) > return page_zonenum(page) == ZONE_DEVICE; > } > > +static inline struct dev_pagemap *page_dev_pagemap(const struct page *page) > +{ > + WARN_ON(!is_zone_device_page(page)); VM_WARN_ON()? With the above fixups: Reviewed-by: Dan Williams <dan.j.willi...@intel.com>