Shivank Garg <shiva...@amd.com> writes: > Previously, guest-memfd allocations followed local NUMA node id in absence > of process mempolicy, resulting in arbitrary memory allocation. > Moreover, mbind() couldn't be used since memory wasn't mapped to userspace > in the VMM. > > Enable NUMA policy support by implementing vm_ops for guest-memfd mmap > operation. This allows the VMM to map the memory and use mbind() to set the > desired NUMA policy. The policy is stored in the inode structure via > kvm_gmem_inode_info, as memory policy is a property of the memory (struct > inode) itself. The policy is then retrieved via mpol_shared_policy_lookup() > and passed to filemap_grab_folio_mpol() to ensure that allocations follow > the specified memory policy. > > This enables the VMM to control guest memory NUMA placement by calling > mbind() on the mapped memory regions, providing fine-grained control over > guest memory allocation across NUMA nodes. > > The policy change only affect future allocations and does not migrate > existing memory. This matches mbind(2)'s default behavior which affects > only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL > flags, which are not supported for guest_memfd as it is unmovable. > > Suggested-by: David Hildenbrand <da...@redhat.com> > Signed-off-by: Shivank Garg <shiva...@amd.com> > --- > virt/kvm/guest_memfd.c | 75 ++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 73 insertions(+), 2 deletions(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 0ccbb152483a..233d3fd5781c 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -4,6 +4,7 @@ > #include <linux/backing-dev.h> > #include <linux/falloc.h> > #include <linux/kvm_host.h> > +#include <linux/mempolicy.h> > #include <linux/pseudo_fs.h> > #include <linux/pagemap.h> > #include <linux/anon_inodes.h> > @@ -19,6 +20,7 @@ struct kvm_gmem { > }; > > struct kvm_gmem_inode_info { > + struct shared_policy policy; > struct inode vfs_inode; > };
What are the pros and cons that you see of storing struct shared_policy in a containing struct kvm_gmem_inode_info, as opposed to storing it in inode->i_private? I've just been using inode->i_private for sharability and hugetlb metadata and didn't consider this option. Could one reason be that struct shared_policy is a requirement for all inodes (not a CONFIG flag) but sharability and hugetlb metadata are both configurable, possibly at runtime? > > @@ -27,6 +29,9 @@ static inline struct kvm_gmem_inode_info *KVM_GMEM_I(struct > inode *inode) > return container_of(inode, struct kvm_gmem_inode_info, vfs_inode); > } > > +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct > kvm_gmem_inode_info *info, > + pgoff_t index); > + > /** > * folio_file_pfn - like folio_file_page, but return a pfn. > * @folio: The folio which contains this index. > @@ -113,7 +118,24 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, > struct kvm_memory_slot *slot, > static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) > { > /* TODO: Support huge pages. */ > - return filemap_grab_folio(inode->i_mapping, index); > + struct mempolicy *policy; > + struct folio *folio; > + > + /* > + * Fast-path: See if folio is already present in mapping to avoid > + * policy_lookup. > + */ > + folio = __filemap_get_folio(inode->i_mapping, index, > + FGP_LOCK | FGP_ACCESSED, 0); > + if (!IS_ERR(folio)) > + return folio; > + > + policy = kvm_gmem_get_pgoff_policy(KVM_GMEM_I(inode), index); > + folio = filemap_grab_folio_mpol(inode->i_mapping, index, policy, > + NO_INTERLEAVE_INDEX); > + mpol_cond_put(policy); > + > + return folio; > } > > static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, > @@ -336,12 +358,14 @@ static struct inode *kvm_gmem_alloc_inode(struct > super_block *sb) > if (!info) > return NULL; > > + mpol_shared_policy_init(&info->policy, NULL); > + > return &info->vfs_inode; > } > > static void kvm_gmem_destroy_inode(struct inode *inode) > { > - > + mpol_free_shared_policy(&KVM_GMEM_I(inode)->policy); > } > > static void kvm_gmem_free_inode(struct inode *inode) > @@ -384,7 +408,54 @@ static void kvm_gmem_init_mount(void) > kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; > } > > +#ifdef CONFIG_NUMA > +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy > *mpol) > +{ > + struct inode *inode = file_inode(vma->vm_file); > + > + return mpol_set_shared_policy(&KVM_GMEM_I(inode)->policy, vma, mpol); > +} > + > +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, > + unsigned long addr, pgoff_t *pgoff) > +{ > + struct inode *inode = file_inode(vma->vm_file); > + > + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); > + return mpol_shared_policy_lookup(&KVM_GMEM_I(inode)->policy, *pgoff); > +} > + > +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct > kvm_gmem_inode_info *info, > + pgoff_t index) > +{ > + struct mempolicy *mpol; > + > + mpol = mpol_shared_policy_lookup(&info->policy, index); > + return mpol ? mpol : get_task_policy(current); > +} > +#else > +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct > kvm_gmem_inode_info *info, > + pgoff_t index) > +{ > + return NULL; > +} > +#endif /* CONFIG_NUMA */ > + > +static const struct vm_operations_struct kvm_gmem_vm_ops = { > +#ifdef CONFIG_NUMA > + .get_policy = kvm_gmem_get_policy, > + .set_policy = kvm_gmem_set_policy, > +#endif > +}; > + > +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) > +{ > + vma->vm_ops = &kvm_gmem_vm_ops; > + return 0; > +} > + > static struct file_operations kvm_gmem_fops = { > + .mmap = kvm_gmem_mmap, > .open = generic_file_open, > .release = kvm_gmem_release, > .fallocate = kvm_gmem_fallocate, > -- > 2.34.1