Re: [PATCH v7] mm/memblock: Add memblock_alloc_or_panic interface
On Sun, Dec 22, 2024 at 07:15:37PM +0800, Guo Weikang wrote: > Before SLUB initialization, various subsystems used memblock_alloc to > allocate memory. In most cases, when memory allocation fails, an immediate > panic is required. To simplify this behavior and reduce repetitive checks, > introduce `memblock_alloc_or_panic`. This function ensures that memory > allocation failures result in a panic automatically, improving code > readability and consistency across subsystems that require this behavior. > > Changelog: > -- > v1: initial version > v2: add __memblock_alloc_or_panic support panic output caller > v3: panic output phys_addr_t use printk's %pap > v4: make __memblock_alloc_or_panic out-of-line, move to memblock.c > v6: Fix CI compile error > Links to CI: > https://lore.kernel.org/oe-kbuild-all/202412221000.r1nzxjuo-...@intel.com/ > v6: Fix CI compile warinigs > Links to CI: > https://lore.kernel.org/oe-kbuild-all/202412221259.jugnaucq-...@intel.com/ > v7: add chagelog and adjust function declaration alignment format > -- > > Signed-off-by: Guo Weikang > Reviewed-by: Andrew Morton > Reviewed-by: Geert Uytterhoeven > Reviewed-by: Mike Rapoport (Microsoft) > Acked-by: Xi Ruoyao If people commented on your patch it does not mean you should add Reviewed-by or Acked-by tags for them. Wait for explicit tags from the reviewers. And don't respin that often, "Reviewers are busy people and may not get to your patch right away" [1]. [1] https://docs.kernel.org/process/submitting-patches.html -- Sincerely yours, Mike.
Re: [PATCH v6] mm/memblock: Add memblock_alloc_or_panic interface
On Sun, Dec 22, 2024 at 01:43:31PM +0800, Guo Weikang wrote: > Before SLUB initialization, various subsystems used memblock_alloc to > allocate memory. In most cases, when memory allocation fails, an immediate > panic is required. To simplify this behavior and reduce repetitive checks, > introduce `memblock_alloc_or_panic`. This function ensures that memory > allocation failures result in a panic automatically, improving code > readability and consistency across subsystems that require this behavior. > > Signed-off-by: Guo Weikang > --- ... > diff --git a/include/linux/memblock.h b/include/linux/memblock.h > index 673d5cae7c81..73af7ca3fa1c 100644 > --- a/include/linux/memblock.h > +++ b/include/linux/memblock.h > @@ -417,6 +417,12 @@ static __always_inline void *memblock_alloc(phys_addr_t > size, phys_addr_t align) > MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); > } > > +void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, > +const char *func); Please align this line with the first parameter to the function. Other than that Acked-by: Mike Rapoport (Microsoft) > + > +#define memblock_alloc_or_panic(size, align)\ > + __memblock_alloc_or_panic(size, align, __func__) > + > static inline void *memblock_alloc_raw(phys_addr_t size, > phys_addr_t align) > { -- Sincerely yours, Mike.
Re: [PATCH v1 2/3] mm/hugetlb: enforce that PMD PT sharing has split PMD PT locks
On Fri, Jul 26, 2024 at 05:07:27PM +0200, David Hildenbrand wrote: > Sharing page tables between processes but falling back to per-MM page > table locks cannot possibly work. > > So, let's make sure that we do have split PMD locks by adding a new > Kconfig option and letting that depend on CONFIG_SPLIT_PMD_PTLOCKS. > > Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) > --- > fs/Kconfig | 4 > include/linux/hugetlb.h | 5 ++--- > mm/hugetlb.c| 8 > 3 files changed, 10 insertions(+), 7 deletions(-) > > diff --git a/fs/Kconfig b/fs/Kconfig > index a46b0cbc4d8f6..0e4efec1d92e6 100644 > --- a/fs/Kconfig > +++ b/fs/Kconfig > @@ -288,6 +288,10 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP > depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP > depends on SPARSEMEM_VMEMMAP > > +config HUGETLB_PMD_PAGE_TABLE_SHARING > + def_bool HUGETLB_PAGE > + depends on ARCH_WANT_HUGE_PMD_SHARE && SPLIT_PMD_PTLOCKS > + > config ARCH_HAS_GIGANTIC_PAGE > bool > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index da800e56fe590..4d2f3224ff027 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -1243,7 +1243,7 @@ static inline __init void hugetlb_cma_reserve(int order) > } > #endif > > -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE > +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING > static inline bool hugetlb_pmd_shared(pte_t *pte) > { > return page_count(virt_to_page(pte)) > 1; > @@ -1279,8 +1279,7 @@ bool __vma_private_lock(struct vm_area_struct *vma); > static inline pte_t * > hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long > sz) > { > -#if defined(CONFIG_HUGETLB_PAGE) && \ > - defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) > +#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP) > struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; > > /* > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 0858a18272073..c4d94e122c41f 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -7211,7 +7211,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long > start, long end, > return 0; > } > > -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE > +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING > static unsigned long page_table_shareable(struct vm_area_struct *svma, > struct vm_area_struct *vma, > unsigned long addr, pgoff_t idx) > @@ -7373,7 +7373,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct > vm_area_struct *vma, > return 1; > } > > -#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ > +#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ > > pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, > unsigned long addr, pud_t *pud) > @@ -7396,7 +7396,7 @@ bool want_pmd_share(struct vm_area_struct *vma, > unsigned long addr) > { > return false; > } > -#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ > +#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ > > #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB > pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, > @@ -7494,7 +7494,7 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) > /* See description above. Architectures can provide their own version. */ > __weak unsigned long hugetlb_mask_last_page(struct hstate *h) > { > -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE > +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING > if (huge_page_size(h) == PMD_SIZE) > return PUD_SIZE - PMD_SIZE; > #endif > -- > 2.45.2 > > -- Sincerely yours, Mike.
Re: [PATCH v1 1/3] mm: turn USE_SPLIT_PTE_PTLOCKS / USE_SPLIT_PTE_PTLOCKS into Kconfig options
On Fri, Jul 26, 2024 at 05:07:26PM +0200, David Hildenbrand wrote: > Let's clean that up a bit and prepare for depending on > CONFIG_SPLIT_PMD_PTLOCKS in other Kconfig options. > > More cleanups would be reasonable (like the arch-specific "depends on" > for CONFIG_SPLIT_PTE_PTLOCKS), but we'll leave that for another day. > > Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) > --- > arch/arm/mm/fault-armv.c | 6 +++--- > arch/x86/xen/mmu_pv.c | 7 --- > include/linux/mm.h| 8 > include/linux/mm_types.h | 2 +- > include/linux/mm_types_task.h | 3 --- > kernel/fork.c | 4 ++-- > mm/Kconfig| 18 +++--- > mm/memory.c | 2 +- > 8 files changed, 26 insertions(+), 24 deletions(-) > > diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c > index 2286c2ea60ec4..831793cd6ff94 100644 > --- a/arch/arm/mm/fault-armv.c > +++ b/arch/arm/mm/fault-armv.c > @@ -61,7 +61,7 @@ static int do_adjust_pte(struct vm_area_struct *vma, > unsigned long address, > return ret; > } > > -#if USE_SPLIT_PTE_PTLOCKS > +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) > /* > * If we are using split PTE locks, then we need to take the page > * lock here. Otherwise we are using shared mm->page_table_lock > @@ -80,10 +80,10 @@ static inline void do_pte_unlock(spinlock_t *ptl) > { > spin_unlock(ptl); > } > -#else /* !USE_SPLIT_PTE_PTLOCKS */ > +#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ > static inline void do_pte_lock(spinlock_t *ptl) {} > static inline void do_pte_unlock(spinlock_t *ptl) {} > -#endif /* USE_SPLIT_PTE_PTLOCKS */ > +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ > > static int adjust_pte(struct vm_area_struct *vma, unsigned long address, > unsigned long pfn) > diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c > index f1ce39d6d32cb..f4a316894bbb4 100644 > --- a/arch/x86/xen/mmu_pv.c > +++ b/arch/x86/xen/mmu_pv.c > @@ -665,7 +665,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct > mm_struct *mm) > { > spinlock_t *ptl = NULL; > > -#if USE_SPLIT_PTE_PTLOCKS > +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) > ptl = ptlock_ptr(page_ptdesc(page)); > spin_lock_nest_lock(ptl, &mm->page_table_lock); > #endif > @@ -1553,7 +1553,8 @@ static inline void xen_alloc_ptpage(struct mm_struct > *mm, unsigned long pfn, > > __set_pfn_prot(pfn, PAGE_KERNEL_RO); > > - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) > + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS) && > + !pinned) > __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); > > xen_mc_issue(XEN_LAZY_MMU); > @@ -1581,7 +1582,7 @@ static inline void xen_release_ptpage(unsigned long > pfn, unsigned level) > if (pinned) { > xen_mc_batch(); > > - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) > + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS)) > __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); > > __set_pfn_prot(pfn, PAGE_KERNEL); > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 0472a5090b180..dff43101572ec 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2843,7 +2843,7 @@ static inline void pagetable_free(struct ptdesc *pt) > __free_pages(page, compound_order(page)); > } > > -#if USE_SPLIT_PTE_PTLOCKS > +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) > #if ALLOC_SPLIT_PTLOCKS > void __init ptlock_cache_init(void); > bool ptlock_alloc(struct ptdesc *ptdesc); > @@ -2895,7 +2895,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) > return true; > } > > -#else/* !USE_SPLIT_PTE_PTLOCKS */ > +#else/* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ > /* > * We use mm->page_table_lock to guard all pagetable pages of the mm. > */ > @@ -2906,7 +2906,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct > *mm, pte_t *pte) > static inline void ptlock_cache_init(void) {} > static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } > static inline void ptlock_free(struct ptdesc *ptdesc) {} > -#endif /* USE_SPLIT_PTE_PTLOCKS */ > +#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ > > static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) > { > @@ -2966,7 +2966,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, > pmd_t *pmd, > ((unlikely(pmd_none(*(pmd))) && __pte_
Re: [PATCH v2 bpf-next 2/3] mm, xen: Separate xen use cases from ioremap.
On Fri, Feb 23, 2024 at 03:57:27PM -0800, Alexei Starovoitov wrote: > From: Alexei Starovoitov > > xen grant table and xenbus ring are not ioremap the way arch specific code is > using it, > so let's add VM_XEN flag to separate them from VM_IOREMAP users. > xen will not and should not be calling ioremap_page_range() on that range. > /proc/vmallocinfo will print such region as "xen" instead of "ioremap" as > well. > > Signed-off-by: Alexei Starovoitov > --- > arch/x86/xen/grant-table.c | 2 +- > drivers/xen/xenbus/xenbus_client.c | 2 +- > include/linux/vmalloc.h| 1 + > mm/vmalloc.c | 7 +-- > 4 files changed, 8 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c > index 1e681bf62561..b816db0349c4 100644 > --- a/arch/x86/xen/grant-table.c > +++ b/arch/x86/xen/grant-table.c > @@ -104,7 +104,7 @@ static int arch_gnttab_valloc(struct gnttab_vm_area > *area, unsigned nr_frames) > area->ptes = kmalloc_array(nr_frames, sizeof(*area->ptes), GFP_KERNEL); > if (area->ptes == NULL) > return -ENOMEM; > - area->area = get_vm_area(PAGE_SIZE * nr_frames, VM_IOREMAP); > + area->area = get_vm_area(PAGE_SIZE * nr_frames, VM_XEN); > if (!area->area) > goto out_free_ptes; > if (apply_to_page_range(&init_mm, (unsigned long)area->area->addr, > diff --git a/drivers/xen/xenbus/xenbus_client.c > b/drivers/xen/xenbus/xenbus_client.c > index 32835b4b9bc5..b9c81a2d578b 100644 > --- a/drivers/xen/xenbus/xenbus_client.c > +++ b/drivers/xen/xenbus/xenbus_client.c > @@ -758,7 +758,7 @@ static int xenbus_map_ring_pv(struct xenbus_device *dev, > bool leaked = false; > int err = -ENOMEM; > > - area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_IOREMAP); > + area = get_vm_area(XEN_PAGE_SIZE * nr_grefs, VM_XEN); > if (!area) > return -ENOMEM; > if (apply_to_page_range(&init_mm, (unsigned long)area->addr, > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h > index c720be70c8dd..223e51c243bc 100644 > --- a/include/linux/vmalloc.h > +++ b/include/linux/vmalloc.h > @@ -28,6 +28,7 @@ struct iov_iter;/* in uio.h */ > #define VM_FLUSH_RESET_PERMS 0x0100 /* reset direct map and flush > TLB on unmap, can't be freed in atomic context */ > #define VM_MAP_PUT_PAGES 0x0200 /* put pages and free array in > vfree */ > #define VM_ALLOW_HUGE_VMAP 0x0400 /* Allow for huge pages on > archs with HAVE_ARCH_HUGE_VMALLOC */ > +#define VM_XEN 0x0800 /* xen use cases */ > > #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ > !defined(CONFIG_KASAN_VMALLOC) There's also VM_DEFER_KMEMLEAK a line below: #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) #define VM_DEFER_KMEMLEAK 0x0801 /* defer kmemleak object creation */ #else #define VM_DEFER_KMEMLEAK 0 #endif It should be adjusted as well. I think it makes sense to use an enumeration for vm_flags, just like as Suren did for GFP (https://lore.kernel.org/linux-mm/20240224015800.2569851-1-sur...@google.com/) -- Sincerely yours, Mike.
Re: [PATCH v4 34/34] mm: Remove pgtable_{pmd, pte}_page_{ctor, dtor}() wrappers
On Mon, Jun 12, 2023 at 02:04:23PM -0700, Vishal Moola (Oracle) wrote: > These functions are no longer necessary. Remove them and cleanup > Documentation referencing them. > > Signed-off-by: Vishal Moola (Oracle) I've found one stale reference in riscv: $ git grep -n pgtable_pmd_page_ctor arch/riscv/mm/init.c:440: BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page(vaddr))); Otherwise Acked-by: Mike Rapoport (IBM) > --- > Documentation/mm/split_page_table_lock.rst| 12 +-- > .../zh_CN/mm/split_page_table_lock.rst| 14 ++--- > include/linux/mm.h| 20 --- > 3 files changed, 13 insertions(+), 33 deletions(-) > > diff --git a/Documentation/mm/split_page_table_lock.rst > b/Documentation/mm/split_page_table_lock.rst > index 50ee0dfc95be..4bffec728340 100644 > --- a/Documentation/mm/split_page_table_lock.rst > +++ b/Documentation/mm/split_page_table_lock.rst > @@ -53,7 +53,7 @@ Support of split page table lock by an architecture > === > > There's no need in special enabling of PTE split page table lock: everything > -required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), > which > +required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which > must be called on PTE table allocation / freeing. > > Make sure the architecture doesn't use slab allocator for page table > @@ -63,8 +63,8 @@ This field shares storage with page->ptl. > PMD split lock only makes sense if you have more than two page table > levels. > > -PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table > -allocation and pgtable_pmd_page_dtor() on freeing. > +PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table > +allocation and pagetable_pmd_dtor() on freeing. > > Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and > pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing > @@ -72,7 +72,7 @@ paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). > > With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. > > -NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must > +NOTE: pagetable_pte_ctor() and pagetable_pmd_ctor() can fail -- it must > be handled properly. > > page->ptl > @@ -92,7 +92,7 @@ trick: > split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs > one more cache line for indirect access; > > -The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in > -pgtable_pmd_page_ctor() for PMD table. > +The spinlock_t allocated in pagetable_pte_ctor() for PTE table and in > +pagetable_pmd_ctor() for PMD table. > > Please, never access page->ptl directly -- use appropriate helper. > diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst > b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst > index 4fb7aa666037..a2c288670a24 100644 > --- a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst > +++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst > @@ -56,16 +56,16 @@ Hugetlb特定的辅助函数: > 架构对分页表锁的支持 > > > -没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor() > -和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。 > +没有必要特别启用PTE分页表锁:所有需要的东西都由pagetable_pte_ctor() > +和pagetable_pte_dtor()完成,它们必须在PTE表分配/释放时被调用。 > > 确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页 > 面。这个区域与page->ptl共享存储。 > > PMD分页锁只有在你有两个以上的页表级别时才有意义。 > > -启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调 > -用pgtable_pmd_page_dtor()。 > +启用PMD分页锁需要在PMD表分配时调用pagetable_pmd_ctor(),在释放时调 > +用pagetable_pmd_dtor()。 > > 分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb() > 中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先 > @@ -73,7 +73,7 @@ PMD分页锁只有在你有两个以上的页表级别时才有意义。 > > 一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。 > > -注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必 > +注意:pagetable_pte_ctor()和pagetable_pmd_ctor()可能失败--必 > 须正确处理。 > > page->ptl > @@ -90,7 +90,7 @@ page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struc > 的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的 > 情况下使用分页锁,但由于间接访问而多花了一个缓存行。 > > -PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t > -分配在pgtable_pmd_page_ctor()中。 > +PTE表的spinlock_t分配在pagetable_pte_ctor()中,PMD表的spinlock_t > +分配在pagetable_pmd_ctor()中。 > > 请不要直接访问page->ptl - -使用适当的辅助函数。 > diff --git a/include/linux/mm.h b/include/linux/mm.h > index dc211c43610b..6d83483cf186 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2897,11 +2897,6 @@ sta
Re: [PATCH v4 33/34] um: Convert {pmd, pte}_free_tlb() to use ptdescs
On Mon, Jun 12, 2023 at 02:04:22PM -0700, Vishal Moola (Oracle) wrote: > Part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents. Also cleans up some spacing issues. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/um/include/asm/pgalloc.h | 18 +- > 1 file changed, 9 insertions(+), 9 deletions(-) > > diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h > index 8ec7cd46dd96..de5e31c64793 100644 > --- a/arch/um/include/asm/pgalloc.h > +++ b/arch/um/include/asm/pgalloc.h > @@ -25,19 +25,19 @@ > */ > extern pgd_t *pgd_alloc(struct mm_struct *); > > -#define __pte_free_tlb(tlb,pte, address) \ > -do { \ > - pgtable_pte_page_dtor(pte); \ > - tlb_remove_page((tlb),(pte)); \ > +#define __pte_free_tlb(tlb, pte, address)\ > +do { \ > + pagetable_pte_dtor(page_ptdesc(pte)); \ > + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ > } while (0) > > #ifdef CONFIG_3_LEVEL_PGTABLES > > -#define __pmd_free_tlb(tlb, pmd, address)\ > -do { \ > - pgtable_pmd_page_dtor(virt_to_page(pmd)); \ > - tlb_remove_page((tlb),virt_to_page(pmd)); \ > -} while (0) \ > +#define __pmd_free_tlb(tlb, pmd, address)\ > +do { \ > + pagetable_pmd_dtor(virt_to_ptdesc(pmd));\ > + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ > +} while (0) > > #endif > > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 32/34] sparc: Convert pgtable_pte_page_{ctor, dtor}() to ptdesc equivalents
On Mon, Jun 12, 2023 at 02:04:21PM -0700, Vishal Moola (Oracle) wrote: > Part of the conversions to replace pgtable pte constructor/destructors with > ptdesc equivalents. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/sparc/mm/srmmu.c | 5 +++-- > 1 file changed, 3 insertions(+), 2 deletions(-) > > diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c > index 13f027afc875..8393faa3e596 100644 > --- a/arch/sparc/mm/srmmu.c > +++ b/arch/sparc/mm/srmmu.c > @@ -355,7 +355,8 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) > return NULL; > page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); > spin_lock(&mm->page_table_lock); > - if (page_ref_inc_return(page) == 2 && !pgtable_pte_page_ctor(page)) { > + if (page_ref_inc_return(page) == 2 && > + !pagetable_pte_ctor(page_ptdesc(page))) { > page_ref_dec(page); > ptep = NULL; > } > @@ -371,7 +372,7 @@ void pte_free(struct mm_struct *mm, pgtable_t ptep) > page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); > spin_lock(&mm->page_table_lock); > if (page_ref_dec_return(page) == 1) > - pgtable_pte_page_dtor(page); > + pagetable_pte_dtor(page_ptdesc(page)); > spin_unlock(&mm->page_table_lock); > > srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE); > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 31/34] sparc64: Convert various functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:20PM -0700, Vishal Moola (Oracle) wrote: > As part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents, convert various page table functions to use ptdescs. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/sparc/mm/init_64.c | 17 + > 1 file changed, 9 insertions(+), 8 deletions(-) > > diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c > index 04f9db0c3111..105915cd2eee 100644 > --- a/arch/sparc/mm/init_64.c > +++ b/arch/sparc/mm/init_64.c > @@ -2893,14 +2893,15 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm) > > pgtable_t pte_alloc_one(struct mm_struct *mm) > { > - struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); > - if (!page) > + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0); > + > + if (!ptdesc) > return NULL; > - if (!pgtable_pte_page_ctor(page)) { > - __free_page(page); > + if (!pagetable_pte_ctor(ptdesc)) { > + pagetable_free(ptdesc); > return NULL; > } > - return (pte_t *) page_address(page); > + return ptdesc_address(ptdesc); > } > > void pte_free_kernel(struct mm_struct *mm, pte_t *pte) > @@ -2910,10 +2911,10 @@ void pte_free_kernel(struct mm_struct *mm, pte_t *pte) > > static void __pte_free(pgtable_t pte) > { > - struct page *page = virt_to_page(pte); > + struct ptdesc *ptdesc = virt_to_ptdesc(pte); > > - pgtable_pte_page_dtor(page); > - __free_page(page); > + pagetable_pte_dtor(ptdesc); > + pagetable_free(ptdesc); > } > > void pte_free(struct mm_struct *mm, pgtable_t pte) > -- > 2.40.1 > > > ___ > linux-riscv mailing list > linux-ri...@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-riscv -- Sincerely yours, Mike.
Re: [PATCH v4 30/34] sh: Convert pte_free_tlb() to use ptdescs
On Mon, Jun 12, 2023 at 02:04:19PM -0700, Vishal Moola (Oracle) wrote: > Part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents. Also cleans up some spacing issues. > > Signed-off-by: Vishal Moola (Oracle) > Reviewed-by: Geert Uytterhoeven > Acked-by: John Paul Adrian Glaubitz Acked-by: Mike Rapoport (IBM) > --- > arch/sh/include/asm/pgalloc.h | 9 + > 1 file changed, 5 insertions(+), 4 deletions(-) > > diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h > index a9e98233c4d4..5d8577ab1591 100644 > --- a/arch/sh/include/asm/pgalloc.h > +++ b/arch/sh/include/asm/pgalloc.h > @@ -2,6 +2,7 @@ > #ifndef __ASM_SH_PGALLOC_H > #define __ASM_SH_PGALLOC_H > > +#include > #include > > #define __HAVE_ARCH_PMD_ALLOC_ONE > @@ -31,10 +32,10 @@ static inline void pmd_populate(struct mm_struct *mm, > pmd_t *pmd, > set_pmd(pmd, __pmd((unsigned long)page_address(pte))); > } > > -#define __pte_free_tlb(tlb,pte,addr) \ > -do { \ > - pgtable_pte_page_dtor(pte); \ > - tlb_remove_page((tlb), (pte)); \ > +#define __pte_free_tlb(tlb, pte, addr) \ > +do { \ > + pagetable_pte_dtor(page_ptdesc(pte)); \ > + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ > } while (0) > > #endif /* __ASM_SH_PGALLOC_H */ > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 29/34] riscv: Convert alloc_{pmd, pte}_late() to use ptdescs
On Mon, Jun 12, 2023 at 02:04:18PM -0700, Vishal Moola (Oracle) wrote: > As part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents, convert various page table functions to use ptdescs. > > Some of the functions use the *get*page*() helper functions. Convert > these to use pagetable_alloc() and ptdesc_address() instead to help > standardize page tables further. > > Signed-off-by: Vishal Moola (Oracle) > Acked-by: Palmer Dabbelt Acked-by: Mike Rapoport (IBM) > --- > arch/riscv/include/asm/pgalloc.h | 8 > arch/riscv/mm/init.c | 16 ++-- > 2 files changed, 10 insertions(+), 14 deletions(-) > > diff --git a/arch/riscv/include/asm/pgalloc.h > b/arch/riscv/include/asm/pgalloc.h > index 59dc12b5b7e8..d169a4f41a2e 100644 > --- a/arch/riscv/include/asm/pgalloc.h > +++ b/arch/riscv/include/asm/pgalloc.h > @@ -153,10 +153,10 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) > > #endif /* __PAGETABLE_PMD_FOLDED */ > > -#define __pte_free_tlb(tlb, pte, buf) \ > -do {\ > - pgtable_pte_page_dtor(pte); \ > - tlb_remove_page((tlb), pte);\ > +#define __pte_free_tlb(tlb, pte, buf)\ > +do { \ > + pagetable_pte_dtor(page_ptdesc(pte)); \ > + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\ > } while (0) > #endif /* CONFIG_MMU */ > > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index 3d689ffb2072..6bfeec80bf4e 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -354,12 +354,10 @@ static inline phys_addr_t __init > alloc_pte_fixmap(uintptr_t va) > > static phys_addr_t __init alloc_pte_late(uintptr_t va) > { > - unsigned long vaddr; > - > - vaddr = __get_free_page(GFP_KERNEL); > - BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page((void *)vaddr))); > + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); > > - return __pa(vaddr); > + BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc)); > + return __pa((pte_t *)ptdesc_address(ptdesc)); > } > > static void __init create_pte_mapping(pte_t *ptep, > @@ -437,12 +435,10 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) > > static phys_addr_t __init alloc_pmd_late(uintptr_t va) > { > - unsigned long vaddr; > - > - vaddr = __get_free_page(GFP_KERNEL); > - BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page((void *)vaddr))); > + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); > > - return __pa(vaddr); > + BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc)); > + return __pa((pmd_t *)ptdesc_address(ptdesc)); > } > > static void __init create_pmd_mapping(pmd_t *pmdp, > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 28/34] openrisc: Convert __pte_free_tlb() to use ptdescs
On Mon, Jun 12, 2023 at 02:04:17PM -0700, Vishal Moola (Oracle) wrote: > Part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/openrisc/include/asm/pgalloc.h | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/arch/openrisc/include/asm/pgalloc.h > b/arch/openrisc/include/asm/pgalloc.h > index b7b2b8d16fad..c6a73772a546 100644 > --- a/arch/openrisc/include/asm/pgalloc.h > +++ b/arch/openrisc/include/asm/pgalloc.h > @@ -66,10 +66,10 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm) > > extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); > > -#define __pte_free_tlb(tlb, pte, addr) \ > -do { \ > - pgtable_pte_page_dtor(pte); \ > - tlb_remove_page((tlb), (pte)); \ > +#define __pte_free_tlb(tlb, pte, addr) \ > +do { \ > + pagetable_pte_dtor(page_ptdesc(pte)); \ > + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ > } while (0) > > #endif > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 27/34] nios2: Convert __pte_free_tlb() to use ptdescs
On Mon, Jun 12, 2023 at 02:04:16PM -0700, Vishal Moola (Oracle) wrote: > Part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/nios2/include/asm/pgalloc.h | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/arch/nios2/include/asm/pgalloc.h > b/arch/nios2/include/asm/pgalloc.h > index ecd1657bb2ce..ce6bb8e74271 100644 > --- a/arch/nios2/include/asm/pgalloc.h > +++ b/arch/nios2/include/asm/pgalloc.h > @@ -28,10 +28,10 @@ static inline void pmd_populate(struct mm_struct *mm, > pmd_t *pmd, > > extern pgd_t *pgd_alloc(struct mm_struct *mm); > > -#define __pte_free_tlb(tlb, pte, addr) \ > - do {\ > - pgtable_pte_page_dtor(pte); \ > - tlb_remove_page((tlb), (pte)); \ > +#define __pte_free_tlb(tlb, pte, addr) > \ > + do {\ > + pagetable_pte_dtor(page_ptdesc(pte)); \ > + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ > } while (0) > > #endif /* _ASM_NIOS2_PGALLOC_H */ > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 26/34] mips: Convert various functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:15PM -0700, Vishal Moola (Oracle) wrote: > As part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents, convert various page table functions to use ptdescs. > > Some of the functions use the *get*page*() helper functions. Convert > these to use pagetable_alloc() and ptdesc_address() instead to help > standardize page tables further. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/mips/include/asm/pgalloc.h | 31 +-- > arch/mips/mm/pgtable.c | 7 --- > 2 files changed, 21 insertions(+), 17 deletions(-) > > diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h > index f72e737dda21..6940e5536664 100644 > --- a/arch/mips/include/asm/pgalloc.h > +++ b/arch/mips/include/asm/pgalloc.h > @@ -51,13 +51,13 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); > > static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) > { > - free_pages((unsigned long)pgd, PGD_TABLE_ORDER); > + pagetable_free(virt_to_ptdesc(pgd)); > } > > -#define __pte_free_tlb(tlb,pte,address) \ > -do { \ > - pgtable_pte_page_dtor(pte); \ > - tlb_remove_page((tlb), pte);\ > +#define __pte_free_tlb(tlb, pte, address)\ > +do { \ > + pagetable_pte_dtor(page_ptdesc(pte)); \ > + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\ > } while (0) > > #ifndef __PAGETABLE_PMD_FOLDED > @@ -65,18 +65,18 @@ do { > \ > static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long > address) > { > pmd_t *pmd; > - struct page *pg; > + struct ptdesc *ptdesc; > > - pg = alloc_pages(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER); > - if (!pg) > + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER); > + if (!ptdesc) > return NULL; > > - if (!pgtable_pmd_page_ctor(pg)) { > - __free_pages(pg, PMD_TABLE_ORDER); > + if (!pagetable_pmd_ctor(ptdesc)) { > + pagetable_free(ptdesc); > return NULL; > } > > - pmd = (pmd_t *)page_address(pg); > + pmd = ptdesc_address(ptdesc); > pmd_init(pmd); > return pmd; > } > @@ -90,10 +90,13 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, > unsigned long address) > static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long > address) > { > pud_t *pud; > + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, PUD_TABLE_ORDER); > > - pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER); > - if (pud) > - pud_init(pud); > + if (!ptdesc) > + return NULL; > + pud = ptdesc_address(ptdesc); > + > + pud_init(pud); > return pud; > } > > diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c > index b13314be5d0e..729258ff4e3b 100644 > --- a/arch/mips/mm/pgtable.c > +++ b/arch/mips/mm/pgtable.c > @@ -10,10 +10,11 @@ > > pgd_t *pgd_alloc(struct mm_struct *mm) > { > - pgd_t *ret, *init; > + pgd_t *init, *ret = NULL; > + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, PGD_TABLE_ORDER); > > - ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER); > - if (ret) { > + if (ptdesc) { > + ret = ptdesc_address(ptdesc); > init = pgd_offset(&init_mm, 0UL); > pgd_init(ret); > memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 25/34] m68k: Convert various functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:14PM -0700, Vishal Moola (Oracle) wrote: > As part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents, convert various page table functions to use ptdescs. > > Some of the functions use the *get*page*() helper functions. Convert > these to use pagetable_alloc() and ptdesc_address() instead to help > standardize page tables further. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) One comment below > --- > arch/m68k/include/asm/mcf_pgalloc.h | 41 ++-- > arch/m68k/include/asm/sun3_pgalloc.h | 8 +++--- > arch/m68k/mm/motorola.c | 4 +-- > 3 files changed, 27 insertions(+), 26 deletions(-) > > diff --git a/arch/m68k/include/asm/mcf_pgalloc.h > b/arch/m68k/include/asm/mcf_pgalloc.h > index 5c2c0a864524..857949ac9431 100644 > --- a/arch/m68k/include/asm/mcf_pgalloc.h > +++ b/arch/m68k/include/asm/mcf_pgalloc.h > @@ -7,20 +7,19 @@ > > extern inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) > { > - free_page((unsigned long) pte); > + pagetable_free(virt_to_ptdesc(pte)); > } > > extern const char bad_pmd_string[]; > > extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) > { > - unsigned long page = __get_free_page(GFP_DMA); > + struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA | __GFP_ZERO, 0); > > - if (!page) > + if (!ptdesc) > return NULL; > > - memset((void *)page, 0, PAGE_SIZE); > - return (pte_t *) (page); > + return ptdesc_address(ptdesc); > } > > extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) > @@ -35,36 +34,36 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, > unsigned long address) > static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable, > unsigned long address) > { > - struct page *page = virt_to_page(pgtable); > + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); > > - pgtable_pte_page_dtor(page); > - __free_page(page); > + pagetable_pte_dtor(ptdesc); > + pagetable_free(ptdesc); > } > > static inline pgtable_t pte_alloc_one(struct mm_struct *mm) > { > - struct page *page = alloc_pages(GFP_DMA, 0); > + struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA, 0); You can add __GFP_ZERO here and drop pagetable_clear() below > pte_t *pte; > > - if (!page) > + if (!ptdesc) > return NULL; > - if (!pgtable_pte_page_ctor(page)) { > - __free_page(page); > + if (!pagetable_pte_ctor(ptdesc)) { > + pagetable_free(ptdesc); > return NULL; > } > > - pte = page_address(page); > - clear_page(pte); > + pte = ptdesc_address(ptdesc); > + pagetable_clear(pte); > > return pte; > } > > static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) > { > - struct page *page = virt_to_page(pgtable); > + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); > > - pgtable_pte_page_dtor(page); > - __free_page(page); > + pagetable_pte_dtor(ptdesc); > + pagetable_free(ptdesc); > } > > /* > @@ -75,16 +74,18 @@ static inline void pte_free(struct mm_struct *mm, > pgtable_t pgtable) > > static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) > { > - free_page((unsigned long) pgd); > + pagetable_free(virt_to_ptdesc(pgd)); > } > > static inline pgd_t *pgd_alloc(struct mm_struct *mm) > { > pgd_t *new_pgd; > + struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA | GFP_NOWARN, 0); > > - new_pgd = (pgd_t *)__get_free_page(GFP_DMA | __GFP_NOWARN); > - if (!new_pgd) > + if (!ptdesc) > return NULL; > + new_pgd = ptdesc_address(ptdesc); > + > memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t)); > memset(new_pgd, 0, PAGE_OFFSET >> PGDIR_SHIFT); > return new_pgd; > diff --git a/arch/m68k/include/asm/sun3_pgalloc.h > b/arch/m68k/include/asm/sun3_pgalloc.h > index 198036aff519..ff48573db2c0 100644 > --- a/arch/m68k/include/asm/sun3_pgalloc.h > +++ b/arch/m68k/include/asm/sun3_pgalloc.h > @@ -17,10 +17,10 @@ > > extern const char bad_pmd_string[]; > > -#define __pte_free_tlb(tlb,pte,addr) \ > -do { \ > - pgtable_pte_page_dtor(pte); \ > - tlb_remove_page((tlb), pte);\ > +#define __pte_free_tlb(tlb, pte, addr) \ > +do {
Re: [PATCH v4 24/34] loongarch: Convert various functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:13PM -0700, Vishal Moola (Oracle) wrote: > As part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents, convert various page table functions to use ptdescs. > > Some of the functions use the *get*page*() helper functions. Convert > these to use pagetable_alloc() and ptdesc_address() instead to help > standardize page tables further. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/loongarch/include/asm/pgalloc.h | 27 +++ > arch/loongarch/mm/pgtable.c | 7 --- > 2 files changed, 19 insertions(+), 15 deletions(-) > > diff --git a/arch/loongarch/include/asm/pgalloc.h > b/arch/loongarch/include/asm/pgalloc.h > index af1d1e4a6965..70bb3bdd201e 100644 > --- a/arch/loongarch/include/asm/pgalloc.h > +++ b/arch/loongarch/include/asm/pgalloc.h > @@ -45,9 +45,9 @@ extern void pagetable_init(void); > extern pgd_t *pgd_alloc(struct mm_struct *mm); > > #define __pte_free_tlb(tlb, pte, address)\ > -do { \ > - pgtable_pte_page_dtor(pte); \ > - tlb_remove_page((tlb), pte);\ > +do { \ > + pagetable_pte_dtor(page_ptdesc(pte)); \ > + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\ > } while (0) > > #ifndef __PAGETABLE_PMD_FOLDED > @@ -55,18 +55,18 @@ do { > \ > static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long > address) > { > pmd_t *pmd; > - struct page *pg; > + struct ptdesc *ptdesc; > > - pg = alloc_page(GFP_KERNEL_ACCOUNT); > - if (!pg) > + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0); > + if (!ptdesc) > return NULL; > > - if (!pgtable_pmd_page_ctor(pg)) { > - __free_page(pg); > + if (!pagetable_pmd_ctor(ptdesc)) { > + pagetable_free(ptdesc); > return NULL; > } > > - pmd = (pmd_t *)page_address(pg); > + pmd = ptdesc_address(ptdesc); > pmd_init(pmd); > return pmd; > } > @@ -80,10 +80,13 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, > unsigned long address) > static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long > address) > { > pud_t *pud; > + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); > > - pud = (pud_t *) __get_free_page(GFP_KERNEL); > - if (pud) > - pud_init(pud); > + if (!ptdesc) > + return NULL; > + pud = ptdesc_address(ptdesc); > + > + pud_init(pud); > return pud; > } > > diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c > index 36a6dc0148ae..cdba10ffc0df 100644 > --- a/arch/loongarch/mm/pgtable.c > +++ b/arch/loongarch/mm/pgtable.c > @@ -11,10 +11,11 @@ > > pgd_t *pgd_alloc(struct mm_struct *mm) > { > - pgd_t *ret, *init; > + pgd_t *init, *ret = NULL; > + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); > > - ret = (pgd_t *) __get_free_page(GFP_KERNEL); > - if (ret) { > + if (ptdesc) { > + ret = (pgd_t *)ptdesc_address(ptdesc); > init = pgd_offset(&init_mm, 0UL); > pgd_init(ret); > memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 23/34] hexagon: Convert __pte_free_tlb() to use ptdescs
On Mon, Jun 12, 2023 at 02:04:12PM -0700, Vishal Moola (Oracle) wrote: > Part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/hexagon/include/asm/pgalloc.h | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/arch/hexagon/include/asm/pgalloc.h > b/arch/hexagon/include/asm/pgalloc.h > index f0c47e6a7427..55988625e6fb 100644 > --- a/arch/hexagon/include/asm/pgalloc.h > +++ b/arch/hexagon/include/asm/pgalloc.h > @@ -87,10 +87,10 @@ static inline void pmd_populate_kernel(struct mm_struct > *mm, pmd_t *pmd, > max_kernel_seg = pmdindex; > } > > -#define __pte_free_tlb(tlb, pte, addr) \ > -do { \ > - pgtable_pte_page_dtor((pte)); \ > - tlb_remove_page((tlb), (pte)); \ > +#define __pte_free_tlb(tlb, pte, addr) \ > +do { \ > + pagetable_pte_dtor((page_ptdesc(pte))); \ > + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ > } while (0) > > #endif > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 22/34] csky: Convert __pte_free_tlb() to use ptdescs
On Mon, Jun 12, 2023 at 02:04:11PM -0700, Vishal Moola (Oracle) wrote: > Part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents. > > Signed-off-by: Vishal Moola (Oracle) > Acked-by: Guo Ren Acked-by: Mike Rapoport (IBM) > --- > arch/csky/include/asm/pgalloc.h | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h > index 7d57e5da0914..9c84c9012e53 100644 > --- a/arch/csky/include/asm/pgalloc.h > +++ b/arch/csky/include/asm/pgalloc.h > @@ -63,8 +63,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) > > #define __pte_free_tlb(tlb, pte, address)\ > do { \ > - pgtable_pte_page_dtor(pte); \ > - tlb_remove_page(tlb, pte); \ > + pagetable_pte_dtor(page_ptdesc(pte)); \ > + tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \ > } while (0) > > extern void pagetable_init(void); > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 21/34] arm64: Convert various functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:10PM -0700, Vishal Moola (Oracle) wrote: > As part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents, convert various page table functions to use ptdescs. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/arm64/include/asm/tlb.h | 14 -- > arch/arm64/mm/mmu.c | 7 --- > 2 files changed, 12 insertions(+), 9 deletions(-) > > diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h > index c995d1f4594f..2c29239d05c3 100644 > --- a/arch/arm64/include/asm/tlb.h > +++ b/arch/arm64/include/asm/tlb.h > @@ -75,18 +75,20 @@ static inline void tlb_flush(struct mmu_gather *tlb) > static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, > unsigned long addr) > { > - pgtable_pte_page_dtor(pte); > - tlb_remove_table(tlb, pte); > + struct ptdesc *ptdesc = page_ptdesc(pte); > + > + pagetable_pte_dtor(ptdesc); > + tlb_remove_ptdesc(tlb, ptdesc); > } > > #if CONFIG_PGTABLE_LEVELS > 2 > static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, > unsigned long addr) > { > - struct page *page = virt_to_page(pmdp); > + struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); > > - pgtable_pmd_page_dtor(page); > - tlb_remove_table(tlb, page); > + pagetable_pmd_dtor(ptdesc); > + tlb_remove_ptdesc(tlb, ptdesc); > } > #endif > > @@ -94,7 +96,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, > pmd_t *pmdp, > static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, > unsigned long addr) > { > - tlb_remove_table(tlb, virt_to_page(pudp)); > + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pudp)); > } > #endif > > diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c > index af6bc8403ee4..5867a0e917b9 100644 > --- a/arch/arm64/mm/mmu.c > +++ b/arch/arm64/mm/mmu.c > @@ -426,6 +426,7 @@ static phys_addr_t __pgd_pgtable_alloc(int shift) > static phys_addr_t pgd_pgtable_alloc(int shift) > { > phys_addr_t pa = __pgd_pgtable_alloc(shift); > + struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); > > /* >* Call proper page table ctor in case later we need to > @@ -433,12 +434,12 @@ static phys_addr_t pgd_pgtable_alloc(int shift) >* this pre-allocated page table. >* >* We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is > - * folded, and if so pgtable_pmd_page_ctor() becomes nop. > + * folded, and if so pagetable_pte_ctor() becomes nop. >*/ > if (shift == PAGE_SHIFT) > - BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa))); > + BUG_ON(!pagetable_pte_ctor(ptdesc)); > else if (shift == PMD_SHIFT) > - BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); > + BUG_ON(!pagetable_pmd_ctor(ptdesc)); > > return pa; > } > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 20/34] arm: Convert various functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:09PM -0700, Vishal Moola (Oracle) wrote: > As part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents, convert various page table functions to use ptdescs. > > late_alloc() also uses the __get_free_pages() helper function. Convert > this to use pagetable_alloc() and ptdesc_address() instead to help > standardize page tables further. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) One comment below. > --- > arch/arm/include/asm/tlb.h | 12 +++- > arch/arm/mm/mmu.c | 6 +++--- > 2 files changed, 10 insertions(+), 8 deletions(-) > > diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h > index b8cbe03ad260..f40d06ad5d2a 100644 > --- a/arch/arm/include/asm/tlb.h > +++ b/arch/arm/include/asm/tlb.h > @@ -39,7 +39,9 @@ static inline void __tlb_remove_table(void *_table) > static inline void > __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) > { > - pgtable_pte_page_dtor(pte); > + struct ptdesc *ptdesc = page_ptdesc(pte); > + > + pagetable_pte_dtor(ptdesc); > > #ifndef CONFIG_ARM_LPAE > /* > @@ -50,17 +52,17 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, > unsigned long addr) > __tlb_adjust_range(tlb, addr - PAGE_SIZE, 2 * PAGE_SIZE); > #endif > > - tlb_remove_table(tlb, pte); > + tlb_remove_ptdesc(tlb, ptdesc); > } > > static inline void > __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) > { > #ifdef CONFIG_ARM_LPAE > - struct page *page = virt_to_page(pmdp); > + struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); > > - pgtable_pmd_page_dtor(page); > - tlb_remove_table(tlb, page); > + pagetable_pmd_dtor(ptdesc); > + tlb_remove_ptdesc(tlb, ptdesc); > #endif > } > > diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c > index 22292cf3381c..294518fd0240 100644 > --- a/arch/arm/mm/mmu.c > +++ b/arch/arm/mm/mmu.c > @@ -737,11 +737,11 @@ static void __init *early_alloc(unsigned long sz) > > static void *__init late_alloc(unsigned long sz) > { > - void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz)); > + void *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL, get_order(sz)); > > - if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr))) > + if (!ptdesc || !pagetable_pte_ctor(ptdesc)) > BUG(); > - return ptr; > + return ptdesc; should be return ptdesc_to_virt(ptdesc); > } > > static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr, > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 19/34] pgalloc: Convert various functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:08PM -0700, Vishal Moola (Oracle) wrote: > As part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents, convert various page table functions to use ptdescs. > > Some of the functions use the *get*page*() helper functions. Convert > these to use pagetable_alloc() and ptdesc_address() instead to help > standardize page tables further. > > Signed-off-by: Vishal Moola (Oracle) > --- > include/asm-generic/pgalloc.h | 62 +-- > 1 file changed, 37 insertions(+), 25 deletions(-) > > diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h > index a7cf825befae..3fd6ce79e654 100644 > --- a/include/asm-generic/pgalloc.h > +++ b/include/asm-generic/pgalloc.h > @@ -18,7 +18,11 @@ > */ > static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm) > { > - return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL); > + struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL, 0); > + > + if (!ptdesc) > + return NULL; > + return ptdesc_address(ptdesc); > } > > #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL > @@ -41,7 +45,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct > *mm) > */ > static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) > { > - free_page((unsigned long)pte); > + pagetable_free(virt_to_ptdesc(pte)); > } > > /** > @@ -49,7 +53,7 @@ static inline void pte_free_kernel(struct mm_struct *mm, > pte_t *pte) > * @mm: the mm_struct of the current context > * @gfp: GFP flags to use for the allocation > * > - * Allocates a page and runs the pgtable_pte_page_ctor(). > + * Allocates a ptdesc and runs the pagetable_pte_ctor(). Allocates memory for page table and ptdesc > * > * This function is intended for architectures that need > * anything beyond simple page allocation or must have custom GFP flags. The Return: description here should be fixed up > @@ -58,17 +62,17 @@ static inline void pte_free_kernel(struct mm_struct *mm, > pte_t *pte) > */ > static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) > { > - struct page *pte; > + struct ptdesc *ptdesc; > > - pte = alloc_page(gfp); > - if (!pte) > + ptdesc = pagetable_alloc(gfp, 0); > + if (!ptdesc) > return NULL; > - if (!pgtable_pte_page_ctor(pte)) { > - __free_page(pte); > + if (!pagetable_pte_ctor(ptdesc)) { > + pagetable_free(ptdesc); > return NULL; > } > > - return pte; > + return ptdesc_page(ptdesc); > } > > #ifndef __HAVE_ARCH_PTE_ALLOC_ONE > @@ -76,7 +80,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct > *mm, gfp_t gfp) > * pte_alloc_one - allocate a page for PTE-level user page table > * @mm: the mm_struct of the current context > * > - * Allocates a page and runs the pgtable_pte_page_ctor(). > + * Allocates a ptdesc and runs the pagetable_pte_ctor(). Allocates memory for page table and ptdesc > * > * Return: `struct page` initialized as page table or %NULL on error Return: ptdesc ... > */ > @@ -98,8 +102,10 @@ static inline pgtable_t pte_alloc_one(struct mm_struct > *mm) > */ > static inline void pte_free(struct mm_struct *mm, struct page *pte_page) > { > - pgtable_pte_page_dtor(pte_page); > - __free_page(pte_page); > + struct ptdesc *ptdesc = page_ptdesc(pte_page); > + > + pagetable_pte_dtor(ptdesc); > + pagetable_free(ptdesc); > } > > > @@ -110,7 +116,7 @@ static inline void pte_free(struct mm_struct *mm, struct > page *pte_page) > * pmd_alloc_one - allocate a page for PMD-level page table > * @mm: the mm_struct of the current context > * > - * Allocates a page and runs the pgtable_pmd_page_ctor(). > + * Allocates a ptdesc and runs the pagetable_pmd_ctor(). Allocate memory for page table and ptdesc > * Allocations use %GFP_PGTABLE_USER in user context and > * %GFP_PGTABLE_KERNEL in kernel context. > * > @@ -118,28 +124,30 @@ static inline void pte_free(struct mm_struct *mm, > struct page *pte_page) > */ > static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) > { > - struct page *page; > + struct ptdesc *ptdesc; > gfp_t gfp = GFP_PGTABLE_USER; > > if (mm == &init_mm) > gfp = GFP_PGTABLE_KERNEL; > - page = alloc_page(gfp); > - if (!page) > + ptdesc = pagetable_alloc(gfp, 0); > + if (!ptdesc) > return NULL; > - if (!pgtable_pmd_page_ctor(page)) { > - __free_page(page); > + if (!pagetable_pmd_ctor(ptdesc)) { > + pagetable_free(ptdesc); > return NULL; > } > - return (pmd_t *)page_address(page); > + return ptdesc_address(ptdesc); > } > #endif > > #ifndef __HAVE_ARCH_PMD_FREE > static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) > { > + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); >
Re: [PATCH v4 18/34] mm: Remove page table members from struct page
On Mon, Jun 12, 2023 at 02:04:07PM -0700, Vishal Moola (Oracle) wrote: > The page table members are now split out into their own ptdesc struct. > Remove them from struct page. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/mm_types.h | 14 -- > include/linux/pgtable.h | 3 --- > 2 files changed, 17 deletions(-) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 6161fe1ae5b8..31ffa1be21d0 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -141,20 +141,6 @@ struct page { > struct {/* Tail pages of compound page */ > unsigned long compound_head;/* Bit zero is set */ > }; > - struct {/* Page table pages */ > - unsigned long _pt_pad_1;/* compound_head */ > - pgtable_t pmd_huge_pte; /* protected by page->ptl */ > - unsigned long _pt_s390_gaddr; /* mapping */ > - union { > - struct mm_struct *pt_mm; /* x86 pgds only */ > - atomic_t pt_frag_refcount; /* powerpc */ > - }; > -#if ALLOC_SPLIT_PTLOCKS > - spinlock_t *ptl; > -#else > - spinlock_t ptl; > -#endif > - }; > struct {/* ZONE_DEVICE pages */ > /** @pgmap: Points to the hosting device page map. */ > struct dev_pagemap *pgmap; > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index c405f74d3875..33cc19d752b3 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -1019,10 +1019,7 @@ struct ptdesc { > TABLE_MATCH(flags, __page_flags); > TABLE_MATCH(compound_head, pt_list); > TABLE_MATCH(compound_head, _pt_pad_1); > -TABLE_MATCH(pmd_huge_pte, pmd_huge_pte); > TABLE_MATCH(mapping, _pt_s390_gaddr); > -TABLE_MATCH(pt_mm, pt_mm); > -TABLE_MATCH(ptl, ptl); > #undef TABLE_MATCH > static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); > > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 17/34] s390: Convert various pgalloc functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:06PM -0700, Vishal Moola (Oracle) wrote: > As part of the conversions to replace pgtable constructor/destructors with > ptdesc equivalents, convert various page table functions to use ptdescs. > > Some of the functions use the *get*page*() helper functions. Convert > these to use pagetable_alloc() and ptdesc_address() instead to help > standardize page tables further. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/s390/include/asm/pgalloc.h | 4 +- > arch/s390/include/asm/tlb.h | 4 +- > arch/s390/mm/pgalloc.c | 108 > 3 files changed, 59 insertions(+), 57 deletions(-) > > diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h > index 17eb618f1348..00ad9b88fda9 100644 > --- a/arch/s390/include/asm/pgalloc.h > +++ b/arch/s390/include/asm/pgalloc.h > @@ -86,7 +86,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, > unsigned long vmaddr) > if (!table) > return NULL; > crst_table_init(table, _SEGMENT_ENTRY_EMPTY); > - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { > + if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { > crst_table_free(mm, table); > return NULL; > } > @@ -97,7 +97,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t > *pmd) > { > if (mm_pmd_folded(mm)) > return; > - pgtable_pmd_page_dtor(virt_to_page(pmd)); > + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); > crst_table_free(mm, (unsigned long *) pmd); > } > > diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h > index b91f4a9b044c..383b1f91442c 100644 > --- a/arch/s390/include/asm/tlb.h > +++ b/arch/s390/include/asm/tlb.h > @@ -89,12 +89,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, > pmd_t *pmd, > { > if (mm_pmd_folded(tlb->mm)) > return; > - pgtable_pmd_page_dtor(virt_to_page(pmd)); > + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); > __tlb_adjust_range(tlb, address, PAGE_SIZE); > tlb->mm->context.flush_mm = 1; > tlb->freed_tables = 1; > tlb->cleared_puds = 1; > - tlb_remove_table(tlb, pmd); > + tlb_remove_ptdesc(tlb, pmd); > } > > /* > diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c > index 6b99932abc66..eeb7c95b98cf 100644 > --- a/arch/s390/mm/pgalloc.c > +++ b/arch/s390/mm/pgalloc.c > @@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl); > > unsigned long *crst_table_alloc(struct mm_struct *mm) > { > - struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); > + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); > > - if (!page) > + if (!ptdesc) > return NULL; > - arch_set_page_dat(page, CRST_ALLOC_ORDER); > - return (unsigned long *) page_to_virt(page); > + arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER); > + return (unsigned long *) ptdesc_to_virt(ptdesc); > } > > void crst_table_free(struct mm_struct *mm, unsigned long *table) > { > - free_pages((unsigned long)table, CRST_ALLOC_ORDER); > + pagetable_free(virt_to_ptdesc(table)); > } > > static void __crst_table_upgrade(void *arg) > @@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, > unsigned int bits) > > struct page *page_table_alloc_pgste(struct mm_struct *mm) > { > - struct page *page; > + struct ptdesc *ptdesc; > u64 *table; > > - page = alloc_page(GFP_KERNEL); > - if (page) { > - table = (u64 *)page_to_virt(page); > + ptdesc = pagetable_alloc(GFP_KERNEL, 0); > + if (ptdesc) { > + table = (u64 *)ptdesc_to_virt(ptdesc); > memset64(table, _PAGE_INVALID, PTRS_PER_PTE); > memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); > } > - return page; > + return ptdesc_page(ptdesc); > } > > void page_table_free_pgste(struct page *page) > { > - __free_page(page); > + pagetable_free(page_ptdesc(page)); > } > > #endif /* CONFIG_PGSTE */ > @@ -230,7 +230,7 @@ void page_table_free_pgste(struct page *page) > unsigned long *page_table_alloc(struct mm_struct *mm) > { > unsigned long *table; > - struct page *page; > + struct ptdesc *ptdesc; > unsigned int mask, bit; > > /* Try to get a fragment of a 4K page as a 2K page table */ > @@ -238,9 +238,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) > table = NULL; >
Re: [PATCH v4 16/34] s390: Convert various gmap functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:05PM -0700, Vishal Moola (Oracle) wrote: > In order to split struct ptdesc from struct page, convert various > functions to use ptdescs. > > Some of the functions use the *get*page*() helper functions. Convert > these to use pagetable_alloc() and ptdesc_address() instead to help > standardize page tables further. > > Signed-off-by: Vishal Moola (Oracle) With folding ptdesc->_pt_s390_gaddr = 0; into pagetable_free() Acked-by: Mike Rapoport (IBM) > --- > arch/s390/mm/gmap.c | 230 > 1 file changed, 128 insertions(+), 102 deletions(-) > > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c > index 81c683426b49..010e87df7299 100644 > --- a/arch/s390/mm/gmap.c > +++ b/arch/s390/mm/gmap.c > @@ -34,7 +34,7 @@ > static struct gmap *gmap_alloc(unsigned long limit) > { > struct gmap *gmap; > - struct page *page; > + struct ptdesc *ptdesc; > unsigned long *table; > unsigned long etype, atype; > > @@ -67,12 +67,12 @@ static struct gmap *gmap_alloc(unsigned long limit) > spin_lock_init(&gmap->guest_table_lock); > spin_lock_init(&gmap->shadow_lock); > refcount_set(&gmap->ref_count, 1); > - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); > - if (!page) > + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); > + if (!ptdesc) > goto out_free; > - page->_pt_s390_gaddr = 0; > - list_add(&page->lru, &gmap->crst_list); > - table = page_to_virt(page); > + ptdesc->_pt_s390_gaddr = 0; > + list_add(&ptdesc->pt_list, &gmap->crst_list); > + table = ptdesc_to_virt(ptdesc); > crst_table_init(table, etype); > gmap->table = table; > gmap->asce = atype | _ASCE_TABLE_LENGTH | > @@ -181,25 +181,25 @@ static void gmap_rmap_radix_tree_free(struct > radix_tree_root *root) > */ > static void gmap_free(struct gmap *gmap) > { > - struct page *page, *next; > + struct ptdesc *ptdesc, *next; > > /* Flush tlb of all gmaps (if not already done for shadows) */ > if (!(gmap_is_shadow(gmap) && gmap->removed)) > gmap_flush_tlb(gmap); > /* Free all segment & region tables. */ > - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { > - page->_pt_s390_gaddr = 0; > - __free_pages(page, CRST_ALLOC_ORDER); > + list_for_each_entry_safe(ptdesc, next, &gmap->crst_list, pt_list) { > + ptdesc->_pt_s390_gaddr = 0; > + pagetable_free(ptdesc); > } > gmap_radix_tree_free(&gmap->guest_to_host); > gmap_radix_tree_free(&gmap->host_to_guest); > > /* Free additional data for a shadow gmap */ > if (gmap_is_shadow(gmap)) { > - /* Free all page tables. */ > - list_for_each_entry_safe(page, next, &gmap->pt_list, lru) { > - page->_pt_s390_gaddr = 0; > - page_table_free_pgste(page); > + /* Free all ptdesc tables. */ > + list_for_each_entry_safe(ptdesc, next, &gmap->pt_list, pt_list) > { > + ptdesc->_pt_s390_gaddr = 0; > + page_table_free_pgste(ptdesc_page(ptdesc)); > } > gmap_rmap_radix_tree_free(&gmap->host_to_rmap); > /* Release reference to the parent */ > @@ -308,27 +308,27 @@ EXPORT_SYMBOL_GPL(gmap_get_enabled); > static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, > unsigned long init, unsigned long gaddr) > { > - struct page *page; > + struct ptdesc *ptdesc; > unsigned long *new; > > /* since we dont free the gmap table until gmap_free we can unlock */ > - page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); > - if (!page) > + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); > + if (!ptdesc) > return -ENOMEM; > - new = page_to_virt(page); > + new = ptdesc_to_virt(ptdesc); > crst_table_init(new, init); > spin_lock(&gmap->guest_table_lock); > if (*table & _REGION_ENTRY_INVALID) { > - list_add(&page->lru, &gmap->crst_list); > + list_add(&ptdesc->pt_list, &gmap->crst_list); > *table = __pa(new) | _REGION_ENTRY_LENGTH | > (*table & _REGION_ENTRY_TYPE_MASK); > - page->_pt_s390_gaddr = gaddr; > - page = NULL; > +
Re: [PATCH v4 15/34] x86: Convert various functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:04PM -0700, Vishal Moola (Oracle) wrote: > In order to split struct ptdesc from struct page, convert various > functions to use ptdescs. > > Some of the functions use the *get*page*() helper functions. Convert Nit: *get_free_page*() > these to use pagetable_alloc() and ptdesc_address() instead to help > standardize page tables further. More importantly, get_free_pages() ensures a page won't be allocated from HIGHMEM, and for 32-bits this is a must. > Signed-off-by: Vishal Moola (Oracle) > --- > arch/x86/mm/pgtable.c | 46 +-- > 1 file changed, 27 insertions(+), 19 deletions(-) > > diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c > index 15a8009a4480..6da7fd5d4782 100644 > --- a/arch/x86/mm/pgtable.c > +++ b/arch/x86/mm/pgtable.c > @@ -52,7 +52,7 @@ early_param("userpte", setup_userpte); > > void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) > { > - pgtable_pte_page_dtor(pte); > + pagetable_pte_dtor(page_ptdesc(pte)); > paravirt_release_pte(page_to_pfn(pte)); > paravirt_tlb_remove_table(tlb, pte); > } > @@ -60,7 +60,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page > *pte) > #if CONFIG_PGTABLE_LEVELS > 2 > void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) > { > - struct page *page = virt_to_page(pmd); > + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); > paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); > /* >* NOTE! For PAE, any changes to the top page-directory-pointer-table > @@ -69,8 +69,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) > #ifdef CONFIG_X86_PAE > tlb->need_flush_all = 1; > #endif > - pgtable_pmd_page_dtor(page); > - paravirt_tlb_remove_table(tlb, page); > + pagetable_pmd_dtor(ptdesc); > + paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); > } > > #if CONFIG_PGTABLE_LEVELS > 3 > @@ -92,16 +92,16 @@ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) > > static inline void pgd_list_add(pgd_t *pgd) > { > - struct page *page = virt_to_page(pgd); > + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); > > - list_add(&page->lru, &pgd_list); > + list_add(&ptdesc->pt_list, &pgd_list); > } > > static inline void pgd_list_del(pgd_t *pgd) > { > - struct page *page = virt_to_page(pgd); > + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); > > - list_del(&page->lru); > + list_del(&ptdesc->pt_list); > } > > #define UNSHARED_PTRS_PER_PGD\ > @@ -112,12 +112,12 @@ static inline void pgd_list_del(pgd_t *pgd) > > static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) > { > - virt_to_page(pgd)->pt_mm = mm; > + virt_to_ptdesc(pgd)->pt_mm = mm; > } > > struct mm_struct *pgd_page_get_mm(struct page *page) > { > - return page->pt_mm; > + return page_ptdesc(page)->pt_mm; > } > > static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) > @@ -213,11 +213,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, > pmd_t *pmd) > static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) > { > int i; > + struct ptdesc *ptdesc; > > for (i = 0; i < count; i++) > if (pmds[i]) { > - pgtable_pmd_page_dtor(virt_to_page(pmds[i])); > - free_page((unsigned long)pmds[i]); > + ptdesc = virt_to_ptdesc(pmds[i]); > + > + pagetable_pmd_dtor(ptdesc); > + pagetable_free(ptdesc); > mm_dec_nr_pmds(mm); > } > } > @@ -232,16 +235,21 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t > *pmds[], int count) > gfp &= ~__GFP_ACCOUNT; > > for (i = 0; i < count; i++) { > - pmd_t *pmd = (pmd_t *)__get_free_page(gfp); > - if (!pmd) > + pmd_t *pmd = NULL; > + struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); > + > + if (!ptdesc) > failed = true; > - if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { > - free_page((unsigned long)pmd); > - pmd = NULL; > + if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { > + pagetable_free(ptdesc); > + ptdesc = NULL; > failed = true; > } > - if (pmd) > + if (ptdesc) { > mm_inc_nr_pmds(mm); > + pmd = ptdesc_address(ptdesc); > + } > + > pmds[i] = pmd; > } > > @@ -830,7 +838,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) > > free_page((unsigned long)pmd_sv); > > - pgtable_pmd_page_dtor(virt_to_page(pmd)); > + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); > free_page((unsigned long)pmd); > > return 1; > -- > 2.40.1 > >
Re: [PATCH v4 14/34] powerpc: Convert various functions to use ptdescs
On Mon, Jun 12, 2023 at 02:04:03PM -0700, Vishal Moola (Oracle) wrote: > In order to split struct ptdesc from struct page, convert various > functions to use ptdescs. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/powerpc/mm/book3s64/mmu_context.c | 10 +++--- > arch/powerpc/mm/book3s64/pgtable.c | 32 +- > arch/powerpc/mm/pgtable-frag.c | 46 +- > 3 files changed, 44 insertions(+), 44 deletions(-) > > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c > b/arch/powerpc/mm/book3s64/mmu_context.c > index c766e4c26e42..1715b07c630c 100644 > --- a/arch/powerpc/mm/book3s64/mmu_context.c > +++ b/arch/powerpc/mm/book3s64/mmu_context.c > @@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx) > static void pmd_frag_destroy(void *pmd_frag) > { > int count; > - struct page *page; > + struct ptdesc *ptdesc; > > - page = virt_to_page(pmd_frag); > + ptdesc = virt_to_ptdesc(pmd_frag); > /* drop all the pending references */ > count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; > /* We allow PTE_FRAG_NR fragments from a PTE page */ > - if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { > - pgtable_pmd_page_dtor(page); > - __free_page(page); > + if (atomic_sub_and_test(PMD_FRAG_NR - count, > &ptdesc->pt_frag_refcount)) { > + pagetable_pmd_dtor(ptdesc); > + pagetable_free(ptdesc); > } > } > > diff --git a/arch/powerpc/mm/book3s64/pgtable.c > b/arch/powerpc/mm/book3s64/pgtable.c > index 85c84e89e3ea..1212deeabe15 100644 > --- a/arch/powerpc/mm/book3s64/pgtable.c > +++ b/arch/powerpc/mm/book3s64/pgtable.c > @@ -306,22 +306,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm) > static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) > { > void *ret = NULL; > - struct page *page; > + struct ptdesc *ptdesc; > gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; > > if (mm == &init_mm) > gfp &= ~__GFP_ACCOUNT; > - page = alloc_page(gfp); > - if (!page) > + ptdesc = pagetable_alloc(gfp, 0); > + if (!ptdesc) > return NULL; > - if (!pgtable_pmd_page_ctor(page)) { > - __free_pages(page, 0); > + if (!pagetable_pmd_ctor(ptdesc)) { > + pagetable_free(ptdesc); > return NULL; > } > > - atomic_set(&page->pt_frag_refcount, 1); > + atomic_set(&ptdesc->pt_frag_refcount, 1); > > - ret = page_address(page); > + ret = ptdesc_address(ptdesc); > /* >* if we support only one fragment just return the >* allocated page. > @@ -331,12 +331,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) > > spin_lock(&mm->page_table_lock); > /* > - * If we find pgtable_page set, we return > + * If we find ptdesc_page set, we return >* the allocated page with single fragment >* count. >*/ > if (likely(!mm->context.pmd_frag)) { > - atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); > + atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR); > mm->context.pmd_frag = ret + PMD_FRAG_SIZE; > } > spin_unlock(&mm->page_table_lock); > @@ -357,15 +357,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, > unsigned long vmaddr) > > void pmd_fragment_free(unsigned long *pmd) > { > - struct page *page = virt_to_page(pmd); > + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); > > - if (PageReserved(page)) > - return free_reserved_page(page); > + if (pagetable_is_reserved(ptdesc)) > + return free_reserved_ptdesc(ptdesc); > > - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); > - if (atomic_dec_and_test(&page->pt_frag_refcount)) { > - pgtable_pmd_page_dtor(page); > - __free_page(page); > + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); > + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { > + pagetable_pmd_dtor(ptdesc); > + pagetable_free(ptdesc); > } > } > > diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c > index 20652daa1d7e..8961f1540209 100644 > --- a/arch/powerpc/mm/pgtable-frag.c > +++ b/arch/powerpc/mm/pgtable-frag.c > @@ -18,15 +18,15 @@ > void pte_frag_destroy(void *pte_frag) > { > int count; > - struct
Re: [PATCH v4 13/34] mm: Create ptdesc equivalents for pgtable_{pte,pmd}_page_{ctor,dtor}
On Mon, Jun 12, 2023 at 02:04:02PM -0700, Vishal Moola (Oracle) wrote: > Creates pagetable_pte_ctor(), pagetable_pmd_ctor(), pagetable_pte_dtor(), > and pagetable_pmd_dtor() and make the original pgtable > constructor/destructors wrappers. Nit: either "creates ... makes" or "create ... make" I like the second form more. > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/mm.h | 56 ++ > 1 file changed, 42 insertions(+), 14 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index a1af7983e1bd..dc211c43610b 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2886,20 +2886,34 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) > { return true; } > static inline void ptlock_free(struct ptdesc *ptdesc) {} > #endif /* USE_SPLIT_PTE_PTLOCKS */ > > -static inline bool pgtable_pte_page_ctor(struct page *page) > +static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) > { > - if (!ptlock_init(page_ptdesc(page))) > + struct folio *folio = ptdesc_folio(ptdesc); > + > + if (!ptlock_init(ptdesc)) > return false; > - __SetPageTable(page); > - inc_lruvec_page_state(page, NR_PAGETABLE); > + __folio_set_table(folio); This comment is more to patch 1 ("mm: Add PAGE_TYPE_OP folio functions") It would be better to have _pgtable here, as "table" does not necessary mean page table. With PageType SetPageTable was fine, but with folio I think it should be more explicit. I'd add a third parameter to PAGE_TYPE_OPS for that. > + lruvec_stat_add_folio(folio, NR_PAGETABLE); > return true; > } > > +static inline bool pgtable_pte_page_ctor(struct page *page) > +{ > + return pagetable_pte_ctor(page_ptdesc(page)); > +} > + > +static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) > +{ > + struct folio *folio = ptdesc_folio(ptdesc); > + > + ptlock_free(ptdesc); > + __folio_clear_table(folio); > + lruvec_stat_sub_folio(folio, NR_PAGETABLE); > +} > + > static inline void pgtable_pte_page_dtor(struct page *page) > { > - ptlock_free(page_ptdesc(page)); > - __ClearPageTable(page); > - dec_lruvec_page_state(page, NR_PAGETABLE); > + pagetable_pte_dtor(page_ptdesc(page)); > } > > #define pte_offset_map_lock(mm, pmd, address, ptlp) \ > @@ -2981,20 +2995,34 @@ static inline spinlock_t *pmd_lock(struct mm_struct > *mm, pmd_t *pmd) > return ptl; > } > > -static inline bool pgtable_pmd_page_ctor(struct page *page) > +static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) > { > - if (!pmd_ptlock_init(page_ptdesc(page))) > + struct folio *folio = ptdesc_folio(ptdesc); > + > + if (!pmd_ptlock_init(ptdesc)) > return false; > - __SetPageTable(page); > - inc_lruvec_page_state(page, NR_PAGETABLE); > + __folio_set_table(folio); > + lruvec_stat_add_folio(folio, NR_PAGETABLE); > return true; > } > > +static inline bool pgtable_pmd_page_ctor(struct page *page) > +{ > + return pagetable_pmd_ctor(page_ptdesc(page)); > +} > + > +static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) > +{ > + struct folio *folio = ptdesc_folio(ptdesc); > + > + pmd_ptlock_free(ptdesc); > + __folio_clear_table(folio); > + lruvec_stat_sub_folio(folio, NR_PAGETABLE); > +} > + > static inline void pgtable_pmd_page_dtor(struct page *page) > { > - pmd_ptlock_free(page_ptdesc(page)); > - __ClearPageTable(page); > - dec_lruvec_page_state(page, NR_PAGETABLE); > + pagetable_pmd_dtor(page_ptdesc(page)); > } > > /* > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 12/34] mm: Convert ptlock_free() to use ptdescs
On Mon, Jun 12, 2023 at 02:04:01PM -0700, Vishal Moola (Oracle) wrote: > This removes some direct accesses to struct page, working towards > splitting out struct ptdesc from struct page. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/mm.h | 10 +- > mm/memory.c| 4 ++-- > 2 files changed, 7 insertions(+), 7 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 3b54bb4c9753..a1af7983e1bd 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2826,7 +2826,7 @@ static inline void pagetable_clear(void *x) > #if ALLOC_SPLIT_PTLOCKS > void __init ptlock_cache_init(void); > bool ptlock_alloc(struct ptdesc *ptdesc); > -extern void ptlock_free(struct page *page); > +void ptlock_free(struct ptdesc *ptdesc); > > static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) > { > @@ -2842,7 +2842,7 @@ static inline bool ptlock_alloc(struct ptdesc *ptdesc) > return true; > } > > -static inline void ptlock_free(struct page *page) > +static inline void ptlock_free(struct ptdesc *ptdesc) > { > } > > @@ -2883,7 +2883,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct > *mm, pmd_t *pmd) > } > static inline void ptlock_cache_init(void) {} > static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } > -static inline void ptlock_free(struct page *page) {} > +static inline void ptlock_free(struct ptdesc *ptdesc) {} > #endif /* USE_SPLIT_PTE_PTLOCKS */ > > static inline bool pgtable_pte_page_ctor(struct page *page) > @@ -2897,7 +2897,7 @@ static inline bool pgtable_pte_page_ctor(struct page > *page) > > static inline void pgtable_pte_page_dtor(struct page *page) > { > - ptlock_free(page); > + ptlock_free(page_ptdesc(page)); > __ClearPageTable(page); > dec_lruvec_page_state(page, NR_PAGETABLE); > } > @@ -2955,7 +2955,7 @@ static inline void pmd_ptlock_free(struct ptdesc > *ptdesc) > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); > #endif > - ptlock_free(ptdesc_page(ptdesc)); > + ptlock_free(ptdesc); > } > > #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) > diff --git a/mm/memory.c b/mm/memory.c > index ba9579117686..d4d2ea5cf0fd 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -5945,8 +5945,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) > return true; > } > > -void ptlock_free(struct page *page) > +void ptlock_free(struct ptdesc *ptdesc) > { > - kmem_cache_free(page_ptl_cachep, page->ptl); > + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); > } > #endif > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 11/34] mm: Convert pmd_ptlock_free() to use ptdescs
On Mon, Jun 12, 2023 at 02:04:00PM -0700, Vishal Moola (Oracle) wrote: > This removes some direct accesses to struct page, working towards > splitting out struct ptdesc from struct page. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/mm.h | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index f48e626d9c98..3b54bb4c9753 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2950,12 +2950,12 @@ static inline bool pmd_ptlock_init(struct ptdesc > *ptdesc) > return ptlock_init(ptdesc); > } > > -static inline void pmd_ptlock_free(struct page *page) > +static inline void pmd_ptlock_free(struct ptdesc *ptdesc) > { > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > - VM_BUG_ON_PAGE(page->pmd_huge_pte, page); > + VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); > #endif > - ptlock_free(page); > + ptlock_free(ptdesc_page(ptdesc)); > } > > #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) > @@ -2968,7 +2968,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct > *mm, pmd_t *pmd) > } > > static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } > -static inline void pmd_ptlock_free(struct page *page) {} > +static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} > > #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) > > @@ -2992,7 +2992,7 @@ static inline bool pgtable_pmd_page_ctor(struct page > *page) > > static inline void pgtable_pmd_page_dtor(struct page *page) > { > - pmd_ptlock_free(page); > + pmd_ptlock_free(page_ptdesc(page)); > __ClearPageTable(page); > dec_lruvec_page_state(page, NR_PAGETABLE); > } > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 10/34] mm: Convert ptlock_init() to use ptdescs
On Mon, Jun 12, 2023 at 02:03:59PM -0700, Vishal Moola (Oracle) wrote: > This removes some direct accesses to struct page, working towards > splitting out struct ptdesc from struct page. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/mm.h | 14 +++--- > 1 file changed, 7 insertions(+), 7 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index daecf1db6cf1..f48e626d9c98 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2857,7 +2857,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct > *mm, pmd_t *pmd) > return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); > } > > -static inline bool ptlock_init(struct page *page) > +static inline bool ptlock_init(struct ptdesc *ptdesc) > { > /* >* prep_new_page() initialize page->private (and therefore page->ptl) > @@ -2866,10 +2866,10 @@ static inline bool ptlock_init(struct page *page) >* It can happen if arch try to use slab for page table allocation: >* slab code uses page->slab_cache, which share storage with page->ptl. >*/ > - VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); > - if (!ptlock_alloc(page_ptdesc(page))) > + VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); > + if (!ptlock_alloc(ptdesc)) > return false; > - spin_lock_init(ptlock_ptr(page_ptdesc(page))); > + spin_lock_init(ptlock_ptr(ptdesc)); > return true; > } > > @@ -2882,13 +2882,13 @@ static inline spinlock_t *pte_lockptr(struct > mm_struct *mm, pmd_t *pmd) > return &mm->page_table_lock; > } > static inline void ptlock_cache_init(void) {} > -static inline bool ptlock_init(struct page *page) { return true; } > +static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } > static inline void ptlock_free(struct page *page) {} > #endif /* USE_SPLIT_PTE_PTLOCKS */ > > static inline bool pgtable_pte_page_ctor(struct page *page) > { > - if (!ptlock_init(page)) > + if (!ptlock_init(page_ptdesc(page))) > return false; > __SetPageTable(page); > inc_lruvec_page_state(page, NR_PAGETABLE); > @@ -2947,7 +2947,7 @@ static inline bool pmd_ptlock_init(struct ptdesc > *ptdesc) > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > ptdesc->pmd_huge_pte = NULL; > #endif > - return ptlock_init(ptdesc_page(ptdesc)); > + return ptlock_init(ptdesc); > } > > static inline void pmd_ptlock_free(struct page *page) > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 09/34] mm: Convert pmd_ptlock_init() to use ptdescs
On Mon, Jun 12, 2023 at 02:03:58PM -0700, Vishal Moola (Oracle) wrote: > This removes some direct accesses to struct page, working towards > splitting out struct ptdesc from struct page. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/mm.h | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index bb934d51390f..daecf1db6cf1 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2942,12 +2942,12 @@ static inline spinlock_t *pmd_lockptr(struct > mm_struct *mm, pmd_t *pmd) > return ptlock_ptr(pmd_ptdesc(pmd)); > } > > -static inline bool pmd_ptlock_init(struct page *page) > +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) > { > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > - page->pmd_huge_pte = NULL; > + ptdesc->pmd_huge_pte = NULL; > #endif > - return ptlock_init(page); > + return ptlock_init(ptdesc_page(ptdesc)); > } > > static inline void pmd_ptlock_free(struct page *page) > @@ -2967,7 +2967,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct > *mm, pmd_t *pmd) > return &mm->page_table_lock; > } > > -static inline bool pmd_ptlock_init(struct page *page) { return true; } > +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } > static inline void pmd_ptlock_free(struct page *page) {} > > #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) > @@ -2983,7 +2983,7 @@ static inline spinlock_t *pmd_lock(struct mm_struct > *mm, pmd_t *pmd) > > static inline bool pgtable_pmd_page_ctor(struct page *page) > { > - if (!pmd_ptlock_init(page)) > + if (!pmd_ptlock_init(page_ptdesc(page))) > return false; > __SetPageTable(page); > inc_lruvec_page_state(page, NR_PAGETABLE); > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 08/34] mm: Convert ptlock_ptr() to use ptdescs
On Mon, Jun 12, 2023 at 02:03:57PM -0700, Vishal Moola (Oracle) wrote: > This removes some direct accesses to struct page, working towards > splitting out struct ptdesc from struct page. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > arch/x86/xen/mmu_pv.c | 2 +- > include/linux/mm.h| 14 +++--- > 2 files changed, 8 insertions(+), 8 deletions(-) > > diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c > index b3b8d289b9ab..f469862e3ef4 100644 > --- a/arch/x86/xen/mmu_pv.c > +++ b/arch/x86/xen/mmu_pv.c > @@ -651,7 +651,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct > mm_struct *mm) > spinlock_t *ptl = NULL; > > #if USE_SPLIT_PTE_PTLOCKS > - ptl = ptlock_ptr(page); > + ptl = ptlock_ptr(page_ptdesc(page)); > spin_lock_nest_lock(ptl, &mm->page_table_lock); > #endif > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index e6f1be2a405e..bb934d51390f 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2828,9 +2828,9 @@ void __init ptlock_cache_init(void); > bool ptlock_alloc(struct ptdesc *ptdesc); > extern void ptlock_free(struct page *page); > > -static inline spinlock_t *ptlock_ptr(struct page *page) > +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) > { > - return page->ptl; > + return ptdesc->ptl; > } > #else /* ALLOC_SPLIT_PTLOCKS */ > static inline void ptlock_cache_init(void) > @@ -2846,15 +2846,15 @@ static inline void ptlock_free(struct page *page) > { > } > > -static inline spinlock_t *ptlock_ptr(struct page *page) > +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) > { > - return &page->ptl; > + return &ptdesc->ptl; > } > #endif /* ALLOC_SPLIT_PTLOCKS */ > > static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) > { > - return ptlock_ptr(pmd_page(*pmd)); > + return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); > } > > static inline bool ptlock_init(struct page *page) > @@ -2869,7 +2869,7 @@ static inline bool ptlock_init(struct page *page) > VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); > if (!ptlock_alloc(page_ptdesc(page))) > return false; > - spin_lock_init(ptlock_ptr(page)); > + spin_lock_init(ptlock_ptr(page_ptdesc(page))); > return true; > } > > @@ -2939,7 +2939,7 @@ static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) > > static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) > { > - return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd))); > + return ptlock_ptr(pmd_ptdesc(pmd)); > } > > static inline bool pmd_ptlock_init(struct page *page) > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 07/34] mm: Convert ptlock_alloc() to use ptdescs
On Mon, Jun 12, 2023 at 02:03:56PM -0700, Vishal Moola (Oracle) wrote: > This removes some direct accesses to struct page, working towards > splitting out struct ptdesc from struct page. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/mm.h | 6 +++--- > mm/memory.c| 4 ++-- > 2 files changed, 5 insertions(+), 5 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 088b7664f897..e6f1be2a405e 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2825,7 +2825,7 @@ static inline void pagetable_clear(void *x) > #if USE_SPLIT_PTE_PTLOCKS > #if ALLOC_SPLIT_PTLOCKS > void __init ptlock_cache_init(void); > -extern bool ptlock_alloc(struct page *page); > +bool ptlock_alloc(struct ptdesc *ptdesc); > extern void ptlock_free(struct page *page); > > static inline spinlock_t *ptlock_ptr(struct page *page) > @@ -2837,7 +2837,7 @@ static inline void ptlock_cache_init(void) > { > } > > -static inline bool ptlock_alloc(struct page *page) > +static inline bool ptlock_alloc(struct ptdesc *ptdesc) > { > return true; > } > @@ -2867,7 +2867,7 @@ static inline bool ptlock_init(struct page *page) >* slab code uses page->slab_cache, which share storage with page->ptl. >*/ > VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); > - if (!ptlock_alloc(page)) > + if (!ptlock_alloc(page_ptdesc(page))) > return false; > spin_lock_init(ptlock_ptr(page)); > return true; > diff --git a/mm/memory.c b/mm/memory.c > index 80ce9dda2779..ba9579117686 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -5934,14 +5934,14 @@ void __init ptlock_cache_init(void) > SLAB_PANIC, NULL); > } > > -bool ptlock_alloc(struct page *page) > +bool ptlock_alloc(struct ptdesc *ptdesc) > { > spinlock_t *ptl; > > ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); > if (!ptl) > return false; > - page->ptl = ptl; > + ptdesc->ptl = ptl; > return true; > } > > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 06/34] mm: Convert pmd_pgtable_page() to pmd_ptdesc()
On Mon, Jun 12, 2023 at 02:03:55PM -0700, Vishal Moola (Oracle) wrote: > Converts pmd_pgtable_page() to pmd_ptdesc() and all its callers. This > removes some direct accesses to struct page, working towards splitting > out struct ptdesc from struct page. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/mm.h | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index f184f1eba85d..088b7664f897 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2931,15 +2931,15 @@ static inline void pgtable_pte_page_dtor(struct page > *page) > > #if USE_SPLIT_PMD_PTLOCKS > > -static inline struct page *pmd_pgtable_page(pmd_t *pmd) > +static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) > { > unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); > - return virt_to_page((void *)((unsigned long) pmd & mask)); > + return virt_to_ptdesc((void *)((unsigned long) pmd & mask)); > } > > static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) > { > - return ptlock_ptr(pmd_pgtable_page(pmd)); > + return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd))); > } > > static inline bool pmd_ptlock_init(struct page *page) > @@ -2958,7 +2958,7 @@ static inline void pmd_ptlock_free(struct page *page) > ptlock_free(page); > } > > -#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte) > +#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) > > #else > > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 05/34] mm: add utility functions for ptdesc
On Mon, Jun 12, 2023 at 02:03:54PM -0700, Vishal Moola (Oracle) wrote: > Introduce utility functions setting the foundation for ptdescs. These > will also assist in the splitting out of ptdesc from struct page. > > Functions that focus on the descriptor are prefixed with ptdesc_* while > functions that focus on the pagetable are prefixed with pagetable_*. > > pagetable_alloc() is defined to allocate new ptdesc pages as compound > pages. This is to standardize ptdescs by allowing for one allocation > and one free function, in contrast to 2 allocation and 2 free functions. > > Signed-off-by: Vishal Moola (Oracle) > --- > include/asm-generic/tlb.h | 11 +++ > include/linux/mm.h| 61 +++ > include/linux/pgtable.h | 12 > 3 files changed, 84 insertions(+) > > diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h > index b46617207c93..6bade9e0e799 100644 > --- a/include/asm-generic/tlb.h > +++ b/include/asm-generic/tlb.h > @@ -481,6 +481,17 @@ static inline void tlb_remove_page(struct mmu_gather > *tlb, struct page *page) > return tlb_remove_page_size(tlb, page, PAGE_SIZE); > } > > +static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt) > +{ > + tlb_remove_table(tlb, pt); > +} > + > +/* Like tlb_remove_ptdesc, but for page-like page directories. */ > +static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct > ptdesc *pt) > +{ > + tlb_remove_page(tlb, ptdesc_page(pt)); > +} > + > static inline void tlb_change_page_size(struct mmu_gather *tlb, >unsigned int page_size) > { > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 0db09639dd2d..f184f1eba85d 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2766,6 +2766,62 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, > pud_t *pud, unsigned long a > } > #endif /* CONFIG_MMU */ > > +static inline struct ptdesc *virt_to_ptdesc(const void *x) > +{ > + return page_ptdesc(virt_to_page(x)); > +} > + > +static inline void *ptdesc_to_virt(const struct ptdesc *pt) > +{ > + return page_to_virt(ptdesc_page(pt)); > +} > + > +static inline void *ptdesc_address(const struct ptdesc *pt) > +{ > + return folio_address(ptdesc_folio(pt)); > +} > + > +static inline bool pagetable_is_reserved(struct ptdesc *pt) > +{ > + return folio_test_reserved(ptdesc_folio(pt)); > +} > + > +/** > + * pagetable_alloc - Allocate pagetables > + * @gfp:GFP flags > + * @order: desired pagetable order > + * > + * pagetable_alloc allocates a page table descriptor as well as all pages > + * described by it. I think the order should be switched here to emphasize that primarily this method allocates memory for page tables. How about pagetable_alloc allocates memory for the page tables as well as a page table descriptor that describes the allocated memory > + * > + * Return: The ptdesc describing the allocated page tables. > + */ > +static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order) > +{ > + struct page *page = alloc_pages(gfp | __GFP_COMP, order); > + > + return page_ptdesc(page); > +} > + > +/** > + * pagetable_free - Free pagetables > + * @pt: The page table descriptor > + * > + * pagetable_free frees a page table descriptor as well as all page > + * tables described by said ptdesc. Similarly here. > + */ > +static inline void pagetable_free(struct ptdesc *pt) > +{ > + struct page *page = ptdesc_page(pt); > + > + __free_pages(page, compound_order(page)); > +} > + > +static inline void pagetable_clear(void *x) > +{ > + clear_page(x); > +} > + > #if USE_SPLIT_PTE_PTLOCKS > #if ALLOC_SPLIT_PTLOCKS > void __init ptlock_cache_init(void); > @@ -2992,6 +3048,11 @@ static inline void mark_page_reserved(struct page > *page) > adjust_managed_page_count(page, -1); > } > > +static inline void free_reserved_ptdesc(struct ptdesc *pt) > +{ > + free_reserved_page(ptdesc_page(pt)); > +} > + > /* > * Default method to free all the __init memory into the buddy system. > * The freed pages will be poisoned with pattern "poison" if it's within > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index 330de96ebfd6..c405f74d3875 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -1026,6 +1026,18 @@ TABLE_MATCH(ptl, ptl); > #undef TABLE_MATCH > static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); > > +#define ptdesc_page(pt) (_Generic((pt), > \ > + const struct ptdesc *: (const struct page *)(pt), \ > + struct ptdesc *:(struct page *)(pt))) > + > +#define ptdesc_folio(pt) (_Generic((pt), \ > + const struct ptdesc *: (const struct folio *)(pt), \ > + struct ptdesc *:(struct folio *)(pt))) > + > +#define page_ptdesc(p)
Re: [PATCH v4 04/34] pgtable: Create struct ptdesc
On Mon, Jun 12, 2023 at 02:03:53PM -0700, Vishal Moola (Oracle) wrote: > Currently, page table information is stored within struct page. As part > of simplifying struct page, create struct ptdesc for page table > information. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/pgtable.h | 51 + > 1 file changed, 51 insertions(+) > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index c5a51481bbb9..330de96ebfd6 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -975,6 +975,57 @@ static inline void ptep_modify_prot_commit(struct > vm_area_struct *vma, > #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ > #endif /* CONFIG_MMU */ > > + > +/** > + * struct ptdesc - Memory descriptor for page tables. > + * @__page_flags: Same as page flags. Unused for page tables. > + * @pt_list: List of used page tables. Used for s390 and x86. > + * @_pt_pad_1: Padding that aliases with page's compound head. > + * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. > + * @_pt_s390_gaddr: Aliases with page's mapping. Used for s390 gmap only. > + * @pt_mm: Used for x86 pgds. > + * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 > only. > + * @ptl: Lock for the page table. Do you mind aligning the descriptions by @pt_frag_refcount? I think it'll be more readable. > + * > + * This struct overlays struct page for now. Do not modify without a good > + * understanding of the issues. > + */ > +struct ptdesc { > + unsigned long __page_flags; > + > + union { > + struct list_head pt_list; > + struct { > + unsigned long _pt_pad_1; > + pgtable_t pmd_huge_pte; > + }; > + }; > + unsigned long _pt_s390_gaddr; > + > + union { > + struct mm_struct *pt_mm; > + atomic_t pt_frag_refcount; > + }; > + > +#if ALLOC_SPLIT_PTLOCKS > + spinlock_t *ptl; > +#else > + spinlock_t ptl; > +#endif > +}; > + > +#define TABLE_MATCH(pg, pt) \ > + static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) > +TABLE_MATCH(flags, __page_flags); > +TABLE_MATCH(compound_head, pt_list); > +TABLE_MATCH(compound_head, _pt_pad_1); > +TABLE_MATCH(pmd_huge_pte, pmd_huge_pte); > +TABLE_MATCH(mapping, _pt_s390_gaddr); > +TABLE_MATCH(pt_mm, pt_mm); > +TABLE_MATCH(ptl, ptl); > +#undef TABLE_MATCH > +static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); > + > /* > * No-op macros that just return the current protection value. Defined here > * because these macros can be used even if CONFIG_MMU is not defined. > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v4 03/34] s390: Use pt_frag_refcount for pagetables
On Mon, Jun 12, 2023 at 02:03:52PM -0700, Vishal Moola (Oracle) wrote: > s390 currently uses _refcount to identify fragmented page tables. > The page table struct already has a member pt_frag_refcount used by > powerpc, so have s390 use that instead of the _refcount field as well. > This improves the safety for _refcount and the page table tracking. > > This also allows us to simplify the tracking since we can once again use > the lower byte of pt_frag_refcount instead of the upper byte of _refcount. > > Signed-off-by: Vishal Moola (Oracle) One nit below, otherwise Acked-by: Mike Rapoport (IBM) > --- > arch/s390/mm/pgalloc.c | 38 +++--- > 1 file changed, 15 insertions(+), 23 deletions(-) > > diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c > index 66ab68db9842..6b99932abc66 100644 > --- a/arch/s390/mm/pgalloc.c > +++ b/arch/s390/mm/pgalloc.c > @@ -182,20 +182,17 @@ void page_table_free_pgste(struct page *page) > * As follows from the above, no unallocated or fully allocated parent > * pages are contained in mm_context_t::pgtable_list. > * > - * The upper byte (bits 24-31) of the parent page _refcount is used > + * The lower byte (bits 0-7) of the parent page pt_frag_refcount is used > * for tracking contained 2KB-pgtables and has the following format: > * > * PP AA > - * 01234567upper byte (bits 24-31) of struct page::_refcount > + * 01234567upper byte (bits 0-7) of struct page::pt_frag_refcount Nit: lower > * || || > * || |+--- upper 2KB-pgtable is allocated > * || + lower 2KB-pgtable is allocated > * |+--- upper 2KB-pgtable is pending for removal > * + lower 2KB-pgtable is pending for removal > * > - * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why > - * using _refcount is possible). > - * > * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. > * The parent page is either: > * - added to mm_context_t::pgtable_list in case the second half of the > @@ -243,11 +240,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm) > if (!list_empty(&mm->context.pgtable_list)) { > page = list_first_entry(&mm->context.pgtable_list, > struct page, lru); > - mask = atomic_read(&page->_refcount) >> 24; > + mask = atomic_read(&page->pt_frag_refcount); > /* >* The pending removal bits must also be checked. >* Failure to do so might lead to an impossible > - * value of (i.e 0x13 or 0x23) written to _refcount. > + * value of (i.e 0x13 or 0x23) written to > + * pt_frag_refcount. >* Such values violate the assumption that pending and >* allocation bits are mutually exclusive, and the rest >* of the code unrails as result. That could lead to > @@ -259,8 +257,8 @@ unsigned long *page_table_alloc(struct mm_struct *mm) > bit = mask & 1; /* =1 -> second 2K */ > if (bit) > table += PTRS_PER_PTE; > - atomic_xor_bits(&page->_refcount, > - 0x01U << (bit + 24)); > + atomic_xor_bits(&page->pt_frag_refcount, > + 0x01U << bit); > list_del(&page->lru); > } > } > @@ -281,12 +279,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm) > table = (unsigned long *) page_to_virt(page); > if (mm_alloc_pgste(mm)) { > /* Return 4K page table with PGSTEs */ > - atomic_xor_bits(&page->_refcount, 0x03U << 24); > + atomic_xor_bits(&page->pt_frag_refcount, 0x03U); > memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); > memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); > } else { > /* Return the first 2K fragment of the page */ > - atomic_xor_bits(&page->_refcount, 0x01U << 24); > + atomic_xor_bits(&page->pt_frag_refcount, 0x01U); > memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); > spin_lock_bh(&mm->context.lock); > list_add(&page->lru, &mm->context.pgtable_list);
Re: [PATCH v4 02/34] s390: Use _pt_s390_gaddr for gmap address tracking
On Mon, Jun 12, 2023 at 02:03:51PM -0700, Vishal Moola (Oracle) wrote: > s390 uses page->index to keep track of page tables for the guest address > space. In an attempt to consolidate the usage of page fields in s390, > replace _pt_pad_2 with _pt_s390_gaddr to replace page->index in gmap. > > This will help with the splitting of struct ptdesc from struct page, as > well as allow s390 to use _pt_frag_refcount for fragmented page table > tracking. > > Since page->_pt_s390_gaddr aliases with mapping, ensure its set to NULL > before freeing the pages as well. I'm looking at the final result and unless I've missed something, setting of _pt_s390_gaddr to 0 is always followed by pagetable_free(). Can't we have pagetable_free() take care of zeroing _pt_s390_gaddr? I think patch 16 ("s390: Convert various gmap functions to use ptdescs") would be the right place for that. Otherwise: Acked-by: Mike Rapoport (IBM) > This also reverts commit 7e25de77bc5ea ("s390/mm: use pmd_pgtable_page() > helper in __gmap_segment_gaddr()") which had s390 use > pmd_pgtable_page() to get a gmap page table, as pmd_pgtable_page() > should be used for more generic process page tables. > > Signed-off-by: Vishal Moola (Oracle) > --- > arch/s390/mm/gmap.c | 56 +++- > include/linux/mm_types.h | 2 +- > 2 files changed, 39 insertions(+), 19 deletions(-) > > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c > index dc90d1eb0d55..81c683426b49 100644 > --- a/arch/s390/mm/gmap.c > +++ b/arch/s390/mm/gmap.c > @@ -70,7 +70,7 @@ static struct gmap *gmap_alloc(unsigned long limit) > page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); > if (!page) > goto out_free; > - page->index = 0; > + page->_pt_s390_gaddr = 0; > list_add(&page->lru, &gmap->crst_list); > table = page_to_virt(page); > crst_table_init(table, etype); > @@ -187,16 +187,20 @@ static void gmap_free(struct gmap *gmap) > if (!(gmap_is_shadow(gmap) && gmap->removed)) > gmap_flush_tlb(gmap); > /* Free all segment & region tables. */ > - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) > + list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { > + page->_pt_s390_gaddr = 0; > __free_pages(page, CRST_ALLOC_ORDER); > + } > gmap_radix_tree_free(&gmap->guest_to_host); > gmap_radix_tree_free(&gmap->host_to_guest); > > /* Free additional data for a shadow gmap */ > if (gmap_is_shadow(gmap)) { > /* Free all page tables. */ > - list_for_each_entry_safe(page, next, &gmap->pt_list, lru) > + list_for_each_entry_safe(page, next, &gmap->pt_list, lru) { > + page->_pt_s390_gaddr = 0; > page_table_free_pgste(page); > + } > gmap_rmap_radix_tree_free(&gmap->host_to_rmap); > /* Release reference to the parent */ > gmap_put(gmap->parent); > @@ -318,12 +322,14 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned > long *table, > list_add(&page->lru, &gmap->crst_list); > *table = __pa(new) | _REGION_ENTRY_LENGTH | > (*table & _REGION_ENTRY_TYPE_MASK); > - page->index = gaddr; > + page->_pt_s390_gaddr = gaddr; > page = NULL; > } > spin_unlock(&gmap->guest_table_lock); > - if (page) > + if (page) { > + page->_pt_s390_gaddr = 0; > __free_pages(page, CRST_ALLOC_ORDER); > + } > return 0; > } > > @@ -336,12 +342,14 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned > long *table, > static unsigned long __gmap_segment_gaddr(unsigned long *entry) > { > struct page *page; > - unsigned long offset; > + unsigned long offset, mask; > > offset = (unsigned long) entry / sizeof(unsigned long); > offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; > - page = pmd_pgtable_page((pmd_t *) entry); > - return page->index + offset; > + mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); > + page = virt_to_page((void *)((unsigned long) entry & mask)); > + > + return page->_pt_s390_gaddr + offset; > } > > /** > @@ -1351,6 +1359,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned > long raddr) > /* Free page table */ > page = phys_to_page(pgt); > list_del(&page->lru); &
Re: [PATCH v4 01/34] mm: Add PAGE_TYPE_OP folio functions
On Mon, Jun 12, 2023 at 02:03:50PM -0700, Vishal Moola (Oracle) wrote: > No folio equivalents for page type operations have been defined, so > define them for later folio conversions. > > Also changes the Page##uname macros to take in const struct page* since > we only read the memory here. > > Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) > --- > include/linux/page-flags.h | 20 ++-- > 1 file changed, 18 insertions(+), 2 deletions(-) > > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h > index 92a2063a0a23..e99a616b9bcd 100644 > --- a/include/linux/page-flags.h > +++ b/include/linux/page-flags.h > @@ -908,6 +908,8 @@ static inline bool is_page_hwpoison(struct page *page) > > #define PageType(page, flag) \ > ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) > +#define folio_test_type(folio, flag) \ > + ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) > > static inline int page_type_has_type(unsigned int page_type) > { > @@ -920,20 +922,34 @@ static inline int page_has_type(struct page *page) > } > > #define PAGE_TYPE_OPS(uname, lname) \ > -static __always_inline int Page##uname(struct page *page)\ > +static __always_inline int Page##uname(const struct page *page) > \ > {\ > return PageType(page, PG_##lname); \ > }\ > +static __always_inline int folio_test_##lname(const struct folio *folio)\ > +{\ > + return folio_test_type(folio, PG_##lname); \ > +}\ > static __always_inline void __SetPage##uname(struct page *page) > \ > {\ > VM_BUG_ON_PAGE(!PageType(page, 0), page); \ > page->page_type &= ~PG_##lname; \ > }\ > +static __always_inline void __folio_set_##lname(struct folio *folio) \ > +{\ > + VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ > + folio->page.page_type &= ~PG_##lname; \ > +}\ > static __always_inline void __ClearPage##uname(struct page *page)\ > {\ > VM_BUG_ON_PAGE(!Page##uname(page), page); \ > page->page_type |= PG_##lname; \ > -} > +}\ > +static __always_inline void __folio_clear_##lname(struct folio *folio) > \ > +{\ > + VM_BUG_ON_FOLIO(!folio_test_##lname(folio), folio); \ > + folio->page.page_type |= PG_##lname;\ > +}\ > > /* > * PageBuddy() indicates that the page is free and in the buddy system > -- > 2.40.1 > > -- Sincerely yours, Mike.
Re: [PATCH v9 02/42] mm: Move pte/pmd_mkwrite() callers with no VMA to _novma()
On Mon, Jun 12, 2023 at 05:10:28PM -0700, Rick Edgecombe wrote: > The x86 Shadow stack feature includes a new type of memory called shadow > stack. This shadow stack memory has some unusual properties, which requires > some core mm changes to function properly. > > One of these unusual properties is that shadow stack memory is writable, > but only in limited ways. These limits are applied via a specific PTE > bit combination. Nevertheless, the memory is writable, and core mm code > will need to apply the writable permissions in the typical paths that > call pte_mkwrite(). Future patches will make pte_mkwrite() take a VMA, so > that the x86 implementation of it can know whether to create regular > writable memory or shadow stack memory. Nit:^ mappings? > But there are a couple of challenges to this. Modifying the signatures of > each arch pte_mkwrite() implementation would be error prone because some > are generated with macros and would need to be re-implemented. Also, some > pte_mkwrite() callers operate on kernel memory without a VMA. > > So this can be done in a three step process. First pte_mkwrite() can be > renamed to pte_mkwrite_novma() in each arch, with a generic pte_mkwrite() > added that just calls pte_mkwrite_novma(). Next callers without a VMA can > be moved to pte_mkwrite_novma(). And lastly, pte_mkwrite() and all callers > can be changed to take/pass a VMA. > > Previous patches have done the first step, so next move the callers that > don't have a VMA to pte_mkwrite_novma(). Also do the same for I hear x86 maintainers asking to drop "previous patches" ;-) Maybe This is the second step of the conversion that moves the callers ... > pmd_mkwrite(). This will be ok for the shadow stack feature, as these > callers are on kernel memory which will not need to be made shadow stack, > and the other architectures only currently support one type of memory > in pte_mkwrite() > > Cc: linux-...@vger.kernel.org > Cc: linux-arm-ker...@lists.infradead.org > Cc: linux-s...@vger.kernel.org > Cc: xen-devel@lists.xenproject.org > Cc: linux-a...@vger.kernel.org > Cc: linux...@kvack.org > Signed-off-by: Rick Edgecombe Reviewed-by: Mike Rapoport (IBM) -- Sincerely yours, Mike.
Re: [PATCH v2 05/34] mm: add utility functions for ptdesc
On Sat, May 27, 2023 at 04:09:31PM +0100, Matthew Wilcox wrote: > On Sat, May 27, 2023 at 01:41:44PM +0300, Mike Rapoport wrote: > > Sorry if I wasn't clear, by "page table page" I meant the page (or memory > > for that matter) for actual page table rather than struct page describing > > that memory. > > > > So what we allocate here is the actual memory for the page tables and not > > the memory for the metadata. That's why I think the name ptdesc_alloc is > > confusing. > > But that's going to be the common pattern in the Glorious Future. > You allocate a folio and that includes both the folio memory descriptor > and the 2^n pages of memory described by that folio. Similarly for all > the other memory descriptors. I'm not arguing with that, I'm not happy about the naming. IMO, the name should reflect that we allocate memory for page tables rather than for the descriptor of that memory, say pgtable_alloc() or page_table_alloc(). -- Sincerely yours, Mike.
Re: [PATCH v2 05/34] mm: add utility functions for ptdesc
On Thu, May 25, 2023 at 01:53:24PM -0700, Vishal Moola wrote: > On Thu, May 25, 2023 at 1:26 PM Mike Rapoport wrote: > > > > On Thu, May 25, 2023 at 11:04:28AM -0700, Vishal Moola wrote: > > > On Thu, May 25, 2023 at 2:10 AM Mike Rapoport wrote: > > > > > + > > > > > +static inline struct ptdesc *ptdesc_alloc(gfp_t gfp, unsigned int > > > > > order) > > > > > +{ > > > > > + struct page *page = alloc_pages(gfp | __GFP_COMP, order); > > > > > + > > > > > + return page_ptdesc(page); > > > > > +} > > > > > + > > > > > +static inline void ptdesc_free(struct ptdesc *pt) > > > > > +{ > > > > > + struct page *page = ptdesc_page(pt); > > > > > + > > > > > + __free_pages(page, compound_order(page)); > > > > > +} > > > > > > > > The ptdesc_{alloc,free} API does not sound right to me. The name > > > > ptdesc_alloc() implies the allocation of the ptdesc itself, rather than > > > > allocation of page table page. The same goes for free. > > > > > > I'm not sure I see the difference. Could you elaborate? > > > > I read ptdesc_alloc() as "allocate a ptdesc" rather than as "allocate a > > page for page table and return ptdesc pointing to that page". Seems very > > confusing to me already and it will be even more confusion when we'll start > > allocating actual ptdescs. > > Hmm, I see what you're saying. I'm envisioning this function evolving into > one that allocates a ptdesc later. I don't see why we would need to have both > a > page table page AND ptdesc at any point, but that may be a lack of knowledge > from my part. Sorry if I wasn't clear, by "page table page" I meant the page (or memory for that matter) for actual page table rather than struct page describing that memory. So what we allocate here is the actual memory for the page tables and not the memory for the metadata. That's why I think the name ptdesc_alloc is confusing. > I was thinking later, if necessary, we could make another function > (only to be used internally) to allocate page table pages. -- Sincerely yours, Mike.
Re: [PATCH v2 05/34] mm: add utility functions for ptdesc
On Thu, May 25, 2023 at 11:04:28AM -0700, Vishal Moola wrote: > On Thu, May 25, 2023 at 2:10 AM Mike Rapoport wrote: > > > + > > > +static inline struct ptdesc *ptdesc_alloc(gfp_t gfp, unsigned int order) > > > +{ > > > + struct page *page = alloc_pages(gfp | __GFP_COMP, order); > > > + > > > + return page_ptdesc(page); > > > +} > > > + > > > +static inline void ptdesc_free(struct ptdesc *pt) > > > +{ > > > + struct page *page = ptdesc_page(pt); > > > + > > > + __free_pages(page, compound_order(page)); > > > +} > > > > The ptdesc_{alloc,free} API does not sound right to me. The name > > ptdesc_alloc() implies the allocation of the ptdesc itself, rather than > > allocation of page table page. The same goes for free. > > I'm not sure I see the difference. Could you elaborate? I read ptdesc_alloc() as "allocate a ptdesc" rather than as "allocate a page for page table and return ptdesc pointing to that page". Seems very confusing to me already and it will be even more confusion when we'll start allocating actual ptdescs. -- Sincerely yours, Mike.
Re: [PATCH v2 01/34] mm: Add PAGE_TYPE_OP folio functions
On Thu, May 25, 2023 at 10:00:23AM -0700, Vishal Moola wrote: > On Thu, May 25, 2023 at 1:56 AM Mike Rapoport wrote: > > > > Hi, > > > > On Mon, May 01, 2023 at 12:27:56PM -0700, Vishal Moola (Oracle) wrote: > > > No folio equivalents for page type operations have been defined, so > > > define them for later folio conversions. > > > > Can you please elaborate why would we need folios for page table > > descriptors? > > Thanks for the review! > > These macros are for callers that care about the page type, i.e. Table and > Buddy. Aside from accounting for those cases, the page tables don't use > folios. > These are more for the cleanliness of those callers. But why using folio APIs for PageType will be cleaner than using page APIs? Do you have an example? -- Sincerely yours, Mike.
Re: [PATCH v2 13/34] mm: Create ptdesc equivalents for pgtable_{pte,pmd}_page_{ctor,dtor}
On Mon, May 01, 2023 at 12:28:08PM -0700, Vishal Moola (Oracle) wrote: > Creates ptdesc_pte_ctor(), ptdesc_pmd_ctor(), ptdesc_pte_dtor(), and > ptdesc_pmd_dtor() and make the original pgtable constructor/destructors > wrappers. I think pgtable_pXY_ctor/dtor names would be better. > Signed-off-by: Vishal Moola (Oracle) > --- > include/linux/mm.h | 56 ++ > 1 file changed, 42 insertions(+), 14 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 58c911341a33..dc61aeca9077 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2847,20 +2847,34 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) > { return true; } > static inline void ptlock_free(struct ptdesc *ptdesc) {} > #endif /* USE_SPLIT_PTE_PTLOCKS */ > > -static inline bool pgtable_pte_page_ctor(struct page *page) > +static inline bool ptdesc_pte_ctor(struct ptdesc *ptdesc) > { > - if (!ptlock_init(page_ptdesc(page))) > + struct folio *folio = ptdesc_folio(ptdesc); > + > + if (!ptlock_init(ptdesc)) > return false; > - __SetPageTable(page); > - inc_lruvec_page_state(page, NR_PAGETABLE); > + __folio_set_table(folio); > + lruvec_stat_add_folio(folio, NR_PAGETABLE); > return true; > } > > +static inline bool pgtable_pte_page_ctor(struct page *page) > +{ > + return ptdesc_pte_ctor(page_ptdesc(page)); > +} > + > +static inline void ptdesc_pte_dtor(struct ptdesc *ptdesc) > +{ > + struct folio *folio = ptdesc_folio(ptdesc); > + > + ptlock_free(ptdesc); > + __folio_clear_table(folio); > + lruvec_stat_sub_folio(folio, NR_PAGETABLE); > +} > + > static inline void pgtable_pte_page_dtor(struct page *page) > { > - ptlock_free(page_ptdesc(page)); > - __ClearPageTable(page); > - dec_lruvec_page_state(page, NR_PAGETABLE); > + ptdesc_pte_dtor(page_ptdesc(page)); > } > > #define pte_offset_map_lock(mm, pmd, address, ptlp) \ > @@ -2942,20 +2956,34 @@ static inline spinlock_t *pmd_lock(struct mm_struct > *mm, pmd_t *pmd) > return ptl; > } > > -static inline bool pgtable_pmd_page_ctor(struct page *page) > +static inline bool ptdesc_pmd_ctor(struct ptdesc *ptdesc) > { > - if (!pmd_ptlock_init(page_ptdesc(page))) > + struct folio *folio = ptdesc_folio(ptdesc); > + > + if (!pmd_ptlock_init(ptdesc)) > return false; > - __SetPageTable(page); > - inc_lruvec_page_state(page, NR_PAGETABLE); > + __folio_set_table(folio); > + lruvec_stat_add_folio(folio, NR_PAGETABLE); > return true; > } > > +static inline bool pgtable_pmd_page_ctor(struct page *page) > +{ > + return ptdesc_pmd_ctor(page_ptdesc(page)); > +} > + > +static inline void ptdesc_pmd_dtor(struct ptdesc *ptdesc) > +{ > + struct folio *folio = ptdesc_folio(ptdesc); > + > + pmd_ptlock_free(ptdesc); > + __folio_clear_table(folio); > + lruvec_stat_sub_folio(folio, NR_PAGETABLE); > +} > + > static inline void pgtable_pmd_page_dtor(struct page *page) > { > - pmd_ptlock_free(page_ptdesc(page)); > - __ClearPageTable(page); > - dec_lruvec_page_state(page, NR_PAGETABLE); > + ptdesc_pmd_dtor(page_ptdesc(page)); > } > > /* > -- > 2.39.2 > > -- Sincerely yours, Mike.
Re: [PATCH v2 05/34] mm: add utility functions for ptdesc
On Mon, May 01, 2023 at 12:28:00PM -0700, Vishal Moola (Oracle) wrote: > Introduce utility functions setting the foundation for ptdescs. These > will also assist in the splitting out of ptdesc from struct page. > > ptdesc_alloc() is defined to allocate new ptdesc pages as compound > pages. This is to standardize ptdescs by allowing for one allocation > and one free function, in contrast to 2 allocation and 2 free functions. > > Signed-off-by: Vishal Moola (Oracle) > --- > include/asm-generic/tlb.h | 11 ++ > include/linux/mm.h| 44 +++ > include/linux/pgtable.h | 12 +++ > 3 files changed, 67 insertions(+) > > diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h > index b46617207c93..6bade9e0e799 100644 > --- a/include/asm-generic/tlb.h > +++ b/include/asm-generic/tlb.h > @@ -481,6 +481,17 @@ static inline void tlb_remove_page(struct mmu_gather > *tlb, struct page *page) > return tlb_remove_page_size(tlb, page, PAGE_SIZE); > } > > +static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt) > +{ > + tlb_remove_table(tlb, pt); > +} > + > +/* Like tlb_remove_ptdesc, but for page-like page directories. */ > +static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct > ptdesc *pt) > +{ > + tlb_remove_page(tlb, ptdesc_page(pt)); > +} > + > static inline void tlb_change_page_size(struct mmu_gather *tlb, >unsigned int page_size) > { > diff --git a/include/linux/mm.h b/include/linux/mm.h > index b18848ae7e22..258f3b730359 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2744,6 +2744,45 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, > pud_t *pud, unsigned long a > } > #endif /* CONFIG_MMU */ > > +static inline struct ptdesc *virt_to_ptdesc(const void *x) > +{ > + return page_ptdesc(virt_to_head_page(x)); Do we ever use compound pages for page tables? > +} > + > +static inline void *ptdesc_to_virt(const struct ptdesc *pt) > +{ > + return page_to_virt(ptdesc_page(pt)); > +} > + > +static inline void *ptdesc_address(const struct ptdesc *pt) > +{ > + return folio_address(ptdesc_folio(pt)); > +} > + > +static inline bool ptdesc_is_reserved(struct ptdesc *pt) > +{ > + return folio_test_reserved(ptdesc_folio(pt)); > +} > + > +static inline struct ptdesc *ptdesc_alloc(gfp_t gfp, unsigned int order) > +{ > + struct page *page = alloc_pages(gfp | __GFP_COMP, order); > + > + return page_ptdesc(page); > +} > + > +static inline void ptdesc_free(struct ptdesc *pt) > +{ > + struct page *page = ptdesc_page(pt); > + > + __free_pages(page, compound_order(page)); > +} The ptdesc_{alloc,free} API does not sound right to me. The name ptdesc_alloc() implies the allocation of the ptdesc itself, rather than allocation of page table page. The same goes for free. > + > +static inline void ptdesc_clear(void *x) > +{ > + clear_page(x); > +} > + > #if USE_SPLIT_PTE_PTLOCKS > #if ALLOC_SPLIT_PTLOCKS > void __init ptlock_cache_init(void); > @@ -2970,6 +3009,11 @@ static inline void mark_page_reserved(struct page > *page) > adjust_managed_page_count(page, -1); > } > > +static inline void free_reserved_ptdesc(struct ptdesc *pt) > +{ > + free_reserved_page(ptdesc_page(pt)); > +} > + > /* > * Default method to free all the __init memory into the buddy system. > * The freed pages will be poisoned with pattern "poison" if it's within > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index 5e0f51308724..b067ac10f3dd 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -1041,6 +1041,18 @@ TABLE_MATCH(ptl, ptl); > #undef TABLE_MATCH > static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); > > +#define ptdesc_page(pt) (_Generic((pt), > \ > + const struct ptdesc *: (const struct page *)(pt), \ > + struct ptdesc *:(struct page *)(pt))) > + > +#define ptdesc_folio(pt) (_Generic((pt), \ > + const struct ptdesc *: (const struct folio *)(pt), \ > + struct ptdesc *:(struct folio *)(pt))) > + > +#define page_ptdesc(p) (_Generic((p), > \ > + const struct page *:(const struct ptdesc *)(p), \ > + struct page *: (struct ptdesc *)(p))) > + > /* > * No-op macros that just return the current protection value. Defined here > * because these macros can be used even if CONFIG_MMU is not defined. > -- > 2.39.2 > > -- Sincerely yours, Mike.
Re: [PATCH v2 02/34] s390: Use _pt_s390_gaddr for gmap address tracking
On Mon, May 01, 2023 at 12:27:57PM -0700, Vishal Moola (Oracle) wrote: > s390 uses page->index to keep track of page tables for the guest address > space. In an attempt to consolidate the usage of page fields in s390, > replace _pt_pad_2 with _pt_s390_gaddr to replace page->index in gmap. > > This will help with the splitting of struct ptdesc from struct page, as > well as allow s390 to use _pt_frag_refcount for fragmented page table > tracking. > > Since page->_pt_s390_gaddr aliases with mapping, ensure its set to NULL > before freeing the pages as well. Wouldn't it be easier to use _pt_pad_1 which is aliased with lru and that does not seem to be used by page tables at all? > This also reverts commit 7e25de77bc5ea ("s390/mm: use pmd_pgtable_page() > helper in __gmap_segment_gaddr()") which had s390 use > pmd_pgtable_page() to get a gmap page table, as pmd_pgtable_page() > should be used for more generic process page tables. > > Signed-off-by: Vishal Moola (Oracle) > --- > arch/s390/mm/gmap.c | 56 +++- > include/linux/mm_types.h | 2 +- > 2 files changed, 39 insertions(+), 19 deletions(-) > > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c > index dfe905c7bd8e..a9e8b1805894 100644 > --- a/arch/s390/mm/gmap.c > +++ b/arch/s390/mm/gmap.c > @@ -70,7 +70,7 @@ static struct gmap *gmap_alloc(unsigned long limit) > page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); > if (!page) > goto out_free; > - page->index = 0; > + page->_pt_s390_gaddr = 0; > list_add(&page->lru, &gmap->crst_list); > table = page_to_virt(page); > crst_table_init(table, etype); > @@ -187,16 +187,20 @@ static void gmap_free(struct gmap *gmap) > if (!(gmap_is_shadow(gmap) && gmap->removed)) > gmap_flush_tlb(gmap); > /* Free all segment & region tables. */ > - list_for_each_entry_safe(page, next, &gmap->crst_list, lru) > + list_for_each_entry_safe(page, next, &gmap->crst_list, lru) { > + page->_pt_s390_gaddr = 0; > __free_pages(page, CRST_ALLOC_ORDER); > + } > gmap_radix_tree_free(&gmap->guest_to_host); > gmap_radix_tree_free(&gmap->host_to_guest); > > /* Free additional data for a shadow gmap */ > if (gmap_is_shadow(gmap)) { > /* Free all page tables. */ > - list_for_each_entry_safe(page, next, &gmap->pt_list, lru) > + list_for_each_entry_safe(page, next, &gmap->pt_list, lru) { > + page->_pt_s390_gaddr = 0; > page_table_free_pgste(page); > + } > gmap_rmap_radix_tree_free(&gmap->host_to_rmap); > /* Release reference to the parent */ > gmap_put(gmap->parent); > @@ -318,12 +322,14 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned > long *table, > list_add(&page->lru, &gmap->crst_list); > *table = __pa(new) | _REGION_ENTRY_LENGTH | > (*table & _REGION_ENTRY_TYPE_MASK); > - page->index = gaddr; > + page->_pt_s390_gaddr = gaddr; > page = NULL; > } > spin_unlock(&gmap->guest_table_lock); > - if (page) > + if (page) { > + page->_pt_s390_gaddr = 0; > __free_pages(page, CRST_ALLOC_ORDER); > + } > return 0; > } > > @@ -336,12 +342,14 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned > long *table, > static unsigned long __gmap_segment_gaddr(unsigned long *entry) > { > struct page *page; > - unsigned long offset; > + unsigned long offset, mask; > > offset = (unsigned long) entry / sizeof(unsigned long); > offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; > - page = pmd_pgtable_page((pmd_t *) entry); > - return page->index + offset; > + mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); > + page = virt_to_page((void *)((unsigned long) entry & mask)); > + > + return page->_pt_s390_gaddr + offset; > } > > /** > @@ -1351,6 +1359,7 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned > long raddr) > /* Free page table */ > page = phys_to_page(pgt); > list_del(&page->lru); > + page->_pt_s390_gaddr = 0; > page_table_free_pgste(page); > } > > @@ -1379,6 +1388,7 @@ static void __gmap_unshadow_sgt(struct gmap *sg, > unsigned long raddr, > /* Free page table */ > page = phys_to_page(pgt); > list_del(&page->lru); > + page->_pt_s390_gaddr = 0; > page_table_free_pgste(page); > } > } > @@ -1409,6 +1419,7 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned > long raddr) > /* Free segment table */ > page = phys_to_page(sgt); > list_del(&page->lru); > + page->_pt_s390_gaddr = 0; > __free_pages(page, CRST_ALLOC_ORDER); > } > > @@ -1437,6 +1448,7 @@ static void __gmap_unshadow_r3t
Re: [PATCH v2 01/34] mm: Add PAGE_TYPE_OP folio functions
Hi, On Mon, May 01, 2023 at 12:27:56PM -0700, Vishal Moola (Oracle) wrote: > No folio equivalents for page type operations have been defined, so > define them for later folio conversions. Can you please elaborate why would we need folios for page table descriptors? > Also changes the Page##uname macros to take in const struct page* since > we only read the memory here. > > Signed-off-by: Vishal Moola (Oracle) > --- > include/linux/page-flags.h | 20 ++-- > 1 file changed, 18 insertions(+), 2 deletions(-) > > diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h > index 1c68d67b832f..607b495d1b57 100644 > --- a/include/linux/page-flags.h > +++ b/include/linux/page-flags.h > @@ -902,6 +902,8 @@ static inline bool is_page_hwpoison(struct page *page) > > #define PageType(page, flag) \ > ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) > +#define folio_test_type(folio, flag) \ > + ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) > > static inline int page_type_has_type(unsigned int page_type) > { > @@ -914,20 +916,34 @@ static inline int page_has_type(struct page *page) > } > > #define PAGE_TYPE_OPS(uname, lname) \ > -static __always_inline int Page##uname(struct page *page)\ > +static __always_inline int Page##uname(const struct page *page) > \ > {\ > return PageType(page, PG_##lname); \ > }\ > +static __always_inline int folio_test_##lname(const struct folio *folio)\ > +{\ > + return folio_test_type(folio, PG_##lname); \ > +}\ > static __always_inline void __SetPage##uname(struct page *page) > \ > {\ > VM_BUG_ON_PAGE(!PageType(page, 0), page); \ > page->page_type &= ~PG_##lname; \ > }\ > +static __always_inline void __folio_set_##lname(struct folio *folio) \ > +{\ > + VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ > + folio->page.page_type &= ~PG_##lname; \ > +}\ > static __always_inline void __ClearPage##uname(struct page *page)\ > {\ > VM_BUG_ON_PAGE(!Page##uname(page), page); \ > page->page_type |= PG_##lname; \ > -} > +}\ > +static __always_inline void __folio_clear_##lname(struct folio *folio) > \ > +{\ > + VM_BUG_ON_FOLIO(!folio_test_##lname(folio), folio); \ > + folio->page.page_type |= PG_##lname;\ > +}\ > > /* > * PageBuddy() indicates that the page is free and in the buddy system > -- > 2.39.2 > > -- Sincerely yours, Mike.
Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions
On Thu, Jan 26, 2023 at 11:17:09AM +0200, Mike Rapoport wrote: > On Wed, Jan 25, 2023 at 12:38:46AM -0800, Suren Baghdasaryan wrote: > > vm_flags are among VMA attributes which affect decisions like VMA merging > > and splitting. Therefore all vm_flags modifications are performed after > > taking exclusive mmap_lock to prevent vm_flags updates racing with such > > operations. Introduce modifier functions for vm_flags to be used whenever > > flags are updated. This way we can better check and control correct > > locking behavior during these updates. > > > > Signed-off-by: Suren Baghdasaryan > > --- > > include/linux/mm.h | 37 + > > include/linux/mm_types.h | 8 +++- > > 2 files changed, 44 insertions(+), 1 deletion(-) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index c2f62bdce134..b71f2809caac 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -627,6 +627,43 @@ static inline void vma_init(struct vm_area_struct > > *vma, struct mm_struct *mm) > > INIT_LIST_HEAD(&vma->anon_vma_chain); > > } > > > > +/* Use when VMA is not part of the VMA tree and needs no locking */ > > +static inline void init_vm_flags(struct vm_area_struct *vma, > > +unsigned long flags) > > I'd suggest to make it vm_flags_init() etc. Thinking more about it, it will be even clearer to name these vma_flags_xyz() > Except that > > Acked-by: Mike Rapoport (IBM) > -- Sincerely yours, Mike.
Re: [PATCH v2 6/6] mm: export dump_mm()
On Wed, Jan 25, 2023 at 12:38:51AM -0800, Suren Baghdasaryan wrote: > mmap_assert_write_locked() is used in vm_flags modifiers. Because > mmap_assert_write_locked() uses dump_mm() and vm_flags are sometimes > modified from from inside a module, it's necessary to export > dump_mm() function. > > Signed-off-by: Suren Baghdasaryan Acked-by: Mike Rapoport (IBM) > --- > mm/debug.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/mm/debug.c b/mm/debug.c > index 9d3d893dc7f4..96d594e16292 100644 > --- a/mm/debug.c > +++ b/mm/debug.c > @@ -215,6 +215,7 @@ void dump_mm(const struct mm_struct *mm) > mm->def_flags, &mm->def_flags > ); > } > +EXPORT_SYMBOL(dump_mm); > > static bool page_init_poisoning __read_mostly = true; > > -- > 2.39.1 >
Re: [PATCH v2 5/6] mm: introduce mod_vm_flags_nolock and use it in untrack_pfn
On Wed, Jan 25, 2023 at 12:38:50AM -0800, Suren Baghdasaryan wrote: > In cases when VMA flags are modified after VMA was isolated and mmap_lock > was downgraded, flags modifications would result in an assertion because > mmap write lock is not held. > Introduce mod_vm_flags_nolock to be used in such situation. vm_flags_mod_nolock? > Pass a hint to untrack_pfn to conditionally use mod_vm_flags_nolock for > flags modification and to avoid assertion. > > Signed-off-by: Suren Baghdasaryan > --- > arch/x86/mm/pat/memtype.c | 10 +++--- > include/linux/mm.h| 12 +--- > include/linux/pgtable.h | 5 +++-- > mm/memory.c | 13 +++-- > mm/memremap.c | 4 ++-- > mm/mmap.c | 16 ++-- > 6 files changed, 38 insertions(+), 22 deletions(-) > > diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c > index ae9645c900fa..d8adc0b42cf2 100644 > --- a/arch/x86/mm/pat/memtype.c > +++ b/arch/x86/mm/pat/memtype.c > @@ -1046,7 +1046,7 @@ void track_pfn_insert(struct vm_area_struct *vma, > pgprot_t *prot, pfn_t pfn) > * can be for the entire vma (in which case pfn, size are zero). > */ > void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, > - unsigned long size) > + unsigned long size, bool mm_wr_locked) > { > resource_size_t paddr; > unsigned long prot; > @@ -1065,8 +1065,12 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned > long pfn, > size = vma->vm_end - vma->vm_start; > } > free_pfn_range(paddr, size); > - if (vma) > - clear_vm_flags(vma, VM_PAT); > + if (vma) { > + if (mm_wr_locked) > + clear_vm_flags(vma, VM_PAT); > + else > + mod_vm_flags_nolock(vma, 0, VM_PAT); > + } > } > > /* > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 55335edd1373..48d49930c411 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -656,12 +656,18 @@ static inline void clear_vm_flags(struct vm_area_struct > *vma, > vma->vm_flags &= ~flags; > } > > +static inline void mod_vm_flags_nolock(struct vm_area_struct *vma, > +unsigned long set, unsigned long clear) > +{ > + vma->vm_flags |= set; > + vma->vm_flags &= ~clear; > +} > + > static inline void mod_vm_flags(struct vm_area_struct *vma, > unsigned long set, unsigned long clear) > { > mmap_assert_write_locked(vma->vm_mm); > - vma->vm_flags |= set; > - vma->vm_flags &= ~clear; > + mod_vm_flags_nolock(vma, set, clear); > } > > static inline void vma_set_anonymous(struct vm_area_struct *vma) > @@ -2087,7 +2093,7 @@ static inline void zap_vma_pages(struct vm_area_struct > *vma) > } > void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, > struct vm_area_struct *start_vma, unsigned long start, > - unsigned long end); > + unsigned long end, bool mm_wr_locked); > > struct mmu_notifier_range; > > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h > index 5fd45454c073..c63cd44777ec 100644 > --- a/include/linux/pgtable.h > +++ b/include/linux/pgtable.h > @@ -1185,7 +1185,8 @@ static inline int track_pfn_copy(struct vm_area_struct > *vma) > * can be for the entire vma (in which case pfn, size are zero). > */ > static inline void untrack_pfn(struct vm_area_struct *vma, > -unsigned long pfn, unsigned long size) > +unsigned long pfn, unsigned long size, > +bool mm_wr_locked) > { > } > > @@ -1203,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct > *vma, pgprot_t *prot, >pfn_t pfn); > extern int track_pfn_copy(struct vm_area_struct *vma); > extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, > - unsigned long size); > + unsigned long size, bool mm_wr_locked); > extern void untrack_pfn_moved(struct vm_area_struct *vma); > #endif > > diff --git a/mm/memory.c b/mm/memory.c > index d6902065e558..5b11b50e2c4a 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -1613,7 +1613,7 @@ void unmap_page_range(struct mmu_gather *tlb, > static void unmap_single_vma(struct mmu_gather *tlb, > struct vm_area_struct *vma, unsigned long start_addr, > unsigned long end_addr, > - struct zap_details *details) > + struct zap_details *details, bool mm_wr_locked) > { > unsigned long start = max(vma->vm_start, start_addr); > unsigned long end; > @@ -1628,7 +1628,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, > uprobe_munmap(vma, start, end); > > if (unlikely(vma->vm_flags & VM_PFNMAP)) > - untrack_pfn(vma, 0, 0); > + untrack_pfn(vma, 0, 0, mm_wr_lo
Re: [PATCH v2 4/6] mm: replace vma->vm_flags indirect modification in ksm_madvise
On Wed, Jan 25, 2023 at 12:38:49AM -0800, Suren Baghdasaryan wrote: > Replace indirect modifications to vma->vm_flags with calls to modifier > functions to be able to track flag changes and to keep vma locking > correctness. Add a BUG_ON check in ksm_madvise() to catch indirect > vm_flags modification attempts. > > Signed-off-by: Suren Baghdasaryan Acked-by: Mike Rapoport (IBM) > --- > arch/powerpc/kvm/book3s_hv_uvmem.c | 5 - > arch/s390/mm/gmap.c| 5 - > mm/khugepaged.c| 2 ++ > mm/ksm.c | 2 ++ > 4 files changed, 12 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c > b/arch/powerpc/kvm/book3s_hv_uvmem.c > index 1d67baa5557a..325a7a47d348 100644 > --- a/arch/powerpc/kvm/book3s_hv_uvmem.c > +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c > @@ -393,6 +393,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, > { > unsigned long gfn = memslot->base_gfn; > unsigned long end, start = gfn_to_hva(kvm, gfn); > + unsigned long vm_flags; > int ret = 0; > struct vm_area_struct *vma; > int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE; > @@ -409,12 +410,14 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, > ret = H_STATE; > break; > } > + vm_flags = vma->vm_flags; > ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, > - merge_flag, &vma->vm_flags); > + merge_flag, &vm_flags); > if (ret) { > ret = H_STATE; > break; > } > + reset_vm_flags(vma, vm_flags); > start = vma->vm_end; > } while (end > vma->vm_end); > > diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c > index 3a695b8a1e3c..d5eb47dcdacb 100644 > --- a/arch/s390/mm/gmap.c > +++ b/arch/s390/mm/gmap.c > @@ -2587,14 +2587,17 @@ int gmap_mark_unmergeable(void) > { > struct mm_struct *mm = current->mm; > struct vm_area_struct *vma; > + unsigned long vm_flags; > int ret; > VMA_ITERATOR(vmi, mm, 0); > > for_each_vma(vmi, vma) { > + vm_flags = vma->vm_flags; > ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, > - MADV_UNMERGEABLE, &vma->vm_flags); > + MADV_UNMERGEABLE, &vm_flags); > if (ret) > return ret; > + reset_vm_flags(vma, vm_flags); > } > mm->def_flags &= ~VM_MERGEABLE; > return 0; > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index 8abc59345bf2..76b24cd0c179 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -354,6 +354,8 @@ struct attribute_group khugepaged_attr_group = { > int hugepage_madvise(struct vm_area_struct *vma, >unsigned long *vm_flags, int advice) > { > + /* vma->vm_flags can be changed only using modifier functions */ > + BUG_ON(vm_flags == &vma->vm_flags); > switch (advice) { > case MADV_HUGEPAGE: > #ifdef CONFIG_S390 > diff --git a/mm/ksm.c b/mm/ksm.c > index 04f1c8c2df11..992b2be9f5e6 100644 > --- a/mm/ksm.c > +++ b/mm/ksm.c > @@ -2573,6 +2573,8 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned > long start, > struct mm_struct *mm = vma->vm_mm; > int err; > > + /* vma->vm_flags can be changed only using modifier functions */ > + BUG_ON(vm_flags == &vma->vm_flags); > switch (advice) { > case MADV_MERGEABLE: > /* > -- > 2.39.1 > >
Re: [PATCH v2 3/6] mm: replace vma->vm_flags direct modifications with modifier calls
On Wed, Jan 25, 2023 at 12:38:48AM -0800, Suren Baghdasaryan wrote: > Replace direct modifications to vma->vm_flags with calls to modifier > functions to be able to track flag changes and to keep vma locking > correctness. > > Signed-off-by: Suren Baghdasaryan Acked-by: Mike Rapoport (IBM) > --- > arch/arm/kernel/process.c | 2 +- > arch/ia64/mm/init.c| 8 > arch/loongarch/include/asm/tlb.h | 2 +- > arch/powerpc/kvm/book3s_xive_native.c | 2 +- > arch/powerpc/mm/book3s64/subpage_prot.c| 2 +- > arch/powerpc/platforms/book3s/vas-api.c| 2 +- > arch/powerpc/platforms/cell/spufs/file.c | 14 +++--- > arch/s390/mm/gmap.c| 3 +-- > arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- > arch/x86/kernel/cpu/sgx/driver.c | 2 +- > arch/x86/kernel/cpu/sgx/virt.c | 2 +- > arch/x86/mm/pat/memtype.c | 6 +++--- > arch/x86/um/mem_32.c | 2 +- > drivers/acpi/pfr_telemetry.c | 2 +- > drivers/android/binder.c | 3 +-- > drivers/char/mspec.c | 2 +- > drivers/crypto/hisilicon/qm.c | 2 +- > drivers/dax/device.c | 2 +- > drivers/dma/idxd/cdev.c| 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c| 2 +- > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 4 ++-- > drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 4 ++-- > drivers/gpu/drm/amd/amdkfd/kfd_events.c| 4 ++-- > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++-- > drivers/gpu/drm/drm_gem.c | 2 +- > drivers/gpu/drm/drm_gem_dma_helper.c | 3 +-- > drivers/gpu/drm/drm_gem_shmem_helper.c | 2 +- > drivers/gpu/drm/drm_vm.c | 8 > drivers/gpu/drm/etnaviv/etnaviv_gem.c | 2 +- > drivers/gpu/drm/exynos/exynos_drm_gem.c| 4 ++-- > drivers/gpu/drm/gma500/framebuffer.c | 2 +- > drivers/gpu/drm/i810/i810_dma.c| 2 +- > drivers/gpu/drm/i915/gem/i915_gem_mman.c | 4 ++-- > drivers/gpu/drm/mediatek/mtk_drm_gem.c | 2 +- > drivers/gpu/drm/msm/msm_gem.c | 2 +- > drivers/gpu/drm/omapdrm/omap_gem.c | 3 +-- > drivers/gpu/drm/rockchip/rockchip_drm_gem.c| 3 +-- > drivers/gpu/drm/tegra/gem.c| 5 ++--- > drivers/gpu/drm/ttm/ttm_bo_vm.c| 3 +-- > drivers/gpu/drm/virtio/virtgpu_vram.c | 2 +- > drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c | 2 +- > drivers/gpu/drm/xen/xen_drm_front_gem.c| 3 +-- > drivers/hsi/clients/cmt_speech.c | 2 +- > drivers/hwtracing/intel_th/msu.c | 2 +- > drivers/hwtracing/stm/core.c | 2 +- > drivers/infiniband/hw/hfi1/file_ops.c | 4 ++-- > drivers/infiniband/hw/mlx5/main.c | 4 ++-- > drivers/infiniband/hw/qib/qib_file_ops.c | 13 ++--- > drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 2 +- > drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c| 2 +- > .../media/common/videobuf2/videobuf2-dma-contig.c | 2 +- > drivers/media/common/videobuf2/videobuf2-vmalloc.c | 2 +- > drivers/media/v4l2-core/videobuf-dma-contig.c | 2 +- > drivers/media/v4l2-core/videobuf-dma-sg.c | 4 ++-- > drivers/media/v4l2-core/videobuf-vmalloc.c | 2 +- > drivers/misc/cxl/context.c | 2 +- > drivers/misc/habanalabs/common/memory.c| 2 +- > drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++-- > drivers/misc/habanalabs/gaudi2/gaudi2.c| 8 > drivers/misc/habanalabs/goya/goya.c| 4 ++-- > drivers/misc/ocxl/context.c| 4 ++-- > drivers/misc/ocxl/sysfs.c | 2 +- > drivers/misc/open-dice.c | 4 ++-- > drivers/misc/sgi-gru/grufile.c | 4 ++-- > drivers/misc/uacce/uacce.c | 2 +- > drivers/sbus/char/oradax.c | 2 +- > drivers/scsi/cxlflash/ocxl_hw.c| 2 +- > drivers/scsi/sg.c | 2 +- > drivers/staging/media/atomisp/pci/hmm/hmm_bo.c | 2 +- > drivers/staging/media/deprecated/meye/meye.c | 4 ++-- > .../media/deprecated/stkwebcam
Re: [PATCH v2 2/6] mm: replace VM_LOCKED_CLEAR_MASK with VM_LOCKED_MASK
On Wed, Jan 25, 2023 at 12:38:47AM -0800, Suren Baghdasaryan wrote: > To simplify the usage of VM_LOCKED_CLEAR_MASK in clear_vm_flags(), > replace it with VM_LOCKED_MASK bitmask and convert all users. > > Signed-off-by: Suren Baghdasaryan Acked-by: Mike Rapoport (IBM) > --- > include/linux/mm.h | 4 ++-- > kernel/fork.c | 2 +- > mm/hugetlb.c | 4 ++-- > mm/mlock.c | 6 +++--- > mm/mmap.c | 6 +++--- > mm/mremap.c| 2 +- > 6 files changed, 12 insertions(+), 12 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index b71f2809caac..da62bdd627bf 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -421,8 +421,8 @@ extern unsigned int kobjsize(const void *objp); > /* This mask defines which mm->def_flags a process can inherit its parent */ > #define VM_INIT_DEF_MASK VM_NOHUGEPAGE > > -/* This mask is used to clear all the VMA flags used by mlock */ > -#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) > +/* This mask represents all the VMA flag bits used by mlock */ > +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) > > /* Arch-specific flags to clear when updating VM flags on protection change > */ > #ifndef VM_ARCH_CLEAR > diff --git a/kernel/fork.c b/kernel/fork.c > index 6683c1b0f460..03d472051236 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -669,7 +669,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, > tmp->anon_vma = NULL; > } else if (anon_vma_fork(tmp, mpnt)) > goto fail_nomem_anon_vma_fork; > - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); > + clear_vm_flags(tmp, VM_LOCKED_MASK); > file = tmp->vm_file; > if (file) { > struct address_space *mapping = file->f_mapping; > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index d20c8b09890e..4ecdbad9a451 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -6973,8 +6973,8 @@ static unsigned long page_table_shareable(struct > vm_area_struct *svma, > unsigned long s_end = sbase + PUD_SIZE; > > /* Allow segments to share if only one is marked locked */ > - unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; > - unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; > + unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; > + unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; > > /* >* match the virtual addresses, permission and the alignment of the > diff --git a/mm/mlock.c b/mm/mlock.c > index 0336f52e03d7..5c4fff93cd6b 100644 > --- a/mm/mlock.c > +++ b/mm/mlock.c > @@ -497,7 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, > size_t len, > if (vma->vm_start != tmp) > return -ENOMEM; > > - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; > + newflags = vma->vm_flags & ~VM_LOCKED_MASK; > newflags |= flags; > /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ > tmp = vma->vm_end; > @@ -661,7 +661,7 @@ static int apply_mlockall_flags(int flags) > struct vm_area_struct *vma, *prev = NULL; > vm_flags_t to_add = 0; > > - current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; > + current->mm->def_flags &= ~VM_LOCKED_MASK; > if (flags & MCL_FUTURE) { > current->mm->def_flags |= VM_LOCKED; > > @@ -681,7 +681,7 @@ static int apply_mlockall_flags(int flags) > for_each_vma(vmi, vma) { > vm_flags_t newflags; > > - newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; > + newflags = vma->vm_flags & ~VM_LOCKED_MASK; > newflags |= to_add; > > /* Ignore errors */ > diff --git a/mm/mmap.c b/mm/mmap.c > index d4abc6feced1..323bd253b25a 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -2671,7 +2671,7 @@ unsigned long mmap_region(struct file *file, unsigned > long addr, > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || > is_vm_hugetlb_page(vma) || > vma == get_gate_vma(current->mm)) > - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; > + clear_vm_flags(vma, VM_LOCKED_MASK); > else > mm->locked_vm += (len >> PAGE_SHIFT); > } > @@ -3340,8 +3340,8 @@ static struct vm_area_struct *__install_sp
Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions
On Wed, Jan 25, 2023 at 12:38:46AM -0800, Suren Baghdasaryan wrote: > vm_flags are among VMA attributes which affect decisions like VMA merging > and splitting. Therefore all vm_flags modifications are performed after > taking exclusive mmap_lock to prevent vm_flags updates racing with such > operations. Introduce modifier functions for vm_flags to be used whenever > flags are updated. This way we can better check and control correct > locking behavior during these updates. > > Signed-off-by: Suren Baghdasaryan > --- > include/linux/mm.h | 37 + > include/linux/mm_types.h | 8 +++- > 2 files changed, 44 insertions(+), 1 deletion(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index c2f62bdce134..b71f2809caac 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -627,6 +627,43 @@ static inline void vma_init(struct vm_area_struct *vma, > struct mm_struct *mm) > INIT_LIST_HEAD(&vma->anon_vma_chain); > } > > +/* Use when VMA is not part of the VMA tree and needs no locking */ > +static inline void init_vm_flags(struct vm_area_struct *vma, > + unsigned long flags) I'd suggest to make it vm_flags_init() etc. Except that Acked-by: Mike Rapoport (IBM) > +{ > + vma->vm_flags = flags; > +} > + > +/* Use when VMA is part of the VMA tree and modifications need coordination > */ > +static inline void reset_vm_flags(struct vm_area_struct *vma, > + unsigned long flags) > +{ > + mmap_assert_write_locked(vma->vm_mm); > + init_vm_flags(vma, flags); > +} > + > +static inline void set_vm_flags(struct vm_area_struct *vma, > + unsigned long flags) > +{ > + mmap_assert_write_locked(vma->vm_mm); > + vma->vm_flags |= flags; > +} > + > +static inline void clear_vm_flags(struct vm_area_struct *vma, > + unsigned long flags) > +{ > + mmap_assert_write_locked(vma->vm_mm); > + vma->vm_flags &= ~flags; > +} > + > +static inline void mod_vm_flags(struct vm_area_struct *vma, > + unsigned long set, unsigned long clear) > +{ > + mmap_assert_write_locked(vma->vm_mm); > + vma->vm_flags |= set; > + vma->vm_flags &= ~clear; > +} > + > static inline void vma_set_anonymous(struct vm_area_struct *vma) > { > vma->vm_ops = NULL; > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 2d6d790d9bed..6c7c70bf50dd 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -491,7 +491,13 @@ struct vm_area_struct { >* See vmf_insert_mixed_prot() for discussion. >*/ > pgprot_t vm_page_prot; > - unsigned long vm_flags; /* Flags, see mm.h. */ > + > + /* > + * Flags, see mm.h. > + * WARNING! Do not modify directly. > + * Use {init|reset|set|clear|mod}_vm_flags() functions instead. > + */ > + unsigned long vm_flags; > > /* >* For areas with an address space and backing store, > -- > 2.39.1 > >
Re: [PATCH v2 0/6] memblock: cleanup memblock_free interface
On Thu, Sep 30, 2021 at 02:20:33PM -0700, Linus Torvalds wrote: > On Thu, Sep 30, 2021 at 11:50 AM Mike Rapoport wrote: > > > > The first patch is a cleanup of numa_distance allocation in arch_numa I've > > spotted during the conversion. > > The second patch is a fix for Xen memory freeing on some of the error > > paths. > > Well, at least patch 2 looks like something that should go into 5.15 > and be marked for stable. > > Patch 1 looks like a trivial local cleanup, and could go in > immediately. Patch 4 might be in that same category. > > The rest look like "next merge window" to me, since they are spread > out and neither bugfixes nor tiny localized cleanups (iow renaming > functions, global resulting search-and-replace things). > > So my gut feel is that two (maybe three) of these patches should go in > asap, with three (maybe four) be left for 5.16. > > IOW, not trat this as a single series. > > Hmm? Yes, why not :) I'd keep patch 4 for the next merge window, does not look urgent to me. Andrew, can you please take care of this or you'd prefer me resending everything separately? > Linus -- Sincerely yours, Mike.
[PATCH v2 5/6] memblock: rename memblock_free to memblock_phys_free
From: Mike Rapoport Since memblock_free() operates on a physical range, make its name reflect it and rename it to memblock_phys_free(), so it will be a logical counterpart to memblock_phys_alloc(). The callers are updated with the below semantic patch: @@ expression addr; expression size; @@ - memblock_free(addr, size); + memblock_phys_free(addr, size); Signed-off-by: Mike Rapoport --- arch/alpha/kernel/core_irongate.c | 3 ++- arch/arc/mm/init.c| 2 +- arch/arm/mach-hisi/platmcpm.c | 2 +- arch/arm/mm/init.c| 2 +- arch/arm64/mm/mmu.c | 4 ++-- arch/mips/mm/init.c | 2 +- arch/mips/sgi-ip30/ip30-setup.c | 6 +++--- arch/powerpc/kernel/dt_cpu_ftrs.c | 4 ++-- arch/powerpc/kernel/paca.c| 8 arch/powerpc/kernel/setup-common.c| 2 +- arch/powerpc/kernel/setup_64.c| 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/powerpc/platforms/pseries/svm.c | 3 ++- arch/riscv/kernel/setup.c | 5 +++-- arch/s390/kernel/setup.c | 8 arch/s390/kernel/smp.c| 4 ++-- arch/s390/kernel/uv.c | 2 +- arch/s390/mm/kasan_init.c | 2 +- arch/sh/boards/mach-ap325rxa/setup.c | 2 +- arch/sh/boards/mach-ecovec24/setup.c | 4 ++-- arch/sh/boards/mach-kfr2r09/setup.c | 2 +- arch/sh/boards/mach-migor/setup.c | 2 +- arch/sh/boards/mach-se/7724/setup.c | 4 ++-- arch/sparc/kernel/smp_64.c| 2 +- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/setup.c | 4 ++-- arch/x86/mm/init.c| 2 +- arch/x86/xen/mmu_pv.c | 6 +++--- arch/x86/xen/setup.c | 6 +++--- drivers/base/arch_numa.c | 2 +- drivers/firmware/efi/memmap.c | 2 +- drivers/of/kexec.c| 3 +-- drivers/of/of_reserved_mem.c | 5 +++-- drivers/s390/char/sclp_early.c| 2 +- drivers/usb/early/xhci-dbc.c | 10 +- drivers/xen/swiotlb-xen.c | 2 +- include/linux/memblock.h | 2 +- init/initramfs.c | 2 +- kernel/dma/swiotlb.c | 2 +- lib/cpumask.c | 2 +- mm/cma.c | 2 +- mm/memblock.c | 8 mm/memory_hotplug.c | 2 +- mm/percpu.c | 8 mm/sparse.c | 2 +- 45 files changed, 79 insertions(+), 76 deletions(-) diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c index 72af1e72d833..ee26dcc49418 100644 --- a/arch/alpha/kernel/core_irongate.c +++ b/arch/alpha/kernel/core_irongate.c @@ -233,7 +233,8 @@ albacore_init_arch(void) unsigned long size; size = initrd_end - initrd_start; - memblock_free(__pa(initrd_start), PAGE_ALIGN(size)); + memblock_phys_free(__pa(initrd_start), + PAGE_ALIGN(size)); if (!move_initrd(pci_mem)) printk("irongate_init_arch: initrd too big " "(%ldK)\ndisabling initrd\n", diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 699ecf119641..59408f6a02d4 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -173,7 +173,7 @@ static void __init highmem_init(void) #ifdef CONFIG_HIGHMEM unsigned long tmp; - memblock_free(high_mem_start, high_mem_sz); + memblock_phys_free(high_mem_start, high_mem_sz); for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++) free_highmem_page(pfn_to_page(tmp)); #endif diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c index 96a484095194..258586e31333 100644 --- a/arch/arm/mach-hisi/platmcpm.c +++ b/arch/arm/mach-hisi/platmcpm.c @@ -339,7 +339,7 @@ static int __init hip04_smp_init(void) err_sysctrl: iounmap(relocation); err_reloc: - memblock_free(hip04_boot_method[0], hip04_boot_method[1]); + memblock_phys_free(hip04_boot_method[0], hip04_boot_method[1]); err: return ret; } diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 6162a070a410..6d0cb0f7bc54 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -158,7 +158,7 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align) panic("Failed to steal %pa bytes at %pS\n", &size, (void *)_RET_IP_); - memblock_free(phys, size); + memblock_phys_free(phys, size); memblock_remo
[PATCH v2 4/6] memblock: stop aliasing __memblock_free_late with memblock_free_late
From: Mike Rapoport memblock_free_late() is a NOP wrapper for __memblock_free_late(), there is no point to keep this indirection. Drop the wrapper and rename __memblock_free_late() to memblock_free_late(). Signed-off-by: Mike Rapoport --- include/linux/memblock.h | 7 +-- mm/memblock.c| 8 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index fc8183be340c..e25f964fdd60 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -133,7 +133,7 @@ void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_b, phys_addr_t *out_start, phys_addr_t *out_end, int *out_nid); -void __memblock_free_late(phys_addr_t base, phys_addr_t size); +void memblock_free_late(phys_addr_t base, phys_addr_t size); #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, @@ -441,11 +441,6 @@ static inline void *memblock_alloc_node(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE, nid); } -static inline void memblock_free_late(phys_addr_t base, phys_addr_t size) -{ - __memblock_free_late(base, size); -} - /* * Set the allocation direction to bottom-up or top-down. */ diff --git a/mm/memblock.c b/mm/memblock.c index 184dcd2e5d99..603f4a02be9b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -366,14 +366,14 @@ void __init memblock_discard(void) addr = __pa(memblock.reserved.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.reserved.max); - __memblock_free_late(addr, size); + memblock_free_late(addr, size); } if (memblock.memory.regions != memblock_memory_init_regions) { addr = __pa(memblock.memory.regions); size = PAGE_ALIGN(sizeof(struct memblock_region) * memblock.memory.max); - __memblock_free_late(addr, size); + memblock_free_late(addr, size); } memblock_memory = NULL; @@ -1586,7 +1586,7 @@ void * __init memblock_alloc_try_nid( } /** - * __memblock_free_late - free pages directly to buddy allocator + * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * @@ -1594,7 +1594,7 @@ void * __init memblock_alloc_try_nid( * down, but we are still initializing the system. Pages are released directly * to the buddy allocator. */ -void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) +void __init memblock_free_late(phys_addr_t base, phys_addr_t size) { phys_addr_t cursor, end; -- 2.28.0
[PATCH v2 6/6] memblock: use memblock_free for freeing virtual pointers
From: Mike Rapoport Rename memblock_free_ptr() to memblock_free() and use memblock_free() when freeing a virtual pointer so that memblock_free() will be a counterpart of memblock_alloc() The callers are updated with the below semantic patch and manual addition of (void *) casting to pointers that are represented by unsigned long variables. @@ identifier vaddr; expression size; @@ ( - memblock_phys_free(__pa(vaddr), size); + memblock_free(vaddr, size); | - memblock_free_ptr(vaddr, size); + memblock_free(vaddr, size); ) Signed-off-by: Mike Rapoport --- arch/alpha/kernel/core_irongate.c | 3 +-- arch/mips/mm/init.c | 2 +- arch/powerpc/kernel/dt_cpu_ftrs.c | 4 ++-- arch/powerpc/kernel/setup-common.c| 2 +- arch/powerpc/kernel/setup_64.c| 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/powerpc/platforms/pseries/svm.c | 3 +-- arch/riscv/kernel/setup.c | 5 ++--- arch/sparc/kernel/smp_64.c| 2 +- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/setup_percpu.c| 2 +- arch/x86/mm/kasan_init_64.c | 4 ++-- arch/x86/mm/numa.c| 2 +- arch/x86/mm/numa_emulation.c | 2 +- arch/x86/xen/mmu_pv.c | 2 +- arch/x86/xen/p2m.c| 2 +- drivers/base/arch_numa.c | 4 ++-- drivers/macintosh/smu.c | 2 +- drivers/xen/swiotlb-xen.c | 2 +- include/linux/memblock.h | 2 +- init/initramfs.c | 2 +- init/main.c | 2 +- kernel/dma/swiotlb.c | 2 +- kernel/printk/printk.c| 4 ++-- lib/bootconfig.c | 2 +- lib/cpumask.c | 2 +- mm/memblock.c | 6 +++--- mm/percpu.c | 8 mm/sparse.c | 2 +- 29 files changed, 39 insertions(+), 42 deletions(-) diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c index ee26dcc49418..6b8ed12936b6 100644 --- a/arch/alpha/kernel/core_irongate.c +++ b/arch/alpha/kernel/core_irongate.c @@ -233,8 +233,7 @@ albacore_init_arch(void) unsigned long size; size = initrd_end - initrd_start; - memblock_phys_free(__pa(initrd_start), - PAGE_ALIGN(size)); + memblock_free((void *)initrd_start, PAGE_ALIGN(size)); if (!move_initrd(pci_mem)) printk("irongate_init_arch: initrd too big " "(%ldK)\ndisabling initrd\n", diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 3be1c29084fa..325e1552cbea 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -529,7 +529,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_phys_free(__pa(ptr), size); + memblock_free(ptr, size); } void __init setup_per_cpu_areas(void) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 42839d6bd486..ba527fb52993 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -1095,8 +1095,8 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char cpufeatures_setup_finished(); - memblock_phys_free(__pa(dt_cpu_features), - sizeof(struct dt_cpu_feature) * nr_dt_cpu_features); + memblock_free(dt_cpu_features, + sizeof(struct dt_cpu_feature) * nr_dt_cpu_features); return 0; } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 5af8993a8e6d..6b1338db8779 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -825,7 +825,7 @@ static void __init smp_setup_pacas(void) set_hard_smp_processor_id(cpu, cpu_to_phys_id[cpu]); } - memblock_phys_free(__pa(cpu_to_phys_id), nr_cpu_ids * sizeof(u32)); + memblock_free(cpu_to_phys_id, nr_cpu_ids * sizeof(u32)); cpu_to_phys_id = NULL; } #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 75bc294ac40d..1777e992b20b 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -812,7 +812,7 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, static void __init pcpu_free_bootmem(void *ptr, size_t size) { - memblock_phys_free(__pa(ptr), size); + memblock_free(ptr, size); } static int pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/power
[PATCH v2 3/6] memblock: drop memblock_free_early_nid() and memblock_free_early()
From: Mike Rapoport memblock_free_early_nid() is unused and memblock_free_early() is an alias for memblock_free(). Replace calls to memblock_free_early() with calls to memblock_free() and remove memblock_free_early() and memblock_free_early_nid(). Signed-off-by: Mike Rapoport --- arch/mips/mm/init.c | 2 +- arch/powerpc/platforms/pseries/svm.c | 3 +-- arch/s390/kernel/smp.c | 2 +- drivers/base/arch_numa.c | 2 +- drivers/s390/char/sclp_early.c | 2 +- include/linux/memblock.h | 12 kernel/dma/swiotlb.c | 2 +- lib/cpumask.c| 2 +- mm/percpu.c | 8 mm/sparse.c | 2 +- 10 files changed, 12 insertions(+), 25 deletions(-) diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 19347dc6bbf8..21a5a7ac0037 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -529,7 +529,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_early(__pa(ptr), size); + memblock_free(__pa(ptr), size); } void __init setup_per_cpu_areas(void) diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index 87f001b4c4e4..f12229ce7301 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -56,8 +56,7 @@ void __init svm_swiotlb_init(void) return; - memblock_free_early(__pa(vstart), - PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + memblock_free(__pa(vstart), PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); panic("SVM: Cannot allocate SWIOTLB buffer"); } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 1a04e5bdf655..066efd6d9345 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -880,7 +880,7 @@ void __init smp_detect_cpus(void) /* Add CPUs present at boot */ __smp_rescan_cpus(info, true); - memblock_free_early((unsigned long)info, sizeof(*info)); + memblock_free((unsigned long)info, sizeof(*info)); } /* diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index f6d0efd01188..e28d9dfe3c20 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -165,7 +165,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, static void __init pcpu_fc_free(void *ptr, size_t size) { - memblock_free_early(__pa(ptr), size); + memblock_free(__pa(ptr), size); } void __init setup_per_cpu_areas(void) diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index f3d5c7f4c13d..f01d942e1c1d 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -139,7 +139,7 @@ int __init sclp_early_get_core_info(struct sclp_core_info *info) } sclp_fill_core_info(info, sccb); out: - memblock_free_early((unsigned long)sccb, length); + memblock_free((unsigned long)sccb, length); return rc; } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 34de69b3b8ba..fc8183be340c 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -441,18 +441,6 @@ static inline void *memblock_alloc_node(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE, nid); } -static inline void memblock_free_early(phys_addr_t base, - phys_addr_t size) -{ - memblock_free(base, size); -} - -static inline void memblock_free_early_nid(phys_addr_t base, - phys_addr_t size, int nid) -{ - memblock_free(base, size); -} - static inline void memblock_free_late(phys_addr_t base, phys_addr_t size) { __memblock_free_late(base, size); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 87c40517e822..430d2f78d540 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -247,7 +247,7 @@ swiotlb_init(int verbose) return; fail_free_mem: - memblock_free_early(__pa(tlb), bytes); + memblock_free(__pa(tlb), bytes); fail: pr_warn("Cannot allocate buffer"); } diff --git a/lib/cpumask.c b/lib/cpumask.c index c3c76b833384..045779446a18 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -188,7 +188,7 @@ EXPORT_SYMBOL(free_cpumask_var); */ void __init free_bootmem_cpumask_var(cpumask_var_t mask) { - memblock_free_early(__pa(mask), cpumask_size()); + memblock_free(__pa(mask), cpumask_size()); } #endif diff --git a/mm/percpu.c b/mm/percpu.c index e0a986818903..f58318cb04c0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2472,7 +2472,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { - memblock_fr
[PATCH v2 2/6] xen/x86: free_p2m_page: use memblock_free_ptr() to free a virtual pointer
From: Mike Rapoport free_p2m_page() wrongly passes a virtual pointer to memblock_free() that treats it as a physical address. Call memblock_free_ptr() instead that gets a virtual address to free the memory. Signed-off-by: Mike Rapoport Reviewed-by: Juergen Gross --- arch/x86/xen/p2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 5e6e236977c7..141bb9dbd2fb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void) static void __ref free_p2m_page(void *p) { if (unlikely(!slab_is_available())) { - memblock_free((unsigned long)p, PAGE_SIZE); + memblock_free_ptr(p, PAGE_SIZE); return; } -- 2.28.0
[PATCH v2 1/6] arch_numa: simplify numa_distance allocation
From: Mike Rapoport Memory allocation of numa_distance uses memblock_phys_alloc_range() without actual range limits, converts the returned physical address to virtual and then only uses the virtual address for further initialization. Simplify this by replacing memblock_phys_alloc_range() with memblock_alloc(). Signed-off-by: Mike Rapoport --- drivers/base/arch_numa.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 00fb4120a5b3..f6d0efd01188 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -275,15 +275,13 @@ void __init numa_free_distance(void) static int __init numa_alloc_distance(void) { size_t size; - u64 phys; int i, j; size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]); - phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn)); - if (WARN_ON(!phys)) + numa_distance = memblock_alloc(size, PAGE_SIZE); + if (WARN_ON(!numa_distance)) return -ENOMEM; - numa_distance = __va(phys); numa_distance_cnt = nr_node_ids; /* fill with the default distances */ -- 2.28.0
[PATCH v2 0/6] memblock: cleanup memblock_free interface
From: Mike Rapoport Hi, Following the discussion on [1] this is the fix for memblock freeing APIs mismatch. The first patch is a cleanup of numa_distance allocation in arch_numa I've spotted during the conversion. The second patch is a fix for Xen memory freeing on some of the error paths. I agree with Christophe that doing step by step makes the thing easier to review, so the patches 3-6 do the actual cleanup step by step. This time I used stricter coccinelle scripts so that only straightforward uses would get converted. There still a couple of (void *) castings for the cases when a virtual address has unsigned long type rather than a pointer type, like e.g initrd_start. Since scripts/get_maintainer.pl returned more than 100 addresses I've trimmed the distribution list only to the relevant lists. Juergen and Shahab, I didn't keep your Reviewed-by because the patches are a bit different this time. v2: * split changes into several patches * use stricter coccinelle scripts [1] https://lore.kernel.org/all/CAHk-=wj9k4LZTz+svCxLYs5Y1=+ykrbauarh1+ghyg3old8...@mail.gmail.com Mike Rapoport (6): arch_numa: simplify numa_distance allocation xen/x86: free_p2m_page: use memblock_free_ptr() to free a virtual pointer memblock: drop memblock_free_early_nid() and memblock_free_early() memblock: stop aliasing __memblock_free_late with memblock_free_late memblock: rename memblock_free to memblock_phys_free memblock: use memblock_free for freeing virtual pointers arch/alpha/kernel/core_irongate.c | 2 +- arch/arc/mm/init.c| 2 +- arch/arm/mach-hisi/platmcpm.c | 2 +- arch/arm/mm/init.c| 2 +- arch/arm64/mm/mmu.c | 4 ++-- arch/mips/mm/init.c | 2 +- arch/mips/sgi-ip30/ip30-setup.c | 6 +++--- arch/powerpc/kernel/dt_cpu_ftrs.c | 4 ++-- arch/powerpc/kernel/paca.c| 8 arch/powerpc/kernel/setup-common.c| 2 +- arch/powerpc/kernel/setup_64.c| 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/powerpc/platforms/pseries/svm.c | 3 +-- arch/riscv/kernel/setup.c | 4 ++-- arch/s390/kernel/setup.c | 8 arch/s390/kernel/smp.c| 4 ++-- arch/s390/kernel/uv.c | 2 +- arch/s390/mm/kasan_init.c | 2 +- arch/sh/boards/mach-ap325rxa/setup.c | 2 +- arch/sh/boards/mach-ecovec24/setup.c | 4 ++-- arch/sh/boards/mach-kfr2r09/setup.c | 2 +- arch/sh/boards/mach-migor/setup.c | 2 +- arch/sh/boards/mach-se/7724/setup.c | 4 ++-- arch/sparc/kernel/smp_64.c| 2 +- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/setup.c | 4 ++-- arch/x86/kernel/setup_percpu.c| 2 +- arch/x86/mm/init.c| 2 +- arch/x86/mm/kasan_init_64.c | 4 ++-- arch/x86/mm/numa.c| 2 +- arch/x86/mm/numa_emulation.c | 2 +- arch/x86/xen/mmu_pv.c | 6 +++--- arch/x86/xen/p2m.c| 2 +- arch/x86/xen/setup.c | 6 +++--- drivers/base/arch_numa.c | 10 -- drivers/firmware/efi/memmap.c | 2 +- drivers/macintosh/smu.c | 2 +- drivers/of/kexec.c| 3 +-- drivers/of/of_reserved_mem.c | 5 +++-- drivers/s390/char/sclp_early.c| 2 +- drivers/usb/early/xhci-dbc.c | 10 +- drivers/xen/swiotlb-xen.c | 2 +- include/linux/memblock.h | 23 +++ init/initramfs.c | 2 +- init/main.c | 2 +- kernel/dma/swiotlb.c | 2 +- kernel/printk/printk.c| 4 ++-- lib/bootconfig.c | 2 +- lib/cpumask.c | 2 +- mm/cma.c | 2 +- mm/memblock.c | 22 +++--- mm/memory_hotplug.c | 2 +- mm/percpu.c | 8 mm/sparse.c | 2 +- 54 files changed, 99 insertions(+), 119 deletions(-) base-commit: 5816b3e6577eaa676ceb00a848f0fd65fe2adc29 -- 2.28.0
Re: [PATCH 3/3] memblock: cleanup memblock_free interface
On Thu, Sep 23, 2021 at 03:54:46PM +0200, Christophe Leroy wrote: > > Le 23/09/2021 à 14:01, Mike Rapoport a écrit : > > On Thu, Sep 23, 2021 at 11:47:48AM +0200, Christophe Leroy wrote: > > > > > > > > > Le 23/09/2021 à 09:43, Mike Rapoport a écrit : > > > > From: Mike Rapoport > > > > > > > > For ages memblock_free() interface dealt with physical addresses even > > > > despite the existence of memblock_alloc_xx() functions that return a > > > > virtual pointer. > > > > > > > > Introduce memblock_phys_free() for freeing physical ranges and repurpose > > > > memblock_free() to free virtual pointers to make the following pairing > > > > abundantly clear: > > > > > > > > int memblock_phys_free(phys_addr_t base, phys_addr_t size); > > > > phys_addr_t memblock_phys_alloc(phys_addr_t base, phys_addr_t > > > > size); > > > > > > > > void *memblock_alloc(phys_addr_t size, phys_addr_t align); > > > > void memblock_free(void *ptr, size_t size); > > > > > > > > Replace intermediate memblock_free_ptr() with memblock_free() and drop > > > > unnecessary aliases memblock_free_early() and memblock_free_early_nid(). > > > > > > > > Suggested-by: Linus Torvalds > > > > Signed-off-by: Mike Rapoport > > > > --- > > > > > > > diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c > > > > index 1a04e5bdf655..37826d8c4f74 100644 > > > > --- a/arch/s390/kernel/smp.c > > > > +++ b/arch/s390/kernel/smp.c > > > > @@ -723,7 +723,7 @@ void __init smp_save_dump_cpus(void) > > > > /* Get the CPU registers */ > > > > smp_save_cpu_regs(sa, addr, is_boot_cpu, page); > > > > } > > > > - memblock_free(page, PAGE_SIZE); > > > > + memblock_phys_free(page, PAGE_SIZE); > > > > diag_amode31_ops.diag308_reset(); > > > > pcpu_set_smt(0); > > > >} > > > > @@ -880,7 +880,7 @@ void __init smp_detect_cpus(void) > > > > /* Add CPUs present at boot */ > > > > __smp_rescan_cpus(info, true); > > > > - memblock_free_early((unsigned long)info, sizeof(*info)); > > > > + memblock_free(info, sizeof(*info)); > > > >} > > > >/* > > > > > > I'm a bit lost. IIUC memblock_free_early() and memblock_free() where > > > identical. > > > > Yes, they were, but all calls to memblock_free_early() were using > > __pa(vaddr) because they had a virtual address at hand. > > I'm still not following. In the above memblock_free_early() was taking > (unsigned long)info . Was it a bug ? Not really because s390 has pa == va: https://elixir.bootlin.com/linux/latest/source/arch/s390/include/asm/page.h#L169 -- Sincerely yours, Mike.
Re: [PATCH 0/3] memblock: cleanup memblock_free interface
Hi Linus, On Thu, Sep 23, 2021 at 09:01:46AM -0700, Linus Torvalds wrote: > On Thu, Sep 23, 2021 at 12:43 AM Mike Rapoport wrote: > > > You need to be a LOT more careful. > > From a trivial check - exactly because I looked at doing it with a > script, and decided it's not so easy - I found cases like this: > > - memblock_free(__pa(paca_ptrs) + new_ptrs_size, > + memblock_free(paca_ptrs + new_ptrs_size, > > which is COMPLETELY wrong. I did use a coccinelle script that's slightly more robust that a sed you've sent, but then I did a manual review, hence the two small patches with fixes. Indeed I missed this one, so to be on the safe side I'll rename only the obvious cases where coccinelle can be used reliably and leave all the rest as it's now. If somebody cares enough they can update it later. > And no, making the scripting just replace '__pa(x)' with '(void *)(x)' These were actually manual and they are required for variables that used as virtual addresses but have unsigned long type, like e.g. initrd_start. So it's either __pa(x) or (void *). -- Sincerely yours, Mike.
Re: [PATCH 3/3] memblock: cleanup memblock_free interface
On Thu, Sep 23, 2021 at 11:47:48AM +0200, Christophe Leroy wrote: > > > Le 23/09/2021 à 09:43, Mike Rapoport a écrit : > > From: Mike Rapoport > > > > For ages memblock_free() interface dealt with physical addresses even > > despite the existence of memblock_alloc_xx() functions that return a > > virtual pointer. > > > > Introduce memblock_phys_free() for freeing physical ranges and repurpose > > memblock_free() to free virtual pointers to make the following pairing > > abundantly clear: > > > > int memblock_phys_free(phys_addr_t base, phys_addr_t size); > > phys_addr_t memblock_phys_alloc(phys_addr_t base, phys_addr_t size); > > > > void *memblock_alloc(phys_addr_t size, phys_addr_t align); > > void memblock_free(void *ptr, size_t size); > > > > Replace intermediate memblock_free_ptr() with memblock_free() and drop > > unnecessary aliases memblock_free_early() and memblock_free_early_nid(). > > > > Suggested-by: Linus Torvalds > > Signed-off-by: Mike Rapoport > > --- > > > diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c > > index 1a04e5bdf655..37826d8c4f74 100644 > > --- a/arch/s390/kernel/smp.c > > +++ b/arch/s390/kernel/smp.c > > @@ -723,7 +723,7 @@ void __init smp_save_dump_cpus(void) > > /* Get the CPU registers */ > > smp_save_cpu_regs(sa, addr, is_boot_cpu, page); > > } > > - memblock_free(page, PAGE_SIZE); > > + memblock_phys_free(page, PAGE_SIZE); > > diag_amode31_ops.diag308_reset(); > > pcpu_set_smt(0); > > } > > @@ -880,7 +880,7 @@ void __init smp_detect_cpus(void) > > /* Add CPUs present at boot */ > > __smp_rescan_cpus(info, true); > > - memblock_free_early((unsigned long)info, sizeof(*info)); > > + memblock_free(info, sizeof(*info)); > > } > > /* > > I'm a bit lost. IIUC memblock_free_early() and memblock_free() where > identical. Yes, they were, but all calls to memblock_free_early() were using __pa(vaddr) because they had a virtual address at hand. > In the first hunk memblock_free() gets replaced by memblock_phys_free() > In the second hunk memblock_free_early() gets replaced by memblock_free() In the first hunk the memory is allocated with memblock_phys_alloc() and we have a physical range to free. In the second hunk the memory is allocated with memblock_alloc() and we are freeing a virtual pointer. > I think it would be easier to follow if you could split it in several > patches: It was an explicit request from Linus to make it a single commit: but the actual commit can and should be just a single commit that just fixes 'memblock_free()' to have sane interfaces. I don't feel strongly about splitting it (except my laziness really objects), but I don't think doing the conversion in several steps worth the churn. > - First patch: Create memblock_phys_free() and change all relevant > memblock_free() to memblock_phys_free() - Or change memblock_free() to > memblock_phys_free() and make memblock_free() an alias of it. > - Second patch: Make memblock_free_ptr() become memblock_free() and change > all remaining callers to the new semantics (IIUC memblock_free(__pa(ptr)) > becomes memblock_free(ptr) and make memblock_free_ptr() an alias of > memblock_free() > - Fourth patch: Replace and drop memblock_free_ptr() > - Fifth patch: Drop memblock_free_early() and memblock_free_early_nid() (All > users should have been upgraded to memblock_free_phys() in patch 1 or > memblock_free() in patch 2) > > Christophe -- Sincerely yours, Mike.
[PATCH 3/3] memblock: cleanup memblock_free interface
From: Mike Rapoport For ages memblock_free() interface dealt with physical addresses even despite the existence of memblock_alloc_xx() functions that return a virtual pointer. Introduce memblock_phys_free() for freeing physical ranges and repurpose memblock_free() to free virtual pointers to make the following pairing abundantly clear: int memblock_phys_free(phys_addr_t base, phys_addr_t size); phys_addr_t memblock_phys_alloc(phys_addr_t base, phys_addr_t size); void *memblock_alloc(phys_addr_t size, phys_addr_t align); void memblock_free(void *ptr, size_t size); Replace intermediate memblock_free_ptr() with memblock_free() and drop unnecessary aliases memblock_free_early() and memblock_free_early_nid(). Suggested-by: Linus Torvalds Signed-off-by: Mike Rapoport --- arch/alpha/kernel/core_irongate.c | 2 +- arch/arc/mm/init.c| 2 +- arch/arm/mach-hisi/platmcpm.c | 2 +- arch/arm/mm/init.c| 2 +- arch/arm64/mm/mmu.c | 4 ++-- arch/mips/mm/init.c | 2 +- arch/mips/sgi-ip30/ip30-setup.c | 6 +++--- arch/powerpc/kernel/dt_cpu_ftrs.c | 2 +- arch/powerpc/kernel/paca.c| 4 ++-- arch/powerpc/kernel/setup-common.c| 2 +- arch/powerpc/kernel/setup_64.c| 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/powerpc/platforms/pseries/svm.c | 4 +--- arch/riscv/kernel/setup.c | 4 ++-- arch/s390/kernel/setup.c | 8 arch/s390/kernel/smp.c| 4 ++-- arch/s390/kernel/uv.c | 2 +- arch/s390/mm/kasan_init.c | 2 +- arch/sh/boards/mach-ap325rxa/setup.c | 2 +- arch/sh/boards/mach-ecovec24/setup.c | 4 ++-- arch/sh/boards/mach-kfr2r09/setup.c | 2 +- arch/sh/boards/mach-migor/setup.c | 2 +- arch/sh/boards/mach-se/7724/setup.c | 4 ++-- arch/sparc/kernel/smp_64.c| 2 +- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/setup.c | 4 ++-- arch/x86/kernel/setup_percpu.c| 2 +- arch/x86/mm/init.c| 2 +- arch/x86/mm/kasan_init_64.c | 4 ++-- arch/x86/mm/numa.c| 2 +- arch/x86/mm/numa_emulation.c | 2 +- arch/x86/xen/mmu_pv.c | 6 +++--- arch/x86/xen/p2m.c| 2 +- arch/x86/xen/setup.c | 6 +++--- drivers/base/arch_numa.c | 4 ++-- drivers/firmware/efi/memmap.c | 2 +- drivers/macintosh/smu.c | 2 +- drivers/of/kexec.c| 2 +- drivers/of/of_reserved_mem.c | 4 ++-- drivers/s390/char/sclp_early.c| 2 +- drivers/usb/early/xhci-dbc.c | 10 +- drivers/xen/swiotlb-xen.c | 2 +- include/linux/memblock.h | 16 ++-- init/initramfs.c | 2 +- init/main.c | 2 +- kernel/dma/swiotlb.c | 2 +- kernel/printk/printk.c| 4 ++-- lib/bootconfig.c | 2 +- lib/cpumask.c | 2 +- mm/cma.c | 2 +- mm/memblock.c | 20 ++-- mm/memory_hotplug.c | 2 +- mm/percpu.c | 8 mm/sparse.c | 2 +- tools/bootconfig/include/linux/memblock.h | 2 +- 55 files changed, 92 insertions(+), 106 deletions(-) diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c index 72af1e72d833..6b8ed12936b6 100644 --- a/arch/alpha/kernel/core_irongate.c +++ b/arch/alpha/kernel/core_irongate.c @@ -233,7 +233,7 @@ albacore_init_arch(void) unsigned long size; size = initrd_end - initrd_start; - memblock_free(__pa(initrd_start), PAGE_ALIGN(size)); + memblock_free((void *)initrd_start, PAGE_ALIGN(size)); if (!move_initrd(pci_mem)) printk("irongate_init_arch: initrd too big " "(%ldK)\ndisabling initrd\n", diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 699ecf119641..59408f6a02d4 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -173,7 +173,7 @@ static void __init highmem_init(void) #ifdef CONFIG_HIGHMEM unsigned long tmp; - memblock_free(high_mem_start, high_mem_sz); + memblock_phys_free(high_mem_start, high_mem_sz); for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++) free_highmem_page(pfn_to_page(tmp)); #endif diff --git a/arch/arm
[PATCH 2/3] xen/x86: free_p2m_page: use memblock_free_ptr() to free a virtual pointer
From: Mike Rapoport free_p2m_page() wrongly passes a virtual pointer to memblock_free() that treats it as a physical address. Call memblock_free_ptr() instead that gets a virtual address to free the memory. Signed-off-by: Mike Rapoport --- arch/x86/xen/p2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 5e6e236977c7..141bb9dbd2fb 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -197,7 +197,7 @@ static void * __ref alloc_p2m_page(void) static void __ref free_p2m_page(void *p) { if (unlikely(!slab_is_available())) { - memblock_free((unsigned long)p, PAGE_SIZE); + memblock_free_ptr(p, PAGE_SIZE); return; } -- 2.28.0
[PATCH 0/3] memblock: cleanup memblock_free interface
From: Mike Rapoport Hi, Following the discussion on [1] this is the fix for memblock freeing APIs mismatch. The first patch is a cleanup of numa_distance allocation in arch_numa I've spotted during the conversion. The second patch is a fix for Xen memory freeing on some of the error paths. The core change is in the third patch that makes memblock_free() a counterpart of memblock_alloc() and adds memblock_phys_alloc() to be a counterpart of memblock_phys_alloc(). Since scripts/get_maintainer.pl returned more than 100 addresses I've trimmed the distribution list only to the relevant lists. [1] https://lore.kernel.org/all/CAHk-=wj9k4LZTz+svCxLYs5Y1=+ykrbauarh1+ghyg3old8...@mail.gmail.com Mike Rapoport (3): arch_numa: simplify numa_distance allocation xen/x86: free_p2m_page: use memblock_free_ptr() to free a virtual pointer memblock: cleanup memblock_free interface arch/alpha/kernel/core_irongate.c | 2 +- arch/arc/mm/init.c| 2 +- arch/arm/mach-hisi/platmcpm.c | 2 +- arch/arm/mm/init.c| 2 +- arch/arm64/mm/mmu.c | 4 ++-- arch/mips/mm/init.c | 2 +- arch/mips/sgi-ip30/ip30-setup.c | 6 +++--- arch/powerpc/kernel/dt_cpu_ftrs.c | 2 +- arch/powerpc/kernel/paca.c| 4 ++-- arch/powerpc/kernel/setup-common.c| 2 +- arch/powerpc/kernel/setup_64.c| 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- arch/powerpc/platforms/pseries/svm.c | 4 +--- arch/riscv/kernel/setup.c | 4 ++-- arch/s390/kernel/setup.c | 8 arch/s390/kernel/smp.c| 4 ++-- arch/s390/kernel/uv.c | 2 +- arch/s390/mm/kasan_init.c | 2 +- arch/sh/boards/mach-ap325rxa/setup.c | 2 +- arch/sh/boards/mach-ecovec24/setup.c | 4 ++-- arch/sh/boards/mach-kfr2r09/setup.c | 2 +- arch/sh/boards/mach-migor/setup.c | 2 +- arch/sh/boards/mach-se/7724/setup.c | 4 ++-- arch/sparc/kernel/smp_64.c| 2 +- arch/um/kernel/mem.c | 2 +- arch/x86/kernel/setup.c | 4 ++-- arch/x86/kernel/setup_percpu.c| 2 +- arch/x86/mm/init.c| 2 +- arch/x86/mm/kasan_init_64.c | 4 ++-- arch/x86/mm/numa.c| 2 +- arch/x86/mm/numa_emulation.c | 2 +- arch/x86/xen/mmu_pv.c | 6 +++--- arch/x86/xen/p2m.c| 2 +- arch/x86/xen/setup.c | 6 +++--- drivers/base/arch_numa.c | 10 -- drivers/firmware/efi/memmap.c | 2 +- drivers/macintosh/smu.c | 2 +- drivers/of/kexec.c| 2 +- drivers/of/of_reserved_mem.c | 4 ++-- drivers/s390/char/sclp_early.c| 2 +- drivers/usb/early/xhci-dbc.c | 10 +- drivers/xen/swiotlb-xen.c | 2 +- include/linux/memblock.h | 16 ++-- init/initramfs.c | 2 +- init/main.c | 2 +- kernel/dma/swiotlb.c | 2 +- kernel/printk/printk.c| 4 ++-- lib/bootconfig.c | 2 +- lib/cpumask.c | 2 +- mm/cma.c | 2 +- mm/memblock.c | 20 ++-- mm/memory_hotplug.c | 2 +- mm/percpu.c | 8 mm/sparse.c | 2 +- tools/bootconfig/include/linux/memblock.h | 2 +- 55 files changed, 94 insertions(+), 110 deletions(-) base-commit: e4e737bb5c170df6135a127739a9e6148ee3da82 -- 2.28.0
[PATCH 1/3] arch_numa: simplify numa_distance allocation
From: Mike Rapoport Memory allocation of numa_distance uses memblock_phys_alloc_range() without actual range limits, converts the returned physical address to virtual and then only uses the virtual address for further initialization. Simplify this by replacing memblock_phys_alloc_range() with memblock_alloc(). Signed-off-by: Mike Rapoport --- drivers/base/arch_numa.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 00fb4120a5b3..f6d0efd01188 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -275,15 +275,13 @@ void __init numa_free_distance(void) static int __init numa_alloc_distance(void) { size_t size; - u64 phys; int i, j; size = nr_node_ids * nr_node_ids * sizeof(numa_distance[0]); - phys = memblock_phys_alloc_range(size, PAGE_SIZE, 0, PFN_PHYS(max_pfn)); - if (WARN_ON(!phys)) + numa_distance = memblock_alloc(size, PAGE_SIZE); + if (WARN_ON(!numa_distance)) return -ENOMEM; - numa_distance = __va(phys); numa_distance_cnt = nr_node_ids; /* fill with the default distances */ -- 2.28.0
Re: [PATCH] x86/setup: call early_reserve_memory() earlier
On Wed, Sep 15, 2021 at 01:00:20PM +0200, Borislav Petkov wrote: > You forgot to Cc Mike, lemme add him. > > And drop stable@ too. > > On Tue, Sep 14, 2021 at 01:06:22PM +0200, Juergen Gross wrote: > > On 14.09.21 12:03, Jan Beulich wrote: > > > On 14.09.2021 11:41, Juergen Gross wrote: > > > > Commit a799c2bd29d19c565 ("x86/setup: Consolidate early memory > > > > reservations") introduced early_reserve_memory() to do all needed > > > > initial memblock_reserve() calls in one function. Unfortunately the > > > > call of early_reserve_memory() is done too late for Xen dom0, as in > > > > some cases a Xen hook called by e820__memory_setup() will need those > > > > memory reservations to have happened already. > > > > > > > > Move the call of early_reserve_memory() to the beginning of > > > > setup_arch() in order to avoid such problems. > > > > > > > > Cc: sta...@vger.kernel.org > > > > Fixes: a799c2bd29d19c565 ("x86/setup: Consolidate early memory > > > > reservations") > > > > Reported-by: Marek Marczykowski-Górecki > > > > > > > > Signed-off-by: Juergen Gross > > > > --- > > > > arch/x86/kernel/setup.c | 24 > > > > 1 file changed, 12 insertions(+), 12 deletions(-) > > > > > > > > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c > > > > index 79f164141116..f369c51ec580 100644 > > > > --- a/arch/x86/kernel/setup.c > > > > +++ b/arch/x86/kernel/setup.c > > > > @@ -757,6 +757,18 @@ dump_kernel_offset(struct notifier_block *self, > > > > unsigned long v, void *p) > > > > void __init setup_arch(char **cmdline_p) > > > > { > > > > + /* > > > > +* Do some memory reservations *before* memory is added to > > > > +* memblock, so memblock allocations won't overwrite it. > > > > +* Do it after early param, so we could get (unlikely) panic > > > > from > > > > +* serial. > > > > > > Hmm, this part of the comment is not only stale now, but gets actively > > > undermined. No idea how likely such a panic() would be, and hence how > > > relevant it is to retain this particular property. > > > > Ah, right. > > > > The alternative would be to split it up again. Let's let the x86 > > maintainers decide which way is the better one. I think the first sentence about reserving memory before memblock allocations are possible is important and I think we should keep it. With that Acked-by: Mike Rapoport > > > > > > Juergen > > > > > > > > Jan > > > > > > > +* After this point everything still needed from the boot > > > > loader or > > > > +* firmware or kernel text should be early reserved or marked > > > > not > > > > +* RAM in e820. All other memory is free game. > > > > +*/ > > > > + early_reserve_memory(); > > > > + > > > > #ifdef CONFIG_X86_32 > > > > memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); > > > > @@ -876,18 +888,6 @@ void __init setup_arch(char **cmdline_p) > > > > parse_early_param(); > > > > - /* > > > > -* Do some memory reservations *before* memory is added to > > > > -* memblock, so memblock allocations won't overwrite it. > > > > -* Do it after early param, so we could get (unlikely) panic > > > > from > > > > -* serial. > > > > -* > > > > -* After this point everything still needed from the boot > > > > loader or > > > > -* firmware or kernel text should be early reserved or marked > > > > not > > > > -* RAM in e820. All other memory is free game. > > > > -*/ > > > > - early_reserve_memory(); > > > > - > > > > #ifdef CONFIG_MEMORY_HOTPLUG > > > > /* > > > > * Memory used by the kernel cannot be hot-removed because Linux > > > > > > > > > > > > > > > > -- > Regards/Gruss, > Boris. > > https://people.kernel.org/tglx/notes-about-netiquette -- Sincerely yours, Mike.
Re: Linux 5.13+ as Xen dom0 crashes on Ryzen CPU (ucode loading related?)
On Tue, Sep 14, 2021 at 09:14:38AM +0200, Juergen Gross wrote: > On 13.09.21 14:50, Marek Marczykowski-Górecki wrote: > > Hi, > > > > Since 5.13, the Xen (PV) dom0 crashes on boot, before even printing the > > kernel version. > > Test environment: > > - Xen 4.14.2 > > - AMD Ryzen 5 4500U (reported also on AMD Ryzen 7 4750U) > > - Linux 5.13.13, confirmed also on 5.14 > > > > The crash happens only if the initramfs has earlycpio with microcode. > > I don't have a serial console, but I've got a photo with crash message > > (from Xen, Linux doesn't managed to print anything): > > https://user-images.githubusercontent.com/726704/133084966-5038f37e-001b-4688-9f90-83d09be3dc2d.jpg > > > > Transcription of some of it: > > > > mapping kernel into physical memory > > about to get started > > (XEN) Pagetable walk from 82810888: > > (XEN) L4[0x1ff] = 000332815067 2815 > > (XEN) L3[0x1fe] = 000332816067 2816 > > (XEN) L2[0x014] = 000334018067 4018 > > (XEN) L1[0x010] = 000332810067 2810 > > (XEN) domain_crash_sync called from entry.S: fault at 82d04033e790 > > x86_64/entry.S#domain_crash_page_fault > > (XEN) Domain 0 (vcpu#0) crashed on cpu#0: > > (XEN) [ Xen-4.14.2 x86_64 debug=n Not tainted ] > > (XEN) CPU:0 > > (XEN) RIP:e033:[<>] > > The domain's run state seems to be completely clobbered. > > Did you try to boot the kernel with "earlyprintk=xen" to get some idea > how far it progressed? > > I could imagine that doing the early reservations after the call of > e820__memory_setup() is problematic, as Xen PV guests have a hook in > this function performing some rather extended actions. Right, among them it may relocate initrd: https://elixir.bootlin.com/linux/latest/source/arch/x86/xen/setup.c#L872 and this may cause the reported crash. > I'm not sure the call of early_reserve_memory() can be moved just before > the e820__memory_setup() call. If this is possibel it should be done > IMO, if not then the reservations which have been at the start of > setup_arch() might need to go there again. early_reserve_memory() can be moved to the beginning of setup_arch(). Anther possibility is to move initrd relocation out of xen_setup_memory() and maybe even integrate it somehow in reserve_initrd(). -- Sincerely yours, Mike.
Re: Linux 5.13+ as Xen dom0 crashes on Ryzen CPU (ucode loading related?)
Hi Marek, On Mon, Sep 13, 2021 at 02:50:00PM +0200, Marek Marczykowski-Górecki wrote: > Hi, > > Since 5.13, the Xen (PV) dom0 crashes on boot, before even printing the > kernel version. > Test environment: > - Xen 4.14.2 > - AMD Ryzen 5 4500U (reported also on AMD Ryzen 7 4750U) > - Linux 5.13.13, confirmed also on 5.14 > > The crash happens only if the initramfs has earlycpio with microcode. Does the crash happen if you boot the same kernel and initrd directly without Xen? > I don't have a serial console, but I've got a photo with crash message > (from Xen, Linux doesn't managed to print anything): > https://user-images.githubusercontent.com/726704/133084966-5038f37e-001b-4688-9f90-83d09be3dc2d.jpg > > Transcription of some of it: > > mapping kernel into physical memory > about to get started > (XEN) Pagetable walk from 82810888: > (XEN) L4[0x1ff] = 000332815067 2815 > (XEN) L3[0x1fe] = 000332816067 2816 > (XEN) L2[0x014] = 000334018067 4018 > (XEN) L1[0x010] = 000332810067 2810 > (XEN) domain_crash_sync called from entry.S: fault at 82d04033e790 > x86_64/entry.S#domain_crash_page_fault > (XEN) Domain 0 (vcpu#0) crashed on cpu#0: > (XEN) [ Xen-4.14.2 x86_64 debug=n Not tainted ] > (XEN) CPU:0 > (XEN) RIP:e033:[<>] Is it possible to get the actual RIP of the instruction that faulted? Feeding that to scripts/faddr2line would be just lovely. > I've bisected it down to the commit a799c2bd29d19c565f37fa038b31a0a1d44d0e4d > > x86/setup: Consolidate early memory reservations > > Since this seems to affect Xen boot only, I'm copying xen-devel too. > > Any ideas? The only thing I can suggest for now is to move the reservations from early_reserve_memory() back to where they were before this commit one by one to see which move caused the crash. -- Sincerely yours, Mike.
Re: [Xen-devel] [PATCH v2 00/21] Refine memblock API
Hi Adam, On Tue, Oct 01, 2019 at 07:14:13PM -0500, Adam Ford wrote: > On Sun, Sep 29, 2019 at 8:33 AM Adam Ford wrote: > > > > I am attaching two logs. I now the mailing lists will be unhappy, but > > don't want to try and spam a bunch of log through the mailing liast. > > The two logs show the differences between the working and non-working > > imx6q 3D accelerator when trying to run a simple glmark2-es2-drm demo. > > > > The only change between them is the 2 line code change you suggested. > > > > In both cases, I have cma=128M set in my bootargs. Historically this > > has been sufficient, but cma=256M has not made a difference. > > > > Mike any suggestions on how to move forward? > I was hoping to get the fixes tested and pushed before 5.4 is released > if at all possible I have a fix (below) that kinda restores the original behaviour, but I still would like to double check to make sure it's not a band aid and I haven't missed the actual root cause. Can you please send me your device tree definition and the output of cat /sys/kernel/debug/memblock/memory and cat /sys/kernel/debug/memblock/reserved Thanks! From 06529f861772b7dea2912fc2245debe4690139b8 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 2 Oct 2019 10:14:17 +0300 Subject: [PATCH] mm: memblock: do not enforce current limit for memblock_phys* family Until commit 92d12f9544b7 ("memblock: refactor internal allocation functions") the maximal address for memblock allocations was forced to memblock.current_limit only for the allocation functions returning virtual address. The changes introduced by that commit moved the limit enforcement into the allocation core and as a result the allocation functions returning physical address also started to limit allocations to memblock.current_limit. This caused breakage of etnaviv GPU driver: [3.682347] etnaviv etnaviv: bound 13.gpu (ops gpu_ops) [3.688669] etnaviv etnaviv: bound 134000.gpu (ops gpu_ops) [3.695099] etnaviv etnaviv: bound 2204000.gpu (ops gpu_ops) [3.700800] etnaviv-gpu 13.gpu: model: GC2000, revision: 5108 [3.723013] etnaviv-gpu 13.gpu: command buffer outside valid memory window [3.731308] etnaviv-gpu 134000.gpu: model: GC320, revision: 5007 [3.752437] etnaviv-gpu 134000.gpu: command buffer outside valid memory window [3.760583] etnaviv-gpu 2204000.gpu: model: GC355, revision: 1215 [3.766766] etnaviv-gpu 2204000.gpu: Ignoring GPU with VG and FE2.0 Restore the behaviour of memblock_phys* family so that these functions will not enforce memblock.current_limit. Fixes: 92d12f9544b7 ("memblock: refactor internal allocation functions") Reported-by: Adam Ford Signed-off-by: Mike Rapoport --- mm/memblock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 7d4f61a..c4b16ca 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1356,9 +1356,6 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, align = SMP_CACHE_BYTES; } - if (end > memblock.current_limit) - end = memblock.current_limit; - again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); @@ -1469,6 +1466,9 @@ static void * __init memblock_alloc_internal( if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, nid); + if (max_addr > memblock.current_limit) + max_addr = memblock.current_limit; + alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid); /* retry allocation without lower limit */ -- 2.7.4 > > adam > > > > On Sat, Sep 28, 2019 at 2:33 AM Mike Rapoport wrote: > > > > > > On Thu, Sep 26, 2019 at 02:35:53PM -0500, Adam Ford wrote: > > > > On Thu, Sep 26, 2019 at 11:04 AM Mike Rapoport > > > > wrote: > > > > > > > > > > Hi, > > > > > > > > > > On Thu, Sep 26, 2019 at 08:09:52AM -0500, Adam Ford wrote: > > > > > > On Wed, Sep 25, 2019 at 10:17 AM Fabio Estevam > > > > > > wrote: > > > > > > > > > > > > > > On Wed, Sep 25, 2019 at 9:17 AM Adam Ford > > > > > > > wrote: > > > > > > > > > > > > > > > I tried cma=256M and noticed the cma dump at the beginning > > > > > > > > didn't > > > > > > > > change. Do we need to setup a reserved-memory node like > > > > > > > > imx6ul-ccimx6ulsom.dtsi did? > > > > > > > > > > > > > > I don't think so. &
Re: [Xen-devel] [PATCH v2 00/21] Refine memblock API
On Thu, Sep 26, 2019 at 02:35:53PM -0500, Adam Ford wrote: > On Thu, Sep 26, 2019 at 11:04 AM Mike Rapoport wrote: > > > > Hi, > > > > On Thu, Sep 26, 2019 at 08:09:52AM -0500, Adam Ford wrote: > > > On Wed, Sep 25, 2019 at 10:17 AM Fabio Estevam wrote: > > > > > > > > On Wed, Sep 25, 2019 at 9:17 AM Adam Ford wrote: > > > > > > > > > I tried cma=256M and noticed the cma dump at the beginning didn't > > > > > change. Do we need to setup a reserved-memory node like > > > > > imx6ul-ccimx6ulsom.dtsi did? > > > > > > > > I don't think so. > > > > > > > > Were you able to identify what was the exact commit that caused such > > > > regression? > > > > > > I was able to narrow it down the 92d12f9544b7 ("memblock: refactor > > > internal allocation functions") that caused the regression with > > > Etnaviv. > > > > > > Can you please test with this change: > > > > That appears to have fixed my issue. I am not sure what the impact > is, but is this a safe option? It's not really a fix, I just wanted to see how exactly 92d12f9544b7 ("memblock: refactor internal allocation functions") broke your setup. Can you share the dts you are using and the full kernel log? > adam > > > diff --git a/mm/memblock.c b/mm/memblock.c > > index 7d4f61a..1f5a0eb 100644 > > --- a/mm/memblock.c > > +++ b/mm/memblock.c > > @@ -1356,9 +1356,6 @@ static phys_addr_t __init > > memblock_alloc_range_nid(phys_addr_t size, > > align = SMP_CACHE_BYTES; > > } > > > > - if (end > memblock.current_limit) > > - end = memblock.current_limit; > > - > > again: > > found = memblock_find_in_range_node(size, align, start, end, nid, > > flags); > > > > > I also noticed that if I create a reserved memory node as was done one > > > imx6ul-ccimx6ulsom.dtsi the 3D seems to work again, but without it, I > > > was getting errors regardless of the 'cma=256M' or not. > > > I don't have a problem using the reserved memory, but I guess I am not > > > sure what the amount should be. I know for the video decoding 1080p, > > > I have historically used cma=128M, but with the 3D also needing some > > > memory allocation, is that enough or should I use 256M? > > > > > > adam > > > > -- > > Sincerely yours, > > Mike. > > -- Sincerely yours, Mike. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 00/21] Refine memblock API
Hi, On Thu, Sep 26, 2019 at 08:09:52AM -0500, Adam Ford wrote: > On Wed, Sep 25, 2019 at 10:17 AM Fabio Estevam wrote: > > > > On Wed, Sep 25, 2019 at 9:17 AM Adam Ford wrote: > > > > > I tried cma=256M and noticed the cma dump at the beginning didn't > > > change. Do we need to setup a reserved-memory node like > > > imx6ul-ccimx6ulsom.dtsi did? > > > > I don't think so. > > > > Were you able to identify what was the exact commit that caused such > > regression? > > I was able to narrow it down the 92d12f9544b7 ("memblock: refactor > internal allocation functions") that caused the regression with > Etnaviv. Can you please test with this change: diff --git a/mm/memblock.c b/mm/memblock.c index 7d4f61a..1f5a0eb 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1356,9 +1356,6 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, align = SMP_CACHE_BYTES; } - if (end > memblock.current_limit) - end = memblock.current_limit; - again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); > I also noticed that if I create a reserved memory node as was done one > imx6ul-ccimx6ulsom.dtsi the 3D seems to work again, but without it, I > was getting errors regardless of the 'cma=256M' or not. > I don't have a problem using the reserved memory, but I guess I am not > sure what the amount should be. I know for the video decoding 1080p, > I have historically used cma=128M, but with the 3D also needing some > memory allocation, is that enough or should I use 256M? > > adam -- Sincerely yours, Mike. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH 1/2] treewide: Switch printk users from %pf and %pF to %ps and %pS, respectively
On Fri, Mar 22, 2019 at 03:21:07PM +0200, Sakari Ailus wrote: > %pF and %pf are functionally equivalent to %pS and %ps conversion > specifiers. The former are deprecated, therefore switch the current users > to use the preferred variant. > > The changes have been produced by the following command: > > git grep -l '%p[fF]' | grep -v '^\(tools\|Documentation\)/' | \ > while read i; do perl -i -pe 's/%pf/%ps/g; s/%pF/%pS/g;' $i; done > > And verifying the result. > > Signed-off-by: Sakari Ailus > --- For > mm/memblock.c | 12 ++-- Acked-by: Mike Rapoport ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCHv2 1/9] mm: Introduce new vm_insert_range and vm_insert_range_buggy API
On Thu, Feb 07, 2019 at 09:37:08PM +0530, Souptick Joarder wrote: > On Thu, Feb 7, 2019 at 9:27 PM Mike Rapoport wrote: > > > > Hi Souptick, > > > > On Thu, Feb 07, 2019 at 09:19:47PM +0530, Souptick Joarder wrote: > > > Hi Mike, > > > > > > Just thought to take opinion for documentation before placing it in v3. > > > Does it looks fine ? > > > > Overall looks good to me. Several minor points below. > > Thanks Mike. Noted. > Shall I consider it as *Reviewed-by:* with below changes ? Yeah, sure. > > > > > +/** > > > + * __vm_insert_range - insert range of kernel pages into user vma > > > + * @vma: user vma to map to > > > + * @pages: pointer to array of source kernel pages > > > + * @num: number of pages in page array > > > + * @offset: user's requested vm_pgoff > > > + * > > > + * This allow drivers to insert range of kernel pages into a user vma. > > > > allows > > > + * > > > + * Return: 0 on success and error code otherwise. > > > + */ > > > +static int __vm_insert_range(struct vm_area_struct *vma, struct page > > > **pages, > > > + unsigned long num, unsigned long offset) > > > > > > > > > +/** > > > + * vm_insert_range - insert range of kernel pages starts with non zero > > > offset > > > + * @vma: user vma to map to > > > + * @pages: pointer to array of source kernel pages > > > + * @num: number of pages in page array > > > + * > > > + * Maps an object consisting of `num' `pages', catering for the user's > >@num pages > > > + * requested vm_pgoff > > > + * > > > + * If we fail to insert any page into the vma, the function will return > > > + * immediately leaving any previously inserted pages present. Callers > > > + * from the mmap handler may immediately return the error as their caller > > > + * will destroy the vma, removing any successfully inserted pages. Other > > > + * callers should make their own arrangements for calling unmap_region(). > > > + * > > > + * Context: Process context. Called by mmap handlers. > > > + * Return: 0 on success and error code otherwise. > > > + */ > > > +int vm_insert_range(struct vm_area_struct *vma, struct page **pages, > > > + unsigned long num) > > > > > > > > > +/** > > > + * vm_insert_range_buggy - insert range of kernel pages starts with zero > > > offset > > > + * @vma: user vma to map to > > > + * @pages: pointer to array of source kernel pages > > > + * @num: number of pages in page array > > > + * > > > + * Similar to vm_insert_range(), except that it explicitly sets > > > @vm_pgoff to > > > > the offset > > > > > + * 0. This function is intended for the drivers that did not consider > > > + * @vm_pgoff. > > > + * > > > + * Context: Process context. Called by mmap handlers. > > > + * Return: 0 on success and error code otherwise. > > > + */ > > > +int vm_insert_range_buggy(struct vm_area_struct *vma, struct page > > > **pages, > > > + unsigned long num) > > > > > > > -- > > Sincerely yours, > > Mike. > > > -- Sincerely yours, Mike. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCHv2 1/9] mm: Introduce new vm_insert_range and vm_insert_range_buggy API
Hi Souptick, On Thu, Feb 07, 2019 at 09:19:47PM +0530, Souptick Joarder wrote: > Hi Mike, > > Just thought to take opinion for documentation before placing it in v3. > Does it looks fine ? Overall looks good to me. Several minor points below. > +/** > + * __vm_insert_range - insert range of kernel pages into user vma > + * @vma: user vma to map to > + * @pages: pointer to array of source kernel pages > + * @num: number of pages in page array > + * @offset: user's requested vm_pgoff > + * > + * This allow drivers to insert range of kernel pages into a user vma. allows > + * > + * Return: 0 on success and error code otherwise. > + */ > +static int __vm_insert_range(struct vm_area_struct *vma, struct page **pages, > + unsigned long num, unsigned long offset) > > > +/** > + * vm_insert_range - insert range of kernel pages starts with non zero offset > + * @vma: user vma to map to > + * @pages: pointer to array of source kernel pages > + * @num: number of pages in page array > + * > + * Maps an object consisting of `num' `pages', catering for the user's @num pages > + * requested vm_pgoff > + * > + * If we fail to insert any page into the vma, the function will return > + * immediately leaving any previously inserted pages present. Callers > + * from the mmap handler may immediately return the error as their caller > + * will destroy the vma, removing any successfully inserted pages. Other > + * callers should make their own arrangements for calling unmap_region(). > + * > + * Context: Process context. Called by mmap handlers. > + * Return: 0 on success and error code otherwise. > + */ > +int vm_insert_range(struct vm_area_struct *vma, struct page **pages, > + unsigned long num) > > > +/** > + * vm_insert_range_buggy - insert range of kernel pages starts with zero > offset > + * @vma: user vma to map to > + * @pages: pointer to array of source kernel pages > + * @num: number of pages in page array > + * > + * Similar to vm_insert_range(), except that it explicitly sets @vm_pgoff to the offset > + * 0. This function is intended for the drivers that did not consider > + * @vm_pgoff. > + * > + * Context: Process context. Called by mmap handlers. > + * Return: 0 on success and error code otherwise. > + */ > +int vm_insert_range_buggy(struct vm_area_struct *vma, struct page **pages, > + unsigned long num) > -- Sincerely yours, Mike. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 10/21] memblock: refactor internal allocation functions
On Sun, Feb 03, 2019 at 08:39:20PM +1100, Michael Ellerman wrote: > Mike Rapoport writes: > > > Currently, memblock has several internal functions with overlapping > > functionality. They all call memblock_find_in_range_node() to find free > > memory and then reserve the allocated range and mark it with kmemleak. > > However, there is difference in the allocation constraints and in fallback > > strategies. > > > > The allocations returning physical address first attempt to find free > > memory on the specified node within mirrored memory regions, then retry on > > the same node without the requirement for memory mirroring and finally fall > > back to all available memory. > > > > The allocations returning virtual address start with clamping the allowed > > range to memblock.current_limit, attempt to allocate from the specified > > node from regions with mirroring and with user defined minimal address. If > > such allocation fails, next attempt is done with node restriction lifted. > > Next, the allocation is retried with minimal address reset to zero and at > > last without the requirement for mirrored regions. > > > > Let's consolidate various fallbacks handling and make them more consistent > > for physical and virtual variants. Most of the fallback handling is moved > > to memblock_alloc_range_nid() and it now handles node and mirror fallbacks. > > > > The memblock_alloc_internal() uses memblock_alloc_range_nid() to get a > > physical address of the allocated range and converts it to virtual address. > > > > The fallback for allocation below the specified minimal address remains in > > memblock_alloc_internal() because memblock_alloc_range_nid() is used by CMA > > with exact requirement for lower bounds. > > This is causing problems on some of my machines. > > I see NODE_DATA allocations falling back to node 0 when they shouldn't, > or didn't previously. > > eg, before: > > 57990190: (116011251): numa: NODE_DATA [mem 0xfffe4980-0xfffebfff] > 58152042: (116373087): numa: NODE_DATA [mem 0x8fff90980-0x8fff97fff] > > after: > > 16356872061562: (6296877055): numa: NODE_DATA [mem 0xfffe4980-0xfffebfff] > 16356872079279: (6296894772): numa: NODE_DATA [mem 0xfffcd300-0xfffd497f] > 16356872096376: (6296911869): numa: NODE_DATA(1) on node 0 > > > On some of my other systems it does that, and then panics because it > can't allocate anything at all: > > [0.00] numa: NODE_DATA [mem 0x7ffcaee80-0x7ffcb3fff] > [0.00] numa: NODE_DATA [mem 0x7ffc99d00-0x7ffc9ee7f] > [0.00] numa: NODE_DATA(1) on node 0 > [0.00] Kernel panic - not syncing: Cannot allocate 20864 bytes for > node 16 data > [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted > 5.0.0-rc4-gccN-next-20190201-gdc4c899 #1 > [0.00] Call Trace: > [0.00] [c11cfca0] [c0c11044] dump_stack+0xe8/0x164 > (unreliable) > [0.00] [c11cfcf0] [c00fdd6c] panic+0x17c/0x3e0 > [0.00] [c11cfd90] [c0f61bc8] initmem_init+0x128/0x260 > [0.00] [c11cfe60] [c0f57940] setup_arch+0x398/0x418 > [0.00] [c11cfee0] [c0f50a94] start_kernel+0xa0/0x684 > [0.00] [c11cff90] [c000af70] > start_here_common+0x1c/0x52c > [0.00] Rebooting in 180 seconds.. > > > So there's something going wrong there, I haven't had time to dig into > it though (Sunday night here). I'll try to see if I can reproduce it with qemu. > cheers > -- Sincerely yours, Mike. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCHv2 1/9] mm: Introduce new vm_insert_range and vm_insert_range_buggy API
On Thu, Jan 31, 2019 at 03:43:39PM +0530, Souptick Joarder wrote: > On Thu, Jan 31, 2019 at 2:09 PM Mike Rapoport wrote: > > > > On Thu, Jan 31, 2019 at 08:38:12AM +0530, Souptick Joarder wrote: > > > Previouly drivers have their own way of mapping range of > > > kernel pages/memory into user vma and this was done by > > > invoking vm_insert_page() within a loop. > > > > > > As this pattern is common across different drivers, it can > > > be generalized by creating new functions and use it across > > > the drivers. > > > > > > vm_insert_range() is the API which could be used to mapped > > > kernel memory/pages in drivers which has considered vm_pgoff > > > > > > vm_insert_range_buggy() is the API which could be used to map > > > range of kernel memory/pages in drivers which has not considered > > > vm_pgoff. vm_pgoff is passed default as 0 for those drivers. > > > > > > We _could_ then at a later "fix" these drivers which are using > > > vm_insert_range_buggy() to behave according to the normal vm_pgoff > > > offsetting simply by removing the _buggy suffix on the function > > > name and if that causes regressions, it gives us an easy way to revert. > > > > > > Signed-off-by: Souptick Joarder > > > Suggested-by: Russell King > > > Suggested-by: Matthew Wilcox > > > --- > > > include/linux/mm.h | 4 +++ > > > mm/memory.c| 81 > > > ++ > > > mm/nommu.c | 14 ++ > > > 3 files changed, 99 insertions(+) > > > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > > index 80bb640..25752b0 100644 > > > --- a/include/linux/mm.h > > > +++ b/include/linux/mm.h > > > @@ -2565,6 +2565,10 @@ unsigned long change_prot_numa(struct > > > vm_area_struct *vma, > > > int remap_pfn_range(struct vm_area_struct *, unsigned long addr, > > > unsigned long pfn, unsigned long size, pgprot_t); > > > int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct > > > page *); > > > +int vm_insert_range(struct vm_area_struct *vma, struct page **pages, > > > + unsigned long num); > > > +int vm_insert_range_buggy(struct vm_area_struct *vma, struct page > > > **pages, > > > + unsigned long num); > > > vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, > > > unsigned long pfn); > > > vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long > > > addr, > > > diff --git a/mm/memory.c b/mm/memory.c > > > index e11ca9d..0a4bf57 100644 > > > --- a/mm/memory.c > > > +++ b/mm/memory.c > > > @@ -1520,6 +1520,87 @@ int vm_insert_page(struct vm_area_struct *vma, > > > unsigned long addr, > > > } > > > EXPORT_SYMBOL(vm_insert_page); > > > > > > +/** > > > + * __vm_insert_range - insert range of kernel pages into user vma > > > + * @vma: user vma to map to > > > + * @pages: pointer to array of source kernel pages > > > + * @num: number of pages in page array > > > + * @offset: user's requested vm_pgoff > > > + * > > > + * This allows drivers to insert range of kernel pages they've allocated > > > + * into a user vma. > > > + * > > > + * If we fail to insert any page into the vma, the function will return > > > + * immediately leaving any previously inserted pages present. Callers > > > + * from the mmap handler may immediately return the error as their caller > > > + * will destroy the vma, removing any successfully inserted pages. Other > > > + * callers should make their own arrangements for calling unmap_region(). > > > + * > > > + * Context: Process context. > > > + * Return: 0 on success and error code otherwise. > > > + */ > > > +static int __vm_insert_range(struct vm_area_struct *vma, struct page > > > **pages, > > > + unsigned long num, unsigned long offset) > > > +{ > > > + unsigned long count = vma_pages(vma); > > > + unsigned long uaddr = vma->vm_start; > > > + int ret, i; > > > + > > > + /* Fail if the user requested offset is beyond the end of the > > > object */ > > > + if (offset > num) > > > +
Re: [Xen-devel] [PATCHv2 1/9] mm: Introduce new vm_insert_range and vm_insert_range_buggy API
On Thu, Jan 31, 2019 at 08:38:12AM +0530, Souptick Joarder wrote: > Previouly drivers have their own way of mapping range of > kernel pages/memory into user vma and this was done by > invoking vm_insert_page() within a loop. > > As this pattern is common across different drivers, it can > be generalized by creating new functions and use it across > the drivers. > > vm_insert_range() is the API which could be used to mapped > kernel memory/pages in drivers which has considered vm_pgoff > > vm_insert_range_buggy() is the API which could be used to map > range of kernel memory/pages in drivers which has not considered > vm_pgoff. vm_pgoff is passed default as 0 for those drivers. > > We _could_ then at a later "fix" these drivers which are using > vm_insert_range_buggy() to behave according to the normal vm_pgoff > offsetting simply by removing the _buggy suffix on the function > name and if that causes regressions, it gives us an easy way to revert. > > Signed-off-by: Souptick Joarder > Suggested-by: Russell King > Suggested-by: Matthew Wilcox > --- > include/linux/mm.h | 4 +++ > mm/memory.c| 81 > ++ > mm/nommu.c | 14 ++ > 3 files changed, 99 insertions(+) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 80bb640..25752b0 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2565,6 +2565,10 @@ unsigned long change_prot_numa(struct vm_area_struct > *vma, > int remap_pfn_range(struct vm_area_struct *, unsigned long addr, > unsigned long pfn, unsigned long size, pgprot_t); > int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page > *); > +int vm_insert_range(struct vm_area_struct *vma, struct page **pages, > + unsigned long num); > +int vm_insert_range_buggy(struct vm_area_struct *vma, struct page **pages, > + unsigned long num); > vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, > unsigned long pfn); > vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long > addr, > diff --git a/mm/memory.c b/mm/memory.c > index e11ca9d..0a4bf57 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -1520,6 +1520,87 @@ int vm_insert_page(struct vm_area_struct *vma, > unsigned long addr, > } > EXPORT_SYMBOL(vm_insert_page); > > +/** > + * __vm_insert_range - insert range of kernel pages into user vma > + * @vma: user vma to map to > + * @pages: pointer to array of source kernel pages > + * @num: number of pages in page array > + * @offset: user's requested vm_pgoff > + * > + * This allows drivers to insert range of kernel pages they've allocated > + * into a user vma. > + * > + * If we fail to insert any page into the vma, the function will return > + * immediately leaving any previously inserted pages present. Callers > + * from the mmap handler may immediately return the error as their caller > + * will destroy the vma, removing any successfully inserted pages. Other > + * callers should make their own arrangements for calling unmap_region(). > + * > + * Context: Process context. > + * Return: 0 on success and error code otherwise. > + */ > +static int __vm_insert_range(struct vm_area_struct *vma, struct page **pages, > + unsigned long num, unsigned long offset) > +{ > + unsigned long count = vma_pages(vma); > + unsigned long uaddr = vma->vm_start; > + int ret, i; > + > + /* Fail if the user requested offset is beyond the end of the object */ > + if (offset > num) > + return -ENXIO; > + > + /* Fail if the user requested size exceeds available object size */ > + if (count > num - offset) > + return -ENXIO; > + > + for (i = 0; i < count; i++) { > + ret = vm_insert_page(vma, uaddr, pages[offset + i]); > + if (ret < 0) > + return ret; > + uaddr += PAGE_SIZE; > + } > + > + return 0; > +} > + > +/** > + * vm_insert_range - insert range of kernel pages starts with non zero offset > + * @vma: user vma to map to > + * @pages: pointer to array of source kernel pages > + * @num: number of pages in page array > + * > + * Maps an object consisting of `num' `pages', catering for the user's > + * requested vm_pgoff > + * The elaborate description you've added to __vm_insert_range() is better put here, as this is the "public" function. > + * Context: Process context. Called by mmap handlers. > + * Return: 0 on success and error code otherwise. > + */ > +int vm_insert_range(struct vm_area_struct *vma, struct page **pages, > + unsigned long num) > +{ > + return __vm_insert_range(vma, pages, num, vma->vm_pgoff); > +} > +EXPORT_SYMBOL(vm_insert_range); > + > +/** > + * vm_insert_range_buggy - insert range of kernel pages starts with zero > offset > + * @vma: user vma to map to > +
Re: [Xen-devel] [PATCH v2 19/21] treewide: add checks for the return value of memblock_alloc*()
On Thu, Jan 31, 2019 at 08:07:29AM +0100, Christophe Leroy wrote: > > > Le 31/01/2019 à 07:44, Christophe Leroy a écrit : > > > > > >Le 31/01/2019 à 07:41, Mike Rapoport a écrit : > >>On Thu, Jan 31, 2019 at 07:07:46AM +0100, Christophe Leroy wrote: > >>> > >>> > >>>Le 21/01/2019 à 09:04, Mike Rapoport a écrit : > >>>>Add check for the return value of memblock_alloc*() functions and call > >>>>panic() in case of error. > >>>>The panic message repeats the one used by panicing memblock > >>>>allocators with > >>>>adjustment of parameters to include only relevant ones. > >>>> > >>>>The replacement was mostly automated with semantic patches like the one > >>>>below with manual massaging of format strings. > >>>> > >>>>@@ > >>>>expression ptr, size, align; > >>>>@@ > >>>>ptr = memblock_alloc(size, align); > >>>>+ if (!ptr) > >>>>+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, > >>>>size, align); > >>>> > >>>>Signed-off-by: Mike Rapoport > >>>>Reviewed-by: Guo Ren # c-sky > >>>>Acked-by: Paul Burton # MIPS > >>>>Acked-by: Heiko Carstens # s390 > >>>>Reviewed-by: Juergen Gross # Xen > >>>>--- > >>> > >>>[...] > >>> > >>>>diff --git a/mm/sparse.c b/mm/sparse.c > >>>>index 7ea5dc6..ad94242 100644 > >>>>--- a/mm/sparse.c > >>>>+++ b/mm/sparse.c > >>> > >>>[...] > >>> > >>>>@@ -425,6 +436,10 @@ static void __init sparse_buffer_init(unsigned > >>>>long size, int nid) > >>>> memblock_alloc_try_nid_raw(size, PAGE_SIZE, > >>>> __pa(MAX_DMA_ADDRESS), > >>>> MEMBLOCK_ALLOC_ACCESSIBLE, nid); > >>>>+ if (!sparsemap_buf) > >>>>+ panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d > >>>>from=%lx\n", > >>>>+ __func__, size, PAGE_SIZE, nid, __pa(MAX_DMA_ADDRESS)); > >>>>+ > >>> > >>>memblock_alloc_try_nid_raw() does not panic (help explicitly says: > >>>Does not > >>>zero allocated memory, does not panic if request cannot be satisfied.). > >> > >>"Does not panic" does not mean it always succeeds. > > > >I agree, but at least here you are changing the behaviour by making it > >panic explicitly. Are we sure there are not cases where the system could > >just continue functionning ? Maybe a WARN_ON() would be enough there ? > > Looking more in details, it looks like everything is done to live with > sparsemap_buf NULL, all functions using it check it so having it NULL > shouldn't imply a panic I believe, see code below. You are right, I'm preparing the fix right now. > static void *sparsemap_buf __meminitdata; > static void *sparsemap_buf_end __meminitdata; > > static void __init sparse_buffer_init(unsigned long size, int nid) > { > WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ > sparsemap_buf = > memblock_alloc_try_nid_raw(size, PAGE_SIZE, > __pa(MAX_DMA_ADDRESS), > MEMBLOCK_ALLOC_ACCESSIBLE, nid); > sparsemap_buf_end = sparsemap_buf + size; > } > > static void __init sparse_buffer_fini(void) > { > unsigned long size = sparsemap_buf_end - sparsemap_buf; > > if (sparsemap_buf && size > 0) > memblock_free_early(__pa(sparsemap_buf), size); > sparsemap_buf = NULL; > } > > void * __meminit sparse_buffer_alloc(unsigned long size) > { > void *ptr = NULL; > > if (sparsemap_buf) { > ptr = PTR_ALIGN(sparsemap_buf, size); > if (ptr + size > sparsemap_buf_end) > ptr = NULL; > else > sparsemap_buf = ptr + size; > } > return ptr; > } > > > Christophe > -- Sincerely yours, Mike. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 19/21] treewide: add checks for the return value of memblock_alloc*()
On Thu, Jan 31, 2019 at 07:07:46AM +0100, Christophe Leroy wrote: > > > Le 21/01/2019 à 09:04, Mike Rapoport a écrit : > >Add check for the return value of memblock_alloc*() functions and call > >panic() in case of error. > >The panic message repeats the one used by panicing memblock allocators with > >adjustment of parameters to include only relevant ones. > > > >The replacement was mostly automated with semantic patches like the one > >below with manual massaging of format strings. > > > >@@ > >expression ptr, size, align; > >@@ > >ptr = memblock_alloc(size, align); > >+ if (!ptr) > >+panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, > >size, align); > > > >Signed-off-by: Mike Rapoport > >Reviewed-by: Guo Ren # c-sky > >Acked-by: Paul Burton # MIPS > >Acked-by: Heiko Carstens # s390 > >Reviewed-by: Juergen Gross # Xen > >--- > > [...] > > >diff --git a/mm/sparse.c b/mm/sparse.c > >index 7ea5dc6..ad94242 100644 > >--- a/mm/sparse.c > >+++ b/mm/sparse.c > > [...] > > >@@ -425,6 +436,10 @@ static void __init sparse_buffer_init(unsigned long > >size, int nid) > > memblock_alloc_try_nid_raw(size, PAGE_SIZE, > > __pa(MAX_DMA_ADDRESS), > > MEMBLOCK_ALLOC_ACCESSIBLE, nid); > >+if (!sparsemap_buf) > >+panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d > >from=%lx\n", > >+ __func__, size, PAGE_SIZE, nid, __pa(MAX_DMA_ADDRESS)); > >+ > > memblock_alloc_try_nid_raw() does not panic (help explicitly says: Does not > zero allocated memory, does not panic if request cannot be satisfied.). "Does not panic" does not mean it always succeeds. > Stephen Rothwell reports a boot failure due to this change. Please see my reply on that thread. > Christophe > > > sparsemap_buf_end = sparsemap_buf + size; > > } > > > -- Sincerely yours, Mike. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 06/21] memblock: memblock_phys_alloc_try_nid(): don't panic
On Fri, Jan 25, 2019 at 05:45:02PM +, Catalin Marinas wrote: > On Mon, Jan 21, 2019 at 10:03:53AM +0200, Mike Rapoport wrote: > > diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c > > index ae34e3a..2c61ea4 100644 > > --- a/arch/arm64/mm/numa.c > > +++ b/arch/arm64/mm/numa.c > > @@ -237,6 +237,10 @@ static void __init setup_node_data(int nid, u64 > > start_pfn, u64 end_pfn) > > pr_info("Initmem setup node %d []\n", nid); > > > > nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); > > + if (!nd_pa) > > + panic("Cannot allocate %zu bytes for node %d data\n", > > + nd_size, nid); > > + > > nd = __va(nd_pa); > > > > /* report and initialize */ > > Does it mean that memblock_phys_alloc_try_nid() never returns valid > physical memory starting at 0? Yes, it does. memblock_find_in_range_node() that is used by all allocation methods skips the first page [1]. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/memblock.c#n257 > -- > Catalin > -- Sincerely yours, Mike. ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 19/21] treewide: add checks for the return value of memblock_alloc*()
Add check for the return value of memblock_alloc*() functions and call panic() in case of error. The panic message repeats the one used by panicing memblock allocators with adjustment of parameters to include only relevant ones. The replacement was mostly automated with semantic patches like the one below with manual massaging of format strings. @@ expression ptr, size, align; @@ ptr = memblock_alloc(size, align); + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", __func__, size, align); Signed-off-by: Mike Rapoport Reviewed-by: Guo Ren # c-sky Acked-by: Paul Burton # MIPS Acked-by: Heiko Carstens # s390 Reviewed-by: Juergen Gross # Xen --- arch/alpha/kernel/core_cia.c | 3 +++ arch/alpha/kernel/core_marvel.c | 6 ++ arch/alpha/kernel/pci-noop.c | 13 +++-- arch/alpha/kernel/pci.c | 11 ++- arch/alpha/kernel/pci_iommu.c | 12 arch/arc/mm/highmem.c | 4 arch/arm/kernel/setup.c | 6 ++ arch/arm/mm/mmu.c | 14 +- arch/arm64/kernel/setup.c | 8 +--- arch/arm64/mm/kasan_init.c| 10 ++ arch/c6x/mm/dma-coherent.c| 4 arch/c6x/mm/init.c| 3 +++ arch/csky/mm/highmem.c| 5 + arch/h8300/mm/init.c | 3 +++ arch/m68k/atari/stram.c | 4 arch/m68k/mm/init.c | 3 +++ arch/m68k/mm/mcfmmu.c | 6 ++ arch/m68k/mm/motorola.c | 9 + arch/m68k/mm/sun3mmu.c| 6 ++ arch/m68k/sun3/sun3dvma.c | 3 +++ arch/microblaze/mm/init.c | 8 ++-- arch/mips/cavium-octeon/dma-octeon.c | 3 +++ arch/mips/kernel/setup.c | 3 +++ arch/mips/kernel/traps.c | 3 +++ arch/mips/mm/init.c | 5 + arch/nds32/mm/init.c | 12 arch/openrisc/mm/ioremap.c| 8 ++-- arch/powerpc/kernel/dt_cpu_ftrs.c | 5 + arch/powerpc/kernel/pci_32.c | 3 +++ arch/powerpc/kernel/setup-common.c| 3 +++ arch/powerpc/kernel/setup_64.c| 4 arch/powerpc/lib/alloc.c | 3 +++ arch/powerpc/mm/hash_utils_64.c | 3 +++ arch/powerpc/mm/mmu_context_nohash.c | 9 + arch/powerpc/mm/pgtable-book3e.c | 12 ++-- arch/powerpc/mm/pgtable-book3s64.c| 3 +++ arch/powerpc/mm/pgtable-radix.c | 9 - arch/powerpc/mm/ppc_mmu_32.c | 3 +++ arch/powerpc/platforms/pasemi/iommu.c | 3 +++ arch/powerpc/platforms/powermac/nvram.c | 3 +++ arch/powerpc/platforms/powernv/opal.c | 3 +++ arch/powerpc/platforms/powernv/pci-ioda.c | 8 arch/powerpc/platforms/ps3/setup.c| 3 +++ arch/powerpc/sysdev/msi_bitmap.c | 3 +++ arch/s390/kernel/setup.c | 13 + arch/s390/kernel/smp.c| 5 - arch/s390/kernel/topology.c | 6 ++ arch/s390/numa/mode_emu.c | 3 +++ arch/s390/numa/numa.c | 6 +- arch/sh/mm/init.c | 6 ++ arch/sh/mm/numa.c | 4 arch/um/drivers/net_kern.c| 3 +++ arch/um/drivers/vector_kern.c | 3 +++ arch/um/kernel/initrd.c | 2 ++ arch/um/kernel/mem.c | 16 arch/unicore32/kernel/setup.c | 4 arch/unicore32/mm/mmu.c | 15 +-- arch/x86/kernel/acpi/boot.c | 3 +++ arch/x86/kernel/apic/io_apic.c| 5 + arch/x86/kernel/e820.c| 3 +++ arch/x86/platform/olpc/olpc_dt.c | 3 +++ arch/x86/xen/p2m.c| 11 +-- arch/xtensa/mm/kasan_init.c | 4 arch/xtensa/mm/mmu.c | 3 +++ drivers/clk/ti/clk.c | 3 +++ drivers/macintosh/smu.c | 3 +++ drivers/of/fdt.c | 8 +++- drivers/of/unittest.c | 8 +++- drivers/xen/swiotlb-xen.c | 7 +-- kernel/power/snapshot.c | 3 +++ lib/cpumask.c | 3 +++ mm/kasan/init.c | 10 -- mm/sparse.c | 19 +-- 73 files changed, 409 insertions(+), 28 deletions(-) diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c index 466cd44..f489170 100644 --- a/arch/alpha/kernel/core_cia.c +++ b/arch/alpha/kernel/core_cia.c @@ -33
[Xen-devel] [PATCH v2 20/21] memblock: memblock_alloc_try_nid: don't panic
As all the memblock_alloc*() users are now checking the return value and panic() in case of error, the panic() call can be removed from the core memblock allocator, namely memblock_alloc_try_nid(). Signed-off-by: Mike Rapoport --- mm/memblock.c | 15 +-- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 03b3929..7164275 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1526,7 +1526,7 @@ void * __init memblock_alloc_try_nid_nopanic( } /** - * memblock_alloc_try_nid - allocate boot memory block with panicking + * memblock_alloc_try_nid - allocate boot memory block * @size: size of memory block to be allocated in bytes * @align: alignment of the region and block's size * @min_addr: the lower bound of the memory region from where the allocation @@ -1536,9 +1536,8 @@ void * __init memblock_alloc_try_nid_nopanic( * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public panicking version of memblock_alloc_try_nid_nopanic() - * which provides debug information (including caller info), if enabled, - * and panics if the request can not be satisfied. + * Public function, provides additional debug information (including caller + * info), if enabled. This function zeroes the allocated memory. * * Return: * Virtual address of allocated memory block on success, NULL on failure. @@ -1555,14 +1554,10 @@ void * __init memblock_alloc_try_nid( &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, min_addr, max_addr, nid); - if (ptr) { + if (ptr) memset(ptr, 0, size); - return ptr; - } - panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n", - __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr); - return NULL; + return ptr; } /** -- 2.7.4 ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 17/21] init/main: add checks for the return value of memblock_alloc*()
Add panic() calls if memblock_alloc() returns NULL. The panic() format duplicates the one used by memblock itself and in order to avoid explosion with long parameters list replace open coded allocation size calculations with a local variable. Signed-off-by: Mike Rapoport --- init/main.c | 26 -- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/init/main.c b/init/main.c index a56f65a..d58a365 100644 --- a/init/main.c +++ b/init/main.c @@ -373,12 +373,20 @@ static inline void smp_prepare_cpus(unsigned int maxcpus) { } */ static void __init setup_command_line(char *command_line) { - saved_command_line = - memblock_alloc(strlen(boot_command_line) + 1, SMP_CACHE_BYTES); - initcall_command_line = - memblock_alloc(strlen(boot_command_line) + 1, SMP_CACHE_BYTES); - static_command_line = memblock_alloc(strlen(command_line) + 1, -SMP_CACHE_BYTES); + size_t len = strlen(boot_command_line) + 1; + + saved_command_line = memblock_alloc(len, SMP_CACHE_BYTES); + if (!saved_command_line) + panic("%s: Failed to allocate %zu bytes\n", __func__, len); + + initcall_command_line = memblock_alloc(len, SMP_CACHE_BYTES); + if (!initcall_command_line) + panic("%s: Failed to allocate %zu bytes\n", __func__, len); + + static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); + if (!static_command_line) + panic("%s: Failed to allocate %zu bytes\n", __func__, len); + strcpy(saved_command_line, boot_command_line); strcpy(static_command_line, command_line); } @@ -773,8 +781,14 @@ static int __init initcall_blacklist(char *str) pr_debug("blacklisting initcall %s\n", str_entry); entry = memblock_alloc(sizeof(*entry), SMP_CACHE_BYTES); + if (!entry) + panic("%s: Failed to allocate %zu bytes\n", + __func__, sizeof(*entry)); entry->buf = memblock_alloc(strlen(str_entry) + 1, SMP_CACHE_BYTES); + if (!entry->buf) + panic("%s: Failed to allocate %zu bytes\n", + __func__, strlen(str_entry) + 1); strcpy(entry->buf, str_entry); list_add(&entry->next, &blacklisted_initcalls); } -- 2.7.4 ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 03/21] memblock: replace memblock_alloc_base(ANYWHERE) with memblock_phys_alloc
The calls to memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE) and memblock_phys_alloc(size, align) are equivalent as both try to allocate 'size' bytes with 'align' alignment anywhere in the memory and panic if hte allocation fails. The conversion is done using the following semantic patch: @@ expression size, align; @@ - memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE) + memblock_phys_alloc(size, align) Signed-off-by: Mike Rapoport --- arch/arm/mm/init.c | 2 +- arch/sh/boards/mach-ap325rxa/setup.c | 2 +- arch/sh/boards/mach-ecovec24/setup.c | 4 ++-- arch/sh/boards/mach-kfr2r09/setup.c | 2 +- arch/sh/boards/mach-migor/setup.c| 2 +- arch/sh/boards/mach-se/7724/setup.c | 4 ++-- arch/xtensa/mm/kasan_init.c | 3 +-- 7 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 478ea8b..b76b90e 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -205,7 +205,7 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align) BUG_ON(!arm_memblock_steal_permitted); - phys = memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); + phys = memblock_phys_alloc(size, align); memblock_free(phys, size); memblock_remove(phys, size); diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c index 8f234d04..d7ceab6 100644 --- a/arch/sh/boards/mach-ap325rxa/setup.c +++ b/arch/sh/boards/mach-ap325rxa/setup.c @@ -557,7 +557,7 @@ static void __init ap325rxa_mv_mem_reserve(void) phys_addr_t phys; phys_addr_t size = CEU_BUFFER_MEMORY_SIZE; - phys = memblock_alloc_base(size, PAGE_SIZE, MEMBLOCK_ALLOC_ANYWHERE); + phys = memblock_phys_alloc(size, PAGE_SIZE); memblock_free(phys, size); memblock_remove(phys, size); diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index 22b4106..a3901806 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -1480,12 +1480,12 @@ static void __init ecovec_mv_mem_reserve(void) phys_addr_t phys; phys_addr_t size = CEU_BUFFER_MEMORY_SIZE; - phys = memblock_alloc_base(size, PAGE_SIZE, MEMBLOCK_ALLOC_ANYWHERE); + phys = memblock_phys_alloc(size, PAGE_SIZE); memblock_free(phys, size); memblock_remove(phys, size); ceu0_dma_membase = phys; - phys = memblock_alloc_base(size, PAGE_SIZE, MEMBLOCK_ALLOC_ANYWHERE); + phys = memblock_phys_alloc(size, PAGE_SIZE); memblock_free(phys, size); memblock_remove(phys, size); ceu1_dma_membase = phys; diff --git a/arch/sh/boards/mach-kfr2r09/setup.c b/arch/sh/boards/mach-kfr2r09/setup.c index 203d249..55bdf4a 100644 --- a/arch/sh/boards/mach-kfr2r09/setup.c +++ b/arch/sh/boards/mach-kfr2r09/setup.c @@ -631,7 +631,7 @@ static void __init kfr2r09_mv_mem_reserve(void) phys_addr_t phys; phys_addr_t size = CEU_BUFFER_MEMORY_SIZE; - phys = memblock_alloc_base(size, PAGE_SIZE, MEMBLOCK_ALLOC_ANYWHERE); + phys = memblock_phys_alloc(size, PAGE_SIZE); memblock_free(phys, size); memblock_remove(phys, size); diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c index f4ad33c..ba7eee6 100644 --- a/arch/sh/boards/mach-migor/setup.c +++ b/arch/sh/boards/mach-migor/setup.c @@ -630,7 +630,7 @@ static void __init migor_mv_mem_reserve(void) phys_addr_t phys; phys_addr_t size = CEU_BUFFER_MEMORY_SIZE; - phys = memblock_alloc_base(size, PAGE_SIZE, MEMBLOCK_ALLOC_ANYWHERE); + phys = memblock_phys_alloc(size, PAGE_SIZE); memblock_free(phys, size); memblock_remove(phys, size); diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c index fdbec22a..4696e10 100644 --- a/arch/sh/boards/mach-se/7724/setup.c +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -965,12 +965,12 @@ static void __init ms7724se_mv_mem_reserve(void) phys_addr_t phys; phys_addr_t size = CEU_BUFFER_MEMORY_SIZE; - phys = memblock_alloc_base(size, PAGE_SIZE, MEMBLOCK_ALLOC_ANYWHERE); + phys = memblock_phys_alloc(size, PAGE_SIZE); memblock_free(phys, size); memblock_remove(phys, size); ceu0_dma_membase = phys; - phys = memblock_alloc_base(size, PAGE_SIZE, MEMBLOCK_ALLOC_ANYWHERE); + phys = memblock_phys_alloc(size, PAGE_SIZE); memblock_free(phys, size); memblock_remove(phys, size); ceu1_dma_membase = phys; diff --git a/arch/xtensa/mm/kasan_init.c b/arch/xtensa/mm/kasan_init.c index 1734cda..48dbb03 100644 --- a/arch/xtensa/mm/kasan_init.c +++ b/arch/xtensa/mm/kasan_init.c @@ -52,8 +52,7 @@ static void __init populate(void *start, void *end) for (k = 0; k < PTRS_PER_PTE; ++k, ++j) {
[Xen-devel] [PATCH v2 18/21] swiotlb: add checks for the return value of memblock_alloc*()
Add panic() calls if memblock_alloc() returns NULL. The panic() format duplicates the one used by memblock itself and in order to avoid explosion with long parameters list replace open coded allocation size calculations with a local variable. Signed-off-by: Mike Rapoport --- kernel/dma/swiotlb.c | 19 +-- 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index d636177..e78835c8 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -191,6 +191,7 @@ void __init swiotlb_update_mem_attributes(void) int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) { unsigned long i, bytes; + size_t alloc_size; bytes = nslabs << IO_TLB_SHIFT; @@ -203,12 +204,18 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE * between io_tlb_start and io_tlb_end. */ - io_tlb_list = memblock_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(int)), - PAGE_SIZE); - io_tlb_orig_addr = memblock_alloc( - PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)), - PAGE_SIZE); + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(int)); + io_tlb_list = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_list) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)); + io_tlb_orig_addr = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_orig_addr) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; -- 2.7.4 ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 14/21] ia64: add checks for the return value of memblock_alloc*()
Add panic() calls if memblock_alloc*() returns NULL. Most of the changes are simply addition of if(!ptr) panic(); statements after the calls to memblock_alloc*() variants. Exceptions are create_mem_map_page_table() and ia64_log_init() that were slightly refactored to accommodate the change. Signed-off-by: Mike Rapoport --- arch/ia64/kernel/mca.c | 20 ++-- arch/ia64/mm/contig.c | 8 ++-- arch/ia64/mm/discontig.c| 4 arch/ia64/mm/init.c | 38 ++ arch/ia64/mm/tlb.c | 6 ++ arch/ia64/sn/kernel/io_common.c | 3 +++ arch/ia64/sn/kernel/setup.c | 12 +++- 7 files changed, 74 insertions(+), 17 deletions(-) diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 370bc34..5cabb3f 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -359,11 +359,6 @@ typedef struct ia64_state_log_s static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES]; -#define IA64_LOG_ALLOCATE(it, size) \ - {ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)] = \ - (ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES); \ - ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)] = \ - (ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES);} #define IA64_LOG_LOCK_INIT(it) spin_lock_init(&ia64_state_log[it].isl_lock) #define IA64_LOG_LOCK(it) spin_lock_irqsave(&ia64_state_log[it].isl_lock, s) #define IA64_LOG_UNLOCK(it) spin_unlock_irqrestore(&ia64_state_log[it].isl_lock,s) @@ -378,6 +373,19 @@ static ia64_state_log_t ia64_state_log[IA64_MAX_LOG_TYPES]; #define IA64_LOG_CURR_BUFFER(it) (void *)((ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)])) #define IA64_LOG_COUNT(it) ia64_state_log[it].isl_count +static inline void ia64_log_allocate(int it, u64 size) +{ + ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)] = + (ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES); + if (!ia64_state_log[it].isl_log[IA64_LOG_CURR_INDEX(it)]) + panic("%s: Failed to allocate %llu bytes\n", __func__, size); + + ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)] = + (ia64_err_rec_t *)memblock_alloc(size, SMP_CACHE_BYTES); + if (!ia64_state_log[it].isl_log[IA64_LOG_NEXT_INDEX(it)]) + panic("%s: Failed to allocate %llu bytes\n", __func__, size); +} + /* * ia64_log_init * Reset the OS ia64 log buffer @@ -399,7 +407,7 @@ ia64_log_init(int sal_info_type) return; // set up OS data structures to hold error info - IA64_LOG_ALLOCATE(sal_info_type, max_size); + ia64_log_allocate(sal_info_type, max_size); } /* diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index 6e44723..d29fb6b 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -84,9 +84,13 @@ void *per_cpu_init(void) static inline void alloc_per_cpu_data(void) { - cpu_data = memblock_alloc_from(PERCPU_PAGE_SIZE * num_possible_cpus(), - PERCPU_PAGE_SIZE, + size_t size = PERCPU_PAGE_SIZE * num_possible_cpus(); + + cpu_data = memblock_alloc_from(size, PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!cpu_data) + panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", + __func__, size, PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); } /** diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index f9c3675..05490dd 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -454,6 +454,10 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, bestnode); + if (!ptr) + panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%lx\n", + __func__, pernodesize, PERCPU_PAGE_SIZE, bestnode, + __pa(MAX_DMA_ADDRESS)); return ptr; } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 29d8415..e49200e 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -444,23 +444,45 @@ int __init create_mem_map_page_table(u64 start, u64 end, void *arg) for (address = start_page; address < end_page; address += PAGE_SIZE) { pgd = pgd_offset_k(address); - if (pgd_none(*pgd)) - pgd_populate(&init_mm, pgd, memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node)); + if (pgd_none(*pgd)) { + pud = memblock_alloc_node(PAGE_SIZE, PAGE_SIZE, node); + if (!pud) + goto err_all
[Xen-devel] [PATCH v2 21/21] memblock: drop memblock_alloc_*_nopanic() variants
As all the memblock allocation functions return NULL in case of error rather than panic(), the duplicates with _nopanic suffix can be removed. Signed-off-by: Mike Rapoport Acked-by: Greg Kroah-Hartman --- arch/arc/kernel/unwind.c | 3 +-- arch/sh/mm/init.c | 2 +- arch/x86/kernel/setup_percpu.c | 10 +- arch/x86/mm/kasan_init_64.c| 14 -- drivers/firmware/memmap.c | 2 +- drivers/usb/early/xhci-dbc.c | 2 +- include/linux/memblock.h | 35 --- kernel/dma/swiotlb.c | 2 +- kernel/printk/printk.c | 9 + mm/memblock.c | 35 --- mm/page_alloc.c| 10 +- mm/page_ext.c | 2 +- mm/percpu.c| 11 --- mm/sparse.c| 6 ++ 14 files changed, 31 insertions(+), 112 deletions(-) diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c index d34f69e..271e9fa 100644 --- a/arch/arc/kernel/unwind.c +++ b/arch/arc/kernel/unwind.c @@ -181,8 +181,7 @@ static void init_unwind_hdr(struct unwind_table *table, */ static void *__init unw_hdr_alloc_early(unsigned long sz) { - return memblock_alloc_from_nopanic(sz, sizeof(unsigned int), - MAX_DMA_ADDRESS); + return memblock_alloc_from(sz, sizeof(unsigned int), MAX_DMA_ADDRESS); } static void *unw_hdr_alloc(unsigned long sz) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index fceefd9..7062132 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -202,7 +202,7 @@ void __init allocate_pgdat(unsigned int nid) get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #ifdef CONFIG_NEED_MULTIPLE_NODES - NODE_DATA(nid) = memblock_alloc_try_nid_nopanic( + NODE_DATA(nid) = memblock_alloc_try_nid( sizeof(struct pglist_data), SMP_CACHE_BYTES, MEMBLOCK_LOW_LIMIT, MEMBLOCK_ALLOC_ACCESSIBLE, nid); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e8796fc..0c5e9bf 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -106,22 +106,22 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, void *ptr; if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from_nopanic(size, align, goal); + ptr = memblock_alloc_from(size, align, goal); pr_info("cpu %d has no node %d or node-local memory\n", cpu, node); pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", cpu, size, __pa(ptr)); } else { - ptr = memblock_alloc_try_nid_nopanic(size, align, goal, -MEMBLOCK_ALLOC_ACCESSIBLE, -node); + ptr = memblock_alloc_try_nid(size, align, goal, +MEMBLOCK_ALLOC_ACCESSIBLE, +node); pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", cpu, size, node, __pa(ptr)); } return ptr; #else - return memblock_alloc_from_nopanic(size, align, goal); + return memblock_alloc_from(size, align, goal); #endif } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 462fde8..8dc0fc0 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -24,14 +24,16 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static __init void *early_alloc(size_t size, int nid, bool panic) +static __init void *early_alloc(size_t size, int nid, bool should_panic) { - if (panic) - return memblock_alloc_try_nid(size, size, - __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); - else - return memblock_alloc_try_nid_nopanic(size, size, + void *ptr = memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); + + if (!ptr && should_panic) + panic("%pS: Failed to allocate page, nid=%d from=%lx\n", + (void *)_RET_IP_, nid, __pa(MAX_DMA_ADDRESS)); + + return ptr; } static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index ec4fd25..d168c87 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -333,7 +333,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry
[Xen-devel] [PATCH v2 15/21] sparc: add checks for the return value of memblock_alloc*()
Add panic() calls if memblock_alloc*() returns NULL. Most of the changes are simply addition of if(!ptr) panic(); statements after the calls to memblock_alloc*() variants. Exceptions are pcpu_populate_pte() and kernel_map_range() that were slightly refactored to accommodate the change. Signed-off-by: Mike Rapoport Acked-by: David S. Miller --- arch/sparc/kernel/prom_32.c | 2 ++ arch/sparc/kernel/setup_64.c | 6 ++ arch/sparc/kernel/smp_64.c | 12 arch/sparc/mm/init_64.c | 11 +++ arch/sparc/mm/srmmu.c| 8 5 files changed, 39 insertions(+) diff --git a/arch/sparc/kernel/prom_32.c b/arch/sparc/kernel/prom_32.c index e7126ca..869b16c 100644 --- a/arch/sparc/kernel/prom_32.c +++ b/arch/sparc/kernel/prom_32.c @@ -33,6 +33,8 @@ void * __init prom_early_alloc(unsigned long size) void *ret; ret = memblock_alloc(size, SMP_CACHE_BYTES); + if (!ret) + panic("%s: Failed to allocate %lu bytes\n", __func__, size); prom_early_allocated += size; diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c index 51c4d12..fd2182a 100644 --- a/arch/sparc/kernel/setup_64.c +++ b/arch/sparc/kernel/setup_64.c @@ -624,8 +624,14 @@ void __init alloc_irqstack_bootmem(void) softirq_stack[i] = memblock_alloc_node(THREAD_SIZE, THREAD_SIZE, node); + if (!softirq_stack[i]) + panic("%s: Failed to allocate %lu bytes align=%lx nid=%d\n", + __func__, THREAD_SIZE, THREAD_SIZE, node); hardirq_stack[i] = memblock_alloc_node(THREAD_SIZE, THREAD_SIZE, node); + if (!hardirq_stack[i]) + panic("%s: Failed to allocate %lu bytes align=%lx nid=%d\n", + __func__, THREAD_SIZE, THREAD_SIZE, node); } } diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index f45d876..a8275fe 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1628,6 +1628,8 @@ static void __init pcpu_populate_pte(unsigned long addr) pud_t *new; new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; pgd_populate(&init_mm, pgd, new); } @@ -1636,6 +1638,8 @@ static void __init pcpu_populate_pte(unsigned long addr) pmd_t *new; new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; pud_populate(&init_mm, pud, new); } @@ -1644,8 +1648,16 @@ static void __init pcpu_populate_pte(unsigned long addr) pte_t *new; new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; pmd_populate_kernel(&init_mm, pmd, new); } + + return; + +err_alloc: + panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", + __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); } void __init setup_per_cpu_areas(void) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index ef340e8..f2d70ff 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1809,6 +1809,8 @@ static unsigned long __ref kernel_map_range(unsigned long pstart, new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; alloc_bytes += PAGE_SIZE; pgd_populate(&init_mm, pgd, new); } @@ -1822,6 +1824,8 @@ static unsigned long __ref kernel_map_range(unsigned long pstart, } new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; alloc_bytes += PAGE_SIZE; pud_populate(&init_mm, pud, new); } @@ -1836,6 +1840,8 @@ static unsigned long __ref kernel_map_range(unsigned long pstart, } new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); + if (!new) + goto err_alloc; alloc_bytes += PAGE_SIZE; pmd_populate_kernel(&init_mm, pmd, new); } @@ -1855,6 +1861,11 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
[Xen-devel] [PATCH v2 16/21] mm/percpu: add checks for the return value of memblock_alloc*()
Add panic() calls if memblock_alloc() returns NULL. The panic() format duplicates the one used by memblock itself and in order to avoid explosion with long parameters list replace open coded allocation size calculations with a local variable. Signed-off-by: Mike Rapoport --- mm/percpu.c | 73 +++-- 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index db86282..5998b03 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1086,6 +1086,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, struct pcpu_chunk *chunk; unsigned long aligned_addr, lcm_align; int start_offset, offset_bits, region_size, region_bits; + size_t alloc_size; /* region calculations */ aligned_addr = tmp_addr & PAGE_MASK; @@ -1101,9 +1102,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ - chunk = memblock_alloc(sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT), - SMP_CACHE_BYTES); + alloc_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(region_size >> PAGE_SHIFT); + chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); INIT_LIST_HEAD(&chunk->list); @@ -1114,12 +1118,25 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); - chunk->alloc_map = memblock_alloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), - SMP_CACHE_BYTES); - chunk->bound_map = memblock_alloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), - SMP_CACHE_BYTES); - chunk->md_blocks = memblock_alloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), - SMP_CACHE_BYTES); + alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); + chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->alloc_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = + BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); + chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->bound_map) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); + chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!chunk->md_blocks) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ @@ -2044,6 +2061,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, int group, unit, i; int map_size; unsigned long tmp_addr; + size_t alloc_size; #define PCPU_SETUP_BUG_ON(cond)do { \ if (unlikely(cond)) { \ @@ -2075,14 +2093,29 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ - group_offsets = memblock_alloc(ai->nr_groups * sizeof(group_offsets[0]), - SMP_CACHE_BYTES); - group_sizes = memblock_alloc(ai->nr_groups * sizeof(group_sizes[0]), -SMP_CACHE_BYTES); - unit_map = memblock_alloc(nr_cpu_ids * sizeof(unit_map[0]), - SMP_CACHE_BYTES); - unit_off = memblock_alloc(nr_cpu_ids * sizeof(unit_off[0]), - SMP_CACHE_BYTES); + alloc_size = ai->nr_groups * sizeof(group_offsets[0]); + group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!group_offsets) + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + + alloc_size = ai->nr_groups * sizeof(group_sizes[0]); + group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); + if (!group_sizes) + panic("%s: Failed to allocate %zu bytes\n&q
[Xen-devel] [PATCH v2 02/21] powerpc: use memblock functions returning virtual address
From: Christophe Leroy Since only the virtual address of allocated blocks is used, lets use functions returning directly virtual address. Those functions have the advantage of also zeroing the block. [ MR: - updated error message in alloc_stack() to be more verbose - convereted several additional call sites ] Signed-off-by: Christophe Leroy Signed-off-by: Mike Rapoport --- arch/powerpc/kernel/dt_cpu_ftrs.c | 3 +-- arch/powerpc/kernel/irq.c | 5 - arch/powerpc/kernel/paca.c| 6 +- arch/powerpc/kernel/prom.c| 5 - arch/powerpc/kernel/setup_32.c| 26 -- 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 8be3721..2554824 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -813,7 +813,6 @@ static int __init process_cpufeatures_node(unsigned long node, int len; f = &dt_cpu_features[i]; - memset(f, 0, sizeof(struct dt_cpu_feature)); f->node = node; @@ -1008,7 +1007,7 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char /* Count and allocate space for cpu features */ of_scan_flat_dt_subnodes(node, count_cpufeatures_subnodes, &nr_dt_cpu_features); - dt_cpu_features = __va(memblock_phys_alloc(sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, PAGE_SIZE)); + dt_cpu_features = memblock_alloc(sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, PAGE_SIZE); cpufeatures_setup_start(isa); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 916ddc4..4a44bc3 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -725,18 +725,15 @@ void exc_lvl_ctx_init(void) #endif #endif - memset((void *)critirq_ctx[cpu_nr], 0, THREAD_SIZE); tp = critirq_ctx[cpu_nr]; tp->cpu = cpu_nr; tp->preempt_count = 0; #ifdef CONFIG_BOOKE - memset((void *)dbgirq_ctx[cpu_nr], 0, THREAD_SIZE); tp = dbgirq_ctx[cpu_nr]; tp->cpu = cpu_nr; tp->preempt_count = 0; - memset((void *)mcheckirq_ctx[cpu_nr], 0, THREAD_SIZE); tp = mcheckirq_ctx[cpu_nr]; tp->cpu = cpu_nr; tp->preempt_count = HARDIRQ_OFFSET; @@ -754,12 +751,10 @@ void irq_ctx_init(void) int i; for_each_possible_cpu(i) { - memset((void *)softirq_ctx[i], 0, THREAD_SIZE); tp = softirq_ctx[i]; tp->cpu = i; klp_init_thread_info(tp); - memset((void *)hardirq_ctx[i], 0, THREAD_SIZE); tp = hardirq_ctx[i]; tp->cpu = i; klp_init_thread_info(tp); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 8c890c6..e7382ab 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -196,7 +196,11 @@ void __init allocate_paca_ptrs(void) paca_nr_cpu_ids = nr_cpu_ids; paca_ptrs_size = sizeof(struct paca_struct *) * nr_cpu_ids; - paca_ptrs = __va(memblock_phys_alloc(paca_ptrs_size, SMP_CACHE_BYTES)); + paca_ptrs = memblock_alloc_raw(paca_ptrs_size, SMP_CACHE_BYTES); + if (!paca_ptrs) + panic("Failed to allocate %d bytes for paca pointers\n", + paca_ptrs_size); + memset(paca_ptrs, 0x88, paca_ptrs_size); } diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index e97aaf2..c0ed4fa 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -127,7 +127,10 @@ static void __init move_device_tree(void) if ((memory_limit && (start + size) > PHYSICAL_START + memory_limit) || !memblock_is_memory(start + size - 1) || overlaps_crashkernel(start, size) || overlaps_initrd(start, size)) { - p = __va(memblock_phys_alloc(size, PAGE_SIZE)); + p = memblock_alloc_raw(size, PAGE_SIZE); + if (!p) + panic("Failed to allocate %lu bytes to move device tree\n", + size); memcpy(p, initial_boot_params, size); initial_boot_params = p; DBG("Moved device tree to 0x%px\n", p); diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 947f904..1f0b762 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -196,6 +196,17 @@ static int __init ppc_init(void) } arch_initcall(ppc_init); +static void *__init alloc_stack(void) +{ + void *ptr = memblock_alloc(THREAD_SIZE, THREAD_SIZE); + + if (!ptr) + panic("cannot allocate %d bytes for stack at %
[Xen-devel] [PATCH v2 10/21] memblock: refactor internal allocation functions
Currently, memblock has several internal functions with overlapping functionality. They all call memblock_find_in_range_node() to find free memory and then reserve the allocated range and mark it with kmemleak. However, there is difference in the allocation constraints and in fallback strategies. The allocations returning physical address first attempt to find free memory on the specified node within mirrored memory regions, then retry on the same node without the requirement for memory mirroring and finally fall back to all available memory. The allocations returning virtual address start with clamping the allowed range to memblock.current_limit, attempt to allocate from the specified node from regions with mirroring and with user defined minimal address. If such allocation fails, next attempt is done with node restriction lifted. Next, the allocation is retried with minimal address reset to zero and at last without the requirement for mirrored regions. Let's consolidate various fallbacks handling and make them more consistent for physical and virtual variants. Most of the fallback handling is moved to memblock_alloc_range_nid() and it now handles node and mirror fallbacks. The memblock_alloc_internal() uses memblock_alloc_range_nid() to get a physical address of the allocated range and converts it to virtual address. The fallback for allocation below the specified minimal address remains in memblock_alloc_internal() because memblock_alloc_range_nid() is used by CMA with exact requirement for lower bounds. The memblock_phys_alloc_nid() function is completely dropped as it is not used anywhere outside memblock and its only usage can be replaced by a call to memblock_alloc_range_nid(). Signed-off-by: Mike Rapoport --- include/linux/memblock.h | 1 - mm/memblock.c| 173 +-- 2 files changed, 78 insertions(+), 96 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6874fdc..cf4cd9c 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -371,7 +371,6 @@ static inline int memblock_get_region_node(const struct memblock_region *r) phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end); -phys_addr_t memblock_phys_alloc_nid(phys_addr_t size, phys_addr_t align, int nid); phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid); static inline phys_addr_t memblock_phys_alloc(phys_addr_t size, diff --git a/mm/memblock.c b/mm/memblock.c index 531fa77..739f769 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1312,30 +1312,84 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ +/** + * memblock_alloc_range_nid - allocate boot memory block + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @start: the lower bound of the memory region to allocate (phys address) + * @end: the upper bound of the memory region to allocate (phys address) + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * The allocation is performed from memory region limited by + * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. + * + * If the specified node can not hold the requested memory the + * allocation falls back to any node in the system + * + * For systems with memory mirroring, the allocation is attempted first + * from the regions with mirroring enabled and then retried from any + * memory region. + * + * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for + * allocated boot memory block, so that it is never reported as leaks. + * + * Return: + * Physical address of allocated memory block on success, %0 on failure. + */ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid, - enum memblock_flags flags) + phys_addr_t end, int nid) { + enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; + if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) + nid = NUMA_NO_NODE; + if (!align) { /* Can't use WARNs this early in boot on powerpc */ dump_stack(); align = SMP_CACHE_BYTES; } + if (end > memblock.current_limit) + end = memblock.current_limit; + +again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); - if (found && !memblock_reserve(found, size)) { + if (found && !membloc
[Xen-devel] [PATCH v2 11/21] memblock: make memblock_find_in_range_node() and choose_memblock_flags() static
These functions are not used outside memblock. Make them static. Signed-off-by: Mike Rapoport --- include/linux/memblock.h | 4 mm/memblock.c| 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index cf4cd9c..f5a83a1 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -111,9 +111,6 @@ void memblock_discard(void); #define memblock_dbg(fmt, ...) \ if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end, - int nid, enum memblock_flags flags); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); void memblock_allow_resize(void); @@ -130,7 +127,6 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); -enum memblock_flags choose_memblock_flags(void); unsigned long memblock_free_all(void); void reset_node_managed_pages(pg_data_t *pgdat); diff --git a/mm/memblock.c b/mm/memblock.c index 739f769..03b3929 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -125,7 +125,7 @@ static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; -enum memblock_flags __init_memblock choose_memblock_flags(void) +static enum memblock_flags __init_memblock choose_memblock_flags(void) { return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; } @@ -254,7 +254,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * Return: * Found address on success, 0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, +static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid, enum memblock_flags flags) -- 2.7.4 ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 07/21] memblock: memblock_phys_alloc(): don't panic
Make the memblock_phys_alloc() function an inline wrapper for memblock_phys_alloc_range() and update the memblock_phys_alloc() callers to check the returned value and panic in case of error. Signed-off-by: Mike Rapoport --- arch/arm/mm/init.c | 4 arch/arm64/mm/mmu.c | 2 ++ arch/powerpc/sysdev/dart_iommu.c | 3 +++ arch/s390/kernel/crash_dump.c| 3 +++ arch/s390/kernel/setup.c | 3 +++ arch/sh/boards/mach-ap325rxa/setup.c | 3 +++ arch/sh/boards/mach-ecovec24/setup.c | 6 ++ arch/sh/boards/mach-kfr2r09/setup.c | 3 +++ arch/sh/boards/mach-migor/setup.c| 3 +++ arch/sh/boards/mach-se/7724/setup.c | 6 ++ arch/xtensa/mm/kasan_init.c | 3 +++ include/linux/memblock.h | 7 ++- mm/memblock.c| 5 - 13 files changed, 45 insertions(+), 6 deletions(-) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index b76b90e..15dddfe 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -206,6 +206,10 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align) BUG_ON(!arm_memblock_steal_permitted); phys = memblock_phys_alloc(size, align); + if (!phys) + panic("Failed to steal %pa bytes at %pS\n", + &size, (void *)_RET_IP_); + memblock_free(phys, size); memblock_remove(phys, size); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b6f5aa5..a74e4be 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -104,6 +104,8 @@ static phys_addr_t __init early_pgtable_alloc(void) void *ptr; phys = memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); + if (!phys) + panic("Failed to allocate page table page\n"); /* * The FIX_{PGD,PUD,PMD} slots may be in active use, but the FIX_PTE diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 25bc25f..b82c9ff 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -265,6 +265,9 @@ static void allocate_dart(void) * prefetching into invalid pages and corrupting data */ tmp = memblock_phys_alloc(DART_PAGE_SIZE, DART_PAGE_SIZE); + if (!tmp) + panic("DART: table allocation failed\n"); + dart_emptyval = DARTMAP_VALID | ((tmp >> DART_PAGE_SHIFT) & DARTMAP_RPNMASK); diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 97eae38..f96a585 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -61,6 +61,9 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) struct save_area *sa; sa = (void *) memblock_phys_alloc(sizeof(*sa), 8); + if (!sa) + panic("Failed to allocate save area\n"); + if (is_boot_cpu) list_add(&sa->list, &dump_save_areas); else diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 72dd23e..da48397 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -968,6 +968,9 @@ static void __init setup_randomness(void) vmms = (struct sysinfo_3_2_2 *) memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE); + if (!vmms) + panic("Failed to allocate memory for sysinfo structure\n"); + if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); memblock_free((unsigned long) vmms, PAGE_SIZE); diff --git a/arch/sh/boards/mach-ap325rxa/setup.c b/arch/sh/boards/mach-ap325rxa/setup.c index d7ceab6..08a0cc9 100644 --- a/arch/sh/boards/mach-ap325rxa/setup.c +++ b/arch/sh/boards/mach-ap325rxa/setup.c @@ -558,6 +558,9 @@ static void __init ap325rxa_mv_mem_reserve(void) phys_addr_t size = CEU_BUFFER_MEMORY_SIZE; phys = memblock_phys_alloc(size, PAGE_SIZE); + if (!phys) + panic("Failed to allocate CEU memory\n"); + memblock_free(phys, size); memblock_remove(phys, size); diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index a3901806..fd264a6 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -1481,11 +1481,17 @@ static void __init ecovec_mv_mem_reserve(void) phys_addr_t size = CEU_BUFFER_MEMORY_SIZE; phys = memblock_phys_alloc(size, PAGE_SIZE); + if (!phys) + panic("Failed to allocate CEU0 memory\n"); + memblock_free(phys, size); memblock_remove(phys, size); ceu0_dma_membase = phys; phys = memblock_phys_alloc(size, PAGE_SIZE); + if (!phys) + panic("Failed to allocate CEU1 memory\
[Xen-devel] [PATCH v2 12/21] arch: use memblock_alloc() instead of memblock_alloc_from(size, align, 0)
The last parameter of memblock_alloc_from() is the lower limit for the memory allocation. When it is 0, the call is equivalent to memblock_alloc(). Signed-off-by: Mike Rapoport Acked-by: Paul Burton # MIPS part --- arch/alpha/kernel/core_cia.c | 2 +- arch/alpha/kernel/pci_iommu.c | 4 ++-- arch/alpha/kernel/setup.c | 2 +- arch/ia64/kernel/mca.c| 3 +-- arch/mips/kernel/traps.c | 2 +- arch/sparc/kernel/prom_32.c | 2 +- arch/sparc/mm/init_32.c | 2 +- arch/sparc/mm/srmmu.c | 10 +- 8 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c index 867e873..466cd44 100644 --- a/arch/alpha/kernel/core_cia.c +++ b/arch/alpha/kernel/core_cia.c @@ -331,7 +331,7 @@ cia_prepare_tbia_workaround(int window) long i; /* Use minimal 1K map. */ - ppte = memblock_alloc_from(CIA_BROKEN_TBIA_SIZE, 32768, 0); + ppte = memblock_alloc(CIA_BROKEN_TBIA_SIZE, 32768); pte = (virt_to_phys(ppte) >> (PAGE_SHIFT - 1)) | 1; for (i = 0; i < CIA_BROKEN_TBIA_SIZE / sizeof(unsigned long); ++i) diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index aa0f50d..e4cf77b 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -87,13 +87,13 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base, printk("%s: couldn't allocate arena ptes from node %d\n" "falling back to system-wide allocation\n", __func__, nid); - arena->ptes = memblock_alloc_from(mem_size, align, 0); + arena->ptes = memblock_alloc(mem_size, align); } #else /* CONFIG_DISCONTIGMEM */ arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES); - arena->ptes = memblock_alloc_from(mem_size, align, 0); + arena->ptes = memblock_alloc(mem_size, align); #endif /* CONFIG_DISCONTIGMEM */ diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 4b5b1b2..5d4c76a 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -293,7 +293,7 @@ move_initrd(unsigned long mem_limit) unsigned long size; size = initrd_end - initrd_start; - start = memblock_alloc_from(PAGE_ALIGN(size), PAGE_SIZE, 0); + start = memblock_alloc(PAGE_ALIGN(size), PAGE_SIZE); if (!start || __pa(start) + size > mem_limit) { initrd_start = initrd_end = 0; return NULL; diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 91bd1e1..74d148b 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1835,8 +1835,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset, /* Caller prevents this from being called after init */ static void * __ref mca_bootmem(void) { - return memblock_alloc_from(sizeof(struct ia64_mca_cpu), - KERNEL_STACK_SIZE, 0); + return memblock_alloc(sizeof(struct ia64_mca_cpu), KERNEL_STACK_SIZE); } /* Do per-CPU MCA-related initialization. */ diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index c91097f..2bbdee5 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -2291,7 +2291,7 @@ void __init trap_init(void) phys_addr_t ebase_pa; ebase = (unsigned long) - memblock_alloc_from(size, 1 << fls(size), 0); + memblock_alloc(size, 1 << fls(size)); /* * Try to ensure ebase resides in KSeg0 if possible. diff --git a/arch/sparc/kernel/prom_32.c b/arch/sparc/kernel/prom_32.c index 42d7f2a..38940af 100644 --- a/arch/sparc/kernel/prom_32.c +++ b/arch/sparc/kernel/prom_32.c @@ -32,7 +32,7 @@ void * __init prom_early_alloc(unsigned long size) { void *ret; - ret = memblock_alloc_from(size, SMP_CACHE_BYTES, 0UL); + ret = memblock_alloc(size, SMP_CACHE_BYTES); if (ret != NULL) memset(ret, 0, size); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index d900952..a8ff298 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -264,7 +264,7 @@ void __init mem_init(void) i = last_valid_pfn >> ((20 - PAGE_SHIFT) + 5); i += 1; sparc_valid_addr_bitmap = (unsigned long *) - memblock_alloc_from(i << 2, SMP_CACHE_BYTES, 0UL); + memblock_alloc(i << 2, SMP_CACHE_BYTES); if (sparc_valid_addr_bitmap == NULL) { prom_printf("mem_init: Cannot alloc valid_addr_bitmap.\n"); diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index b609362..a400ec3 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -303,13 +303,13 @@ static void __init srmmu_nocache_init(void) bitmap_bits = srmmu_nocac
[Xen-devel] [PATCH v2 13/21] arch: don't memset(0) memory returned by memblock_alloc()
memblock_alloc() already clears the allocated memory, no point in doing it twice. Signed-off-by: Mike Rapoport Acked-by: Geert Uytterhoeven # m68k --- arch/c6x/mm/init.c | 1 - arch/h8300/mm/init.c| 1 - arch/ia64/kernel/mca.c | 2 -- arch/m68k/mm/mcfmmu.c | 1 - arch/microblaze/mm/init.c | 6 ++ arch/sparc/kernel/prom_32.c | 2 -- 6 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index af5ada0..e83c046 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -40,7 +40,6 @@ void __init paging_init(void) empty_zero_page = (unsigned long) memblock_alloc(PAGE_SIZE, PAGE_SIZE); - memset((void *)empty_zero_page, 0, PAGE_SIZE); /* * Set up user data space diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 6519252..a157890 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -68,7 +68,6 @@ void __init paging_init(void) * to a couple of allocated pages. */ empty_zero_page = (unsigned long)memblock_alloc(PAGE_SIZE, PAGE_SIZE); - memset((void *)empty_zero_page, 0, PAGE_SIZE); /* * Set up SFC/DFC registers (user data space). diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 74d148b..370bc34 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -400,8 +400,6 @@ ia64_log_init(int sal_info_type) // set up OS data structures to hold error info IA64_LOG_ALLOCATE(sal_info_type, max_size); - memset(IA64_LOG_CURR_BUFFER(sal_info_type), 0, max_size); - memset(IA64_LOG_NEXT_BUFFER(sal_info_type), 0, max_size); } /* diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 0de4999..492f953 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -44,7 +44,6 @@ void __init paging_init(void) int i; empty_zero_page = (void *) memblock_alloc(PAGE_SIZE, PAGE_SIZE); - memset((void *) empty_zero_page, 0, PAGE_SIZE); pg_dir = swapper_pg_dir; memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 44f4b89..bd1cd4b 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -376,10 +376,8 @@ void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) if (mem_init_done) p = kzalloc(size, mask); - else { + else p = memblock_alloc(size, SMP_CACHE_BYTES); - if (p) - memset(p, 0, size); - } + return p; } diff --git a/arch/sparc/kernel/prom_32.c b/arch/sparc/kernel/prom_32.c index 38940af..e7126ca 100644 --- a/arch/sparc/kernel/prom_32.c +++ b/arch/sparc/kernel/prom_32.c @@ -33,8 +33,6 @@ void * __init prom_early_alloc(unsigned long size) void *ret; ret = memblock_alloc(size, SMP_CACHE_BYTES); - if (ret != NULL) - memset(ret, 0, size); prom_early_allocated += size; -- 2.7.4 ___ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel