Re: [PATCH RFC v2 16/27] arm64: mte: Manage tag storage on page allocation

2023-12-07 Thread Hyesoo Yu
Hi.

On Wed, Nov 29, 2023 at 01:33:37PM +, Alexandru Elisei wrote:
> Hi,
> 
> On Wed, Nov 29, 2023 at 06:10:40PM +0900, Hyesoo Yu wrote:
> > On Sun, Nov 19, 2023 at 04:57:10PM +, Alexandru Elisei wrote:
> > > [..]
> > > +static int order_to_num_blocks(int order)
> > > +{
> > > + return max((1 << order) / 32, 1);
> > > +}
> > > [..]
> > > +int reserve_tag_storage(struct page *page, int order, gfp_t gfp)
> > > +{
> > > + unsigned long start_block, end_block;
> > > + struct tag_region *region;
> > > + unsigned long block;
> > > + unsigned long flags;
> > > + unsigned int tries;
> > > + int ret = 0;
> > > +
> > > + VM_WARN_ON_ONCE(!preemptible());
> > > +
> > > + if (page_tag_storage_reserved(page))
> > > + return 0;
> > > +
> > > + /*
> > > +  * __alloc_contig_migrate_range() ignores gfp when allocating the
> > > +  * destination page for migration. Regardless, massage gfp flags and
> > > +  * remove __GFP_TAGGED to avoid recursion in case gfp stops being
> > > +  * ignored.
> > > +  */
> > > + gfp &= ~__GFP_TAGGED;
> > > + if (!(gfp & __GFP_NORETRY))
> > > + gfp |= __GFP_RETRY_MAYFAIL;
> > > +
> > > + ret = tag_storage_find_block(page, &start_block, ®ion);
> > > + if (WARN_ONCE(ret, "Missing tag storage block for pfn 0x%lx", 
> > > page_to_pfn(page)))
> > > + return 0;
> > > + end_block = start_block + order_to_num_blocks(order) * 
> > > region->block_size;
> > > +
> > 
> > Hello.
> > 
> > If the page size is 4K,  block size is 2 (block size bytes 8K), and order 
> > is 6,
> > then we need 2 pages for the tag. However according to the equation, 
> > order_to_num_blocks
> > is 2 and block_size is also 2, so end block will be incremented by 4.
> > 
> > However we actually only need 8K of tag, right for 256K ?
> > Could you explain order_to_num_blocks * region->block_size more detail ?
> 
> I think you are correct, thank you for pointing it out. The formula should
> probably be something like:
> 
> static int order_to_num_blocks(int order, u32 block_size)
> {
>   int num_tag_pages = max((1 << order) / 32, 1);
> 
>   return DIV_ROUND_UP(num_tag_pages, block_size);
> }
> 
> and that will make end_block = start_block + 2 in your scenario.
> 
> Does that look correct to you?
> 
> Thanks,
> Alex
> 

That looks great!

Thanks,
Regards.

> > 
> > Thanks,
> > Regards.
> > 
> > > + mutex_lock(&tag_blocks_lock);
> > > +
> > > + /* Check again, this time with the lock held. */
> > > + if (page_tag_storage_reserved(page))
> > > + goto out_unlock;
> > > +
> > > + /* Make sure existing entries are not freed from out under out feet. */
> > > + xa_lock_irqsave(&tag_blocks_reserved, flags);
> > > + for (block = start_block; block < end_block; block += 
> > > region->block_size) {
> > > + if (tag_storage_block_is_reserved(block))
> > > + block_ref_add(block, region, order);
> > > + }
> > > + xa_unlock_irqrestore(&tag_blocks_reserved, flags);
> > > +
> > > + for (block = start_block; block < end_block; block += 
> > > region->block_size) {
> > > + /* Refcount incremented above. */
> > > + if (tag_storage_block_is_reserved(block))
> > > + continue;
> > > +
> > > + tries = 3;
> > > + while (tries--) {
> > > + ret = alloc_contig_range(block, block + 
> > > region->block_size, MIGRATE_CMA, gfp);
> > > + if (ret == 0 || ret != -EBUSY)
> > > + break;
> > > + }
> > > +
> > > + if (ret)
> > > + goto out_error;
> > > +
> > > + ret = tag_storage_reserve_block(block, region, order);
> > > + if (ret) {
> > > + free_contig_range(block, region->block_size);
> > > + goto out_error;
> > > + }
> > > +
> > > + count_vm_events(CMA_ALLOC_SUCCESS, region->block_size);
> > > + }
> > > +
> > > + page_set_tag_storage_reserved(page, order);
> > > +out_unlock:
> > > + mutex_unlock(&tag_blocks_lock);
> > > +
> > > + return 0

Re: [PATCH RFC v2 15/27] arm64: mte: Check that tag storage blocks are in the same zone

2023-12-07 Thread Hyesoo Yu
Hi~

On Thu, Nov 30, 2023 at 12:00:11PM +, Alexandru Elisei wrote:
> Hi,
> 
> On Wed, Nov 29, 2023 at 05:57:44PM +0900, Hyesoo Yu wrote:
> > On Sun, Nov 19, 2023 at 04:57:09PM +, Alexandru Elisei wrote:
> > > alloc_contig_range() requires that the requested pages are in the same
> > > zone. Check that this is indeed the case before initializing the tag
> > > storage blocks.
> > > 
> > > Signed-off-by: Alexandru Elisei 
> > > ---
> > >  arch/arm64/kernel/mte_tag_storage.c | 33 +
> > >  1 file changed, 33 insertions(+)
> > > 
> > > diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> > > b/arch/arm64/kernel/mte_tag_storage.c
> > > index 8b9bedf7575d..fd63430d4dc0 100644
> > > --- a/arch/arm64/kernel/mte_tag_storage.c
> > > +++ b/arch/arm64/kernel/mte_tag_storage.c
> > > @@ -265,6 +265,35 @@ void __init mte_tag_storage_init(void)
> > >   }
> > >  }
> > >  
> > > +/* alloc_contig_range() requires all pages to be in the same zone. */
> > > +static int __init mte_tag_storage_check_zone(void)
> > > +{
> > > + struct range *tag_range;
> > > + struct zone *zone;
> > > + unsigned long pfn;
> > > + u32 block_size;
> > > + int i, j;
> > > +
> > > + for (i = 0; i < num_tag_regions; i++) {
> > > + block_size = tag_regions[i].block_size;
> > > + if (block_size == 1)
> > > + continue;
> > > +
> > > + tag_range = &tag_regions[i].tag_range;
> > > + for (pfn = tag_range->start; pfn <= tag_range->end; pfn += 
> > > block_size) {
> > > + zone = page_zone(pfn_to_page(pfn));
> > 
> > Hello.
> > 
> > Since the blocks within the tag_range must all be in the same zone, can we 
> > move the "page_zone"
> > out of the loop ?
> `
> Hmm.. why do you say that the pages in a tag_range must be in the same
> zone? I am not very familiar with how the memory management code puts pages
> into zones, but I would imagine that pages in a tag range straddling the
> 4GB limit (so, let's say, from 3GB to 5GB) will end up in both ZONE_DMA and
> ZONE_NORMAL.
> 
> Thanks,
> Alex
> 

Oh, I see that reserve_tag_storage only calls alloc_contig_rnage in units of 
block_size,
I thought it could be called for the entire range the page needed at once.
(Maybe it could be a bit faster ? It doesn't seem like unnecessary drain and
other operation are repeated.)

If we use the cma code when activating the tag storage, it will be error if the
entire area of tag region is not in the same zone, so there should be a 
constraint
that it must be in the same zone when defining the tag region on device tree.

Thanks,
Regards.

> > 
> > Thanks,
> > Regards.
> > 
> > > + for (j = 1; j < block_size; j++) {
> > > + if (page_zone(pfn_to_page(pfn + j)) != zone) {
> > > + pr_err("Tag storage block pages in 
> > > different zones");
> > > + return -EINVAL;
> > > + }
> > > + }
> > > + }
> > > + }
> > > +
> > > +  return 0;
> > > +}
> > > +
> > >  static int __init mte_tag_storage_activate_regions(void)
> > >  {
> > >   phys_addr_t dram_start, dram_end;
> > > @@ -321,6 +350,10 @@ static int __init 
> > > mte_tag_storage_activate_regions(void)
> > >   goto out_disabled;
> > >   }
> > >  
> > > + ret = mte_tag_storage_check_zone();
> > > + if (ret)
> > > + goto out_disabled;
> > > +
> > >   for (i = 0; i < num_tag_regions; i++) {
> > >   tag_range = &tag_regions[i].tag_range;
> > >   for (pfn = tag_range->start; pfn <= tag_range->end; pfn += 
> > > pageblock_nr_pages)
> > > -- 
> > > 2.42.1
> > > 
> > > 
> 
> 
> 


Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-12-07 Thread Hyesoo Yu
Hi, 

I'm sorry for the late response, I was on vacation.

On Sun, Dec 03, 2023 at 12:14:30PM +, Alexandru Elisei wrote:
> Hi,
> 
> On Wed, Nov 29, 2023 at 05:44:24PM +0900, Hyesoo Yu wrote:
> > Hello.
> > 
> > On Sun, Nov 19, 2023 at 04:57:05PM +, Alexandru Elisei wrote:
> > > Allow the kernel to get the size and location of the MTE tag storage
> > > regions from the DTB. This memory is marked as reserved for now.
> > > 
> > > The DTB node for the tag storage region is defined as:
> > > 
> > > tags0: tag-storage@8f800 {
> > > compatible = "arm,mte-tag-storage";
> > > reg = <0x08 0xf800 0x00 0x400>;
> > > block-size = <0x1000>;
> > > memory = <&memory0>;  // Associated tagged memory node
> > > };
> > >
> > 
> > How about using compatible = "shared-dma-pool" like below ?
> > 
> > &reserved_memory {
> > tags0: tag0@8f800 {
> > compatible = "arm,mte-tag-storage";
> > reg = <0x08 0xf800 0x00 0x400>;
> > };
> > }
> > 
> > tag-storage {
> > compatible = "arm,mte-tag-storage";
> > memory-region = <&tag>;
> > memory = <&memory0>;
> > block-size = <0x1000>;
> > }
> > 
> > And then, the activation of CMA would be performed in the CMA code.
> > We just can get the region information from memory-region and allocate it 
> > directly
> > like alloc_contig_range, take_page_off_buddy. It seems like we can remove a 
> > lots of code.
>

Sorry, that example was my mistake. Actually I wanted to write like this. 

&reserved_memory {
tags0: tag0@8f800 {
compatible = "shared-dma-pool";
reg = <0x08 0xf800 0x00 0x400>;
reusable;
};
}

tag-storage {
compatible = "arm,mte-tag-storage";
memory-region = <&tag>;
memory = <&memory0>;
block-size = <0x1000>;
}


> Played with reserved_mem a bit. I don't think that's the correct path
> forward.
> 
> The location of the tag storage is a hardware property, independent of how
> Linux is configured.
> 
> early_init_fdt_scan_reserved_mem() is called from arm64_memblock_init(),
> **after** the kernel enforces an upper address for various reasons. One of
> the reasons can be that it's been compiled with 39 bits VA.
> 

I'm not sure about this part. What is the upper address enforced by the kernel ?
Where can I check the code ? Do you means that memblock_end_of_DRAM() ?  

> After early_init_fdt_scan_reserved_mem() returns, the kernel sets the
> maximum address, stored in the variable "high_memory".
>
> What can happen is that tag storage is present at an address above the
> maximum addressable by the kernel, and the CMA code will trigger an
> unrecovrable page fault.
> 
> I was able to trigger this with the dts change:
> 
> diff --git a/arch/arm64/boot/dts/arm/fvp-base-revc.dts 
> b/arch/arm64/boot/dts/arm/fvp-base-revc.dts
> index 60472d65a355..201359d014e4 100644
> --- a/arch/arm64/boot/dts/arm/fvp-base-revc.dts
> +++ b/arch/arm64/boot/dts/arm/fvp-base-revc.dts
> @@ -183,6 +183,13 @@ vram: vram@1800 {
> reg = <0x 0x1800 0 0x0080>;
> no-map;
> };
> +
> +
> +   linux,cma {
> +   compatible = "shared-dma-pool";
> +   reg = <0x100 0x0 0x00 0x400>;
> +   reusable;
> +   };
> };
> 
> gic: interrupt-controller@2f00 {
> 
> And the error I got:
> 
> [0.00] Reserved memory: created CMA memory pool at 
> 0x0100, size 64 MiB
> [0.00] OF: reserved mem: initialized node linux,cma, compatible id 
> shared-dma-pool
> [0.00] OF: reserved mem: 0x0100..0x010003ff 
> (65536 KiB) map reusable linux,cma
> [..]
> [0.793193] WARNING: CPU: 0 PID: 1 at mm/cma.c:111 
> cma_init_reserved_areas+0xa8/0x378
> [..]
> [0.806945] Unable to handle kernel paging request at virtual address 
> 0001fe00
> [0.807277] Mem abort info:
> [0.807277]   ESR = 0x9605
> [0.807693]   EC = 0x25: DABT (current EL), IL = 32 bits
> [0.808110]   SET = 0, FnV = 0
> [0.808443]   EA = 0, S1PTW = 0
> [0.808526]   FSC = 0x0

Re: [PATCH RFC v2 19/27] mm: mprotect: Introduce PAGE_FAULT_ON_ACCESS for mprotect(PROT_MTE)

2023-11-29 Thread Hyesoo Yu
On Sun, Nov 19, 2023 at 04:57:13PM +, Alexandru Elisei wrote:
> To enable tagging on a memory range, userspace can use mprotect() with the
> PROT_MTE access flag. Pages already mapped in the VMA don't have the
> associated tag storage block reserved, so mark the PTEs as
> PAGE_FAULT_ON_ACCESS to trigger a fault next time they are accessed, and
> reserve the tag storage on the fault path.
> 
> This has several benefits over reserving the tag storage as part of the
> mprotect() call handling:
> 
> - Tag storage is reserved only for those pages in the VMA that are
>   accessed, instead of for all the pages already mapped in the VMA.
> - Reduces the latency of the mprotect() call.
> - Eliminates races with page migration.
> 
> But all of this is at the expense of an extra page fault per page until the
> pages being accessed all have their corresponding tag storage reserved.
> 
> For arm64, the PAGE_FAULT_ON_ACCESS protection is created by defining a new
> page table entry software bit, PTE_TAG_STORAGE_NONE. Linux doesn't set any
> of the PBHA bits in entries from the last level of the translation table
> and it doesn't use the TCR_ELx.HWUxx bits; also, the first PBHA bit, bit
> 59, is already being used as a software bit for PMD_PRESENT_INVALID.
> 
> This is only implemented for PTE mappings; PMD mappings will follow.
> 
> Signed-off-by: Alexandru Elisei 
> ---
>  arch/arm64/Kconfig   |   1 +
>  arch/arm64/include/asm/mte.h |   4 +-
>  arch/arm64/include/asm/mte_tag_storage.h |   2 +
>  arch/arm64/include/asm/pgtable-prot.h|   2 +
>  arch/arm64/include/asm/pgtable.h |  40 ++---
>  arch/arm64/kernel/mte.c  |  12 ++-
>  arch/arm64/mm/fault.c| 101 +++
>  include/linux/pgtable.h  |  17 
>  mm/Kconfig   |   3 +
>  mm/memory.c  |   3 +
>  10 files changed, 170 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index efa5b7958169..3b9c435eaafb 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -2066,6 +2066,7 @@ if ARM64_MTE
>  config ARM64_MTE_TAG_STORAGE
>   bool "Dynamic MTE tag storage management"
>   depends on ARCH_KEEP_MEMBLOCK
> + select ARCH_HAS_FAULT_ON_ACCESS
>   select CONFIG_CMA
>   help
> Adds support for dynamic management of the memory used by the hardware
> diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
> index 6457b7899207..70dc2e409070 100644
> --- a/arch/arm64/include/asm/mte.h
> +++ b/arch/arm64/include/asm/mte.h
> @@ -107,7 +107,7 @@ static inline bool try_page_mte_tagging(struct page *page)
>  }
>  
>  void mte_zero_clear_page_tags(void *addr);
> -void mte_sync_tags(pte_t pte, unsigned int nr_pages);
> +void mte_sync_tags(pte_t *pteval, unsigned int nr_pages);
>  void mte_copy_page_tags(void *kto, const void *kfrom);
>  void mte_thread_init_user(void);
>  void mte_thread_switch(struct task_struct *next);
> @@ -139,7 +139,7 @@ static inline bool try_page_mte_tagging(struct page *page)
>  static inline void mte_zero_clear_page_tags(void *addr)
>  {
>  }
> -static inline void mte_sync_tags(pte_t pte, unsigned int nr_pages)
> +static inline void mte_sync_tags(pte_t *pteval, unsigned int nr_pages)
>  {
>  }
>  static inline void mte_copy_page_tags(void *kto, const void *kfrom)
> diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
> b/arch/arm64/include/asm/mte_tag_storage.h
> index 6e5d28e607bb..c70ced60a0cd 100644
> --- a/arch/arm64/include/asm/mte_tag_storage.h
> +++ b/arch/arm64/include/asm/mte_tag_storage.h
> @@ -33,6 +33,8 @@ int reserve_tag_storage(struct page *page, int order, gfp_t 
> gfp);
>  void free_tag_storage(struct page *page, int order);
>  
>  bool page_tag_storage_reserved(struct page *page);
> +
> +vm_fault_t handle_page_missing_tag_storage(struct vm_fault *vmf);
>  #else
>  static inline bool tag_storage_enabled(void)
>  {
> diff --git a/arch/arm64/include/asm/pgtable-prot.h 
> b/arch/arm64/include/asm/pgtable-prot.h
> index e9624f6326dd..85ebb3e352ad 100644
> --- a/arch/arm64/include/asm/pgtable-prot.h
> +++ b/arch/arm64/include/asm/pgtable-prot.h
> @@ -19,6 +19,7 @@
>  #define PTE_SPECIAL  (_AT(pteval_t, 1) << 56)
>  #define PTE_DEVMAP   (_AT(pteval_t, 1) << 57)
>  #define PTE_PROT_NONE(_AT(pteval_t, 1) << 58) /* only when 
> !PTE_VALID */
> +#define PTE_TAG_STORAGE_NONE (_AT(pteval_t, 1) << 60) /* only when 
> PTE_PROT_NONE */
>  
>  /*
>   * This bit indicates that the entry is present i.e. pmd_page()
> @@ -94,6 +95,7 @@ extern bool arm64_use_ng_mappings;
>})
>  
>  #define PAGE_NONE__pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | 
> PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
> +#define PAGE_FAULT_ON_ACCESS __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | 
> PTE_PROT_NONE | PTE_TAG_STORAGE_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | 
> PTE

Re: [PATCH RFC v2 16/27] arm64: mte: Manage tag storage on page allocation

2023-11-29 Thread Hyesoo Yu
On Sun, Nov 19, 2023 at 04:57:10PM +, Alexandru Elisei wrote:
> Reserve tag storage for a tagged page by migrating the contents of the tag
> storage (if in use for data) and removing the tag storage pages from the
> page allocator by calling alloc_contig_range().
> 
> When all the associated tagged pages have been freed, return the tag
> storage pages back to the page allocator, where they can be used again for
> data allocations.
> 
> Tag storage pages cannot be tagged, so disallow allocations from
> MIGRATE_CMA when the allocation is tagged.
> 
> Signed-off-by: Alexandru Elisei 
> ---
>  arch/arm64/include/asm/mte.h |  16 +-
>  arch/arm64/include/asm/mte_tag_storage.h |  45 +
>  arch/arm64/include/asm/pgtable.h |  27 +++
>  arch/arm64/kernel/mte_tag_storage.c  | 241 +++
>  fs/proc/page.c   |   1 +
>  include/linux/kernel-page-flags.h|   1 +
>  include/linux/page-flags.h   |   1 +
>  include/trace/events/mmflags.h   |   3 +-
>  mm/huge_memory.c |   1 +
>  9 files changed, 333 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h
> index 8034695b3dd7..6457b7899207 100644
> --- a/arch/arm64/include/asm/mte.h
> +++ b/arch/arm64/include/asm/mte.h
> @@ -40,12 +40,24 @@ void mte_free_tag_buf(void *buf);
>  #ifdef CONFIG_ARM64_MTE
>  
>  /* track which pages have valid allocation tags */
> -#define PG_mte_taggedPG_arch_2
> +#define PG_mte_taggedPG_arch_2
>  /* simple lock to avoid multiple threads tagging the same page */
> -#define PG_mte_lock  PG_arch_3
> +#define PG_mte_lock  PG_arch_3
> +/* Track if a tagged page has tag storage reserved */
> +#define PG_tag_storage_reserved  PG_arch_4
> +
> +#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> +DECLARE_STATIC_KEY_FALSE(tag_storage_enabled_key);
> +extern bool page_tag_storage_reserved(struct page *page);
> +#endif
>  
>  static inline void set_page_mte_tagged(struct page *page)
>  {
> +#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> + /* Open code mte_tag_storage_enabled() */
> + WARN_ON_ONCE(static_branch_likely(&tag_storage_enabled_key) &&
> +  !page_tag_storage_reserved(page));
> +#endif
>   /*
>* Ensure that the tags written prior to this function are visible
>* before the page flags update.
> diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
> b/arch/arm64/include/asm/mte_tag_storage.h
> index 8f86c4f9a7c3..cab033b184ab 100644
> --- a/arch/arm64/include/asm/mte_tag_storage.h
> +++ b/arch/arm64/include/asm/mte_tag_storage.h
> @@ -5,11 +5,56 @@
>  #ifndef __ASM_MTE_TAG_STORAGE_H
>  #define __ASM_MTE_TAG_STORAGE_H
>  
> +#ifndef __ASSEMBLY__
> +
> +#include 
> +
> +#include 
> +
>  #ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> +
> +DECLARE_STATIC_KEY_FALSE(tag_storage_enabled_key);
> +
> +static inline bool tag_storage_enabled(void)
> +{
> + return static_branch_likely(&tag_storage_enabled_key);
> +}
> +
> +static inline bool alloc_requires_tag_storage(gfp_t gfp)
> +{
> + return gfp & __GFP_TAGGED;
> +}
> +
>  void mte_tag_storage_init(void);
> +
> +int reserve_tag_storage(struct page *page, int order, gfp_t gfp);
> +void free_tag_storage(struct page *page, int order);
> +
> +bool page_tag_storage_reserved(struct page *page);
>  #else
> +static inline bool tag_storage_enabled(void)
> +{
> + return false;
> +}
> +static inline bool alloc_requires_tag_storage(struct page *page)
> +{
> + return false;
> +}
>  static inline void mte_tag_storage_init(void)
>  {
>  }
> +static inline int reserve_tag_storage(struct page *page, int order, gfp_t 
> gfp)
> +{
> + return 0;
> +}
> +static inline void free_tag_storage(struct page *page, int order)
> +{
> +}
> +static inline bool page_tag_storage_reserved(struct page *page)
> +{
> + return true;
> +}
>  #endif /* CONFIG_ARM64_MTE_TAG_STORAGE */
> +
> +#endif /* !__ASSEMBLY__ */
>  #endif /* __ASM_MTE_TAG_STORAGE_H  */
> diff --git a/arch/arm64/include/asm/pgtable.h 
> b/arch/arm64/include/asm/pgtable.h
> index cd5dacd1be3a..20e8de853f5d 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -10,6 +10,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1063,6 +1064,32 @@ static inline void arch_swap_restore(swp_entry_t 
> entry, struct folio *folio)
>   mte_restore_page_tags_by_swp_entry(entry, &folio->page);
>  }
>  
> +#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> +
> +#define __HAVE_ARCH_PREP_NEW_PAGE
> +static inline int arch_prep_new_page(struct page *page, int order, gfp_t gfp)
> +{
> + if (tag_storage_enabled() && alloc_requires_tag_storage(gfp))
> + return reserve_tag_storage(page, order, gfp);
> + return 0;
> +}
> +
> +#define __HAVE_ARCH_FREE_PAGES_PREPARE
> +static inline void arch_free_pages_prepare(struct page *page, int order)
> +{
> + if (tag_st

Re: [PATCH RFC v2 15/27] arm64: mte: Check that tag storage blocks are in the same zone

2023-11-29 Thread Hyesoo Yu
On Sun, Nov 19, 2023 at 04:57:09PM +, Alexandru Elisei wrote:
> alloc_contig_range() requires that the requested pages are in the same
> zone. Check that this is indeed the case before initializing the tag
> storage blocks.
> 
> Signed-off-by: Alexandru Elisei 
> ---
>  arch/arm64/kernel/mte_tag_storage.c | 33 +
>  1 file changed, 33 insertions(+)
> 
> diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> b/arch/arm64/kernel/mte_tag_storage.c
> index 8b9bedf7575d..fd63430d4dc0 100644
> --- a/arch/arm64/kernel/mte_tag_storage.c
> +++ b/arch/arm64/kernel/mte_tag_storage.c
> @@ -265,6 +265,35 @@ void __init mte_tag_storage_init(void)
>   }
>  }
>  
> +/* alloc_contig_range() requires all pages to be in the same zone. */
> +static int __init mte_tag_storage_check_zone(void)
> +{
> + struct range *tag_range;
> + struct zone *zone;
> + unsigned long pfn;
> + u32 block_size;
> + int i, j;
> +
> + for (i = 0; i < num_tag_regions; i++) {
> + block_size = tag_regions[i].block_size;
> + if (block_size == 1)
> + continue;
> +
> + tag_range = &tag_regions[i].tag_range;
> + for (pfn = tag_range->start; pfn <= tag_range->end; pfn += 
> block_size) {
> + zone = page_zone(pfn_to_page(pfn));

Hello.

Since the blocks within the tag_range must all be in the same zone, can we move 
the "page_zone"
out of the loop ?

Thanks,
Regards.

> + for (j = 1; j < block_size; j++) {
> + if (page_zone(pfn_to_page(pfn + j)) != zone) {
> + pr_err("Tag storage block pages in 
> different zones");
> + return -EINVAL;
> + }
> + }
> + }
> + }
> +
> +  return 0;
> +}
> +
>  static int __init mte_tag_storage_activate_regions(void)
>  {
>   phys_addr_t dram_start, dram_end;
> @@ -321,6 +350,10 @@ static int __init mte_tag_storage_activate_regions(void)
>   goto out_disabled;
>   }
>  
> + ret = mte_tag_storage_check_zone();
> + if (ret)
> + goto out_disabled;
> +
>   for (i = 0; i < num_tag_regions; i++) {
>   tag_range = &tag_regions[i].tag_range;
>   for (pfn = tag_range->start; pfn <= tag_range->end; pfn += 
> pageblock_nr_pages)
> -- 
> 2.42.1
> 
> 


Re: [PATCH RFC v2 11/27] arm64: mte: Reserve tag storage memory

2023-11-29 Thread Hyesoo Yu
Hello.

On Sun, Nov 19, 2023 at 04:57:05PM +, Alexandru Elisei wrote:
> Allow the kernel to get the size and location of the MTE tag storage
> regions from the DTB. This memory is marked as reserved for now.
> 
> The DTB node for the tag storage region is defined as:
> 
> tags0: tag-storage@8f800 {
> compatible = "arm,mte-tag-storage";
> reg = <0x08 0xf800 0x00 0x400>;
> block-size = <0x1000>;
> memory = <&memory0>;  // Associated tagged memory node
> };
>

How about using compatible = "shared-dma-pool" like below ?

&reserved_memory {
tags0: tag0@8f800 {
compatible = "arm,mte-tag-storage";
reg = <0x08 0xf800 0x00 0x400>;
};
}

tag-storage {
compatible = "arm,mte-tag-storage";
memory-region = <&tag>;
memory = <&memory0>;
block-size = <0x1000>;
}

And then, the activation of CMA would be performed in the CMA code.
We just can get the region information from memory-region and allocate it 
directly
like alloc_contig_range, take_page_off_buddy. It seems like we can remove a 
lots of code.

> The tag storage region represents the largest contiguous memory region that
> holds all the tags for the associated contiguous memory region which can be
> tagged. For example, for a 32GB contiguous tagged memory the corresponding
> tag storage region is 1GB of contiguous memory, not two adjacent 512M of
> tag storage memory.
> 
> "block-size" represents the minimum multiple of 4K of tag storage where all
> the tags stored in the block correspond to a contiguous memory region. This
> is needed for platforms where the memory controller interleaves tag writes
> to memory. For example, if the memory controller interleaves tag writes for
> 256KB of contiguous memory across 8K of tag storage (2-way interleave),
> then the correct value for "block-size" is 0x2000. This value is a hardware
> property, independent of the selected kernel page size.
>

Is it considered for kernel page size like 16K page, 64K page ? The comment says
it should be a multiple of 4K, but it should be a multiple of the "page size" 
more accurately.
Please let me know if there's anything I misunderstood. :-)


> Signed-off-by: Alexandru Elisei 
> ---
>  arch/arm64/Kconfig   |  12 ++
>  arch/arm64/include/asm/mte_tag_storage.h |  15 ++
>  arch/arm64/kernel/Makefile   |   1 +
>  arch/arm64/kernel/mte_tag_storage.c  | 256 +++
>  arch/arm64/kernel/setup.c|   7 +
>  5 files changed, 291 insertions(+)
>  create mode 100644 arch/arm64/include/asm/mte_tag_storage.h
>  create mode 100644 arch/arm64/kernel/mte_tag_storage.c
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 7b071a00425d..fe8276fdc7a8 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -2062,6 +2062,18 @@ config ARM64_MTE
>  
> Documentation/arch/arm64/memory-tagging-extension.rst.
>  
> +if ARM64_MTE
> +config ARM64_MTE_TAG_STORAGE
> + bool "Dynamic MTE tag storage management"
> + help
> +   Adds support for dynamic management of the memory used by the hardware
> +   for storing MTE tags. This memory, unlike normal memory, cannot be
> +   tagged. When it is used to store tags for another memory location it
> +   cannot be used for any type of allocation.
> +
> +   If unsure, say N
> +endif # ARM64_MTE
> +
>  endmenu # "ARMv8.5 architectural features"
>  
>  menu "ARMv8.7 architectural features"
> diff --git a/arch/arm64/include/asm/mte_tag_storage.h 
> b/arch/arm64/include/asm/mte_tag_storage.h
> new file mode 100644
> index ..8f86c4f9a7c3
> --- /dev/null
> +++ b/arch/arm64/include/asm/mte_tag_storage.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023 ARM Ltd.
> + */
> +#ifndef __ASM_MTE_TAG_STORAGE_H
> +#define __ASM_MTE_TAG_STORAGE_H
> +
> +#ifdef CONFIG_ARM64_MTE_TAG_STORAGE
> +void mte_tag_storage_init(void);
> +#else
> +static inline void mte_tag_storage_init(void)
> +{
> +}
> +#endif /* CONFIG_ARM64_MTE_TAG_STORAGE */
> +#endif /* __ASM_MTE_TAG_STORAGE_H  */
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index d95b3d6b471a..5f031bf9f8f1 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -70,6 +70,7 @@ obj-$(CONFIG_CRASH_CORE)+= crash_core.o
>  obj-$(CONFIG_ARM_SDE_INTERFACE)  += sdei.o
>  obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o
>  obj-$(CONFIG_ARM64_MTE)  += mte.o
> +obj-$(CONFIG_ARM64_MTE_TAG_STORAGE)  += mte_tag_storage.o
>  obj-y+= vdso-wrap.o
>  obj-$(CONFIG_COMPAT_VDSO)+= vdso32-wrap.o
>  obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS)  += patch-scs.o
> diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> b/arch/arm64/kernel/mte_tag_storage.c
> new file mo

Re: [PATCH RFC 00/37] Add support for arm64 MTE dynamic tag storage reuse

2023-10-25 Thread Hyesoo Yu
On Wed, Oct 25, 2023 at 09:47:36AM +0100, Alexandru Elisei wrote:
> Hi,
> 
> On Wed, Oct 25, 2023 at 11:59:32AM +0900, Hyesoo Yu wrote:
> > On Wed, Sep 13, 2023 at 04:29:25PM +0100, Catalin Marinas wrote:
> > > On Mon, Sep 11, 2023 at 02:29:03PM +0200, David Hildenbrand wrote:
> > > > On 11.09.23 13:52, Catalin Marinas wrote:
> > > > > On Wed, Sep 06, 2023 at 12:23:21PM +0100, Alexandru Elisei wrote:
> > > > > > On Thu, Aug 24, 2023 at 04:24:30PM +0100, Catalin Marinas wrote:
> > > > > > > On Thu, Aug 24, 2023 at 01:25:41PM +0200, David Hildenbrand wrote:
> > > > > > > > On 24.08.23 13:06, David Hildenbrand wrote:
> > > > > > > > > Regarding one complication: "The kernel needs to know where 
> > > > > > > > > to allocate
> > > > > > > > > a PROT_MTE page from or migrate a current page if it becomes 
> > > > > > > > > PROT_MTE
> > > > > > > > > (mprotect()) and the range it is in does not support 
> > > > > > > > > tagging.",
> > > > > > > > > simplified handling would be if it's in a MIGRATE_CMA 
> > > > > > > > > pageblock, it
> > > > > > > > > doesn't support tagging. You have to migrate to a !CMA page 
> > > > > > > > > (for
> > > > > > > > > example, not specifying GFP_MOVABLE as a quick way to achieve 
> > > > > > > > > that).
> > > > > > > > 
> > > > > > > > Okay, I now realize that this patch set effectively duplicates 
> > > > > > > > some CMA
> > > > > > > > behavior using a new migrate-type.
> > > > > [...]
> > > > > > I considered mixing the tag storage memory memory with normal 
> > > > > > memory and
> > > > > > adding it to MIGRATE_CMA. But since tag storage memory cannot be 
> > > > > > tagged,
> > > > > > this means that it's not enough anymore to have a __GFP_MOVABLE 
> > > > > > allocation
> > > > > > request to use MIGRATE_CMA.
> > > > > > 
> > > > > > I considered two solutions to this problem:
> > > > > > 
> > > > > > 1. Only allocate from MIGRATE_CMA is the requested memory is not 
> > > > > > tagged =>
> > > > > > this effectively means transforming all memory from MIGRATE_CMA 
> > > > > > into the
> > > > > > MIGRATE_METADATA migratetype that the series introduces. Not very
> > > > > > appealing, because that means treating normal memory that is also 
> > > > > > on the
> > > > > > MIGRATE_CMA lists as tagged memory.
> > > > > 
> > > > > That's indeed not ideal. We could try this if it makes the patches
> > > > > significantly simpler, though I'm not so sure.
> > > > > 
> > > > > Allocating metadata is the easier part as we know the correspondence
> > > > > from the tagged pages (32 PROT_MTE page) to the metadata page (1 tag
> > > > > storage page), so alloc_contig_range() does this for us. Just adding 
> > > > > it
> > > > > to the CMA range is sufficient.
> > > > > 
> > > > > However, making sure that we don't allocate PROT_MTE pages from the
> > > > > metadata range is what led us to another migrate type. I guess we 
> > > > > could
> > > > > achieve something similar with a new zone or a CPU-less NUMA node,
> > > > 
> > > > Ideally, no significant core-mm changes to optimize for an architecture
> > > > oddity. That implies, no new zones and no new migratetypes -- unless it 
> > > > is
> > > > unavoidable and you are confident that you can convince core-MM people 
> > > > that
> > > > the use case (giving back 3% of system RAM at max in some setups) is 
> > > > worth
> > > > the trouble.
> > > 
> > > If I was an mm maintainer, I'd also question this ;). But vendors seem
> > > pretty picky about the amount of RAM reserved for MTE (e.g. 0.5G for a
> > > 16G platform does look somewhat big). As more and more apps adopt MTE,
> > > the wastage would be smaller but the first step is getting vendors to
> > > enable i

Re: [PATCH RFC 00/37] Add support for arm64 MTE dynamic tag storage reuse

2023-10-24 Thread Hyesoo Yu
On Wed, Sep 13, 2023 at 04:29:25PM +0100, Catalin Marinas wrote:
> On Mon, Sep 11, 2023 at 02:29:03PM +0200, David Hildenbrand wrote:
> > On 11.09.23 13:52, Catalin Marinas wrote:
> > > On Wed, Sep 06, 2023 at 12:23:21PM +0100, Alexandru Elisei wrote:
> > > > On Thu, Aug 24, 2023 at 04:24:30PM +0100, Catalin Marinas wrote:
> > > > > On Thu, Aug 24, 2023 at 01:25:41PM +0200, David Hildenbrand wrote:
> > > > > > On 24.08.23 13:06, David Hildenbrand wrote:
> > > > > > > Regarding one complication: "The kernel needs to know where to 
> > > > > > > allocate
> > > > > > > a PROT_MTE page from or migrate a current page if it becomes 
> > > > > > > PROT_MTE
> > > > > > > (mprotect()) and the range it is in does not support tagging.",
> > > > > > > simplified handling would be if it's in a MIGRATE_CMA pageblock, 
> > > > > > > it
> > > > > > > doesn't support tagging. You have to migrate to a !CMA page (for
> > > > > > > example, not specifying GFP_MOVABLE as a quick way to achieve 
> > > > > > > that).
> > > > > > 
> > > > > > Okay, I now realize that this patch set effectively duplicates some 
> > > > > > CMA
> > > > > > behavior using a new migrate-type.
> > > [...]
> > > > I considered mixing the tag storage memory memory with normal memory and
> > > > adding it to MIGRATE_CMA. But since tag storage memory cannot be tagged,
> > > > this means that it's not enough anymore to have a __GFP_MOVABLE 
> > > > allocation
> > > > request to use MIGRATE_CMA.
> > > > 
> > > > I considered two solutions to this problem:
> > > > 
> > > > 1. Only allocate from MIGRATE_CMA is the requested memory is not tagged 
> > > > =>
> > > > this effectively means transforming all memory from MIGRATE_CMA into the
> > > > MIGRATE_METADATA migratetype that the series introduces. Not very
> > > > appealing, because that means treating normal memory that is also on the
> > > > MIGRATE_CMA lists as tagged memory.
> > > 
> > > That's indeed not ideal. We could try this if it makes the patches
> > > significantly simpler, though I'm not so sure.
> > > 
> > > Allocating metadata is the easier part as we know the correspondence
> > > from the tagged pages (32 PROT_MTE page) to the metadata page (1 tag
> > > storage page), so alloc_contig_range() does this for us. Just adding it
> > > to the CMA range is sufficient.
> > > 
> > > However, making sure that we don't allocate PROT_MTE pages from the
> > > metadata range is what led us to another migrate type. I guess we could
> > > achieve something similar with a new zone or a CPU-less NUMA node,
> > 
> > Ideally, no significant core-mm changes to optimize for an architecture
> > oddity. That implies, no new zones and no new migratetypes -- unless it is
> > unavoidable and you are confident that you can convince core-MM people that
> > the use case (giving back 3% of system RAM at max in some setups) is worth
> > the trouble.
> 
> If I was an mm maintainer, I'd also question this ;). But vendors seem
> pretty picky about the amount of RAM reserved for MTE (e.g. 0.5G for a
> 16G platform does look somewhat big). As more and more apps adopt MTE,
> the wastage would be smaller but the first step is getting vendors to
> enable it.
> 
> > I also had CPU-less NUMA nodes in mind when thinking about that, but not
> > sure how easy it would be to integrate it. If the tag memory has actually
> > different performance characteristics as well, a NUMA node would be the
> > right choice.
> 
> In general I'd expect the same characteristics. However, changing the
> memory designation from tag to data (and vice-versa) requires some cache
> maintenance. The allocation cost is slightly higher (not the runtime
> one), so it would help if the page allocator does not favour this range.
> Anyway, that's an optimisation to worry about later.
> 
> > If we could find some way to easily support this either via CMA or CPU-less
> > NUMA nodes, that would be much preferable; even if we cannot cover each and
> > every future use case right now. I expect some issues with CXL+MTE either
> > way , but are happy to be taught otherwise :)
> 
> I think CXL+MTE is rather theoretical at the moment. Given that PCIe
> doesn't have any notion of MTE, more likely there would be some piece of
> interconnect that generates two memory accesses: one for data and the
> other for tags at a configurable offset (which may or may not be in the
> same CXL range).
> 
> > Another thought I had was adding something like CMA memory characteristics.
> > Like, asking if a given CMA area/page supports tagging (i.e., flag for the
> > CMA area set?)?
> 
> I don't think adding CMA memory characteristics helps much. The metadata
> allocation wouldn't go through cma_alloc() but rather
> alloc_contig_range() directly for a specific pfn corresponding to the
> data pages with PROT_MTE. The core mm code doesn't need to know about
> the tag storage layout.
> 
> It's also unlikely for cma_alloc() memory to be mapped as PROT_MTE.
> That's typically coming from device dr

Re: [PATCH RFC 04/37] mm: Add MIGRATE_METADATA allocation policy

2023-10-23 Thread Hyesoo Yu
On Mon, Oct 16, 2023 at 01:40:39PM +0100, Alexandru Elisei wrote:
> Hello,
> 
> On Thu, Oct 12, 2023 at 10:28:24AM +0900, Hyesoo Yu wrote:
> > On Wed, Aug 23, 2023 at 02:13:17PM +0100, Alexandru Elisei wrote:
> > > Some architectures implement hardware memory coloring to catch incorrect
> > > usage of memory allocation. One such architecture is arm64, which calls 
> > > its
> > > hardware implementation Memory Tagging Extension.
> > > 
> > > So far, the memory which stores the metadata has been configured by
> > > firmware and hidden from Linux. For arm64, it is impossible to to have the
> > > entire system RAM allocated with metadata because executable memory cannot
> > > be tagged. Furthermore, in practice, only a chunk of all the memory that
> > > can have tags is actually used as tagged. which leaves a portion of
> > > metadata memory unused. As such, it would be beneficial to use this 
> > > memory,
> > > which so far has been unaccessible to Linux, to service allocation
> > > requests. To prepare for exposing this metadata memory a new migratetype 
> > > is
> > > being added to the page allocator, called MIGRATE_METADATA.
> > > 
> > > One important aspect is that for arm64 the memory that stores metadata
> > > cannot have metadata associated with it, it can only be used to store
> > > metadata for other pages. This means that the page allocator will *not*
> > > allocate from this migratetype if at least one of the following is true:
> > > 
> > > - The allocation also needs metadata to be allocated.
> > > - The allocation isn't movable. A metadata page storing data must be
> > >   able to be migrated at any given time so it can be repurposed to store
> > >   metadata.
> > > 
> > > Both cases are specific to arm64's implementation of memory metadata.
> > > 
> > > For now, metadata storage pages management is disabled, and it will be
> > > enabled once the architecture-specific handling is added.
> > > 
> > > Signed-off-by: Alexandru Elisei 
> > > ---
> > > [..]
> > > @@ -2144,6 +2156,15 @@ __rmqueue(struct zone *zone, unsigned int order, 
> > > int migratetype,
> > >   if (alloc_flags & ALLOC_CMA)
> > >   page = __rmqueue_cma_fallback(zone, order);
> > >  
> > > + /*
> > > +  * Allocate data pages from MIGRATE_METADATA only if the regular
> > > +  * allocation path fails to increase the chance that the
> > > +  * metadata page is available when the associated data page
> > > +  * needs it.
> > > +  */
> > > + if (!page && (alloc_flags & ALLOC_FROM_METADATA))
> > > + page = __rmqueue_metadata_fallback(zone, order);
> > > +
> > 
> > Hi!
> > 
> > I guess it would cause non-movable page starving issue as CMA.
> 
> I don't understand what you mean by "non-movable page starving issue as
> CMA". Would you care to elaborate?
> 

Before below patch, I frequently encountered situations where there was free 
CMA memory
available but the allocation of unmovable page failed. That patch has improved 
this
issue. ("mm,page_alloc,cma: conditionally prefer cma pageblocks for movable 
allocations)
https://lore.kernel.org/linux-mm/20200306150102.3e773...@imladris.surriel.com/

I guess it would be beneficial to add a policy for effectively utilizing the 
metadata
area as well. I think migration is cheaper than app killing or swap in terms
of performance.

But, if the next iteration tries to use only cma, as discussed in recent 
mailing lists,
I think this concern would be fine.

Thanks,
Regards.

> > The metadata pages cannot be used for non-movable allocations.
> > Metadata pages are utilized poorly, non-movable allocations may end up
> > getting starved if all regular movable pages are allocated and the only
> > pages left are metadata. If the system has a lot of CMA pages, then
> > this problem would become more bad. I think it would be better to make
> > use of it in places where performance is not critical, including some
> > GFP_METADATA ?
> 
> GFP_METADATA pages must be used only for movable allocations. The kernel
> must be able to migrate GFP_METADATA pages (if they have been allocated)
> when they are reserved to serve as tag storage for a newly allocated tagged
> page.
> 
> If you are referring to the fact that GFP_METADATA pages are allocated only
> when there are no more free pages in the zone, then yes, I can understand

Re: [PATCH RFC 17/37] arm64: mte: Disable dynamic tag storage management if HW KASAN is enabled

2023-10-11 Thread Hyesoo Yu
On Wed, Aug 23, 2023 at 02:13:30PM +0100, Alexandru Elisei wrote:
> Reserving the tag storage associated with a tagged page requires the
> ability to migrate existing data if the tag storage is in use for data.
> 
> The kernel allocates pages, which are now tagged because of HW KASAN, in
> non-preemptible contexts, which can make reserving the associate tag
> storage impossible.
> 
> Don't expose the tag storage pages to the memory allocator if HW KASAN is
> enabled.
> 
> Signed-off-by: Alexandru Elisei 
> ---
>  arch/arm64/kernel/mte_tag_storage.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/arch/arm64/kernel/mte_tag_storage.c 
> b/arch/arm64/kernel/mte_tag_storage.c
> index 4a6bfdf88458..f45128d0244e 100644
> --- a/arch/arm64/kernel/mte_tag_storage.c
> +++ b/arch/arm64/kernel/mte_tag_storage.c
> @@ -314,6 +314,18 @@ static int __init mte_tag_storage_activate_regions(void)
>   return 0;
>   }
>  
> + /*
> +  * The kernel allocates memory in non-preemptible contexts, which makes
> +  * migration impossible when reserving the associated tag storage.
> +  *
> +  * The check is safe to make because KASAN HW tags are enabled before
> +  * the rest of the init functions are called, in smp_prepare_boot_cpu().
> +  */
> + if (kasan_hw_tags_enabled()) {
> + pr_info("KASAN HW tags enabled, disabling tag storage");
> + return 0;
> + }
> +

Hi.

Is there no plan to enable HW KASAN in the current design ? 
I wonder if dynamic MTE is only used for user ? 

Thanks,
Hyesoo Yu.


>   for (i = 0; i < num_tag_regions; i++) {
>   tag_range = &tag_regions[i].tag_range;
>   for (pfn = tag_range->start; pfn <= tag_range->end; pfn += 
> pageblock_nr_pages) {
> -- 
> 2.41.0
> 
> 


Re: [PATCH RFC 04/37] mm: Add MIGRATE_METADATA allocation policy

2023-10-11 Thread Hyesoo Yu
atic inline bool metadata_storage_enabled(void)
> +{
> + return false;
> +}
> +static inline bool alloc_can_use_metadata_pages(gfp_t gfp_mask)
> +{
> + return false;
> +}
> +#endif /* !CONFIG_MEMORY_METADATA */
> +
> +#endif /* __ASM_GENERIC_MEMORY_METADATA_H */
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 5e50b78d58ea..74925806687e 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -61,6 +61,9 @@ enum migratetype {
>*/
>   MIGRATE_CMA,
>  #endif
> +#ifdef CONFIG_MEMORY_METADATA
> + MIGRATE_METADATA,
> +#endif
>  #ifdef CONFIG_MEMORY_ISOLATION
>   MIGRATE_ISOLATE,/* can't allocate from here */
>  #endif
> @@ -78,6 +81,14 @@ extern const char * const migratetype_names[MIGRATE_TYPES];
>  #  define is_migrate_cma_page(_page) false
>  #endif
>  
> +#ifdef CONFIG_MEMORY_METADATA
> +#  define is_migrate_metadata(migratetype) unlikely((migratetype) == 
> MIGRATE_METADATA)
> +#  define is_migrate_metadata_page(_page) (get_pageblock_migratetype(_page) 
> == MIGRATE_METADATA)
> +#else
> +#  define is_migrate_metadata(migratetype) false
> +#  define is_migrate_metadata_page(_page) false
> +#endif
> +
>  static inline bool is_migrate_movable(int mt)
>  {
>   return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 09130434e30d..838193522e20 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1236,6 +1236,9 @@ config LOCK_MM_AND_FIND_VMA
>   bool
>   depends on !STACK_GROWSUP
>  
> +config MEMORY_METADATA
> + bool
> +
>  source "mm/damon/Kconfig"
>  
>  endmenu
> diff --git a/mm/internal.h b/mm/internal.h
> index a7d9e980429a..efd52c9f1578 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -824,6 +824,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone 
> *zone,
>  #define ALLOC_NOFRAGMENT   0x0
>  #endif
>  #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
> +#ifdef CONFIG_MEMORY_METADATA
> +#define ALLOC_FROM_METADATA  0x400 /* allow allocations from 
> MIGRATE_METADATA list */
> +#else
> +#define ALLOC_FROM_METADATA  0x0
> +#endif
>  #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, 
> __GFP_KSWAPD_RECLAIM set */
>  
>  /* Flags that allow allocations below the min watermark. */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index fdc230440a44..7baa78abf351 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -53,6 +53,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "internal.h"
>  #include "shuffle.h"
>  #include "page_reporting.h"
> @@ -1645,6 +1646,17 @@ static inline struct page 
> *__rmqueue_cma_fallback(struct zone *zone,
>   unsigned int order) { return NULL; }
>  #endif
>  
> +#ifdef CONFIG_MEMORY_METADATA
> +static __always_inline struct page *__rmqueue_metadata_fallback(struct zone 
> *zone,
> + unsigned int order)
> +{
> + return __rmqueue_smallest(zone, order, MIGRATE_METADATA);
> +}
> +#else
> +static inline struct page *__rmqueue_metadata_fallback(struct zone *zone,
> + unsigned int order) { return NULL; }
> +#endif
> +
>  /*
>   * Move the free pages in a range to the freelist tail of the requested type.
>   * Note that start_page and end_pages are not aligned on a pageblock
> @@ -2144,6 +2156,15 @@ __rmqueue(struct zone *zone, unsigned int order, int 
> migratetype,
>   if (alloc_flags & ALLOC_CMA)
>   page = __rmqueue_cma_fallback(zone, order);
>  
> + /*
> +  * Allocate data pages from MIGRATE_METADATA only if the regular
> +  * allocation path fails to increase the chance that the
> +      * metadata page is available when the associated data page
> +  * needs it.
> +  */
> + if (!page && (alloc_flags & ALLOC_FROM_METADATA))
> + page = __rmqueue_metadata_fallback(zone, order);
> +

Hi!

I guess it would cause non-movable page starving issue as CMA.
The metadata pages cannot be used for non-movable allocations.
Metadata pages are utilized poorly, non-movable allocations may end up
getting starved if all regular movable pages are allocated and the only
pages left are metadata. If the system has a lot of CMA pages, then
this problem would become more bad. I think it would be better to make
use of it in places where performance is not critical, including some
GFP_METADATA ?

Thanks,
Hyesoo Yu.

>   if (!page && __rmqueue_fallback(zone, order, migratetype,
>   alloc_flags))
>   goto retry;
> @@ -3088,6 +3109,13 @@ static inline unsigned int 
> gfp_to_alloc_flags_fast(gfp_t gfp_mask,
>   if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
>   alloc_flags |= ALLOC_CMA;
>  #endif
> +#ifdef CONFIG_MEMORY_METADATA
> + if (metadata_storage_enabled() &&
> + gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE &&
> + alloc_can_use_metadata_pages(gfp_mask))
> + alloc_flags |= ALLOC_FROM_METADATA;
> +#endif
> +
>   return alloc_flags;
>  }
>  
> -- 
> 2.41.0
> 
> 


Re: [PATCH RFC 06/37] mm: page_alloc: Allocate from movable pcp lists only if ALLOC_FROM_METADATA

2023-10-11 Thread Hyesoo Yu
On Wed, Aug 23, 2023 at 02:13:19PM +0100, Alexandru Elisei wrote:
> pcp lists keep MIGRATE_METADATA pages on the MIGRATE_MOVABLE list. Make
> sure pages from the movable list are allocated only when the
> ALLOC_FROM_METADATA alloc flag is set, as otherwise the page allocator
> could end up allocating a metadata page when that page cannot be used.
> 
> __alloc_pages_bulk() sidesteps rmqueue() and calls __rmqueue_pcplist()
> directly. Add a check for the flag before calling __rmqueue_pcplist(), and
> fallback to __alloc_pages() if the check is false.
> 
> Note that CMA isn't a problem for __alloc_pages_bulk(): an allocation can
> always use CMA pages if the requested migratetype is MIGRATE_MOVABLE, which
> is not the case with MIGRATE_METADATA pages.
> 
> Signed-off-by: Alexandru Elisei 
> ---
>  mm/page_alloc.c | 21 +
>  1 file changed, 17 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 829134a4dfa8..a693e23c4733 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2845,11 +2845,16 @@ struct page *rmqueue(struct zone *preferred_zone,
>  
>   if (likely(pcp_allowed_order(order))) {
>   /*
> -  * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
> -  * we need to skip it when CMA area isn't allowed.
> +  * PCP lists keep MIGRATE_CMA/MIGRATE_METADATA pages on the same
> +  * movable list. Make sure it's allowed to allocate both type of
> +  * pages before allocating from the movable list.
>*/
> - if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
> - migratetype != MIGRATE_MOVABLE) {
> + bool movable_allowed = (!IS_ENABLED(CONFIG_CMA) ||
> + (alloc_flags & ALLOC_CMA)) &&
> +(!IS_ENABLED(CONFIG_MEMORY_METADATA) ||
> + (alloc_flags & ALLOC_FROM_METADATA));
> +
> + if (migratetype != MIGRATE_MOVABLE || movable_allowed) {

Hi!

I don't think it would be effcient when the majority of movable pages
do not use GFP_TAGGED.

Metadata pages have a low probability of being in the pcp list
because metadata pages is bypassed when freeing pages.

The allocation performance of most movable pages is likely to decrease
if only the request with ALLOC_FROM_METADATA could be allocated.

How about not including metadata pages in the pcp list at all ?

Thanks,
Hyesoo Yu.

>   page = rmqueue_pcplist(preferred_zone, zone, order,
>   migratetype, alloc_flags);
>   if (likely(page))
> @@ -4388,6 +4393,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int 
> preferred_nid,
>   goto out;
>   gfp = alloc_gfp;
>  
> + /*
> +  * pcp lists puts MIGRATE_METADATA on the MIGRATE_MOVABLE list, don't
> +  * use pcp if allocating metadata pages is not allowed.
> +  */
> + if (metadata_storage_enabled() && ac.migratetype == MIGRATE_MOVABLE &&
> + !(alloc_flags & ALLOC_FROM_METADATA))
> + goto failed;
> +
>   /* Find an allowed local zone that meets the low watermark. */
>   for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, 
> ac.highest_zoneidx, ac.nodemask) {
>   unsigned long mark;
> -- 
> 2.41.0
> 
> 


Re: [PATCH v3 4/4] dma-buf: heaps: add chunk heap to dmabuf heaps

2021-01-19 Thread Hyesoo Yu
On Tue, Jan 19, 2021 at 12:36:40PM -0800, Minchan Kim wrote:
> On Tue, Jan 19, 2021 at 10:29:29AM -0800, John Stultz wrote:
> > On Tue, Jan 12, 2021 at 5:22 PM Minchan Kim  wrote:
> > >
> > > From: Hyesoo Yu 
> > >
> > > This patch supports chunk heap that allocates the buffers that
> > > arranged into a list a fixed size chunks taken from CMA.
> > >
> > > The chunk heap driver is bound directly to a reserved_memory
> > > node by following Rob Herring's suggestion in [1].
> > >
> > > [1] 
> > > https://lore.kernel.org/lkml/20191025225009.50305-2-john.stu...@linaro.org/T/#m3dc63acd33fea269a584f43bb799a876f0b2b45d
> > >
> > > Signed-off-by: Hyesoo Yu 
> > > Signed-off-by: Hridya Valsaraju 
> > > Signed-off-by: Minchan Kim 
> > > ---
> > ...
> > > +static int register_chunk_heap(struct chunk_heap *chunk_heap_info)
> > > +{
> > > +   struct dma_heap_export_info exp_info;
> > > +
> > > +   exp_info.name = cma_get_name(chunk_heap_info->cma);
> > 
> > One potential issue here, you're setting the name to the same as the
> > CMA name. Since the CMA heap uses the CMA name, if one chunk was
> > registered as a chunk heap but also was the default CMA area, it might
> > be registered twice. But since both would have the same name it would
> > be an initialization race as to which one "wins".
> 
> Good point. Maybe someone might want to use default CMA area for
> both cma_heap and chunk_heap. I cannot come up with ideas why we
> should prohibit it atm.
> 
> > 
> > So maybe could you postfix the CMA name with "-chunk" or something?
> 
> Hyesoo, Any opinion?
> Unless you have something other idea, let's fix it in next version.
>

I agree that. It is not good to use heap name directly as cma name.
Let's postfix the name with '-chunk'

Thanks,
Regards.


Re: [PATCH 4/4] dma-heap: Devicetree binding for chunk heap

2020-11-18 Thread Hyesoo Yu
On Wed, Nov 18, 2020 at 07:19:07PM -0800, John Stultz wrote:
> On Wed, Nov 18, 2020 at 5:22 PM Hyesoo Yu  wrote:
> >
> > On Tue, Nov 17, 2020 at 07:00:54PM -0800, John Stultz wrote:
> > > So I suspect Rob will push back on this as he has for other dt
> > > bindings related to ion/dmabuf heaps (I tried to push a similar
> > > solution to exporting multiple CMA areas via dmabuf heaps).
> > >
> > > The proposal he seemed to like best was having an in-kernel function
> > > that a driver would call to initialize the heap (associated with the
> > > CMA region the driver is interested in). Similar to Kunihiko Hayashi's
> > > patch here:
> > >   - 
> > > https://lore.kernel.org/lkml/1594948208-4739-1-git-send-email-hayashi.kunih...@socionext.com/
> > >
> > > The one sticking point for that patch (which I think is a good one),
> > > is that we don't have any in-tree users, so it couldn't be merged yet.
> > >
> > > A similar approach might be good here, but again we probably need to
> > > have at least one in-tree user which could call such a registration
> > > function.
> >
> > Thanks for your review.
> >
> > The chunk heap is not considered for device-specific reserved memory and 
> > specific driver.
> > It is similar to system heap, but it only collects high-order pages by 
> > using specific cma-area for performance.
> 
> So, yes I agree, the chunk heap isn't device specific. It's just that
> the CMA regions usually are tied to devices.
> 
> The main objection to this style of solution has been due to the fact
> that the DTS is supposed to describe the physical hardware (in an OS
> agnostic way), rather than define configuration info for Linux
> software drivers.
> 
> Obviously this can be quibbled about, as the normal way of tying
> devices to CMA has some assumptions of what the driver will use that
> region for, rather than somehow representing a physical tie between a
> memory reservation and a device. Nonetheless, Rob has been hesitant to
> take any sort of ION/DmaBuf Heap DT devices, and has been more
> interested in some device having the memory reservation reference and
> the driver for that doing the registration.
> 
> > It is strange that there is in-tree user who registers chunk heap.
> > (Wouldn't it be strange for some users to register the system heap?)
> 
> Well, as there's no reservation/configuration needed, the system heap
> can register itself.
> 
> The CMA heap currently only registers the default CMA heap, as we
> didn't want to expose all CMA regions and there's otherwise no way to
> pick and choose.
> 
> > Is there a reason to use dma-heap framework to add cma-area for specific 
> > device ?
> >
> > Even if some in-tree users register dma-heap with cma-area, the buffers 
> > could be allocated in user-land and these could be shared among other 
> > devices.
> > For exclusive access, I guess, the device don't need to register dma-heap 
> > for cma area.
> >
> 
> It's not really about exclusive access. More just that if you want to
> bind a memory reservation/region (cma or otherwise), at least for DTS,
> it needs to bind with some device in DT.
> 
> Then the device driver can register that region with a heap driver.
> This avoids adding new Linux-specific software bindings to DT. It
> becomes a driver implementation detail instead. The primary user of
> the heap type would probably be a practical pick (ie the display or
> isp driver).
> 
> The other potential solution Rob has suggested is that we create some
> tag for the memory reservation (ie: like we do with cma: "reusable"),
> which can be used to register the region to a heap. But this has the
> problem that each tag has to be well defined and map to a known heap.
> 
> thanks
> -john
>

Thanks for the detailed reply.

I understood what you mean exactly.
I agree with your opinion that avoids software bindings to DT.

The way to register the heap by specific device driver, makes dependency
between heap and some device drivers that we pick practically.
If that device driver changed or removed whenever H/W changed,
the chunk heap is affected regardless of our intentions.

As you said, the other solution that add tags need to be well defined.
I guess, that will be a long-term solution.

First of all, we just want to register chunk heap to allocate high-order pages.
I'm going to change to a simple solution that uses default cma like cma heap, 
not using DT.

Thanks.
Regards.


Re: [PATCH 3/4] dma-buf: heaps: add chunk heap to dmabuf heaps

2020-11-18 Thread Hyesoo Yu
Hello, Hillf danton.

On Wed, Nov 18, 2020 at 05:00:13PM +0800, Hillf Danton wrote:
> On Tue, 17 Nov 2020 10:19:34 -0800 Minchan Kim wrote:
> +
> +static int chunk_heap_mmap(struct dma_buf *dmabuf, struct vm_area_struct 
> *vma)
> +{
> + struct chunk_heap_buffer *buffer = dmabuf->priv;
> + struct sg_table *table = &buffer->sg_table;
> + unsigned long addr = vma->vm_start;
> + struct sg_page_iter piter;
> + int ret;
> +
> + for_each_sgtable_page(table, &piter, vma->vm_pgoff) {
> + struct page *page = sg_page_iter_page(&piter);
> +
> + ret = remap_pfn_range(vma, addr, page_to_pfn(page), PAGE_SIZE,
> +   vma->vm_page_prot);
> + if (ret)
> + return ret;
> + addr = PAGE_SIZE;
> 
> Typo?
>   addr += PAGE_SIZE;
> 

Yes, It is typo. I will change it.

Thanks for your review.
Regards.

> + if (addr >= vma->vm_end)
> + return 0;
> + }
> + return 0;
> +}
> 


Re: [PATCH 4/4] dma-heap: Devicetree binding for chunk heap

2020-11-18 Thread Hyesoo Yu
On Tue, Nov 17, 2020 at 07:00:54PM -0800, John Stultz wrote:
> On Tue, Nov 17, 2020 at 10:19 AM Minchan Kim  wrote:
> >
> > From: Hyesoo Yu 
> >
> > Document devicetree binding for chunk heap on dma heap framework
> >
> > Signed-off-by: Hyesoo Yu 
> > Signed-off-by: Minchan Kim 
> > ---
> >  .../bindings/dma-buf/chunk_heap.yaml  | 52 +++
> >  1 file changed, 52 insertions(+)
> >  create mode 100644 
> > Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml
> >
> > diff --git a/Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml 
> > b/Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml
> > new file mode 100644
> > index ..f382bee02778
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml
> > @@ -0,0 +1,52 @@
> > +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> > +%YAML 1.2
> > +---
> > +$id: 
> > https://protect2.fireeye.com/v1/url?k=9020a1f6-cfbb98fd-90212ab9-002590f5b904-5057bc6b174b6a8e&q=1&e=76ff8b54-517c-4389-81b9-fa1446ad08bf&u=http%3A%2F%2Fdevicetree.org%2Fschemas%2Fdma-buf%2Fchunk_heap.yaml%23
> > +$schema: 
> > https://protect2.fireeye.com/v1/url?k=876fa02f-d8f49924-876e2b60-002590f5b904-e220c9cf0d714704&q=1&e=76ff8b54-517c-4389-81b9-fa1446ad08bf&u=http%3A%2F%2Fdevicetree.org%2Fmeta-schemas%2Fcore.yaml%23
> > +
> > +title: Device tree binding for chunk heap on DMA HEAP FRAMEWORK
> > +
> > +maintainers:
> > +  - Sumit Semwal 
> > +
> > +description: |
> > +  The chunk heap is backed by the Contiguous Memory Allocator (CMA) and
> > +  allocates the buffers that are made up to a list of fixed size chunks
> > +  taken from CMA. Chunk sizes are configurated when the heaps are created.
> > +
> > +properties:
> > +  compatible:
> > +enum:
> > +  - dma_heap,chunk
> > +
> > +  memory-region:
> > +maxItems: 1
> > +
> > +  alignment:
> > +maxItems: 1
> > +
> > +required:
> > +  - compatible
> > +  - memory-region
> > +  - alignment
> > +
> > +additionalProperties: false
> > +
> > +examples:
> > +  - |
> > +reserved-memory {
> > +#address-cells = <2>;
> > +#size-cells = <1>;
> > +
> > +chunk_memory: chunk_memory {
> > +compatible = "shared-dma-pool";
> > +reusable;
> > +size = <0x1000>;
> > +};
> > +};
> > +
> > +chunk_default_heap: chunk_default_heap {
> > +compatible = "dma_heap,chunk";
> > +memory-region = <&chunk_memory>;
> > +alignment = <0x1>;
> > +};
> 
> 
> So I suspect Rob will push back on this as he has for other dt
> bindings related to ion/dmabuf heaps (I tried to push a similar
> solution to exporting multiple CMA areas via dmabuf heaps).
> 
> The proposal he seemed to like best was having an in-kernel function
> that a driver would call to initialize the heap (associated with the
> CMA region the driver is interested in). Similar to Kunihiko Hayashi's
> patch here:
>   - 
> https://lore.kernel.org/lkml/1594948208-4739-1-git-send-email-hayashi.kunih...@socionext.com/
> 
> The one sticking point for that patch (which I think is a good one),
> is that we don't have any in-tree users, so it couldn't be merged yet.
> 
> A similar approach might be good here, but again we probably need to
> have at least one in-tree user which could call such a registration
> function.
> 
> thanks
> -john
>

Thanks for your review.

The chunk heap is not considered for device-specific reserved memory and 
specific driver.
It is similar to system heap, but it only collects high-order pages by using 
specific cma-area for performance.

It is strange that there is in-tree user who registers chunk heap.
(Wouldn't it be strange for some users to register the system heap?)

Is there a reason to use dma-heap framework to add cma-area for specific device 
?

Even if some in-tree users register dma-heap with cma-area, the buffers could 
be allocated in user-land and these could be shared among other devices.
For exclusive access, I guess, the device don't need to register dma-heap for 
cma area.

Please let me know if I misunderstood what you said.

Thanks,
Regards.


Re: [PATCH 3/3] dma-heap: Devicetree binding for chunk heap

2020-08-21 Thread Hyesoo Yu
Hi,

On Tue, Aug 18, 2020 at 10:48:12AM -0600, Rob Herring wrote:
> On Tue, 18 Aug 2020 17:04:15 +0900, Hyesoo Yu wrote:
> > Document devicetree binding for chunk heap on dma heap framework
> > 
> > Signed-off-by: Hyesoo Yu 
> > ---
> >  .../devicetree/bindings/dma-buf/chunk_heap.yaml| 46 
> > ++
> >  1 file changed, 46 insertions(+)
> >  create mode 100644 
> > Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml
> > 
> 
> 
> My bot found errors running 'make dt_binding_check' on your patch:
> 
> /builds/robherring/linux-dt-review/Documentation/devicetree/bindings/dma-buf/chunk_heap.example.dt.yaml:
>  chunk_default_heap: 'alignment', 'memory-region' do not match any of the 
> regexes: 'pinctrl-[0-9]+'
> 
> 
> See 
> https://protect2.fireeye.com/v1/url?k=66da9090-3b1117ae-66db1bdf-0cc47a31309a-d3f5cc0866799e96&q=1&e=d42ef5a6-e7ba-494d-8f6b-faf451118f84&u=https%3A%2F%2Fpatchwork.ozlabs.org%2Fpatch%2F1346687
> 
> If you already ran 'make dt_binding_check' and didn't see the above
> error(s), then make sure dt-schema is up to date:
> 
> pip3 install 
> git+https://protect2.fireeye.com/v1/url?k=c99eded1-945559ef-c99f559e-0cc47a31309a-bcc831610d2ce1c6&q=1&e=d42ef5a6-e7ba-494d-8f6b-faf451118f84&u=https%3A%2F%2Fgithub.com%2Fdevicetree-org%2Fdt-schema.git%40master
>  --upgrade
> 
> Please check and re-submit.
> 
> 

Thanks for reply. I missed alignment and memory-region on property.
I added and ran dt_binding_check, and all passed.

I will re-submit the patch v2.

Regards.
Hyesoo yu.


[PATCH 2/3] dma-buf: heaps: add chunk heap to dmabuf heaps

2020-08-18 Thread Hyesoo Yu
This patch adds support for a chunk heap that allows for buffers
that are made up of a list of fixed size chunks taken from a CMA.
Chunk sizes are configuratd when the heaps are created.

Signed-off-by: Hyesoo Yu 
---
 drivers/dma-buf/heaps/Kconfig  |   9 ++
 drivers/dma-buf/heaps/Makefile |   1 +
 drivers/dma-buf/heaps/chunk_heap.c | 222 +
 3 files changed, 232 insertions(+)
 create mode 100644 drivers/dma-buf/heaps/chunk_heap.c

diff --git a/drivers/dma-buf/heaps/Kconfig b/drivers/dma-buf/heaps/Kconfig
index a5eef06..98552fa 100644
--- a/drivers/dma-buf/heaps/Kconfig
+++ b/drivers/dma-buf/heaps/Kconfig
@@ -12,3 +12,12 @@ config DMABUF_HEAPS_CMA
  Choose this option to enable dma-buf CMA heap. This heap is backed
  by the Contiguous Memory Allocator (CMA). If your system has these
  regions, you should say Y here.
+
+config DMABUF_HEAPS_CHUNK
+   tristate "DMA-BUF CHUNK Heap"
+   depends on DMABUF_HEAPS && DMA_CMA
+   help
+ Choose this option to enable dma-buf CHUNK heap. This heap is backed
+ by the Contiguous Memory Allocator (CMA) and allocate the buffers that
+ are made up to a list of fixed size chunks tasken from CMA. Chunk 
sizes
+ are configurated when the heaps are created.
diff --git a/drivers/dma-buf/heaps/Makefile b/drivers/dma-buf/heaps/Makefile
index 6e54cde..3b2a0986 100644
--- a/drivers/dma-buf/heaps/Makefile
+++ b/drivers/dma-buf/heaps/Makefile
@@ -2,3 +2,4 @@
 obj-y  += heap-helpers.o
 obj-$(CONFIG_DMABUF_HEAPS_SYSTEM)  += system_heap.o
 obj-$(CONFIG_DMABUF_HEAPS_CMA) += cma_heap.o
+obj-$(CONFIG_DMABUF_HEAPS_CHUNK)   += chunk_heap.o
diff --git a/drivers/dma-buf/heaps/chunk_heap.c 
b/drivers/dma-buf/heaps/chunk_heap.c
new file mode 100644
index 000..1eefaec
--- /dev/null
+++ b/drivers/dma-buf/heaps/chunk_heap.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ION Memory Allocator chunk heap exporter
+ *
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd.
+ * Author:  for Samsung Electronics.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "heap-helpers.h"
+
+struct chunk_heap {
+   struct dma_heap *heap;
+   phys_addr_t base;
+   phys_addr_t size;
+   atomic_t cur_pageblock_idx;
+   unsigned int max_num_pageblocks;
+   unsigned int order;
+};
+
+static void chunk_heap_free(struct heap_helper_buffer *buffer)
+{
+   struct chunk_heap *chunk_heap = dma_heap_get_drvdata(buffer->heap);
+   pgoff_t pg;
+
+   for (pg = 0; pg < buffer->pagecount; pg++)
+   __free_pages(buffer->pages[pg], chunk_heap->order);
+   kvfree(buffer->pages);
+   kfree(buffer);
+}
+
+static inline unsigned long chunk_get_next_pfn(struct chunk_heap *chunk_heap)
+{
+   unsigned long i = atomic_inc_return(&chunk_heap->cur_pageblock_idx) %
+   chunk_heap->max_num_pageblocks;
+
+   return PHYS_PFN(chunk_heap->base) + i * pageblock_nr_pages;
+}
+
+static int chunk_alloc_pages(struct chunk_heap *chunk_heap, struct page 
**pages,
+unsigned int order, unsigned int count)
+{
+   unsigned long base;
+   unsigned int i = 0, nr_block = 0, nr_elem, ret;
+
+   while (count) {
+   /*
+* If the number of scanned page block is the same as max block,
+* the tries of allocation fails.
+*/
+   if (nr_block++ == chunk_heap->max_num_pageblocks) {
+   ret = -ENOMEM;
+   goto err_bulk;
+   }
+   base = chunk_get_next_pfn(chunk_heap);
+   nr_elem = min_t(unsigned int, count, pageblock_nr_pages >> 
order);
+   ret = alloc_pages_bulk(base, base + pageblock_nr_pages, 
MIGRATE_CMA,
+  GFP_KERNEL, order, nr_elem, pages + i);
+   if (ret < 0)
+   goto err_bulk;
+
+   i += ret;
+   count -= ret;
+   }
+
+   return 0;
+
+err_bulk:
+   while (i-- > 0)
+   __free_pages(pages[i], order);
+
+   return ret;
+}
+
+static int chunk_heap_allocate(struct dma_heap *heap, unsigned long len,
+unsigned long fd_flags, unsigned long heap_flags)
+{
+
+   struct chunk_heap *chunk_heap = dma_heap_get_drvdata(heap);
+   struct heap_helper_buffer *helper_buffer;
+   struct dma_buf *dmabuf;
+   unsigned int count = DIV_ROUND_UP(len, PAGE_SIZE << chunk_heap->order);
+   int ret = -ENOMEM;
+
+   helper_buffer = kzalloc(sizeof(*helper_buffer), GFP_KERNEL);
+   if (!helper_buffer)
+   return ret;
+
+   init_heap_

[PATCH 3/3] dma-heap: Devicetree binding for chunk heap

2020-08-18 Thread Hyesoo Yu
Document devicetree binding for chunk heap on dma heap framework

Signed-off-by: Hyesoo Yu 
---
 .../devicetree/bindings/dma-buf/chunk_heap.yaml| 46 ++
 1 file changed, 46 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml

diff --git a/Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml 
b/Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml
new file mode 100644
index 000..1ee8fad
--- /dev/null
+++ b/Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/dma-buf/chunk_heap.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Device tree binding for chunk heap on DMA HEAP FRAMEWORK
+
+maintainers:
+  - Sumit Semwal 
+
+description: |
+  The chunk heap is backed by the Contiguous Memory Allocator (CMA) and
+  allocates the buffers that are made up to a list of fixed size chunks
+  taken from CMA. Chunk sizes are configurated when the heaps are created.
+
+properties:
+  compatible:
+enum:
+  - dma_heap,chunk
+
+required:
+  - compatible
+  - memory-region
+  - alignment
+
+additionalProperties: false
+
+examples:
+  - |
+reserved-memory {
+#address-cells = <2>;
+#size-cells = <1>;
+
+chunk_memory: chunk_memory {
+compatible = "shared-dma-pool";
+reusable;
+size = <0x1000>;
+};
+};
+
+chunk_default_heap: chunk_default_heap {
+compatible = "dma_heap,chunk";
+memory-region = <&chunk_memory>;
+alignment = <0x1>;
+};
-- 
2.7.4



[PATCH 0/3] Chunk Heap Support on DMA-HEAP

2020-08-18 Thread Hyesoo Yu
These patch series to introduce a new dma heap, chunk heap.
That heap is needed for special HW that requires bulk allocation of
fixed high order pages. For example, 64MB dma-buf pages are made up
to fixed order-4 pages * 1024.

The chunk heap uses alloc_pages_bulk to allocate high order page.
https://lore.kernel.org/linux-mm/20200814173131.2803002-1-minc...@kernel.org

The chunk heap is registered by device tree with alignment and memory node
of contiguous memory allocator(CMA). Alignment defines chunk page size.
For example, alignment 0x1_ means chunk page size is 64KB.
The phandle to memory node indicates contiguous memory allocator(CMA).
If device node doesn't have cma, the registration of chunk heap fails.

The patchset includes the following:
 - export dma-heap API to register kernel module dma heap.
 - add chunk heap implementation.
 - document of device tree to register chunk heap

Hyesoo Yu (3):
  dma-buf: add missing EXPORT_SYMBOL_GPL() for dma heaps
  dma-buf: heaps: add chunk heap to dmabuf heaps
  dma-heap: Devicetree binding for chunk heap

 .../devicetree/bindings/dma-buf/chunk_heap.yaml|  46 +
 drivers/dma-buf/dma-heap.c |   2 +
 drivers/dma-buf/heaps/Kconfig  |   9 +
 drivers/dma-buf/heaps/Makefile |   1 +
 drivers/dma-buf/heaps/chunk_heap.c | 222 +
 drivers/dma-buf/heaps/heap-helpers.c   |   2 +
 6 files changed, 282 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/dma-buf/chunk_heap.yaml
 create mode 100644 drivers/dma-buf/heaps/chunk_heap.c

-- 
2.7.4



[PATCH 1/3] dma-buf: add missing EXPORT_SYMBOL_GPL() for dma heaps

2020-08-18 Thread Hyesoo Yu
The interface of dma heap is used from kernel module to
register dma heaps, otherwize we will get compile error.

Signed-off-by: Hyesoo Yu 
---
 drivers/dma-buf/dma-heap.c   | 2 ++
 drivers/dma-buf/heaps/heap-helpers.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
index afd22c9..cc6339c 100644
--- a/drivers/dma-buf/dma-heap.c
+++ b/drivers/dma-buf/dma-heap.c
@@ -189,6 +189,7 @@ void *dma_heap_get_drvdata(struct dma_heap *heap)
 {
return heap->priv;
 }
+EXPORT_SYMBOL_GPL(dma_heap_get_drvdata);
 
 struct dma_heap *dma_heap_add(const struct dma_heap_export_info *exp_info)
 {
@@ -272,6 +273,7 @@ struct dma_heap *dma_heap_add(const struct 
dma_heap_export_info *exp_info)
kfree(heap);
return err_ret;
 }
+EXPORT_SYMBOL_GPL(dma_heap_add);
 
 static char *dma_heap_devnode(struct device *dev, umode_t *mode)
 {
diff --git a/drivers/dma-buf/heaps/heap-helpers.c 
b/drivers/dma-buf/heaps/heap-helpers.c
index 9f964ca..741bae0 100644
--- a/drivers/dma-buf/heaps/heap-helpers.c
+++ b/drivers/dma-buf/heaps/heap-helpers.c
@@ -24,6 +24,7 @@ void init_heap_helper_buffer(struct heap_helper_buffer 
*buffer,
INIT_LIST_HEAD(&buffer->attachments);
buffer->free = free;
 }
+EXPORT_SYMBOL_GPL(init_heap_helper_buffer);
 
 struct dma_buf *heap_helper_export_dmabuf(struct heap_helper_buffer *buffer,
  int fd_flags)
@@ -37,6 +38,7 @@ struct dma_buf *heap_helper_export_dmabuf(struct 
heap_helper_buffer *buffer,
 
return dma_buf_export(&exp_info);
 }
+EXPORT_SYMBOL_GPL(heap_helper_export_dmabuf);
 
 static void *dma_heap_map_kernel(struct heap_helper_buffer *buffer)
 {
-- 
2.7.4



[PATCH] dma-buf: support to walk the list of dmabuf for debug

2020-06-12 Thread Hyesoo Yu
Let's support debugging function to show exporter
detail information. The exporter don't need to manage
the lists for debugging because all dmabuf list are
managed on dmabuf framework.

That supports to walk the dmabuf list and show the
detailed information for exporter by passed function
implemented from exporter.

That helps to show exporter detail information.
For example, ION may show the buffer flag, heap name,
or the name of process to request allocation.

Change-Id: I670f04dda4a0870081e1b0fd96b9185b48b9dd15
Signed-off-by: Hyesoo Yu 
---
 drivers/dma-buf/dma-buf.c | 30 ++
 include/linux/dma-buf.h   |  2 ++
 2 files changed, 32 insertions(+)

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 01ce125f8e8d..002bd3ac636e 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1254,6 +1254,36 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr)
 }
 EXPORT_SYMBOL_GPL(dma_buf_vunmap);
 
+int dma_buf_exp_show(struct seq_file *s,
+int (*it)(struct seq_file *s, struct dma_buf *dmabuf))
+{
+   int ret;
+   struct dma_buf *buf_obj;
+
+   ret = mutex_lock_interruptible(&db_list.lock);
+   if (ret)
+   return ret;
+
+   list_for_each_entry(buf_obj, &db_list.head, list_node) {
+   ret = mutex_lock_interruptible(&buf_obj->lock);
+   if (ret) {
+   seq_puts(s,
+"\tERROR locking buffer object: skipping\n");
+   continue;
+   }
+
+   ret = it(s, buf_obj);
+   mutex_unlock(&buf_obj->lock);
+   if (ret)
+   break;
+   }
+   mutex_unlock(&db_list.lock);
+
+   return 0;
+
+}
+EXPORT_SYMBOL_GPL(dma_buf_exp_show);
+
 #ifdef CONFIG_DEBUG_FS
 static int dma_buf_debug_show(struct seq_file *s, void *unused)
 {
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index ab0c156abee6..b5c0a10b4eb3 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -502,4 +502,6 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *,
 unsigned long);
 void *dma_buf_vmap(struct dma_buf *);
 void dma_buf_vunmap(struct dma_buf *, void *vaddr);
+int dma_buf_exp_show(struct seq_file *s,
+int (*it)(struct seq_file *s, struct dma_buf *dmabuf));
 #endif /* __DMA_BUF_H__ */
-- 
2.27.0



[PATCH] dma-buf: support to walk the list of dmabuf for debug

2020-06-11 Thread Hyesoo Yu
Let's support debugging function to show exporter
detail information. The exporter don't need to manage
the lists for debugging because all dmabuf list are
managed on dmabuf framework.

That supports to walk the dmabuf list and show the
detailed information for exporter by passed function
implemented from exporter.

That helps to show exporter detail information.
For example, ION may show the buffer flag, heap name,
or the name of process to request allocation.

Change-Id: I670f04dda4a0870081e1b0fd96b9185b48b9dd15
Signed-off-by: Hyesoo Yu 
---
 drivers/dma-buf/dma-buf.c | 30 ++
 include/linux/dma-buf.h   |  2 ++
 2 files changed, 32 insertions(+)

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 01ce125f8e8d..002bd3ac636e 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -1254,6 +1254,36 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr)
 }
 EXPORT_SYMBOL_GPL(dma_buf_vunmap);
 
+int dma_buf_exp_show(struct seq_file *s,
+int (*it)(struct seq_file *s, struct dma_buf *dmabuf))
+{
+   int ret;
+   struct dma_buf *buf_obj;
+
+   ret = mutex_lock_interruptible(&db_list.lock);
+   if (ret)
+   return ret;
+
+   list_for_each_entry(buf_obj, &db_list.head, list_node) {
+   ret = mutex_lock_interruptible(&buf_obj->lock);
+   if (ret) {
+   seq_puts(s,
+"\tERROR locking buffer object: skipping\n");
+   continue;
+   }
+
+   ret = it(s, buf_obj);
+   mutex_unlock(&buf_obj->lock);
+   if (ret)
+   break;
+   }
+   mutex_unlock(&db_list.lock);
+
+   return 0;
+
+}
+EXPORT_SYMBOL_GPL(dma_buf_exp_show);
+
 #ifdef CONFIG_DEBUG_FS
 static int dma_buf_debug_show(struct seq_file *s, void *unused)
 {
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index ab0c156abee6..b5c0a10b4eb3 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -502,4 +502,6 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *,
 unsigned long);
 void *dma_buf_vmap(struct dma_buf *);
 void dma_buf_vunmap(struct dma_buf *, void *vaddr);
+int dma_buf_exp_show(struct seq_file *s,
+int (*it)(struct seq_file *s, struct dma_buf *dmabuf));
 #endif /* __DMA_BUF_H__ */
-- 
2.27.0