Re: [PATCH v5 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.

2021-03-22 Thread Yang Shi
On Sun, Mar 21, 2021 at 7:11 PM Zi Yan  wrote:
>
> On 19 Mar 2021, at 19:37, Yang Shi wrote:
>
> > On Thu, Mar 18, 2021 at 5:52 PM Zi Yan  wrote:
> >>
> >> From: Zi Yan 
> >>
> >> We did not have a direct user interface of splitting the compound page
> >> backing a THP and there is no need unless we want to expose the THP
> >> implementation details to users. Make /split_huge_pages accept
> >> a new command to do that.
> >>
> >> By writing ",," to
> >> /split_huge_pages, THPs within the given virtual address range
> >> from the process with the given pid are split. It is used to test
> >> split_huge_page function. In addition, a selftest program is added to
> >> tools/testing/selftests/vm to utilize the interface by splitting
> >> PMD THPs and PTE-mapped THPs.
> >>
> >> This does not change the old behavior, i.e., writing 1 to the interface
> >> to split all THPs in the system.
> >>
> >> Changelog:
> >>
> >> From v5:
> >> 1. Skipped special VMAs and other fixes. (suggested by Yang Shi)
> >
> > Looks good to me. Reviewed-by: Yang Shi 
> >
> > Some nits below:
> >
> >>
> >> From v4:
> >> 1. Fixed the error code return issue, spotted by kernel test robot
> >>.
> >>
> >> From v3:
> >> 1. Factored out split huge pages in the given pid code to a separate
> >>function.
> >> 2. Added the missing put_page for not split pages.
> >> 3. pr_debug -> pr_info, make reading results simpler.
> >>
> >> From v2:
> >> 1. Reused existing /split_huge_pages interface. (suggested by
> >>Yang Shi)
> >>
> >> From v1:
> >> 1. Removed unnecessary calling to vma_migratable, spotted by kernel test
> >>robot .
> >> 2. Dropped the use of find_mm_struct and code it directly, since there
> >>is no need for the permission check in that function and the function
> >>is only available when migration is on.
> >> 3. Added some comments in the selftest program to clarify how PTE-mapped
> >>THPs are formed.
> >>
> >> Signed-off-by: Zi Yan 
> >> ---
> >>  mm/huge_memory.c  | 143 +++-
> >>  tools/testing/selftests/vm/.gitignore |   1 +
> >>  tools/testing/selftests/vm/Makefile   |   1 +
> >>  .../selftests/vm/split_huge_page_test.c   | 318 ++
> >>  4 files changed, 456 insertions(+), 7 deletions(-)
> >>  create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
> >>
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index bff92dea5ab3..9bf9bc489228 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -7,6 +7,7 @@
> >>
> >>  #include 
> >>  #include 
> >> +#include 
> >>  #include 
> >>  #include 
> >>  #include 
> >> @@ -2922,16 +2923,14 @@ static struct shrinker deferred_split_shrinker = {
> >>  };
> >>
> >>  #ifdef CONFIG_DEBUG_FS
> >> -static int split_huge_pages_set(void *data, u64 val)
> >> +static void split_huge_pages_all(void)
> >>  {
> >> struct zone *zone;
> >> struct page *page;
> >> unsigned long pfn, max_zone_pfn;
> >> unsigned long total = 0, split = 0;
> >>
> >> -   if (val != 1)
> >> -   return -EINVAL;
> >> -
> >> +   pr_info("Split all THPs\n");
> >> for_each_populated_zone(zone) {
> >> max_zone_pfn = zone_end_pfn(zone);
> >> for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; 
> >> pfn++) {
> >> @@ -2959,11 +2958,141 @@ static int split_huge_pages_set(void *data, u64 
> >> val)
> >> }
> >>
> >> pr_info("%lu of %lu THP split\n", split, total);
> >> +}
> >>
> >> -   return 0;
> >> +static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
> >> +   unsigned long vaddr_end)
> >> +{
> >> +   int ret = 0;
> >> +   struct task_struct *task;
> >> +   struct mm_struct *mm;
> >> +   unsigned long total = 0, split = 0;
> >> +   unsigned long addr;
> >> +
> >> +   vaddr_start &= PAGE_MASK;
> >> +   vaddr_end &= PAGE_MASK;
> >> +
> >> +   /* Find the task_struct from pid */
> >> +   rcu_read_lock();
> >> +   task = find_task_by_vpid(pid);
> >> +   if (!task) {
> >> +   rcu_read_unlock();
> >> +   ret = -ESRCH;
> >> +   goto out;
> >> +   }
> >> +   get_task_struct(task);
> >> +   rcu_read_unlock();
> >> +
> >> +   /* Find the mm_struct */
> >> +   mm = get_task_mm(task);
> >> +   put_task_struct(task);
> >> +
> >> +   if (!mm) {
> >> +   ret = -EINVAL;
> >> +   goto out;
> >> +   }
> >> +
> >> +   pr_info("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
> >> +pid, vaddr_start, vaddr_end);
> >> +
> >> +   mmap_read_lock(mm);
> >> +   /*
> >> +* always increase addr by PAGE_SIZE, since we could have a PTE 
> >> page
> >> +* table filled with PTE-mapped THPs, each of which is distinct.
> >> +*/
> >> +   for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
> 

Re: [PATCH v5 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.

2021-03-21 Thread Zi Yan
On 19 Mar 2021, at 19:37, Yang Shi wrote:

> On Thu, Mar 18, 2021 at 5:52 PM Zi Yan  wrote:
>>
>> From: Zi Yan 
>>
>> We did not have a direct user interface of splitting the compound page
>> backing a THP and there is no need unless we want to expose the THP
>> implementation details to users. Make /split_huge_pages accept
>> a new command to do that.
>>
>> By writing ",," to
>> /split_huge_pages, THPs within the given virtual address range
>> from the process with the given pid are split. It is used to test
>> split_huge_page function. In addition, a selftest program is added to
>> tools/testing/selftests/vm to utilize the interface by splitting
>> PMD THPs and PTE-mapped THPs.
>>
>> This does not change the old behavior, i.e., writing 1 to the interface
>> to split all THPs in the system.
>>
>> Changelog:
>>
>> From v5:
>> 1. Skipped special VMAs and other fixes. (suggested by Yang Shi)
>
> Looks good to me. Reviewed-by: Yang Shi 
>
> Some nits below:
>
>>
>> From v4:
>> 1. Fixed the error code return issue, spotted by kernel test robot
>>.
>>
>> From v3:
>> 1. Factored out split huge pages in the given pid code to a separate
>>function.
>> 2. Added the missing put_page for not split pages.
>> 3. pr_debug -> pr_info, make reading results simpler.
>>
>> From v2:
>> 1. Reused existing /split_huge_pages interface. (suggested by
>>Yang Shi)
>>
>> From v1:
>> 1. Removed unnecessary calling to vma_migratable, spotted by kernel test
>>robot .
>> 2. Dropped the use of find_mm_struct and code it directly, since there
>>is no need for the permission check in that function and the function
>>is only available when migration is on.
>> 3. Added some comments in the selftest program to clarify how PTE-mapped
>>THPs are formed.
>>
>> Signed-off-by: Zi Yan 
>> ---
>>  mm/huge_memory.c  | 143 +++-
>>  tools/testing/selftests/vm/.gitignore |   1 +
>>  tools/testing/selftests/vm/Makefile   |   1 +
>>  .../selftests/vm/split_huge_page_test.c   | 318 ++
>>  4 files changed, 456 insertions(+), 7 deletions(-)
>>  create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index bff92dea5ab3..9bf9bc489228 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -7,6 +7,7 @@
>>
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -2922,16 +2923,14 @@ static struct shrinker deferred_split_shrinker = {
>>  };
>>
>>  #ifdef CONFIG_DEBUG_FS
>> -static int split_huge_pages_set(void *data, u64 val)
>> +static void split_huge_pages_all(void)
>>  {
>> struct zone *zone;
>> struct page *page;
>> unsigned long pfn, max_zone_pfn;
>> unsigned long total = 0, split = 0;
>>
>> -   if (val != 1)
>> -   return -EINVAL;
>> -
>> +   pr_info("Split all THPs\n");
>> for_each_populated_zone(zone) {
>> max_zone_pfn = zone_end_pfn(zone);
>> for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
>> @@ -2959,11 +2958,141 @@ static int split_huge_pages_set(void *data, u64 val)
>> }
>>
>> pr_info("%lu of %lu THP split\n", split, total);
>> +}
>>
>> -   return 0;
>> +static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>> +   unsigned long vaddr_end)
>> +{
>> +   int ret = 0;
>> +   struct task_struct *task;
>> +   struct mm_struct *mm;
>> +   unsigned long total = 0, split = 0;
>> +   unsigned long addr;
>> +
>> +   vaddr_start &= PAGE_MASK;
>> +   vaddr_end &= PAGE_MASK;
>> +
>> +   /* Find the task_struct from pid */
>> +   rcu_read_lock();
>> +   task = find_task_by_vpid(pid);
>> +   if (!task) {
>> +   rcu_read_unlock();
>> +   ret = -ESRCH;
>> +   goto out;
>> +   }
>> +   get_task_struct(task);
>> +   rcu_read_unlock();
>> +
>> +   /* Find the mm_struct */
>> +   mm = get_task_mm(task);
>> +   put_task_struct(task);
>> +
>> +   if (!mm) {
>> +   ret = -EINVAL;
>> +   goto out;
>> +   }
>> +
>> +   pr_info("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
>> +pid, vaddr_start, vaddr_end);
>> +
>> +   mmap_read_lock(mm);
>> +   /*
>> +* always increase addr by PAGE_SIZE, since we could have a PTE page
>> +* table filled with PTE-mapped THPs, each of which is distinct.
>> +*/
>> +   for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
>> +   struct vm_area_struct *vma = find_vma(mm, addr);
>> +   unsigned int follflags;
>> +   struct page *page;
>> +
>> +   if (!vma || addr < vma->vm_start)
>> +   break;
>> +
>> +   /* skip special VMA and hugetlb VMA */
>> +   if (vma_is_special_huge(vma) 

Re: [PATCH v5 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.

2021-03-19 Thread Yang Shi
On Thu, Mar 18, 2021 at 5:52 PM Zi Yan  wrote:
>
> From: Zi Yan 
>
> We did not have a direct user interface of splitting the compound page
> backing a THP and there is no need unless we want to expose the THP
> implementation details to users. Make /split_huge_pages accept
> a new command to do that.
>
> By writing ",," to
> /split_huge_pages, THPs within the given virtual address range
> from the process with the given pid are split. It is used to test
> split_huge_page function. In addition, a selftest program is added to
> tools/testing/selftests/vm to utilize the interface by splitting
> PMD THPs and PTE-mapped THPs.
>
> This does not change the old behavior, i.e., writing 1 to the interface
> to split all THPs in the system.
>
> Changelog:
>
> From v5:
> 1. Skipped special VMAs and other fixes. (suggested by Yang Shi)

Looks good to me. Reviewed-by: Yang Shi 

Some nits below:

>
> From v4:
> 1. Fixed the error code return issue, spotted by kernel test robot
>.
>
> From v3:
> 1. Factored out split huge pages in the given pid code to a separate
>function.
> 2. Added the missing put_page for not split pages.
> 3. pr_debug -> pr_info, make reading results simpler.
>
> From v2:
> 1. Reused existing /split_huge_pages interface. (suggested by
>Yang Shi)
>
> From v1:
> 1. Removed unnecessary calling to vma_migratable, spotted by kernel test
>robot .
> 2. Dropped the use of find_mm_struct and code it directly, since there
>is no need for the permission check in that function and the function
>is only available when migration is on.
> 3. Added some comments in the selftest program to clarify how PTE-mapped
>THPs are formed.
>
> Signed-off-by: Zi Yan 
> ---
>  mm/huge_memory.c  | 143 +++-
>  tools/testing/selftests/vm/.gitignore |   1 +
>  tools/testing/selftests/vm/Makefile   |   1 +
>  .../selftests/vm/split_huge_page_test.c   | 318 ++
>  4 files changed, 456 insertions(+), 7 deletions(-)
>  create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index bff92dea5ab3..9bf9bc489228 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -7,6 +7,7 @@
>
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -2922,16 +2923,14 @@ static struct shrinker deferred_split_shrinker = {
>  };
>
>  #ifdef CONFIG_DEBUG_FS
> -static int split_huge_pages_set(void *data, u64 val)
> +static void split_huge_pages_all(void)
>  {
> struct zone *zone;
> struct page *page;
> unsigned long pfn, max_zone_pfn;
> unsigned long total = 0, split = 0;
>
> -   if (val != 1)
> -   return -EINVAL;
> -
> +   pr_info("Split all THPs\n");
> for_each_populated_zone(zone) {
> max_zone_pfn = zone_end_pfn(zone);
> for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
> @@ -2959,11 +2958,141 @@ static int split_huge_pages_set(void *data, u64 val)
> }
>
> pr_info("%lu of %lu THP split\n", split, total);
> +}
>
> -   return 0;
> +static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
> +   unsigned long vaddr_end)
> +{
> +   int ret = 0;
> +   struct task_struct *task;
> +   struct mm_struct *mm;
> +   unsigned long total = 0, split = 0;
> +   unsigned long addr;
> +
> +   vaddr_start &= PAGE_MASK;
> +   vaddr_end &= PAGE_MASK;
> +
> +   /* Find the task_struct from pid */
> +   rcu_read_lock();
> +   task = find_task_by_vpid(pid);
> +   if (!task) {
> +   rcu_read_unlock();
> +   ret = -ESRCH;
> +   goto out;
> +   }
> +   get_task_struct(task);
> +   rcu_read_unlock();
> +
> +   /* Find the mm_struct */
> +   mm = get_task_mm(task);
> +   put_task_struct(task);
> +
> +   if (!mm) {
> +   ret = -EINVAL;
> +   goto out;
> +   }
> +
> +   pr_info("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
> +pid, vaddr_start, vaddr_end);
> +
> +   mmap_read_lock(mm);
> +   /*
> +* always increase addr by PAGE_SIZE, since we could have a PTE page
> +* table filled with PTE-mapped THPs, each of which is distinct.
> +*/
> +   for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
> +   struct vm_area_struct *vma = find_vma(mm, addr);
> +   unsigned int follflags;
> +   struct page *page;
> +
> +   if (!vma || addr < vma->vm_start)
> +   break;
> +
> +   /* skip special VMA and hugetlb VMA */
> +   if (vma_is_special_huge(vma) || is_vm_hugetlb_page(vma)) {

VM_IO vma should be skipped as well. And you may extract this into a helper?

> +   addr = vma->vm_end;
> +   continue;
> +   

[PATCH v5 1/2] mm: huge_memory: a new debugfs interface for splitting THP tests.

2021-03-18 Thread Zi Yan
From: Zi Yan 

We did not have a direct user interface of splitting the compound page
backing a THP and there is no need unless we want to expose the THP
implementation details to users. Make /split_huge_pages accept
a new command to do that.

By writing ",," to
/split_huge_pages, THPs within the given virtual address range
from the process with the given pid are split. It is used to test
split_huge_page function. In addition, a selftest program is added to
tools/testing/selftests/vm to utilize the interface by splitting
PMD THPs and PTE-mapped THPs.

This does not change the old behavior, i.e., writing 1 to the interface
to split all THPs in the system.

Changelog:

>From v5:
1. Skipped special VMAs and other fixes. (suggested by Yang Shi)

>From v4:
1. Fixed the error code return issue, spotted by kernel test robot
   .

>From v3:
1. Factored out split huge pages in the given pid code to a separate
   function.
2. Added the missing put_page for not split pages.
3. pr_debug -> pr_info, make reading results simpler.

>From v2:
1. Reused existing /split_huge_pages interface. (suggested by
   Yang Shi)

>From v1:
1. Removed unnecessary calling to vma_migratable, spotted by kernel test
   robot .
2. Dropped the use of find_mm_struct and code it directly, since there
   is no need for the permission check in that function and the function
   is only available when migration is on.
3. Added some comments in the selftest program to clarify how PTE-mapped
   THPs are formed.

Signed-off-by: Zi Yan 
---
 mm/huge_memory.c  | 143 +++-
 tools/testing/selftests/vm/.gitignore |   1 +
 tools/testing/selftests/vm/Makefile   |   1 +
 .../selftests/vm/split_huge_page_test.c   | 318 ++
 4 files changed, 456 insertions(+), 7 deletions(-)
 create mode 100644 tools/testing/selftests/vm/split_huge_page_test.c

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bff92dea5ab3..9bf9bc489228 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -7,6 +7,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2922,16 +2923,14 @@ static struct shrinker deferred_split_shrinker = {
 };
 
 #ifdef CONFIG_DEBUG_FS
-static int split_huge_pages_set(void *data, u64 val)
+static void split_huge_pages_all(void)
 {
struct zone *zone;
struct page *page;
unsigned long pfn, max_zone_pfn;
unsigned long total = 0, split = 0;
 
-   if (val != 1)
-   return -EINVAL;
-
+   pr_info("Split all THPs\n");
for_each_populated_zone(zone) {
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
@@ -2959,11 +2958,141 @@ static int split_huge_pages_set(void *data, u64 val)
}
 
pr_info("%lu of %lu THP split\n", split, total);
+}
 
-   return 0;
+static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
+   unsigned long vaddr_end)
+{
+   int ret = 0;
+   struct task_struct *task;
+   struct mm_struct *mm;
+   unsigned long total = 0, split = 0;
+   unsigned long addr;
+
+   vaddr_start &= PAGE_MASK;
+   vaddr_end &= PAGE_MASK;
+
+   /* Find the task_struct from pid */
+   rcu_read_lock();
+   task = find_task_by_vpid(pid);
+   if (!task) {
+   rcu_read_unlock();
+   ret = -ESRCH;
+   goto out;
+   }
+   get_task_struct(task);
+   rcu_read_unlock();
+
+   /* Find the mm_struct */
+   mm = get_task_mm(task);
+   put_task_struct(task);
+
+   if (!mm) {
+   ret = -EINVAL;
+   goto out;
+   }
+
+   pr_info("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
+pid, vaddr_start, vaddr_end);
+
+   mmap_read_lock(mm);
+   /*
+* always increase addr by PAGE_SIZE, since we could have a PTE page
+* table filled with PTE-mapped THPs, each of which is distinct.
+*/
+   for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
+   struct vm_area_struct *vma = find_vma(mm, addr);
+   unsigned int follflags;
+   struct page *page;
+
+   if (!vma || addr < vma->vm_start)
+   break;
+
+   /* skip special VMA and hugetlb VMA */
+   if (vma_is_special_huge(vma) || is_vm_hugetlb_page(vma)) {
+   addr = vma->vm_end;
+   continue;
+   }
+
+   /* FOLL_DUMP to ignore special (like zero) pages */
+   follflags = FOLL_GET | FOLL_DUMP;
+   page = follow_page(vma, addr, follflags);
+
+   if (IS_ERR(page))
+   continue;
+   if (!page)
+   continue;
+
+   if (!is_transparent_hugepage(page))
+   goto next;
+
+   total++;
+