Re: [PATCH v10 23/25] mm: add speculative page fault vmstats

2018-05-15 Thread Ganesh Mahendran
2018-04-17 22:33 GMT+08:00 Laurent Dufour :
> Add speculative_pgfault vmstat counter to count successful speculative page
> fault handling.
>
> Also fixing a minor typo in include/linux/vm_event_item.h.
>
> Signed-off-by: Laurent Dufour 
> ---
>  include/linux/vm_event_item.h | 3 +++
>  mm/memory.c   | 1 +
>  mm/vmstat.c   | 5 -
>  3 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 5c7f010676a7..a240acc09684 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -111,6 +111,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
> SWAP_RA,
> SWAP_RA_HIT,
>  #endif
> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
> +   SPECULATIVE_PGFAULT,
> +#endif
> NR_VM_EVENT_ITEMS
>  };
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 425f07e0bf38..1cd5bc000643 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4508,6 +4508,7 @@ int __handle_speculative_fault(struct mm_struct *mm, 
> unsigned long address,
>  * If there is no need to retry, don't return the vma to the caller.
>  */
> if (ret != VM_FAULT_RETRY) {
> +   count_vm_event(SPECULATIVE_PGFAULT);
> put_vma(vmf.vma);
> *vma = NULL;
> }
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 536332e988b8..c6b49bfa8139 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1289,7 +1289,10 @@ const char * const vmstat_text[] = {
> "swap_ra",
> "swap_ra_hit",
>  #endif
> -#endif /* CONFIG_VM_EVENTS_COUNTERS */
> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
> +   "speculative_pgfault"

"speculative_pgfault",
will be better. :)

> +#endif
> +#endif /* CONFIG_VM_EVENT_COUNTERS */
>  };
>  #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
>
> --
> 2.7.4
>


Re: [PATCH v10 23/25] mm: add speculative page fault vmstats

2018-05-15 Thread Ganesh Mahendran
2018-04-17 22:33 GMT+08:00 Laurent Dufour :
> Add speculative_pgfault vmstat counter to count successful speculative page
> fault handling.
>
> Also fixing a minor typo in include/linux/vm_event_item.h.
>
> Signed-off-by: Laurent Dufour 
> ---
>  include/linux/vm_event_item.h | 3 +++
>  mm/memory.c   | 1 +
>  mm/vmstat.c   | 5 -
>  3 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 5c7f010676a7..a240acc09684 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -111,6 +111,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
> SWAP_RA,
> SWAP_RA_HIT,
>  #endif
> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
> +   SPECULATIVE_PGFAULT,
> +#endif
> NR_VM_EVENT_ITEMS
>  };
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 425f07e0bf38..1cd5bc000643 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4508,6 +4508,7 @@ int __handle_speculative_fault(struct mm_struct *mm, 
> unsigned long address,
>  * If there is no need to retry, don't return the vma to the caller.
>  */
> if (ret != VM_FAULT_RETRY) {
> +   count_vm_event(SPECULATIVE_PGFAULT);
> put_vma(vmf.vma);
> *vma = NULL;
> }
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index 536332e988b8..c6b49bfa8139 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1289,7 +1289,10 @@ const char * const vmstat_text[] = {
> "swap_ra",
> "swap_ra_hit",
>  #endif
> -#endif /* CONFIG_VM_EVENTS_COUNTERS */
> +#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
> +   "speculative_pgfault"

"speculative_pgfault",
will be better. :)

> +#endif
> +#endif /* CONFIG_VM_EVENT_COUNTERS */
>  };
>  #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
>
> --
> 2.7.4
>


[PATCH v2 1/2] arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-05-04 Thread Ganesh Mahendran
Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for arm64. This
enables Speculative Page Fault handler.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
v2: remove "if SMP"
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf49..b3ca29d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -144,6 +144,7 @@ config ARM64
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+   select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
help
  ARM 64-bit (AArch64) Linux support.
 
-- 
1.9.1



[PATCH v2 1/2] arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-05-04 Thread Ganesh Mahendran
Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for arm64. This
enables Speculative Page Fault handler.

Signed-off-by: Ganesh Mahendran 
---
v2: remove "if SMP"
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf49..b3ca29d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -144,6 +144,7 @@ config ARM64
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+   select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT
help
  ARM 64-bit (AArch64) Linux support.
 
-- 
1.9.1



[PATCH v2 2/2] arm64/mm: add speculative page fault

2018-05-04 Thread Ganesh Mahendran
This patch enables the speculative page fault on the arm64
architecture.

I completed spf porting in 4.9. From the test result,
we can see app launching time improved by about 10% in average.
For the apps which have more than 50 threads, 15% or even more
improvement can be got.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
v2:
  move find_vma() to do_page_fault()
  remove IS_ENABLED()
  remove fault != VM_FAULT_SIGSEGV check
  initilize vma = NULL
---
 arch/arm64/mm/fault.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 4165485..efd5956 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -320,14 +320,12 @@ static void do_bad_area(unsigned long addr, unsigned int 
esr, struct pt_regs *re
 #define VM_FAULT_BADMAP0x01
 #define VM_FAULT_BADACCESS 0x02
 
-static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
+static int __do_page_fault(struct vm_area_struct *vma, unsigned long addr,
   unsigned int mm_flags, unsigned long vm_flags,
   struct task_struct *tsk)
 {
-   struct vm_area_struct *vma;
int fault;
 
-   vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
if (unlikely(!vma))
goto out;
@@ -371,6 +369,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
int fault, major = 0;
unsigned long vm_flags = VM_READ | VM_WRITE;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+   struct vm_area_struct *vma = NULL;
 
if (notify_page_fault(regs, esr))
return 0;
@@ -410,6 +409,16 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 
/*
+* let's try a speculative page fault without grabbing the
+* mmap_sem.
+*/
+   fault = handle_speculative_fault(mm, addr, mm_flags, );
+   if (fault != VM_FAULT_RETRY) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
+   goto done;
+   }
+
+   /*
 * As per x86, we may deadlock here. However, since the kernel only
 * validly references user space from well defined areas of the code,
 * we can bug out early if this is from code which shouldn't.
@@ -431,7 +440,10 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
 #endif
}
 
-   fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
+   if (!vma || !can_reuse_spf_vma(vma, addr))
+   vma = find_vma(mm, addr);
+
+   fault = __do_page_fault(vma, addr, mm_flags, vm_flags, tsk);
major |= fault & VM_FAULT_MAJOR;
 
if (fault & VM_FAULT_RETRY) {
@@ -454,11 +466,20 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
mm_flags |= FAULT_FLAG_TRIED;
+
+   /*
+* Do not try to reuse this vma and fetch it
+* again since we will release the mmap_sem.
+*/
+   vma = NULL;
+
goto retry;
}
}
up_read(>mmap_sem);
 
+done:
+
/*
 * Handle the "normal" (no error) case first.
 */
-- 
1.9.1



[PATCH v2 2/2] arm64/mm: add speculative page fault

2018-05-04 Thread Ganesh Mahendran
This patch enables the speculative page fault on the arm64
architecture.

I completed spf porting in 4.9. From the test result,
we can see app launching time improved by about 10% in average.
For the apps which have more than 50 threads, 15% or even more
improvement can be got.

Signed-off-by: Ganesh Mahendran 
---
v2:
  move find_vma() to do_page_fault()
  remove IS_ENABLED()
  remove fault != VM_FAULT_SIGSEGV check
  initilize vma = NULL
---
 arch/arm64/mm/fault.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 4165485..efd5956 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -320,14 +320,12 @@ static void do_bad_area(unsigned long addr, unsigned int 
esr, struct pt_regs *re
 #define VM_FAULT_BADMAP0x01
 #define VM_FAULT_BADACCESS 0x02
 
-static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
+static int __do_page_fault(struct vm_area_struct *vma, unsigned long addr,
   unsigned int mm_flags, unsigned long vm_flags,
   struct task_struct *tsk)
 {
-   struct vm_area_struct *vma;
int fault;
 
-   vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
if (unlikely(!vma))
goto out;
@@ -371,6 +369,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
int fault, major = 0;
unsigned long vm_flags = VM_READ | VM_WRITE;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+   struct vm_area_struct *vma = NULL;
 
if (notify_page_fault(regs, esr))
return 0;
@@ -410,6 +409,16 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 
/*
+* let's try a speculative page fault without grabbing the
+* mmap_sem.
+*/
+   fault = handle_speculative_fault(mm, addr, mm_flags, );
+   if (fault != VM_FAULT_RETRY) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
+   goto done;
+   }
+
+   /*
 * As per x86, we may deadlock here. However, since the kernel only
 * validly references user space from well defined areas of the code,
 * we can bug out early if this is from code which shouldn't.
@@ -431,7 +440,10 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
 #endif
}
 
-   fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
+   if (!vma || !can_reuse_spf_vma(vma, addr))
+   vma = find_vma(mm, addr);
+
+   fault = __do_page_fault(vma, addr, mm_flags, vm_flags, tsk);
major |= fault & VM_FAULT_MAJOR;
 
if (fault & VM_FAULT_RETRY) {
@@ -454,11 +466,20 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
mm_flags |= FAULT_FLAG_TRIED;
+
+   /*
+* Do not try to reuse this vma and fetch it
+* again since we will release the mmap_sem.
+*/
+   vma = NULL;
+
goto retry;
}
}
up_read(>mmap_sem);
 
+done:
+
/*
 * Handle the "normal" (no error) case first.
 */
-- 
1.9.1



Re: [PATCH 2/2] arm64/mm: add speculative page fault

2018-05-04 Thread Ganesh Mahendran
2018-05-02 22:46 GMT+08:00 Punit Agrawal <punit.agra...@arm.com>:
> Hi Ganesh,
>
> I was looking at evaluating speculative page fault handling on arm64 and
> noticed your patch.
>
> Some comments below -

Thanks for your review.

>
> Ganesh Mahendran <opensource.gan...@gmail.com> writes:
>
>> This patch enables the speculative page fault on the arm64
>> architecture.
>>
>> I completed spf porting in 4.9. From the test result,
>> we can see app launching time improved by about 10% in average.
>> For the apps which have more than 50 threads, 15% or even more
>> improvement can be got.
>>
>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>> ---
>> This patch is on top of Laurent's v10 spf
>> ---
>>  arch/arm64/mm/fault.c | 38 +++---
>>  1 file changed, 35 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index 4165485..e7992a3 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -322,11 +322,13 @@ static void do_bad_area(unsigned long addr, unsigned 
>> int esr, struct pt_regs *re
>>
>>  static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
>>  unsigned int mm_flags, unsigned long vm_flags,
>> -struct task_struct *tsk)
>> +struct task_struct *tsk, struct vm_area_struct *vma)
>>  {
>> - struct vm_area_struct *vma;
>>   int fault;
>>
>> + if (!vma || !can_reuse_spf_vma(vma, addr))
>> + vma = find_vma(mm, addr);
>> +
>
> It would be better to move this hunk to do_page_fault().
>
> It'll help localise the fact that handle_speculative_fault() is a
> stateful call which needs a corresponding can_reuse_spf_vma() to
> properly update the vma reference counting.

Yes, your suggestion is better.

>
>
>>   vma = find_vma(mm, addr);
>
> Remember to drop this call in the next version. As it stands the call
> the find_vma() needlessly gets duplicated.

Will fix

>
>>   fault = VM_FAULT_BADMAP;
>>   if (unlikely(!vma))
>> @@ -371,6 +373,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>   int fault, major = 0;
>>   unsigned long vm_flags = VM_READ | VM_WRITE;
>>   unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>> + struct vm_area_struct *vma;
>>
>>   if (notify_page_fault(regs, esr))
>>   return 0;
>> @@ -409,6 +412,25 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>
>>   perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
>>
>> + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
>
> You don't need the IS_ENABLED() check. The alternate implementation of
> handle_speculative_fault() when CONFIG_SPECULATIVE_PAGE_FAULT is not
> enabled takes care of this.

Will fix

>
>> + fault = handle_speculative_fault(mm, addr, mm_flags, );
>> + /*
>> +  * Page fault is done if VM_FAULT_RETRY is not returned.
>> +  * But if the memory protection keys are active, we don't know
>> +  * if the fault is due to key mistmatch or due to a
>> +  * classic protection check.
>> +  * To differentiate that, we will need the VMA we no
>> +  * more have, so let's retry with the mmap_sem held.
>> +  */
>
> As there is no support for memory protection keys on arm64 most of this
> comment can be dropped.

will fix

>
>> + if (fault != VM_FAULT_RETRY &&
>> +  fault != VM_FAULT_SIGSEGV) {
>
> Not sure if you need the VM_FAULT_SIGSEGV here.
>
>> + perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
>> + goto done;
>> + }
>> + } else {
>> + vma = NULL;
>> + }
>> +
>
> If vma is initiliased to NULL during declaration, the else part can be
> dropped.

will fix

>
>>   /*
>>* As per x86, we may deadlock here. However, since the kernel only
>>* validly references user space from well defined areas of the code,
>> @@ -431,7 +453,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>  #endif
>>   }
>>
>> - fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
>> + fault = __do_page_fault(mm, addr, mm_flags, vm_flags, ts

Re: [PATCH 2/2] arm64/mm: add speculative page fault

2018-05-04 Thread Ganesh Mahendran
2018-05-02 22:46 GMT+08:00 Punit Agrawal :
> Hi Ganesh,
>
> I was looking at evaluating speculative page fault handling on arm64 and
> noticed your patch.
>
> Some comments below -

Thanks for your review.

>
> Ganesh Mahendran  writes:
>
>> This patch enables the speculative page fault on the arm64
>> architecture.
>>
>> I completed spf porting in 4.9. From the test result,
>> we can see app launching time improved by about 10% in average.
>> For the apps which have more than 50 threads, 15% or even more
>> improvement can be got.
>>
>> Signed-off-by: Ganesh Mahendran 
>> ---
>> This patch is on top of Laurent's v10 spf
>> ---
>>  arch/arm64/mm/fault.c | 38 +++---
>>  1 file changed, 35 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index 4165485..e7992a3 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -322,11 +322,13 @@ static void do_bad_area(unsigned long addr, unsigned 
>> int esr, struct pt_regs *re
>>
>>  static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
>>  unsigned int mm_flags, unsigned long vm_flags,
>> -struct task_struct *tsk)
>> +struct task_struct *tsk, struct vm_area_struct *vma)
>>  {
>> - struct vm_area_struct *vma;
>>   int fault;
>>
>> + if (!vma || !can_reuse_spf_vma(vma, addr))
>> + vma = find_vma(mm, addr);
>> +
>
> It would be better to move this hunk to do_page_fault().
>
> It'll help localise the fact that handle_speculative_fault() is a
> stateful call which needs a corresponding can_reuse_spf_vma() to
> properly update the vma reference counting.

Yes, your suggestion is better.

>
>
>>   vma = find_vma(mm, addr);
>
> Remember to drop this call in the next version. As it stands the call
> the find_vma() needlessly gets duplicated.

Will fix

>
>>   fault = VM_FAULT_BADMAP;
>>   if (unlikely(!vma))
>> @@ -371,6 +373,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>   int fault, major = 0;
>>   unsigned long vm_flags = VM_READ | VM_WRITE;
>>   unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>> + struct vm_area_struct *vma;
>>
>>   if (notify_page_fault(regs, esr))
>>   return 0;
>> @@ -409,6 +412,25 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>
>>   perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
>>
>> + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
>
> You don't need the IS_ENABLED() check. The alternate implementation of
> handle_speculative_fault() when CONFIG_SPECULATIVE_PAGE_FAULT is not
> enabled takes care of this.

Will fix

>
>> + fault = handle_speculative_fault(mm, addr, mm_flags, );
>> + /*
>> +  * Page fault is done if VM_FAULT_RETRY is not returned.
>> +  * But if the memory protection keys are active, we don't know
>> +  * if the fault is due to key mistmatch or due to a
>> +  * classic protection check.
>> +  * To differentiate that, we will need the VMA we no
>> +  * more have, so let's retry with the mmap_sem held.
>> +  */
>
> As there is no support for memory protection keys on arm64 most of this
> comment can be dropped.

will fix

>
>> + if (fault != VM_FAULT_RETRY &&
>> +  fault != VM_FAULT_SIGSEGV) {
>
> Not sure if you need the VM_FAULT_SIGSEGV here.
>
>> + perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
>> + goto done;
>> + }
>> + } else {
>> + vma = NULL;
>> + }
>> +
>
> If vma is initiliased to NULL during declaration, the else part can be
> dropped.

will fix

>
>>   /*
>>* As per x86, we may deadlock here. However, since the kernel only
>>* validly references user space from well defined areas of the code,
>> @@ -431,7 +453,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>  #endif
>>   }
>>
>> - fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
>> + fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk, vma);
>>   major |= fault & VM_FAULT_MAJOR;
>>
>>   if (fault & VM_FA

Re: [PATCH 2/2] arm64/mm: add speculative page fault

2018-05-04 Thread Ganesh Mahendran
2018-05-02 17:07 GMT+08:00 Laurent Dufour <lduf...@linux.vnet.ibm.com>:
> On 02/05/2018 09:54, Ganesh Mahendran wrote:
>> This patch enables the speculative page fault on the arm64
>> architecture.
>>
>> I completed spf porting in 4.9. From the test result,
>> we can see app launching time improved by about 10% in average.
>> For the apps which have more than 50 threads, 15% or even more
>> improvement can be got.
>
> Thanks Ganesh,
>
> That's a great improvement, could you please provide details about the apps 
> and
> the hardware you used ?

We run spf on Qcom SDM845(kernel 4.9). Below is app(popular in China)
list we tested:
--
com.tencent.mobileqq
com.tencent.qqmusic
com.tencent.mtt
com.UCMobile
com.qiyi.video
com.baidu.searchbox
com.baidu.BaiduMap
tv.danmaku.bili
com.sdu.didi.psnger
com.ss.android.ugc.aweme
air.tv.douyu.android
me.ele
com.autonavi.minimap
com.duowan.kiwi
com.v.study
com.qqgame.hlddz
com.ss.android.article.lite
com.jingdong.app.mall
com.tencent.tmgp.pubgmhd
com.kugou.android
com.kuaikan.comic
com.hunantv.imgo.activity
com.mt.mtxx.mtxx
com.sankuai.meituan
com.sankuai.meituan.takeoutnew
com.tencent.karaoke
com.taobao.taobao
com.tencent.qqlive
com.tmall.wireless
com.tencent.tmgp.sgame
com.netease.cloudmusic
com.sina.weibo
com.tencent.mm
com.immomo.momo
com.xiaomi.hm.health
com.youku.phone
com.eg.android.AlipayGphone
com.meituan.qcs.c.android
--

We will do more test of the V10 spf.

>
>>
>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>> ---
>> This patch is on top of Laurent's v10 spf
>> ---
>>  arch/arm64/mm/fault.c | 38 +++---
>>  1 file changed, 35 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index 4165485..e7992a3 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -322,11 +322,13 @@ static void do_bad_area(unsigned long addr, unsigned 
>> int esr, struct pt_regs *re
>>
>>  static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
>>  unsigned int mm_flags, unsigned long vm_flags,
>> -struct task_struct *tsk)
>> +struct task_struct *tsk, struct vm_area_struct *vma)
>>  {
>> - struct vm_area_struct *vma;
>>   int fault;
>>
>> + if (!vma || !can_reuse_spf_vma(vma, addr))
>> + vma = find_vma(mm, addr);
>> +
>>   vma = find_vma(mm, addr);
>>   fault = VM_FAULT_BADMAP;
>>   if (unlikely(!vma))
>> @@ -371,6 +373,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>   int fault, major = 0;
>>   unsigned long vm_flags = VM_READ | VM_WRITE;
>>   unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>> + struct vm_area_struct *vma;
>>
>>   if (notify_page_fault(regs, esr))
>>   return 0;
>> @@ -409,6 +412,25 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>
>>   perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
>>
>> + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
>
> As suggested by Punit in his v10's review, the test on
> CONFIG_SPECULATIVE_PAGE_FAULT is not needed as handle_speculative_fault() is
> defined to return VM_FAULT_RETRY is the config is not set.

Thanks, will fix.

>
>> + fault = handle_speculative_fault(mm, addr, mm_flags, );
>> + /*
>> +  * Page fault is done if VM_FAULT_RETRY is not returned.
>> +  * But if the memory protection keys are active, we don't know
>> +  * if the fault is due to key mistmatch or due to a
>> +  * classic protection check.
>> +  * To differentiate that, we will need the VMA we no
>> +  * more have, so let's retry with the mmap_sem held.
>> +  */
>
> The check of VM_FAULT_SIGSEGV was needed on ppc64 because of the memory
> protection key support, but as far as I know, this is not the case on arm64.
> Isn't it ?

Yes, wil fix.

>
>> + if (fault != VM_FAULT_RETRY &&
>> +  fault != VM_FAULT_SIGSEGV) {
>> + perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
>> + goto done;
>> + }
>> + } else {
>> + vma = NULL;
>> + }
>> +
>>   /*
>>* As per x86, we may deadlock here. However, since the kernel only
>>* validly references user space from well defi

Re: [PATCH 2/2] arm64/mm: add speculative page fault

2018-05-04 Thread Ganesh Mahendran
2018-05-02 17:07 GMT+08:00 Laurent Dufour :
> On 02/05/2018 09:54, Ganesh Mahendran wrote:
>> This patch enables the speculative page fault on the arm64
>> architecture.
>>
>> I completed spf porting in 4.9. From the test result,
>> we can see app launching time improved by about 10% in average.
>> For the apps which have more than 50 threads, 15% or even more
>> improvement can be got.
>
> Thanks Ganesh,
>
> That's a great improvement, could you please provide details about the apps 
> and
> the hardware you used ?

We run spf on Qcom SDM845(kernel 4.9). Below is app(popular in China)
list we tested:
--
com.tencent.mobileqq
com.tencent.qqmusic
com.tencent.mtt
com.UCMobile
com.qiyi.video
com.baidu.searchbox
com.baidu.BaiduMap
tv.danmaku.bili
com.sdu.didi.psnger
com.ss.android.ugc.aweme
air.tv.douyu.android
me.ele
com.autonavi.minimap
com.duowan.kiwi
com.v.study
com.qqgame.hlddz
com.ss.android.article.lite
com.jingdong.app.mall
com.tencent.tmgp.pubgmhd
com.kugou.android
com.kuaikan.comic
com.hunantv.imgo.activity
com.mt.mtxx.mtxx
com.sankuai.meituan
com.sankuai.meituan.takeoutnew
com.tencent.karaoke
com.taobao.taobao
com.tencent.qqlive
com.tmall.wireless
com.tencent.tmgp.sgame
com.netease.cloudmusic
com.sina.weibo
com.tencent.mm
com.immomo.momo
com.xiaomi.hm.health
com.youku.phone
com.eg.android.AlipayGphone
com.meituan.qcs.c.android
--

We will do more test of the V10 spf.

>
>>
>> Signed-off-by: Ganesh Mahendran 
>> ---
>> This patch is on top of Laurent's v10 spf
>> ---
>>  arch/arm64/mm/fault.c | 38 +++---
>>  1 file changed, 35 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index 4165485..e7992a3 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -322,11 +322,13 @@ static void do_bad_area(unsigned long addr, unsigned 
>> int esr, struct pt_regs *re
>>
>>  static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
>>  unsigned int mm_flags, unsigned long vm_flags,
>> -struct task_struct *tsk)
>> +struct task_struct *tsk, struct vm_area_struct *vma)
>>  {
>> - struct vm_area_struct *vma;
>>   int fault;
>>
>> + if (!vma || !can_reuse_spf_vma(vma, addr))
>> + vma = find_vma(mm, addr);
>> +
>>   vma = find_vma(mm, addr);
>>   fault = VM_FAULT_BADMAP;
>>   if (unlikely(!vma))
>> @@ -371,6 +373,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>   int fault, major = 0;
>>   unsigned long vm_flags = VM_READ | VM_WRITE;
>>   unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>> + struct vm_area_struct *vma;
>>
>>   if (notify_page_fault(regs, esr))
>>   return 0;
>> @@ -409,6 +412,25 @@ static int __kprobes do_page_fault(unsigned long addr, 
>> unsigned int esr,
>>
>>   perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
>>
>> + if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
>
> As suggested by Punit in his v10's review, the test on
> CONFIG_SPECULATIVE_PAGE_FAULT is not needed as handle_speculative_fault() is
> defined to return VM_FAULT_RETRY is the config is not set.

Thanks, will fix.

>
>> + fault = handle_speculative_fault(mm, addr, mm_flags, );
>> + /*
>> +  * Page fault is done if VM_FAULT_RETRY is not returned.
>> +  * But if the memory protection keys are active, we don't know
>> +  * if the fault is due to key mistmatch or due to a
>> +  * classic protection check.
>> +  * To differentiate that, we will need the VMA we no
>> +  * more have, so let's retry with the mmap_sem held.
>> +  */
>
> The check of VM_FAULT_SIGSEGV was needed on ppc64 because of the memory
> protection key support, but as far as I know, this is not the case on arm64.
> Isn't it ?

Yes, wil fix.

>
>> + if (fault != VM_FAULT_RETRY &&
>> +  fault != VM_FAULT_SIGSEGV) {
>> + perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
>> + goto done;
>> + }
>> + } else {
>> + vma = NULL;
>> + }
>> +
>>   /*
>>* As per x86, we may deadlock here. However, since the kernel only
>>* validly references user space from well defined areas of the code,
>> @@ -431,7 +453,7 @@ static int __kprobes d

Re: [PATCH 1/2] arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-05-04 Thread Ganesh Mahendran
2018-05-02 20:23 GMT+08:00 Will Deacon <will.dea...@arm.com>:
> On Wed, May 02, 2018 at 03:53:21PM +0800, Ganesh Mahendran wrote:
>> Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for arm64. This
>> enables Speculative Page Fault handler.
>
> Are there are tests for this? I'm really nervous about enabling it...

Hi, Will

I test the arm64 spf on Qcom SDM845 cpu with kernel 4.9.
It looks good for performance, and have not found stability issue yet.

Thanks.

>
> Will
>
>>
>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>> ---
>> This patch is on top of Laurent's v10 spf
>> ---
>>  arch/arm64/Kconfig | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index eb2cf49..cd583a9 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -144,6 +144,7 @@ config ARM64
>>   select SPARSE_IRQ
>>   select SYSCTL_EXCEPTION_TRACE
>>   select THREAD_INFO_IN_TASK
>> + select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT if SMP
>>   help
>> ARM 64-bit (AArch64) Linux support.
>>
>> --
>> 1.9.1
>>


Re: [PATCH 1/2] arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-05-04 Thread Ganesh Mahendran
2018-05-02 20:23 GMT+08:00 Will Deacon :
> On Wed, May 02, 2018 at 03:53:21PM +0800, Ganesh Mahendran wrote:
>> Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for arm64. This
>> enables Speculative Page Fault handler.
>
> Are there are tests for this? I'm really nervous about enabling it...

Hi, Will

I test the arm64 spf on Qcom SDM845 cpu with kernel 4.9.
It looks good for performance, and have not found stability issue yet.

Thanks.

>
> Will
>
>>
>> Signed-off-by: Ganesh Mahendran 
>> ---
>> This patch is on top of Laurent's v10 spf
>> ---
>>  arch/arm64/Kconfig | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>> index eb2cf49..cd583a9 100644
>> --- a/arch/arm64/Kconfig
>> +++ b/arch/arm64/Kconfig
>> @@ -144,6 +144,7 @@ config ARM64
>>   select SPARSE_IRQ
>>   select SYSCTL_EXCEPTION_TRACE
>>   select THREAD_INFO_IN_TASK
>> + select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT if SMP
>>   help
>> ARM 64-bit (AArch64) Linux support.
>>
>> --
>> 1.9.1
>>


Re: [PATCH v8 22/24] mm: Speculative page fault handler return VMA

2018-05-02 Thread Ganesh Mahendran
2018-03-29 15:50 GMT+08:00 Laurent Dufour <lduf...@linux.vnet.ibm.com>:
> On 29/03/2018 05:06, Ganesh Mahendran wrote:
>> 2018-03-29 10:26 GMT+08:00 Ganesh Mahendran <opensource.gan...@gmail.com>:
>>> Hi, Laurent
>>>
>>> 2018-02-16 23:25 GMT+08:00 Laurent Dufour <lduf...@linux.vnet.ibm.com>:
>>>> When the speculative page fault handler is returning VM_RETRY, there is a
>>>> chance that VMA fetched without grabbing the mmap_sem can be reused by the
>>>> legacy page fault handler.  By reusing it, we avoid calling find_vma()
>>>> again. To achieve, that we must ensure that the VMA structure will not be
>>>> freed in our back. This is done by getting the reference on it (get_vma())
>>>> and by assuming that the caller will call the new service
>>>> can_reuse_spf_vma() once it has grabbed the mmap_sem.
>>>>
>>>> can_reuse_spf_vma() is first checking that the VMA is still in the RB tree
>>>> , and then that the VMA's boundaries matched the passed address and release
>>>> the reference on the VMA so that it can be freed if needed.
>>>>
>>>> In the case the VMA is freed, can_reuse_spf_vma() will have returned false
>>>> as the VMA is no more in the RB tree.
>>>
>>> when I applied this patch to arm64, I got a crash:
>
> Hi Ganesh,
>
> Glad to see that you're enabling it on arm64.
>
> I didn't give this arch a try, so feel free to propose patches on top of the
> SPF series for this, I'll do my best to give them updated.
>
>>>
>>> [6.088296] Unable to handle kernel NULL pointer dereference at
>>> virtual address 
>>> [6.088307] pgd = ff9d67735000
>>> [6.088313] [] *pgd=0001795e3003,
>>> *pud=0001795e3003, *pmd=
>>> [6.088372] [ cut here ]
>>> [6.088377] Kernel BUG at ff9d64f65960 [verbose debug info 
>>> unavailable]
>>> [6.088384] Internal error: Oops - BUG: 9645 [#1] PREEMPT SMP
>>> [6.088389] BUG: Bad rss-counter state mm:ffe8f3861040 idx:0 val:90
>>> [6.088393] BUG: Bad rss-counter state mm:ffe8f3861040 idx:1 val:58
>>> [6.088398] Modules linked in:
>>> [6.088408] CPU: 1 PID: 621 Comm: qseecomd Not tainted 4.4.78-perf+ #88
>>> [6.088413] Hardware name: Qualcomm Technologies, Inc. SDM 636
>>> PM660 + PM660L MTP E7S (DT)
>>> [6.088419] task: ffe8f6208000 ti: ffe872a8c000 task.ti:
>>> ffe872a8c000
>>> [6.088432] PC is at __rb_erase_color+0x108/0x240
>>> [6.088441] LR is at vma_interval_tree_remove+0x244/0x24c
>>> [6.088447] pc : [] lr : []
>>> pstate: 604001c5
>>> [6.088451] sp : ffe872a8fa50
>>> [6.088455] x29: ffe872a8fa50 x28: 0008
>>> [6.088462] x27: 0009 x26: 
>>> [6.088470] x25: ffe8f458fb80 x24: 00768ff87000
>>> [6.088477] x23:  x22: 
>>> [6.088484] x21: ff9d64d9be7c x20: ffe8f3ff0680
>>> [6.088492] x19: ffe8f212e9b0 x18: 0074
>>> [6.088499] x17: 0007 x16: 000e
>>> [6.088507] x15: ff9d65c88000 x14: 0001
>>> [6.088514] x13: 00192d76 x12: 00989680
>>> [6.088521] x11: 001f x10: ff9d661ded1b
>>> [6.088528] x9 : 007691759000 x8 : 07691759
>>> [6.088535] x7 :  x6 : ffe871ebada8
>>> [6.088541] x5 : 00e1 x4 : ffe8f212e958
>>> [6.088548] x3 : 00e9 x2 : 
>>> [6.088555] x1 : ffe8f212f110 x0 : ffe8f212e9b1
>>> [6.088564]
>>> [6.088564] PC: 0xff9d64f65920:
>>> [6.088568] 5920  f902 aa0103e0 aa1603e1 d63f02a0 aa1603e1
>>> f9400822 f9000662 f9000833
>>> [6.088590] 5940  143b f9400a61 f9400020 370002c0 f9400436
>>> b2400260 f9000a76 f9000433
>>> [6.088610] 5960  f90002c0 f9400260 f920 f9000261 f27ef400
>>> 54000100 f9400802 eb13005f
>>> [6.088630] 5980  5461 f9000801 1404 f9000401 1402
>>> f9000281 aa1303e0 d63f02a0
>>> [6.088652]
>>> [6.088652] LR: 0xff9d64d9c298:
>>> [6.088656] c298  f9403083 b483 f9400c63 eb03005f 9a832042
>>> f9403883 eb02007f 54a0
>>> [6.088676] c2b8  f9003882 f9402c82 927ef442 b5fffd22 b480
>>> f0e2 

Re: [PATCH v8 22/24] mm: Speculative page fault handler return VMA

2018-05-02 Thread Ganesh Mahendran
2018-03-29 15:50 GMT+08:00 Laurent Dufour :
> On 29/03/2018 05:06, Ganesh Mahendran wrote:
>> 2018-03-29 10:26 GMT+08:00 Ganesh Mahendran :
>>> Hi, Laurent
>>>
>>> 2018-02-16 23:25 GMT+08:00 Laurent Dufour :
>>>> When the speculative page fault handler is returning VM_RETRY, there is a
>>>> chance that VMA fetched without grabbing the mmap_sem can be reused by the
>>>> legacy page fault handler.  By reusing it, we avoid calling find_vma()
>>>> again. To achieve, that we must ensure that the VMA structure will not be
>>>> freed in our back. This is done by getting the reference on it (get_vma())
>>>> and by assuming that the caller will call the new service
>>>> can_reuse_spf_vma() once it has grabbed the mmap_sem.
>>>>
>>>> can_reuse_spf_vma() is first checking that the VMA is still in the RB tree
>>>> , and then that the VMA's boundaries matched the passed address and release
>>>> the reference on the VMA so that it can be freed if needed.
>>>>
>>>> In the case the VMA is freed, can_reuse_spf_vma() will have returned false
>>>> as the VMA is no more in the RB tree.
>>>
>>> when I applied this patch to arm64, I got a crash:
>
> Hi Ganesh,
>
> Glad to see that you're enabling it on arm64.
>
> I didn't give this arch a try, so feel free to propose patches on top of the
> SPF series for this, I'll do my best to give them updated.
>
>>>
>>> [6.088296] Unable to handle kernel NULL pointer dereference at
>>> virtual address 
>>> [6.088307] pgd = ff9d67735000
>>> [6.088313] [] *pgd=0001795e3003,
>>> *pud=0001795e3003, *pmd=
>>> [6.088372] [ cut here ]
>>> [6.088377] Kernel BUG at ff9d64f65960 [verbose debug info 
>>> unavailable]
>>> [6.088384] Internal error: Oops - BUG: 9645 [#1] PREEMPT SMP
>>> [6.088389] BUG: Bad rss-counter state mm:ffe8f3861040 idx:0 val:90
>>> [6.088393] BUG: Bad rss-counter state mm:ffe8f3861040 idx:1 val:58
>>> [6.088398] Modules linked in:
>>> [6.088408] CPU: 1 PID: 621 Comm: qseecomd Not tainted 4.4.78-perf+ #88
>>> [6.088413] Hardware name: Qualcomm Technologies, Inc. SDM 636
>>> PM660 + PM660L MTP E7S (DT)
>>> [6.088419] task: ffe8f6208000 ti: ffe872a8c000 task.ti:
>>> ffe872a8c000
>>> [6.088432] PC is at __rb_erase_color+0x108/0x240
>>> [6.088441] LR is at vma_interval_tree_remove+0x244/0x24c
>>> [6.088447] pc : [] lr : []
>>> pstate: 604001c5
>>> [6.088451] sp : ffe872a8fa50
>>> [6.088455] x29: ffe872a8fa50 x28: 0008
>>> [6.088462] x27: 0009 x26: 
>>> [6.088470] x25: ffe8f458fb80 x24: 00768ff87000
>>> [6.088477] x23:  x22: 
>>> [6.088484] x21: ff9d64d9be7c x20: ffe8f3ff0680
>>> [6.088492] x19: ffe8f212e9b0 x18: 0074
>>> [6.088499] x17: 0007 x16: 000e
>>> [6.088507] x15: ff9d65c88000 x14: 0001
>>> [6.088514] x13: 00192d76 x12: 00989680
>>> [6.088521] x11: 001f x10: ff9d661ded1b
>>> [6.088528] x9 : 007691759000 x8 : 07691759
>>> [6.088535] x7 :  x6 : ffe871ebada8
>>> [6.088541] x5 : 00e1 x4 : ffe8f212e958
>>> [6.088548] x3 : 00e9 x2 : 
>>> [6.088555] x1 : ffe8f212f110 x0 : ffe8f212e9b1
>>> [6.088564]
>>> [6.088564] PC: 0xff9d64f65920:
>>> [6.088568] 5920  f902 aa0103e0 aa1603e1 d63f02a0 aa1603e1
>>> f9400822 f9000662 f9000833
>>> [6.088590] 5940  143b f9400a61 f9400020 370002c0 f9400436
>>> b2400260 f9000a76 f9000433
>>> [6.088610] 5960  f90002c0 f9400260 f920 f9000261 f27ef400
>>> 54000100 f9400802 eb13005f
>>> [6.088630] 5980  5461 f9000801 1404 f9000401 1402
>>> f9000281 aa1303e0 d63f02a0
>>> [6.088652]
>>> [6.088652] LR: 0xff9d64d9c298:
>>> [6.088656] c298  f9403083 b483 f9400c63 eb03005f 9a832042
>>> f9403883 eb02007f 54a0
>>> [6.088676] c2b8  f9003882 f9402c82 927ef442 b5fffd22 b480
>>> f0e2 9139f042 94072561
>>> [6.088695] c2d8  a8c17bfd d65f03c0 a9bf7bfd 910003fd f943
>&

[PATCH 2/2] arm64/mm: add speculative page fault

2018-05-02 Thread Ganesh Mahendran
This patch enables the speculative page fault on the arm64
architecture.

I completed spf porting in 4.9. From the test result,
we can see app launching time improved by about 10% in average.
For the apps which have more than 50 threads, 15% or even more
improvement can be got.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
This patch is on top of Laurent's v10 spf
---
 arch/arm64/mm/fault.c | 38 +++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 4165485..e7992a3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -322,11 +322,13 @@ static void do_bad_area(unsigned long addr, unsigned int 
esr, struct pt_regs *re
 
 static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
   unsigned int mm_flags, unsigned long vm_flags,
-  struct task_struct *tsk)
+  struct task_struct *tsk, struct vm_area_struct *vma)
 {
-   struct vm_area_struct *vma;
int fault;
 
+   if (!vma || !can_reuse_spf_vma(vma, addr))
+   vma = find_vma(mm, addr);
+
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
if (unlikely(!vma))
@@ -371,6 +373,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
int fault, major = 0;
unsigned long vm_flags = VM_READ | VM_WRITE;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+   struct vm_area_struct *vma;
 
if (notify_page_fault(regs, esr))
return 0;
@@ -409,6 +412,25 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 
+   if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
+   fault = handle_speculative_fault(mm, addr, mm_flags, );
+   /*
+* Page fault is done if VM_FAULT_RETRY is not returned.
+* But if the memory protection keys are active, we don't know
+* if the fault is due to key mistmatch or due to a
+* classic protection check.
+* To differentiate that, we will need the VMA we no
+* more have, so let's retry with the mmap_sem held.
+*/
+   if (fault != VM_FAULT_RETRY &&
+fault != VM_FAULT_SIGSEGV) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
+   goto done;
+   }
+   } else {
+   vma = NULL;
+   }
+
/*
 * As per x86, we may deadlock here. However, since the kernel only
 * validly references user space from well defined areas of the code,
@@ -431,7 +453,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
 #endif
}
 
-   fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
+   fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk, vma);
major |= fault & VM_FAULT_MAJOR;
 
if (fault & VM_FAULT_RETRY) {
@@ -454,11 +476,21 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
mm_flags |= FAULT_FLAG_TRIED;
+
+   /*
+* Do not try to reuse this vma and fetch it
+* again since we will release the mmap_sem.
+*/
+   if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT))
+   vma = NULL;
+
goto retry;
}
}
up_read(>mmap_sem);
 
+done:
+
/*
 * Handle the "normal" (no error) case first.
 */
-- 
1.9.1



[PATCH 2/2] arm64/mm: add speculative page fault

2018-05-02 Thread Ganesh Mahendran
This patch enables the speculative page fault on the arm64
architecture.

I completed spf porting in 4.9. From the test result,
we can see app launching time improved by about 10% in average.
For the apps which have more than 50 threads, 15% or even more
improvement can be got.

Signed-off-by: Ganesh Mahendran 
---
This patch is on top of Laurent's v10 spf
---
 arch/arm64/mm/fault.c | 38 +++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 4165485..e7992a3 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -322,11 +322,13 @@ static void do_bad_area(unsigned long addr, unsigned int 
esr, struct pt_regs *re
 
 static int __do_page_fault(struct mm_struct *mm, unsigned long addr,
   unsigned int mm_flags, unsigned long vm_flags,
-  struct task_struct *tsk)
+  struct task_struct *tsk, struct vm_area_struct *vma)
 {
-   struct vm_area_struct *vma;
int fault;
 
+   if (!vma || !can_reuse_spf_vma(vma, addr))
+   vma = find_vma(mm, addr);
+
vma = find_vma(mm, addr);
fault = VM_FAULT_BADMAP;
if (unlikely(!vma))
@@ -371,6 +373,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
int fault, major = 0;
unsigned long vm_flags = VM_READ | VM_WRITE;
unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+   struct vm_area_struct *vma;
 
if (notify_page_fault(regs, esr))
return 0;
@@ -409,6 +412,25 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
 
+   if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
+   fault = handle_speculative_fault(mm, addr, mm_flags, );
+   /*
+* Page fault is done if VM_FAULT_RETRY is not returned.
+* But if the memory protection keys are active, we don't know
+* if the fault is due to key mistmatch or due to a
+* classic protection check.
+* To differentiate that, we will need the VMA we no
+* more have, so let's retry with the mmap_sem held.
+*/
+   if (fault != VM_FAULT_RETRY &&
+fault != VM_FAULT_SIGSEGV) {
+   perf_sw_event(PERF_COUNT_SW_SPF, 1, regs, addr);
+   goto done;
+   }
+   } else {
+   vma = NULL;
+   }
+
/*
 * As per x86, we may deadlock here. However, since the kernel only
 * validly references user space from well defined areas of the code,
@@ -431,7 +453,7 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
 #endif
}
 
-   fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
+   fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk, vma);
major |= fault & VM_FAULT_MAJOR;
 
if (fault & VM_FAULT_RETRY) {
@@ -454,11 +476,21 @@ static int __kprobes do_page_fault(unsigned long addr, 
unsigned int esr,
if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
mm_flags |= FAULT_FLAG_TRIED;
+
+   /*
+* Do not try to reuse this vma and fetch it
+* again since we will release the mmap_sem.
+*/
+   if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT))
+   vma = NULL;
+
goto retry;
}
}
up_read(>mmap_sem);
 
+done:
+
/*
 * Handle the "normal" (no error) case first.
 */
-- 
1.9.1



[PATCH 1/2] arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-05-02 Thread Ganesh Mahendran
Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for arm64. This
enables Speculative Page Fault handler.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
This patch is on top of Laurent's v10 spf
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf49..cd583a9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -144,6 +144,7 @@ config ARM64
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+   select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT if SMP
help
  ARM 64-bit (AArch64) Linux support.
 
-- 
1.9.1



[PATCH 1/2] arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-05-02 Thread Ganesh Mahendran
Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for arm64. This
enables Speculative Page Fault handler.

Signed-off-by: Ganesh Mahendran 
---
This patch is on top of Laurent's v10 spf
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf49..cd583a9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -144,6 +144,7 @@ config ARM64
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+   select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT if SMP
help
  ARM 64-bit (AArch64) Linux support.
 
-- 
1.9.1



[PATCH 1/2] arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-05-02 Thread Ganesh Mahendran
Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for arm64. This
enables Speculative Page Fault handler.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
This patch is on top of Laurent's v10 spf
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf49..cd583a9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -144,6 +144,7 @@ config ARM64
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+   select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT if SMP
help
  ARM 64-bit (AArch64) Linux support.
 
-- 
1.9.1



[PATCH 1/2] arm64/mm: define ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT

2018-05-02 Thread Ganesh Mahendran
Set ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT for arm64. This
enables Speculative Page Fault handler.

Signed-off-by: Ganesh Mahendran 
---
This patch is on top of Laurent's v10 spf
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf49..cd583a9 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -144,6 +144,7 @@ config ARM64
select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+   select ARCH_SUPPORTS_SPECULATIVE_PAGE_FAULT if SMP
help
  ARM 64-bit (AArch64) Linux support.
 
-- 
1.9.1



Re: [PATCH v3] PM / wakeup: use seq_open() to show wakeup stats

2018-05-01 Thread Ganesh Mahendran
Hi, Pavel

Thanks for your review.

2018-04-29 22:30 GMT+08:00 Pavel Machek <pa...@ucw.cz>:
> On Wed 2018-04-25 18:59:31, Ganesh Mahendran wrote:
>> single_open() interface requires that the whole output must
>> fit into a single buffer. This will lead to timeout when
>> system memory is not in a good situation.
>>
>> This patch use seq_open() to show wakeup stats. This method
>> need only one page, so timeout will not be observed.
>
> Sounds like magic.

I did not explain clearly here.

If we use single_open() to open the file, a single buffer(physical
continious) will be allocated
to store the whole data of file /sys/kernel/debug/wakeup_sources.
When memory situation is not good(fragments...), long time may be used
to allocate
such buffer which may cause android watchdog timeout.

>
>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>> + loff_t *pos)
>>  {
> ...
>> - srcuidx = srcu_read_lock(_srcu);
>> - list_for_each_entry_rcu(ws, _sources, entry)
>> - print_wakeup_source_stats(m, ws);
>> - srcu_read_unlock(_srcu, srcuidx);
>> + *srcuidx = srcu_read_lock(_srcu);
>> + list_for_each_entry_rcu(ws, _sources, entry) {
>> + if (n-- <= 0)
>> + return ws;
>> + }
>> +
>> + return NULL;
>> +}
> ...
>> +static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
>> +{
>> + int *srcuidx = m->private;
>> +
>> + srcu_read_unlock(_srcu, *srcuidx);
>> +}
>
> But you are holding srcu_lock over return to userspace, and somehow I
> don't think that's permitted?

In seq_read(), the m->op->[start | stop] will be invoked as a pair.
So the srcu_lock will not be hold and return to userspace.

Thanks.

> Pavel
> --
> (english) http://www.livejournal.com/~pavelmachek
> (cesky, pictures) 
> http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


Re: [PATCH v3] PM / wakeup: use seq_open() to show wakeup stats

2018-05-01 Thread Ganesh Mahendran
Hi, Pavel

Thanks for your review.

2018-04-29 22:30 GMT+08:00 Pavel Machek :
> On Wed 2018-04-25 18:59:31, Ganesh Mahendran wrote:
>> single_open() interface requires that the whole output must
>> fit into a single buffer. This will lead to timeout when
>> system memory is not in a good situation.
>>
>> This patch use seq_open() to show wakeup stats. This method
>> need only one page, so timeout will not be observed.
>
> Sounds like magic.

I did not explain clearly here.

If we use single_open() to open the file, a single buffer(physical
continious) will be allocated
to store the whole data of file /sys/kernel/debug/wakeup_sources.
When memory situation is not good(fragments...), long time may be used
to allocate
such buffer which may cause android watchdog timeout.

>
>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>> + loff_t *pos)
>>  {
> ...
>> - srcuidx = srcu_read_lock(_srcu);
>> - list_for_each_entry_rcu(ws, _sources, entry)
>> - print_wakeup_source_stats(m, ws);
>> - srcu_read_unlock(_srcu, srcuidx);
>> + *srcuidx = srcu_read_lock(_srcu);
>> + list_for_each_entry_rcu(ws, _sources, entry) {
>> + if (n-- <= 0)
>> + return ws;
>> + }
>> +
>> + return NULL;
>> +}
> ...
>> +static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
>> +{
>> + int *srcuidx = m->private;
>> +
>> + srcu_read_unlock(_srcu, *srcuidx);
>> +}
>
> But you are holding srcu_lock over return to userspace, and somehow I
> don't think that's permitted?

In seq_read(), the m->op->[start | stop] will be invoked as a pair.
So the srcu_lock will not be hold and return to userspace.

Thanks.

> Pavel
> --
> (english) http://www.livejournal.com/~pavelmachek
> (cesky, pictures) 
> http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-04-25 Thread Ganesh Mahendran
2018-04-02 14:46 GMT+08:00 Geert Uytterhoeven <ge...@linux-m68k.org>:
> Hi Ganesh,
>
> On Mon, Apr 2, 2018 at 3:33 AM, Ganesh Mahendran
> <opensource.gan...@gmail.com> wrote:
>> 2018-03-30 19:00 GMT+08:00 Geert Uytterhoeven <ge...@linux-m68k.org>:
>>> On Fri, Mar 30, 2018 at 12:25 PM, Rafael J. Wysocki <r...@rjwysocki.net> 
>>> wrote:
>>>> On Monday, March 5, 2018 9:47:46 AM CEST Ganesh Mahendran wrote:
>>>>> single_open() interface requires that the whole output must
>>>>> fit into a single buffer. This will lead to timeout when
>>>>> system memory is not in a good situation.
>>>>>
>>>>> This patch use seq_open() to show wakeup stats. This method
>>>>> need only one page, so timeout will not be observed.
>
>>>>> --- a/drivers/base/power/wakeup.c
>>>>> +++ b/drivers/base/power/wakeup.c
>>>>> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct 
>>>>> seq_file *m,
>>>>>   return 0;
>>>>>  }
>>>>>
>>>>> -/**
>>>>> - * wakeup_sources_stats_show - Print wakeup sources statistics 
>>>>> information.
>>>>> - * @m: seq_file to print the statistics into.
>>>>> - */
>>>>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>>>>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>>>>> + loff_t *pos)
>>>>>  {
>
>>>>> + list_for_each_entry_rcu(ws, _sources, entry) {
>>>>> + if (n-- > 0)
>>>>> + continue;
>>>>> + goto out;
>>>>> + }
>>>>> + ws = NULL;
>>>>> +out:
>>>>> + return ws;
>>>>> +}
>>>>
>>>> Please clean up the above at least.
>>>>
>>>> If I'm not mistaken, you don't need the label and the goto here.
>>>
>>> The continue is also not needed, if the test condition is inverted.
>>
>> Hi, Geert
>>
>> We need to locate to the last read item. What is your suggestion here?
>
> I didn't mean to get rid of that logic, but to reorganize the code to make it
> simpler:
>
> list_for_each_entry_rcu(ws, _sources, entry) {
> if (n-- <= 0)
> return ws;
> }

I send a v3 patch.
Thanks for your review.

>
> Gr{oetje,eeting}s,
>
> Geert
>
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- 
> ge...@linux-m68k.org
>
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like 
> that.
> -- Linus Torvalds


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-04-25 Thread Ganesh Mahendran
2018-04-02 14:46 GMT+08:00 Geert Uytterhoeven :
> Hi Ganesh,
>
> On Mon, Apr 2, 2018 at 3:33 AM, Ganesh Mahendran
>  wrote:
>> 2018-03-30 19:00 GMT+08:00 Geert Uytterhoeven :
>>> On Fri, Mar 30, 2018 at 12:25 PM, Rafael J. Wysocki  
>>> wrote:
>>>> On Monday, March 5, 2018 9:47:46 AM CEST Ganesh Mahendran wrote:
>>>>> single_open() interface requires that the whole output must
>>>>> fit into a single buffer. This will lead to timeout when
>>>>> system memory is not in a good situation.
>>>>>
>>>>> This patch use seq_open() to show wakeup stats. This method
>>>>> need only one page, so timeout will not be observed.
>
>>>>> --- a/drivers/base/power/wakeup.c
>>>>> +++ b/drivers/base/power/wakeup.c
>>>>> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct 
>>>>> seq_file *m,
>>>>>   return 0;
>>>>>  }
>>>>>
>>>>> -/**
>>>>> - * wakeup_sources_stats_show - Print wakeup sources statistics 
>>>>> information.
>>>>> - * @m: seq_file to print the statistics into.
>>>>> - */
>>>>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>>>>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>>>>> + loff_t *pos)
>>>>>  {
>
>>>>> + list_for_each_entry_rcu(ws, _sources, entry) {
>>>>> + if (n-- > 0)
>>>>> + continue;
>>>>> + goto out;
>>>>> + }
>>>>> + ws = NULL;
>>>>> +out:
>>>>> + return ws;
>>>>> +}
>>>>
>>>> Please clean up the above at least.
>>>>
>>>> If I'm not mistaken, you don't need the label and the goto here.
>>>
>>> The continue is also not needed, if the test condition is inverted.
>>
>> Hi, Geert
>>
>> We need to locate to the last read item. What is your suggestion here?
>
> I didn't mean to get rid of that logic, but to reorganize the code to make it
> simpler:
>
> list_for_each_entry_rcu(ws, _sources, entry) {
> if (n-- <= 0)
> return ws;
> }

I send a v3 patch.
Thanks for your review.

>
> Gr{oetje,eeting}s,
>
> Geert
>
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- 
> ge...@linux-m68k.org
>
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like 
> that.
> -- Linus Torvalds


[PATCH v3] PM / wakeup: use seq_open() to show wakeup stats

2018-04-25 Thread Ganesh Mahendran
single_open() interface requires that the whole output must
fit into a single buffer. This will lead to timeout when
system memory is not in a good situation.

This patch use seq_open() to show wakeup stats. This method
need only one page, so timeout will not be observed.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>

v3: simplify wakeup_sources_stats_seq_start
v2: use srcu_read_lock instead of rcu_read_lock
---
 drivers/base/power/wakeup.c | 75 +++--
 1 file changed, 59 insertions(+), 16 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index ea01621..5872705 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -1029,32 +1029,75 @@ static int print_wakeup_source_stats(struct seq_file *m,
return 0;
 }
 
-/**
- * wakeup_sources_stats_show - Print wakeup sources statistics information.
- * @m: seq_file to print the statistics into.
- */
-static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
+static void *wakeup_sources_stats_seq_start(struct seq_file *m,
+   loff_t *pos)
 {
struct wakeup_source *ws;
-   int srcuidx;
+   loff_t n = *pos;
+   int *srcuidx = m->private;
 
-   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
-   "expire_count\tactive_since\ttotal_time\tmax_time\t"
-   "last_change\tprevent_suspend_time\n");
+   if (n == 0) {
+   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+   "expire_count\tactive_since\ttotal_time\tmax_time\t"
+   "last_change\tprevent_suspend_time\n");
+   }
 
-   srcuidx = srcu_read_lock(_srcu);
-   list_for_each_entry_rcu(ws, _sources, entry)
-   print_wakeup_source_stats(m, ws);
-   srcu_read_unlock(_srcu, srcuidx);
+   *srcuidx = srcu_read_lock(_srcu);
+   list_for_each_entry_rcu(ws, _sources, entry) {
+   if (n-- <= 0)
+   return ws;
+   }
+
+   return NULL;
+}
+
+static void *wakeup_sources_stats_seq_next(struct seq_file *m,
+   void *v, loff_t *pos)
+{
+   struct wakeup_source *ws = v;
+   struct wakeup_source *next_ws = NULL;
+
+   ++(*pos);
 
-   print_wakeup_source_stats(m, _ws);
+   list_for_each_entry_continue_rcu(ws, _sources, entry) {
+   next_ws = ws;
+   break;
+   }
+
+   return next_ws;
+}
+
+static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
+{
+   int *srcuidx = m->private;
+
+   srcu_read_unlock(_srcu, *srcuidx);
+}
+
+/**
+ * wakeup_sources_stats_seq_show - Print wakeup sources statistics information.
+ * @m: seq_file to print the statistics into.
+ * @v: wakeup_source of each iteration
+ */
+static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
+{
+   struct wakeup_source *ws = v;
+
+   print_wakeup_source_stats(m, ws);
 
return 0;
 }
 
+static const struct seq_operations wakeup_sources_stats_seq_ops = {
+   .start = wakeup_sources_stats_seq_start,
+   .next  = wakeup_sources_stats_seq_next,
+   .stop  = wakeup_sources_stats_seq_stop,
+   .show  = wakeup_sources_stats_seq_show,
+};
+
 static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
 {
-   return single_open(file, wakeup_sources_stats_show, NULL);
+   return seq_open_private(file, _sources_stats_seq_ops, 
sizeof(int));
 }
 
 static const struct file_operations wakeup_sources_stats_fops = {
@@ -1062,7 +1105,7 @@ static int wakeup_sources_stats_open(struct inode *inode, 
struct file *file)
.open = wakeup_sources_stats_open,
.read = seq_read,
.llseek = seq_lseek,
-   .release = single_release,
+   .release = seq_release_private,
 };
 
 static int __init wakeup_sources_debugfs_init(void)
-- 
1.9.1



[PATCH v3] PM / wakeup: use seq_open() to show wakeup stats

2018-04-25 Thread Ganesh Mahendran
single_open() interface requires that the whole output must
fit into a single buffer. This will lead to timeout when
system memory is not in a good situation.

This patch use seq_open() to show wakeup stats. This method
need only one page, so timeout will not be observed.

Signed-off-by: Ganesh Mahendran 

v3: simplify wakeup_sources_stats_seq_start
v2: use srcu_read_lock instead of rcu_read_lock
---
 drivers/base/power/wakeup.c | 75 +++--
 1 file changed, 59 insertions(+), 16 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index ea01621..5872705 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -1029,32 +1029,75 @@ static int print_wakeup_source_stats(struct seq_file *m,
return 0;
 }
 
-/**
- * wakeup_sources_stats_show - Print wakeup sources statistics information.
- * @m: seq_file to print the statistics into.
- */
-static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
+static void *wakeup_sources_stats_seq_start(struct seq_file *m,
+   loff_t *pos)
 {
struct wakeup_source *ws;
-   int srcuidx;
+   loff_t n = *pos;
+   int *srcuidx = m->private;
 
-   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
-   "expire_count\tactive_since\ttotal_time\tmax_time\t"
-   "last_change\tprevent_suspend_time\n");
+   if (n == 0) {
+   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+   "expire_count\tactive_since\ttotal_time\tmax_time\t"
+   "last_change\tprevent_suspend_time\n");
+   }
 
-   srcuidx = srcu_read_lock(_srcu);
-   list_for_each_entry_rcu(ws, _sources, entry)
-   print_wakeup_source_stats(m, ws);
-   srcu_read_unlock(_srcu, srcuidx);
+   *srcuidx = srcu_read_lock(_srcu);
+   list_for_each_entry_rcu(ws, _sources, entry) {
+   if (n-- <= 0)
+   return ws;
+   }
+
+   return NULL;
+}
+
+static void *wakeup_sources_stats_seq_next(struct seq_file *m,
+   void *v, loff_t *pos)
+{
+   struct wakeup_source *ws = v;
+   struct wakeup_source *next_ws = NULL;
+
+   ++(*pos);
 
-   print_wakeup_source_stats(m, _ws);
+   list_for_each_entry_continue_rcu(ws, _sources, entry) {
+   next_ws = ws;
+   break;
+   }
+
+   return next_ws;
+}
+
+static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
+{
+   int *srcuidx = m->private;
+
+   srcu_read_unlock(_srcu, *srcuidx);
+}
+
+/**
+ * wakeup_sources_stats_seq_show - Print wakeup sources statistics information.
+ * @m: seq_file to print the statistics into.
+ * @v: wakeup_source of each iteration
+ */
+static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
+{
+   struct wakeup_source *ws = v;
+
+   print_wakeup_source_stats(m, ws);
 
return 0;
 }
 
+static const struct seq_operations wakeup_sources_stats_seq_ops = {
+   .start = wakeup_sources_stats_seq_start,
+   .next  = wakeup_sources_stats_seq_next,
+   .stop  = wakeup_sources_stats_seq_stop,
+   .show  = wakeup_sources_stats_seq_show,
+};
+
 static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
 {
-   return single_open(file, wakeup_sources_stats_show, NULL);
+   return seq_open_private(file, _sources_stats_seq_ops, 
sizeof(int));
 }
 
 static const struct file_operations wakeup_sources_stats_fops = {
@@ -1062,7 +1105,7 @@ static int wakeup_sources_stats_open(struct inode *inode, 
struct file *file)
.open = wakeup_sources_stats_open,
.read = seq_read,
.llseek = seq_lseek,
-   .release = single_release,
+   .release = seq_release_private,
 };
 
 static int __init wakeup_sources_debugfs_init(void)
-- 
1.9.1



Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-04-10 Thread Ganesh Mahendran
2018-04-02 18:32 GMT+08:00 Minchan Kim <minc...@kernel.org>:
> Hi Ganesh,
>
> On Mon, Apr 02, 2018 at 06:01:59PM +0800, Ganesh Mahendran wrote:
>> 2018-04-02 15:11 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>> > On Mon, Apr 02, 2018 at 02:46:14PM +0800, Ganesh Mahendran wrote:
>> >> 2018-04-02 14:34 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>> >> > On Fri, Mar 30, 2018 at 12:04:07PM +0200, Greg Kroah-Hartman wrote:
>> >> >> On Fri, Mar 30, 2018 at 10:29:21AM +0900, Minchan Kim wrote:
>> >> >> > Hi Ganesh,
>> >> >> >
>> >> >> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>> >> >> > > 2018-03-29 14:54 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>> >> >> > > > binder_update_page_range needs down_write of mmap_sem because
>> >> >> > > > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
>> >> >> > > > it is set. However, when I profile binder working, it seems
>> >> >> > > > every binder buffers should be mapped in advance by binder_mmap.
>> >> >> > > > It means we could set VM_MIXEDMAP in binder_mmap time which is
>> >> >> > > > already hold a mmap_sem as down_write so binder_update_page_range
>> >> >> > > > doesn't need to hold a mmap_sem as down_write.
>> >> >> > > >
>> >> >> > > > Android suffers from mmap_sem contention so let's reduce mmap_sem
>> >> >> > > > down_write.
>> >> >> > >
>> >> >> > > Hi, Minchan:
>> >> >> > >
>> >> >> > > It seems there is performance regression of this patch.
>> >> >> >
>> >> >> > You mean "This patch aims for solving performance regression" not 
>> >> >> > "This patch
>> >> >> > makes performance regression"?
>> >> >> >
>> >> >> > >
>> >> >> > > Do you have some test result of android app launch time or 
>> >> >> > > binderThroughput?
>> >> >> >
>> >> >> > Unfortunately, I don't have any number. The goal is to reduce the 
>> >> >> > number of
>> >> >> > call mmap_sem as write-side lock because it makes priority inversion 
>> >> >> > of threads
>> >> >> > easily and that's one of clear part I spot that we don't need 
>> >> >> > write-side lock.
>> >> >>
>> >> >> Please always run the binderThroughput tests when making binder changes
>> >> >> (there is a binder test suite in the CTS Android tests), as that 
>> >> >> ensures
>> >> >> that you are not causing performance regressions as well as just normal
>> >> >> bug regressions :)
>> >> >
>> >> > Thanks for the information. I didn't notice that such kinds of tests for
>> >> > binder. I will keep it in mind.
>> >> >
>> >> > Today, I have setup the testing for my phone and found testing was very
>> >> > fluctuating even without my patch. It might be not good with my test
>> >> > skill. I emulated user's behavior with various touch event. With it, I 
>> >> > open
>> >> > various apps and play with them several times. Before starting the test,
>> >> > I did "adb shell stop && adb shell start && echo 3 > 
>> >> > /proc/sys/vm/drop_caches"
>> >> >
>> >> > Such 15% noise was very easy to make it.
>> >> >
>> >> > Ganesh, How did you measure? What's the stddev?
>> >>
>> >> Hi, Minchan:
>> >>
>> >> Sorry for the late response, a little busy these days. :)
>> >>
>> >> We have our own test tools to measure app launch time, or you can use
>> >> android systrace to get the app launch time. We tested your V1 patch:
>> >> https://patchwork.kernel.org/patch/10312057/
>> >> and found app lunch time regression.
>> >
>> > V1 had a bug with VM_MAYWRITE. Could you confirm it with v5?
>>
>> I have finished binder Throughput test. The test result is stable,
>> there is no performance
>> regression found both in v1 and v5.
>
>

Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-04-10 Thread Ganesh Mahendran
2018-04-02 18:32 GMT+08:00 Minchan Kim :
> Hi Ganesh,
>
> On Mon, Apr 02, 2018 at 06:01:59PM +0800, Ganesh Mahendran wrote:
>> 2018-04-02 15:11 GMT+08:00 Minchan Kim :
>> > On Mon, Apr 02, 2018 at 02:46:14PM +0800, Ganesh Mahendran wrote:
>> >> 2018-04-02 14:34 GMT+08:00 Minchan Kim :
>> >> > On Fri, Mar 30, 2018 at 12:04:07PM +0200, Greg Kroah-Hartman wrote:
>> >> >> On Fri, Mar 30, 2018 at 10:29:21AM +0900, Minchan Kim wrote:
>> >> >> > Hi Ganesh,
>> >> >> >
>> >> >> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>> >> >> > > 2018-03-29 14:54 GMT+08:00 Minchan Kim :
>> >> >> > > > binder_update_page_range needs down_write of mmap_sem because
>> >> >> > > > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
>> >> >> > > > it is set. However, when I profile binder working, it seems
>> >> >> > > > every binder buffers should be mapped in advance by binder_mmap.
>> >> >> > > > It means we could set VM_MIXEDMAP in binder_mmap time which is
>> >> >> > > > already hold a mmap_sem as down_write so binder_update_page_range
>> >> >> > > > doesn't need to hold a mmap_sem as down_write.
>> >> >> > > >
>> >> >> > > > Android suffers from mmap_sem contention so let's reduce mmap_sem
>> >> >> > > > down_write.
>> >> >> > >
>> >> >> > > Hi, Minchan:
>> >> >> > >
>> >> >> > > It seems there is performance regression of this patch.
>> >> >> >
>> >> >> > You mean "This patch aims for solving performance regression" not 
>> >> >> > "This patch
>> >> >> > makes performance regression"?
>> >> >> >
>> >> >> > >
>> >> >> > > Do you have some test result of android app launch time or 
>> >> >> > > binderThroughput?
>> >> >> >
>> >> >> > Unfortunately, I don't have any number. The goal is to reduce the 
>> >> >> > number of
>> >> >> > call mmap_sem as write-side lock because it makes priority inversion 
>> >> >> > of threads
>> >> >> > easily and that's one of clear part I spot that we don't need 
>> >> >> > write-side lock.
>> >> >>
>> >> >> Please always run the binderThroughput tests when making binder changes
>> >> >> (there is a binder test suite in the CTS Android tests), as that 
>> >> >> ensures
>> >> >> that you are not causing performance regressions as well as just normal
>> >> >> bug regressions :)
>> >> >
>> >> > Thanks for the information. I didn't notice that such kinds of tests for
>> >> > binder. I will keep it in mind.
>> >> >
>> >> > Today, I have setup the testing for my phone and found testing was very
>> >> > fluctuating even without my patch. It might be not good with my test
>> >> > skill. I emulated user's behavior with various touch event. With it, I 
>> >> > open
>> >> > various apps and play with them several times. Before starting the test,
>> >> > I did "adb shell stop && adb shell start && echo 3 > 
>> >> > /proc/sys/vm/drop_caches"
>> >> >
>> >> > Such 15% noise was very easy to make it.
>> >> >
>> >> > Ganesh, How did you measure? What's the stddev?
>> >>
>> >> Hi, Minchan:
>> >>
>> >> Sorry for the late response, a little busy these days. :)
>> >>
>> >> We have our own test tools to measure app launch time, or you can use
>> >> android systrace to get the app launch time. We tested your V1 patch:
>> >> https://patchwork.kernel.org/patch/10312057/
>> >> and found app lunch time regression.
>> >
>> > V1 had a bug with VM_MAYWRITE. Could you confirm it with v5?
>>
>> I have finished binder Throughput test. The test result is stable,
>> there is no performance
>> regression found both in v1 and v5.
>
> Thanks for the test! Now I'm struggling with setting up BinderThrough test.
> Binder matainers:

Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-04-09 Thread Ganesh Mahendran
2018-04-09 14:40 GMT+08:00 Minchan Kim <minc...@kernel.org>:
> Hi Ganesh,
>
> Isn't there any update?

We were on vacation a few days ago. After the test complete, I will
update the result immediately.

Thanks.

>
> On Mon, Apr 2, 2018 at 7:32 PM, Minchan Kim <minc...@kernel.org> wrote:
>> Hi Ganesh,
>>
>> On Mon, Apr 02, 2018 at 06:01:59PM +0800, Ganesh Mahendran wrote:
>>> 2018-04-02 15:11 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>>> > On Mon, Apr 02, 2018 at 02:46:14PM +0800, Ganesh Mahendran wrote:
>>> >> 2018-04-02 14:34 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>>> >> > On Fri, Mar 30, 2018 at 12:04:07PM +0200, Greg Kroah-Hartman wrote:
>>> >> >> On Fri, Mar 30, 2018 at 10:29:21AM +0900, Minchan Kim wrote:
>>> >> >> > Hi Ganesh,
>>> >> >> >
>>> >> >> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>>> >> >> > > 2018-03-29 14:54 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>>> >> >> > > > binder_update_page_range needs down_write of mmap_sem because
>>> >> >> > > > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP 
>>> >> >> > > > unless
>>> >> >> > > > it is set. However, when I profile binder working, it seems
>>> >> >> > > > every binder buffers should be mapped in advance by binder_mmap.
>>> >> >> > > > It means we could set VM_MIXEDMAP in binder_mmap time which is
>>> >> >> > > > already hold a mmap_sem as down_write so 
>>> >> >> > > > binder_update_page_range
>>> >> >> > > > doesn't need to hold a mmap_sem as down_write.
>>> >> >> > > >
>>> >> >> > > > Android suffers from mmap_sem contention so let's reduce 
>>> >> >> > > > mmap_sem
>>> >> >> > > > down_write.
>>> >> >> > >
>>> >> >> > > Hi, Minchan:
>>> >> >> > >
>>> >> >> > > It seems there is performance regression of this patch.
>>> >> >> >
>>> >> >> > You mean "This patch aims for solving performance regression" not 
>>> >> >> > "This patch
>>> >> >> > makes performance regression"?
>>> >> >> >
>>> >> >> > >
>>> >> >> > > Do you have some test result of android app launch time or 
>>> >> >> > > binderThroughput?
>>> >> >> >
>>> >> >> > Unfortunately, I don't have any number. The goal is to reduce the 
>>> >> >> > number of
>>> >> >> > call mmap_sem as write-side lock because it makes priority 
>>> >> >> > inversion of threads
>>> >> >> > easily and that's one of clear part I spot that we don't need 
>>> >> >> > write-side lock.
>>> >> >>
>>> >> >> Please always run the binderThroughput tests when making binder 
>>> >> >> changes
>>> >> >> (there is a binder test suite in the CTS Android tests), as that 
>>> >> >> ensures
>>> >> >> that you are not causing performance regressions as well as just 
>>> >> >> normal
>>> >> >> bug regressions :)
>>> >> >
>>> >> > Thanks for the information. I didn't notice that such kinds of tests 
>>> >> > for
>>> >> > binder. I will keep it in mind.
>>> >> >
>>> >> > Today, I have setup the testing for my phone and found testing was very
>>> >> > fluctuating even without my patch. It might be not good with my test
>>> >> > skill. I emulated user's behavior with various touch event. With it, I 
>>> >> > open
>>> >> > various apps and play with them several times. Before starting the 
>>> >> > test,
>>> >> > I did "adb shell stop && adb shell start && echo 3 > 
>>> >> > /proc/sys/vm/drop_caches"
>>> >> >
>>> >> > Such 15% noise was very easy to make it.
>>> >> >
>>> >&

Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-04-09 Thread Ganesh Mahendran
2018-04-09 14:40 GMT+08:00 Minchan Kim :
> Hi Ganesh,
>
> Isn't there any update?

We were on vacation a few days ago. After the test complete, I will
update the result immediately.

Thanks.

>
> On Mon, Apr 2, 2018 at 7:32 PM, Minchan Kim  wrote:
>> Hi Ganesh,
>>
>> On Mon, Apr 02, 2018 at 06:01:59PM +0800, Ganesh Mahendran wrote:
>>> 2018-04-02 15:11 GMT+08:00 Minchan Kim :
>>> > On Mon, Apr 02, 2018 at 02:46:14PM +0800, Ganesh Mahendran wrote:
>>> >> 2018-04-02 14:34 GMT+08:00 Minchan Kim :
>>> >> > On Fri, Mar 30, 2018 at 12:04:07PM +0200, Greg Kroah-Hartman wrote:
>>> >> >> On Fri, Mar 30, 2018 at 10:29:21AM +0900, Minchan Kim wrote:
>>> >> >> > Hi Ganesh,
>>> >> >> >
>>> >> >> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>>> >> >> > > 2018-03-29 14:54 GMT+08:00 Minchan Kim :
>>> >> >> > > > binder_update_page_range needs down_write of mmap_sem because
>>> >> >> > > > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP 
>>> >> >> > > > unless
>>> >> >> > > > it is set. However, when I profile binder working, it seems
>>> >> >> > > > every binder buffers should be mapped in advance by binder_mmap.
>>> >> >> > > > It means we could set VM_MIXEDMAP in binder_mmap time which is
>>> >> >> > > > already hold a mmap_sem as down_write so 
>>> >> >> > > > binder_update_page_range
>>> >> >> > > > doesn't need to hold a mmap_sem as down_write.
>>> >> >> > > >
>>> >> >> > > > Android suffers from mmap_sem contention so let's reduce 
>>> >> >> > > > mmap_sem
>>> >> >> > > > down_write.
>>> >> >> > >
>>> >> >> > > Hi, Minchan:
>>> >> >> > >
>>> >> >> > > It seems there is performance regression of this patch.
>>> >> >> >
>>> >> >> > You mean "This patch aims for solving performance regression" not 
>>> >> >> > "This patch
>>> >> >> > makes performance regression"?
>>> >> >> >
>>> >> >> > >
>>> >> >> > > Do you have some test result of android app launch time or 
>>> >> >> > > binderThroughput?
>>> >> >> >
>>> >> >> > Unfortunately, I don't have any number. The goal is to reduce the 
>>> >> >> > number of
>>> >> >> > call mmap_sem as write-side lock because it makes priority 
>>> >> >> > inversion of threads
>>> >> >> > easily and that's one of clear part I spot that we don't need 
>>> >> >> > write-side lock.
>>> >> >>
>>> >> >> Please always run the binderThroughput tests when making binder 
>>> >> >> changes
>>> >> >> (there is a binder test suite in the CTS Android tests), as that 
>>> >> >> ensures
>>> >> >> that you are not causing performance regressions as well as just 
>>> >> >> normal
>>> >> >> bug regressions :)
>>> >> >
>>> >> > Thanks for the information. I didn't notice that such kinds of tests 
>>> >> > for
>>> >> > binder. I will keep it in mind.
>>> >> >
>>> >> > Today, I have setup the testing for my phone and found testing was very
>>> >> > fluctuating even without my patch. It might be not good with my test
>>> >> > skill. I emulated user's behavior with various touch event. With it, I 
>>> >> > open
>>> >> > various apps and play with them several times. Before starting the 
>>> >> > test,
>>> >> > I did "adb shell stop && adb shell start && echo 3 > 
>>> >> > /proc/sys/vm/drop_caches"
>>> >> >
>>> >> > Such 15% noise was very easy to make it.
>>> >> >
>>> >> > Ganesh, How did you measure? What's the stddev?
>>> >>
>>> >> Hi, Minchan:
>>> >>
>>&g

Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-04-02 Thread Ganesh Mahendran
2018-04-02 15:11 GMT+08:00 Minchan Kim <minc...@kernel.org>:
> On Mon, Apr 02, 2018 at 02:46:14PM +0800, Ganesh Mahendran wrote:
>> 2018-04-02 14:34 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>> > On Fri, Mar 30, 2018 at 12:04:07PM +0200, Greg Kroah-Hartman wrote:
>> >> On Fri, Mar 30, 2018 at 10:29:21AM +0900, Minchan Kim wrote:
>> >> > Hi Ganesh,
>> >> >
>> >> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>> >> > > 2018-03-29 14:54 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>> >> > > > binder_update_page_range needs down_write of mmap_sem because
>> >> > > > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
>> >> > > > it is set. However, when I profile binder working, it seems
>> >> > > > every binder buffers should be mapped in advance by binder_mmap.
>> >> > > > It means we could set VM_MIXEDMAP in binder_mmap time which is
>> >> > > > already hold a mmap_sem as down_write so binder_update_page_range
>> >> > > > doesn't need to hold a mmap_sem as down_write.
>> >> > > >
>> >> > > > Android suffers from mmap_sem contention so let's reduce mmap_sem
>> >> > > > down_write.
>> >> > >
>> >> > > Hi, Minchan:
>> >> > >
>> >> > > It seems there is performance regression of this patch.
>> >> >
>> >> > You mean "This patch aims for solving performance regression" not "This 
>> >> > patch
>> >> > makes performance regression"?
>> >> >
>> >> > >
>> >> > > Do you have some test result of android app launch time or 
>> >> > > binderThroughput?
>> >> >
>> >> > Unfortunately, I don't have any number. The goal is to reduce the 
>> >> > number of
>> >> > call mmap_sem as write-side lock because it makes priority inversion of 
>> >> > threads
>> >> > easily and that's one of clear part I spot that we don't need 
>> >> > write-side lock.
>> >>
>> >> Please always run the binderThroughput tests when making binder changes
>> >> (there is a binder test suite in the CTS Android tests), as that ensures
>> >> that you are not causing performance regressions as well as just normal
>> >> bug regressions :)
>> >
>> > Thanks for the information. I didn't notice that such kinds of tests for
>> > binder. I will keep it in mind.
>> >
>> > Today, I have setup the testing for my phone and found testing was very
>> > fluctuating even without my patch. It might be not good with my test
>> > skill. I emulated user's behavior with various touch event. With it, I open
>> > various apps and play with them several times. Before starting the test,
>> > I did "adb shell stop && adb shell start && echo 3 > 
>> > /proc/sys/vm/drop_caches"
>> >
>> > Such 15% noise was very easy to make it.
>> >
>> > Ganesh, How did you measure? What's the stddev?
>>
>> Hi, Minchan:
>>
>> Sorry for the late response, a little busy these days. :)
>>
>> We have our own test tools to measure app launch time, or you can use
>> android systrace to get the app launch time. We tested your V1 patch:
>> https://patchwork.kernel.org/patch/10312057/
>> and found app lunch time regression.
>
> V1 had a bug with VM_MAYWRITE. Could you confirm it with v5?

I have finished binder Throughput test. The test result is stable,
there is no performance
regression found both in v1 and v5.

basepatch_v1 patch_v5
---
91223.4  90560.2   89644.5
90520.3  89583.1   89048.2
89833.2  90247.6   90091.3
90740.2  90276.7   90994.2
89703.5  90112.4   89994.6
89945.1  89122.8   88937.7
89872.8  90357.3   89307.4
89913.2  90355.4   89563.8
88979 90393.4   90182.8
89577.3  90946.8   90441.4
AVG90030.8  90195.57  89820.59

Before the test, I stop the android framework by:
adb shell stop

>
> Please tell me more detail. What apps are slower compared to old?
> Every apps are slowed with avg 15%? The

Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-04-02 Thread Ganesh Mahendran
2018-04-02 15:11 GMT+08:00 Minchan Kim :
> On Mon, Apr 02, 2018 at 02:46:14PM +0800, Ganesh Mahendran wrote:
>> 2018-04-02 14:34 GMT+08:00 Minchan Kim :
>> > On Fri, Mar 30, 2018 at 12:04:07PM +0200, Greg Kroah-Hartman wrote:
>> >> On Fri, Mar 30, 2018 at 10:29:21AM +0900, Minchan Kim wrote:
>> >> > Hi Ganesh,
>> >> >
>> >> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>> >> > > 2018-03-29 14:54 GMT+08:00 Minchan Kim :
>> >> > > > binder_update_page_range needs down_write of mmap_sem because
>> >> > > > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
>> >> > > > it is set. However, when I profile binder working, it seems
>> >> > > > every binder buffers should be mapped in advance by binder_mmap.
>> >> > > > It means we could set VM_MIXEDMAP in binder_mmap time which is
>> >> > > > already hold a mmap_sem as down_write so binder_update_page_range
>> >> > > > doesn't need to hold a mmap_sem as down_write.
>> >> > > >
>> >> > > > Android suffers from mmap_sem contention so let's reduce mmap_sem
>> >> > > > down_write.
>> >> > >
>> >> > > Hi, Minchan:
>> >> > >
>> >> > > It seems there is performance regression of this patch.
>> >> >
>> >> > You mean "This patch aims for solving performance regression" not "This 
>> >> > patch
>> >> > makes performance regression"?
>> >> >
>> >> > >
>> >> > > Do you have some test result of android app launch time or 
>> >> > > binderThroughput?
>> >> >
>> >> > Unfortunately, I don't have any number. The goal is to reduce the 
>> >> > number of
>> >> > call mmap_sem as write-side lock because it makes priority inversion of 
>> >> > threads
>> >> > easily and that's one of clear part I spot that we don't need 
>> >> > write-side lock.
>> >>
>> >> Please always run the binderThroughput tests when making binder changes
>> >> (there is a binder test suite in the CTS Android tests), as that ensures
>> >> that you are not causing performance regressions as well as just normal
>> >> bug regressions :)
>> >
>> > Thanks for the information. I didn't notice that such kinds of tests for
>> > binder. I will keep it in mind.
>> >
>> > Today, I have setup the testing for my phone and found testing was very
>> > fluctuating even without my patch. It might be not good with my test
>> > skill. I emulated user's behavior with various touch event. With it, I open
>> > various apps and play with them several times. Before starting the test,
>> > I did "adb shell stop && adb shell start && echo 3 > 
>> > /proc/sys/vm/drop_caches"
>> >
>> > Such 15% noise was very easy to make it.
>> >
>> > Ganesh, How did you measure? What's the stddev?
>>
>> Hi, Minchan:
>>
>> Sorry for the late response, a little busy these days. :)
>>
>> We have our own test tools to measure app launch time, or you can use
>> android systrace to get the app launch time. We tested your V1 patch:
>> https://patchwork.kernel.org/patch/10312057/
>> and found app lunch time regression.
>
> V1 had a bug with VM_MAYWRITE. Could you confirm it with v5?

I have finished binder Throughput test. The test result is stable,
there is no performance
regression found both in v1 and v5.

basepatch_v1 patch_v5
---
91223.4  90560.2   89644.5
90520.3  89583.1   89048.2
89833.2  90247.6   90091.3
90740.2  90276.7   90994.2
89703.5  90112.4   89994.6
89945.1  89122.8   88937.7
89872.8  90357.3   89307.4
89913.2  90355.4   89563.8
88979 90393.4   90182.8
89577.3  90946.8   90441.4
AVG90030.8  90195.57  89820.59

Before the test, I stop the android framework by:
adb shell stop

>
> Please tell me more detail. What apps are slower compared to old?
> Every apps are slowed with avg 15%? Then, what's the stddev?

Not all of the apps slowed 15%, The app *avg* launch

Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-04-02 Thread Ganesh Mahendran
2018-04-02 14:34 GMT+08:00 Minchan Kim <minc...@kernel.org>:
> On Fri, Mar 30, 2018 at 12:04:07PM +0200, Greg Kroah-Hartman wrote:
>> On Fri, Mar 30, 2018 at 10:29:21AM +0900, Minchan Kim wrote:
>> > Hi Ganesh,
>> >
>> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>> > > 2018-03-29 14:54 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>> > > > binder_update_page_range needs down_write of mmap_sem because
>> > > > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
>> > > > it is set. However, when I profile binder working, it seems
>> > > > every binder buffers should be mapped in advance by binder_mmap.
>> > > > It means we could set VM_MIXEDMAP in binder_mmap time which is
>> > > > already hold a mmap_sem as down_write so binder_update_page_range
>> > > > doesn't need to hold a mmap_sem as down_write.
>> > > >
>> > > > Android suffers from mmap_sem contention so let's reduce mmap_sem
>> > > > down_write.
>> > >
>> > > Hi, Minchan:
>> > >
>> > > It seems there is performance regression of this patch.
>> >
>> > You mean "This patch aims for solving performance regression" not "This 
>> > patch
>> > makes performance regression"?
>> >
>> > >
>> > > Do you have some test result of android app launch time or 
>> > > binderThroughput?
>> >
>> > Unfortunately, I don't have any number. The goal is to reduce the number of
>> > call mmap_sem as write-side lock because it makes priority inversion of 
>> > threads
>> > easily and that's one of clear part I spot that we don't need write-side 
>> > lock.
>>
>> Please always run the binderThroughput tests when making binder changes
>> (there is a binder test suite in the CTS Android tests), as that ensures
>> that you are not causing performance regressions as well as just normal
>> bug regressions :)
>
> Thanks for the information. I didn't notice that such kinds of tests for
> binder. I will keep it in mind.
>
> Today, I have setup the testing for my phone and found testing was very
> fluctuating even without my patch. It might be not good with my test
> skill. I emulated user's behavior with various touch event. With it, I open
> various apps and play with them several times. Before starting the test,
> I did "adb shell stop && adb shell start && echo 3 > /proc/sys/vm/drop_caches"
>
> Such 15% noise was very easy to make it.
>
> Ganesh, How did you measure? What's the stddev?

Hi, Minchan:

Sorry for the late response, a little busy these days. :)

We have our own test tools to measure app launch time, or you can use
android systrace to get the app launch time. We tested your V1 patch:
https://patchwork.kernel.org/patch/10312057/
and found app lunch time regression.

I will use binderThroghput tool to test the patch today or tomorrow.

Thanks.

> Please let me know how you measure without noise so I'd like to reproduce
> the result in my phone.
>
> I will do binderThroghput test, too.
>
>


Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-04-02 Thread Ganesh Mahendran
2018-04-02 14:34 GMT+08:00 Minchan Kim :
> On Fri, Mar 30, 2018 at 12:04:07PM +0200, Greg Kroah-Hartman wrote:
>> On Fri, Mar 30, 2018 at 10:29:21AM +0900, Minchan Kim wrote:
>> > Hi Ganesh,
>> >
>> > On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>> > > 2018-03-29 14:54 GMT+08:00 Minchan Kim :
>> > > > binder_update_page_range needs down_write of mmap_sem because
>> > > > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
>> > > > it is set. However, when I profile binder working, it seems
>> > > > every binder buffers should be mapped in advance by binder_mmap.
>> > > > It means we could set VM_MIXEDMAP in binder_mmap time which is
>> > > > already hold a mmap_sem as down_write so binder_update_page_range
>> > > > doesn't need to hold a mmap_sem as down_write.
>> > > >
>> > > > Android suffers from mmap_sem contention so let's reduce mmap_sem
>> > > > down_write.
>> > >
>> > > Hi, Minchan:
>> > >
>> > > It seems there is performance regression of this patch.
>> >
>> > You mean "This patch aims for solving performance regression" not "This 
>> > patch
>> > makes performance regression"?
>> >
>> > >
>> > > Do you have some test result of android app launch time or 
>> > > binderThroughput?
>> >
>> > Unfortunately, I don't have any number. The goal is to reduce the number of
>> > call mmap_sem as write-side lock because it makes priority inversion of 
>> > threads
>> > easily and that's one of clear part I spot that we don't need write-side 
>> > lock.
>>
>> Please always run the binderThroughput tests when making binder changes
>> (there is a binder test suite in the CTS Android tests), as that ensures
>> that you are not causing performance regressions as well as just normal
>> bug regressions :)
>
> Thanks for the information. I didn't notice that such kinds of tests for
> binder. I will keep it in mind.
>
> Today, I have setup the testing for my phone and found testing was very
> fluctuating even without my patch. It might be not good with my test
> skill. I emulated user's behavior with various touch event. With it, I open
> various apps and play with them several times. Before starting the test,
> I did "adb shell stop && adb shell start && echo 3 > /proc/sys/vm/drop_caches"
>
> Such 15% noise was very easy to make it.
>
> Ganesh, How did you measure? What's the stddev?

Hi, Minchan:

Sorry for the late response, a little busy these days. :)

We have our own test tools to measure app launch time, or you can use
android systrace to get the app launch time. We tested your V1 patch:
https://patchwork.kernel.org/patch/10312057/
and found app lunch time regression.

I will use binderThroghput tool to test the patch today or tomorrow.

Thanks.

> Please let me know how you measure without noise so I'd like to reproduce
> the result in my phone.
>
> I will do binderThroghput test, too.
>
>


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-04-01 Thread Ganesh Mahendran
2018-03-30 19:00 GMT+08:00 Geert Uytterhoeven <ge...@linux-m68k.org>:
> On Fri, Mar 30, 2018 at 12:25 PM, Rafael J. Wysocki <r...@rjwysocki.net> 
> wrote:
>> On Monday, March 5, 2018 9:47:46 AM CEST Ganesh Mahendran wrote:
>>> single_open() interface requires that the whole output must
>>> fit into a single buffer. This will lead to timeout when
>>> system memory is not in a good situation.
>>>
>>> This patch use seq_open() to show wakeup stats. This method
>>> need only one page, so timeout will not be observed.
>>>
>>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>>> 
>>> v2: use srcu_read_lock instead of rcu_read_lock
>>> ---
>>>  drivers/base/power/wakeup.c | 77 
>>> +++--
>>>  1 file changed, 61 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
>>> index ea01621..3bcab7d 100644
>>> --- a/drivers/base/power/wakeup.c
>>> +++ b/drivers/base/power/wakeup.c
>>> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct 
>>> seq_file *m,
>>>   return 0;
>>>  }
>>>
>>> -/**
>>> - * wakeup_sources_stats_show - Print wakeup sources statistics information.
>>> - * @m: seq_file to print the statistics into.
>>> - */
>>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>>> + loff_t *pos)
>>>  {
>>>   struct wakeup_source *ws;
>>> - int srcuidx;
>>> + loff_t n = *pos;
>>> + int *srcuidx = m->private;
>>>
>>> - seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
>>> - "expire_count\tactive_since\ttotal_time\tmax_time\t"
>>> - "last_change\tprevent_suspend_time\n");
>>> + if (n == 0) {
>>> + seq_puts(m, 
>>> "name\t\tactive_count\tevent_count\twakeup_count\t"
>>> + "expire_count\tactive_since\ttotal_time\tmax_time\t"
>>> + "last_change\tprevent_suspend_time\n");
>>> + }
>>>
>>> - srcuidx = srcu_read_lock(_srcu);
>>> - list_for_each_entry_rcu(ws, _sources, entry)
>>> - print_wakeup_source_stats(m, ws);
>>> - srcu_read_unlock(_srcu, srcuidx);
>>> + *srcuidx = srcu_read_lock(_srcu);
>>> + list_for_each_entry_rcu(ws, _sources, entry) {
>>> + if (n-- > 0)
>>> + continue;
>>> + goto out;
>>> + }
>>> + ws = NULL;
>>> +out:
>>> + return ws;
>>> +}
>>
>> Please clean up the above at least.
>>
>> If I'm not mistaken, you don't need the label and the goto here.
>
> The continue is also not needed, if the test condition is inverted.

Hi, Geert

We need to locate to the last read item. What is your suggestion here?

Thanks.

>
> Gr{oetje,eeting}s,
>
> Geert
>
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- 
> ge...@linux-m68k.org
>
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like 
> that.
> -- Linus Torvalds


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-04-01 Thread Ganesh Mahendran
2018-03-30 19:00 GMT+08:00 Geert Uytterhoeven :
> On Fri, Mar 30, 2018 at 12:25 PM, Rafael J. Wysocki  
> wrote:
>> On Monday, March 5, 2018 9:47:46 AM CEST Ganesh Mahendran wrote:
>>> single_open() interface requires that the whole output must
>>> fit into a single buffer. This will lead to timeout when
>>> system memory is not in a good situation.
>>>
>>> This patch use seq_open() to show wakeup stats. This method
>>> need only one page, so timeout will not be observed.
>>>
>>> Signed-off-by: Ganesh Mahendran 
>>> 
>>> v2: use srcu_read_lock instead of rcu_read_lock
>>> ---
>>>  drivers/base/power/wakeup.c | 77 
>>> +++--
>>>  1 file changed, 61 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
>>> index ea01621..3bcab7d 100644
>>> --- a/drivers/base/power/wakeup.c
>>> +++ b/drivers/base/power/wakeup.c
>>> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct 
>>> seq_file *m,
>>>   return 0;
>>>  }
>>>
>>> -/**
>>> - * wakeup_sources_stats_show - Print wakeup sources statistics information.
>>> - * @m: seq_file to print the statistics into.
>>> - */
>>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>>> + loff_t *pos)
>>>  {
>>>   struct wakeup_source *ws;
>>> - int srcuidx;
>>> + loff_t n = *pos;
>>> + int *srcuidx = m->private;
>>>
>>> - seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
>>> - "expire_count\tactive_since\ttotal_time\tmax_time\t"
>>> - "last_change\tprevent_suspend_time\n");
>>> + if (n == 0) {
>>> + seq_puts(m, 
>>> "name\t\tactive_count\tevent_count\twakeup_count\t"
>>> + "expire_count\tactive_since\ttotal_time\tmax_time\t"
>>> + "last_change\tprevent_suspend_time\n");
>>> + }
>>>
>>> - srcuidx = srcu_read_lock(_srcu);
>>> - list_for_each_entry_rcu(ws, _sources, entry)
>>> - print_wakeup_source_stats(m, ws);
>>> - srcu_read_unlock(_srcu, srcuidx);
>>> + *srcuidx = srcu_read_lock(_srcu);
>>> + list_for_each_entry_rcu(ws, _sources, entry) {
>>> + if (n-- > 0)
>>> + continue;
>>> + goto out;
>>> + }
>>> + ws = NULL;
>>> +out:
>>> + return ws;
>>> +}
>>
>> Please clean up the above at least.
>>
>> If I'm not mistaken, you don't need the label and the goto here.
>
> The continue is also not needed, if the test condition is inverted.

Hi, Geert

We need to locate to the last read item. What is your suggestion here?

Thanks.

>
> Gr{oetje,eeting}s,
>
> Geert
>
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- 
> ge...@linux-m68k.org
>
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like 
> that.
> -- Linus Torvalds


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-04-01 Thread Ganesh Mahendran
2018-03-30 18:25 GMT+08:00 Rafael J. Wysocki <r...@rjwysocki.net>:
> On Monday, March 5, 2018 9:47:46 AM CEST Ganesh Mahendran wrote:
>> single_open() interface requires that the whole output must
>> fit into a single buffer. This will lead to timeout when
>> system memory is not in a good situation.
>>
>> This patch use seq_open() to show wakeup stats. This method
>> need only one page, so timeout will not be observed.
>>
>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>> 
>> v2: use srcu_read_lock instead of rcu_read_lock
>> ---
>>  drivers/base/power/wakeup.c | 77 
>> +++--
>>  1 file changed, 61 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
>> index ea01621..3bcab7d 100644
>> --- a/drivers/base/power/wakeup.c
>> +++ b/drivers/base/power/wakeup.c
>> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct seq_file 
>> *m,
>>   return 0;
>>  }
>>
>> -/**
>> - * wakeup_sources_stats_show - Print wakeup sources statistics information.
>> - * @m: seq_file to print the statistics into.
>> - */
>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>> + loff_t *pos)
>>  {
>>   struct wakeup_source *ws;
>> - int srcuidx;
>> + loff_t n = *pos;
>> + int *srcuidx = m->private;
>>
>> - seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
>> - "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> - "last_change\tprevent_suspend_time\n");
>> + if (n == 0) {
>> + seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
>> + "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> + "last_change\tprevent_suspend_time\n");
>> + }
>>
>> - srcuidx = srcu_read_lock(_srcu);
>> - list_for_each_entry_rcu(ws, _sources, entry)
>> - print_wakeup_source_stats(m, ws);
>> - srcu_read_unlock(_srcu, srcuidx);
>> + *srcuidx = srcu_read_lock(_srcu);
>> + list_for_each_entry_rcu(ws, _sources, entry) {
>> + if (n-- > 0)
>> + continue;
>> + goto out;
>> + }
>> + ws = NULL;
>> +out:
>> + return ws;
>> +}
>
> Please clean up the above at least.

Hi, Rafael

When length of file "wakeup_sources" is larger than 1 page,
wakeup_sources_stats_seq_start()
may be called more then 1 time if the user space wants to read all of the file.
So we need to locate to last read item, if it is not the first time to
read the file.

We can see the same logic in kmemleak_seq_start().

Thanks.

>
> If I'm not mistaken, you don't need the label and the goto here.
>


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-04-01 Thread Ganesh Mahendran
2018-03-30 18:25 GMT+08:00 Rafael J. Wysocki :
> On Monday, March 5, 2018 9:47:46 AM CEST Ganesh Mahendran wrote:
>> single_open() interface requires that the whole output must
>> fit into a single buffer. This will lead to timeout when
>> system memory is not in a good situation.
>>
>> This patch use seq_open() to show wakeup stats. This method
>> need only one page, so timeout will not be observed.
>>
>> Signed-off-by: Ganesh Mahendran 
>> 
>> v2: use srcu_read_lock instead of rcu_read_lock
>> ---
>>  drivers/base/power/wakeup.c | 77 
>> +++--
>>  1 file changed, 61 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
>> index ea01621..3bcab7d 100644
>> --- a/drivers/base/power/wakeup.c
>> +++ b/drivers/base/power/wakeup.c
>> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct seq_file 
>> *m,
>>   return 0;
>>  }
>>
>> -/**
>> - * wakeup_sources_stats_show - Print wakeup sources statistics information.
>> - * @m: seq_file to print the statistics into.
>> - */
>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>> + loff_t *pos)
>>  {
>>   struct wakeup_source *ws;
>> - int srcuidx;
>> + loff_t n = *pos;
>> + int *srcuidx = m->private;
>>
>> - seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
>> - "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> - "last_change\tprevent_suspend_time\n");
>> + if (n == 0) {
>> + seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
>> + "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> + "last_change\tprevent_suspend_time\n");
>> + }
>>
>> - srcuidx = srcu_read_lock(_srcu);
>> - list_for_each_entry_rcu(ws, _sources, entry)
>> - print_wakeup_source_stats(m, ws);
>> - srcu_read_unlock(_srcu, srcuidx);
>> + *srcuidx = srcu_read_lock(_srcu);
>> + list_for_each_entry_rcu(ws, _sources, entry) {
>> + if (n-- > 0)
>> + continue;
>> + goto out;
>> + }
>> + ws = NULL;
>> +out:
>> + return ws;
>> +}
>
> Please clean up the above at least.

Hi, Rafael

When length of file "wakeup_sources" is larger than 1 page,
wakeup_sources_stats_seq_start()
may be called more then 1 time if the user space wants to read all of the file.
So we need to locate to last read item, if it is not the first time to
read the file.

We can see the same logic in kmemleak_seq_start().

Thanks.

>
> If I'm not mistaken, you don't need the label and the goto here.
>


Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-03-29 Thread Ganesh Mahendran
2018-03-30 9:29 GMT+08:00 Minchan Kim <minc...@kernel.org>:
> Hi Ganesh,
>
> On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>> 2018-03-29 14:54 GMT+08:00 Minchan Kim <minc...@kernel.org>:
>> > binder_update_page_range needs down_write of mmap_sem because
>> > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
>> > it is set. However, when I profile binder working, it seems
>> > every binder buffers should be mapped in advance by binder_mmap.
>> > It means we could set VM_MIXEDMAP in binder_mmap time which is
>> > already hold a mmap_sem as down_write so binder_update_page_range
>> > doesn't need to hold a mmap_sem as down_write.
>> >
>> > Android suffers from mmap_sem contention so let's reduce mmap_sem
>> > down_write.
>>
>> Hi, Minchan:
>>
>> It seems there is performance regression of this patch.
>
> You mean "This patch aims for solving performance regression" not "This patch
> makes performance regression"?

After applying this patch in our devices, app launch time increases
about 15% in average.
"This patch makes performance regression", yes, from the results, it
is like this.

I will do more test of this patch.

>
>>
>> Do you have some test result of android app launch time or binderThroughput?
>
> Unfortunately, I don't have any number. The goal is to reduce the number of
> call mmap_sem as write-side lock because it makes priority inversion of 
> threads
> easily and that's one of clear part I spot that we don't need write-side lock.
>
> Thanks.


Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-03-29 Thread Ganesh Mahendran
2018-03-30 9:29 GMT+08:00 Minchan Kim :
> Hi Ganesh,
>
> On Fri, Mar 30, 2018 at 09:21:55AM +0800, Ganesh Mahendran wrote:
>> 2018-03-29 14:54 GMT+08:00 Minchan Kim :
>> > binder_update_page_range needs down_write of mmap_sem because
>> > vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
>> > it is set. However, when I profile binder working, it seems
>> > every binder buffers should be mapped in advance by binder_mmap.
>> > It means we could set VM_MIXEDMAP in binder_mmap time which is
>> > already hold a mmap_sem as down_write so binder_update_page_range
>> > doesn't need to hold a mmap_sem as down_write.
>> >
>> > Android suffers from mmap_sem contention so let's reduce mmap_sem
>> > down_write.
>>
>> Hi, Minchan:
>>
>> It seems there is performance regression of this patch.
>
> You mean "This patch aims for solving performance regression" not "This patch
> makes performance regression"?

After applying this patch in our devices, app launch time increases
about 15% in average.
"This patch makes performance regression", yes, from the results, it
is like this.

I will do more test of this patch.

>
>>
>> Do you have some test result of android app launch time or binderThroughput?
>
> Unfortunately, I don't have any number. The goal is to reduce the number of
> call mmap_sem as write-side lock because it makes priority inversion of 
> threads
> easily and that's one of clear part I spot that we don't need write-side lock.
>
> Thanks.


Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-03-29 Thread Ganesh Mahendran
2018-03-29 14:54 GMT+08:00 Minchan Kim :
> binder_update_page_range needs down_write of mmap_sem because
> vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
> it is set. However, when I profile binder working, it seems
> every binder buffers should be mapped in advance by binder_mmap.
> It means we could set VM_MIXEDMAP in binder_mmap time which is
> already hold a mmap_sem as down_write so binder_update_page_range
> doesn't need to hold a mmap_sem as down_write.
>
> Android suffers from mmap_sem contention so let's reduce mmap_sem
> down_write.

Hi, Minchan:

It seems there is performance regression of this patch.

Do you have some test result of android app launch time or binderThroughput?

Thanks.
>
> Cc: Joe Perches 
> Cc: Arve Hjønnevåg 
> Cc: Todd Kjos 
> Cc: Greg Kroah-Hartman 
> Reviewed-by: Martijn Coenen 
> Signed-off-by: Minchan Kim 
> ---
> From v4:
>   * Fix typo and VM flags clear handling - Joe
>
> From v3:
>   * Fix typo
>
> From v2:
>   * Fix vma->flag setting - Arve
>
> From v1:
>   * remove WARN_ON_ONCE - Greg
>   * add reviewed-by - Martijn
>
> Martijn, I took your LGTM of v1 as Reviewed-by. If you don't like it
> or want to change it to acked-by, please, tell me.
>
>  drivers/android/binder.c   | 4 +++-
>  drivers/android/binder_alloc.c | 6 +++---
>  2 files changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/android/binder.c b/drivers/android/binder.c
> index 764b63a5aade..bb63e3b54e0c 100644
> --- a/drivers/android/binder.c
> +++ b/drivers/android/binder.c
> @@ -4722,7 +4722,9 @@ static int binder_mmap(struct file *filp, struct 
> vm_area_struct *vma)
> failure_string = "bad vm_flags";
> goto err_bad_arg;
> }
> -   vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
> +   vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP;
> +   vma->vm_flags &= ~VM_MAYWRITE;
> +
> vma->vm_ops = _vm_ops;
> vma->vm_private_data = proc;
>
> diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
> index 5a426c877dfb..4f382d51def1 100644
> --- a/drivers/android/binder_alloc.c
> +++ b/drivers/android/binder_alloc.c
> @@ -219,7 +219,7 @@ static int binder_update_page_range(struct binder_alloc 
> *alloc, int allocate,
> mm = alloc->vma_vm_mm;
>
> if (mm) {
> -   down_write(>mmap_sem);
> +   down_read(>mmap_sem);
> vma = alloc->vma;
> }
>
> @@ -288,7 +288,7 @@ static int binder_update_page_range(struct binder_alloc 
> *alloc, int allocate,
> /* vm_insert_page does not seem to increment the refcount */
> }
> if (mm) {
> -   up_write(>mmap_sem);
> +   up_read(>mmap_sem);
> mmput(mm);
> }
> return 0;
> @@ -321,7 +321,7 @@ static int binder_update_page_range(struct binder_alloc 
> *alloc, int allocate,
> }
>  err_no_vma:
> if (mm) {
> -   up_write(>mmap_sem);
> +   up_read(>mmap_sem);
> mmput(mm);
> }
> return vma ? -ENOMEM : -ESRCH;
> --
> 2.17.0.rc1.321.gba9d0f2565-goog
>


Re: [PATCH v5] ANDROID: binder: change down_write to down_read

2018-03-29 Thread Ganesh Mahendran
2018-03-29 14:54 GMT+08:00 Minchan Kim :
> binder_update_page_range needs down_write of mmap_sem because
> vm_insert_page need to change vma->vm_flags to VM_MIXEDMAP unless
> it is set. However, when I profile binder working, it seems
> every binder buffers should be mapped in advance by binder_mmap.
> It means we could set VM_MIXEDMAP in binder_mmap time which is
> already hold a mmap_sem as down_write so binder_update_page_range
> doesn't need to hold a mmap_sem as down_write.
>
> Android suffers from mmap_sem contention so let's reduce mmap_sem
> down_write.

Hi, Minchan:

It seems there is performance regression of this patch.

Do you have some test result of android app launch time or binderThroughput?

Thanks.
>
> Cc: Joe Perches 
> Cc: Arve Hjønnevåg 
> Cc: Todd Kjos 
> Cc: Greg Kroah-Hartman 
> Reviewed-by: Martijn Coenen 
> Signed-off-by: Minchan Kim 
> ---
> From v4:
>   * Fix typo and VM flags clear handling - Joe
>
> From v3:
>   * Fix typo
>
> From v2:
>   * Fix vma->flag setting - Arve
>
> From v1:
>   * remove WARN_ON_ONCE - Greg
>   * add reviewed-by - Martijn
>
> Martijn, I took your LGTM of v1 as Reviewed-by. If you don't like it
> or want to change it to acked-by, please, tell me.
>
>  drivers/android/binder.c   | 4 +++-
>  drivers/android/binder_alloc.c | 6 +++---
>  2 files changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/android/binder.c b/drivers/android/binder.c
> index 764b63a5aade..bb63e3b54e0c 100644
> --- a/drivers/android/binder.c
> +++ b/drivers/android/binder.c
> @@ -4722,7 +4722,9 @@ static int binder_mmap(struct file *filp, struct 
> vm_area_struct *vma)
> failure_string = "bad vm_flags";
> goto err_bad_arg;
> }
> -   vma->vm_flags = (vma->vm_flags | VM_DONTCOPY) & ~VM_MAYWRITE;
> +   vma->vm_flags |= VM_DONTCOPY | VM_MIXEDMAP;
> +   vma->vm_flags &= ~VM_MAYWRITE;
> +
> vma->vm_ops = _vm_ops;
> vma->vm_private_data = proc;
>
> diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
> index 5a426c877dfb..4f382d51def1 100644
> --- a/drivers/android/binder_alloc.c
> +++ b/drivers/android/binder_alloc.c
> @@ -219,7 +219,7 @@ static int binder_update_page_range(struct binder_alloc 
> *alloc, int allocate,
> mm = alloc->vma_vm_mm;
>
> if (mm) {
> -   down_write(>mmap_sem);
> +   down_read(>mmap_sem);
> vma = alloc->vma;
> }
>
> @@ -288,7 +288,7 @@ static int binder_update_page_range(struct binder_alloc 
> *alloc, int allocate,
> /* vm_insert_page does not seem to increment the refcount */
> }
> if (mm) {
> -   up_write(>mmap_sem);
> +   up_read(>mmap_sem);
> mmput(mm);
> }
> return 0;
> @@ -321,7 +321,7 @@ static int binder_update_page_range(struct binder_alloc 
> *alloc, int allocate,
> }
>  err_no_vma:
> if (mm) {
> -   up_write(>mmap_sem);
> +   up_read(>mmap_sem);
> mmput(mm);
> }
> return vma ? -ENOMEM : -ESRCH;
> --
> 2.17.0.rc1.321.gba9d0f2565-goog
>


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-03-29 Thread Ganesh Mahendran
ping.

2018-03-05 16:47 GMT+08:00 Ganesh Mahendran <opensource.gan...@gmail.com>:
> single_open() interface requires that the whole output must
> fit into a single buffer. This will lead to timeout when
> system memory is not in a good situation.
>
> This patch use seq_open() to show wakeup stats. This method
> need only one page, so timeout will not be observed.

We have resolved the watchdog timeout issue by this patch.
Please help to review.

Thanks.

>
> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
> 
> v2: use srcu_read_lock instead of rcu_read_lock
> ---
>  drivers/base/power/wakeup.c | 77 
> +++--
>  1 file changed, 61 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
> index ea01621..3bcab7d 100644
> --- a/drivers/base/power/wakeup.c
> +++ b/drivers/base/power/wakeup.c
> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct seq_file 
> *m,
> return 0;
>  }
>
> -/**
> - * wakeup_sources_stats_show - Print wakeup sources statistics information.
> - * @m: seq_file to print the statistics into.
> - */
> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
> +   loff_t *pos)
>  {
> struct wakeup_source *ws;
> -   int srcuidx;
> +   loff_t n = *pos;
> +   int *srcuidx = m->private;
>
> -   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
> -   "expire_count\tactive_since\ttotal_time\tmax_time\t"
> -   "last_change\tprevent_suspend_time\n");
> +   if (n == 0) {
> +   seq_puts(m, 
> "name\t\tactive_count\tevent_count\twakeup_count\t"
> +   "expire_count\tactive_since\ttotal_time\tmax_time\t"
> +   "last_change\tprevent_suspend_time\n");
> +   }
>
> -   srcuidx = srcu_read_lock(_srcu);
> -   list_for_each_entry_rcu(ws, _sources, entry)
> -   print_wakeup_source_stats(m, ws);
> -   srcu_read_unlock(_srcu, srcuidx);
> +   *srcuidx = srcu_read_lock(_srcu);
> +   list_for_each_entry_rcu(ws, _sources, entry) {
> +   if (n-- > 0)
> +   continue;
> +   goto out;
> +   }
> +   ws = NULL;
> +out:
> +   return ws;
> +}
> +
> +static void *wakeup_sources_stats_seq_next(struct seq_file *m,
> +   void *v, loff_t *pos)
> +{
> +   struct wakeup_source *ws = v;
> +   struct wakeup_source *next_ws = NULL;
> +
> +   ++(*pos);
>
> -   print_wakeup_source_stats(m, _ws);
> +   list_for_each_entry_continue_rcu(ws, _sources, entry) {
> +   next_ws = ws;
> +   break;
> +   }
> +
> +   return next_ws;
> +}
> +
> +static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
> +{
> +   int *srcuidx = m->private;
> +
> +   srcu_read_unlock(_srcu, *srcuidx);
> +}
> +
> +/**
> + * wakeup_sources_stats_seq_show - Print wakeup sources statistics 
> information.
> + * @m: seq_file to print the statistics into.
> + * @v: wakeup_source of each iteration
> + */
> +static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
> +{
> +   struct wakeup_source *ws = v;
> +
> +   print_wakeup_source_stats(m, ws);
>
> return 0;
>  }
>
> +static const struct seq_operations wakeup_sources_stats_seq_ops = {
> +   .start = wakeup_sources_stats_seq_start,
> +   .next  = wakeup_sources_stats_seq_next,
> +   .stop  = wakeup_sources_stats_seq_stop,
> +   .show  = wakeup_sources_stats_seq_show,
> +};
> +
>  static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
>  {
> -   return single_open(file, wakeup_sources_stats_show, NULL);
> +   return seq_open_private(file, _sources_stats_seq_ops, 
> sizeof(int));
>  }
>
>  static const struct file_operations wakeup_sources_stats_fops = {
> @@ -1062,7 +1107,7 @@ static int wakeup_sources_stats_open(struct inode 
> *inode, struct file *file)
> .open = wakeup_sources_stats_open,
> .read = seq_read,
> .llseek = seq_lseek,
> -   .release = single_release,
> +   .release = seq_release_private,
>  };
>
>  static int __init wakeup_sources_debugfs_init(void)
> --
> 1.9.1
>


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-03-29 Thread Ganesh Mahendran
ping.

2018-03-05 16:47 GMT+08:00 Ganesh Mahendran :
> single_open() interface requires that the whole output must
> fit into a single buffer. This will lead to timeout when
> system memory is not in a good situation.
>
> This patch use seq_open() to show wakeup stats. This method
> need only one page, so timeout will not be observed.

We have resolved the watchdog timeout issue by this patch.
Please help to review.

Thanks.

>
> Signed-off-by: Ganesh Mahendran 
> 
> v2: use srcu_read_lock instead of rcu_read_lock
> ---
>  drivers/base/power/wakeup.c | 77 
> +++--
>  1 file changed, 61 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
> index ea01621..3bcab7d 100644
> --- a/drivers/base/power/wakeup.c
> +++ b/drivers/base/power/wakeup.c
> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct seq_file 
> *m,
> return 0;
>  }
>
> -/**
> - * wakeup_sources_stats_show - Print wakeup sources statistics information.
> - * @m: seq_file to print the statistics into.
> - */
> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
> +   loff_t *pos)
>  {
> struct wakeup_source *ws;
> -   int srcuidx;
> +   loff_t n = *pos;
> +   int *srcuidx = m->private;
>
> -   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
> -   "expire_count\tactive_since\ttotal_time\tmax_time\t"
> -   "last_change\tprevent_suspend_time\n");
> +   if (n == 0) {
> +   seq_puts(m, 
> "name\t\tactive_count\tevent_count\twakeup_count\t"
> +   "expire_count\tactive_since\ttotal_time\tmax_time\t"
> +   "last_change\tprevent_suspend_time\n");
> +   }
>
> -   srcuidx = srcu_read_lock(_srcu);
> -   list_for_each_entry_rcu(ws, _sources, entry)
> -   print_wakeup_source_stats(m, ws);
> -   srcu_read_unlock(_srcu, srcuidx);
> +   *srcuidx = srcu_read_lock(_srcu);
> +   list_for_each_entry_rcu(ws, _sources, entry) {
> +   if (n-- > 0)
> +   continue;
> +   goto out;
> +   }
> +   ws = NULL;
> +out:
> +   return ws;
> +}
> +
> +static void *wakeup_sources_stats_seq_next(struct seq_file *m,
> +   void *v, loff_t *pos)
> +{
> +   struct wakeup_source *ws = v;
> +   struct wakeup_source *next_ws = NULL;
> +
> +   ++(*pos);
>
> -   print_wakeup_source_stats(m, _ws);
> +   list_for_each_entry_continue_rcu(ws, _sources, entry) {
> +   next_ws = ws;
> +   break;
> +   }
> +
> +   return next_ws;
> +}
> +
> +static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
> +{
> +   int *srcuidx = m->private;
> +
> +   srcu_read_unlock(_srcu, *srcuidx);
> +}
> +
> +/**
> + * wakeup_sources_stats_seq_show - Print wakeup sources statistics 
> information.
> + * @m: seq_file to print the statistics into.
> + * @v: wakeup_source of each iteration
> + */
> +static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
> +{
> +   struct wakeup_source *ws = v;
> +
> +   print_wakeup_source_stats(m, ws);
>
> return 0;
>  }
>
> +static const struct seq_operations wakeup_sources_stats_seq_ops = {
> +   .start = wakeup_sources_stats_seq_start,
> +   .next  = wakeup_sources_stats_seq_next,
> +   .stop  = wakeup_sources_stats_seq_stop,
> +   .show  = wakeup_sources_stats_seq_show,
> +};
> +
>  static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
>  {
> -   return single_open(file, wakeup_sources_stats_show, NULL);
> +   return seq_open_private(file, _sources_stats_seq_ops, 
> sizeof(int));
>  }
>
>  static const struct file_operations wakeup_sources_stats_fops = {
> @@ -1062,7 +1107,7 @@ static int wakeup_sources_stats_open(struct inode 
> *inode, struct file *file)
> .open = wakeup_sources_stats_open,
> .read = seq_read,
> .llseek = seq_lseek,
> -   .release = single_release,
> +   .release = seq_release_private,
>  };
>
>  static int __init wakeup_sources_debugfs_init(void)
> --
> 1.9.1
>


Re: [PATCH v8 22/24] mm: Speculative page fault handler return VMA

2018-03-28 Thread Ganesh Mahendran
2018-03-29 10:26 GMT+08:00 Ganesh Mahendran <opensource.gan...@gmail.com>:
> Hi, Laurent
>
> 2018-02-16 23:25 GMT+08:00 Laurent Dufour <lduf...@linux.vnet.ibm.com>:
>> When the speculative page fault handler is returning VM_RETRY, there is a
>> chance that VMA fetched without grabbing the mmap_sem can be reused by the
>> legacy page fault handler.  By reusing it, we avoid calling find_vma()
>> again. To achieve, that we must ensure that the VMA structure will not be
>> freed in our back. This is done by getting the reference on it (get_vma())
>> and by assuming that the caller will call the new service
>> can_reuse_spf_vma() once it has grabbed the mmap_sem.
>>
>> can_reuse_spf_vma() is first checking that the VMA is still in the RB tree
>> , and then that the VMA's boundaries matched the passed address and release
>> the reference on the VMA so that it can be freed if needed.
>>
>> In the case the VMA is freed, can_reuse_spf_vma() will have returned false
>> as the VMA is no more in the RB tree.
>
> when I applied this patch to arm64, I got a crash:
>
> [6.088296] Unable to handle kernel NULL pointer dereference at
> virtual address 
> [6.088307] pgd = ff9d67735000
> [6.088313] [] *pgd=0001795e3003,
> *pud=0001795e3003, *pmd=
> [6.088372] [ cut here ]
> [6.088377] Kernel BUG at ff9d64f65960 [verbose debug info unavailable]
> [6.088384] Internal error: Oops - BUG: 9645 [#1] PREEMPT SMP
> [6.088389] BUG: Bad rss-counter state mm:ffe8f3861040 idx:0 val:90
> [6.088393] BUG: Bad rss-counter state mm:ffe8f3861040 idx:1 val:58
> [6.088398] Modules linked in:
> [6.088408] CPU: 1 PID: 621 Comm: qseecomd Not tainted 4.4.78-perf+ #88
> [6.088413] Hardware name: Qualcomm Technologies, Inc. SDM 636
> PM660 + PM660L MTP E7S (DT)
> [6.088419] task: ffe8f6208000 ti: ffe872a8c000 task.ti:
> ffe872a8c000
> [6.088432] PC is at __rb_erase_color+0x108/0x240
> [6.088441] LR is at vma_interval_tree_remove+0x244/0x24c
> [6.088447] pc : [] lr : []
> pstate: 604001c5
> [6.088451] sp : ffe872a8fa50
> [6.088455] x29: ffe872a8fa50 x28: 0008
> [6.088462] x27: 0009 x26: 
> [6.088470] x25: ffe8f458fb80 x24: 00768ff87000
> [6.088477] x23:  x22: 
> [6.088484] x21: ff9d64d9be7c x20: ffe8f3ff0680
> [6.088492] x19: ffe8f212e9b0 x18: 0074
> [6.088499] x17: 0007 x16: 000e
> [6.088507] x15: ff9d65c88000 x14: 0001
> [6.088514] x13: 00192d76 x12: 00989680
> [6.088521] x11: 001f x10: ff9d661ded1b
> [6.088528] x9 : 007691759000 x8 : 07691759
> [6.088535] x7 :  x6 : ffe871ebada8
> [6.088541] x5 : 00e1 x4 : ffe8f212e958
> [6.088548] x3 : 00e9 x2 : 
> [6.088555] x1 : ffe8f212f110 x0 : ffe8f212e9b1
> [6.088564]
> [6.088564] PC: 0xff9d64f65920:
> [6.088568] 5920  f902 aa0103e0 aa1603e1 d63f02a0 aa1603e1
> f9400822 f9000662 f9000833
> [6.088590] 5940  143b f9400a61 f9400020 370002c0 f9400436
> b2400260 f9000a76 f9000433
> [6.088610] 5960  f90002c0 f9400260 f920 f9000261 f27ef400
> 54000100 f9400802 eb13005f
> [6.088630] 5980  5461 f9000801 1404 f9000401 1402
> f9000281 aa1303e0 d63f02a0
> [6.088652]
> [6.088652] LR: 0xff9d64d9c298:
> [6.088656] c298  f9403083 b483 f9400c63 eb03005f 9a832042
> f9403883 eb02007f 54a0
> [6.088676] c2b8  f9003882 f9402c82 927ef442 b5fffd22 b480
> f0e2 9139f042 94072561
> [6.088695] c2d8  a8c17bfd d65f03c0 a9bf7bfd 910003fd f943
> d280 b4e3 f9400c65
> [6.088715] c2f8  d1016063 eb0100bf 5463 aa0303e0 97fffef2
> a8c17bfd d65f03c0 a9bf7bfd
> [6.088735]
> [6.088735] SP: 0xffe872a8fa10:
> [6.088740] fa10  64d9c2d8 ff9d 72a8fa50 ffe8 64f65960
> ff9d 604001c5 
> [6.088759] fa30  71d67d70 ffe8 71c281e8 ffe8 
> 0080 64daa90c ff9d
> [6.088779] fa50  72a8fa90 ffe8 64d9c2d8 ff9d 71ebada8
> ffe8 f3ff0678 ffe8
> [6.088799] fa70  72a8fb80 ffe8   
>  0001 
> [6.088818]
> [6.088823] Process qseecomd (pid: 621, stack limit = 0xffe872a8c028)
> [6.088828] Call trace:
> [6.088834] Exception stack(0xffe872a8f860 to 0xffe872a8f990)
> [6.088841] f860: ffe8f212e9b0 

Re: [PATCH v8 22/24] mm: Speculative page fault handler return VMA

2018-03-28 Thread Ganesh Mahendran
2018-03-29 10:26 GMT+08:00 Ganesh Mahendran :
> Hi, Laurent
>
> 2018-02-16 23:25 GMT+08:00 Laurent Dufour :
>> When the speculative page fault handler is returning VM_RETRY, there is a
>> chance that VMA fetched without grabbing the mmap_sem can be reused by the
>> legacy page fault handler.  By reusing it, we avoid calling find_vma()
>> again. To achieve, that we must ensure that the VMA structure will not be
>> freed in our back. This is done by getting the reference on it (get_vma())
>> and by assuming that the caller will call the new service
>> can_reuse_spf_vma() once it has grabbed the mmap_sem.
>>
>> can_reuse_spf_vma() is first checking that the VMA is still in the RB tree
>> , and then that the VMA's boundaries matched the passed address and release
>> the reference on the VMA so that it can be freed if needed.
>>
>> In the case the VMA is freed, can_reuse_spf_vma() will have returned false
>> as the VMA is no more in the RB tree.
>
> when I applied this patch to arm64, I got a crash:
>
> [6.088296] Unable to handle kernel NULL pointer dereference at
> virtual address 
> [6.088307] pgd = ff9d67735000
> [6.088313] [] *pgd=0001795e3003,
> *pud=0001795e3003, *pmd=
> [6.088372] [ cut here ]
> [6.088377] Kernel BUG at ff9d64f65960 [verbose debug info unavailable]
> [6.088384] Internal error: Oops - BUG: 9645 [#1] PREEMPT SMP
> [6.088389] BUG: Bad rss-counter state mm:ffe8f3861040 idx:0 val:90
> [6.088393] BUG: Bad rss-counter state mm:ffe8f3861040 idx:1 val:58
> [6.088398] Modules linked in:
> [6.088408] CPU: 1 PID: 621 Comm: qseecomd Not tainted 4.4.78-perf+ #88
> [6.088413] Hardware name: Qualcomm Technologies, Inc. SDM 636
> PM660 + PM660L MTP E7S (DT)
> [6.088419] task: ffe8f6208000 ti: ffe872a8c000 task.ti:
> ffe872a8c000
> [6.088432] PC is at __rb_erase_color+0x108/0x240
> [6.088441] LR is at vma_interval_tree_remove+0x244/0x24c
> [6.088447] pc : [] lr : []
> pstate: 604001c5
> [6.088451] sp : ffe872a8fa50
> [6.088455] x29: ffe872a8fa50 x28: 0008
> [6.088462] x27: 0009 x26: 
> [6.088470] x25: ffe8f458fb80 x24: 00768ff87000
> [6.088477] x23:  x22: 
> [6.088484] x21: ff9d64d9be7c x20: ffe8f3ff0680
> [6.088492] x19: ffe8f212e9b0 x18: 0074
> [6.088499] x17: 0007 x16: 000e
> [6.088507] x15: ff9d65c88000 x14: 0001
> [6.088514] x13: 00192d76 x12: 00989680
> [6.088521] x11: 001f x10: ff9d661ded1b
> [6.088528] x9 : 007691759000 x8 : 07691759
> [6.088535] x7 :  x6 : ffe871ebada8
> [6.088541] x5 : 00e1 x4 : ffe8f212e958
> [6.088548] x3 : 00e9 x2 : 
> [6.088555] x1 : ffe8f212f110 x0 : ffe8f212e9b1
> [6.088564]
> [6.088564] PC: 0xff9d64f65920:
> [6.088568] 5920  f902 aa0103e0 aa1603e1 d63f02a0 aa1603e1
> f9400822 f9000662 f9000833
> [6.088590] 5940  143b f9400a61 f9400020 370002c0 f9400436
> b2400260 f9000a76 f9000433
> [6.088610] 5960  f90002c0 f9400260 f920 f9000261 f27ef400
> 54000100 f9400802 eb13005f
> [6.088630] 5980  5461 f9000801 1404 f9000401 1402
> f9000281 aa1303e0 d63f02a0
> [6.088652]
> [6.088652] LR: 0xff9d64d9c298:
> [6.088656] c298  f9403083 b483 f9400c63 eb03005f 9a832042
> f9403883 eb02007f 54a0
> [6.088676] c2b8  f9003882 f9402c82 927ef442 b5fffd22 b480
> f0e2 9139f042 94072561
> [6.088695] c2d8  a8c17bfd d65f03c0 a9bf7bfd 910003fd f943
> d280 b4e3 f9400c65
> [6.088715] c2f8  d1016063 eb0100bf 5463 aa0303e0 97fffef2
> a8c17bfd d65f03c0 a9bf7bfd
> [6.088735]
> [6.088735] SP: 0xffe872a8fa10:
> [6.088740] fa10  64d9c2d8 ff9d 72a8fa50 ffe8 64f65960
> ff9d 604001c5 
> [6.088759] fa30  71d67d70 ffe8 71c281e8 ffe8 
> 0080 64daa90c ff9d
> [6.088779] fa50  72a8fa90 ffe8 64d9c2d8 ff9d 71ebada8
> ffe8 f3ff0678 ffe8
> [6.088799] fa70  72a8fb80 ffe8   
>  0001 
> [6.088818]
> [6.088823] Process qseecomd (pid: 621, stack limit = 0xffe872a8c028)
> [6.088828] Call trace:
> [6.088834] Exception stack(0xffe872a8f860 to 0xffe872a8f990)
> [6.088841] f860: ffe8f212e9b0 0080
> 82b37000 ff9d64f65960
> [6.08884

Re: [PATCH v8 22/24] mm: Speculative page fault handler return VMA

2018-03-28 Thread Ganesh Mahendran
Hi, Laurent

2018-02-16 23:25 GMT+08:00 Laurent Dufour :
> When the speculative page fault handler is returning VM_RETRY, there is a
> chance that VMA fetched without grabbing the mmap_sem can be reused by the
> legacy page fault handler.  By reusing it, we avoid calling find_vma()
> again. To achieve, that we must ensure that the VMA structure will not be
> freed in our back. This is done by getting the reference on it (get_vma())
> and by assuming that the caller will call the new service
> can_reuse_spf_vma() once it has grabbed the mmap_sem.
>
> can_reuse_spf_vma() is first checking that the VMA is still in the RB tree
> , and then that the VMA's boundaries matched the passed address and release
> the reference on the VMA so that it can be freed if needed.
>
> In the case the VMA is freed, can_reuse_spf_vma() will have returned false
> as the VMA is no more in the RB tree.

when I applied this patch to arm64, I got a crash:

[6.088296] Unable to handle kernel NULL pointer dereference at
virtual address 
[6.088307] pgd = ff9d67735000
[6.088313] [] *pgd=0001795e3003,
*pud=0001795e3003, *pmd=
[6.088372] [ cut here ]
[6.088377] Kernel BUG at ff9d64f65960 [verbose debug info unavailable]
[6.088384] Internal error: Oops - BUG: 9645 [#1] PREEMPT SMP
[6.088389] BUG: Bad rss-counter state mm:ffe8f3861040 idx:0 val:90
[6.088393] BUG: Bad rss-counter state mm:ffe8f3861040 idx:1 val:58
[6.088398] Modules linked in:
[6.088408] CPU: 1 PID: 621 Comm: qseecomd Not tainted 4.4.78-perf+ #88
[6.088413] Hardware name: Qualcomm Technologies, Inc. SDM 636
PM660 + PM660L MTP E7S (DT)
[6.088419] task: ffe8f6208000 ti: ffe872a8c000 task.ti:
ffe872a8c000
[6.088432] PC is at __rb_erase_color+0x108/0x240
[6.088441] LR is at vma_interval_tree_remove+0x244/0x24c
[6.088447] pc : [] lr : []
pstate: 604001c5
[6.088451] sp : ffe872a8fa50
[6.088455] x29: ffe872a8fa50 x28: 0008
[6.088462] x27: 0009 x26: 
[6.088470] x25: ffe8f458fb80 x24: 00768ff87000
[6.088477] x23:  x22: 
[6.088484] x21: ff9d64d9be7c x20: ffe8f3ff0680
[6.088492] x19: ffe8f212e9b0 x18: 0074
[6.088499] x17: 0007 x16: 000e
[6.088507] x15: ff9d65c88000 x14: 0001
[6.088514] x13: 00192d76 x12: 00989680
[6.088521] x11: 001f x10: ff9d661ded1b
[6.088528] x9 : 007691759000 x8 : 07691759
[6.088535] x7 :  x6 : ffe871ebada8
[6.088541] x5 : 00e1 x4 : ffe8f212e958
[6.088548] x3 : 00e9 x2 : 
[6.088555] x1 : ffe8f212f110 x0 : ffe8f212e9b1
[6.088564]
[6.088564] PC: 0xff9d64f65920:
[6.088568] 5920  f902 aa0103e0 aa1603e1 d63f02a0 aa1603e1
f9400822 f9000662 f9000833
[6.088590] 5940  143b f9400a61 f9400020 370002c0 f9400436
b2400260 f9000a76 f9000433
[6.088610] 5960  f90002c0 f9400260 f920 f9000261 f27ef400
54000100 f9400802 eb13005f
[6.088630] 5980  5461 f9000801 1404 f9000401 1402
f9000281 aa1303e0 d63f02a0
[6.088652]
[6.088652] LR: 0xff9d64d9c298:
[6.088656] c298  f9403083 b483 f9400c63 eb03005f 9a832042
f9403883 eb02007f 54a0
[6.088676] c2b8  f9003882 f9402c82 927ef442 b5fffd22 b480
f0e2 9139f042 94072561
[6.088695] c2d8  a8c17bfd d65f03c0 a9bf7bfd 910003fd f943
d280 b4e3 f9400c65
[6.088715] c2f8  d1016063 eb0100bf 5463 aa0303e0 97fffef2
a8c17bfd d65f03c0 a9bf7bfd
[6.088735]
[6.088735] SP: 0xffe872a8fa10:
[6.088740] fa10  64d9c2d8 ff9d 72a8fa50 ffe8 64f65960
ff9d 604001c5 
[6.088759] fa30  71d67d70 ffe8 71c281e8 ffe8 
0080 64daa90c ff9d
[6.088779] fa50  72a8fa90 ffe8 64d9c2d8 ff9d 71ebada8
ffe8 f3ff0678 ffe8
[6.088799] fa70  72a8fb80 ffe8   
 0001 
[6.088818]
[6.088823] Process qseecomd (pid: 621, stack limit = 0xffe872a8c028)
[6.088828] Call trace:
[6.088834] Exception stack(0xffe872a8f860 to 0xffe872a8f990)
[6.088841] f860: ffe8f212e9b0 0080
82b37000 ff9d64f65960
[6.088848] f880: 604001c5 ff9d672c8680
ff9d672c9c00 ff9d672d3ab7
[6.088855] f8a0: ffe872a8f8f0 ff9d64db9bfc
 ffe8f9402c00
[6.088861] f8c0: ffe872a8c000 
ffe872a8f920 ff9d64db9bfc
[6.088867] f8e0:  ffe8f9402b00
ffe872a8fa10 ff9d64dba568
[6.088874] f900: ffbe61c759c0 ffe871d67d70
ffe8f9402c00 1de56fb006cba396
[6.01] f920: ffe8f212e9b1 ffe8f212f110
 

Re: [PATCH v8 22/24] mm: Speculative page fault handler return VMA

2018-03-28 Thread Ganesh Mahendran
Hi, Laurent

2018-02-16 23:25 GMT+08:00 Laurent Dufour :
> When the speculative page fault handler is returning VM_RETRY, there is a
> chance that VMA fetched without grabbing the mmap_sem can be reused by the
> legacy page fault handler.  By reusing it, we avoid calling find_vma()
> again. To achieve, that we must ensure that the VMA structure will not be
> freed in our back. This is done by getting the reference on it (get_vma())
> and by assuming that the caller will call the new service
> can_reuse_spf_vma() once it has grabbed the mmap_sem.
>
> can_reuse_spf_vma() is first checking that the VMA is still in the RB tree
> , and then that the VMA's boundaries matched the passed address and release
> the reference on the VMA so that it can be freed if needed.
>
> In the case the VMA is freed, can_reuse_spf_vma() will have returned false
> as the VMA is no more in the RB tree.

when I applied this patch to arm64, I got a crash:

[6.088296] Unable to handle kernel NULL pointer dereference at
virtual address 
[6.088307] pgd = ff9d67735000
[6.088313] [] *pgd=0001795e3003,
*pud=0001795e3003, *pmd=
[6.088372] [ cut here ]
[6.088377] Kernel BUG at ff9d64f65960 [verbose debug info unavailable]
[6.088384] Internal error: Oops - BUG: 9645 [#1] PREEMPT SMP
[6.088389] BUG: Bad rss-counter state mm:ffe8f3861040 idx:0 val:90
[6.088393] BUG: Bad rss-counter state mm:ffe8f3861040 idx:1 val:58
[6.088398] Modules linked in:
[6.088408] CPU: 1 PID: 621 Comm: qseecomd Not tainted 4.4.78-perf+ #88
[6.088413] Hardware name: Qualcomm Technologies, Inc. SDM 636
PM660 + PM660L MTP E7S (DT)
[6.088419] task: ffe8f6208000 ti: ffe872a8c000 task.ti:
ffe872a8c000
[6.088432] PC is at __rb_erase_color+0x108/0x240
[6.088441] LR is at vma_interval_tree_remove+0x244/0x24c
[6.088447] pc : [] lr : []
pstate: 604001c5
[6.088451] sp : ffe872a8fa50
[6.088455] x29: ffe872a8fa50 x28: 0008
[6.088462] x27: 0009 x26: 
[6.088470] x25: ffe8f458fb80 x24: 00768ff87000
[6.088477] x23:  x22: 
[6.088484] x21: ff9d64d9be7c x20: ffe8f3ff0680
[6.088492] x19: ffe8f212e9b0 x18: 0074
[6.088499] x17: 0007 x16: 000e
[6.088507] x15: ff9d65c88000 x14: 0001
[6.088514] x13: 00192d76 x12: 00989680
[6.088521] x11: 001f x10: ff9d661ded1b
[6.088528] x9 : 007691759000 x8 : 07691759
[6.088535] x7 :  x6 : ffe871ebada8
[6.088541] x5 : 00e1 x4 : ffe8f212e958
[6.088548] x3 : 00e9 x2 : 
[6.088555] x1 : ffe8f212f110 x0 : ffe8f212e9b1
[6.088564]
[6.088564] PC: 0xff9d64f65920:
[6.088568] 5920  f902 aa0103e0 aa1603e1 d63f02a0 aa1603e1
f9400822 f9000662 f9000833
[6.088590] 5940  143b f9400a61 f9400020 370002c0 f9400436
b2400260 f9000a76 f9000433
[6.088610] 5960  f90002c0 f9400260 f920 f9000261 f27ef400
54000100 f9400802 eb13005f
[6.088630] 5980  5461 f9000801 1404 f9000401 1402
f9000281 aa1303e0 d63f02a0
[6.088652]
[6.088652] LR: 0xff9d64d9c298:
[6.088656] c298  f9403083 b483 f9400c63 eb03005f 9a832042
f9403883 eb02007f 54a0
[6.088676] c2b8  f9003882 f9402c82 927ef442 b5fffd22 b480
f0e2 9139f042 94072561
[6.088695] c2d8  a8c17bfd d65f03c0 a9bf7bfd 910003fd f943
d280 b4e3 f9400c65
[6.088715] c2f8  d1016063 eb0100bf 5463 aa0303e0 97fffef2
a8c17bfd d65f03c0 a9bf7bfd
[6.088735]
[6.088735] SP: 0xffe872a8fa10:
[6.088740] fa10  64d9c2d8 ff9d 72a8fa50 ffe8 64f65960
ff9d 604001c5 
[6.088759] fa30  71d67d70 ffe8 71c281e8 ffe8 
0080 64daa90c ff9d
[6.088779] fa50  72a8fa90 ffe8 64d9c2d8 ff9d 71ebada8
ffe8 f3ff0678 ffe8
[6.088799] fa70  72a8fb80 ffe8   
 0001 
[6.088818]
[6.088823] Process qseecomd (pid: 621, stack limit = 0xffe872a8c028)
[6.088828] Call trace:
[6.088834] Exception stack(0xffe872a8f860 to 0xffe872a8f990)
[6.088841] f860: ffe8f212e9b0 0080
82b37000 ff9d64f65960
[6.088848] f880: 604001c5 ff9d672c8680
ff9d672c9c00 ff9d672d3ab7
[6.088855] f8a0: ffe872a8f8f0 ff9d64db9bfc
 ffe8f9402c00
[6.088861] f8c0: ffe872a8c000 
ffe872a8f920 ff9d64db9bfc
[6.088867] f8e0:  ffe8f9402b00
ffe872a8fa10 ff9d64dba568
[6.088874] f900: ffbe61c759c0 ffe871d67d70
ffe8f9402c00 1de56fb006cba396
[6.01] f920: ffe8f212e9b1 ffe8f212f110
 00e9
[6.08] 

Re: [PATCH v9 00/24] Speculative page faults

2018-03-21 Thread Ganesh Mahendran
Hi, Laurent

2018-03-14 1:59 GMT+08:00 Laurent Dufour :
> This is a port on kernel 4.16 of the work done by Peter Zijlstra to
> handle page fault without holding the mm semaphore [1].
>
> The idea is to try to handle user space page faults without holding the
> mmap_sem. This should allow better concurrency for massively threaded
> process since the page fault handler will not wait for other threads memory
> layout change to be done, assuming that this change is done in another part
> of the process's memory space. This type page fault is named speculative
> page fault. If the speculative page fault fails because of a concurrency is
> detected or because underlying PMD or PTE tables are not yet allocating, it
> is failing its processing and a classic page fault is then tried.
>
> The speculative page fault (SPF) has to look for the VMA matching the fault
> address without holding the mmap_sem, this is done by introducing a rwlock
> which protects the access to the mm_rb tree. Previously this was done using
> SRCU but it was introducing a lot of scheduling to process the VMA's
> freeing
> operation which was hitting the performance by 20% as reported by Kemi Wang
> [2].Using a rwlock to protect access to the mm_rb tree is limiting the
> locking contention to these operations which are expected to be in a O(log
> n)
> order. In addition to ensure that the VMA is not freed in our back a
> reference count is added and 2 services (get_vma() and put_vma()) are
> introduced to handle the reference count. When a VMA is fetch from the RB
> tree using get_vma() is must be later freeed using put_vma(). Furthermore,
> to allow the VMA to be used again by the classic page fault handler a
> service is introduced can_reuse_spf_vma(). This service is expected to be
> called with the mmap_sem hold. It checked that the VMA is still matching
> the specified address and is releasing its reference count as the mmap_sem
> is hold it is ensure that it will not be freed in our back. In general, the
> VMA's reference count could be decremented when holding the mmap_sem but it
> should not be increased as holding the mmap_sem is ensuring that the VMA is
> stable. I can't see anymore the overhead I got while will-it-scale
> benchmark anymore.
>
> The VMA's attributes checked during the speculative page fault processing
> have to be protected against parallel changes. This is done by using a per
> VMA sequence lock. This sequence lock allows the speculative page fault
> handler to fast check for parallel changes in progress and to abort the
> speculative page fault in that case.
>
> Once the VMA is found, the speculative page fault handler would check for
> the VMA's attributes to verify that the page fault has to be handled
> correctly or not. Thus the VMA is protected through a sequence lock which
> allows fast detection of concurrent VMA changes. If such a change is
> detected, the speculative page fault is aborted and a *classic* page fault
> is tried.  VMA sequence lockings are added when VMA attributes which are
> checked during the page fault are modified.
>
> When the PTE is fetched, the VMA is checked to see if it has been changed,
> so once the page table is locked, the VMA is valid, so any other changes
> leading to touching this PTE will need to lock the page table, so no
> parallel change is possible at this time.
>
> The locking of the PTE is done with interrupts disabled, this allows to
> check for the PMD to ensure that there is not an ongoing collapsing
> operation. Since khugepaged is firstly set the PMD to pmd_none and then is
> waiting for the other CPU to have catch the IPI interrupt, if the pmd is
> valid at the time the PTE is locked, we have the guarantee that the
> collapsing opertion will have to wait on the PTE lock to move foward. This
> allows the SPF handler to map the PTE safely. If the PMD value is different
> than the one recorded at the beginning of the SPF operation, the classic
> page fault handler will be called to handle the operation while holding the
> mmap_sem. As the PTE lock is done with the interrupts disabled, the lock is
> done using spin_trylock() to avoid dead lock when handling a page fault
> while a TLB invalidate is requested by an other CPU holding the PTE.
>
> Support for THP is not done because when checking for the PMD, we can be
> confused by an in progress collapsing operation done by khugepaged. The
> issue is that pmd_none() could be true either if the PMD is not already
> populated or if the underlying PTE are in the way to be collapsed. So we
> cannot safely allocate a PMD if pmd_none() is true.
>
> This series a new software performance event named 'speculative-faults' or
> 'spf'. It counts the number of successful page fault event handled in a
> speculative way. When recording 'faults,spf' events, the faults one is
> counting the total number of page fault events while 'spf' is only counting
> the part of the faults processed in a speculative 

Re: [PATCH v9 00/24] Speculative page faults

2018-03-21 Thread Ganesh Mahendran
Hi, Laurent

2018-03-14 1:59 GMT+08:00 Laurent Dufour :
> This is a port on kernel 4.16 of the work done by Peter Zijlstra to
> handle page fault without holding the mm semaphore [1].
>
> The idea is to try to handle user space page faults without holding the
> mmap_sem. This should allow better concurrency for massively threaded
> process since the page fault handler will not wait for other threads memory
> layout change to be done, assuming that this change is done in another part
> of the process's memory space. This type page fault is named speculative
> page fault. If the speculative page fault fails because of a concurrency is
> detected or because underlying PMD or PTE tables are not yet allocating, it
> is failing its processing and a classic page fault is then tried.
>
> The speculative page fault (SPF) has to look for the VMA matching the fault
> address without holding the mmap_sem, this is done by introducing a rwlock
> which protects the access to the mm_rb tree. Previously this was done using
> SRCU but it was introducing a lot of scheduling to process the VMA's
> freeing
> operation which was hitting the performance by 20% as reported by Kemi Wang
> [2].Using a rwlock to protect access to the mm_rb tree is limiting the
> locking contention to these operations which are expected to be in a O(log
> n)
> order. In addition to ensure that the VMA is not freed in our back a
> reference count is added and 2 services (get_vma() and put_vma()) are
> introduced to handle the reference count. When a VMA is fetch from the RB
> tree using get_vma() is must be later freeed using put_vma(). Furthermore,
> to allow the VMA to be used again by the classic page fault handler a
> service is introduced can_reuse_spf_vma(). This service is expected to be
> called with the mmap_sem hold. It checked that the VMA is still matching
> the specified address and is releasing its reference count as the mmap_sem
> is hold it is ensure that it will not be freed in our back. In general, the
> VMA's reference count could be decremented when holding the mmap_sem but it
> should not be increased as holding the mmap_sem is ensuring that the VMA is
> stable. I can't see anymore the overhead I got while will-it-scale
> benchmark anymore.
>
> The VMA's attributes checked during the speculative page fault processing
> have to be protected against parallel changes. This is done by using a per
> VMA sequence lock. This sequence lock allows the speculative page fault
> handler to fast check for parallel changes in progress and to abort the
> speculative page fault in that case.
>
> Once the VMA is found, the speculative page fault handler would check for
> the VMA's attributes to verify that the page fault has to be handled
> correctly or not. Thus the VMA is protected through a sequence lock which
> allows fast detection of concurrent VMA changes. If such a change is
> detected, the speculative page fault is aborted and a *classic* page fault
> is tried.  VMA sequence lockings are added when VMA attributes which are
> checked during the page fault are modified.
>
> When the PTE is fetched, the VMA is checked to see if it has been changed,
> so once the page table is locked, the VMA is valid, so any other changes
> leading to touching this PTE will need to lock the page table, so no
> parallel change is possible at this time.
>
> The locking of the PTE is done with interrupts disabled, this allows to
> check for the PMD to ensure that there is not an ongoing collapsing
> operation. Since khugepaged is firstly set the PMD to pmd_none and then is
> waiting for the other CPU to have catch the IPI interrupt, if the pmd is
> valid at the time the PTE is locked, we have the guarantee that the
> collapsing opertion will have to wait on the PTE lock to move foward. This
> allows the SPF handler to map the PTE safely. If the PMD value is different
> than the one recorded at the beginning of the SPF operation, the classic
> page fault handler will be called to handle the operation while holding the
> mmap_sem. As the PTE lock is done with the interrupts disabled, the lock is
> done using spin_trylock() to avoid dead lock when handling a page fault
> while a TLB invalidate is requested by an other CPU holding the PTE.
>
> Support for THP is not done because when checking for the PMD, we can be
> confused by an in progress collapsing operation done by khugepaged. The
> issue is that pmd_none() could be true either if the PMD is not already
> populated or if the underlying PTE are in the way to be collapsed. So we
> cannot safely allocate a PMD if pmd_none() is true.
>
> This series a new software performance event named 'speculative-faults' or
> 'spf'. It counts the number of successful page fault event handled in a
> speculative way. When recording 'faults,spf' events, the faults one is
> counting the total number of page fault events while 'spf' is only counting
> the part of the faults processed in a speculative way.
>
> There are some 

Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-03-13 Thread Ganesh Mahendran
Hi, Andy

2018-03-14 0:39 GMT+08:00 Andy Shevchenko <andy.shevche...@gmail.com>:
> On Mon, Mar 5, 2018 at 10:47 AM, Ganesh Mahendran
> <opensource.gan...@gmail.com> wrote:
>> single_open() interface requires that the whole output must
>> fit into a single buffer. This will lead to timeout when
>> system memory is not in a good situation.
>>
>> This patch use seq_open() to show wakeup stats. This method
>> need only one page, so timeout will not be observed.
>
>> +   if (n == 0) {
>> +   seq_puts(m, 
>> "name\t\tactive_count\tevent_count\twakeup_count\t"
>> +   "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> +   "last_change\tprevent_suspend_time\n");
>> +   }
>
> Can't you do this once at ->open() stage, for example?

We can not put this at ->open(). Because in seq_open(), the buffer is
not ready,
the seq buffer is allocated in seq_read().

Thanks.

>
>>  static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
>>  {
>> -   return single_open(file, wakeup_sources_stats_show, NULL);
>> +   return seq_open_private(file, _sources_stats_seq_ops, 
>> sizeof(int));
>>  }
>
> --
> With Best Regards,
> Andy Shevchenko


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-03-13 Thread Ganesh Mahendran
Hi, Andy

2018-03-14 0:39 GMT+08:00 Andy Shevchenko :
> On Mon, Mar 5, 2018 at 10:47 AM, Ganesh Mahendran
>  wrote:
>> single_open() interface requires that the whole output must
>> fit into a single buffer. This will lead to timeout when
>> system memory is not in a good situation.
>>
>> This patch use seq_open() to show wakeup stats. This method
>> need only one page, so timeout will not be observed.
>
>> +   if (n == 0) {
>> +   seq_puts(m, 
>> "name\t\tactive_count\tevent_count\twakeup_count\t"
>> +   "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> +   "last_change\tprevent_suspend_time\n");
>> +   }
>
> Can't you do this once at ->open() stage, for example?

We can not put this at ->open(). Because in seq_open(), the buffer is
not ready,
the seq buffer is allocated in seq_read().

Thanks.

>
>>  static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
>>  {
>> -   return single_open(file, wakeup_sources_stats_show, NULL);
>> +   return seq_open_private(file, _sources_stats_seq_ops, 
>> sizeof(int));
>>  }
>
> --
> With Best Regards,
> Andy Shevchenko


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-03-11 Thread Ganesh Mahendran
Hello, Rafael:

2018-03-05 16:47 GMT+08:00 Ganesh Mahendran <opensource.gan...@gmail.com>:
> single_open() interface requires that the whole output must
> fit into a single buffer. This will lead to timeout when
> system memory is not in a good situation.
>
> This patch use seq_open() to show wakeup stats. This method
> need only one page, so timeout will not be observed.
>
> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
> 
> v2: use srcu_read_lock instead of rcu_read_lock

How about the V2 patch?
If you have other concern, please let me know.

Thanks.

> ---
>  drivers/base/power/wakeup.c | 77 
> +++--
>  1 file changed, 61 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
> index ea01621..3bcab7d 100644
> --- a/drivers/base/power/wakeup.c
> +++ b/drivers/base/power/wakeup.c
> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct seq_file 
> *m,
> return 0;
>  }
>
> -/**
> - * wakeup_sources_stats_show - Print wakeup sources statistics information.
> - * @m: seq_file to print the statistics into.
> - */
> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
> +   loff_t *pos)
>  {
> struct wakeup_source *ws;
> -   int srcuidx;
> +   loff_t n = *pos;
> +   int *srcuidx = m->private;
>
> -   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
> -   "expire_count\tactive_since\ttotal_time\tmax_time\t"
> -   "last_change\tprevent_suspend_time\n");
> +   if (n == 0) {
> +   seq_puts(m, 
> "name\t\tactive_count\tevent_count\twakeup_count\t"
> +   "expire_count\tactive_since\ttotal_time\tmax_time\t"
> +   "last_change\tprevent_suspend_time\n");
> +   }
>
> -   srcuidx = srcu_read_lock(_srcu);
> -   list_for_each_entry_rcu(ws, _sources, entry)
> -   print_wakeup_source_stats(m, ws);
> -   srcu_read_unlock(_srcu, srcuidx);
> +   *srcuidx = srcu_read_lock(_srcu);
> +   list_for_each_entry_rcu(ws, _sources, entry) {
> +   if (n-- > 0)
> +   continue;
> +   goto out;
> +   }
> +   ws = NULL;
> +out:
> +   return ws;
> +}
> +
> +static void *wakeup_sources_stats_seq_next(struct seq_file *m,
> +   void *v, loff_t *pos)
> +{
> +   struct wakeup_source *ws = v;
> +   struct wakeup_source *next_ws = NULL;
> +
> +   ++(*pos);
>
> -   print_wakeup_source_stats(m, _ws);
> +   list_for_each_entry_continue_rcu(ws, _sources, entry) {
> +   next_ws = ws;
> +   break;
> +   }
> +
> +   return next_ws;
> +}
> +
> +static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
> +{
> +   int *srcuidx = m->private;
> +
> +   srcu_read_unlock(_srcu, *srcuidx);
> +}
> +
> +/**
> + * wakeup_sources_stats_seq_show - Print wakeup sources statistics 
> information.
> + * @m: seq_file to print the statistics into.
> + * @v: wakeup_source of each iteration
> + */
> +static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
> +{
> +   struct wakeup_source *ws = v;
> +
> +   print_wakeup_source_stats(m, ws);
>
> return 0;
>  }
>
> +static const struct seq_operations wakeup_sources_stats_seq_ops = {
> +   .start = wakeup_sources_stats_seq_start,
> +   .next  = wakeup_sources_stats_seq_next,
> +   .stop  = wakeup_sources_stats_seq_stop,
> +   .show  = wakeup_sources_stats_seq_show,
> +};
> +
>  static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
>  {
> -   return single_open(file, wakeup_sources_stats_show, NULL);
> +   return seq_open_private(file, _sources_stats_seq_ops, 
> sizeof(int));
>  }
>
>  static const struct file_operations wakeup_sources_stats_fops = {
> @@ -1062,7 +1107,7 @@ static int wakeup_sources_stats_open(struct inode 
> *inode, struct file *file)
> .open = wakeup_sources_stats_open,
> .read = seq_read,
> .llseek = seq_lseek,
> -   .release = single_release,
> +   .release = seq_release_private,
>  };
>
>  static int __init wakeup_sources_debugfs_init(void)
> --
> 1.9.1
>


Re: [PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-03-11 Thread Ganesh Mahendran
Hello, Rafael:

2018-03-05 16:47 GMT+08:00 Ganesh Mahendran :
> single_open() interface requires that the whole output must
> fit into a single buffer. This will lead to timeout when
> system memory is not in a good situation.
>
> This patch use seq_open() to show wakeup stats. This method
> need only one page, so timeout will not be observed.
>
> Signed-off-by: Ganesh Mahendran 
> 
> v2: use srcu_read_lock instead of rcu_read_lock

How about the V2 patch?
If you have other concern, please let me know.

Thanks.

> ---
>  drivers/base/power/wakeup.c | 77 
> +++--
>  1 file changed, 61 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
> index ea01621..3bcab7d 100644
> --- a/drivers/base/power/wakeup.c
> +++ b/drivers/base/power/wakeup.c
> @@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct seq_file 
> *m,
> return 0;
>  }
>
> -/**
> - * wakeup_sources_stats_show - Print wakeup sources statistics information.
> - * @m: seq_file to print the statistics into.
> - */
> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
> +   loff_t *pos)
>  {
> struct wakeup_source *ws;
> -   int srcuidx;
> +   loff_t n = *pos;
> +   int *srcuidx = m->private;
>
> -   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
> -   "expire_count\tactive_since\ttotal_time\tmax_time\t"
> -   "last_change\tprevent_suspend_time\n");
> +   if (n == 0) {
> +   seq_puts(m, 
> "name\t\tactive_count\tevent_count\twakeup_count\t"
> +   "expire_count\tactive_since\ttotal_time\tmax_time\t"
> +   "last_change\tprevent_suspend_time\n");
> +   }
>
> -   srcuidx = srcu_read_lock(_srcu);
> -   list_for_each_entry_rcu(ws, _sources, entry)
> -   print_wakeup_source_stats(m, ws);
> -   srcu_read_unlock(_srcu, srcuidx);
> +   *srcuidx = srcu_read_lock(_srcu);
> +   list_for_each_entry_rcu(ws, _sources, entry) {
> +   if (n-- > 0)
> +   continue;
> +   goto out;
> +   }
> +   ws = NULL;
> +out:
> +   return ws;
> +}
> +
> +static void *wakeup_sources_stats_seq_next(struct seq_file *m,
> +   void *v, loff_t *pos)
> +{
> +   struct wakeup_source *ws = v;
> +   struct wakeup_source *next_ws = NULL;
> +
> +   ++(*pos);
>
> -   print_wakeup_source_stats(m, _ws);
> +   list_for_each_entry_continue_rcu(ws, _sources, entry) {
> +   next_ws = ws;
> +   break;
> +   }
> +
> +   return next_ws;
> +}
> +
> +static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
> +{
> +   int *srcuidx = m->private;
> +
> +   srcu_read_unlock(_srcu, *srcuidx);
> +}
> +
> +/**
> + * wakeup_sources_stats_seq_show - Print wakeup sources statistics 
> information.
> + * @m: seq_file to print the statistics into.
> + * @v: wakeup_source of each iteration
> + */
> +static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
> +{
> +   struct wakeup_source *ws = v;
> +
> +   print_wakeup_source_stats(m, ws);
>
> return 0;
>  }
>
> +static const struct seq_operations wakeup_sources_stats_seq_ops = {
> +   .start = wakeup_sources_stats_seq_start,
> +   .next  = wakeup_sources_stats_seq_next,
> +   .stop  = wakeup_sources_stats_seq_stop,
> +   .show  = wakeup_sources_stats_seq_show,
> +};
> +
>  static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
>  {
> -   return single_open(file, wakeup_sources_stats_show, NULL);
> +   return seq_open_private(file, _sources_stats_seq_ops, 
> sizeof(int));
>  }
>
>  static const struct file_operations wakeup_sources_stats_fops = {
> @@ -1062,7 +1107,7 @@ static int wakeup_sources_stats_open(struct inode 
> *inode, struct file *file)
> .open = wakeup_sources_stats_open,
> .read = seq_read,
> .llseek = seq_lseek,
> -   .release = single_release,
> +   .release = seq_release_private,
>  };
>
>  static int __init wakeup_sources_debugfs_init(void)
> --
> 1.9.1
>


[PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-03-05 Thread Ganesh Mahendran
single_open() interface requires that the whole output must
fit into a single buffer. This will lead to timeout when
system memory is not in a good situation.

This patch use seq_open() to show wakeup stats. This method
need only one page, so timeout will not be observed.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>

v2: use srcu_read_lock instead of rcu_read_lock
---
 drivers/base/power/wakeup.c | 77 +++--
 1 file changed, 61 insertions(+), 16 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index ea01621..3bcab7d 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct seq_file *m,
return 0;
 }
 
-/**
- * wakeup_sources_stats_show - Print wakeup sources statistics information.
- * @m: seq_file to print the statistics into.
- */
-static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
+static void *wakeup_sources_stats_seq_start(struct seq_file *m,
+   loff_t *pos)
 {
struct wakeup_source *ws;
-   int srcuidx;
+   loff_t n = *pos;
+   int *srcuidx = m->private;
 
-   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
-   "expire_count\tactive_since\ttotal_time\tmax_time\t"
-   "last_change\tprevent_suspend_time\n");
+   if (n == 0) {
+   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+   "expire_count\tactive_since\ttotal_time\tmax_time\t"
+   "last_change\tprevent_suspend_time\n");
+   }
 
-   srcuidx = srcu_read_lock(_srcu);
-   list_for_each_entry_rcu(ws, _sources, entry)
-   print_wakeup_source_stats(m, ws);
-   srcu_read_unlock(_srcu, srcuidx);
+   *srcuidx = srcu_read_lock(_srcu);
+   list_for_each_entry_rcu(ws, _sources, entry) {
+   if (n-- > 0)
+   continue;
+   goto out;
+   }
+   ws = NULL;
+out:
+   return ws;
+}
+
+static void *wakeup_sources_stats_seq_next(struct seq_file *m,
+   void *v, loff_t *pos)
+{
+   struct wakeup_source *ws = v;
+   struct wakeup_source *next_ws = NULL;
+
+   ++(*pos);
 
-   print_wakeup_source_stats(m, _ws);
+   list_for_each_entry_continue_rcu(ws, _sources, entry) {
+   next_ws = ws;
+   break;
+   }
+
+   return next_ws;
+}
+
+static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
+{
+   int *srcuidx = m->private;
+
+   srcu_read_unlock(_srcu, *srcuidx);
+}
+
+/**
+ * wakeup_sources_stats_seq_show - Print wakeup sources statistics information.
+ * @m: seq_file to print the statistics into.
+ * @v: wakeup_source of each iteration
+ */
+static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
+{
+   struct wakeup_source *ws = v;
+
+   print_wakeup_source_stats(m, ws);
 
return 0;
 }
 
+static const struct seq_operations wakeup_sources_stats_seq_ops = {
+   .start = wakeup_sources_stats_seq_start,
+   .next  = wakeup_sources_stats_seq_next,
+   .stop  = wakeup_sources_stats_seq_stop,
+   .show  = wakeup_sources_stats_seq_show,
+};
+
 static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
 {
-   return single_open(file, wakeup_sources_stats_show, NULL);
+   return seq_open_private(file, _sources_stats_seq_ops, 
sizeof(int));
 }
 
 static const struct file_operations wakeup_sources_stats_fops = {
@@ -1062,7 +1107,7 @@ static int wakeup_sources_stats_open(struct inode *inode, 
struct file *file)
.open = wakeup_sources_stats_open,
.read = seq_read,
.llseek = seq_lseek,
-   .release = single_release,
+   .release = seq_release_private,
 };
 
 static int __init wakeup_sources_debugfs_init(void)
-- 
1.9.1



[PATCH v2] PM / wakeup: use seq_open() to show wakeup stats

2018-03-05 Thread Ganesh Mahendran
single_open() interface requires that the whole output must
fit into a single buffer. This will lead to timeout when
system memory is not in a good situation.

This patch use seq_open() to show wakeup stats. This method
need only one page, so timeout will not be observed.

Signed-off-by: Ganesh Mahendran 

v2: use srcu_read_lock instead of rcu_read_lock
---
 drivers/base/power/wakeup.c | 77 +++--
 1 file changed, 61 insertions(+), 16 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index ea01621..3bcab7d 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -1029,32 +1029,77 @@ static int print_wakeup_source_stats(struct seq_file *m,
return 0;
 }
 
-/**
- * wakeup_sources_stats_show - Print wakeup sources statistics information.
- * @m: seq_file to print the statistics into.
- */
-static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
+static void *wakeup_sources_stats_seq_start(struct seq_file *m,
+   loff_t *pos)
 {
struct wakeup_source *ws;
-   int srcuidx;
+   loff_t n = *pos;
+   int *srcuidx = m->private;
 
-   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
-   "expire_count\tactive_since\ttotal_time\tmax_time\t"
-   "last_change\tprevent_suspend_time\n");
+   if (n == 0) {
+   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+   "expire_count\tactive_since\ttotal_time\tmax_time\t"
+   "last_change\tprevent_suspend_time\n");
+   }
 
-   srcuidx = srcu_read_lock(_srcu);
-   list_for_each_entry_rcu(ws, _sources, entry)
-   print_wakeup_source_stats(m, ws);
-   srcu_read_unlock(_srcu, srcuidx);
+   *srcuidx = srcu_read_lock(_srcu);
+   list_for_each_entry_rcu(ws, _sources, entry) {
+   if (n-- > 0)
+   continue;
+   goto out;
+   }
+   ws = NULL;
+out:
+   return ws;
+}
+
+static void *wakeup_sources_stats_seq_next(struct seq_file *m,
+   void *v, loff_t *pos)
+{
+   struct wakeup_source *ws = v;
+   struct wakeup_source *next_ws = NULL;
+
+   ++(*pos);
 
-   print_wakeup_source_stats(m, _ws);
+   list_for_each_entry_continue_rcu(ws, _sources, entry) {
+   next_ws = ws;
+   break;
+   }
+
+   return next_ws;
+}
+
+static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
+{
+   int *srcuidx = m->private;
+
+   srcu_read_unlock(_srcu, *srcuidx);
+}
+
+/**
+ * wakeup_sources_stats_seq_show - Print wakeup sources statistics information.
+ * @m: seq_file to print the statistics into.
+ * @v: wakeup_source of each iteration
+ */
+static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
+{
+   struct wakeup_source *ws = v;
+
+   print_wakeup_source_stats(m, ws);
 
return 0;
 }
 
+static const struct seq_operations wakeup_sources_stats_seq_ops = {
+   .start = wakeup_sources_stats_seq_start,
+   .next  = wakeup_sources_stats_seq_next,
+   .stop  = wakeup_sources_stats_seq_stop,
+   .show  = wakeup_sources_stats_seq_show,
+};
+
 static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
 {
-   return single_open(file, wakeup_sources_stats_show, NULL);
+   return seq_open_private(file, _sources_stats_seq_ops, 
sizeof(int));
 }
 
 static const struct file_operations wakeup_sources_stats_fops = {
@@ -1062,7 +1107,7 @@ static int wakeup_sources_stats_open(struct inode *inode, 
struct file *file)
.open = wakeup_sources_stats_open,
.read = seq_read,
.llseek = seq_lseek,
-   .release = single_release,
+   .release = seq_release_private,
 };
 
 static int __init wakeup_sources_debugfs_init(void)
-- 
1.9.1



Re: [PATCH] PM / wakeup: use seq_open() to show wakeup stats

2018-03-02 Thread Ganesh Mahendran
Hi, Rafael:

2018-03-02 16:58 GMT+08:00 Rafael J. Wysocki <raf...@kernel.org>:
> On Fri, Mar 2, 2018 at 6:01 AM, Ganesh Mahendran
> <opensource.gan...@gmail.com> wrote:
>> single_open() interface requires that the whole output must
>> fit into a single buffer. This will lead to timeout when
>> system memory is not in a good situation.
>
> Did you actually see this problem with this particular file or is it
> theoretical?

We got report of android watchdog timeout when memory situation
is bad.

>
>> This patch use seq_open() to show wakeup stats. This method
>> need only one page, so timeout will not be observed.
>>
>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>> ---
>>  drivers/base/power/wakeup.c | 71 
>> +++--
>>  1 file changed, 56 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
>> index ea01621..c64609a 100644
>> --- a/drivers/base/power/wakeup.c
>> +++ b/drivers/base/power/wakeup.c
>> @@ -1029,32 +1029,73 @@ static int print_wakeup_source_stats(struct seq_file 
>> *m,
>> return 0;
>>  }
>>
>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>> +   loff_t *pos)
>> +{
>> +   struct wakeup_source *ws;
>> +   loff_t n = *pos;
>> +
>> +   if (n == 0) {
>> +   seq_puts(m, 
>> "name\t\tactive_count\tevent_count\twakeup_count\t"
>> +   "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> +   "last_change\tprevent_suspend_time\n");
>> +   }
>> +
>> +   rcu_read_lock();
>
> The code running after this cannot sleep.  Use
> srcu_read_lock(_srcu) instead.

wakeup_sources_stats_seq_[start | end] are called in seq_read().
So rcu_read_unlock() will soon be called  in seq_read().

I am not familar with rcu. I refered to kmemleak.c which use seq_open()
to show the stats.

Thanks for your review.

>
>> +   list_for_each_entry_rcu(ws, _sources, entry) {
>> +   if (n-- > 0)
>> +   continue;
>> +   goto out;
>> +   }
>> +   ws = NULL;
>> +out:
>> +   return ws;
>> +}
>> +
>> +static void *wakeup_sources_stats_seq_next(struct seq_file *m,
>> +   void *v, loff_t *pos)
>> +{
>> +   struct wakeup_source *ws = v;
>> +   struct wakeup_source *next_ws = NULL;
>> +
>> +   ++(*pos);
>> +
>> +   list_for_each_entry_continue_rcu(ws, _sources, entry) {
>> +   next_ws = ws;
>> +   break;
>> +   }
>> +
>> +   return next_ws;
>> +}
>> +
>> +static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
>> +{
>> +   rcu_read_unlock();
>> +}
>> +
>>  /**
>>   * wakeup_sources_stats_show - Print wakeup sources statistics information.
>>   * @m: seq_file to print the statistics into.
>>   */
>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>> +static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
>>  {
>> -   struct wakeup_source *ws;
>> -   int srcuidx;
>> +   struct wakeup_source *ws = v;
>>
>> -   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
>> -   "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> -   "last_change\tprevent_suspend_time\n");
>> -
>> -   srcuidx = srcu_read_lock(_srcu);
>> -   list_for_each_entry_rcu(ws, _sources, entry)
>> -   print_wakeup_source_stats(m, ws);
>> -   srcu_read_unlock(_srcu, srcuidx);
>> -
>> -   print_wakeup_source_stats(m, _ws);
>> +   print_wakeup_source_stats(m, ws);
>>
>> return 0;
>>  }
>>
>> +static const struct seq_operations wakeup_sources_stats_seq_ops = {
>> +   .start = wakeup_sources_stats_seq_start,
>> +   .next  = wakeup_sources_stats_seq_next,
>> +   .stop  = wakeup_sources_stats_seq_stop,
>> +   .show  = wakeup_sources_stats_seq_show,
>> +};
>> +
>>  static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
>>  {
>> -   return single_open(file, wakeup_sources_stats_show, NULL);
>> +   return seq_open(file, _sources_stats_seq_ops);
>>  }
>>
>>  static const struct file_operations wakeup_sources_stats_fops = {
>> @@ -1062,7 +1103,7 @@ static int wakeup_sources_stats_open(struct inode 
>> *inode, struct file *file)
>> .open = wakeup_sources_stats_open,
>> .read = seq_read,
>> .llseek = seq_lseek,
>> -   .release = single_release,
>> +   .release = seq_release,
>>  };
>>
>>  static int __init wakeup_sources_debugfs_init(void)
>> --
>> 1.9.1
>>


Re: [PATCH] PM / wakeup: use seq_open() to show wakeup stats

2018-03-02 Thread Ganesh Mahendran
Hi, Rafael:

2018-03-02 16:58 GMT+08:00 Rafael J. Wysocki :
> On Fri, Mar 2, 2018 at 6:01 AM, Ganesh Mahendran
>  wrote:
>> single_open() interface requires that the whole output must
>> fit into a single buffer. This will lead to timeout when
>> system memory is not in a good situation.
>
> Did you actually see this problem with this particular file or is it
> theoretical?

We got report of android watchdog timeout when memory situation
is bad.

>
>> This patch use seq_open() to show wakeup stats. This method
>> need only one page, so timeout will not be observed.
>>
>> Signed-off-by: Ganesh Mahendran 
>> ---
>>  drivers/base/power/wakeup.c | 71 
>> +++--
>>  1 file changed, 56 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
>> index ea01621..c64609a 100644
>> --- a/drivers/base/power/wakeup.c
>> +++ b/drivers/base/power/wakeup.c
>> @@ -1029,32 +1029,73 @@ static int print_wakeup_source_stats(struct seq_file 
>> *m,
>> return 0;
>>  }
>>
>> +static void *wakeup_sources_stats_seq_start(struct seq_file *m,
>> +   loff_t *pos)
>> +{
>> +   struct wakeup_source *ws;
>> +   loff_t n = *pos;
>> +
>> +   if (n == 0) {
>> +   seq_puts(m, 
>> "name\t\tactive_count\tevent_count\twakeup_count\t"
>> +   "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> +   "last_change\tprevent_suspend_time\n");
>> +   }
>> +
>> +   rcu_read_lock();
>
> The code running after this cannot sleep.  Use
> srcu_read_lock(_srcu) instead.

wakeup_sources_stats_seq_[start | end] are called in seq_read().
So rcu_read_unlock() will soon be called  in seq_read().

I am not familar with rcu. I refered to kmemleak.c which use seq_open()
to show the stats.

Thanks for your review.

>
>> +   list_for_each_entry_rcu(ws, _sources, entry) {
>> +   if (n-- > 0)
>> +   continue;
>> +   goto out;
>> +   }
>> +   ws = NULL;
>> +out:
>> +   return ws;
>> +}
>> +
>> +static void *wakeup_sources_stats_seq_next(struct seq_file *m,
>> +   void *v, loff_t *pos)
>> +{
>> +   struct wakeup_source *ws = v;
>> +   struct wakeup_source *next_ws = NULL;
>> +
>> +   ++(*pos);
>> +
>> +   list_for_each_entry_continue_rcu(ws, _sources, entry) {
>> +   next_ws = ws;
>> +   break;
>> +   }
>> +
>> +   return next_ws;
>> +}
>> +
>> +static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
>> +{
>> +   rcu_read_unlock();
>> +}
>> +
>>  /**
>>   * wakeup_sources_stats_show - Print wakeup sources statistics information.
>>   * @m: seq_file to print the statistics into.
>>   */
>> -static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
>> +static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
>>  {
>> -   struct wakeup_source *ws;
>> -   int srcuidx;
>> +   struct wakeup_source *ws = v;
>>
>> -   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
>> -   "expire_count\tactive_since\ttotal_time\tmax_time\t"
>> -   "last_change\tprevent_suspend_time\n");
>> -
>> -   srcuidx = srcu_read_lock(_srcu);
>> -   list_for_each_entry_rcu(ws, _sources, entry)
>> -   print_wakeup_source_stats(m, ws);
>> -   srcu_read_unlock(_srcu, srcuidx);
>> -
>> -   print_wakeup_source_stats(m, _ws);
>> +   print_wakeup_source_stats(m, ws);
>>
>> return 0;
>>  }
>>
>> +static const struct seq_operations wakeup_sources_stats_seq_ops = {
>> +   .start = wakeup_sources_stats_seq_start,
>> +   .next  = wakeup_sources_stats_seq_next,
>> +   .stop  = wakeup_sources_stats_seq_stop,
>> +   .show  = wakeup_sources_stats_seq_show,
>> +};
>> +
>>  static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
>>  {
>> -   return single_open(file, wakeup_sources_stats_show, NULL);
>> +   return seq_open(file, _sources_stats_seq_ops);
>>  }
>>
>>  static const struct file_operations wakeup_sources_stats_fops = {
>> @@ -1062,7 +1103,7 @@ static int wakeup_sources_stats_open(struct inode 
>> *inode, struct file *file)
>> .open = wakeup_sources_stats_open,
>> .read = seq_read,
>> .llseek = seq_lseek,
>> -   .release = single_release,
>> +   .release = seq_release,
>>  };
>>
>>  static int __init wakeup_sources_debugfs_init(void)
>> --
>> 1.9.1
>>


[PATCH] PM / wakeup: use seq_open() to show wakeup stats

2018-03-01 Thread Ganesh Mahendran
single_open() interface requires that the whole output must
fit into a single buffer. This will lead to timeout when
system memory is not in a good situation.

This patch use seq_open() to show wakeup stats. This method
need only one page, so timeout will not be observed.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
 drivers/base/power/wakeup.c | 71 +++--
 1 file changed, 56 insertions(+), 15 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index ea01621..c64609a 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -1029,32 +1029,73 @@ static int print_wakeup_source_stats(struct seq_file *m,
return 0;
 }
 
+static void *wakeup_sources_stats_seq_start(struct seq_file *m,
+   loff_t *pos)
+{
+   struct wakeup_source *ws;
+   loff_t n = *pos;
+
+   if (n == 0) {
+   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+   "expire_count\tactive_since\ttotal_time\tmax_time\t"
+   "last_change\tprevent_suspend_time\n");
+   }
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(ws, _sources, entry) {
+   if (n-- > 0)
+   continue;
+   goto out;
+   }
+   ws = NULL;
+out:
+   return ws;
+}
+
+static void *wakeup_sources_stats_seq_next(struct seq_file *m,
+   void *v, loff_t *pos)
+{
+   struct wakeup_source *ws = v;
+   struct wakeup_source *next_ws = NULL;
+
+   ++(*pos);
+
+   list_for_each_entry_continue_rcu(ws, _sources, entry) {
+   next_ws = ws;
+   break;
+   }
+
+   return next_ws;
+}
+
+static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
+{
+   rcu_read_unlock();
+}
+
 /**
  * wakeup_sources_stats_show - Print wakeup sources statistics information.
  * @m: seq_file to print the statistics into.
  */
-static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
+static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
 {
-   struct wakeup_source *ws;
-   int srcuidx;
+   struct wakeup_source *ws = v;
 
-   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
-   "expire_count\tactive_since\ttotal_time\tmax_time\t"
-   "last_change\tprevent_suspend_time\n");
-
-   srcuidx = srcu_read_lock(_srcu);
-   list_for_each_entry_rcu(ws, _sources, entry)
-   print_wakeup_source_stats(m, ws);
-   srcu_read_unlock(_srcu, srcuidx);
-
-   print_wakeup_source_stats(m, _ws);
+   print_wakeup_source_stats(m, ws);
 
return 0;
 }
 
+static const struct seq_operations wakeup_sources_stats_seq_ops = {
+   .start = wakeup_sources_stats_seq_start,
+   .next  = wakeup_sources_stats_seq_next,
+   .stop  = wakeup_sources_stats_seq_stop,
+   .show  = wakeup_sources_stats_seq_show,
+};
+
 static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
 {
-   return single_open(file, wakeup_sources_stats_show, NULL);
+   return seq_open(file, _sources_stats_seq_ops);
 }
 
 static const struct file_operations wakeup_sources_stats_fops = {
@@ -1062,7 +1103,7 @@ static int wakeup_sources_stats_open(struct inode *inode, 
struct file *file)
.open = wakeup_sources_stats_open,
.read = seq_read,
.llseek = seq_lseek,
-   .release = single_release,
+   .release = seq_release,
 };
 
 static int __init wakeup_sources_debugfs_init(void)
-- 
1.9.1



[PATCH] PM / wakeup: use seq_open() to show wakeup stats

2018-03-01 Thread Ganesh Mahendran
single_open() interface requires that the whole output must
fit into a single buffer. This will lead to timeout when
system memory is not in a good situation.

This patch use seq_open() to show wakeup stats. This method
need only one page, so timeout will not be observed.

Signed-off-by: Ganesh Mahendran 
---
 drivers/base/power/wakeup.c | 71 +++--
 1 file changed, 56 insertions(+), 15 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index ea01621..c64609a 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -1029,32 +1029,73 @@ static int print_wakeup_source_stats(struct seq_file *m,
return 0;
 }
 
+static void *wakeup_sources_stats_seq_start(struct seq_file *m,
+   loff_t *pos)
+{
+   struct wakeup_source *ws;
+   loff_t n = *pos;
+
+   if (n == 0) {
+   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+   "expire_count\tactive_since\ttotal_time\tmax_time\t"
+   "last_change\tprevent_suspend_time\n");
+   }
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(ws, _sources, entry) {
+   if (n-- > 0)
+   continue;
+   goto out;
+   }
+   ws = NULL;
+out:
+   return ws;
+}
+
+static void *wakeup_sources_stats_seq_next(struct seq_file *m,
+   void *v, loff_t *pos)
+{
+   struct wakeup_source *ws = v;
+   struct wakeup_source *next_ws = NULL;
+
+   ++(*pos);
+
+   list_for_each_entry_continue_rcu(ws, _sources, entry) {
+   next_ws = ws;
+   break;
+   }
+
+   return next_ws;
+}
+
+static void wakeup_sources_stats_seq_stop(struct seq_file *m, void *v)
+{
+   rcu_read_unlock();
+}
+
 /**
  * wakeup_sources_stats_show - Print wakeup sources statistics information.
  * @m: seq_file to print the statistics into.
  */
-static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
+static int wakeup_sources_stats_seq_show(struct seq_file *m, void *v)
 {
-   struct wakeup_source *ws;
-   int srcuidx;
+   struct wakeup_source *ws = v;
 
-   seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
-   "expire_count\tactive_since\ttotal_time\tmax_time\t"
-   "last_change\tprevent_suspend_time\n");
-
-   srcuidx = srcu_read_lock(_srcu);
-   list_for_each_entry_rcu(ws, _sources, entry)
-   print_wakeup_source_stats(m, ws);
-   srcu_read_unlock(_srcu, srcuidx);
-
-   print_wakeup_source_stats(m, _ws);
+   print_wakeup_source_stats(m, ws);
 
return 0;
 }
 
+static const struct seq_operations wakeup_sources_stats_seq_ops = {
+   .start = wakeup_sources_stats_seq_start,
+   .next  = wakeup_sources_stats_seq_next,
+   .stop  = wakeup_sources_stats_seq_stop,
+   .show  = wakeup_sources_stats_seq_show,
+};
+
 static int wakeup_sources_stats_open(struct inode *inode, struct file *file)
 {
-   return single_open(file, wakeup_sources_stats_show, NULL);
+   return seq_open(file, _sources_stats_seq_ops);
 }
 
 static const struct file_operations wakeup_sources_stats_fops = {
@@ -1062,7 +1103,7 @@ static int wakeup_sources_stats_open(struct inode *inode, 
struct file *file)
.open = wakeup_sources_stats_open,
.read = seq_read,
.llseek = seq_lseek,
-   .release = single_release,
+   .release = seq_release,
 };
 
 static int __init wakeup_sources_debugfs_init(void)
-- 
1.9.1



Re: [PATCH] scsi_lib: increase {host|target|device}_busy count after dispatch cmd

2018-03-01 Thread Ganesh Mahendran
Hi, Bart:

2018-03-02 7:11 GMT+08:00 Bart Van Assche <bart.vanass...@wdc.com>:
> On Mon, 2017-06-05 at 17:37 +0800, Ganesh Mahendran wrote:
>> In android system, when there are lots of threads running. Thread A
>> holding *host_busy* count is easily to be preempted, and if at the
>> same time, thread B set *host_blocked*, then all other threads will
>> be io blocked.
>
> Hello Ganesh,
>
> Have you considered to insert preempt_disable() and preempt_enable() calls
> where necessary to achieve the same effect? I think that would result in a
> much less intrusive patch.

Yes, preempt_disable()preempt_enable will also achieve the same effect.
But I just think preempt_disable()preempt_enable may be a little heavy for
this problem which can be fixed by increaseing {host|target|device}_busy count
after dispatch cmd.

Thanks.

>
> Thanks,
>
> Bart.
>
>


Re: [PATCH] scsi_lib: increase {host|target|device}_busy count after dispatch cmd

2018-03-01 Thread Ganesh Mahendran
Hi, Bart:

2018-03-02 7:11 GMT+08:00 Bart Van Assche :
> On Mon, 2017-06-05 at 17:37 +0800, Ganesh Mahendran wrote:
>> In android system, when there are lots of threads running. Thread A
>> holding *host_busy* count is easily to be preempted, and if at the
>> same time, thread B set *host_blocked*, then all other threads will
>> be io blocked.
>
> Hello Ganesh,
>
> Have you considered to insert preempt_disable() and preempt_enable() calls
> where necessary to achieve the same effect? I think that would result in a
> much less intrusive patch.

Yes, preempt_disable()preempt_enable will also achieve the same effect.
But I just think preempt_disable()preempt_enable may be a little heavy for
this problem which can be fixed by increaseing {host|target|device}_busy count
after dispatch cmd.

Thanks.

>
> Thanks,
>
> Bart.
>
>


Re: [PATCH v3] android: binder: use VM_ALLOC to get vm area

2018-01-24 Thread Ganesh Mahendran
Hi, Martijn

2018-01-24 22:33 GMT+08:00 Martijn Coenen :
> On Mon, Jan 22, 2018 at 4:54 PM, Greg KH  wrote:
>> Martijn and Todd, any objections to this patch?
>
> Looks good to me.

Thanks for your review.

Should I cherry-pick this change to aosp kernel 3.10/3.18/4.4/4.9 now?

Thanks.

>
>>
>> thanks,
>>
>> greg k-h


Re: [PATCH v3] android: binder: use VM_ALLOC to get vm area

2018-01-24 Thread Ganesh Mahendran
Hi, Martijn

2018-01-24 22:33 GMT+08:00 Martijn Coenen :
> On Mon, Jan 22, 2018 at 4:54 PM, Greg KH  wrote:
>> Martijn and Todd, any objections to this patch?
>
> Looks good to me.

Thanks for your review.

Should I cherry-pick this change to aosp kernel 3.10/3.18/4.4/4.9 now?

Thanks.

>
>>
>> thanks,
>>
>> greg k-h


Re: [PATCH v3] android: binder: use VM_ALLOC to get vm area

2018-01-24 Thread Ganesh Mahendran
Hi, Todd:

2018-01-23 1:02 GMT+08:00 Todd Kjos <tk...@google.com>:
> On Mon, Jan 22, 2018 at 7:54 AM, Greg KH <gre...@linuxfoundation.org> wrote:
>> On Wed, Jan 10, 2018 at 10:49:05AM +0800, Ganesh Mahendran wrote:
>>> VM_IOREMAP is used to access hardware through a mechanism called
>>> I/O mapped memory. Android binder is a IPC machanism which will
>>> not access I/O memory.
>>>
>>> And VM_IOREMAP has alignment requiement which may not needed in
>>> binder.
>>> __get_vm_area_node()
>>> {
>>> ...
>>> if (flags & VM_IOREMAP)
>>> align = 1ul << clamp_t(int, fls_long(size),
>>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>>> ...
>>> }
>>>
>>> This patch will save some kernel vm area, especially for 32bit os.
>>>
>>> In 32bit OS, kernel vm area is only 240MB. We may got below
>>> error when launching a app:
>>>
>>> <3>[ 4482.440053] binder_alloc: binder_alloc_mmap_handler: 15728 
>>> 8ce67000-8cf65000 get_vm_area failed -12
>>> <3>[ 4483.218817] binder_alloc: binder_alloc_mmap_handler: 15745 
>>> 8ce67000-8cf65000 get_vm_area failed -12
>>>
>>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>>> 
>>> V3: update comments
>>> V2: update comments
>>> ---
>>>  drivers/android/binder_alloc.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> Martijn and Todd, any objections to this patch?
>
> Looks fine to me. Arve, do you remember the rationale for using VM_IOREMAP?

Thanks for your review.

>
>>
>> thanks,
>>
>> greg k-h


Re: [PATCH v3] android: binder: use VM_ALLOC to get vm area

2018-01-24 Thread Ganesh Mahendran
Hi, Todd:

2018-01-23 1:02 GMT+08:00 Todd Kjos :
> On Mon, Jan 22, 2018 at 7:54 AM, Greg KH  wrote:
>> On Wed, Jan 10, 2018 at 10:49:05AM +0800, Ganesh Mahendran wrote:
>>> VM_IOREMAP is used to access hardware through a mechanism called
>>> I/O mapped memory. Android binder is a IPC machanism which will
>>> not access I/O memory.
>>>
>>> And VM_IOREMAP has alignment requiement which may not needed in
>>> binder.
>>> __get_vm_area_node()
>>> {
>>> ...
>>> if (flags & VM_IOREMAP)
>>> align = 1ul << clamp_t(int, fls_long(size),
>>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>>> ...
>>> }
>>>
>>> This patch will save some kernel vm area, especially for 32bit os.
>>>
>>> In 32bit OS, kernel vm area is only 240MB. We may got below
>>> error when launching a app:
>>>
>>> <3>[ 4482.440053] binder_alloc: binder_alloc_mmap_handler: 15728 
>>> 8ce67000-8cf65000 get_vm_area failed -12
>>> <3>[ 4483.218817] binder_alloc: binder_alloc_mmap_handler: 15745 
>>> 8ce67000-8cf65000 get_vm_area failed -12
>>>
>>> Signed-off-by: Ganesh Mahendran 
>>> 
>>> V3: update comments
>>> V2: update comments
>>> ---
>>>  drivers/android/binder_alloc.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> Martijn and Todd, any objections to this patch?
>
> Looks fine to me. Arve, do you remember the rationale for using VM_IOREMAP?

Thanks for your review.

>
>>
>> thanks,
>>
>> greg k-h


Re: [PATCH v3] android: binder: use VM_ALLOC to get vm area

2018-01-23 Thread Ganesh Mahendran
Hi, Arve:

2018-01-23 2:55 GMT+08:00 Arve Hjønnevåg <a...@android.com>:
> On Mon, Jan 22, 2018 at 9:02 AM, Todd Kjos <tk...@google.com> wrote:
>> On Mon, Jan 22, 2018 at 7:54 AM, Greg KH <gre...@linuxfoundation.org> wrote:
>>> On Wed, Jan 10, 2018 at 10:49:05AM +0800, Ganesh Mahendran wrote:
>>>> VM_IOREMAP is used to access hardware through a mechanism called
>>>> I/O mapped memory. Android binder is a IPC machanism which will
>>>> not access I/O memory.
>>>>
>>>> And VM_IOREMAP has alignment requiement which may not needed in
>>>> binder.
>>>> __get_vm_area_node()
>>>> {
>>>> ...
>>>> if (flags & VM_IOREMAP)
>>>> align = 1ul << clamp_t(int, fls_long(size),
>>>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>>>> ...
>>>> }
>>>>
>>>> This patch will save some kernel vm area, especially for 32bit os.
>>>>
>>>> In 32bit OS, kernel vm area is only 240MB. We may got below
>>>> error when launching a app:
>>>>
>>>> <3>[ 4482.440053] binder_alloc: binder_alloc_mmap_handler: 15728 
>>>> 8ce67000-8cf65000 get_vm_area failed -12
>>>> <3>[ 4483.218817] binder_alloc: binder_alloc_mmap_handler: 15745 
>>>> 8ce67000-8cf65000 get_vm_area failed -12
>>>>
>>>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>>>> 
>>>> V3: update comments
>>>> V2: update comments
>>>> ---
>>>>  drivers/android/binder_alloc.c | 2 +-
>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> Martijn and Todd, any objections to this patch?
>>
>> Looks fine to me. Arve, do you remember the rationale for using VM_IOREMAP?
>>
>
> I don't remember for sure, but I think it used alloc_vm_area at some
> point, and that uses VM_IOREMAP.

Yes, in alloc_vm_area(), it uses VM_IOREMAP flag.

In binder, the ~1MB vm area is cut into binder_buffer which is not
even page aligned.
So for binder IPC, there is no need to align vm area(512KB now).
For 64 bit OS, the vm area is very big, but for 32 bit OS, default vm
area is only 240.
The vm area will soon be exhausted, then no app can be launched.

Thanks.

>
> --
> Arve Hjønnevåg


Re: [PATCH v3] android: binder: use VM_ALLOC to get vm area

2018-01-23 Thread Ganesh Mahendran
Hi, Arve:

2018-01-23 2:55 GMT+08:00 Arve Hjønnevåg :
> On Mon, Jan 22, 2018 at 9:02 AM, Todd Kjos  wrote:
>> On Mon, Jan 22, 2018 at 7:54 AM, Greg KH  wrote:
>>> On Wed, Jan 10, 2018 at 10:49:05AM +0800, Ganesh Mahendran wrote:
>>>> VM_IOREMAP is used to access hardware through a mechanism called
>>>> I/O mapped memory. Android binder is a IPC machanism which will
>>>> not access I/O memory.
>>>>
>>>> And VM_IOREMAP has alignment requiement which may not needed in
>>>> binder.
>>>> __get_vm_area_node()
>>>> {
>>>> ...
>>>> if (flags & VM_IOREMAP)
>>>> align = 1ul << clamp_t(int, fls_long(size),
>>>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>>>> ...
>>>> }
>>>>
>>>> This patch will save some kernel vm area, especially for 32bit os.
>>>>
>>>> In 32bit OS, kernel vm area is only 240MB. We may got below
>>>> error when launching a app:
>>>>
>>>> <3>[ 4482.440053] binder_alloc: binder_alloc_mmap_handler: 15728 
>>>> 8ce67000-8cf65000 get_vm_area failed -12
>>>> <3>[ 4483.218817] binder_alloc: binder_alloc_mmap_handler: 15745 
>>>> 8ce67000-8cf65000 get_vm_area failed -12
>>>>
>>>> Signed-off-by: Ganesh Mahendran 
>>>> 
>>>> V3: update comments
>>>> V2: update comments
>>>> ---
>>>>  drivers/android/binder_alloc.c | 2 +-
>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> Martijn and Todd, any objections to this patch?
>>
>> Looks fine to me. Arve, do you remember the rationale for using VM_IOREMAP?
>>
>
> I don't remember for sure, but I think it used alloc_vm_area at some
> point, and that uses VM_IOREMAP.

Yes, in alloc_vm_area(), it uses VM_IOREMAP flag.

In binder, the ~1MB vm area is cut into binder_buffer which is not
even page aligned.
So for binder IPC, there is no need to align vm area(512KB now).
For 64 bit OS, the vm area is very big, but for 32 bit OS, default vm
area is only 240.
The vm area will soon be exhausted, then no app can be launched.

Thanks.

>
> --
> Arve Hjønnevåg


[PATCH v3] android: binder: use VM_ALLOC to get vm area

2018-01-09 Thread Ganesh Mahendran
VM_IOREMAP is used to access hardware through a mechanism called
I/O mapped memory. Android binder is a IPC machanism which will
not access I/O memory.

And VM_IOREMAP has alignment requiement which may not needed in
binder.
__get_vm_area_node()
{
...
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, fls_long(size),
   PAGE_SHIFT, IOREMAP_MAX_ORDER);
...
}

This patch will save some kernel vm area, especially for 32bit os.

In 32bit OS, kernel vm area is only 240MB. We may got below
error when launching a app:

<3>[ 4482.440053] binder_alloc: binder_alloc_mmap_handler: 15728 
8ce67000-8cf65000 get_vm_area failed -12
<3>[ 4483.218817] binder_alloc: binder_alloc_mmap_handler: 15745 
8ce67000-8cf65000 get_vm_area failed -12

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>

V3: update comments
V2: update comments
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 07b866a..5a426c8 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -670,7 +670,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
goto err_already_mapped;
}
 
-   area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
+   area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
if (area == NULL) {
ret = -ENOMEM;
failure_string = "get_vm_area";
-- 
1.9.1



[PATCH v3] android: binder: use VM_ALLOC to get vm area

2018-01-09 Thread Ganesh Mahendran
VM_IOREMAP is used to access hardware through a mechanism called
I/O mapped memory. Android binder is a IPC machanism which will
not access I/O memory.

And VM_IOREMAP has alignment requiement which may not needed in
binder.
__get_vm_area_node()
{
...
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, fls_long(size),
   PAGE_SHIFT, IOREMAP_MAX_ORDER);
...
}

This patch will save some kernel vm area, especially for 32bit os.

In 32bit OS, kernel vm area is only 240MB. We may got below
error when launching a app:

<3>[ 4482.440053] binder_alloc: binder_alloc_mmap_handler: 15728 
8ce67000-8cf65000 get_vm_area failed -12
<3>[ 4483.218817] binder_alloc: binder_alloc_mmap_handler: 15745 
8ce67000-8cf65000 get_vm_area failed -12

Signed-off-by: Ganesh Mahendran 

V3: update comments
V2: update comments
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 07b866a..5a426c8 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -670,7 +670,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
goto err_already_mapped;
}
 
-   area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
+   area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
if (area == NULL) {
ret = -ENOMEM;
failure_string = "get_vm_area";
-- 
1.9.1



[PATCH] use macro SHIFT instead of hard code 32

2017-07-06 Thread Ganesh Mahendran
32 is already defined as macro SHIFT, so it's better
to use macro SHIFT

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
 Documentation/scheduler/sched-pelt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/scheduler/sched-pelt.c 
b/Documentation/scheduler/sched-pelt.c
index e421913..726cb31 100644
--- a/Documentation/scheduler/sched-pelt.c
+++ b/Documentation/scheduler/sched-pelt.c
@@ -22,7 +22,7 @@ void calc_runnable_avg_yN_inv(void)
 
printf("static const u32 runnable_avg_yN_inv[] = {");
for (i = 0; i < HALFLIFE; i++) {
-   x = ((1UL<<32)-1)*pow(y, i);
+   x = ((1UL<<SHIFT)-1)*pow(y, i);
 
if (i % 6 == 0) printf("\n\t");
printf("0x%8x, ", x);
@@ -57,7 +57,7 @@ void calc_runnable_avg_yN_sum(void)
 
 void calc_converged_max(void)
 {
-   long last = 0, y_inv = ((1UL<<32)-1)*y;
+   long last = 0, y_inv = ((1UL<<SHIFT)-1)*y;
 
for (; ; n++) {
if (n > -1)
-- 
1.9.1



[PATCH] use macro SHIFT instead of hard code 32

2017-07-06 Thread Ganesh Mahendran
32 is already defined as macro SHIFT, so it's better
to use macro SHIFT

Signed-off-by: Ganesh Mahendran 
---
 Documentation/scheduler/sched-pelt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/scheduler/sched-pelt.c 
b/Documentation/scheduler/sched-pelt.c
index e421913..726cb31 100644
--- a/Documentation/scheduler/sched-pelt.c
+++ b/Documentation/scheduler/sched-pelt.c
@@ -22,7 +22,7 @@ void calc_runnable_avg_yN_inv(void)
 
printf("static const u32 runnable_avg_yN_inv[] = {");
for (i = 0; i < HALFLIFE; i++) {
-   x = ((1UL<<32)-1)*pow(y, i);
+   x = ((1UL< -1)
-- 
1.9.1



Re: [PATCH] sched/fair: fix contribution calculation

2017-07-05 Thread Ganesh Mahendran
Hello, Peter:

2017-07-05 19:59 GMT+08:00 Peter Zijlstra <pet...@infradead.org>:
> On Wed, Jul 05, 2017 at 04:46:30PM +0800, Ganesh Mahendran wrote:
>> Function __compute_runnable_contrib() is to calculate:
>>\Sum 1024*y^n {for (1..n_period)}
>> But LOAD_AVG_MAX returns sum of 1024*y^n (0..n_period).
>> So we need to subtract 1024*y^0.
>>
>> Cc: sta...@vger.kernel.org
>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>> ---
>>  kernel/sched/fair.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 90e26b1..777ad49 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -2508,7 +2508,7 @@ static u32 __compute_runnable_contrib(u64 n)
>>   if (likely(n <= LOAD_AVG_PERIOD))
>>   return runnable_avg_yN_sum[n];
>>   else if (unlikely(n >= LOAD_AVG_MAX_N))
>> - return LOAD_AVG_MAX;
>> + return LOAD_AVG_MAX - 1024;
>>
>>   /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
>>   do {
>
>
> This code no longer exists...

Yes, you are right. The latest kernel has fix this.
Do we need to fix this in LTS 4.1, 4.4?

Thanks.


Re: [PATCH] sched/fair: fix contribution calculation

2017-07-05 Thread Ganesh Mahendran
Hello, Peter:

2017-07-05 19:59 GMT+08:00 Peter Zijlstra :
> On Wed, Jul 05, 2017 at 04:46:30PM +0800, Ganesh Mahendran wrote:
>> Function __compute_runnable_contrib() is to calculate:
>>\Sum 1024*y^n {for (1..n_period)}
>> But LOAD_AVG_MAX returns sum of 1024*y^n (0..n_period).
>> So we need to subtract 1024*y^0.
>>
>> Cc: sta...@vger.kernel.org
>> Signed-off-by: Ganesh Mahendran 
>> ---
>>  kernel/sched/fair.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 90e26b1..777ad49 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -2508,7 +2508,7 @@ static u32 __compute_runnable_contrib(u64 n)
>>   if (likely(n <= LOAD_AVG_PERIOD))
>>   return runnable_avg_yN_sum[n];
>>   else if (unlikely(n >= LOAD_AVG_MAX_N))
>> - return LOAD_AVG_MAX;
>> + return LOAD_AVG_MAX - 1024;
>>
>>   /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
>>   do {
>
>
> This code no longer exists...

Yes, you are right. The latest kernel has fix this.
Do we need to fix this in LTS 4.1, 4.4?

Thanks.


[PATCH] sched/fair: fix contribution calculation

2017-07-05 Thread Ganesh Mahendran
Function __compute_runnable_contrib() is to calculate:
   \Sum 1024*y^n {for (1..n_period)}
But LOAD_AVG_MAX returns sum of 1024*y^n (0..n_period).
So we need to subtract 1024*y^0.

Cc: sta...@vger.kernel.org
Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 90e26b1..777ad49 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2508,7 +2508,7 @@ static u32 __compute_runnable_contrib(u64 n)
if (likely(n <= LOAD_AVG_PERIOD))
return runnable_avg_yN_sum[n];
else if (unlikely(n >= LOAD_AVG_MAX_N))
-   return LOAD_AVG_MAX;
+   return LOAD_AVG_MAX - 1024;
 
/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
do {
-- 
1.9.1



[PATCH] sched/fair: fix contribution calculation

2017-07-05 Thread Ganesh Mahendran
Function __compute_runnable_contrib() is to calculate:
   \Sum 1024*y^n {for (1..n_period)}
But LOAD_AVG_MAX returns sum of 1024*y^n (0..n_period).
So we need to subtract 1024*y^0.

Cc: sta...@vger.kernel.org
Signed-off-by: Ganesh Mahendran 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 90e26b1..777ad49 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2508,7 +2508,7 @@ static u32 __compute_runnable_contrib(u64 n)
if (likely(n <= LOAD_AVG_PERIOD))
return runnable_avg_yN_sum[n];
else if (unlikely(n >= LOAD_AVG_MAX_N))
-   return LOAD_AVG_MAX;
+   return LOAD_AVG_MAX - 1024;
 
/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
do {
-- 
1.9.1



Re: [PATCH] scsi_lib: increase {host|target|device}_busy count after dispatch cmd

2017-06-07 Thread Ganesh Mahendran
Ping~ Willing to hear some feed back :-)

Thanks

2017-06-05 17:37 GMT+08:00 Ganesh Mahendran <opensource.gan...@gmail.com>:
> In android system, when there are lots of threads running. Thread A
> holding *host_busy* count is easily to be preempted, and if at the
> same time, thread B set *host_blocked*, then all other threads will
> be io blocked.
>
> Below the detail:
> 1). Thread A calls scsi_request_fn() and it increases *host_busy*.
> But soon it is preempted.
> 2). Thread B call scsi_request_fn(), and it got failure from
> scsi_dispatch_cmd(). So it set *host_blocked*
> 3). All the io blocked...
> 4). Thread A is scheduled again, and it decreases *host_busy*
> in scsi_device_unbusy()
>
> Afer step 2), all the io will be blocked, since scsi_host_queue_ready()
> will always return 0.
> 
> scsi_host_queue_ready
> {
> if (atomic_read(>host_blocked) > 0) {
> if (busy)  ==> true after step 2
> goto starved;
> }
> 
>
> The system will be unblocked after step 4).
>
> This patch increases {host|target|device}_busy count after dispatch cmd.
>
> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
> ---
>  drivers/scsi/scsi_lib.c | 66 
> -
>  1 file changed, 32 insertions(+), 34 deletions(-)
>
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 884aaa8..9cac272 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -311,6 +311,16 @@ static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
> cmd->cmd_len = scsi_command_size(cmd->cmnd);
>  }
>
> +static void scsi_device_busy(struct scsi_device *sdev)
> +{
> +   struct Scsi_Host *shost = sdev->host;
> +   struct scsi_target *starget = scsi_target(sdev);
> +
> +   atomic_inc(>device_busy);
> +   atomic_inc(>host_busy);
> +   atomic_inc(>target_busy);
> +}
> +
>  void scsi_device_unbusy(struct scsi_device *sdev)
>  {
> struct Scsi_Host *shost = sdev->host;
> @@ -1352,12 +1362,13 @@ static void scsi_unprep_fn(struct request_queue *q, 
> struct request *req)
>  static inline int scsi_dev_queue_ready(struct request_queue *q,
>   struct scsi_device *sdev)
>  {
> +   int ret = 0;
> unsigned int busy;
>
> -   busy = atomic_inc_return(>device_busy) - 1;
> +   busy = atomic_read(>device_busy);
> if (atomic_read(>device_blocked)) {
> if (busy)
> -   goto out_dec;
> +   goto out;
>
> /*
>  * unblock after device_blocked iterates to zero
> @@ -1368,19 +1379,18 @@ static inline int scsi_dev_queue_ready(struct 
> request_queue *q,
>  */
> if (!q->mq_ops)
> blk_delay_queue(q, SCSI_QUEUE_DELAY);
> -   goto out_dec;
> +   goto out;
> }
> SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
>"unblocking device at zero depth\n"));
> }
>
> if (busy >= sdev->queue_depth)
> -   goto out_dec;
> +   goto out;
>
> -   return 1;
> -out_dec:
> -   atomic_dec(>device_busy);
> -   return 0;
> +   ret = 1;
> +out:
> +   return ret;
>  }
>
>  /*
> @@ -1407,7 +1417,7 @@ static inline int scsi_target_queue_ready(struct 
> Scsi_Host *shost,
> if (starget->can_queue <= 0)
> return 1;
>
> -   busy = atomic_inc_return(>target_busy) - 1;
> +   busy = atomic_read(>target_busy);
> if (atomic_read(>target_blocked) > 0) {
> if (busy)
> goto starved;
> @@ -1416,7 +1426,7 @@ static inline int scsi_target_queue_ready(struct 
> Scsi_Host *shost,
>  * unblock after target_blocked iterates to zero
>  */
> if (atomic_dec_return(>target_blocked) > 0)
> -   goto out_dec;
> +   goto out;
>
> SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget,
>  "unblocking target at zero depth\n"));
> @@ -1431,9 +1441,7 @@ static inline int scsi_target_queue_ready(struct 
> Scsi_Host *shost,
> spin_lock_irq(shost->host_lock);
> list_move_tail(>starved_entry, >starved_list);
> spin_unlock_irq(shost->host_lock);
> -out_de

Re: [PATCH] scsi_lib: increase {host|target|device}_busy count after dispatch cmd

2017-06-07 Thread Ganesh Mahendran
Ping~ Willing to hear some feed back :-)

Thanks

2017-06-05 17:37 GMT+08:00 Ganesh Mahendran :
> In android system, when there are lots of threads running. Thread A
> holding *host_busy* count is easily to be preempted, and if at the
> same time, thread B set *host_blocked*, then all other threads will
> be io blocked.
>
> Below the detail:
> 1). Thread A calls scsi_request_fn() and it increases *host_busy*.
> But soon it is preempted.
> 2). Thread B call scsi_request_fn(), and it got failure from
> scsi_dispatch_cmd(). So it set *host_blocked*
> 3). All the io blocked...
> 4). Thread A is scheduled again, and it decreases *host_busy*
> in scsi_device_unbusy()
>
> Afer step 2), all the io will be blocked, since scsi_host_queue_ready()
> will always return 0.
> 
> scsi_host_queue_ready
> {
> if (atomic_read(>host_blocked) > 0) {
> if (busy)  ==> true after step 2
> goto starved;
> }
> 
>
> The system will be unblocked after step 4).
>
> This patch increases {host|target|device}_busy count after dispatch cmd.
>
> Signed-off-by: Ganesh Mahendran 
> ---
>  drivers/scsi/scsi_lib.c | 66 
> -
>  1 file changed, 32 insertions(+), 34 deletions(-)
>
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 884aaa8..9cac272 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -311,6 +311,16 @@ static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
> cmd->cmd_len = scsi_command_size(cmd->cmnd);
>  }
>
> +static void scsi_device_busy(struct scsi_device *sdev)
> +{
> +   struct Scsi_Host *shost = sdev->host;
> +   struct scsi_target *starget = scsi_target(sdev);
> +
> +   atomic_inc(>device_busy);
> +   atomic_inc(>host_busy);
> +   atomic_inc(>target_busy);
> +}
> +
>  void scsi_device_unbusy(struct scsi_device *sdev)
>  {
> struct Scsi_Host *shost = sdev->host;
> @@ -1352,12 +1362,13 @@ static void scsi_unprep_fn(struct request_queue *q, 
> struct request *req)
>  static inline int scsi_dev_queue_ready(struct request_queue *q,
>   struct scsi_device *sdev)
>  {
> +   int ret = 0;
> unsigned int busy;
>
> -   busy = atomic_inc_return(>device_busy) - 1;
> +   busy = atomic_read(>device_busy);
> if (atomic_read(>device_blocked)) {
> if (busy)
> -   goto out_dec;
> +   goto out;
>
> /*
>  * unblock after device_blocked iterates to zero
> @@ -1368,19 +1379,18 @@ static inline int scsi_dev_queue_ready(struct 
> request_queue *q,
>  */
> if (!q->mq_ops)
> blk_delay_queue(q, SCSI_QUEUE_DELAY);
> -   goto out_dec;
> +   goto out;
> }
> SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
>"unblocking device at zero depth\n"));
> }
>
> if (busy >= sdev->queue_depth)
> -   goto out_dec;
> +   goto out;
>
> -   return 1;
> -out_dec:
> -   atomic_dec(>device_busy);
> -   return 0;
> +   ret = 1;
> +out:
> +   return ret;
>  }
>
>  /*
> @@ -1407,7 +1417,7 @@ static inline int scsi_target_queue_ready(struct 
> Scsi_Host *shost,
> if (starget->can_queue <= 0)
> return 1;
>
> -   busy = atomic_inc_return(>target_busy) - 1;
> +   busy = atomic_read(>target_busy);
> if (atomic_read(>target_blocked) > 0) {
> if (busy)
> goto starved;
> @@ -1416,7 +1426,7 @@ static inline int scsi_target_queue_ready(struct 
> Scsi_Host *shost,
>  * unblock after target_blocked iterates to zero
>  */
> if (atomic_dec_return(>target_blocked) > 0)
> -   goto out_dec;
> +   goto out;
>
> SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget,
>  "unblocking target at zero depth\n"));
> @@ -1431,9 +1441,7 @@ static inline int scsi_target_queue_ready(struct 
> Scsi_Host *shost,
> spin_lock_irq(shost->host_lock);
> list_move_tail(>starved_entry, >starved_list);
> spin_unlock_irq(shost->host_lock);
> -out_dec:
> -   if (starget->can_queue > 0)
> 

[PATCH] scsi_lib: increase {host|target|device}_busy count after dispatch cmd

2017-06-05 Thread Ganesh Mahendran
In android system, when there are lots of threads running. Thread A
holding *host_busy* count is easily to be preempted, and if at the
same time, thread B set *host_blocked*, then all other threads will
be io blocked.

Below the detail:
1). Thread A calls scsi_request_fn() and it increases *host_busy*.
But soon it is preempted.
2). Thread B call scsi_request_fn(), and it got failure from
scsi_dispatch_cmd(). So it set *host_blocked*
3). All the io blocked...
4). Thread A is scheduled again, and it decreases *host_busy*
in scsi_device_unbusy()

Afer step 2), all the io will be blocked, since scsi_host_queue_ready()
will always return 0.

scsi_host_queue_ready
{
if (atomic_read(>host_blocked) > 0) {
if (busy)  ==> true after step 2
goto starved;
}


The system will be unblocked after step 4).

This patch increases {host|target|device}_busy count after dispatch cmd.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
 drivers/scsi/scsi_lib.c | 66 -
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 884aaa8..9cac272 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -311,6 +311,16 @@ static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
cmd->cmd_len = scsi_command_size(cmd->cmnd);
 }
 
+static void scsi_device_busy(struct scsi_device *sdev)
+{
+   struct Scsi_Host *shost = sdev->host;
+   struct scsi_target *starget = scsi_target(sdev);
+
+   atomic_inc(>device_busy);
+   atomic_inc(>host_busy);
+   atomic_inc(>target_busy);
+}
+
 void scsi_device_unbusy(struct scsi_device *sdev)
 {
struct Scsi_Host *shost = sdev->host;
@@ -1352,12 +1362,13 @@ static void scsi_unprep_fn(struct request_queue *q, 
struct request *req)
 static inline int scsi_dev_queue_ready(struct request_queue *q,
  struct scsi_device *sdev)
 {
+   int ret = 0;
unsigned int busy;
 
-   busy = atomic_inc_return(>device_busy) - 1;
+   busy = atomic_read(>device_busy);
if (atomic_read(>device_blocked)) {
if (busy)
-   goto out_dec;
+   goto out;
 
/*
 * unblock after device_blocked iterates to zero
@@ -1368,19 +1379,18 @@ static inline int scsi_dev_queue_ready(struct 
request_queue *q,
 */
if (!q->mq_ops)
blk_delay_queue(q, SCSI_QUEUE_DELAY);
-   goto out_dec;
+   goto out;
}
SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
   "unblocking device at zero depth\n"));
}
 
if (busy >= sdev->queue_depth)
-   goto out_dec;
+   goto out;
 
-   return 1;
-out_dec:
-   atomic_dec(>device_busy);
-   return 0;
+   ret = 1;
+out:
+   return ret;
 }
 
 /*
@@ -1407,7 +1417,7 @@ static inline int scsi_target_queue_ready(struct 
Scsi_Host *shost,
if (starget->can_queue <= 0)
return 1;
 
-   busy = atomic_inc_return(>target_busy) - 1;
+   busy = atomic_read(>target_busy);
if (atomic_read(>target_blocked) > 0) {
if (busy)
goto starved;
@@ -1416,7 +1426,7 @@ static inline int scsi_target_queue_ready(struct 
Scsi_Host *shost,
 * unblock after target_blocked iterates to zero
 */
if (atomic_dec_return(>target_blocked) > 0)
-   goto out_dec;
+   goto out;
 
SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget,
 "unblocking target at zero depth\n"));
@@ -1431,9 +1441,7 @@ static inline int scsi_target_queue_ready(struct 
Scsi_Host *shost,
spin_lock_irq(shost->host_lock);
list_move_tail(>starved_entry, >starved_list);
spin_unlock_irq(shost->host_lock);
-out_dec:
-   if (starget->can_queue > 0)
-   atomic_dec(>target_busy);
+out:
return 0;
 }
 
@@ -1451,7 +1459,7 @@ static inline int scsi_host_queue_ready(struct 
request_queue *q,
if (scsi_host_in_recovery(shost))
return 0;
 
-   busy = atomic_inc_return(>host_busy) - 1;
+   busy = atomic_read(>host_busy);
if (atomic_read(>host_blocked) > 0) {
if (busy)
goto starved;
@@ -1460,7 +1468,7 @@ static inline int scsi_host_queue_ready(struct 
request_queue *q,
 * unblock after host_blocked iterates to zero
 */
if (atomic_dec_return(>host_blocked) > 0)
- 

[PATCH] scsi_lib: increase {host|target|device}_busy count after dispatch cmd

2017-06-05 Thread Ganesh Mahendran
In android system, when there are lots of threads running. Thread A
holding *host_busy* count is easily to be preempted, and if at the
same time, thread B set *host_blocked*, then all other threads will
be io blocked.

Below the detail:
1). Thread A calls scsi_request_fn() and it increases *host_busy*.
But soon it is preempted.
2). Thread B call scsi_request_fn(), and it got failure from
scsi_dispatch_cmd(). So it set *host_blocked*
3). All the io blocked...
4). Thread A is scheduled again, and it decreases *host_busy*
in scsi_device_unbusy()

Afer step 2), all the io will be blocked, since scsi_host_queue_ready()
will always return 0.

scsi_host_queue_ready
{
if (atomic_read(>host_blocked) > 0) {
if (busy)  ==> true after step 2
goto starved;
}


The system will be unblocked after step 4).

This patch increases {host|target|device}_busy count after dispatch cmd.

Signed-off-by: Ganesh Mahendran 
---
 drivers/scsi/scsi_lib.c | 66 -
 1 file changed, 32 insertions(+), 34 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 884aaa8..9cac272 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -311,6 +311,16 @@ static void scsi_init_cmd_errh(struct scsi_cmnd *cmd)
cmd->cmd_len = scsi_command_size(cmd->cmnd);
 }
 
+static void scsi_device_busy(struct scsi_device *sdev)
+{
+   struct Scsi_Host *shost = sdev->host;
+   struct scsi_target *starget = scsi_target(sdev);
+
+   atomic_inc(>device_busy);
+   atomic_inc(>host_busy);
+   atomic_inc(>target_busy);
+}
+
 void scsi_device_unbusy(struct scsi_device *sdev)
 {
struct Scsi_Host *shost = sdev->host;
@@ -1352,12 +1362,13 @@ static void scsi_unprep_fn(struct request_queue *q, 
struct request *req)
 static inline int scsi_dev_queue_ready(struct request_queue *q,
  struct scsi_device *sdev)
 {
+   int ret = 0;
unsigned int busy;
 
-   busy = atomic_inc_return(>device_busy) - 1;
+   busy = atomic_read(>device_busy);
if (atomic_read(>device_blocked)) {
if (busy)
-   goto out_dec;
+   goto out;
 
/*
 * unblock after device_blocked iterates to zero
@@ -1368,19 +1379,18 @@ static inline int scsi_dev_queue_ready(struct 
request_queue *q,
 */
if (!q->mq_ops)
blk_delay_queue(q, SCSI_QUEUE_DELAY);
-   goto out_dec;
+   goto out;
}
SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
   "unblocking device at zero depth\n"));
}
 
if (busy >= sdev->queue_depth)
-   goto out_dec;
+   goto out;
 
-   return 1;
-out_dec:
-   atomic_dec(>device_busy);
-   return 0;
+   ret = 1;
+out:
+   return ret;
 }
 
 /*
@@ -1407,7 +1417,7 @@ static inline int scsi_target_queue_ready(struct 
Scsi_Host *shost,
if (starget->can_queue <= 0)
return 1;
 
-   busy = atomic_inc_return(>target_busy) - 1;
+   busy = atomic_read(>target_busy);
if (atomic_read(>target_blocked) > 0) {
if (busy)
goto starved;
@@ -1416,7 +1426,7 @@ static inline int scsi_target_queue_ready(struct 
Scsi_Host *shost,
 * unblock after target_blocked iterates to zero
 */
if (atomic_dec_return(>target_blocked) > 0)
-   goto out_dec;
+   goto out;
 
SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget,
 "unblocking target at zero depth\n"));
@@ -1431,9 +1441,7 @@ static inline int scsi_target_queue_ready(struct 
Scsi_Host *shost,
spin_lock_irq(shost->host_lock);
list_move_tail(>starved_entry, >starved_list);
spin_unlock_irq(shost->host_lock);
-out_dec:
-   if (starget->can_queue > 0)
-   atomic_dec(>target_busy);
+out:
return 0;
 }
 
@@ -1451,7 +1459,7 @@ static inline int scsi_host_queue_ready(struct 
request_queue *q,
if (scsi_host_in_recovery(shost))
return 0;
 
-   busy = atomic_inc_return(>host_busy) - 1;
+   busy = atomic_read(>host_busy);
if (atomic_read(>host_blocked) > 0) {
if (busy)
goto starved;
@@ -1460,7 +1468,7 @@ static inline int scsi_host_queue_ready(struct 
request_queue *q,
 * unblock after host_blocked iterates to zero
 */
if (atomic_dec_return(>host_blocked) > 0)
-   goto out_dec;
+  

Re: [PATCH] Revert "arm64: Increase the max granular size"

2017-04-06 Thread Ganesh Mahendran
2017-04-06 23:58 GMT+08:00 Catalin Marinas :
> On Thu, Apr 06, 2017 at 12:52:13PM +0530, Imran Khan wrote:
>> On 4/5/2017 10:13 AM, Imran Khan wrote:
>> >> We may have to revisit this logic and consider L1_CACHE_BYTES the
>> >> _minimum_ of cache line sizes in arm64 systems supported by the kernel.
>> >> Do you have any benchmarks on Cavium boards that would show significant
>> >> degradation with 64-byte L1_CACHE_BYTES vs 128?
>> >>
>> >> For non-coherent DMA, the simplest is to make ARCH_DMA_MINALIGN the
>> >> _maximum_ of the supported systems:
>> >>
>> >> diff --git a/arch/arm64/include/asm/cache.h 
>> >> b/arch/arm64/include/asm/cache.h
>> >> index 5082b30bc2c0..4b5d7b27edaf 100644
>> >> --- a/arch/arm64/include/asm/cache.h
>> >> +++ b/arch/arm64/include/asm/cache.h
>> >> @@ -18,17 +18,17 @@
>> >>
>> >>  #include 
>> >>
>> >> -#define L1_CACHE_SHIFT 7
>> >> +#define L1_CACHE_SHIFT 6
>> >>  #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
>> >>
>> >>  /*
>> >>   * Memory returned by kmalloc() may be used for DMA, so we must make
>> >> - * sure that all such allocations are cache aligned. Otherwise,
>> >> - * unrelated code may cause parts of the buffer to be read into the
>> >> - * cache before the transfer is done, causing old data to be seen by
>> >> - * the CPU.
>> >> + * sure that all such allocations are aligned to the maximum *known*
>> >> + * cache line size on ARMv8 systems. Otherwise, unrelated code may cause
>> >> + * parts of the buffer to be read into the cache before the transfer is
>> >> + * done, causing old data to be seen by the CPU.
>> >>   */
>> >> -#define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
>> >> +#define ARCH_DMA_MINALIGN  (128)
>> >>
>> >>  #ifndef __ASSEMBLY__
>> >>
>> >> diff --git a/arch/arm64/kernel/cpufeature.c 
>> >> b/arch/arm64/kernel/cpufeature.c
>> >> index 392c67eb9fa6..30bafca1aebf 100644
>> >> --- a/arch/arm64/kernel/cpufeature.c
>> >> +++ b/arch/arm64/kernel/cpufeature.c
>> >> @@ -976,9 +976,9 @@ void __init setup_cpu_features(void)
>> >> if (!cwg)
>> >> pr_warn("No Cache Writeback Granule information, assuming
>> >> cache line size %d\n",
>> >> cls);
>> >> -   if (L1_CACHE_BYTES < cls)
>> >> -   pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback 
>> >> Granule (%d < %d)\n",
>> >> -   L1_CACHE_BYTES, cls);
>> >> +   if (ARCH_DMA_MINALIGN < cls)
>> >> +   pr_warn("ARCH_DMA_MINALIGN smaller than the Cache 
>> >> Writeback Granule (%d < %d)\n",
>> >> +   ARCH_DMA_MINALIGN, cls);
>> >>  }
>> >>
>> >>  static bool __maybe_unused
>> >
>> > This change was discussed at: [1] but was not concluded as apparently no 
>> > one
>> > came back with test report and numbers. After including this change in our
>> > local kernel we are seeing significant throughput improvement. For example 
>> > with:
>> >
>> > iperf -c 192.168.1.181 -i 1 -w 128K -t 60
>> >
>> > The average throughput is improving by about 30% (230Mbps from 180Mbps).
>> > Could you please let us know if this change can be included in upstream 
>> > kernel.
>> >
>> > [1]: https://groups.google.com/forum/#!topic/linux.kernel/P40yDB90ePs
>>
>> Could you please provide some feedback about the above mentioned query ?
>
> Do you have an explanation on the performance variation when
> L1_CACHE_BYTES is changed? We'd need to understand how the network stack
> is affected by L1_CACHE_BYTES, in which context it uses it (is it for
> non-coherent DMA?).

network stack use SKB_DATA_ALIGN to align.
---
#define SKB_DATA_ALIGN(X) (((X) + (SMP_CACHE_BYTES - 1)) & \
~(SMP_CACHE_BYTES - 1))

#define SMP_CACHE_BYTES L1_CACHE_BYTES
---
I think this is the reason of performance regression.

>
> The Cavium guys haven't shown any numbers (IIUC) to back the
> L1_CACHE_BYTES performance improvement but I would not revert the
> original commit since ARCH_DMA_MINALIGN definitely needs to cover the
> maximum available cache line size, which is 128 for them.

how about define L1_CACHE_SHIFT like below:
---
#ifdef CONFIG_ARM64_L1_CACHE_SHIFT
#define L1_CACHE_SHIFT CONFIG_ARM64_L1_CACHE_SHIFT
#else
#define L1_CACHE_SHIFT 7
endif
---

Thanks

>
> --
> Catalin


Re: [PATCH] Revert "arm64: Increase the max granular size"

2017-04-06 Thread Ganesh Mahendran
2017-04-06 23:58 GMT+08:00 Catalin Marinas :
> On Thu, Apr 06, 2017 at 12:52:13PM +0530, Imran Khan wrote:
>> On 4/5/2017 10:13 AM, Imran Khan wrote:
>> >> We may have to revisit this logic and consider L1_CACHE_BYTES the
>> >> _minimum_ of cache line sizes in arm64 systems supported by the kernel.
>> >> Do you have any benchmarks on Cavium boards that would show significant
>> >> degradation with 64-byte L1_CACHE_BYTES vs 128?
>> >>
>> >> For non-coherent DMA, the simplest is to make ARCH_DMA_MINALIGN the
>> >> _maximum_ of the supported systems:
>> >>
>> >> diff --git a/arch/arm64/include/asm/cache.h 
>> >> b/arch/arm64/include/asm/cache.h
>> >> index 5082b30bc2c0..4b5d7b27edaf 100644
>> >> --- a/arch/arm64/include/asm/cache.h
>> >> +++ b/arch/arm64/include/asm/cache.h
>> >> @@ -18,17 +18,17 @@
>> >>
>> >>  #include 
>> >>
>> >> -#define L1_CACHE_SHIFT 7
>> >> +#define L1_CACHE_SHIFT 6
>> >>  #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
>> >>
>> >>  /*
>> >>   * Memory returned by kmalloc() may be used for DMA, so we must make
>> >> - * sure that all such allocations are cache aligned. Otherwise,
>> >> - * unrelated code may cause parts of the buffer to be read into the
>> >> - * cache before the transfer is done, causing old data to be seen by
>> >> - * the CPU.
>> >> + * sure that all such allocations are aligned to the maximum *known*
>> >> + * cache line size on ARMv8 systems. Otherwise, unrelated code may cause
>> >> + * parts of the buffer to be read into the cache before the transfer is
>> >> + * done, causing old data to be seen by the CPU.
>> >>   */
>> >> -#define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
>> >> +#define ARCH_DMA_MINALIGN  (128)
>> >>
>> >>  #ifndef __ASSEMBLY__
>> >>
>> >> diff --git a/arch/arm64/kernel/cpufeature.c 
>> >> b/arch/arm64/kernel/cpufeature.c
>> >> index 392c67eb9fa6..30bafca1aebf 100644
>> >> --- a/arch/arm64/kernel/cpufeature.c
>> >> +++ b/arch/arm64/kernel/cpufeature.c
>> >> @@ -976,9 +976,9 @@ void __init setup_cpu_features(void)
>> >> if (!cwg)
>> >> pr_warn("No Cache Writeback Granule information, assuming
>> >> cache line size %d\n",
>> >> cls);
>> >> -   if (L1_CACHE_BYTES < cls)
>> >> -   pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback 
>> >> Granule (%d < %d)\n",
>> >> -   L1_CACHE_BYTES, cls);
>> >> +   if (ARCH_DMA_MINALIGN < cls)
>> >> +   pr_warn("ARCH_DMA_MINALIGN smaller than the Cache 
>> >> Writeback Granule (%d < %d)\n",
>> >> +   ARCH_DMA_MINALIGN, cls);
>> >>  }
>> >>
>> >>  static bool __maybe_unused
>> >
>> > This change was discussed at: [1] but was not concluded as apparently no 
>> > one
>> > came back with test report and numbers. After including this change in our
>> > local kernel we are seeing significant throughput improvement. For example 
>> > with:
>> >
>> > iperf -c 192.168.1.181 -i 1 -w 128K -t 60
>> >
>> > The average throughput is improving by about 30% (230Mbps from 180Mbps).
>> > Could you please let us know if this change can be included in upstream 
>> > kernel.
>> >
>> > [1]: https://groups.google.com/forum/#!topic/linux.kernel/P40yDB90ePs
>>
>> Could you please provide some feedback about the above mentioned query ?
>
> Do you have an explanation on the performance variation when
> L1_CACHE_BYTES is changed? We'd need to understand how the network stack
> is affected by L1_CACHE_BYTES, in which context it uses it (is it for
> non-coherent DMA?).

network stack use SKB_DATA_ALIGN to align.
---
#define SKB_DATA_ALIGN(X) (((X) + (SMP_CACHE_BYTES - 1)) & \
~(SMP_CACHE_BYTES - 1))

#define SMP_CACHE_BYTES L1_CACHE_BYTES
---
I think this is the reason of performance regression.

>
> The Cavium guys haven't shown any numbers (IIUC) to back the
> L1_CACHE_BYTES performance improvement but I would not revert the
> original commit since ARCH_DMA_MINALIGN definitely needs to cover the
> maximum available cache line size, which is 128 for them.

how about define L1_CACHE_SHIFT like below:
---
#ifdef CONFIG_ARM64_L1_CACHE_SHIFT
#define L1_CACHE_SHIFT CONFIG_ARM64_L1_CACHE_SHIFT
#else
#define L1_CACHE_SHIFT 7
endif
---

Thanks

>
> --
> Catalin


Re: [V2] android: binder: use VM_ALLOC to get vm area

2017-02-09 Thread Ganesh Mahendran
Hi, Greg:

2017-02-09 18:17 GMT+08:00 Greg KH <gre...@linuxfoundation.org>:
> On Thu, Feb 09, 2017 at 05:54:03PM +0800, Ganesh Mahendran wrote:
>> A gentle ping.
>
> I don't see a patch here that can be accepted, what are you asking for
> a response from?

I sent a patch before:
https://patchwork.kernel.org/patch/9429257/

Please help to review.

Thanks.

>
> confused,
>
> greg k-h


Re: [V2] android: binder: use VM_ALLOC to get vm area

2017-02-09 Thread Ganesh Mahendran
Hi, Greg:

2017-02-09 18:17 GMT+08:00 Greg KH :
> On Thu, Feb 09, 2017 at 05:54:03PM +0800, Ganesh Mahendran wrote:
>> A gentle ping.
>
> I don't see a patch here that can be accepted, what are you asking for
> a response from?

I sent a patch before:
https://patchwork.kernel.org/patch/9429257/

Please help to review.

Thanks.

>
> confused,
>
> greg k-h


Re: [V2] android: binder: use VM_ALLOC to get vm area

2017-02-09 Thread Ganesh Mahendran
A gentle ping.

Thanks.

2016-11-15 21:18 GMT+08:00 Ganesh Mahendran <opensource.gan...@gmail.com>:
> Hi, Greg
>
> 2016-11-15 18:18 GMT+08:00 Greg KH <gre...@linuxfoundation.org>:
>> On Tue, Nov 15, 2016 at 05:55:39PM +0800, Ganesh Mahendran wrote:
>>> VM_IOREMAP is used to access hardware through a mechanism called
>>> I/O mapped memory. Android binder is a IPC machanism which will
>>> not access I/O memory.
>>>
>>> Also VM_IOREMAP has alignment requiement which may not needed in
>>> binder.
>>> __get_vm_area_node()
>>> {
>>> ...
>>> if (flags & VM_IOREMAP)
>>> align = 1ul << clamp_t(int, fls_long(size),
>>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>>> ...
>>> }
>>>
>>> This patch use VM_ALLOC to get vm area.
>>>
>>> Below is the throughput test result:
>>>
>>>   # ./binderThroughputTest -w 100
>>>   I run this command 10 times:
>>>beforeafter
>>>   average iterations per sec:  11199.9   11886.9
>>>
>>> No performance regression found throgh binder test.
>>>
>>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>>> ---
>>>  drivers/android/binder.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> What changed from v1?
>
> Sorry for missing the change information.
>
> In V2, I run the binder test. And there is no side effect with this
> patch.
>
>>
>> Always list that below the --- line.
>
> Thanks for reminder.
>
>>
>> thanks,
>>
>> greg k-h


Re: [V2] android: binder: use VM_ALLOC to get vm area

2017-02-09 Thread Ganesh Mahendran
A gentle ping.

Thanks.

2016-11-15 21:18 GMT+08:00 Ganesh Mahendran :
> Hi, Greg
>
> 2016-11-15 18:18 GMT+08:00 Greg KH :
>> On Tue, Nov 15, 2016 at 05:55:39PM +0800, Ganesh Mahendran wrote:
>>> VM_IOREMAP is used to access hardware through a mechanism called
>>> I/O mapped memory. Android binder is a IPC machanism which will
>>> not access I/O memory.
>>>
>>> Also VM_IOREMAP has alignment requiement which may not needed in
>>> binder.
>>> __get_vm_area_node()
>>> {
>>> ...
>>> if (flags & VM_IOREMAP)
>>> align = 1ul << clamp_t(int, fls_long(size),
>>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>>> ...
>>> }
>>>
>>> This patch use VM_ALLOC to get vm area.
>>>
>>> Below is the throughput test result:
>>>
>>>   # ./binderThroughputTest -w 100
>>>   I run this command 10 times:
>>>beforeafter
>>>   average iterations per sec:  11199.9   11886.9
>>>
>>> No performance regression found throgh binder test.
>>>
>>> Signed-off-by: Ganesh Mahendran 
>>> ---
>>>  drivers/android/binder.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> What changed from v1?
>
> Sorry for missing the change information.
>
> In V2, I run the binder test. And there is no side effect with this
> patch.
>
>>
>> Always list that below the --- line.
>
> Thanks for reminder.
>
>>
>> thanks,
>>
>> greg k-h


Re: [PATCH] binder: replace kzalloc with kmem_cache

2016-12-13 Thread Ganesh Mahendran
Hi, Greg:

Sorry for the late response.

On Tue, Nov 22, 2016 at 02:53:02PM +0100, Greg KH wrote:
> On Tue, Nov 22, 2016 at 07:17:30PM +0800, Ganesh Mahendran wrote:
> > This patch use kmem_cache to allocate/free binder objects.
> 
> Why do this?

I am not very familiar with kmem_cache. I think if we have thousands of
active binder objects in system, kmem_cache would be better.

Below is binder object number in my android system:
-
$ cat /d/binder/stats
...
proc: active 100 total 6735
thread: active 1456 total 180807
node: active 5668 total 1027387
ref: active 7141 total 1214877
death: active 844 total 468056
transaction: active 0 total 54736890
transaction_complete: active 0 total 54736890
-

binder objects are allocated/freed frequently.

> 
> > It will have better memory efficiency.
> 
> Really?  How?  It should be the same, if not a bit worse.  Have you
> tested this?  What is the results?

kzalloc will use object with size 2^n to store user data.
Take "struct binder_thread" as example, its size is 296 bytes.
If use kzalloc(), slab system will use 512 object size to store the 296
bytes. But if use kmem_cache to create a seperte(may be merged with other
slab allocator) allocator, it will use 304 object size to store the 296
bytes. Below is information get from /proc/slabinfo :
--
name   
binder_thread  858  858   304 26   2

memmory efficiency is : (296 * 26) / (2 * 4096) = 93.9%

> 
> > And we can also get object usage details in /sys/kernel/slab/* for
> > futher analysis.
> 
> Why do we need this?  Who needs this information and what are you going
> to do with it?

This is only for debug purpuse to see how much memory is used by binder.

> 
> > Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
> > ---
> >  drivers/android/binder.c | 127 
> > ++-
> >  1 file changed, 104 insertions(+), 23 deletions(-)
> > 
> > diff --git a/drivers/android/binder.c b/drivers/android/binder.c
> > index 3c71b98..f1f8362 100644
> > --- a/drivers/android/binder.c
> > +++ b/drivers/android/binder.c
> > @@ -54,6 +54,14 @@
> >  static HLIST_HEAD(binder_deferred_list);
> >  static HLIST_HEAD(binder_dead_nodes);
> >  
> > +static struct kmem_cache *binder_proc_cachep;
> > +static struct kmem_cache *binder_thread_cachep;
> > +static struct kmem_cache *binder_node_cachep;
> > +static struct kmem_cache *binder_ref_cachep;
> > +static struct kmem_cache *binder_transaction_cachep;
> > +static struct kmem_cache *binder_work_cachep;
> > +static struct kmem_cache *binder_ref_death_cachep;
> 
> That's a lot of different caches, are you sure they don't just all get
> merged together anyway for most allocators?

If binder kmem_cache have the same flag with other allocator, it may be
merged with other allocator. But I think it would be better than using
kzalloc().

> 
> Don't create lots of little caches for no good reason, and without any
> benchmark numbers, I'd prefer to leave this alone.  You are going to
> have to prove this is a win to allow this type of churn.

I test binder with this patch. There is no performance regression.
---
I run 10 times with below command:
 $binderThroughputTest -w 100

   Beforeafter(with patch)
avg:   9848.4 9878.8

Thanks.

> 
> thanks,
> 
> greg k-h


Re: [PATCH] binder: replace kzalloc with kmem_cache

2016-12-13 Thread Ganesh Mahendran
Hi, Greg:

Sorry for the late response.

On Tue, Nov 22, 2016 at 02:53:02PM +0100, Greg KH wrote:
> On Tue, Nov 22, 2016 at 07:17:30PM +0800, Ganesh Mahendran wrote:
> > This patch use kmem_cache to allocate/free binder objects.
> 
> Why do this?

I am not very familiar with kmem_cache. I think if we have thousands of
active binder objects in system, kmem_cache would be better.

Below is binder object number in my android system:
-
$ cat /d/binder/stats
...
proc: active 100 total 6735
thread: active 1456 total 180807
node: active 5668 total 1027387
ref: active 7141 total 1214877
death: active 844 total 468056
transaction: active 0 total 54736890
transaction_complete: active 0 total 54736890
-

binder objects are allocated/freed frequently.

> 
> > It will have better memory efficiency.
> 
> Really?  How?  It should be the same, if not a bit worse.  Have you
> tested this?  What is the results?

kzalloc will use object with size 2^n to store user data.
Take "struct binder_thread" as example, its size is 296 bytes.
If use kzalloc(), slab system will use 512 object size to store the 296
bytes. But if use kmem_cache to create a seperte(may be merged with other
slab allocator) allocator, it will use 304 object size to store the 296
bytes. Below is information get from /proc/slabinfo :
--
name   
binder_thread  858  858   304 26   2

memmory efficiency is : (296 * 26) / (2 * 4096) = 93.9%

> 
> > And we can also get object usage details in /sys/kernel/slab/* for
> > futher analysis.
> 
> Why do we need this?  Who needs this information and what are you going
> to do with it?

This is only for debug purpuse to see how much memory is used by binder.

> 
> > Signed-off-by: Ganesh Mahendran 
> > ---
> >  drivers/android/binder.c | 127 
> > ++-
> >  1 file changed, 104 insertions(+), 23 deletions(-)
> > 
> > diff --git a/drivers/android/binder.c b/drivers/android/binder.c
> > index 3c71b98..f1f8362 100644
> > --- a/drivers/android/binder.c
> > +++ b/drivers/android/binder.c
> > @@ -54,6 +54,14 @@
> >  static HLIST_HEAD(binder_deferred_list);
> >  static HLIST_HEAD(binder_dead_nodes);
> >  
> > +static struct kmem_cache *binder_proc_cachep;
> > +static struct kmem_cache *binder_thread_cachep;
> > +static struct kmem_cache *binder_node_cachep;
> > +static struct kmem_cache *binder_ref_cachep;
> > +static struct kmem_cache *binder_transaction_cachep;
> > +static struct kmem_cache *binder_work_cachep;
> > +static struct kmem_cache *binder_ref_death_cachep;
> 
> That's a lot of different caches, are you sure they don't just all get
> merged together anyway for most allocators?

If binder kmem_cache have the same flag with other allocator, it may be
merged with other allocator. But I think it would be better than using
kzalloc().

> 
> Don't create lots of little caches for no good reason, and without any
> benchmark numbers, I'd prefer to leave this alone.  You are going to
> have to prove this is a win to allow this type of churn.

I test binder with this patch. There is no performance regression.
---
I run 10 times with below command:
 $binderThroughputTest -w 100

   Beforeafter(with patch)
avg:   9848.4 9878.8

Thanks.

> 
> thanks,
> 
> greg k-h


[PATCH] binder: replace kzalloc with kmem_cache

2016-11-22 Thread Ganesh Mahendran
This patch use kmem_cache to allocate/free binder objects.

It will have better memory efficiency. And we can also get
object usage details in /sys/kernel/slab/* for futher analysis.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
 drivers/android/binder.c | 127 ++-
 1 file changed, 104 insertions(+), 23 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3c71b98..f1f8362 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -54,6 +54,14 @@
 static HLIST_HEAD(binder_deferred_list);
 static HLIST_HEAD(binder_dead_nodes);
 
+static struct kmem_cache *binder_proc_cachep;
+static struct kmem_cache *binder_thread_cachep;
+static struct kmem_cache *binder_node_cachep;
+static struct kmem_cache *binder_ref_cachep;
+static struct kmem_cache *binder_transaction_cachep;
+static struct kmem_cache *binder_work_cachep;
+static struct kmem_cache *binder_ref_death_cachep;
+
 static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
 static struct binder_node *binder_context_mgr_node;
@@ -902,7 +910,7 @@ static struct binder_node *binder_new_node(struct 
binder_proc *proc,
return NULL;
}
 
-   node = kzalloc(sizeof(*node), GFP_KERNEL);
+   node = kmem_cache_zalloc(binder_node_cachep, GFP_KERNEL);
if (node == NULL)
return NULL;
binder_stats_created(BINDER_STAT_NODE);
@@ -992,7 +1000,7 @@ static int binder_dec_node(struct binder_node *node, int 
strong, int internal)
 "dead node %d deleted\n",
 node->debug_id);
}
-   kfree(node);
+   kmem_cache_free(binder_node_cachep, node);
binder_stats_deleted(BINDER_STAT_NODE);
}
}
@@ -1043,7 +1051,7 @@ static struct binder_ref *binder_get_ref_for_node(struct 
binder_proc *proc,
else
return ref;
}
-   new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+   new_ref = kmem_cache_zalloc(binder_ref_cachep, GFP_KERNEL);
if (new_ref == NULL)
return NULL;
binder_stats_created(BINDER_STAT_REF);
@@ -1108,10 +1116,10 @@ static void binder_delete_ref(struct binder_ref *ref)
 "%d delete ref %d desc %d has death 
notification\n",
  ref->proc->pid, ref->debug_id, ref->desc);
list_del(>death->work.entry);
-   kfree(ref->death);
+   kmem_cache_free(binder_ref_death_cachep, ref->death);
binder_stats_deleted(BINDER_STAT_DEATH);
}
-   kfree(ref);
+   kmem_cache_free(binder_ref_cachep, ref);
binder_stats_deleted(BINDER_STAT_REF);
 }
 
@@ -1183,7 +1191,7 @@ static void binder_pop_transaction(struct binder_thread 
*target_thread,
t->need_reply = 0;
if (t->buffer)
t->buffer->transaction = NULL;
-   kfree(t);
+   kmem_cache_free(binder_transaction_cachep, t);
binder_stats_deleted(BINDER_STAT_TRANSACTION);
 }
 
@@ -1444,14 +1452,14 @@ static void binder_transaction(struct binder_proc *proc,
e->to_proc = target_proc->pid;
 
/* TODO: reuse incoming transaction for reply */
-   t = kzalloc(sizeof(*t), GFP_KERNEL);
+   t = kmem_cache_zalloc(binder_transaction_cachep, GFP_KERNEL);
if (t == NULL) {
return_error = BR_FAILED_REPLY;
goto err_alloc_t_failed;
}
binder_stats_created(BINDER_STAT_TRANSACTION);
 
-   tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL);
+   tcomplete = kmem_cache_zalloc(binder_work_cachep, GFP_KERNEL);
if (tcomplete == NULL) {
return_error = BR_FAILED_REPLY;
goto err_alloc_tcomplete_failed;
@@ -1742,10 +1750,10 @@ static void binder_transaction(struct binder_proc *proc,
t->buffer->transaction = NULL;
binder_free_buf(target_proc, t->buffer);
 err_binder_alloc_buf_failed:
-   kfree(tcomplete);
+   kmem_cache_free(binder_work_cachep, tcomplete);
binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
 err_alloc_tcomplete_failed:
-   kfree(t);
+   kmem_cache_free(binder_transaction_cachep, t);
binder_stats_deleted(BINDER_STAT_TRANSACTION);
 err_alloc_t_failed:
 err_bad_call_stack:
@@ -2039,7 +2047,7 @@ static int binder_thread_write(struct binder_proc *proc,
proc->pid, thread->pid);
break;
}
-   death = kzalloc(sizeof(*death), GFP_KERNEL);
+   death = 
kmem_cache_zalloc(b

[PATCH] binder: replace kzalloc with kmem_cache

2016-11-22 Thread Ganesh Mahendran
This patch use kmem_cache to allocate/free binder objects.

It will have better memory efficiency. And we can also get
object usage details in /sys/kernel/slab/* for futher analysis.

Signed-off-by: Ganesh Mahendran 
---
 drivers/android/binder.c | 127 ++-
 1 file changed, 104 insertions(+), 23 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3c71b98..f1f8362 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -54,6 +54,14 @@
 static HLIST_HEAD(binder_deferred_list);
 static HLIST_HEAD(binder_dead_nodes);
 
+static struct kmem_cache *binder_proc_cachep;
+static struct kmem_cache *binder_thread_cachep;
+static struct kmem_cache *binder_node_cachep;
+static struct kmem_cache *binder_ref_cachep;
+static struct kmem_cache *binder_transaction_cachep;
+static struct kmem_cache *binder_work_cachep;
+static struct kmem_cache *binder_ref_death_cachep;
+
 static struct dentry *binder_debugfs_dir_entry_root;
 static struct dentry *binder_debugfs_dir_entry_proc;
 static struct binder_node *binder_context_mgr_node;
@@ -902,7 +910,7 @@ static struct binder_node *binder_new_node(struct 
binder_proc *proc,
return NULL;
}
 
-   node = kzalloc(sizeof(*node), GFP_KERNEL);
+   node = kmem_cache_zalloc(binder_node_cachep, GFP_KERNEL);
if (node == NULL)
return NULL;
binder_stats_created(BINDER_STAT_NODE);
@@ -992,7 +1000,7 @@ static int binder_dec_node(struct binder_node *node, int 
strong, int internal)
 "dead node %d deleted\n",
 node->debug_id);
}
-   kfree(node);
+   kmem_cache_free(binder_node_cachep, node);
binder_stats_deleted(BINDER_STAT_NODE);
}
}
@@ -1043,7 +1051,7 @@ static struct binder_ref *binder_get_ref_for_node(struct 
binder_proc *proc,
else
return ref;
}
-   new_ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+   new_ref = kmem_cache_zalloc(binder_ref_cachep, GFP_KERNEL);
if (new_ref == NULL)
return NULL;
binder_stats_created(BINDER_STAT_REF);
@@ -1108,10 +1116,10 @@ static void binder_delete_ref(struct binder_ref *ref)
 "%d delete ref %d desc %d has death 
notification\n",
  ref->proc->pid, ref->debug_id, ref->desc);
list_del(>death->work.entry);
-   kfree(ref->death);
+   kmem_cache_free(binder_ref_death_cachep, ref->death);
binder_stats_deleted(BINDER_STAT_DEATH);
}
-   kfree(ref);
+   kmem_cache_free(binder_ref_cachep, ref);
binder_stats_deleted(BINDER_STAT_REF);
 }
 
@@ -1183,7 +1191,7 @@ static void binder_pop_transaction(struct binder_thread 
*target_thread,
t->need_reply = 0;
if (t->buffer)
t->buffer->transaction = NULL;
-   kfree(t);
+   kmem_cache_free(binder_transaction_cachep, t);
binder_stats_deleted(BINDER_STAT_TRANSACTION);
 }
 
@@ -1444,14 +1452,14 @@ static void binder_transaction(struct binder_proc *proc,
e->to_proc = target_proc->pid;
 
/* TODO: reuse incoming transaction for reply */
-   t = kzalloc(sizeof(*t), GFP_KERNEL);
+   t = kmem_cache_zalloc(binder_transaction_cachep, GFP_KERNEL);
if (t == NULL) {
return_error = BR_FAILED_REPLY;
goto err_alloc_t_failed;
}
binder_stats_created(BINDER_STAT_TRANSACTION);
 
-   tcomplete = kzalloc(sizeof(*tcomplete), GFP_KERNEL);
+   tcomplete = kmem_cache_zalloc(binder_work_cachep, GFP_KERNEL);
if (tcomplete == NULL) {
return_error = BR_FAILED_REPLY;
goto err_alloc_tcomplete_failed;
@@ -1742,10 +1750,10 @@ static void binder_transaction(struct binder_proc *proc,
t->buffer->transaction = NULL;
binder_free_buf(target_proc, t->buffer);
 err_binder_alloc_buf_failed:
-   kfree(tcomplete);
+   kmem_cache_free(binder_work_cachep, tcomplete);
binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE);
 err_alloc_tcomplete_failed:
-   kfree(t);
+   kmem_cache_free(binder_transaction_cachep, t);
binder_stats_deleted(BINDER_STAT_TRANSACTION);
 err_alloc_t_failed:
 err_bad_call_stack:
@@ -2039,7 +2047,7 @@ static int binder_thread_write(struct binder_proc *proc,
proc->pid, thread->pid);
break;
}
-   death = kzalloc(sizeof(*death), GFP_KERNEL);
+   death = 
kmem_cache_zalloc(binder_

Re: [V2] android: binder: use VM_ALLOC to get vm area

2016-11-15 Thread Ganesh Mahendran
Hi, Greg

2016-11-15 18:18 GMT+08:00 Greg KH <gre...@linuxfoundation.org>:
> On Tue, Nov 15, 2016 at 05:55:39PM +0800, Ganesh Mahendran wrote:
>> VM_IOREMAP is used to access hardware through a mechanism called
>> I/O mapped memory. Android binder is a IPC machanism which will
>> not access I/O memory.
>>
>> Also VM_IOREMAP has alignment requiement which may not needed in
>> binder.
>> __get_vm_area_node()
>> {
>> ...
>> if (flags & VM_IOREMAP)
>> align = 1ul << clamp_t(int, fls_long(size),
>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>> ...
>> }
>>
>> This patch use VM_ALLOC to get vm area.
>>
>> Below is the throughput test result:
>>
>>   # ./binderThroughputTest -w 100
>>   I run this command 10 times:
>>beforeafter
>>   average iterations per sec:  11199.9   11886.9
>>
>> No performance regression found throgh binder test.
>>
>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>> ---
>>  drivers/android/binder.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> What changed from v1?

Sorry for missing the change information.

In V2, I run the binder test. And there is no side effect with this
patch.

>
> Always list that below the --- line.

Thanks for reminder.

>
> thanks,
>
> greg k-h


Re: [V2] android: binder: use VM_ALLOC to get vm area

2016-11-15 Thread Ganesh Mahendran
Hi, Greg

2016-11-15 18:18 GMT+08:00 Greg KH :
> On Tue, Nov 15, 2016 at 05:55:39PM +0800, Ganesh Mahendran wrote:
>> VM_IOREMAP is used to access hardware through a mechanism called
>> I/O mapped memory. Android binder is a IPC machanism which will
>> not access I/O memory.
>>
>> Also VM_IOREMAP has alignment requiement which may not needed in
>> binder.
>> __get_vm_area_node()
>> {
>> ...
>> if (flags & VM_IOREMAP)
>> align = 1ul << clamp_t(int, fls_long(size),
>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>> ...
>> }
>>
>> This patch use VM_ALLOC to get vm area.
>>
>> Below is the throughput test result:
>>
>>   # ./binderThroughputTest -w 100
>>   I run this command 10 times:
>>before    after
>>   average iterations per sec:  11199.9   11886.9
>>
>> No performance regression found throgh binder test.
>>
>> Signed-off-by: Ganesh Mahendran 
>> ---
>>  drivers/android/binder.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> What changed from v1?

Sorry for missing the change information.

In V2, I run the binder test. And there is no side effect with this
patch.

>
> Always list that below the --- line.

Thanks for reminder.

>
> thanks,
>
> greg k-h


[V2] android: binder: use VM_ALLOC to get vm area

2016-11-15 Thread Ganesh Mahendran
VM_IOREMAP is used to access hardware through a mechanism called
I/O mapped memory. Android binder is a IPC machanism which will
not access I/O memory.

Also VM_IOREMAP has alignment requiement which may not needed in
binder.
__get_vm_area_node()
{
...
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, fls_long(size),
   PAGE_SHIFT, IOREMAP_MAX_ORDER);
...
}

This patch use VM_ALLOC to get vm area.

Below is the throughput test result:

  # ./binderThroughputTest -w 100
  I run this command 10 times:
   beforeafter
  average iterations per sec:  11199.9   11886.9

No performance regression found throgh binder test.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
 drivers/android/binder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3c71b98..b5908ec 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2901,7 +2901,7 @@ static int binder_mmap(struct file *filp, struct 
vm_area_struct *vma)
goto err_already_mapped;
}
 
-   area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
+   area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
if (area == NULL) {
ret = -ENOMEM;
failure_string = "get_vm_area";
-- 
1.9.1



[V2] android: binder: use VM_ALLOC to get vm area

2016-11-15 Thread Ganesh Mahendran
VM_IOREMAP is used to access hardware through a mechanism called
I/O mapped memory. Android binder is a IPC machanism which will
not access I/O memory.

Also VM_IOREMAP has alignment requiement which may not needed in
binder.
__get_vm_area_node()
{
...
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, fls_long(size),
   PAGE_SHIFT, IOREMAP_MAX_ORDER);
...
}

This patch use VM_ALLOC to get vm area.

Below is the throughput test result:

  # ./binderThroughputTest -w 100
  I run this command 10 times:
   beforeafter
  average iterations per sec:  11199.9   11886.9

No performance regression found throgh binder test.

Signed-off-by: Ganesh Mahendran 
---
 drivers/android/binder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 3c71b98..b5908ec 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2901,7 +2901,7 @@ static int binder_mmap(struct file *filp, struct 
vm_area_struct *vma)
goto err_already_mapped;
}
 
-   area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
+   area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
if (area == NULL) {
ret = -ENOMEM;
failure_string = "get_vm_area";
-- 
1.9.1



Re: [PATCH] android: binder: use VM_ALLOC to get vm area.

2016-09-05 Thread Ganesh Mahendran
2016-09-02 3:59 GMT+08:00 Arve Hjønnevåg <a...@android.com>:
> On Thu, Sep 1, 2016 at 12:02 PM, Greg KH <gre...@linuxfoundation.org> wrote:
>> On Thu, Sep 01, 2016 at 02:41:04PM +0800, Ganesh Mahendran wrote:
>>> VM_IOREMAP is used to access hardware through a mechanism called
>>> I/O mapped memory. Android binder is a IPC machanism which will
>>> not access I/O memory.
>>>
>>> Also VM_IOREMAP has alignment requiement which may not needed in
>>> binder.
>>> 
>>> __get_vm_area_node()
>>> {
>>> ...
>>> if (flags & VM_IOREMAP)
>>> align = 1ul << clamp_t(int, fls_long(size),
>>>    PAGE_SHIFT, IOREMAP_MAX_ORDER);
>>> ...
>>> }
>>> 
>>>
>>> This patch use VM_ALLOC to get vm area.
>>>
>>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>>> ---
>>>  drivers/android/binder.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/android/binder.c b/drivers/android/binder.c
>>> index 16288e7..3511d5c 100644
>>> --- a/drivers/android/binder.c
>>> +++ b/drivers/android/binder.c
>>> @@ -2885,7 +2885,7 @@ static int binder_mmap(struct file *filp, struct 
>>> vm_area_struct *vma)
>>>   goto err_already_mapped;
>>>   }
>>>
>>> - area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
>>> + area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
>>>   if (area == NULL) {
>>>   ret = -ENOMEM;
>>>   failure_string = "get_vm_area";
>>
>> What change have you noticed with this patch?  Have you tested it?
>> Found that previously reserved iomemory is now free for binder to use
>> where it wasn't?  What kind of change does the system now run as because
>> of this?
>>
>> And are you _sure_ the alignment requirement isn't needed for binder?
>> Have you verified this with the userspace binder library?
>>
>> This is messy, tricky, stuff, I'm loath to change it without loads of
>> testing having happened...
>>
>> thanks,
>>
>> greg k-h
>
> There is no alignment requirement on this area unless
> cache_is_vipt_aliasing returns true. In that case the area needs to be
> aligned with vma->vm_start which is done manually in the code right
> after this allocation. If there are no other side effects of changing
> this flag this change should be safe, but please run all the tests at
> https://android.googlesource.com/platform/frameworks/native/+/master/libs/binder/tests/
> to test it.

Thanks for your suggestion.
I only did some android app performance test. I will do more binder test.

Thanks.

>
> --
> Arve Hjønnevåg


Re: [PATCH] android: binder: use VM_ALLOC to get vm area.

2016-09-05 Thread Ganesh Mahendran
2016-09-02 3:59 GMT+08:00 Arve Hjønnevåg :
> On Thu, Sep 1, 2016 at 12:02 PM, Greg KH  wrote:
>> On Thu, Sep 01, 2016 at 02:41:04PM +0800, Ganesh Mahendran wrote:
>>> VM_IOREMAP is used to access hardware through a mechanism called
>>> I/O mapped memory. Android binder is a IPC machanism which will
>>> not access I/O memory.
>>>
>>> Also VM_IOREMAP has alignment requiement which may not needed in
>>> binder.
>>> 
>>> __get_vm_area_node()
>>> {
>>> ...
>>> if (flags & VM_IOREMAP)
>>> align = 1ul << clamp_t(int, fls_long(size),
>>>        PAGE_SHIFT, IOREMAP_MAX_ORDER);
>>> ...
>>> }
>>> 
>>>
>>> This patch use VM_ALLOC to get vm area.
>>>
>>> Signed-off-by: Ganesh Mahendran 
>>> ---
>>>  drivers/android/binder.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/android/binder.c b/drivers/android/binder.c
>>> index 16288e7..3511d5c 100644
>>> --- a/drivers/android/binder.c
>>> +++ b/drivers/android/binder.c
>>> @@ -2885,7 +2885,7 @@ static int binder_mmap(struct file *filp, struct 
>>> vm_area_struct *vma)
>>>   goto err_already_mapped;
>>>   }
>>>
>>> - area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
>>> + area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
>>>   if (area == NULL) {
>>>   ret = -ENOMEM;
>>>   failure_string = "get_vm_area";
>>
>> What change have you noticed with this patch?  Have you tested it?
>> Found that previously reserved iomemory is now free for binder to use
>> where it wasn't?  What kind of change does the system now run as because
>> of this?
>>
>> And are you _sure_ the alignment requirement isn't needed for binder?
>> Have you verified this with the userspace binder library?
>>
>> This is messy, tricky, stuff, I'm loath to change it without loads of
>> testing having happened...
>>
>> thanks,
>>
>> greg k-h
>
> There is no alignment requirement on this area unless
> cache_is_vipt_aliasing returns true. In that case the area needs to be
> aligned with vma->vm_start which is done manually in the code right
> after this allocation. If there are no other side effects of changing
> this flag this change should be safe, but please run all the tests at
> https://android.googlesource.com/platform/frameworks/native/+/master/libs/binder/tests/
> to test it.

Thanks for your suggestion.
I only did some android app performance test. I will do more binder test.

Thanks.

>
> --
> Arve Hjønnevåg


Re: [PATCH] android: binder: use VM_ALLOC to get vm area.

2016-09-05 Thread Ganesh Mahendran
Hi, Greg:

2016-09-02 3:02 GMT+08:00 Greg KH <gre...@linuxfoundation.org>:
> On Thu, Sep 01, 2016 at 02:41:04PM +0800, Ganesh Mahendran wrote:
>> VM_IOREMAP is used to access hardware through a mechanism called
>> I/O mapped memory. Android binder is a IPC machanism which will
>> not access I/O memory.
>>
>> Also VM_IOREMAP has alignment requiement which may not needed in
>> binder.
>> 
>> __get_vm_area_node()
>> {
>> ...
>> if (flags & VM_IOREMAP)
>> align = 1ul << clamp_t(int, fls_long(size),
>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>> ...
>> }
>> 
>>
>> This patch use VM_ALLOC to get vm area.
>>
>> Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
>> ---
>>  drivers/android/binder.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/android/binder.c b/drivers/android/binder.c
>> index 16288e7..3511d5c 100644
>> --- a/drivers/android/binder.c
>> +++ b/drivers/android/binder.c
>> @@ -2885,7 +2885,7 @@ static int binder_mmap(struct file *filp, struct 
>> vm_area_struct *vma)
>>   goto err_already_mapped;
>>   }
>>
>> - area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
>> + area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
>>   if (area == NULL) {
>>   ret = -ENOMEM;
>>   failure_string = "get_vm_area";
>
> What change have you noticed with this patch?  Have you tested it?
> Found that previously reserved iomemory is now free for binder to use
> where it wasn't?  What kind of change does the system now run as because
> of this?

I did some sanity test in Android system.  I will do more test using:
https://android.googlesource.com/platform/frameworks/native/+/master/libs/binder/tests/

>
> And are you _sure_ the alignment requirement isn't needed for binder?
> Have you verified this with the userspace binder library?

I will do the binder test.

Thanks.

>
> This is messy, tricky, stuff, I'm loath to change it without loads of
> testing having happened...
>
> thanks,
>
> greg k-h


Re: [PATCH] android: binder: use VM_ALLOC to get vm area.

2016-09-05 Thread Ganesh Mahendran
Hi, Greg:

2016-09-02 3:02 GMT+08:00 Greg KH :
> On Thu, Sep 01, 2016 at 02:41:04PM +0800, Ganesh Mahendran wrote:
>> VM_IOREMAP is used to access hardware through a mechanism called
>> I/O mapped memory. Android binder is a IPC machanism which will
>> not access I/O memory.
>>
>> Also VM_IOREMAP has alignment requiement which may not needed in
>> binder.
>> 
>> __get_vm_area_node()
>> {
>> ...
>> if (flags & VM_IOREMAP)
>> align = 1ul << clamp_t(int, fls_long(size),
>>PAGE_SHIFT, IOREMAP_MAX_ORDER);
>> ...
>> }
>> 
>>
>> This patch use VM_ALLOC to get vm area.
>>
>> Signed-off-by: Ganesh Mahendran 
>> ---
>>  drivers/android/binder.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/android/binder.c b/drivers/android/binder.c
>> index 16288e7..3511d5c 100644
>> --- a/drivers/android/binder.c
>> +++ b/drivers/android/binder.c
>> @@ -2885,7 +2885,7 @@ static int binder_mmap(struct file *filp, struct 
>> vm_area_struct *vma)
>>   goto err_already_mapped;
>>   }
>>
>> - area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
>> + area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
>>   if (area == NULL) {
>>   ret = -ENOMEM;
>>   failure_string = "get_vm_area";
>
> What change have you noticed with this patch?  Have you tested it?
> Found that previously reserved iomemory is now free for binder to use
> where it wasn't?  What kind of change does the system now run as because
> of this?

I did some sanity test in Android system.  I will do more test using:
https://android.googlesource.com/platform/frameworks/native/+/master/libs/binder/tests/

>
> And are you _sure_ the alignment requirement isn't needed for binder?
> Have you verified this with the userspace binder library?

I will do the binder test.

Thanks.

>
> This is messy, tricky, stuff, I'm loath to change it without loads of
> testing having happened...
>
> thanks,
>
> greg k-h


[PATCH] android: binder: use VM_ALLOC to get vm area.

2016-09-01 Thread Ganesh Mahendran
VM_IOREMAP is used to access hardware through a mechanism called
I/O mapped memory. Android binder is a IPC machanism which will
not access I/O memory.

Also VM_IOREMAP has alignment requiement which may not needed in
binder.

__get_vm_area_node()
{
...
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, fls_long(size),
   PAGE_SHIFT, IOREMAP_MAX_ORDER);
...
}


This patch use VM_ALLOC to get vm area.

Signed-off-by: Ganesh Mahendran <opensource.gan...@gmail.com>
---
 drivers/android/binder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 16288e7..3511d5c 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2885,7 +2885,7 @@ static int binder_mmap(struct file *filp, struct 
vm_area_struct *vma)
goto err_already_mapped;
}
 
-   area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
+   area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
if (area == NULL) {
ret = -ENOMEM;
failure_string = "get_vm_area";
-- 
1.9.1



[PATCH] android: binder: use VM_ALLOC to get vm area.

2016-09-01 Thread Ganesh Mahendran
VM_IOREMAP is used to access hardware through a mechanism called
I/O mapped memory. Android binder is a IPC machanism which will
not access I/O memory.

Also VM_IOREMAP has alignment requiement which may not needed in
binder.

__get_vm_area_node()
{
...
if (flags & VM_IOREMAP)
align = 1ul << clamp_t(int, fls_long(size),
   PAGE_SHIFT, IOREMAP_MAX_ORDER);
...
}


This patch use VM_ALLOC to get vm area.

Signed-off-by: Ganesh Mahendran 
---
 drivers/android/binder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 16288e7..3511d5c 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -2885,7 +2885,7 @@ static int binder_mmap(struct file *filp, struct 
vm_area_struct *vma)
goto err_already_mapped;
}
 
-   area = get_vm_area(vma->vm_end - vma->vm_start, VM_IOREMAP);
+   area = get_vm_area(vma->vm_end - vma->vm_start, VM_ALLOC);
if (area == NULL) {
ret = -ENOMEM;
failure_string = "get_vm_area";
-- 
1.9.1



  1   2   3   4   >