[PATCH] mm/hugetlb: Implement ASLR and topdown for hugetlb mappings

2017-11-03 Thread Shile Zhang
merge from arch/x86

Signed-off-by: Shile Zhang 
---
 arch/arm/include/asm/page.h |  1 +
 arch/arm/mm/hugetlbpage.c   | 85 +
 2 files changed, 86 insertions(+)

diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 4355f0e..994630f 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -144,6 +144,7 @@ extern void copy_page(void *to, const void *from);
 
 #ifdef CONFIG_KUSER_HELPERS
 #define __HAVE_ARCH_GATE_AREA 1
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #endif
 
 #ifdef CONFIG_ARM_LPAE
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index fcafb52..46ed0c8 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -45,3 +45,88 @@ int pmd_huge(pmd_t pmd)
 {
return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
 }
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
+   unsigned long addr, unsigned long len,
+   unsigned long pgoff, unsigned long flags)
+{
+   struct hstate *h = hstate_file(file);
+   struct vm_unmapped_area_info info;
+
+   info.flags = 0;
+   info.length = len;
+   info.low_limit = current->mm->mmap_legacy_base;
+   info.high_limit = TASK_SIZE;
+   info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+   info.align_offset = 0;
+   return vm_unmapped_area(&info);
+}
+
+static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
+   unsigned long addr0, unsigned long len,
+   unsigned long pgoff, unsigned long flags)
+{
+   struct hstate *h = hstate_file(file);
+   struct vm_unmapped_area_info info;
+   unsigned long addr;
+
+   info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+   info.length = len;
+   info.low_limit = PAGE_SIZE;
+   info.high_limit = current->mm->mmap_base;
+   info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+   info.align_offset = 0;
+   addr = vm_unmapped_area(&info);
+
+   /*
+* A failed mmap() very likely causes application failure,
+* so fall back to the bottom-up function here. This scenario
+* can happen with large stack limits and large mmap()
+* allocations.
+*/
+   if (addr & ~PAGE_MASK) {
+   VM_BUG_ON(addr != -ENOMEM);
+   info.flags = 0;
+   info.low_limit = TASK_UNMAPPED_BASE;
+   info.high_limit = TASK_SIZE;
+   addr = vm_unmapped_area(&info);
+   }
+
+   return addr;
+}
+
+unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+   unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+   struct hstate *h = hstate_file(file);
+   struct mm_struct *mm = current->mm;
+   struct vm_area_struct *vma;
+
+   if (len & ~huge_page_mask(h))
+   return -EINVAL;
+   if (len > TASK_SIZE)
+   return -ENOMEM;
+
+   if (flags & MAP_FIXED) {
+   if (prepare_hugepage_range(file, addr, len))
+   return -EINVAL;
+   return addr;
+   }
+
+   if (addr) {
+   addr = ALIGN(addr, huge_page_size(h));
+   vma = find_vma(mm, addr);
+   if (TASK_SIZE - len >= addr &&
+   (!vma || addr + len <= vma->vm_start))
+   return addr;
+   }
+   if (mm->get_unmapped_area == arch_get_unmapped_area)
+   return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+   pgoff, flags);
+   else
+   return hugetlb_get_unmapped_area_topdown(file, addr, len,
+   pgoff, flags);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
-- 
2.6.2



[PATCH] mm/page_alloc.c: fix typos in comments

2018-01-09 Thread Shile Zhang
Signed-off-by: Shile Zhang 
---
 mm/page_alloc.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 76c9688..bfd5f99 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -293,7 +293,7 @@ int page_group_by_mobility_disabled __read_mostly;
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 
 /*
- * Determine how many pages need to be initialized durig early boot
+ * Determine how many pages need to be initialized during early boot
  * (non-deferred initialization).
  * The value of first_deferred_pfn will be set later, once non-deferred pages
  * are initialized, but for now set it ULONG_MAX.
@@ -344,7 +344,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
 {
-   /* Always populate low zones for address-contrained allocations */
+   /* Always populate low zones for address-constrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
(*nr_initialised)++;
@@ -1502,7 +1502,7 @@ static unsigned long __init deferred_init_range(int nid, 
int zid,
 * performing it only once every pageblock_nr_pages.
 *
 * We do it in two loops: first we initialize struct page, than free to
-* buddy allocator, becuse while we are freeing pages we can access
+* buddy allocator, because while we are freeing pages we can access
 * pages that are ahead (computing buddy page in __free_one_page()).
 */
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
@@ -3391,7 +3391,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (gfp_mask & __GFP_THISNODE)
goto out;
 
-   /* Exhausted what can be done so it's blamo time */
+   /* Exhausted what can be done so it's blame time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
 
-- 
2.6.2



[Regression][stable] Major hackbench pipes regression due to commit 'sched/fair: Search a task from the tail of the queue'

2019-04-26 Thread Shile Zhang

Hi, Uladzislau

We found a major regression (more than -40%) of hackbench on 2VCPU 
(skylake) virtual machine, with LTS kernel v4.19.36. Following commit be 
bisect by test 'hackbench --thread --pipe --group 1 --loop 6'


---

commit 93824900a2e242766f5fe6ae7697e3d7171aa234
Author: Uladzislau Rezki 
Date:   Wed Sep 13 12:24:30 2017 +0200

    sched/fair: Search a task from the tail of the queue

---

-default: 21.048

-revert: 14.385


It said that the hackbench has slightly better performance with 40 
groups in the commit log, but it seems more performance dropped with 
less groups, such as less than 32 group. I also tried this benchmark 
with newest v5.1-rc6 kernel, same regression was found.


Could you please also have a check?

Thanks!

BRs/Shile





[PATCH] misc/pvpanic: Export module FDT device table

2021-02-18 Thread Shile Zhang
Export the module FDT device table to ensure the FDT compatible strings
are listed in the module alias. This help the pvpanic driver can be
loaded on boot automatically not only the ACPI device, but also the FDT
device.

Signed-off-by: Shile Zhang 
---
 drivers/misc/pvpanic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/pvpanic.c b/drivers/misc/pvpanic.c
index 41cab297d66e..2356d621967e 100644
--- a/drivers/misc/pvpanic.c
+++ b/drivers/misc/pvpanic.c
@@ -92,6 +92,7 @@ static const struct of_device_id pvpanic_mmio_match[] = {
{ .compatible = "qemu,pvpanic-mmio", },
{}
 };
+MODULE_DEVICE_TABLE(of, pvpanic_mmio_match);
 
 static const struct acpi_device_id pvpanic_device_ids[] = {
{ "QEMU0001", 0 },
-- 
2.24.0.rc2



Re: [PATCH] misc/pvpanic: Export module FDT device table

2021-02-18 Thread Shile Zhang




On 2021/2/18 18:00, Greg Kroah-Hartman wrote:

On Thu, Feb 18, 2021 at 05:40:24PM +0800, Shile Zhang wrote:

Export the module FDT device table to ensure the FDT compatible strings
are listed in the module alias. This help the pvpanic driver can be
loaded on boot automatically not only the ACPI device, but also the FDT
device.

Signed-off-by: Shile Zhang 
---
  drivers/misc/pvpanic.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/misc/pvpanic.c b/drivers/misc/pvpanic.c
index 41cab297d66e..2356d621967e 100644
--- a/drivers/misc/pvpanic.c
+++ b/drivers/misc/pvpanic.c
@@ -92,6 +92,7 @@ static const struct of_device_id pvpanic_mmio_match[] = {
{ .compatible = "qemu,pvpanic-mmio", },
{}
  };
+MODULE_DEVICE_TABLE(of, pvpanic_mmio_match);


What caused this to not work properly?  I.e. should there be a "Fixes:"
tag in the commit changelog as well?


Sorry, I think it should be:

Fixes: 46f934c9a12fc ("misc/pvpanic: add support to get pvpanic device 
info FDT")


Shall I sent v2 with this update?

Thanks!



thanks,

greg k-h



[PATCH v2] misc/pvpanic: Export module FDT device table

2021-02-18 Thread Shile Zhang
Export the module FDT device table to ensure the FDT compatible strings
are listed in the module alias. This help the pvpanic driver can be
loaded on boot automatically not only the ACPI device, but also the FDT
device.

Fixes: 46f934c9a12fc ("misc/pvpanic: add support to get pvpanic device info 
FDT")
Signed-off-by: Shile Zhang 
---
-v2: add the original commit id in changelog
-v1: 
https://lore.kernel.org/lkml/20210218094024.69354-1-shile.zh...@linux.alibaba.com/

 drivers/misc/pvpanic.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/misc/pvpanic.c b/drivers/misc/pvpanic.c
index 41cab297d66e..2356d621967e 100644
--- a/drivers/misc/pvpanic.c
+++ b/drivers/misc/pvpanic.c
@@ -92,6 +92,7 @@ static const struct of_device_id pvpanic_mmio_match[] = {
{ .compatible = "qemu,pvpanic-mmio", },
{}
 };
+MODULE_DEVICE_TABLE(of, pvpanic_mmio_match);
 
 static const struct acpi_device_id pvpanic_device_ids[] = {
{ "QEMU0001", 0 },
-- 
2.24.0.rc2



[RESEND PATCH] mm/hugetlb: topdown mmap supports for hugepage

2019-09-16 Thread shile . zhang
From: Shile Zhang 

Similar to other arches, this adds topdown mmap support for hugepage
in user process address space allocation. It allows mmap big size
hugepage. This patch copied from the implementation in arch/x86.

Signed-off-by: Shile Zhang 
---
 arch/arm/include/asm/page.h |  1 +
 arch/arm/mm/hugetlbpage.c   | 85 +
 2 files changed, 86 insertions(+)

diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index c2b75cb..dcb4df5 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -141,6 +141,7 @@ extern void __cpu_copy_user_highpage(struct page *to, 
struct page *from,
 
 #ifdef CONFIG_KUSER_HELPERS
 #define __HAVE_ARCH_GATE_AREA 1
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #endif
 
 #ifdef CONFIG_ARM_LPAE
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index a1e5aac..ba9e151 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -33,3 +33,88 @@ int pmd_huge(pmd_t pmd)
 {
return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
 }
+
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
+   unsigned long addr, unsigned long len,
+   unsigned long pgoff, unsigned long flags)
+{
+   struct hstate *h = hstate_file(file);
+   struct vm_unmapped_area_info info;
+
+   info.flags = 0;
+   info.length = len;
+   info.low_limit = current->mm->mmap_legacy_base;
+   info.high_limit = TASK_SIZE;
+   info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+   info.align_offset = 0;
+   return vm_unmapped_area(&info);
+}
+
+static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
+   unsigned long addr0, unsigned long len,
+   unsigned long pgoff, unsigned long flags)
+{
+   struct hstate *h = hstate_file(file);
+   struct vm_unmapped_area_info info;
+   unsigned long addr;
+
+   info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+   info.length = len;
+   info.low_limit = PAGE_SIZE;
+   info.high_limit = current->mm->mmap_base;
+   info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+   info.align_offset = 0;
+   addr = vm_unmapped_area(&info);
+
+   /*
+* A failed mmap() very likely causes application failure,
+* so fall back to the bottom-up function here. This scenario
+* can happen with large stack limits and large mmap()
+* allocations.
+*/
+   if (addr & ~PAGE_MASK) {
+   VM_BUG_ON(addr != -ENOMEM);
+   info.flags = 0;
+   info.low_limit = TASK_UNMAPPED_BASE;
+   info.high_limit = TASK_SIZE;
+   addr = vm_unmapped_area(&info);
+   }
+
+   return addr;
+}
+
+unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+   unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+   struct hstate *h = hstate_file(file);
+   struct mm_struct *mm = current->mm;
+   struct vm_area_struct *vma;
+
+   if (len & ~huge_page_mask(h))
+   return -EINVAL;
+   if (len > TASK_SIZE)
+   return -ENOMEM;
+
+   if (flags & MAP_FIXED) {
+   if (prepare_hugepage_range(file, addr, len))
+   return -EINVAL;
+   return addr;
+   }
+
+   if (addr) {
+   addr = ALIGN(addr, huge_page_size(h));
+   vma = find_vma(mm, addr);
+   if (TASK_SIZE - len >= addr &&
+   (!vma || addr + len <= vma->vm_start))
+   return addr;
+   }
+   if (mm->get_unmapped_area == arch_get_unmapped_area)
+   return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+   pgoff, flags);
+   else
+   return hugetlb_get_unmapped_area_topdown(file, addr, len,
+   pgoff, flags);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
-- 
1.8.3.1



[PATCH] bcache: add cond_resched() in __bch_cache_cmp()

2019-03-06 Thread shile . zhang
From: Shile Zhang 

Read /sys/fs/bcache//cacheN/priority_stats can take very long
time with huge cache after long run.

Signed-off-by: Shile Zhang 
---
 drivers/md/bcache/sysfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 557a8a3..028fea1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -897,6 +897,7 @@ static void bch_cache_set_internal_release(struct kobject 
*k)
 
 static int __bch_cache_cmp(const void *l, const void *r)
 {
+   cond_resched();
return *((uint16_t *)r) - *((uint16_t *)l);
 }
 
-- 
1.8.3.1



Re: [PATCH] bcache: add cond_resched() in __bch_cache_cmp()

2019-03-07 Thread Shile Zhang



On 2019/3/7 18:34, Coly Li wrote:

On 2019/3/7 1:15 下午, shile.zh...@linux.alibaba.com wrote:

From: Shile Zhang 

Read /sys/fs/bcache//cacheN/priority_stats can take very long
time with huge cache after long run.

Signed-off-by: Shile Zhang 

Hi Shile,

Do you test your change ? It will be helpful with more performance data
(what problem that you improved).


In case of 960GB SSD cache device, once read of the 'priority_stats' 
costs about 600ms in our test environment.


The perf tool shown that near 50% CPU time consumed by 'sort()', this 
means once sort will hold the CPU near 300ms.


In our case, the statistics collector reads the 'priority_stats' 
periodically, it will trigger the schedule latency jitters of the


task which shared same CPU core.



Thanks.

Coly Li


---
  drivers/md/bcache/sysfs.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 557a8a3..028fea1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -897,6 +897,7 @@ static void bch_cache_set_internal_release(struct kobject 
*k)
  
  static int __bch_cache_cmp(const void *l, const void *r)

  {
+   cond_resched();
return *((uint16_t *)r) - *((uint16_t *)l);
  }
  





Re: [PATCH] bcache: add cond_resched() in __bch_cache_cmp()

2019-03-07 Thread Shile Zhang



On 2019/3/7 23:44, Vojtech Pavlik wrote:

On Thu, Mar 07, 2019 at 11:36:18PM +0800, Coly Li wrote:

On 2019/3/7 11:06 下午, Shile Zhang wrote:

On 2019/3/7 18:34, Coly Li wrote:

On 2019/3/7 1:15 下午, shile.zh...@linux.alibaba.com wrote:

From: Shile Zhang 

Read /sys/fs/bcache//cacheN/priority_stats can take very long
time with huge cache after long run.

Signed-off-by: Shile Zhang 

Hi Shile,

Do you test your change ? It will be helpful with more performance data
(what problem that you improved).

In case of 960GB SSD cache device, once read of the 'priority_stats'
costs about 600ms in our test environment.


After the fix, how much time it takes ?



The perf tool shown that near 50% CPU time consumed by 'sort()', this
means once sort will hold the CPU near 300ms.

In our case, the statistics collector reads the 'priority_stats'
periodically, it will trigger the schedule latency jitters of the

task which shared same CPU core.


Hmm, it seems you just make the sort slower, and nothing more changes.
Am I right ?

Well, it has to make the sort slower, but it'll also avoid hogging the
CPU (on a non-preemptible kernel), avoiding a potential soft lockup
warning and allowing other tasks to run.

Yes, there is a risk that other tasks have no chance to run due to sort 
hogging the CPU, it is harmful to some schedule-latency sensitive tasks.
This change just try to reduce the impact of sort, but not a performance 
improvement of it. I'm not sure if a better way can handle it more 
efficiency.


Thanks,

Shile




[PATCH] sched: fix the inconsistent of SCHED_RR timeslice tuning knob showing

2017-01-28 Thread Shile Zhang
Clark add SCHED_RR tuning knob in commit ce0d30ae ("sched/rt: Add a
tuning knob to allow changing SCHED_RR timeslice"). The tuning knob
sched_rr_timeslice_ms gets user visible value in milliseconds, but it
shows still in jiffies as it is stored.
This inconsistence is confusing when HZ is not 1000, it seems like the
value set failed, such as HZ=100:

root# echo 100 > /proc/sys/kernel/sched_rr_timeslice_ms
root# cat /proc/sys/kernel/sched_rr_timeslice_ms
10

This fix makes the sched_rr_timeslice shows in milliseconds.

Signed-off-by: Shile Zhang 
---
 include/linux/sched/sysctl.h | 1 +
 kernel/sched/core.c  | 5 +++--
 kernel/sched/rt.c| 1 +
 kernel/sysctl.c  | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 4411453..49308e1 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -59,6 +59,7 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 extern unsigned int sysctl_sched_autogroup_enabled;
 #endif
 
+extern int sysctl_sched_rr_timeslice;
 extern int sched_rr_timeslice;
 
 extern int sched_rr_handler(struct ctl_table *table, int write,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c56fb57..9297563 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8369,8 +8369,9 @@ int sched_rr_handler(struct ctl_table *table, int write,
/* make sure that internally we keep jiffies */
/* also, writing zero resets timeslice to default */
if (!ret && write) {
-   sched_rr_timeslice = sched_rr_timeslice <= 0 ?
-   RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+   sched_rr_timeslice =
+   sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+   msecs_to_jiffies(sysctl_sched_rr_timeslice);
}
mutex_unlock(&mutex);
return ret;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2516b8d..8737a0e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
 #include 
 
 int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8dbaec0..9a90097 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -416,7 +416,7 @@ static struct ctl_table kern_table[] = {
},
{
.procname   = "sched_rr_timeslice_ms",
-   .data   = &sched_rr_timeslice,
+   .data   = &sysctl_sched_rr_timeslice,
.maxlen = sizeof(int),
.mode   = 0644,
.proc_handler   = sched_rr_handler,
-- 
2.6.2



[PATCH] hangcheck-timer: Fix typo in comment

2017-03-22 Thread Shile Zhang
Signed-off-by: Shile Zhang 
---
 drivers/char/hangcheck-timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 4f33737..dcd37b1 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -32,7 +32,7 @@
  * timer and 180 seconds for the margin of error.  IOW, a timer is set
  * for 60 seconds.  When the timer fires, the callback checks the
  * actual duration that the timer waited.  If the duration exceeds the
- * alloted time and margin (here 60 + 180, or 240 seconds), the machine
+ * allowed time and margin (here 60 + 180, or 240 seconds), the machine
  * is restarted.  A healthy machine will have the duration match the
  * expected timeout very closely.
  */
-- 
2.6.2



[PATCH] hangcheck-timer: Fix typo in comment

2017-03-22 Thread Shile Zhang
Signed-off-by: Shile Zhang 
---
 drivers/char/hangcheck-timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 4f33737..dcd37b1 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -32,7 +32,7 @@
  * timer and 180 seconds for the margin of error.  IOW, a timer is set
  * for 60 seconds.  When the timer fires, the callback checks the
  * actual duration that the timer waited.  If the duration exceeds the
- * alloted time and margin (here 60 + 180, or 240 seconds), the machine
+ * allowed time and margin (here 60 + 180, or 240 seconds), the machine
  * is restarted.  A healthy machine will have the duration match the
  * expected timeout very closely.
  */
-- 
2.6.2



[PATCH] hangcheck-timer: Fix typo in comment

2017-03-23 Thread Shile Zhang
Fix the typo "alloted" -> "allowed" in comment.

Signed-off-by: Shile Zhang 
---
 drivers/char/hangcheck-timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 4f33737..dcd37b1 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -32,7 +32,7 @@
  * timer and 180 seconds for the margin of error.  IOW, a timer is set
  * for 60 seconds.  When the timer fires, the callback checks the
  * actual duration that the timer waited.  If the duration exceeds the
- * alloted time and margin (here 60 + 180, or 240 seconds), the machine
+ * allowed time and margin (here 60 + 180, or 240 seconds), the machine
  * is restarted.  A healthy machine will have the duration match the
  * expected timeout very closely.
  */
-- 
2.6.2



[PATCH] hangcheck-timer: Fix typo in comment

2017-03-23 Thread Shile Zhang
Fix the typo "alloted" -> "allotted" in comment.

Signed-off-by: Shile Zhang 
---
 drivers/char/hangcheck-timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 4f33737..5406b90 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -32,7 +32,7 @@
  * timer and 180 seconds for the margin of error.  IOW, a timer is set
  * for 60 seconds.  When the timer fires, the callback checks the
  * actual duration that the timer waited.  If the duration exceeds the
- * alloted time and margin (here 60 + 180, or 240 seconds), the machine
+ * allotted time and margin (here 60 + 180, or 240 seconds), the machine
  * is restarted.  A healthy machine will have the duration match the
  * expected timeout very closely.
  */
-- 
2.6.2



[PATCH v3] hangcheck-timer: Fix typo in comment

2017-03-23 Thread Shile Zhang
Fix the typo "alloted" -> "allotted" in comment.

Signed-off-by: Shile Zhang 
---
 drivers/char/hangcheck-timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 4f33737..5406b90 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -32,7 +32,7 @@
  * timer and 180 seconds for the margin of error.  IOW, a timer is set
  * for 60 seconds.  When the timer fires, the callback checks the
  * actual duration that the timer waited.  If the duration exceeds the
- * alloted time and margin (here 60 + 180, or 240 seconds), the machine
+ * allotted time and margin (here 60 + 180, or 240 seconds), the machine
  * is restarted.  A healthy machine will have the duration match the
  * expected timeout very closely.
  */
-- 
2.6.2



Re: [PATCH v2] virtio_ring: use alloc_pages_node for NUMA-aware allocation

2020-08-04 Thread Shile Zhang

Hi Michael & Bjorn,

Sorry for the ping,
but how about this patch/issue? any comments/suggestions?

Thanks!

On 2020/7/27 21:10, Shile Zhang wrote:



On 2020/7/21 19:28, Shile Zhang wrote:



On 2020/7/21 16:18, Michael S. Tsirkin wrote:

On Tue, Jul 21, 2020 at 03:00:13PM +0800, Shile Zhang wrote:

Use alloc_pages_node() allocate memory for vring queue with proper
NUMA affinity.

Reported-by: kernel test robot 
Suggested-by: Jiang Liu 
Signed-off-by: Shile Zhang 


Do you observe any performance gains from this patch?


Thanks for your comments!
Yes, the bandwidth can boost more than doubled (from 30Gbps to 80GBps) 
with this changes in my test env (8 numa nodes), with netperf test.




I also wonder why isn't the probe code run on the correct numa node?
That would fix a wide class of issues like this without need to tweak
drivers.


Good point, I'll check this, thanks!


Sorry, I have no idea about how the probe code to grab the appropriate 
NUMA node.






Bjorn, what do you think? Was this considered?


Hi Bjorn, Could you please give any comments about this issue?
Thanks!




---
Changelog
v1 -> v2:
- fixed compile warning reported by LKP.
---
  drivers/virtio/virtio_ring.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c 
b/drivers/virtio/virtio_ring.c

index 58b96baa8d48..d38fd6872c8c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -276,9 +276,11 @@ static void *vring_alloc_queue(struct 
virtio_device *vdev, size_t size,

  return dma_alloc_coherent(vdev->dev.parent, size,
    dma_handle, flag);
  } else {
-    void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
-
-    if (queue) {
+    void *queue = NULL;
+    struct page *page = 
alloc_pages_node(dev_to_node(vdev->dev.parent),

+ flag, get_order(size));
+    if (page) {
+    queue = page_address(page);
  phys_addr_t phys_addr = virt_to_phys(queue);
  *dma_handle = (dma_addr_t)phys_addr;
@@ -308,7 +310,7 @@ static void vring_free_queue(struct 
virtio_device *vdev, size_t size,

  if (vring_use_dma_api(vdev))
  dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
  else
-    free_pages_exact(queue, PAGE_ALIGN(size));
+    free_pages((unsigned long)queue, get_order(size));
  }
  /*
--
2.24.0.rc2


Re: [PATCH v2] virtio_ring: use alloc_pages_node for NUMA-aware allocation

2020-07-27 Thread Shile Zhang




On 2020/7/21 19:28, Shile Zhang wrote:



On 2020/7/21 16:18, Michael S. Tsirkin wrote:

On Tue, Jul 21, 2020 at 03:00:13PM +0800, Shile Zhang wrote:

Use alloc_pages_node() allocate memory for vring queue with proper
NUMA affinity.

Reported-by: kernel test robot 
Suggested-by: Jiang Liu 
Signed-off-by: Shile Zhang 


Do you observe any performance gains from this patch?


Thanks for your comments!
Yes, the bandwidth can boost more than doubled (from 30Gbps to 80GBps) 
with this changes in my test env (8 numa nodes), with netperf test.




I also wonder why isn't the probe code run on the correct numa node?
That would fix a wide class of issues like this without need to tweak
drivers.


Good point, I'll check this, thanks!


Sorry, I have no idea about how the probe code to grab the appropriate 
NUMA node.






Bjorn, what do you think? Was this considered?


Hi Bjorn, Could you please give any comments about this issue?
Thanks!




---
Changelog
v1 -> v2:
- fixed compile warning reported by LKP.
---
  drivers/virtio/virtio_ring.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 58b96baa8d48..d38fd6872c8c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -276,9 +276,11 @@ static void *vring_alloc_queue(struct 
virtio_device *vdev, size_t size,

  return dma_alloc_coherent(vdev->dev.parent, size,
    dma_handle, flag);
  } else {
-    void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
-
-    if (queue) {
+    void *queue = NULL;
+    struct page *page = 
alloc_pages_node(dev_to_node(vdev->dev.parent),

+ flag, get_order(size));
+    if (page) {
+    queue = page_address(page);
  phys_addr_t phys_addr = virt_to_phys(queue);
  *dma_handle = (dma_addr_t)phys_addr;
@@ -308,7 +310,7 @@ static void vring_free_queue(struct virtio_device 
*vdev, size_t size,

  if (vring_use_dma_api(vdev))
  dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
  else
-    free_pages_exact(queue, PAGE_ALIGN(size));
+    free_pages((unsigned long)queue, get_order(size));
  }
  /*
--
2.24.0.rc2


[PATCH v2] virtio_ring: use alloc_pages_node for NUMA-aware allocation

2020-07-21 Thread Shile Zhang
Use alloc_pages_node() allocate memory for vring queue with proper
NUMA affinity.

Reported-by: kernel test robot 
Suggested-by: Jiang Liu 
Signed-off-by: Shile Zhang 
---
Changelog
v1 -> v2:
- fixed compile warning reported by LKP.
---
 drivers/virtio/virtio_ring.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 58b96baa8d48..d38fd6872c8c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -276,9 +276,11 @@ static void *vring_alloc_queue(struct virtio_device *vdev, 
size_t size,
return dma_alloc_coherent(vdev->dev.parent, size,
  dma_handle, flag);
} else {
-   void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
-
-   if (queue) {
+   void *queue = NULL;
+   struct page *page = 
alloc_pages_node(dev_to_node(vdev->dev.parent),
+flag, get_order(size));
+   if (page) {
+   queue = page_address(page);
phys_addr_t phys_addr = virt_to_phys(queue);
*dma_handle = (dma_addr_t)phys_addr;
 
@@ -308,7 +310,7 @@ static void vring_free_queue(struct virtio_device *vdev, 
size_t size,
if (vring_use_dma_api(vdev))
dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
else
-   free_pages_exact(queue, PAGE_ALIGN(size));
+   free_pages((unsigned long)queue, get_order(size));
 }
 
 /*
-- 
2.24.0.rc2



Re: [PATCH v2] virtio_ring: use alloc_pages_node for NUMA-aware allocation

2020-07-21 Thread Shile Zhang




On 2020/7/21 16:18, Michael S. Tsirkin wrote:

On Tue, Jul 21, 2020 at 03:00:13PM +0800, Shile Zhang wrote:

Use alloc_pages_node() allocate memory for vring queue with proper
NUMA affinity.

Reported-by: kernel test robot 
Suggested-by: Jiang Liu 
Signed-off-by: Shile Zhang 


Do you observe any performance gains from this patch?


Thanks for your comments!
Yes, the bandwidth can boost more than doubled (from 30Gbps to 80GBps) 
with this changes in my test env (8 numa nodes), with netperf test.




I also wonder why isn't the probe code run on the correct numa node?
That would fix a wide class of issues like this without need to tweak
drivers.


Good point, I'll check this, thanks!



Bjorn, what do you think? Was this considered?


---
Changelog
v1 -> v2:
- fixed compile warning reported by LKP.
---
  drivers/virtio/virtio_ring.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 58b96baa8d48..d38fd6872c8c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -276,9 +276,11 @@ static void *vring_alloc_queue(struct virtio_device *vdev, 
size_t size,
return dma_alloc_coherent(vdev->dev.parent, size,
  dma_handle, flag);
} else {
-   void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
-
-   if (queue) {
+   void *queue = NULL;
+   struct page *page = 
alloc_pages_node(dev_to_node(vdev->dev.parent),
+flag, get_order(size));
+   if (page) {
+   queue = page_address(page);
phys_addr_t phys_addr = virt_to_phys(queue);
*dma_handle = (dma_addr_t)phys_addr;
  
@@ -308,7 +310,7 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,

if (vring_use_dma_api(vdev))
dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
else
-   free_pages_exact(queue, PAGE_ALIGN(size));
+   free_pages((unsigned long)queue, get_order(size));
  }
  
  /*

--
2.24.0.rc2


[PATCH] virtio_ring: use alloc_pages_node for NUMA-aware allocation

2020-07-17 Thread Shile Zhang
Use alloc_pages_node() allocate memory for vring queue with proper
NUMA affinity.

Suggested-by: Jiang Liu 
Signed-off-by: Shile Zhang 
---
 drivers/virtio/virtio_ring.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 58b96baa8d48..ded82880281a 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -276,9 +276,11 @@ static void *vring_alloc_queue(struct virtio_device *vdev, 
size_t size,
return dma_alloc_coherent(vdev->dev.parent, size,
  dma_handle, flag);
} else {
-   void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
-
-   if (queue) {
+   void *queue = NULL;
+   struct page *page = 
alloc_pages_node(dev_to_node(&vdev->dev.parent),
+flag, get_order(size));
+   if (page) {
+   queue = page_address(page);
phys_addr_t phys_addr = virt_to_phys(queue);
*dma_handle = (dma_addr_t)phys_addr;
 
@@ -308,7 +310,7 @@ static void vring_free_queue(struct virtio_device *vdev, 
size_t size,
if (vring_use_dma_api(vdev))
dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
else
-   free_pages_exact(queue, PAGE_ALIGN(size));
+   free_pages((unsigned long)queue, get_order(size));
 }
 
 /*
-- 
2.24.0.rc2



[PATCH] watchdog: fix build error if define SOFTWARE_REBOOT

2017-04-10 Thread Shile Zhang
To fix following build error when SOFTWARE_REBOOT is defined:

  CC [M]  driver/watchdog/wdt_pci.o
driver/watchdog/wdt_pci.c: In function 'wdtpci_interrupt':
driver/watchdog/wdt_pci.c:335:3: error: too many arguments to function 
'emergency_restart'
   emergency_restart(NULL);
   ^
In file included from driver/watchdog/wdt_pci.c:51:0:
include/linux/reboot.h:80:13: note: declared here
 extern void emergency_restart(void);
     ^

Signed-off-by: Shile Zhang 
---
 drivers/watchdog/wdt_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/watchdog/wdt_pci.c b/drivers/watchdog/wdt_pci.c
index 48b2c05..bc7addc 100644
--- a/drivers/watchdog/wdt_pci.c
+++ b/drivers/watchdog/wdt_pci.c
@@ -332,7 +332,7 @@ static irqreturn_t wdtpci_interrupt(int irq, void *dev_id)
pr_crit("Would Reboot\n");
 #else
pr_crit("Initiating system reboot\n");
-   emergency_restart(NULL);
+   emergency_restart();
 #endif
 #else
pr_crit("Reset in 5ms\n");
-- 
2.6.2



[PATCH] arm: fix the spacing/tabbing issue

2017-11-17 Thread Shile Zhang
To fix the style issue where spaces where used instead of tabs.

Signed-off-by: Shile Zhang 
---
 arch/arm/Kconfig | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 7888c98..d5ee446 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1696,12 +1696,12 @@ config HW_PERF_EVENTS
depends on ARM_PMU
 
 config SYS_SUPPORTS_HUGETLBFS
-   def_bool y
-   depends on ARM_LPAE
+   def_bool y
+   depends on ARM_LPAE
 
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
-   def_bool y
-   depends on ARM_LPAE
+   def_bool y
+   depends on ARM_LPAE
 
 config ARCH_WANT_GENERAL_HUGETLB
def_bool y
-- 
2.6.2



[PATCH v1] arm: fix the spacing/tabbing issue

2017-11-19 Thread Shile Zhang
To fix the style issue where spaces where used instead of tabs.

Signed-off-by: Shile Zhang 
---
 add more missed lines' fix.
 Thanks!

 arch/arm/Kconfig | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d1346a1..3728e7e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -220,7 +220,7 @@ config ZONE_DMA
bool
 
 config NEED_DMA_MAP_STATE
-   def_bool y
+   def_bool y
 
 config ARCH_SUPPORTS_UPROBES
def_bool y
@@ -1134,9 +1134,9 @@ config ARM_ERRATA_764369
  in the diagnostic control register of the SCU.
 
 config ARM_ERRATA_775420
-   bool "ARM errata: A data cache maintenance operation which aborts, 
might lead to deadlock"
-   depends on CPU_V7
-   help
+   bool "ARM errata: A data cache maintenance operation which aborts, 
might lead to deadlock"
+   depends on CPU_V7
+   help
 This option enables the workaround for the 775420 Cortex-A9 (r2p2,
 r2p6,r2p8,r2p10,r3p0) erratum. In case a date cache maintenance
 operation aborts with MMU exception, it might cause the processor
@@ -1697,12 +1697,12 @@ config HW_PERF_EVENTS
depends on ARM_PMU
 
 config SYS_SUPPORTS_HUGETLBFS
-   def_bool y
-   depends on ARM_LPAE
+   def_bool y
+   depends on ARM_LPAE
 
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
-   def_bool y
-   depends on ARM_LPAE
+   def_bool y
+   depends on ARM_LPAE
 
 config ARCH_WANT_GENERAL_HUGETLB
def_bool y
-- 
2.6.2



[tip:sched/core] sched/rt: Show the 'sched_rr_timeslice' SCHED_RR timeslice tuning knob in milliseconds

2017-02-01 Thread tip-bot for Shile Zhang
Commit-ID:  975e155ed8732cb81f55c021c441ae662dd040b5
Gitweb: http://git.kernel.org/tip/975e155ed8732cb81f55c021c441ae662dd040b5
Author: Shile Zhang 
AuthorDate: Sat, 28 Jan 2017 22:00:49 +0800
Committer:  Ingo Molnar 
CommitDate: Wed, 1 Feb 2017 11:01:30 +0100

sched/rt: Show the 'sched_rr_timeslice' SCHED_RR timeslice tuning knob in 
milliseconds

We added the 'sched_rr_timeslice_ms' SCHED_RR tuning knob in this commit:

  ce0d30ae ("sched/rt: Add a tuning knob to allow changing SCHED_RR 
timeslice")

... which name suggests to users that it's in milliseconds, while in reality
it's being set in milliseconds but the result is shown in jiffies.

This is obviously confusing when HZ is not 1000, it makes it appear like the
value set failed, such as HZ=100:

  root# echo 100 > /proc/sys/kernel/sched_rr_timeslice_ms
  root# cat /proc/sys/kernel/sched_rr_timeslice_ms
  10

Fix this to be milliseconds all around.

Signed-off-by: Shile Zhang 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1485612049-20923-1-git-send-email-shile.zh...@nokia.com
Signed-off-by: Ingo Molnar 
---
 include/linux/sched/sysctl.h | 1 +
 kernel/sched/core.c  | 5 +++--
 kernel/sched/rt.c| 1 +
 kernel/sysctl.c  | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 4411453..49308e1 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -59,6 +59,7 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 extern unsigned int sysctl_sched_autogroup_enabled;
 #endif
 
+extern int sysctl_sched_rr_timeslice;
 extern int sched_rr_timeslice;
 
 extern int sched_rr_handler(struct ctl_table *table, int write,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d01f9d0..10e18fa 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8471,8 +8471,9 @@ int sched_rr_handler(struct ctl_table *table, int write,
/* make sure that internally we keep jiffies */
/* also, writing zero resets timeslice to default */
if (!ret && write) {
-   sched_rr_timeslice = sched_rr_timeslice <= 0 ?
-   RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+   sched_rr_timeslice =
+   sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+   msecs_to_jiffies(sysctl_sched_rr_timeslice);
}
mutex_unlock(&mutex);
return ret;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 704f2b8..4101f9d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
 #include 
 
 int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1aea594..bb260ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -416,7 +416,7 @@ static struct ctl_table kern_table[] = {
},
{
.procname   = "sched_rr_timeslice_ms",
-   .data   = &sched_rr_timeslice,
+   .data   = &sysctl_sched_rr_timeslice,
.maxlen = sizeof(int),
.mode   = 0644,
.proc_handler   = sched_rr_handler,