from:"zhong jiang"

[PATCH] lib/sg_pool: remove unnecessary null check when free the object

2018-07-31 Thread zhong jiang

kmem_cache_destroy/mempool_destroy has taken null check into account.
so remove the redundant check.

Signed-off-by: zhong jiang 
---
 lib/sg_pool.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lib/sg_pool.c b/lib/sg_pool.c
index 6dd3061..d1c1e63 100644
--- a/lib/sg_pool.c
+++ b/lib/sg_pool.c
@@ -148,10 +148,9 @@ static __init int sg_pool_init(void)
 cleanup_sdb:
for (i = 0; i < SG_MEMPOOL_NR; i++) {
struct sg_pool *sgp = sg_pools + i;
-   if (sgp->pool)
-   mempool_destroy(sgp->pool);
-   if (sgp->slab)
-   kmem_cache_destroy(sgp->slab);
+
+   mempool_destroy(sgp->pool);
+   kmem_cache_destroy(sgp->slab);
}
 
return -ENOMEM;
-- 
1.7.12.4

Re: [PATCH] lib/sg_pool,debugobjects: remove unnecessary null check when free the object

2018-07-31 Thread zhong jiang

On 2018/8/1 0:21, Thomas Gleixner wrote:
> On Tue, 31 Jul 2018, zhong jiang wrote:
>
>> kmem_cache_destroy/mempool_destroy has taken null check into account.
>> so remove the redundant check.
> Please split the patch so they can be applied independently by the relevant
> maintainers.
>
> Thanks,
>
>   tglx
>
> .
>
Ok, will do .

Thanks,
zhong jiang

[PATCH] x86/platform/olpc: Use the PTR_ERR_OR_ZERO to simplify the code

2018-07-31 Thread zhong jiang

use PTR_ERR_OR_ZERO is better than the open code

Signed-off-by: zhong jiang 
---
 arch/x86/platform/olpc/olpc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index 7c3077e..f0e920f 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -311,10 +311,8 @@ static int __init add_xo1_platform_devices(void)
return PTR_ERR(pdev);
 
pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
-   if (IS_ERR(pdev))
-   return PTR_ERR(pdev);
 
-   return 0;
+   return PTR_ERR_OR_ZERO(pdev);
 }
 
 static int olpc_xo1_ec_probe(struct platform_device *pdev)
-- 
1.7.12.4

[PATCH] lib/sg_pool,debugobjects: remove unnecessary null check when free the object

2018-07-31 Thread zhong jiang

kmem_cache_destroy/mempool_destroy has taken null check into account.
so remove the redundant check.

Signed-off-by: zhong jiang 
---
 lib/debugobjects.c | 3 +--
 lib/sg_pool.c  | 7 +++
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 994be48..7a6d80b 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -1185,8 +1185,7 @@ void __init debug_objects_mem_init(void)
 
if (!obj_cache || debug_objects_replace_static_objects()) {
debug_objects_enabled = 0;
-   if (obj_cache)
-   kmem_cache_destroy(obj_cache);
+   kmem_cache_destroy(obj_cache);
pr_warn("out of memory.\n");
} else
debug_objects_selftest();
diff --git a/lib/sg_pool.c b/lib/sg_pool.c
index 6dd3061..d1c1e63 100644
--- a/lib/sg_pool.c
+++ b/lib/sg_pool.c
@@ -148,10 +148,9 @@ static __init int sg_pool_init(void)
 cleanup_sdb:
for (i = 0; i < SG_MEMPOOL_NR; i++) {
struct sg_pool *sgp = sg_pools + i;
-   if (sgp->pool)
-   mempool_destroy(sgp->pool);
-   if (sgp->slab)
-   kmem_cache_destroy(sgp->slab);
+
+   mempool_destroy(sgp->pool);
+   kmem_cache_destroy(sgp->slab);
}
 
return -ENOMEM;
-- 
1.7.12.4

[PATCH] driver/hwtracing: use ERR_CAST instead of ERR_PTR

2018-07-31 Thread zhong jiang

Use ERR_CAT inlined function to replace the ERR_PTR(PTR_ERR). It
make the code more concise.

Signed-off-by: zhong jiang 
---
 drivers/hwtracing/coresight/coresight-tmc-etr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 2eda5de..1196364 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -536,7 +536,7 @@ static void tmc_etr_sg_table_populate(struct etr_sg_table 
*etr_table)
sg_table = tmc_alloc_sg_table(dev, node, nr_tpages, nr_dpages, pages);
if (IS_ERR(sg_table)) {
kfree(etr_table);
-   return ERR_PTR(PTR_ERR(sg_table));
+   return ERR_CAST(sg_table);
}
 
etr_table->sg_table = sg_table;
-- 
1.7.12.4

[tip:x86/boot] x86/boot/KASLR: Make local variable mem_limit static

2018-07-30 Thread tip-bot for zhong jiang

Commit-ID:  5db1b1e1ee34871b1965b3f890e3ccbdb185fa52
Gitweb: https://git.kernel.org/tip/5db1b1e1ee34871b1965b3f890e3ccbdb185fa52
Author: zhong jiang 
AuthorDate: Mon, 30 Jul 2018 21:44:33 +0800
Committer:  Thomas Gleixner 
CommitDate: Mon, 30 Jul 2018 19:46:03 +0200

x86/boot/KASLR: Make local variable mem_limit static

Fix the following sparse warning:

arch/x86/boot/compressed/kaslr.c:102:20: warning: symbol 'mem_limit' was not 
declared. Should it be static?

Signed-off-by: zhong jiang 
Signed-off-by: Thomas Gleixner 
Cc: 
Link: 
https://lkml.kernel.org/r/1532958273-47725-1-git-send-email-zhongji...@huawei.com

---
 arch/x86/boot/compressed/kaslr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 531c9876f573..302517929932 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -102,7 +102,7 @@ static bool memmap_too_large;
 
 
 /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
-unsigned long long mem_limit = ULLONG_MAX;
+static unsigned long long mem_limit = ULLONG_MAX;
 
 
 enum mem_avoid_index {

[PATCH] x86/boot/KASLR: Make local variable mem_limit static

2018-07-30 Thread zhong jiang

Fix the following sparse warning:

arch/x86/boot/compressed/kaslr.c:102:20: warning: symbol 'mem_limit' was not 
declared. Should it be static?

Signed-off-by: zhong jiang 
---
 arch/x86/boot/compressed/kaslr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 5aae509..d1e19f3 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -99,7 +99,7 @@ struct mem_vector {
 
 
 /* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
-unsigned long long mem_limit = ULLONG_MAX;
+static unsigned long long mem_limit = ULLONG_MAX;
 
 
 enum mem_avoid_index {
-- 
1.7.12.4

Re: [PATCH v11 19/26] mm: provide speculative fault infrastructure

2018-07-25 Thread zhong jiang

On 2018/7/25 18:44, Laurent Dufour wrote:
>
> On 25/07/2018 11:04, zhong jiang wrote:
>> On 2018/7/25 0:10, Laurent Dufour wrote:
>>> On 24/07/2018 16:26, zhong jiang wrote:
>>>> On 2018/5/17 19:06, Laurent Dufour wrote:
>>>>> From: Peter Zijlstra 
>>>>>
>>>>> Provide infrastructure to do a speculative fault (not holding
>>>>> mmap_sem).
>>>>>
>>>>> The not holding of mmap_sem means we can race against VMA
>>>>> change/removal and page-table destruction. We use the SRCU VMA freeing
>>>>> to keep the VMA around. We use the VMA seqcount to detect change
>>>>> (including umapping / page-table deletion) and we use gup_fast() style
>>>>> page-table walking to deal with page-table races.
>>>>>
>>>>> Once we've obtained the page and are ready to update the PTE, we
>>>>> validate if the state we started the fault with is still valid, if
>>>>> not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
>>>>> PTE and we're done.
>>>>>
>>>>> Signed-off-by: Peter Zijlstra (Intel) 
>>>>>
>>>>> [Manage the newly introduced pte_spinlock() for speculative page
>>>>>  fault to fail if the VMA is touched in our back]
>>>>> [Rename vma_is_dead() to vma_has_changed() and declare it here]
>>>>> [Fetch p4d and pud]
>>>>> [Set vmd.sequence in __handle_mm_fault()]
>>>>> [Abort speculative path when handle_userfault() has to be called]
>>>>> [Add additional VMA's flags checks in handle_speculative_fault()]
>>>>> [Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
>>>>> [Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
>>>>> [Remove warning comment about waiting for !seq&1 since we don't want
>>>>>  to wait]
>>>>> [Remove warning about no huge page support, mention it explictly]
>>>>> [Don't call do_fault() in the speculative path as __do_fault() calls
>>>>>  vma->vm_ops->fault() which may want to release mmap_sem]
>>>>> [Only vm_fault pointer argument for vma_has_changed()]
>>>>> [Fix check against huge page, calling pmd_trans_huge()]
>>>>> [Use READ_ONCE() when reading VMA's fields in the speculative path]
>>>>> [Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
>>>>>  processing done in vm_normal_page()]
>>>>> [Check that vma->anon_vma is already set when starting the speculative
>>>>>  path]
>>>>> [Check for memory policy as we can't support MPOL_INTERLEAVE case due to
>>>>>  the processing done in mpol_misplaced()]
>>>>> [Don't support VMA growing up or down]
>>>>> [Move check on vm_sequence just before calling handle_pte_fault()]
>>>>> [Don't build SPF services if !CONFIG_SPECULATIVE_PAGE_FAULT]
>>>>> [Add mem cgroup oom check]
>>>>> [Use READ_ONCE to access p*d entries]
>>>>> [Replace deprecated ACCESS_ONCE() by READ_ONCE() in vma_has_changed()]
>>>>> [Don't fetch pte again in handle_pte_fault() when running the speculative
>>>>>  path]
>>>>> [Check PMD against concurrent collapsing operation]
>>>>> [Try spin lock the pte during the speculative path to avoid deadlock with
>>>>>  other CPU's invalidating the TLB and requiring this CPU to catch the
>>>>>  inter processor's interrupt]
>>>>> [Move define of FAULT_FLAG_SPECULATIVE here]
>>>>> [Introduce __handle_speculative_fault() and add a check against
>>>>>  mm->mm_users in handle_speculative_fault() defined in mm.h]
>>>>> Signed-off-by: Laurent Dufour 
>>>>> ---
>>>>>  include/linux/hugetlb_inline.h |   2 +-
>>>>>  include/linux/mm.h |  30 
>>>>>  include/linux/pagemap.h|   4 +-
>>>>>  mm/internal.h  |  16 +-
>>>>>  mm/memory.c| 340 
>>>>> -
>>>>>  5 files changed, 385 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/hugetlb_inline.h 
>>>>> b/include/linux/hugetlb_inline.h
>>>>> index 0660a03d37d9..9e25283d6fc9 100644
>>>>> --- a/include/linu

Re: [PATCH v11 19/26] mm: provide speculative fault infrastructure

2018-07-25 Thread zhong jiang

On 2018/7/25 0:10, Laurent Dufour wrote:
>
> On 24/07/2018 16:26, zhong jiang wrote:
>> On 2018/5/17 19:06, Laurent Dufour wrote:
>>> From: Peter Zijlstra 
>>>
>>> Provide infrastructure to do a speculative fault (not holding
>>> mmap_sem).
>>>
>>> The not holding of mmap_sem means we can race against VMA
>>> change/removal and page-table destruction. We use the SRCU VMA freeing
>>> to keep the VMA around. We use the VMA seqcount to detect change
>>> (including umapping / page-table deletion) and we use gup_fast() style
>>> page-table walking to deal with page-table races.
>>>
>>> Once we've obtained the page and are ready to update the PTE, we
>>> validate if the state we started the fault with is still valid, if
>>> not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
>>> PTE and we're done.
>>>
>>> Signed-off-by: Peter Zijlstra (Intel) 
>>>
>>> [Manage the newly introduced pte_spinlock() for speculative page
>>>  fault to fail if the VMA is touched in our back]
>>> [Rename vma_is_dead() to vma_has_changed() and declare it here]
>>> [Fetch p4d and pud]
>>> [Set vmd.sequence in __handle_mm_fault()]
>>> [Abort speculative path when handle_userfault() has to be called]
>>> [Add additional VMA's flags checks in handle_speculative_fault()]
>>> [Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
>>> [Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
>>> [Remove warning comment about waiting for !seq&1 since we don't want
>>>  to wait]
>>> [Remove warning about no huge page support, mention it explictly]
>>> [Don't call do_fault() in the speculative path as __do_fault() calls
>>>  vma->vm_ops->fault() which may want to release mmap_sem]
>>> [Only vm_fault pointer argument for vma_has_changed()]
>>> [Fix check against huge page, calling pmd_trans_huge()]
>>> [Use READ_ONCE() when reading VMA's fields in the speculative path]
>>> [Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
>>>  processing done in vm_normal_page()]
>>> [Check that vma->anon_vma is already set when starting the speculative
>>>  path]
>>> [Check for memory policy as we can't support MPOL_INTERLEAVE case due to
>>>  the processing done in mpol_misplaced()]
>>> [Don't support VMA growing up or down]
>>> [Move check on vm_sequence just before calling handle_pte_fault()]
>>> [Don't build SPF services if !CONFIG_SPECULATIVE_PAGE_FAULT]
>>> [Add mem cgroup oom check]
>>> [Use READ_ONCE to access p*d entries]
>>> [Replace deprecated ACCESS_ONCE() by READ_ONCE() in vma_has_changed()]
>>> [Don't fetch pte again in handle_pte_fault() when running the speculative
>>>  path]
>>> [Check PMD against concurrent collapsing operation]
>>> [Try spin lock the pte during the speculative path to avoid deadlock with
>>>  other CPU's invalidating the TLB and requiring this CPU to catch the
>>>  inter processor's interrupt]
>>> [Move define of FAULT_FLAG_SPECULATIVE here]
>>> [Introduce __handle_speculative_fault() and add a check against
>>>  mm->mm_users in handle_speculative_fault() defined in mm.h]
>>> Signed-off-by: Laurent Dufour 
>>> ---
>>>  include/linux/hugetlb_inline.h |   2 +-
>>>  include/linux/mm.h |  30 
>>>  include/linux/pagemap.h|   4 +-
>>>  mm/internal.h  |  16 +-
>>>  mm/memory.c| 340 
>>> -
>>>  5 files changed, 385 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
>>> index 0660a03d37d9..9e25283d6fc9 100644
>>> --- a/include/linux/hugetlb_inline.h
>>> +++ b/include/linux/hugetlb_inline.h
>>> @@ -8,7 +8,7 @@
>>>  
>>>  static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
>>>  {
>>> -   return !!(vma->vm_flags & VM_HUGETLB);
>>> +   return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
>>>  }
>>>  
>>>  #else
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index 05cbba70104b..31acf98a7d92 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -315,6 +315,7 @@ extern pgprot_t protection_map[16];
>>>  #define FAULT

[tip:x86/mm] x86/mm/tlb: Make clear_asid_other() static

2018-07-24 Thread tip-bot for zhong jiang

Commit-ID:  387048f51aecaa083e660fe0f15ad339354b116e
Gitweb: https://git.kernel.org/tip/387048f51aecaa083e660fe0f15ad339354b116e
Author: zhong jiang 
AuthorDate: Sat, 21 Jul 2018 15:55:32 +0800
Committer:  Ingo Molnar 
CommitDate: Tue, 24 Jul 2018 09:52:32 +0200

x86/mm/tlb: Make clear_asid_other() static

Fixes the following sparse warning:

  arch/x86/mm/tlb.c:38:6: warning: symbol 'clear_asid_other' was not declared. 
Should it be static?

Signed-off-by: zhong jiang 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: dave.han...@linux.intel.com
Cc: kirill.shute...@linux.intel.com
Cc: tim.c.c...@linux.intel.com
Link: 
http://lkml.kernel.org/r/1532159732-22939-1-git-send-email-zhongji...@huawei.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index f086195f644c..752dbf4e0e50 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -36,7 +36,7 @@
  * necessary invalidation by clearing out the 'ctx_id' which
  * forces a TLB flush when the context is loaded.
  */
-void clear_asid_other(void)
+static void clear_asid_other(void)
 {
u16 asid;

[PATCH linux-next] gpio: fix meaningless return expression

2018-07-24 Thread zhong jiang

Fix the following sparse error:

drivers/gpio/gpio-ath79.c:54:16: error: return expression in void function

Signed-off-by: zhong jiang 
---
 drivers/gpio/gpio-ath79.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpio/gpio-ath79.c b/drivers/gpio/gpio-ath79.c
index 684e9d6..0a553d6 100644
--- a/drivers/gpio/gpio-ath79.c
+++ b/drivers/gpio/gpio-ath79.c
@@ -51,7 +51,7 @@ static u32 ath79_gpio_read(struct ath79_gpio_ctrl *ctrl, 
unsigned reg)
 static void ath79_gpio_write(struct ath79_gpio_ctrl *ctrl,
unsigned reg, u32 val)
 {
-   return writel(val, ctrl->base + reg);
+   writel(val, ctrl->base + reg);
 }
 
 static bool ath79_gpio_update_bits(
-- 
1.7.12.4

[PATCH linux-next] driver/gpu: Fix mismatch in funciton argument type

2018-07-21 Thread zhong jiang

Fix the following warning:

drivers/gpu/drm/nouveau/dispnv50/wndw.c:570:1: error: symbol 'nv50_wndw_new_' 
redeclared with different type
(originally declared at drivers/gpu/drm/nouveau/dispnv50/wndw.h:39) - 
incompatible argument 7 (different signedness)

Signed-off-by: zhong jiang 
---
 drivers/gpu/drm/nouveau/dispnv50/wndw.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.h 
b/drivers/gpu/drm/nouveau/dispnv50/wndw.h
index b0b6428..0770683 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/wndw.h
+++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.h
@@ -38,8 +38,8 @@ struct nv50_wndw {
 
 int nv50_wndw_new_(const struct nv50_wndw_func *, struct drm_device *,
   enum drm_plane_type, const char *name, int index,
-  const u32 *format, enum nv50_disp_interlock_type,
-  u32 interlock_data, u32 heads, struct nv50_wndw **);
+  const u32 *format, u32 heads, enum nv50_disp_interlock_type,
+  u32 interlock_data, struct nv50_wndw **);
 void nv50_wndw_init(struct nv50_wndw *);
 void nv50_wndw_fini(struct nv50_wndw *);
 void nv50_wndw_flush_set(struct nv50_wndw *, u32 *interlock,
-- 
1.7.12.4

[PATCH linux-next] kernel/exit: fix mismatch in function argument types

2018-07-21 Thread zhong jiang

Fix following warning:

kernel/exit.c:1634:6: error: symbol 'kernel_wait4' redeclared with different 
type (originally declared at ./include/linux/sched/task.h:78)
- incompatible argument 2 (different address spaces)

Signed-off-by: zhong jiang 
---
 include/linux/sched/task.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 5be31eb..108ede9 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -75,7 +75,7 @@ static inline void exit_thread(struct task_struct *tsk)
 extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, 
int __user *);
 struct task_struct *fork_idle(int);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
-extern long kernel_wait4(pid_t, int *, int, struct rusage *);
+extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
 
 extern void free_task(struct task_struct *tsk);
 
-- 
1.7.12.4

[PATCH linux-next] x86: should use NULL pointer to replace plain integer

2018-07-21 Thread zhong jiang

Fixes the following sparse warnings:

arch/x86/kernel/pci-iommu_table.c:63:37: warning: Using plain integer as NULL 
pointer

Signed-off-by: zhong jiang 
---
 arch/x86/kernel/pci-iommu_table.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/pci-iommu_table.c 
b/arch/x86/kernel/pci-iommu_table.c
index 4dfd90a..2e9006c 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -60,7 +60,7 @@ void __init check_iommu_entries(struct iommu_table_entry 
*start,
printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends 
on %pS and vice-versa. BREAKING IT.\n",
   p->detect, q->detect);
/* Heavy handed way..*/
-   x->depend = 0;
+   x->depend = NULL;
}
}
 
-- 
1.7.12.4

[PATCH linux-next] mm/tlb: make some function static to avoid warning

2018-07-21 Thread zhong jiang

Fixes the following sparse warnings:

arch/x86/mm/tlb.c:38:6: warning: symbol 'clear_asid_other' was not declared. 
Should it be static?

Signed-off-by: zhong jiang 
---
 arch/x86/mm/tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6eb1f34..5e8e478 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -35,7 +35,7 @@
  * necessary invalidation by clearing out the 'ctx_id' which
  * forces a TLB flush when the context is loaded.
  */
-void clear_asid_other(void)
+static void clear_asid_other(void)
 {
u16 asid;
 
-- 
1.7.12.4

Re: [RESEND] x86/numa: move setting parsed numa node to num_add_memblk

2017-12-11 Thread zhong jiang

On 2017/12/11 21:45, Michal Hocko wrote:
> On Mon 11-12-17 20:59:29, zhong jiang wrote:
>> On 2017/12/11 20:03, Michal Hocko wrote:
>>> On Fri 01-12-17 18:13:52, zhong jiang wrote:
>>>> The acpi table are very much like user input. it is likely to
>>>> introduce some unreasonable node in some architecture. but
>>>> they do not ingore the node and bail out in time. it will result
>>>> in unnecessary print.
>>>> e.g  x86:  start is equal to end is a unreasonable node.
>>>> numa_blk_memblk will fails but return 0.
>>>>
>>>> meanwhile, Arm64 node will double set it to "numa_node_parsed"
>>>> after NUMA adds a memblk successfully.  but X86 is not. because
>>>> numa_add_memblk is not set in X86.
>>> I am sorry but I still fail to understand wht the actual problem is.
>>> You said that x86 will print a message. Alright at least you know that
>>> the platform provides a nonsense ACPI/SRAT? tables and you can complain.
>>> But does the kernel misbehave? In what way?
>>   From the view of  the following code , we should expect that the node is 
>> reasonable.
>>   otherwise, if we only want to complain,  it should bail out in time after 
>> printing the
>>   unreasonable message.
>>
>>   node_set(node, numa_nodes_parsed);
>>
>> pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
>> node, pxm,
>> (unsigned long long) start, (unsigned long long) end - 1,
>> hotpluggable ? " hotplug" : "",
>> ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : 
>> "");
>>
>> /* Mark hotplug range in memblock. */
>> if (hotpluggable && memblock_mark_hotplug(start, ma->length))
>> pr_warn("SRAT: Failed to mark hotplug range [mem 
>> %#010Lx-%#010Lx] in memblock\n",
>> (unsigned long long)start, (unsigned long long)end - 
>> 1);
>>
>> max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
>>
>> return 0;
>> out_err_bad_srat:
>> bad_srat();
>>
>>  In addition.  Arm64  will double set node to numa_nodes_parsed after add a 
>> memblk
>> successfully.  Because numa_add_memblk will perform node_set(*, *).
>>
>>  if (numa_add_memblk(node, start, end) < 0) {
>> pr_err("SRAT: Failed to add memblk to node %u [mem 
>> %#010Lx-%#010Lx]\n",
>>node, (unsigned long long) start,
>>(unsigned long long) end - 1);
>> goto out_err_bad_srat;
>> }
>>
>> node_set(node, numa_nodes_parsed);
> I am sorry but I _do not_ understand how this answers my simple
> question. You are describing the code flow which doesn't really explain
> what is the _user_ or a _runtime_ visible effect. Anybody reading this
> changelog will have to scratch his head to understand what the heck does
> this fix and whether the patch needs to be considered for backporting.
> See my point?
 There  is not any visible effect to the user.  IMO,  it is  a better 
optimization.
 Maybe I put more words  to explain  how  the patch works.  :-[

 I found the code is messy when reading it without a real issue. 

 Thanks
 zhong jiang

Re: [RESEND] x86/numa: move setting parsed numa node to num_add_memblk

2017-12-11 Thread zhong jiang

On 2017/12/11 20:03, Michal Hocko wrote:
> On Fri 01-12-17 18:13:52, zhong jiang wrote:
>> The acpi table are very much like user input. it is likely to
>> introduce some unreasonable node in some architecture. but
>> they do not ingore the node and bail out in time. it will result
>> in unnecessary print.
>> e.g  x86:  start is equal to end is a unreasonable node.
>> numa_blk_memblk will fails but return 0.
>>
>> meanwhile, Arm64 node will double set it to "numa_node_parsed"
>> after NUMA adds a memblk successfully.  but X86 is not. because
>> numa_add_memblk is not set in X86.
> I am sorry but I still fail to understand wht the actual problem is.
> You said that x86 will print a message. Alright at least you know that
> the platform provides a nonsense ACPI/SRAT? tables and you can complain.
> But does the kernel misbehave? In what way?
  From the view of  the following code , we should expect that the node is 
reasonable.
  otherwise, if we only want to complain,  it should bail out in time after 
printing the
  unreasonable message.

  node_set(node, numa_nodes_parsed);

pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
node, pxm,
(unsigned long long) start, (unsigned long long) end - 1,
hotpluggable ? " hotplug" : "",
ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : "");

/* Mark hotplug range in memblock. */
if (hotpluggable && memblock_mark_hotplug(start, ma->length))
pr_warn("SRAT: Failed to mark hotplug range [mem 
%#010Lx-%#010Lx] in memblock\n",
(unsigned long long)start, (unsigned long long)end - 1);

max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));

return 0;
out_err_bad_srat:
bad_srat();

 In addition.  Arm64  will double set node to numa_nodes_parsed after add a 
memblk
successfully.  Because numa_add_memblk will perform node_set(*, *).

 if (numa_add_memblk(node, start, end) < 0) {
pr_err("SRAT: Failed to add memblk to node %u [mem 
%#010Lx-%#010Lx]\n",
   node, (unsigned long long) start,
   (unsigned long long) end - 1);
goto out_err_bad_srat;
}

node_set(node, numa_nodes_parsed);

Thanks
zhong jiang
>> In view of the above problems. I think it need a better improvement.
>> we add a check here for bypassing the invalid memblk node.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>  arch/x86/mm/amdtopology.c | 1 -
>>  arch/x86/mm/numa.c| 3 ++-
>>  drivers/acpi/numa.c   | 5 -
>>  3 files changed, 6 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
>> index 91f501b..7657042 100644
>> --- a/arch/x86/mm/amdtopology.c
>> +++ b/arch/x86/mm/amdtopology.c
>> @@ -151,7 +151,6 @@ int __init amd_numa_init(void)
>>  
>>  prevbase = base;
>>  numa_add_memblk(nodeid, base, limit);
>> -node_set(nodeid, numa_nodes_parsed);
>>  }
>>  
>>  if (!nodes_weight(numa_nodes_parsed))
>> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
>> index 25504d5..8f87f26 100644
>> --- a/arch/x86/mm/numa.c
>> +++ b/arch/x86/mm/numa.c
>> @@ -150,6 +150,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, 
>> u64 end,
>>  mi->blk[mi->nr_blks].end = end;
>>  mi->blk[mi->nr_blks].nid = nid;
>>  mi->nr_blks++;
>> +
>> +node_set(nid, numa_nodes_parsed);
>>  return 0;
>>  }
>>  
>> @@ -693,7 +695,6 @@ static int __init dummy_numa_init(void)
>>  printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
>> 0LLU, PFN_PHYS(max_pfn) - 1);
>>  
>> -node_set(0, numa_nodes_parsed);
>>  numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
>>  
>>  return 0;
>> diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
>> index 917f1cc..f2e33cb 100644
>> --- a/drivers/acpi/numa.c
>> +++ b/drivers/acpi/numa.c
>> @@ -294,7 +294,9 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
>> *slit)
>>  goto out_err_bad_srat;
>>  }
>>  
>> -node_set(node, numa_nodes_parsed);
>> +/* some architecture is likely to ignore a unreasonable node */
>> +if (!node_isset(node, numa_nodes_parsed))
>> +goto out;
>>  
>>  pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
>>  node, pxm,
>> @@ -309,6 +311,7 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
>> *slit)
>>  
>>  max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
>>  
>> +out:
>>  return 0;
>>  out_err_bad_srat:
>>  bad_srat();
>> -- 
>> 1.8.3.1

[RESEND] x86/numa: move setting parsed numa node to num_add_memblk

2017-12-01 Thread zhong jiang

The acpi table are very much like user input. it is likely to
introduce some unreasonable node in some architecture. but
they do not ingore the node and bail out in time. it will result
in unnecessary print.
e.g  x86:  start is equal to end is a unreasonable node.
numa_blk_memblk will fails but return 0.

meanwhile, Arm64 node will double set it to "numa_node_parsed"
after NUMA adds a memblk successfully.  but X86 is not. because
numa_add_memblk is not set in X86.

In view of the above problems. I think it need a better improvement.
we add a check here for bypassing the invalid memblk node.

Signed-off-by: zhong jiang 
---
 arch/x86/mm/amdtopology.c | 1 -
 arch/x86/mm/numa.c| 3 ++-
 drivers/acpi/numa.c   | 5 -
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 91f501b..7657042 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -151,7 +151,6 @@ int __init amd_numa_init(void)
 
prevbase = base;
numa_add_memblk(nodeid, base, limit);
-   node_set(nodeid, numa_nodes_parsed);
}
 
if (!nodes_weight(numa_nodes_parsed))
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 25504d5..8f87f26 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -150,6 +150,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, 
u64 end,
mi->blk[mi->nr_blks].end = end;
mi->blk[mi->nr_blks].nid = nid;
mi->nr_blks++;
+
+   node_set(nid, numa_nodes_parsed);
return 0;
 }
 
@@ -693,7 +695,6 @@ static int __init dummy_numa_init(void)
printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
   0LLU, PFN_PHYS(max_pfn) - 1);
 
-   node_set(0, numa_nodes_parsed);
numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
 
return 0;
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 917f1cc..f2e33cb 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -294,7 +294,9 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
*slit)
goto out_err_bad_srat;
}
 
-   node_set(node, numa_nodes_parsed);
+   /* some architecture is likely to ignore a unreasonable node */
+   if (!node_isset(node, numa_nodes_parsed))
+   goto out;
 
pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
node, pxm,
@@ -309,6 +311,7 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
*slit)
 
max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
 
+out:
return 0;
 out_err_bad_srat:
bad_srat();
-- 
1.8.3.1

Re: [PATCH] x86/numa: move setting parse numa node to num_add_memblk

2017-12-01 Thread zhong jiang

On 2017/12/1 16:58, Michal Hocko wrote:
> On Fri 01-12-17 16:48:25, zhong jiang wrote:
>> +cc more mm maintainer.
>>
>> Any one has any object.  please let me know.  
> Please repost with the changelog which actually tells 1) what is the
> problem 2) why do we need to address it and 3) how do we address it.
Fine,  I will repost later.

Thanks
zhong jiang

Re: [PATCH] x86/numa: move setting parse numa node to num_add_memblk

2017-12-01 Thread zhong jiang

+cc more mm maintainer.

Any one has any object.  please let me know.  

Thanks
zhongjiang
On 2017/11/29 17:13, zhong jiang wrote:
> Currently, Arm64 and x86 use the common code wehn parsing numa node
> in a acpi way. The arm64 will set the parsed node in numa_add_memblk,
> but the x86 is not set in that , then it will result in the repeatly
> setting. And the parsed node maybe is  unreasonable to the system.
>
> we would better not set it although it also still works. because the
> parsed node is unresonable. so we should skip related operate in this
> node. This patch just set node in various architecture individually.
> it is no functional change.
>
> Signed-off-by: zhong jiang 
> ---
>  arch/x86/mm/amdtopology.c | 1 -
>  arch/x86/mm/numa.c| 3 ++-
>  drivers/acpi/numa.c   | 5 -
>  3 files changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
> index 91f501b..7657042 100644
> --- a/arch/x86/mm/amdtopology.c
> +++ b/arch/x86/mm/amdtopology.c
> @@ -151,7 +151,6 @@ int __init amd_numa_init(void)
>  
>   prevbase = base;
>   numa_add_memblk(nodeid, base, limit);
> - node_set(nodeid, numa_nodes_parsed);
>   }
>  
>   if (!nodes_weight(numa_nodes_parsed))
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 25504d5..8f87f26 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -150,6 +150,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, 
> u64 end,
>   mi->blk[mi->nr_blks].end = end;
>   mi->blk[mi->nr_blks].nid = nid;
>   mi->nr_blks++;
> +
> + node_set(nid, numa_nodes_parsed);
>   return 0;
>  }
>  
> @@ -693,7 +695,6 @@ static int __init dummy_numa_init(void)
>   printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
>  0LLU, PFN_PHYS(max_pfn) - 1);
>  
> - node_set(0, numa_nodes_parsed);
>   numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
>  
>   return 0;
> diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
> index 917f1cc..f2e33cb 100644
> --- a/drivers/acpi/numa.c
> +++ b/drivers/acpi/numa.c
> @@ -294,7 +294,9 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
> *slit)
>   goto out_err_bad_srat;
>   }
>  
> - node_set(node, numa_nodes_parsed);
> + /* some architecture is likely to ignore a unreasonable node */
> + if (!node_isset(node, numa_nodes_parsed))
> + goto out;
>  
>   pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
>   node, pxm,
> @@ -309,6 +311,7 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
> *slit)
>  
>   max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
>  
> +out:
>   return 0;
>  out_err_bad_srat:
>   bad_srat();

Re: [PATCH] x86/numa: move setting parse numa node to num_add_memblk

2017-11-29 Thread zhong jiang

On 2017/11/29 22:14, Dou Liyang wrote:
> Hi Jiang,
>
> At 11/29/2017 09:44 PM, zhong jiang wrote:
>> On 2017/11/29 21:33, Michal Hocko wrote:
>>> On Wed 29-11-17 21:26:19, zhong jiang wrote:
>>>> On 2017/11/29 21:01, Michal Hocko wrote:
>>>>> On Wed 29-11-17 20:41:25, zhong jiang wrote:
>>>>>> On 2017/11/29 20:03, Michal Hocko wrote:
>>>>>>> On Wed 29-11-17 17:13:27, zhong jiang wrote:
>>>>>>>> Currently, Arm64 and x86 use the common code wehn parsing numa node
>>>>>>>> in a acpi way. The arm64 will set the parsed node in numa_add_memblk,
>>>>>>>> but the x86 is not set in that , then it will result in the repeatly
>>>>>>>> setting. And the parsed node maybe is  unreasonable to the system.
>>>>>>>>
>>>>>>>> we would better not set it although it also still works. because the
>>>>>>>> parsed node is unresonable. so we should skip related operate in this
>>>>>>>> node. This patch just set node in various architecture individually.
>>>>>>>> it is no functional change.
>>>>>>> I really have hard time to understand what you try to say above. Could
>>>>>>> you start by the problem description and then how you are addressing it?
>>>>>>   I am so sorry for that.  I will make the issue clear.
>>>>>>
>>>>>>   Arm64  get numa information through acpi.  The code flow is as follows.
>>>>>>
>>>>>>   arm64_acpi_numa_init
>>>>>>acpi_parse_memory_affinity
>>>>>>   acpi_numa_memory_affinity_init
>>>>>>   numa_add_memblk(nid, start, end);  //it will set node 
>>>>>> to numa_nodes_parsed successfully.
>>>>>>   node_set(node, numa_nodes_parsed); // numa_add_memblk 
>>>>>> had set that.  it will repeat.
>>>>>>
>>>>>>  the root cause is that X86 parse numa also  go through above code.  and 
>>>>>>  arch-related
>>>>>>  numa_add_memblk  is not set the parsed node to numa_nodes_parsed.  it 
>>>>>> need
>>>>>>  additional node_set(node, numa_parsed) to handle.  therefore,  the 
>>>>>> issue will be introduced.
>>>>>>
>>>>> No it is not much more clear. I would have to go and re-study the whole
>>>>> code flow to see what you mean here. So you could simply state what _the
>>>>> issue_ is? How can user observe it and what are the consequences?
>>>>   The patch do not fix a real issue.  it is a cleanup.
>
> > @@ -294,7 +294,9 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
> > *slit)
> >  goto out_err_bad_srat;
> >  }
> >
> > -node_set(node, numa_nodes_parsed);
> > +/* some architecture is likely to ignore a unreasonable node */
> > +if (!node_isset(node, numa_nodes_parsed))
> > +goto out;
> >
>
> It is not just a cleanup patch,Here you change the original logic.
>
  you are right.  cleanup and slightly change.
> With this patch, we just set the *numa_nodes_parsed* after NUMA adds a
> memblk successfully and also add a check here for bypassing the invalid
> memblk node.
>
> I am not sure which arch may meet this situation? did you test this
> patch?
>
  At least  X86 maybe meet the condition. we can see the following code.

static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 struct numa_meminfo *mi)
{
/* ignore zero length blks */
if (start == end)
return 0;

/* whine about and ignore invalid blks */
    if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
pr_warning("NUMA: Warning: invalid memblk node %d [mem 
%#010Lx-%#010Lx]\n",
   nid, start, end - 1);
return 0;
}

if (mi->nr_blks >= NR_NODE_MEMBLKS) {
pr_err("NUMA: too many memblk ranges\n");
return -EINVAL;
}

mi->blk[mi->nr_blks].start = start;
mi->blk[mi->nr_blks].end = end;
mi->blk[mi->nr_blks].nid = nid;
mi->nr_blks++;
return 0;
}

it is likely to fail and return 0.   e.g: start == end  etc.
In this case, we expect it should bail out in time.
> Anyway, AFAIK, The ACPI tables are very much like user input in that
> respect and they are unreasonable. So the patch is better.
>
  yes,  Totally agree. 

 Thanks
 zhong jiang
> Thanks,
> dou.
>
>>>>   because the acpi code  is public,  I find they are messy between
>>>>   Arch64 and X86 when parsing numa message .  therefore,  I try to
>>>>   make the code more clear between them.
>>> So make this explicit in the changelog. Your previous wording sounded
>>> like there is a _problem_ in the code.
>>>
>> :-[   please take some time to check.  if it works.  I will resend v2 
>> with detailed changelog.
>>
>> Thanks
>> zhongjiang
>>
>>
>>
>>
>
>
>
> .
>

Re: [PATCH] x86/numa: move setting parse numa node to num_add_memblk

2017-11-29 Thread zhong jiang

On 2017/11/29 21:33, Michal Hocko wrote:
> On Wed 29-11-17 21:26:19, zhong jiang wrote:
>> On 2017/11/29 21:01, Michal Hocko wrote:
>>> On Wed 29-11-17 20:41:25, zhong jiang wrote:
>>>> On 2017/11/29 20:03, Michal Hocko wrote:
>>>>> On Wed 29-11-17 17:13:27, zhong jiang wrote:
>>>>>> Currently, Arm64 and x86 use the common code wehn parsing numa node
>>>>>> in a acpi way. The arm64 will set the parsed node in numa_add_memblk,
>>>>>> but the x86 is not set in that , then it will result in the repeatly
>>>>>> setting. And the parsed node maybe is  unreasonable to the system.
>>>>>>
>>>>>> we would better not set it although it also still works. because the
>>>>>> parsed node is unresonable. so we should skip related operate in this
>>>>>> node. This patch just set node in various architecture individually.
>>>>>> it is no functional change.
>>>>> I really have hard time to understand what you try to say above. Could
>>>>> you start by the problem description and then how you are addressing it?
>>>>   I am so sorry for that.  I will make the issue clear.
>>>>  
>>>>   Arm64  get numa information through acpi.  The code flow is as follows.
>>>>
>>>>   arm64_acpi_numa_init
>>>>acpi_parse_memory_affinity
>>>>   acpi_numa_memory_affinity_init
>>>>   numa_add_memblk(nid, start, end);  //it will set node to 
>>>> numa_nodes_parsed successfully.
>>>>   node_set(node, numa_nodes_parsed); // numa_add_memblk 
>>>> had set that.  it will repeat.
>>>>
>>>>  the root cause is that X86 parse numa also  go through above code.  and  
>>>> arch-related
>>>>  numa_add_memblk  is not set the parsed node to numa_nodes_parsed.  it need
>>>>  additional node_set(node, numa_parsed) to handle.  therefore,  the issue 
>>>> will be introduced.
>>>>
>>> No it is not much more clear. I would have to go and re-study the whole
>>> code flow to see what you mean here. So you could simply state what _the
>>> issue_ is? How can user observe it and what are the consequences?
>>   The patch do not fix a real issue.  it is a cleanup.
>>   because the acpi code  is public,  I find they are messy between
>>   Arch64 and X86 when parsing numa message .  therefore,  I try to
>>   make the code more clear between them.
> So make this explicit in the changelog. Your previous wording sounded
> like there is a _problem_ in the code.
>
:-[   please take some time to check.  if it works.  I will resend v2 with 
detailed changelog.

Thanks
zhongjiang

Re: [PATCH] x86/numa: move setting parse numa node to num_add_memblk

2017-11-29 Thread zhong jiang

On 2017/11/29 21:01, Michal Hocko wrote:
> On Wed 29-11-17 20:41:25, zhong jiang wrote:
>> On 2017/11/29 20:03, Michal Hocko wrote:
>>> On Wed 29-11-17 17:13:27, zhong jiang wrote:
>>>> Currently, Arm64 and x86 use the common code wehn parsing numa node
>>>> in a acpi way. The arm64 will set the parsed node in numa_add_memblk,
>>>> but the x86 is not set in that , then it will result in the repeatly
>>>> setting. And the parsed node maybe is  unreasonable to the system.
>>>>
>>>> we would better not set it although it also still works. because the
>>>> parsed node is unresonable. so we should skip related operate in this
>>>> node. This patch just set node in various architecture individually.
>>>> it is no functional change.
>>> I really have hard time to understand what you try to say above. Could
>>> you start by the problem description and then how you are addressing it?
>>   I am so sorry for that.  I will make the issue clear.
>>  
>>   Arm64  get numa information through acpi.  The code flow is as follows.
>>
>>   arm64_acpi_numa_init
>>acpi_parse_memory_affinity
>>   acpi_numa_memory_affinity_init
>>   numa_add_memblk(nid, start, end);  //it will set node to 
>> numa_nodes_parsed successfully.
>>   node_set(node, numa_nodes_parsed); // numa_add_memblk had 
>> set that.  it will repeat.
>>
>>  the root cause is that X86 parse numa also  go through above code.  and  
>> arch-related
>>  numa_add_memblk  is not set the parsed node to numa_nodes_parsed.  it need
>>  additional node_set(node, numa_parsed) to handle.  therefore,  the issue 
>> will be introduced.
>>
> No it is not much more clear. I would have to go and re-study the whole
> code flow to see what you mean here. So you could simply state what _the
> issue_ is? How can user observe it and what are the consequences?
  The patch do not fix a real issue.  it is a cleanup.
  because the acpi code  is public,  I find they are messy between
  Arch64 and X86 when parsing numa message .  therefore,  I try to
  make the code more clear between them.

  Thanks
  zhongjiang
> Sorry for my laziness, I could go and read the code but the primary
> point of the changelog is to be _clear_ about the problem and the fix.
> Call paths can help reviewers but the scope should be clear even without
> them.

Re: [PATCH] x86/numa: move setting parse numa node to num_add_memblk

2017-11-29 Thread zhong jiang

On 2017/11/29 20:03, Michal Hocko wrote:
> On Wed 29-11-17 17:13:27, zhong jiang wrote:
>> Currently, Arm64 and x86 use the common code wehn parsing numa node
>> in a acpi way. The arm64 will set the parsed node in numa_add_memblk,
>> but the x86 is not set in that , then it will result in the repeatly
>> setting. And the parsed node maybe is  unreasonable to the system.
>>
>> we would better not set it although it also still works. because the
>> parsed node is unresonable. so we should skip related operate in this
>> node. This patch just set node in various architecture individually.
>> it is no functional change.
> I really have hard time to understand what you try to say above. Could
> you start by the problem description and then how you are addressing it?
  I am so sorry for that.  I will make the issue clear.
 
  Arm64  get numa information through acpi.  The code flow is as follows.

  arm64_acpi_numa_init
   acpi_parse_memory_affinity
  acpi_numa_memory_affinity_init
  numa_add_memblk(nid, start, end);  //it will set node to 
numa_nodes_parsed successfully.
  node_set(node, numa_nodes_parsed); // numa_add_memblk had set 
that.  it will repeat.

  the root cause is that X86 parse numa also  go through above code.  and  
arch-related
  numa_add_memblk  is not set the parsed node to numa_nodes_parsed.  it need
  additional node_set(node, numa_parsed) to handle.  therefore,  the issue will 
be introduced.

  menawhile,  the parsed node is meaningless when numa_add_memblk fails and 
return 0.
 so we should bail out in time.
 
is it a little clearer ?

 Thanks
 zhongjiang
>> Signed-off-by: zhong jiang 
>> ---
>>  arch/x86/mm/amdtopology.c | 1 -
>>  arch/x86/mm/numa.c| 3 ++-
>>  drivers/acpi/numa.c   | 5 -
>>  3 files changed, 6 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
>> index 91f501b..7657042 100644
>> --- a/arch/x86/mm/amdtopology.c
>> +++ b/arch/x86/mm/amdtopology.c
>> @@ -151,7 +151,6 @@ int __init amd_numa_init(void)
>>  
>>  prevbase = base;
>>  numa_add_memblk(nodeid, base, limit);
>> -node_set(nodeid, numa_nodes_parsed);
>>  }
>>  
>>  if (!nodes_weight(numa_nodes_parsed))
>> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
>> index 25504d5..8f87f26 100644
>> --- a/arch/x86/mm/numa.c
>> +++ b/arch/x86/mm/numa.c
>> @@ -150,6 +150,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, 
>> u64 end,
>>  mi->blk[mi->nr_blks].end = end;
>>  mi->blk[mi->nr_blks].nid = nid;
>>  mi->nr_blks++;
>> +
>> +node_set(nid, numa_nodes_parsed);
>>  return 0;
>>  }
>>  
>> @@ -693,7 +695,6 @@ static int __init dummy_numa_init(void)
>>  printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
>> 0LLU, PFN_PHYS(max_pfn) - 1);
>>  
>> -node_set(0, numa_nodes_parsed);
>>  numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
>>  
>>  return 0;
>> diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
>> index 917f1cc..f2e33cb 100644
>> --- a/drivers/acpi/numa.c
>> +++ b/drivers/acpi/numa.c
>> @@ -294,7 +294,9 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
>> *slit)
>>  goto out_err_bad_srat;
>>  }
>>  
>> -node_set(node, numa_nodes_parsed);
>> +/* some architecture is likely to ignore a unreasonable node */
>> +if (!node_isset(node, numa_nodes_parsed))
>> +goto out;
>>  
>>  pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
>>  node, pxm,
>> @@ -309,6 +311,7 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
>> *slit)
>>  
>>  max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
>>  
>> +out:
>>  return 0;
>>  out_err_bad_srat:
>>  bad_srat();
>> -- 
>> 1.8.3.1

[PATCH] x86/numa: move setting parse numa node to num_add_memblk

2017-11-29 Thread zhong jiang

Currently, Arm64 and x86 use the common code wehn parsing numa node
in a acpi way. The arm64 will set the parsed node in numa_add_memblk,
but the x86 is not set in that , then it will result in the repeatly
setting. And the parsed node maybe is  unreasonable to the system.

we would better not set it although it also still works. because the
parsed node is unresonable. so we should skip related operate in this
node. This patch just set node in various architecture individually.
it is no functional change.

Signed-off-by: zhong jiang 
---
 arch/x86/mm/amdtopology.c | 1 -
 arch/x86/mm/numa.c| 3 ++-
 drivers/acpi/numa.c   | 5 -
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 91f501b..7657042 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -151,7 +151,6 @@ int __init amd_numa_init(void)
 
prevbase = base;
numa_add_memblk(nodeid, base, limit);
-   node_set(nodeid, numa_nodes_parsed);
}
 
if (!nodes_weight(numa_nodes_parsed))
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 25504d5..8f87f26 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -150,6 +150,8 @@ static int __init numa_add_memblk_to(int nid, u64 start, 
u64 end,
mi->blk[mi->nr_blks].end = end;
mi->blk[mi->nr_blks].nid = nid;
mi->nr_blks++;
+
+   node_set(nid, numa_nodes_parsed);
return 0;
 }
 
@@ -693,7 +695,6 @@ static int __init dummy_numa_init(void)
printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
   0LLU, PFN_PHYS(max_pfn) - 1);
 
-   node_set(0, numa_nodes_parsed);
numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
 
return 0;
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 917f1cc..f2e33cb 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -294,7 +294,9 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
*slit)
goto out_err_bad_srat;
}
 
-   node_set(node, numa_nodes_parsed);
+   /* some architecture is likely to ignore a unreasonable node */
+   if (!node_isset(node, numa_nodes_parsed))
+   goto out;
 
pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n",
node, pxm,
@@ -309,6 +311,7 @@ void __init acpi_numa_slit_init(struct acpi_table_slit 
*slit)
 
max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1));
 
+out:
return 0;
 out_err_bad_srat:
bad_srat();
-- 
1.8.3.1

Re: [PATCH v2 4/5] mm: memory_hotplug: Add memory hotremove probe device

2017-11-24 Thread zhong jiang

Hi, Andrea

most of server will benefit from NUMA ,it is best to sovle the issue without
spcial restrictions.

At least we can obtain the numa information from dtb. therefore, The memory can
online correctly.

Thanks
zhongjiang

On 2017/11/24 18:44, Andrea Reale wrote:
> Hi zhongjiang,
>
> On Fri 24 Nov 2017, 18:35, zhong jiang wrote:
>> HI, Andrea
>>
>> I don't see "memory_add_physaddr_to_nid" in arch/arm64.
>> Am I miss something?
> When !CONFIG_NUMA it is defined in include/linux/memory_hotplug.h as 0.
> In patch 1/5 of this series we require !NUMA to enable
> ARCH_ENABLE_MEMORY_HOTPLUG.
>
> The reason for this simplification is simply that we would not know how
> to decide the correct node to which to add memory when NUMA is on.
> Any suggestion on that matter is welcome. 
>
> Thanks,
> Andrea
>
>> Thnaks
>> zhongjiang
>>
>> On 2017/11/23 19:14, Andrea Reale wrote:
>>> Adding a "remove" sysfs handle that can be used to trigger
>>> memory hotremove manually, exactly simmetrically with
>>> what happens with the "probe" device for hot-add.
>>>
>>> This is usueful for architecture that do not rely on
>>> ACPI for memory hot-remove.
>>>
>>> Signed-off-by: Andrea Reale 
>>> Signed-off-by: Maciej Bielski 
>>> ---
>>>  drivers/base/memory.c | 34 +-
>>>  1 file changed, 33 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>>> index 1d60b58..8ccb67c 100644
>>> --- a/drivers/base/memory.c
>>> +++ b/drivers/base/memory.c
>>> @@ -530,7 +530,36 @@ memory_probe_store(struct device *dev, struct 
>>> device_attribute *attr,
>>>  }
>>>  
>>>  static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
>>> -#endif
>>> +
>>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>>> +static ssize_t
>>> +memory_remove_store(struct device *dev,
>>> +   struct device_attribute *attr, const char *buf, size_t count)
>>> +{
>>> +   u64 phys_addr;
>>> +   int nid, ret;
>>> +   unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
>>> +
>>> +   ret = kstrtoull(buf, 0, &phys_addr);
>>> +   if (ret)
>>> +   return ret;
>>> +
>>> +   if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
>>> +   return -EINVAL;
>>> +
>>> +   nid = memory_add_physaddr_to_nid(phys_addr);
>>> +   ret = lock_device_hotplug_sysfs();
>>> +   if (ret)
>>> +   return ret;
>>> +
>>> +   remove_memory(nid, phys_addr,
>>> +MIN_MEMORY_BLOCK_SIZE * sections_per_block);
>>> +   unlock_device_hotplug();
>>> +   return count;
>>> +}
>>> +static DEVICE_ATTR(remove, S_IWUSR, NULL, memory_remove_store);
>>> +#endif /* CONFIG_MEMORY_HOTREMOVE */
>>> +#endif /* CONFIG_ARCH_MEMORY_PROBE */
>>>  
>>>  #ifdef CONFIG_MEMORY_FAILURE
>>>  /*
>>> @@ -790,6 +819,9 @@ bool is_memblock_offlined(struct memory_block *mem)
>>>  static struct attribute *memory_root_attrs[] = {
>>>  #ifdef CONFIG_ARCH_MEMORY_PROBE
>>> &dev_attr_probe.attr,
>>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>>> +   &dev_attr_remove.attr,
>>> +#endif
>>>  #endif
>>>  
>>>  #ifdef CONFIG_MEMORY_FAILURE
>>
>
> .
>

Re: [PATCH v2 4/5] mm: memory_hotplug: Add memory hotremove probe device

2017-11-24 Thread zhong jiang

HI, Andrea

I don't see "memory_add_physaddr_to_nid" in arch/arm64.
Am I miss something?

Thnaks
zhongjiang

On 2017/11/23 19:14, Andrea Reale wrote:
> Adding a "remove" sysfs handle that can be used to trigger
> memory hotremove manually, exactly simmetrically with
> what happens with the "probe" device for hot-add.
>
> This is usueful for architecture that do not rely on
> ACPI for memory hot-remove.
>
> Signed-off-by: Andrea Reale 
> Signed-off-by: Maciej Bielski 
> ---
>  drivers/base/memory.c | 34 +-
>  1 file changed, 33 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 1d60b58..8ccb67c 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -530,7 +530,36 @@ memory_probe_store(struct device *dev, struct 
> device_attribute *attr,
>  }
>  
>  static DEVICE_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
> -#endif
> +
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +static ssize_t
> +memory_remove_store(struct device *dev,
> + struct device_attribute *attr, const char *buf, size_t count)
> +{
> + u64 phys_addr;
> + int nid, ret;
> + unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
> +
> + ret = kstrtoull(buf, 0, &phys_addr);
> + if (ret)
> + return ret;
> +
> + if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
> + return -EINVAL;
> +
> + nid = memory_add_physaddr_to_nid(phys_addr);
> + ret = lock_device_hotplug_sysfs();
> + if (ret)
> + return ret;
> +
> + remove_memory(nid, phys_addr,
> +  MIN_MEMORY_BLOCK_SIZE * sections_per_block);
> + unlock_device_hotplug();
> + return count;
> +}
> +static DEVICE_ATTR(remove, S_IWUSR, NULL, memory_remove_store);
> +#endif /* CONFIG_MEMORY_HOTREMOVE */
> +#endif /* CONFIG_ARCH_MEMORY_PROBE */
>  
>  #ifdef CONFIG_MEMORY_FAILURE
>  /*
> @@ -790,6 +819,9 @@ bool is_memblock_offlined(struct memory_block *mem)
>  static struct attribute *memory_root_attrs[] = {
>  #ifdef CONFIG_ARCH_MEMORY_PROBE
>   &dev_attr_probe.attr,
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> + &dev_attr_remove.attr,
> +#endif
>  #endif
>  
>  #ifdef CONFIG_MEMORY_FAILURE

Re: [PATCH] futex: avoid undefined behaviour when shift exponent is negative

2017-08-25 Thread zhong jiang

On 2017/8/26 5:13, Thomas Gleixner wrote:
> On Fri, 25 Aug 2017, zhong jiang wrote:
>> From: zhong jiang 
>> Date: Fri, 25 Aug 2017 12:05:56 +0800
>> Subject: [PATCH v2] futex: avoid undefined behaviour when shift exponent is
>>  negative
> Please do not send patches without changing the subject line so it's clear
> that there is a new patch.
  ok
>> using a shift value < 0 or > 31 will get crap as a result. because
>> it's just undefined. The issue still disturb me, so I try to fix
>> it again by excluding the especially condition.
> Which is obsolete now as this code is unified accross all architectures and
> the shift issue is addressed in the generic version of it. So all
> architectures get the same fix. See:
>
>  http://git.kernel.org/tip/30d6e0a4190d37740e9447e4e4815f06992dd8c3
  ok , I  miss the above patch.
> And no, we won't add that x86 fix before that unification hits mainline
> because that undefined behaviour is harmless as it only affects the user
> space value of the futex. IOW, the caller gets what it asked for: crap.
  Thank you for clarification.

  Regards
 zhongjiang
> Thanks,
>
>   tglx
>
>
> .
>

Re: [PATCH] futex: avoid undefined behaviour when shift exponent is negative

2017-08-24 Thread zhong jiang

On 2017/6/29 6:13, Thomas Gleixner wrote:
> On Wed, 28 Jun 2017, zhong jiang wrote:
>> On 2017/6/22 0:40, Ingo Molnar wrote:
>>> * zhong jiang  wrote:
>>>
>>>> when shift expoment is negative, left shift alway zero. therefore, we
>>>> modify the logic to avoid the warining.
>>>>
>>>> Signed-off-by: zhong jiang 
>>>> ---
>>>>  arch/x86/include/asm/futex.h | 8 ++--
>>>>  1 file changed, 6 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
>>>> index b4c1f54..2425fca 100644
>>>> --- a/arch/x86/include/asm/futex.h
>>>> +++ b/arch/x86/include/asm/futex.h
>>>> @@ -49,8 +49,12 @@ static inline int futex_atomic_op_inuser(int 
>>>> encoded_op, u32 __user *uaddr)
>>>>int cmparg = (encoded_op << 20) >> 20;
>>>>int oldval = 0, ret, tem;
>>>>  
>>>> -  if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
>>>> -  oparg = 1 << oparg;
>>>> +  if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
>>>> +  if (oparg >= 0)
>>>> +  oparg = 1 << oparg;
>>>> +  else
>>>> +  oparg = 0;
>>>> +  }
>>> Could we avoid all these complications by using an unsigned type?
>>   I think it is not feasible.  a negative shift exponent is likely
>>   existence and reasonable.
> What is reasonable about a negative shift value?
>
>> as the above case, oparg is a negative is common.
> That's simply wrong. If oparg is negative and the SHIFT bit is set then the
> result is undefined today and there is no way that this can be used at
> all.
>
> On x86:
>
>1 << -1= 0x8000
>1 << -2048 = 0x0001
>1 << -2047 = 0x0002
>
> Anything using a shift value < 0 or > 31 will get crap as a
> result. Rightfully so because it's just undefined.
>
> Yes I know that the insanity of user space is unlimited, but anything
> attempting this is so broken that we cannot break it further by making that
> shift arg unsigned and actually limit it to 0-31
>
> Thanks,
>
>   tglx
>
>
>
> .
>
 >From df9e2a5a3f1f401943aeb2718d5876b854dea3a3 Mon Sep 17 00:00:00 2001
From: zhong jiang 
Date: Fri, 25 Aug 2017 12:05:56 +0800
Subject: [PATCH v2] futex: avoid undefined behaviour when shift exponent is
 negative

when futex syscall is called from userspace, we find the following
warning by ubsan detection.

[   63.237803] UBSAN: Undefined behaviour in 
/root/rpmbuild/BUILDROOT/kernel-3.10.0-327.49.58.52.x86_64/usr/src/linux-3.10.0-327.49.58.52.x86_64/arch/x86/include/asm/futex.h:53:13
[   63.237803] shift exponent -16 is negative
[   63.237803] CPU: 0 PID: 67 Comm: driver Not tainted 3.10.0 #1
[   63.237803] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
[   63.237803]  fff0 9ad70fde 8802fa08 
81ef0d6f
[   63.237803]  8802fa20 81ef0e2c 828f2540 
8802fb90
[   63.237803]  81ef1ad0 8141cc88 11005f48 
41b58ab3
[   63.237803] Call Trace:
[   63.237803]  [] dump_stack+0x1e/0x20
[   63.237803]  [] ubsan_epilogue+0x12/0x55
[   63.237803]  [] 
__ubsan_handle_shift_out_of_bounds+0x237/0x29c
[   63.237803]  [] ? kasan_alloc_pages+0x38/0x40
[   63.237803]  [] ? 
__ubsan_handle_load_invalid_value+0x162/0x162
[   63.237803]  [] ? get_futex_key+0x361/0x6c0
[   63.237803]  [] ? get_futex_key_refs+0xb0/0xb0
[   63.237803]  [] futex_wake_op+0xb48/0xc70
[   63.237803]  [] ? futex_wake_op+0xb48/0xc70
[   63.237803]  [] ? futex_wake+0x380/0x380
[   63.237803]  [] do_futex+0x2cc/0xb60
[   63.237803]  [] ? exit_robust_list+0x350/0x350
[   63.237803]  [] ? __fsnotify_inode_delete+0x20/0x20
[   63.237803]  [] ? n_tty_flush_buffer+0x80/0x80
[   63.237803]  [] ? __fsnotify_parent+0x53/0x210
[   63.237803]  [] SyS_futex+0x147/0x300
[   63.237803]  [] ? do_futex+0xb60/0xb60
[   63.237803]  [] ? do_page_fault+0x44/0xa0
[   63.237803]  [] system_call_fastpath+0x16/0x1b

using a shift value < 0 or > 31 will get crap as a result. because
it's just undefined. The issue still disturb me, so I try to fix
it again by excluding the especially condition.

Signed-off-by: zhong jiang 
---
 arch/x86/include/asm/futex.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index b4c1f54..c414d76 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -49,9 +49,11 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 
__user *uaddr)
int cmparg = (encoded_op << 20) >> 20;
int oldval = 0, ret, tem;

-   if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+   if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
+   if (oparg < 0 || oparg > 31)
+   return -EINVAL;
oparg = 1 << oparg;
-
+   }
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
return -EFAULT;

Re: [PATCH] futex: avoid undefined behaviour when shift exponent is negative

2017-06-29 Thread zhong jiang

On 2017/6/29 14:33, Thomas Gleixner wrote:
> On Thu, 29 Jun 2017, zhong jiang wrote:
>> On 2017/6/29 6:13, Thomas Gleixner wrote:
>>> That's simply wrong. If oparg is negative and the SHIFT bit is set then the
>>> result is undefined today and there is no way that this can be used at
>>> all.
>>>
>>> On x86:
>>>
>>>1 << -1  = 0x8000
>>>1 << -2048   = 0x0001
>>>1 << -2047   = 0x0002
>>   but I test the cases in x86_64 all is zero.   I wonder whether it is 
>> related to gcc or not
>>
>>   zj.c:15:8: warning: left shift count is negative [-Wshift-count-negative]
>>   j = 1 << -2048;
>> ^
>> [root@localhost zhongjiang]# ./zj
>> j = 0
> Which is not a surprise because the compiler can detect it as the shift is
> a constant. oparg is not so constant ...
  I get it. Thanks
 
  Thanks
  zhongjiang
> Thanks,
>
>   tglx
>
> .
>

Re: [PATCH] futex: avoid undefined behaviour when shift exponent is negative

2017-06-28 Thread zhong jiang

On 2017/6/29 12:29, h...@zytor.com wrote:
> On June 28, 2017 7:12:04 PM PDT, zhong jiang  wrote:
>> On 2017/6/29 5:43, h...@zytor.com wrote:
>>> On June 27, 2017 9:35:10 PM PDT, zhong jiang 
>> wrote:
>>>> Hi,  Ingo
>>>>
>>>> Thank you for the comment.
>>>> On 2017/6/22 0:40, Ingo Molnar wrote:
>>>>> * zhong jiang  wrote:
>>>>>
>>>>>> when shift expoment is negative, left shift alway zero. therefore,
>>>> we
>>>>>> modify the logic to avoid the warining.
>>>>>>
>>>>>> Signed-off-by: zhong jiang 
>>>>>> ---
>>>>>>  arch/x86/include/asm/futex.h | 8 ++--
>>>>>>  1 file changed, 6 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/x86/include/asm/futex.h
>>>> b/arch/x86/include/asm/futex.h
>>>>>> index b4c1f54..2425fca 100644
>>>>>> --- a/arch/x86/include/asm/futex.h
>>>>>> +++ b/arch/x86/include/asm/futex.h
>>>>>> @@ -49,8 +49,12 @@ static inline int futex_atomic_op_inuser(int
>>>> encoded_op, u32 __user *uaddr)
>>>>>>  int cmparg = (encoded_op << 20) >> 20;
>>>>>>  int oldval = 0, ret, tem;
>>>>>>  
>>>>>> -if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
>>>>>> -oparg = 1 << oparg;
>>>>>> +if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
>>>>>> +if (oparg >= 0)
>>>>>> +oparg = 1 << oparg;
>>>>>> +else
>>>>>> +oparg = 0;
>>>>>> +}
>>>>> Could we avoid all these complications by using an unsigned type?
>>>> I think it is not feasible.  a negative shift exponent is likely
>>>> existence and reasonable.
>>>>  as the above case,  oparg is a negative is common. 
>>>>
>>>> I think it can be avoided by following change. 
>>>>
>>>> diff --git a/arch/x86/include/asm/futex.h
>>>> b/arch/x86/include/asm/futex.h
>>>> index b4c1f54..3205e86 100644
>>>> --- a/arch/x86/include/asm/futex.h
>>>> +++ b/arch/x86/include/asm/futex.h
>>>> @@ -50,7 +50,7 @@ static inline int futex_atomic_op_inuser(int
>>>> encoded_op, u32 __user *uaddr)
>>>>int oldval = 0, ret, tem;
>>>>
>>>>if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
>>>> -   oparg = 1 << oparg;
>>>> +   oparg = safe_shift(1, oparg);
>>>>
>>>>if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
>>>>return -EFAULT;
>>>> diff --git a/drivers/video/fbdev/core/fbmem.c
>>>> b/drivers/video/fbdev/core/fbmem.c
>>>> index 069fe79..b4edda3 100644
>>>> --- a/drivers/video/fbdev/core/fbmem.c
>>>> +++ b/drivers/video/fbdev/core/fbmem.c
>>>> @@ -190,11 +190,6 @@ char* fb_get_buffer_offset(struct fb_info
>> *info,
>>>> struct fb_pixmap *buf, u32 size
>>>>
>>>> #ifdef CONFIG_LOGO
>>>>
>>>> -static inline unsigned safe_shift(unsigned d, int n)
>>>> -{
>>>> -   return n < 0 ? d >> -n : d << n;
>>>> -}
>>>> -
>>>> static void fb_set_logocmap(struct fb_info *info,
>>>>   const struct linux_logo *logo)
>>>> {
>>>> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
>>>> index d043ada..f3b8856 100644
>>>> --- a/include/linux/kernel.h
>>>> +++ b/include/linux/kernel.h
>>>> @@ -841,6 +841,10 @@ static inline void ftrace_dump(enum
>>>> ftrace_dump_mode oops_dump_mode) { }
>>>>  */
>>>> #define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
>>>>
>>>> +static inline unsigned safe_shift(unsigned d, int n)
>>>> +{
>>>> +   return n < 0 ? d >> -n : d << n;
>>>> +}
>>>>
>>>> Thansk
>>>> zhongjiang
>>>>
>>>>> Thanks,
>>>>>
>>>>>   Ingo
>>>>>
>>>>> .
>>>>>
>>> What makes it reasonable?  It is totally ill-defined and doesn't do
>> anything useful now?
>> Thanks you for comments.
>>
>> Maybe I mismake the meaning. I test the negative cases in x86 , all
>> case is zero. so I come to a conclusion.
>>
>> zj.c:15:8: warning: left shift count is negative
>> [-Wshift-count-negative]
>>  j = 1 << -2048;
>>^
>> [root@localhost zhongjiang]# ./zj
>> j = 0
>> j.c:15:8: warning: left shift count is negative
>> [-Wshift-count-negative]
>>  j = 1 << -2047;
>>^
>> [root@localhost zhongjiang]# ./zj
>> j = 0
>>
>> I insmod a module into kernel to test the testcasts, all of the result
>> is zero.
>>
>> I wonder whether I miss some point or not. Do you point out to me?
>> please
>>
>> Thanks
>> zhongjiang
>>
>>
> When you use compile-time constants, the compiler generates the value at 
> compile time, which can be totally different.
 yes, I test that. Thanks.

 Thanks
 zhongjiang

Re: [PATCH] futex: avoid undefined behaviour when shift exponent is negative

2017-06-28 Thread zhong jiang

On 2017/6/29 5:43, h...@zytor.com wrote:
> On June 27, 2017 9:35:10 PM PDT, zhong jiang  wrote:
>> Hi,  Ingo
>>
>> Thank you for the comment.
>> On 2017/6/22 0:40, Ingo Molnar wrote:
>>> * zhong jiang  wrote:
>>>
>>>> when shift expoment is negative, left shift alway zero. therefore,
>> we
>>>> modify the logic to avoid the warining.
>>>>
>>>> Signed-off-by: zhong jiang 
>>>> ---
>>>>  arch/x86/include/asm/futex.h | 8 ++--
>>>>  1 file changed, 6 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/futex.h
>> b/arch/x86/include/asm/futex.h
>>>> index b4c1f54..2425fca 100644
>>>> --- a/arch/x86/include/asm/futex.h
>>>> +++ b/arch/x86/include/asm/futex.h
>>>> @@ -49,8 +49,12 @@ static inline int futex_atomic_op_inuser(int
>> encoded_op, u32 __user *uaddr)
>>>>int cmparg = (encoded_op << 20) >> 20;
>>>>int oldval = 0, ret, tem;
>>>>  
>>>> -  if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
>>>> -  oparg = 1 << oparg;
>>>> +  if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
>>>> +  if (oparg >= 0)
>>>> +  oparg = 1 << oparg;
>>>> +  else
>>>> +  oparg = 0;
>>>> +  }
>>> Could we avoid all these complications by using an unsigned type?
>> I think it is not feasible.  a negative shift exponent is likely
>> existence and reasonable.
>>  as the above case,  oparg is a negative is common. 
>>
>> I think it can be avoided by following change. 
>>
>> diff --git a/arch/x86/include/asm/futex.h
>> b/arch/x86/include/asm/futex.h
>> index b4c1f54..3205e86 100644
>> --- a/arch/x86/include/asm/futex.h
>> +++ b/arch/x86/include/asm/futex.h
>> @@ -50,7 +50,7 @@ static inline int futex_atomic_op_inuser(int
>> encoded_op, u32 __user *uaddr)
>>int oldval = 0, ret, tem;
>>
>>if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
>> -   oparg = 1 << oparg;
>> +   oparg = safe_shift(1, oparg);
>>
>>if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
>>return -EFAULT;
>> diff --git a/drivers/video/fbdev/core/fbmem.c
>> b/drivers/video/fbdev/core/fbmem.c
>> index 069fe79..b4edda3 100644
>> --- a/drivers/video/fbdev/core/fbmem.c
>> +++ b/drivers/video/fbdev/core/fbmem.c
>> @@ -190,11 +190,6 @@ char* fb_get_buffer_offset(struct fb_info *info,
>> struct fb_pixmap *buf, u32 size
>>
>> #ifdef CONFIG_LOGO
>>
>> -static inline unsigned safe_shift(unsigned d, int n)
>> -{
>> -   return n < 0 ? d >> -n : d << n;
>> -}
>> -
>> static void fb_set_logocmap(struct fb_info *info,
>>   const struct linux_logo *logo)
>> {
>> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
>> index d043ada..f3b8856 100644
>> --- a/include/linux/kernel.h
>> +++ b/include/linux/kernel.h
>> @@ -841,6 +841,10 @@ static inline void ftrace_dump(enum
>> ftrace_dump_mode oops_dump_mode) { }
>>  */
>> #define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)
>>
>> +static inline unsigned safe_shift(unsigned d, int n)
>> +{
>> +   return n < 0 ? d >> -n : d << n;
>> +}
>>
>> Thansk
>> zhongjiang
>>
>>> Thanks,
>>>
>>> Ingo
>>>
>>> .
>>>
> What makes it reasonable?  It is totally ill-defined and doesn't do anything 
> useful now?
 Thanks you for comments.
 
 Maybe I mismake the meaning. I test the negative cases in x86 , all case is 
zero. so I come to a conclusion.
 
zj.c:15:8: warning: left shift count is negative [-Wshift-count-negative]
  j = 1 << -2048;
^
[root@localhost zhongjiang]# ./zj
j = 0
j.c:15:8: warning: left shift count is negative [-Wshift-count-negative]
  j = 1 << -2047;
^
[root@localhost zhongjiang]# ./zj
j = 0

I insmod a module into kernel to test the testcasts, all of the result is zero.

I wonder whether I miss some point or not. Do you point out to me? please

Thanks
zhongjiang

Re: [PATCH] futex: avoid undefined behaviour when shift exponent is negative

2017-06-28 Thread zhong jiang

Hi, Thomas

Thank you for clarification.
On 2017/6/29 6:13, Thomas Gleixner wrote:
> On Wed, 28 Jun 2017, zhong jiang wrote:
>> On 2017/6/22 0:40, Ingo Molnar wrote:
>>> * zhong jiang  wrote:
>>>
>>>> when shift expoment is negative, left shift alway zero. therefore, we
>>>> modify the logic to avoid the warining.
>>>>
>>>> Signed-off-by: zhong jiang 
>>>> ---
>>>>  arch/x86/include/asm/futex.h | 8 ++--
>>>>  1 file changed, 6 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
>>>> index b4c1f54..2425fca 100644
>>>> --- a/arch/x86/include/asm/futex.h
>>>> +++ b/arch/x86/include/asm/futex.h
>>>> @@ -49,8 +49,12 @@ static inline int futex_atomic_op_inuser(int 
>>>> encoded_op, u32 __user *uaddr)
>>>>int cmparg = (encoded_op << 20) >> 20;
>>>>int oldval = 0, ret, tem;
>>>>  
>>>> -  if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
>>>> -  oparg = 1 << oparg;
>>>> +  if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
>>>> +  if (oparg >= 0)
>>>> +  oparg = 1 << oparg;
>>>> +  else
>>>> +  oparg = 0;
>>>> +  }
>>> Could we avoid all these complications by using an unsigned type?
>>   I think it is not feasible.  a negative shift exponent is likely
>>   existence and reasonable.
> What is reasonable about a negative shift value?
>
>> as the above case, oparg is a negative is common.
> That's simply wrong. If oparg is negative and the SHIFT bit is set then the
> result is undefined today and there is no way that this can be used at
> all.
>
> On x86:
>
>1 << -1= 0x8000
>1 << -2048 = 0x0001
>1 << -2047 = 0x0002
  but I test the cases in x86_64 all is zero.   I wonder whether it is related 
to gcc or not

  zj.c:15:8: warning: left shift count is negative [-Wshift-count-negative]
  j = 1 << -2048;
^
[root@localhost zhongjiang]# ./zj
j = 0

 Thanks
 zhongjiang
> Anything using a shift value < 0 or > 31 will get crap as a
> result. Rightfully so because it's just undefined.
>
> Yes I know that the insanity of user space is unlimited, but anything
> attempting this is so broken that we cannot break it further by making that
> shift arg unsigned and actually limit it to 0-31
> Thanks,
>
>   tglx
>
>
>
> .
>

Re: [PATCH] futex: avoid undefined behaviour when shift exponent is negative

2017-06-27 Thread zhong jiang

Hi,  Ingo

Thank you for the comment.
On 2017/6/22 0:40, Ingo Molnar wrote:
> * zhong jiang  wrote:
>
>> when shift expoment is negative, left shift alway zero. therefore, we
>> modify the logic to avoid the warining.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>  arch/x86/include/asm/futex.h | 8 ++--
>>  1 file changed, 6 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
>> index b4c1f54..2425fca 100644
>> --- a/arch/x86/include/asm/futex.h
>> +++ b/arch/x86/include/asm/futex.h
>> @@ -49,8 +49,12 @@ static inline int futex_atomic_op_inuser(int encoded_op, 
>> u32 __user *uaddr)
>>  int cmparg = (encoded_op << 20) >> 20;
>>  int oldval = 0, ret, tem;
>>  
>> -if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
>> -oparg = 1 << oparg;
>> +if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
>> +if (oparg >= 0)
>> +oparg = 1 << oparg;
>> +else
>> +oparg = 0;
>> +}
> Could we avoid all these complications by using an unsigned type?
  I think it is not feasible.  a negative shift exponent is likely existence 
and reasonable.
  as the above case,  oparg is a negative is common. 

 I think it can be avoided by following change. 

  diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index b4c1f54..3205e86 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -50,7 +50,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 
__user *uaddr)
int oldval = 0, ret, tem;

if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
-   oparg = 1 << oparg;
+   oparg = safe_shift(1, oparg);

if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
return -EFAULT;
diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c
index 069fe79..b4edda3 100644
--- a/drivers/video/fbdev/core/fbmem.c
+++ b/drivers/video/fbdev/core/fbmem.c
@@ -190,11 +190,6 @@ char* fb_get_buffer_offset(struct fb_info *info, struct 
fb_pixmap *buf, u32 size

 #ifdef CONFIG_LOGO

-static inline unsigned safe_shift(unsigned d, int n)
-{
-   return n < 0 ? d >> -n : d << n;
-}
-
 static void fb_set_logocmap(struct fb_info *info,
   const struct linux_logo *logo)
 {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d043ada..f3b8856 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -841,6 +841,10 @@ static inline void ftrace_dump(enum ftrace_dump_mode 
oops_dump_mode) { }
  */
 #define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi)

+static inline unsigned safe_shift(unsigned d, int n)
+{
+   return n < 0 ? d >> -n : d << n;
+}

Thansk
zhongjiang

> Thanks,
>
>   Ingo
>
> .
>

Re: [PATCH] x86/mm: Don't reenter flush_tlb_func_common()

2017-06-19 Thread zhong jiang

On 2017/6/19 23:05, Andy Lutomirski wrote:
> On Mon, Jun 19, 2017 at 6:33 AM, zhong jiang  wrote:
>> On 2017/6/19 12:48, Andy Lutomirski wrote:
>>> It was historically possible to have two concurrent TLB flushes
>>> targeting the same CPU: one initiated locally and one initiated
>>> remotely.  This can now cause an OOPS in leave_mm() at
>>> arch/x86/mm/tlb.c:47:
>>>
>>> if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
>>> BUG();
>>>
>>> with this call trace:
>>>  flush_tlb_func_local arch/x86/mm/tlb.c:239 [inline]
>>>  flush_tlb_mm_range+0x26d/0x370 arch/x86/mm/tlb.c:317
>>>
>>> Without reentrancy, this OOPS is impossible: leave_mm() is only
>>> called if we're not in TLBSTATE_OK, but then we're unexpectedly
>>> in TLBSTATE_OK in leave_mm().
>>>
>>> This can be caused by flush_tlb_func_remote() happening between
>>> the two checks and calling leave_mm(), resulting in two consecutive
>>> leave_mm() calls on the same CPU with no intervening switch_mm()
>>> calls.
>>>
>>> We never saw this OOPS before because the old leave_mm()
>>> implementation didn't put us back in TLBSTATE_OK, so the assertion
>>> didn't fire.
>>   HI, Andy
>>
>>   Today, I see same OOPS in linux 3.4 stable. It prove that it indeed has 
>> fired.
>>but It is rarely to appear.  I review the code. I found the a  issue.
>>   when current->mm is NULL,  leave_mm will be called. but  it maybe in
>>   TLBSTATE_OK,  eg: unuse_mm call after task->mm = NULL , but before 
>> enter_lazy_tlb.
>>
>>therefore,  it will fire. is it right?
> Is there a code path that does this?
 eg:
 
 cpu1  cpu2 
 

flush_tlb_page  unuse_mm
current->mm 
= NULL
   
 current->mm == NULL
   
leave_mm (cpu_tlbstate.state is TLBSATATE_OK)

enter_lazy_tlb
 I am not sure the above race whether  exist or not. Do you point out the 
problem if it is not existence? please

  Thanks
  zhongjiang
>   
> Also, the IPI handler on 3.4 looks like this:
>
> if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
> if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
> if (f->flush_va == TLB_FLUSH_ALL)
> local_flush_tlb();
> else
> __flush_tlb_one(f->flush_va);
> } else
> leave_mm(cpu);
> }
>
> but leave_mm() checks the same condition (cpu_tlbstate.state, not
> current->mm).  How is the BUG triggering?
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>
>

Re: [PATCH] x86/mm: Don't reenter flush_tlb_func_common()

2017-06-19 Thread zhong jiang

On 2017/6/19 12:48, Andy Lutomirski wrote:
> It was historically possible to have two concurrent TLB flushes
> targeting the same CPU: one initiated locally and one initiated
> remotely.  This can now cause an OOPS in leave_mm() at
> arch/x86/mm/tlb.c:47:
>
> if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
> BUG();
>
> with this call trace:
>  flush_tlb_func_local arch/x86/mm/tlb.c:239 [inline]
>  flush_tlb_mm_range+0x26d/0x370 arch/x86/mm/tlb.c:317
>
> Without reentrancy, this OOPS is impossible: leave_mm() is only
> called if we're not in TLBSTATE_OK, but then we're unexpectedly
> in TLBSTATE_OK in leave_mm().
>
> This can be caused by flush_tlb_func_remote() happening between
> the two checks and calling leave_mm(), resulting in two consecutive
> leave_mm() calls on the same CPU with no intervening switch_mm()
> calls.
>
> We never saw this OOPS before because the old leave_mm()
> implementation didn't put us back in TLBSTATE_OK, so the assertion
> didn't fire.
  HI, Andy

  Today, I see same OOPS in linux 3.4 stable. It prove that it indeed has fired.
   but It is rarely to appear.  I review the code. I found the a  issue.
  when current->mm is NULL,  leave_mm will be called. but  it maybe in
  TLBSTATE_OK,  eg: unuse_mm call after task->mm = NULL , but before 
enter_lazy_tlb.

   therefore,  it will fire. is it right?

  Thanks
  zhongjiang
> Nadav noticed the reentrancy issue in a different context, but
> neither of us realized that it caused a problem yet.
>
> Cc: Nadav Amit 
> Cc: Dave Hansen 
> Reported-by: "Levin, Alexander (Sasha Levin)" 
> Fixes: 3d28ebceaffa ("x86/mm: Rework lazy TLB to track the actual loaded mm")
> Signed-off-by: Andy Lutomirski 
> ---
>  arch/x86/mm/tlb.c | 15 +--
>  1 file changed, 13 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 2a5e851f2035..f06239c6919f 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -208,6 +208,9 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
> mm_struct *next,
>  static void flush_tlb_func_common(const struct flush_tlb_info *f,
> bool local, enum tlb_flush_reason reason)
>  {
> + /* This code cannot presently handle being reentered. */
> + VM_WARN_ON(!irqs_disabled());
> +
>   if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
>   leave_mm(smp_processor_id());
>   return;
> @@ -313,8 +316,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned 
> long start,
>   info.end = TLB_FLUSH_ALL;
>   }
>  
> - if (mm == this_cpu_read(cpu_tlbstate.loaded_mm))
> + if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
> + local_irq_disable();
>   flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
> + local_irq_enable();
> + }
> +
>   if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
>   flush_tlb_others(mm_cpumask(mm), &info);
>   put_cpu();
> @@ -370,8 +377,12 @@ void arch_tlbbatch_flush(struct 
> arch_tlbflush_unmap_batch *batch)
>  
>   int cpu = get_cpu();
>  
> - if (cpumask_test_cpu(cpu, &batch->cpumask))
> + if (cpumask_test_cpu(cpu, &batch->cpumask)) {
> + local_irq_disable();
>   flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
> + local_irq_enable();
> + }
> +
>   if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
>   flush_tlb_others(&batch->cpumask, &info);
>   cpumask_clear(&batch->cpumask);

Re: [HMM 07/15] mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory v3

2017-06-14 Thread zhong jiang

On 2017/5/25 1:20, Jérôme Glisse wrote:
> HMM (heterogeneous memory management) need struct page to support migration
> from system main memory to device memory.  Reasons for HMM and migration to
> device memory is explained with HMM core patch.
>
> This patch deals with device memory that is un-addressable memory (ie CPU
> can not access it). Hence we do not want those struct page to be manage
> like regular memory. That is why we extend ZONE_DEVICE to support different
> types of memory.
>
> A persistent memory type is define for existing user of ZONE_DEVICE and a
> new device un-addressable type is added for the un-addressable memory type.
> There is a clear separation between what is expected from each memory type
> and existing user of ZONE_DEVICE are un-affected by new requirement and new
> use of the un-addressable type. All specific code path are protect with
> test against the memory type.
>
> Because memory is un-addressable we use a new special swap type for when
> a page is migrated to device memory (this reduces the number of maximum
> swap file).
>
> The main two additions beside memory type to ZONE_DEVICE is two callbacks.
> First one, page_free() is call whenever page refcount reach 1 (which means
> the page is free as ZONE_DEVICE page never reach a refcount of 0). This
> allow device driver to manage its memory and associated struct page.
>
> The second callback page_fault() happens when there is a CPU access to
> an address that is back by a device page (which are un-addressable by the
> CPU). This callback is responsible to migrate the page back to system
> main memory. Device driver can not block migration back to system memory,
> HMM make sure that such page can not be pin into device memory.
>
> If device is in some error condition and can not migrate memory back then
> a CPU page fault to device memory should end with SIGBUS.
>
> Changed since v2:
>   - s/DEVICE_UNADDRESSABLE/DEVICE_PRIVATE
> Changed since v1:
>   - rename to device private memory (from device unaddressable)
>
> Signed-off-by: Jérôme Glisse 
> Acked-by: Dan Williams 
> Cc: Ross Zwisler 
> ---
>  fs/proc/task_mmu.c   |  7 +
>  include/linux/ioport.h   |  1 +
>  include/linux/memremap.h | 72 
> 
>  include/linux/mm.h   | 12 
>  include/linux/swap.h | 24 ++--
>  include/linux/swapops.h  | 68 +
>  kernel/memremap.c| 34 +++
>  mm/Kconfig   | 13 +
>  mm/memory.c  | 61 
>  mm/memory_hotplug.c  | 10 +--
>  mm/mprotect.c| 14 ++
>  11 files changed, 311 insertions(+), 5 deletions(-)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index f0c8b33..90b2fa4 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -542,6 +542,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long 
> addr,
>   }
>   } else if (is_migration_entry(swpent))
>   page = migration_entry_to_page(swpent);
> + else if (is_device_private_entry(swpent))
> + page = device_private_entry_to_page(swpent);
>   } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
>   && pte_none(*pte))) {
>   page = find_get_entry(vma->vm_file->f_mapping,
> @@ -704,6 +706,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long 
> hmask,
>  
>   if (is_migration_entry(swpent))
>   page = migration_entry_to_page(swpent);
> + else if (is_device_private_entry(swpent))
> + page = device_private_entry_to_page(swpent);
>   }
>   if (page) {
>   int mapcount = page_mapcount(page);
> @@ -1196,6 +1200,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
> pagemapread *pm,
>   flags |= PM_SWAP;
>   if (is_migration_entry(entry))
>   page = migration_entry_to_page(entry);
> +
> + if (is_device_private_entry(entry))
> + page = device_private_entry_to_page(entry);
>   }
>  
>   if (page && !PageAnon(page))
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index 6230064..3a4f691 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -130,6 +130,7 @@ enum {
>   IORES_DESC_ACPI_NV_STORAGE  = 3,
>   IORES_DESC_PERSISTENT_MEMORY= 4,
>   IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
> + IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
>  };
>  
>  /* helpers to define resources */
> diff --git a/include/linux/memremap.h b/include/linux/memremap.h
> index 9341619..0fcf840 100644
> --- a/include/linux/memremap.h
> +++ b/include/linux/memremap.h
> @@ -4,6 +4,8 @@
>  #include 
>  #include 
>  
> +#include 
> +
>  struct reso

Re: [PATCH] fs/fcntl: return -ESRCH in f_setown when pid/pgid can't be found

2017-06-14 Thread zhong jiang

yes,  look good to me.

but I found the another issue.  if the pass argument  is -1.  by the spec 
describe,
type should be assigned to PIDTYPE_MAX,  Do you think that it deserve another 
patch ?

Thanks
zhongjiang

On 2017/6/14 22:52, Jeff Layton wrote:
> The current implementation of F_SETOWN doesn't properly vet the argument
> passed in. It never returns an error. If the argument doesn't specify a
> valid pid/pgid, then we just end up cleaning out the file->f_owner
> structure.
>
> What we really want is to only clean that out only in the case where
> userland passed in an argument of 0. For anything else, we want to
> return ESRCH if it doesn't refer to a valid pid.
>
> The relevant POSIX spec page is here:
>
> http://pubs.opengroup.org/onlinepubs/9699919799/functions/fcntl.html
>
> Cc: Jiri Slaby 
> Cc: zhong jiang 
> Signed-off-by: Jeff Layton 
> ---
>  fs/fcntl.c | 18 +-
>  1 file changed, 13 insertions(+), 5 deletions(-)
>
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 693322e28751..afed3b364979 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -112,8 +112,9 @@ EXPORT_SYMBOL(__f_setown);
>  int f_setown(struct file *filp, unsigned long arg, int force)
>  {
>   enum pid_type type;
> - struct pid *pid;
> - int who = arg;
> + struct pid *pid = NULL;
> + int who = arg, ret = 0;
> +
>   type = PIDTYPE_PID;
>   if (who < 0) {
>   /* avoid overflow below */
> @@ -123,12 +124,19 @@ int f_setown(struct file *filp, unsigned long arg, int 
> force)
>   type = PIDTYPE_PGID;
>   who = -who;
>   }
> +
>   rcu_read_lock();
> - pid = find_vpid(who);
> - __f_setown(filp, pid, type, force);
> + if (who) {
> + pid = find_vpid(who);
> + if (!pid)
> + ret = -ESRCH;
> + }
> +
> + if (!ret)
> + __f_setown(filp, pid, type, force);
>   rcu_read_unlock();
>  
> - return 0;
> + return ret;
>  }
>  EXPORT_SYMBOL(f_setown);
>

Re: [PATCH v3 2/2] fs/fcntl: f_setown, avoid undefined behaviour

2017-06-13 Thread zhong jiang

On 2017/6/13 20:13, Jeff Layton wrote:
> On Tue, 2017-06-13 at 13:35 +0200, Jiri Slaby wrote:
>> fcntl(0, F_SETOWN, 0x8000) triggers:
>> UBSAN: Undefined behaviour in fs/fcntl.c:118:7
>> negation of -2147483648 cannot be represented in type 'int':
>> CPU: 1 PID: 18261 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1
>> ...
>> Call Trace:
>> ...
>>  [] ? f_setown+0x1d8/0x200
>>  [] ? SyS_fcntl+0x999/0xf30
>>  [] ? entry_SYSCALL_64_fastpath+0x23/0xc1
>>
>> Fix that by checking the arg parameter properly (against INT_MAX) before
>> "who = -who". And return immediatelly with -EINVAL in case it is wrong.
>> Note that according to POSIX we can return EINVAL:
>> http://pubs.opengroup.org/onlinepubs/9699919799/functions/fcntl.html
>>
>> [EINVAL]
>> The cmd argument is F_SETOWN and the value of the argument
>> is not valid as a process or process group identifier.
>>
>> [v2] returns an error, v1 used to fail silently
>> [v3] implement proper check for the bad value INT_MIN
>>
>> Signed-off-by: Jiri Slaby 
>> Cc: Jeff Layton 
>> Cc: "J. Bruce Fields" 
>> Cc: Alexander Viro 
>> Cc: linux-fsde...@vger.kernel.org
>> ---
>>  fs/fcntl.c | 4 
>>  1 file changed, 4 insertions(+)
>>
>> diff --git a/fs/fcntl.c b/fs/fcntl.c
>> index 313eba860346..693322e28751 100644
>> --- a/fs/fcntl.c
>> +++ b/fs/fcntl.c
>> @@ -116,6 +116,10 @@ int f_setown(struct file *filp, unsigned long arg, int 
>> force)
>>  int who = arg;
>>  type = PIDTYPE_PID;
>>  if (who < 0) {
>> +/* avoid overflow below */
>> +if (who == INT_MIN)
>> +return -EINVAL;
>> +
>>  type = PIDTYPE_PGID;
>>  who = -who;
>>  }
> Seems reasonable.
>
> I do somewhat lean toward checking for all larger values, but there
> could be userland programs that leave the upper bits set when they cast
> this to unsigned long. This is probably the safer thing to do.
>
> I'll plan to pick these up for v4.12.
>
> On the other related note...I think we ought to return ESRCH when
> find_vpid returns NULL. I'll take a look at that sometime soon too.
>
> Thanks!
 hi, jeff

 I have sent the patch about find_vpid ,and exist in linux-next branch

 https://patchwork.kernel.org/patch/9766259/

 Thanks
zhongjiang

[PATCH v2] exit: avoid undefined behaviour when call wait4

2017-06-13 Thread zhong jiang

From: zhongjiang 

wait4(-2147483648, 0x20, 0, 0xdd) triggers:
UBSAN: Undefined behaviour in kernel/exit.c:1651:9

The related calltrace is as follows:

[518871.435738] negation of -2147483648 cannot be represented in type 'int':
[518871.442618] CPU: 9 PID: 16482 Comm: zj Tainted: GB   
---   3.10.0-327.53.58.71.x86_64+ #66
[518871.452874] Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285   
   /BC11BTSA  , BIOS CTSAV036 04/27/2011
[518871.464690]  82599190 8b740a25 880112447d90 
81d6eb16
[518871.472395]  880112447da8 81d6ebc9 82599180 
880112447e98
[518871.480101]  81d6fc99 41b58ab3 8228d698 
81d6fb90
[518871.487801] Call Trace:
[518871.490435]  [] dump_stack+0x19/0x1b
[518871.495751]  [] ubsan_epilogue+0xd/0x50
[518871.501328]  [] __ubsan_handle_negate_overflow+0x109/0x14e
[518871.508548]  [] ? 
__ubsan_handle_divrem_overflow+0x1df/0x1df
[518871.516041]  [] ? lg_local_lock+0x24/0xb0
[518871.521785]  [] ? lg_local_unlock+0x20/0xd0
[518871.527708]  [] ? __pmd_alloc+0x180/0x180
[518871.533458]  [] ? mntput+0x3b/0x70
[518871.538598]  [] SyS_wait4+0x1cb/0x1e0
[518871.543999]  [] ? SyS_waitid+0x220/0x220
[518871.549661]  [] ? __audit_syscall_entry+0x1f7/0x2a0
[518871.556278]  [] system_call_fastpath+0x16/0x1b

The patch by excluding the overflow to avoid the UBSAN warning.

Signed-off-by: zhong jiang 
---
 kernel/exit.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/exit.c b/kernel/exit.c
index 516acdb..cfe70cf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1701,6 +1701,10 @@ static long do_wait(struct wait_opts *wo)
if (upid == -1)
type = PIDTYPE_MAX;
else if (upid < 0) {
+   /* -INT_MIN is not defined */
+   if (upid == INT_MIN)
+   return -ESRCH;
+
type = PIDTYPE_PGID;
pid = find_get_pid(-upid);
} else if (upid == 0) {
-- 
1.7.12.4

Re: [PATCH v3 1/2] fs/fcntl: f_setown, allow returning error

2017-06-13 Thread zhong jiang

On 2017/6/13 19:35, Jiri Slaby wrote:
> Allow f_setown to return an error value. We will fail in the next patch
> with EINVAL for bad input to f_setown, so tile the path for the later
> patch.
>
> Signed-off-by: Jiri Slaby 
> Reviewed-by: Jeff Layton 
> Cc: Jeff Layton 
> Cc: "J. Bruce Fields" 
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> ---
>  fs/fcntl.c | 7 ---
>  include/linux/fs.h | 2 +-
>  net/socket.c   | 3 +--
>  3 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index bbf80344c125..313eba860346 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -109,7 +109,7 @@ void __f_setown(struct file *filp, struct pid *pid, enum 
> pid_type type,
>  }
>  EXPORT_SYMBOL(__f_setown);
>  
> -void f_setown(struct file *filp, unsigned long arg, int force)
> +int f_setown(struct file *filp, unsigned long arg, int force)
>  {
>   enum pid_type type;
>   struct pid *pid;
> @@ -123,6 +123,8 @@ void f_setown(struct file *filp, unsigned long arg, int 
> force)
>   pid = find_vpid(who);
>   __f_setown(filp, pid, type, force);
>   rcu_read_unlock();
> +
> + return 0;
>  }
>  EXPORT_SYMBOL(f_setown);
>  
> @@ -305,8 +307,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned 
> long arg,
>   force_successful_syscall_return();
>   break;
>   case F_SETOWN:
> - f_setown(filp, arg, 1);
> - err = 0;
> + err = f_setown(filp, arg, 1);
>   break;
>   case F_GETOWN_EX:
>   err = f_getown_ex(filp, arg);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index ecc301043abf..6dd215a339d4 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1250,7 +1250,7 @@ extern void fasync_free(struct fasync_struct *);
>  extern void kill_fasync(struct fasync_struct **, int, int);
>  
>  extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int 
> force);
> -extern void f_setown(struct file *filp, unsigned long arg, int force);
> +extern int f_setown(struct file *filp, unsigned long arg, int force);
>  extern void f_delown(struct file *filp);
>  extern pid_t f_getown(struct file *filp);
>  extern int send_sigurg(struct fown_struct *fown);
> diff --git a/net/socket.c b/net/socket.c
> index 8f9dab330d57..59e902b9df09 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -991,8 +991,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, 
> unsigned long arg)
>   err = -EFAULT;
>   if (get_user(pid, (int __user *)argp))
>   break;
> - f_setown(sock->file, pid, 1);
> - err = 0;
> + err = f_setown(sock->file, pid, 1);
>   break;
>   case FIOGETOWN:
>   case SIOCGPGRP:
  yes , Look good to me.  

 Thanks
 zhongjiang

Re: [PATCH] fs: fcntl, avoid undefined behaviour

2017-06-13 Thread zhong jiang

On 2017/6/13 17:29, Jiri Slaby wrote:
> On 06/12/2017, 07:03 AM, zhong jiang wrote:
>> On 2016/10/14 17:23, Jiri Slaby wrote:
>>> fcntl(0, F_SETOWN, 0x8000) triggers:
>>> UBSAN: Undefined behaviour in fs/fcntl.c:118:7
>>> negation of -2147483648 cannot be represented in type 'int':
>>> CPU: 1 PID: 18261 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1
>>> ...
>>> Call Trace:
>>> ...
>>>  [] ? f_setown+0x1d8/0x200
>>>  [] ? SyS_fcntl+0x999/0xf30
>>>  [] ? entry_SYSCALL_64_fastpath+0x23/0xc1
>>>
>>> Fix that by checking the arg parameter properly (against INT_MAX) and
>>> return immediatelly in case it is wrong. No error is returned, the
>>> same as in other cases.
>>>
>>> Signed-off-by: Jiri Slaby 
>>> Cc: Jeff Layton 
>>> Cc: "J. Bruce Fields" 
>>> Cc: Alexander Viro 
>>> Cc: linux-fsde...@vger.kernel.org
>>> ---
>>>  fs/fcntl.c | 4 
>>>  1 file changed, 4 insertions(+)
>>>
>>> diff --git a/fs/fcntl.c b/fs/fcntl.c
>>> index 350a2c8cfd28..bfc3b040d956 100644
>>> --- a/fs/fcntl.c
>>> +++ b/fs/fcntl.c
>>> @@ -112,6 +112,10 @@ void f_setown(struct file *filp, unsigned long arg, 
>>> int force)
>>> enum pid_type type;
>>> struct pid *pid;
>>> int who = arg;
>>> +
>>> +   if (arg > INT_MAX)
>>> +   return;
>>> +
>>> type = PIDTYPE_PID;
>>> if (who < 0
>>> type = PIDTYPE_PGID;
>> Hi, Jiri
>>
>> I hit the same issue,  but I see the upstream is still not changed.  Had any 
>> problem?
> Hi, it needed an update which I have just sent. So let's see if that
> gets applied.
>
> thanks,
  I have updated in newest kernel version. but it fails to get the change.  Can 
you look at that?

Thanks
zhongjiang

Re: [PATCH] fs: fcntl, avoid undefined behaviour

2017-06-11 Thread zhong jiang

On 2016/10/14 17:23, Jiri Slaby wrote:
> fcntl(0, F_SETOWN, 0x8000) triggers:
> UBSAN: Undefined behaviour in fs/fcntl.c:118:7
> negation of -2147483648 cannot be represented in type 'int':
> CPU: 1 PID: 18261 Comm: syz-executor Not tainted 4.8.1-0-syzkaller #1
> ...
> Call Trace:
> ...
>  [] ? f_setown+0x1d8/0x200
>  [] ? SyS_fcntl+0x999/0xf30
>  [] ? entry_SYSCALL_64_fastpath+0x23/0xc1
>
> Fix that by checking the arg parameter properly (against INT_MAX) and
> return immediatelly in case it is wrong. No error is returned, the
> same as in other cases.
>
> Signed-off-by: Jiri Slaby 
> Cc: Jeff Layton 
> Cc: "J. Bruce Fields" 
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> ---
>  fs/fcntl.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index 350a2c8cfd28..bfc3b040d956 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -112,6 +112,10 @@ void f_setown(struct file *filp, unsigned long arg, int 
> force)
>   enum pid_type type;
>   struct pid *pid;
>   int who = arg;
> +
> + if (arg > INT_MAX)
> + return;
> +
>   type = PIDTYPE_PID;
>   if (who < 0
>   type = PIDTYPE_PGID;
Hi, Jiri

I hit the same issue,  but I see the upstream is still not changed.  Had any 
problem?

Thanks
zhongjiang

Re: [PATCH 3/3] mm: migrate: Stabilise page count when migrating transparent hugepages

2017-06-09 Thread zhong jiang

On 2017/6/8 20:07, Will Deacon wrote:
> On Thu, Jun 08, 2017 at 12:52:07PM +0200, Vlastimil Babka wrote:
>> On 06/06/2017 07:58 PM, Will Deacon wrote:
>>> When migrating a transparent hugepage, migrate_misplaced_transhuge_page
>>> guards itself against a concurrent fastgup of the page by checking that
>>> the page count is equal to 2 before and after installing the new pmd.
>>>
>>> If the page count changes, then the pmd is reverted back to the original
>>> entry, however there is a small window where the new (possibly writable)
>>> pmd is installed and the underlying page could be written by userspace.
>>> Restoring the old pmd could therefore result in loss of data.
>>>
>>> This patch fixes the problem by freezing the page count whilst updating
>>> the page tables, which protects against a concurrent fastgup without the
>>> need to restore the old pmd in the failure case (since the page count can
>>> no longer change under our feet).
>>>
>>> Cc: Mel Gorman 
>>> Signed-off-by: Will Deacon 
>>> ---
>>>  mm/migrate.c | 15 ++-
>>>  1 file changed, 2 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 89a0a1707f4c..8b21f1b1ec6e 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -1913,7 +1913,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
>>> *mm,
>>> int page_lru = page_is_file_cache(page);
>>> unsigned long mmun_start = address & HPAGE_PMD_MASK;
>>> unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
>>> -   pmd_t orig_entry;
>>>  
>>> /*
>>>  * Rate-limit the amount of data that is being migrated to a node.
>>> @@ -1956,8 +1955,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
>>> *mm,
>>> /* Recheck the target PMD */
>>> mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
>>> ptl = pmd_lock(mm, pmd);
>>> -   if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
>>> -fail_putback:
>>> +   if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
>>> spin_unlock(ptl);
>>> mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
>>>  
>>> @@ -1979,7 +1977,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
>>> *mm,
>>> goto out_unlock;
>>> }
>>>  
>>> -   orig_entry = *pmd;
>>> entry = mk_huge_pmd(new_page, vma->vm_page_prot);
>>> entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>  
>>> @@ -1996,15 +1993,7 @@ int migrate_misplaced_transhuge_page(struct 
>>> mm_struct *mm,
>> There's a comment above this:
>>
>>/*
>>  * Clear the old entry under pagetable lock and establish the new 
>> PTE.
>>  * Any parallel GUP will either observe the old page blocking on the
>>  * page lock, block on the page table lock or observe the new page.
>>  * The SetPageUptodate on the new page and page_add_new_anon_rmap
>>  * guarantee the copy is visible before the pagetable update.
>>  */
>>
>> Is it still correct? Didn't the freezing prevent some of the cases above?
> I don't think the comment needs to change, the freezing is just doing
> correctly what the code tried to do before. Granted, the blocking might come
> about because of the count momentarily being set to 0 (and
> page_cache_add_speculative bailing), but that's just fastGUP implementation
> details, I think.
 I think it should be hold off the speculative , but the fastgup by userspace.
>>> set_pmd_at(mm, mmun_start, pmd, entry);
>>> update_mmu_cache_pmd(vma, address, &entry);
>>>  
>>> -   if (page_count(page) != 2) {
>> BTW, how did the old code recognize that page count would increase and then
>> decrease back?
> I'm not sure that case matters because the inc/dec would happen before the
> new PMD is put in place (otherwise it wouldn't be reachable via the
> fastGUP).
>
> Will
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>
> .
>

Re: [PATCH 3/3] mm: migrate: Stabilise page count when migrating transparent hugepages

2017-06-09 Thread zhong jiang

On 2017/6/8 20:07, Will Deacon wrote:
> On Thu, Jun 08, 2017 at 12:52:07PM +0200, Vlastimil Babka wrote:
>> On 06/06/2017 07:58 PM, Will Deacon wrote:
>>> When migrating a transparent hugepage, migrate_misplaced_transhuge_page
>>> guards itself against a concurrent fastgup of the page by checking that
>>> the page count is equal to 2 before and after installing the new pmd.
>>>
>>> If the page count changes, then the pmd is reverted back to the original
>>> entry, however there is a small window where the new (possibly writable)
>>> pmd is installed and the underlying page could be written by userspace.
>>> Restoring the old pmd could therefore result in loss of data.
>>>
>>> This patch fixes the problem by freezing the page count whilst updating
>>> the page tables, which protects against a concurrent fastgup without the
>>> need to restore the old pmd in the failure case (since the page count can
>>> no longer change under our feet).
>>>
>>> Cc: Mel Gorman 
>>> Signed-off-by: Will Deacon 
>>> ---
>>>  mm/migrate.c | 15 ++-
>>>  1 file changed, 2 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 89a0a1707f4c..8b21f1b1ec6e 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -1913,7 +1913,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
>>> *mm,
>>> int page_lru = page_is_file_cache(page);
>>> unsigned long mmun_start = address & HPAGE_PMD_MASK;
>>> unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
>>> -   pmd_t orig_entry;
>>>  
>>> /*
>>>  * Rate-limit the amount of data that is being migrated to a node.
>>> @@ -1956,8 +1955,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
>>> *mm,
>>> /* Recheck the target PMD */
>>> mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
>>> ptl = pmd_lock(mm, pmd);
>>> -   if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
>>> -fail_putback:
>>> +   if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
>>> spin_unlock(ptl);
>>> mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
>>>  
>>> @@ -1979,7 +1977,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
>>> *mm,
>>> goto out_unlock;
>>> }
>>>  
>>> -   orig_entry = *pmd;
>>> entry = mk_huge_pmd(new_page, vma->vm_page_prot);
>>> entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>  
>>> @@ -1996,15 +1993,7 @@ int migrate_misplaced_transhuge_page(struct 
>>> mm_struct *mm,
>> There's a comment above this:
>>
>>/*
>>  * Clear the old entry under pagetable lock and establish the new 
>> PTE.
>>  * Any parallel GUP will either observe the old page blocking on the
>>  * page lock, block on the page table lock or observe the new page.
>>  * The SetPageUptodate on the new page and page_add_new_anon_rmap
>>  * guarantee the copy is visible before the pagetable update.
>>  */
>>
>> Is it still correct? Didn't the freezing prevent some of the cases above?
> I don't think the comment needs to change, the freezing is just doing
> correctly what the code tried to do before. Granted, the blocking might come
> about because of the count momentarily being set to 0 (and
> page_cache_add_speculative bailing), but that's just fastGUP implementation
> details, I think.
  The pagetable lock will prevent the fastgup by userspace.  why the race will 
come across
  I do not unaderstand , can you explain it please?

 Thanks
 zhongjiang
>>> set_pmd_at(mm, mmun_start, pmd, entry);
>>> update_mmu_cache_pmd(vma, address, &entry);
>>>  
>>> -   if (page_count(page) != 2) {
>> BTW, how did the old code recognize that page count would increase and then
>> decrease back?
> I'm not sure that case matters because the inc/dec would happen before the
> new PMD is put in place (otherwise it wouldn't be reachable via the
> fastGUP).
>
> Will
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>
> .
>

Re: [PATCH] mm: correct the comment when reclaimed pages exceed the scanned pages

2017-06-08 Thread zhong jiang

On 2017/6/8 14:46, Minchan Kim wrote:
> On Wed, Jun 07, 2017 at 04:31:06PM +0800, zhongjiang wrote:
>> The commit e1587a494540 ("mm: vmpressure: fix sending wrong events on
>> underflow") declare that reclaimed pages exceed the scanned pages due
>> to the thp reclaim. it is incorrect because THP will be spilt to normal
>> page and loop again. which will result in the scanned pages increment.
>>
>> Signed-off-by: zhongjiang 
>> ---
>>  mm/vmpressure.c | 5 +++--
>>  1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/vmpressure.c b/mm/vmpressure.c
>> index 6063581..0e91ba3 100644
>> --- a/mm/vmpressure.c
>> +++ b/mm/vmpressure.c
>> @@ -116,8 +116,9 @@ static enum vmpressure_levels 
>> vmpressure_calc_level(unsigned long scanned,
>>  
>>  /*
>>   * reclaimed can be greater than scanned in cases
>> - * like THP, where the scanned is 1 and reclaimed
>> - * could be 512
>> + * like reclaimed slab pages, shrink_node just add
>> + * reclaimed page without a related increment to
>> + * scanned pages.
>>   */
>>  if (reclaimed >= scanned)
>>  goto out;
> Thanks for the fixing my fault!
>
> Acked-by: Minchan Kim 
>
> Frankly speaking, I'm not sure we need such comment in there at the cost
> of maintainance because it would be fragile but easy to fix by above simple
> condition so I think it would be better to remove that comment but others
> might be different. So, don't have any objection.
>
>
> .
>
 Thanks

 Regards
 zhongjiang

Re: mm, something wring in page_lock_anon_vma_read()?

2017-06-08 Thread zhong jiang

On 2017/6/8 21:59, Vlastimil Babka wrote:
> On 06/08/2017 03:44 PM, Xishi Qiu wrote:
>> On 2017/5/23 17:33, Vlastimil Babka wrote:
>>
>>> On 05/23/2017 11:21 AM, zhong jiang wrote:
>>>> On 2017/5/23 0:51, Vlastimil Babka wrote:
>>>>> On 05/20/2017 05:01 AM, zhong jiang wrote:
>>>>>> On 2017/5/20 10:40, Hugh Dickins wrote:
>>>>>>> On Sat, 20 May 2017, Xishi Qiu wrote:
>>>>>>>> Here is a bug report form redhat: 
>>>>>>>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>>>>>>>> And I meet the bug too. However it is hard to reproduce, and 
>>>>>>>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is 
>>>>>>>> not help.
>>>>>>>>
>>>>>>>> From the vmcore, it seems that the page is still mapped(_mapcount=0 
>>>>>>>> and _count=2),
>>>>>>>> and the value of mapping is a valid address(mapping = 
>>>>>>>> 0x8801b3e2a101),
>>>>>>>> but anon_vma has been corrupted.
>>>>>>>>
>>>>>>>> Any ideas?
>>>>>>> Sorry, no.  I assume that _mapcount has been misaccounted, for example
>>>>>>> a pte mapped in on top of another pte; but cannot begin tell you where
>>>>>>> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>>>>>>>
>>>>>>> Hugh
>>>>>>>
>>>>>>> .
>>>>>>>
>>>>>> Hi, Hugh
>>>>>>
>>>>>> I find the following message from the dmesg.
>>>>>>
>>>>>> [26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1
>>>>>>
>>>>>> I can prove that the __mapcount is misaccount.  when task is exited. the 
>>>>>> rmap
>>>>>> still exist.
>>>>> Check if the kernel in question contains this commit: ad33bb04b2a6 ("mm:
>>>>> thp: fix SMP race condition between THP page fault and MADV_DONTNEED")
>>>>   HI, Vlastimil
>>>>  
>>>>   I miss the patch.
>>> Try applying it then, there's good chance the error and crash will go
>>> away. Even if your workload doesn't actually run any madvise(MADV_DONTNEED).
>>>
>> Hi Vlastimil,
>>
>> I find this error was reported by Kirill as following, right?
>> https://patchwork.kernel.org/patch/7550401/
> That was reported by Minchan.
>
>> The call trace is quite like the same as ours.
> In that thread, the error seems just disappeared in the end.
  without any patch,  I wonder that how to disappear. 
> So, did you apply the patch I suggested? Did it help?
 yes, I apply the patch,  test two weeks,  no panic occur.
 but last panic just occur after one month.  so we still not sure that
  it is really resolved the issue.

  Thanks
zhongjiang
>> Thanks,
>> Xishi Qiu
>>
>>>> when I read the patch. I find the following issue. but I am sure it is 
>>>> right.
>>>>
>>>>   if (unlikely(pmd_trans_unstable(pmd)))
>>>> return 0;
>>>> /*
>>>>  * A regular pmd is established and it can't morph into a huge pmd
>>>>  * from under us anymore at this point because we hold the mmap_sem
>>>>  * read mode and khugepaged takes it in write mode. So now it's
>>>>  * safe to run pte_offset_map().
>>>>  */
>>>> pte = pte_offset_map(pmd, address);
>>>>
>>>>   after pmd_trans_unstable call,  without any protect method.  by the 
>>>> comments,
>>>>   it think the pte_offset_map is safe.before pte_offset_map call, it 
>>>> still may be
>>>>   unstable. it is possible?
>>> IIRC it's "unstable" wrt possible none->huge->none transition. But once
>>> we've seen it's a regular pmd via pmd_trans_unstable(), we're safe as a
>>> transition from regular pmd can't happen.
>>>
>>>>   Thanks
>>>> zhongjiang
>>>>>> Thanks
>>>>>> zhongjiang
>>>>>>
>>>>>> --
>>>>>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>>>>> the body to majord...@kvack.org.  For more info on Linux MM,
>>>>>> see: http://www.linux-mm.org/ .
>>>>>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>>>>>>
>>>>> .
>>>>>
>>>>
>>>> --
>>>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>>> the body to majord...@kvack.org.  For more info on Linux MM,
>>>> see: http://www.linux-mm.org/ .
>>>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>>>>
>>>
>>> .
>>>
>>
>>
>
> .
>

Re: [PATCH] mm: correct the comment when reclaimed pages exceed the scanned pages

2017-06-07 Thread zhong jiang

On 2017/6/7 16:31, zhongjiang wrote:
> The commit e1587a494540 ("mm: vmpressure: fix sending wrong events on
> underflow") declare that reclaimed pages exceed the scanned pages due
> to the thp reclaim. it is incorrect because THP will be spilt to normal
> page and loop again. which will result in the scanned pages increment.
>
> Signed-off-by: zhongjiang 
> ---
>  mm/vmpressure.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/mm/vmpressure.c b/mm/vmpressure.c
> index 6063581..0e91ba3 100644
> --- a/mm/vmpressure.c
> +++ b/mm/vmpressure.c
> @@ -116,8 +116,9 @@ static enum vmpressure_levels 
> vmpressure_calc_level(unsigned long scanned,
>  
>   /*
>* reclaimed can be greater than scanned in cases
> -  * like THP, where the scanned is 1 and reclaimed
> -  * could be 512
> +  * like reclaimed slab pages, shrink_node just add
> +  * reclaimed page without a related increment to
> +  * scanned pages.
>*/
>   if (reclaimed >= scanned)
>   goto out;
Hi, minchan

what  suggstion about  the patch

Thanks
zhongjiang

Re: [PATCH] Revert "mm: vmpressure: fix sending wrong events on underflow"

2017-06-06 Thread zhong jiang

On 2017/6/7 14:17, vinayak menon wrote:
> On Wed, Jun 7, 2017 at 11:42 AM, Minchan Kim  wrote:
>> On Wed, Jun 07, 2017 at 12:56:57PM +0800, zhong jiang wrote:
>>> On 2017/6/7 11:55, Minchan Kim wrote:
>>>> On Wed, Jun 07, 2017 at 11:08:37AM +0800, zhongjiang wrote:
>>>>> This reverts commit e1587a4945408faa58d0485002c110eb2454740c.
>>>>>
>>>>> THP lru page is reclaimed , THP is split to normal page and loop again.
>>>>> reclaimed pages should not be bigger than nr_scan.  because of each
>>>>> loop will increase nr_scan counter.
>>>> Unfortunately, there is still underflow issue caused by slab pages as
>>>> Vinayak reported in description of e1587a4945408 so we cannot revert.
>>>> Please correct comment instead of removing the logic.
>>>>
>>>> Thanks.
>>>   we calculate the vmpressue based on the Lru page, exclude the slab pages 
>>> by previous
>>>   discussion.is it not this?
>>>
>> IIRC, It is not merged into mainline although mmotm has it.
> That's right Minchan. That patch was not mainlined.
>
>
 Hi  Minchan and vinayak

 we should revert the patch (mm: vmpressure: fix sending wrong events on 
underflow), then
 apply the mmotm's related patch. or drop the mmotm's related patch, then 
corrent the comment.

 which one make more sense.  Maybe the latter is more feasible. Suggestion ?

 Thanks
 zhongjiang

Re: [PATCH] Revert "mm: vmpressure: fix sending wrong events on underflow"

2017-06-06 Thread zhong jiang

On 2017/6/7 11:55, Minchan Kim wrote:
> On Wed, Jun 07, 2017 at 11:08:37AM +0800, zhongjiang wrote:
>> This reverts commit e1587a4945408faa58d0485002c110eb2454740c.
>>
>> THP lru page is reclaimed , THP is split to normal page and loop again.
>> reclaimed pages should not be bigger than nr_scan.  because of each
>> loop will increase nr_scan counter.
> Unfortunately, there is still underflow issue caused by slab pages as
> Vinayak reported in description of e1587a4945408 so we cannot revert.
> Please correct comment instead of removing the logic.
>
> Thanks.
  we calculate the vmpressue based on the Lru page, exclude the slab pages by 
previous
  discussion.is it not this?

  Thanks
 zhongjiang
>> Signed-off-by: zhongjiang 
>> ---
>>  mm/vmpressure.c | 10 +-
>>  1 file changed, 1 insertion(+), 9 deletions(-)
>>
>> diff --git a/mm/vmpressure.c b/mm/vmpressure.c
>> index 6063581..149fdf6 100644
>> --- a/mm/vmpressure.c
>> +++ b/mm/vmpressure.c
>> @@ -112,16 +112,9 @@ static enum vmpressure_levels 
>> vmpressure_calc_level(unsigned long scanned,
>>  unsigned long reclaimed)
>>  {
>>  unsigned long scale = scanned + reclaimed;
>> -unsigned long pressure = 0;
>> +unsigned long pressure;
>>  
>>  /*
>> - * reclaimed can be greater than scanned in cases
>> - * like THP, where the scanned is 1 and reclaimed
>> - * could be 512
>> - */
>> -if (reclaimed >= scanned)
>> -goto out;
>> -/*
>>   * We calculate the ratio (in percents) of how many pages were
>>   * scanned vs. reclaimed in a given time frame (window). Note that
>>   * time is in VM reclaimer's "ticks", i.e. number of pages
>> @@ -131,7 +124,6 @@ static enum vmpressure_levels 
>> vmpressure_calc_level(unsigned long scanned,
>>  pressure = scale - (reclaimed * scale / scanned);
>>  pressure = pressure * 100 / scale;
>>  
>> -out:
>>  pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
>>   scanned, reclaimed);
>>  
>> -- 
>> 1.7.12.4
>>
> .
>

Re: [PATCH] mm: vmscan: do not pass reclaimed slab to vmpressure

2017-06-06 Thread zhong jiang

On 2017/6/7 10:53, Minchan Kim wrote:
> Hi,
>
> On Tue, Jun 06, 2017 at 09:00:55PM +0800, zhong jiang wrote:
>> On 2017/1/31 7:40, Minchan Kim wrote:
>>> Hi Vinayak,
>>> Sorry for late response. It was Lunar New Year holidays.
>>>
>>> On Fri, Jan 27, 2017 at 01:43:23PM +0530, vinayak menon wrote:
>>>>> Thanks for the explain. However, such case can happen with THP page
>>>>> as well as slab. In case of THP page, nr_scanned is 1 but nr_reclaimed
>>>>> could be 512 so I think vmpressure should have a logic to prevent undeflow
>>>>> regardless of slab shrinking.
>>>>>
>>>> I see. Going to send a vmpressure fix. But, wouldn't the THP case
>>>> result in incorrect
>>>> vmpressure reporting even if we fix the vmpressure underflow problem ?
>>> If a THP page is reclaimed, it reports lower pressure due to bigger
>>> reclaim ratio(ie, reclaimed/scanned) compared to normal pages but
>>> it's not a problem, is it? Because VM reclaimed more memory than
>>> expected so memory pressure isn't severe now.
>>   Hi, Minchan
>>
>>   THP lru page is reclaimed, reclaim ratio bigger make sense. but I read the 
>> code, I found
>>   THP is split to normal pages and loop again.  reclaimed pages should not 
>> be bigger
>>than nr_scan.  because of each loop will increase nr_scan counter.
>>  
>>It is likely  I miss something.  you can point out the point please.
> You are absolutely right.
>
> I got confused by nr_scanned from isolate_lru_pages and sc->nr_scanned
> from shrink_page_list.
>
> Thanks.
>
>
> .
>
 Hi, Minchan

 I will send the revert patch shortly. how do you think?

 Thanks
 zhongjiang

Re: double call identical release when there is a race hitting

2017-06-06 Thread zhong jiang

On 2017/6/6 23:56, Oleg Nesterov wrote:
> I can't answer authoritatively, but
>
> On 06/06, zhong jiang wrote:
>> Hi
>>
>> when I review the code, I find the following scenario will lead to a race ,
>> but I am not sure whether the real issue will hit or not.
>>
>> cpu1  
>> cpu2
>> exit_mmap   
>> mmu_notifier_unregister
>>__mmu_notifier_release srcu_read_lock
>> srcu_read_lock
>> mm->ops->release(mn, mm) mm->ops->release(mn,mm)
>>srcu_read_unlock 
>> srcu_read_unlock
>>
>>
>> obviously,  the specified mm will call identical release function when
>> the related condition satisfy.  is it right?
> I think you are right, this is possible, perhaps the comments should mention
> this explicitly.
>
> See the changelog in d34883d4e35c0a994e91dd847a82b4c9e0c31d83 "mm: 
> mmu_notifier:
> re-fix freed page still mapped in secondary MMU":
>
>   "multiple ->release() callouts", we needn't care it too much ...
>
> Oleg.
>
>
> .
>
Thank you for clarification.
 yes,  I see that the author admit that this is a issue.   The patch describe 
that it is really rare.
 Anyway, this issue should be fixed in a separate patch.

but so far  the issue still exist unfortunately.

Regards
zhongjiang

double call identical release when there is a race hitting

2017-06-06 Thread zhong jiang

Hi

when I review the code, I find the following scenario will lead to a race ,
but I am not sure whether the real issue will hit or not.

cpu1  cpu2
exit_mmap   mmu_notifier_unregister
   __mmu_notifier_release srcu_read_lock
srcu_read_lock
mm->ops->release(mn, mm) mm->ops->release(mn,mm)
   srcu_read_unlock 
srcu_read_unlock


obviously,  the specified mm will call identical release function when
the related condition satisfy.  is it right?

Thanks
zhongjiang

Re: [PATCH] mm: vmscan: do not pass reclaimed slab to vmpressure

2017-06-06 Thread zhong jiang

On 2017/1/31 7:40, Minchan Kim wrote:
> Hi Vinayak,
> Sorry for late response. It was Lunar New Year holidays.
>
> On Fri, Jan 27, 2017 at 01:43:23PM +0530, vinayak menon wrote:
>>> Thanks for the explain. However, such case can happen with THP page
>>> as well as slab. In case of THP page, nr_scanned is 1 but nr_reclaimed
>>> could be 512 so I think vmpressure should have a logic to prevent undeflow
>>> regardless of slab shrinking.
>>>
>> I see. Going to send a vmpressure fix. But, wouldn't the THP case
>> result in incorrect
>> vmpressure reporting even if we fix the vmpressure underflow problem ?
> If a THP page is reclaimed, it reports lower pressure due to bigger
> reclaim ratio(ie, reclaimed/scanned) compared to normal pages but
> it's not a problem, is it? Because VM reclaimed more memory than
> expected so memory pressure isn't severe now.
  Hi, Minchan

  THP lru page is reclaimed, reclaim ratio bigger make sense. but I read the 
code, I found
  THP is split to normal pages and loop again.  reclaimed pages should not be 
bigger
   than nr_scan.  because of each loop will increase nr_scan counter.
 
   It is likely  I miss something.  you can point out the point please.
 
  Thanks
  zhongjiang
>> unsigned arithmetic results in the pressure value to be
>> huge, thus resulting in a critical event being sent to
>> root cgroup. Fix this by not passing the reclaimed slab
>> count to vmpressure, with the assumption that vmpressure
>> should show the actual pressure on LRU which is now
>> diluted by adding reclaimed slab without a corresponding
>> scanned value.
> I can't guess justfication of your assumption from the description.
> Why do we consider only LRU pages for vmpressure? Could you elaborate
> a bit?
>
 When we encountered the false events from vmpressure, thought the problem
 could be that slab scanned is not included in sc->nr_scanned, like it is 
 done
 for reclaimed. But later thought vmpressure works only on the scanned and
 reclaimed from LRU. I can explain what I understand, let me know if this is
 incorrect.
 vmpressure is an index which tells the pressure on LRU, and thus an
 indicator of thrashing. In shrink_node when we come out of the inner 
 do-while
 loop after shrinking the lruvec, the scanned and reclaimed corresponds to 
 the
 pressure felt on the LRUs which in turn indicates the pressure on VM. The
 moment we add the slab reclaimed pages to the reclaimed, we dilute the
 actual pressure felt on LRUs. When slab scanned/reclaimed is not included
 in the vmpressure, the values will indicate the actual pressure and if 
 there
 were a lot of slab reclaimed pages it will result in lesser pressure
 on LRUs in the next run which will again be indicated by vmpressure. i.e. 
 the
>>> I think there is no intention to exclude slab by design of vmpressure.
>>> Beause slab is memory consumption so freeing of slab pages really helps
>>> the memory pressure. Also, there might be slab-intensive workload rather
>>> than LRU. It would be great if vmpressure works well with that case.
>>> But the problem with involving slab for vmpressure is it's not fair with
>>> LRU pages. LRU pages are 1:1 cost model for scan:free but slab shriking
>>> depends the each slab's object population. It means it's impossible to
>>> get stable cost model with current slab shrinkg model, unfortunately.
>>> So I don't obejct this patch although I want to see slab shrink model's
>>> change which is heavy-handed work.
>>>
>> Looking at the code, the slab reclaimed pages started getting passed to
>> vmpressure after the commit ("mm: vmscan: invoke slab shrinkers from
>> shrink_zone()").
>> But as you said, this may be helpful for slab intensive workloads. But in its
>> current form I think it results in incorrect vmpressure reporting because of 
>> not
>> accounting the slab scanned pages. Resending the patch with a modified
>> commit msg
>> since the underflow issue is fixed separately. Thanks Minchan.
> Make sense.
>
> Thanks, Vinayak!
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>
> .
>

Re: [PATCH v2] signal: Avoid undefined behaviour in kill_something_info

2017-06-05 Thread zhong jiang

On 2017/6/5 21:31, Oleg Nesterov wrote:
> On 06/05, zhongjiang wrote:
>> --- a/kernel/signal.c
>> +++ b/kernel/signal.c
>> @@ -1395,6 +1395,12 @@ static int kill_something_info(int sig, struct 
>> siginfo *info, pid_t pid)
>>
>>  read_lock(&tasklist_lock);
>>  if (pid != -1) {
>> +/*
>> + * -INT_MIN is undefined, it need to exclude following case to
>> + * avoid the UBSAN detection.
>> + */
>> +if (pid == INT_MIN)
>> +return -ESRCH;
> you need to do this before read_lock(tasklist)
>
> Oleg.
>
>
> .
>
  I am so sorry for disturbing.

 Thanks
zhongjiang

Re: [PATCH v2] signal: Avoid undefined behaviour in kill_something_info

2017-06-05 Thread zhong jiang

On 2017/6/5 21:09, Michal Hocko wrote:
> On Mon 05-06-17 20:53:27, zhongjiang wrote:
>> diff --git a/kernel/signal.c b/kernel/signal.c
>> index ca92bcf..63148f7 100644
>> --- a/kernel/signal.c
>> +++ b/kernel/signal.c
>> @@ -1395,6 +1395,12 @@ static int kill_something_info(int sig, struct 
>> siginfo *info, pid_t pid)
>>  
>>  read_lock(&tasklist_lock);
>>  if (pid != -1) {
>> +/*
>> + * -INT_MIN is undefined, it need to exclude following case to 
>> + * avoid the UBSAN detection.
>> + */
>> +if (pid == INT_MIN)
>> +return -ESRCH;
> this will obviously keep the tasklist_lock held...
 oh, it is my fault.   Thank you for clarify.

 Thanks
zhongjiang
>>  ret = __kill_pgrp_info(sig, info,
>>  pid ? find_vpid(-pid) : task_pgrp(current));
>>  } else {
>> -- 
>> 1.7.12.4

Re: [PATCH] signal: Avoid undefined behaviour in kill_something_info

2017-06-05 Thread zhong jiang

On 2017/6/5 20:37, Oleg Nesterov wrote:
> On 06/05, zhongjiang wrote:
>>  static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
>>  {
>> -int ret;
>> +int ret, vpid;
>>  
>>  if (pid > 0) {
>>  rcu_read_lock();
>> @@ -1395,8 +1395,12 @@ static int kill_something_info(int sig, struct 
>> siginfo *info, pid_t pid)
>>  
>>  read_lock(&tasklist_lock);
>>  if (pid != -1) {
>> +if (pid == INT_MIN)
>> +vpid = INT_MAX;
> Well, this probably needs a comment to explain that this is just "avoid ub".
>
> And if we really want the fix, to me
>
>   if (pid == INT_MIN)
>   return -ESRCH;
>
> at the start makes more sense...
>
> Oleg.
>
>
> .
>
 ok, I will motify it in v2 shortly,  Thanks

 Regards
zhongjiang

Re: [PATCH] mm/vmalloc: a slight change of compare target in __insert_vmap_area()

2017-06-01 Thread zhong jiang

On 2017/6/2 9:45, Wei Yang wrote:
> On Fri, May 26, 2017 at 09:55:31AM +0800, zhong jiang wrote:
>> On 2017/5/26 9:36, Wei Yang wrote:
>>> On Thu, May 25, 2017 at 11:04:44AM +0800, zhong jiang wrote:
>>>> I hit the overlap issue, but it  is hard to reproduced. if you think it is 
>>>> safe. and the situation
>>>> is not happen. AFAIC, it is no need to add the code.
>>>>
>>>> if you insist on the point. Maybe VM_WARN_ON is a choice.
>>>>
>>> Do you have some log to show the overlap happens?
>> Hi  wei
>>
>> cat /proc/vmallocinfo
>> 0xf158-0xf160  524288 raw_dump_mem_write+0x10c/0x188 phys=8b901000 
>> ioremap
>> 0xf1638000-0xf163a0008192 mcss_pou_queue_init+0xa0/0x13c [mcss] 
>> phys=fc614000 ioremap
>> 0xf528e000-0xf5292000   16384 n_tty_open+0x10/0xd0 pages=3 vmalloc
>> 0xf500-0xf9001000 67112960 devm_ioremap+0x38/0x70 phys=4000 ioremap
> These two ranges overlap.
>
> This is hard to say where is the problem. From the code point of view, I don't
> see there is possibility to allocate an overlapped range.
>
> Which version of your kernel?
> Hard to reproduce means just see once? 
  yes, just once.  I have also no see any problem from the code.   The kernel 
version is linux 4.1.
 but That indeed exist. 

 Thanks
zhongjiang
>> 0xfe001000-0xfe0020004096 iotable_init+0x0/0xc phys=20001000 ioremap
>> 0xfe20-0xfe2010004096 iotable_init+0x0/0xc phys=1a00 ioremap
>> 0xff10-0xff1010004096 iotable_init+0x0/0xc phys=2000a000 ioremap
>>
>> I hit the above issue, but the log no more useful info. it just is found by 
>> accident.
>> and it is hard to reprodeced. no more info can be supported for further 
>> investigation.
>> therefore, it is no idea for me. 
>>
>> Thanks
>> zhongjinag
>>

Re: [PATCH] mm/vmalloc: a slight change of compare target in __insert_vmap_area()

2017-05-25 Thread zhong jiang

On 2017/5/26 9:36, Wei Yang wrote:
> On Thu, May 25, 2017 at 11:04:44AM +0800, zhong jiang wrote:
>> I hit the overlap issue, but it  is hard to reproduced. if you think it is 
>> safe. and the situation
>> is not happen. AFAIC, it is no need to add the code.
>>
>> if you insist on the point. Maybe VM_WARN_ON is a choice.
>>
> Do you have some log to show the overlap happens?
 Hi  wei

cat /proc/vmallocinfo
0xf158-0xf160  524288 raw_dump_mem_write+0x10c/0x188 phys=8b901000 
ioremap
0xf1638000-0xf163a0008192 mcss_pou_queue_init+0xa0/0x13c [mcss] 
phys=fc614000 ioremap
0xf528e000-0xf5292000   16384 n_tty_open+0x10/0xd0 pages=3 vmalloc
0xf500-0xf9001000 67112960 devm_ioremap+0x38/0x70 phys=4000 ioremap
0xfe001000-0xfe0020004096 iotable_init+0x0/0xc phys=20001000 ioremap
0xfe20-0xfe2010004096 iotable_init+0x0/0xc phys=1a00 ioremap
0xff10-0xff1010004096 iotable_init+0x0/0xc phys=2000a000 ioremap

I hit the above issue, but the log no more useful info. it just is found by 
accident.
and it is hard to reprodeced. no more info can be supported for further 
investigation.
therefore, it is no idea for me. 

Thanks
zhongjinag

Re: [PATCH] mm/vmalloc: a slight change of compare target in __insert_vmap_area()

2017-05-24 Thread zhong jiang

I hit the overlap issue, but it  is hard to reproduced. if you think it is 
safe. and the situation
is not happen. AFAIC, it is no need to add the code.

if you insist on the point. Maybe VM_WARN_ON is a choice.

Regards
zhongjiang
On 2017/5/24 18:03, Wei Yang wrote:
> The vmap RB tree store the elements in order and no overlap between any of
> them. The comparison in __insert_vmap_area() is to decide which direction
> the search should follow and make sure the new vmap_area is not overlap
> with any other.
>
> Current implementation fails to do the overlap check.
>
> When first "if" is not true, it means
>
> va->va_start >= tmp_va->va_end
>
> And with the truth
>
> xxx->va_end > xxx->va_start
>
> The deduction is
>
> va->va_end > tmp_va->va_start
>
> which is the condition in second "if".
>
> This patch changes a little of the comparison in __insert_vmap_area() to
> make sure it forbids the overlapped vmap_area.
>
> Signed-off-by: Wei Yang 
> ---
>  mm/vmalloc.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 0b057628a7ba..8087451cb332 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -360,9 +360,9 @@ static void __insert_vmap_area(struct vmap_area *va)
>  
>   parent = *p;
>   tmp_va = rb_entry(parent, struct vmap_area, rb_node);
> - if (va->va_start < tmp_va->va_end)
> + if (va->va_end <= tmp_va->va_start)
>   p = &(*p)->rb_left;
> - else if (va->va_end > tmp_va->va_start)
> + else if (va->va_start >= tmp_va->va_end)
>   p = &(*p)->rb_right;
>   else
>   BUG();

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-23 Thread zhong jiang

On 2017/5/23 17:33, Vlastimil Babka wrote:
> On 05/23/2017 11:21 AM, zhong jiang wrote:
>> On 2017/5/23 0:51, Vlastimil Babka wrote:
>>> On 05/20/2017 05:01 AM, zhong jiang wrote:
>>>> On 2017/5/20 10:40, Hugh Dickins wrote:
>>>>> On Sat, 20 May 2017, Xishi Qiu wrote:
>>>>>> Here is a bug report form redhat: 
>>>>>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>>>>>> And I meet the bug too. However it is hard to reproduce, and 
>>>>>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not 
>>>>>> help.
>>>>>>
>>>>>> From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>>>>>> _count=2),
>>>>>> and the value of mapping is a valid address(mapping = 
>>>>>> 0x8801b3e2a101),
>>>>>> but anon_vma has been corrupted.
>>>>>>
>>>>>> Any ideas?
>>>>> Sorry, no.  I assume that _mapcount has been misaccounted, for example
>>>>> a pte mapped in on top of another pte; but cannot begin tell you where
>>>>> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>>>>>
>>>>> Hugh
>>>>>
>>>>> .
>>>>>
>>>> Hi, Hugh
>>>>
>>>> I find the following message from the dmesg.
>>>>
>>>> [26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1
>>>>
>>>> I can prove that the __mapcount is misaccount.  when task is exited. the 
>>>> rmap
>>>> still exist.
>>> Check if the kernel in question contains this commit: ad33bb04b2a6 ("mm:
>>> thp: fix SMP race condition between THP page fault and MADV_DONTNEED")
>>   HI, Vlastimil
>>  
>>   I miss the patch.
> Try applying it then, there's good chance the error and crash will go
> away. Even if your workload doesn't actually run any madvise(MADV_DONTNEED).
 ok , I will try.   Thanks
>> when I read the patch. I find the following issue. but I am sure it is right.
>>
>>   if (unlikely(pmd_trans_unstable(pmd)))
>> return 0;
>> /*
>>  * A regular pmd is established and it can't morph into a huge pmd
>>  * from under us anymore at this point because we hold the mmap_sem
>>  * read mode and khugepaged takes it in write mode. So now it's
>>  * safe to run pte_offset_map().
>>  */
>> pte = pte_offset_map(pmd, address);
>>
>>   after pmd_trans_unstable call,  without any protect method.  by the 
>> comments,
>>   it think the pte_offset_map is safe.before pte_offset_map call, it 
>> still may be
>>   unstable. it is possible?
> IIRC it's "unstable" wrt possible none->huge->none transition. But once
> we've seen it's a regular pmd via pmd_trans_unstable(), we're safe as a
> transition from regular pmd can't happen.
  Thank you for clarify. 
 
  Regards
 zhongjiang
>>   Thanks
>> zhongjiang
>>>> Thanks
>>>> zhongjiang
>>>>
>>>> --
>>>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>>>> the body to majord...@kvack.org.  For more info on Linux MM,
>>>> see: http://www.linux-mm.org/ .
>>>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>>>>
>>> .
>>>
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>
> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-23 Thread zhong jiang

On 2017/5/23 0:51, Vlastimil Babka wrote:
> On 05/20/2017 05:01 AM, zhong jiang wrote:
>> On 2017/5/20 10:40, Hugh Dickins wrote:
>>> On Sat, 20 May 2017, Xishi Qiu wrote:
>>>> Here is a bug report form redhat: 
>>>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>>>> And I meet the bug too. However it is hard to reproduce, and 
>>>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not 
>>>> help.
>>>>
>>>> From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>>>> _count=2),
>>>> and the value of mapping is a valid address(mapping = 0x8801b3e2a101),
>>>> but anon_vma has been corrupted.
>>>>
>>>> Any ideas?
>>> Sorry, no.  I assume that _mapcount has been misaccounted, for example
>>> a pte mapped in on top of another pte; but cannot begin tell you where
>>> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>>>
>>> Hugh
>>>
>>> .
>>>
>> Hi, Hugh
>>
>> I find the following message from the dmesg.
>>
>> [26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1
>>
>> I can prove that the __mapcount is misaccount.  when task is exited. the rmap
>> still exist.
> Check if the kernel in question contains this commit: ad33bb04b2a6 ("mm:
> thp: fix SMP race condition between THP page fault and MADV_DONTNEED")
  HI, Vlastimil
 
  I miss the patch.  when I read the patch. I find the following issue. but I 
am sure it is right.

  if (unlikely(pmd_trans_unstable(pmd)))
return 0;
/*
 * A regular pmd is established and it can't morph into a huge pmd
 * from under us anymore at this point because we hold the mmap_sem
 * read mode and khugepaged takes it in write mode. So now it's
 * safe to run pte_offset_map().
 */
pte = pte_offset_map(pmd, address);

  after pmd_trans_unstable call,  without any protect method.  by the comments,
  it think the pte_offset_map is safe.before pte_offset_map call, it still 
may be
  unstable. it is possible?

  Thanks
zhongjiang
>> Thanks
>> zhongjiang
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>>
>
> .
>

Re: mm, something wring in page_lock_anon_vma_read()?

2017-05-19 Thread zhong jiang

On 2017/5/20 10:40, Hugh Dickins wrote:
> On Sat, 20 May 2017, Xishi Qiu wrote:
>> Here is a bug report form redhat: 
>> https://bugzilla.redhat.com/show_bug.cgi?id=1305620
>> And I meet the bug too. However it is hard to reproduce, and 
>> 624483f3ea82598("mm: rmap: fix use-after-free in __put_anon_vma") is not 
>> help.
>>
>> From the vmcore, it seems that the page is still mapped(_mapcount=0 and 
>> _count=2),
>> and the value of mapping is a valid address(mapping = 0x8801b3e2a101),
>> but anon_vma has been corrupted.
>>
>> Any ideas?
> Sorry, no.  I assume that _mapcount has been misaccounted, for example
> a pte mapped in on top of another pte; but cannot begin tell you where
> in Red Hat's kernel-3.10.0-229.4.2.el7 that might happen.
>
> Hugh
>
> .
>
Hi, Hugh

I find the following message from the dmesg.

[26068.316592] BUG: Bad rss-counter state mm:8800a7de2d80 idx:1 val:1

I can prove that the __mapcount is misaccount.  when task is exited. the rmap
still exist.

Thanks
zhongjiang

Re: [Qustion] vmalloc area overlap with another allocated vmalloc area

2017-05-17 Thread zhong jiang

On 2017/5/17 21:44, Michal Hocko wrote:
> On Wed 17-05-17 20:53:57, zhong jiang wrote:
>> +to linux-mm maintainer for any suggestions
>>
>> Thanks
>> zhongjiang
>> On 2017/5/16 13:03, zhong jiang wrote:
>>> Hi
>>>
>>> I  hit the following issue by runing /proc/vmallocinfo.  The kernel is 4.1 
>>> stable and
>>> 32 bit to be used.  after I expand the vamlloc area,  the issue is not 
>>> occur again.
>>> it is related to the overflow. but I do not see any problem so far.
> Is this a clean 4.1 stable kernel without any additional patches on top?
> Are you able to reproduce this? How? Does the same problem happen with
> the current Linus tree?
  It is hard to reproduce.  just for special case and only once. we can not 
structure the case.
  I do not test it in Linus tree so far.  because I know it is hard to 
reprodeuce.

  Just by reading the code, I do not find the same issue. so I have no idea.

 Thanks
 zhongjiang 
>>> cat /proc/vmallocinfo
>>> 0xf158-0xf160  524288 raw_dump_mem_write+0x10c/0x188 phys=8b901000 
>>> ioremap
>>> 0xf1638000-0xf163a0008192 mcss_pou_queue_init+0xa0/0x13c [mcss] 
>>> phys=fc614000 ioremap
>>> 0xf528e000-0xf5292000   16384 n_tty_open+0x10/0xd0 pages=3 vmalloc
>>> 0xf500-0xf9001000 67112960 devm_ioremap+0x38/0x70 phys=4000 ioremap
>>> 0xfe001000-0xfe0020004096 iotable_init+0x0/0xc phys=20001000 ioremap
>>> 0xfe20-0xfe2010004096 iotable_init+0x0/0xc phys=1a00 ioremap
>>> 0xff10-0xff1010004096 iotable_init+0x0/0xc phys=2000a000 ioremap
>>>
>>> n_tty_open allocate the vmap area is surrounded by the devm_ioremap ioremap 
>>> by above info.
>>> I do not see also the race in the condition.
>>>
>>> I have no idea to the issue.  Anyone has any suggestions will be 
>>> appreicated.
>>> The related config is attatched.
>>>
>>> Thanks
>>> zhongjiang

Re: [Qustion] vmalloc area overlap with another allocated vmalloc area

2017-05-17 Thread zhong jiang

+to linux-mm maintainer for any suggestions

Thanks
zhongjiang
On 2017/5/16 13:03, zhong jiang wrote:
> Hi
>
> I  hit the following issue by runing /proc/vmallocinfo.  The kernel is 4.1 
> stable and
> 32 bit to be used.  after I expand the vamlloc area,  the issue is not occur 
> again.
> it is related to the overflow. but I do not see any problem so far.
>
> cat /proc/vmallocinfo
> 0xf158-0xf160  524288 raw_dump_mem_write+0x10c/0x188 phys=8b901000 
> ioremap
> 0xf1638000-0xf163a0008192 mcss_pou_queue_init+0xa0/0x13c [mcss] 
> phys=fc614000 ioremap
> 0xf528e000-0xf5292000   16384 n_tty_open+0x10/0xd0 pages=3 vmalloc
> 0xf500-0xf9001000 67112960 devm_ioremap+0x38/0x70 phys=4000 ioremap
> 0xfe001000-0xfe0020004096 iotable_init+0x0/0xc phys=20001000 ioremap
> 0xfe20-0xfe2010004096 iotable_init+0x0/0xc phys=1a00 ioremap
> 0xff10-0xff1010004096 iotable_init+0x0/0xc phys=2000a000 ioremap
>
> n_tty_open allocate the vmap area is surrounded by the devm_ioremap ioremap 
> by above info.
> I do not see also the race in the condition.
>
> I have no idea to the issue.  Anyone has any suggestions will be appreicated.
> The related config is attatched.
>
> Thanks
> zhongjiang

[Qustion] vmalloc area overlap with another allocated vmalloc area

2017-05-15 Thread zhong jiang

Hi

I  hit the following issue by runing /proc/vmallocinfo.  The kernel is 4.1 
stable and
32 bit to be used.  after I expand the vamlloc area,  the issue is not occur 
again.
it is related to the overflow. but I do not see any problem so far.

cat /proc/vmallocinfo
0xf158-0xf160  524288 raw_dump_mem_write+0x10c/0x188 phys=8b901000 
ioremap
0xf1638000-0xf163a0008192 mcss_pou_queue_init+0xa0/0x13c [mcss] 
phys=fc614000 ioremap
0xf528e000-0xf5292000   16384 n_tty_open+0x10/0xd0 pages=3 vmalloc
0xf500-0xf9001000 67112960 devm_ioremap+0x38/0x70 phys=4000 ioremap
0xfe001000-0xfe0020004096 iotable_init+0x0/0xc phys=20001000 ioremap
0xfe20-0xfe2010004096 iotable_init+0x0/0xc phys=1a00 ioremap
0xff10-0xff1010004096 iotable_init+0x0/0xc phys=2000a000 ioremap

n_tty_open allocate the vmap area is surrounded by the devm_ioremap ioremap by 
above info.
I do not see also the race in the condition.

I have no idea to the issue.  Anyone has any suggestions will be appreicated.
The related config is attatched.

Thanks
zhongjiang
#
# Automatically generated file; DO NOT EDIT.
# Linux/arm 4.1.12 Kernel Configuration
#
CONFIG_ARM=y
CONFIG_MIGHT_HAVE_PCI=y
CONFIG_SYS_SUPPORTS_APM_EMULATION=y
CONFIG_HAVE_PROC_CPU=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
# CONFIG_IRQ_BYPASS is not set
# CONFIG_LIB_INTERRUPT is not set
# CONFIG_SET_IRQPRIORITY is not set
# CONFIG_SRE_PREHANDLER is not set
# CONFIG_IPI_COMBINE is not set
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_NEED_DMA_MAP_STATE=y
CONFIG_ARCH_SUPPORTS_UPROBES=y
CONFIG_FIQ=y
CONFIG_VECTORS_BASE=0x
CONFIG_NEED_MACH_MEMORY_H=y
CONFIG_PHYS_OFFSET=0x800
CONFIG_GENERIC_BUG=y
CONFIG_PGTABLE_LEVELS=2
# CONFIG_ARCH_AARCH32_ES_SUPPORT is not set
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
CONFIG_IRQ_WORK=y
CONFIG_BUILDTIME_EXTABLE_SORT=y

#
# General setup
#
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_CROSS_COMPILE=""
# CONFIG_COMPILE_TEST is not set
CONFIG_LOCALVERSION=""
# CONFIG_LOCALVERSION_AUTO is not set
CONFIG_HAVE_KERNEL_GZIP=y
CONFIG_HAVE_KERNEL_LZMA=y
CONFIG_HAVE_KERNEL_XZ=y
CONFIG_HAVE_KERNEL_LZO=y
CONFIG_HAVE_KERNEL_LZ4=y
# CONFIG_KERNEL_GZIP is not set
CONFIG_KERNEL_LZMA=y
# CONFIG_KERNEL_XZ is not set
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
CONFIG_DEFAULT_HOSTNAME="(none)"
# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SY SCTL=y
CONFIG_POSIX_MQUEUE=y
CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_CROSS_MEMORY_ATTACH=y
# CONFIG_FHANDLE is not set
# CONFIG_USELIB is not set
CONFIG_AUDIT=y
CONFIG_HAVE_ARCH_AUDITSYSCALL=y
CONFIG_AUDITSYSCALL=y
CONFIG_AUDIT_WATCH=y
CONFIG_AUDIT_TREE=y

#
# IRQ subsystem
#
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_IRQ_SHOW=y
CONFIG_GENERIC_IRQ_SHOW_LEVEL=y
CONFIG_HARDIRQS_SW_RESEND=y
CONFIG_IRQ_DOMAIN=y
CONFIG_IRQ_DOMAIN_HIERARCHY=y
CONFIG_HANDLE_DOMAIN_IRQ=y
# CONFIG_IRQ_DOMAIN_DEBUG is not set
CONFIG_IRQ_FORCED_THREADING=y
CONFIG_GENERIC_TIME_VSYSCALL=y
CONFIG_GENERIC_CLOCKEVENTS=y
CONFIG_ARCH_HAS_TICK_BROADCAST=y
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y

#
# Timers subsystem
#
CONFIG_HZ_PERIODIC=y
# CONFIG_NO_HZ_IDLE is not set
# CONFIG_NO_HZ_FULL is not set
# CONFIG_NO_HZ is not set
# CONFIG_HIGH_RES_TIMERS is not set

#
# CPU/Task time and stats accounting
#
CONFIG_TICK_CPU_ACCOUNTING=y
# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set
# CONFIG_IRQ_TIME_ACCOUNTING is not set
# CONFIG_BSD_PROCESS_ACCT is not set
# CONFIG_TASKSTATS is not set

#
# RCU Subsystem
#
CONFIG_TREE_RCU=y
CONFIG_SRCU=y
# CONFIG_TASKS_RCU is not set
CONFIG_RCU_STALL_COMMON=y
# CONFIG_RCU_USER_QS is not set
CONFIG_RCU_FANOUT=32
CONFIG_RCU_FANOUT_LEAF=16
# CONFIG_RCU_FANOUT_EXACT is not set
# CONFIG_TREE_RCU_TRACE is not set
CONFIG_RCU_KTHREAD_PRIO=0
# CONFIG_RCU_NOCB_CPU is not set
# CONFIG_RCU_EXPEDITE_BOOT is not set
# CONFIG_BUILD_BIN2C is not set
# CONFIG_IKCONFIG is not set
CONFIG_LOG_BUF_SHIFT=14
CONFIG_LOG_CPU_MAX_BUF_SHIFT=12
CONFIG_GENERIC_SCHED_CLOCK=y
CONFIG_CGROUPS=y
CONFIG_CGROUP_DEBUG=y
# CONFIG_CGROUP_FREEZER is not set
# CONFIG_CGROUP_PIDS is not set
# CONFIG_CGROUP_DEVICE is not set
CONFIG_CPUSETS=y
CONFIG_PROC_PID_CPUSET=y
CONFIG_CGROUP_CPUACCT=y
# CONFIG_MEMCG is not set
# CONFIG_CGROUP_HUGETLB is not set
# CONFIG_CGROUP_PERF is not set
CONFIG_CGROUP_SCHED=y
CONFIG_FAIR_GROUP_SCHED=y
# CONFIG_CFS_BANDWIDTH is not set
# CONFIG_RT_GROUP_SCHED is not set
# CONFIG_BLK_CGROUP is not set
# CONFIG_CHECKPOINT_RESTORE is not set
# CONFIG_NAMESPACES is not set
# CONFIG_SCHED_AUTOGROUP is not set
# CONFIG_SYSFS_DEPRECATED is not set
# CONFIG_RELAY is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_INITRAMFS_SOURCE=""
# CONFIG_RD_GZIP is not set
# CONFIG_RD_BZIP2 is not set
CONFIG_RD_LZMA=y
# CONFIG_RD_XZ is not set
# CONFIG_RD_LZO is not set
# CONFIG_RD_LZ4 is not set
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
CONFIG_SYSCTL=y
CONFIG_ANON_INODES=y
CONFIG_HAVE_UID16=y
CONFIG_BPF=y
CONFIG_EXPERT=y
CONFIG_UID16=

Re: [RESENT PATCH] x86/mem: fix the offset overflow when read/write mem

2017-05-09 Thread zhong jiang

On 2017/5/9 23:46, Rik van Riel wrote:
> On Thu, 2017-05-04 at 10:28 +0800, zhong jiang wrote:
>> On 2017/5/4 2:46, Rik van Riel wrote:
>>> However, it is not as easy as simply checking the
>>> end against __pa(high_memory). Some systems have
>>> non-contiguous physical memory ranges, with gaps
>>> of invalid addresses in-between.
>>  The invalid physical address means that it is used as
>>  io mapped. not in system ram region. /dev/mem is not
>>  access to them , is it right?
> Not necessarily. Some systems simply have large
> gaps in physical memory access. Their memory map
> may look like this:
>
> |MM|IO||..||
>
> Where M is memory, IO is IO space, and the
> dots are simply a gap in physical address
> space with no valid accesses at all.
>
>>> At that point, is the complexity so much that it no
>>> longer makes sense to try to protect against root
>>> crashing the system?
>>>
>>  your suggestion is to let the issue along without any protection.
>>  just root user know what they are doing.
> Well, root already has other ways to crash the system.
>
> Implementing validation on /dev/mem may make sense if
> it can be done in a simple way, but may not be worth
> it if it becomes too complex.
>
 I have no a simple way to fix. Do you any suggestion. or you can send
 a patch for me ?

 Thanks
 zhongjiang

Re: [RESENT PATCH] x86/mem: fix the offset overflow when read/write mem

2017-05-03 Thread zhong jiang

On 2017/5/4 2:46, Rik van Riel wrote:
> On Tue, 2017-05-02 at 13:54 -0700, David Rientjes wrote:
>
>>> diff --git a/drivers/char/mem.c b/drivers/char/mem.c
>>> index 7e4a9d1..3a765e02 100644
>>> --- a/drivers/char/mem.c
>>> +++ b/drivers/char/mem.c
>>> @@ -55,7 +55,7 @@ static inline int
>> valid_phys_addr_range(phys_addr_t addr, size_t count)
>>>   
>>>   static inline int valid_mmap_phys_addr_range(unsigned long pfn,
>> size_t size)
>>>   {
>>> - return 1;
>>> + return (pfn << PAGE_SHIFT) + size <= __pa(high_memory);
>>>   }
>>>   #endif
>>>   
>> I suppose you are correct that there should be some sanity checking
>> on the 
>> size used for the mmap().
> My apologies for not responding earlier. It may
> indeed make sense to have a sanity check here.
>
> However, it is not as easy as simply checking the
> end against __pa(high_memory). Some systems have
> non-contiguous physical memory ranges, with gaps
> of invalid addresses in-between.
 The invalid physical address means that it is used as
 io mapped. not in system ram region. /dev/mem is not
 access to them , is it right?
> You would have to make sure that both the beginning
> and the end are valid, and that there are no gaps of
> invalid pfns in the middle...
 If it is limited in system ram, we can walk the resource
 to exclude them. or adding pfn_valid further to optimize.
 whether other situation should be consider ? I am not sure.
> At that point, is the complexity so much that it no
> longer makes sense to try to protect against root
> crashing the system?
>
 your suggestion is to let the issue along without any protection.
 just root user know what they are doing.
 
 Thanks
 zhongjiang

Re: [RESENT PATCH] x86/mem: fix the offset overflow when read/write mem

2017-05-02 Thread zhong jiang

On 2017/5/3 4:54, David Rientjes wrote:
> On Thu, 27 Apr 2017, zhongjiang wrote:
>
>> From: zhong jiang 
>>
>> Recently, I found the following issue, it will result in the panic.
>>
>> [  168.739152] mmap1: Corrupted page table at address 7f3e6275a002
>> [  168.745039] PGD 61f4a1067
>> [  168.745040] PUD 61ab19067
>> [  168.747730] PMD 61fb8b067
>> [  168.750418] PTE 80001225
>> [  168.753109]
>> [  168.757795] Bad pagetable: 000d [#1] SMP
>> [  168.761696] Modules linked in: intel_powerclamp coretemp kvm_intel kvm 
>> irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel 
>> crypto_simd iTCO_wdt glue_helper cryptd sg iTCO_vendor_support i7core_edac 
>> edac_core shpchp lpc_ich i2c_i801 pcspkr mfd_core acpi_cpufreq ip_tables xfs 
>> libcrc32c sd_mod igb ata_generic ptp pata_acpi pps_core mptsas ata_piix 
>> scsi_transport_sas i2c_algo_bit libata mptscsih i2c_core serio_raw 
>> crc32c_intel bnx2 mptbase dca dm_mirror dm_region_hash dm_log dm_mod
>> [  168.805983] CPU: 15 PID: 10369 Comm: mmap1 Not tainted 
>> 4.11.0-rc2-327.28.3.53.x86_64+ #345
>> [  168.814202] Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285 
>>  /BC11BTSA  , BIOS CTSAV036 04/27/2011
>> [  168.825704] task: 8806207d5200 task.stack: c9000c34
>> [  168.831592] RIP: 0033:0x7f3e622c5360
>> [  168.835147] RSP: 002b:7ffe2bb7a098 EFLAGS: 00010203
>> [  168.840344] RAX: 7ffe2bb7a0c0 RBX:  RCX: 
>> 7f3e6275a000
>> [  168.847439] RDX: 7f3e622c5360 RSI: 7f3e6275a000 RDI: 
>> 7ffe2bb7a0c0
>> [  168.854535] RBP: 7ffe2bb7a4e0 R08: 7f3e621c3d58 R09: 
>> 002d
>> [  168.861632] R10: 7ffe2bb79e20 R11: 7f3e622fbcb0 R12: 
>> 004005d0
>> [  168.868728] R13: 7ffe2bb7a5c0 R14:  R15: 
>> 
>> [  168.875825] FS:  7f3e62752740() GS:880627bc() 
>> knlGS:
>> [  168.883870] CS:  0010 DS:  ES:  CR0: 80050033
>> [  168.889583] CR2: 7f3e6275a002 CR3: 000622845000 CR4: 
>> 06e0
>> [  168.896680] RIP: 0x7f3e622c5360 RSP: 7ffe2bb7a098
>> [  168.901713] ---[ end trace ef98fa9f2a01cbc6 ]---
>> [  168.90630 arch/x86/kernel/smp.c:127 native_smp_send_reschedule+0x3f/0x50
>> [  168.935410] Modules linked in: intel_powerclamp coretemp kvm_intel kvm 
>> irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel 
>> crypto_simd iTCO_wdt glue_helper cryptd sg iTCO_vendor_support i7core_edac 
>> edac_core shpchp lpc_ich i2c_i801 pcspkr mfd_core acpi_cpufreq ip_tables xfs 
>> libcrc32c sd_mod igb ata_generic ptp pata_acpi pps_core mptsas ata_piix 
>> scsi_transport_sas i2c_algo_bit libata mptscsih i2c_core serio_raw 
>> crc32c_intel bnx2 mptbase dca dm_mirror dm_region_hash dm_log dm_mod
>> [  168.979686] CPU: 15 PID: 10369 Comm: mmap1 Tainted: G  D 
>> 4.11.0-rc2-327.28.3.53.x86_64+ #345
>> [  168.989114] Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285 
>>  /BC11BTSA  , BIOS CTSAV036 04/27/2011
>> [  169.000616] Call Trace:
>> [  169.003049]  
>> [  169.005050]  dump_stack+0x63/0x84
>> [  169.008348]  __warn+0xd1/0xf0
>> [  169.011297]  warn_slowpath_null+0x1d/0x20
>> [  169.015282]  native_smp_send_reschedule+0x3f/0x50
>> [  169.019961]  resched_curr+0xa1/0xc0
>> [  169.023428]  check_preempt_curr+0x70/0x90
>> [  169.027415]  ttwu_do_wakeup+0x1e/0x160
>> [  169.031142]  ttwu_do_activate+0x77/0x80
>> [  169.034956]  try_to_wake_up+0x1c3/0x430
>> [  169.038771]  default_wake_function+0x12/0x20
>> [  169.043019]  __wake_up_common+0x55/0x90
>> [  169.046833]  __wake_up_locked+0x13/0x20
>> [  169.050649]  ep_poll_callback+0xbb/0x240
>> [  169.054550]  __wake_up_common+0x55/0x90
>> [  169.058363]  __wake_up+0x39/0x50
>> [  169.061574]  wake_up_klogd_work_func+0x40/0x60
>> [  169.065994]  irq_work_run_list+0x4d/0x70
>> [  169.069895]  irq_work_tick+0x40/0x50
>> [  169.073452]  update_process_times+0x42/0x60
>> [  169.077612]  tick_periodic+0x2b/0x80
>> [  169.081166]  tick_handle_periodic+0x25/0x70
>> [  169.085326]  local_apic_timer_interrupt+0x35/0x60
>> [  169.090004]  smp_apic_timer_interrupt+0x38/0x50
>> [  169.094507]  apic_timer_interrupt+0x93/0xa0
>> [  169.098667] RIP: 0010:panic+0x1f5/0x239
>> [  169.102480] RSP: :c9000c343dd8 EFLAGS: 0246 ORIG_RAX: 
>> ff10
>> [  169.110010] RAX: 0034 RBX:  RCX: 
>> 000

Re: [RESENT PATCH] x86/mem: fix the offset overflow when read/write mem

2017-05-01 Thread zhong jiang

ping 

anyone has any objections.
On 2017/4/27 19:49, zhongjiang wrote:
> From: zhong jiang 
>
> Recently, I found the following issue, it will result in the panic.
>
> [  168.739152] mmap1: Corrupted page table at address 7f3e6275a002
> [  168.745039] PGD 61f4a1067
> [  168.745040] PUD 61ab19067
> [  168.747730] PMD 61fb8b067
> [  168.750418] PTE 80001225
> [  168.753109]
> [  168.757795] Bad pagetable: 000d [#1] SMP
> [  168.761696] Modules linked in: intel_powerclamp coretemp kvm_intel kvm 
> irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel 
> crypto_simd iTCO_wdt glue_helper cryptd sg iTCO_vendor_support i7core_edac 
> edac_core shpchp lpc_ich i2c_i801 pcspkr mfd_core acpi_cpufreq ip_tables xfs 
> libcrc32c sd_mod igb ata_generic ptp pata_acpi pps_core mptsas ata_piix 
> scsi_transport_sas i2c_algo_bit libata mptscsih i2c_core serio_raw 
> crc32c_intel bnx2 mptbase dca dm_mirror dm_region_hash dm_log dm_mod
> [  168.805983] CPU: 15 PID: 10369 Comm: mmap1 Not tainted 
> 4.11.0-rc2-327.28.3.53.x86_64+ #345
> [  168.814202] Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285  
> /BC11BTSA  , BIOS CTSAV036 04/27/2011
> [  168.825704] task: 8806207d5200 task.stack: c9000c34
> [  168.831592] RIP: 0033:0x7f3e622c5360
> [  168.835147] RSP: 002b:7ffe2bb7a098 EFLAGS: 00010203
> [  168.840344] RAX: 7ffe2bb7a0c0 RBX:  RCX: 
> 7f3e6275a000
> [  168.847439] RDX: 7f3e622c5360 RSI: 7f3e6275a000 RDI: 
> 7ffe2bb7a0c0
> [  168.854535] RBP: 7ffe2bb7a4e0 R08: 7f3e621c3d58 R09: 
> 002d
> [  168.861632] R10: 7ffe2bb79e20 R11: 7f3e622fbcb0 R12: 
> 004005d0
> [  168.868728] R13: 7ffe2bb7a5c0 R14:  R15: 
> 
> [  168.875825] FS:  7f3e62752740() GS:880627bc() 
> knlGS:
> [  168.883870] CS:  0010 DS:  ES:  CR0: 80050033
> [  168.889583] CR2: 7f3e6275a002 CR3: 000622845000 CR4: 
> 06e0
> [  168.896680] RIP: 0x7f3e622c5360 RSP: 7ffe2bb7a098
> [  168.901713] ---[ end trace ef98fa9f2a01cbc6 ]---
> [  168.90630 arch/x86/kernel/smp.c:127 native_smp_send_reschedule+0x3f/0x50
> [  168.935410] Modules linked in: intel_powerclamp coretemp kvm_intel kvm 
> irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel 
> crypto_simd iTCO_wdt glue_helper cryptd sg iTCO_vendor_support i7core_edac 
> edac_core shpchp lpc_ich i2c_i801 pcspkr mfd_core acpi_cpufreq ip_tables xfs 
> libcrc32c sd_mod igb ata_generic ptp pata_acpi pps_core mptsas ata_piix 
> scsi_transport_sas i2c_algo_bit libata mptscsih i2c_core serio_raw 
> crc32c_intel bnx2 mptbase dca dm_mirror dm_region_hash dm_log dm_mod
> [  168.979686] CPU: 15 PID: 10369 Comm: mmap1 Tainted: G  D 
> 4.11.0-rc2-327.28.3.53.x86_64+ #345
> [  168.989114] Hardware name: Huawei Technologies Co., Ltd. Tecal RH2285  
> /BC11BTSA  , BIOS CTSAV036 04/27/2011
> [  169.000616] Call Trace:
> [  169.003049]  
> [  169.005050]  dump_stack+0x63/0x84
> [  169.008348]  __warn+0xd1/0xf0
> [  169.011297]  warn_slowpath_null+0x1d/0x20
> [  169.015282]  native_smp_send_reschedule+0x3f/0x50
> [  169.019961]  resched_curr+0xa1/0xc0
> [  169.023428]  check_preempt_curr+0x70/0x90
> [  169.027415]  ttwu_do_wakeup+0x1e/0x160
> [  169.031142]  ttwu_do_activate+0x77/0x80
> [  169.034956]  try_to_wake_up+0x1c3/0x430
> [  169.038771]  default_wake_function+0x12/0x20
> [  169.043019]  __wake_up_common+0x55/0x90
> [  169.046833]  __wake_up_locked+0x13/0x20
> [  169.050649]  ep_poll_callback+0xbb/0x240
> [  169.054550]  __wake_up_common+0x55/0x90
> [  169.058363]  __wake_up+0x39/0x50
> [  169.061574]  wake_up_klogd_work_func+0x40/0x60
> [  169.065994]  irq_work_run_list+0x4d/0x70
> [  169.069895]  irq_work_tick+0x40/0x50
> [  169.073452]  update_process_times+0x42/0x60
> [  169.077612]  tick_periodic+0x2b/0x80
> [  169.081166]  tick_handle_periodic+0x25/0x70
> [  169.085326]  local_apic_timer_interrupt+0x35/0x60
> [  169.090004]  smp_apic_timer_interrupt+0x38/0x50
> [  169.094507]  apic_timer_interrupt+0x93/0xa0
> [  169.098667] RIP: 0010:panic+0x1f5/0x239
> [  169.102480] RSP: :c9000c343dd8 EFLAGS: 0246 ORIG_RAX: 
> ff10
> [  169.110010] RAX: 0034 RBX:  RCX: 
> 0006
> [  169.117106] RDX:  RSI: 0086 RDI: 
> 880627bcdfe0
> [  169.124201] RBP: c9000c343e48 R08: fffe R09: 
> 0395
> [  169.131298] R10: 0005 R11: 0394 R12: 
> 81a0c475
> [  169.138395] R13:  R14:  R15:

Re: A small window for a race condition in mm/rmap.c:page_lock_anon_vma_read

2017-04-22 Thread zhong jiang

Hi,  Dashi
The same issue I had occured every other week.  Do you have solve it .
 I want to know how it is fixed.  The patch exist in the mainline.

Thanks
zhongjiang
On 2016/12/23 10:38, Dashi DS1 Cao wrote:
> I'd expected that one or more tasks doing the free were the current task of 
> other cpu cores, but only one of the four dumps has two swapd task that are 
> concurrently at execution on different cpu.
> This is the task leading to the crash:
> PID: 247TASK: 881fcfad8000  CPU: 14  COMMAND: "kswapd1"
>  #0 [881fcfad7978] machine_kexec at 81051e9b
>  #1 [881fcfad79d8] crash_kexec at 810f27e2
>  #2 [881fcfad7aa8] oops_end at 8163f448
>  #3 [881fcfad7ad0] die at 8101859b
>  #4 [881fcfad7b00] do_general_protection at 8163ed3e
>  #5 [881fcfad7b30] general_protection at 8163e5e8
> [exception RIP: down_read_trylock+9]
> RIP: 810aa9f9  RSP: 881fcfad7be0  RFLAGS: 00010286
> RAX:   RBX: 882b47ddadc0  RCX: 
> RDX:   RSI:   RDI: 91550b2b32f5a3e8
> RBP: 881fcfad7be0   R8: ea00ecc28860   R9: 883fcffeae28
> R10: 81a691a0  R11: 0001  R12: 882b47ddadc1
> R13: ea00ecc28840  R14: 91550b2b32f5a3e8  R15: ea00ecc28840
> ORIG_RAX:   CS: 0010  SS: 
>  #6 [881fcfad7be8] page_lock_anon_vma_read at 811a3365
>  #7 [881fcfad7c18] page_referenced at 811a35e7
>  #8 [881fcfad7c90] shrink_active_list at 8117e8cc
>  #9 [881fcfad7d48] balance_pgdat at 81180288
> #10 [881fcfad7e20] kswapd at 81180813
> #11 [881fcfad7ec8] kthread at 810a5b8f
> #12 [881fcfad7f50] ret_from_fork at 81646a98
>
> And this is the one at the same time:
> PID: 246TASK: 881fd27af300  CPU: 20  COMMAND: "kswapd0"
>  #0 [881fffd05e70] crash_nmi_callback at 81045982
>  #1 [881fffd05e80] nmi_handle at 8163f5d9
>  #2 [881fffd05ec8] do_nmi at 8163f6f0
>  #3 [881fffd05ef0] end_repeat_nmi at 8163ea13
> [exception RIP: free_pcppages_bulk+529]
> RIP: 81171ae1  RSP: 881fcfad38f0  RFLAGS: 0087
> RAX: 002f002c  RBX: ea007606ae40  RCX: 
> RDX: ea007606ae00  RSI: 02b9  RDI: 
> RBP: 881fcfad3978   R8:    R9: 0001
> R10: 88207ffda000  R11: 0002  R12: ea007606ae40
> R13: 0002  R14: 88207ffda000  R15: 02b8
> ORIG_RAX:   CS: 0010  SS: 0018
> ---  ---
>  #4 [881fcfad38f0] free_pcppages_bulk at 81171ae1
>  #5 [881fcfad3980] free_hot_cold_page at 81171f08
>  #6 [881fcfad39b8] free_hot_cold_page_list at 81171f76
>  #7 [881fcfad39f0] shrink_page_list at 8117d71e
>  #8 [881fcfad3b28] shrink_inactive_list at 8117e37a
>  #9 [881fcfad3bf0] shrink_lruvec at 8117ee45
> #10 [881fcfad3cf0] shrink_zone at 8117f2a6
> #11 [881fcfad3d48] balance_pgdat at 8118054c
> #12 [881fcfad3e20] kswapd at 81180813
> #13 [881fcfad3ec8] kthread at 810a5b8f
> #14 [881fcfad3f50] ret_from_fork at 81646a98
>
> I hope the information would be useful.
> Dashi Cao
>
> -Original Message-
> From: Hugh Dickins [mailto:hu...@google.com] 
> Sent: Friday, December 23, 2016 6:27 AM
> To: Peter Zijlstra 
> Cc: Michal Hocko ; Dashi DS1 Cao ; 
> linux...@kvack.org; linux-kernel@vger.kernel.org; Hugh Dickins 
> 
> Subject: Re: A small window for a race condition in 
> mm/rmap.c:page_lock_anon_vma_read
>
> On Thu, 22 Dec 2016, Peter Zijlstra wrote:
>> On Wed, Dec 21, 2016 at 03:43:43PM +0100, Michal Hocko wrote:
>>> anon_vma locking is clever^Wsubtle as hell. CC Peter...
>>>
>>> On Tue 20-12-16 09:32:27, Dashi DS1 Cao wrote:
 I've collected four crash dumps with similar backtrace. 

 PID: 247TASK: 881fcfad8000  CPU: 14  COMMAND: "kswapd1"
  #0 [881fcfad7978] machine_kexec at 81051e9b
  #1 [881fcfad79d8] crash_kexec at 810f27e2
  #2 [881fcfad7aa8] oops_end at 8163f448
  #3 [881fcfad7ad0] die at 8101859b
  #4 [881fcfad7b00] do_general_protection at 8163ed3e
  #5 [881fcfad7b30] general_protection at 8163e5e8
 [exception RIP: down_read_trylock+9]
 RIP: 810aa9f9  RSP: 881fcfad7be0  RFLAGS: 00010286
 RAX:   RBX: 882b47ddadc0  RCX: 
 RDX:   RSI:   RDI: 
 91550b2b32f5a3e8
>>> rdi is obviously a mess - smells like a string. So either sombody 
>>> has overwritten root_anon_vma or this is really a use after free...
>> e8 - ???
>> a3 - ???
>> f5 - ???
>> 32 - 2
>> 2b - +
>>  b -
>>
>> 55 -

Re: [PATCH 2/5] mm: convert anon_vma.refcount from atomic_t to refcount_t

2017-04-22 Thread zhong jiang

Hi, Elean

Do the issue had really occured,  use-after-free. but why the patch
 is not received.   or is is possible for the situation.

Thanks
zhongjiang
On 2017/2/20 18:49, Elena Reshetova wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
>
> Signed-off-by: Elena Reshetova 
> Signed-off-by: Hans Liljestrand 
> Signed-off-by: Kees Cook 
> Signed-off-by: David Windsor 
> ---
>  include/linux/rmap.h |  7 ---
>  mm/rmap.c| 14 +++---
>  2 files changed, 11 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 8c89e90..a8f4a97 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -10,6 +10,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  /*
>   * The anon_vma heads a list of private "related" vmas, to scan if
> @@ -35,7 +36,7 @@ struct anon_vma {
>* the reference is responsible for clearing up the
>* anon_vma if they are the last user on release
>*/
> - atomic_t refcount;
> + refcount_t refcount;
>  
>   /*
>* Count of child anon_vmas and VMAs which points to this anon_vma.
> @@ -102,14 +103,14 @@ enum ttu_flags {
>  #ifdef CONFIG_MMU
>  static inline void get_anon_vma(struct anon_vma *anon_vma)
>  {
> - atomic_inc(&anon_vma->refcount);
> + refcount_inc(&anon_vma->refcount);
>  }
>  
>  void __put_anon_vma(struct anon_vma *anon_vma);
>  
>  static inline void put_anon_vma(struct anon_vma *anon_vma)
>  {
> - if (atomic_dec_and_test(&anon_vma->refcount))
> + if (refcount_dec_and_test(&anon_vma->refcount))
>   __put_anon_vma(anon_vma);
>  }
>  
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 8774791..3321c86 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -77,7 +77,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
>  
>   anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
>   if (anon_vma) {
> - atomic_set(&anon_vma->refcount, 1);
> + refcount_set(&anon_vma->refcount, 1);
>   anon_vma->degree = 1;   /* Reference for first vma */
>   anon_vma->parent = anon_vma;
>   /*
> @@ -92,7 +92,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
>  
>  static inline void anon_vma_free(struct anon_vma *anon_vma)
>  {
> - VM_BUG_ON(atomic_read(&anon_vma->refcount));
> + VM_BUG_ON(refcount_read(&anon_vma->refcount));
>  
>   /*
>* Synchronize against page_lock_anon_vma_read() such that
> @@ -421,7 +421,7 @@ static void anon_vma_ctor(void *data)
>   struct anon_vma *anon_vma = data;
>  
>   init_rwsem(&anon_vma->rwsem);
> - atomic_set(&anon_vma->refcount, 0);
> + refcount_set(&anon_vma->refcount, 0);
>   anon_vma->rb_root = RB_ROOT;
>  }
>  
> @@ -470,7 +470,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
>   goto out;
>  
>   anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
> - if (!atomic_inc_not_zero(&anon_vma->refcount)) {
> + if (!refcount_inc_not_zero(&anon_vma->refcount)) {
>   anon_vma = NULL;
>   goto out;
>   }
> @@ -529,7 +529,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page 
> *page)
>   }
>  
>   /* trylock failed, we got to sleep */
> - if (!atomic_inc_not_zero(&anon_vma->refcount)) {
> + if (!refcount_inc_not_zero(&anon_vma->refcount)) {
>   anon_vma = NULL;
>   goto out;
>   }
> @@ -544,7 +544,7 @@ struct anon_vma *page_lock_anon_vma_read(struct page 
> *page)
>   rcu_read_unlock();
>   anon_vma_lock_read(anon_vma);
>  
> - if (atomic_dec_and_test(&anon_vma->refcount)) {
> + if (refcount_dec_and_test(&anon_vma->refcount)) {
>   /*
>* Oops, we held the last refcount, release the lock
>* and bail -- can't simply use put_anon_vma() because
> @@ -1577,7 +1577,7 @@ void __put_anon_vma(struct anon_vma *anon_vma)
>   struct anon_vma *root = anon_vma->root;
>  
>   anon_vma_free(anon_vma);
> - if (root != anon_vma && atomic_dec_and_test(&root->refcount))
> + if (root != anon_vma && refcount_dec_and_test(&root->refcount))
>   anon_vma_free(root);
>  }
>

Re: NULL pointer dereference in the kernel 3.10

2017-04-10 Thread zhong jiang

On 2017/4/10 22:13, Willy Tarreau wrote:
> On Mon, Apr 10, 2017 at 10:06:59PM +0800, zhong jiang wrote:
>> On 2017/4/10 20:48, Michal Hocko wrote:
>>> On Mon 10-04-17 20:10:06, zhong jiang wrote:
>>>> On 2017/4/10 16:56, Mel Gorman wrote:
>>>>> On Sat, Apr 08, 2017 at 09:39:42PM +0800, zhong jiang wrote:
>>>>>> when runing the stabile docker cases in the vm.   The following issue 
>>>>>> will come up.
>>>>>>
>>>>>> #40 [8801b57ffb30] async_page_fault at 8165c9f8
>>>>>> [exception RIP: down_read_trylock+5]
>>>>>> RIP: 810aca65  RSP: 8801b57ffbe8  RFLAGS: 00010202
>>>>>> RAX:   RBX: 88018ae858c1  RCX: 
>>>>>> RDX:   RSI:   RDI: 0008
>>>>>> RBP: 8801b57ffc10   R8: ea0006903de0   R9: 8800b3c61810
>>>>>> R10: 22cb  R11:   R12: 88018ae858c0
>>>>>> R13: ea0006903dc0  R14: 0008  R15: ea0006903dc0
>>>>>> ORIG_RAX:   CS: 0010  SS: 
>>>>> Post the full report including the kernel version and state whether any
>>>>> additional patches to 3.10 are applied.
>>>>>
>>>>  Hi, Mel
>>>>
>>>> Our kernel from RHEL 7.2, Addtional patches all from upstream -- 
>>>> include Bugfix and CVE.
>>> I believe you should contact Redhat for the support. This is a) old
>>> kernel and b) with other patches which might or might not be relevant.
>>   Ok, regardless of the kernel version, we just discuss the situation in 
>> theory.  if commit
>>   624483f3ea8  ("mm: rmap: fix use-after-free in __put_anon_vma")  is not 
>> exist. the issue
>>  will trigger . Any thought.
> But this commit was backported into 3.10.43, so stable kernel users are safe.
>
> Regards,
> Willy
>
> .
  yes,  you are sure that the commit can fix the issue.

Re: NULL pointer dereference in the kernel 3.10

2017-04-10 Thread zhong jiang

On 2017/4/10 22:06, Mel Gorman wrote:
> On Mon, Apr 10, 2017 at 08:10:06PM +0800, zhong jiang wrote:
>> On 2017/4/10 16:56, Mel Gorman wrote:
>>> On Sat, Apr 08, 2017 at 09:39:42PM +0800, zhong jiang wrote:
>>>> when runing the stabile docker cases in the vm.   The following issue will 
>>>> come up.
>>>>
>>>> #40 [8801b57ffb30] async_page_fault at 8165c9f8
>>>> [exception RIP: down_read_trylock+5]
>>>> RIP: 810aca65  RSP: 8801b57ffbe8  RFLAGS: 00010202
>>>> RAX:   RBX: 88018ae858c1  RCX: 
>>>> RDX:   RSI:   RDI: 0008
>>>> RBP: 8801b57ffc10   R8: ea0006903de0   R9: 8800b3c61810
>>>> R10: 22cb  R11:   R12: 88018ae858c0
>>>> R13: ea0006903dc0  R14: 0008  R15: ea0006903dc0
>>>> ORIG_RAX:   CS: 0010  SS: 
>>> Post the full report including the kernel version and state whether any
>>> additional patches to 3.10 are applied.
>>>
>>  Hi, Mel
>>
>> Our kernel from RHEL 7.2, Addtional patches all from upstream -- 
>> include Bugfix and CVE.
>>
>> Commit 624483f3ea8 ("mm: rmap: fix use-after-free in __put_anon_vma") 
>> exclude in
>> the RHEL 7.2. it looks seems to the issue. but I don't know how it triggered.
>> or it is not the correct fix.  Any suggestion? Thanks
>>
> I'm afraid you'll need to bring it up with RHEL support as it contains
> a number of backported patches from them that cannot be meaningfully
> evaluated outside of RedHat and they may have additional questions on the
> patches applied on top.
>
 Thanks

Re: NULL pointer dereference in the kernel 3.10

2017-04-10 Thread zhong jiang

On 2017/4/10 20:48, Michal Hocko wrote:
> On Mon 10-04-17 20:10:06, zhong jiang wrote:
>> On 2017/4/10 16:56, Mel Gorman wrote:
>>> On Sat, Apr 08, 2017 at 09:39:42PM +0800, zhong jiang wrote:
>>>> when runing the stabile docker cases in the vm.   The following issue will 
>>>> come up.
>>>>
>>>> #40 [8801b57ffb30] async_page_fault at 8165c9f8
>>>> [exception RIP: down_read_trylock+5]
>>>> RIP: 810aca65  RSP: 8801b57ffbe8  RFLAGS: 00010202
>>>> RAX:   RBX: 88018ae858c1  RCX: 
>>>> RDX:   RSI:   RDI: 0008
>>>> RBP: 8801b57ffc10   R8: ea0006903de0   R9: 8800b3c61810
>>>> R10: 22cb  R11:   R12: 88018ae858c0
>>>> R13: ea0006903dc0  R14: 0008  R15: ea0006903dc0
>>>> ORIG_RAX:   CS: 0010  SS: 
>>> Post the full report including the kernel version and state whether any
>>> additional patches to 3.10 are applied.
>>>
>>  Hi, Mel
>>
>> Our kernel from RHEL 7.2, Addtional patches all from upstream -- 
>> include Bugfix and CVE.
> I believe you should contact Redhat for the support. This is a) old
> kernel and b) with other patches which might or might not be relevant.
  Ok, regardless of the kernel version, we just discuss the situation in 
theory.  if commit
  624483f3ea8  ("mm: rmap: fix use-after-free in __put_anon_vma")  is not 
exist. the issue
 will trigger . Any thought.

Thanks
zhongjiang

Re: NULL pointer dereference in the kernel 3.10

2017-04-10 Thread zhong jiang

On 2017/4/10 16:56, Mel Gorman wrote:
> On Sat, Apr 08, 2017 at 09:39:42PM +0800, zhong jiang wrote:
>> when runing the stabile docker cases in the vm.   The following issue will 
>> come up.
>>
>> #40 [8801b57ffb30] async_page_fault at 8165c9f8
>> [exception RIP: down_read_trylock+5]
>> RIP: 810aca65  RSP: 8801b57ffbe8  RFLAGS: 00010202
>> RAX:   RBX: 88018ae858c1  RCX: 
>> RDX:   RSI:   RDI: 0008
>> RBP: 8801b57ffc10   R8: ea0006903de0   R9: 8800b3c61810
>> R10: 22cb  R11:   R12: 88018ae858c0
>> R13: ea0006903dc0  R14: 0008  R15: ea0006903dc0
>> ORIG_RAX:   CS: 0010  SS: 
> Post the full report including the kernel version and state whether any
> additional patches to 3.10 are applied.
>
 Hi, Mel

Our kernel from RHEL 7.2, Addtional patches all from upstream -- 
include Bugfix and CVE.

Commit 624483f3ea8 ("mm: rmap: fix use-after-free in __put_anon_vma") exclude in
the RHEL 7.2. it looks seems to the issue. but I don't know how it triggered.
or it is not the correct fix.  Any suggestion? Thanks

partly dmesg will print in the following.

[59982.162223] EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: 
(null)
[59985.261635] device-mapper: ioctl: remove_all left 8 open device(s)
[59986.492174] EXT4-fs (dm-5): mounted filesystem with ordered data mode. Opts: 
(null)
[59987.445606] device-mapper: ioctl: remove_all left 8 open device(s)
[59987.625887] EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: 
(null)
[59988.174600] device-mapper: ioctl: remove_all left 8 open device(s)
[59988.345667] EXT4-fs (dm-5): mounted filesystem with ordered data mode. Opts: 
(null)
[59990.951713] EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: 
(null)
[59991.025185] device vethd295793 entered promiscuous mode
[59991.025253] IPv6: ADDRCONF(NETDEV_UP): vethd295793: link is not ready
[59991.860817] IPv6: ADDRCONF(NETDEV_CHANGE): vethd295793: link becomes ready
[59991.860836] docker0: port 4(vethd295793) entered forwarding state
[59991.860840] docker0: port 4(vethd295793) entered forwarding state
[59992.704027] docker0: port 4(vethd295793) entered disabled state
[59992.724049] EXT4-fs (dm-9): mounted filesystem with ordered data mode. Opts: 
(null)
[59993.098341] docker0: port 4(vethd295793) entered disabled state
[59993.102583] device vethd295793 left promiscuous mode
[59993.102605] docker0: port 4(vethd295793) entered disabled state
[59995.109048] EXT4-fs (dm-5): mounted filesystem with ordered data mode. Opts: 
(null)
[59995.229390] docker0: port 2(veth2ad76e2) entered disabled state
[59995.523997] docker0: port 2(veth2ad76e2) entered disabled state
[59995.528183] device veth2ad76e2 left promiscuous mode
[59995.528202] docker0: port 2(veth2ad76e2) entered disabled state
[59995.975559] device-mapper: ioctl: remove_all left 8 open device(s)
[59996.084575] EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: 
(null)
[59996.660641] device-mapper: ioctl: remove_all left 7 open device(s)
[59997.109018] EXT4-fs (dm-4): mounted filesystem with ordered data mode. Opts: 
(null)
[59998.360101] EXT4-fs (dm-5): mounted filesystem with ordered data mode. Opts: 
(null)
[60001.721429] EXT4-fs (dm-6): mounted filesystem with ordered data mode. Opts: 
(null)
[60001.771433] device vethcca3b6a entered promiscuous mode
[60001.771643] IPv6: ADDRCONF(NETDEV_UP): vethcca3b6a: link is not ready
[60002.872102] IPv6: ADDRCONF(NETDEV_CHANGE): vethcca3b6a: link becomes ready
[60002.872124] docker0: port 2(vethcca3b6a) entered forwarding state
[60002.872130] docker0: port 2(vethcca3b6a) entered forwarding state
[60005.041654] EXT4-fs (dm-5): mounted filesystem with ordered data mode. Opts: 
(null)
[60005.597179] EXT4-fs (dm-5): mounted filesystem with ordered data mode. Opts: 
(null)
[60013.731728] [/usr/bin/os_rotate_and_save_log.sh]space of output directory is 
larger than 500M bytes,delete the oldest tar file 
messages-20170321181104-129.tar.bz2
[60016.243601] EXT4-fs (dm-5): mounted filesystem with ordered data mode. Opts: 
(null)
[60016.669594] device-mapper: ioctl: remove_all left 9 open device(s)
[60016.930232] EXT4-fs (dm-9): mounted filesystem with ordered data mode. Opts: 
(null)
[60017.918511] docker0: port 2(vethcca3b6a) entered forwarding state
[60022.197574] device-mapper: ioctl: remove_all left 8 open device(s)
[60022.575774] EXT4-fs (dm-4): mounted filesystem with ordered data mode. Opts: 
(null)
[60023.288744] EXT4-fs (dm-5): mounted filesystem with ordered data mode. Opts: 
(null)
[60024.282579] device-mapper: ioctl: remove_all left 8 open device(s)
[60024.505905] EXT4-fs (dm-4): mounted filesystem with ordered data mode. Opts: 
(null)
[60024.934311] EX

NULL pointer dereference in the kernel 3.10

2017-04-08 Thread zhong jiang

when runing the stabile docker cases in the vm.   The following issue will come 
up.

#40 [8801b57ffb30] async_page_fault at 8165c9f8
[exception RIP: down_read_trylock+5]
RIP: 810aca65  RSP: 8801b57ffbe8  RFLAGS: 00010202
RAX:   RBX: 88018ae858c1  RCX: 
RDX:   RSI:   RDI: 0008
RBP: 8801b57ffc10   R8: ea0006903de0   R9: 8800b3c61810
R10: 22cb  R11:   R12: 88018ae858c0
R13: ea0006903dc0  R14: 0008  R15: ea0006903dc0
ORIG_RAX:   CS: 0010  SS: 
#41 [8801b57ffbe8] page_lock_anon_vma_read at 811b241c
#42 [8801b57ffc18] page_referenced at 811b26a7
#43 [8801b57ffc90] shrink_active_list at 8118d634
#44 [8801b57ffd48] balance_pgdat at 8118f088
#45 [8801b57ffe20] kswapd at 8118f633
#46 [8801b57ffec8] kthread at 810a795f
#47 [8801b57fff50] ret_from_fork at 81665398
crash> struct page.mapping ea0006903dc0
  mapping = 0x88018ae858c1
crash> struct anon_vma 0x88018ae858c0
struct anon_vma {
  root = 0x0,
  rwsem = {
count = 0,
wait_lock = {
  raw_lock = {
{
  head_tail = 1,
  tickets = {
head = 1,
tail = 0
  }
}
  }
},
wait_list = {
  next = 0x0,
  prev = 0x0
}
  },
  refcount = {
counter = 0
  },
  rb_root = {
rb_node = 0x0
  }
}

This maks me wonder,  the anon_vma do not come from slab structure.
and the content is abnormal. IMO,  At least anon_vma->root will not NULL.
The issue can be reproduced every other week.

Any comments will be appreciated.

Thanks
zhongjiang

Re: [PATCH] mm: do not export ioremap_page_range symbol for external module

2017-01-23 Thread zhong jiang

On 2017/1/23 9:30, John Hubbard wrote:
>
>
> On 01/22/2017 05:14 PM, zhong jiang wrote:
>> On 2017/1/22 20:58, zhongjiang wrote:
>>> From: zhong jiang 
>>>
>>> Recently, I find the ioremap_page_range had been abusing. The improper
>>> address mapping is a issue. it will result in the crash. so, remove
>>> the symbol. It can be replaced by the ioremap_cache or others symbol.
>>>
>>> Signed-off-by: zhong jiang 
>>> ---
>>>  lib/ioremap.c | 1 -
>>>  1 file changed, 1 deletion(-)
>>>
>>> diff --git a/lib/ioremap.c b/lib/ioremap.c
>>> index 86c8911..a3e14ce 100644
>>> --- a/lib/ioremap.c
>>> +++ b/lib/ioremap.c
>>> @@ -144,4 +144,3 @@ int ioremap_page_range(unsigned long addr,
>>>
>>>  return err;
>>>  }
>>> -EXPORT_SYMBOL_GPL(ioremap_page_range);
>> self nack
>>
>
> heh. What changed your mind?
>
  Very sorry,  I mistake own kernel modules call it directly.  Thank you review
  the patch . I will take your changelog and send it in v2.

  Thanks
  zhongjiang
> .
>

Re: [PATCH] mm: do not export ioremap_page_range symbol for external module

2017-01-22 Thread zhong jiang

On 2017/1/22 20:58, zhongjiang wrote:
> From: zhong jiang 
>
> Recently, I find the ioremap_page_range had been abusing. The improper
> address mapping is a issue. it will result in the crash. so, remove
> the symbol. It can be replaced by the ioremap_cache or others symbol.
>
> Signed-off-by: zhong jiang 
> ---
>  lib/ioremap.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/lib/ioremap.c b/lib/ioremap.c
> index 86c8911..a3e14ce 100644
> --- a/lib/ioremap.c
> +++ b/lib/ioremap.c
> @@ -144,4 +144,3 @@ int ioremap_page_range(unsigned long addr,
>  
>   return err;
>  }
> -EXPORT_SYMBOL_GPL(ioremap_page_range);
self nack

Re: [RESEND PATCH 2/2] arm64: make WANT_HUGE_PMD_SHARE depends on HUGETLB_PAGE

2016-12-16 Thread zhong jiang

On 2016/12/16 20:35, Will Deacon wrote:
> On Fri, Dec 16, 2016 at 05:10:05PM +0800, zhong jiang wrote:
>> On 2016/12/14 22:19, zhongjiang wrote:
>>> From: zhong jiang 
>>>
>>> when HUGETLB_PAGE is disable, WANT_HUGE_PMD_SHARE contains the
>>> fuctions should not be use. therefore, we add the dependency.
>>>
>>> Signed-off-by: zhong jiang 
>>> ---
>>>  arch/arm64/Kconfig | 1 +
>>>  1 file changed, 1 insertion(+)
>>>
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index 969ef88..694ca73 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -640,6 +640,7 @@ config SYS_SUPPORTS_HUGETLBFS
>>>  
>>>  config ARCH_WANT_HUGE_PMD_SHARE
>>> def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
>>> +   depends on HUGETLB_PAGE
>>>  
>>>  config ARCH_HAS_CACHE_LINE_SIZE
>>> def_bool y
>> Hi,
>>   I still think it is a issue. Perhaps above changelog is unclear.  
>> Further explain is as follows.
>>  when hugetlb_pages is disable and arch_want_huge_pmd_share is enable,   we 
>> maybe call
>>  huge_pmd_sahre in hugetlbpage.c, but the function actually is not 
>> definition as it is not
>>  exported.  is it right ??
> The only users of ARCH_WANT_HUGE_PMD_SHARE on arm64 are:
>
> arch/arm64/mm/hugetlbpage.c:if 
> (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> mm/hugetlb.c:#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
> mm/hugetlb.c:#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
> mm/hugetlb.c:#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
>
> and neither of those files are compiled if !HUGETLB_PAGE.
>
> Are you actually seeing a problem here?
>
> Will
>
> .
>
  I got it.  Please forget the  stupid patch and my stupid comments.
 
 but the first patch look reasonable.  Is it accepted  ?

 Thanks
 zhongjiang

Re: [RESEND PATCH 2/2] arm64: make WANT_HUGE_PMD_SHARE depends on HUGETLB_PAGE

2016-12-16 Thread zhong jiang

On 2016/12/16 20:35, Will Deacon wrote:
> On Fri, Dec 16, 2016 at 05:10:05PM +0800, zhong jiang wrote:
>> On 2016/12/14 22:19, zhongjiang wrote:
>>> From: zhong jiang 
>>>
>>> when HUGETLB_PAGE is disable, WANT_HUGE_PMD_SHARE contains the
>>> fuctions should not be use. therefore, we add the dependency.
>>>
>>> Signed-off-by: zhong jiang 
>>> ---
>>>  arch/arm64/Kconfig | 1 +
>>>  1 file changed, 1 insertion(+)
>>>
>>> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
>>> index 969ef88..694ca73 100644
>>> --- a/arch/arm64/Kconfig
>>> +++ b/arch/arm64/Kconfig
>>> @@ -640,6 +640,7 @@ config SYS_SUPPORTS_HUGETLBFS
>>>  
>>>  config ARCH_WANT_HUGE_PMD_SHARE
>>> def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
>>> +   depends on HUGETLB_PAGE
>>>  
>>>  config ARCH_HAS_CACHE_LINE_SIZE
>>> def_bool y
>> Hi,
>>   I still think it is a issue. Perhaps above changelog is unclear.  
>> Further explain is as follows.
>>  when hugetlb_pages is disable and arch_want_huge_pmd_share is enable,   we 
>> maybe call
>>  huge_pmd_sahre in hugetlbpage.c, but the function actually is not 
>> definition as it is not
>>  exported.  is it right ??
> The only users of ARCH_WANT_HUGE_PMD_SHARE on arm64 are:
>
> arch/arm64/mm/hugetlbpage.c:if 
> (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
  yes
> mm/hugetlb.c:#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
> mm/hugetlb.c:#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
> mm/hugetlb.c:#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
>
> and neither of those files are compiled if !HUGETLB_PAGE.
  Indeed, but  The only users maybe use it  if !HUGETLB_PAGE. 
  because  include/linux/hugetlb.h  can not export the reference if 
!HUGETLB_PAGE.
  IOW,  the function is not definition.  

  is it right ? or I miss the key point.
> Are you actually seeing a problem here?
  I review the code and find the issue.

 Thanks
 zhongjiang
> Will
>
> .
>

Re: [RESEND PATCH 2/2] arm64: make WANT_HUGE_PMD_SHARE depends on HUGETLB_PAGE

2016-12-16 Thread zhong jiang

On 2016/12/14 22:19, zhongjiang wrote:
> From: zhong jiang 
>
> when HUGETLB_PAGE is disable, WANT_HUGE_PMD_SHARE contains the
> fuctions should not be use. therefore, we add the dependency.
>
> Signed-off-by: zhong jiang 
> ---
>  arch/arm64/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 969ef88..694ca73 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -640,6 +640,7 @@ config SYS_SUPPORTS_HUGETLBFS
>  
>  config ARCH_WANT_HUGE_PMD_SHARE
>   def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
> + depends on HUGETLB_PAGE
>  
>  config ARCH_HAS_CACHE_LINE_SIZE
>   def_bool y
Hi,
  I still think it is a issue. Perhaps above changelog is unclear.  Further 
explain is as follows.
 when hugetlb_pages is disable and arch_want_huge_pmd_share is enable,   we 
maybe call
 huge_pmd_sahre in hugetlbpage.c, but the function actually is not definition 
as it is not
 exported.  is it right ??

Thanks
zhongjiang

Re: [RESEND PATCH 1/2] arm64: change from CONT_PMD_SHIFT to CONT_PTE_SHIFT

2016-12-14 Thread zhong jiang

On 2016/12/14 22:45, Ard Biesheuvel wrote:
> On 14 December 2016 at 14:19, zhongjiang  wrote:
>> From: zhong jiang 
>>
>> I think that CONT_PTE_SHIFT is more reasonable even if they are some
>> value. and the patch is not any functional change.
>>
> This may be the case for 64k pages, but not for 16k pages, and I
> actually think add_default_hugepagesz() could be made unconditional,
> given that both 64k on 4k kernels and 2 MB on 16k kernels are useful
> hugepage sizes that are not otherwise available by default.
 I agree that we can make add_default_hugepagesz() to be unconditional.
 but I do not know the history why it did so at that time. The patch
 just is based on the current kernel.

 by the way, please review the second patch if you have time. Any comment
 will be welcomed.

 Thanks
 zhongjiang
>> Signed-off-by: zhong jiang 
> Reviewed-by: Ard Biesheuvel 
>
>> ---
>>  arch/arm64/mm/hugetlbpage.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
>> index 2e49bd2..0a4c97b 100644
>> --- a/arch/arm64/mm/hugetlbpage.c
>> +++ b/arch/arm64/mm/hugetlbpage.c
>> @@ -323,7 +323,7 @@ static __init int setup_hugepagesz(char *opt)
>>  static __init int add_default_hugepagesz(void)
>>  {
>> if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL)
>> -   hugetlb_add_hstate(CONT_PMD_SHIFT);
>> +   hugetlb_add_hstate(CONT_PTE_SHIFT);
>> return 0;
>>  }
>>  arch_initcall(add_default_hugepagesz);
>> --
>> 1.8.3.1
>>
> .
>

Re: [PATCH v2] kexec: add cond_resched into kimage_alloc_crash_control_pages

2016-12-08 Thread zhong jiang

On 2016/12/9 13:19, Eric W. Biederman wrote:
> zhong jiang  writes:
>
>> On 2016/12/8 17:41, Xunlei Pang wrote:
>>> On 12/08/2016 at 10:37 AM, zhongjiang wrote:
>>>> From: zhong jiang 
>>>>
> [snip]
>>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>>> index 5616755..bfc9621 100644
>>>> --- a/kernel/kexec_core.c
>>>> +++ b/kernel/kexec_core.c
>>>> @@ -441,6 +441,8 @@ static struct page 
>>>> *kimage_alloc_crash_control_pages(struct kimage *image,
>>>>while (hole_end <= crashk_res.end) {
>>>>unsigned long i;
>>>>  
>>>> +  cond_resched();
>>>> +
>>> I can't see why it would take a long time to loop inside, the job it does 
>>> is simply to find a control area
>>> not overlapped with image->segment[], you can see the loop "for (i = 0; i < 
>>> image->nr_segments; i++)",
>>> @hole_end will be advanced to the end of its next nearby segment once 
>>> overlap was detected each loop,
>>> also there are limited (<=16) segments, so it won't take long to locate the 
>>> right area.
>>>
>>> Am I missing something?
>>>
>>> Regards,
>>> Xunlei
>>   if the crashkernel = auto is set in cmdline.  it represent crashk_res.end 
>> will exceed to 4G, the first allocate control pages will
>>   loop  million times. if we set crashk_res.end to the higher value
>>   manually,  you can image
> Or in short the cond_resched is about keeping things reasonable when the
> loop has worst case behavior.
>
> Eric
>
>
  Yes,   Thank you reply and comment.

  Regards,
  zhongjiang

Re: [PATCH v2] kexec: add cond_resched into kimage_alloc_crash_control_pages

2016-12-08 Thread zhong jiang

On 2016/12/8 17:41, Xunlei Pang wrote:
> On 12/08/2016 at 10:37 AM, zhongjiang wrote:
>> From: zhong jiang 
>>
>> A soft lookup will occur when I run trinity in syscall kexec_load.
>> the corresponding stack information is as follows.
>>
>> [  237.235937] BUG: soft lockup - CPU#6 stuck for 22s! [trinity-c6:13859]
>> [  237.242699] Kernel panic - not syncing: softlockup: hung tasks
>> [  237.248573] CPU: 6 PID: 13859 Comm: trinity-c6 Tainted: G   O L 
>> V---   3.10.0-327.28.3.35.zhongjiang.x86_64 #1
>> [  237.259984] Hardware name: Huawei Technologies Co., Ltd. Tecal BH622 
>> V2/BC01SRSA0, BIOS RMIBV386 06/30/2014
>> [  237.269752]  8187626b 18cfde31 88184c803e18 
>> 81638f16
>> [  237.277471]  88184c803e98 8163278f 0008 
>> 88184c803ea8
>> [  237.285190]  88184c803e48 18cfde31 88184c803e67 
>> 
>> [  237.292909] Call Trace:
>> [  237.295404][] dump_stack+0x19/0x1b
>> [  237.301352]  [] panic+0xd8/0x214
>> [  237.306196]  [] watchdog_timer_fn+0x1cc/0x1e0
>> [  237.312157]  [] ? watchdog_enable+0xc0/0xc0
>> [  237.317955]  [] __hrtimer_run_queues+0xd2/0x260
>> [  237.324087]  [] hrtimer_interrupt+0xb0/0x1e0
>> [  237.329963]  [] ? call_softirq+0x1c/0x30
>> [  237.335500]  [] local_apic_timer_interrupt+0x37/0x60
>> [  237.342228]  [] smp_apic_timer_interrupt+0x3f/0x60
>> [  237.348771]  [] apic_timer_interrupt+0x6d/0x80
>> [  237.354967][] ? 
>> kimage_alloc_control_pages+0x80/0x270
>> [  237.362875]  [] ? kmem_cache_alloc_trace+0x1ce/0x1f0
>> [  237.369592]  [] ? do_kimage_alloc_init+0x1f/0x90
>> [  237.375992]  [] kimage_alloc_init+0x12a/0x180
>> [  237.382103]  [] SyS_kexec_load+0x20a/0x260
>> [  237.387957]  [] system_call_fastpath+0x16/0x1b
>>
>> the first time allocate control pages may take too much time because
>> crash_res.end can be set to a higher value. we need to add cond_resched
>> to avoid the issue.
>>
>> The patch have been tested and above issue is not appear.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>  kernel/kexec_core.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index 5616755..bfc9621 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -441,6 +441,8 @@ static struct page 
>> *kimage_alloc_crash_control_pages(struct kimage *image,
>>  while (hole_end <= crashk_res.end) {
>>  unsigned long i;
>>  
>> +cond_resched();
>> +
> I can't see why it would take a long time to loop inside, the job it does is 
> simply to find a control area
> not overlapped with image->segment[], you can see the loop "for (i = 0; i < 
> image->nr_segments; i++)",
> @hole_end will be advanced to the end of its next nearby segment once overlap 
> was detected each loop,
> also there are limited (<=16) segments, so it won't take long to locate the 
> right area.
>
> Am I missing something?
>
> Regards,
> Xunlei
  if the crashkernel = auto is set in cmdline.  it represent crashk_res.end 
will exceed to 4G, the first allocate control pages will
  loop  million times. if we set crashk_res.end to the higher value manually,  
you can image

  Thanks
  zhongjiang
>>  if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
>>  break;
>>  /* See if I overlap any of the segments */
>
> .
>

Re: [PATCH] kexec: add cond_resched into kimage_alloc_crash_control_pages

2016-12-07 Thread zhong jiang

On 2016/12/8 9:50, Eric W. Biederman wrote:
> zhongjiang  writes:
>
>> From: zhong jiang 
>>
>> A soft lookup will occur when I run trinity in syscall kexec_load.
>> the corresponding stack information is as follows.
> Overall that looks reasonable.  Why only every 256 page and not call
> cond_resched unconditionally?
>
> The function cond_resched won't reschedule unless the process has spent
> it's cpu quota anyway.
  The value just a test.  I mistake it can reschedule immediately.
  cond_resched unconditionally will be a good choice.  if you accept the change,
  I will resend it .
> Eric
>
>> [  237.235937] BUG: soft lockup - CPU#6 stuck for 22s! [trinity-c6:13859]
>> [  237.242699] Kernel panic - not syncing: softlockup: hung tasks
>> [  237.248573] CPU: 6 PID: 13859 Comm: trinity-c6 Tainted: G   O L 
>> V---   3.10.0-327.28.3.35.zhongjiang.x86_64 #1
>> [  237.259984] Hardware name: Huawei Technologies Co., Ltd. Tecal BH622 
>> V2/BC01SRSA0, BIOS RMIBV386 06/30/2014
>> [  237.269752]  8187626b 18cfde31 88184c803e18 
>> 81638f16
>> [  237.277471]  88184c803e98 8163278f 0008 
>> 88184c803ea8
>> [  237.285190]  88184c803e48 18cfde31 88184c803e67 
>> 
>> [  237.292909] Call Trace:
>> [  237.295404][] dump_stack+0x19/0x1b
>> [  237.301352]  [] panic+0xd8/0x214
>> [  237.306196]  [] watchdog_timer_fn+0x1cc/0x1e0
>> [  237.312157]  [] ? watchdog_enable+0xc0/0xc0
>> [  237.317955]  [] __hrtimer_run_queues+0xd2/0x260
>> [  237.324087]  [] hrtimer_interrupt+0xb0/0x1e0
>> [  237.329963]  [] ? call_softirq+0x1c/0x30
>> [  237.335500]  [] local_apic_timer_interrupt+0x37/0x60
>> [  237.342228]  [] smp_apic_timer_interrupt+0x3f/0x60
>> [  237.348771]  [] apic_timer_interrupt+0x6d/0x80
>> [  237.354967][] ? 
>> kimage_alloc_control_pages+0x80/0x270
>> [  237.362875]  [] ? kmem_cache_alloc_trace+0x1ce/0x1f0
>> [  237.369592]  [] ? do_kimage_alloc_init+0x1f/0x90
>> [  237.375992]  [] kimage_alloc_init+0x12a/0x180
>> [  237.382103]  [] SyS_kexec_load+0x20a/0x260
>> [  237.387957]  [] system_call_fastpath+0x16/0x1b
>>
>> the first time allocate control pages may take too much time because
>> crash_res.end can be set to a higher value. we need to add cond_resched
>> to avoid the issue.
>>
>> The patch have been tested and above issue is not appear.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>  kernel/kexec_core.c | 4 
>>  1 file changed, 4 insertions(+)
>>
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index 5616755..2b43cc5 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -433,6 +433,7 @@ static struct page 
>> *kimage_alloc_crash_control_pages(struct kimage *image,
>>   */
>>  unsigned long hole_start, hole_end, size;
>>  struct page *pages;
>> +unsigned long count = 0;
>>  
>>  pages = NULL;
>>  size = (1 << order) << PAGE_SHIFT;
>> @@ -441,6 +442,9 @@ static struct page 
>> *kimage_alloc_crash_control_pages(struct kimage *image,
>>  while (hole_end <= crashk_res.end) {
>>  unsigned long i;
>>  
>> +if (++count % 256 == 0)
>> +cond_resched();
>> +
>>  if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
>>  break;
>>  /* See if I overlap any of the segments */
> .
>

Re: [RFC PATCH] hugetlbfs: fix the hugetlbfs can not be mounted

2016-11-03 Thread zhong jiang

On 2016/11/4 3:17, Andrew Morton wrote:
> On Sat, 29 Oct 2016 14:08:31 +0800 zhongjiang  wrote:
>
>> From: zhong jiang 
>>
>> Since 'commit 3e89e1c5ea84 ("hugetlb: make mm and fs code explicitly 
>> non-modular")'
>> bring in the mainline. mount hugetlbfs will result in the following issue.
>>
>> mount: unknown filesystme type 'hugetlbfs'
>>
>> because previous patch remove the module_alias_fs, when we mount the fs type,
>> the caller get_fs_type can not find the filesystem.
>>
>> The patch just recover the module_alias_fs to identify the hugetlbfs.
> hm, 3e89e1c5ea84 ("hugetlb: make mm and fs code explicitly
> non-modular") was merged almost a year ago.  And you are apparently the
> first person to discover this regression.  Can you think why that is?
  when I pull the upstream patch in 4.9-rc2. I find that I cannot mount the 
hugetlbfs.
  but when I pull  the upstream remain patch in the next day.  I test again. it 
 work well.
  so I reply the mail right now,  please ignore the patch.  The detailed reason 
is not digged.

  I am sorry for wasting your time.

  Thanks you
  zhongjiang
>> index 4fb7b10..b63e7de 100644
>> --- a/fs/hugetlbfs/inode.c
>> +++ b/fs/hugetlbfs/inode.c
>> @@ -35,6 +35,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  
>>  #include 
>> @@ -1209,6 +1210,7 @@ static struct dentry *hugetlbfs_mount(struct 
>> file_system_type *fs_type,
>>  .mount  = hugetlbfs_mount,
>>  .kill_sb= kill_litter_super,
>>  };
>> +MODULE_ALIAS_FS("hugetlbfs");
>>  
>>  static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
>>  
>
> .
>

Re: [RFC PATCH] hugetlbfs: fix the hugetlbfs can not be mounted

2016-10-29 Thread zhong jiang

On 2016/10/29 14:08, zhongjiang wrote:
> From: zhong jiang 
>
> Since 'commit 3e89e1c5ea84 ("hugetlb: make mm and fs code explicitly 
> non-modular")'
> bring in the mainline. mount hugetlbfs will result in the following issue.
>
> mount: unknown filesystme type 'hugetlbfs'
>
> because previous patch remove the module_alias_fs, when we mount the fs type,
> the caller get_fs_type can not find the filesystem.
>
> The patch just recover the module_alias_fs to identify the hugetlbfs.
>
> Signed-off-by: zhong jiang 
> ---
>  fs/hugetlbfs/inode.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 4fb7b10..b63e7de 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -35,6 +35,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include 
> @@ -1209,6 +1210,7 @@ static struct dentry *hugetlbfs_mount(struct 
> file_system_type *fs_type,
>   .mount  = hugetlbfs_mount,
>   .kill_sb= kill_litter_super,
>  };
> +MODULE_ALIAS_FS("hugetlbfs");
>  
>  static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
>  
 please ignore the patch,  It have been fixed.

Re: [PATCH] net: avoid uninitialized variable

2016-10-26 Thread zhong jiang

On 2016/10/27 12:02, Gao Feng wrote:
> On Thu, Oct 27, 2016 at 11:56 AM, zhongjiang  wrote:
>> From: zhong jiang 
>>
>> when I compiler the newest kernel, I hit the following error with
>> Werror=may-uninitalized.
>>
>> net/core/flow_dissector.c: In function ?._skb_flow_dissect?
>> include/uapi/linux/swab.h:100:46: error: ?.lan?.may be used uninitialized in 
>> this function [-Werror=maybe-uninitialized]
>> net/core/flow_dissector.c:248:26: note: ?.lan?.was declared here
>>
>> This adds an additional check for proto to explicitly tell the compiler
>> that vlan pointer have the correct value before it is used.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>  net/core/flow_dissector.c | 5 +++--
>>  1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
>> index 1a7b80f..a04d9cf 100644
>> --- a/net/core/flow_dissector.c
>> +++ b/net/core/flow_dissector.c
>> @@ -245,7 +245,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>> }
>> case htons(ETH_P_8021AD):
>> case htons(ETH_P_8021Q): {
>> -   const struct vlan_hdr *vlan;
>> +   const struct vlan_hdr *vlan = NULL;
>>
>> if (skb_vlan_tag_present(skb))
>> proto = skb->protocol;
>> @@ -276,7 +276,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>> key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
>> key_vlan->vlan_priority =
>> (skb_vlan_tag_get_prio(skb) >> 
>> VLAN_PRIO_SHIFT);
>> -   } else {
>> +   } else if (proto == cpu_to_be16(ETH_P_8021Q) ||
>> +   proto == 
>> cpu_to_be16(ETH_P_8021AD)) {
>> key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
>> VLAN_VID_MASK;
>> key_vlan->vlan_priority =
>> --
>> 1.8.3.1
>>
> It seems there is one similar patch already.
> You could refer to https://patchwork.kernel.org/patch/9389565/
>
> Regards
> Feng
>
> .
>
 sorry, I doesn't notice that patch.

 Thanks
 zhongjiang

Re: [PATCH v2] z3fold: fix the potential encode bug in encod_handle

2016-10-17 Thread zhong jiang

On 2016/10/17 23:30, Dan Streetman wrote:
> On Mon, Oct 17, 2016 at 8:48 AM, zhong jiang  wrote:
>> On 2016/10/17 20:03, Vitaly Wool wrote:
>>> Hi Zhong Jiang,
>>>
>>> On Mon, Oct 17, 2016 at 3:58 AM, zhong jiang  wrote:
>>>> Hi,  Vitaly
>>>>
>>>> About the following patch,  is it right?
>>>>
>>>> Thanks
>>>> zhongjiang
>>>> On 2016/10/13 12:02, zhongjiang wrote:
>>>>> From: zhong jiang 
>>>>>
>>>>> At present, zhdr->first_num plus bud can exceed the BUDDY_MASK
>>>>> in encode_handle, it will lead to the the caller handle_to_buddy
>>>>> return the error value.
>>>>>
>>>>> The patch fix the issue by changing the BUDDY_MASK to PAGE_MASK,
>>>>> it will be consistent with handle_to_z3fold_header. At the same time,
>>>>> change the BUDDY_MASK to PAGE_MASK in handle_to_buddy is better.
>>> are you seeing problems with the existing code? first_num should wrap around
>>> BUDDY_MASK and this should be ok because it is way bigger than the number
>>> of buddies.
>>>
>>> ~vitaly
>>>
>>> .
>>>
>>  first_num plus buddies can exceed the BUDDY_MASK. is it right?
> yes.
>
>>  (first_num + buddies) & BUDDY_MASK may be a smaller value than first_num.
> yes, but that doesn't matter; the value stored in the handle is never
> accessed directly.
>
>>   but (handle - zhdr->first_num) & BUDDY_MASK will return incorrect value
>>   in handle_to_buddy.
> the subtraction and masking will result in the correct buddy number,
> even if (handle & BUDDY_MASK) < zhdr->first_num.
 yes, I see. it is hard to read.
> However, I agree it's nonobvious, and tying the first_num size to
> NCHUNKS_ORDER is confusing - the number of chunks is completely
> unrelated to the number of buddies.
 yes. indeed.
> Possibly a better way to handle first_num is to limit it to the order
> of enum buddy to the actual range of possible buddy indexes, which is
> 0x3, i.e.:
>
> #define BUDDY_MASK  (0x3)
>
> and
>
>unsigned short first_num:2;
>
> with that and a small bit of explanation in the encode_handle or
> handle_to_buddy comments, it should be clear how the first_num and
> buddy numbering work, including that overflow/underflow are ok (due to
> the masking)...
 yes, It is better and clearer. Thanks for your relpy and advice. I will
 resend the patch.
>>   Thanks
>>   zhongjiang
>>
>>
> .
>

Re: [PATCH v2] z3fold: fix the potential encode bug in encod_handle

2016-10-17 Thread zhong jiang

On 2016/10/17 20:03, Vitaly Wool wrote:
> Hi Zhong Jiang,
>
> On Mon, Oct 17, 2016 at 3:58 AM, zhong jiang  wrote:
>> Hi,  Vitaly
>>
>> About the following patch,  is it right?
>>
>> Thanks
>> zhongjiang
>> On 2016/10/13 12:02, zhongjiang wrote:
>>> From: zhong jiang 
>>>
>>> At present, zhdr->first_num plus bud can exceed the BUDDY_MASK
>>> in encode_handle, it will lead to the the caller handle_to_buddy
>>> return the error value.
>>>
>>> The patch fix the issue by changing the BUDDY_MASK to PAGE_MASK,
>>> it will be consistent with handle_to_z3fold_header. At the same time,
>>> change the BUDDY_MASK to PAGE_MASK in handle_to_buddy is better.
> are you seeing problems with the existing code? first_num should wrap around
> BUDDY_MASK and this should be ok because it is way bigger than the number
> of buddies.
>
> ~vitaly
>
> .
>
 I am curious about the z3fold implement, Thus, I am reviewing the code.

 Thanks
 zhongjiang

Re: [PATCH v2] z3fold: fix the potential encode bug in encod_handle

2016-10-17 Thread zhong jiang

On 2016/10/17 20:03, Vitaly Wool wrote:
> Hi Zhong Jiang,
>
> On Mon, Oct 17, 2016 at 3:58 AM, zhong jiang  wrote:
>> Hi,  Vitaly
>>
>> About the following patch,  is it right?
>>
>> Thanks
>> zhongjiang
>> On 2016/10/13 12:02, zhongjiang wrote:
>>> From: zhong jiang 
>>>
>>> At present, zhdr->first_num plus bud can exceed the BUDDY_MASK
>>> in encode_handle, it will lead to the the caller handle_to_buddy
>>> return the error value.
>>>
>>> The patch fix the issue by changing the BUDDY_MASK to PAGE_MASK,
>>> it will be consistent with handle_to_z3fold_header. At the same time,
>>> change the BUDDY_MASK to PAGE_MASK in handle_to_buddy is better.
> are you seeing problems with the existing code? first_num should wrap around
> BUDDY_MASK and this should be ok because it is way bigger than the number
> of buddies.
>
> ~vitaly
>
> .
>
 first_num plus buddies can exceed the BUDDY_MASK. is it right?
 (first_num + buddies) & BUDDY_MASK may be a smaller value than first_num.

  but (handle - zhdr->first_num) & BUDDY_MASK will return incorrect value
  in handle_to_buddy.

  Thanks
  zhongjiang

Re: [PATCH v2] z3fold: fix the potential encode bug in encod_handle

2016-10-16 Thread zhong jiang

Hi,  Vitaly

About the following patch,  is it right?

Thanks
zhongjiang
On 2016/10/13 12:02, zhongjiang wrote:
> From: zhong jiang 
>
> At present, zhdr->first_num plus bud can exceed the BUDDY_MASK
> in encode_handle, it will lead to the the caller handle_to_buddy
> return the error value.
>
> The patch fix the issue by changing the BUDDY_MASK to PAGE_MASK,
> it will be consistent with handle_to_z3fold_header. At the same time,
> change the BUDDY_MASK to PAGE_MASK in handle_to_buddy is better.
>
> Signed-off-by: zhong jiang 
> ---
>  mm/z3fold.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 8f9e89c..e8fc216 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -169,7 +169,7 @@ static unsigned long encode_handle(struct z3fold_header 
> *zhdr, enum buddy bud)
>  
>   handle = (unsigned long)zhdr;
>   if (bud != HEADLESS)
> - handle += (bud + zhdr->first_num) & BUDDY_MASK;
> + handle += (bud + zhdr->first_num) & PAGE_MASK;
>   return handle;
>  }
>  
> @@ -183,7 +183,7 @@ static struct z3fold_header 
> *handle_to_z3fold_header(unsigned long handle)
>  static enum buddy handle_to_buddy(unsigned long handle)
>  {
>   struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
> - return (handle - zhdr->first_num) & BUDDY_MASK;
> + return (handle - zhdr->first_num) & PAGE_MASK;
>  }
>  
>  /*

Re: [PATCH] z3fold: remove the unnecessary limit in z3fold_compact_page

2016-10-16 Thread zhong jiang

On 2016/10/15 3:25, Vitaly Wool wrote:
> On Fri, Oct 14, 2016 at 3:35 PM, zhongjiang  wrote:
>> From: zhong jiang 
>>
>> z3fold compact page has nothing with the last_chunks. even if
>> last_chunks is not free, compact page will proceed.
>>
>> The patch just remove the limit without functional change.
>>
>> Signed-off-by: zhong jiang 
>> ---
>>  mm/z3fold.c | 3 +--
>>  1 file changed, 1 insertion(+), 2 deletions(-)
>>
>> diff --git a/mm/z3fold.c b/mm/z3fold.c
>> index e8fc216..4668e1c 100644
>> --- a/mm/z3fold.c
>> +++ b/mm/z3fold.c
>> @@ -258,8 +258,7 @@ static int z3fold_compact_page(struct z3fold_header 
>> *zhdr)
>>
>>
>> if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
>> -   zhdr->middle_chunks != 0 &&
>> -   zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
>> +   zhdr->middle_chunks != 0 && zhdr->first_chunks == 0) {
>> memmove(beg + ZHDR_SIZE_ALIGNED,
>> beg + (zhdr->start_middle << CHUNK_SHIFT),
>> zhdr->middle_chunks << CHUNK_SHIFT);
> This check is actually important because if we move the middle chunk
> to the first and leave the last chunk, handles will become invalid and
> there won't be any easy way to fix that.
>
> Best regards,
>Vitaly
>
> .
>
 Thanks for you reply. you are right. Leave the last chunk to compact will
 lead to the first_num increase. Thus, handle_to_buddy will become invalid.

 Thanks
 zhongjiang

Re: [PATCH] z3fold: fix the potential encode bug in encod_handle

2016-10-12 Thread zhong jiang

On 2016/10/13 11:33, zhongjiang wrote:
> From: zhong jiang 
>
> At present, zhdr->first_num plus bud can exceed the BUDDY_MASK
> in encode_handle, it will lead to the the caller handle_to_buddy
> return the error value.
>
> The patch fix the issue by changing the BUDDY_MASK to PAGE_MASK,
> it will be consistent with handle_to_z3fold_header. At the same time,
> The code will much comprehensible to change the BUDDY_MASK to
> BUDDIES_MAX in handle_to_buddy.
>
> Signed-off-by: zhong jiang 
> ---
>  mm/z3fold.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 8f9e89c..5884b9e 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -169,7 +169,7 @@ static unsigned long encode_handle(struct z3fold_header 
> *zhdr, enum buddy bud)
>  
>   handle = (unsigned long)zhdr;
>   if (bud != HEADLESS)
> - handle += (bud + zhdr->first_num) & BUDDY_MASK;
> + handle += (bud + zhdr->first_num) & PAGE_MASK;
>   return handle;
>  }
>  
> @@ -183,7 +183,7 @@ static struct z3fold_header 
> *handle_to_z3fold_header(unsigned long handle)
>  static enum buddy handle_to_buddy(unsigned long handle)
>  {
>   struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
> - return (handle - zhdr->first_num) & BUDDY_MASK;
> + return (handle - zhdr->first_num) & BUDDIES_MAX;
>  }
>  
>  /*
  oh,  a  obvious problem, please ignore it. I will resent the patch in v2.  
Thanks

Re: [RFC] remove unnecessary condition in remove_inode_hugepages

2016-09-24 Thread zhong jiang

On 2016/9/25 8:06, Mike Kravetz wrote:
> On 09/23/2016 07:56 PM, zhong jiang wrote:
>> On 2016/9/24 1:19, Mike Kravetz wrote:
>>> On 09/22/2016 06:53 PM, zhong jiang wrote:
>>>> At present, we need to call hugetlb_fix_reserve_count when 
>>>> hugetlb_unrserve_pages fails,
>>>> and PagePrivate will decide hugetlb reserves counts.
>>>>
>>>> we obtain the page from page cache. and use page both lock_page and 
>>>> mutex_lock.
>>>> alloc_huge_page add page to page chace always hold lock page, then bail 
>>>> out clearpageprivate
>>>> before unlock page. 
>>>>
>>>> but I' m not sure  it is right  or I miss the points.
>>> Let me try to explain the code you suggest is unnecessary.
>>>
>>> The PagePrivate flag is used in huge page allocation/deallocation to
>>> indicate that the page was globally reserved.  For example, in
>>> dequeue_huge_page_vma() there is this code:
>>>
>>> if (page) {
>>> if (avoid_reserve)
>>> break;
>>> if (!vma_has_reserves(vma, chg))
>>> break;
>>>
>>> SetPagePrivate(page);
>>> h->resv_huge_pages--;
>>> break;
>>> }
>>>
>>> and in free_huge_page():
>>>
>>> restore_reserve = PagePrivate(page);
>>> ClearPagePrivate(page);
>>> .
>>> 
>>> .
>>> if (restore_reserve)
>>> h->resv_huge_pages++;
>>>
>>> This helps maintains the global huge page reserve count.
>>>
>>> In addition to the global reserve count, there are per VMA reservation
>>> structures.  Unfortunately, these structures have different meanings
>>> depending on the context in which they are used.
>>>
>>> If there is a VMA reservation entry for a page, and the page has not
>>> been instantiated in the VMA this indicates there is a huge page reserved
>>> and the global resv_huge_pages count reflects that reservation.  Even
>>> if a page was not reserved, a VMA reservation entry is added when a page
>>> is instantiated in the VMA.
>>>
>>> With that background, let's look at the existing code/proposed changes.
>>  Clearly. 
>>>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>>>> index 4ea71eb..010723b 100644
>>>> --- a/fs/hugetlbfs/inode.c
>>>> +++ b/fs/hugetlbfs/inode.c
>>>> @@ -462,14 +462,12 @@ static void remove_inode_hugepages(struct inode 
>>>> *inode, loff_t lstart,
>>>>  * the page, note PagePrivate which is used in case
>>>>  * of error.
>>>>  */
>>>> -   rsv_on_error = !PagePrivate(page);
>>> This rsv_on_error flag indicates that when the huge page was allocated,
>>yes
>>> it was NOT counted against the global reserve count.  So, when
>>> remove_huge_page eventually calls free_huge_page(), the global count
>>> resv_huge_pages is not incremented.  So far, no problem.
>>  but the page comes from the page cache.  if it is.  it should implement
>>  ClearPageprivate(page) when lock page.   This condition always true.
>>
>>   The key point is why it need still check the PagePrivate(page) when page 
>> from
>>   page cache and hold lock.
> You are correct.  My apologies for not seeing your point in the original
> post.
>
> When the huge page is added to the page cache (huge_add_to_page_cache),
> the Page Private flag will be cleared.  Since this code
> (remove_inode_hugepages) will only be called for pages in the page cache,
> PagePrivate(page) will always be false.
>
> The comments in this area should be changed along with the code.
>
 Thanks, I will resend the patch.

Re: [RFC] remove unnecessary condition in remove_inode_hugepages

2016-09-23 Thread zhong jiang

On 2016/9/24 1:19, Mike Kravetz wrote:
> On 09/22/2016 06:53 PM, zhong jiang wrote:
>> At present, we need to call hugetlb_fix_reserve_count when 
>> hugetlb_unrserve_pages fails,
>> and PagePrivate will decide hugetlb reserves counts.
>>
>> we obtain the page from page cache. and use page both lock_page and 
>> mutex_lock.
>> alloc_huge_page add page to page chace always hold lock page, then bail out 
>> clearpageprivate
>> before unlock page. 
>>
>> but I' m not sure  it is right  or I miss the points.
> Let me try to explain the code you suggest is unnecessary.
>
> The PagePrivate flag is used in huge page allocation/deallocation to
> indicate that the page was globally reserved.  For example, in
> dequeue_huge_page_vma() there is this code:
>
> if (page) {
> if (avoid_reserve)
> break;
> if (!vma_has_reserves(vma, chg))
> break;
>
> SetPagePrivate(page);
> h->resv_huge_pages--;
> break;
> }
>
> and in free_huge_page():
>
> restore_reserve = PagePrivate(page);
> ClearPagePrivate(page);
>   .
>   
>   .
> if (restore_reserve)
> h->resv_huge_pages++;
>
> This helps maintains the global huge page reserve count.
>
> In addition to the global reserve count, there are per VMA reservation
> structures.  Unfortunately, these structures have different meanings
> depending on the context in which they are used.
>
> If there is a VMA reservation entry for a page, and the page has not
> been instantiated in the VMA this indicates there is a huge page reserved
> and the global resv_huge_pages count reflects that reservation.  Even
> if a page was not reserved, a VMA reservation entry is added when a page
> is instantiated in the VMA.
>
> With that background, let's look at the existing code/proposed changes.
 Clearly. 
>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>> index 4ea71eb..010723b 100644
>> --- a/fs/hugetlbfs/inode.c
>> +++ b/fs/hugetlbfs/inode.c
>> @@ -462,14 +462,12 @@ static void remove_inode_hugepages(struct inode 
>> *inode, loff_t lstart,
>>  * the page, note PagePrivate which is used in case
>>  * of error.
>>  */
>> -   rsv_on_error = !PagePrivate(page);
> This rsv_on_error flag indicates that when the huge page was allocated,
   yes
> it was NOT counted against the global reserve count.  So, when
> remove_huge_page eventually calls free_huge_page(), the global count
> resv_huge_pages is not incremented.  So far, no problem.
 but the page comes from the page cache.  if it is.  it should implement
 ClearPageprivate(page) when lock page.   This condition always true.

  The key point is why it need still check the PagePrivate(page) when page from
  page cache and hold lock.

  Thanks you
 zhongjiang
>> remove_huge_page(page);
>> freed++;
>> if (!truncate_op) {
>> if (unlikely(hugetlb_unreserve_pages(inode,
>> next, next + 1, 1)))
> We now have this VERY unlikely situation that hugetlb_unreserve_pages fails.
> This means that the VMA reservation entry for the page was not removed.
> So, we are in a bit of a mess.  The page has already been removed, but the
> VMA reservation entry can not.  This LOOKS like there is a reservation for
> the page in the VMA reservation structure.  But, the global count
> resv_huge_pages does not reflect this reservation.
>
> If we do nothing, when the VMA is eventually removed the VMA reservation
> structure will be completely removed and the global count resv_huge_pages
> will be decremented for each entry in the structure.  Since, there is a
> VMA reservation entry without a corresponding global count, the global
> count will be one less than it should (will eventually go to -1).
>
> To 'fix' this, hugetlb_fix_reserve_counts is called.  In this case, it will
> increment the global count so that it is consistent with the entries in
> the VMA reservation structure.
>
> This is all quite confusing and really unlikely to happen.  I tried to
> explain in code comments:
>
> Before removing the page:
> /*
>  * We must free the huge page and remove from p

[RFC] remove unnecessary condition in remove_inode_hugepages

2016-09-22 Thread zhong jiang


At present, we need to call hugetlb_fix_reserve_count when 
hugetlb_unrserve_pages fails,
and PagePrivate will decide hugetlb reserves counts.

we obtain the page from page cache. and use page both lock_page and mutex_lock.
alloc_huge_page add page to page chace always hold lock page, then bail out 
clearpageprivate
before unlock page. 

but I' m not sure  it is right  or I miss the points.


diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4ea71eb..010723b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -462,14 +462,12 @@ static void remove_inode_hugepages(struct inode *inode, 
loff_t lstart,
 * the page, note PagePrivate which is used in case
 * of error.
 */
-   rsv_on_error = !PagePrivate(page);
remove_huge_page(page);
freed++;
if (!truncate_op) {
if (unlikely(hugetlb_unreserve_pages(inode,
next, next + 1, 1)))
-   hugetlb_fix_reserve_counts(inode,
-   rsv_on_error);
+   hugetlb_fix_reserve_counts(inode)
}

unlock_page(page);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c26d463..d2e0fc5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -90,7 +90,7 @@ int dequeue_hwpoisoned_huge_page(struct page *page);
 bool isolate_huge_page(struct page *page, struct list_head *list);
 void putback_active_hugepage(struct page *page);
 void free_huge_page(struct page *page);
-void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve);
+void hugetlb_fix_reserve_counts(struct inode *inode);
 extern struct mutex *hugetlb_fault_mutex_table;
 u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 87e11d8..28a079a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -567,13 +567,13 @@ retry:
  * appear as a "reserved" entry instead of simply dangling with incorrect
  * counts.
  */
-void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+void hugetlb_fix_reserve_counts(struct inode *inode)
 {
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;

rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-   if (restore_reserve && rsv_adjust) {
+   if (rsv_adjust) {
struct hstate *h = hstate_inode(inode);

hugetlb_acct_memory(h, 1);

Re: [PATCH] mm, numa: boot cpu should bound to the node0 when node_off enable

2016-08-23 Thread zhong jiang

On 2016/8/22 22:28, Catalin Marinas wrote:
> On Sat, Aug 20, 2016 at 05:38:59PM +0800, zhong jiang wrote:
>> On 2016/8/19 12:11, Ganapatrao Kulkarni wrote:
>>> On Fri, Aug 19, 2016 at 9:30 AM, Ganapatrao Kulkarni
>>>  wrote:
>>>> On Fri, Aug 19, 2016 at 7:28 AM, zhong jiang  wrote:
>>>>> On 2016/8/19 1:45, Ganapatrao Kulkarni wrote:
>>>>>> On Thu, Aug 18, 2016 at 9:34 PM, Catalin Marinas
>>>>>>  wrote:
>>>>>>> On Thu, Aug 18, 2016 at 09:09:26PM +0800, zhongjiang wrote:
>>>>>>>> At present, boot cpu will bound to a node from device tree when 
>>>>>>>> node_off enable.
>>>>>>>> if the node is not initialization, it will lead to a following problem.
> [...]
>>>>>>>> --- a/arch/arm64/mm/numa.c
>>>>>>>> +++ b/arch/arm64/mm/numa.c
>>>>>>>> @@ -119,7 +119,7 @@ void numa_store_cpu_info(unsigned int cpu)
>>>>>>>>  void __init early_map_cpu_to_node(unsigned int cpu, int nid)
>>>>>>>>  {
>>>>>>>>   /* fallback to node 0 */
>>>>>>>> - if (nid < 0 || nid >= MAX_NUMNODES)
>>>>>>>> + if (nid < 0 || nid >= MAX_NUMNODES || numa_off)
>>>>>> i  did not understood how this line change fixes the issue that you
>>>>>> have mentioned (i too not understood fully the issue description)
>>>>>> this array used while mapping node id when secondary cores comes up
>>>>>> when numa_off is set the cpu_to_node_map[cpu] is not used and set to
>>>>>> node0 always( refer function numa_store_cpu_info)..
>>>>>> please provide more details to understand the issue you are facing.
>>>>>> /*
>>>>>>  *  Set the cpu to node and mem mapping
>>>>>>  */
>>>>>> void numa_store_cpu_info(unsigned int cpu)
>>>>>> {
>>>>>> map_cpu_to_node(cpu, numa_off ? 0 : cpu_to_node_map[cpu]);
>>>>>> }
>>>>> The issue comes up when we test the kdump. it will leads to kernel crash.
>>>>> when I debug the issue, I find boot cpu actually bound to the node1. while
>>>>> node1 is not real existence when numa_off enable.
>>>> boot cpu is default mapped to node0
>>>> are you running with any other patches?
>>> if you added any patch to change this code
>>>   /* init boot processor */
>>> cpu_to_node_map[0] = 0;
>>> map_cpu_to_node(0, 0);
>>>
>>> then adding code to take-care numa_off here might solve your issue.
>>  but in of_smp_init_cpus, boot cpu will call early_map_cpu_to_node[] to get
>>  the relation node. and the node is from devicetree.
>>
>>  you points to the code will be covered with another node. therefore, it is
>>  possible that cpu_to_node[cpu] will leads to the incorrect results. 
>> therefore,
>>  The crash will come up.
> I think I get Ganapat's point. The cpu_to_node_map[0] may be incorrectly
> set by early_map_cpu_to_node() when called from smp_init_cpus() ->
> of_parse_and_init_cpus(). However, the cpu_to_node_map[] array is *only*
> read by numa_store_cpu_info(). This latter function calls
> map_cpu_to_node() and, if numa_off, will only ever pass 0 as the nid.
>
> Given that the cpu_to_node_map[] array is static, I don't see how any
> non-zero value could leak outside the arch/arm64/mm/numa.c file.
>
> So please give more details of any additional patches you have on top of
> mainline or whether you reproduced this issue with the vanilla kernel
> (since you mentioned kdump, that's not in mainline yet).
>
Thanks for Catalin and Ganapatral.
I am sorry for that.  The mainline have solved.  The mainline changes is too 
much, I did not notice.

Re: [PATCH] mm, numa: boot cpu should bound to the node0 when node_off enable

2016-08-20 Thread zhong jiang

On 2016/8/19 12:11, Ganapatrao Kulkarni wrote:
> On Fri, Aug 19, 2016 at 9:30 AM, Ganapatrao Kulkarni
>  wrote:
>> On Fri, Aug 19, 2016 at 7:28 AM, zhong jiang  wrote:
>>> On 2016/8/19 1:45, Ganapatrao Kulkarni wrote:
>>>> On Thu, Aug 18, 2016 at 9:34 PM, Catalin Marinas
>>>>  wrote:
>>>>> On Thu, Aug 18, 2016 at 09:09:26PM +0800, zhongjiang wrote:
>>>>>> At present, boot cpu will bound to a node from device tree when node_off 
>>>>>> enable.
>>>>>> if the node is not initialization, it will lead to a following problem.
>>>>>>
>>>>>>  next_zones_zonelist+0x18/0x80
>>>>>>  __build_all_zonelists+0x1e0/0x288
>>>>>>  build_all_zonelists_init+0x10/0x1c
>>>>>>  build_all_zonelists+0x114/0x128
>>>>>>  start_kernel+0x1a0/0x414
>>>>> I think this "problem" is missing a lot of information. Is this supposed
>>>>> to be a kernel panic?
>>>>>
>>>>>> The patch fix it by fallback to node 0. therefore, the cpu will bound to 
>>>>>> the node
>>>>>> correctly.
>>>>>>
>>>>>> Signed-off-by: zhongjiang 
>>>>>> ---
>>>>>>  arch/arm64/mm/numa.c | 2 +-
>>>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
>>>>>> index 4dcd7d6..1f8f5da 100644
>>>>>> --- a/arch/arm64/mm/numa.c
>>>>>> +++ b/arch/arm64/mm/numa.c
>>>>>> @@ -119,7 +119,7 @@ void numa_store_cpu_info(unsigned int cpu)
>>>>>>  void __init early_map_cpu_to_node(unsigned int cpu, int nid)
>>>>>>  {
>>>>>>   /* fallback to node 0 */
>>>>>> - if (nid < 0 || nid >= MAX_NUMNODES)
>>>>>> + if (nid < 0 || nid >= MAX_NUMNODES || numa_off)
>>>> i  did not understood how this line change fixes the issue that you
>>>> have mentioned (i too not understood fully the issue description)
>>>> this array used while mapping node id when secondary cores comes up
>>>> when numa_off is set the cpu_to_node_map[cpu] is not used and set to
>>>> node0 always( refer function numa_store_cpu_info)..
>>>> please provide more details to understand the issue you are facing.
>>>> /*
>>>>  *  Set the cpu to node and mem mapping
>>>>  */
>>>> void numa_store_cpu_info(unsigned int cpu)
>>>> {
>>>> map_cpu_to_node(cpu, numa_off ? 0 : cpu_to_node_map[cpu]);
>>>> }
>>>>
>>>> thanks
>>>> Ganapat
>>> The issue comes up when we test the kdump. it will leads to kernel crash.
>>> when I debug the issue, I find boot cpu actually bound to the node1. while
>>> node1 is not real existence when numa_off enable.
>> boot cpu is default mapped to node0
>> are you running with any other patches?
> if you added any patch to change this code
>   /* init boot processor */
> cpu_to_node_map[0] = 0;
> map_cpu_to_node(0, 0);
>
> then adding code to take-care numa_off here might solve your issue.
 but in of_smp_init_cpus, boot cpu will call early_map_cpu_to_node[] to get
 the relation node. and the node is from devicetree.

 you points to the code will be covered with another node. therefore, it is
 possible that cpu_to_node[cpu] will leads to the incorrect results. therefore,
 The crash will come up.
>>> __build_all_zonelists will call the cpu_to_node[cpu], but orresponding 
>>> relation
>>> will be obtained from the devicetree. therefore, the issue will come up.
>> when numa_off, all cpus are mapped to node0( refer
>> numa_store_cpu_info) and device tree mapping is ignored.
>>> The corresponding message is as follows when kdump start. it is obvious 
>>> that mem
>>> range points to the node1 in the devicetree.
>>>
>>> Early memory node ranges
>>> node   0: [mem 0x005fe000-0x005f]
>>> Initmem setup node 0 [mem 0x005fe000-0x005f]
>>>
>>> Unable to handle kernel paging request at virtual address 1690
>>> pgd = 81226000
>>> [1690] *pgd=
>>> Internal error: Oops: 9604 [#1] SMP
>>>  Modules linked in:
>>> CPU: 0 PID: 0 Comm: swapper Not tainted 4.1.27-vhulk3.6.5.aarch64 #1
>>> Hardware name: Hisilicon Hi1612 Development Board (DT)
>>>  task: 8102b730 ti: 81018000 task.ti: 81018000
>>> PC is at next_zones_zonelist+0x18/0x80
>>>  LR is at __build_all_zonelists+0x1e0/0x288
>>> next_zones_zonelist+0x18/0x80
>>>  __build_all_zonelists+0x1e0/0x288
>>> build_all_zonelists_init+0x10/0x1c
>>>  build_all_zonelists+0x114/0x128
>>>  start_kernel+0x1a0/0x414
>>>>>>   nid = 0;
>>>>>>
>>>>>>   cpu_to_node_map[cpu] = nid;
>>>>> The patch looks fine (slight inconsistence from the map_cpu_to_node()
>>>>> callers but I guess we don't want to expose numa_off outside this file).
>>>>> I would however like to see an Ack from Ganapat (cc'ed).
>>>>>
>>>>> --
>>>>> Catalin
>>>>>
>>>>> ___
>>>>> linux-arm-kernel mailing list
>>>>> linux-arm-ker...@lists.infradead.org
>>>>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>>>> .
>>>>
>>>
> .
>

< 1 2 3 4 5 >

301 - 400 of 437 matches

Mail list logo