Re: [PATCH 0/2] memcg: add pagetable comsumption to memory.stat

2020-11-30 Thread Shakeel Butt
On Mon, Nov 30, 2020 at 1:10 PM Roman Gushchin  wrote:
>
> On Mon, Nov 30, 2020 at 01:01:18PM -0800, Shakeel Butt wrote:
> > On Mon, Nov 30, 2020 at 12:34 PM Roman Gushchin  wrote:
> > >
> > > On Wed, Nov 25, 2020 at 04:56:01PM -0800, Shakeel Butt wrote:
> > > > Many workloads consumes significant amount of memory in pagetables. This
> > > > patch series exposes the pagetable comsumption for each memory cgroup.
> > >
> > > Hi Shakeel!
> > >
> > > The code looks good to me. However I'm not sure I understand what's the
> > > use case for the new statistics? Can you, please, elaborate a bit more 
> > > here?
> > >
> > > From a very first glance, the size of pagetables should be _roughly_ equal
> > > to the size_of(pte)/PAGE_SIZE*(size of a cgroup) and should not exceed 1%
> > > of the cgroup size. So for all but very large cgroups the value will be
> > > in the noise of per-cpu counters. Perhaps I'm missing some important 
> > > cases?
> > >
> >
> > I think this is in general a good metric to have but one specific
> > use-case we have is the user space network driver which mmaps the
> > memory of the applications for zero copy data transfers. This driver
> > can consume a large amount of memory in page tables. So, this metric
> > becomes really useful here.
>
> Got it, thank you for the explanation!
> Would you mind to add this text to the cover letter as an example?

Sure.

>
> Acked-by: Roman Gushchin 
> for the series.

Thanks for the review.


[PATCH v2 0/2] memcg: add pagetable comsumption to memory.stat

2020-11-30 Thread Shakeel Butt
Many workloads consumes significant amount of memory in pagetables. One
specific use-case is the user space network driver which mmaps the
application memory to provide zero copy transfer. This driver can
consume a large amount memory in page tables. This patch series exposes
the pagetable comsumption for each memory cgroup.

Shakeel Butt (2):
  mm: move lruvec stats update functions to vmstat.h
  mm: memcontrol: account pagetables per node

 Documentation/admin-guide/cgroup-v2.rst |   3 +
 arch/nds32/mm/mm-nds32.c|   6 +-
 drivers/base/node.c |   2 +-
 fs/proc/meminfo.c   |   2 +-
 include/linux/memcontrol.h  | 112 
 include/linux/mm.h  |  11 ++-
 include/linux/mmzone.h  |   2 +-
 include/linux/vmstat.h  | 104 ++
 mm/memcontrol.c |  19 
 mm/page_alloc.c |   6 +-
 10 files changed, 142 insertions(+), 125 deletions(-)

-- 
2.29.2.454.gaff20da3a2-goog



[PATCH v2 1/2] mm: move lruvec stats update functions to vmstat.h

2020-11-30 Thread Shakeel Butt
This patch does not change any functionality and only move the functions
which update the lruvec stats to vmstat.h from memcontrol.h. The main
reason for this patch is to be able to use these functions in the page
table contructor function which is defined in mm.h and we can not
include the memcontrol.h in that file. Also this is a better place for
this interface in general. The lruvec abstraction, while invented for
memcg, isn't specific to memcg at all.

Signed-off-by: Shakeel Butt 
Acked-by: Johannes Weiner 
Acked-by: Roman Gushchin 

---
Changes since v1:
- Updated the commit message based on Johannes's comment.

 include/linux/memcontrol.h | 112 -
 include/linux/vmstat.h | 104 ++
 mm/memcontrol.c|  18 ++
 3 files changed, 122 insertions(+), 112 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 87ed56dc75f9..cd7b9136fb39 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -947,8 +947,6 @@ static inline unsigned long lruvec_page_state_local(struct 
lruvec *lruvec,
 
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
  int val);
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
-   int val);
 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
 
 static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
@@ -971,44 +969,6 @@ static inline void mod_memcg_lruvec_state(struct lruvec 
*lruvec,
local_irq_restore(flags);
 }
 
-static inline void mod_lruvec_state(struct lruvec *lruvec,
-   enum node_stat_item idx, int val)
-{
-   unsigned long flags;
-
-   local_irq_save(flags);
-   __mod_lruvec_state(lruvec, idx, val);
-   local_irq_restore(flags);
-}
-
-static inline void __mod_lruvec_page_state(struct page *page,
-  enum node_stat_item idx, int val)
-{
-   struct page *head = compound_head(page); /* rmap on tail pages */
-   struct mem_cgroup *memcg = page_memcg(head);
-   pg_data_t *pgdat = page_pgdat(page);
-   struct lruvec *lruvec;
-
-   /* Untracked pages have no memcg, no lruvec. Update only the node */
-   if (!memcg) {
-   __mod_node_page_state(pgdat, idx, val);
-   return;
-   }
-
-   lruvec = mem_cgroup_lruvec(memcg, pgdat);
-   __mod_lruvec_state(lruvec, idx, val);
-}
-
-static inline void mod_lruvec_page_state(struct page *page,
-enum node_stat_item idx, int val)
-{
-   unsigned long flags;
-
-   local_irq_save(flags);
-   __mod_lruvec_page_state(page, idx, val);
-   local_irq_restore(flags);
-}
-
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
@@ -1411,30 +1371,6 @@ static inline void __mod_memcg_lruvec_state(struct 
lruvec *lruvec,
 {
 }
 
-static inline void __mod_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx, int val)
-{
-   __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
-}
-
-static inline void mod_lruvec_state(struct lruvec *lruvec,
-   enum node_stat_item idx, int val)
-{
-   mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
-}
-
-static inline void __mod_lruvec_page_state(struct page *page,
-  enum node_stat_item idx, int val)
-{
-   __mod_node_page_state(page_pgdat(page), idx, val);
-}
-
-static inline void mod_lruvec_page_state(struct page *page,
-enum node_stat_item idx, int val)
-{
-   mod_node_page_state(page_pgdat(page), idx, val);
-}
-
 static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
   int val)
 {
@@ -1490,30 +1426,6 @@ static inline void lruvec_memcg_debug(struct lruvec 
*lruvec, struct page *page)
 }
 #endif /* CONFIG_MEMCG */
 
-static inline void __inc_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx)
-{
-   __mod_lruvec_state(lruvec, idx, 1);
-}
-
-static inline void __dec_lruvec_state(struct lruvec *lruvec,
- enum node_stat_item idx)
-{
-   __mod_lruvec_state(lruvec, idx, -1);
-}
-
-static inline void __inc_lruvec_page_state(struct page *page,
-  enum node_stat_item idx)
-{
-   __mod_lruvec_page_state(page, idx, 1);
-}
-
-static inline void __dec_lruvec_page_state(struct page *page,
-  enum node_stat_item idx)
-{
-   __mod_lruvec_page_state(page, idx, -1);
-}
-
 static i

[PATCH v2 2/2] mm: memcontrol: account pagetables per node

2020-11-30 Thread Shakeel Butt
For many workloads, pagetable consumption is significant and it makes
sense to expose it in the memory.stat for the memory cgroups. However at
the moment, the pagetables are accounted per-zone. Converting them to
per-node and using the right interface will correctly account for the
memory cgroups as well.

Signed-off-by: Shakeel Butt 
Acked-by: Johannes Weiner 
Acked-by: Roman Gushchin 

---
Changes since v1:
- Fixes vmstat_text in vmstat.c based on Johannes's comment.

 Documentation/admin-guide/cgroup-v2.rst | 3 +++
 arch/nds32/mm/mm-nds32.c| 6 +++---
 drivers/base/node.c | 2 +-
 fs/proc/meminfo.c   | 2 +-
 include/linux/mm.h  | 8 
 include/linux/mmzone.h  | 2 +-
 mm/memcontrol.c | 1 +
 mm/page_alloc.c | 6 +++---
 mm/vmstat.c | 2 +-
 9 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 515bb13084a0..63521cd36ce5 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1274,6 +1274,9 @@ PAGE_SIZE multiple when read back.
  kernel_stack
Amount of memory allocated to kernel stacks.
 
+ pagetables
+Amount of memory allocated for page tables.
+
  percpu(npn)
Amount of memory used for storing per-cpu kernel
data structures.
diff --git a/arch/nds32/mm/mm-nds32.c b/arch/nds32/mm/mm-nds32.c
index 55bec50ccc03..f2778f2b39f6 100644
--- a/arch/nds32/mm/mm-nds32.c
+++ b/arch/nds32/mm/mm-nds32.c
@@ -34,8 +34,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
cpu_dcache_wb_range((unsigned long)new_pgd,
(unsigned long)new_pgd +
PTRS_PER_PGD * sizeof(pgd_t));
-   inc_zone_page_state(virt_to_page((unsigned long *)new_pgd),
-   NR_PAGETABLE);
+   inc_lruvec_page_state(virt_to_page((unsigned long *)new_pgd),
+ NR_PAGETABLE);
 
return new_pgd;
 }
@@ -59,7 +59,7 @@ void pgd_free(struct mm_struct *mm, pgd_t * pgd)
 
pte = pmd_page(*pmd);
pmd_clear(pmd);
-   dec_zone_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE);
+   dec_lruvec_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE);
pte_free(mm, pte);
mm_dec_nr_ptes(mm);
pmd_free(mm, pmd);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6ffa470e2984..04f71c7bc3f8 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -450,7 +450,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 #ifdef CONFIG_SHADOW_CALL_STACK
 nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
-nid, K(sum_zone_node_page_state(nid, 
NR_PAGETABLE)),
+nid, K(node_page_state(pgdat, NR_PAGETABLE)),
 nid, 0UL,
 nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
 nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 887a5532e449..d6fc74619625 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -107,7 +107,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
   global_node_page_state(NR_KERNEL_SCS_KB));
 #endif
show_val_kb(m, "PageTables: ",
-   global_zone_page_state(NR_PAGETABLE));
+   global_node_page_state(NR_PAGETABLE));
 
show_val_kb(m, "NFS_Unstable:   ", 0);
show_val_kb(m, "Bounce: ",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index eabe7d9f80d8..d1f64744ace2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2199,7 +2199,7 @@ static inline bool pgtable_pte_page_ctor(struct page 
*page)
if (!ptlock_init(page))
return false;
__SetPageTable(page);
-   inc_zone_page_state(page, NR_PAGETABLE);
+   inc_lruvec_page_state(page, NR_PAGETABLE);
return true;
 }
 
@@ -2207,7 +2207,7 @@ static inline void pgtable_pte_page_dtor(struct page 
*page)
 {
ptlock_free(page);
__ClearPageTable(page);
-   dec_zone_page_state(page, NR_PAGETABLE);
+   dec_lruvec_page_state(page, NR_PAGETABLE);
 }
 
 #define pte_offset_map_lock(mm, pmd, address, ptlp)\
@@ -2294,7 +2294,7 @@ static inline bool pgtable_pmd_page_ctor(struct page 
*page)
if (!pmd_ptlock_init(page))
return false;
__SetPageTable(page);
-   inc_zone_page_state(page, NR_PAGETABLE);
+   inc_lruvec_page_state(page, NR_PAGETABLE);
return true;
 }
 
@@ -2302,7 +2302,7 @@ static inline void pgtable_pmd_page_dtor(struct page 
*page)
 {
pmd_ptlock_free(p

Re: [PATCH] mm: mmap_lock: fix use-after-free race and css ref leak in tracepoints

2020-11-30 Thread Shakeel Butt
On Mon, Nov 30, 2020 at 3:43 PM Axel Rasmussen  wrote:
>
> syzbot reported[1] a use-after-free introduced in 0f818c4bc1f3. The bug
> is that an ongoing trace event might race with the tracepoint being
> disabled (and therefore the _unreg() callback being called). Consider
> this ordering:
>
> T1: trace event fires, get_mm_memcg_path() is called
> T1: get_memcg_path_buf() returns a buffer pointer
> T2: trace_mmap_lock_unreg() is called, buffers are freed
> T1: cgroup_path() is called with the now-freed buffer

Any reason to use the cgroup_path instead of the cgroup_ino? There are
other examples of trace points using cgroup_ino and no need to
allocate buffers. Also cgroup namespace might complicate the path
usage.

>
> The solution in this commit is to modify trace_mmap_lock_unreg() to
> first stop new buffers from being handed out, and then to wait (spin)
> until any existing buffer references are dropped (i.e., those trace
> events complete).
>
> I have a simple reproducer program which spins up two pools of threads,
> doing the following in a tight loop:
>
>   Pool 1:
>   mmap(NULL, 4096, PROT_READ | PROT_WRITE,
>MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)
>   munmap()
>
>   Pool 2:
>   echo 1 > /sys/kernel/debug/tracing/events/mmap_lock/enable
>   echo 0 > /sys/kernel/debug/tracing/events/mmap_lock/enable
>
> This triggers the use-after-free very quickly. With this patch, I let it
> run for an hour without any BUGs.
>
> While fixing this, I also noticed and fixed a css ref leak. Previously
> we called get_mem_cgroup_from_mm(), but we never called css_put() to
> release that reference. get_mm_memcg_path() now does this properly.
>
> [1]: https://syzkaller.appspot.com/bug?extid=19e6dd9943972fa1c58a
>
> Fixes: 0f818c4bc1f3 ("mm: mmap_lock: add tracepoints around lock acquisition")

The original patch is in mm tree, so the SHA1 is not stabilized.
Usually Andrew squash the fixes into the original patches.

> Signed-off-by: Axel Rasmussen 
> ---
>  mm/mmap_lock.c | 100 +
>  1 file changed, 85 insertions(+), 15 deletions(-)
>
> diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
> index 12af8f1b8a14..be38dc58278b 100644
> --- a/mm/mmap_lock.c
> +++ b/mm/mmap_lock.c
> @@ -3,6 +3,7 @@
>  #include 
>
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -18,13 +19,28 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
>  #ifdef CONFIG_MEMCG
>
>  /*
> - * Our various events all share the same buffer (because we don't want or 
> need
> - * to allocate a set of buffers *per event type*), so we need to protect 
> against
> - * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
> - * been made.
> + * This is unfortunately complicated... _reg() and _unreg() may be called
> + * in parallel, separately for each of our three event types. To save memory,
> + * all of the event types share the same buffers. Furthermore, trace events
> + * might happen in parallel with _unreg(); we need to ensure we don't free 
> the
> + * buffers before all inflights have finished. Because these events happen
> + * "frequently", we also want to prevent new inflights from starting once the
> + * _unreg() process begins. And, for performance reasons, we want to avoid 
> any
> + * locking in the trace event path.
> + *
> + * So:
> + *
> + * - Use a spinlock to serialize _reg() and _unreg() calls.
> + * - Keep track of nested _reg() calls with a lock-protected counter.
> + * - Define a flag indicating whether or not unregistration has begun (and
> + *   therefore that there should be no new buffer uses going forward).
> + * - Keep track of inflight buffer users with a reference count.
>   */
>  static DEFINE_SPINLOCK(reg_lock);
> -static int reg_refcount;
> +static int reg_types_rc; /* Protected by reg_lock. */
> +static bool unreg_started; /* Doesn't need synchronization. */
> +/* atomic_t instead of refcount_t, as we want ordered inc without locks. */
> +static atomic_t inflight_rc = ATOMIC_INIT(0);
>
>  /*
>   * Size of the buffer for memcg path names. Ignoring stack trace support,
> @@ -46,9 +62,14 @@ int trace_mmap_lock_reg(void)
> unsigned long flags;
> int cpu;
>
> +   /*
> +* Serialize _reg() and _unreg(). Without this, e.g. _unreg() might
> +* start cleaning up while _reg() is only partially completed.
> +*/
> spin_lock_irqsave(®_lock, flags);
>
> -   if (reg_refcount++)
> +   /* If the refcount is going 0->1, proceed with allocating buffers. */
> +   if (reg_types_rc++)
> goto out;
>
> for_each_possible_cpu(cpu) {
> @@ -62,6 +83,11 @@ int trace_mmap_lock_reg(void)
> per_cpu(memcg_path_buf_idx, cpu) = 0;
> }
>
> +   /* Reset unreg_started flag, allowing new trace events. */
> +   WRITE_ONCE(unreg_started, false);
> +   /* Add the registration +1 to the inflight refcount. */
> +   atomic_inc(&inflight_rc);
> +
>  out:
> spin_un

[PATCH v2] mm: memcontrol: account pagetables per node

2020-11-23 Thread Shakeel Butt
For many workloads, pagetable consumption is significant and it makes
sense to expose it in the memory.stat for the memory cgroups. However at
the moment, the pagetables are accounted per-zone. Converting them to
per-node and using the right interface will correctly account for the
memory cgroups as well.

Signed-off-by: Shakeel Butt 
---
Changes since v1:
- Tried to fix linking errors for m68k and nds32


 Documentation/admin-guide/cgroup-v2.rst |  3 +++
 arch/m68k/include/asm/mcf_pgalloc.h |  1 +
 arch/nds32/mm/mm-nds32.c|  7 ---
 drivers/base/node.c |  2 +-
 fs/proc/meminfo.c   |  2 +-
 include/linux/mm.h  | 11 +++
 include/linux/mmzone.h  |  2 +-
 mm/memcontrol.c |  1 +
 mm/page_alloc.c |  6 +++---
 9 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 515bb13084a0..63521cd36ce5 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1274,6 +1274,9 @@ PAGE_SIZE multiple when read back.
  kernel_stack
Amount of memory allocated to kernel stacks.
 
+ pagetables
+Amount of memory allocated for page tables.
+
  percpu(npn)
Amount of memory used for storing per-cpu kernel
data structures.
diff --git a/arch/m68k/include/asm/mcf_pgalloc.h 
b/arch/m68k/include/asm/mcf_pgalloc.h
index bc1228e00518..9088812c2174 100644
--- a/arch/m68k/include/asm/mcf_pgalloc.h
+++ b/arch/m68k/include/asm/mcf_pgalloc.h
@@ -4,6 +4,7 @@
 
 #include 
 #include 
+#include 
 
 extern inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
diff --git a/arch/nds32/mm/mm-nds32.c b/arch/nds32/mm/mm-nds32.c
index 55bec50ccc03..d18e59877503 100644
--- a/arch/nds32/mm/mm-nds32.c
+++ b/arch/nds32/mm/mm-nds32.c
@@ -5,6 +5,7 @@
 
 #define __HAVE_ARCH_PGD_FREE
 #include 
+#include 
 
 #define FIRST_KERNEL_PGD_NR(USER_PTRS_PER_PGD)
 
@@ -34,8 +35,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
cpu_dcache_wb_range((unsigned long)new_pgd,
(unsigned long)new_pgd +
PTRS_PER_PGD * sizeof(pgd_t));
-   inc_zone_page_state(virt_to_page((unsigned long *)new_pgd),
-   NR_PAGETABLE);
+   inc_lruvec_page_state(virt_to_page((unsigned long *)new_pgd),
+ NR_PAGETABLE);
 
return new_pgd;
 }
@@ -59,7 +60,7 @@ void pgd_free(struct mm_struct *mm, pgd_t * pgd)
 
pte = pmd_page(*pmd);
pmd_clear(pmd);
-   dec_zone_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE);
+   dec_lruvec_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE);
pte_free(mm, pte);
mm_dec_nr_ptes(mm);
pmd_free(mm, pmd);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6ffa470e2984..04f71c7bc3f8 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -450,7 +450,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 #ifdef CONFIG_SHADOW_CALL_STACK
 nid, node_page_state(pgdat, NR_KERNEL_SCS_KB),
 #endif
-nid, K(sum_zone_node_page_state(nid, 
NR_PAGETABLE)),
+nid, K(node_page_state(pgdat, NR_PAGETABLE)),
 nid, 0UL,
 nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
 nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 887a5532e449..d6fc74619625 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -107,7 +107,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
   global_node_page_state(NR_KERNEL_SCS_KB));
 #endif
show_val_kb(m, "PageTables: ",
-   global_zone_page_state(NR_PAGETABLE));
+   global_node_page_state(NR_PAGETABLE));
 
show_val_kb(m, "NFS_Unstable:   ", 0);
show_val_kb(m, "Bounce: ",
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7cfc4653dddf..15c8d082e32f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2177,12 +2177,15 @@ static inline void pgtable_init(void)
pgtable_cache_init();
 }
 
+static void inc_lruvec_page_state(struct page *page, enum node_stat_item idx);
+static void dec_lruvec_page_state(struct page *page, enum node_stat_item idx);
+
 static inline bool pgtable_pte_page_ctor(struct page *page)
 {
if (!ptlock_init(page))
return false;
__SetPageTable(page);
-   inc_zone_page_state(page, NR_PAGETABLE);
+   inc_lruvec_page_state(page, NR_PAGETABLE);
return true;
 }
 
@@ -2190,7 +2193,7 @@ static inline void pgtable_pte_page_dtor(struct page 
*pag

Re: [PATCH v2] mm: memcg/slab: Fix return child memcg objcg for root memcg

2020-10-29 Thread Shakeel Butt
On Tue, Oct 27, 2020 at 8:50 PM Muchun Song  wrote:
>
> Consider the following memcg hierarchy.
>
> root
>/\
>   A  B
>
> If we get the objcg of memcg A failed,

Please fix the above statement.

> the get_obj_cgroup_from_current
> can return the wrong objcg for the root memcg.
>
> Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API")
> Signed-off-by: Muchun Song 
> ---
>  changelog in v2:
>  1. Do not use a comparison with the root_mem_cgroup
>
>  mm/memcontrol.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 1337775b04f3..8c8b4c3ed5a0 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2961,6 +2961,7 @@ __always_inline struct obj_cgroup 
> *get_obj_cgroup_from_current(void)
> objcg = rcu_dereference(memcg->objcg);
> if (objcg && obj_cgroup_tryget(objcg))
> break;
> +   objcg = NULL;

Roman, in your cleanup, are you planning to have objcg for root memcg as well?

> }
> rcu_read_unlock();
>
> --
> 2.20.1
>


Re: [PATCH v2] mm: memcg/slab: Fix use after free in obj_cgroup_charge

2020-10-29 Thread Shakeel Butt
On Tue, Oct 27, 2020 at 8:51 PM Muchun Song  wrote:
>
> The rcu_read_lock/unlock only can guarantee that the memcg will
> not be freed, but it cannot guarantee the success of css_get to
> memcg.
>
> If the whole process of a cgroup offlining is completed between
> reading a objcg->memcg pointer and bumping the css reference on
> another CPU, and there are exactly 0 external references to this
> memory cgroup (how we get to the obj_cgroup_charge() then?),
> css_get() can change the ref counter from 0 back to 1.
>
> Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API")
> Signed-off-by: Muchun Song 
> Acked-by: Roman Gushchin 

Reviewed-by: Shakeel Butt 


Re: [PATCH v2] mm: memcg/slab: Rename *_lruvec_slab_state to *_lruvec_kmem_state

2020-10-29 Thread Shakeel Butt
On Tue, Oct 27, 2020 at 8:51 PM Muchun Song  wrote:
>
> The *_lruvec_slab_state is also suitable for pages allocated from buddy,
> not just for the slab objects. But the function name seems to tell us that
> only slab object is applicable. So we can rename the keyword of slab to
> kmem.
>
> Signed-off-by: Muchun Song 
> Acked-by: Roman Gushchin 

Reviewed-by: Shakeel Butt 


Re: [PATCH v2] mm: memcontrol: Simplify the mem_cgroup_page_lruvec

2020-10-29 Thread Shakeel Butt
On Thu, Oct 29, 2020 at 2:08 AM Michal Hocko  wrote:
>
> On Wed 28-10-20 11:50:13, Muchun Song wrote:
> [...]
> > -struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct 
> > pglist_data *pgdat)
> > +static struct lruvec *
> > +__mem_cgroup_node_lruvec(struct mem_cgroup *memcg, struct pglist_data 
> > *pgdat,
> > +  int nid)
>
> I thought I have made it clear that this is not a good approach. Please
> do not repost new version without that being addressed. If there are any
> questions then feel free to ask for details.

You can get nid from pgdat (pgdat->node_id) and also pgdat from nid
(NODE_DATA(nid)), so, __mem_cgroup_node_lruvec() only need one of them
as parameter.


Re: [External] Re: [PATCH v2] mm: memcg/slab: Fix return child memcg objcg for root memcg

2020-10-29 Thread Shakeel Butt
On Thu, Oct 29, 2020 at 9:09 AM Muchun Song  wrote:
>
> On Thu, Oct 29, 2020 at 11:48 PM Shakeel Butt  wrote:
> >
> > On Tue, Oct 27, 2020 at 8:50 PM Muchun Song  
> > wrote:
> > >
> > > Consider the following memcg hierarchy.
> > >
> > > root
> > >/\
> > >   A  B
> > >
> > > If we get the objcg of memcg A failed,
> >
> > Please fix the above statement.
>
> Sorry, could you be more specific, I don't quite understand.

Fix the grammar. Something like "If we failed to get the reference on
objcg of memcg A..."


Re: [PATCH v2] mm: memcg/slab: Fix return child memcg objcg for root memcg

2020-10-29 Thread Shakeel Butt
On Thu, Oct 29, 2020 at 10:10 AM Roman Gushchin  wrote:
>
> On Thu, Oct 29, 2020 at 08:48:45AM -0700, Shakeel Butt wrote:
> > On Tue, Oct 27, 2020 at 8:50 PM Muchun Song  
> > wrote:
> > >
> > > Consider the following memcg hierarchy.
> > >
> > > root
> > >/\
> > >   A  B
> > >
> > > If we get the objcg of memcg A failed,
> >
> > Please fix the above statement.
> >
> > > the get_obj_cgroup_from_current
> > > can return the wrong objcg for the root memcg.
> > >
> > > Fixes: bf4f059954dc ("mm: memcg/slab: obj_cgroup API")
> > > Signed-off-by: Muchun Song 
> > > ---
> > >  changelog in v2:
> > >  1. Do not use a comparison with the root_mem_cgroup
> > >
> > >  mm/memcontrol.c | 1 +
> > >  1 file changed, 1 insertion(+)
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 1337775b04f3..8c8b4c3ed5a0 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -2961,6 +2961,7 @@ __always_inline struct obj_cgroup 
> > > *get_obj_cgroup_from_current(void)
> > > objcg = rcu_dereference(memcg->objcg);
> > > if (objcg && obj_cgroup_tryget(objcg))
> > > break;
> > > +   objcg = NULL;
> >
> > Roman, in your cleanup, are you planning to have objcg for root memcg as 
> > well?
>
> Yes. I'll just change the for loop to include the root_mem_cgroup.
>

Then do we really need this patch since it's not tagged for stable?


Re: [PATCH rfc 1/3] mm: memcg: deprecate the non-hierarchical mode

2020-11-04 Thread Shakeel Butt
On Tue, Nov 3, 2020 at 1:27 PM Roman Gushchin  wrote:
>
> The non-hierarchical cgroup v1 mode is a legacy of early days
> of the memory controller and doesn't bring any value today.
> However, it complicates the code and creates many edge cases
> all over the memory controller code.
>
> It's a good time to deprecate it completely.
>
> Functionally this patch enabled is by default for all cgroups
> and forbids switching it off. Nothing changes if cgroup v2 is used:
> hierarchical mode was enforced from scratch.
>
> To protect the ABI memory.use_hierarchy interface is preserved
> with a limited functionality: reading always returns "1", writing
> of "1" passes silently, writing of any other value fails with
> -EINVAL and a warning to dmesg (on the first occasion).
>
> Signed-off-by: Roman Gushchin 

Reviewed-by: Shakeel Butt 


Re: [PATCH rfc 2/3] docs: cgroup-v1: reflect the deprecation of the non-hierarchical mode

2020-11-04 Thread Shakeel Butt
On Tue, Nov 3, 2020 at 1:27 PM Roman Gushchin  wrote:
>
> Update cgroup v1 docs after the deprecation of the non-hierarchical
> mode of the memory controller.
>
> Signed-off-by: Roman Gushchin 

Reviewed-by: Shakeel Butt 


Re: [PATCH rfc 3/3] cgroup: remove obsoleted broken_hierarchy and warned_broken_hierarchy

2020-11-04 Thread Shakeel Butt
On Tue, Nov 3, 2020 at 1:27 PM Roman Gushchin  wrote:
>
> With the deprecation of the non-hierarchical mode of the memory
> controller there are no more examples of broken hierarchies left.
>
> Let's remove the cgroup core code which was supposed to print
> warnings about creating of broken hierarchies.
>
> Signed-off-by: Roman Gushchin 

Reviewed-by: Shakeel Butt 


Re: Machine lockups on extreme memory pressure

2020-10-30 Thread Shakeel Butt
On Tue, Sep 22, 2020 at 10:01 AM Michal Hocko  wrote:
>
> On Tue 22-09-20 09:51:30, Shakeel Butt wrote:
> > On Tue, Sep 22, 2020 at 9:34 AM Michal Hocko  wrote:
> > >
> > > On Tue 22-09-20 09:29:48, Shakeel Butt wrote:
> [...]
> > > > Anyways, what do you think of the in-kernel PSI based
> > > > oom-kill trigger. I think Johannes had a prototype as well.
> > >
> > > We have talked about something like that in the past and established
> > > that auto tuning for oom killer based on PSI is almost impossible to get
> > > right for all potential workloads and that so this belongs to userspace.
> > > The kernel's oom killer is there as a last resort when system gets close
> > > to meltdown.
> >
> > The system is already in meltdown state from the users perspective. I
> > still think allowing the users to optionally set the oom-kill trigger
> > based on PSI makes sense. Something like 'if all processes on the
> > system are stuck for 60 sec, trigger oom-killer'.
>
> We already do have watchdogs for that no? If you cannot really schedule
> anything then soft lockup detector should fire. In a meltdown state like
> that the reboot is likely the best way forward anyway.

Yes, soft lockup detector can catch this situation but I still think
we can do better than panic/reboot.

Anyways, I think we now know the reason for this extreme pressure and
I just wanted to share if someone else might be facing a similar
situation.

There were several thousand TCP delayed ACKs queued on the system. The
system was under memory pressure and alloc_skb(GFP_ATOMIC) for delayed
ACKs were either stealing from reclaimers or failing. For the delayed
ACKs whose allocation failed, the kernel reschedules them infinitely.
So, these failing allocations for delayed ACKs were keeping the system
in this lockup state for hours. The commit a37c2134bed6 ("tcp: add
exponential backoff in __tcp_send_ack()") recently added the fix for
this situation.


Re: 回复:general protection fault in refill_obj_stock

2024-04-13 Thread Shakeel Butt
On Tue, Apr 02, 2024 at 09:50:54AM +0800, Ubisectech Sirius wrote:
> > On Mon, Apr 01, 2024 at 03:04:46PM +0800, Ubisectech Sirius wrote:
> > Hello.
> > We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. 
> > Recently, our team has discovered a issue in Linux kernel 6.7. Attached to 
> > the email were a PoC file of the issue.
> 
> > Thank you for the report!
> 
> > I tried to compile and run your test program for about half an hour
> > on a virtual machine running 6.7 with enabled KASAN, but wasn't able
> > to reproduce the problem.
> 
> > Can you, please, share a bit more information? How long does it take
> > to reproduce? Do you mind sharing your kernel config? Is there anything 
> > special
> > about your setup? What are exact steps to reproduce the problem?
> > Is this problem reproducible on 6.6?
> 
> Hi. 
>The .config of linux kernel 6.7 has send to you as attachment. And The 
> problem is reproducible on 6.6.

Can you please share the reproducer of this issue?

> 
> > It's interesting that the problem looks like use-after-free for the objcg 
> > pointer
> > but happens in the context of udev-systemd, which I believe should be 
> > fairly stable
> > and it's cgroup is not going anywhere.
> 
> > Thanks!
> 





Re: [PATCH net-next v3 2/5] mm: add a signature in struct page

2021-04-10 Thread Shakeel Butt
On Sat, Apr 10, 2021 at 9:16 AM Ilias Apalodimas
 wrote:
>
> Hi Matthew
>
> On Sat, Apr 10, 2021 at 04:48:24PM +0100, Matthew Wilcox wrote:
> > On Sat, Apr 10, 2021 at 12:37:58AM +0200, Matteo Croce wrote:
> > > This is needed by the page_pool to avoid recycling a page not allocated
> > > via page_pool.
> >
> > Is the PageType mechanism more appropriate to your needs?  It wouldn't
> > be if you use page->_mapcount (ie mapping it to userspace).
>
> Interesting!
> Please keep in mind this was written ~2018 and was stale on my branches for
> quite some time.  So back then I did try to use PageType, but had not free
> bits.  Looking at it again though, it's cleaned up.  So yes I think this can
> be much much cleaner.  Should we go and define a new PG_pagepool?
>
>

Can this page_pool be used for TCP RX zerocopy? If yes then PageType
can not be used.

There is a recent discussion [1] on memcg accounting of TCP RX
zerocopy and I am wondering if this work can somehow help in that
regard. I will take a look at the series.

[1] 
https://lore.kernel.org/linux-mm/20210316013003.25271-1-arjunroy.k...@gmail.com/


Re: [RFC PATCH v2 01/18] mm: memcontrol: fix page charging in page replacement

2021-04-10 Thread Shakeel Butt
On Fri, Apr 9, 2021 at 5:32 AM Muchun Song  wrote:
>
> The pages aren't accounted at the root level, so do not charge the page
> to the root memcg in page replacement. Although we do not display the
> value (mem_cgroup_usage) so there shouldn't be any actual problem, but
> there is a WARN_ON_ONCE in the page_counter_cancel(). Who knows if it
> will trigger? So it is better to fix it.
>
> Signed-off-by: Muchun Song 
> Acked-by: Johannes Weiner 

Reviewed-by: Shakeel Butt 


Re: [RFC PATCH v2 02/18] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm

2021-04-10 Thread Shakeel Butt
On Fri, Apr 9, 2021 at 5:32 AM Muchun Song  wrote:
>
> When mm is NULL, we do not need to hold rcu lock and call css_tryget for
> the root memcg. And we also do not need to check !mm in every loop of
> while. So bail out early when !mm.
>
> Signed-off-by: Muchun Song 
> Acked-by: Johannes Weiner 

Reviewed-by: Shakeel Butt 


Re: [External] Re: [RFC PATCH v2 09/18] mm: vmscan: remove noinline_for_stack

2021-04-10 Thread Shakeel Butt
On Fri, Apr 9, 2021 at 9:34 PM Muchun Song  wrote:
>
> On Sat, Apr 10, 2021 at 2:31 AM Johannes Weiner  wrote:
> >
> > On Fri, Apr 09, 2021 at 08:29:50PM +0800, Muchun Song wrote:
> > > The noinline_for_stack is introduced by commit 666356297ec4 ("vmscan:
> > > set up pagevec as late as possible in shrink_inactive_list()"), its
> > > purpose is to delay the allocation of pagevec as late as possible to
> > > save stack memory. But the commit 2bcf88796381 ("mm: take pagevecs off
> > > reclaim stack") replace pagevecs by lists of pages_to_free. So we do
> > > not need noinline_for_stack, just remove it (let the compiler decide
> > > whether to inline).
> > >
> > > Signed-off-by: Muchun Song 
> >
> > Good catch.
> >
> > Acked-by: Johannes Weiner 
> >
> > Since this patch is somewhat independent of the rest of the series,
> > you may want to put it in the very beginning, or even submit it
> > separately, to keep the main series as compact as possible. Reviewers
> > can be more hesitant to get involved with larger series ;)
>
> OK. I will gather all the cleanup patches into a separate series.
> Thanks for your suggestion.

That would be best.

For this patch:

Reviewed-by: Shakeel Butt 


Re: [RFC PATCH v1 00/11] Manage the top tier memory in a tiered memory

2021-04-12 Thread Shakeel Butt
On Thu, Apr 8, 2021 at 4:52 AM Michal Hocko  wrote:
>
[...]
>
> What I am trying to say (and I have brought that up when demotion has been
> discussed at LSFMM) is that the implementation shouldn't be PMEM aware.
> The specific technology shouldn't be imprinted into the interface.
> Fundamentally you are trying to balance memory among NUMA nodes as we do
> not have other abstraction to use. So rather than talking about top,
> secondary, nth tier we have different NUMA nodes with different
> characteristics and you want to express your "priorities" for them.
>

I am also inclined towards NUMA based approach. It makes the solution
more general and even existing systems with multiple numa nodes and
DRAM can take advantage of this approach (if it makes sense).


Re: [RFC PATCH v1 00/11] Manage the top tier memory in a tiered memory

2021-04-12 Thread Shakeel Butt
On Thu, Apr 8, 2021 at 1:50 PM Yang Shi  wrote:
>
[...]

> >
> > The low and min limits have semantics similar to the v1's soft limit
> > for this situation i.e. letting the low priority job occupy top tier
> > memory and depending on reclaim to take back the excess top tier
> > memory use of such jobs.
>
> I don't get why low priority jobs can *not* use top tier memory?

I am saying low priority jobs can use top tier memory. The only
difference is to limit them upfront (using limits) or reclaim from
them later (using min/low/soft-limit).

> I can
> think of it may incur latency overhead for high priority jobs. If it
> is not allowed, it could be restricted by cpuset without introducing
> in any new interfaces.
>
> I'm supposed the memory utilization could be maximized by allowing all
> jobs allocate memory from all applicable nodes, then let reclaimer (or
> something new if needed)

Most probably something new as we do want to consider unevictable
memory as well.

> do the job to migrate the memory to proper
> nodes by time. We could achieve some kind of balance between memory
> utilization and resource isolation.
>

Tradeoff between utilization and isolation should be decided by the user/admin.


Re: [PATCH 2/5] mm/memcg: Introduce obj_cgroup_uncharge_mod_state()

2021-04-12 Thread Shakeel Butt
On Fri, Apr 9, 2021 at 4:19 PM Waiman Long  wrote:
>
> In memcg_slab_free_hook()/pcpu_memcg_free_hook(), obj_cgroup_uncharge()
> is followed by mod_objcg_state()/mod_memcg_state(). Each of these
> function call goes through a separate irq_save/irq_restore cycle. That
> is inefficient.  Introduce a new function obj_cgroup_uncharge_mod_state()
> that combines them with a single irq_save/irq_restore cycle.
>
> Signed-off-by: Waiman Long 

Reviewed-by: Shakeel Butt 


[tip: sched/core] psi: Reduce calls to sched_clock() in psi

2021-03-23 Thread tip-bot2 for Shakeel Butt
The following commit has been merged into the sched/core branch of tip:

Commit-ID: df77430639c9cf73559bac0f25084518bf9a812d
Gitweb:
https://git.kernel.org/tip/df77430639c9cf73559bac0f25084518bf9a812d
Author:Shakeel Butt 
AuthorDate:Sun, 21 Mar 2021 13:51:56 -07:00
Committer: Peter Zijlstra 
CommitterDate: Tue, 23 Mar 2021 16:01:58 +01:00

psi: Reduce calls to sched_clock() in psi

We noticed that the cost of psi increases with the increase in the
levels of the cgroups. Particularly the cost of cpu_clock() sticks out
as the kernel calls it multiple times as it traverses up the cgroup
tree. This patch reduces the calls to cpu_clock().

Performed perf bench on Intel Broadwell with 3 levels of cgroup.

Before the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

 Total time: 0.747 [sec]

 # Running sched/pipe benchmark...
 # Executed 100 pipe operations between two processes

 Total time: 3.516 [sec]

   3.516689 usecs/op
 284358 ops/sec

After the patch:

$ perf bench sched all
 # Running sched/messaging benchmark...
 # 20 sender and receiver processes per group
 # 10 groups == 400 processes run

 Total time: 0.640 [sec]

 # Running sched/pipe benchmark...
 # Executed 100 pipe operations between two processes

 Total time: 3.329 [sec]

   3.329820 usecs/op
 300316 ops/sec

Signed-off-by: Shakeel Butt 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Johannes Weiner 
Link: https://lkml.kernel.org/r/20210321205156.4186483-1-shake...@google.com
---
 kernel/sched/psi.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index c8480d7..b1b00e9 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,12 +644,10 @@ static void poll_timer_fn(struct timer_list *t)
wake_up_interruptible(&group->poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu)
+static void record_times(struct psi_group_cpu *groupc, u64 now)
 {
u32 delta;
-   u64 now;
 
-   now = cpu_clock(cpu);
delta = now - groupc->state_start;
groupc->state_start = now;
 
@@ -676,7 +674,7 @@ static void record_times(struct psi_group_cpu *groupc, int 
cpu)
 }
 
 static void psi_group_change(struct psi_group *group, int cpu,
-unsigned int clear, unsigned int set,
+unsigned int clear, unsigned int set, u64 now,
 bool wake_clock)
 {
struct psi_group_cpu *groupc;
@@ -696,7 +694,7 @@ static void psi_group_change(struct psi_group *group, int 
cpu,
 */
write_seqcount_begin(&groupc->seq);
 
-   record_times(groupc, cpu);
+   record_times(groupc, now);
 
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
@@ -788,12 +786,14 @@ void psi_task_change(struct task_struct *task, int clear, 
int set)
struct psi_group *group;
bool wake_clock = true;
void *iter = NULL;
+   u64 now;
 
if (!task->pid)
return;
 
psi_flags_change(task, clear, set);
 
+   now = cpu_clock(cpu);
/*
 * Periodic aggregation shuts off if there is a period of no
 * task changes, so we wake it back up if necessary. However,
@@ -806,7 +806,7 @@ void psi_task_change(struct task_struct *task, int clear, 
int set)
wake_clock = false;
 
while ((group = iterate_groups(task, &iter)))
-   psi_group_change(group, cpu, clear, set, wake_clock);
+   psi_group_change(group, cpu, clear, set, now, wake_clock);
 }
 
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@@ -815,6 +815,7 @@ void psi_task_switch(struct task_struct *prev, struct 
task_struct *next,
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
void *iter;
+   u64 now = cpu_clock(cpu);
 
if (next->pid) {
bool identical_state;
@@ -836,7 +837,7 @@ void psi_task_switch(struct task_struct *prev, struct 
task_struct *next,
break;
}
 
-   psi_group_change(group, cpu, 0, TSK_ONCPU, true);
+   psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
}
}
 
@@ -858,7 +859,7 @@ void psi_task_switch(struct task_struct *prev, struct 
task_struct *next,
 
iter = NULL;
while ((group = iterate_groups(prev, &iter)) && group != common)
-   psi_group_change(group, cpu, clear, set, true);
+   psi_group_change(group, cpu, clear, set, now, true);
 
/*
 * TSK_ONCPU is handled up to the common ances

<    5   6   7   8   9   10