[PATCH v2] mm: fix a race on nr_swap_pages

2020-12-03 Thread Zhaoyang Huang
The scenario on which "Free swap = -4kB" happens in my system, which is caused
by several get_swap_pages racing with each other and show_swap_cache_info
happens simutaniously. No need to add a lock on get_swap_page_of_type as we
remove "Presub/PosAdd" here.

ProcessAProcessBProcessC
ngoals = 1  ngoals = 1
avail = nr_swap_pages(1)avail = nr_swap_pages(1)
nr_swap_pages(1) -= ngoals
nr_swap_pages(0) -= ngoals
nr_swap_pages = 
-1

Signed-off-by: Zhaoyang Huang 
---
change of v2: fix bug of unpaired of spin_lock
---
---
 mm/swapfile.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index cf63b5f..1212f17 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -974,9 +974,13 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], 
int entry_size)
/* Only single cluster request supported */
WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
 
+   spin_lock(_avail_lock);
+
avail_pgs = atomic_long_read(_swap_pages) / size;
-   if (avail_pgs <= 0)
+   if (avail_pgs <= 0) {
+   spin_unlock(_avail_lock);
goto noswap;
+   }
 
if (n_goal > SWAP_BATCH)
n_goal = SWAP_BATCH;
@@ -986,8 +990,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], 
int entry_size)
 
atomic_long_sub(n_goal * size, _swap_pages);
 
-   spin_lock(_avail_lock);
-
 start_over:
node = numa_node_id();
plist_for_each_entry_safe(si, next, _avail_heads[node], 
avail_lists[node]) {
@@ -1061,14 +1063,13 @@ swp_entry_t get_swap_page_of_type(int type)
 
spin_lock(>lock);
if (si->flags & SWP_WRITEOK) {
-   atomic_long_dec(_swap_pages);
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
if (offset) {
+   atomic_long_dec(_swap_pages);
spin_unlock(>lock);
return swp_entry(type, offset);
}
-   atomic_long_inc(_swap_pages);
}
spin_unlock(>lock);
 fail:
-- 
1.9.1



Re: [PATCH] mm: fix a race on nr_swap_pages

2020-12-03 Thread Zhaoyang Huang
It is show_swap_cache_info() which races with get_swap_xxx

On Thu, Dec 3, 2020 at 7:36 PM Zhaoyang Huang  wrote:
>
> The scenario on which "Free swap -4kB" happens in my system, which is caused 
> by
>  get_swap_page_of_type or get_swap_pages racing with show_mem. Remove the race
>  here.
>
> Signed-off-by: Zhaoyang Huang 
> ---
>  mm/swapfile.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index cf63b5f..13201b6 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -974,6 +974,8 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], 
> int entry_size)
> /* Only single cluster request supported */
> WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
>
> +   spin_lock(_avail_lock);
> +
> avail_pgs = atomic_long_read(_swap_pages) / size;
> if (avail_pgs <= 0)
> goto noswap;
> @@ -986,8 +988,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], 
> int entry_size)
>
> atomic_long_sub(n_goal * size, _swap_pages);
>
> -   spin_lock(_avail_lock);
> -
>  start_over:
> node = numa_node_id();
> plist_for_each_entry_safe(si, next, _avail_heads[node], 
> avail_lists[node]) {
> @@ -1061,14 +1061,13 @@ swp_entry_t get_swap_page_of_type(int type)
>
> spin_lock(>lock);
> if (si->flags & SWP_WRITEOK) {
> -   atomic_long_dec(_swap_pages);
> /* This is called for allocating swap entry, not cache */
> offset = scan_swap_map(si, 1);
> if (offset) {
> +   atomic_long_dec(_swap_pages);
> spin_unlock(>lock);
> return swp_entry(type, offset);
> }
> -   atomic_long_inc(_swap_pages);
> }
> spin_unlock(>lock);
>  fail:
> --
> 1.9.1
>


[PATCH] mm: fix a race on nr_swap_pages

2020-12-03 Thread Zhaoyang Huang
The scenario on which "Free swap -4kB" happens in my system, which is caused by
 get_swap_page_of_type or get_swap_pages racing with show_mem. Remove the race
 here.

Signed-off-by: Zhaoyang Huang 
---
 mm/swapfile.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index cf63b5f..13201b6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -974,6 +974,8 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], 
int entry_size)
/* Only single cluster request supported */
WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
 
+   spin_lock(_avail_lock);
+
avail_pgs = atomic_long_read(_swap_pages) / size;
if (avail_pgs <= 0)
goto noswap;
@@ -986,8 +988,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], 
int entry_size)
 
atomic_long_sub(n_goal * size, _swap_pages);
 
-   spin_lock(_avail_lock);
-
 start_over:
node = numa_node_id();
plist_for_each_entry_safe(si, next, _avail_heads[node], 
avail_lists[node]) {
@@ -1061,14 +1061,13 @@ swp_entry_t get_swap_page_of_type(int type)
 
spin_lock(>lock);
if (si->flags & SWP_WRITEOK) {
-   atomic_long_dec(_swap_pages);
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
if (offset) {
+   atomic_long_dec(_swap_pages);
spin_unlock(>lock);
return swp_entry(type, offset);
}
-   atomic_long_inc(_swap_pages);
}
spin_unlock(>lock);
 fail:
-- 
1.9.1



[RFC PATCH] mm: bail out from psi memstall when cond_resched

2020-11-17 Thread Zhaoyang Huang
Memory reclaiming will run as several seconds in memory constraint system, which
will be deemed as heavy memstall. Have the memory reclaim be more presiced by
bailing out when cond_resched

Signed-off-by: Zhaoyang Huang 
---
 mm/vmscan.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index a815f73..a083c85 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -316,6 +316,15 @@ static inline bool memcg_congested(struct pglist_data 
*pgdat,
 }
 #endif
 
+static inline void psi_cond_resched(void)
+{
+   unsigned long *flags;
+
+   if (current->flags & PF_MEMSTALL)
+   psi_memstall_leave();
+   cond_resched();
+   psi_memstall_enter();
+}
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -557,7 +566,7 @@ static unsigned long do_shrink_slab(struct shrink_control 
*shrinkctl,
total_scan -= shrinkctl->nr_scanned;
scanned += shrinkctl->nr_scanned;
 
-   cond_resched();
+   psi_cond_resched();
}
 
if (next_deferred >= scanned)
@@ -714,7 +723,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 
up_read(_rwsem);
 out:
-   cond_resched();
+   psi_cond_resched();
return freed;
 }
 
@@ -1109,7 +1118,7 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
unsigned nr_reclaimed = 0;
 
memset(stat, 0, sizeof(*stat));
-   cond_resched();
+   psi_cond_resched();
 
while (!list_empty(page_list)) {
struct address_space *mapping;
@@ -1118,7 +1127,7 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
 
-   cond_resched();
+   psi_cond_resched();
 
page = lru_to_page(page_list);
list_del(>lru);
@@ -2084,7 +2093,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
spin_unlock_irq(>lru_lock);
 
while (!list_empty(_hold)) {
-   cond_resched();
+   psi_cond_resched();
page = lru_to_page(_hold);
list_del(>lru);
 
@@ -2500,7 +2509,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, 
struct mem_cgroup *memc
}
}
 
-   cond_resched();
+   psi_cond_resched();
 
if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
continue;
@@ -4149,7 +4158,7 @@ static int __node_reclaim(struct pglist_data *pgdat, 
gfp_t gfp_mask, unsigned in
.reclaim_idx = gfp_zone(gfp_mask),
};
 
-   cond_resched();
+   psi_cond_resched();
fs_reclaim_acquire(sc.gfp_mask);
/*
 * We need to be able to allocate from the reserves for RECLAIM_UNMAP
-- 
1.9.1



Re: [PATCH v2] mm : sync ra->ra_pages with bdi->ra_pages

2020-08-24 Thread Zhaoyang Huang
On Fri, Aug 21, 2020 at 7:57 PM Matthew Wilcox  wrote:
>
> On Fri, Aug 21, 2020 at 05:31:52PM +0800, Zhaoyang Huang wrote:
> > This patch has been verified on an android system and reduces 15% of
> > UNITERRUPTIBLE_SLEEP_BLOCKIO which was used to be caused by wrong
> > ra->ra_pages.
>
> Wait, what?  Readahead doesn't sleep on the pages it's requesting.
> Unless ... your file access pattern is random, so you end up submitting
> a readahead I/O that's bigger than needed, so takes longer for the page
> you actually wanted to be returned.  I know we have the LOTSAMISS
> logic, but that's not really enough.
>
> OK, assuming this problem is really about sync mmap (ie executables),
> this makes a bit more sense.  I think the real problem is here:
>
> ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
> ra->size = ra->ra_pages;
> ra->async_size = ra->ra_pages / 4;
> ra_submit(ra, mapping, file);
>
> which actually skips all the logic we have in ondemand_readahead()
> for adjusting the readahead size.  Ugh, this is a mess.
>
> I think a quick fix to your problem will be just replacing ra->ra_pages
> with bdi->ra_pages in do_sync_mmap_readahead() and leaving ra->ra_pages
> alone everywhere else.
>
We can't just sync ra->ra_pages with bdi->ra_pages as eio and fadvise
will shrink or turbo it, that is why I introduce seq_read_fact in this
commit

> We need a smarter readahead algorithm for mmap'ed files, and I don't have
> time to work on it right now.  So let's stick to the same dumb algorithm,
> but make it responsive to bdi ra_pages being reset.


Re: [PATCH v2] mm : sync ra->ra_pages with bdi->ra_pages

2020-08-21 Thread Zhaoyang Huang
On Fri, Aug 21, 2020 at 7:57 PM Matthew Wilcox  wrote:
>
> On Fri, Aug 21, 2020 at 05:31:52PM +0800, Zhaoyang Huang wrote:
> > This patch has been verified on an android system and reduces 15% of
> > UNITERRUPTIBLE_SLEEP_BLOCKIO which was used to be caused by wrong
> > ra->ra_pages.
>
> Wait, what?  Readahead doesn't sleep on the pages it's requesting.
> Unless ... your file access pattern is random, so you end up submitting
> a readahead I/O that's bigger than needed, so takes longer for the page
> you actually wanted to be returned.  I know we have the LOTSAMISS
> logic, but that's not really enough.
actually, async read even if hitting the marker will also introduce
huge mount of read by wrong ra->ra_pages, which will be used as
req_size for ondemand_readahead.
>
> OK, assuming this problem is really about sync mmap (ie executables),
> this makes a bit more sense.  I think the real problem is here:
>
> ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
> ra->size = ra->ra_pages;
> ra->async_size = ra->ra_pages / 4;
> ra_submit(ra, mapping, file);
>
> which actually skips all the logic we have in ondemand_readahead()
> for adjusting the readahead size.  Ugh, this is a mess.
>
> I think a quick fix to your problem will be just replacing ra->ra_pages
> with bdi->ra_pages in do_sync_mmap_readahead() and leaving ra->ra_pages
> alone everywhere else.
>
> We need a smarter readahead algorithm for mmap'ed files, and I don't have
> time to work on it right now.  So let's stick to the same dumb algorithm,
> but make it responsive to bdi ra_pages being reset.


Re: [PATCH v2] mm : sync ra->ra_pages with bdi->ra_pages

2020-08-21 Thread Zhaoyang Huang
On Fri, Aug 21, 2020 at 5:24 PM Zhaoyang Huang  wrote:
>
> Some system(like android) will turbo read during startup via expanding the
> readahead window and then set it back to normal(128kb as usual). However, some
> files in the system process context will keep to be opened since it is opened
> up and has no chance to sync with the updated value as it is almost impossible
> to change the files attached to the inode(processes are unaware of these 
> things)
>
> We sync ra->ra_pages with bdi->ra_pages when read. Furthermore, in 
> consideration
> of the scenario of eio and fadvise(...,POSIX_FADV_SEQUENTIAL).We introduce a
> seq_read_fact to record the factors of above two cases.
>
> Signed-off-by: Zhaoyang Huang 
> ---
> change from v2:
> fix checkpatch error
> ---
> ---
>  include/linux/fs.h | 17 +
>  mm/fadvise.c   |  4 +++-
>  mm/filemap.c   | 19 +--
>  mm/readahead.c | 37 +
>  4 files changed, 66 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index dd28e76..e3cdc5a 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -66,6 +66,7 @@
>  struct fscrypt_operations;
>  struct fs_context;
>  struct fs_parameter_description;
> +struct file_ra_state;
>
>  extern void __init inode_init(void);
>  extern void __init inode_init_early(void);
> @@ -81,6 +82,7 @@
>  extern int sysctl_protected_hardlinks;
>  extern int sysctl_protected_fifos;
>  extern int sysctl_protected_regular;
> +extern void ra_pages_sync(struct file_ra_state *ra, struct address_space 
> *mapping);
>
>  typedef __kernel_rwf_t rwf_t;
>
> @@ -900,11 +902,26 @@ struct file_ra_state {
>there are only # of pages ahead */
>
> unsigned int ra_pages;  /* Maximum readahead window */
> +   int seq_read_fact;  /* turbo factor of sequential read */
> unsigned int mmap_miss; /* Cache miss stat for mmap accesses 
> */
> loff_t prev_pos;/* Cache last read() position */
>  };
>
>  /*
> + * ra->seq_read_fact == -1 indicates eio happens
> + */
> +#define RA_PAGES(ra)   \
> +({ \
> +   unsigned int ra_pages;  \
> +   if (ra->seq_read_fact != -1)\
> +   ra_pages = ra->ra_pages * ra->seq_read_fact;\
> +   else\
> +   ra_pages = ra->ra_pages;\
> +   ra_pages;   \
> +})
> +
> +
> +/*
>   * Check if @index falls in the readahead windows.
>   */
>  static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
> diff --git a/mm/fadvise.c b/mm/fadvise.c
> index 467bcd0..b06e3ca 100644
> --- a/mm/fadvise.c
> +++ b/mm/fadvise.c
> @@ -78,6 +78,7 @@ static int generic_fadvise(struct file *file, loff_t 
> offset, loff_t len,
> switch (advice) {
> case POSIX_FADV_NORMAL:
> file->f_ra.ra_pages = bdi->ra_pages;
> +   file->f_ra.seq_read_fact = 1;
> spin_lock(>f_lock);
> file->f_mode &= ~FMODE_RANDOM;
> spin_unlock(>f_lock);
> @@ -88,7 +89,8 @@ static int generic_fadvise(struct file *file, loff_t 
> offset, loff_t len,
> spin_unlock(>f_lock);
> break;
> case POSIX_FADV_SEQUENTIAL:
> -   file->f_ra.ra_pages = bdi->ra_pages * 2;
> +   file->f_ra.ra_pages = bdi->ra_pages;
> +   file->f_ra.seq_read_fact = 2;
> spin_lock(>f_lock);
> file->f_mode &= ~FMODE_RANDOM;
> spin_unlock(>f_lock);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index d78f577..425d2a2 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2048,6 +2048,7 @@ unsigned find_get_entries_tag(struct address_space 
> *mapping, pgoff_t start,
>  static void shrink_readahead_size_eio(struct file *filp,
> struct file_ra_state *ra)
>  {
> +   ra->seq_read_fact = -1;
> ra->ra_pages /= 4;
>  }
>
> @@ -2473,13 +2474,16 @@ static struct file *do_sync_mmap_readahead(struct 
> vm_fault *vmf)
> /* If we don't want any read-ahead, don't bother */
> if (vmf->vma->vm_flags & VM_RAND_READ)
> return fpin;
> -   if (!ra->ra_pages)
> +   if (!RA_PAGES(ra))
> return fpin;
>
> +   /* sync ra->r

[PATCH v2] mm : sync ra->ra_pages with bdi->ra_pages

2020-08-21 Thread Zhaoyang Huang
Some system(like android) will turbo read during startup via expanding the
readahead window and then set it back to normal(128kb as usual). However, some
files in the system process context will keep to be opened since it is opened
up and has no chance to sync with the updated value as it is almost impossible
to change the files attached to the inode(processes are unaware of these things)

We sync ra->ra_pages with bdi->ra_pages when read. Furthermore, in consideration
of the scenario of eio and fadvise(...,POSIX_FADV_SEQUENTIAL).We introduce a
seq_read_fact to record the factors of above two cases.

Signed-off-by: Zhaoyang Huang 
---
change from v2:
fix checkpatch error
---
---
 include/linux/fs.h | 17 +
 mm/fadvise.c   |  4 +++-
 mm/filemap.c   | 19 +--
 mm/readahead.c | 37 +
 4 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd28e76..e3cdc5a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -66,6 +66,7 @@
 struct fscrypt_operations;
 struct fs_context;
 struct fs_parameter_description;
+struct file_ra_state;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -81,6 +82,7 @@
 extern int sysctl_protected_hardlinks;
 extern int sysctl_protected_fifos;
 extern int sysctl_protected_regular;
+extern void ra_pages_sync(struct file_ra_state *ra, struct address_space 
*mapping);
 
 typedef __kernel_rwf_t rwf_t;
 
@@ -900,11 +902,26 @@ struct file_ra_state {
   there are only # of pages ahead */
 
unsigned int ra_pages;  /* Maximum readahead window */
+   int seq_read_fact;  /* turbo factor of sequential read */
unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
loff_t prev_pos;/* Cache last read() position */
 };
 
 /*
+ * ra->seq_read_fact == -1 indicates eio happens
+ */
+#define RA_PAGES(ra)   \
+({ \
+   unsigned int ra_pages;  \
+   if (ra->seq_read_fact != -1)\
+   ra_pages = ra->ra_pages * ra->seq_read_fact;\
+   else\
+   ra_pages = ra->ra_pages;\
+   ra_pages;   \
+})
+
+
+/*
  * Check if @index falls in the readahead windows.
  */
 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd0..b06e3ca 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -78,6 +78,7 @@ static int generic_fadvise(struct file *file, loff_t offset, 
loff_t len,
switch (advice) {
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
+   file->f_ra.seq_read_fact = 1;
spin_lock(>f_lock);
file->f_mode &= ~FMODE_RANDOM;
spin_unlock(>f_lock);
@@ -88,7 +89,8 @@ static int generic_fadvise(struct file *file, loff_t offset, 
loff_t len,
spin_unlock(>f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
-   file->f_ra.ra_pages = bdi->ra_pages * 2;
+   file->f_ra.ra_pages = bdi->ra_pages;
+   file->f_ra.seq_read_fact = 2;
spin_lock(>f_lock);
file->f_mode &= ~FMODE_RANDOM;
spin_unlock(>f_lock);
diff --git a/mm/filemap.c b/mm/filemap.c
index d78f577..425d2a2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2048,6 +2048,7 @@ unsigned find_get_entries_tag(struct address_space 
*mapping, pgoff_t start,
 static void shrink_readahead_size_eio(struct file *filp,
struct file_ra_state *ra)
 {
+   ra->seq_read_fact = -1;
ra->ra_pages /= 4;
 }
 
@@ -2473,13 +2474,16 @@ static struct file *do_sync_mmap_readahead(struct 
vm_fault *vmf)
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
-   if (!ra->ra_pages)
+   if (!RA_PAGES(ra))
return fpin;
 
+   /* sync ra->ra_pages with bdi->ra_pages*/
+   ra_pages_sync(ra, mapping);
+
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ RA_PAGES(ra));
return fpin;
}
 
@@ -2498,9 +2502,9 @@ static struct file *do_sync_mmap_readahead(struct 
vm_fault *vmf)
 * mmap read-around
 */
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
-   ra->start = max_t(long, 0, offset - ra->ra_

Re: [PATCH] mm : sync ra->ra_pages with bdi->ra_pages

2020-08-19 Thread Zhaoyang Huang
On Fri, Aug 14, 2020 at 5:03 PM Zhaoyang Huang  wrote:
>
> Some system(like android) will turbo read during startup via expanding the
> readahead window and then set it back to normal(128kb as usual). However, some
> files in the system process context will keep to be opened since it is opened
> up and has no chance to sync with the updated value as it is almost impossible
> to change the files attached to the inode(processes are unaware of these 
> things)
>
> We sync ra->ra_pages with bdi->ra_pages when read. Furthermore, in 
> consideration
> of the scenario of eio and fadvise(...,POSIX_FADV_SEQUENTIAL).We introduce a
> seq_read_fact to record the factors of above two cases.
>
> Signed-off-by: Zhaoyang Huang 
> ---
>  include/linux/fs.h | 17 +
>  mm/fadvise.c   |  4 +++-
>  mm/filemap.c   | 19 +--
>  mm/readahead.c | 38 ++
>  4 files changed, 67 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index dd28e76..e3cdc5a 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -66,6 +66,7 @@
>  struct fscrypt_operations;
>  struct fs_context;
>  struct fs_parameter_description;
> +struct file_ra_state;
>
>  extern void __init inode_init(void);
>  extern void __init inode_init_early(void);
> @@ -81,6 +82,7 @@
>  extern int sysctl_protected_hardlinks;
>  extern int sysctl_protected_fifos;
>  extern int sysctl_protected_regular;
> +extern void ra_pages_sync(struct file_ra_state *ra, struct address_space 
> *mapping);
>
>  typedef __kernel_rwf_t rwf_t;
>
> @@ -900,11 +902,26 @@ struct file_ra_state {
>there are only # of pages ahead */
>
> unsigned int ra_pages;  /* Maximum readahead window */
> +   int seq_read_fact;  /* turbo factor of sequential read */
> unsigned int mmap_miss; /* Cache miss stat for mmap accesses 
> */
> loff_t prev_pos;/* Cache last read() position */
>  };
>
>  /*
> + * ra->seq_read_fact == -1 indicates eio happens
> + */
> +#define RA_PAGES(ra)   \
> +({ \
> +   unsigned int ra_pages;  \
> +   if (ra->seq_read_fact != -1)\
> +   ra_pages = ra->ra_pages * ra->seq_read_fact;\
> +   else\
> +   ra_pages = ra->ra_pages;\
> +   ra_pages;   \
> +})
> +
> +
> +/*
>   * Check if @index falls in the readahead windows.
>   */
>  static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
> diff --git a/mm/fadvise.c b/mm/fadvise.c
> index 467bcd0..b06e3ca 100644
> --- a/mm/fadvise.c
> +++ b/mm/fadvise.c
> @@ -78,6 +78,7 @@ static int generic_fadvise(struct file *file, loff_t 
> offset, loff_t len,
> switch (advice) {
> case POSIX_FADV_NORMAL:
> file->f_ra.ra_pages = bdi->ra_pages;
> +   file->f_ra.seq_read_fact = 1;
> spin_lock(>f_lock);
> file->f_mode &= ~FMODE_RANDOM;
> spin_unlock(>f_lock);
> @@ -88,7 +89,8 @@ static int generic_fadvise(struct file *file, loff_t 
> offset, loff_t len,
> spin_unlock(>f_lock);
> break;
> case POSIX_FADV_SEQUENTIAL:
> -   file->f_ra.ra_pages = bdi->ra_pages * 2;
> +   file->f_ra.ra_pages = bdi->ra_pages;
> +   file->f_ra.seq_read_fact = 2;
> spin_lock(>f_lock);
> file->f_mode &= ~FMODE_RANDOM;
> spin_unlock(>f_lock);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index d78f577..425d2a2 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2048,6 +2048,7 @@ unsigned find_get_entries_tag(struct address_space 
> *mapping, pgoff_t start,
>  static void shrink_readahead_size_eio(struct file *filp,
> struct file_ra_state *ra)
>  {
> +   ra->seq_read_fact = -1;
> ra->ra_pages /= 4;
>  }
>
> @@ -2473,13 +2474,16 @@ static struct file *do_sync_mmap_readahead(struct 
> vm_fault *vmf)
> /* If we don't want any read-ahead, don't bother */
> if (vmf->vma->vm_flags & VM_RAND_READ)
> return fpin;
> -   if (!ra->ra_pages)
> +   if (!RA_PAGES(ra))
> return fpin;
>
> +   /* sync ra->ra_pages with bdi->ra_pages*/
> +   ra_pages_sync(r

Re: [PATCH] mm : sync ra->ra_pages with bdi->ra_pages

2020-08-17 Thread Zhaoyang Huang
On Sat, Aug 15, 2020 at 12:15 PM Andrew Morton
 wrote:
>
> On Fri, 14 Aug 2020 13:10:34 -0700 Andrew Morton  
> wrote:
>
> > On Fri, 14 Aug 2020 17:03:44 +0800 Zhaoyang Huang  
> > wrote:
> >
> > > Some system(like android) will turbo read during startup via expanding the
> > > readahead window and then set it back to normal(128kb as usual). However, 
> > > some
> > > files in the system process context will keep to be opened since it is 
> > > opened
> > > up and has no chance to sync with the updated value as it is almost 
> > > impossible
> > > to change the files attached to the inode(processes are unaware of these 
> > > things)
> >
> > How about making VM_READAHEAD_PAGES a variable?
>
> Or make it settable in Kconfig?
I don't think so. The scenario I gave before is a dynamic process,
can't be solved via menuconfig thing.


[PATCH] mm : sync ra->ra_pages with bdi->ra_pages

2020-08-14 Thread Zhaoyang Huang
Some system(like android) will turbo read during startup via expanding the
readahead window and then set it back to normal(128kb as usual). However, some
files in the system process context will keep to be opened since it is opened
up and has no chance to sync with the updated value as it is almost impossible
to change the files attached to the inode(processes are unaware of these things)

We sync ra->ra_pages with bdi->ra_pages when read. Furthermore, in consideration
of the scenario of eio and fadvise(...,POSIX_FADV_SEQUENTIAL).We introduce a
seq_read_fact to record the factors of above two cases.

Signed-off-by: Zhaoyang Huang 
---
 include/linux/fs.h | 17 +
 mm/fadvise.c   |  4 +++-
 mm/filemap.c   | 19 +--
 mm/readahead.c | 38 ++
 4 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd28e76..e3cdc5a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -66,6 +66,7 @@
 struct fscrypt_operations;
 struct fs_context;
 struct fs_parameter_description;
+struct file_ra_state;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -81,6 +82,7 @@
 extern int sysctl_protected_hardlinks;
 extern int sysctl_protected_fifos;
 extern int sysctl_protected_regular;
+extern void ra_pages_sync(struct file_ra_state *ra, struct address_space 
*mapping);
 
 typedef __kernel_rwf_t rwf_t;
 
@@ -900,11 +902,26 @@ struct file_ra_state {
   there are only # of pages ahead */
 
unsigned int ra_pages;  /* Maximum readahead window */
+   int seq_read_fact;  /* turbo factor of sequential read */
unsigned int mmap_miss; /* Cache miss stat for mmap accesses */
loff_t prev_pos;/* Cache last read() position */
 };
 
 /*
+ * ra->seq_read_fact == -1 indicates eio happens
+ */
+#define RA_PAGES(ra)   \
+({ \
+   unsigned int ra_pages;  \
+   if (ra->seq_read_fact != -1)\
+   ra_pages = ra->ra_pages * ra->seq_read_fact;\
+   else\
+   ra_pages = ra->ra_pages;\
+   ra_pages;   \
+})
+
+
+/*
  * Check if @index falls in the readahead windows.
  */
 static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd0..b06e3ca 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -78,6 +78,7 @@ static int generic_fadvise(struct file *file, loff_t offset, 
loff_t len,
switch (advice) {
case POSIX_FADV_NORMAL:
file->f_ra.ra_pages = bdi->ra_pages;
+   file->f_ra.seq_read_fact = 1;
spin_lock(>f_lock);
file->f_mode &= ~FMODE_RANDOM;
spin_unlock(>f_lock);
@@ -88,7 +89,8 @@ static int generic_fadvise(struct file *file, loff_t offset, 
loff_t len,
spin_unlock(>f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
-   file->f_ra.ra_pages = bdi->ra_pages * 2;
+   file->f_ra.ra_pages = bdi->ra_pages;
+   file->f_ra.seq_read_fact = 2;
spin_lock(>f_lock);
file->f_mode &= ~FMODE_RANDOM;
spin_unlock(>f_lock);
diff --git a/mm/filemap.c b/mm/filemap.c
index d78f577..425d2a2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2048,6 +2048,7 @@ unsigned find_get_entries_tag(struct address_space 
*mapping, pgoff_t start,
 static void shrink_readahead_size_eio(struct file *filp,
struct file_ra_state *ra)
 {
+   ra->seq_read_fact = -1;
ra->ra_pages /= 4;
 }
 
@@ -2473,13 +2474,16 @@ static struct file *do_sync_mmap_readahead(struct 
vm_fault *vmf)
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
-   if (!ra->ra_pages)
+   if (!RA_PAGES(ra))
return fpin;
 
+   /* sync ra->ra_pages with bdi->ra_pages*/
+   ra_pages_sync(ra, mapping);
+
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ RA_PAGES(ra));
return fpin;
}
 
@@ -2498,9 +2502,9 @@ static struct file *do_sync_mmap_readahead(struct 
vm_fault *vmf)
 * mmap read-around
 */
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
-   ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
-   ra->size = ra->ra

Re: [PATCH] mm : update ra->ra_pages if it's NOT equal to bdi->ra_pages

2020-08-13 Thread Zhaoyang Huang
On Fri, Aug 14, 2020 at 10:33 AM Andrew Morton
 wrote:
>
> On Fri, 14 Aug 2020 10:20:11 +0800 Zhaoyang Huang  
> wrote:
>
> > On Fri, Aug 14, 2020 at 10:07 AM Matthew Wilcox  wrote:
> > >
> > > On Fri, Aug 14, 2020 at 02:43:55AM +0100, Matthew Wilcox wrote:
> > > > On Fri, Aug 14, 2020 at 09:30:11AM +0800, Zhaoyang Huang wrote:
> > > > > file->f_ra->ra_pages will remain the initialized value since it 
> > > > > opend, which may
> > > > > be NOT equal to bdi->ra_pages as the latter one is updated 
> > > > > somehow(etc,
> > > > > echo xxx > /sys/block/dm/queue/read_ahead_kb).So sync ra->ra_pages to 
> > > > > the
> > > > > updated value when sync read.
> > > >
> > > > It still ignores the work done by shrink_readahead_size_eio()
> > > > and fadvise(POSIX_FADV_SEQUENTIAL).
> > >
> > > ... by the way, if you're trying to update one particular file's readahead
> > > state, you can just call fadvise(POSIX_FADV_NORMAL) on it.
> > >
> > > If you want to update every open file's ra_pages by writing to sysfs,
> > > then just no.  We don't do that.
> > No, What I want to fix is the file within one process's context  keeps
> > using the initialized value when it is opened and not sync with new
> > value when bdi->ra_pages changes.
>
> So you're saying that
>
> echo xxx > /sys/block/dm/queue/read_ahead_kb
>
> does not affect presently-open files, and you believe that it should do
> so?
>
> I guess that could be a reasonable thing to want - it's reasonable for
> a user to expect that writing to a global tunable will take immediate
> global effect.  I guess.
>
> But as Matthew says, it would help if you were to explain why this is
> needed.  In full detail.  What operational problems is the present
> implementation causing?
The real scenario is some system(like android) will turbo read during
startup via expanding the readahead window and then set it back to
normal(128kb as usual). However, some files in the system process
context will keep to be opened since it is opened up and has no chance
to sync with the updated value as it is almost impossible to change
the files attached to the inode(processes are unaware of these
things). we have to fix it from a kernel perspective.


Re: [PATCH] mm : update ra->ra_pages if it's NOT equal to bdi->ra_pages

2020-08-13 Thread Zhaoyang Huang
On Fri, Aug 14, 2020 at 10:20 AM Zhaoyang Huang  wrote:
>
> On Fri, Aug 14, 2020 at 10:07 AM Matthew Wilcox  wrote:
> >
> > On Fri, Aug 14, 2020 at 02:43:55AM +0100, Matthew Wilcox wrote:
> > > On Fri, Aug 14, 2020 at 09:30:11AM +0800, Zhaoyang Huang wrote:
> > > > file->f_ra->ra_pages will remain the initialized value since it opend, 
> > > > which may
> > > > be NOT equal to bdi->ra_pages as the latter one is updated somehow(etc,
> > > > echo xxx > /sys/block/dm/queue/read_ahead_kb).So sync ra->ra_pages to 
> > > > the
> > > > updated value when sync read.
> > >
> > > It still ignores the work done by shrink_readahead_size_eio()
> > > and fadvise(POSIX_FADV_SEQUENTIAL).
> >
> > ... by the way, if you're trying to update one particular file's readahead
> > state, you can just call fadvise(POSIX_FADV_NORMAL) on it.
> >
> > If you want to update every open file's ra_pages by writing to sysfs,
> > then just no.  We don't do that.
> No, What I want to fix is the file within one process's context  keeps
> using the initialized value when it is opened and not sync with new
> value when bdi->ra_pages changes.
So you mean it is just the desired behavior as having the opened file
use the initialized value even if bdi->ra_pages changed via sysfs?
> >
> > You haven't said what problem you're facing, so I really can't be more
> > helpful.


Re: [PATCH] mm : update ra->ra_pages if it's NOT equal to bdi->ra_pages

2020-08-13 Thread Zhaoyang Huang
On Fri, Aug 14, 2020 at 10:07 AM Matthew Wilcox  wrote:
>
> On Fri, Aug 14, 2020 at 02:43:55AM +0100, Matthew Wilcox wrote:
> > On Fri, Aug 14, 2020 at 09:30:11AM +0800, Zhaoyang Huang wrote:
> > > file->f_ra->ra_pages will remain the initialized value since it opend, 
> > > which may
> > > be NOT equal to bdi->ra_pages as the latter one is updated somehow(etc,
> > > echo xxx > /sys/block/dm/queue/read_ahead_kb).So sync ra->ra_pages to the
> > > updated value when sync read.
> >
> > It still ignores the work done by shrink_readahead_size_eio()
> > and fadvise(POSIX_FADV_SEQUENTIAL).
>
> ... by the way, if you're trying to update one particular file's readahead
> state, you can just call fadvise(POSIX_FADV_NORMAL) on it.
>
> If you want to update every open file's ra_pages by writing to sysfs,
> then just no.  We don't do that.
No, What I want to fix is the file within one process's context  keeps
using the initialized value when it is opened and not sync with new
value when bdi->ra_pages changes.
>
> You haven't said what problem you're facing, so I really can't be more
> helpful.


[PATCH] mm : update ra->ra_pages if it's NOT equal to bdi->ra_pages

2020-08-13 Thread Zhaoyang Huang
file->f_ra->ra_pages will remain the initialized value since it opend, which may
be NOT equal to bdi->ra_pages as the latter one is updated somehow(etc,
echo xxx > /sys/block/dm/queue/read_ahead_kb).So sync ra->ra_pages to the
updated value when sync read.

Signed-off-by: Zhaoyang Huang 
---
 mm/filemap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/filemap.c b/mm/filemap.c
index d78f577..5c2d7cc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2470,6 +2470,8 @@ static struct file *do_sync_mmap_readahead(struct 
vm_fault *vmf)
struct file *fpin = NULL;
pgoff_t offset = vmf->pgoff;
 
+   if (ra->ra_pages != inode_to_bdi(mapping->host)->ra_pages)
+   ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
-- 
1.9.1



[RFC PATCH] mm : using bdi->ra_pages instead of ra->ra_pages within readahead

2020-08-13 Thread Zhaoyang Huang
file->f_ra->ra_pages will remain the initialized value since it opend, which may
be NOT equal to bdi->ra_pages as the latter one is updated somehow(etc,
echo xxx > /sys/block/dm/queue/read_ahead_kb).So having readahead use
bdi->ra_pages.

Signed-off-by: Zhaoyang Huang 
---
 mm/filemap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index d78f577..259dcfd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2499,8 +2499,8 @@ static struct file *do_sync_mmap_readahead(struct 
vm_fault *vmf)
 */
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
-   ra->size = ra->ra_pages;
-   ra->async_size = ra->ra_pages / 4;
+   ra->size = inode_to_bdi(mapping->host)->ra_pages;
+   ra->async_size = ra->size / 4;
ra_submit(ra, mapping, file);
return fpin;
 }
-- 
1.9.1



[PATCH v2] trace : use kvzalloc instead of kzalloc

2020-07-30 Thread Zhaoyang Huang
High order memory stuff within trace could introduce OOM, use kvzalloc instead.

Please find the bellowing for the call stack we run across in an android system.
The scenario happens when traced_probes is woken up to get a large quantity of
trace even if free memory is even higher than watermark_low. 

traced_probes invoked oom-killer: 
gfp_mask=0x140c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),  
order=2, oom_score_adj=-1

traced_probes cpuset=system-background mems_allowed=0
CPU: 3 PID: 588 Comm: traced_probes Tainted: GW  O4.14.181 #1
Hardware name: Generic DT based system
(unwind_backtrace) from [] (show_stack+0x20/0x24)
(show_stack) from [] (dump_stack+0xa8/0xec)
(dump_stack) from [] (dump_header+0x9c/0x220)
(dump_header) from [] (oom_kill_process+0xc0/0x5c4)
(oom_kill_process) from [] (out_of_memory+0x220/0x310)
(out_of_memory) from [] (__alloc_pages_nodemask+0xff8/0x13a4)
(__alloc_pages_nodemask) from [] (kmalloc_order+0x30/0x48)
(kmalloc_order) from [] (kmalloc_order_trace+0x30/0x118)
(kmalloc_order_trace) from [] (tracing_buffers_open+0x50/0xfc)
(tracing_buffers_open) from [] (do_dentry_open+0x278/0x34c)
(do_dentry_open) from [] (vfs_open+0x50/0x70)
(vfs_open) from [] (path_openat+0x5fc/0x169c)
(path_openat) from [] (do_filp_open+0x94/0xf8)
(do_filp_open) from [] (do_sys_open+0x168/0x26c)
(do_sys_open) from [] (SyS_openat+0x34/0x38)
(SyS_openat) from [] (ret_fast_syscall+0x0/0x28)

Signed-off-by: Zhaoyang Huang 
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ca1ee65..1a038a2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6891,7 +6891,7 @@ static int tracing_buffers_open(struct inode *inode, 
struct file *filp)
if (trace_array_get(tr) < 0)
return -ENODEV;
 
-   info = kzalloc(sizeof(*info), GFP_KERNEL);
+   info = kvzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
trace_array_put(tr);
return -ENOMEM;
@@ -7017,7 +7017,7 @@ static int tracing_buffers_release(struct inode *inode, 
struct file *file)
if (info->spare)
ring_buffer_free_read_page(iter->trace_buffer->buffer,
   info->spare_cpu, info->spare);
-   kfree(info);
+   kvfree(info);
 
mutex_unlock(_types_lock);
 
-- 
1.9.1



Re: [PATCH] trace : use kvmalloc instead of kmalloc

2020-07-30 Thread Zhaoyang Huang
On Thu, Jul 30, 2020 at 9:58 PM Steven Rostedt  wrote:
>
> On Thu, 30 Jul 2020 19:04:12 +0800
> Zhaoyang Huang  wrote:
>
> > High order memory stuff within trace could introduce OOM, use kvmalloc 
> > instead.
> >
> > Please find the bellowing for the call stack we run across in an android 
> > system. The scenario happens when traced_probes is woken up to get a large 
> > quantity of trace even if free memory is even higher than watermark_low.
>
> Please limit your column width in the description of patches to 76
> characters.
>
> >
> > traced_probes invoked oom-killer: 
> > gfp_mask=0x140c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),  
> > order=2, oom_score_adj=-1
> >
>
> What does this traced_probes thing do?
traced_probes is an android exe which reads ftrace and writes them to
files. I think kzalloc works fine for most of context but will cause
OOM in high ratio ftrace access in memory constraint system, like
mobilephone etc.
>
> > traced_probes cpuset=system-background mems_allowed=0
> > CPU: 3 PID: 588 Comm: traced_probes Tainted: GW  O4.14.181 #1
> > Hardware name: Generic DT based system
> > (unwind_backtrace) from [] (show_stack+0x20/0x24)
> > (show_stack) from [] (dump_stack+0xa8/0xec)
> > (dump_stack) from [] (dump_header+0x9c/0x220)
> > (dump_header) from [] (oom_kill_process+0xc0/0x5c4)
> > (oom_kill_process) from [] (out_of_memory+0x220/0x310)
> > (out_of_memory) from [] (__alloc_pages_nodemask+0xff8/0x13a4)
> > (__alloc_pages_nodemask) from [] (kmalloc_order+0x30/0x48)
> > (kmalloc_order) from [] (kmalloc_order_trace+0x30/0x118)
> > (kmalloc_order_trace) from [] (tracing_buffers_open+0x50/0xfc)
> > (tracing_buffers_open) from [] (do_dentry_open+0x278/0x34c)
> > (do_dentry_open) from [] (vfs_open+0x50/0x70)
> > (vfs_open) from [] (path_openat+0x5fc/0x169c)
> > (path_openat) from [] (do_filp_open+0x94/0xf8)
> > (do_filp_open) from [] (do_sys_open+0x168/0x26c)
> > (do_sys_open) from [] (SyS_openat+0x34/0x38)
> > (SyS_openat) from [] (ret_fast_syscall+0x0/0x28)
> >
> > Signed-off-by: Zhaoyang Huang 
> > ---
> > changes since v1: change kfree to kvfree
> > ---
> >  kernel/trace/trace.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
> > index ca1ee65..8d70c79 100644
> > --- a/kernel/trace/trace.c
> > +++ b/kernel/trace/trace.c
> > @@ -6891,7 +6891,7 @@ static int tracing_buffers_open(struct inode *inode, 
> > struct file *filp)
> >   if (trace_array_get(tr) < 0)
> >   return -ENODEV;
> >
> > - info = kzalloc(sizeof(*info), GFP_KERNEL);
> > + info = kvmalloc(sizeof(*info), GFP_KERNEL);
>
> The above is a bug. It converts kzalloc() to kvmalloc() instead of
> kvzalloc().
fixed and resend with patch v2
>
> -- Steve
>
>
>
> >   if (!info) {
> >   trace_array_put(tr);
> >   return -ENOMEM;
> > @@ -7017,7 +7017,7 @@ static int tracing_buffers_release(struct inode 
> > *inode, struct file *file)
> >   if (info->spare)
> >   ring_buffer_free_read_page(iter->trace_buffer->buffer,
> >  info->spare_cpu, info->spare);
> > - kfree(info);
> > + kvfree(info);
> >
> >   mutex_unlock(_types_lock);
> >
>


[PATCH] trace : use kvmalloc instead of kmalloc

2020-07-30 Thread Zhaoyang Huang
High order memory stuff within trace could introduce OOM, use kvmalloc instead.

Please find the bellowing for the call stack we run across in an android 
system. The scenario happens when traced_probes is woken up to get a large 
quantity of trace even if free memory is even higher than watermark_low. 

traced_probes invoked oom-killer: 
gfp_mask=0x140c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),  
order=2, oom_score_adj=-1

traced_probes cpuset=system-background mems_allowed=0
CPU: 3 PID: 588 Comm: traced_probes Tainted: GW  O4.14.181 #1
Hardware name: Generic DT based system
(unwind_backtrace) from [] (show_stack+0x20/0x24)
(show_stack) from [] (dump_stack+0xa8/0xec)
(dump_stack) from [] (dump_header+0x9c/0x220)
(dump_header) from [] (oom_kill_process+0xc0/0x5c4)
(oom_kill_process) from [] (out_of_memory+0x220/0x310)
(out_of_memory) from [] (__alloc_pages_nodemask+0xff8/0x13a4)
(__alloc_pages_nodemask) from [] (kmalloc_order+0x30/0x48)
(kmalloc_order) from [] (kmalloc_order_trace+0x30/0x118)
(kmalloc_order_trace) from [] (tracing_buffers_open+0x50/0xfc)
(tracing_buffers_open) from [] (do_dentry_open+0x278/0x34c)
(do_dentry_open) from [] (vfs_open+0x50/0x70)
(vfs_open) from [] (path_openat+0x5fc/0x169c)
(path_openat) from [] (do_filp_open+0x94/0xf8)
(do_filp_open) from [] (do_sys_open+0x168/0x26c)
(do_sys_open) from [] (SyS_openat+0x34/0x38)
(SyS_openat) from [] (ret_fast_syscall+0x0/0x28)

Signed-off-by: Zhaoyang Huang 
---
changes since v1: change kfree to kvfree
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ca1ee65..8d70c79 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6891,7 +6891,7 @@ static int tracing_buffers_open(struct inode *inode, 
struct file *filp)
if (trace_array_get(tr) < 0)
return -ENODEV;
 
-   info = kzalloc(sizeof(*info), GFP_KERNEL);
+   info = kvmalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
trace_array_put(tr);
return -ENOMEM;
@@ -7017,7 +7017,7 @@ static int tracing_buffers_release(struct inode *inode, 
struct file *file)
if (info->spare)
ring_buffer_free_read_page(iter->trace_buffer->buffer,
   info->spare_cpu, info->spare);
-   kfree(info);
+   kvfree(info);
 
mutex_unlock(_types_lock);
 
-- 
1.9.1



[PATCH] trace : use kvmalloc instead of kmalloc

2020-07-29 Thread Zhaoyang Huang
High order memory stuff within trace could introduce OOM, use kvmalloc instead.

Please find the bellowing for the call stack we run across in an android 
system. The scenario happens when traced_probes is woken up to get a large 
quantity of trace even if free memory is even higher than watermark_low. 

traced_probes invoked oom-killer: 
gfp_mask=0x140c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),  
order=2, oom_score_adj=-1

traced_probes cpuset=system-background mems_allowed=0
CPU: 3 PID: 588 Comm: traced_probes Tainted: GW  O4.14.181 #1
Hardware name: Generic DT based system
(unwind_backtrace) from [] (show_stack+0x20/0x24)
(show_stack) from [] (dump_stack+0xa8/0xec)
(dump_stack) from [] (dump_header+0x9c/0x220)
(dump_header) from [] (oom_kill_process+0xc0/0x5c4)
(oom_kill_process) from [] (out_of_memory+0x220/0x310)
(out_of_memory) from [] (__alloc_pages_nodemask+0xff8/0x13a4)
(__alloc_pages_nodemask) from [] (kmalloc_order+0x30/0x48)
(kmalloc_order) from [] (kmalloc_order_trace+0x30/0x118)
(kmalloc_order_trace) from [] (tracing_buffers_open+0x50/0xfc)
(tracing_buffers_open) from [] (do_dentry_open+0x278/0x34c)
(do_dentry_open) from [] (vfs_open+0x50/0x70)
(vfs_open) from [] (path_openat+0x5fc/0x169c)
(path_openat) from [] (do_filp_open+0x94/0xf8)
(do_filp_open) from [] (do_sys_open+0x168/0x26c)
(do_sys_open) from [] (SyS_openat+0x34/0x38)
(SyS_openat) from [] (ret_fast_syscall+0x0/0x28)

Signed-off-by: Zhaoyang Huang 
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ca1ee65..d4eb7ea 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6891,7 +6891,7 @@ static int tracing_buffers_open(struct inode *inode, 
struct file *filp)
if (trace_array_get(tr) < 0)
return -ENODEV;
 
-   info = kzalloc(sizeof(*info), GFP_KERNEL);
+   info = kvmalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
trace_array_put(tr);
return -ENOMEM;
-- 
1.9.1



[PATCH] trace : use kvmalloc instead of kmalloc

2020-07-29 Thread Zhaoyang Huang
High order memory stuff within trace could introduce OOM, use kvmalloc instead.

traced_probes invoked oom-killer: 
gfp_mask=0x140c0c0(GFP_KERNEL|__GFP_COMP|__GFP_ZERO), nodemask=(null),  
order=2, oom_score_adj=-1

traced_probes cpuset=system-background mems_allowed=0
CPU: 3 PID: 588 Comm: traced_probes Tainted: GW  O4.14.181 #1
Hardware name: Generic DT based system
(unwind_backtrace) from [] (show_stack+0x20/0x24)
(show_stack) from [] (dump_stack+0xa8/0xec)
(dump_stack) from [] (dump_header+0x9c/0x220)
(dump_header) from [] (oom_kill_process+0xc0/0x5c4)
(oom_kill_process) from [] (out_of_memory+0x220/0x310)
(out_of_memory) from [] (__alloc_pages_nodemask+0xff8/0x13a4)
(__alloc_pages_nodemask) from [] (kmalloc_order+0x30/0x48)
(kmalloc_order) from [] (kmalloc_order_trace+0x30/0x118)
(kmalloc_order_trace) from [] (tracing_buffers_open+0x50/0xfc)
(tracing_buffers_open) from [] (do_dentry_open+0x278/0x34c)
(do_dentry_open) from [] (vfs_open+0x50/0x70)
(vfs_open) from [] (path_openat+0x5fc/0x169c)
(path_openat) from [] (do_filp_open+0x94/0xf8)
(do_filp_open) from [] (do_sys_open+0x168/0x26c)
(do_sys_open) from [] (SyS_openat+0x34/0x38)
(SyS_openat) from [] (ret_fast_syscall+0x0/0x28)

Signed-off-by: Zhaoyang Huang 
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ca1ee65..d4eb7ea 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6891,7 +6891,7 @@ static int tracing_buffers_open(struct inode *inode, 
struct file *filp)
if (trace_array_get(tr) < 0)
return -ENODEV;
 
-   info = kzalloc(sizeof(*info), GFP_KERNEL);
+   info = kvmalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
trace_array_put(tr);
return -ENOMEM;
-- 
1.9.1



[Resend PATCH v3] arch : arm : add a criteria for pfn_valid

2019-08-20 Thread Zhaoyang Huang
From: Zhaoyang Huang 

pfn_valid can be wrong when parsing a invalid pfn whose phys address
exceeds BITS_PER_LONG as the MSB will be trimed when shifted.

The issue originally arise from bellowing call stack, which corresponding to
an access of the /proc/kpageflags from userspace with a invalid pfn parameter
and leads to kernel panic.

[46886.723249] c7 [] (stable_page_flags) from []
[46886.723264] c7 [] (kpageflags_read) from []
[46886.723280] c7 [] (proc_reg_read) from []
[46886.723290] c7 [] (__vfs_read) from []
[46886.723301] c7 [] (vfs_read) from []
[46886.723315] c7 [] (SyS_pread64) from []
(ret_fast_syscall+0x0/0x28)

Signed-off-by: Zhaoyang Huang 
Reviewed-by: Mike Rapoport 
---
v2: use __pfn_to_phys/__phys_to_pfn instead of max_pfn as the criteria
v3: update commit message to describe the defection's context
  add Mike Rapoport as reviewer
---
 arch/arm/mm/init.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index c2daabb..cc769fa 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -177,6 +177,11 @@ static void __init zone_sizes_init(unsigned long
min, unsigned long max_low,
 #ifdef CONFIG_HAVE_ARCH_PFN_VALID
 int pfn_valid(unsigned long pfn)
 {
+   phys_addr_t addr = __pfn_to_phys(pfn);
+
+   if (__phys_to_pfn(addr) != pfn)
+   return 0;
+
return memblock_is_map_memory(__pfn_to_phys(pfn));
 }
 EXPORT_SYMBOL(pfn_valid);
-- 
1.9.1


[PATCH v3] arch : arm : add a criteria for pfn_valid

2019-08-18 Thread Zhaoyang Huang
From: Zhaoyang Huang 

pfn_valid can be wrong when parsing a invalid pfn whose phys address
exceeds BITS_PER_LONG as the MSB will be trimed when shifted.

The issue originally arise from bellowing call stack, which corresponding to
an access of the /proc/kpageflags from userspace with a invalid pfn parameter
and leads to kernel panic.

[46886.723249] c7 [] (stable_page_flags) from []
[46886.723264] c7 [] (kpageflags_read) from []
[46886.723280] c7 [] (proc_reg_read) from []
[46886.723290] c7 [] (__vfs_read) from []
[46886.723301] c7 [] (vfs_read) from []
[46886.723315] c7 [] (SyS_pread64) from []
(ret_fast_syscall+0x0/0x28)

Signed-off-by: Zhaoyang Huang 
---
v2: use __pfn_to_phys/__phys_to_pfn instead of max_pfn as the criteria
v3: update commit message to describe the defection's context
---
 arch/arm/mm/init.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index c2daabb..cc769fa 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -177,6 +177,11 @@ static void __init zone_sizes_init(unsigned long min, 
unsigned long max_low,
 #ifdef CONFIG_HAVE_ARCH_PFN_VALID
 int pfn_valid(unsigned long pfn)
 {
+   phys_addr_t addr = __pfn_to_phys(pfn);
+
+   if (__phys_to_pfn(addr) != pfn)
+   return 0;
+
return memblock_is_map_memory(__pfn_to_phys(pfn));
 }
 EXPORT_SYMBOL(pfn_valid);
-- 
1.9.1



[PATCH v2] arch : arm : add a criteria for pfn_valid

2019-08-18 Thread Zhaoyang Huang
From: Zhaoyang Huang 

pfn_valid can be wrong when parsing a invalid pfn whose phys address
exceeds BITS_PER_LONG as the MSB will be trimed when shifted.

Signed-off-by: Zhaoyang Huang 
---
v2: use __pfn_to_phys/__phys_to_pfn instead of max_pfn as the criteria
---
 arch/arm/mm/init.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index c2daabb..cc769fa 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -177,6 +177,11 @@ static void __init zone_sizes_init(unsigned long min, 
unsigned long max_low,
 #ifdef CONFIG_HAVE_ARCH_PFN_VALID
 int pfn_valid(unsigned long pfn)
 {
+   phys_addr_t addr = __pfn_to_phys(pfn);
+
+   if (__phys_to_pfn(addr) != pfn)
+   return 0;
+
return memblock_is_map_memory(__pfn_to_phys(pfn));
 }
 EXPORT_SYMBOL(pfn_valid);
-- 
1.9.1



[PATCH v2] arch : arm : add a criteria for pfn_valid

2019-08-18 Thread Zhaoyang Huang
From: Zhaoyang Huang 

pfn_valid can be wrong when parsing a invalid pfn whose phys address
exceeds BITS_PER_LONG as the MSB will be trimed when shifted.

Signed-off-by: Zhaoyang Huang 
---
 arch/arm/mm/init.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index c2daabb..cc769fa 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -177,6 +177,11 @@ static void __init zone_sizes_init(unsigned long min, 
unsigned long max_low,
 #ifdef CONFIG_HAVE_ARCH_PFN_VALID
 int pfn_valid(unsigned long pfn)
 {
+   phys_addr_t addr = __pfn_to_phys(pfn);
+
+   if (__phys_to_pfn(addr) != pfn)
+   return 0;
+
return memblock_is_map_memory(__pfn_to_phys(pfn));
 }
 EXPORT_SYMBOL(pfn_valid);
-- 
1.9.1



Re: [PATCH] arch : arm : add a criteria for pfn_valid

2019-08-18 Thread Zhaoyang Huang
On Sun, Aug 18, 2019 at 2:32 AM Russell King - ARM Linux admin
 wrote:
>
> On Sat, Aug 17, 2019 at 11:00:13AM +0800, Zhaoyang Huang wrote:
> > From: Zhaoyang Huang 
> >
> > pfn_valid can be wrong while the MSB of physical address be trimed as pfn
> > larger than the max_pfn.
>
> What scenario are you addressing here?  At a guess, you're addressing
> the non-LPAE case with PFNs that correspond with >= 4GiB of memory?
Please find bellowing for the callstack caused by this defect. The
original reason is a invalid PFN passed from userspace which will
introduce a invalid page within stable_page_flags and then kernel
panic.

[46886.723249] c7 [] (stable_page_flags) from []
(kpageflags_read+0x90/0x11c)
[46886.723256] c7  r9:c101ce04 r8:c2d0bf70 r7:c2d0bf70 r6:1fbb10fb
r5:a8686f08 r4:a8686f08
[46886.723264] c7 [] (kpageflags_read) from []
(proc_reg_read+0x80/0x94)
[46886.723270] c7  r10:00b4 r9:0008 r8:c2d0bf70 r7:
r6:0001 r5:ed8e7240
[46886.723272] c7  r4:
[46886.723280] c7 [] (proc_reg_read) from []
(__vfs_read+0x48/0x150)
[46886.723284] c7  r7:c2d0bf70 r6:c0f09208 r5:c0a4f940 r4:c40326c0
[46886.723290] c7 [] (__vfs_read) from []
(vfs_read+0xa4/0x158)
[46886.723296] c7  r9:a8686f08 r8:0008 r7:c2d0bf70 r6:a8686f08
r5:c40326c0 r4:0008
[46886.723301] c7 [] (vfs_read) from []
(SyS_pread64+0x80/0xb8)
[46886.723306] c7  r8:0008 r7:c0f09208 r6:c40326c0 r5:c40326c0 r4:fdd887d8
[46886.723315] c7 [] (SyS_pread64) from []
(ret_fast_syscall+0x0/0x28)

>
> >
> > Signed-off-by: Zhaoyang Huang 
> > ---
> >  arch/arm/mm/init.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
> > index c2daabb..9c4d938 100644
> > --- a/arch/arm/mm/init.c
> > +++ b/arch/arm/mm/init.c
> > @@ -177,7 +177,8 @@ static void __init zone_sizes_init(unsigned long min, 
> > unsigned long max_low,
> >  #ifdef CONFIG_HAVE_ARCH_PFN_VALID
> >  int pfn_valid(unsigned long pfn)
> >  {
> > - return memblock_is_map_memory(__pfn_to_phys(pfn));
> > + return (pfn > max_pfn) ?
> > + false : memblock_is_map_memory(__pfn_to_phys(pfn));
> >  }
> >  EXPORT_SYMBOL(pfn_valid);
> >  #endif
> > --
> > 1.9.1
> >
> >
>
> --
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line in suburbia: sync at 12.1Mbps down 622kbps up
> According to speedtest.net: 11.9Mbps down 500kbps up


Re: [PATCH] arch : arm : add a criteria for pfn_valid

2019-08-17 Thread Zhaoyang Huang
On Sat, Aug 17, 2019 at 5:00 PM Mike Rapoport  wrote:
>
> On Sat, Aug 17, 2019 at 11:00:13AM +0800, Zhaoyang Huang wrote:
> > From: Zhaoyang Huang 
> >
> > pfn_valid can be wrong while the MSB of physical address be trimed as pfn
> > larger than the max_pfn.
>
> How the overflow of __pfn_to_phys() is related to max_pfn?
> Where is the guarantee that __pfn_to_phys(max_pfn) won't overflow?
eg, the invalid pfn value as 0x1bffc0 will pass pfn_valid if there is
a memory block while the max_pfn is 0xbffc0.
In ARM64, bellowing condition check will help to
>
> > Signed-off-by: Zhaoyang Huang 
> > ---
> >  arch/arm/mm/init.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
> > index c2daabb..9c4d938 100644
> > --- a/arch/arm/mm/init.c
> > +++ b/arch/arm/mm/init.c
> > @@ -177,7 +177,8 @@ static void __init zone_sizes_init(unsigned long min, 
> > unsigned long max_low,
> >  #ifdef CONFIG_HAVE_ARCH_PFN_VALID
> >  int pfn_valid(unsigned long pfn)
> >  {
> > - return memblock_is_map_memory(__pfn_to_phys(pfn));
> > + return (pfn > max_pfn) ?
> > + false : memblock_is_map_memory(__pfn_to_phys(pfn));
> >  }
> >  EXPORT_SYMBOL(pfn_valid);
> >  #endif
> > --
> > 1.9.1
> >
>
> --
> Sincerely yours,
> Mike.
>


[PATCH] arch : arm : add a criteria for pfn_valid

2019-08-16 Thread Zhaoyang Huang
From: Zhaoyang Huang 

pfn_valid can be wrong while the MSB of physical address be trimed as pfn
larger than the max_pfn.

Signed-off-by: Zhaoyang Huang 
---
 arch/arm/mm/init.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index c2daabb..9c4d938 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -177,7 +177,8 @@ static void __init zone_sizes_init(unsigned long min, 
unsigned long max_low,
 #ifdef CONFIG_HAVE_ARCH_PFN_VALID
 int pfn_valid(unsigned long pfn)
 {
-   return memblock_is_map_memory(__pfn_to_phys(pfn));
+   return (pfn > max_pfn) ?
+   false : memblock_is_map_memory(__pfn_to_phys(pfn));
 }
 EXPORT_SYMBOL(pfn_valid);
 #endif
-- 
1.9.1



Re: [[repost]RFC PATCH] mm/workingset : judge file page activity via timestamp

2019-05-07 Thread Zhaoyang Huang
On Mon, May 6, 2019 at 10:57 PM Johannes Weiner  wrote:
>
> On Sun, Apr 28, 2019 at 03:44:34PM +0800, Zhaoyang Huang wrote:
> > From: Zhaoyang Huang 
> >
> > this patch introduce timestamp into workingset's entry and judge if the 
> > page is
> > active or inactive via active_file/refault_ratio instead of refault 
> > distance.
> >
> > The original thought is coming from the logs we got from trace_printk in 
> > this
> > patch, we can find about 1/5 of the file pages' refault are under the
> > scenario[1],which will be counted as inactive as they have a long refault 
> > distance
> > in between access. However, we can also know from the time information that 
> > the
> > page refault quickly as comparing to the average refault time which is 
> > calculated
> > by the number of active file and refault ratio. We want to save these kinds 
> > of
> > pages from evicted earlier as it used to be via setting it to ACTIVE 
> > instead.
> > The refault ratio is the value which can reflect lru's average file access
> > frequency in the past and provide the judge criteria for page's activation.
> >
> > The patch is tested on an android system and reduce 30% of page faults, 
> > while
> > 60% of the pages remain the original status as (refault_distance < 
> > active_file)
> > indicates. Pages status got from ftrace during the test can refer to [2].
> >
Hi Johannes,
Thank you for your feedback. I have answer previous comments many
times in different context. I don't expect you accept this patch but
want to have you pay attention to the phenomenon reported in [1],
which has a big refault distance but refaulted very quickly after
evicted. Do you think if this kind of page should be set to INACTIVE?
> > [1]
> > system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 
> > 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 
> > 29616 secs 97 pre_secs 97
> > HwBinder:922  workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 
> > 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 
> > 45600 secs 101 pre_secs 99
> >
> > [2]
> > WKST_ACT[0]:   original--INACTIVE  commit--ACTIVE
> > WKST_ACT[1]:   original--ACTIVEcommit--ACTIVE
> > WKST_INACT[0]: original--INACTIVE  commit--INACTIVE
> > WKST_INACT[1]: original--ACTIVEcommit--INACTIVE
> >
> > Signed-off-by: Zhaoyang Huang 
>
> Nacked-by: Johannes Weiner 
>
> You haven't addressed any of the questions raised during previous
> submissions.


[[repost]RFC PATCH] mm/workingset : judge file page activity via timestamp

2019-04-28 Thread Zhaoyang Huang
From: Zhaoyang Huang 

this patch introduce timestamp into workingset's entry and judge if the page is
active or inactive via active_file/refault_ratio instead of refault distance.

The original thought is coming from the logs we got from trace_printk in this
patch, we can find about 1/5 of the file pages' refault are under the
scenario[1],which will be counted as inactive as they have a long refault 
distance
in between access. However, we can also know from the time information that the
page refault quickly as comparing to the average refault time which is 
calculated
by the number of active file and refault ratio. We want to save these kinds of
pages from evicted earlier as it used to be via setting it to ACTIVE instead.
The refault ratio is the value which can reflect lru's average file access
frequency in the past and provide the judge criteria for page's activation.

The patch is tested on an android system and reduce 30% of page faults, while
60% of the pages remain the original status as (refault_distance < active_file)
indicates. Pages status got from ftrace during the test can refer to [2].

[1]
system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 
rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 
pre_secs 97
HwBinder:922  workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 
rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 
101 pre_secs 99

[2]
WKST_ACT[0]:   original--INACTIVE  commit--ACTIVE
WKST_ACT[1]:   original--ACTIVEcommit--ACTIVE
WKST_INACT[0]: original--INACTIVE  commit--INACTIVE
WKST_INACT[1]: original--ACTIVEcommit--INACTIVE

Signed-off-by: Zhaoyang Huang 
---
 include/linux/mmzone.h |   1 +
 mm/workingset.c| 129 ++---
 2 files changed, 113 insertions(+), 17 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fba7741..ca4ced6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -242,6 +242,7 @@ struct lruvec {
atomic_long_t   inactive_age;
/* Refaults at the time of last reclaim cycle */
unsigned long   refaults;
+   atomic_long_t   refaults_ratio;
 #ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
 #endif
diff --git a/mm/workingset.c b/mm/workingset.c
index 0bedf67..fd2e5af 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -167,10 +167,19 @@
  * refault distance will immediately activate the refaulting page.
  */
 
+#ifdef CONFIG_64BIT
+#define EVICTION_SECS_POS_SHIFT 18
+#define EVICTION_SECS_SHRINK_SHIFT 4
+#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
+#else
+#define EVICTION_SECS_POS_SHIFT 0
+#define EVICTION_SECS_SHRINK_SHIFT 0
+#define NO_SECS_IN_WORKINGSET
+#endif
 #define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) +  \
-1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
+1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT + \
+EVICTION_SECS_POS_SHIFT + EVICTION_SECS_SHRINK_SHIFT)
 #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
-
 /*
  * Eviction timestamps need to be able to cover the full range of
  * actionable refaults. However, bits are tight in the xarray
@@ -180,12 +189,48 @@
  * evictions into coarser buckets by shaving off lower timestamp bits.
  */
 static unsigned int bucket_order __read_mostly;
-
+#ifdef NO_SECS_IN_WORKINGSET
+static void pack_secs(unsigned long *peviction) { }
+static unsigned int unpack_secs(unsigned long entry) {return 0; }
+#else
+static void pack_secs(unsigned long *peviction)
+{
+   unsigned int secs;
+   unsigned long eviction;
+   int order;
+   int secs_shrink_size;
+   struct timespec64 ts;
+
+   ktime_get_boottime_ts64();
+   secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
+   order = get_count_order(secs);
+   secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
+   ? 0 : (order - EVICTION_SECS_POS_SHIFT);
+
+   eviction = *peviction;
+   eviction = (eviction << EVICTION_SECS_POS_SHIFT)
+   | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
+   eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size 
& 0xf);
+   *peviction = eviction;
+}
+static unsigned int unpack_secs(unsigned long entry)
+{
+   unsigned int secs;
+   int secs_shrink_size;
+
+   secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
+   entry >>= EVICTION_SECS_SHRINK_SHIFT;
+   secs = entry & EVICTION_SECS_POS_MASK;
+   secs = secs << secs_shrink_size;
+   return secs;
+}
+#endif
 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
 bool workingset)
 {
eviction >>= bucket_order;
eviction &= EVICTION_MASK;
+  

Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

2019-04-23 Thread Zhaoyang Huang
KINGSET
+ refaults_ratio = (atomic_long_read(>inactive_age) + 1) / secs;
+ atomic_long_set(>refaults_ratio, refaults_ratio);
+ refault_time = secs - prev_secs;
+ avg_refault_time = active_file / refaults_ratio;
+ tradition = !!(refault_distance < active_file);
  /*
- * Compare the distance to the existing workingset size. We
- * don't act on pages that couldn't stay resident even if all
- * the memory was available to the page cache.
+ * What we are trying to solve here is
+ * 1. extremely fast refault as refault_time == 0.
+ * 2. quick file drop scenario, which has a big refault_distance but
+ *small refault_time comparing with the past refault ratio, which
+ *will be deemed as inactive in previous implementation.
  */
- if (refault_distance > active_file)
+ if (refault_time && (((refault_time < avg_refault_time)
+ && (avg_refault_time < 2 * refault_time))
+ || (refault_time >= avg_refault_time))) {
+ trace_printk("WKST_INACT[%d]:rft_dis %ld, act %ld\
+ rft_ratio %ld rft_time %ld avg_rft_time %ld\
+ refault %ld eviction %ld secs %d pre_secs %d page %p\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs, page);
  goto out;
+ }
+ else {
+#else
+ if (refault_distance < active_file) {
+#endif

- SetPageActive(page);
- atomic_long_inc(>inactive_age);
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't act on pages that couldn't stay resident even if all
+ * the memory was available to the page cache.
+ */

- /* Page was active prior to eviction */
- if (workingset) {
- SetPageWorkingset(page);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ SetPageActive(page);
+ atomic_long_inc(>inactive_age);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+ /* Page was active prior to eviction */
+ if (workingset) {
+ SetPageWorkingset(page);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ }
+#ifndef NO_SECS_IN_WORKINGSET
+ trace_printk("WKST_ACT[%d]:rft_dis %ld, act %ld\
+ rft_ratio %ld rft_time %ld avg_rft_time %ld\
+ refault %ld eviction %ld secs %d pre_secs %d page %p\n",
+ tradition, refault_distance, active_file,
+ refaults_ratio, refault_time, avg_refault_time,
+ refault, eviction, secs, prev_secs, page);
+#endif
  }
 out:
  rcu_read_unlock();
@@ -539,7 +637,9 @@ static int __init workingset_init(void)
  unsigned int max_order;
  int ret;

- BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ BUILD_BUG_ON(BITS_PER_LONG < (EVICTION_SHIFT
+ + EVICTION_SECS_POS_SHIFT
+ + EVICTION_SECS_SHRINK_SHIFT));
  /*
  * Calculate the eviction bucket size to cover the longest
  * actionable refault distance, which is currently half of
@@ -547,7 +647,9 @@ static int __init workingset_init(void)
  * some more pages at runtime, so keep working with up to
  * double the initial memory by using totalram_pages as-is.
  */
- timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT
+ - EVICTION_SECS_POS_SHIFT - EVICTION_SECS_SHRINK_SHIFT;
+
  max_order = fls_long(totalram_pages() - 1);
  if (max_order > timestamp_bits)
  bucket_order = max_order - timestamp_bits;

On Wed, Apr 17, 2019 at 9:37 PM Matthew Wilcox  wrote:
>
> On Wed, Apr 17, 2019 at 08:26:22PM +0800, Zhaoyang Huang wrote:
> [quoting Johannes here]
> > As Matthew says, you are fairly randomly making refault activations
> > more aggressive (especially with that timestamp unpacking bug), and
> > while that expectedly boosts workload transition / startup, it comes
> > at the cost of disrupting stable states because you can flood a very
> > active in-ram workingset with completely cold cache pages simply
> > because they refault uniformly wrt each other.
> > [HZY]: I analysis the log got from trace_printk, what we activate have
> > proven record of long refault distance but very short refault time.
>
> You haven't addressed my point, which is that you were only testing
> workloads for which your changed algorithm would improve the results.
> What you haven't done is shown how other workloads would be negatively
> affected.
>
> Once you do that, we can make a decision about whether to improve your
> workload by X% and penalise that other workload by Y%.


Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

2019-04-17 Thread Zhaoyang Huang
repost the feedback by under Johannes's comment
When something like a higher-order allocation drops a large number of
file pages, it's *intentional* that the pages that were evicted before
them become less valuable and less likely to be activated on refault.
There is a finite amount of in-memory LRU space and the pages that
have been evicted the most recently have precedence because they have
the highest proven access frequency.
[HZY]: Yes. I do agree with you about the original thought of
sacrificing long distance access pages when huge memory demands arise.
The problem is what is the criteria of selecting the page, which you
can find from what I comment in the patch, that is, some pages have
long refault_distance while having a very short access time in
between.

Of course, when a large amount of the cache that was pushed out in
between is not re-used again, and don't claim their space in memory,
it would be great if we could then activate the older pages that *are*
re-used again in their stead.But that would require us being able to
look into the future. When an old page refaults, we don't know if a
younger page is still going to refault with a shorter refault distance
or not. If it won't, then we were right to activate it. If it will
refault, then we put something on the active list whose reuse
frequency is too low to be able to fit into memory, and we thrash the
hottest pages in the system.
[HZY]: We do NOT use the absolute timestamp when page refaulting to
indicate young or old of the page and thus to decide the position of
LRU. The criteria which i use is to comparing the "time duration of
the page's out of cache" and "the active files shrinking time by
dividing average refault ratio". I inherite the concept of deeming
ACTIVE file as deficit of INACTIVE files, but use time to avoid the
scenario as suggested in patch's [1].

As Matthew says, you are fairly randomly making refault activations
more aggressive (especially with that timestamp unpacking bug), and
while that expectedly boosts workload transition / startup, it comes
at the cost of disrupting stable states because you can flood a very
active in-ram workingset with completely cold cache pages simply
because they refault uniformly wrt each other.
[HZY]: I analysis the log got from trace_printk, what we activate have
proven record of long refault distance but very short refault time.

On Wed, Apr 17, 2019 at 7:46 PM Michal Hocko  wrote:
>
> On Wed 17-04-19 19:36:21, Zhaoyang Huang wrote:
> > sorry for the confusion. What I mean is the basic idea doesn't change
> > as replacing the refault criteria from refault_distance to timestamp.
> > But the detailed implementation changed a lot, including fix bugs,
> > update the way of packing the timestamp, 32bit/64bit differentiation
> > etc. So it makes sense for starting a new context.
>
> Not really. My take away from the previous discussion is that Johannes
> has questioned the timestamping approach itself. I wasn't following very
> closely so I might be wrong here but if that is really the case then it
> doesn't make much sense to improve the implementation if there is no
> consensus on the approach itself.
>
> --
> Michal Hocko
> SUSE Labs


Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

2019-04-17 Thread Zhaoyang Huang
sorry for the confusion. What I mean is the basic idea doesn't change
as replacing the refault criteria from refault_distance to timestamp.
But the detailed implementation changed a lot, including fix bugs,
update the way of packing the timestamp, 32bit/64bit differentiation
etc. So it makes sense for starting a new context.

On Wed, Apr 17, 2019 at 7:06 PM Michal Hocko  wrote:
>
> On Wed 17-04-19 18:55:15, Zhaoyang Huang wrote:
> > fix one mailbox and update for some information
> >
> > Comparing to 
> > http://lkml.kernel.org/r/1554348617-12897-1-git-send-email-huangzhaoy...@gmail.com,
> > this commit fix the packing order error and add trace_printk for
> > reference debug information.
> >
> > For johannes's comments, please find bellowing for my feedback.
>
> OK, this suggests there is no strong reason to poset a new version of
> the patch then. Please do not fragment discussion and continue
> discussing in the original email thread until there is some conclusion
> reached.
>
> Thanks!
> --
> Michal Hocko
> SUSE Labs


Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

2019-04-17 Thread Zhaoyang Huang
fix one mailbox and update for some information

Comparing to 
http://lkml.kernel.org/r/1554348617-12897-1-git-send-email-huangzhaoy...@gmail.com,
this commit fix the packing order error and add trace_printk for
reference debug information.

For johannes's comments, please find bellowing for my feedback.



On Wed, Apr 17, 2019 at 3:59 PM Zhaoyang Huang  wrote:
>
> add Johannes and answer his previous question.
>
> @Johannes Weiner
> Yes. I do agree with you about the original thought of sacrificing
> long distance access pages when huge memory demands arise. The problem
> is what is the criteria of the distance, which you can find from what
> I comment in the patch, that is, some pages have long refault_distance
> while having a very short access time in between. I think the latter
> one should be take into consideration or as part of the finnal
> decision of if the page should be active/inactive.
>
> On Wed, Apr 17, 2019 at 3:48 PM Zhaoyang Huang  
> wrote:
> >
> > From: Zhaoyang Huang 
> >
> > This patch introduce timestamp into workingset's entry and judge if the page
> > is active or inactive via active_file/refault_ratio instead of refault 
> > distance.
> >
> > The original thought is coming from the logs we got from trace_printk in 
> > this
> > patch, we can find about 1/5 of the file pages' refault are under the
> > scenario[1],which will be counted as inactive as they have a long refault 
> > distance
> > in between access. However, we can also know from the time information that 
> > the
> > page refault quickly as comparing to the average refault time which is 
> > calculated
> > by the number of active file and refault ratio. We want to save these kinds 
> > of
> > pages from evicted earlier as it used to be. The refault ratio is the value
> > which can reflect lru's average file access frequency and also can be 
> > deemed as a
> > prediction of future.
> >
> > The patch is tested on an android system and reduce 30% of page faults, 
> > while
> > 60% of the pages remain the original status as (refault_distance < 
> > active_file)
> > indicates. Pages status got from ftrace during the test can refer to [2].
> >
> > [1]
> > system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 
> > 34268 rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 
> > 29616 secs 97 pre_secs 97
> > HwBinder:922  workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 
> > 35037 rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 
> > 45600 secs 101 pre_secs 99
> >
> > [2]
> > WKST_ACT[0]:   original--INACTIVE  commit--ACTIVE
> > WKST_ACT[1]:   original--ACTIVEcommit--ACTIVE
> > WKST_INACT[0]: original--INACTIVE  commit--INACTIVE
> > WKST_INACT[1]: original--ACTIVEcommit--INACTIVE
> >
> > Signed-off-by: Zhaoyang Huang 
> > ---
> >  include/linux/mmzone.h |   1 +
> >  mm/workingset.c| 120 
> > +
> >  2 files changed, 112 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 32699b2..6f30673 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -240,6 +240,7 @@ struct lruvec {
> > atomic_long_t   inactive_age;
> > /* Refaults at the time of last reclaim cycle */
> > unsigned long   refaults;
> > +   atomic_long_t   refaults_ratio;
> >  #ifdef CONFIG_MEMCG
> > struct pglist_data *pgdat;
> >  #endif
> > diff --git a/mm/workingset.c b/mm/workingset.c
> > index 40ee02c..66c177b 100644
> > --- a/mm/workingset.c
> > +++ b/mm/workingset.c
> > @@ -160,6 +160,21 @@
> >  MEM_CGROUP_ID_SHIFT)
> >  #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
> >
> > +#ifdef CONFIG_64BIT
> > +#define EVICTION_SECS_POS_SHIFT 20
> > +#define EVICTION_SECS_SHRINK_SHIFT 4
> > +#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> > +#else
> > +#ifndef CONFIG_MEMCG
> > +#define EVICTION_SECS_POS_SHIFT 12
> > +#define EVICTION_SECS_SHRINK_SHIFT 4
> > +#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> > +#else
> > +#define EVICTION_SECS_POS_SHIFT 0
> > +#define EVICTION_SECS_SHRINK_SHIFT 0
> > +#define NO_SECS_IN_WORKINGSET
> > +#endif
> > +#endif
> >  /*
> >   * Eviction timestamps need to be able to cover the full range of
> >   * 

Re: [RFC PATCH] mm/workingset : judge file page activity via timestamp

2019-04-17 Thread Zhaoyang Huang
add Johannes and answer his previous question.

@Johannes Weiner
Yes. I do agree with you about the original thought of sacrificing
long distance access pages when huge memory demands arise. The problem
is what is the criteria of the distance, which you can find from what
I comment in the patch, that is, some pages have long refault_distance
while having a very short access time in between. I think the latter
one should be take into consideration or as part of the finnal
decision of if the page should be active/inactive.

On Wed, Apr 17, 2019 at 3:48 PM Zhaoyang Huang  wrote:
>
> From: Zhaoyang Huang 
>
> This patch introduce timestamp into workingset's entry and judge if the page
> is active or inactive via active_file/refault_ratio instead of refault 
> distance.
>
> The original thought is coming from the logs we got from trace_printk in this
> patch, we can find about 1/5 of the file pages' refault are under the
> scenario[1],which will be counted as inactive as they have a long refault 
> distance
> in between access. However, we can also know from the time information that 
> the
> page refault quickly as comparing to the average refault time which is 
> calculated
> by the number of active file and refault ratio. We want to save these kinds of
> pages from evicted earlier as it used to be. The refault ratio is the value
> which can reflect lru's average file access frequency and also can be deemed 
> as a
> prediction of future.
>
> The patch is tested on an android system and reduce 30% of page faults, while
> 60% of the pages remain the original status as (refault_distance < 
> active_file)
> indicates. Pages status got from ftrace during the test can refer to [2].
>
> [1]
> system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 
> rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 
> 97 pre_secs 97
> HwBinder:922  workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 
> rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 
> 101 pre_secs 99
>
> [2]
> WKST_ACT[0]:   original--INACTIVE  commit--ACTIVE
> WKST_ACT[1]:   original--ACTIVEcommit--ACTIVE
> WKST_INACT[0]: original--INACTIVE  commit--INACTIVE
> WKST_INACT[1]: original--ACTIVEcommit--INACTIVE
>
> Signed-off-by: Zhaoyang Huang 
> ---
>  include/linux/mmzone.h |   1 +
>  mm/workingset.c| 120 
> +
>  2 files changed, 112 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 32699b2..6f30673 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -240,6 +240,7 @@ struct lruvec {
> atomic_long_t   inactive_age;
> /* Refaults at the time of last reclaim cycle */
> unsigned long   refaults;
> +   atomic_long_t   refaults_ratio;
>  #ifdef CONFIG_MEMCG
> struct pglist_data *pgdat;
>  #endif
> diff --git a/mm/workingset.c b/mm/workingset.c
> index 40ee02c..66c177b 100644
> --- a/mm/workingset.c
> +++ b/mm/workingset.c
> @@ -160,6 +160,21 @@
>  MEM_CGROUP_ID_SHIFT)
>  #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
>
> +#ifdef CONFIG_64BIT
> +#define EVICTION_SECS_POS_SHIFT 20
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#ifndef CONFIG_MEMCG
> +#define EVICTION_SECS_POS_SHIFT 12
> +#define EVICTION_SECS_SHRINK_SHIFT 4
> +#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
> +#else
> +#define EVICTION_SECS_POS_SHIFT 0
> +#define EVICTION_SECS_SHRINK_SHIFT 0
> +#define NO_SECS_IN_WORKINGSET
> +#endif
> +#endif
>  /*
>   * Eviction timestamps need to be able to cover the full range of
>   * actionable refaults. However, bits are tight in the radix tree
> @@ -169,10 +184,54 @@
>   * evictions into coarser buckets by shaving off lower timestamp bits.
>   */
>  static unsigned int bucket_order __read_mostly;
> -
> +#ifdef NO_SECS_IN_WORKINGSET
> +static void pack_secs(unsigned long *peviction) { }
> +static unsigned int unpack_secs(unsigned long entry) {return 0; }
> +#else
> +/*
> + * Shrink the timestamp according to its value and store it together
> + * with the shrink size in the entry.
> + */
> +static void pack_secs(unsigned long *peviction)
> +{
> +   unsigned int secs;
> +   unsigned long eviction;
> +   int order;
> +   int secs_shrink_size;
> +   struct timespec ts;
> +
> +   get_monotonic_boottime();
> +   secs = (unsigned int)ts.tv_sec ? (unsig

[RFC PATCH] mm/workingset : judge file page activity via timestamp

2019-04-17 Thread Zhaoyang Huang
From: Zhaoyang Huang 

This patch introduce timestamp into workingset's entry and judge if the page
is active or inactive via active_file/refault_ratio instead of refault distance.

The original thought is coming from the logs we got from trace_printk in this
patch, we can find about 1/5 of the file pages' refault are under the
scenario[1],which will be counted as inactive as they have a long refault 
distance
in between access. However, we can also know from the time information that the
page refault quickly as comparing to the average refault time which is 
calculated
by the number of active file and refault ratio. We want to save these kinds of
pages from evicted earlier as it used to be. The refault ratio is the value
which can reflect lru's average file access frequency and also can be deemed as 
a
prediction of future.

The patch is tested on an android system and reduce 30% of page faults, while
60% of the pages remain the original status as (refault_distance < active_file)
indicates. Pages status got from ftrace during the test can refer to [2].

[1]
system_server workingset_refault: WKST_ACT[0]:rft_dis 265976, act_file 34268 
rft_ratio 3047 rft_time 0 avg_rft_time 11 refault 295592 eviction 29616 secs 97 
pre_secs 97
HwBinder:922  workingset_refault: WKST_ACT[0]:rft_dis 264478, act_file 35037 
rft_ratio 3070 rft_time 2 avg_rft_time 11 refault 310078 eviction 45600 secs 
101 pre_secs 99

[2]
WKST_ACT[0]:   original--INACTIVE  commit--ACTIVE
WKST_ACT[1]:   original--ACTIVEcommit--ACTIVE
WKST_INACT[0]: original--INACTIVE  commit--INACTIVE
WKST_INACT[1]: original--ACTIVEcommit--INACTIVE

Signed-off-by: Zhaoyang Huang 
---
 include/linux/mmzone.h |   1 +
 mm/workingset.c| 120 +
 2 files changed, 112 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2..6f30673 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -240,6 +240,7 @@ struct lruvec {
atomic_long_t   inactive_age;
/* Refaults at the time of last reclaim cycle */
unsigned long   refaults;
+   atomic_long_t   refaults_ratio;
 #ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
 #endif
diff --git a/mm/workingset.c b/mm/workingset.c
index 40ee02c..66c177b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -160,6 +160,21 @@
 MEM_CGROUP_ID_SHIFT)
 #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
 
+#ifdef CONFIG_64BIT
+#define EVICTION_SECS_POS_SHIFT 20
+#define EVICTION_SECS_SHRINK_SHIFT 4
+#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
+#else
+#ifndef CONFIG_MEMCG
+#define EVICTION_SECS_POS_SHIFT 12
+#define EVICTION_SECS_SHRINK_SHIFT 4
+#define EVICTION_SECS_POS_MASK  ((1UL << EVICTION_SECS_POS_SHIFT) - 1)
+#else
+#define EVICTION_SECS_POS_SHIFT 0
+#define EVICTION_SECS_SHRINK_SHIFT 0
+#define NO_SECS_IN_WORKINGSET
+#endif
+#endif
 /*
  * Eviction timestamps need to be able to cover the full range of
  * actionable refaults. However, bits are tight in the radix tree
@@ -169,10 +184,54 @@
  * evictions into coarser buckets by shaving off lower timestamp bits.
  */
 static unsigned int bucket_order __read_mostly;
-
+#ifdef NO_SECS_IN_WORKINGSET
+static void pack_secs(unsigned long *peviction) { }
+static unsigned int unpack_secs(unsigned long entry) {return 0; }
+#else
+/*
+ * Shrink the timestamp according to its value and store it together
+ * with the shrink size in the entry.
+ */
+static void pack_secs(unsigned long *peviction)
+{
+   unsigned int secs;
+   unsigned long eviction;
+   int order;
+   int secs_shrink_size;
+   struct timespec ts;
+
+   get_monotonic_boottime();
+   secs = (unsigned int)ts.tv_sec ? (unsigned int)ts.tv_sec : 1;
+   order = get_count_order(secs);
+   secs_shrink_size = (order <= EVICTION_SECS_POS_SHIFT)
+   ? 0 : (order - EVICTION_SECS_POS_SHIFT);
+
+   eviction = *peviction;
+   eviction = (eviction << EVICTION_SECS_POS_SHIFT)
+   | ((secs >> secs_shrink_size) & EVICTION_SECS_POS_MASK);
+   eviction = (eviction << EVICTION_SECS_SHRINK_SHIFT) | (secs_shrink_size 
& 0xf);
+   *peviction = eviction;
+}
+/*
+ * Unpack the second from the entry and restore the value according to the
+ * shrink size.
+ */
+static unsigned int unpack_secs(unsigned long entry)
+{
+   unsigned int secs;
+   int secs_shrink_size;
+
+   secs_shrink_size = entry & ((1 << EVICTION_SECS_SHRINK_SHIFT) - 1);
+   entry >>= EVICTION_SECS_SHRINK_SHIFT;
+   secs = entry & EVICTION_SECS_POS_MASK;
+   secs = secs << secs_shrink_size;
+   return secs;
+}
+#endif
 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
 {
eviction >>= bucket_order;
+

Re: [PATCH] mm:workingset use real time to judge activity of the file page

2019-04-04 Thread Zhaoyang Huang
resend it via the right mailling list and rewrite the comments by ZY.

On Thu, Apr 4, 2019 at 3:15 PM Michal Hocko  wrote:
>
> [Fixup email for Pavel and add Johannes]
>
> On Thu 04-04-19 11:30:17, Zhaoyang Huang wrote:
> > From: Zhaoyang Huang 
> >
> > In previous implementation, the number of refault pages is used
> > for judging the refault period of each page, which is not precised as
> > eviction of other files will be affect a lot on current cache.
> > We introduce the timestamp into the workingset's entry and refault ratio
> > to measure the file page's activity. It helps to decrease the affection
> > of other files(average refault ratio can reflect the view of whole system
> > 's memory).
> > The patch is tested on an Android system, which can be described as
> > comparing the launch time of an application between a huge memory
> > consumption. The result is launch time decrease 50% and the page fault
> > during the test decrease 80%.
> >
I don't understand what exactly you're saying here, can you please elaborate?

The reason it's using distances instead of absolute time is because
the ordering of the LRU is relative and not based on absolute time.

E.g. if a page is accessed every 500ms, it depends on all other pages
to determine whether this page is at the head or the tail of the LRU.

So when you refault, in order to determine the relative position of
the refaulted page in the LRU, you have to compare it to how fast that
LRU is moving. The absolute refault time, or the average time between
refaults, is not comparable to what's already in memory.

comment by ZY
For current implementation, it is hard to deal with the evaluation of
refault period under the scenario of huge dropping of file pages
within short time, which maybe caused by a high order allocation or
continues single page allocation in KSWAPD. On the contrary, such page
which having a big refault_distance will be deemed as INACTIVE
wrongly, which will be reclaimed earlier than it should be and lead to
page thrashing. So we introduce 'avg_refault_time' & 'refault_ratio'
to judge if the refault is a accumulated thing or caused by a tight
reclaiming. That is to say, a big refault_distance in a long time
would also be inactive as the result of comparing it with ideal
time(avg_refault_time: avg_refault_time = delta_lru_reclaimed_pages/
avg_refault_retio (refault_ratio = lru->inactive_ages / time).
> > Signed-off-by: Zhaoyang Huang 
> > ---
> >  include/linux/mmzone.h |  2 ++
> >  mm/workingset.c| 24 +---
> >  2 files changed, 19 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 32699b2..c38ba0a 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -240,6 +240,8 @@ struct lruvec {
> >   atomic_long_t   inactive_age;
> >   /* Refaults at the time of last reclaim cycle */
> >   unsigned long   refaults;
> > + atomic_long_t   refaults_ratio;
> > + atomic_long_t   prev_fault;
> >  #ifdef CONFIG_MEMCG
> >   struct pglist_data *pgdat;
> >  #endif
> > diff --git a/mm/workingset.c b/mm/workingset.c
> > index 40ee02c..6361853 100644
> > --- a/mm/workingset.c
> > +++ b/mm/workingset.c
> > @@ -159,7 +159,7 @@
> >NODES_SHIFT +  \
> >MEM_CGROUP_ID_SHIFT)
> >  #define EVICTION_MASK(~0UL >> EVICTION_SHIFT)
> > -
> > +#define EVICTION_JIFFIES (BITS_PER_LONG >> 3)
> >  /*
> >   * Eviction timestamps need to be able to cover the full range of
> >   * actionable refaults. However, bits are tight in the radix tree
> > @@ -175,18 +175,22 @@ static void *pack_shadow(int memcgid, pg_data_t 
> > *pgdat, unsigned long eviction)
> >   eviction >>= bucket_order;
> >   eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
> >   eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
> > + eviction = (eviction << EVICTION_JIFFIES) | (jiffies >> 
> > EVICTION_JIFFIES);
> >   eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
> >
> >   return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
> >  }
> >
> >  static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
> > -   unsigned long *evictionp)
> > +   unsigned long *evictionp, unsigned long *prev_jiffp)
> >  {
> >   unsigned long entry = (unsigned long)shadow;
> >   int memcgid, nid;
> > + unsigned long prev_jiff;
>

Re: [PATCH] mm:workingset use real time to judge activity of the file page

2019-04-04 Thread Zhaoyang Huang
On Fri, Apr 5, 2019 at 12:39 AM Johannes Weiner  wrote:
>
> On Thu, Apr 04, 2019 at 11:30:17AM +0800, Zhaoyang Huang wrote:
> > From: Zhaoyang Huang 
> >
> > In previous implementation, the number of refault pages is used
> > for judging the refault period of each page, which is not precised as
> > eviction of other files will be affect a lot on current cache.
> > We introduce the timestamp into the workingset's entry and refault ratio
> > to measure the file page's activity. It helps to decrease the affection
> > of other files(average refault ratio can reflect the view of whole system
> > 's memory).
>
> I don't understand what exactly you're saying here, can you please
> elaborate?
>
> The reason it's using distances instead of absolute time is because
> the ordering of the LRU is relative and not based on absolute time.
>
> E.g. if a page is accessed every 500ms, it depends on all other pages
> to determine whether this page is at the head or the tail of the LRU.
>
> So when you refault, in order to determine the relative position of
> the refaulted page in the LRU, you have to compare it to how fast that
> LRU is moving. The absolute refault time, or the average time between
> refaults, is not comparable to what's already in memory.
How do you know how long time did these pages' dropping taken.Actruly,
a quick dropping of large mount of pages will be wrongly deemed as
slow dropping instead of the exact hard situation.That is to say, 100
pages per million second or per second have same impaction on
calculating the refault distance, which may cause less protection on
this page cache for former scenario and introduce page thrashing.
especially when global reclaim, a round of kswapd reclaiming that
waked up by a high order allocation or large number of single page
allocations may cause such things as all pages within the node are
counted in the same lru. This commit can decreasing above things by
comparing refault time of single page with avg_refault_time =
delta_lru_reclaimed_pages/ avg_refault_retio (refault_ratio =
lru->inactive_ages / time).


[PATCH] mm:workingset use real time to judge activity of the file page

2019-04-03 Thread Zhaoyang Huang
From: Zhaoyang Huang 

In previous implementation, the number of refault pages is used
for judging the refault period of each page, which is not precised as
eviction of other files will be affect a lot on current cache.
We introduce the timestamp into the workingset's entry and refault ratio
to measure the file page's activity. It helps to decrease the affection
of other files(average refault ratio can reflect the view of whole system
's memory).
The patch is tested on an Android system, which can be described as
comparing the launch time of an application between a huge memory
consumption. The result is launch time decrease 50% and the page fault
during the test decrease 80%.

Signed-off-by: Zhaoyang Huang 
---
 include/linux/mmzone.h |  2 ++
 mm/workingset.c| 24 +---
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2..c38ba0a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -240,6 +240,8 @@ struct lruvec {
atomic_long_t   inactive_age;
/* Refaults at the time of last reclaim cycle */
unsigned long   refaults;
+   atomic_long_t   refaults_ratio;
+   atomic_long_t   prev_fault;
 #ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
 #endif
diff --git a/mm/workingset.c b/mm/workingset.c
index 40ee02c..6361853 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -159,7 +159,7 @@
 NODES_SHIFT +  \
 MEM_CGROUP_ID_SHIFT)
 #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
-
+#define EVICTION_JIFFIES (BITS_PER_LONG >> 3)
 /*
  * Eviction timestamps need to be able to cover the full range of
  * actionable refaults. However, bits are tight in the radix tree
@@ -175,18 +175,22 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, 
unsigned long eviction)
eviction >>= bucket_order;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+   eviction = (eviction << EVICTION_JIFFIES) | (jiffies >> 
EVICTION_JIFFIES);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
 
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
 }
 
 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp)
+ unsigned long *evictionp, unsigned long *prev_jiffp)
 {
unsigned long entry = (unsigned long)shadow;
int memcgid, nid;
+   unsigned long prev_jiff;
 
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+   entry >>= EVICTION_JIFFIES;
+   prev_jiff = (entry & ((1UL << EVICTION_JIFFIES) - 1)) << 
EVICTION_JIFFIES;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -195,6 +199,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, 
pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
+   *prev_jiffp = prev_jiff;
 }
 
 /**
@@ -242,8 +247,12 @@ bool workingset_refault(void *shadow)
unsigned long refault;
struct pglist_data *pgdat;
int memcgid;
+   unsigned long refault_ratio;
+   unsigned long prev_jiff;
+   unsigned long avg_refault_time;
+   unsigned long refault_time;
 
-   unpack_shadow(shadow, , , );
+   unpack_shadow(shadow, , , , _jiff);
 
rcu_read_lock();
/*
@@ -288,10 +297,11 @@ bool workingset_refault(void *shadow)
 * list is not a problem.
 */
refault_distance = (refault - eviction) & EVICTION_MASK;
-
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
-
-   if (refault_distance <= active_file) {
+   lruvec->refaults_ratio = atomic_long_read(>inactive_age) / 
jiffies;
+   refault_time = jiffies - prev_jiff;
+   avg_refault_time = refault_distance / lruvec->refaults_ratio;
+   if (refault_time <= avg_refault_time) {
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
rcu_read_unlock();
return true;
@@ -521,7 +531,7 @@ static int __init workingset_init(void)
 * some more pages at runtime, so keep working with up to
 * double the initial memory by using totalram_pages as-is.
 */
-   timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+   timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT - EVICTION_JIFFIES;
max_order = fls_long(totalram_pages - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;
-- 
1.9.1



[PATCH] mm:workingset use real time to judge activity of the file page

2019-04-03 Thread Zhaoyang Huang
From: Zhaoyang Huang 

In previous implementation, the number of refault pages is used
for judging the refault period of each page, which is not precised.
We introduce the timestamp into the workingset's entry to measure
the file page's activity.

The patch is tested on an Android system, which can be described as
comparing the launch time of an application between a huge memory
consumption. The result is launch time decrease 50% and the page fault
during the test decrease 80%.

Signed-off-by: Zhaoyang Huang 
---
 include/linux/mmzone.h |  2 ++
 mm/workingset.c| 24 +---
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2..c38ba0a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -240,6 +240,8 @@ struct lruvec {
atomic_long_t   inactive_age;
/* Refaults at the time of last reclaim cycle */
unsigned long   refaults;
+   atomic_long_t   refaults_ratio;
+   atomic_long_t   prev_fault;
 #ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
 #endif
diff --git a/mm/workingset.c b/mm/workingset.c
index 40ee02c..6361853 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -159,7 +159,7 @@
 NODES_SHIFT +  \
 MEM_CGROUP_ID_SHIFT)
 #define EVICTION_MASK  (~0UL >> EVICTION_SHIFT)
-
+#define EVICTION_JIFFIES (BITS_PER_LONG >> 3)
 /*
  * Eviction timestamps need to be able to cover the full range of
  * actionable refaults. However, bits are tight in the radix tree
@@ -175,18 +175,22 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, 
unsigned long eviction)
eviction >>= bucket_order;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
+   eviction = (eviction << EVICTION_JIFFIES) | (jiffies >> 
EVICTION_JIFFIES);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
 
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
 }
 
 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp)
+ unsigned long *evictionp, unsigned long *prev_jiffp)
 {
unsigned long entry = (unsigned long)shadow;
int memcgid, nid;
+   unsigned long prev_jiff;
 
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+   entry >>= EVICTION_JIFFIES;
+   prev_jiff = (entry & ((1UL << EVICTION_JIFFIES) - 1)) << 
EVICTION_JIFFIES;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -195,6 +199,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, 
pg_data_t **pgdat,
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
+   *prev_jiffp = prev_jiff;
 }
 
 /**
@@ -242,8 +247,12 @@ bool workingset_refault(void *shadow)
unsigned long refault;
struct pglist_data *pgdat;
int memcgid;
+   unsigned long refault_ratio;
+   unsigned long prev_jiff;
+   unsigned long avg_refault_time;
+   unsigned long refault_time;
 
-   unpack_shadow(shadow, , , );
+   unpack_shadow(shadow, , , , _jiff);
 
rcu_read_lock();
/*
@@ -288,10 +297,11 @@ bool workingset_refault(void *shadow)
 * list is not a problem.
 */
refault_distance = (refault - eviction) & EVICTION_MASK;
-
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
-
-   if (refault_distance <= active_file) {
+   lruvec->refaults_ratio = atomic_long_read(>inactive_age) / 
jiffies;
+   refault_time = jiffies - prev_jiff;
+   avg_refault_time = refault_distance / lruvec->refaults_ratio;
+   if (refault_time <= avg_refault_time) {
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
rcu_read_unlock();
return true;
@@ -521,7 +531,7 @@ static int __init workingset_init(void)
 * some more pages at runtime, so keep working with up to
 * double the initial memory by using totalram_pages as-is.
 */
-   timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+   timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT - EVICTION_JIFFIES;
max_order = fls_long(totalram_pages - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;
-- 
1.9.1



Re: [PATCH] driver : staging : ion: optimization for decreasing memory fragmentaion

2019-03-20 Thread Zhaoyang Huang
On Wed, Mar 20, 2019 at 9:10 AM David Rientjes  wrote:
>
> On Thu, 14 Mar 2019, Zhaoyang Huang wrote:
>
> > From: Zhaoyang Huang 
> >
> > Two action for this patch:
> > 1. set a batch size for system heap's shrinker, which can have it buffer
> > reasonable page blocks in pool for future allocation.
> > 2. reverse the order sequence when free page blocks, the purpose is also
> > to have system heap keep as more big blocks as it can.
> >
> > By testing on an android system with 2G RAM, the changes with setting
> > batch = 48MB can help reduce the fragmentation obviously and improve
> > big block allocation speed for 15%.
> >
> > Signed-off-by: Zhaoyang Huang 
> > ---
> >  drivers/staging/android/ion/ion_heap.c| 12 +++-
> >  drivers/staging/android/ion/ion_system_heap.c |  2 +-
> >  2 files changed, 12 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/staging/android/ion/ion_heap.c 
> > b/drivers/staging/android/ion/ion_heap.c
> > index 31db510..9e9caf2 100644
> > --- a/drivers/staging/android/ion/ion_heap.c
> > +++ b/drivers/staging/android/ion/ion_heap.c
> > @@ -16,6 +16,8 @@
> >  #include 
> >  #include "ion.h"
> >
> > +unsigned long ion_heap_batch = 0;
>
> static?
ok
>
> > +
> >  void *ion_heap_map_kernel(struct ion_heap *heap,
> > struct ion_buffer *buffer)
> >  {
> > @@ -303,7 +305,15 @@ int ion_heap_init_shrinker(struct ion_heap *heap)
> >   heap->shrinker.count_objects = ion_heap_shrink_count;
> >   heap->shrinker.scan_objects = ion_heap_shrink_scan;
> >   heap->shrinker.seeks = DEFAULT_SEEKS;
> > - heap->shrinker.batch = 0;
> > + heap->shrinker.batch = ion_heap_batch;
> >
> >   return register_shrinker(>shrinker);
> >  }
> > +
> > +static int __init ion_system_heap_batch_init(char *arg)
> > +{
> > +  ion_heap_batch = memparse(arg, NULL);
> > +
>
> No bounds checking?  What are the legitimate upper and lower bounds here?
Actruly, ion_heap_batch will work during shrink_slab, which shown bellow.
We can find that it is hard that to set batch_size as a constant value
as total ram size is different to each system. Furthermore, it is also
no need to set a percentage thing, "total_scan >= freeable" will work
as another threshold of slab size.
...
while (total_scan >= batch_size ||
   total_scan >= freeable) {
unsigned long nr_to_scan = min(batch_size, total_scan);
ret = shrinker->scan_objects(shrinker, shrinkctl);
...
shrinkctl->nr_to_scan = nr_to_scan;
shrinkctl->nr_scanned = nr_to_scan;
ret = shrinker->scan_objects(shrinker, shrinkctl);
>
> > + return 0;
> > +}
> > +early_param("ion_batch", ion_system_heap_batch_init);
> > diff --git a/drivers/staging/android/ion/ion_system_heap.c 
> > b/drivers/staging/android/ion/ion_system_heap.c
> > index 701eb9f..d249f8d 100644
> > --- a/drivers/staging/android/ion/ion_system_heap.c
> > +++ b/drivers/staging/android/ion/ion_system_heap.c
> > @@ -182,7 +182,7 @@ static int ion_system_heap_shrink(struct ion_heap 
> > *heap, gfp_t gfp_mask,
> >   if (!nr_to_scan)
> >   only_scan = 1;
> >
> > - for (i = 0; i < NUM_ORDERS; i++) {
> > + for (i = NUM_ORDERS - 1; i >= 0; i--) {
> >   pool = sys_heap->pools[i];
> >
> >   if (only_scan) {
>
> Can we get a Documentation update on how we can use ion_batch and what the
> appropriate settings are (and in what circumstances)?
ok, I will explain it here firstly.
ion_heap_batch will work as the batch_size during shink_slab, which
help the heap buffer some of the page blocks for further allocation.
My test is based on a android system with 2G RAM. We find that
multimedia related cases is the chief consumer of the ion system heap
and cause memory fragmentation after a period of running. By
configuring ion_heap_batch as 48M(3 x camera peak consuming value) and
revert the shrink order, we can almost eliminate such scenario during
the test and improve the allocating speed up to 15%.
For common policy, the batch size should depend on the practical
scenario. The peak value can be got via sysfs or kernel log.


[PATCH] driver : staging : ion: optimization for decreasing memory fragmentaion

2019-03-14 Thread Zhaoyang Huang
From: Zhaoyang Huang 

Two action for this patch:
1. set a batch size for system heap's shrinker, which can have it buffer
reasonable page blocks in pool for future allocation.
2. reverse the order sequence when free page blocks, the purpose is also
to have system heap keep as more big blocks as it can.

By testing on an android system with 2G RAM, the changes with setting
batch = 48MB can help reduce the fragmentation obviously and improve
big block allocation speed for 15%.

Signed-off-by: Zhaoyang Huang 
---
 drivers/staging/android/ion/ion_heap.c| 12 +++-
 drivers/staging/android/ion/ion_system_heap.c |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/android/ion/ion_heap.c 
b/drivers/staging/android/ion/ion_heap.c
index 31db510..9e9caf2 100644
--- a/drivers/staging/android/ion/ion_heap.c
+++ b/drivers/staging/android/ion/ion_heap.c
@@ -16,6 +16,8 @@
 #include 
 #include "ion.h"
 
+unsigned long ion_heap_batch = 0;
+
 void *ion_heap_map_kernel(struct ion_heap *heap,
  struct ion_buffer *buffer)
 {
@@ -303,7 +305,15 @@ int ion_heap_init_shrinker(struct ion_heap *heap)
heap->shrinker.count_objects = ion_heap_shrink_count;
heap->shrinker.scan_objects = ion_heap_shrink_scan;
heap->shrinker.seeks = DEFAULT_SEEKS;
-   heap->shrinker.batch = 0;
+   heap->shrinker.batch = ion_heap_batch;
 
return register_shrinker(>shrinker);
 }
+
+static int __init ion_system_heap_batch_init(char *arg)
+{
+ion_heap_batch = memparse(arg, NULL);
+
+   return 0;
+}
+early_param("ion_batch", ion_system_heap_batch_init);
diff --git a/drivers/staging/android/ion/ion_system_heap.c 
b/drivers/staging/android/ion/ion_system_heap.c
index 701eb9f..d249f8d 100644
--- a/drivers/staging/android/ion/ion_system_heap.c
+++ b/drivers/staging/android/ion/ion_system_heap.c
@@ -182,7 +182,7 @@ static int ion_system_heap_shrink(struct ion_heap *heap, 
gfp_t gfp_mask,
if (!nr_to_scan)
only_scan = 1;
 
-   for (i = 0; i < NUM_ORDERS; i++) {
+   for (i = NUM_ORDERS - 1; i >= 0; i--) {
pool = sys_heap->pools[i];
 
if (only_scan) {
-- 
1.9.1



[PATCH] mm:vmalloc add vm_struct for vm_map_ram

2018-11-08 Thread Zhaoyang Huang
From: Zhaoyang Huang 

There is no caller and pages information etc for the area which is
created by vm_map_ram as well as the page count > VMAP_MAX_ALLOC.
Add them on in this commit.

Signed-off-by: Zhaoyang Huang 
---
 mm/vmalloc.c | 30 --
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cfea25b..819b690 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -45,7 +45,8 @@ struct vfree_deferred {
 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
 
 static void __vunmap(const void *, int);
-
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, const void *caller);
 static void free_work(struct work_struct *w)
 {
struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
@@ -1138,6 +1139,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(!va);
debug_check_no_locks_freed((void *)va->va_start,
(va->va_end - va->va_start));
+   kfree(va->vm);
free_unmap_vmap_area(va);
 }
 EXPORT_SYMBOL(vm_unmap_ram);
@@ -1170,6 +1172,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, 
int node, pgprot_t pro
addr = (unsigned long)mem;
} else {
struct vmap_area *va;
+   struct vm_struct *area;
+
va = alloc_vmap_area(size, PAGE_SIZE,
VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
if (IS_ERR(va))
@@ -1177,11 +1181,17 @@ void *vm_map_ram(struct page **pages, unsigned int 
count, int node, pgprot_t pro
 
addr = va->va_start;
mem = (void *)addr;
+   area = kzalloc_node(sizeof(*area), GFP_KERNEL, node);
+   if (likely(area)) {
+   setup_vmalloc_vm(area, va, 0, 
__builtin_return_address(0));
+   va->flags &= ~VM_VM_AREA;
+   }
}
if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+
return mem;
 }
 EXPORT_SYMBOL(vm_map_ram);
@@ -2688,19 +2698,19 @@ static int s_show(struct seq_file *m, void *p)
 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
 * behalf of vmap area is being tear down or vm_map_ram allocation.
 */
-   if (!(va->flags & VM_VM_AREA)) {
-   seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
-   (void *)va->va_start, (void *)va->va_end,
-   va->va_end - va->va_start,
-   va->flags & VM_LAZY_FREE ? "unpurged vm_area" : 
"vm_map_ram");
-
+   if (!(va->flags & VM_VM_AREA) && !va->vm)
return 0;
-   }
 
v = va->vm;
 
-   seq_printf(m, "0x%pK-0x%pK %7ld",
-   v->addr, v->addr + v->size, v->size);
+   if (!(va->flags & VM_VM_AREA))
+   seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
+   (void *)va->va_start, (void *)va->va_end,
+   va->va_end - va->va_start,
+   va->flags & VM_LAZY_FREE ? "unpurged vm_area" : 
"vm_map_ram");
+   else
+   seq_printf(m, "0x%pK-0x%pK %7ld",
+   v->addr, v->addr + v->size, v->size);
 
if (v->caller)
seq_printf(m, " %pS", v->caller);
-- 
1.9.1



[PATCH] mm:vmalloc add vm_struct for vm_map_ram

2018-11-08 Thread Zhaoyang Huang
From: Zhaoyang Huang 

There is no caller and pages information etc for the area which is
created by vm_map_ram as well as the page count > VMAP_MAX_ALLOC.
Add them on in this commit.

Signed-off-by: Zhaoyang Huang 
---
 mm/vmalloc.c | 30 --
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cfea25b..819b690 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -45,7 +45,8 @@ struct vfree_deferred {
 static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
 
 static void __vunmap(const void *, int);
-
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, const void *caller);
 static void free_work(struct work_struct *w)
 {
struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
@@ -1138,6 +1139,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(!va);
debug_check_no_locks_freed((void *)va->va_start,
(va->va_end - va->va_start));
+   kfree(va->vm);
free_unmap_vmap_area(va);
 }
 EXPORT_SYMBOL(vm_unmap_ram);
@@ -1170,6 +1172,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, 
int node, pgprot_t pro
addr = (unsigned long)mem;
} else {
struct vmap_area *va;
+   struct vm_struct *area;
+
va = alloc_vmap_area(size, PAGE_SIZE,
VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
if (IS_ERR(va))
@@ -1177,11 +1181,17 @@ void *vm_map_ram(struct page **pages, unsigned int 
count, int node, pgprot_t pro
 
addr = va->va_start;
mem = (void *)addr;
+   area = kzalloc_node(sizeof(*area), GFP_KERNEL, node);
+   if (likely(area)) {
+   setup_vmalloc_vm(area, va, 0, 
__builtin_return_address(0));
+   va->flags &= ~VM_VM_AREA;
+   }
}
if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
+
return mem;
 }
 EXPORT_SYMBOL(vm_map_ram);
@@ -2688,19 +2698,19 @@ static int s_show(struct seq_file *m, void *p)
 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
 * behalf of vmap area is being tear down or vm_map_ram allocation.
 */
-   if (!(va->flags & VM_VM_AREA)) {
-   seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
-   (void *)va->va_start, (void *)va->va_end,
-   va->va_end - va->va_start,
-   va->flags & VM_LAZY_FREE ? "unpurged vm_area" : 
"vm_map_ram");
-
+   if (!(va->flags & VM_VM_AREA) && !va->vm)
return 0;
-   }
 
v = va->vm;
 
-   seq_printf(m, "0x%pK-0x%pK %7ld",
-   v->addr, v->addr + v->size, v->size);
+   if (!(va->flags & VM_VM_AREA))
+   seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
+   (void *)va->va_start, (void *)va->va_end,
+   va->va_end - va->va_start,
+   va->flags & VM_LAZY_FREE ? "unpurged vm_area" : 
"vm_map_ram");
+   else
+   seq_printf(m, "0x%pK-0x%pK %7ld",
+   v->addr, v->addr + v->size, v->size);
 
if (v->caller)
seq_printf(m, " %pS", v->caller);
-- 
1.9.1



[PATCH] arch/arm64 : fix error in dump_backtrace

2018-11-05 Thread Zhaoyang Huang
From: Zhaoyang Huang 

In some cases, the instruction of "bl foo1" will be the last one of the
foo2[1], which will cause the lr be the first instruction of the adjacent
foo3[2]. Hence, the backtrace will show the weird result as bellow[3].
The patch will fix it by miner 4 of the lr when dump_backtrace

[1]
0xff80081e6b04 :  adrpx0, 0xff8008ca8000
0xff80081e6b08 :  add x0, x0, #0x5a8
0xff80081e6b0c :  bl  0xff80081b0ca0 

0xff80081e6b10 :  stp x29, x30, [sp,#-64]!
0xff80081e6b14 :mov x29, sp

[2]
crash_arm64> rd ffc02eec3bd0 2
ffc02eec3bd0:  ffc02eec3cb0 ff80081e6b10

[3]
wrong:
[] panic+0xf0/0x24c
[] access_remote_vm+0x0/0x5c
[] do_page_fault+0x290/0x3b8
[] do_mem_abort+0x64/0xdc

correct:
[ffc02eec3bd0] panic at ff80081b0da4
[ffc02eec3cb0] handle_mm_fault at ff80081e6b0c
[ffc02eec3d80] do_page_fault at ff800809d7ac
[ffc02eec3df0] do_mem_abort at ff800808156c

Signed-off-by: Zhaoyang Huang 
---
 arch/arm64/kernel/traps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index d399d45..7a097cc 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -113,7 +113,7 @@ void dump_backtrace(struct pt_regs *regs, struct 
task_struct *tsk)
 
if (tsk == current) {
frame.fp = (unsigned long)__builtin_frame_address(0);
-   frame.pc = (unsigned long)dump_backtrace;
+   frame.pc = (unsigned long)dump_backtrace + 4;
} else {
/*
 * task blocked in __switch_to
@@ -130,7 +130,7 @@ void dump_backtrace(struct pt_regs *regs, struct 
task_struct *tsk)
do {
/* skip until specified stack frame */
if (!skip) {
-   dump_backtrace_entry(frame.pc);
+   dump_backtrace_entry(frame.pc - 4);
} else if (frame.fp == regs->regs[29]) {
skip = 0;
/*
-- 
1.9.1



[PATCH] arch/arm64 : fix error in dump_backtrace

2018-11-05 Thread Zhaoyang Huang
From: Zhaoyang Huang 

In some cases, the instruction of "bl foo1" will be the last one of the
foo2[1], which will cause the lr be the first instruction of the adjacent
foo3[2]. Hence, the backtrace will show the weird result as bellow[3].
The patch will fix it by miner 4 of the lr when dump_backtrace

[1]
0xff80081e6b04 :  adrpx0, 0xff8008ca8000
0xff80081e6b08 :  add x0, x0, #0x5a8
0xff80081e6b0c :  bl  0xff80081b0ca0 

0xff80081e6b10 :  stp x29, x30, [sp,#-64]!
0xff80081e6b14 :mov x29, sp

[2]
crash_arm64> rd ffc02eec3bd0 2
ffc02eec3bd0:  ffc02eec3cb0 ff80081e6b10

[3]
wrong:
[] panic+0xf0/0x24c
[] access_remote_vm+0x0/0x5c
[] do_page_fault+0x290/0x3b8
[] do_mem_abort+0x64/0xdc

correct:
[ffc02eec3bd0] panic at ff80081b0da4
[ffc02eec3cb0] handle_mm_fault at ff80081e6b0c
[ffc02eec3d80] do_page_fault at ff800809d7ac
[ffc02eec3df0] do_mem_abort at ff800808156c

Signed-off-by: Zhaoyang Huang 
---
 arch/arm64/kernel/traps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index d399d45..7a097cc 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -113,7 +113,7 @@ void dump_backtrace(struct pt_regs *regs, struct 
task_struct *tsk)
 
if (tsk == current) {
frame.fp = (unsigned long)__builtin_frame_address(0);
-   frame.pc = (unsigned long)dump_backtrace;
+   frame.pc = (unsigned long)dump_backtrace + 4;
} else {
/*
 * task blocked in __switch_to
@@ -130,7 +130,7 @@ void dump_backtrace(struct pt_regs *regs, struct 
task_struct *tsk)
do {
/* skip until specified stack frame */
if (!skip) {
-   dump_backtrace_entry(frame.pc);
+   dump_backtrace_entry(frame.pc - 4);
} else if (frame.fp == regs->regs[29]) {
skip = 0;
/*
-- 
1.9.1



Re: [PATCH v1] mm:memcg: skip memcg of current in mem_cgroup_soft_limit_reclaim

2018-08-03 Thread Zhaoyang Huang
On Fri, Aug 3, 2018 at 2:18 PM Michal Hocko  wrote:
>
> On Fri 03-08-18 14:11:26, Zhaoyang Huang wrote:
> > On Fri, Aug 3, 2018 at 1:48 PM Zhaoyang Huang  
> > wrote:
> > >
> > > for the soft_limit reclaim has more directivity than global reclaim, 
> > > we40960
> > > have current memcg be skipped to avoid potential page thrashing.
> > >
> > The patch is tested in our android system with 2GB ram.  The case
> > mainly focus on the smooth slide of pictures on a gallery, which used
> > to stall on the direct reclaim for over several hundred
> > millionseconds. By further debugging, we find that the direct reclaim
> > spend most of time to reclaim pages on its own with softlimit set to
> > 40960KB. I add a ftrace event to verify that the patch can help
> > escaping such scenario. Furthermore, we also measured the major fault
> > of this process(by dumpsys of android). The result is the patch can
> > help to reduce 20% of the major fault during the test.
>
> I have asked already asked. Why do you use the soft limit in the first
> place? It is known to cause excessive reclaim and long stalls.

It is required by Google for applying new version of android system.
There was such a mechanism called LMK in previous ANDROID version,
which will kill process when in memory contention like OOM does. I
think Google want to drop such rough way for reclaiming pages and turn
to memcg. They setup different memcg groups for different process of
the system and set their softlimit according to the oom_adj. Their
original purpose is to reclaim pages gentlely in direct reclaim and
kswapd. During the debugging process , it seems to me that memcg maybe
tunable somehow. At least , the patch works on our system.
> --
> Michal Hocko
> SUSE Labs


Re: [PATCH v1] mm:memcg: skip memcg of current in mem_cgroup_soft_limit_reclaim

2018-08-03 Thread Zhaoyang Huang
On Fri, Aug 3, 2018 at 2:18 PM Michal Hocko  wrote:
>
> On Fri 03-08-18 14:11:26, Zhaoyang Huang wrote:
> > On Fri, Aug 3, 2018 at 1:48 PM Zhaoyang Huang  
> > wrote:
> > >
> > > for the soft_limit reclaim has more directivity than global reclaim, 
> > > we40960
> > > have current memcg be skipped to avoid potential page thrashing.
> > >
> > The patch is tested in our android system with 2GB ram.  The case
> > mainly focus on the smooth slide of pictures on a gallery, which used
> > to stall on the direct reclaim for over several hundred
> > millionseconds. By further debugging, we find that the direct reclaim
> > spend most of time to reclaim pages on its own with softlimit set to
> > 40960KB. I add a ftrace event to verify that the patch can help
> > escaping such scenario. Furthermore, we also measured the major fault
> > of this process(by dumpsys of android). The result is the patch can
> > help to reduce 20% of the major fault during the test.
>
> I have asked already asked. Why do you use the soft limit in the first
> place? It is known to cause excessive reclaim and long stalls.

It is required by Google for applying new version of android system.
There was such a mechanism called LMK in previous ANDROID version,
which will kill process when in memory contention like OOM does. I
think Google want to drop such rough way for reclaiming pages and turn
to memcg. They setup different memcg groups for different process of
the system and set their softlimit according to the oom_adj. Their
original purpose is to reclaim pages gentlely in direct reclaim and
kswapd. During the debugging process , it seems to me that memcg maybe
tunable somehow. At least , the patch works on our system.
> --
> Michal Hocko
> SUSE Labs


Re: [PATCH v1] mm:memcg: skip memcg of current in mem_cgroup_soft_limit_reclaim

2018-08-03 Thread Zhaoyang Huang
On Fri, Aug 3, 2018 at 1:48 PM Zhaoyang Huang  wrote:
>
> for the soft_limit reclaim has more directivity than global reclaim, we40960
> have current memcg be skipped to avoid potential page thrashing.
>
The patch is tested in our android system with 2GB ram.  The case
mainly focus on the smooth slide of pictures on a gallery, which used
to stall on the direct reclaim for over several hundred
millionseconds. By further debugging, we find that the direct reclaim
spend most of time to reclaim pages on its own with softlimit set to
40960KB. I add a ftrace event to verify that the patch can help
escaping such scenario. Furthermore, we also measured the major fault
of this process(by dumpsys of android). The result is the patch can
help to reduce 20% of the major fault during the test.

> Signed-off-by: Zhaoyang Huang 
> ---
>  mm/memcontrol.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8c0280b..9d09e95 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2537,12 +2537,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
> *pgdat, int order,
> mz = mem_cgroup_largest_soft_limit_node(mctz);
> if (!mz)
> break;
> -
> +   /*
> +* skip current memcg to avoid page thrashing, for the
> +* mem_cgroup_soft_reclaim has more directivity than
> +* global reclaim.
> +*/
> +   if (get_mem_cgroup_from_mm(current->mm) == mz->memcg) {
> +   reclaimed = 0;
> +   goto next;
> +   }
> nr_scanned = 0;
> reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
> gfp_mask, _scanned);
> nr_reclaimed += reclaimed;
> *total_scanned += nr_scanned;
> +next:
> spin_lock_irq(>lock);
> __mem_cgroup_remove_exceeded(mz, mctz);
>
> --
> 1.9.1
>


Re: [PATCH v1] mm:memcg: skip memcg of current in mem_cgroup_soft_limit_reclaim

2018-08-03 Thread Zhaoyang Huang
On Fri, Aug 3, 2018 at 1:48 PM Zhaoyang Huang  wrote:
>
> for the soft_limit reclaim has more directivity than global reclaim, we40960
> have current memcg be skipped to avoid potential page thrashing.
>
The patch is tested in our android system with 2GB ram.  The case
mainly focus on the smooth slide of pictures on a gallery, which used
to stall on the direct reclaim for over several hundred
millionseconds. By further debugging, we find that the direct reclaim
spend most of time to reclaim pages on its own with softlimit set to
40960KB. I add a ftrace event to verify that the patch can help
escaping such scenario. Furthermore, we also measured the major fault
of this process(by dumpsys of android). The result is the patch can
help to reduce 20% of the major fault during the test.

> Signed-off-by: Zhaoyang Huang 
> ---
>  mm/memcontrol.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 8c0280b..9d09e95 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2537,12 +2537,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
> *pgdat, int order,
> mz = mem_cgroup_largest_soft_limit_node(mctz);
> if (!mz)
> break;
> -
> +   /*
> +* skip current memcg to avoid page thrashing, for the
> +* mem_cgroup_soft_reclaim has more directivity than
> +* global reclaim.
> +*/
> +   if (get_mem_cgroup_from_mm(current->mm) == mz->memcg) {
> +   reclaimed = 0;
> +   goto next;
> +   }
> nr_scanned = 0;
> reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
> gfp_mask, _scanned);
> nr_reclaimed += reclaimed;
> *total_scanned += nr_scanned;
> +next:
> spin_lock_irq(>lock);
> __mem_cgroup_remove_exceeded(mz, mctz);
>
> --
> 1.9.1
>


[PATCH v1] mm:memcg: skip memcg of current in mem_cgroup_soft_limit_reclaim

2018-08-02 Thread Zhaoyang Huang
for the soft_limit reclaim has more directivity than global reclaim, we
have current memcg be skipped to avoid potential page thrashing.

Signed-off-by: Zhaoyang Huang 
---
 mm/memcontrol.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b..9d09e95 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2537,12 +2537,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
mz = mem_cgroup_largest_soft_limit_node(mctz);
if (!mz)
break;
-
+   /*
+* skip current memcg to avoid page thrashing, for the
+* mem_cgroup_soft_reclaim has more directivity than
+* global reclaim.
+*/
+   if (get_mem_cgroup_from_mm(current->mm) == mz->memcg) {
+   reclaimed = 0;
+   goto next;
+   }
nr_scanned = 0;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
gfp_mask, _scanned);
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
+next:
spin_lock_irq(>lock);
__mem_cgroup_remove_exceeded(mz, mctz);
 
-- 
1.9.1



[PATCH v1] mm:memcg: skip memcg of current in mem_cgroup_soft_limit_reclaim

2018-08-02 Thread Zhaoyang Huang
for the soft_limit reclaim has more directivity than global reclaim, we
have current memcg be skipped to avoid potential page thrashing.

Signed-off-by: Zhaoyang Huang 
---
 mm/memcontrol.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b..9d09e95 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2537,12 +2537,21 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
mz = mem_cgroup_largest_soft_limit_node(mctz);
if (!mz)
break;
-
+   /*
+* skip current memcg to avoid page thrashing, for the
+* mem_cgroup_soft_reclaim has more directivity than
+* global reclaim.
+*/
+   if (get_mem_cgroup_from_mm(current->mm) == mz->memcg) {
+   reclaimed = 0;
+   goto next;
+   }
nr_scanned = 0;
reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
gfp_mask, _scanned);
nr_reclaimed += reclaimed;
*total_scanned += nr_scanned;
+next:
spin_lock_irq(>lock);
__mem_cgroup_remove_exceeded(mz, mctz);
 
-- 
1.9.1



Re: [PATCH v2] mm: terminate the reclaim early when direct reclaiming

2018-07-31 Thread Zhaoyang Huang
On Tue, Jul 31, 2018 at 7:19 PM Michal Hocko  wrote:
>
> On Tue 31-07-18 19:09:28, Zhaoyang Huang wrote:
> > This patch try to let the direct reclaim finish earlier than it used
> > to be. The problem comes from We observing that the direct reclaim
> > took a long time to finish when memcg is enabled. By debugging, we
> > find that the reason is the softlimit is too low to meet the loop
> > end criteria. So we add two barriers to judge if it has reclaimed
> > enough memory as same criteria as it is in shrink_lruvec:
> > 1. for each memcg softlimit reclaim.
> > 2. before starting the global reclaim in shrink_zone.
>
> Then I would really recommend to not use soft limit at all. It has
> always been aggressive. I have propose to make it less so in the past we
> have decided to go that way because we simply do not know whether
> somebody depends on that behavior. Your changelog doesn't really tell
> the whole story. Why is this a problem all of the sudden? Nothing has
> really changed recently AFAICT. Cgroup v1 interface is mostly for
> backward compatibility, we have much better ways to accomplish
> workloads isolation in cgroup v2.
>
> So why does it matter all of the sudden?
>
> Besides that EXPORT_SYMBOL for such a low level functionality as the
> memory reclaim is a big no-no.
>
> So without a much better explanation and with a low level symbol
> exported NAK from me.
>
My test workload is from Android system, where the multimedia apps
require much pages. We observed that one thread of the process trapped
into mem_cgroup_soft_limit_reclaim within direct reclaim and also
blocked other thread in mmap or do_page_fault(by semphore?).
Furthermore, we also observed other long time direct reclaim related
with soft limit which are supposed to cause page thrash as the
allocator itself is the most right of the rb_tree . Besides, even
without the soft_limit, shall the 'direct reclaim' check the watermark
firstly before shrink_node, for the concurrent kswapd may have
reclaimed enough pages for allocation.
> >
> > Signed-off-by: Zhaoyang Huang 
> > ---
> >  include/linux/memcontrol.h |  3 ++-
> >  mm/memcontrol.c|  3 +++
> >  mm/vmscan.c| 38 +-
> >  3 files changed, 42 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 6c6fb11..a7e82c7 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -325,7 +325,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct 
> > mem_cgroup *memcg,
> >  void mem_cgroup_uncharge_list(struct list_head *page_list);
> >
> >  void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
> > -
> > +bool direct_reclaim_reach_watermark(pg_data_t *pgdat, unsigned long 
> > nr_reclaimed,
> > + unsigned long nr_scanned, gfp_t gfp_mask, int order);
> >  static struct mem_cgroup_per_node *
> >  mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
> >  {
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 8c0280b..e4efd46 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -2577,6 +2577,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
> > *pgdat, int order,
> >   (next_mz == NULL ||
> >   loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
> >   break;
> > + if (direct_reclaim_reach_watermark(pgdat, nr_reclaimed,
> > + *total_scanned, gfp_mask, order))
> > + break;
> >   } while (!nr_reclaimed);
> >   if (next_mz)
> >   css_put(_mz->memcg->css);
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 03822f8..19503f3 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2518,6 +2518,34 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, 
> > struct mem_cgroup *memcg)
> >   (memcg && memcg_congested(pgdat, memcg));
> >  }
> >
> > +bool direct_reclaim_reach_watermark(pg_data_t *pgdat, unsigned long 
> > nr_reclaimed,
> > + unsigned long nr_scanned, gfp_t gfp_mask,
> > + int order)
> > +{
> > + struct scan_control sc = {
> > + .gfp_mask = gfp_mask,
> > + .order = order,
> > + .priority = DEF_PRIORITY,
> > + .nr_reclaimed = nr_reclaimed,
> > + .nr_scanned = nr_scanned,
> > + };
> > + if (!cur

Re: [PATCH v2] mm: terminate the reclaim early when direct reclaiming

2018-07-31 Thread Zhaoyang Huang
On Tue, Jul 31, 2018 at 7:19 PM Michal Hocko  wrote:
>
> On Tue 31-07-18 19:09:28, Zhaoyang Huang wrote:
> > This patch try to let the direct reclaim finish earlier than it used
> > to be. The problem comes from We observing that the direct reclaim
> > took a long time to finish when memcg is enabled. By debugging, we
> > find that the reason is the softlimit is too low to meet the loop
> > end criteria. So we add two barriers to judge if it has reclaimed
> > enough memory as same criteria as it is in shrink_lruvec:
> > 1. for each memcg softlimit reclaim.
> > 2. before starting the global reclaim in shrink_zone.
>
> Then I would really recommend to not use soft limit at all. It has
> always been aggressive. I have propose to make it less so in the past we
> have decided to go that way because we simply do not know whether
> somebody depends on that behavior. Your changelog doesn't really tell
> the whole story. Why is this a problem all of the sudden? Nothing has
> really changed recently AFAICT. Cgroup v1 interface is mostly for
> backward compatibility, we have much better ways to accomplish
> workloads isolation in cgroup v2.
>
> So why does it matter all of the sudden?
>
> Besides that EXPORT_SYMBOL for such a low level functionality as the
> memory reclaim is a big no-no.
>
> So without a much better explanation and with a low level symbol
> exported NAK from me.
>
My test workload is from Android system, where the multimedia apps
require much pages. We observed that one thread of the process trapped
into mem_cgroup_soft_limit_reclaim within direct reclaim and also
blocked other thread in mmap or do_page_fault(by semphore?).
Furthermore, we also observed other long time direct reclaim related
with soft limit which are supposed to cause page thrash as the
allocator itself is the most right of the rb_tree . Besides, even
without the soft_limit, shall the 'direct reclaim' check the watermark
firstly before shrink_node, for the concurrent kswapd may have
reclaimed enough pages for allocation.
> >
> > Signed-off-by: Zhaoyang Huang 
> > ---
> >  include/linux/memcontrol.h |  3 ++-
> >  mm/memcontrol.c|  3 +++
> >  mm/vmscan.c| 38 +-
> >  3 files changed, 42 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 6c6fb11..a7e82c7 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -325,7 +325,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct 
> > mem_cgroup *memcg,
> >  void mem_cgroup_uncharge_list(struct list_head *page_list);
> >
> >  void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
> > -
> > +bool direct_reclaim_reach_watermark(pg_data_t *pgdat, unsigned long 
> > nr_reclaimed,
> > + unsigned long nr_scanned, gfp_t gfp_mask, int order);
> >  static struct mem_cgroup_per_node *
> >  mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
> >  {
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 8c0280b..e4efd46 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -2577,6 +2577,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
> > *pgdat, int order,
> >   (next_mz == NULL ||
> >   loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
> >   break;
> > + if (direct_reclaim_reach_watermark(pgdat, nr_reclaimed,
> > + *total_scanned, gfp_mask, order))
> > + break;
> >   } while (!nr_reclaimed);
> >   if (next_mz)
> >   css_put(_mz->memcg->css);
> > diff --git a/mm/vmscan.c b/mm/vmscan.c
> > index 03822f8..19503f3 100644
> > --- a/mm/vmscan.c
> > +++ b/mm/vmscan.c
> > @@ -2518,6 +2518,34 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, 
> > struct mem_cgroup *memcg)
> >   (memcg && memcg_congested(pgdat, memcg));
> >  }
> >
> > +bool direct_reclaim_reach_watermark(pg_data_t *pgdat, unsigned long 
> > nr_reclaimed,
> > + unsigned long nr_scanned, gfp_t gfp_mask,
> > + int order)
> > +{
> > + struct scan_control sc = {
> > + .gfp_mask = gfp_mask,
> > + .order = order,
> > + .priority = DEF_PRIORITY,
> > + .nr_reclaimed = nr_reclaimed,
> > + .nr_scanned = nr_scanned,
> > + };
> > + if (!cur

[PATCH v2] mm: terminate the reclaim early when direct reclaiming

2018-07-31 Thread Zhaoyang Huang
This patch try to let the direct reclaim finish earlier than it used
to be. The problem comes from We observing that the direct reclaim
took a long time to finish when memcg is enabled. By debugging, we
find that the reason is the softlimit is too low to meet the loop
end criteria. So we add two barriers to judge if it has reclaimed
enough memory as same criteria as it is in shrink_lruvec:
1. for each memcg softlimit reclaim.
2. before starting the global reclaim in shrink_zone.

Signed-off-by: Zhaoyang Huang 
---
 include/linux/memcontrol.h |  3 ++-
 mm/memcontrol.c|  3 +++
 mm/vmscan.c| 38 +-
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb11..a7e82c7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -325,7 +325,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct 
mem_cgroup *memcg,
 void mem_cgroup_uncharge_list(struct list_head *page_list);
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
-
+bool direct_reclaim_reach_watermark(pg_data_t *pgdat, unsigned long 
nr_reclaimed,
+   unsigned long nr_scanned, gfp_t gfp_mask, int order);
 static struct mem_cgroup_per_node *
 mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b..e4efd46 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2577,6 +2577,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
+   if (direct_reclaim_reach_watermark(pgdat, nr_reclaimed,
+   *total_scanned, gfp_mask, order))
+   break;
} while (!nr_reclaimed);
if (next_mz)
css_put(_mz->memcg->css);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..19503f3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2518,6 +2518,34 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, 
struct mem_cgroup *memcg)
(memcg && memcg_congested(pgdat, memcg));
 }
 
+bool direct_reclaim_reach_watermark(pg_data_t *pgdat, unsigned long 
nr_reclaimed,
+   unsigned long nr_scanned, gfp_t gfp_mask,
+   int order)
+{
+   struct scan_control sc = {
+   .gfp_mask = gfp_mask,
+   .order = order,
+   .priority = DEF_PRIORITY,
+   .nr_reclaimed = nr_reclaimed,
+   .nr_scanned = nr_scanned,
+   };
+   if (!current_is_kswapd())
+   return false;
+   if (!IS_ENABLED(CONFIG_COMPACTION))
+   return false;
+   /*
+* In fact, we add 1 to nr_reclaimed and nr_scanned to let 
should_continue_reclaim
+* NOT return by finding they are zero, which means 
compaction_suitable()
+* takes effect here to judge if we have reclaimed enough pages for 
passing
+* the watermark and no necessary to check other memcg anymore.
+*/
+   if (!should_continue_reclaim(pgdat,
+   sc.nr_reclaimed + 1, sc.nr_scanned + 1, ))
+   return true;
+   return false;
+}
+EXPORT_SYMBOL(direct_reclaim_reach_watermark);
+
 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2802,7 +2830,15 @@ static void shrink_zones(struct zonelist *zonelist, 
struct scan_control *sc)
sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */
}
-
+   /*
+* we maybe have stolen enough pages from soft limit reclaim, 
so we return
+* back if we are direct reclaim
+*/
+   if (direct_reclaim_reach_watermark(zone->zone_pgdat, 
sc->nr_reclaimed,
+   sc->nr_scanned, sc->gfp_mask, 
sc->order)) {
+   sc->gfp_mask = orig_mask;
+   return;
+   }
/* See comment about same check for global reclaim above */
if (zone->zone_pgdat == last_pgdat)
continue;
-- 
1.9.1



[PATCH v2] mm: terminate the reclaim early when direct reclaiming

2018-07-31 Thread Zhaoyang Huang
This patch try to let the direct reclaim finish earlier than it used
to be. The problem comes from We observing that the direct reclaim
took a long time to finish when memcg is enabled. By debugging, we
find that the reason is the softlimit is too low to meet the loop
end criteria. So we add two barriers to judge if it has reclaimed
enough memory as same criteria as it is in shrink_lruvec:
1. for each memcg softlimit reclaim.
2. before starting the global reclaim in shrink_zone.

Signed-off-by: Zhaoyang Huang 
---
 include/linux/memcontrol.h |  3 ++-
 mm/memcontrol.c|  3 +++
 mm/vmscan.c| 38 +-
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb11..a7e82c7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -325,7 +325,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct 
mem_cgroup *memcg,
 void mem_cgroup_uncharge_list(struct list_head *page_list);
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
-
+bool direct_reclaim_reach_watermark(pg_data_t *pgdat, unsigned long 
nr_reclaimed,
+   unsigned long nr_scanned, gfp_t gfp_mask, int order);
 static struct mem_cgroup_per_node *
 mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b..e4efd46 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2577,6 +2577,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
+   if (direct_reclaim_reach_watermark(pgdat, nr_reclaimed,
+   *total_scanned, gfp_mask, order))
+   break;
} while (!nr_reclaimed);
if (next_mz)
css_put(_mz->memcg->css);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..19503f3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2518,6 +2518,34 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, 
struct mem_cgroup *memcg)
(memcg && memcg_congested(pgdat, memcg));
 }
 
+bool direct_reclaim_reach_watermark(pg_data_t *pgdat, unsigned long 
nr_reclaimed,
+   unsigned long nr_scanned, gfp_t gfp_mask,
+   int order)
+{
+   struct scan_control sc = {
+   .gfp_mask = gfp_mask,
+   .order = order,
+   .priority = DEF_PRIORITY,
+   .nr_reclaimed = nr_reclaimed,
+   .nr_scanned = nr_scanned,
+   };
+   if (!current_is_kswapd())
+   return false;
+   if (!IS_ENABLED(CONFIG_COMPACTION))
+   return false;
+   /*
+* In fact, we add 1 to nr_reclaimed and nr_scanned to let 
should_continue_reclaim
+* NOT return by finding they are zero, which means 
compaction_suitable()
+* takes effect here to judge if we have reclaimed enough pages for 
passing
+* the watermark and no necessary to check other memcg anymore.
+*/
+   if (!should_continue_reclaim(pgdat,
+   sc.nr_reclaimed + 1, sc.nr_scanned + 1, ))
+   return true;
+   return false;
+}
+EXPORT_SYMBOL(direct_reclaim_reach_watermark);
+
 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2802,7 +2830,15 @@ static void shrink_zones(struct zonelist *zonelist, 
struct scan_control *sc)
sc->nr_scanned += nr_soft_scanned;
/* need some check for avoid more shrink_zone() */
}
-
+   /*
+* we maybe have stolen enough pages from soft limit reclaim, 
so we return
+* back if we are direct reclaim
+*/
+   if (direct_reclaim_reach_watermark(zone->zone_pgdat, 
sc->nr_reclaimed,
+   sc->nr_scanned, sc->gfp_mask, 
sc->order)) {
+   sc->gfp_mask = orig_mask;
+   return;
+   }
/* See comment about same check for global reclaim above */
if (zone->zone_pgdat == last_pgdat)
continue;
-- 
1.9.1



[PATCH] mm: terminate the reclaim early when direct reclaiming

2018-07-27 Thread Zhaoyang Huang
This patch try to let the direct reclaim finish earlier than it used
to be. The problem comes from We observing that the direct reclaim
took a long time to finish when memcg is enabled. By debugging, we
find that the reason is the softlimit is too low to meet the loop
end criteria. So we add two barriers to judge if it has reclaimed
enough memory as same criteria as it is in shrink_lruvec:
1. for each memcg softlimit reclaim.
2. before starting the global reclaim in shrink_zone.

Signed-off-by: Zhaoyang Huang 
---
 include/linux/memcontrol.h |  3 ++-
 mm/memcontrol.c|  3 +++
 mm/vmscan.c| 24 
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb11..cdf5de6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -325,7 +325,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct 
mem_cgroup *memcg,
 void mem_cgroup_uncharge_list(struct list_head *page_list);
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
-
+bool direct_reclaim_reach_sflimit(pg_data_t *pgdat, unsigned long nr_reclaimed,
+   unsigned long nr_scanned, gfp_t gfp_mask, int order);
 static struct mem_cgroup_per_node *
 mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b..4e38223 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2577,6 +2577,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
+   if (direct_reclaim_reach_sflimit(pgdat, nr_reclaimed,
+   *total_scanned, gfp_mask, order))
+   break;
} while (!nr_reclaimed);
if (next_mz)
css_put(_mz->memcg->css);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..77fcda4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2518,12 +2518,36 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, 
struct mem_cgroup *memcg)
(memcg && memcg_congested(pgdat, memcg));
 }
 
+bool direct_reclaim_reach_sflimit(pg_data_t *pgdat, unsigned long nr_reclaimed,
+   unsigned long nr_scanned, gfp_t gfp_mask,
+   int order)
+{
+   struct scan_control sc = {
+   .gfp_mask = gfp_mask,
+   .order = order,
+   .priority = DEF_PRIORITY,
+   .nr_reclaimed = nr_reclaimed,
+   .nr_scanned = nr_scanned,
+   };
+   if (!current_is_kswapd() && !should_continue_reclaim(pgdat,
+   sc.nr_reclaimed, sc.nr_scanned, ))
+   return true;
+   return false;
+}
+EXPORT_SYMBOL(direct_reclaim_reach_sflimit);
+
 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
 
+   if (!current_is_kswapd() && !should_continue_reclaim(pgdat,
+   sc->nr_reclaimed, sc->nr_scanned, sc)) {
+
+   return !!sc->nr_reclaimed;
+   }
+
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
-- 
1.9.1



[PATCH] mm: terminate the reclaim early when direct reclaiming

2018-07-27 Thread Zhaoyang Huang
This patch try to let the direct reclaim finish earlier than it used
to be. The problem comes from We observing that the direct reclaim
took a long time to finish when memcg is enabled. By debugging, we
find that the reason is the softlimit is too low to meet the loop
end criteria. So we add two barriers to judge if it has reclaimed
enough memory as same criteria as it is in shrink_lruvec:
1. for each memcg softlimit reclaim.
2. before starting the global reclaim in shrink_zone.

Signed-off-by: Zhaoyang Huang 
---
 include/linux/memcontrol.h |  3 ++-
 mm/memcontrol.c|  3 +++
 mm/vmscan.c| 24 
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb11..cdf5de6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -325,7 +325,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct 
mem_cgroup *memcg,
 void mem_cgroup_uncharge_list(struct list_head *page_list);
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage);
-
+bool direct_reclaim_reach_sflimit(pg_data_t *pgdat, unsigned long nr_reclaimed,
+   unsigned long nr_scanned, gfp_t gfp_mask, int order);
 static struct mem_cgroup_per_node *
 mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b..4e38223 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2577,6 +2577,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
(next_mz == NULL ||
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
+   if (direct_reclaim_reach_sflimit(pgdat, nr_reclaimed,
+   *total_scanned, gfp_mask, order))
+   break;
} while (!nr_reclaimed);
if (next_mz)
css_put(_mz->memcg->css);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..77fcda4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2518,12 +2518,36 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, 
struct mem_cgroup *memcg)
(memcg && memcg_congested(pgdat, memcg));
 }
 
+bool direct_reclaim_reach_sflimit(pg_data_t *pgdat, unsigned long nr_reclaimed,
+   unsigned long nr_scanned, gfp_t gfp_mask,
+   int order)
+{
+   struct scan_control sc = {
+   .gfp_mask = gfp_mask,
+   .order = order,
+   .priority = DEF_PRIORITY,
+   .nr_reclaimed = nr_reclaimed,
+   .nr_scanned = nr_scanned,
+   };
+   if (!current_is_kswapd() && !should_continue_reclaim(pgdat,
+   sc.nr_reclaimed, sc.nr_scanned, ))
+   return true;
+   return false;
+}
+EXPORT_SYMBOL(direct_reclaim_reach_sflimit);
+
 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
 
+   if (!current_is_kswapd() && !should_continue_reclaim(pgdat,
+   sc->nr_reclaimed, sc->nr_scanned, sc)) {
+
+   return !!sc->nr_reclaimed;
+   }
+
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
struct mem_cgroup_reclaim_cookie reclaim = {
-- 
1.9.1



Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-11 Thread Zhaoyang Huang
On Wed, Apr 11, 2018 at 2:39 AM, Joel Fernandes  wrote:
> Hi Steve,
>
> On Tue, Apr 10, 2018 at 11:00 AM, Steven Rostedt  wrote:
>> On Tue, 10 Apr 2018 09:45:54 -0700
>> Joel Fernandes  wrote:
>>
>>> > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
>>> > index a0233edc0718..807e2bcb21b3 100644
>>> > --- a/include/linux/ring_buffer.h
>>> > +++ b/include/linux/ring_buffer.h
>>> > @@ -106,7 +106,8 @@ __poll_t ring_buffer_poll_wait(struct ring_buffer 
>>> > *buffer, int cpu,
>>> >
>>> >  void ring_buffer_free(struct ring_buffer *buffer);
>>> >
>>> > -int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, 
>>> > int cpu);
>>> > +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
>>> > +   int cpu, int rbflags);
>>> >
>>> >  void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val);
>>> >
>>> > @@ -201,6 +202,7 @@ int ring_buffer_print_page_header(struct trace_seq 
>>> > *s);
>>> >
>>> >  enum ring_buffer_flags {
>>> > RB_FL_OVERWRITE = 1 << 0,
>>> > +   RB_FL_NO_RECLAIM= 1 << 1,
>>>
>>> But the thing is, set_oom_origin doesn't seem to be doing the
>>> desirable thing every time anyway as per my tests last week [1] and
>>> the si_mem_available check alone seems to be working fine for me (and
>>> also Zhaoyang as he mentioned).
>>
>> But did you try it with just plain GFP_KERNEL, and not RETRY_MAYFAIL.
>
> Yes I tried it with just GFP_KERNEL as well. What I did based on your
> suggestion for testing the OOM hint is:
> 1. Comment the si_mem_available check
> 2. Do only GFP_KERNEL
>
> The system gets destabilized with this combination even with the OOM
> hint. These threads are here:
> https://lkml.org/lkml/2018/4/5/720
>
>> My tests would always trigger the allocating task without the
>> RETRY_MAYFAIL, but with RETRY_MAYFAIL it would sometimes take out other
>> tasks.
>>
>>>
>>> Since the problem Zhaoyang is now referring to is caused because of
>>> calling set_oom_origin in the first place, can we not just drop that
>>> patch and avoid adding more complexity?
>>
>> Actually, I'm thinking of dropping the MAYFAIL part. It really should
>> be the one targeted if you are extending the ring buffer.
>
> This then sounds like it should be fixed in -mm code? If we're giving
> the hint and its not getting killed there then that's an -mm issue.
>
>> I could add two loops. One that does NORETRY without the oom origin,
>> and if it succeeds, its fine. But if it requires reclaim, it will then
>> set oom_origin and go harder (where it should be the one targeted).
>>
>> But that may be pointless, because if NORETRY succeeds, there's not
>> really any likelihood of oom triggering in the first place.
>
> Yes.
>
>>
>>>
>>> IMHO I feel like for things like RB memory allocation, we shouldn't
>>> add a knob if we don't need to.
>>
>> It was just a suggestion.
>
> Cool, I understand.
>
>>>
>>> Also I think Zhaoyang is developing for Android too since he mentioned
>>> he ran CTS tests so we both have the same "usecase" but he can feel
>>> free to correct me if that's not the case ;)
>>
>> I think if you are really worried with the task being killed by oom,
>> then I agree with Michal and just fork a process to do the allocation
>> for you.
>
> Yes I agree. So lets just do that and no other patches additional
> patches are needed then. Let me know if there's anything else I
> missed?
>
> Also I got a bit confused, I reread all the threads. Zhaoyang's
> current issue is that the OOM hint *IS* working which is what
> triggered your patch to toggle the behavior through an option. Where
> was in this message we are discussing that the OOM hint doesn't always
> work which is not Zhaoyang's current issue. Let me know if I missed
> something? Sorry if I did.
>
> thanks,
>
> - Joel
Hi Joel, you are right. My issue is to make Steven's patch safer by
keeping -1000 process out of OOM. I think it is ok either we just have
si_mem_available or apply set/clear_current_oom_origin with absolving
-1000 process. The CTS case failed because the system_server was
killed as the innocent. If Steven think it is rared corner case, I am
ok with that.


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-11 Thread Zhaoyang Huang
On Wed, Apr 11, 2018 at 2:39 AM, Joel Fernandes  wrote:
> Hi Steve,
>
> On Tue, Apr 10, 2018 at 11:00 AM, Steven Rostedt  wrote:
>> On Tue, 10 Apr 2018 09:45:54 -0700
>> Joel Fernandes  wrote:
>>
>>> > diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
>>> > index a0233edc0718..807e2bcb21b3 100644
>>> > --- a/include/linux/ring_buffer.h
>>> > +++ b/include/linux/ring_buffer.h
>>> > @@ -106,7 +106,8 @@ __poll_t ring_buffer_poll_wait(struct ring_buffer 
>>> > *buffer, int cpu,
>>> >
>>> >  void ring_buffer_free(struct ring_buffer *buffer);
>>> >
>>> > -int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, 
>>> > int cpu);
>>> > +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
>>> > +   int cpu, int rbflags);
>>> >
>>> >  void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val);
>>> >
>>> > @@ -201,6 +202,7 @@ int ring_buffer_print_page_header(struct trace_seq 
>>> > *s);
>>> >
>>> >  enum ring_buffer_flags {
>>> > RB_FL_OVERWRITE = 1 << 0,
>>> > +   RB_FL_NO_RECLAIM= 1 << 1,
>>>
>>> But the thing is, set_oom_origin doesn't seem to be doing the
>>> desirable thing every time anyway as per my tests last week [1] and
>>> the si_mem_available check alone seems to be working fine for me (and
>>> also Zhaoyang as he mentioned).
>>
>> But did you try it with just plain GFP_KERNEL, and not RETRY_MAYFAIL.
>
> Yes I tried it with just GFP_KERNEL as well. What I did based on your
> suggestion for testing the OOM hint is:
> 1. Comment the si_mem_available check
> 2. Do only GFP_KERNEL
>
> The system gets destabilized with this combination even with the OOM
> hint. These threads are here:
> https://lkml.org/lkml/2018/4/5/720
>
>> My tests would always trigger the allocating task without the
>> RETRY_MAYFAIL, but with RETRY_MAYFAIL it would sometimes take out other
>> tasks.
>>
>>>
>>> Since the problem Zhaoyang is now referring to is caused because of
>>> calling set_oom_origin in the first place, can we not just drop that
>>> patch and avoid adding more complexity?
>>
>> Actually, I'm thinking of dropping the MAYFAIL part. It really should
>> be the one targeted if you are extending the ring buffer.
>
> This then sounds like it should be fixed in -mm code? If we're giving
> the hint and its not getting killed there then that's an -mm issue.
>
>> I could add two loops. One that does NORETRY without the oom origin,
>> and if it succeeds, its fine. But if it requires reclaim, it will then
>> set oom_origin and go harder (where it should be the one targeted).
>>
>> But that may be pointless, because if NORETRY succeeds, there's not
>> really any likelihood of oom triggering in the first place.
>
> Yes.
>
>>
>>>
>>> IMHO I feel like for things like RB memory allocation, we shouldn't
>>> add a knob if we don't need to.
>>
>> It was just a suggestion.
>
> Cool, I understand.
>
>>>
>>> Also I think Zhaoyang is developing for Android too since he mentioned
>>> he ran CTS tests so we both have the same "usecase" but he can feel
>>> free to correct me if that's not the case ;)
>>
>> I think if you are really worried with the task being killed by oom,
>> then I agree with Michal and just fork a process to do the allocation
>> for you.
>
> Yes I agree. So lets just do that and no other patches additional
> patches are needed then. Let me know if there's anything else I
> missed?
>
> Also I got a bit confused, I reread all the threads. Zhaoyang's
> current issue is that the OOM hint *IS* working which is what
> triggered your patch to toggle the behavior through an option. Where
> was in this message we are discussing that the OOM hint doesn't always
> work which is not Zhaoyang's current issue. Let me know if I missed
> something? Sorry if I did.
>
> thanks,
>
> - Joel
Hi Joel, you are right. My issue is to make Steven's patch safer by
keeping -1000 process out of OOM. I think it is ok either we just have
si_mem_available or apply set/clear_current_oom_origin with absolving
-1000 process. The CTS case failed because the system_server was
killed as the innocent. If Steven think it is rared corner case, I am
ok with that.


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 5:32 PM, Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
> On Tue, Apr 10, 2018 at 5:01 PM, Michal Hocko <mho...@kernel.org> wrote:
>> On Tue 10-04-18 16:38:32, Zhaoyang Huang wrote:
>>> On Tue, Apr 10, 2018 at 4:12 PM, Michal Hocko <mho...@kernel.org> wrote:
>>> > On Tue 10-04-18 16:04:40, Zhaoyang Huang wrote:
>>> >> On Tue, Apr 10, 2018 at 3:49 PM, Michal Hocko <mho...@kernel.org> wrote:
>>> >> > On Tue 10-04-18 14:39:35, Zhaoyang Huang wrote:
>>> >> >> On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko <mho...@kernel.org> 
>>> >> >> wrote:
>>> > [...]
>>> >> >> > OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer 
>>> >> >> > completely".
>>> >> >> > So what exactly do you want to achieve here? Because from the above 
>>> >> >> > it
>>> >> >> > sounds like opposite things. /me confused...
>>> >> >> >
>>> >> >> Steve's patch intend to have the process be OOM's victim when it
>>> >> >> over-allocating pages for ring buffer. I amend a patch over to protect
>>> >> >> process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
>>> >> >> such process to be selected by current OOM's way of
>>> >> >> selecting.(consider OOM_FLAG_ORIGIN first before the adj)
>>> >> >
>>> >> > I just wouldn't really care unless there is an existing and reasonable
>>> >> > usecase for an application which updates the ring buffer size _and_ it
>>> >> > is OOM disabled at the same time.
>>> >> There is indeed such kind of test case on my android system, which is
>>> >> known as CTS and Monkey etc.
>>> >
>>> > Does the test simulate a real workload? I mean we have two things here
>>> >
>>> > oom disabled task and an updater of the ftrace ring buffer to a
>>> > potentially large size. The second can be completely isolated to a
>>> > different context, no? So why do they run in the single user process
>>> > context?
>>> ok. I think there are some misunderstandings here. Let me try to
>>> explain more by my poor English. There is just one thing here. The
>>> updater is originally a oom disabled task with adj=OOM_SCORE_ADJ_MIN.
>>> With Steven's patch, it will periodically become a oom killable task
>>> by calling set_current_oom_origin() for user process which is
>>> enlarging the ring buffer. What I am doing here is limit the user
>>> process to the ones that adj > -1000.
>>
>> I've understood that part. And I am arguing whether this is really such
>> an important case to play further tricks. Wouldn't it be much simpler to
>> put the updater out to a separate process? OOM disabled processes
>> shouldn't really do unexpectedly large allocations. Full stop. Otherwise
>> you risk a large system disruptions.
>> --
> It is a real problem(my android system just hung there while running
> the test case for the innocent key process killed by OOM), however,
> the problem is we can not define the userspace's behavior as you
> suggested. What Steven's patch doing here is to keep the system to be
> stable by having the updater to take the responsbility itself. My
> patch is to let the OOM disabled processes remain the unkillable
> status.
>
>> Michal Hocko
>> SUSE Labs
To summarize the patch sets as 'let the updater take the
responsibility itself, don't harm to the innocent, but absolve the
critical process'


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 5:32 PM, Zhaoyang Huang  wrote:
> On Tue, Apr 10, 2018 at 5:01 PM, Michal Hocko  wrote:
>> On Tue 10-04-18 16:38:32, Zhaoyang Huang wrote:
>>> On Tue, Apr 10, 2018 at 4:12 PM, Michal Hocko  wrote:
>>> > On Tue 10-04-18 16:04:40, Zhaoyang Huang wrote:
>>> >> On Tue, Apr 10, 2018 at 3:49 PM, Michal Hocko  wrote:
>>> >> > On Tue 10-04-18 14:39:35, Zhaoyang Huang wrote:
>>> >> >> On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko  
>>> >> >> wrote:
>>> > [...]
>>> >> >> > OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer 
>>> >> >> > completely".
>>> >> >> > So what exactly do you want to achieve here? Because from the above 
>>> >> >> > it
>>> >> >> > sounds like opposite things. /me confused...
>>> >> >> >
>>> >> >> Steve's patch intend to have the process be OOM's victim when it
>>> >> >> over-allocating pages for ring buffer. I amend a patch over to protect
>>> >> >> process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
>>> >> >> such process to be selected by current OOM's way of
>>> >> >> selecting.(consider OOM_FLAG_ORIGIN first before the adj)
>>> >> >
>>> >> > I just wouldn't really care unless there is an existing and reasonable
>>> >> > usecase for an application which updates the ring buffer size _and_ it
>>> >> > is OOM disabled at the same time.
>>> >> There is indeed such kind of test case on my android system, which is
>>> >> known as CTS and Monkey etc.
>>> >
>>> > Does the test simulate a real workload? I mean we have two things here
>>> >
>>> > oom disabled task and an updater of the ftrace ring buffer to a
>>> > potentially large size. The second can be completely isolated to a
>>> > different context, no? So why do they run in the single user process
>>> > context?
>>> ok. I think there are some misunderstandings here. Let me try to
>>> explain more by my poor English. There is just one thing here. The
>>> updater is originally a oom disabled task with adj=OOM_SCORE_ADJ_MIN.
>>> With Steven's patch, it will periodically become a oom killable task
>>> by calling set_current_oom_origin() for user process which is
>>> enlarging the ring buffer. What I am doing here is limit the user
>>> process to the ones that adj > -1000.
>>
>> I've understood that part. And I am arguing whether this is really such
>> an important case to play further tricks. Wouldn't it be much simpler to
>> put the updater out to a separate process? OOM disabled processes
>> shouldn't really do unexpectedly large allocations. Full stop. Otherwise
>> you risk a large system disruptions.
>> --
> It is a real problem(my android system just hung there while running
> the test case for the innocent key process killed by OOM), however,
> the problem is we can not define the userspace's behavior as you
> suggested. What Steven's patch doing here is to keep the system to be
> stable by having the updater to take the responsbility itself. My
> patch is to let the OOM disabled processes remain the unkillable
> status.
>
>> Michal Hocko
>> SUSE Labs
To summarize the patch sets as 'let the updater take the
responsibility itself, don't harm to the innocent, but absolve the
critical process'


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 5:01 PM, Michal Hocko <mho...@kernel.org> wrote:
> On Tue 10-04-18 16:38:32, Zhaoyang Huang wrote:
>> On Tue, Apr 10, 2018 at 4:12 PM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Tue 10-04-18 16:04:40, Zhaoyang Huang wrote:
>> >> On Tue, Apr 10, 2018 at 3:49 PM, Michal Hocko <mho...@kernel.org> wrote:
>> >> > On Tue 10-04-18 14:39:35, Zhaoyang Huang wrote:
>> >> >> On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko <mho...@kernel.org> 
>> >> >> wrote:
>> > [...]
>> >> >> > OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer 
>> >> >> > completely".
>> >> >> > So what exactly do you want to achieve here? Because from the above 
>> >> >> > it
>> >> >> > sounds like opposite things. /me confused...
>> >> >> >
>> >> >> Steve's patch intend to have the process be OOM's victim when it
>> >> >> over-allocating pages for ring buffer. I amend a patch over to protect
>> >> >> process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
>> >> >> such process to be selected by current OOM's way of
>> >> >> selecting.(consider OOM_FLAG_ORIGIN first before the adj)
>> >> >
>> >> > I just wouldn't really care unless there is an existing and reasonable
>> >> > usecase for an application which updates the ring buffer size _and_ it
>> >> > is OOM disabled at the same time.
>> >> There is indeed such kind of test case on my android system, which is
>> >> known as CTS and Monkey etc.
>> >
>> > Does the test simulate a real workload? I mean we have two things here
>> >
>> > oom disabled task and an updater of the ftrace ring buffer to a
>> > potentially large size. The second can be completely isolated to a
>> > different context, no? So why do they run in the single user process
>> > context?
>> ok. I think there are some misunderstandings here. Let me try to
>> explain more by my poor English. There is just one thing here. The
>> updater is originally a oom disabled task with adj=OOM_SCORE_ADJ_MIN.
>> With Steven's patch, it will periodically become a oom killable task
>> by calling set_current_oom_origin() for user process which is
>> enlarging the ring buffer. What I am doing here is limit the user
>> process to the ones that adj > -1000.
>
> I've understood that part. And I am arguing whether this is really such
> an important case to play further tricks. Wouldn't it be much simpler to
> put the updater out to a separate process? OOM disabled processes
> shouldn't really do unexpectedly large allocations. Full stop. Otherwise
> you risk a large system disruptions.
> --
It is a real problem(my android system just hung there while running
the test case for the innocent key process killed by OOM), however,
the problem is we can not define the userspace's behavior as you
suggested. What Steven's patch doing here is to keep the system to be
stable by having the updater to take the responsbility itself. My
patch is to let the OOM disabled processes remain the unkillable
status.

> Michal Hocko
> SUSE Labs


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 5:01 PM, Michal Hocko  wrote:
> On Tue 10-04-18 16:38:32, Zhaoyang Huang wrote:
>> On Tue, Apr 10, 2018 at 4:12 PM, Michal Hocko  wrote:
>> > On Tue 10-04-18 16:04:40, Zhaoyang Huang wrote:
>> >> On Tue, Apr 10, 2018 at 3:49 PM, Michal Hocko  wrote:
>> >> > On Tue 10-04-18 14:39:35, Zhaoyang Huang wrote:
>> >> >> On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko  
>> >> >> wrote:
>> > [...]
>> >> >> > OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer 
>> >> >> > completely".
>> >> >> > So what exactly do you want to achieve here? Because from the above 
>> >> >> > it
>> >> >> > sounds like opposite things. /me confused...
>> >> >> >
>> >> >> Steve's patch intend to have the process be OOM's victim when it
>> >> >> over-allocating pages for ring buffer. I amend a patch over to protect
>> >> >> process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
>> >> >> such process to be selected by current OOM's way of
>> >> >> selecting.(consider OOM_FLAG_ORIGIN first before the adj)
>> >> >
>> >> > I just wouldn't really care unless there is an existing and reasonable
>> >> > usecase for an application which updates the ring buffer size _and_ it
>> >> > is OOM disabled at the same time.
>> >> There is indeed such kind of test case on my android system, which is
>> >> known as CTS and Monkey etc.
>> >
>> > Does the test simulate a real workload? I mean we have two things here
>> >
>> > oom disabled task and an updater of the ftrace ring buffer to a
>> > potentially large size. The second can be completely isolated to a
>> > different context, no? So why do they run in the single user process
>> > context?
>> ok. I think there are some misunderstandings here. Let me try to
>> explain more by my poor English. There is just one thing here. The
>> updater is originally a oom disabled task with adj=OOM_SCORE_ADJ_MIN.
>> With Steven's patch, it will periodically become a oom killable task
>> by calling set_current_oom_origin() for user process which is
>> enlarging the ring buffer. What I am doing here is limit the user
>> process to the ones that adj > -1000.
>
> I've understood that part. And I am arguing whether this is really such
> an important case to play further tricks. Wouldn't it be much simpler to
> put the updater out to a separate process? OOM disabled processes
> shouldn't really do unexpectedly large allocations. Full stop. Otherwise
> you risk a large system disruptions.
> --
It is a real problem(my android system just hung there while running
the test case for the innocent key process killed by OOM), however,
the problem is we can not define the userspace's behavior as you
suggested. What Steven's patch doing here is to keep the system to be
stable by having the updater to take the responsbility itself. My
patch is to let the OOM disabled processes remain the unkillable
status.

> Michal Hocko
> SUSE Labs


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 4:12 PM, Michal Hocko <mho...@kernel.org> wrote:
> On Tue 10-04-18 16:04:40, Zhaoyang Huang wrote:
>> On Tue, Apr 10, 2018 at 3:49 PM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Tue 10-04-18 14:39:35, Zhaoyang Huang wrote:
>> >> On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko <mho...@kernel.org> wrote:
> [...]
>> >> > OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer 
>> >> > completely".
>> >> > So what exactly do you want to achieve here? Because from the above it
>> >> > sounds like opposite things. /me confused...
>> >> >
>> >> Steve's patch intend to have the process be OOM's victim when it
>> >> over-allocating pages for ring buffer. I amend a patch over to protect
>> >> process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
>> >> such process to be selected by current OOM's way of
>> >> selecting.(consider OOM_FLAG_ORIGIN first before the adj)
>> >
>> > I just wouldn't really care unless there is an existing and reasonable
>> > usecase for an application which updates the ring buffer size _and_ it
>> > is OOM disabled at the same time.
>> There is indeed such kind of test case on my android system, which is
>> known as CTS and Monkey etc.
>
> Does the test simulate a real workload? I mean we have two things here
>
> oom disabled task and an updater of the ftrace ring buffer to a
> potentially large size. The second can be completely isolated to a
> different context, no? So why do they run in the single user process
> context?
ok. I think there are some misunderstandings here. Let me try to
explain more by my poor English. There is just one thing here. The
updater is originally a oom disabled task with adj=OOM_SCORE_ADJ_MIN.
With Steven's patch, it will periodically become a oom killable task
by calling set_current_oom_origin() for user process which is
enlarging the ring buffer. What I am doing here is limit the user
process to the ones that adj > -1000.

>
>> Furthermore, I think we should make the
>> patch to be as safest as possible. Why do we leave a potential risk
>> here? There is no side effect for my patch.
>
> I do not have the full context. Could you point me to your patch?

here are Steven and my patches
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5f38398..1005d73 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1135,7 +1135,7 @@ static int rb_check_pages(struct
ring_buffer_per_cpu *cpu_buffer)
 static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
 {
struct buffer_page *bpage, *tmp;
-   bool user_thread = current->mm != NULL;
+   bool user_thread = (current->mm != NULL &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN);//by zhaoyang
gfp_t mflags;
long i;
-
  {

  struct buffer_page *bpage, *tmp;
+ bool user_thread = current->mm != NULL;
+ gfp_t mflags;
  long i;

- /* Check if the available memory is there first */
+ /*
+ * Check if the available memory is there first.
+ * Note, si_mem_available() only gives us a rough estimate of available
+ * memory. It may not be accurate. But we don't care, we just want
+ * to prevent doing any allocation when it is obvious that it is
+ * not going to succeed.
+ */
  i = si_mem_available();
  if (i < nr_pages)
  return -ENOMEM;

+ /*
+ * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is not
+ * destabilized.
+ */
+ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
+
+ /*
+ * If a user thread allocates too much, and si_mem_available()
+ * reports there's enough memory, even though there is not.
+ * Make sure the OOM killer kills this thread. This can happen
+ * even with RETRY_MAYFAIL because another task may be doing
+ * an allocation after this task has taken all memory.
+ * This is the task the OOM killer needs to take out during this
+ * loop, even if it was triggered by an allocation somewhere else.
+ */
+ if (user_thread)
+ set_current_oom_origin();
  for (i = 0; i < nr_pages; i++) {
  struct page *page;
- /*
- * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
- * gracefully without invoking oom-killer and the system is not
- * destabilized.
- */
+
  bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
-GFP_KERNEL | __GFP_RETRY_MAYFAIL,
-cpu_to_node(cpu));
+mflags, cpu_to_node(cpu));
  if (!bpage)
  goto free_pages;

  list_add(>list, pages);

- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
+ page = alloc_pages_nod

Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 4:12 PM, Michal Hocko  wrote:
> On Tue 10-04-18 16:04:40, Zhaoyang Huang wrote:
>> On Tue, Apr 10, 2018 at 3:49 PM, Michal Hocko  wrote:
>> > On Tue 10-04-18 14:39:35, Zhaoyang Huang wrote:
>> >> On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko  wrote:
> [...]
>> >> > OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer 
>> >> > completely".
>> >> > So what exactly do you want to achieve here? Because from the above it
>> >> > sounds like opposite things. /me confused...
>> >> >
>> >> Steve's patch intend to have the process be OOM's victim when it
>> >> over-allocating pages for ring buffer. I amend a patch over to protect
>> >> process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
>> >> such process to be selected by current OOM's way of
>> >> selecting.(consider OOM_FLAG_ORIGIN first before the adj)
>> >
>> > I just wouldn't really care unless there is an existing and reasonable
>> > usecase for an application which updates the ring buffer size _and_ it
>> > is OOM disabled at the same time.
>> There is indeed such kind of test case on my android system, which is
>> known as CTS and Monkey etc.
>
> Does the test simulate a real workload? I mean we have two things here
>
> oom disabled task and an updater of the ftrace ring buffer to a
> potentially large size. The second can be completely isolated to a
> different context, no? So why do they run in the single user process
> context?
ok. I think there are some misunderstandings here. Let me try to
explain more by my poor English. There is just one thing here. The
updater is originally a oom disabled task with adj=OOM_SCORE_ADJ_MIN.
With Steven's patch, it will periodically become a oom killable task
by calling set_current_oom_origin() for user process which is
enlarging the ring buffer. What I am doing here is limit the user
process to the ones that adj > -1000.

>
>> Furthermore, I think we should make the
>> patch to be as safest as possible. Why do we leave a potential risk
>> here? There is no side effect for my patch.
>
> I do not have the full context. Could you point me to your patch?

here are Steven and my patches
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5f38398..1005d73 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1135,7 +1135,7 @@ static int rb_check_pages(struct
ring_buffer_per_cpu *cpu_buffer)
 static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
 {
struct buffer_page *bpage, *tmp;
-   bool user_thread = current->mm != NULL;
+   bool user_thread = (current->mm != NULL &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN);//by zhaoyang
gfp_t mflags;
long i;
-
  {

  struct buffer_page *bpage, *tmp;
+ bool user_thread = current->mm != NULL;
+ gfp_t mflags;
  long i;

- /* Check if the available memory is there first */
+ /*
+ * Check if the available memory is there first.
+ * Note, si_mem_available() only gives us a rough estimate of available
+ * memory. It may not be accurate. But we don't care, we just want
+ * to prevent doing any allocation when it is obvious that it is
+ * not going to succeed.
+ */
  i = si_mem_available();
  if (i < nr_pages)
  return -ENOMEM;

+ /*
+ * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is not
+ * destabilized.
+ */
+ mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
+
+ /*
+ * If a user thread allocates too much, and si_mem_available()
+ * reports there's enough memory, even though there is not.
+ * Make sure the OOM killer kills this thread. This can happen
+ * even with RETRY_MAYFAIL because another task may be doing
+ * an allocation after this task has taken all memory.
+ * This is the task the OOM killer needs to take out during this
+ * loop, even if it was triggered by an allocation somewhere else.
+ */
+ if (user_thread)
+ set_current_oom_origin();
  for (i = 0; i < nr_pages; i++) {
  struct page *page;
- /*
- * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
- * gracefully without invoking oom-killer and the system is not
- * destabilized.
- */
+
  bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
-GFP_KERNEL | __GFP_RETRY_MAYFAIL,
-cpu_to_node(cpu));
+mflags, cpu_to_node(cpu));
  if (!bpage)
  goto free_pages;

  list_add(>list, pages);

- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
+ page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
  if (!page)
  goto free_page

Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 3:49 PM, Michal Hocko <mho...@kernel.org> wrote:
> On Tue 10-04-18 14:39:35, Zhaoyang Huang wrote:
>> On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Tue 10-04-18 11:41:44, Zhaoyang Huang wrote:
>> >> On Tue, Apr 10, 2018 at 11:12 AM, Steven Rostedt <rost...@goodmis.org> 
>> >> wrote:
>> >> > On Tue, 10 Apr 2018 10:32:36 +0800
>> >> > Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
>> >> >
>> >> >> For bellowing scenario, process A have no intension to exhaust the
>> >> >> memory, but will be likely to be selected by OOM for we set
>> >> >> OOM_CORE_ADJ_MIN for it.
>> >> >> process A(-1000)  process B
>> >> >>
>> >> >>   i = si_mem_available();
>> >> >>if (i < nr_pages)
>> >> >>return -ENOMEM;
>> >> >>schedule
>> >> >> --->
>> >> >> allocate huge memory
>> >> >> <-
>> >> >> if (user_thread)
>> >> >>   set_current_oom_origin();
>> >> >>
>> >> >>   for (i = 0; i < nr_pages; i++) {
>> >> >>  bpage = kzalloc_node
>> >> >
>> >> > Is this really an issue though?
>> >> >
>> >> > Seriously, do you think you will ever hit this?
>> >> >
>> >> > How often do you increase the size of the ftrace ring buffer? For this
>> >> > to be an issue, the system has to trigger an OOM at the exact moment
>> >> > you decide to increase the size of the ring buffer. That would be an
>> >> > impressive attack, with little to gain.
>> >> >
>> >> > Ask the memory management people. If they think this could be a
>> >> > problem, then I'll be happy to take your patch.
>> >> >
>> >> > -- Steve
>> >> add Michael for review.
>> >> Hi Michael,
>> >> I would like suggest Steve NOT to set OOM_CORE_ADJ_MIN for the process
>> >> with adj = -1000 when setting the user space process as potential
>> >> victim of OOM.
>> >
>> > OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer completely".
>> > So what exactly do you want to achieve here? Because from the above it
>> > sounds like opposite things. /me confused...
>> >
>> Steve's patch intend to have the process be OOM's victim when it
>> over-allocating pages for ring buffer. I amend a patch over to protect
>> process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
>> such process to be selected by current OOM's way of
>> selecting.(consider OOM_FLAG_ORIGIN first before the adj)
>
> I just wouldn't really care unless there is an existing and reasonable
> usecase for an application which updates the ring buffer size _and_ it
> is OOM disabled at the same time.
> --
> Michal Hocko
> SUSE Labs
There is indeed such kind of test case on my android system, which is
known as CTS and Monkey etc. Furthermore, I think we should make the
patch to be as safest as possible. Why do we leave a potential risk
here? There is no side effect for my patch.


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 3:49 PM, Michal Hocko  wrote:
> On Tue 10-04-18 14:39:35, Zhaoyang Huang wrote:
>> On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko  wrote:
>> > On Tue 10-04-18 11:41:44, Zhaoyang Huang wrote:
>> >> On Tue, Apr 10, 2018 at 11:12 AM, Steven Rostedt  
>> >> wrote:
>> >> > On Tue, 10 Apr 2018 10:32:36 +0800
>> >> > Zhaoyang Huang  wrote:
>> >> >
>> >> >> For bellowing scenario, process A have no intension to exhaust the
>> >> >> memory, but will be likely to be selected by OOM for we set
>> >> >> OOM_CORE_ADJ_MIN for it.
>> >> >> process A(-1000)  process B
>> >> >>
>> >> >>   i = si_mem_available();
>> >> >>if (i < nr_pages)
>> >> >>return -ENOMEM;
>> >> >>schedule
>> >> >> --->
>> >> >> allocate huge memory
>> >> >> <-
>> >> >> if (user_thread)
>> >> >>   set_current_oom_origin();
>> >> >>
>> >> >>   for (i = 0; i < nr_pages; i++) {
>> >> >>  bpage = kzalloc_node
>> >> >
>> >> > Is this really an issue though?
>> >> >
>> >> > Seriously, do you think you will ever hit this?
>> >> >
>> >> > How often do you increase the size of the ftrace ring buffer? For this
>> >> > to be an issue, the system has to trigger an OOM at the exact moment
>> >> > you decide to increase the size of the ring buffer. That would be an
>> >> > impressive attack, with little to gain.
>> >> >
>> >> > Ask the memory management people. If they think this could be a
>> >> > problem, then I'll be happy to take your patch.
>> >> >
>> >> > -- Steve
>> >> add Michael for review.
>> >> Hi Michael,
>> >> I would like suggest Steve NOT to set OOM_CORE_ADJ_MIN for the process
>> >> with adj = -1000 when setting the user space process as potential
>> >> victim of OOM.
>> >
>> > OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer completely".
>> > So what exactly do you want to achieve here? Because from the above it
>> > sounds like opposite things. /me confused...
>> >
>> Steve's patch intend to have the process be OOM's victim when it
>> over-allocating pages for ring buffer. I amend a patch over to protect
>> process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
>> such process to be selected by current OOM's way of
>> selecting.(consider OOM_FLAG_ORIGIN first before the adj)
>
> I just wouldn't really care unless there is an existing and reasonable
> usecase for an application which updates the ring buffer size _and_ it
> is OOM disabled at the same time.
> --
> Michal Hocko
> SUSE Labs
There is indeed such kind of test case on my android system, which is
known as CTS and Monkey etc. Furthermore, I think we should make the
patch to be as safest as possible. Why do we leave a potential risk
here? There is no side effect for my patch.


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko <mho...@kernel.org> wrote:
> On Tue 10-04-18 11:41:44, Zhaoyang Huang wrote:
>> On Tue, Apr 10, 2018 at 11:12 AM, Steven Rostedt <rost...@goodmis.org> wrote:
>> > On Tue, 10 Apr 2018 10:32:36 +0800
>> > Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
>> >
>> >> For bellowing scenario, process A have no intension to exhaust the
>> >> memory, but will be likely to be selected by OOM for we set
>> >> OOM_CORE_ADJ_MIN for it.
>> >> process A(-1000)  process B
>> >>
>> >>   i = si_mem_available();
>> >>if (i < nr_pages)
>> >>return -ENOMEM;
>> >>schedule
>> >> --->
>> >> allocate huge memory
>> >> <-
>> >> if (user_thread)
>> >>   set_current_oom_origin();
>> >>
>> >>   for (i = 0; i < nr_pages; i++) {
>> >>  bpage = kzalloc_node
>> >
>> > Is this really an issue though?
>> >
>> > Seriously, do you think you will ever hit this?
>> >
>> > How often do you increase the size of the ftrace ring buffer? For this
>> > to be an issue, the system has to trigger an OOM at the exact moment
>> > you decide to increase the size of the ring buffer. That would be an
>> > impressive attack, with little to gain.
>> >
>> > Ask the memory management people. If they think this could be a
>> > problem, then I'll be happy to take your patch.
>> >
>> > -- Steve
>> add Michael for review.
>> Hi Michael,
>> I would like suggest Steve NOT to set OOM_CORE_ADJ_MIN for the process
>> with adj = -1000 when setting the user space process as potential
>> victim of OOM.
>
> OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer completely".
> So what exactly do you want to achieve here? Because from the above it
> sounds like opposite things. /me confused...
>
> --
> Michal Hocko
> SUSE Labs
Steve's patch intend to have the process be OOM's victim when it
over-allocating pages for ring buffer. I amend a patch over to protect
process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
such process to be selected by current OOM's way of
selecting.(consider OOM_FLAG_ORIGIN first before the adj)


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-10 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 2:14 PM, Michal Hocko  wrote:
> On Tue 10-04-18 11:41:44, Zhaoyang Huang wrote:
>> On Tue, Apr 10, 2018 at 11:12 AM, Steven Rostedt  wrote:
>> > On Tue, 10 Apr 2018 10:32:36 +0800
>> > Zhaoyang Huang  wrote:
>> >
>> >> For bellowing scenario, process A have no intension to exhaust the
>> >> memory, but will be likely to be selected by OOM for we set
>> >> OOM_CORE_ADJ_MIN for it.
>> >> process A(-1000)  process B
>> >>
>> >>   i = si_mem_available();
>> >>if (i < nr_pages)
>> >>return -ENOMEM;
>> >>schedule
>> >> --->
>> >> allocate huge memory
>> >> <-
>> >> if (user_thread)
>> >>   set_current_oom_origin();
>> >>
>> >>   for (i = 0; i < nr_pages; i++) {
>> >>  bpage = kzalloc_node
>> >
>> > Is this really an issue though?
>> >
>> > Seriously, do you think you will ever hit this?
>> >
>> > How often do you increase the size of the ftrace ring buffer? For this
>> > to be an issue, the system has to trigger an OOM at the exact moment
>> > you decide to increase the size of the ring buffer. That would be an
>> > impressive attack, with little to gain.
>> >
>> > Ask the memory management people. If they think this could be a
>> > problem, then I'll be happy to take your patch.
>> >
>> > -- Steve
>> add Michael for review.
>> Hi Michael,
>> I would like suggest Steve NOT to set OOM_CORE_ADJ_MIN for the process
>> with adj = -1000 when setting the user space process as potential
>> victim of OOM.
>
> OOM_SCORE_ADJ_MIN means "hide the process from the OOM killer completely".
> So what exactly do you want to achieve here? Because from the above it
> sounds like opposite things. /me confused...
>
> --
> Michal Hocko
> SUSE Labs
Steve's patch intend to have the process be OOM's victim when it
over-allocating pages for ring buffer. I amend a patch over to protect
process with OOM_SCORE_ADJ_MIN from doing so. Because it will make
such process to be selected by current OOM's way of
selecting.(consider OOM_FLAG_ORIGIN first before the adj)


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-09 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 11:12 AM, Steven Rostedt <rost...@goodmis.org> wrote:
> On Tue, 10 Apr 2018 10:32:36 +0800
> Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
>
>> For bellowing scenario, process A have no intension to exhaust the
>> memory, but will be likely to be selected by OOM for we set
>> OOM_CORE_ADJ_MIN for it.
>> process A(-1000)  process B
>>
>>   i = si_mem_available();
>>if (i < nr_pages)
>>return -ENOMEM;
>>schedule
>> --->
>> allocate huge memory
>> <-
>> if (user_thread)
>>   set_current_oom_origin();
>>
>>   for (i = 0; i < nr_pages; i++) {
>>  bpage = kzalloc_node
>
> Is this really an issue though?
>
> Seriously, do you think you will ever hit this?
>
> How often do you increase the size of the ftrace ring buffer? For this
> to be an issue, the system has to trigger an OOM at the exact moment
> you decide to increase the size of the ring buffer. That would be an
> impressive attack, with little to gain.
>
> Ask the memory management people. If they think this could be a
> problem, then I'll be happy to take your patch.
>
> -- Steve
add Michael for review.
Hi Michael,
I would like suggest Steve NOT to set OOM_CORE_ADJ_MIN for the process
with adj = -1000 when setting the user space process as potential
victim of OOM. Steve doubts about the possibility of the scenario. In
my opinion, we should NOT break the original concept of the OOM, that
is, OOM would not select -1000 process unless it config it itself.
With regard to the possibility, in memory thirsty system such as
android on mobile phones, there are different kinds of user behavior
or test script to attack or ensure the stability of the system. So I
suggest we'd better keep every corner case safe. Would you please give
a comment on that? thanks


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-09 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 11:12 AM, Steven Rostedt  wrote:
> On Tue, 10 Apr 2018 10:32:36 +0800
> Zhaoyang Huang  wrote:
>
>> For bellowing scenario, process A have no intension to exhaust the
>> memory, but will be likely to be selected by OOM for we set
>> OOM_CORE_ADJ_MIN for it.
>> process A(-1000)  process B
>>
>>   i = si_mem_available();
>>if (i < nr_pages)
>>return -ENOMEM;
>>schedule
>> --->
>> allocate huge memory
>> <-
>> if (user_thread)
>>   set_current_oom_origin();
>>
>>   for (i = 0; i < nr_pages; i++) {
>>  bpage = kzalloc_node
>
> Is this really an issue though?
>
> Seriously, do you think you will ever hit this?
>
> How often do you increase the size of the ftrace ring buffer? For this
> to be an issue, the system has to trigger an OOM at the exact moment
> you decide to increase the size of the ring buffer. That would be an
> impressive attack, with little to gain.
>
> Ask the memory management people. If they think this could be a
> problem, then I'll be happy to take your patch.
>
> -- Steve
add Michael for review.
Hi Michael,
I would like suggest Steve NOT to set OOM_CORE_ADJ_MIN for the process
with adj = -1000 when setting the user space process as potential
victim of OOM. Steve doubts about the possibility of the scenario. In
my opinion, we should NOT break the original concept of the OOM, that
is, OOM would not select -1000 process unless it config it itself.
With regard to the possibility, in memory thirsty system such as
android on mobile phones, there are different kinds of user behavior
or test script to attack or ensure the stability of the system. So I
suggest we'd better keep every corner case safe. Would you please give
a comment on that? thanks


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-09 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 8:32 AM, Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
> On Mon, Apr 9, 2018 at 9:49 PM, Steven Rostedt <rost...@goodmis.org> wrote:
>> On Mon, 9 Apr 2018 08:56:01 +0800
>> Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
>>
>>> >>
>>> >> if (oom_task_origin(task)) {
>>> >> points = ULONG_MAX;
>>> >> goto select;
>>> >> }
>>> >>
>>> >> points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
>>> >> if (!points || points < oc->chosen_points)
>>> >> goto next;
>>> >
>>> > And what's wrong with that?
>>> >
>>> > -- Steve
>>> I think the original thought of OOM is the flag 'OOM_SCORE_ADJ_MIN' is
>>> most likely to be set by process himself via accessing the proc file,
>>> if it does so, OOM can select it as the victim. except, it is
>>> reluctant to choose the critical process to be killed, so I suggest
>>> not to set such heavy flag as OOM_SCORE_ADJ_MIN on behalf of -1000
>>> process.
>>
>> Really, I don't think tasks that are setting OOM_CORE_ADJ_MIN should be
>> allocating a lot of memory in the kernel (via ring buffer). It sounds
>> like a good way to wreck havoc on the system.
>>
>> It's basically saying, "I'm going to take up all memory, but don't kill
>> me, just kill some random user on the system".
>>
>> -- Steve
> Sure, but the memory status is dynamic, the process could also exceed the 
> limit
> at the moment even it check the available memory before. We have to
> add protection
> for such kind of risk. It could also happen that the critical process
> be preempted by
> another huge memory allocating process, which may cause insufficient memory 
> when
> it schedule back.

For bellowing scenario, process A have no intension to exhaust the
memory, but will be likely to be selected by OOM for we set
OOM_CORE_ADJ_MIN for it.
process A(-1000)  process B

  i = si_mem_available();
   if (i < nr_pages)
   return -ENOMEM;
   schedule
--->
allocate huge memory
<-
if (user_thread)
  set_current_oom_origin();

  for (i = 0; i < nr_pages; i++) {
 bpage = kzalloc_node


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-09 Thread Zhaoyang Huang
On Tue, Apr 10, 2018 at 8:32 AM, Zhaoyang Huang  wrote:
> On Mon, Apr 9, 2018 at 9:49 PM, Steven Rostedt  wrote:
>> On Mon, 9 Apr 2018 08:56:01 +0800
>> Zhaoyang Huang  wrote:
>>
>>> >>
>>> >> if (oom_task_origin(task)) {
>>> >> points = ULONG_MAX;
>>> >> goto select;
>>> >> }
>>> >>
>>> >> points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
>>> >> if (!points || points < oc->chosen_points)
>>> >> goto next;
>>> >
>>> > And what's wrong with that?
>>> >
>>> > -- Steve
>>> I think the original thought of OOM is the flag 'OOM_SCORE_ADJ_MIN' is
>>> most likely to be set by process himself via accessing the proc file,
>>> if it does so, OOM can select it as the victim. except, it is
>>> reluctant to choose the critical process to be killed, so I suggest
>>> not to set such heavy flag as OOM_SCORE_ADJ_MIN on behalf of -1000
>>> process.
>>
>> Really, I don't think tasks that are setting OOM_CORE_ADJ_MIN should be
>> allocating a lot of memory in the kernel (via ring buffer). It sounds
>> like a good way to wreck havoc on the system.
>>
>> It's basically saying, "I'm going to take up all memory, but don't kill
>> me, just kill some random user on the system".
>>
>> -- Steve
> Sure, but the memory status is dynamic, the process could also exceed the 
> limit
> at the moment even it check the available memory before. We have to
> add protection
> for such kind of risk. It could also happen that the critical process
> be preempted by
> another huge memory allocating process, which may cause insufficient memory 
> when
> it schedule back.

For bellowing scenario, process A have no intension to exhaust the
memory, but will be likely to be selected by OOM for we set
OOM_CORE_ADJ_MIN for it.
process A(-1000)  process B

  i = si_mem_available();
   if (i < nr_pages)
   return -ENOMEM;
   schedule
--->
allocate huge memory
<-
if (user_thread)
  set_current_oom_origin();

  for (i = 0; i < nr_pages; i++) {
 bpage = kzalloc_node


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-09 Thread Zhaoyang Huang
On Mon, Apr 9, 2018 at 9:49 PM, Steven Rostedt <rost...@goodmis.org> wrote:
> On Mon, 9 Apr 2018 08:56:01 +0800
> Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
>
>> >>
>> >> if (oom_task_origin(task)) {
>> >> points = ULONG_MAX;
>> >> goto select;
>> >> }
>> >>
>> >> points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
>> >> if (!points || points < oc->chosen_points)
>> >> goto next;
>> >
>> > And what's wrong with that?
>> >
>> > -- Steve
>> I think the original thought of OOM is the flag 'OOM_SCORE_ADJ_MIN' is
>> most likely to be set by process himself via accessing the proc file,
>> if it does so, OOM can select it as the victim. except, it is
>> reluctant to choose the critical process to be killed, so I suggest
>> not to set such heavy flag as OOM_SCORE_ADJ_MIN on behalf of -1000
>> process.
>
> Really, I don't think tasks that are setting OOM_CORE_ADJ_MIN should be
> allocating a lot of memory in the kernel (via ring buffer). It sounds
> like a good way to wreck havoc on the system.
>
> It's basically saying, "I'm going to take up all memory, but don't kill
> me, just kill some random user on the system".
>
> -- Steve
Sure, but the memory status is dynamic, the process could also exceed the limit
at the moment even it check the available memory before. We have to
add protection
for such kind of risk. It could also happen that the critical process
be preempted by
another huge memory allocating process, which may cause insufficient memory when
it schedule back.


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-09 Thread Zhaoyang Huang
On Mon, Apr 9, 2018 at 9:49 PM, Steven Rostedt  wrote:
> On Mon, 9 Apr 2018 08:56:01 +0800
> Zhaoyang Huang  wrote:
>
>> >>
>> >> if (oom_task_origin(task)) {
>> >> points = ULONG_MAX;
>> >> goto select;
>> >> }
>> >>
>> >> points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
>> >> if (!points || points < oc->chosen_points)
>> >> goto next;
>> >
>> > And what's wrong with that?
>> >
>> > -- Steve
>> I think the original thought of OOM is the flag 'OOM_SCORE_ADJ_MIN' is
>> most likely to be set by process himself via accessing the proc file,
>> if it does so, OOM can select it as the victim. except, it is
>> reluctant to choose the critical process to be killed, so I suggest
>> not to set such heavy flag as OOM_SCORE_ADJ_MIN on behalf of -1000
>> process.
>
> Really, I don't think tasks that are setting OOM_CORE_ADJ_MIN should be
> allocating a lot of memory in the kernel (via ring buffer). It sounds
> like a good way to wreck havoc on the system.
>
> It's basically saying, "I'm going to take up all memory, but don't kill
> me, just kill some random user on the system".
>
> -- Steve
Sure, but the memory status is dynamic, the process could also exceed the limit
at the moment even it check the available memory before. We have to
add protection
for such kind of risk. It could also happen that the critical process
be preempted by
another huge memory allocating process, which may cause insufficient memory when
it schedule back.


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-08 Thread Zhaoyang Huang
On Sun, Apr 8, 2018 at 8:47 PM, Steven Rostedt <rost...@goodmis.org> wrote:
> [ Removing kernel-patch-test, because of annoying "moderator" messages ]
>
> On Sun, 8 Apr 2018 13:54:59 +0800
> Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
>
>> On Sun, Apr 8, 2018 at 11:48 AM, Steven Rostedt <rost...@goodmis.org> wrote:
>> > On Sun,  8 Apr 2018 10:16:23 +0800
>> > Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
>> >
>> >> Don't choose the process with adj == OOM_SCORE_ADJ_MIN which
>> >> over-allocating pages for ring buffers.
>> >
>> > Why?
>> >
>> > -- Steve
>> because in oom_evaluate_task, the process with adj == OOM_SCORE_ADJ_MIN will
>> be suppressed by oom_badness, but with applying your latest patch,
>> such process will
>> be selected by oom_task_origin
>>
>> if (oom_task_origin(task)) {
>> points = ULONG_MAX;
>> goto select;
>> }
>>
>> points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
>> if (!points || points < oc->chosen_points)
>> goto next;
>
> And what's wrong with that?
>
> -- Steve
I think the original thought of OOM is the flag 'OOM_SCORE_ADJ_MIN' is
most likely to be set by process himself via accessing the proc file,
if it does so, OOM can select it as the victim. except, it is
reluctant to choose the critical process to be killed, so I suggest
not to set such heavy flag as OOM_SCORE_ADJ_MIN on behalf of -1000
process.


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-08 Thread Zhaoyang Huang
On Sun, Apr 8, 2018 at 8:47 PM, Steven Rostedt  wrote:
> [ Removing kernel-patch-test, because of annoying "moderator" messages ]
>
> On Sun, 8 Apr 2018 13:54:59 +0800
> Zhaoyang Huang  wrote:
>
>> On Sun, Apr 8, 2018 at 11:48 AM, Steven Rostedt  wrote:
>> > On Sun,  8 Apr 2018 10:16:23 +0800
>> > Zhaoyang Huang  wrote:
>> >
>> >> Don't choose the process with adj == OOM_SCORE_ADJ_MIN which
>> >> over-allocating pages for ring buffers.
>> >
>> > Why?
>> >
>> > -- Steve
>> because in oom_evaluate_task, the process with adj == OOM_SCORE_ADJ_MIN will
>> be suppressed by oom_badness, but with applying your latest patch,
>> such process will
>> be selected by oom_task_origin
>>
>> if (oom_task_origin(task)) {
>> points = ULONG_MAX;
>> goto select;
>> }
>>
>> points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
>> if (!points || points < oc->chosen_points)
>> goto next;
>
> And what's wrong with that?
>
> -- Steve
I think the original thought of OOM is the flag 'OOM_SCORE_ADJ_MIN' is
most likely to be set by process himself via accessing the proc file,
if it does so, OOM can select it as the victim. except, it is
reluctant to choose the critical process to be killed, so I suggest
not to set such heavy flag as OOM_SCORE_ADJ_MIN on behalf of -1000
process.


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-07 Thread Zhaoyang Huang
On Sun, Apr 8, 2018 at 11:48 AM, Steven Rostedt <rost...@goodmis.org> wrote:
> On Sun,  8 Apr 2018 10:16:23 +0800
> Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
>
>> Don't choose the process with adj == OOM_SCORE_ADJ_MIN which
>> over-allocating pages for ring buffers.
>
> Why?
>
> -- Steve
because in oom_evaluate_task, the process with adj == OOM_SCORE_ADJ_MIN will
be suppressed by oom_badness, but with applying your latest patch,
such process will
be selected by oom_task_origin

if (oom_task_origin(task)) {
points = ULONG_MAX;
goto select;
}

points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
if (!points || points < oc->chosen_points)
goto next;


Re: [PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-07 Thread Zhaoyang Huang
On Sun, Apr 8, 2018 at 11:48 AM, Steven Rostedt  wrote:
> On Sun,  8 Apr 2018 10:16:23 +0800
> Zhaoyang Huang  wrote:
>
>> Don't choose the process with adj == OOM_SCORE_ADJ_MIN which
>> over-allocating pages for ring buffers.
>
> Why?
>
> -- Steve
because in oom_evaluate_task, the process with adj == OOM_SCORE_ADJ_MIN will
be suppressed by oom_badness, but with applying your latest patch,
such process will
be selected by oom_task_origin

if (oom_task_origin(task)) {
points = ULONG_MAX;
goto select;
}

points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
if (!points || points < oc->chosen_points)
goto next;


[PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-07 Thread Zhaoyang Huang
Don't choose the process with adj == OOM_SCORE_ADJ_MIN which
over-allocating pages for ring buffers.

Signed-off-by: Zhaoyang Huang <zhaoyang.hu...@spreadtrum.com>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5f38398..1005d73 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1135,7 +1135,7 @@ static int rb_check_pages(struct ring_buffer_per_cpu 
*cpu_buffer)
 static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
 {
struct buffer_page *bpage, *tmp;
-   bool user_thread = current->mm != NULL;
+   bool user_thread = (current->mm != NULL && 
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN);
gfp_t mflags;
long i;
 
-- 
1.9.1



[PATCH v1] ringbuffer: Don't choose the process with adj equal OOM_SCORE_ADJ_MIN

2018-04-07 Thread Zhaoyang Huang
Don't choose the process with adj == OOM_SCORE_ADJ_MIN which
over-allocating pages for ring buffers.

Signed-off-by: Zhaoyang Huang 
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 5f38398..1005d73 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1135,7 +1135,7 @@ static int rb_check_pages(struct ring_buffer_per_cpu 
*cpu_buffer)
 static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
 {
struct buffer_page *bpage, *tmp;
-   bool user_thread = current->mm != NULL;
+   bool user_thread = (current->mm != NULL && 
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN);
gfp_t mflags;
long i;
 
-- 
1.9.1



Re: [PATCH] ring-buffer: Add set/clear_current_oom_origin() during allocations

2018-04-06 Thread Zhaoyang Huang
On Fri, Apr 6, 2018 at 7:36 AM, Joel Fernandes  wrote:
> Hi Steve,
>
> On Thu, Apr 5, 2018 at 12:57 PM, Joel Fernandes  wrote:
>> On Thu, Apr 5, 2018 at 6:43 AM, Steven Rostedt  wrote:
>>> On Wed, 4 Apr 2018 16:59:18 -0700
>>> Joel Fernandes  wrote:
>>>
 Happy to try anything else, BTW when the si_mem_available check
 enabled, this doesn't happen and the buffer_size_kb write fails
 normally without hurting anything else.
>>>
>>> Can you remove the RETRY_MAYFAIL and see if you can try again? It may
>>> be that we just remove that, and if si_mem_available() is wrong, it
>>> will kill the process :-/ My original code would only add MAYFAIL if it
>>> was a kernel thread (which is why I created the mflags variable).
>>
>> Tried this. Dropping RETRY_MAYFAIL and the si_mem_available check
>> destabilized the system and brought it down (along with OOM killing
>> the victim).
>>
>> System hung for several seconds and then both the memory hog and bash
>> got killed.
>
> I think its still Ok to keep the OOM patch as a safe guard even though
> its hard to test, and the si_mem_available on its own seem sufficient.
> What do you think?
>
> thanks,
>
>
> - Joel
I also test the patch on my system, which works fine for the previous script.

PS: The script I mentioned is the cts test case POC 16_12 on android8.1


Re: [PATCH] ring-buffer: Add set/clear_current_oom_origin() during allocations

2018-04-06 Thread Zhaoyang Huang
On Fri, Apr 6, 2018 at 7:36 AM, Joel Fernandes  wrote:
> Hi Steve,
>
> On Thu, Apr 5, 2018 at 12:57 PM, Joel Fernandes  wrote:
>> On Thu, Apr 5, 2018 at 6:43 AM, Steven Rostedt  wrote:
>>> On Wed, 4 Apr 2018 16:59:18 -0700
>>> Joel Fernandes  wrote:
>>>
 Happy to try anything else, BTW when the si_mem_available check
 enabled, this doesn't happen and the buffer_size_kb write fails
 normally without hurting anything else.
>>>
>>> Can you remove the RETRY_MAYFAIL and see if you can try again? It may
>>> be that we just remove that, and if si_mem_available() is wrong, it
>>> will kill the process :-/ My original code would only add MAYFAIL if it
>>> was a kernel thread (which is why I created the mflags variable).
>>
>> Tried this. Dropping RETRY_MAYFAIL and the si_mem_available check
>> destabilized the system and brought it down (along with OOM killing
>> the victim).
>>
>> System hung for several seconds and then both the memory hog and bash
>> got killed.
>
> I think its still Ok to keep the OOM patch as a safe guard even though
> its hard to test, and the si_mem_available on its own seem sufficient.
> What do you think?
>
> thanks,
>
>
> - Joel
I also test the patch on my system, which works fine for the previous script.

PS: The script I mentioned is the cts test case POC 16_12 on android8.1


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-04-04 Thread Zhaoyang Huang
On Wed, Apr 4, 2018 at 2:23 PM, Michal Hocko <mho...@kernel.org> wrote:
> On Wed 04-04-18 10:58:39, Zhaoyang Huang wrote:
>> On Tue, Apr 3, 2018 at 9:56 PM, Michal Hocko <mho...@kernel.org> wrote:
>> > On Tue 03-04-18 09:32:45, Steven Rostedt wrote:
>> >> On Tue, 3 Apr 2018 14:35:14 +0200
>> >> Michal Hocko <mho...@kernel.org> wrote:
>> > [...]
>> >> > Being clever is OK if it doesn't add a tricky code. And relying on
>> >> > si_mem_available is definitely tricky and obscure.
>> >>
>> >> Can we get the mm subsystem to provide a better method to know if an
>> >> allocation will possibly succeed or not before trying it? It doesn't
>> >> have to be free of races. Just "if I allocate this many pages right
>> >> now, will it work?" If that changes from the time it asks to the time
>> >> it allocates, that's fine. I'm not trying to prevent OOM to never
>> >> trigger. I just don't want to to trigger consistently.
>> >
>> > How do you do that without an actuall allocation request? And more
>> > fundamentally, what if your _particular_ request is just fine but it
>> > will get us so close to the OOM edge that the next legit allocation
>> > request simply goes OOM? There is simply no sane interface I can think
>> > of that would satisfy a safe/sensible "will it cause OOM" semantic.
>> >
>> The point is the app which try to allocate the size over the line will escape
>> the OOM and let other innocent to be sacrificed. However, the one which you
>> mentioned above will be possibly selected by OOM that triggered by consequnce
>> failed allocation.
>
> If you are afraid of that then you can have a look at 
> {set,clear}_current_oom_origin()
> which will automatically select the current process as an oom victim and
> kill it.
But we can not call the function on behalf of the current process
which maybe don't want
to be killed for memory reason. It is proper to tell it ENOMEM and let
it make further decision.
> --
> Michal Hocko
> SUSE Labs


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-04-04 Thread Zhaoyang Huang
On Wed, Apr 4, 2018 at 2:23 PM, Michal Hocko  wrote:
> On Wed 04-04-18 10:58:39, Zhaoyang Huang wrote:
>> On Tue, Apr 3, 2018 at 9:56 PM, Michal Hocko  wrote:
>> > On Tue 03-04-18 09:32:45, Steven Rostedt wrote:
>> >> On Tue, 3 Apr 2018 14:35:14 +0200
>> >> Michal Hocko  wrote:
>> > [...]
>> >> > Being clever is OK if it doesn't add a tricky code. And relying on
>> >> > si_mem_available is definitely tricky and obscure.
>> >>
>> >> Can we get the mm subsystem to provide a better method to know if an
>> >> allocation will possibly succeed or not before trying it? It doesn't
>> >> have to be free of races. Just "if I allocate this many pages right
>> >> now, will it work?" If that changes from the time it asks to the time
>> >> it allocates, that's fine. I'm not trying to prevent OOM to never
>> >> trigger. I just don't want to to trigger consistently.
>> >
>> > How do you do that without an actuall allocation request? And more
>> > fundamentally, what if your _particular_ request is just fine but it
>> > will get us so close to the OOM edge that the next legit allocation
>> > request simply goes OOM? There is simply no sane interface I can think
>> > of that would satisfy a safe/sensible "will it cause OOM" semantic.
>> >
>> The point is the app which try to allocate the size over the line will escape
>> the OOM and let other innocent to be sacrificed. However, the one which you
>> mentioned above will be possibly selected by OOM that triggered by consequnce
>> failed allocation.
>
> If you are afraid of that then you can have a look at 
> {set,clear}_current_oom_origin()
> which will automatically select the current process as an oom victim and
> kill it.
But we can not call the function on behalf of the current process
which maybe don't want
to be killed for memory reason. It is proper to tell it ENOMEM and let
it make further decision.
> --
> Michal Hocko
> SUSE Labs


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-04-03 Thread Zhaoyang Huang
On Tue, Apr 3, 2018 at 9:56 PM, Michal Hocko  wrote:
> On Tue 03-04-18 09:32:45, Steven Rostedt wrote:
>> On Tue, 3 Apr 2018 14:35:14 +0200
>> Michal Hocko  wrote:
> [...]
>> > Being clever is OK if it doesn't add a tricky code. And relying on
>> > si_mem_available is definitely tricky and obscure.
>>
>> Can we get the mm subsystem to provide a better method to know if an
>> allocation will possibly succeed or not before trying it? It doesn't
>> have to be free of races. Just "if I allocate this many pages right
>> now, will it work?" If that changes from the time it asks to the time
>> it allocates, that's fine. I'm not trying to prevent OOM to never
>> trigger. I just don't want to to trigger consistently.
>
> How do you do that without an actuall allocation request? And more
> fundamentally, what if your _particular_ request is just fine but it
> will get us so close to the OOM edge that the next legit allocation
> request simply goes OOM? There is simply no sane interface I can think
> of that would satisfy a safe/sensible "will it cause OOM" semantic.
>
The point is the app which try to allocate the size over the line will escape
the OOM and let other innocent to be sacrificed. However, the one which you
mentioned above will be possibly selected by OOM that triggered by consequnce
failed allocation.

>> > > Perhaps I should try to allocate a large group of pages with
>> > > RETRY_MAYFAIL, and if that fails go back to NORETRY, with the thinking
>> > > that the large allocation may reclaim some memory that would allow the
>> > > NORETRY to succeed with smaller allocations (one page at a time)?
>> >
>> > That again relies on a subtle dependencies of the current
>> > implementation. So I would rather ask whether this is something that
>> > really deserves special treatment. If admin asks for a buffer of a
>> > certain size then try to do so. If we get OOM then bad luck you cannot
>> > get large memory buffers for free...
>>
>> That is not acceptable to me nor to the people asking for this.
>>
>> The problem is known. The ring buffer allocates memory page by page,
>> and this can allow it to easily take all memory in the system before it
>> fails to allocate and free everything it had done.
>
> Then do not allow buffers that are too large. How often do you need
> buffers that are larger than few megs or small % of the available
> memory? Consuming excessive amount of memory just to trace workload
> which will need some memory on its own sounds just dubious to me.
>
>> If you don't like the use of si_mem_available() I'll do the larger
>> pages method. Yes it depends on the current implementation of memory
>> allocation. It will depend on RETRY_MAYFAIL trying to allocate a large
>> number of pages, and fail if it can't (leaving memory for other
>> allocations to succeed).
>>
>> The allocation of the ring buffer isn't critical. It can fail to
>> expand, and we can tell the user -ENOMEM. I original had NORETRY
>> because I rather have it fail than cause an OOM. But there's folks
>> (like Joel) that want it to succeed when there's available memory in
>> page caches.
>
> Then implement a retry logic on top of NORETRY. You can control how hard
> to retry to satisfy the request yourself. You still risk that your
> allocation will get us close to OOM for _somebody_ else though.
>
>> I'm fine if the admin shoots herself in the foot if the ring buffer
>> gets big enough to start causing OOMs, but I don't want it to cause
>> OOMs if there's not even enough memory to fulfill the ring buffer size
>> itself.
>
> I simply do not see the difference between the two. Both have the same
> deadly effect in the end. The direct OOM has an arguable advantage that
> the effect is immediate rather than subtle with potential performance
> side effects until the machine OOMs after crawling for quite some time.
>
> --
> Michal Hocko
> SUSE Labs


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-04-03 Thread Zhaoyang Huang
On Tue, Apr 3, 2018 at 9:56 PM, Michal Hocko  wrote:
> On Tue 03-04-18 09:32:45, Steven Rostedt wrote:
>> On Tue, 3 Apr 2018 14:35:14 +0200
>> Michal Hocko  wrote:
> [...]
>> > Being clever is OK if it doesn't add a tricky code. And relying on
>> > si_mem_available is definitely tricky and obscure.
>>
>> Can we get the mm subsystem to provide a better method to know if an
>> allocation will possibly succeed or not before trying it? It doesn't
>> have to be free of races. Just "if I allocate this many pages right
>> now, will it work?" If that changes from the time it asks to the time
>> it allocates, that's fine. I'm not trying to prevent OOM to never
>> trigger. I just don't want to to trigger consistently.
>
> How do you do that without an actuall allocation request? And more
> fundamentally, what if your _particular_ request is just fine but it
> will get us so close to the OOM edge that the next legit allocation
> request simply goes OOM? There is simply no sane interface I can think
> of that would satisfy a safe/sensible "will it cause OOM" semantic.
>
The point is the app which try to allocate the size over the line will escape
the OOM and let other innocent to be sacrificed. However, the one which you
mentioned above will be possibly selected by OOM that triggered by consequnce
failed allocation.

>> > > Perhaps I should try to allocate a large group of pages with
>> > > RETRY_MAYFAIL, and if that fails go back to NORETRY, with the thinking
>> > > that the large allocation may reclaim some memory that would allow the
>> > > NORETRY to succeed with smaller allocations (one page at a time)?
>> >
>> > That again relies on a subtle dependencies of the current
>> > implementation. So I would rather ask whether this is something that
>> > really deserves special treatment. If admin asks for a buffer of a
>> > certain size then try to do so. If we get OOM then bad luck you cannot
>> > get large memory buffers for free...
>>
>> That is not acceptable to me nor to the people asking for this.
>>
>> The problem is known. The ring buffer allocates memory page by page,
>> and this can allow it to easily take all memory in the system before it
>> fails to allocate and free everything it had done.
>
> Then do not allow buffers that are too large. How often do you need
> buffers that are larger than few megs or small % of the available
> memory? Consuming excessive amount of memory just to trace workload
> which will need some memory on its own sounds just dubious to me.
>
>> If you don't like the use of si_mem_available() I'll do the larger
>> pages method. Yes it depends on the current implementation of memory
>> allocation. It will depend on RETRY_MAYFAIL trying to allocate a large
>> number of pages, and fail if it can't (leaving memory for other
>> allocations to succeed).
>>
>> The allocation of the ring buffer isn't critical. It can fail to
>> expand, and we can tell the user -ENOMEM. I original had NORETRY
>> because I rather have it fail than cause an OOM. But there's folks
>> (like Joel) that want it to succeed when there's available memory in
>> page caches.
>
> Then implement a retry logic on top of NORETRY. You can control how hard
> to retry to satisfy the request yourself. You still risk that your
> allocation will get us close to OOM for _somebody_ else though.
>
>> I'm fine if the admin shoots herself in the foot if the ring buffer
>> gets big enough to start causing OOMs, but I don't want it to cause
>> OOMs if there's not even enough memory to fulfill the ring buffer size
>> itself.
>
> I simply do not see the difference between the two. Both have the same
> deadly effect in the end. The direct OOM has an arguable advantage that
> the effect is immediate rather than subtle with potential performance
> side effects until the machine OOMs after crawling for quite some time.
>
> --
> Michal Hocko
> SUSE Labs


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-04-01 Thread Zhaoyang Huang
On Sat, Mar 31, 2018 at 5:42 AM, Steven Rostedt  wrote:
> On Fri, 30 Mar 2018 17:30:31 -0400
> Steven Rostedt  wrote:
>
>> I'll take a look at si_mem_available() that Joel suggested and see if
>> we can make that work.
>
> Wow, this appears to work great! Joel and Zhaoyang, can you test this?
>
> -- Steve
>
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index a2fd3893cc02..32a803626ee2 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -1164,6 +1164,11 @@ static int __rb_allocate_pages(long nr_pages, struct 
> list_head *pages, int cpu)
> struct buffer_page *bpage, *tmp;
> long i;
>
> +   /* Check if the available memory is there first */
> +   i = si_mem_available();
> +   if (i < nr_pages)
> +   return -ENOMEM;
> +
> for (i = 0; i < nr_pages; i++) {
> struct page *page;
> /*
Hi Steve, It works as my previous patch does.


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-04-01 Thread Zhaoyang Huang
On Sat, Mar 31, 2018 at 5:42 AM, Steven Rostedt  wrote:
> On Fri, 30 Mar 2018 17:30:31 -0400
> Steven Rostedt  wrote:
>
>> I'll take a look at si_mem_available() that Joel suggested and see if
>> we can make that work.
>
> Wow, this appears to work great! Joel and Zhaoyang, can you test this?
>
> -- Steve
>
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index a2fd3893cc02..32a803626ee2 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -1164,6 +1164,11 @@ static int __rb_allocate_pages(long nr_pages, struct 
> list_head *pages, int cpu)
> struct buffer_page *bpage, *tmp;
> long i;
>
> +   /* Check if the available memory is there first */
> +   i = si_mem_available();
> +   if (i < nr_pages)
> +   return -ENOMEM;
> +
> for (i = 0; i < nr_pages; i++) {
> struct page *page;
> /*
Hi Steve, It works as my previous patch does.


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-03-29 Thread Zhaoyang Huang
On Fri, Mar 30, 2018 at 12:05 AM, Steven Rostedt <rost...@goodmis.org> wrote:
> On Thu, 29 Mar 2018 18:41:44 +0800
> Zhaoyang Huang <huangzhaoy...@gmail.com> wrote:
>
>> It is reported that some user app would like to echo a huge
>> number to "/sys/kernel/debug/tracing/buffer_size_kb" regardless
>>  of the available memory, which will cause the coinstantaneous
>> page allocation failed and introduce OOM. The commit checking the
>> val against the available mem first to avoid the consequence allocation.
>>
>
> One of my tests is to stress buffer_size_kb, and it fails nicely if you
> try to get too much. Although, it may cause an OOM, but that's expected.
>
> The application should do the test (try "free" on the command line).
> This isn't something that the kernel should be responsible for. If
> someone wants to allocate all memory for tracing, that's their
> prerogative.
>
> -- Steve
Steve, thanks for your quick feedback. The original purpose is to
avoid such kind
of OOM as kernel can not define the behavior of the application. I
think it is no need
to do the alloc->free process if we have known the number of pages
difinitly lead to failure.
Furthermore,  the app which screw up the thing usually escape the OOM and cause
the innocent to be sacrificed.


Re: [PATCH v1] kernel/trace:check the val against the available mem

2018-03-29 Thread Zhaoyang Huang
On Fri, Mar 30, 2018 at 12:05 AM, Steven Rostedt  wrote:
> On Thu, 29 Mar 2018 18:41:44 +0800
> Zhaoyang Huang  wrote:
>
>> It is reported that some user app would like to echo a huge
>> number to "/sys/kernel/debug/tracing/buffer_size_kb" regardless
>>  of the available memory, which will cause the coinstantaneous
>> page allocation failed and introduce OOM. The commit checking the
>> val against the available mem first to avoid the consequence allocation.
>>
>
> One of my tests is to stress buffer_size_kb, and it fails nicely if you
> try to get too much. Although, it may cause an OOM, but that's expected.
>
> The application should do the test (try "free" on the command line).
> This isn't something that the kernel should be responsible for. If
> someone wants to allocate all memory for tracing, that's their
> prerogative.
>
> -- Steve
Steve, thanks for your quick feedback. The original purpose is to
avoid such kind
of OOM as kernel can not define the behavior of the application. I
think it is no need
to do the alloc->free process if we have known the number of pages
difinitly lead to failure.
Furthermore,  the app which screw up the thing usually escape the OOM and cause
the innocent to be sacrificed.


[PATCH v1] kernel/trace:check the val against the available mem

2018-03-29 Thread Zhaoyang Huang
It is reported that some user app would like to echo a huge
number to "/sys/kernel/debug/tracing/buffer_size_kb" regardless
 of the available memory, which will cause the coinstantaneous
page allocation failed and introduce OOM. The commit checking the
val against the available mem first to avoid the consequence allocation.

Signed-off-by: Zhaoyang Huang <zhaoyang.hu...@spreadtrum.com>
---
 kernel/trace/trace.c | 39 ++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2d0ffcc..a4a4237 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,6 +43,8 @@
 #include 
 #include 
 
+#include 
+#include 
 #include "trace.h"
 #include "trace_output.h"
 
@@ -5967,6 +5969,39 @@ static ssize_t tracing_splice_read_pipe(struct file 
*filp,
return ret;
 }
 
+static long get_available_mem(void)
+{
+   struct sysinfo i;
+   long available;
+   unsigned long pagecache;
+   unsigned long wmark_low = 0;
+   unsigned long pages[NR_LRU_LISTS];
+   struct zone *zone;
+   int lru;
+
+   si_meminfo();
+   si_swapinfo();
+
+   for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+   pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+   for_each_zone(zone)
+   wmark_low += zone->watermark[WMARK_LOW];
+
+   available = i.freeram - wmark_low;
+
+   pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+   pagecache -= min(pagecache / 2, wmark_low);
+   available += pagecache;
+
+   available += global_page_state(NR_SLAB_RECLAIMABLE) -
+   min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+   if (available < 0)
+   available = 0;
+   return available;
+}
+
 static ssize_t
 tracing_entries_write(struct file *filp, const char __user *ubuf,
  size_t cnt, loff_t *ppos)
@@ -5975,13 +6010,15 @@ static ssize_t tracing_splice_read_pipe(struct file 
*filp,
struct trace_array *tr = inode->i_private;
unsigned long val;
int ret;
+   long available;
 
+   available = get_available_mem();
ret = kstrtoul_from_user(ubuf, cnt, 10, );
if (ret)
return ret;
 
/* must have at least 1 entry */
-   if (!val)
+   if (!val || (val > available))
return -EINVAL;
 
/* value is in KB */
-- 
1.9.1



[PATCH v1] kernel/trace:check the val against the available mem

2018-03-29 Thread Zhaoyang Huang
It is reported that some user app would like to echo a huge
number to "/sys/kernel/debug/tracing/buffer_size_kb" regardless
 of the available memory, which will cause the coinstantaneous
page allocation failed and introduce OOM. The commit checking the
val against the available mem first to avoid the consequence allocation.

Signed-off-by: Zhaoyang Huang 
---
 kernel/trace/trace.c | 39 ++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2d0ffcc..a4a4237 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -43,6 +43,8 @@
 #include 
 #include 
 
+#include 
+#include 
 #include "trace.h"
 #include "trace_output.h"
 
@@ -5967,6 +5969,39 @@ static ssize_t tracing_splice_read_pipe(struct file 
*filp,
return ret;
 }
 
+static long get_available_mem(void)
+{
+   struct sysinfo i;
+   long available;
+   unsigned long pagecache;
+   unsigned long wmark_low = 0;
+   unsigned long pages[NR_LRU_LISTS];
+   struct zone *zone;
+   int lru;
+
+   si_meminfo();
+   si_swapinfo();
+
+   for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+   pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+   for_each_zone(zone)
+   wmark_low += zone->watermark[WMARK_LOW];
+
+   available = i.freeram - wmark_low;
+
+   pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+   pagecache -= min(pagecache / 2, wmark_low);
+   available += pagecache;
+
+   available += global_page_state(NR_SLAB_RECLAIMABLE) -
+   min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+   if (available < 0)
+   available = 0;
+   return available;
+}
+
 static ssize_t
 tracing_entries_write(struct file *filp, const char __user *ubuf,
  size_t cnt, loff_t *ppos)
@@ -5975,13 +6010,15 @@ static ssize_t tracing_splice_read_pipe(struct file 
*filp,
struct trace_array *tr = inode->i_private;
unsigned long val;
int ret;
+   long available;
 
+   available = get_available_mem();
ret = kstrtoul_from_user(ubuf, cnt, 10, );
if (ret)
return ret;
 
/* must have at least 1 entry */
-   if (!val)
+   if (!val || (val > available))
return -EINVAL;
 
/* value is in KB */
-- 
1.9.1



Re: [PATCH v1] mm: help the ALLOC_HARDER allocation pass the watermarki when CMA on

2018-03-23 Thread Zhaoyang Huang
On Fri, Mar 23, 2018 at 4:38 PM, Michal Hocko <mho...@kernel.org> wrote:
> On Fri 23-03-18 15:57:32, Zhaoyang Huang wrote:
>> For the type of 'ALLOC_HARDER' page allocation, there is an express
>> highway for the whole process which lead the allocation reach __rmqueue_xxx
>> easier than other type.
>> However, when CMA is enabled, the free_page within zone_watermark_ok() will
>> be deducted for number the pages in CMA type, which may cause the watermark
>> check fail, but there are possible enough HighAtomic or Unmovable and
>> Reclaimable pages in the zone. So add 'alloc_harder' here to
>> count CMA pages in to clean the obstacles on the way to the final.
>
> This is no longer the case in the current mmotm tree. Have a look at
> Joonsoo's zone movable based CMA patchset 
> http://lkml.kernel.org/r/1512114786-5085-1-git-send-email-iamjoonsoo@lge.com
>
Thanks for the information. However, I can't find the commit in the
latest mainline, is it merged?
>> Signed-off-by: Zhaoyang Huang <zhaoyang.hu...@spreadtrum.com>
>> ---
>>  mm/page_alloc.c | 7 +--
>>  1 file changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 635d7dd..cc18620 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -3045,8 +3045,11 @@ bool __zone_watermark_ok(struct zone *z, unsigned int 
>> order, unsigned long mark,
>>
>>
>>  #ifdef CONFIG_CMA
>> - /* If allocation can't use CMA areas don't use free CMA pages */
>> - if (!(alloc_flags & ALLOC_CMA))
>> + /*
>> +  * If allocation can't use CMA areas and no alloc_harder set for none
>> +  * order0 allocation, don't use free CMA pages.
>> +  */
>> + if (!(alloc_flags & ALLOC_CMA) && (!alloc_harder || !order))
>>   free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
>>  #endif
>>
>> --
>> 1.9.1
>>
>
> --
> Michal Hocko
> SUSE Labs


Re: [PATCH v1] mm: help the ALLOC_HARDER allocation pass the watermarki when CMA on

2018-03-23 Thread Zhaoyang Huang
On Fri, Mar 23, 2018 at 4:38 PM, Michal Hocko  wrote:
> On Fri 23-03-18 15:57:32, Zhaoyang Huang wrote:
>> For the type of 'ALLOC_HARDER' page allocation, there is an express
>> highway for the whole process which lead the allocation reach __rmqueue_xxx
>> easier than other type.
>> However, when CMA is enabled, the free_page within zone_watermark_ok() will
>> be deducted for number the pages in CMA type, which may cause the watermark
>> check fail, but there are possible enough HighAtomic or Unmovable and
>> Reclaimable pages in the zone. So add 'alloc_harder' here to
>> count CMA pages in to clean the obstacles on the way to the final.
>
> This is no longer the case in the current mmotm tree. Have a look at
> Joonsoo's zone movable based CMA patchset 
> http://lkml.kernel.org/r/1512114786-5085-1-git-send-email-iamjoonsoo@lge.com
>
Thanks for the information. However, I can't find the commit in the
latest mainline, is it merged?
>> Signed-off-by: Zhaoyang Huang 
>> ---
>>  mm/page_alloc.c | 7 +--
>>  1 file changed, 5 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 635d7dd..cc18620 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -3045,8 +3045,11 @@ bool __zone_watermark_ok(struct zone *z, unsigned int 
>> order, unsigned long mark,
>>
>>
>>  #ifdef CONFIG_CMA
>> - /* If allocation can't use CMA areas don't use free CMA pages */
>> - if (!(alloc_flags & ALLOC_CMA))
>> + /*
>> +  * If allocation can't use CMA areas and no alloc_harder set for none
>> +  * order0 allocation, don't use free CMA pages.
>> +  */
>> + if (!(alloc_flags & ALLOC_CMA) && (!alloc_harder || !order))
>>   free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
>>  #endif
>>
>> --
>> 1.9.1
>>
>
> --
> Michal Hocko
> SUSE Labs


[PATCH v1] mm: help the ALLOC_HARDER allocation pass the watermarki when CMA on

2018-03-23 Thread Zhaoyang Huang
For the type of 'ALLOC_HARDER' page allocation, there is an express
highway for the whole process which lead the allocation reach __rmqueue_xxx
easier than other type.
However, when CMA is enabled, the free_page within zone_watermark_ok() will
be deducted for number the pages in CMA type, which may cause the watermark
check fail, but there are possible enough HighAtomic or Unmovable and
Reclaimable pages in the zone. So add 'alloc_harder' here to
count CMA pages in to clean the obstacles on the way to the final.

Signed-off-by: Zhaoyang Huang <zhaoyang.hu...@spreadtrum.com>
---
 mm/page_alloc.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 635d7dd..cc18620 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3045,8 +3045,11 @@ bool __zone_watermark_ok(struct zone *z, unsigned int 
order, unsigned long mark,
 
 
 #ifdef CONFIG_CMA
-   /* If allocation can't use CMA areas don't use free CMA pages */
-   if (!(alloc_flags & ALLOC_CMA))
+   /*
+* If allocation can't use CMA areas and no alloc_harder set for none
+* order0 allocation, don't use free CMA pages.
+*/
+   if (!(alloc_flags & ALLOC_CMA) && (!alloc_harder || !order))
free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 
-- 
1.9.1



[PATCH v1] mm: help the ALLOC_HARDER allocation pass the watermarki when CMA on

2018-03-23 Thread Zhaoyang Huang
For the type of 'ALLOC_HARDER' page allocation, there is an express
highway for the whole process which lead the allocation reach __rmqueue_xxx
easier than other type.
However, when CMA is enabled, the free_page within zone_watermark_ok() will
be deducted for number the pages in CMA type, which may cause the watermark
check fail, but there are possible enough HighAtomic or Unmovable and
Reclaimable pages in the zone. So add 'alloc_harder' here to
count CMA pages in to clean the obstacles on the way to the final.

Signed-off-by: Zhaoyang Huang 
---
 mm/page_alloc.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 635d7dd..cc18620 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3045,8 +3045,11 @@ bool __zone_watermark_ok(struct zone *z, unsigned int 
order, unsigned long mark,
 
 
 #ifdef CONFIG_CMA
-   /* If allocation can't use CMA areas don't use free CMA pages */
-   if (!(alloc_flags & ALLOC_CMA))
+   /*
+* If allocation can't use CMA areas and no alloc_harder set for none
+* order0 allocation, don't use free CMA pages.
+*/
+   if (!(alloc_flags & ALLOC_CMA) && (!alloc_harder || !order))
free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
 
-- 
1.9.1



[PATCH v1] mm/vmalloc: add a node corresponding to cached_hole_size

2017-07-21 Thread Zhaoyang Huang
we just record the cached_hole_size now, which will be used when
the criteria meet both of 'free_vmap_cache == NULL' and 'size <
cached_hole_size'. However, under above scenario, the search will
start from the rb_root and then find the node which just in front
of the cached hole.

free_vmap_cache miss:
  vmap_area_root
  /  \
   _next U
/  (T1)
 cached_hole_node
   /
 ...   (T2)
  /
first

vmap_area_list->first->..->cached_hole_node->cached_hole_node.list.next
  |---(T3)---| | <<< cached_hole_size >>> |

vmap_area_list->..->cached_hole_node->cached_hole_node.list.next
   | <<< cached_hole_size >>> |

The time cost to search the node now is T = T1 + T2 + T3.
The commit add a cached_hole_node here to record the one just in front of
the cached_hole_size, which can help to avoid walking the rb tree and
the list and make the T = 0;

Signed-off-by: Zhaoyang Huang <zhaoyang.hu...@spreadtrum.com>
---
 mm/vmalloc.c | 23 +--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8698c1c..4e76e7f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -336,6 +336,7 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 
 /* The vmap cache globals are protected by vmap_area_lock */
 static struct rb_node *free_vmap_cache;
+static struct vmap_area *cached_hole_node;
 static unsigned long cached_hole_size;
 static unsigned long cached_vstart;
 static unsigned long cached_align;
@@ -444,6 +445,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long 
size,
size < cached_hole_size ||
vstart < cached_vstart ||
align < cached_align) {
+   /*if we have a cached node, just use it*/
+   if ((size < cached_hole_size) && cached_hole_node != NULL) {
+   addr = ALIGN(cached_hole_node->va_end, align);
+   cached_hole_node = NULL;
+   goto found;
+   }
 nocache:
cached_hole_size = 0;
free_vmap_cache = NULL;
@@ -487,8 +494,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long 
size,
 
/* from the starting point, walk areas until a suitable hole is found */
while (addr + size > first->va_start && addr + size <= vend) {
-   if (addr + cached_hole_size < first->va_start)
+   if (addr + cached_hole_size < first->va_start) {
cached_hole_size = first->va_start - addr;
+   /*record the node corresponding to the hole*/
+   cached_hole_node = (first->list.prev ==
+   _area_list) ?
+   NULL : list_prev_entry(first, list);
+   }
addr = ALIGN(first->va_end, align);
if (addr + size < addr)
goto overflow;
@@ -571,10 +583,17 @@ static void __free_vmap_area(struct vmap_area *va)
}
}
}
+   if (va == cached_hole_node) {
+   /*cached node is freed, the hole get bigger*/
+   if (cached_hole_node->list.prev != _area_list)
+   cached_hole_node = list_prev_entry(cached_hole_node,
+  list);
+   else
+   cached_hole_node = NULL;
+   }
rb_erase(>rb_node, _area_root);
RB_CLEAR_NODE(>rb_node);
list_del_rcu(>list);
-
/*
 * Track the highest possible candidate for pcpu area
 * allocation.  Areas outside of vmalloc area can be returned
-- 
1.9.1



[PATCH v1] mm/vmalloc: add a node corresponding to cached_hole_size

2017-07-21 Thread Zhaoyang Huang
we just record the cached_hole_size now, which will be used when
the criteria meet both of 'free_vmap_cache == NULL' and 'size <
cached_hole_size'. However, under above scenario, the search will
start from the rb_root and then find the node which just in front
of the cached hole.

free_vmap_cache miss:
  vmap_area_root
  /  \
   _next U
/  (T1)
 cached_hole_node
   /
 ...   (T2)
  /
first

vmap_area_list->first->..->cached_hole_node->cached_hole_node.list.next
  |---(T3)---| | <<< cached_hole_size >>> |

vmap_area_list->..->cached_hole_node->cached_hole_node.list.next
   | <<< cached_hole_size >>> |

The time cost to search the node now is T = T1 + T2 + T3.
The commit add a cached_hole_node here to record the one just in front of
the cached_hole_size, which can help to avoid walking the rb tree and
the list and make the T = 0;

Signed-off-by: Zhaoyang Huang 
---
 mm/vmalloc.c | 23 +--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8698c1c..4e76e7f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -336,6 +336,7 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 
 /* The vmap cache globals are protected by vmap_area_lock */
 static struct rb_node *free_vmap_cache;
+static struct vmap_area *cached_hole_node;
 static unsigned long cached_hole_size;
 static unsigned long cached_vstart;
 static unsigned long cached_align;
@@ -444,6 +445,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long 
size,
size < cached_hole_size ||
vstart < cached_vstart ||
align < cached_align) {
+   /*if we have a cached node, just use it*/
+   if ((size < cached_hole_size) && cached_hole_node != NULL) {
+   addr = ALIGN(cached_hole_node->va_end, align);
+   cached_hole_node = NULL;
+   goto found;
+   }
 nocache:
cached_hole_size = 0;
free_vmap_cache = NULL;
@@ -487,8 +494,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long 
size,
 
/* from the starting point, walk areas until a suitable hole is found */
while (addr + size > first->va_start && addr + size <= vend) {
-   if (addr + cached_hole_size < first->va_start)
+   if (addr + cached_hole_size < first->va_start) {
cached_hole_size = first->va_start - addr;
+   /*record the node corresponding to the hole*/
+   cached_hole_node = (first->list.prev ==
+   _area_list) ?
+   NULL : list_prev_entry(first, list);
+   }
addr = ALIGN(first->va_end, align);
if (addr + size < addr)
goto overflow;
@@ -571,10 +583,17 @@ static void __free_vmap_area(struct vmap_area *va)
}
}
}
+   if (va == cached_hole_node) {
+   /*cached node is freed, the hole get bigger*/
+   if (cached_hole_node->list.prev != _area_list)
+   cached_hole_node = list_prev_entry(cached_hole_node,
+  list);
+   else
+   cached_hole_node = NULL;
+   }
rb_erase(>rb_node, _area_root);
RB_CLEAR_NODE(>rb_node);
list_del_rcu(>list);
-
/*
 * Track the highest possible candidate for pcpu area
 * allocation.  Areas outside of vmalloc area can be returned
-- 
1.9.1



  1   2   >