[PATCH for vm-scalability] usemem: Output a message after punch holes done

2021-04-13 Thread Hui Zhu
From: Hui Zhu 

When I use punch holes to setup a test page fragmentation environment, I
didn't know when the punch holes done.  I can only get this information
through top or something else.

This commit add code to output a message after punch holes done to
handle this issue.

Signed-off-by: Hui Zhu 
---
 usemem.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/usemem.c b/usemem.c
index 5b90aae..0c76d17 100644
--- a/usemem.c
+++ b/usemem.c
@@ -791,6 +791,8 @@ long do_units(void)
for (i = 0; i < nptr; i++)
do_punch_holes(ptrs[i], lens[i]);
}
+   printf("punch holes done\n");
+   fflush(stdout);
}
 
while (sleep_secs)
-- 
1.8.3.1



[PATCH for vm-scalability] usemem: Add code for touch-alloc

2021-04-08 Thread Hui Zhu
Add code for touch-alloc.
And Change read memory to write memory to avoid use the zero-page for
reads in do_anonymous_page.

Signed-off-by: Hui Zhu 
---
 usemem.c | 34 ++
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/usemem.c b/usemem.c
index e2c46ec..5b90aae 100644
--- a/usemem.c
+++ b/usemem.c
@@ -329,6 +329,18 @@ void detach(void)
}
 }
 
+unsigned long do_access(unsigned long *p, unsigned long idx, int read)
+{
+   volatile unsigned long *vp = p;
+
+   if (read)
+   return vp[idx]; /* read data */
+   else {
+   vp[idx] = idx;  /* write data */
+   return 0;
+   }
+}
+
 unsigned long * allocate(unsigned long bytes)
 {
unsigned long *p;
@@ -355,6 +367,14 @@ unsigned long * allocate(unsigned long bytes)
p = (unsigned long *)ALIGN((unsigned long)p, pagesize - 1);
}
 
+   if (opt_touch_alloc) {
+   unsigned long i;
+   unsigned long m = bytes / sizeof(*p);
+
+   for (i = 0; i < m; i += 1)
+   do_access(p, i, 0);
+   }
+
return p;
 }
 
@@ -436,18 +456,6 @@ void shm_unlock(int seg_id)
shmctl(seg_id, SHM_UNLOCK, NULL);
 }
 
-unsigned long do_access(unsigned long *p, unsigned long idx, int read)
-{
-   volatile unsigned long *vp = p;
-
-   if (read)
-   return vp[idx]; /* read data */
-   else {
-   vp[idx] = idx;  /* write data */
-   return 0;
-   }
-}
-
 #define NSEC_PER_SEC  (1UL * 1000 * 1000 * 1000)
 
 long nsec_sub(long nsec1, long nsec2)
@@ -953,6 +961,8 @@ int main(int argc, char *argv[])
opt_punch_holes = 1;
} else if (strcmp(opts[opt_index].name, "init-time") == 
0) {
opt_init_time = 1;
+   } else if (strcmp(opts[opt_index].name, "touch-alloc") 
== 0) {
+   opt_touch_alloc = 1;
} else
usage(1);
break;
-- 
2.17.1



[PATCH] usemem: Remove the duplicate do_access

2021-01-18 Thread Hui Zhu
From: Hui Zhu 

Got following error when build usemem:
gcc -O -c -Wall -g  usemem.c -o usemem.o
usemem.c:451:15: error: redefinition of ‘do_access’
 unsigned long do_access(unsigned long *p, unsigned long idx, int read)
   ^
usemem.c:332:15: note: previous definition of ‘do_access’ was here
 unsigned long do_access(unsigned long *p, unsigned long idx, int read)
   ^
make: *** [usemem.o] Error 1

Remove the duplicate do_access to fix this error.

Signed-off-by: Hui Zhu 
---
 usemem.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/usemem.c b/usemem.c
index 48c3d65..e2c46ec 100644
--- a/usemem.c
+++ b/usemem.c
@@ -329,18 +329,6 @@ void detach(void)
}
 }
 
-unsigned long do_access(unsigned long *p, unsigned long idx, int read)
-{
-   volatile unsigned long *vp = p;
-
-   if (read)
-   return vp[idx]; /* read data */
-   else {
-   vp[idx] = idx;  /* write data */
-   return 0;
-   }
-}
-
 unsigned long * allocate(unsigned long bytes)
 {
unsigned long *p;
-- 
1.8.3.1



[PATCH] usemem: Add option touch-alloc

2020-12-16 Thread Hui Zhu
Some environment will not fault in memory even if MAP_POPULATE is set.
This commit add option touch-alloc to read memory after allocate it to
make sure the pages is fault in.

Signed-off-by: Hui Zhu 
---
 usemem.c | 37 +
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/usemem.c b/usemem.c
index 6d1d575..d93691b 100644
--- a/usemem.c
+++ b/usemem.c
@@ -97,6 +97,7 @@ unsigned long opt_delay = 0;
 int opt_read_again = 0;
 int opt_punch_holes = 0;
 int opt_init_time = 0;
+int opt_touch_alloc = 0;
 int nr_task;
 int nr_thread;
 int nr_cpu;
@@ -157,6 +158,7 @@ void usage(int ok)
"-Z|--read-again read memory again after access the memory\n"
"--punch-holes   free every other page after allocation\n"
"--init-time remove the initialization time from the run 
time and show the initialization time\n"
+   "--touch-alloc   read memory after allocate it\n"
"-h|--help   show this message\n"
,   ourname);
 
@@ -197,6 +199,7 @@ static const struct option opts[] = {
{ "read-again"  , 0, NULL, 'Z' },
{ "punch-holes" , 0, NULL,   0 },
{ "init-time"   , 0, NULL,   0 },
+   { "touch-alloc" , 0, NULL,   0 },
{ "help", 0, NULL, 'h' },
{ NULL  , 0, NULL, 0 }
 };
@@ -326,6 +329,18 @@ void detach(void)
}
 }
 
+unsigned long do_access(unsigned long *p, unsigned long idx, int read)
+{
+   volatile unsigned long *vp = p;
+
+   if (read)
+   return vp[idx]; /* read data */
+   else {
+   vp[idx] = idx;  /* write data */
+   return 0;
+   }
+}
+
 unsigned long * allocate(unsigned long bytes)
 {
unsigned long *p;
@@ -352,6 +367,14 @@ unsigned long * allocate(unsigned long bytes)
p = (unsigned long *)ALIGN((unsigned long)p, pagesize - 1);
}
 
+   if (opt_touch_alloc) {
+   unsigned long i;
+   unsigned long m = bytes / sizeof(*p);
+
+   for (i = 0; i < m; i += 1)
+   do_access(p, i, 1);
+   }
+
return p;
 }
 
@@ -433,18 +456,6 @@ void shm_unlock(int seg_id)
shmctl(seg_id, SHM_UNLOCK, NULL);
 }
 
-unsigned long do_access(unsigned long *p, unsigned long idx, int read)
-{
-   volatile unsigned long *vp = p;
-
-   if (read)
-   return vp[idx]; /* read data */
-   else {
-   vp[idx] = idx;  /* write data */
-   return 0;
-   }
-}
-
 #define NSEC_PER_SEC  (1UL * 1000 * 1000 * 1000)
 
 long nsec_sub(long nsec1, long nsec2)
@@ -950,6 +961,8 @@ int main(int argc, char *argv[])
opt_punch_holes = 1;
} else if (strcmp(opts[opt_index].name, "init-time") == 
0) { 
opt_init_time = 1;
+   } else if (strcmp(opts[opt_index].name, "touch-alloc") 
== 0) { 
+   opt_touch_alloc = 1;
} else
usage(1);
break;
-- 
2.17.1



[PATCH] usemem: Add option init-time

2020-12-16 Thread Hui Zhu
From: Hui Zhu 

This commit add a new option init-time to remove the initialization time
from the run time and show the initialization time.

Signed-off-by: Hui Zhu 
---
 usemem.c | 29 +++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/usemem.c b/usemem.c
index 823647e..6d1d575 100644
--- a/usemem.c
+++ b/usemem.c
@@ -96,6 +96,7 @@ int opt_bind_interval = 0;
 unsigned long opt_delay = 0;
 int opt_read_again = 0;
 int opt_punch_holes = 0;
+int opt_init_time = 0;
 int nr_task;
 int nr_thread;
 int nr_cpu;
@@ -155,6 +156,7 @@ void usage(int ok)
"-U|--hugetlballocate hugetlbfs page\n"
"-Z|--read-again read memory again after access the memory\n"
"--punch-holes   free every other page after allocation\n"
+   "--init-time remove the initialization time from the run 
time and show the initialization time\n"
"-h|--help   show this message\n"
,   ourname);
 
@@ -193,7 +195,8 @@ static const struct option opts[] = {
{ "delay"   , 1, NULL, 'e' },
{ "hugetlb" , 0, NULL, 'U' },
{ "read-again"  , 0, NULL, 'Z' },
-   { "punch-holes" , 0, NULL,   0 },
+   { "punch-holes" , 0, NULL,   0 },
+   { "init-time"   , 0, NULL,   0 },
{ "help", 0, NULL, 'h' },
{ NULL  , 0, NULL, 0 }
 };
@@ -945,6 +948,8 @@ int main(int argc, char *argv[])
case 0:
if (strcmp(opts[opt_index].name, "punch-holes") == 0) {
opt_punch_holes = 1;
+   } else if (strcmp(opts[opt_index].name, "init-time") == 
0) { 
+   opt_init_time = 1;
} else
usage(1);
break;
@@ -1128,7 +1133,7 @@ int main(int argc, char *argv[])
if (optind != argc - 1)
usage(0);
 
-   if (!opt_write_signal_read)
+   if (!opt_write_signal_read || opt_init_time)
gettimeofday(&start_time, NULL);
 
opt_bytes = memparse(argv[optind], NULL);
@@ -1263,5 +1268,25 @@ int main(int argc, char *argv[])
if (!nr_task)
nr_task = 1;
 
+   if (opt_init_time) {
+   struct timeval stop;
+   char buf[1024];
+   size_t len;
+   unsigned long delta_us;
+
+   gettimeofday(&stop, NULL);
+   delta_us = (stop.tv_sec - start_time.tv_sec) * 100 +
+   (stop.tv_usec - start_time.tv_usec);
+   len = snprintf(buf, sizeof(buf),
+   "the initialization time is %lu secs %lu usecs\n",
+   delta_us / 100, delta_us % 100);
+   fflush(stdout);
+   if (write(1, buf, len) != len)
+   fprintf(stderr, "WARNING: statistics output may be 
incomplete.\n");
+
+   if (!opt_write_signal_read)
+   gettimeofday(&start_time, NULL);
+   }
+
return do_tasks();
 }
-- 
2.17.1



[PATCH] samples/bpf/Makefile: Create tools/testing/selftests/bpf dir

2020-12-14 Thread Hui Zhu
From: Hui Zhu 

Got an error when I built samples/bpf in a separate directory:
make O=../bk/ defconfig
make -j64 bzImage
make headers_install
make V=1 M=samples/bpf
...
...
make -C /home/teawater/kernel/linux/samples/bpf/../..//tools/build
CFLAGS= LDFLAGS= fixdep
make -f
/home/teawater/kernel/linux/samples/bpf/../..//tools/build/Makefile.build
dir=. obj=fixdep
make all_cmd
Warning: Kernel ABI header at 'tools/include/uapi/linux/netlink.h'
differs from latest version at 'include/uapi/linux/netlink.h'
Warning: Kernel ABI header at 'tools/include/uapi/linux/if_link.h'
differs from latest version at 'include/uapi/linux/if_link.h'
  gcc
-Wp,-MD,samples/bpf/../../tools/testing/selftests/bpf/.cgroup_helpers.o.d
-Wall -O2 -Wmissing-prototypes -Wstrict-prototypes -I./usr/include
-I/home/teawater/kernel/linux/tools/testing/selftests/bpf/
-I/home/teawater/kernel/linux/tools/lib/
-I/home/teawater/kernel/linux/tools/include
-I/home/teawater/kernel/linux/tools/perf -DHAVE_ATTR_TEST=0  -c -o
samples/bpf/../../tools/testing/selftests/bpf/cgroup_helpers.o
/home/teawater/kernel/linux/samples/bpf/../../tools/testing/selftests/bpf/cgroup_helpers.c
/home/teawater/kernel/linux/samples/bpf/../../tools/testing/selftests/bpf/cgroup_helpers.c:315:1:
fatal error: opening dependency file
samples/bpf/../../tools/testing/selftests/bpf/.cgroup_helpers.o.d: No
such file or directory

ls -al samples/bpf/../../tools/testing/selftests/bpf/
ls: cannot access 'samples/bpf/../../tools/testing/selftests/bpf/': No
such file or directory

There is no samples/bpf/../../tools/testing/selftests/bpf/ causing a
compilation error.

This commit add a "make -p" before build files in
samples/bpf/../../tools/testing/selftests/bpf/ to handle the issue.

Signed-off-by: Hui Zhu 
---
 samples/bpf/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index aeebf5d12f32..5b940eedf2e8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -262,6 +262,7 @@ clean:
 
 $(LIBBPF): FORCE
 # Fix up variables inherited from Kbuild that tools/ build system won't like
+   mkdir -p $(obj)/../../tools/testing/selftests/bpf/
$(MAKE) -C $(dir $@) RM='rm -rf' EXTRA_CFLAGS="$(TPROGS_CFLAGS)" \
LDFLAGS=$(TPROGS_LDFLAGS) srctree=$(BPF_SAMPLES_PATH)/../../ O=
 
-- 
2.17.1



[RFC for qemu v4 2/2] virtio_balloon: Add dcvq to deflate continuous pages

2020-07-15 Thread Hui Zhu
This commit adds a vq dcvq to deflate continuous pages.
When VIRTIO_BALLOON_F_CONT_PAGES is set, try to get continuous pages
from icvq and use madvise MADV_WILLNEED with the pages.

Signed-off-by: Hui Zhu 
---
 hw/virtio/virtio-balloon.c | 14 +-
 include/hw/virtio/virtio-balloon.h |  2 +-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index d36a5c8..165adf7 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -138,7 +138,8 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
 }
 
 static void balloon_deflate_page(VirtIOBalloon *balloon,
- MemoryRegion *mr, hwaddr mr_offset)
+ MemoryRegion *mr, hwaddr mr_offset,
+ size_t size)
 {
 void *addr = memory_region_get_ram_ptr(mr) + mr_offset;
 ram_addr_t rb_offset;
@@ -153,10 +154,11 @@ static void balloon_deflate_page(VirtIOBalloon *balloon,
 rb_page_size = qemu_ram_pagesize(rb);
 
 host_addr = (void *)((uintptr_t)addr & ~(rb_page_size - 1));
+size &= ~(rb_page_size - 1);
 
 /* When a page is deflated, we hint the whole host page it lives
  * on, since we can't do anything smaller */
-ret = qemu_madvise(host_addr, rb_page_size, QEMU_MADV_WILLNEED);
+ret = qemu_madvise(host_addr, size, QEMU_MADV_WILLNEED);
 if (ret != 0) {
 warn_report("Couldn't MADV_WILLNEED on balloon deflate: %s",
 strerror(errno));
@@ -354,7 +356,7 @@ static void virtio_balloon_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT;
 offset += 4;
 
-if (vq == s->icvq) {
+if (vq == s->icvq || vq == s->dcvq) {
 uint32_t psize_ptr;
 if (iov_to_buf(elem->out_sg, elem->out_num, offset, 
&psize_ptr, 4) != 4) {
 break;
@@ -383,8 +385,9 @@ static void virtio_balloon_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 balloon_inflate_page(s, section.mr,
  section.offset_within_region,
  psize, &pbp);
-} else if (vq == s->dvq) {
-balloon_deflate_page(s, section.mr, 
section.offset_within_region);
+} else if (vq == s->dvq || vq == s->dcvq) {
+balloon_deflate_page(s, section.mr, 
section.offset_within_region,
+ psize);
 } else {
 g_assert_not_reached();
 }
@@ -838,6 +841,7 @@ static void virtio_balloon_device_realize(DeviceState *dev, 
Error **errp)
 
 if (virtio_has_feature(s->host_features, VIRTIO_BALLOON_F_CONT_PAGES)) {
 s->icvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
+s->dcvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
 }
 
 reset_stats(s);
diff --git a/include/hw/virtio/virtio-balloon.h 
b/include/hw/virtio/virtio-balloon.h
index 6a2514d..848a7fb 100644
--- a/include/hw/virtio/virtio-balloon.h
+++ b/include/hw/virtio/virtio-balloon.h
@@ -42,7 +42,7 @@ enum virtio_balloon_free_page_report_status {
 
 typedef struct VirtIOBalloon {
 VirtIODevice parent_obj;
-VirtQueue *ivq, *dvq, *svq, *free_page_vq, *icvq;
+VirtQueue *ivq, *dvq, *svq, *free_page_vq, *icvq, *dcvq;
 uint32_t free_page_report_status;
 uint32_t num_pages;
 uint32_t actual;
-- 
2.7.4



[RFC for Linux v4 1/2] virtio_balloon: Add VIRTIO_BALLOON_F_CONT_PAGES and inflate_cont_vq

2020-07-15 Thread Hui Zhu
This commit adds a new flag VIRTIO_BALLOON_F_CONT_PAGES to virtio_balloon.
Add it adds a vq inflate_cont_vq to inflate continuous pages.
When VIRTIO_BALLOON_F_CONT_PAGES is set, try to allocate continuous pages
and report them use inflate_cont_vq.

Signed-off-by: Hui Zhu 
---
 drivers/virtio/virtio_balloon.c | 119 ++--
 include/linux/balloon_compaction.h  |   9 ++-
 include/uapi/linux/virtio_balloon.h |   1 +
 mm/balloon_compaction.c |  41 ++---
 4 files changed, 142 insertions(+), 28 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 1f157d2..b89f566 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -42,6 +42,9 @@
(1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT))
 #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER)
 
+#define VIRTIO_BALLOON_INFLATE_MAX_ORDER min((int) (sizeof(__virtio32) * 
BITS_PER_BYTE - \
+   1 - PAGE_SHIFT), 
(MAX_ORDER-1))
+
 #ifdef CONFIG_BALLOON_COMPACTION
 static struct vfsmount *balloon_mnt;
 #endif
@@ -52,6 +55,7 @@ enum virtio_balloon_vq {
VIRTIO_BALLOON_VQ_STATS,
VIRTIO_BALLOON_VQ_FREE_PAGE,
VIRTIO_BALLOON_VQ_REPORTING,
+   VIRTIO_BALLOON_VQ_INFLATE_CONT,
VIRTIO_BALLOON_VQ_MAX
 };
 
@@ -61,7 +65,7 @@ enum virtio_balloon_config_read {
 
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq, 
*inflate_cont_vq;
 
/* Balloon's own wq for cpu-intensive work items */
struct workqueue_struct *balloon_wq;
@@ -126,6 +130,9 @@ struct virtio_balloon {
/* Free page reporting device */
struct virtqueue *reporting_vq;
struct page_reporting_dev_info pr_dev_info;
+
+   /* Current order of inflate continuous pages - 
VIRTIO_BALLOON_F_CONT_PAGES */
+   __u32 current_pages_order;
 };
 
 static struct virtio_device_id id_table[] = {
@@ -208,19 +215,59 @@ static void set_page_pfns(struct virtio_balloon *vb,
  page_to_balloon_pfn(page) + i);
 }
 
+static void set_page_pfns_order(struct virtio_balloon *vb,
+   __virtio32 pfns[], struct page *page,
+   unsigned int order)
+{
+   if (order == 0)
+   return set_page_pfns(vb, pfns, page);
+
+   /* Set the first pfn of the continuous pages.  */
+   pfns[0] = cpu_to_virtio32(vb->vdev, page_to_balloon_pfn(page));
+   /* Set the size of the continuous pages.  */
+   pfns[1] = PAGE_SIZE << order;
+}
+
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
 {
unsigned num_allocated_pages;
-   unsigned num_pfns;
+   unsigned int num_pfns, pfn_per_alloc;
struct page *page;
LIST_HEAD(pages);
+   bool is_cont = vb->current_pages_order != 0;
 
-   /* We can only do one array worth at a time. */
-   num = min(num, ARRAY_SIZE(vb->pfns));
-
-   for (num_pfns = 0; num_pfns < num;
-num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) {
-   struct page *page = balloon_page_alloc();
+   if (is_cont)
+   pfn_per_alloc = 2;
+   else
+   pfn_per_alloc = VIRTIO_BALLOON_PAGES_PER_PAGE;
+
+   for (num_pfns = 0, num_allocated_pages = 0;
+num_pfns < ARRAY_SIZE(vb->pfns) && num_allocated_pages < num;
+num_pfns += pfn_per_alloc,
+num_allocated_pages += VIRTIO_BALLOON_PAGES_PER_PAGE << 
vb->current_pages_order) {
+   struct page *page;
+
+   for (; vb->current_pages_order >= 0; vb->current_pages_order--) 
{
+   if (vb->current_pages_order &&
+   num - num_allocated_pages <
+   VIRTIO_BALLOON_PAGES_PER_PAGE << 
vb->current_pages_order)
+   continue;
+   page = balloon_pages_alloc(vb->current_pages_order);
+   if (page) {
+   /* If the first allocated page is not 
continuous pages,
+* go back to transport page as signle page.
+*/
+   if (is_cont && num_pfns == 0 && 
!vb->current_pages_order) {
+   is_cont = false;
+   pfn_per_alloc = 
VIRTIO_BALLOON_PAGES_PER_PAGE;
+   }
+   set_page_private(page, vb->current_pages_order);
+   balloon_page_push(&pages, page);
+   br

[RFC for qemu v4 0/2] virtio-balloon: Add option cont-pages to set VIRTIO_BALLOON_F_CONT_PAGES

2020-07-15 Thread Hui Zhu
Code of current version for Linux and qemu is available in [1] and [2].
Update of this version:
1. Report continuous pages will increase the speed.  So added deflate
   continuous pages.
2. According to the comments from David in [3], added 2 new vqs icvq and
   dcvq to get continuous pages with format 32 bits pfn and 32 bits size.

Following is the introduction of the function.
Set option cont-pages to on will open flags VIRTIO_BALLOON_F_CONT_PAGES.
qemu will get continuous pages from icvq and dcvq and do madvise
MADV_WILLNEED and MADV_DONTNEED with the pages.
Opening this flag can bring two benefits:
1. Increase the speed of balloon inflate and deflate.
2. Decrease the splitted THPs number in the host.

[1] https://github.com/teawater/linux/tree/balloon_conts
[2] https://github.com/teawater/qemu/tree/balloon_conts
[3] https://lkml.org/lkml/2020/5/13/1211

Hui Zhu (2):
  virtio_balloon: Add cont-pages and icvq
  virtio_balloon: Add dcvq to deflate continuous pages

 hw/virtio/virtio-balloon.c  |   92 +++-
 include/hw/virtio/virtio-balloon.h  |2
 include/standard-headers/linux/virtio_balloon.h |1
 3 files changed, 63 insertions(+), 32 deletions(-)


[RFC for Linux v4 0/2] virtio_balloon: Add VIRTIO_BALLOON_F_CONT_PAGES to report continuous pages

2020-07-15 Thread Hui Zhu
The first, second and third version are in [1], [2] and [3].
Code of current version for Linux and qemu is available in [4] and [5].
Update of this version:
1. Report continuous pages will increase the speed.  So added deflate
   continuous pages.
2. According to the comments from David in [6], added 2 new vqs inflate_cont_vq
   and deflate_cont_vq to report continuous pages with format 32 bits pfn and 32
   bits size.
Following is the introduction of the function.
These patches add VIRTIO_BALLOON_F_CONT_PAGES to virtio_balloon. With this
flag, balloon tries to use continuous pages to inflate and deflate.
Opening this flag can bring two benefits:
1. Report continuous pages will increase memory report size of each time
   call tell_host.  Then it will increase the speed of balloon inflate and
   deflate.
2. Host THPs will be splitted when qemu release the page of balloon inflate.
   Inflate balloon with continuous pages will let QEMU release the pages
   of same THPs.  That will help decrease the splitted THPs number in
   the host.
   Following is an example in a VM with 1G memory 1CPU.  This test setups an
   environment that has a lot of fragmentation pages.  Then inflate balloon will
   split the THPs.
// This is the THP number before VM execution in the host.
// None use THP.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages: 0 kB
// After VM start, use usemem
// (https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git)
// punch-holes function generates 400m fragmentation pages in the guest
// kernel.
usemem --punch-holes -s -1 800m &
// This is the THP number after this command in the host.
// Some THP is used by VM because usemem will access 800M memory
// in the guest.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:911360 kB
// Connect to the QEMU monitor, setup balloon, and set it size to 600M.
(qemu) device_add virtio-balloon-pci,id=balloon1
(qemu) info balloon
balloon: actual=1024
(qemu) balloon 600
(qemu) info balloon
balloon: actual=600
// This is the THP number after inflate the balloon in the host.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages: 88064 kB
// Set the size back to 1024M in the QEMU monitor.
(qemu) balloon 1024
(qemu) info balloon
balloon: actual=1024
// Use usemem to increase the memory usage of QEMU.
killall usemem
usemem 800m
// This is the THP number after this operation.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages: 65536 kB

Following example change to use continuous pages balloon.  The number of
splitted THPs is decreased.
// This is the THP number before VM execution in the host.
// None use THP.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages: 0 kB
// After VM start, use usemem punch-holes function generates 400M
// fragmentation pages in the guest kernel.
usemem --punch-holes -s -1 800m &
// This is the THP number after this command in the host.
// Some THP is used by VM because usemem will access 800M memory
// in the guest.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:911360 kB
// Connect to the QEMU monitor, setup balloon, and set it size to 600M.
(qemu) device_add virtio-balloon-pci,id=balloon1,cont-pages=on
(qemu) info balloon
balloon: actual=1024
(qemu) balloon 600
(qemu) info balloon
balloon: actual=600
// This is the THP number after inflate the balloon in the host.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:616448 kB
// Set the size back to 1024M in the QEMU monitor.
(qemu) balloon 1024
(qemu) info balloon
balloon: actual=1024
// Use usemem to increase the memory usage of QEMU.
killall usemem
usemem 800m
// This is the THP number after this operation.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:907264 kB

[1] https://lkml.org/lkml/2020/3/12/144
[2] 
https://lore.kernel.org/linux-mm/1584893097-12317-1-git-send-email-teawa...@gmail.com/
[3] https://lkml.org/lkml/2020/5/12/324
[4] https://github.com/teawater/linux/tree/balloon_conts
[5] https://github.com/teawater/qemu/tree/balloon_conts
[6] https://lkml.org/lkml/2020/5/13/1211

Hui Zhu (2):
  virtio_balloon: Add VIRTIO_BALLOON_F_CONT_PAGES and inflate_cont_vq
  virtio_balloon: Add deflate_cont_vq to deflate continuous pages

 drivers/virtio/virtio_balloon.c |  180 +++-
 include/linux/balloon_compaction.h  |   12 ++
 include/uapi/linux/virtio_balloon.h |1
 mm/balloon_compaction.c |  117 +--
 4 files changed, 280 insertions(+), 30 deletions(-)


[RFC for qemu v4 1/2] virtio_balloon: Add cont-pages and icvq

2020-07-15 Thread Hui Zhu
This commit adds cont-pages option to virtio_balloon.  virtio_balloon
will open flags VIRTIO_BALLOON_F_CONT_PAGES with this option.
And it add a vq icvq to inflate continuous pages.
When VIRTIO_BALLOON_F_CONT_PAGES is set, try to get continuous pages
from icvq and use madvise MADV_DONTNEED release the pages.

Signed-off-by: Hui Zhu 
---
 hw/virtio/virtio-balloon.c  | 80 -
 include/hw/virtio/virtio-balloon.h  |  2 +-
 include/standard-headers/linux/virtio_balloon.h |  1 +
 3 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index a4729f7..d36a5c8 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -65,23 +65,26 @@ static bool 
virtio_balloon_pbp_matches(PartiallyBalloonedPage *pbp,
 
 static void balloon_inflate_page(VirtIOBalloon *balloon,
  MemoryRegion *mr, hwaddr mr_offset,
+ size_t size,
  PartiallyBalloonedPage *pbp)
 {
 void *addr = memory_region_get_ram_ptr(mr) + mr_offset;
 ram_addr_t rb_offset, rb_aligned_offset, base_gpa;
 RAMBlock *rb;
 size_t rb_page_size;
-int subpages;
+int subpages, pages_num;
 
 /* XXX is there a better way to get to the RAMBlock than via a
  * host address? */
 rb = qemu_ram_block_from_host(addr, false, &rb_offset);
 rb_page_size = qemu_ram_pagesize(rb);
 
+size &= ~(rb_page_size - 1);
+
 if (rb_page_size == BALLOON_PAGE_SIZE) {
 /* Easy case */
 
-ram_block_discard_range(rb, rb_offset, rb_page_size);
+ram_block_discard_range(rb, rb_offset, size);
 /* We ignore errors from ram_block_discard_range(), because it
  * has already reported them, and failing to discard a balloon
  * page is not fatal */
@@ -99,32 +102,38 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
 
 rb_aligned_offset = QEMU_ALIGN_DOWN(rb_offset, rb_page_size);
 subpages = rb_page_size / BALLOON_PAGE_SIZE;
-base_gpa = memory_region_get_ram_addr(mr) + mr_offset -
-   (rb_offset - rb_aligned_offset);
 
-if (pbp->bitmap && !virtio_balloon_pbp_matches(pbp, base_gpa)) {
-/* We've partially ballooned part of a host page, but now
- * we're trying to balloon part of a different one.  Too hard,
- * give up on the old partial page */
-virtio_balloon_pbp_free(pbp);
-}
+for (pages_num = size / BALLOON_PAGE_SIZE;
+ pages_num > 0; pages_num--) {
+base_gpa = memory_region_get_ram_addr(mr) + mr_offset -
+   (rb_offset - rb_aligned_offset);
 
-if (!pbp->bitmap) {
-virtio_balloon_pbp_alloc(pbp, base_gpa, subpages);
-}
+if (pbp->bitmap && !virtio_balloon_pbp_matches(pbp, base_gpa)) {
+/* We've partially ballooned part of a host page, but now
+* we're trying to balloon part of a different one.  Too hard,
+* give up on the old partial page */
+virtio_balloon_pbp_free(pbp);
+}
 
-set_bit((rb_offset - rb_aligned_offset) / BALLOON_PAGE_SIZE,
-pbp->bitmap);
+if (!pbp->bitmap) {
+virtio_balloon_pbp_alloc(pbp, base_gpa, subpages);
+}
 
-if (bitmap_full(pbp->bitmap, subpages)) {
-/* We've accumulated a full host page, we can actually discard
- * it now */
+set_bit((rb_offset - rb_aligned_offset) / BALLOON_PAGE_SIZE,
+pbp->bitmap);
 
-ram_block_discard_range(rb, rb_aligned_offset, rb_page_size);
-/* We ignore errors from ram_block_discard_range(), because it
- * has already reported them, and failing to discard a balloon
- * page is not fatal */
-virtio_balloon_pbp_free(pbp);
+if (bitmap_full(pbp->bitmap, subpages)) {
+/* We've accumulated a full host page, we can actually discard
+* it now */
+
+ram_block_discard_range(rb, rb_aligned_offset, rb_page_size);
+/* We ignore errors from ram_block_discard_range(), because it
+* has already reported them, and failing to discard a balloon
+* page is not fatal */
+virtio_balloon_pbp_free(pbp);
+}
+
+mr_offset += BALLOON_PAGE_SIZE;
 }
 }
 
@@ -340,12 +349,21 @@ static void virtio_balloon_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
 unsigned int p = virtio_ldl_p(vdev, &pfn);
 hwaddr pa;
+unsigned int psize = BALLOON_PAGE_SIZE;
 
 pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT;
 offset += 4;
 
-section = memory_region_find(get_system_memory(), pa,
- 

[RFC for Linux v4 2/2] virtio_balloon: Add deflate_cont_vq to deflate continuous pages

2020-07-15 Thread Hui Zhu
This commit adds a vq deflate_cont_vq to deflate continuous pages.
When VIRTIO_BALLOON_F_CONT_PAGES is set, call leak_balloon_cont to leak
the balloon.
leak_balloon_cont will call balloon_page_list_dequeue_cont get continuous
pages from balloon and report them use deflate_cont_vq.

Signed-off-by: Hui Zhu 
---
 drivers/virtio/virtio_balloon.c| 73 
 include/linux/balloon_compaction.h |  3 ++
 mm/balloon_compaction.c| 76 ++
 3 files changed, 144 insertions(+), 8 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index b89f566..258b3d9 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -44,6 +44,7 @@
 
 #define VIRTIO_BALLOON_INFLATE_MAX_ORDER min((int) (sizeof(__virtio32) * 
BITS_PER_BYTE - \
1 - PAGE_SHIFT), 
(MAX_ORDER-1))
+#define VIRTIO_BALLOON_DEFLATE_MAX_PAGES_NUM (((__virtio32)~0U) >> PAGE_SHIFT)
 
 #ifdef CONFIG_BALLOON_COMPACTION
 static struct vfsmount *balloon_mnt;
@@ -56,6 +57,7 @@ enum virtio_balloon_vq {
VIRTIO_BALLOON_VQ_FREE_PAGE,
VIRTIO_BALLOON_VQ_REPORTING,
VIRTIO_BALLOON_VQ_INFLATE_CONT,
+   VIRTIO_BALLOON_VQ_DEFLATE_CONT,
VIRTIO_BALLOON_VQ_MAX
 };
 
@@ -65,7 +67,8 @@ enum virtio_balloon_config_read {
 
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq, 
*inflate_cont_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq,
+*inflate_cont_vq, *deflate_cont_vq;
 
/* Balloon's own wq for cpu-intensive work items */
struct workqueue_struct *balloon_wq;
@@ -215,6 +218,16 @@ static void set_page_pfns(struct virtio_balloon *vb,
  page_to_balloon_pfn(page) + i);
 }
 
+static void set_page_pfns_size(struct virtio_balloon *vb,
+  __virtio32 pfns[], struct page *page,
+  size_t size)
+{
+   /* Set the first pfn of the continuous pages.  */
+   pfns[0] = cpu_to_virtio32(vb->vdev, page_to_balloon_pfn(page));
+   /* Set the size of the continuous pages.  */
+   pfns[1] = (__virtio32) size;
+}
+
 static void set_page_pfns_order(struct virtio_balloon *vb,
__virtio32 pfns[], struct page *page,
unsigned int order)
@@ -222,10 +235,7 @@ static void set_page_pfns_order(struct virtio_balloon *vb,
if (order == 0)
return set_page_pfns(vb, pfns, page);
 
-   /* Set the first pfn of the continuous pages.  */
-   pfns[0] = cpu_to_virtio32(vb->vdev, page_to_balloon_pfn(page));
-   /* Set the size of the continuous pages.  */
-   pfns[1] = PAGE_SIZE << order;
+   set_page_pfns_size(vb, pfns, page, PAGE_SIZE << order);
 }
 
 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num)
@@ -367,6 +377,42 @@ static unsigned leak_balloon(struct virtio_balloon *vb, 
size_t num)
return num_freed_pages;
 }
 
+static unsigned int leak_balloon_cont(struct virtio_balloon *vb, size_t num)
+{
+   unsigned int num_freed_pages;
+   struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info;
+   LIST_HEAD(pages);
+   size_t num_pages;
+
+   mutex_lock(&vb->balloon_lock);
+   for (vb->num_pfns = 0, num_freed_pages = 0;
+vb->num_pfns < ARRAY_SIZE(vb->pfns) && num_freed_pages < num;
+vb->num_pfns += 2,
+num_freed_pages += num_pages << (PAGE_SHIFT - 
VIRTIO_BALLOON_PFN_SHIFT)) {
+   struct page *page;
+
+   num_pages = balloon_page_list_dequeue_cont(vb_dev_info, &pages, 
&page,
+   min_t(size_t,
+ 
VIRTIO_BALLOON_DEFLATE_MAX_PAGES_NUM,
+ num - num_freed_pages));
+   if (!num_pages)
+   break;
+   set_page_pfns_size(vb, vb->pfns + vb->num_pfns, page, num_pages 
<< PAGE_SHIFT);
+   }
+   vb->num_pages -= num_freed_pages;
+
+   /*
+* Note that if
+* virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST);
+* is true, we *have* to do it in this order
+*/
+   if (vb->num_pfns != 0)
+   tell_host(vb, vb->deflate_cont_vq);
+   release_pages_balloon(vb, &pages);
+   mutex_unlock(&vb->balloon_lock);
+   return num_freed_pages;
+}
+
 static inline void update_stat(struct virtio_balloon *vb, int idx,
   u16 tag, u64 val)
 {
@@ -551,8 +597,12 @@ static void update_balloon_size_func(struct work_struct 
*work)
 
if (diff > 0)

[RFC v3 for QEMU] virtio-balloon: Add option cont-pages to set VIRTIO_BALLOON_VQ_INFLATE_CONT

2020-05-12 Thread Hui Zhu
If the guest kernel has many fragmentation pages, use virtio_balloon
will split THP of QEMU when it calls MADV_DONTNEED madvise to release
the balloon pages.
Set option cont-pages to on will open flags VIRTIO_BALLOON_VQ_INFLATE_CONT
and set default continuous pages order to THP order.
Then It will get continuous pages PFN that its order is current_pages_order
from VQ ivq use use madvise MADV_DONTNEED release the page.
This will handle the THP split issue.

Signed-off-by: Hui Zhu 
---
 hw/virtio/virtio-balloon.c  | 77 +
 include/hw/virtio/virtio-balloon.h  |  2 +
 include/standard-headers/linux/virtio_balloon.h |  5 ++
 3 files changed, 60 insertions(+), 24 deletions(-)

diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index a4729f7..84d47d3 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -34,6 +34,7 @@
 #include "hw/virtio/virtio-access.h"
 
 #define BALLOON_PAGE_SIZE  (1 << VIRTIO_BALLOON_PFN_SHIFT)
+#define CONT_PAGES_ORDER   9
 
 typedef struct PartiallyBalloonedPage {
 ram_addr_t base_gpa;
@@ -72,6 +73,8 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
 RAMBlock *rb;
 size_t rb_page_size;
 int subpages;
+size_t inflate_size = BALLOON_PAGE_SIZE << balloon->current_pages_order;
+int pages_num;
 
 /* XXX is there a better way to get to the RAMBlock than via a
  * host address? */
@@ -81,7 +84,7 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
 if (rb_page_size == BALLOON_PAGE_SIZE) {
 /* Easy case */
 
-ram_block_discard_range(rb, rb_offset, rb_page_size);
+ram_block_discard_range(rb, rb_offset, inflate_size);
 /* We ignore errors from ram_block_discard_range(), because it
  * has already reported them, and failing to discard a balloon
  * page is not fatal */
@@ -99,32 +102,38 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
 
 rb_aligned_offset = QEMU_ALIGN_DOWN(rb_offset, rb_page_size);
 subpages = rb_page_size / BALLOON_PAGE_SIZE;
-base_gpa = memory_region_get_ram_addr(mr) + mr_offset -
-   (rb_offset - rb_aligned_offset);
 
-if (pbp->bitmap && !virtio_balloon_pbp_matches(pbp, base_gpa)) {
-/* We've partially ballooned part of a host page, but now
- * we're trying to balloon part of a different one.  Too hard,
- * give up on the old partial page */
-virtio_balloon_pbp_free(pbp);
-}
+for (pages_num = inflate_size / BALLOON_PAGE_SIZE;
+ pages_num > 0; pages_num--) {
+base_gpa = memory_region_get_ram_addr(mr) + mr_offset -
+   (rb_offset - rb_aligned_offset);
 
-if (!pbp->bitmap) {
-virtio_balloon_pbp_alloc(pbp, base_gpa, subpages);
-}
+if (pbp->bitmap && !virtio_balloon_pbp_matches(pbp, base_gpa)) {
+/* We've partially ballooned part of a host page, but now
+* we're trying to balloon part of a different one.  Too hard,
+* give up on the old partial page */
+virtio_balloon_pbp_free(pbp);
+}
 
-set_bit((rb_offset - rb_aligned_offset) / BALLOON_PAGE_SIZE,
-pbp->bitmap);
+if (!pbp->bitmap) {
+virtio_balloon_pbp_alloc(pbp, base_gpa, subpages);
+}
 
-if (bitmap_full(pbp->bitmap, subpages)) {
-/* We've accumulated a full host page, we can actually discard
- * it now */
+set_bit((rb_offset - rb_aligned_offset) / BALLOON_PAGE_SIZE,
+pbp->bitmap);
 
-ram_block_discard_range(rb, rb_aligned_offset, rb_page_size);
-/* We ignore errors from ram_block_discard_range(), because it
- * has already reported them, and failing to discard a balloon
- * page is not fatal */
-virtio_balloon_pbp_free(pbp);
+if (bitmap_full(pbp->bitmap, subpages)) {
+/* We've accumulated a full host page, we can actually discard
+* it now */
+
+ram_block_discard_range(rb, rb_aligned_offset, rb_page_size);
+/* We ignore errors from ram_block_discard_range(), because it
+* has already reported them, and failing to discard a balloon
+* page is not fatal */
+virtio_balloon_pbp_free(pbp);
+}
+
+mr_offset += BALLOON_PAGE_SIZE;
 }
 }
 
@@ -345,7 +354,7 @@ static void virtio_balloon_handle_output(VirtIODevice 
*vdev, VirtQueue *vq)
 offset += 4;
 
 section = memory_region_find(get_system_memory(), pa,
- BALLOON_PAGE_SIZE);
+BALLOON_PAGE_SIZE << s->current_pages_order);
 if (!section.mr) {
 trace_virtio_balloon_bad_addr(pa);
 continue;
@@ -618,9 +627,12 @@ static size_t vir

[RFC v3 for Linux] virtio_balloon: Add VIRTIO_BALLOON_VQ_INFLATE_CONT to handle THP split issue

2020-05-12 Thread Hui Zhu
The first and second version are in [1] and [2].
According to the comments from Michael, I updated the patch.
1. Removed the separate vq inflate_cont_vq and just use inflate_vq to
   transport inflate pages.
2. Add two max_pages_order and current_pages_order to virtio_balloon_config
   instead of pages_order.
   max_pages_order is set by QEMU.  It is the max order of the inflate
   pages.
   current_pages_order is set by kernel.  It is the current order of
   the inflate pages.
   When balloon inflate begin, current_pages_order is set to
   max_pages_order.
   Kernel tries to allocate current_pages_order page.  If allocation fails,
   current_pages_order will be reduced by 1 until it is 0.
   When QEMU get pfn from inflate_vq, it will release with size in
   current_pages_order.

Following is the introduction of the function.
If the guest kernel has many fragmentation pages, use virtio_balloon
will split THP of QEMU when it calls MADV_DONTNEED madvise to release
the balloon pages.
This is an example in a VM with 1G memory 1CPU:
// This is the THP number before VM execution in the host.
// None use THP.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages: 0 kB

// After VM start, use usemem
// (https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git)
// punch-holes function generates 400m fragmentation pages in the guest
// kernel.
usemem --punch-holes -s -1 800m &

// This is the THP number after this command in the host.
// Some THP is used by VM because usemem will access 800M memory
// in the guest.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:978944 kB

// Connect to the QEMU monitor, setup balloon, and set it size to 600M.
(qemu) device_add virtio-balloon-pci,id=balloon1
(qemu) info balloon
balloon: actual=1024
(qemu) balloon 600
(qemu) info balloon
balloon: actual=600

// This is the THP number after inflate the balloon in the host.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:153600 kB

// Set the size back to 1024M in the QEMU monitor.
(qemu) balloon 1024
(qemu) info balloon
balloon: actual=1024

// Use usemem to increase the memory usage of QEMU.
killall usemem
usemem 800m

// This is the THP number after this operation.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:153600 kB

THP number decreased more than 800M after inflate the balloon in the host.
The reason is usemem with punch-holes option will free every other
page after allocation.  Then 400M free memory inside the guest kernel
is fragmentation pages.
The guest kernel will use them to inflate the balloon.  When these
fragmentation pages are freed, THP will be split.
THP number is not increased after deflate the balloon and increase
memory useage because fragmentation address affect the THP allcation
in the host.

This commit tries to handle this with add a new flag
VIRTIO_BALLOON_VQ_INFLATE_CONT.
When this flag is set, the balloon will try to use continuous pages
inflate the balloon.  And the pages default order is set to THP order.
Then THP pages will be freed together in the host.
This is an example in a VM with 1G memory 1CPU:
// This is the THP number before VM execution in the host.
// None use THP.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages: 0 kB

// After VM start, use usemem punch-holes function generates 400M
// fragmentation pages in the guest kernel.
usemem --punch-holes -s -1 800m &

// This is the THP number after this command in the host.
// Some THP is used by VM because usemem will access 800M memory
// in the guest.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:978944 kB

// Connect to the QEMU monitor, setup balloon, and set it size to 600M.
(qemu) device_add virtio-balloon-pci,id=balloon1,cont-pages=on
(qemu) info balloon
balloon: actual=1024
(qemu) balloon 600
(qemu) info balloon
balloon: actual=600

// This is the THP number after inflate the balloon in the host.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:612352 kB

// Set the size back to 1024M in the QEMU monitor.
(qemu) balloon 1024
(qemu) info balloon
balloon: actual=1024

// Use usemem to increase the memory usage of QEMU.
killall usemem
usemem 800m

// This is the THP number after this operation.
cat /proc/meminfo | grep AnonHugePages:
AnonHugePages:944128 kB

The THP number decreases 358M.  This shows that
VIRTIO_BALLOON_VQ_INFLATE_CONT can help handle the THP split issue.
The THP number is increased after deflate the balloon and increase
memory useage.

[1] https://lkml.org/lkml/2020/3/12/144
[2] 
https://lore.kernel.org/linux-mm/1584893097-12317-1-git-send-email-teawa...@gmail.com/

Signed-off-by: Hui Zhu 
---
 drivers/virtio/virtio_balloon.c | 98 +++--
 include/linux/balloon_compaction.h  |  9 +++-
 include/uapi/linux/virtio_balloon.h |  5 ++
 mm/balloon_compaction.c | 40 ---
 4 files changed, 129 insertions(+), 23 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/driver

[PATCH 1/2] mm, frontswap: Fix frontswap_map issue with THP

2019-10-14 Thread Hui Zhu
Shrink will try to use frontswap interface store the THP as a normal
page in __frontswap_store:
if (ret == 0) {
__frontswap_set(sis, offset);
inc_frontswap_succ_stores();
} else {
It should set all bits with THP.

This commit set all bits with THP.

Signed-off-by: Hui Zhu 
---
 mm/frontswap.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/frontswap.c b/mm/frontswap.c
index 60bb20e..f07ea63 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -274,8 +274,12 @@ int __frontswap_store(struct page *page)
break;
}
if (ret == 0) {
-   __frontswap_set(sis, offset);
-   inc_frontswap_succ_stores();
+   int i, nr = hpage_nr_pages(page);
+
+   for (i = 0; i < nr; i++) {
+   __frontswap_set(sis, offset + i);
+   inc_frontswap_succ_stores();
+   }
} else {
inc_frontswap_failed_stores();
}
-- 
2.7.4



[PATCH 2/2] mm, zswap: Support THP

2019-10-14 Thread Hui Zhu
This commit let zswap treats THP as continuous normal pages
in zswap_frontswap_store.
It will store them to a lot of "zswap_entry".  These "zswap_entry"
will be inserted to "zswap_tree" together.

Signed-off-by: Hui Zhu 
---
 mm/zswap.c | 170 +++--
 1 file changed, 109 insertions(+), 61 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 46a3223..36aa10d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -316,11 +316,7 @@ static void zswap_rb_erase(struct rb_root *root, struct 
zswap_entry *entry)
}
 }
 
-/*
- * Carries out the common pattern of freeing and entry's zpool allocation,
- * freeing the entry itself, and decrementing the number of stored pages.
- */
-static void zswap_free_entry(struct zswap_entry *entry)
+static void zswap_free_entry_1(struct zswap_entry *entry)
 {
if (!entry->length)
atomic_dec(&zswap_same_filled_pages);
@@ -329,6 +325,15 @@ static void zswap_free_entry(struct zswap_entry *entry)
zswap_pool_put(entry->pool);
}
zswap_entry_cache_free(entry);
+}
+
+/*
+ * Carries out the common pattern of freeing and entry's zpool allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_free_entry(struct zswap_entry *entry)
+{
+   zswap_free_entry_1(entry);
atomic_dec(&zswap_stored_pages);
zswap_update_total_size();
 }
@@ -980,15 +985,11 @@ static void zswap_fill_page(void *ptr, unsigned long 
value)
memset_l(page, value, PAGE_SIZE / sizeof(unsigned long));
 }
 
-/*
-* frontswap hooks
-**/
-/* attempts to compress and store an single page */
-static int zswap_frontswap_store(unsigned type, pgoff_t offset,
-   struct page *page)
+static int zswap_frontswap_store_1(unsigned type, pgoff_t offset,
+   struct page *page,
+   struct zswap_entry **entry_pointer)
 {
-   struct zswap_tree *tree = zswap_trees[type];
-   struct zswap_entry *entry, *dupentry;
+   struct zswap_entry *entry;
struct crypto_comp *tfm;
int ret;
unsigned int hlen, dlen = PAGE_SIZE;
@@ -998,36 +999,6 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
gfp_t gfp;
 
-   /* THP isn't supported */
-   if (PageTransHuge(page)) {
-   ret = -EINVAL;
-   goto reject;
-   }
-
-   if (!zswap_enabled || !tree) {
-   ret = -ENODEV;
-   goto reject;
-   }
-
-   /* reclaim space if needed */
-   if (zswap_is_full()) {
-   zswap_pool_limit_hit++;
-   if (zswap_shrink()) {
-   zswap_reject_reclaim_fail++;
-   ret = -ENOMEM;
-   goto reject;
-   }
-
-   /* A second zswap_is_full() check after
-* zswap_shrink() to make sure it's now
-* under the max_pool_percent
-*/
-   if (zswap_is_full()) {
-   ret = -ENOMEM;
-   goto reject;
-   }
-   }
-
/* allocate entry */
entry = zswap_entry_cache_alloc(GFP_KERNEL);
if (!entry) {
@@ -1035,6 +1006,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
ret = -ENOMEM;
goto reject;
}
+   *entry_pointer = entry;
 
if (zswap_same_filled_pages_enabled) {
src = kmap_atomic(page);
@@ -1044,7 +1016,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
entry->length = 0;
entry->value = value;
atomic_inc(&zswap_same_filled_pages);
-   goto insert_entry;
+   goto out;
}
kunmap_atomic(src);
}
@@ -1093,31 +1065,105 @@ static int zswap_frontswap_store(unsigned type, 
pgoff_t offset,
entry->handle = handle;
entry->length = dlen;
 
-insert_entry:
+out:
+   return 0;
+
+put_dstmem:
+   put_cpu_var(zswap_dstmem);
+   zswap_pool_put(entry->pool);
+freepage:
+   zswap_entry_cache_free(entry);
+reject:
+   return ret;
+}
+
+/*
+* frontswap hooks
+**/
+/* attempts to compress and store an single page */
+static int zswap_frontswap_store(unsigned type, pgoff_t offset,
+   struct page *page)
+{
+   struct zswap_tree *tree = zswap_trees[type];
+   struct zswap_entry **entries = NULL, *dupentry;
+   struct zswap_entry *single_entry[1];
+   int ret;
+ 

[RFC v4] zswap: Add CONFIG_ZSWAP_IO_SWITCH to handle swap IO issue

2019-10-08 Thread Hui Zhu
This is the fourth version of this patch.  The perious versions
are in [1], [2] and [3].

The parameters read_in_flight_limit and write_in_flight_limit were
replaced by io_switch_enabled_enabled in this verion to make this
function more clear.

Currently, I use a VM that has 1 CPU, 4G memory and 4G swap file.
I found that swap will affect the IO performance when it is running.
So I open zswap to handle it because it just use CPU cycles but not
disk IO.

It work OK but I found that zswap is slower than normal swap in this
VM.  zswap is about 300M/s and normal swap is about 500M/s. (The reason
is the swap disk device config is "cache=none,aio=native".)
So open zswap is make memory shrinker slower but good for IO performance
in this VM.
So I just want zswap work when the disk of the swap file is under high
IO load.

This commit is designed for this idea.
When this function is enabled by the swap parameter
io_switch_enabled_enabled, zswap will just work when the swap disk has
outstanding I/O requests.

[1] https://lkml.org/lkml/2019/9/11/935
[2] https://lkml.org/lkml/2019/9/20/90
[3] https://lkml.org/lkml/2019/9/22/927

Signed-off-by: Hui Zhu 
---
 include/linux/swap.h |  3 +++
 mm/Kconfig   | 14 ++
 mm/page_io.c | 16 
 mm/zswap.c   | 25 +
 4 files changed, 58 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a..82b621f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -389,6 +389,9 @@ extern void end_swap_bio_write(struct bio *bio);
 extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func);
 extern int swap_set_page_dirty(struct page *page);
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+extern void swap_io_in_flight(struct page *page, unsigned int inflight[2]);
+#endif
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block);
diff --git a/mm/Kconfig b/mm/Kconfig
index 56cec63..f5740e3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -546,6 +546,20 @@ config ZSWAP
  they have not be fully explored on the large set of potential
  configurations and workloads that exist.
 
+config ZSWAP_IO_SWITCH
+   bool "Compressed cache for swap pages according to the IO status"
+   depends on ZSWAP
+   help
+ This function helps the system that normal swap speed is higher
+ than zswap speed to handle the swap IO issue.
+ For example, a VM where the swap disk device with config
+ "cache=none,aio=native".
+
+ When this function is enabled by the swap parameter
+ io_switch_enabled_enabled, zswap will just work when the swap disk
+ has outstanding I/O requests.
+ If unsure, say "n".
+
 config ZPOOL
tristate "Common API for compressed memory storage"
help
diff --git a/mm/page_io.c b/mm/page_io.c
index 24ee600..e66b050 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -434,3 +434,19 @@ int swap_set_page_dirty(struct page *page)
return __set_page_dirty_no_writeback(page);
}
 }
+
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+void swap_io_in_flight(struct page *page, unsigned int inflight[2])
+{
+   struct swap_info_struct *sis = page_swap_info(page);
+
+   if (!sis->bdev) {
+   inflight[0] = 0;
+   inflight[1] = 0;
+   return;
+   }
+
+   part_in_flight_rw(bdev_get_queue(sis->bdev), sis->bdev->bd_part,
+ inflight);
+}
+#endif
diff --git a/mm/zswap.c b/mm/zswap.c
index 0e22744..b50d8fb 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -114,6 +114,18 @@ static bool zswap_same_filled_pages_enabled = true;
 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
   bool, 0644);
 
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+/*
+ * Enable/disable the io switch functon (disabled by default)
+ * When the io switch functon is enabled, zswap will only try to
+ * store pages when IO of the swap device is low (read and write io in
+ * flight number is 0).
+ */
+static bool zswap_io_switch_enabled;
+module_param_named(io_switch_enabled_enabled, zswap_io_switch_enabled,
+  bool, 0644);
+#endif
+
 /*
 * data structures
 **/
@@ -1009,6 +1021,19 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
goto reject;
}
 
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+   if (zswap_io_switch_enabled) {
+   unsigned int inflight[2];
+
+   swap_io_in_flight(page, inflight);
+
+   if (inflight[0] == 0 || inflight[1] == 0) {
+   ret = -EIO;
+   goto reject;
+   }
+   }
+#endif
+
/* reclaim space if needed */
if (zswap_is_full()) {
zswap_pool_limit_hit++;
-- 
2.7.4



[RFC v3] zswap: Add CONFIG_ZSWAP_IO_SWITCH to handle swap IO issue

2019-09-22 Thread Hui Zhu
This is the third version of this patch.  The first and second version
is in [1] and [2].
This verion is updated according to the comments from Randy Dunlap
in [3].

Currently, I use a VM that has 2 CPUs, 4G memory and 4G swap file.
I found that swap will affect the IO performance when it is running.
So I open zswap to handle it because it just use CPU cycles but not
disk IO.

It work OK but I found that zswap is slower than normal swap in this
VM.  zswap is about 300M/s and normal swap is about 500M/s. (The reason
is disk inside VM has fscache in host machine.)
So open zswap is make memory shrinker slower but good for IO performance
in this VM.
So I just want zswap work when the disk of the swap file is under high
IO load.

This commit is designed for this idea.
It add two parameters read_in_flight_limit and write_in_flight_limit to
zswap.
In zswap_frontswap_store, pages will be stored to zswap only when
the IO in flight number of swap device is bigger than
zswap_read_in_flight_limit or zswap_write_in_flight_limit
when zswap is enabled.
Then the zswap just work when the IO in flight number of swap device
is low.

[1] https://lkml.org/lkml/2019/9/11/935
[2] https://lkml.org/lkml/2019/9/20/90
[3] https://lkml.org/lkml/2019/9/20/1076

Signed-off-by: Hui Zhu 
---
 include/linux/swap.h |  3 +++
 mm/Kconfig   | 18 
 mm/page_io.c | 16 +++
 mm/zswap.c   | 58 
 4 files changed, 95 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a..82b621f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -389,6 +389,9 @@ extern void end_swap_bio_write(struct bio *bio);
 extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func);
 extern int swap_set_page_dirty(struct page *page);
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+extern void swap_io_in_flight(struct page *page, unsigned int inflight[2]);
+#endif
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block);
diff --git a/mm/Kconfig b/mm/Kconfig
index 56cec63..387c3b5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -546,6 +546,24 @@ config ZSWAP
  they have not be fully explored on the large set of potential
  configurations and workloads that exist.
 
+config ZSWAP_IO_SWITCH
+   bool "Compressed cache for swap pages according to the IO status"
+   depends on ZSWAP
+   help
+ This function helps the system that normal swap speed is higher
+ than zswap speed to handle the swap IO issue.
+ For example, a VM where the disk device is not set cache config or
+ set cache=writeback.
+
+ This function makes zswap just work when the disk of the swap file
+ is under high IO load.
+ It add two parameters (read_in_flight_limit and
+ write_in_flight_limit) to zswap.  When zswap is enabled, pages will
+ be stored to zswap only when the IO in flight number of swap device
+ is bigger than zswap_read_in_flight_limit or
+ zswap_write_in_flight_limit.
+ If unsure, say "n".
+
 config ZPOOL
tristate "Common API for compressed memory storage"
help
diff --git a/mm/page_io.c b/mm/page_io.c
index 24ee600..e66b050 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -434,3 +434,19 @@ int swap_set_page_dirty(struct page *page)
return __set_page_dirty_no_writeback(page);
}
 }
+
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+void swap_io_in_flight(struct page *page, unsigned int inflight[2])
+{
+   struct swap_info_struct *sis = page_swap_info(page);
+
+   if (!sis->bdev) {
+   inflight[0] = 0;
+   inflight[1] = 0;
+   return;
+   }
+
+   part_in_flight_rw(bdev_get_queue(sis->bdev), sis->bdev->bd_part,
+ inflight);
+}
+#endif
diff --git a/mm/zswap.c b/mm/zswap.c
index 0e22744..0190b2d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -62,6 +62,14 @@ static u64 zswap_reject_compress_poor;
 static u64 zswap_reject_alloc_fail;
 /* Store failed because the entry metadata could not be allocated (rare) */
 static u64 zswap_reject_kmemcache_fail;
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+/*
+ * Store failed because zswap_read_in_flight_limit or
+ * zswap_write_in_flight_limit is bigger than IO in flight number of
+ * swap device
+ */
+static u64 zswap_reject_io;
+#endif
 /* Duplicate store was encountered (rare) */
 static u64 zswap_duplicate_entry;
 
@@ -114,6 +122,24 @@ static bool zswap_same_filled_pages_enabled = true;
 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
   bool, 0644);
 
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+/*
+ * zswap will not try to store the page if zswap_read_in_flight_limit is
+ * bigger than IO read in flight numbe

[RFC v2] zswap: Add CONFIG_ZSWAP_IO_SWITCH to handle swap IO issue

2019-09-19 Thread Hui Zhu
This is the second version of this patch.  The previous version is in
https://lkml.org/lkml/2019/9/11/935
I updated the commit introduction and Kconfig  because it is not clear.

Currently, I use a VM that has 2 CPUs, 4G memory and 4G swap file.
I found that swap will affect the IO performance when it is running.
So I open zswap to handle it because it just use CPU cycles but not
disk IO.

It work OK but I found that zswap is slower than normal swap in this
VM.  zswap is about 300M/s and normal swap is about 500M/s. (The reason
is disk inside VM has fscache in host machine.)
So open zswap is make memory shrinker slower but good for IO performance
in this VM.
So I just want zswap work when the disk of the swap file is under high
IO load.

This commit is designed for this idea.
It add two parameters read_in_flight_limit and write_in_flight_limit to
zswap.
In zswap_frontswap_store, pages will be stored to zswap only when
the IO in flight number of swap device is bigger than
zswap_read_in_flight_limit or zswap_write_in_flight_limit
when zswap is enabled.
Then the zswap just work when the IO in flight number of swap device
is low.

Signed-off-by: Hui Zhu 
---
 include/linux/swap.h |  3 +++
 mm/Kconfig   | 18 +
 mm/page_io.c | 16 +++
 mm/zswap.c   | 55 
 4 files changed, 92 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a..82b621f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -389,6 +389,9 @@ extern void end_swap_bio_write(struct bio *bio);
 extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func);
 extern int swap_set_page_dirty(struct page *page);
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+extern void swap_io_in_flight(struct page *page, unsigned int inflight[2]);
+#endif
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block);
diff --git a/mm/Kconfig b/mm/Kconfig
index 56cec63..5408d65 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -546,6 +546,24 @@ config ZSWAP
  they have not be fully explored on the large set of potential
  configurations and workloads that exist.
 
+config ZSWAP_IO_SWITCH
+   bool "Compressed cache for swap pages according to the IO status"
+   depends on ZSWAP
+   def_bool n
+   help
+ This function help the system that normal swap speed is higher
+ than zswap speed to handle the swap IO issue.
+ For example, a VM that is disk device is not set cache config or
+ set cache=writeback.
+
+ This function make zswap just work when the disk of the swap file
+ is under high IO load.
+ It add two parameters read_in_flight_limit and write_in_flight_limit 
to
+ zswap.  When zswap is enabled, pages will be stored to zswap only
+ when the IO in flight number of swap device is bigger than
+ zswap_read_in_flight_limit or zswap_write_in_flight_limit.
+ If unsure, say "n".
+
 config ZPOOL
tristate "Common API for compressed memory storage"
help
diff --git a/mm/page_io.c b/mm/page_io.c
index 24ee600..e66b050 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -434,3 +434,19 @@ int swap_set_page_dirty(struct page *page)
return __set_page_dirty_no_writeback(page);
}
 }
+
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+void swap_io_in_flight(struct page *page, unsigned int inflight[2])
+{
+   struct swap_info_struct *sis = page_swap_info(page);
+
+   if (!sis->bdev) {
+   inflight[0] = 0;
+   inflight[1] = 0;
+   return;
+   }
+
+   part_in_flight_rw(bdev_get_queue(sis->bdev), sis->bdev->bd_part,
+ inflight);
+}
+#endif
diff --git a/mm/zswap.c b/mm/zswap.c
index 0e22744..1255645 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -62,6 +62,13 @@ static u64 zswap_reject_compress_poor;
 static u64 zswap_reject_alloc_fail;
 /* Store failed because the entry metadata could not be allocated (rare) */
 static u64 zswap_reject_kmemcache_fail;
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+/* Store failed because zswap_read_in_flight_limit or
+ * zswap_write_in_flight_limit is bigger than IO in flight number of
+ * swap device
+ */
+static u64 zswap_reject_io;
+#endif
 /* Duplicate store was encountered (rare) */
 static u64 zswap_duplicate_entry;
 
@@ -114,6 +121,22 @@ static bool zswap_same_filled_pages_enabled = true;
 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
   bool, 0644);
 
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+/* zswap will not try to store the page if zswap_read_in_flight_limit is
+ * bigger than IO read in flight number of swap device
+ */
+static unsigned int zswap_read_in_flight_limit;
+module_param_named(read_i

[PATCH for vm-scalability] usemem: Add new option -Z|--read-again

2019-09-13 Thread Hui Zhu
usemem will read memory again after access the memory with this option.
It can help test the speed that load page from swap to memory.

Signed-off-by: Hui Zhu 
---
 usemem.c | 46 --
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/usemem.c b/usemem.c
index 264d52a..2d31946 100644
--- a/usemem.c
+++ b/usemem.c
@@ -94,6 +94,7 @@ int opt_sync_rw = 0;
 int opt_sync_free = 0;
 int opt_bind_interval = 0;
 unsigned long opt_delay = 0;
+int opt_read_again = 0;
 int nr_task;
 int nr_thread;
 int nr_cpu;
@@ -151,6 +152,7 @@ void usage(int ok)
"-e|--delay  delay for each page in ns\n"
"-O|--anonymous  mmap with MAP_ANONYMOUS\n"
"-U|--hugetlballocate hugetlbfs page\n"
+   "-Z|--read-again read memory again after access the memory\n"
"-h|--help   show this message\n"
,   ourname);
 
@@ -188,6 +190,7 @@ static const struct option opts[] = {
{ "sync-rw" , 0, NULL, 'y' },
{ "delay"   , 1, NULL, 'e' },
{ "hugetlb" , 0, NULL, 'U' },
+   { "read-again"  , 0, NULL, 'Z' },
{ "help", 0, NULL, 'h' },
{ NULL  , 0, NULL, 0 }
 };
@@ -616,7 +619,7 @@ unsigned long do_unit(unsigned long bytes, struct 
drand48_data *rand_data,
return rw_bytes;
 }
 
-static void output_statistics(unsigned long unit_bytes)
+static void output_statistics(unsigned long unit_bytes, const char *intro)
 {
struct timeval stop;
char buf[1024];
@@ -629,8 +632,8 @@ static void output_statistics(unsigned long unit_bytes)
(stop.tv_usec - start_time.tv_usec);
throughput = ((unit_bytes * 100ULL) >> 10) / delta_us;
len = snprintf(buf, sizeof(buf),
-   "%lu bytes / %lu usecs = %lu KB/s\n",
-   unit_bytes, delta_us, throughput);
+   "%s%lu bytes / %lu usecs = %lu KB/s\n",
+   intro, unit_bytes, delta_us, throughput);
fflush(stdout);
write(1, buf, len);
 }
@@ -690,7 +693,34 @@ long do_units(void)
} while (bytes);
 
if (!opt_write_signal_read && unit_bytes)
-   output_statistics(unit_bytes);
+   output_statistics(unit_bytes, "");
+
+   if (opt_read_again && unit_bytes) {
+   unsigned long rw_bytes = 0;
+
+   gettimeofday(&start_time, NULL);
+   for (i = 0; i < nptr; i++) {
+   int rep;
+
+   for (rep = 0; rep < reps; rep++) {
+   if (rep > 0 && !quiet) {
+   printf(".");
+   fflush(stdout);
+   }
+
+   rw_bytes += do_rw_once(ptrs[i], lens[i], 
&rand_data, 1, &rep, reps);
+
+   if (msync_mode) {
+   if ((msync(ptrs[i], lens[i], 
msync_mode)) == -1) {
+   fprintf(stderr, "msync failed 
with error %s \n", strerror(errno));
+   exit(1);
+   }
+   }
+   }
+   }
+
+   output_statistics(rw_bytes, "read again ");
+   }
 
if (opt_write_signal_read) {
struct sigaction act;
@@ -731,7 +761,7 @@ long do_units(void)
sigsuspend(&set);
gettimeofday(&start_time, NULL);
unit_bytes = do_rw_once(buffer, opt_bytes, &rand_data, 1, NULL, 
0);
-   output_statistics(unit_bytes);
+   output_statistics(unit_bytes, "");
}
 
if (opt_sync_free)
@@ -879,7 +909,7 @@ int main(int argc, char *argv[])
pagesize = getpagesize();
 
while ((c = getopt_long(argc, argv,
-   
"aAB:f:FPp:gqowRMm:n:t:b:ds:T:Sr:u:j:e:EHDNLWyxOUh", opts, NULL)) != -1)
+   
"aAB:f:FPp:gqowRMm:n:t:b:ds:T:Sr:u:j:e:EHDNLWyxOUZh", opts, NULL)) != -1)
{
switch (c) {
case 'a':
@@ -1005,6 +1035,10 @@ int main(int argc, char *argv[])
map_hugetlb = MAP_HUGETLB | MAP_HUGE_2MB;
break;
 
+   case 'Z':
+   opt_read_again = 1;
+   break;
+
default:
usage(1);
}
-- 
2.7.4



[PATCH] zswap: Add CONFIG_ZSWAP_IO_SWITCH

2019-09-11 Thread Hui Zhu
I use zswap to handle the swap IO issue in a VM that uses a swap file.
This VM has 4G memory and 2 CPUs.  And I set up 4G swap in /swapfile.
This is test script:
cat 1.sh
./usemem --sleep 3600 -M -a -n 1 $((3 * 1024 * 1024 * 1024)) &
sleep 10
echo 1 > /proc/sys/vm/drop_caches
./usemem -S -f /test2 $((2 * 1024 * 1024 * 1024)) &
while [ True ]; do ./usemem -a -n 1 $((1 * 1024 * 1024 * 1024)); done

Without ZSWAP:
echo 100 > /proc/sys/vm/swappiness
swapon /swapfile
sh 1.sh
...
...
1207959552 bytes / 2076479 usecs = 568100 KB/s
61088 usecs to free memory
1207959552 bytes / 2035439 usecs = 579554 KB/s
55073 usecs to free memory
2415919104 bytes / 24054408 usecs = 98081 KB/s
3741 usecs to free memory
1207959552 bytes / 1954371 usecs = 603594 KB/s
53161 usecs to free memory
...
...

With ZSWAP:
echo 100 > /proc/sys/vm/swappiness
swapon /swapfile
echo lz4 > /sys/module/zswap/parameters/compressor
echo zsmalloc > /sys/module/zswap/parameters/zpool
echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled
echo 20 > /sys/module/zswap/parameters/max_pool_percent
echo 1 > /sys/module/zswap/parameters/enabled
sh 1.sh
1207959552 bytes / 3619283 usecs = 325934 KB/s
194825 usecs to free memory
1207959552 bytes / 3439563 usecs = 342964 KB/s
218419 usecs to free memory
2415919104 bytes / 19508762 usecs = 120935 KB/s
5632 usecs to free memory
1207959552 bytes / 3329369 usecs = 354315 KB/s
179764 usecs to free memory

The normal io speed is increased from 98081 KB/s to 120935 KB/s.
But I found 2 issues of zswap in this machine:
1. Because the disk of VM has the file cache in the host layer,
   so normal swap speed is higher than with zswap.
2. Because zswap need allocates memory to store the compressed pages,
   it will make memory capacity worse.
For example:
Command "./usemem -a -n 1 $((7 * 1024 * 1024 * 1024))" request 7G memory
from this machine.
It will work OK without zswap but got OOM when zswap is opened.

This commit adds CONFIG_ZSWAP_IO_SWITCH that try to handle the issues
and let zswap keep save IO.
It add two parameters read_in_flight_limit and write_in_flight_limit to
zswap.
In zswap_frontswap_store, pages will be stored to zswap only when
the IO in flight number of swap device is bigger than
zswap_read_in_flight_limit or zswap_write_in_flight_limit
when zswap is enabled.
Then the zswap just work when the IO in flight number of swap device
is low.

This is the test result:
echo 100 > /proc/sys/vm/swappiness
swapon /swapfile
echo lz4 > /sys/module/zswap/parameters/compressor
echo zsmalloc > /sys/module/zswap/parameters/zpool
echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled
echo 20 > /sys/module/zswap/parameters/max_pool_percent
echo 1 > /sys/module/zswap/parameters/enabled
echo 3 > /sys/module/zswap/parameters/read_in_flight_limit
echo 50 > /sys/module/zswap/parameters/write_in_flight_limit
sh 1.sh
...
1207959552 bytes / 2320861 usecs = 508280 KB/s
106164 usecs to free memory
1207959552 bytes / 2343916 usecs = 503280 KB/s
79386 usecs to free memory
2415919104 bytes / 20136015 usecs = 117167 KB/s
4411 usecs to free memory
1207959552 bytes / 1833403 usecs = 643419 KB/s
70452 usecs to free memory
...
killall usemem
./usemem -a -n 1 $((7 * 1024 * 1024 * 1024))
8455716864 bytes / 14457505 usecs = 571159 KB/s
365961 usecs to free memory

Signed-off-by: Hui Zhu 
---
 include/linux/swap.h |  3 +++
 mm/Kconfig   | 11 +++
 mm/page_io.c | 16 +++
 mm/zswap.c   | 55 
 4 files changed, 85 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index de2c67a..82b621f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -389,6 +389,9 @@ extern void end_swap_bio_write(struct bio *bio);
 extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func);
 extern int swap_set_page_dirty(struct page *page);
+#ifdef CONFIG_ZSWAP_IO_SWITCH
+extern void swap_io_in_flight(struct page *page, unsigned int inflight[2]);
+#endif
 
 int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block);
diff --git a/mm/Kconfig b/mm/Kconfig
index 56cec63..d077e51 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -546,6 +546,17 @@ config ZSWAP
  they have not be fully explored on the large set of potential
  configurations and workloads that exist.
 
+config ZSWAP_IO_SWITCH
+   bool "Compressed cache for swap pages according to the IO status"
+   depends on ZSWAP
+   def_bool n
+   help
+ Add two parameters read_in_flight_limit and write_in_flight_limit to
+ ZSWAP.  When ZSWAP is enabled, pages will be stored to zswap only
+ when the IO in flight number of swap device is bigger than
+ zswap_read_in_flight_limit or zswap_write_in_flight_limit.
+ If 

[PATCH V3 2/2] zswap: Use movable memory if zpool support allocate movable memory

2019-06-05 Thread Hui Zhu
 usecs = 557535 KB/s
2717908992 bytes / 4803621 usecs = 552543 KB/s
2717908992 bytes / 5069828 usecs = 523530 KB/s
431546 usecs to free memory
383397 usecs to free memory
456454 usecs to free memory
224487 usecs to free memory
/home/teawater/kernel/vm-scalability# cat /proc/pagetypeinfo
Page block order: 9
Pages per block:  512

Free pages count per migrate type at order   0  1  2  3  4  
5  6  7  8  9 10
Node0, zone  DMA, typeUnmovable  1  1  1  0  2  
1  1  0  1  0  0
Node0, zone  DMA, type  Movable  0  0  0  0  0  
0  0  0  0  1  3
Node0, zone  DMA, type  Reclaimable  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone  DMA, type   HighAtomic  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone  DMA, type  CMA  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone  DMA, type  Isolate  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, typeUnmovable 10  8 10  9 10  
4  3  2  3  0  0
Node0, zoneDMA32, type  Movable 18 12 14 16 16  
   11  9  5  5  6775
Node0, zoneDMA32, type  Reclaimable  0  0  0  0  0  
0  0  0  0  0  1
Node0, zoneDMA32, type   HighAtomic  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, type  CMA  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, type  Isolate  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone   Normal, typeUnmovable   2669   1236452118 37  
   14  4  1  2  3  0
Node0, zone   Normal, type  Movable   3850   6086   5274   4327   3510  
 2494   1520934438220470
Node0, zone   Normal, type  Reclaimable 56 93155124 47  
   31 17  7  3  0  0
Node0, zone   Normal, type   HighAtomic  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone   Normal, type  CMA  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone   Normal, type  Isolate  0  0  0  0  0  
0  0  0  0  0  0

Number of blocks type Unmovable  Movable  Reclaimable   HighAtomic  
CMA  Isolate
Node 0, zone  DMA1700   
 00
Node 0, zoneDMA324 165020   
 00
Node 0, zone   Normal   79 2326   260   
 00

You can see that the number of unmovable page blocks is decreased
when the kernel has this commit.

Signed-off-by: Hui Zhu 
---
 mm/zswap.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index a4e4d36ec085..c6bf92bf5890 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1006,6 +1006,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
char *buf;
u8 *src, *dst;
struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
+   gfp_t gfp;
 
/* THP isn't supported */
if (PageTransHuge(page)) {
@@ -1079,9 +1080,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
 
/* store */
hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
-   ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
-  __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
-  &handle);
+   gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+   if (zpool_malloc_support_movable(entry->pool->zpool))
+   gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+   ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
-- 
2.21.0 (Apple Git-120)



[PATCH V3 1/2] zpool: Add malloc_support_movable to zpool_driver

2019-06-05 Thread Hui Zhu
As a zpool_driver, zsmalloc can allocate movable memory because it
support migate pages.
But zbud and z3fold cannot allocate movable memory.

This commit adds malloc_support_movable to zpool_driver.
If a zpool_driver support allocate movable memory, set it to true.
And add zpool_malloc_support_movable check malloc_support_movable
to make sure if a zpool support allocate movable memory.

Signed-off-by: Hui Zhu 
---
 include/linux/zpool.h |  3 +++
 mm/zpool.c| 16 
 mm/zsmalloc.c | 19 ++-
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 7238865e75b0..51bf43076165 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -46,6 +46,8 @@ const char *zpool_get_type(struct zpool *pool);
 
 void zpool_destroy_pool(struct zpool *pool);
 
+bool zpool_malloc_support_movable(struct zpool *pool);
+
 int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
unsigned long *handle);
 
@@ -90,6 +92,7 @@ struct zpool_driver {
struct zpool *zpool);
void (*destroy)(void *pool);
 
+   bool malloc_support_movable;
int (*malloc)(void *pool, size_t size, gfp_t gfp,
unsigned long *handle);
void (*free)(void *pool, unsigned long handle);
diff --git a/mm/zpool.c b/mm/zpool.c
index a2dd9107857d..863669212070 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -238,6 +238,22 @@ const char *zpool_get_type(struct zpool *zpool)
return zpool->driver->type;
 }
 
+/**
+ * zpool_malloc_support_movable() - Check if the zpool support
+ * allocate movable memory
+ * @zpool: The zpool to check
+ *
+ * This returns if the zpool support allocate movable memory.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: true if if the zpool support allocate movable memory, false if not
+ */
+bool zpool_malloc_support_movable(struct zpool *zpool)
+{
+   return zpool->driver->malloc_support_movable;
+}
+
 /**
  * zpool_malloc() - Allocate memory
  * @zpool: The zpool to allocate from.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0787d33b80d8..8f3d9a4d46f4 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -437,15 +437,16 @@ static u64 zs_zpool_total_size(void *pool)
 }
 
 static struct zpool_driver zs_zpool_driver = {
-   .type = "zsmalloc",
-   .owner =THIS_MODULE,
-   .create =   zs_zpool_create,
-   .destroy =  zs_zpool_destroy,
-   .malloc =   zs_zpool_malloc,
-   .free = zs_zpool_free,
-   .map =  zs_zpool_map,
-   .unmap =zs_zpool_unmap,
-   .total_size =   zs_zpool_total_size,
+   .type =   "zsmalloc",
+   .owner =  THIS_MODULE,
+   .create = zs_zpool_create,
+   .destroy =zs_zpool_destroy,
+   .malloc_support_movable = true,
+   .malloc = zs_zpool_malloc,
+   .free =   zs_zpool_free,
+   .map =zs_zpool_map,
+   .unmap =  zs_zpool_unmap,
+   .total_size = zs_zpool_total_size,
 };
 
 MODULE_ALIAS("zpool-zsmalloc");
-- 
2.21.0 (Apple Git-120)



Re: [PATCH V2 2/2] zswap: Add module parameter malloc_movable_if_support

2019-06-04 Thread Hui Zhu
Shakeel Butt  于2019年6月5日周三 上午1:12写道:
>
> On Sun, Jun 2, 2019 at 2:47 AM Hui Zhu  wrote:
> >
> > This is the second version that was updated according to the comments
> > from Sergey Senozhatsky in https://lkml.org/lkml/2019/5/29/73
> >
> > zswap compresses swap pages into a dynamically allocated RAM-based
> > memory pool.  The memory pool should be zbud, z3fold or zsmalloc.
> > All of them will allocate unmovable pages.  It will increase the
> > number of unmovable page blocks that will bad for anti-fragment.
> >
> > zsmalloc support page migration if request movable page:
> > handle = zs_malloc(zram->mem_pool, comp_len,
> > GFP_NOIO | __GFP_HIGHMEM |
> > __GFP_MOVABLE);
> >
> > And commit "zpool: Add malloc_support_movable to zpool_driver" add
> > zpool_malloc_support_movable check malloc_support_movable to make
> > sure if a zpool support allocate movable memory.
> >
> > This commit adds module parameter malloc_movable_if_support to enable
> > or disable zpool allocate block with gfp __GFP_HIGHMEM | __GFP_MOVABLE
> > if it support allocate movable memory (disabled by default).
> >
> > Following part is test log in a pc that has 8G memory and 2G swap.
> >
> > When it disabled:
> >  echo lz4 > /sys/module/zswap/parameters/compressor
> >  echo zsmalloc > /sys/module/zswap/parameters/zpool
> >  echo 1 > /sys/module/zswap/parameters/enabled
> >  swapon /swapfile
> >  cd /home/teawater/kernel/vm-scalability/
> > /home/teawater/kernel/vm-scalability# export unit_size=$((9 * 1024 * 1024 * 
> > 1024))
> > /home/teawater/kernel/vm-scalability# ./case-anon-w-seq
> > 2717908992 bytes / 3977932 usecs = 667233 KB/s
> > 2717908992 bytes / 4160702 usecs = 637923 KB/s
> > 2717908992 bytes / 4354611 usecs = 609516 KB/s
> > 293359 usecs to free memory
> > 340304 usecs to free memory
> > 205781 usecs to free memory
> > 2717908992 bytes / 5588016 usecs = 474982 KB/s
> > 166124 usecs to free memory
> > /home/teawater/kernel/vm-scalability# cat /proc/pagetypeinfo
> > Page block order: 9
> > Pages per block:  512
> >
> > Free pages count per migrate type at order   0  1  2  3 
> >  4  5  6  7  8  9 10
> > Node0, zone  DMA, typeUnmovable  1  1  1  0 
> >  2  1  1  0  1  0  0
> > Node0, zone  DMA, type  Movable  0  0  0  0 
> >  0  0  0  0  0  1  3
> > Node0, zone  DMA, type  Reclaimable  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zone  DMA, type   HighAtomic  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zone  DMA, type  CMA  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zone  DMA, type  Isolate  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zoneDMA32, typeUnmovable  5 10  9  8 
> >  8  5  1  2  3  0  0
> > Node0, zoneDMA32, type  Movable 15 16 14 12 
> > 14 10  9  6  6  5776
> > Node0, zoneDMA32, type  Reclaimable  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zoneDMA32, type   HighAtomic  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zoneDMA32, type  CMA  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zoneDMA32, type  Isolate  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zone   Normal, typeUnmovable   7097   6914   6473   5642   
> > 4373   2664   1220319 78  4  0
> > Node0, zone   Normal, type  Movable   2092   3216   2820   2266   
> > 1585946559359237258378
> > Node0, zone   Normal, type  Reclaimable 47 88122 80 
> > 34  9  5  4  2  1  2
> > Node0, zone   Normal, type   HighAtomic  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zone   Normal, type  CMA  0  0  0  0 
> >  0  0  0  0  0  0  0
> > Node0, zone   Normal, type  Isolate  0  0 

[PATCH V2 2/2] zswap: Add module parameter malloc_movable_if_support

2019-06-02 Thread Hui Zhu
n-w-seq
2717908992 bytes / 4721401 usecs = 562165 KB/s
2717908992 bytes / 4783167 usecs = 554905 KB/s
2717908992 bytes / 4802125 usecs = 552715 KB/s
2717908992 bytes / 4866579 usecs = 545395 KB/s
323605 usecs to free memory
414817 usecs to free memory
458576 usecs to free memory
355827 usecs to free memory
/home/teawater/kernel/vm-scalability# cat /proc/pagetypeinfo
Page block order: 9
Pages per block:  512

Free pages count per migrate type at order   0  1  2  3  4  
5  6  7  8  9 10
Node0, zone  DMA, typeUnmovable  1  1  1  0  2  
1  1  0  1  0  0
Node0, zone  DMA, type  Movable  0  0  0  0  0  
0  0  0  0  1  3
Node0, zone  DMA, type  Reclaimable  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone  DMA, type   HighAtomic  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone  DMA, type  CMA  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone  DMA, type  Isolate  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, typeUnmovable  8 10  8  7  7  
6  5  3  2  0  0
Node0, zoneDMA32, type  Movable 23 21 18 15 13  
   14 14 10 11  6766
Node0, zoneDMA32, type  Reclaimable  0  0  0  0  0  
0  0  0  0  0  1
Node0, zoneDMA32, type   HighAtomic  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, type  CMA  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, type  Isolate  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone   Normal, typeUnmovable   2660   1295460102 11  
5  3 11  2  4  0
Node0, zone   Normal, type  Movable   4178   5760   5045   4137   3324  
 2306   1482930497254460
Node0, zone   Normal, type  Reclaimable 50 83114 93 28  
   12 10  6  3  3  0
Node0, zone   Normal, type   HighAtomic  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone   Normal, type  CMA  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone   Normal, type  Isolate  0  0  0  0  0  
0  0  0  0  0  0

Number of blocks type Unmovable  Movable  Reclaimable   HighAtomic  
CMA  Isolate
Node 0, zone  DMA1700   
 00
Node 0, zoneDMA324 165020   
 00
Node 0, zone   Normal   81 2325   250   
 00

You can see that the number of unmovable page blocks is decreased
when malloc_movable_if_support is enabled.

Signed-off-by: Hui Zhu 
---
 mm/zswap.c | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index a4e4d36ec085..2fc45de92383 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -123,6 +123,13 @@ static bool zswap_same_filled_pages_enabled = true;
 module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
   bool, 0644);
 
+/* Enable/disable zpool allocate block with gfp __GFP_HIGHMEM | __GFP_MOVABLE
+ * if it support allocate movable memory (disabled by default).
+ */
+static bool __read_mostly zswap_malloc_movable_if_support;
+module_param_cb(malloc_movable_if_support, ¶m_ops_bool,
+   &zswap_malloc_movable_if_support, 0644);
+
 /*
 * data structures
 **/
@@ -1006,6 +1013,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
char *buf;
u8 *src, *dst;
struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
+   gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
 
/* THP isn't supported */
if (PageTransHuge(page)) {
@@ -1079,9 +1087,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
 
/* store */
hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
-   ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
-  __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
-  &handle);
+   if (zswap_malloc_movable_if_support &&
+   zpool_malloc_support_movable(entry->pool->zpool)) {
+   gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+   }

[PATCH V2 1/2] zpool: Add malloc_support_movable to zpool_driver

2019-06-02 Thread Hui Zhu
As a zpool_driver, zsmalloc can allocate movable memory because it
support migate pages.
But zbud and z3fold cannot allocate movable memory.

This commit adds malloc_support_movable to zpool_driver.
If a zpool_driver support allocate movable memory, set it to true.
And add zpool_malloc_support_movable check malloc_support_movable
to make sure if a zpool support allocate movable memory.

Signed-off-by: Hui Zhu 
---
 include/linux/zpool.h |  3 +++
 mm/zpool.c| 16 
 mm/zsmalloc.c | 19 ++-
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 7238865e75b0..51bf43076165 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -46,6 +46,8 @@ const char *zpool_get_type(struct zpool *pool);
 
 void zpool_destroy_pool(struct zpool *pool);
 
+bool zpool_malloc_support_movable(struct zpool *pool);
+
 int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
unsigned long *handle);
 
@@ -90,6 +92,7 @@ struct zpool_driver {
struct zpool *zpool);
void (*destroy)(void *pool);
 
+   bool malloc_support_movable;
int (*malloc)(void *pool, size_t size, gfp_t gfp,
unsigned long *handle);
void (*free)(void *pool, unsigned long handle);
diff --git a/mm/zpool.c b/mm/zpool.c
index a2dd9107857d..863669212070 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -238,6 +238,22 @@ const char *zpool_get_type(struct zpool *zpool)
return zpool->driver->type;
 }
 
+/**
+ * zpool_malloc_support_movable() - Check if the zpool support
+ * allocate movable memory
+ * @zpool: The zpool to check
+ *
+ * This returns if the zpool support allocate movable memory.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: true if if the zpool support allocate movable memory, false if not
+ */
+bool zpool_malloc_support_movable(struct zpool *zpool)
+{
+   return zpool->driver->malloc_support_movable;
+}
+
 /**
  * zpool_malloc() - Allocate memory
  * @zpool: The zpool to allocate from.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0787d33b80d8..8f3d9a4d46f4 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -437,15 +437,16 @@ static u64 zs_zpool_total_size(void *pool)
 }
 
 static struct zpool_driver zs_zpool_driver = {
-   .type = "zsmalloc",
-   .owner =THIS_MODULE,
-   .create =   zs_zpool_create,
-   .destroy =  zs_zpool_destroy,
-   .malloc =   zs_zpool_malloc,
-   .free = zs_zpool_free,
-   .map =  zs_zpool_map,
-   .unmap =zs_zpool_unmap,
-   .total_size =   zs_zpool_total_size,
+   .type =   "zsmalloc",
+   .owner =  THIS_MODULE,
+   .create = zs_zpool_create,
+   .destroy =zs_zpool_destroy,
+   .malloc_support_movable = true,
+   .malloc = zs_zpool_malloc,
+   .free =   zs_zpool_free,
+   .map =zs_zpool_map,
+   .unmap =  zs_zpool_unmap,
+   .total_size = zs_zpool_total_size,
 };
 
 MODULE_ALIAS("zpool-zsmalloc");
-- 
2.20.1 (Apple Git-117)



[UPSTREAM KERNEL] mm/zsmalloc.c: Add module parameter malloc_force_movable

2019-05-28 Thread Hui Zhu
m-scalability# cat /proc/pagetypeinfo
Page block order: 9
Pages per block:  512

Free pages count per migrate type at order   0  1  2  3  4  
5  6  7  8  9 10
Node0, zone  DMA, typeUnmovable  1  1  1  0  2  
1  1  0  1  0  0
Node0, zone  DMA, type  Movable  0  0  0  0  0  
0  0  0  0  1  3
Node0, zone  DMA, type  Reclaimable  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone  DMA, type   HighAtomic  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone  DMA, type  CMA  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone  DMA, type  Isolate  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, typeUnmovable  9 15 13 10 13  
9  3  2  2  0  0
Node0, zoneDMA32, type  Movable 16 19 10 14 17  
   17 16  8  5  6775
Node0, zoneDMA32, type  Reclaimable  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, type   HighAtomic  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, type  CMA  0  0  0  0  0  
0  0  0  0  0  0
Node0, zoneDMA32, type  Isolate  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone   Normal, typeUnmovable   2525   1347603181 55  
   14  4  1  6  0  0
Node0, zone   Normal, type  Movable   5255   6069   5007   3978   2885  
 1940   1164732485276511
Node0, zone   Normal, type  Reclaimable103104140 87 31  
   21  7  3  2  1  1
Node0, zone   Normal, type   HighAtomic  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone   Normal, type  CMA  0  0  0  0  0  
0  0  0  0  0  0
Node0, zone   Normal, type  Isolate  0  0  0  0  0  
0  0  0  0  0  0

Number of blocks type Unmovable  Movable  Reclaimable   HighAtomic  
CMA  Isolate
Node 0, zone  DMA1700   
 00
Node 0, zoneDMA324 165200   
 00
Node 0, zone   Normal   78 2330   230   
 00

You can see that the number of unmovable page blocks is decreased
when malloc_force_movable is enabled.

Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0787d33b80d8..7d44c7ccd882 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -178,6 +178,13 @@ static struct dentry *zs_stat_root;
 static struct vfsmount *zsmalloc_mnt;
 #endif
 
+/* Enable/disable zs_malloc force allocate block with
+ *  gfp __GFP_HIGHMEM | __GFP_MOVABLE (disabled by default).
+ */
+static bool __read_mostly zs_malloc_force_movable;
+module_param_cb(malloc_force_movable, ¶m_ops_bool,
+   &zs_malloc_force_movable, 0644);
+
 /*
  * We assign a page to ZS_ALMOST_EMPTY fullness group when:
  * n <= N / f, where
@@ -1479,6 +1486,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t 
size, gfp_t gfp)
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
 
+   if (zs_malloc_force_movable)
+   gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+
handle = cache_alloc_handle(pool, gfp);
if (!handle)
return 0;
-- 
2.20.1 (Apple Git-117)



[RFC 2/4] BloodTest: perf

2017-10-13 Thread Hui Zhu
This patch add the function that call perf function and bt_pages that
can record the data that get from perf.

The interface is in "/sys/kernel/debug/bloodtest/perf".
"on" is the switch.  When it set to 1, access "test" will call perf.
There are "perf_config", "perf_freq", "perf_period", "perf_type" can
set the options of perf.
After record, access "str" will get the record data in string.
Access "cpu0/page" will get the record data in binary that is format is
in "bin_format".

Signed-off-by: Hui Zhu 
---
 kernel/bloodtest/Makefile   |   4 +-
 kernel/bloodtest/core.c |  76 +++---
 kernel/bloodtest/internal.h |  43 +++-
 kernel/bloodtest/pages.c| 266 
 kernel/bloodtest/perf.c | 591 
 5 files changed, 931 insertions(+), 49 deletions(-)
 create mode 100644 kernel/bloodtest/pages.c
 create mode 100644 kernel/bloodtest/perf.c

diff --git a/kernel/bloodtest/Makefile b/kernel/bloodtest/Makefile
index 7f289af..79b7ea0 100644
--- a/kernel/bloodtest/Makefile
+++ b/kernel/bloodtest/Makefile
@@ -1 +1,3 @@
-obj-y  = core.o kernel_stat.o
+obj-y  = core.o pages.o kernel_stat.o
+
+obj-$(CONFIG_PERF_EVENTS) += perf.o
diff --git a/kernel/bloodtest/core.c b/kernel/bloodtest/core.c
index 7b39cbb..5ba800c 100644
--- a/kernel/bloodtest/core.c
+++ b/kernel/bloodtest/core.c
@@ -6,31 +6,17 @@
 
 #include "internal.h"
 
-enum bt_stat_enum bt_stat;
-DEFINE_SPINLOCK(bt_lock);
+DECLARE_RWSEM(bt_lock);
 
 static DECLARE_WAIT_QUEUE_HEAD(bt_wq);
 static struct hrtimer bt_timer;
 static ktime_t bt_ktime;
-
-static bool is_bt_stat(enum bt_stat_enum stat)
-{
-   unsigned long flags;
-   bool ret = false;
-
-   spin_lock_irqsave(&bt_lock, flags);
-   if (bt_stat == stat)
-   ret = true;
-   spin_unlock_irqrestore(&bt_lock, flags);
-
-   return ret;
-}
+static bool bt_timer_stop;
 
 /* This function must be called under the protection of bt_lock.  */
 static void bt_insert(void)
 {
-   bt_stat = bt_running;
-
+   bt_insert_perf();
bt_insert_kernel_stat();
 }
 
@@ -38,8 +24,13 @@ static void bt_insert(void)
 static void bt_pullout(void)
 {
bt_pullout_kernel_stat();
+   bt_pullout_perf();
+}
 
-   bt_stat = bt_done;
+/* This function must be called under the protection of bt_lock.  */
+static void bt_task_pullout(void)
+{
+   bt_task_pullout_perf();
 }
 
 /* This function must be called under the protection of bt_lock.  */
@@ -50,38 +41,33 @@ static void bt_report(struct seq_file *p)
 
 static enum hrtimer_restart bt_timer_fn(struct hrtimer *data)
 {
-   spin_lock(&bt_lock);
bt_pullout();
-   spin_unlock(&bt_lock);
 
-   wake_up_interruptible_all(&bt_wq);
+   bt_timer_stop = true;
+   wake_up_all(&bt_wq);
 
return HRTIMER_NORESTART;
 }
 
-static int test_show(struct seq_file *p, void *v)
+static int test_show(struct seq_file *p, void *unused)
 {
-   int ret = 0;
+   down_write(&bt_lock);
 
-   spin_lock(&bt_lock);
-   if (bt_stat == bt_running)
-   goto wait;
+   bt_timer_stop = false;
 
-   hrtimer_start(&bt_timer, bt_ktime, HRTIMER_MODE_REL);
bt_insert();
+   hrtimer_start(&bt_timer, bt_ktime, HRTIMER_MODE_REL);
 
-wait:
-   spin_unlock(&bt_lock);
-   ret = wait_event_interruptible(bt_wq, is_bt_stat(bt_done));
-   if (ret)
-   goto out;
+   wait_event(bt_wq, bt_timer_stop);
 
-   spin_lock(&bt_lock);
-   bt_report(p);
-   spin_unlock(&bt_lock);
+   bt_task_pullout();
+   up_write(&bt_lock);
 
-out:
-   return ret;
+   down_read(&bt_lock);
+   bt_report(p);
+   up_read(&bt_lock);
+   
+   return 0;
 }
 
 static int test_open(struct inode *inode, struct file *file)
@@ -98,20 +84,28 @@ static int test_open(struct inode *inode, struct file *file)
 
 static int __init bt_init(void)
 {
-   struct dentry *d, *t;
+   int ret = -ENOMEM;
+   struct dentry *d = NULL, *t = NULL;
 
d = debugfs_create_dir("bloodtest", NULL);
if (!d)
-   return -ENOMEM;
+   goto out;
t = debugfs_create_file("test", S_IRUSR, d, NULL, &test_fops);
if (!t)
-   return -ENOMEM;
+   goto out;
 
hrtimer_init(&bt_timer, CLOCK_REALTIME, HRTIMER_MODE_REL);
bt_timer.function = bt_timer_fn;
bt_ktime = ktime_set(1, 0);
 
-   return 0;
+   ret = bt_perf_init(d);
+
+out:
+   if (ret != 0) {
+   debugfs_remove(t);
+   debugfs_remove(d);
+   }
+   return ret;
 }
 
 core_initcall(bt_init);
diff --git a/kernel/bloodtest/internal.h b/kernel/bloodtest/internal.h
index 48faf4d..f6befc4 100644
--- a/kernel/bloodtest/internal.h
+++ 

[RFC 0/4] BloodTest: kernel status

2017-10-13 Thread Hui Zhu
BloodTest: an interface to call other analysing tools

Linux kernel has a lot of analysing tools, perf, ftrace, systemtap, KGTP
and so on.
And kernel also supplies a lot of internal value from procfs and sysfs
to analyse the performance.

Sometime, user need get performance infomation quickly, low overhead and
full coverage.
BloodTest is for it.
It is a interface can acess function of other analysing tools and
records to internal buffer that user or application can access very
quickly (mmap).

Now, BloodTest just support record cpu, perf and task infomation in
one seconds.

Hui Zhu (2):
BloodTest: kernel status
BloodTest: perf
Module: add /proc/modules_update_version
BloodTest: task

 fs/proc/stat.c |8 
 include/linux/bloodtest.h  |   10 
 include/linux/kernel_stat.h|3 
 init/Kconfig   |3 
 kernel/Makefile|2 
 kernel/bloodtest/Makefile  |3 
 kernel/bloodtest/core.c|  132 +
 kernel/bloodtest/internal.h|   61 
 kernel/bloodtest/kernel_stat.c |   62 
 kernel/bloodtest/pages.c   |  266 ++
 kernel/bloodtest/perf.c|  576 +
 kernel/bloodtest/task.c|  447 +++
 kernel/exit.c  |4 
 kernel/module.c|   19 +
 14 files changed, 1592 insertions(+), 4 deletions(-)


[RFC 1/4] BloodTest: kernel status

2017-10-13 Thread Hui Zhu
This patch include the base framework of BloodTest and get the kernel
status function.

The interface is in "/sys/kernel/debug/bloodtest".
Access "test" will call bt_insert that will call all start record
function.  And register a hrtimer to call bt_pullout to stop record.

bt_insert and bt_pullout will call analysing tools.

Signed-off-by: Hui Zhu 
---
 fs/proc/stat.c |   8 +--
 include/linux/kernel_stat.h|   3 ++
 init/Kconfig   |   3 ++
 kernel/Makefile|   2 +
 kernel/bloodtest/Makefile  |   1 +
 kernel/bloodtest/core.c| 117 +
 kernel/bloodtest/internal.h|  19 +++
 kernel/bloodtest/kernel_stat.c |  62 ++
 8 files changed, 211 insertions(+), 4 deletions(-)
 create mode 100644 kernel/bloodtest/Makefile
 create mode 100644 kernel/bloodtest/core.c
 create mode 100644 kernel/bloodtest/internal.h
 create mode 100644 kernel/bloodtest/kernel_stat.c

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index bd4e55f..c6f4fd4 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -22,7 +22,7 @@
 
 #ifdef arch_idle_time
 
-static u64 get_idle_time(int cpu)
+u64 get_idle_time(int cpu)
 {
u64 idle;
 
@@ -32,7 +32,7 @@ static u64 get_idle_time(int cpu)
return idle;
 }
 
-static u64 get_iowait_time(int cpu)
+u64 get_iowait_time(int cpu)
 {
u64 iowait;
 
@@ -44,7 +44,7 @@ static u64 get_iowait_time(int cpu)
 
 #else
 
-static u64 get_idle_time(int cpu)
+u64 get_idle_time(int cpu)
 {
u64 idle, idle_usecs = -1ULL;
 
@@ -60,7 +60,7 @@ static u64 get_idle_time(int cpu)
return idle;
 }
 
-static u64 get_iowait_time(int cpu)
+u64 get_iowait_time(int cpu)
 {
u64 iowait, iowait_usecs = -1ULL;
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 66be8b6..bf8d3f0 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -96,4 +96,7 @@ static inline void account_process_tick(struct task_struct 
*tsk, int user)
 
 extern void account_idle_ticks(unsigned long ticks);
 
+extern u64 get_idle_time(int cpu);
+extern u64 get_iowait_time(int cpu);
+
 #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/init/Kconfig b/init/Kconfig
index 78cb246..f63550c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1909,3 +1909,6 @@ config ASN1
  functions to call on what tags.
 
 source "kernel/Kconfig.locks"
+
+config BLOODTEST
+   bool "Blood test"
diff --git a/kernel/Makefile b/kernel/Makefile
index ed470aa..2a04e42 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -103,6 +103,8 @@ obj-$(CONFIG_BPF) += bpf/
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
+obj-$(CONFIG_BLOODTEST) += bloodtest/
+
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
diff --git a/kernel/bloodtest/Makefile b/kernel/bloodtest/Makefile
new file mode 100644
index 000..7f289af
--- /dev/null
+++ b/kernel/bloodtest/Makefile
@@ -0,0 +1 @@
+obj-y  = core.o kernel_stat.o
diff --git a/kernel/bloodtest/core.c b/kernel/bloodtest/core.c
new file mode 100644
index 000..7b39cbb
--- /dev/null
+++ b/kernel/bloodtest/core.c
@@ -0,0 +1,117 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "internal.h"
+
+enum bt_stat_enum bt_stat;
+DEFINE_SPINLOCK(bt_lock);
+
+static DECLARE_WAIT_QUEUE_HEAD(bt_wq);
+static struct hrtimer bt_timer;
+static ktime_t bt_ktime;
+
+static bool is_bt_stat(enum bt_stat_enum stat)
+{
+   unsigned long flags;
+   bool ret = false;
+
+   spin_lock_irqsave(&bt_lock, flags);
+   if (bt_stat == stat)
+   ret = true;
+   spin_unlock_irqrestore(&bt_lock, flags);
+
+   return ret;
+}
+
+/* This function must be called under the protection of bt_lock.  */
+static void bt_insert(void)
+{
+   bt_stat = bt_running;
+
+   bt_insert_kernel_stat();
+}
+
+/* This function must be called under the protection of bt_lock.  */
+static void bt_pullout(void)
+{
+   bt_pullout_kernel_stat();
+
+   bt_stat = bt_done;
+}
+
+/* This function must be called under the protection of bt_lock.  */
+static void bt_report(struct seq_file *p)
+{
+   bt_report_kernel_stat(p);
+}
+
+static enum hrtimer_restart bt_timer_fn(struct hrtimer *data)
+{
+   spin_lock(&bt_lock);
+   bt_pullout();
+   spin_unlock(&bt_lock);
+
+   wake_up_interruptible_all(&bt_wq);
+
+   return HRTIMER_NORESTART;
+}
+
+static int test_show(struct seq_file *p, void *v)
+{
+   int ret = 0;
+
+   spin_lock(&bt_lock);
+   if (bt_stat == bt_running)
+   goto wait;
+
+   hrtimer_start(&bt_timer, bt_ktime, HRTIMER_MODE_REL);
+   bt_insert();
+
+wait:
+   spin_unlock(&bt_lock);
+   ret = wait_event_interruptible(bt_wq, is_bt_stat(bt_done));
+   if (ret)
+   goto out;
+
+   spin_lock(&bt_

[RFC 4/4] BloodTest: task

2017-10-13 Thread Hui Zhu
This patch add the function that get the infomation that task use the
resource of system for example cpu time, read_bytes, write_bytes.
The interface is in "/sys/kernel/debug/bloodtest/task".
"on" is the switch.  When it set to 1, access "test" will record task
infomation.
After record, access "str" will get the record data in string.
Access "page" will get the record data in binary that is format is
in "bin_format".

Signed-off-by: Hui Zhu 
---
 include/linux/bloodtest.h   |  10 +
 kernel/bloodtest/Makefile   |   2 +-
 kernel/bloodtest/core.c |  21 +++
 kernel/bloodtest/internal.h |  13 ++
 kernel/bloodtest/perf.c |  33 +---
 kernel/bloodtest/task.c | 447 
 kernel/exit.c   |   4 +
 7 files changed, 505 insertions(+), 25 deletions(-)
 create mode 100644 include/linux/bloodtest.h
 create mode 100644 kernel/bloodtest/task.c

diff --git a/include/linux/bloodtest.h b/include/linux/bloodtest.h
new file mode 100644
index 000..55f4ebc
--- /dev/null
+++ b/include/linux/bloodtest.h
@@ -0,0 +1,10 @@
+#ifndef __LINUX_BLOODTEST_H
+#define __LINUX_BLOODTEST_H
+
+#ifdef CONFIG_BLOODTEST
+extern void bt_task_exit_record(struct task_struct *p);
+#else
+static inline void bt_task_exit_record(struct task_struct *p)  { }
+#endif
+
+#endif /* __LINUX_BLOODTEST_H */
diff --git a/kernel/bloodtest/Makefile b/kernel/bloodtest/Makefile
index 79b7ea0..a6f1a7a 100644
--- a/kernel/bloodtest/Makefile
+++ b/kernel/bloodtest/Makefile
@@ -1,3 +1,3 @@
-obj-y  = core.o pages.o kernel_stat.o
+obj-y  = core.o pages.o kernel_stat.o task.o
 
 obj-$(CONFIG_PERF_EVENTS) += perf.o
diff --git a/kernel/bloodtest/core.c b/kernel/bloodtest/core.c
index 5ba800c..6cfcdf2 100644
--- a/kernel/bloodtest/core.c
+++ b/kernel/bloodtest/core.c
@@ -16,6 +16,7 @@
 /* This function must be called under the protection of bt_lock.  */
 static void bt_insert(void)
 {
+   bt_insert_task();
bt_insert_perf();
bt_insert_kernel_stat();
 }
@@ -25,6 +26,7 @@ static void bt_pullout(void)
 {
bt_pullout_kernel_stat();
bt_pullout_perf();
+   bt_pullout_task();
 }
 
 /* This function must be called under the protection of bt_lock.  */
@@ -99,13 +101,32 @@ static int __init bt_init(void)
bt_ktime = ktime_set(1, 0);
 
ret = bt_perf_init(d);
+   if (ret < 0)
+   goto out;
+
+   ret = bt_task_init(d);
 
 out:
if (ret != 0) {
debugfs_remove(t);
debugfs_remove(d);
+   pr_err("bloodtest: init get error %d\n", ret);
}
return ret;
 }
 
 core_initcall(bt_init);
+
+int bt_number_get(void *data, u64 *val)
+{
+   unsigned int *number_point = data;
+
+   down_read(&bt_lock);
+
+   *val = (u64)*number_point;
+
+   up_read(&bt_lock);
+
+   return 0;
+}
+
diff --git a/kernel/bloodtest/internal.h b/kernel/bloodtest/internal.h
index f6befc4..5aacf37 100644
--- a/kernel/bloodtest/internal.h
+++ b/kernel/bloodtest/internal.h
@@ -3,6 +3,13 @@
 
 #include 
 
+#define SHOW_FORMAT_1(p, s, entry, type, sign, size) \
+   seq_printf(p, "%s format:%s %s offset:%lu size:%lu\n", \
+  #entry, #type, sign, offsetof(s, entry), \
+  (unsigned long)size)
+#define SHOW_FORMAT(p, s, entry, type, sign) \
+   SHOW_FORMAT_1(p, s, entry, type, sign, sizeof(type))
+
 extern struct rw_semaphore bt_lock;
 
 struct bt_pages {
@@ -45,4 +52,10 @@ static inline void bt_task_pullout_perf(void)
{ }
 static inline int bt_perf_init(struct dentry *d)   { return 0; }
 #endif
 
+extern void bt_insert_task(void);
+extern void bt_pullout_task(void);
+extern int bt_task_init(struct dentry *d);
+
+extern int bt_number_get(void *data, u64 *val);
+
 #endif /* _KERNEL_BLOODTEST_INTERNAL_H */
diff --git a/kernel/bloodtest/perf.c b/kernel/bloodtest/perf.c
index cf23844..d495258 100644
--- a/kernel/bloodtest/perf.c
+++ b/kernel/bloodtest/perf.c
@@ -40,20 +40,7 @@ struct perf_rec {
 struct dentry *perf_dir;
 struct dentry *perf_str_dir;
 
-static int perf_number_get(void *data, u64 *val)
-{
-   unsigned int *number_point = data;
-
-   down_read(&bt_lock);
-
-   *val = (u64)*number_point;
-
-   up_read(&bt_lock);
-
-   return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(perf_number_fops, perf_number_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(perf_number_fops, bt_number_get, NULL, "%llu\n");
 
 static void perf_overflow_handler(struct perf_event *event,
struct perf_sample_data *data,
@@ -402,7 +389,7 @@ static int perf_event_set(void *data, u64 val)
 }
 
 DEFINE_SIMPLE_ATTRIBUTE(perf_event_fops,
-   perf_number_get,
+   bt_number_get,
perf_event_set, "%llu\n");
 
 static int perf_bin_format_show(struct seq_file *p, void *unused)
@@ 

[RFC 3/4] module: add /proc/modules_update_version

2017-10-13 Thread Hui Zhu
With "BloodTest: perf", we can get the address of kernel from "cpu0/page"
without symbol.
The application that call BloodTest need translate the address to symbol
with itself.  For normal address, just vmlinux is OK to get the right
symbol.  But for the address of kernel module, it also need the address
of modules from /proc/modules.

Add /proc/modules_update_version will help the application to get if the
kernel modules address is changed or not.

Signed-off-by: Hui Zhu 
---
 kernel/module.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index de66ec8..ed6f370 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -317,6 +317,8 @@ struct load_info {
} index;
 };
 
+static atomic_t modules_update_version = ATOMIC_INIT(0);
+
 /*
  * We require a truly strong try_module_get(): 0 means success.
  * Otherwise an error is returned due to ongoing or failed
@@ -1020,6 +1022,9 @@ int module_refcount(struct module *mod)
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
 
free_module(mod);
+
+   atomic_inc(&modules_update_version);
+
return 0;
 out:
mutex_unlock(&module_mutex);
@@ -3183,6 +3188,8 @@ static int move_module(struct module *mod, struct 
load_info *info)
 (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
}
 
+   atomic_inc(&modules_update_version);
+
return 0;
 }
 
@@ -4196,9 +4203,21 @@ static int modules_open(struct inode *inode, struct file 
*file)
.release= seq_release,
 };
 
+static int modules_update_version_get(void *data, u64 *val)
+{
+   *val = (u64)atomic_read(&modules_update_version);
+
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(proc_modules_update_version_operations,
+   modules_update_version_get, NULL, "%llu\n");
+
 static int __init proc_modules_init(void)
 {
proc_create("modules", 0, NULL, &proc_modules_operations);
+   proc_create("modules_update_version", 0, NULL,
+   &proc_modules_update_version_operations);
return 0;
 }
 module_init(proc_modules_init);
-- 
1.9.1



Re: [RFC 0/2] Use HighAtomic against long-term fragmentation

2017-09-26 Thread Hui Zhu
2017-09-26 17:51 GMT+08:00 Mel Gorman :
> On Tue, Sep 26, 2017 at 04:46:42PM +0800, Hui Zhu wrote:
>> Current HighAtomic just to handle the high atomic page alloc.
>> But I found that use it handle the normal unmovable continuous page
>> alloc will help to against long-term fragmentation.
>>
>
> This is not wise. High-order atomic allocations do not always have a
> smooth recovery path such as network drivers with large MTUs that have no
> choice but to drop the traffic and hope for a retransmit. That's why they
> have the highatomic reserve. If the reserve is used for normal unmovable
> allocations then allocation requests that could have waited for reclaim
> may cause high-order atomic allocations to fail. Changing it may allow
> improve latencies in some limited cases while causing functional failures
> in others.  If there is a special case where there are a large number of
> other high-order allocations then I would suggest increasing min_free_kbytes
> instead as a workaround.

I think let 0 order unmovable page alloc and other order unmovable pages
alloc use different migrate types will help against long-term
fragmentation.

Do you think kernel can add a special migrate type for big than 0 order
unmovable pages alloc?

Thanks,
Hui

>
> --
> Mel Gorman
> SUSE Labs


[RFC 1/2] Try to use HighAtomic if try to alloc umovable page that order is not 0

2017-09-26 Thread Hui Zhu
The page add a new condition to let gfp_to_alloc_flags return
alloc_flags with ALLOC_HARDER if the order is not 0 and migratetype is
MIGRATE_UNMOVABLE.

Then alloc umovable page that order is not 0 will try to use HighAtomic.

Signed-off-by: Hui Zhu 
---
 mm/page_alloc.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c841af8..b54e94a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3642,7 +3642,7 @@ static void wake_all_kswapds(unsigned int order, const 
struct alloc_context *ac)
 }
 
 static inline unsigned int
-gfp_to_alloc_flags(gfp_t gfp_mask)
+gfp_to_alloc_flags(gfp_t gfp_mask, int order, int migratetype)
 {
unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 
@@ -3671,6 +3671,8 @@ static void wake_all_kswapds(unsigned int order, const 
struct alloc_context *ac)
alloc_flags &= ~ALLOC_CPUSET;
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
+   else if (order > 0 && migratetype == MIGRATE_UNMOVABLE)
+   alloc_flags |= ALLOC_HARDER;
 
 #ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
@@ -3903,7 +3905,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 * kswapd needs to be woken up, and to avoid the cost of setting up
 * alloc_flags precisely. So we do that now.
 */
-   alloc_flags = gfp_to_alloc_flags(gfp_mask);
+   alloc_flags = gfp_to_alloc_flags(gfp_mask, order, ac->migratetype);
 
/*
 * We need to recalculate the starting point for the zonelist iterator
-- 
1.9.1



[RFC 2/2] Change limit of HighAtomic from 1% to 10%

2017-09-26 Thread Hui Zhu
After "Try to use HighAtomic if try to alloc umovable page that order
is not 0".  The result is still not very well because the the limit of
HighAtomic make kernel cannot reserve more pageblock to HighAtomic.

The patch change max_managed from 1% to 10% make HighAtomic can get more
pageblocks.

Signed-off-by: Hui Zhu 
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b54e94a..9322458 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2101,7 +2101,7 @@ static void reserve_highatomic_pageblock(struct page 
*page, struct zone *zone,
 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
 * Check is race-prone but harmless.
 */
-   max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+   max_managed = (zone->managed_pages / 10) + pageblock_nr_pages;
if (zone->nr_reserved_highatomic >= max_managed)
return;
 
-- 
1.9.1



[RFC 0/2] Use HighAtomic against long-term fragmentation

2017-09-26 Thread Hui Zhu
Current HighAtomic just to handle the high atomic page alloc.
But I found that use it handle the normal unmovable continuous page
alloc will help to against long-term fragmentation.

Use highatomic as normal page alloc is odd.  But I really got some good
results with our internal test and mmtests.

Do you think it is worth to work on it?

The patches was tested with mmtests stress-highalloc modified to do
GFP_KERNEL order-4 allocations, on 4.14.0-rc1+ 2 cpus Vbox 1G memory.
  orig  ch
Minor Faults  4565947743315623
Major Faults   319 371
Swap Ins 0   0
Swap Outs0   0
Allocation stalls0   0
DMA allocs   93518   18345
DMA32 allocs  4239569940406865
Normal allocs0   0
Movable allocs   0   0
Direct pages scanned  7056   16232
Kswapd pages scanned946174  961750
Kswapd pages reclaimed  945077  942821
Direct pages reclaimed7022   16170
Kswapd efficiency  99% 98%
Kswapd velocity   1576.3521567.977
Direct efficiency  99% 99%
Direct velocity 11.755  26.464
Percentage direct scans 0%  1%
Zone normal velocity  1588.1081594.441
Zone dma32 velocity  0.000   0.000
Zone dma velocity0.000   0.000
Page writes by reclaim   0.000   0.000
Page writes file 0   0
Page writes anon 0   0
Page reclaim immediate 405   16429
Sector Reads   2027848 2109324
Sector Writes  3386260 3299388
Page rescued immediate   0   0
Slabs scanned   867805  877005
Direct inode steals3372072
Kswapd inode steals  33911   41777
Kswapd skipped wait  0   0
THP fault alloc 30  84
THP collapse alloc 188 244
THP splits   0   0
THP fault fallback  67  51
THP collapse fail6   4
Compaction stalls  111  49
Compaction success  81  35
Compaction failures 30  14
Page migrate success 57962   43921
Page migrate failure67 183
Compaction pages isolated   117473   88823
Compaction migrate scanned   75548   50403
Compaction free scanned1454638  672310
Compaction cost 62  47
NUMA alloc hit4212949340018326
NUMA alloc miss  0   0
NUMA interleave hit  0   0
NUMA alloc local  4212949340018326
NUMA base PTE updates0   0
NUMA huge PMD updates0   0
NUMA page range updates  0   0
NUMA hint faults 0   0
NUMA hint local faults   0   0
NUMA hint local percent100 100
NUMA pages migrated  0   0
AutoNUMA cost   0%  0%

Hui Zhu (2):
Try to use HighAtomic if try to alloc umovable page that order is not 0
Change limit of HighAtomic from 1% to 10%

 page_alloc.c |8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)


Re: [PATCH v2] zsmalloc: zs_page_migrate: schedule free_work if zspage is ZS_EMPTY

2017-08-16 Thread Hui Zhu
2017-08-16 12:51 GMT+08:00 Minchan Kim :
> On Wed, Aug 16, 2017 at 10:49:14AM +0800, Hui Zhu wrote:
>> Hi Minchan,
>>
>> 2017-08-16 10:13 GMT+08:00 Minchan Kim :
>> > Hi Hui,
>> >
>> > On Mon, Aug 14, 2017 at 05:56:30PM +0800, Hui Zhu wrote:
>> >> After commit e2846124f9a2 ("zsmalloc: zs_page_migrate: skip unnecessary
>> >
>> > This patch is not merged yet so the hash is invalid.
>> > That means we may fold this patch to [1] in current mmotm.
>> >
>> > [1] 
>> > zsmalloc-zs_page_migrate-skip-unnecessary-loops-but-not-return-ebusy-if-zspage-is-not-inuse-fix.patch
>> >
>> >> loops but not return -EBUSY if zspage is not inuse") zs_page_migrate
>> >> can handle the ZS_EMPTY zspage.
>> >>
>> >> But I got some false in zs_page_isolate:
>> >>   if (get_zspage_inuse(zspage) == 0) {
>> >>   spin_unlock(&class->lock);
>> >>   return false;
>> >>   }
>> >
>> > I also realized we should make zs_page_isolate succeed on empty zspage
>> > because we allow the empty zspage migration from now on.
>> > Could you send a patch for that as well?
>>
>> OK.  I will make a patch for that later.
>
> Please send the patch so I want to fold it to [1] before Andrew is going
> to send [1] to Linus.
>
> Thanks.

Done.

Thanks,
Hui


[PATCH] zsmalloc: zs_page_isolate: skip unnecessary loops but not return false if zspage is not inuse

2017-08-16 Thread Hui Zhu
Like [1], zs_page_isolate meet the same problem if zspage is not inuse.

After [2], zs_page_migrate can support empty zspage now.

Make this patch to let zs_page_isolate skip unnecessary loops but not
return false if zspage is not inuse.

[1] 
zsmalloc-zs_page_migrate-skip-unnecessary-loops-but-not-return-ebusy-if-zspage-is-not-inuse-fix.patch
[2] zsmalloc-zs_page_migrate-schedule-free_work-if-zspage-is-ZS_EMPTY.patch

Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index fb99953..8560c93 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1927,11 +1927,6 @@ bool zs_page_isolate(struct page *page, isolate_mode_t 
mode)
class = pool->size_class[class_idx];
 
spin_lock(&class->lock);
-   if (get_zspage_inuse(zspage) == 0) {
-   spin_unlock(&class->lock);
-   return false;
-   }
-
/* zspage is isolated for object migration */
if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
spin_unlock(&class->lock);
-- 
1.9.1



[PATCH v3] zsmalloc: zs_page_migrate: schedule free_work if zspage is ZS_EMPTY

2017-08-15 Thread Hui Zhu
After commit [1] zs_page_migrate can handle the ZS_EMPTY zspage.

But I got some false in zs_page_isolate:
if (get_zspage_inuse(zspage) == 0) {
spin_unlock(&class->lock);
return false;
}
The page of this zspage was migrated in before.

The reason is commit [1] just handle the "page" but not "newpage"
then it keep the "newpage" with a empty zspage inside system.
Root cause is zs_page_isolate remove it from ZS_EMPTY list but not
call zs_page_putback "schedule_work(&pool->free_work);".  Because
zs_page_migrate done the job without "schedule_work(&pool->free_work);"

Make this patch let zs_page_migrate wake up free_work if need.

[1] 
zsmalloc-zs_page_migrate-skip-unnecessary-loops-but-not-return-ebusy-if-zspage-is-not-inuse-fix.patch

Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 62457eb..fb99953 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2035,8 +2035,15 @@ int zs_page_migrate(struct address_space *mapping, 
struct page *newpage,
 * Page migration is done so let's putback isolated zspage to
 * the list if @page is final isolated subpage in the zspage.
 */
-   if (!is_zspage_isolated(zspage))
-   putback_zspage(class, zspage);
+   if (!is_zspage_isolated(zspage)) {
+   /*
+* Since we allow empty zspage migration, putback of zspage
+* should free empty zspage. Otherwise, it could make a leak
+* until upcoming free_work is done, which isn't guaranteed.
+*/
+   if (putback_zspage(class, zspage) == ZS_EMPTY)
+   schedule_work(&pool->free_work);
+   }
 
reset_page(page);
put_page(page);
-- 
1.9.1



Re: [PATCH v2] zsmalloc: zs_page_migrate: schedule free_work if zspage is ZS_EMPTY

2017-08-15 Thread Hui Zhu
Hi Minchan,

2017-08-16 10:13 GMT+08:00 Minchan Kim :
> Hi Hui,
>
> On Mon, Aug 14, 2017 at 05:56:30PM +0800, Hui Zhu wrote:
>> After commit e2846124f9a2 ("zsmalloc: zs_page_migrate: skip unnecessary
>
> This patch is not merged yet so the hash is invalid.
> That means we may fold this patch to [1] in current mmotm.
>
> [1] 
> zsmalloc-zs_page_migrate-skip-unnecessary-loops-but-not-return-ebusy-if-zspage-is-not-inuse-fix.patch
>
>> loops but not return -EBUSY if zspage is not inuse") zs_page_migrate
>> can handle the ZS_EMPTY zspage.
>>
>> But I got some false in zs_page_isolate:
>>   if (get_zspage_inuse(zspage) == 0) {
>>   spin_unlock(&class->lock);
>>   return false;
>>   }
>
> I also realized we should make zs_page_isolate succeed on empty zspage
> because we allow the empty zspage migration from now on.
> Could you send a patch for that as well?

OK.  I will make a patch for that later.

Thanks,
Hui

>
>> The page of this zspage was migrated in before.
>>
>> The reason is commit e2846124f9a2 ("zsmalloc: zs_page_migrate: skip
>> unnecessary loops but not return -EBUSY if zspage is not inuse") just
>> handle the "page" but not "newpage" then it keep the "newpage" with
>> a empty zspage inside system.
>> Root cause is zs_page_isolate remove it from ZS_EMPTY list but not
>> call zs_page_putback "schedule_work(&pool->free_work);".  Because
>> zs_page_migrate done the job without "schedule_work(&pool->free_work);"
>>
>> Make this patch let zs_page_migrate wake up free_work if need.
>>
>> Fixes: e2846124f9a2 ("zsmalloc: zs_page_migrate: skip unnecessary loops but 
>> not return -EBUSY if zspage is not inuse")
>> Signed-off-by: Hui Zhu 
>> ---
>>  mm/zsmalloc.c | 13 +++--
>>  1 file changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
>> index 62457eb..c6cc77c 100644
>> --- a/mm/zsmalloc.c
>> +++ b/mm/zsmalloc.c
>> @@ -2035,8 +2035,17 @@ int zs_page_migrate(struct address_space *mapping, 
>> struct page *newpage,
>>* Page migration is done so let's putback isolated zspage to
>>* the list if @page is final isolated subpage in the zspage.
>>*/
>> - if (!is_zspage_isolated(zspage))
>> - putback_zspage(class, zspage);
>> + if (!is_zspage_isolated(zspage)) {
>> + /*
>> +  * Page will be freed in following part. But newpage and
>> +  * zspage will stay in system if zspage is in ZS_EMPTY
>> +  * list.  So call free_work to free it.
>> +  * The page and class is locked, we cannot free zspage
>> +  * immediately so let's defer.
>> +  */
>
> How about this?
>
> /*
>  * Since we allow empty zspage migration, putback of zspage
>  * should free empty zspage. Otherwise, it could make a leak
>  * until upcoming free_work is done, which isn't guaranteed.
>  */
>> + if (putback_zspage(class, zspage) == ZS_EMPTY)
>> + schedule_work(&pool->free_work);
>> + }
>>
>>   reset_page(page);
>>   put_page(page);
>> --
>> 1.9.1
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 


[PATCH v2] zsmalloc: zs_page_migrate: schedule free_work if zspage is ZS_EMPTY

2017-08-14 Thread Hui Zhu
After commit e2846124f9a2 ("zsmalloc: zs_page_migrate: skip unnecessary
loops but not return -EBUSY if zspage is not inuse") zs_page_migrate
can handle the ZS_EMPTY zspage.

But I got some false in zs_page_isolate:
if (get_zspage_inuse(zspage) == 0) {
spin_unlock(&class->lock);
return false;
}
The page of this zspage was migrated in before.

The reason is commit e2846124f9a2 ("zsmalloc: zs_page_migrate: skip
unnecessary loops but not return -EBUSY if zspage is not inuse") just
handle the "page" but not "newpage" then it keep the "newpage" with
a empty zspage inside system.
Root cause is zs_page_isolate remove it from ZS_EMPTY list but not
call zs_page_putback "schedule_work(&pool->free_work);".  Because
zs_page_migrate done the job without "schedule_work(&pool->free_work);"

Make this patch let zs_page_migrate wake up free_work if need.

Fixes: e2846124f9a2 ("zsmalloc: zs_page_migrate: skip unnecessary loops but not 
return -EBUSY if zspage is not inuse")
Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 62457eb..c6cc77c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2035,8 +2035,17 @@ int zs_page_migrate(struct address_space *mapping, 
struct page *newpage,
 * Page migration is done so let's putback isolated zspage to
 * the list if @page is final isolated subpage in the zspage.
 */
-   if (!is_zspage_isolated(zspage))
-   putback_zspage(class, zspage);
+   if (!is_zspage_isolated(zspage)) {
+   /*
+* Page will be freed in following part. But newpage and
+* zspage will stay in system if zspage is in ZS_EMPTY
+* list.  So call free_work to free it.
+* The page and class is locked, we cannot free zspage
+* immediately so let's defer.
+*/
+   if (putback_zspage(class, zspage) == ZS_EMPTY)
+   schedule_work(&pool->free_work);
+   }
 
reset_page(page);
put_page(page);
-- 
1.9.1



Re: [PATCH] zsmalloc: zs_page_migrate: schedule free_work if zspage is ZS_EMPTY

2017-08-14 Thread Hui Zhu
2017-08-14 16:31 GMT+08:00 Minchan Kim :
> Hi Hui,
>
> On Mon, Aug 14, 2017 at 02:34:46PM +0800, Hui Zhu wrote:
>> After commit e2846124f9a2 ("zsmalloc: zs_page_migrate: skip unnecessary
>> loops but not return -EBUSY if zspage is not inuse") zs_page_migrate
>> can handle the ZS_EMPTY zspage.
>>
>> But it will affect the free_work free the zspage.  That will make this
>> ZS_EMPTY zspage stay in system until another zspage wake up free_work.
>>
>> Make this patch let zs_page_migrate wake up free_work if need.
>>
>> Fixes: e2846124f9a2 ("zsmalloc: zs_page_migrate: skip unnecessary loops but 
>> not return -EBUSY if zspage is not inuse")
>> Signed-off-by: Hui Zhu 
>
> This patch makes me remind why I didn't try to migrate empty zspage
> as you did e2846124f9a2. I have forgotten it toally.
>
> We cannot guarantee when the freeing of the page happens if we use
> deferred freeing in zs_page_migrate. However, we returns
> MIGRATEPAGE_SUCCESS which is totally lie.
> Without instant freeing the page, it doesn't help the migration
> situation. No?
>

Sorry I think the reason is I didn't introduce this clear.
After I patch e2846124f9a2.  I got some false in zs_page_isolate:
if (get_zspage_inuse(zspage) == 0) {
spin_unlock(&class->lock);
return false;
}
The page of this zspage was migrated in before.

So I think e2846124f9a2 is OK that MIGRATEPAGE_SUCCESS with the "page".
But it keep the "newpage" with a empty zspage inside system.
Root cause is zs_page_isolate remove it from  ZS_EMPTY list but not
call zs_page_putback "schedule_work(&pool->free_work);".  Because
zs_page_migrate done the job without
"schedule_work(&pool->free_work);"

That is why I made the new patch.

Thanks,
Hui

> I start to wonder why your patch e2846124f9a2 helped your test.
> I will think over the issue with fresh mind after the holiday.
>
>> ---
>>  mm/zsmalloc.c | 10 --
>>  1 file changed, 8 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
>> index 62457eb..48ce043 100644
>> --- a/mm/zsmalloc.c
>> +++ b/mm/zsmalloc.c
>> @@ -2035,8 +2035,14 @@ int zs_page_migrate(struct address_space *mapping, 
>> struct page *newpage,
>>* Page migration is done so let's putback isolated zspage to
>>* the list if @page is final isolated subpage in the zspage.
>>*/
>> - if (!is_zspage_isolated(zspage))
>> - putback_zspage(class, zspage);
>> + if (!is_zspage_isolated(zspage)) {
>> + /*
>> +  * The page and class is locked, we cannot free zspage
>> +  * immediately so let's defer.
>> +  */
>> + if (putback_zspage(class, zspage) == ZS_EMPTY)
>> + schedule_work(&pool->free_work);
>> + }
>>
>>   reset_page(page);
>>   put_page(page);
>> --
>> 1.9.1
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 


[PATCH] zsmalloc: zs_page_migrate: schedule free_work if zspage is ZS_EMPTY

2017-08-13 Thread Hui Zhu
After commit e2846124f9a2 ("zsmalloc: zs_page_migrate: skip unnecessary
loops but not return -EBUSY if zspage is not inuse") zs_page_migrate
can handle the ZS_EMPTY zspage.

But it will affect the free_work free the zspage.  That will make this
ZS_EMPTY zspage stay in system until another zspage wake up free_work.

Make this patch let zs_page_migrate wake up free_work if need.

Fixes: e2846124f9a2 ("zsmalloc: zs_page_migrate: skip unnecessary loops but not 
return -EBUSY if zspage is not inuse")
Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 62457eb..48ce043 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -2035,8 +2035,14 @@ int zs_page_migrate(struct address_space *mapping, 
struct page *newpage,
 * Page migration is done so let's putback isolated zspage to
 * the list if @page is final isolated subpage in the zspage.
 */
-   if (!is_zspage_isolated(zspage))
-   putback_zspage(class, zspage);
+   if (!is_zspage_isolated(zspage)) {
+   /*
+* The page and class is locked, we cannot free zspage
+* immediately so let's defer.
+*/
+   if (putback_zspage(class, zspage) == ZS_EMPTY)
+   schedule_work(&pool->free_work);
+   }
 
reset_page(page);
put_page(page);
-- 
1.9.1



[PATCH] zsmalloc: zs_page_migrate: skip unnecessary loops but not return -EBUSY if zspage is not inuse

2017-07-24 Thread Hui Zhu
The first version is in [1].

Got -EBUSY from zs_page_migrate will make migration
slow (retry) or fail (zs_page_putback will schedule_work free_work,
but it cannot ensure the success).

I noticed this issue because my Kernel patched [2]
that will remove retry in __alloc_contig_migrate_range.
This retry willhandle the -EBUSY because it will re-isolate the page
and re-call migrate_pages.
Without it will make cma_alloc fail at once with -EBUSY.

According to the review from Minchan Kim in [3], I update the patch
to skip unnecessary loops but not return -EBUSY if zspage is not inuse.

Following is what I got with highalloc-performance in a vbox with 2
cpu 1G memory 512 zram as swap.  And the swappiness is set to 100.
   ori  ne
  orig new
Minor Faults  5080511350830235
Major Faults 43918   56530
Swap Ins 42087   55680
Swap Outs89718  104700
Allocation stalls0   0
DMA allocs   57787   52364
DMA32 allocs  4796459948043563
Normal allocs0   0
Movable allocs   0   0
Direct pages scanned 45493   23167
Kswapd pages scanned   1565222 1725078
Kswapd pages reclaimed 134 1503037
Direct pages reclaimed   45615   25186
Kswapd efficiency  85% 87%
Kswapd velocity   1897.1011949.042
Direct efficiency 100%108%
Direct velocity 55.139  26.175
Percentage direct scans 2%  1%
Zone normal velocity  1952.2401975.217
Zone dma32 velocity  0.000   0.000
Zone dma velocity0.000   0.000
Page writes by reclaim   89764.000  105233.000
Page writes file46 533
Page writes anon 89718  104700
Page reclaim immediate   214573699
Sector Reads   3259688 3441368
Sector Writes  3667252 3754836
Page rescued immediate   0   0
Slabs scanned  1042872 1160855
Direct inode steals   8042   10089
Kswapd inode steals  54295   29170
Kswapd skipped wait  0   0
THP fault alloc175 154
THP collapse alloc 226 289
THP splits   0   0
THP fault fallback  11  14
THP collapse fail3   2
Compaction stalls  536 646
Compaction success 322 358
Compaction failures214 288
Page migrate success119608  111063
Page migrate failure  27232593
Compaction pages isolated   250179  232652
Compaction migrate scanned 9131832 9942306
Compaction free scanned2093272 2613998
Compaction cost192 189
NUMA alloc hit4712455547193990
NUMA alloc miss  0   0
NUMA interleave hit  0   0
NUMA alloc local  4712455547193990
NUMA base PTE updates0   0
NUMA huge PMD updates0   0
NUMA page range updates  0   0
NUMA hint faults 0   0
NUMA hint local faults   0   0
NUMA hint local percent100 100
NUMA pages migrated  0   0
AutoNUMA cost   0%  0%

[1]: https://lkml.org/lkml/2017/7/14/93
[2]: https://lkml.org/lkml/2014/5/28/113
[3]: https://lkml.org/lkml/2017/7/21/10

Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d41edd2..c2c7ba9 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1997,8 +1997,11 @@ int zs_page_migrate(struct address_space *mapping, 
struct page *newpage,
 
spin_lock(&class->lock);
if (!get_zspage_inuse(zspage)) {
-   ret = -EBUSY;
-   goto unlock_class;
+   /*
+* Set "offset" to end of the page so that every loops
+* skips unnecessary object scanning.
+*/
+   offset = PAGE_SIZE;
}
 
pos = offset;
@@ -2066,7 +2069,7 @@ int zs_page_migrate(struct address_space *mapping, struct 
page *newpage,
}
}
kunmap_atomic(s_addr);
-unlock_class:
+
spin_unlock(&class->lock);
migrate_write_unlock(zspage);
 
-- 
1.9.1



Re: [PATCH] zsmalloc: zs_page_migrate: not check inuse if migrate_mode is not MIGRATE_ASYNC

2017-07-20 Thread Hui Zhu
2017-07-20 16:47 GMT+08:00 Minchan Kim :
> Hi Hui,
>
> On Thu, Jul 20, 2017 at 02:39:17PM +0800, Hui Zhu wrote:
>> Hi Minchan,
>>
>> I am sorry for answer late.
>> I spent some time on ubuntu 16.04 with mmtests in an old laptop.
>>
>> 2017-07-17 13:39 GMT+08:00 Minchan Kim :
>> > Hello Hui,
>> >
>> > On Fri, Jul 14, 2017 at 03:51:07PM +0800, Hui Zhu wrote:
>> >> Got some -EBUSY from zs_page_migrate that will make migration
>> >> slow (retry) or fail (zs_page_putback will schedule_work free_work,
>> >> but it cannot ensure the success).
>> >
>> > I think EAGAIN(migration retrial) is better than EBUSY(bailout) because
>> > expectation is that zsmalloc will release the empty zs_page soon so
>> > at next retrial, it will be succeeded.
>>
>>
>> I am not sure.
>>
>> This is the call trace of zs_page_migrate:
>> zs_page_migrate
>> mapping->a_ops->migratepage
>> move_to_new_page
>> __unmap_and_move
>> unmap_and_move
>> migrate_pages
>>
>> In unmap_and_move will remove page from migration page list
>> and call putback_movable_page(will call mapping->a_ops->putback_page) if
>> return value of zs_page_migrate is not -EAGAIN.
>> The comments of this part:
>> After called mapping->a_ops->putback_page, zsmalloc can free the page
>> from ZS_EMPTY list.
>>
>> If retrun -EAGAIN, the page will be not be put back.  EAGAIN page will
>> be try again in migrate_pages without re-isolate.
>
> You're right. With -EGAIN, it burns out CPU pointlessly.
>
>>
>> > About schedule_work, as you said, we don't make sure when it happens but
>> > I believe it will happen in a migration iteration most of case.
>> > How often do you see that case?
>>
>> I noticed this issue because my Kernel patch 
>> https://lkml.org/lkml/2014/5/28/113
>> that will remove retry in __alloc_contig_migrate_range.
>> This retry willhandle the -EBUSY because it will re-isolate the page
>> and re-call migrate_pages.
>> Without it will make cma_alloc fail at once with -EBUSY.
>
> LKML.org server is not responding so hard to see patch you mentioned
> but I just got your point now so I don't care any more. Your patch is
> enough simple as considering the benefit.
> Just look at below comment.
>
>>
>> >
>> >>
>> >> And I didn't find anything that make zs_page_migrate cannot work with
>> >> a ZS_EMPTY zspage.
>> >> So make the patch to not check inuse if migrate_mode is not
>> >> MIGRATE_ASYNC.
>> >
>> > At a first glance, I think it work but the question is that it a same 
>> > problem
>> > ith schedule_work of zs_page_putback. IOW, Until the work is done, 
>> > compaction
>> > cannot succeed. Do you have any number before and after?
>> >
>>
>>
>> Following is what I got with highalloc-performance in a vbox with 2
>> cpu 1G memory 512 zram as swap:
>>oriafte
>>   orig   after
>> Minor Faults  5080511350801261
>> Major Faults 43918   46692
>> Swap Ins 42087   46299
>> Swap Outs89718  105495
>> Allocation stalls0   0
>> DMA allocs   57787   69787
>> DMA32 allocs  4796459947983772
>> Normal allocs0   0
>> Movable allocs   0   0
>> Direct pages scanned 45493   28837
>> Kswapd pages scanned   1565222 1512947
>> Kswapd pages reclaimed 134 1334030
>> Direct pages reclaimed   45615   30174
>> Kswapd efficiency  85% 88%
>> Kswapd velocity   1897.1011708.309
>> Direct efficiency 100%104%
>> Direct velocity 55.139  32.561
>> Percentage direct scans 2%  1%
>> Zone normal velocity  1952.2401740.870
>> Zone dma32 velocity  0.000   0.000
>> Zone dma velocity0.000   0.000
>> Page writes by reclaim   89764.000  106043.000
>> Page writes file46 548
>> Page writes anon 89718  105495
>> Page reclaim immediate   214577269
>> Sector Reads   3259688 3144160
>

Re: [PATCH] zsmalloc: zs_page_migrate: not check inuse if migrate_mode is not MIGRATE_ASYNC

2017-07-19 Thread Hui Zhu
Hi Minchan,

I am sorry for answer late.
I spent some time on ubuntu 16.04 with mmtests in an old laptop.

2017-07-17 13:39 GMT+08:00 Minchan Kim :
> Hello Hui,
>
> On Fri, Jul 14, 2017 at 03:51:07PM +0800, Hui Zhu wrote:
>> Got some -EBUSY from zs_page_migrate that will make migration
>> slow (retry) or fail (zs_page_putback will schedule_work free_work,
>> but it cannot ensure the success).
>
> I think EAGAIN(migration retrial) is better than EBUSY(bailout) because
> expectation is that zsmalloc will release the empty zs_page soon so
> at next retrial, it will be succeeded.


I am not sure.

This is the call trace of zs_page_migrate:
zs_page_migrate
mapping->a_ops->migratepage
move_to_new_page
__unmap_and_move
unmap_and_move
migrate_pages

In unmap_and_move will remove page from migration page list
and call putback_movable_page(will call mapping->a_ops->putback_page) if
return value of zs_page_migrate is not -EAGAIN.
The comments of this part:
After called mapping->a_ops->putback_page, zsmalloc can free the page
from ZS_EMPTY list.

If retrun -EAGAIN, the page will be not be put back.  EAGAIN page will
be try again in migrate_pages without re-isolate.

> About schedule_work, as you said, we don't make sure when it happens but
> I believe it will happen in a migration iteration most of case.
> How often do you see that case?

I noticed this issue because my Kernel patch https://lkml.org/lkml/2014/5/28/113
that will remove retry in __alloc_contig_migrate_range.
This retry willhandle the -EBUSY because it will re-isolate the page
and re-call migrate_pages.
Without it will make cma_alloc fail at once with -EBUSY.

>
>>
>> And I didn't find anything that make zs_page_migrate cannot work with
>> a ZS_EMPTY zspage.
>> So make the patch to not check inuse if migrate_mode is not
>> MIGRATE_ASYNC.
>
> At a first glance, I think it work but the question is that it a same problem
> ith schedule_work of zs_page_putback. IOW, Until the work is done, compaction
> cannot succeed. Do you have any number before and after?
>


Following is what I got with highalloc-performance in a vbox with 2
cpu 1G memory 512 zram as swap:
   oriafte
  orig   after
Minor Faults  5080511350801261
Major Faults 43918   46692
Swap Ins 42087   46299
Swap Outs89718  105495
Allocation stalls0   0
DMA allocs   57787   69787
DMA32 allocs  4796459947983772
Normal allocs0   0
Movable allocs   0   0
Direct pages scanned 45493   28837
Kswapd pages scanned   1565222 1512947
Kswapd pages reclaimed 134 1334030
Direct pages reclaimed   45615   30174
Kswapd efficiency  85% 88%
Kswapd velocity   1897.1011708.309
Direct efficiency 100%104%
Direct velocity 55.139  32.561
Percentage direct scans 2%  1%
Zone normal velocity  1952.2401740.870
Zone dma32 velocity  0.000   0.000
Zone dma velocity0.000   0.000
Page writes by reclaim   89764.000  106043.000
Page writes file46 548
Page writes anon 89718  105495
Page reclaim immediate   214577269
Sector Reads   3259688 3144160
Sector Writes  3667252 3675528
Page rescued immediate   0   0
Slabs scanned  1042872 1035438
Direct inode steals   80427772
Kswapd inode steals  54295   55075
Kswapd skipped wait  0   0
THP fault alloc175 200
THP collapse alloc 226 363
THP splits   0   0
THP fault fallback  11   1
THP collapse fail3   1
Compaction stalls  536 647
Compaction success 322 384
Compaction failures214 263
Page migrate success119608  127002
Page migrate failure  27232309
Compaction pages isolated   250179  265318
Compaction migrate scanned 9131832 9351314
Compaction free scanned2093272 3059014
Compaction cost192 202
NUMA alloc hit4712455547086375
NUMA alloc miss  0   0
NUMA interleave hit  0   0
NUMA alloc local  4712455547086375
NUMA base PTE updates0   0
NUMA huge PMD up

[PATCH] zsmalloc: zs_page_migrate: not check inuse if migrate_mode is not MIGRATE_ASYNC

2017-07-14 Thread Hui Zhu
Got some -EBUSY from zs_page_migrate that will make migration
slow (retry) or fail (zs_page_putback will schedule_work free_work,
but it cannot ensure the success).

And I didn't find anything that make zs_page_migrate cannot work with
a ZS_EMPTY zspage.
So make the patch to not check inuse if migrate_mode is not
MIGRATE_ASYNC.

Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 66 +--
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d41edd2..c298e5c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1982,6 +1982,7 @@ int zs_page_migrate(struct address_space *mapping, struct 
page *newpage,
unsigned long old_obj, new_obj;
unsigned int obj_idx;
int ret = -EAGAIN;
+   int inuse;
 
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
@@ -1996,21 +1997,24 @@ int zs_page_migrate(struct address_space *mapping, 
struct page *newpage,
offset = get_first_obj_offset(page);
 
spin_lock(&class->lock);
-   if (!get_zspage_inuse(zspage)) {
+   inuse = get_zspage_inuse(zspage);
+   if (mode == MIGRATE_ASYNC && !inuse) {
ret = -EBUSY;
goto unlock_class;
}
 
pos = offset;
s_addr = kmap_atomic(page);
-   while (pos < PAGE_SIZE) {
-   head = obj_to_head(page, s_addr + pos);
-   if (head & OBJ_ALLOCATED_TAG) {
-   handle = head & ~OBJ_ALLOCATED_TAG;
-   if (!trypin_tag(handle))
-   goto unpin_objects;
+   if (inuse) {
+   while (pos < PAGE_SIZE) {
+   head = obj_to_head(page, s_addr + pos);
+   if (head & OBJ_ALLOCATED_TAG) {
+   handle = head & ~OBJ_ALLOCATED_TAG;
+   if (!trypin_tag(handle))
+   goto unpin_objects;
+   }
+   pos += class->size;
}
-   pos += class->size;
}
 
/*
@@ -2020,20 +2024,22 @@ int zs_page_migrate(struct address_space *mapping, 
struct page *newpage,
memcpy(d_addr, s_addr, PAGE_SIZE);
kunmap_atomic(d_addr);
 
-   for (addr = s_addr + offset; addr < s_addr + pos;
-   addr += class->size) {
-   head = obj_to_head(page, addr);
-   if (head & OBJ_ALLOCATED_TAG) {
-   handle = head & ~OBJ_ALLOCATED_TAG;
-   if (!testpin_tag(handle))
-   BUG();
-
-   old_obj = handle_to_obj(handle);
-   obj_to_location(old_obj, &dummy, &obj_idx);
-   new_obj = (unsigned long)location_to_obj(newpage,
-   obj_idx);
-   new_obj |= BIT(HANDLE_PIN_BIT);
-   record_obj(handle, new_obj);
+   if (inuse) {
+   for (addr = s_addr + offset; addr < s_addr + pos;
+   addr += class->size) {
+   head = obj_to_head(page, addr);
+   if (head & OBJ_ALLOCATED_TAG) {
+   handle = head & ~OBJ_ALLOCATED_TAG;
+   if (!testpin_tag(handle))
+   BUG();
+
+   old_obj = handle_to_obj(handle);
+   obj_to_location(old_obj, &dummy, &obj_idx);
+   new_obj = (unsigned long)
+   location_to_obj(newpage, obj_idx);
+   new_obj |= BIT(HANDLE_PIN_BIT);
+   record_obj(handle, new_obj);
+   }
}
}
 
@@ -2055,14 +2061,16 @@ int zs_page_migrate(struct address_space *mapping, 
struct page *newpage,
 
ret = MIGRATEPAGE_SUCCESS;
 unpin_objects:
-   for (addr = s_addr + offset; addr < s_addr + pos;
+   if (inuse) {
+   for (addr = s_addr + offset; addr < s_addr + pos;
addr += class->size) {
-   head = obj_to_head(page, addr);
-   if (head & OBJ_ALLOCATED_TAG) {
-   handle = head & ~OBJ_ALLOCATED_TAG;
-   if (!testpin_tag(handle))
-   BUG();
-   unpin_tag(handle);
+   head = obj_to_head(page, addr);
+   if (head & OBJ_ALLOCATED_TAG) {
+   handle = head & ~OBJ_ALLOCATED_TAG;
+   if (

[RFC 0/2] Add interface let ZRAM close swap cache

2016-11-25 Thread Hui Zhu
SWAP will keep before swap cache before swap space get full.  It will
make swap space cannot be freed.  It is harmful to the system that use
ZRAM because its space use memory too.

This two patches will add a sysfs switch to ZRAM that open or close swap
cache without check the swap space.
I got good result in real environment with them.  And following part is
the record with vm-scalability case-swap-w-rand and case-swap-w-seq in
a Intel(R) Core(TM)2 Duo CPU, 2G memory and 1G ZRAM swap machine:
4.9.0-rc5 without the patches:
case-swap-w-rand
1129809600 bytes / 2149155959 usecs = 513 KB/s
1129809600 bytes / 2150796138 usecs = 512 KB/s
case-swap-w-rand
1124808768 bytes / 1973130450 usecs = 556 KB/s
1124808768 bytes / 1975142661 usecs = 556 KB/s
case-swap-w-rand
1130677056 bytes / 2154714972 usecs = 512 KB/s
1130677056 bytes / 2157542507 usecs = 511 KB/s
case-swap-w-seq
1117922688 bytes / 6596049 usecs = 165511 KB/s
1117922688 bytes / 6715711 usecs = 162562 KB/s
case-swap-w-seq
1115869824 bytes / 6909262 usecs = 157718 KB/s
1115869824 bytes / 7099283 usecs = 153496 KB/s
case-swap-w-seq
1116472896 bytes / 6451638 usecs = 168996 KB/s
1116472896 bytes / 6647963 usecs = 164005 KB/s
4.9.0-rc5 with the patches:
case-swap-w-rand
1127272896 bytes / 2060906184 usecs = 534 KB/s
1127272896 bytes / 2063671365 usecs = 533 KB/s
case-swap-w-rand
1131846912 bytes / 2097038264 usecs = 527 KB/s
1131846912 bytes / 2100148465 usecs = 526 KB/s
case-swap-w-rand
1129139136 bytes / 2038769367 usecs = 540 KB/s
1129139136 bytes / 2041411431 usecs = 540 KB/s
case-swap-w-seq
1129622976 bytes / 5910625 usecs = 186638 KB/s
1129622976 bytes / 6313311 usecs = 174733 KB/s
case-swap-w-seq
1130053248 bytes / 6771182 usecs = 162980 KB/s
1130053248 bytes / 061 usecs = 165550 KB/s
case-swap-w-seq
1126484928 bytes / 6555923 usecs = 167799 KB/s
1126484928 bytes / 6642291 usecs = 165617 KB/s

Hui Zhu (2):
SWAP: add interface to let disk close swap cache
ZRAM: add sysfs switch swap_cache_not_keep


[RFC 1/2] SWAP: add interface to let disk close swap cache

2016-11-25 Thread Hui Zhu
This patch add a interface to gendisk that SWAP device can use it to
control the swap cache rule.

Signed-off-by: Hui Zhu 
---
 include/linux/genhd.h |  3 +++
 include/linux/swap.h  |  8 ++
 mm/Kconfig| 10 +++
 mm/memory.c   |  2 +-
 mm/swapfile.c | 74 ++-
 mm/vmscan.c   |  2 +-
 6 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e0341af..6baec46 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -215,6 +215,9 @@ struct gendisk {
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
int node_id;
struct badblocks *bb;
+#ifdef CONFIG_SWAP_CACHE_RULE
+   bool swap_cache_not_keep;
+#endif
 };
 
 static inline struct gendisk *part_to_disk(struct hd_struct *part)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a56523c..6fa11ca 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -582,5 +582,13 @@ static inline bool mem_cgroup_swap_full(struct page *page)
 }
 #endif
 
+#ifdef CONFIG_SWAP_CACHE_RULE
+extern bool swap_not_keep_cache(struct page *page);
+extern void swap_cache_rule_update(void);
+#else
+#define swap_not_keep_cache(p) mem_cgroup_swap_full(p)
+#define swap_cache_rule_update()
+#endif
+
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 86e3e0e..6623e87 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -711,3 +711,13 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
 config ARCH_HAS_PKEYS
bool
+
+config SWAP_CACHE_RULE
+   bool "Swap cache rule support"
+   depends on SWAP
+   default n
+   help
+ add a interface to gendisk that SWAP device can use it to
+ control the swap cache rule.
+
+ If unsure, say "n".
diff --git a/mm/memory.c b/mm/memory.c
index e18c57b..099cb5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2654,7 +2654,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
}
 
swap_free(entry);
-   if (mem_cgroup_swap_full(page) ||
+   if (swap_not_keep_cache(page) ||
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f304389..9837261 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1019,7 +1019,7 @@ int free_swap_and_cache(swp_entry_t entry)
 * Also recheck PageSwapCache now page is locked (above).
 */
if (PageSwapCache(page) && !PageWriteback(page) &&
-   (!page_mapped(page) || mem_cgroup_swap_full(page))) {
+   (!page_mapped(page) || swap_not_keep_cache(page))) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
@@ -1992,6 +1992,8 @@ static void reinsert_swap_info(struct swap_info_struct *p)
filp_close(victim, NULL);
 out:
putname(pathname);
+   if (!err)
+   swap_cache_rule_update();
return err;
 }
 
@@ -2576,6 +2578,8 @@ static bool swap_discardable(struct swap_info_struct *si)
putname(name);
if (inode && S_ISREG(inode->i_mode))
inode_unlock(inode);
+   if (!error)
+   swap_cache_rule_update();
return error;
 }
 
@@ -2954,3 +2958,71 @@ static void free_swap_count_continuations(struct 
swap_info_struct *si)
}
}
 }
+
+#ifdef CONFIG_SWAP_CACHE_RULE
+enum swap_cache_rule_type {
+   SWAP_CACHE_UNKNOWN = 0,
+   SWAP_CACHE_SPECIAL_RULE,
+   SWAP_CACHE_NOT_KEEP,
+   SWAP_CACHE_NEED_CHECK,
+};
+
+static enum swap_cache_rule_type swap_cache_rule __read_mostly;
+
+bool swap_not_keep_cache(struct page *page)
+{
+   enum swap_cache_rule_type rule = READ_ONCE(swap_cache_rule);
+
+   if (rule == SWAP_CACHE_NOT_KEEP)
+   return true;
+
+   if (unlikely(rule == SWAP_CACHE_SPECIAL_RULE)) {
+   struct swap_info_struct *sis;
+
+   BUG_ON(!PageSwapCache(page));
+
+   sis = page_swap_info(page);
+   if (sis->flags & SWP_BLKDEV) {
+   struct gendisk *disk = sis->bdev->bd_disk;
+
+   if (READ_ONCE(disk->swap_cache_not_keep))
+   return true;
+   }
+   }
+
+   return mem_cgroup_swap_full(page);
+}
+
+void swap_cache_rule_update(void)
+{
+   enum swap_cache_rule_type rule = SWAP_CACHE_UNKNOWN;
+   int type;
+
+   spin_lock(&swap_lock);
+   for (type = 0; type < nr_swapfiles; type++) {
+   struct swap_info_struct *sis = swap_info[type];
+   enum swap_cache_rule_type current_rule = SWAP_CACHE_NEED_CHECK;
+
+   if (!(sis->flags & SWP_USED))
+   continue;
+
+   if 

[RFC 2/2] ZRAM: add sysfs switch swap_cache_not_keep

2016-11-25 Thread Hui Zhu
This patch add a sysfs interface swap_cache_not_keep to control the swap
cache rule for a ZRAM disk.
Swap will not keep the swap cache anytime if it set to 1.

Signed-off-by: Hui Zhu 
---
 drivers/block/zram/zram_drv.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 04365b1..bda9bbf 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -30,6 +30,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "zram_drv.h"
 
@@ -1158,6 +1160,32 @@ static ssize_t reset_store(struct device *dev,
return len;
 }
 
+#ifdef CONFIG_SWAP_CACHE_RULE
+static ssize_t swap_cache_not_keep_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct zram *zram = dev_to_zram(dev);
+
+   return scnprintf(buf, PAGE_SIZE, "%d\n",
+zram->disk->swap_cache_not_keep);
+}
+
+static ssize_t swap_cache_not_keep_store(struct device *dev,
+   struct device_attribute *attr, const char *buf, size_t len)
+{
+   struct zram *zram = dev_to_zram(dev);
+   bool rule;
+
+   if (strtobool(buf, &rule) < 0)
+   return -EINVAL;
+   WRITE_ONCE(zram->disk->swap_cache_not_keep, rule);
+
+   swap_cache_rule_update();
+
+   return len;
+}
+#endif
+
 static int zram_open(struct block_device *bdev, fmode_t mode)
 {
int ret = 0;
@@ -1190,6 +1218,9 @@ static int zram_open(struct block_device *bdev, fmode_t 
mode)
 static DEVICE_ATTR_RW(mem_used_max);
 static DEVICE_ATTR_RW(max_comp_streams);
 static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_SWAP_CACHE_RULE
+static DEVICE_ATTR_RW(swap_cache_not_keep);
+#endif
 
 static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
@@ -1213,6 +1244,9 @@ static int zram_open(struct block_device *bdev, fmode_t 
mode)
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,
+#ifdef CONFIG_SWAP_CACHE_RULE
+   &dev_attr_swap_cache_not_keep.attr,
+#endif
NULL,
 };
 
-- 
1.9.1



Re: [RFC 0/4] ZRAM: make it just store the high compression rate page

2016-09-04 Thread Hui Zhu
On Mon, Sep 5, 2016 at 1:51 PM, Minchan Kim  wrote:
> On Mon, Sep 05, 2016 at 01:12:05PM +0800, Hui Zhu wrote:
>> On Mon, Sep 5, 2016 at 10:18 AM, Minchan Kim  wrote:
>> > On Thu, Aug 25, 2016 at 04:25:30PM +0800, Hui Zhu wrote:
>> >> On Thu, Aug 25, 2016 at 2:09 PM, Sergey Senozhatsky
>> >>  wrote:
>> >> > Hello,
>> >> >
>> >> > On (08/22/16 16:25), Hui Zhu wrote:
>> >> >>
>> >> >> Current ZRAM just can store all pages even if the compression rate
>> >> >> of a page is really low.  So the compression rate of ZRAM is out of
>> >> >> control when it is running.
>> >> >> In my part, I did some test and record with ZRAM.  The compression rate
>> >> >> is about 40%.
>> >> >>
>> >> >> This series of patches make ZRAM can just store the page that the
>> >> >> compressed size is smaller than a value.
>> >> >> With these patches, I set the value to 2048 and did the same test with
>> >> >> before.  The compression rate is about 20%.  The times of 
>> >> >> lowmemorykiller
>> >> >> also decreased.
>> >> >
>> >> > I haven't looked at the patches in details yet. can you educate me a 
>> >> > bit?
>> >> > is your test stable? why the number of lowmemorykill-s has decreased?
>> >> > ... or am reading "The times of lowmemorykiller also decreased" wrong?
>> >> >
>> >> > suppose you have X pages that result in bad compression size (from zram
>> >> > point of view). zram stores such pages uncompressed, IOW we have no 
>> >> > memory
>> >> > savings - swapped out page lands in zsmalloc PAGE_SIZE class. now you
>> >> > don't try to store those pages in zsmalloc, but keep them as 
>> >> > unevictable.
>> >> > so the page still occupies PAGE_SIZE; no memory saving again. why did it
>> >> > improve LMK?
>> >>
>> >> No, zram will not save this page uncompressed with these patches.  It
>> >> will set it as non-swap and kick back to shrink_page_list.
>> >> Shrink_page_list will remove this page from swapcache and kick it to
>> >> unevictable list.
>> >> Then this page will not be swaped before it get write.
>> >> That is why most of code are around vmscan.c.
>> >
>> > If I understand Sergey's point right, he means there is no gain
>> > to save memory between before and after.
>> >
>> > With your approach, you can prevent unnecessary pageout(i.e.,
>> > uncompressible page swap out) but it doesn't mean you save the
>> > memory compared to old so why does your patch decrease the number of
>> > lowmemory killing?
>> >
>> > A thing I can imagine is without this feature, zram could be full of
>> > uncompressible pages so good-compressible page cannot be swapped out.
>> > Hui, is this scenario right for your case?
>> >
>>
>> That is one reason.  But it is not the principal one.
>>
>> Another reason is when swap is running to put page to zram, what the
>> system wants is to get memory.
>> Then the deal is system spends cpu time and memory to get memory. If
>> the zram just access the high compression rate pages, system can get
>> more memory with the same amount of memory. It will pull system from
>> low memory status earlier. (Maybe more cpu time, because the
>> compression rate checks. But maybe less, because fewer pages need to
>> digress. That is the interesting part. :)
>> I think that is why lmk times decrease.
>>
>> And yes, all of this depends on the number of high compression rate
>> pages. So you cannot just set a non_swap limit to the system and get
>> everything. You need to do a lot of test around it to make sure the
>> non_swap limit is good for your system.
>>
>> And I think use AOP_WRITEPAGE_ACTIVATE without kicking page to a
>> special list will make cpu too busy sometimes.
>
> Yes, and it would same with your patch if new arraival write on CoWed
> page is uncompressible data.
>
>> I did some tests before I kick page to a special list. The shrink task
>
> What kinds of test? Could you elaborate a bit more?
> shrink task. What does it mean?
>



Sorry for this part.  It should be function shrink_page_list.

I will do more test for that and post the patch later.

Thanks,
Hui


>> will be moved around, around and around because low compression rate
>> pages just moved from one list to another a lot of times, again, again
>> and again.
>> And all this low compression rate pages always stay together.
>
> I cannot understand with detail description. :(
> Could you explain more?


Re: [RFC 0/4] ZRAM: make it just store the high compression rate page

2016-09-04 Thread Hui Zhu
On Mon, Sep 5, 2016 at 10:18 AM, Minchan Kim  wrote:
> On Thu, Aug 25, 2016 at 04:25:30PM +0800, Hui Zhu wrote:
>> On Thu, Aug 25, 2016 at 2:09 PM, Sergey Senozhatsky
>>  wrote:
>> > Hello,
>> >
>> > On (08/22/16 16:25), Hui Zhu wrote:
>> >>
>> >> Current ZRAM just can store all pages even if the compression rate
>> >> of a page is really low.  So the compression rate of ZRAM is out of
>> >> control when it is running.
>> >> In my part, I did some test and record with ZRAM.  The compression rate
>> >> is about 40%.
>> >>
>> >> This series of patches make ZRAM can just store the page that the
>> >> compressed size is smaller than a value.
>> >> With these patches, I set the value to 2048 and did the same test with
>> >> before.  The compression rate is about 20%.  The times of lowmemorykiller
>> >> also decreased.
>> >
>> > I haven't looked at the patches in details yet. can you educate me a bit?
>> > is your test stable? why the number of lowmemorykill-s has decreased?
>> > ... or am reading "The times of lowmemorykiller also decreased" wrong?
>> >
>> > suppose you have X pages that result in bad compression size (from zram
>> > point of view). zram stores such pages uncompressed, IOW we have no memory
>> > savings - swapped out page lands in zsmalloc PAGE_SIZE class. now you
>> > don't try to store those pages in zsmalloc, but keep them as unevictable.
>> > so the page still occupies PAGE_SIZE; no memory saving again. why did it
>> > improve LMK?
>>
>> No, zram will not save this page uncompressed with these patches.  It
>> will set it as non-swap and kick back to shrink_page_list.
>> Shrink_page_list will remove this page from swapcache and kick it to
>> unevictable list.
>> Then this page will not be swaped before it get write.
>> That is why most of code are around vmscan.c.
>
> If I understand Sergey's point right, he means there is no gain
> to save memory between before and after.
>
> With your approach, you can prevent unnecessary pageout(i.e.,
> uncompressible page swap out) but it doesn't mean you save the
> memory compared to old so why does your patch decrease the number of
> lowmemory killing?
>
> A thing I can imagine is without this feature, zram could be full of
> uncompressible pages so good-compressible page cannot be swapped out.
> Hui, is this scenario right for your case?
>

That is one reason.  But it is not the principal one.

Another reason is when swap is running to put page to zram, what the
system wants is to get memory.
Then the deal is system spends cpu time and memory to get memory. If
the zram just access the high compression rate pages, system can get
more memory with the same amount of memory. It will pull system from
low memory status earlier. (Maybe more cpu time, because the
compression rate checks. But maybe less, because fewer pages need to
digress. That is the interesting part. :)
I think that is why lmk times decrease.

And yes, all of this depends on the number of high compression rate
pages. So you cannot just set a non_swap limit to the system and get
everything. You need to do a lot of test around it to make sure the
non_swap limit is good for your system.

And I think use AOP_WRITEPAGE_ACTIVATE without kicking page to a
special list will make cpu too busy sometimes.
I did some tests before I kick page to a special list. The shrink task
will be moved around, around and around because low compression rate
pages just moved from one list to another a lot of times, again, again
and again.
And all this low compression rate pages always stay together.

Thanks,
Hui


> Thanks.


Re: [RFC 0/4] ZRAM: make it just store the high compression rate page

2016-08-25 Thread Hui Zhu
On Thu, Aug 25, 2016 at 2:09 PM, Sergey Senozhatsky
 wrote:
> Hello,
>
> On (08/22/16 16:25), Hui Zhu wrote:
>>
>> Current ZRAM just can store all pages even if the compression rate
>> of a page is really low.  So the compression rate of ZRAM is out of
>> control when it is running.
>> In my part, I did some test and record with ZRAM.  The compression rate
>> is about 40%.
>>
>> This series of patches make ZRAM can just store the page that the
>> compressed size is smaller than a value.
>> With these patches, I set the value to 2048 and did the same test with
>> before.  The compression rate is about 20%.  The times of lowmemorykiller
>> also decreased.
>
> I haven't looked at the patches in details yet. can you educate me a bit?
> is your test stable? why the number of lowmemorykill-s has decreased?
> ... or am reading "The times of lowmemorykiller also decreased" wrong?
>
> suppose you have X pages that result in bad compression size (from zram
> point of view). zram stores such pages uncompressed, IOW we have no memory
> savings - swapped out page lands in zsmalloc PAGE_SIZE class. now you
> don't try to store those pages in zsmalloc, but keep them as unevictable.
> so the page still occupies PAGE_SIZE; no memory saving again. why did it
> improve LMK?

No, zram will not save this page uncompressed with these patches.  It
will set it as non-swap and kick back to shrink_page_list.
Shrink_page_list will remove this page from swapcache and kick it to
unevictable list.
Then this page will not be swaped before it get write.
That is why most of code are around vmscan.c.

Thanks,
Hui

>
> -ss


Re: [RFC 0/4] ZRAM: make it just store the high compression rate page

2016-08-23 Thread Hui Zhu
Hi Minchan,

On Wed, Aug 24, 2016 at 9:04 AM, Minchan Kim  wrote:
> Hi Hui,
>
> On Mon, Aug 22, 2016 at 04:25:05PM +0800, Hui Zhu wrote:
>> Current ZRAM just can store all pages even if the compression rate
>> of a page is really low.  So the compression rate of ZRAM is out of
>> control when it is running.
>> In my part, I did some test and record with ZRAM.  The compression rate
>> is about 40%.
>>
>> This series of patches make ZRAM can just store the page that the
>> compressed size is smaller than a value.
>> With these patches, I set the value to 2048 and did the same test with
>> before.  The compression rate is about 20%.  The times of lowmemorykiller
>> also decreased.
>
> I have an interest about the feature for a long time but didn't work on it
> because I didn't have a good idea to implment it with generic approach
> without layer violation. I will look into this after handling urgent works.
>
> Thanks.

That will be great.  Thanks.

Best,
Hui


[RFC 0/4] ZRAM: make it just store the high compression rate page

2016-08-22 Thread Hui Zhu
Current ZRAM just can store all pages even if the compression rate
of a page is really low.  So the compression rate of ZRAM is out of
control when it is running.
In my part, I did some test and record with ZRAM.  The compression rate
is about 40%.

This series of patches make ZRAM can just store the page that the
compressed size is smaller than a value.
With these patches, I set the value to 2048 and did the same test with
before.  The compression rate is about 20%.  The times of lowmemorykiller
also decreased.

Hui Zhu (4):
vmscan.c: shrink_page_list: unmap anon pages after pageout
Add non-swap page flag to mark a page will not swap
ZRAM: do not swap the pages that compressed size bigger than non_swap
vmscan.c: zram: add non swap support for shmem file pages

 drivers/block/zram/Kconfig |   11 +++
 drivers/block/zram/zram_drv.c  |   38 +++
 drivers/block/zram/zram_drv.h  |4 +
 fs/proc/meminfo.c  |6 +
 include/linux/mm_inline.h  |   20 +
 include/linux/mmzone.h |3 
 include/linux/page-flags.h |8 ++
 include/linux/rmap.h   |5 +
 include/linux/shmem_fs.h   |6 +
 include/trace/events/mmflags.h |9 ++
 kernel/events/uprobes.c|   16 
 mm/Kconfig |9 ++
 mm/memory.c|   34 ++
 mm/migrate.c   |4 +
 mm/mprotect.c  |8 ++
 mm/page_io.c   |   11 ++-
 mm/rmap.c  |   23 ++
 mm/shmem.c |   77 +-
 mm/vmscan.c|  139 +++--
 19 files changed, 387 insertions(+), 44 deletions(-)


[RFC 3/4] ZRAM: do not swap the page that compressed size bigger than non_swap

2016-08-22 Thread Hui Zhu
New option ZRAM_NON_SWAP add a interface "non_swap" to zram.
User can set a unsigned int value to zram.
If a page that compressed size is bigger than limit, mark it as
non-swap.  Then this page will add to unevictable lru list.

This patch doesn't handle the shmem file pages.

Signed-off-by: Hui Zhu 
---
 drivers/block/zram/Kconfig| 11 +++
 drivers/block/zram/zram_drv.c | 39 +++
 drivers/block/zram/zram_drv.h |  4 
 3 files changed, 54 insertions(+)

diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index b8ecba6..525caaa 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -13,3 +13,14 @@ config ZRAM
  disks and maybe many more.
 
  See zram.txt for more information.
+
+config ZRAM_NON_SWAP
+   bool "Enable zram non-swap support"
+   depends on ZRAM
+   select NON_SWAP
+   default n
+   help
+ This option add a interface "non_swap" to zram.  User can set
+ a unsigned int value to zram.
+ If a page that compressed size is bigger than limit, mark it as
+ non-swap.  Then this page will add to unevictable lru list.
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 04365b1..8f7f1ec 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -714,6 +714,14 @@ compress_again:
goto out;
}
 
+#ifdef CONFIG_ZRAM_NON_SWAP
+   if (!is_partial_io(bvec) && PageAnon(page) &&
+   zram->non_swap && clen > zram->non_swap) {
+   ret = 0;
+   SetPageNonSwap(page);
+   goto out;
+   }
+#endif
src = zstrm->buffer;
if (unlikely(clen > max_zpage_size)) {
clen = PAGE_SIZE;
@@ -1180,6 +1188,31 @@ static const struct block_device_operations zram_devops 
= {
.owner = THIS_MODULE
 };
 
+#ifdef CONFIG_ZRAM_NON_SWAP
+static ssize_t non_swap_show(struct device *dev,
+struct device_attribute *attr, char *buf)
+{
+   struct zram *zram = dev_to_zram(dev);
+
+   return scnprintf(buf, PAGE_SIZE, "%u\n", zram->non_swap);
+}
+
+static ssize_t non_swap_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t len)
+{
+   struct zram *zram = dev_to_zram(dev);
+
+   zram->non_swap = (unsigned int)memparse(buf, NULL);
+
+   if (zram->non_swap > max_zpage_size)
+   pr_warn("Nonswap should small than max_zpage_size %zu\n",
+   max_zpage_size);
+
+   return len;
+}
+#endif
+
 static DEVICE_ATTR_WO(compact);
 static DEVICE_ATTR_RW(disksize);
 static DEVICE_ATTR_RO(initstate);
@@ -1190,6 +1223,9 @@ static DEVICE_ATTR_RW(mem_limit);
 static DEVICE_ATTR_RW(mem_used_max);
 static DEVICE_ATTR_RW(max_comp_streams);
 static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_ZRAM_NON_SWAP
+static DEVICE_ATTR_RW(non_swap);
+#endif
 
 static struct attribute *zram_disk_attrs[] = {
&dev_attr_disksize.attr,
@@ -1210,6 +1246,9 @@ static struct attribute *zram_disk_attrs[] = {
&dev_attr_mem_used_max.attr,
&dev_attr_max_comp_streams.attr,
&dev_attr_comp_algorithm.attr,
+#ifdef CONFIG_ZRAM_NON_SWAP
+   &dev_attr_non_swap.attr,
+#endif
&dev_attr_io_stat.attr,
&dev_attr_mm_stat.attr,
&dev_attr_debug_stat.attr,
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 74fcf10..bd5f38a 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -119,5 +119,9 @@ struct zram {
 * zram is claimed so open request will be failed
 */
bool claim; /* Protected by bdev->bd_mutex */
+
+#ifdef CONFIG_ZRAM_NON_SWAP
+   unsigned int non_swap;
+#endif
 };
 #endif
-- 
1.9.1



[RFC 2/4] Add non-swap page flag to mark a page will not swap

2016-08-22 Thread Hui Zhu
After a page marked non-swap flag in swap driver, it will add to
unevictable lru list.
This page will be kept in this status before its data changed.

Signed-off-by: Hui Zhu 
---
 fs/proc/meminfo.c  |  6 ++
 include/linux/mm_inline.h  | 20 ++--
 include/linux/mmzone.h |  3 +++
 include/linux/page-flags.h |  8 
 include/trace/events/mmflags.h |  9 -
 kernel/events/uprobes.c| 16 +++-
 mm/Kconfig |  5 +
 mm/memory.c| 34 ++
 mm/migrate.c   |  4 
 mm/mprotect.c  |  8 
 mm/vmscan.c| 41 -
 11 files changed, 149 insertions(+), 5 deletions(-)

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b9a8c81..5c79b2e 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -79,6 +79,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #endif
"SwapTotal:  %8lu kB\n"
"SwapFree:   %8lu kB\n"
+#ifdef CONFIG_NON_SWAP
+   "NonSwap:%8lu kB\n"
+#endif
"Dirty:  %8lu kB\n"
"Writeback:  %8lu kB\n"
"AnonPages:  %8lu kB\n"
@@ -138,6 +141,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #endif
K(i.totalswap),
K(i.freeswap),
+#ifdef CONFIG_NON_SWAP
+   K(global_page_state(NR_NON_SWAP)),
+#endif
K(global_node_page_state(NR_FILE_DIRTY)),
K(global_node_page_state(NR_WRITEBACK)),
K(global_node_page_state(NR_ANON_MAPPED)),
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 71613e8..92298ce 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -46,15 +46,31 @@ static __always_inline void update_lru_size(struct lruvec 
*lruvec,
 static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
 {
-   update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
+   int nr_pages = hpage_nr_pages(page);
+   enum zone_type zid = page_zonenum(page);
+#ifdef CONFIG_NON_SWAP
+   if (PageNonSwap(page)) {
+   lru = LRU_UNEVICTABLE;
+   update_lru_size(lruvec, NR_NON_SWAP, zid, nr_pages);
+   }
+#endif
+   update_lru_size(lruvec, lru, zid, nr_pages);
list_add(&page->lru, &lruvec->lists[lru]);
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
 {
+   int nr_pages = hpage_nr_pages(page);
+   enum zone_type zid = page_zonenum(page);
+#ifdef CONFIG_NON_SWAP
+   if (PageNonSwap(page)) {
+   lru = LRU_UNEVICTABLE;
+   update_lru_size(lruvec, NR_NON_SWAP, zid, -nr_pages);
+   }
+#endif
list_del(&page->lru);
-   update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
+   update_lru_size(lruvec, lru, zid, -nr_pages);
 }
 
 /**
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d572b78..da08d20 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -138,6 +138,9 @@ enum zone_stat_item {
NUMA_OTHER, /* allocation from other node */
 #endif
NR_FREE_CMA_PAGES,
+#ifdef CONFIG_NON_SWAP
+   NR_NON_SWAP,
+#endif
NR_VM_ZONE_STAT_ITEMS };
 
 enum node_stat_item {
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74e4dda..0cd80db9 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -105,6 +105,9 @@ enum pageflags {
PG_young,
PG_idle,
 #endif
+#ifdef CONFIG_NON_SWAP
+   PG_non_swap,
+#endif
__NR_PAGEFLAGS,
 
/* Filesystems */
@@ -303,6 +306,11 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
 PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
 
+#ifdef CONFIG_NON_SWAP
+PAGEFLAG(NonSwap, non_swap, PF_NO_TAIL)
+   TESTSCFLAG(NonSwap, non_swap, PF_NO_TAIL)
+#endif
+
 #ifdef CONFIG_HIGHMEM
 /*
  * Must use a macro here due to header dependency issues. page_zone() is not
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 5a81ab4..1c0ccc9 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -79,6 +79,12 @@
 #define IF_HAVE_PG_IDLE(flag,string)
 #endif
 
+#ifdef CONFIG_NON_SWAP
+#define IF_HAVE_PG_NON_SWAP(flag,string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_NON_SWAP(flag,string)
+#endif
+
 #define __def_pageflag_names   \
{1UL << PG_locked,  "locked"},  \
{1UL <&

[RFC 1/4] vmscan.c: shrink_page_list: unmap anon pages after pageout

2016-08-22 Thread Hui Zhu
The page is unmapped when ZRAM get the compressed size.  At it is added
to swapcache.
To remove it from swapcache need set each pte back to point to pfn.
But these is not a way to do it.

This patch set each pte readonly before pageout.  Then when the page is
written when save its data to ZRAM, its pte will be set to dirty.
After pageout, shrink_page_list will check the pte and re-dirty the page.
After pageout successfully and page is not dirty, unmap the page.

This patch doesn't handle the shmem file pages that use swap too.
The reason is I just find a hack way the make sure a page is shmem file
page. Then I separate code of shmem file pages to last patch of this
series.

Signed-off-by: Hui Zhu 
---
 include/linux/rmap.h |  5 
 mm/Kconfig   |  4 +++
 mm/page_io.c | 11 ---
 mm/rmap.c| 28 ++
 mm/vmscan.c  | 81 +---
 5 files changed, 108 insertions(+), 21 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b46bb56..4259c46 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -88,6 +88,11 @@ enum ttu_flags {
TTU_LZFREE = 8, /* lazy free mode */
TTU_SPLIT_HUGE_PMD = 16,/* split huge PMD if any */
 
+#ifdef CONFIG_LATE_UNMAP
+   TTU_CHECK_DIRTY = (1 << 5), /* Check dirty mode */
+   TTU_READONLY = (1 << 6),/* Change readonly mode */
+#endif
+
TTU_IGNORE_MLOCK = (1 << 8),/* ignore mlock */
TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
diff --git a/mm/Kconfig b/mm/Kconfig
index 78a23c5..57ecdb3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -704,3 +704,7 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
 config ARCH_HAS_PKEYS
bool
+
+config LATE_UNMAP
+   bool
+   depends on SWAP
diff --git a/mm/page_io.c b/mm/page_io.c
index 16bd82fa..adaf801 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -237,10 +237,13 @@ int swap_writepage(struct page *page, struct 
writeback_control *wbc)
 {
int ret = 0;
 
-   if (try_to_free_swap(page)) {
-   unlock_page(page);
-   goto out;
-   }
+#ifdef CONFIG_LATE_UNMAP
+   if (!(PageAnon(page) && page_mapped(page)))
+#endif
+   if (try_to_free_swap(page)) {
+   unlock_page(page);
+   goto out;
+   }
if (frontswap_store(page) == 0) {
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 1ef3640..d484f95 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1488,6 +1488,29 @@ static int try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
}
}
 
+#ifdef CONFIG_LATE_UNMAP
+   if ((flags & TTU_CHECK_DIRTY) || (flags & TTU_READONLY)) {
+   BUG_ON(!PageAnon(page));
+
+   pteval = *pte;
+
+   BUG_ON(pte_write(pteval) &&
+  page_mapcount(page) + page_swapcount(page) > 1);
+
+   if ((flags & TTU_CHECK_DIRTY) && pte_dirty(pteval)) {
+   set_page_dirty(page);
+   pteval = pte_mkclean(pteval);
+   }
+
+   if (flags & TTU_READONLY)
+   pteval = pte_wrprotect(pteval);
+
+   if (!pte_same(*pte, pteval))
+   set_pte_at(mm, address, pte, pteval);
+   goto out_unmap;
+   }
+#endif
+
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
if (should_defer_flush(mm, flags)) {
@@ -1657,6 +1680,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
else
ret = rmap_walk(page, &rwc);
 
+#ifdef CONFIG_LATE_UNMAP
+   if ((flags & (TTU_READONLY | TTU_CHECK_DIRTY)) &&
+   ret == SWAP_AGAIN)
+   ret = SWAP_SUCCESS;
+#endif
if (ret != SWAP_MLOCK && !page_mapcount(page)) {
ret = SWAP_SUCCESS;
if (rp.lazyfreed && !PageDirty(page))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 374d95d..32fef7d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -494,12 +494,19 @@ void drop_slab(void)
 
 static inline int is_page_cache_freeable(struct page *page)
 {
+   int count = page_count(page) - page_has_private(page);
+
+#ifdef CONFIG_LATE_UNMAP
+   if (PageAnon(page))
+   count -= page_mapcount(page);
+#endif
+
/*
 * A freeable page cache page is referenced only by the caller
 * that isolated the page, the page cache radix tree and
 * optional buffer heads at page->private.
 */
-   return page_count(page) - page_has_private(page) == 2;
+   return count == 2;
 }
 
 static int may_write_to_inode

[RFC 4/4] vmscan.c: zram: add non swap support for shmem file pages

2016-08-22 Thread Hui Zhu
This patch add the whole support for shmem file pages non swap.
To make sure a page is shmem file page, check mapping->a_ops == &shmem_aops.
I think it is really a hack way.

There are not a lot of shmem file pages will be swapped out.

Signed-off-by: Hui Zhu 
---
 drivers/block/zram/zram_drv.c |  3 +-
 include/linux/shmem_fs.h  |  6 
 mm/page_io.c  |  2 +-
 mm/rmap.c |  5 ---
 mm/shmem.c| 77 ++-
 mm/vmscan.c   | 27 +++
 6 files changed, 89 insertions(+), 31 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 8f7f1ec..914c096 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -715,8 +715,7 @@ compress_again:
}
 
 #ifdef CONFIG_ZRAM_NON_SWAP
-   if (!is_partial_io(bvec) && PageAnon(page) &&
-   zram->non_swap && clen > zram->non_swap) {
+   if (!is_partial_io(bvec) && zram->non_swap && clen > zram->non_swap) {
ret = 0;
SetPageNonSwap(page);
goto out;
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index ff078e7..fd44473 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -124,4 +124,10 @@ static inline bool shmem_huge_enabled(struct 
vm_area_struct *vma)
 }
 #endif
 
+extern const struct address_space_operations shmem_aops;
+
+#ifdef CONFIG_LATE_UNMAP
+extern void shmem_page_unmap(struct page *page);
+#endif
+
 #endif
diff --git a/mm/page_io.c b/mm/page_io.c
index adaf801..5fd3069 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -238,7 +238,7 @@ int swap_writepage(struct page *page, struct 
writeback_control *wbc)
int ret = 0;
 
 #ifdef CONFIG_LATE_UNMAP
-   if (!(PageAnon(page) && page_mapped(page)))
+   if (!page_mapped(page))
 #endif
if (try_to_free_swap(page)) {
unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index d484f95..418f731 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1490,13 +1490,8 @@ static int try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
 
 #ifdef CONFIG_LATE_UNMAP
if ((flags & TTU_CHECK_DIRTY) || (flags & TTU_READONLY)) {
-   BUG_ON(!PageAnon(page));
-
pteval = *pte;
 
-   BUG_ON(pte_write(pteval) &&
-  page_mapcount(page) + page_swapcount(page) > 1);
-
if ((flags & TTU_CHECK_DIRTY) && pte_dirty(pteval)) {
set_page_dirty(page);
pteval = pte_mkclean(pteval);
diff --git a/mm/shmem.c b/mm/shmem.c
index fd8b2b5..556d853 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -182,7 +182,6 @@ static inline void shmem_unacct_blocks(unsigned long flags, 
long pages)
 }
 
 static const struct super_operations shmem_ops;
-static const struct address_space_operations shmem_aops;
 static const struct file_operations shmem_file_operations;
 static const struct inode_operations shmem_inode_operations;
 static const struct inode_operations shmem_dir_inode_operations;
@@ -1178,6 +1177,55 @@ out:
return error;
 }
 
+#define SHMEM_WRITEPAGE_LOCK   \
+   do {\
+   mutex_lock(&shmem_swaplist_mutex);  \
+   if (list_empty(&info->swaplist))\
+   list_add_tail(&info->swaplist,  \
+ &shmem_swaplist); \
+   } while (0)
+
+#define SHMEM_WRITEPAGE_SWAP   \
+   do {\
+   spin_lock(&info->lock); \
+   shmem_recalc_inode(inode);  \
+   info->swapped++;\
+   spin_unlock(&info->lock);   \
+   swap_shmem_alloc(swap); \
+   shmem_delete_from_page_cache(page,  \
+swp_to_radix_entry(swap)); \
+   } while (0)
+
+#define SHMEM_WRITEPAGE_UNLOCK \
+   do {\
+   mutex_unlock(&shmem_swaplist_mutex);\
+   } while (0)
+
+#define SHMEM_WRITEPAGE_BUG_ON \
+   do {\
+   BUG_ON(page_mapped(page));  \
+   } while (0)
+
+#ifdef CONFIG_LATE_UNMAP
+void
+shmem_page_unmap(struct page *page)
+{
+   struct shmem_inode_info *info;
+   struct address_space *mapping;
+   struct inode *inode;
+

[PATCH 2/3] zsmalloc: make its page "PageMobile"

2015-11-27 Thread Hui Zhu
The idea of this patch is same with prev version [1].  But it use the
migration frame in [1].

[1] http://comments.gmane.org/gmane.linux.kernel.mm/140014
[2] https://lkml.org/lkml/2015/7/7/21

Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 214 --
 1 file changed, 209 insertions(+), 5 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 57c91a5..5034aac 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -53,10 +53,13 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
+#include 
+#include 
 
 /*
  * This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -217,6 +220,8 @@ struct size_class {
 
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
bool huge;
+
+   atomic_t count;
 };
 
 /*
@@ -281,6 +286,10 @@ struct zs_migration {
 #define ZS_MIGRATION(p) ((struct zs_migration *)((p)->freelist))
 #define ZS_META(p) ((struct zs_meta *)&(ZS_MIGRATION(p)->index))
 
+static struct inode *zs_inode;
+static DEFINE_SPINLOCK(zs_migration_lock);
+static DEFINE_RWLOCK(zs_tag_rwlock);
+
 struct mapping_area {
 #ifdef CONFIG_PGTABLE_MAPPING
struct vm_struct *vm; /* vm area for mapping object that span pages */
@@ -307,7 +316,7 @@ static void destroy_handle_cache(struct zs_pool *pool)
 static unsigned long alloc_handle(struct zs_pool *pool)
 {
return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-   pool->flags & ~__GFP_HIGHMEM);
+   pool->flags & ~(__GFP_HIGHMEM | __GFP_MOVABLE));
 }
 
 static void free_handle(struct zs_pool *pool, unsigned long handle)
@@ -914,9 +923,12 @@ static void reset_page(struct page *page)
clear_bit(PG_private, &page->flags);
clear_bit(PG_private_2, &page->flags);
set_page_private(page, 0);
-   free_migration(page->freelist);
-   page->freelist = NULL;
+   if (page->freelist) {
+   free_migration(page->freelist);
+   page->freelist = NULL;
+   }
page_mapcount_reset(page);
+   page->mapping = NULL;
 }
 
 static void free_zspage(struct page *first_page)
@@ -927,6 +939,8 @@ static void free_zspage(struct page *first_page)
BUG_ON(!is_first_page(first_page));
BUG_ON(get_inuse_obj(first_page));
 
+   spin_lock(&zs_migration_lock);
+
head_extra = (struct page *)page_private(first_page);
 
reset_page(first_page);
@@ -934,7 +948,7 @@ static void free_zspage(struct page *first_page)
 
/* zspage with only 1 system page */
if (!head_extra)
-   return;
+   goto out;
 
list_for_each_entry_safe(nextm, tmp, &ZS_MIGRATION(head_extra)->lru,
 lru) {
@@ -945,6 +959,9 @@ static void free_zspage(struct page *first_page)
}
reset_page(head_extra);
__free_page(head_extra);
+
+out:
+   spin_unlock(&zs_migration_lock);
 }
 
 /* Initialize a newly allocated zspage */
@@ -1018,6 +1035,7 @@ static struct page *alloc_zspage(struct size_class 
*class, gfp_t flags)
page = alloc_page(flags);
if (!page)
goto cleanup;
+   page->mapping = zs_inode->i_mapping;
page->freelist = alloc_migration(flags);
if (!page->freelist) {
__free_page(page);
@@ -1327,6 +1345,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long 
handle,
BUG_ON(in_interrupt());
 
/* From now on, migration cannot move the object */
+   read_lock(&zs_tag_rwlock);
pin_tag(handle);
 
obj = handle_to_obj(handle);
@@ -1395,6 +1414,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long 
handle)
}
put_cpu_var(zs_map_area);
unpin_tag(handle);
+   read_unlock(&zs_tag_rwlock);
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
 
@@ -1431,6 +1451,16 @@ static unsigned long obj_malloc(struct page *first_page,
 }
 
 
+static void set_zspage_mobile(struct size_class *class, struct page *page)
+{
+   BUG_ON(!is_first_page(page));
+
+   while (page) {
+   __SetPageMobile(page);
+   page = get_next_page(page);
+   }
+}
+
 /**
  * zs_malloc - Allocate block of given size from pool.
  * @pool: pool to allocate from
@@ -1474,6 +1504,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
&pool->pages_allocated);
 
spin_lock(&class->lock);
+   set_zspage_mobile(class, first_page);
zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
class->size, class->pages_per_zspage));
}
@@ -1526,6 +1557,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
if (unlikely(!handle))
 

[PATCH v3 0/3] zsmalloc: make its pages can be migrated

2015-11-27 Thread Hui Zhu
These patches updated according to the review for the prev version [1].
So they are based on "[RFCv3 0/5] enable migration of driver pages" [2]
and "[RFC zsmalloc 0/4] meta diet" [3].

Hui Zhu (3):
zsmalloc: make struct can move
zsmalloc: mark its page "PageMobile"
zram: make create "__GFP_MOVABLE" pool

[1] http://comments.gmane.org/gmane.linux.kernel.mm/140014
[2] https://lkml.org/lkml/2015/7/7/21
[3] https://lkml.org/lkml/2015/8/10/90

 drivers/block/zram/zram_drv.c |4 
 mm/zsmalloc.c |  392 +-
 2 files changed, 316 insertions(+), 80 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] zsmalloc: make struct can be migrated

2015-11-27 Thread Hui Zhu
After "[RFC zsmalloc 0/4] meta diet" [1], the struct it close to
be migrated.
But the LRU is still used.  And to use the migration frame in [2], need
a way to get class through page struct.
So this patch add a new struct zs_migration and store it in struct page.

[1] https://lkml.org/lkml/2015/8/10/90
[2] https://lkml.org/lkml/2015/7/7/21

Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 178 ++
 1 file changed, 104 insertions(+), 74 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 1b18144..57c91a5 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -17,10 +17,10 @@
  *
  * Usage of struct page fields:
  * page->first_page: points to the first component (0-order) page
- * page->index (union with page->freelist): offset of the first object
- * starting in this page.
- * page->lru: links together all component pages (except the first page)
- * of a zspage
+ * ZS_MIGRATION(page)->index: offset of the first object starting in
+ * this page
+ * ZS_MIGRATION(page)->lru: links together all component pages (except
+ * the first page) of a zspage
  *
  * For _first_ page only:
  *
@@ -28,9 +28,9 @@
  * component page after the first page
  * If the page is first_page for huge object, it stores handle.
  * Look at size_class->huge.
- * page->lru: links together first pages of various zspages.
+ * ZS_MIGRATION(page)->lru: links together first pages of various zspages.
  * Basically forming list of zspages in a fullness group.
- * page->freelist: override by struct zs_meta
+ * ZS_MIGRATION(page)->index: override by struct zs_meta
  *
  * Usage of struct page flags:
  * PG_private: identifies the first component page
@@ -136,7 +136,7 @@
 #define INUSE_BITS 11
 #define INUSE_MASK ((1 << INUSE_BITS) - 1)
 #define ETC_BITS   ((sizeof(unsigned long) * 8) - FREE_OBJ_IDX_BITS - \
-   CLASS_IDX_BITS - FULLNESS_BITS - INUSE_BITS)
+   FULLNESS_BITS - INUSE_BITS)
 /*
  * On systems with 4K page size, this gives 255 size classes! There is a
  * trader-off here:
@@ -266,12 +266,21 @@ struct zs_pool {
  */
 struct zs_meta {
unsigned long free_idx:FREE_OBJ_IDX_BITS;
-   unsigned long class_idx:CLASS_IDX_BITS;
unsigned long fullness:FULLNESS_BITS;
unsigned long inuse:INUSE_BITS;
unsigned long etc:ETC_BITS;
 };
 
+struct zs_migration {
+   unsigned long index;
+   struct size_class *class;
+   struct list_head lru;
+   struct page *page;
+};
+
+#define ZS_MIGRATION(p) ((struct zs_migration *)((p)->freelist))
+#define ZS_META(p) ((struct zs_meta *)&(ZS_MIGRATION(p)->index))
+
 struct mapping_area {
 #ifdef CONFIG_PGTABLE_MAPPING
struct vm_struct *vm; /* vm area for mapping object that span pages */
@@ -311,6 +320,19 @@ static void record_obj(unsigned long handle, unsigned long 
obj)
*(unsigned long *)handle = obj;
 }
 
+struct kmem_cache *zs_migration_cachep;
+
+static struct migration *alloc_migration(gfp_t flags)
+{
+   return (struct migration *)kmem_cache_alloc(zs_migration_cachep,
+   flags & ~__GFP_HIGHMEM);
+}
+
+static void free_migration(struct migration *migration)
+{
+   kmem_cache_free(zs_migration_cachep, (void *)migration);
+}
+
 /* zpool driver */
 
 #ifdef CONFIG_ZPOOL
@@ -414,7 +436,7 @@ static int get_inuse_obj(struct page *page)
 
BUG_ON(!is_first_page(page));
 
-   m = (struct zs_meta *)&page->freelist;
+   m = ZS_META(page);
 
return m->inuse;
 }
@@ -425,48 +447,22 @@ static void set_inuse_obj(struct page *page, int inc)
 
BUG_ON(!is_first_page(page));
 
-   m = (struct zs_meta *)&page->freelist;
+   m = ZS_META(page);
m->inuse += inc;
 }
 
 static void set_free_obj_idx(struct page *first_page, int idx)
 {
-   struct zs_meta *m = (struct zs_meta *)&first_page->freelist;
+   struct zs_meta *m = ZS_META(first_page);
m->free_idx = idx;
 }
 
 static unsigned long get_free_obj_idx(struct page *first_page)
 {
-   struct zs_meta *m = (struct zs_meta *)&first_page->freelist;
+   struct zs_meta *m = ZS_META(first_page);
return m->free_idx;
 }
 
-static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
-   enum fullness_group *fullness)
-{
-   struct zs_meta *m;
-   BUG_ON(!is_first_page(page));
-
-   m = (struct zs_meta *)&page->freelist;
-   *fullness = m->fullness;
-   *class_idx = m->class_idx;
-}
-
-static void set_zspage_mapping(struct page *page, unsigned int class_idx,
-   enum fullness_group fullness)
-{
-   struct zs_meta *m;
-
-   BUG_ON(!is_first_page(page));
-
-   BUG_ON(

[PATCH 3/3] zram: make create "__GFP_MOVABLE" pool

2015-11-27 Thread Hui Zhu
Change the flags when call zs_create_pool to make zram alloc movable
zsmalloc page.

Signed-off-by: Hui Zhu 
---
 drivers/block/zram/zram_drv.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9fa15bb..8f3f524 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -514,7 +514,9 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, 
u64 disksize)
goto out_error;
}
 
-   meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
+   meta->mem_pool
+   = zs_create_pool(pool_name,
+GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE);
if (!meta->mem_pool) {
pr_err("Error creating memory pool\n");
goto out_error;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC v2 1/3] migrate: new struct migration and add it to struct page

2015-10-19 Thread Hui Zhu
On Thu, Oct 15, 2015 at 5:53 PM, Minchan Kim  wrote:
> On Thu, Oct 15, 2015 at 11:27:15AM +0200, Vlastimil Babka wrote:
>> On 10/15/2015 11:09 AM, Hui Zhu wrote:
>> >I got that add function interfaces is really not a good idea.
>> >So I add a new struct migration to put all migration interfaces and add
>> >this struct to struct page as union of "mapping".
>>
>> That's better, but not as flexible as the previously proposed
>> approaches that Sergey pointed you at:
>>
>>  http://lkml.iu.edu/hypermail/linux/kernel/1507.0/03233.html
>>  http://lkml.iu.edu/hypermail/linux/kernel/1508.1/00696.html
>>
>> There the operations are reachable via mapping, so we can support
>> the special operations migration also when mapping is otherwise
>> needed; your patch excludes mapping.
>>
>
> Hello Hui,
>
> FYI, I take over the work from Gioh and have a plan to improve the work.
> So, Could you wait a bit? Of course, if you have better idea, feel free
> to post it.
>
> Thanks.

Hi Minchan and Vlastimil,

If you don't mind. I want to wait the patches and focus on page
movable of zsmalloc part.
What do you think about it?

Best,
Hui
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] zsmalloc: remove unless line in obj_free

2015-10-13 Thread Hui Zhu
Thanks.  I will post a new version later.

Best,
Hui

On Tue, Oct 13, 2015 at 4:00 PM, Sergey Senozhatsky
 wrote:
> On (10/13/15 14:31), Hui Zhu wrote:
>> Signed-off-by: Hui Zhu 
>
> s/unless/useless/
>
> other than that
>
> Reviewed-by: Sergey Senozhatsky 
>
> -ss
>
>> ---
>>  mm/zsmalloc.c | 3 ---
>>  1 file changed, 3 deletions(-)
>>
>> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
>> index f135b1b..c7338f0 100644
>> --- a/mm/zsmalloc.c
>> +++ b/mm/zsmalloc.c
>> @@ -1428,8 +1428,6 @@ static void obj_free(struct zs_pool *pool, struct 
>> size_class *class,
>>   struct page *first_page, *f_page;
>>   unsigned long f_objidx, f_offset;
>>   void *vaddr;
>> - int class_idx;
>> - enum fullness_group fullness;
>>
>>   BUG_ON(!obj);
>>
>> @@ -1437,7 +1435,6 @@ static void obj_free(struct zs_pool *pool, struct 
>> size_class *class,
>>   obj_to_location(obj, &f_page, &f_objidx);
>>   first_page = get_first_page(f_page);
>>
>> - get_zspage_mapping(first_page, &class_idx, &fullness);
>>   f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
>>
>>   vaddr = kmap_atomic(f_page);
>> --
>> 1.9.1
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] zsmalloc: remove useless line in obj_free

2015-10-13 Thread Hui Zhu
Signed-off-by: Hui Zhu 
Reviewed-by: Sergey Senozhatsky 
---
 mm/zsmalloc.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index f135b1b..c7338f0 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1428,8 +1428,6 @@ static void obj_free(struct zs_pool *pool, struct 
size_class *class,
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
void *vaddr;
-   int class_idx;
-   enum fullness_group fullness;
 
BUG_ON(!obj);
 
@@ -1437,7 +1435,6 @@ static void obj_free(struct zs_pool *pool, struct 
size_class *class,
obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
 
-   get_zspage_mapping(first_page, &class_idx, &fullness);
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
 
vaddr = kmap_atomic(f_page);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] zsmalloc: remove unless line in obj_free

2015-10-12 Thread Hui Zhu
Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index f135b1b..c7338f0 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1428,8 +1428,6 @@ static void obj_free(struct zs_pool *pool, struct 
size_class *class,
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
void *vaddr;
-   int class_idx;
-   enum fullness_group fullness;
 
BUG_ON(!obj);
 
@@ -1437,7 +1435,6 @@ static void obj_free(struct zs_pool *pool, struct 
size_class *class,
obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
 
-   get_zspage_mapping(first_page, &class_idx, &fullness);
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
 
vaddr = kmap_atomic(f_page);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] zsmalloc: fix obj_to_head use page_private(page) as value but not pointer

2015-10-06 Thread Hui Zhu
In function obj_malloc:
if (!class->huge)
/* record handle in the header of allocated chunk */
link->handle = handle;
else
/* record handle in first_page->private */
set_page_private(first_page, handle);
The huge's page save handle to private directly.

But in obj_to_head:
if (class->huge) {
VM_BUG_ON(!is_first_page(page));
return *(unsigned long *)page_private(page);
} else
return *(unsigned long *)obj;
It is used as a pointer.

The reason why there is no problem until now is huge-class page is
born with ZS_FULL so it couldn't be migrated.
Therefore, it shouldn't be real bug in practice.
However, we need this patch for future-work "VM-aware zsmalloced
page migration" to reduce external fragmentation.

Signed-off-by: Hui Zhu 
Acked-by: Minchan Kim 
---
 mm/zsmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index f135b1b..e881d4f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -824,7 +824,7 @@ static unsigned long obj_to_head(struct size_class *class, 
struct page *page,
 {
if (class->huge) {
VM_BUG_ON(!is_first_page(page));
-   return *(unsigned long *)page_private(page);
+   return page_private(page);
} else
return *(unsigned long *)obj;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] zsmalloc: fix obj_to_head use page_private(page) as value but not pointer

2015-10-06 Thread Hui Zhu
On Tue, Oct 6, 2015 at 9:54 PM, Minchan Kim  wrote:
> Hello,
>
> On Mon, Oct 05, 2015 at 04:23:01PM +0800, Hui Zhu wrote:
>> In function obj_malloc:
>>   if (!class->huge)
>>   /* record handle in the header of allocated chunk */
>>   link->handle = handle;
>>   else
>>   /* record handle in first_page->private */
>>   set_page_private(first_page, handle);
>> The huge's page save handle to private directly.
>>
>> But in obj_to_head:
>>   if (class->huge) {
>>   VM_BUG_ON(!is_first_page(page));
>>   return page_private(page);
>
> Typo.
> return *(unsigned long*)page_private(page);
>
> Please fix the description.
>
>>   } else
>>   return *(unsigned long *)obj;
>> It is used as a pointer.
>>
>> So change obj_to_head use page_private(page) as value but not pointer
>> in obj_to_head.
>
> The reason why there is no problem until now is huge-class page is
> born with ZS_FULL so it couldn't be migrated.
> Therefore, it shouldn't be real bug in practice.
> However, we need this patch for future-work "VM-aware zsmalloced
> page migration" to reduce external fragmentation.
>
>>
>> Signed-off-by: Hui Zhu 
>
> With fixing the comment,
>
> Acked-by: Minchan Kim 
>
> Thanks for the fix, Hui.
>

Thanks!  I will post a new version.

Best,
Hui

> --
> Kind regards,
> Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] zsmalloc: fix obj_to_head use page_private(page) as value but not pointer

2015-10-05 Thread Hui Zhu
In function obj_malloc:
if (!class->huge)
/* record handle in the header of allocated chunk */
link->handle = handle;
else
/* record handle in first_page->private */
set_page_private(first_page, handle);
The huge's page save handle to private directly.

But in obj_to_head:
if (class->huge) {
VM_BUG_ON(!is_first_page(page));
return page_private(page);
} else
return *(unsigned long *)obj;
It is used as a pointer.

So change obj_to_head use page_private(page) as value but not pointer
in obj_to_head.

Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index f135b1b..e881d4f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -824,7 +824,7 @@ static unsigned long obj_to_head(struct size_class *class, 
struct page *page,
 {
if (class->huge) {
VM_BUG_ON(!is_first_page(page));
-   return *(unsigned long *)page_private(page);
+   return page_private(page);
} else
return *(unsigned long *)obj;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] zsmalloc: add comments for ->inuse to zspage

2015-09-23 Thread Hui Zhu
Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index f135b1b..f62f2fb 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -38,6 +38,7 @@
  * page->lru: links together first pages of various zspages.
  * Basically forming list of zspages in a fullness group.
  * page->mapping: class index and fullness group of the zspage
+ * page->inuse: the objects number that is used in this zspage
  *
  * Usage of struct page flags:
  * PG_private: identifies the first component page
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] zsmalloc: add comments for ->inuse to zspage

2015-09-21 Thread Hui Zhu
Signed-off-by: Hui Zhu 
---
 mm/zsmalloc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index f135b1b..1f66d5b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -38,6 +38,7 @@
  * page->lru: links together first pages of various zspages.
  * Basically forming list of zspages in a fullness group.
  * page->mapping: class index and fullness group of the zspage
+ * page->inuse: the pages number that is used in this zspage
  *
  * Usage of struct page flags:
  * PG_private: identifies the first component page
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Call for Topics and Sponsors

2015-06-25 Thread Hui Zhu
*
Call for Topics and Sponsors

Workshop on Open Source Development Tools 2015
Beijing, China
Sep. 12, 2015 (TBD)
HelloGCC Work Group (www.hellogcc.org)
*
Open Source Development Tools Workshop is a meeting for open
source software developers. You can share your work, study and
learning experience of open source software development here.
Our main topics is open source development tools.

The content of topics can be:
* GNU toolchain (gcc, binutils, gdb, etc)
* Clang/LLVM toolchain
* Other tools of open source development, debug and simulation

The form of topics can be:
* the introduction of your own work
* the introduction of your work did in the past
* tutorial, experience and etc
* other forms of presentation, such as lightning talk

If you have some topics, please contact us:
* send email to hello...@freelists.org (need to subscribe
http://www.freelists.org/list/hellogcc first)
* login into freenode IRC #hellogcc room

Important Date:
* the deadline of topics and sponsors solicitation: Aug 1st, 2015

Previous Meetings:
* OSDT 2014: http://www.hellogcc.org/?p=33910
* HelloGCC 2013: http://www.hellogcc.org/?p=33518
* HelloGCC 2012: http://linux.chinaunix.net/hellogcc2012
* HelloGCC 2011: http://linux.chinaunix.net/hellogcc2011
* HelloGCC 2010: http://linux.chinaunix.net/hellogcc2010
* HelloGCC 2009: http://www.aka-kernel.org/news/hellogcc/index.html

If you want to sponsor us, we will very appreciate and please contact us via
hellogcc.workgr...@gmail.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] CMA: page_isolation: check buddy before access it

2015-05-06 Thread Hui Zhu
On Wed, May 6, 2015 at 2:28 PM, Joonsoo Kim  wrote:
> On Tue, May 05, 2015 at 11:22:59AM +0800, Hui Zhu wrote:
>> Change pfn_present to pfn_valid_within according to the review of Laura.
>>
>> I got a issue:
>> [  214.294917] Unable to handle kernel NULL pointer dereference at virtual 
>> address 082a
>> [  214.303013] pgd = cc97
>> [  214.305721] [082a] *pgd=
>> [  214.309316] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
>> [  214.335704] PC is at get_pageblock_flags_group+0x5c/0xb0
>> [  214.341030] LR is at unset_migratetype_isolate+0x148/0x1b0
>> [  214.346523] pc : []lr : []psr: 8093
>> [  214.346523] sp : c7029d00  ip : 0105  fp : c7029d1c
>> [  214.358005] r10: 0001  r9 : 000a  r8 : 0004
>> [  214.363231] r7 : 6013  r6 : 00a4  r5 : c0a357e4  r4 : 
>> [  214.369761] r3 : 0826  r2 : 0002  r1 :   r0 : 003f
>> [  214.376291] Flags: Nzcv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment 
>> user
>> [  214.383516] Control: 10c5387d  Table: 2cb7006a  DAC: 0015
>> [  214.949720] Backtrace:
>> [  214.952192] [] (get_pageblock_flags_group+0x0/0xb0) from 
>> [] (unset_migratetype_isolate+0x148/0x1b0)
>> [  214.962978]  r7:6013 r6:c0a357c0 r5:c0a357e4 r4:c1555000
>> [  214.968693] [] (unset_migratetype_isolate+0x0/0x1b0) from 
>> [] (undo_isolate_page_range+0xd0/0xdc)
>> [  214.979222] [] (undo_isolate_page_range+0x0/0xdc) from 
>> [] (__alloc_contig_range+0x254/0x34c)
>> [  214.989398]  r9:000abc00 r8:c7028000 r7:000b1f53 r6:000b3e00 r5:0005
>> r4:c7029db4
>> [  214.997308] [] (__alloc_contig_range+0x0/0x34c) from 
>> [] (alloc_contig_range+0x14/0x18)
>> [  215.006973] [] (alloc_contig_range+0x0/0x18) from [] 
>> (dma_alloc_from_contiguous_addr+0x1ac/0x304)
>>
>> This issue is because when call unset_migratetype_isolate to unset a part
>> of CMA memory, it try to access the buddy page to get its status:
>>   if (order >= pageblock_order) {
>>   page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
>>   buddy_idx = __find_buddy_index(page_idx, order);
>>   buddy = page + (buddy_idx - page_idx);
>>
>>       if (!is_migrate_isolate_page(buddy)) {
>> But the begin addr of this part of CMA memory is very close to a part of
>> memory that is reserved in the boot time (not in buddy system).
>> So add a check before access it.
>>
>> Signed-off-by: Hui Zhu 
>> ---
>>  mm/page_isolation.c | 3 ++-
>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
>> index 755a42c..eb22d1f 100644
>> --- a/mm/page_isolation.c
>> +++ b/mm/page_isolation.c
>> @@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, 
>> unsigned migratetype)
>>   buddy_idx = __find_buddy_index(page_idx, order);
>>   buddy = page + (buddy_idx - page_idx);
>>
>> - if (!is_migrate_isolate_page(buddy)) {
>> + if (!pfn_valid_within(page_to_pfn(buddy))
>> + || !is_migrate_isolate_page(buddy)) {
>>   __isolate_free_page(page, order);
>>   kernel_map_pages(page, (1 << order), 1);
>>   set_page_refcounted(page);
>
> Hello,
>
> This isolation is for merging buddy pages. If buddy is not valid, we
> don't need to isolate page, because we can't merge them.
> I think that correct code would be:
>
> pfn_valid_within(page_to_pfn(buddy)) &&
> !is_migrate_isolate_page(buddy)
>
> But, isolation and free here is safe operation so your code will work
> fine.
>

Oops!  I posted a new version for the patch.

Thanks,
Hui

> Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3] CMA: page_isolation: check buddy before access it

2015-05-06 Thread Hui Zhu
Changelog:
v3, Change the behavior according to the review of Joonsoo.
v2, Change pfn_present to pfn_valid_within according to the review of Laura.

I got a issue:
[  214.294917] Unable to handle kernel NULL pointer dereference at virtual 
address 082a
[  214.303013] pgd = cc97
[  214.305721] [082a] *pgd=
[  214.309316] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
[  214.335704] PC is at get_pageblock_flags_group+0x5c/0xb0
[  214.341030] LR is at unset_migratetype_isolate+0x148/0x1b0
[  214.346523] pc : []lr : []psr: 8093
[  214.346523] sp : c7029d00  ip : 0105  fp : c7029d1c
[  214.358005] r10: 0001  r9 : 000a  r8 : 0004
[  214.363231] r7 : 6013  r6 : 00a4  r5 : c0a357e4  r4 : 
[  214.369761] r3 : 0826  r2 : 0002  r1 :   r0 : 003f
[  214.376291] Flags: Nzcv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment 
user
[  214.383516] Control: 10c5387d  Table: 2cb7006a  DAC: 0015
[  214.949720] Backtrace:
[  214.952192] [] (get_pageblock_flags_group+0x0/0xb0) from 
[] (unset_migratetype_isolate+0x148/0x1b0)
[  214.962978]  r7:6013 r6:c0a357c0 r5:c0a357e4 r4:c1555000
[  214.968693] [] (unset_migratetype_isolate+0x0/0x1b0) from 
[] (undo_isolate_page_range+0xd0/0xdc)
[  214.979222] [] (undo_isolate_page_range+0x0/0xdc) from 
[] (__alloc_contig_range+0x254/0x34c)
[  214.989398]  r9:000abc00 r8:c7028000 r7:000b1f53 r6:000b3e00 r5:0005
r4:c7029db4
[  214.997308] [] (__alloc_contig_range+0x0/0x34c) from [] 
(alloc_contig_range+0x14/0x18)
[  215.006973] [] (alloc_contig_range+0x0/0x18) from [] 
(dma_alloc_from_contiguous_addr+0x1ac/0x304)

This issue is because when call unset_migratetype_isolate to unset a part
of CMA memory, it try to access the buddy page to get its status:
if (order >= pageblock_order) {
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);

if (!is_migrate_isolate_page(buddy)) {
But the begin addr of this part of CMA memory is very close to a part of
memory that is reserved in the boot time (not in buddy system).
So add a check before access it.

Suggested-by: Laura Abbott 
Suggested-by: Joonsoo Kim 
Signed-off-by: Hui Zhu 
---
 mm/page_isolation.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 755a42c..4a5624c 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, unsigned 
migratetype)
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
 
-   if (!is_migrate_isolate_page(buddy)) {
+   if (pfn_valid_within(page_to_pfn(buddy))
+   && !is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
kernel_map_pages(page, (1 << order), 1);
set_page_refcounted(page);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] CMA: page_isolation: check buddy before access it

2015-05-05 Thread Hui Zhu
On Wed, May 6, 2015 at 5:29 AM, Andrew Morton  wrote:
> On Tue, 5 May 2015 11:22:59 +0800 Hui Zhu  wrote:
>
>> Change pfn_present to pfn_valid_within according to the review of Laura.
>>
>> I got a issue:
>> [  214.294917] Unable to handle kernel NULL pointer dereference at virtual 
>> address 082a
>> [  214.303013] pgd = cc97
>> [  214.305721] [082a] *pgd=
>> [  214.309316] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
>> [  214.335704] PC is at get_pageblock_flags_group+0x5c/0xb0
>> [  214.341030] LR is at unset_migratetype_isolate+0x148/0x1b0
>> [  214.346523] pc : []lr : []psr: 8093
>> [  214.346523] sp : c7029d00  ip : 0105  fp : c7029d1c
>> [  214.358005] r10: 0001  r9 : 000a  r8 : 0004
>> [  214.363231] r7 : 6013  r6 : 00a4  r5 : c0a357e4  r4 : 
>> [  214.369761] r3 : 0826  r2 : 0002  r1 :   r0 : 003f
>> [  214.376291] Flags: Nzcv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment 
>> user
>> [  214.383516] Control: 10c5387d  Table: 2cb7006a  DAC: 0015
>> [  214.949720] Backtrace:
>> [  214.952192] [] (get_pageblock_flags_group+0x0/0xb0) from 
>> [] (unset_migratetype_isolate+0x148/0x1b0)
>> [  214.962978]  r7:6013 r6:c0a357c0 r5:c0a357e4 r4:c1555000
>> [  214.968693] [] (unset_migratetype_isolate+0x0/0x1b0) from 
>> [] (undo_isolate_page_range+0xd0/0xdc)
>> [  214.979222] [] (undo_isolate_page_range+0x0/0xdc) from 
>> [] (__alloc_contig_range+0x254/0x34c)
>> [  214.989398]  r9:000abc00 r8:c7028000 r7:000b1f53 r6:000b3e00 r5:0005
>> r4:c7029db4
>> [  214.997308] [] (__alloc_contig_range+0x0/0x34c) from 
>> [] (alloc_contig_range+0x14/0x18)
>> [  215.006973] [] (alloc_contig_range+0x0/0x18) from [] 
>> (dma_alloc_from_contiguous_addr+0x1ac/0x304)
>>
>> This issue is because when call unset_migratetype_isolate to unset a part
>> of CMA memory, it try to access the buddy page to get its status:
>>   if (order >= pageblock_order) {
>>   page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
>>   buddy_idx = __find_buddy_index(page_idx, order);
>>   buddy = page + (buddy_idx - page_idx);
>>
>>   if (!is_migrate_isolate_page(buddy)) {
>> But the begin addr of this part of CMA memory is very close to a part of
>> memory that is reserved in the boot time (not in buddy system).
>> So add a check before access it.
>>
>> ...
>>
>> --- a/mm/page_isolation.c
>> +++ b/mm/page_isolation.c
>> @@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, 
>> unsigned migratetype)
>>   buddy_idx = __find_buddy_index(page_idx, order);
>>   buddy = page + (buddy_idx - page_idx);
>>
>> - if (!is_migrate_isolate_page(buddy)) {
>> + if (!pfn_valid_within(page_to_pfn(buddy))
>> + || !is_migrate_isolate_page(buddy)) {
>>   __isolate_free_page(page, order);
>>   kernel_map_pages(page, (1 << order), 1);
>>   set_page_refcounted(page);
>
> This fix is needed in kernel versions 4.0.x isn't it?

I think it need it.

Thanks,
Hui
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] CMA: page_isolation: check buddy before access it

2015-05-04 Thread Hui Zhu
Change pfn_present to pfn_valid_within according to the review of Laura.

I got a issue:
[  214.294917] Unable to handle kernel NULL pointer dereference at virtual 
address 082a
[  214.303013] pgd = cc97
[  214.305721] [082a] *pgd=
[  214.309316] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
[  214.335704] PC is at get_pageblock_flags_group+0x5c/0xb0
[  214.341030] LR is at unset_migratetype_isolate+0x148/0x1b0
[  214.346523] pc : []lr : []psr: 8093
[  214.346523] sp : c7029d00  ip : 0105  fp : c7029d1c
[  214.358005] r10: 0001  r9 : 000a  r8 : 0004
[  214.363231] r7 : 6013  r6 : 00a4  r5 : c0a357e4  r4 : 
[  214.369761] r3 : 0826  r2 : 0002  r1 :   r0 : 003f
[  214.376291] Flags: Nzcv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment 
user
[  214.383516] Control: 10c5387d  Table: 2cb7006a  DAC: 0015
[  214.949720] Backtrace:
[  214.952192] [] (get_pageblock_flags_group+0x0/0xb0) from 
[] (unset_migratetype_isolate+0x148/0x1b0)
[  214.962978]  r7:6013 r6:c0a357c0 r5:c0a357e4 r4:c1555000
[  214.968693] [] (unset_migratetype_isolate+0x0/0x1b0) from 
[] (undo_isolate_page_range+0xd0/0xdc)
[  214.979222] [] (undo_isolate_page_range+0x0/0xdc) from 
[] (__alloc_contig_range+0x254/0x34c)
[  214.989398]  r9:000abc00 r8:c7028000 r7:000b1f53 r6:000b3e00 r5:0005
r4:c7029db4
[  214.997308] [] (__alloc_contig_range+0x0/0x34c) from [] 
(alloc_contig_range+0x14/0x18)
[  215.006973] [] (alloc_contig_range+0x0/0x18) from [] 
(dma_alloc_from_contiguous_addr+0x1ac/0x304)

This issue is because when call unset_migratetype_isolate to unset a part
of CMA memory, it try to access the buddy page to get its status:
if (order >= pageblock_order) {
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);

if (!is_migrate_isolate_page(buddy)) {
But the begin addr of this part of CMA memory is very close to a part of
memory that is reserved in the boot time (not in buddy system).
So add a check before access it.

Signed-off-by: Hui Zhu 
---
 mm/page_isolation.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 755a42c..eb22d1f 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, unsigned 
migratetype)
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
 
-   if (!is_migrate_isolate_page(buddy)) {
+   if (!pfn_valid_within(page_to_pfn(buddy))
+   || !is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
kernel_map_pages(page, (1 << order), 1);
set_page_refcounted(page);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] CMA: page_isolation: check buddy before access it

2015-05-04 Thread Hui Zhu
On Tue, May 5, 2015 at 2:34 AM, Laura Abbott  wrote:
> On 05/04/2015 02:41 AM, Hui Zhu wrote:
>>
>> I got a issue:
>> [  214.294917] Unable to handle kernel NULL pointer dereference at virtual
>> address 082a
>> [  214.303013] pgd = cc97
>> [  214.305721] [082a] *pgd=
>> [  214.309316] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
>> [  214.335704] PC is at get_pageblock_flags_group+0x5c/0xb0
>> [  214.341030] LR is at unset_migratetype_isolate+0x148/0x1b0
>> [  214.346523] pc : []lr : []psr: 8093
>> [  214.346523] sp : c7029d00  ip : 0105  fp : c7029d1c
>> [  214.358005] r10: 0001  r9 : 000a  r8 : 0004
>> [  214.363231] r7 : 6013  r6 : 00a4  r5 : c0a357e4  r4 : 
>> [  214.369761] r3 : 0826  r2 : 0002  r1 :   r0 : 003f
>> [  214.376291] Flags: Nzcv  IRQs off  FIQs on  Mode SVC_32  ISA ARM
>> Segment user
>> [  214.383516] Control: 10c5387d  Table: 2cb7006a  DAC: 0015
>> [  214.949720] Backtrace:
>> [  214.952192] [] (get_pageblock_flags_group+0x0/0xb0) from
>> [] (unset_migratetype_isolate+0x148/0x1b0)
>> [  214.962978]  r7:6013 r6:c0a357c0 r5:c0a357e4 r4:c1555000
>> [  214.968693] [] (unset_migratetype_isolate+0x0/0x1b0) from
>> [] (undo_isolate_page_range+0xd0/0xdc)
>> [  214.979222] [] (undo_isolate_page_range+0x0/0xdc) from
>> [] (__alloc_contig_range+0x254/0x34c)
>> [  214.989398]  r9:000abc00 r8:c7028000 r7:000b1f53 r6:000b3e00
>> r5:0005
>> r4:c7029db4
>> [  214.997308] [] (__alloc_contig_range+0x0/0x34c) from
>> [] (alloc_contig_range+0x14/0x18)
>> [  215.006973] [] (alloc_contig_range+0x0/0x18) from
>> [] (dma_alloc_from_contiguous_addr+0x1ac/0x304)
>>
>> This issue is because when call unset_migratetype_isolate to unset a part
>> of CMA memory, it try to access the buddy page to get its status:
>> if (order >= pageblock_order) {
>> page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) -
>> 1);
>> buddy_idx = __find_buddy_index(page_idx, order);
>> buddy = page + (buddy_idx - page_idx);
>>
>>     if (!is_migrate_isolate_page(buddy)) {
>> But the begin addr of this part of CMA memory is very close to a part of
>> memory that is reserved in the boot time (not in buddy system).
>> So add a check before access it.
>>
>> Signed-off-by: Hui Zhu 
>> ---
>>   mm/page_isolation.c | 3 ++-
>>   1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
>> index 755a42c..434730b 100644
>> --- a/mm/page_isolation.c
>> +++ b/mm/page_isolation.c
>> @@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page,
>> unsigned migratetype)
>> buddy_idx = __find_buddy_index(page_idx, order);
>> buddy = page + (buddy_idx - page_idx);
>>
>> -   if (!is_migrate_isolate_page(buddy)) {
>> +   if (!pfn_present(page_to_pfn(buddy))
>> +   || !is_migrate_isolate_page(buddy)) {
>> __isolate_free_page(page, order);
>> kernel_map_pages(page, (1 << order), 1);
>> set_page_refcounted(page);
>>
>
> I think you want to use pfn_valid_within instead of pfn_present.

Thanks.  I will post a new version for it.

Best,
Hui

>
> Thanks,
> Laura
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] CMA: page_isolation: check buddy before access it

2015-05-04 Thread Hui Zhu
I got a issue:
[  214.294917] Unable to handle kernel NULL pointer dereference at virtual 
address 082a
[  214.303013] pgd = cc97
[  214.305721] [082a] *pgd=
[  214.309316] Internal error: Oops: 5 [#1] PREEMPT SMP ARM
[  214.335704] PC is at get_pageblock_flags_group+0x5c/0xb0
[  214.341030] LR is at unset_migratetype_isolate+0x148/0x1b0
[  214.346523] pc : []lr : []psr: 8093
[  214.346523] sp : c7029d00  ip : 0105  fp : c7029d1c
[  214.358005] r10: 0001  r9 : 000a  r8 : 0004
[  214.363231] r7 : 6013  r6 : 00a4  r5 : c0a357e4  r4 : 
[  214.369761] r3 : 0826  r2 : 0002  r1 :   r0 : 003f
[  214.376291] Flags: Nzcv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment 
user
[  214.383516] Control: 10c5387d  Table: 2cb7006a  DAC: 0015
[  214.949720] Backtrace:
[  214.952192] [] (get_pageblock_flags_group+0x0/0xb0) from 
[] (unset_migratetype_isolate+0x148/0x1b0)
[  214.962978]  r7:6013 r6:c0a357c0 r5:c0a357e4 r4:c1555000
[  214.968693] [] (unset_migratetype_isolate+0x0/0x1b0) from 
[] (undo_isolate_page_range+0xd0/0xdc)
[  214.979222] [] (undo_isolate_page_range+0x0/0xdc) from 
[] (__alloc_contig_range+0x254/0x34c)
[  214.989398]  r9:000abc00 r8:c7028000 r7:000b1f53 r6:000b3e00 r5:0005
r4:c7029db4
[  214.997308] [] (__alloc_contig_range+0x0/0x34c) from [] 
(alloc_contig_range+0x14/0x18)
[  215.006973] [] (alloc_contig_range+0x0/0x18) from [] 
(dma_alloc_from_contiguous_addr+0x1ac/0x304)

This issue is because when call unset_migratetype_isolate to unset a part
of CMA memory, it try to access the buddy page to get its status:
if (order >= pageblock_order) {
page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);

if (!is_migrate_isolate_page(buddy)) {
But the begin addr of this part of CMA memory is very close to a part of
memory that is reserved in the boot time (not in buddy system).
So add a check before access it.

Signed-off-by: Hui Zhu 
---
 mm/page_isolation.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 755a42c..434730b 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, unsigned 
migratetype)
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
 
-   if (!is_migrate_isolate_page(buddy)) {
+   if (!pfn_present(page_to_pfn(buddy))
+   || !is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
kernel_map_pages(page, (1 << order), 1);
set_page_refcounted(page);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] CMA: treat free cma pages as non-free if not ALLOC_CMA on watermark checking

2015-01-18 Thread Hui Zhu
On Mon, Jan 19, 2015 at 2:55 PM, Minchan Kim  wrote:
> Hello,
>
> On Sun, Jan 18, 2015 at 04:32:59PM +0800, Hui Zhu wrote:
>> From: Hui Zhu 
>>
>> The original of this patch [1] is part of Joonsoo's CMA patch series.
>> I made a patch [2] to fix the issue of this patch.  Joonsoo reminded me
>> that this issue affect current kernel too.  So made a new one for upstream.
>
> Recently, we found many problems of CMA and Joonsoo tried to add more
> hooks into MM like agressive allocation but I suggested adding new zone
> would be more desirable than more hooks in mm fast path in various aspect.
> (ie, remove lots of hooks in hot path of MM, don't need reclaim hooks
>  for special CMA pages, don't need custom fair allocation for CMA).
>
> Joonsoo is investigating the direction so please wait.
> If it turns out we have lots of hurdle to go that way,
> this direction(ie, putting more hooks) should be second plan.

OK.  Thanks.

Best,
Hui

>
> Thanks.
>
>>
>> Current code treat free cma pages as non-free if not ALLOC_CMA in the first
>> check:
>> if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
>>   return false;
>> But in the loop after that, it treat free cma pages as free memory even
>> if not ALLOC_CMA.
>> So this one substruct free_cma from free_pages before the loop if not
>> ALLOC_CMA to treat free cma pages as non-free in the loop.
>>
>> But there still have a issue is that CMA memory in each order is part
>> of z->free_area[o].nr_free, then the CMA page number of this order is
>> substructed twice.  This bug will make __zone_watermark_ok return more false.
>> This patch add cma_nr_free to struct free_area that just record the number
>> of CMA pages.  And add it back in the order loop to handle the substruct
>> twice issue.
>>
>> The last issue of this patch should handle is pointed by Joonsoo in [3].
>> If pageblock for CMA is isolated, cma_nr_free would be miscalculated.
>> This patch add two functions nr_free_inc and nr_free_dec to change the
>> values of nr_free and cma_nr_free.  If the migratetype is MIGRATE_ISOLATE,
>> they will not change the value of nr_free.
>> Change __mod_zone_freepage_state to doesn't record isolated page to
>> NR_FREE_PAGES.
>> And add code to move_freepages to record the page number that isolated:
>>   if (is_migrate_isolate(migratetype))
>>   nr_free_dec(&zone->free_area[order],
>>   get_freepage_migratetype(page));
>>   else
>>   nr_free_inc(&zone->free_area[order], migratetype);
>> Then the isolate issue is handled.
>>
>> This patchset is based on fc7f0dd381720ea5ee5818645f7d0e9dece41cb0.
>>
>> [1] https://lkml.org/lkml/2014/5/28/110
>> [2] https://lkml.org/lkml/2014/12/25/43
>> [3] https://lkml.org/lkml/2015/1/4/220
>>
>> Signed-off-by: Joonsoo Kim 
>> Signed-off-by: Hui Zhu 
>> Signed-off-by: Weixing Liu 
>> ---
>>  include/linux/mmzone.h |  3 +++
>>  include/linux/vmstat.h |  4 +++-
>>  mm/page_alloc.c| 59 
>> +-
>>  3 files changed, 55 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 2f0856d..094476b 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -92,6 +92,9 @@ static inline int get_pfnblock_migratetype(struct page 
>> *page, unsigned long pfn)
>>  struct free_area {
>>   struct list_headfree_list[MIGRATE_TYPES];
>>   unsigned long   nr_free;
>> +#ifdef CONFIG_CMA
>> + unsigned long   cma_nr_free;
>> +#endif
>>  };
>>
>>  struct pglist_data;
>> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
>> index 82e7db7..f18ef00 100644
>> --- a/include/linux/vmstat.h
>> +++ b/include/linux/vmstat.h
>> @@ -6,6 +6,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>
>>  extern int sysctl_stat_interval;
>> @@ -280,7 +281,8 @@ static inline void drain_zonestat(struct zone *zone,
>>  static inline void __mod_zone_freepage_state(struct zone *zone, int 
>> nr_pages,
>>int migratetype)
>>  {
>> - __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
>> + if (!is_migrate_isolate(migratetype))
>> + __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
>>   if (is_migrate_

Re: [PATCH] mm/page_alloc: Fix race conditions on getting migratetype in buffered_rmqueue

2015-01-18 Thread Hui Zhu
On Sun, Jan 18, 2015 at 6:19 PM, Vlastimil Babka  wrote:
> On 18.1.2015 10:17, Hui Zhu wrote:
>>
>> From: Hui Zhu 
>>
>> To test the patch [1], I use KGTP and a script [2] to show
>> NR_FREE_CMA_PAGES
>> and gross of cma_nr_free.  The values are always not same.
>> I check the code of pages alloc and free and found that race conditions
>> on getting migratetype in buffered_rmqueue.
>
>
> Can you elaborate? What does this races with, are you dynamically changing
> the size of CMA area, or what? The migratetype here is based on which free
> list the page was found on. Was it misplaced then? Wasn't Joonsoo's recent
> series supposed to eliminate this?

My bad.
I thought move_freepages has race condition with this part.  But I
missed it will check PageBuddy before set_freepage_migratetype.
Sorry for that.

I will do more work around this one and [1].

Thanks for your review.

Best,
Hui

>
>> Then I add move the code of getting migratetype inside the zone->lock
>> protection part.
>
>
> Not just that, you are also reading migratetype from pageblock bitmap
> instead of the one embedded in the free page. Which is more expensive
> and we already do that more often than we would like to because of CMA.
> And it appears to be a wrong fix for a possible misplacement bug. If there's
> such misplacement, the wrong stats are not the only problem.
>
>>
>> Because this issue will affect system even if the Linux kernel does't
>> have [1].  So I post this patch separately.
>
>
> But we can't test that without [1], right? Maybe the issue is introduced by
> [1]?
>
>
>>
>> This patchset is based on fc7f0dd381720ea5ee5818645f7d0e9dece41cb0.
>>
>> [1] https://lkml.org/lkml/2015/1/18/28
>> [2] https://github.com/teawater/kgtp/blob/dev/add-ons/cma_free.py
>>
>> Signed-off-by: Hui Zhu 
>> ---
>>   mm/page_alloc.c | 11 +++
>>   1 file changed, 7 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 7633c50..f3d6922 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1694,11 +1694,12 @@ again:
>> }
>> spin_lock_irqsave(&zone->lock, flags);
>> page = __rmqueue(zone, order, migratetype);
>> +   if (page)
>> +   migratetype = get_pageblock_migratetype(page);
>> +   else
>> +   goto failed_unlock;
>> spin_unlock(&zone->lock);
>> -   if (!page)
>> -   goto failed;
>> -   __mod_zone_freepage_state(zone, -(1 << order),
>> - get_freepage_migratetype(page));
>> +   __mod_zone_freepage_state(zone, -(1 << order),
>> migratetype);
>> }
>> __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
>> @@ -1715,6 +1716,8 @@ again:
>> goto again;
>> return page;
>>   +failed_unlock:
>> +   spin_unlock(&zone->lock);
>>   failed:
>> local_irq_restore(flags);
>> return NULL;
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm/page_alloc: Fix race conditions on getting migratetype in buffered_rmqueue

2015-01-18 Thread Hui Zhu
From: Hui Zhu 

To test the patch [1], I use KGTP and a script [2] to show NR_FREE_CMA_PAGES
and gross of cma_nr_free.  The values are always not same.
I check the code of pages alloc and free and found that race conditions
on getting migratetype in buffered_rmqueue.
Then I add move the code of getting migratetype inside the zone->lock
protection part.

Because this issue will affect system even if the Linux kernel does't
have [1].  So I post this patch separately.

This patchset is based on fc7f0dd381720ea5ee5818645f7d0e9dece41cb0.

[1] https://lkml.org/lkml/2015/1/18/28
[2] https://github.com/teawater/kgtp/blob/dev/add-ons/cma_free.py

Signed-off-by: Hui Zhu 
---
 mm/page_alloc.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c50..f3d6922 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1694,11 +1694,12 @@ again:
}
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order, migratetype);
+   if (page)
+   migratetype = get_pageblock_migratetype(page);
+   else
+   goto failed_unlock;
spin_unlock(&zone->lock);
-   if (!page)
-   goto failed;
-   __mod_zone_freepage_state(zone, -(1 << order),
- get_freepage_migratetype(page));
+   __mod_zone_freepage_state(zone, -(1 << order), migratetype);
}
 
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
@@ -1715,6 +1716,8 @@ again:
goto again;
return page;
 
+failed_unlock:
+   spin_unlock(&zone->lock);
 failed:
local_irq_restore(flags);
return NULL;
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] CMA: treat free cma pages as non-free if not ALLOC_CMA on watermark checking

2015-01-18 Thread Hui Zhu
From: Hui Zhu 

The original of this patch [1] is part of Joonsoo's CMA patch series.
I made a patch [2] to fix the issue of this patch.  Joonsoo reminded me
that this issue affect current kernel too.  So made a new one for upstream.

Current code treat free cma pages as non-free if not ALLOC_CMA in the first
check:
if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
return false;
But in the loop after that, it treat free cma pages as free memory even
if not ALLOC_CMA.
So this one substruct free_cma from free_pages before the loop if not
ALLOC_CMA to treat free cma pages as non-free in the loop.

But there still have a issue is that CMA memory in each order is part
of z->free_area[o].nr_free, then the CMA page number of this order is
substructed twice.  This bug will make __zone_watermark_ok return more false.
This patch add cma_nr_free to struct free_area that just record the number
of CMA pages.  And add it back in the order loop to handle the substruct
twice issue.

The last issue of this patch should handle is pointed by Joonsoo in [3].
If pageblock for CMA is isolated, cma_nr_free would be miscalculated.
This patch add two functions nr_free_inc and nr_free_dec to change the
values of nr_free and cma_nr_free.  If the migratetype is MIGRATE_ISOLATE,
they will not change the value of nr_free.
Change __mod_zone_freepage_state to doesn't record isolated page to
NR_FREE_PAGES.
And add code to move_freepages to record the page number that isolated:
if (is_migrate_isolate(migratetype))
nr_free_dec(&zone->free_area[order],
get_freepage_migratetype(page));
else
nr_free_inc(&zone->free_area[order], migratetype);
Then the isolate issue is handled.

This patchset is based on fc7f0dd381720ea5ee5818645f7d0e9dece41cb0.

[1] https://lkml.org/lkml/2014/5/28/110
[2] https://lkml.org/lkml/2014/12/25/43
[3] https://lkml.org/lkml/2015/1/4/220

Signed-off-by: Joonsoo Kim 
Signed-off-by: Hui Zhu 
Signed-off-by: Weixing Liu 
---
 include/linux/mmzone.h |  3 +++
 include/linux/vmstat.h |  4 +++-
 mm/page_alloc.c| 59 +-
 3 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f0856d..094476b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -92,6 +92,9 @@ static inline int get_pfnblock_migratetype(struct page *page, 
unsigned long pfn)
 struct free_area {
struct list_headfree_list[MIGRATE_TYPES];
unsigned long   nr_free;
+#ifdef CONFIG_CMA
+   unsigned long   cma_nr_free;
+#endif
 };
 
 struct pglist_data;
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 82e7db7..f18ef00 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 extern int sysctl_stat_interval;
@@ -280,7 +281,8 @@ static inline void drain_zonestat(struct zone *zone,
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
 int migratetype)
 {
-   __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
+   if (!is_migrate_isolate(migratetype))
+   __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c50..9a2b6da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -576,6 +576,28 @@ static inline int page_is_buddy(struct page *page, struct 
page *buddy,
return 0;
 }
 
+static inline void nr_free_inc(struct free_area *area, int migratetype)
+{
+   if (!is_migrate_isolate(migratetype))
+   area->nr_free++;
+
+#ifdef CONFIG_CMA
+   if (is_migrate_cma(migratetype))
+   area->cma_nr_free++;
+#endif
+}
+
+static inline void nr_free_dec(struct free_area *area, int migratetype)
+{
+   if (!is_migrate_isolate(migratetype))
+   area->nr_free--;
+
+#ifdef CONFIG_CMA
+   if (is_migrate_cma(migratetype))
+   area->cma_nr_free--;
+#endif
+}
+
 /*
  * Freeing function for a buddy system allocator.
  *
@@ -649,7 +671,7 @@ static inline void __free_one_page(struct page *page,
clear_page_guard(zone, buddy, order, migratetype);
} else {
list_del(&buddy->lru);
-   zone->free_area[order].nr_free--;
+   nr_free_dec(&zone->free_area[order], migratetype);
rmv_page_order(buddy);
}
combined_idx = buddy_idx & page_idx;
@@ -682,7 +704,7 @@ static inline void __free_one_page(struct page *page,
 
list_add(&page->

Re: [PATCH] CMA: Fix CMA's page number is substructed twice in __zone_watermark_ok

2015-01-07 Thread Hui Zhu
On Wed, Jan 7, 2015 at 4:45 PM, Vlastimil Babka  wrote:
> On 12/30/2014 11:17 AM, Hui Zhu wrote:
>> The original of this patch [1] is used to fix the issue in Joonsoo's CMA 
>> patch
>> "CMA: always treat free cma pages as non-free on watermark checking" [2].
>>
>> Joonsoo reminded me that this issue affect current kernel too.  So made a new
>> one for upstream.
>>
>> Function __zone_watermark_ok substruct CMA pages number from free_pages
>> if system allocation can't use CMA areas:
>>   /* If allocation can't use CMA areas don't use free CMA pages */
>>   if (!(alloc_flags & ALLOC_CMA))
>>   free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
>>
>> But after this part of code
>>   for (o = 0; o < order; o++) {
>>   /* At the next order, this order's pages become unavailable */
>>   free_pages -= z->free_area[o].nr_free << o;
>> CMA memory in each order is part of z->free_area[o].nr_free, then the CMA
>> page number of this order is substructed twice.  This bug will make
>> __zone_watermark_ok return more false.
>>
>> This patch add cma_free_area to struct free_area that just record the number
>> of CMA pages.  And add it back in the order loop to handle the substruct
>> twice issue.
>
> Le sigh.
>
> I now dub CMA "Contagious Memory Allocator".
> One can't even take a Christmas vacation without this blight to spread :(
>
> Seriously, with so much special casing everywhere in fast paths, Minchan's
> (IIRC) proposal of a special CMA zone has some appeal.
>
> But it seems to me that the bug you are fixing doesn't exist as you describe 
> it?
> free_cma is only used here:
>
> if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
> return false;
>
> So it's subtracted from free_pages just temporarily for the basic order-0 
> check.
> In the higher-order magic loop, it's not used at all?
>

I am so sorry  that I made a mistake when I split this patch from the
patch series.

The original of this patch is to fix the issue around Joonsoo's update
of __zone_watermark_ok:
if (IS_ENABLED(CONFIG_CMA) && z->managed_cma_pages)
free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);

if (free_pages <= min + z->lowmem_reserve[classzone_idx])
return false;

Joonsoo, what about submit this change to upstream first?

Thanks,
Hui


> Vlastimil
>
>
>> [1] https://lkml.org/lkml/2014/12/25/43
>> [2] https://lkml.org/lkml/2014/5/28/110
>>
>> Signed-off-by: Hui Zhu 
>> Signed-off-by: Weixing Liu 
>> ---
>>  include/linux/mmzone.h |  3 +++
>>  mm/page_alloc.c| 22 ++
>>  2 files changed, 25 insertions(+)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 2f0856d..094476b 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -92,6 +92,9 @@ static inline int get_pfnblock_migratetype(struct page 
>> *page, unsigned long pfn)
>>  struct free_area {
>>   struct list_headfree_list[MIGRATE_TYPES];
>>   unsigned long   nr_free;
>> +#ifdef CONFIG_CMA
>> + unsigned long   cma_nr_free;
>> +#endif
>>  };
>>
>>  struct pglist_data;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 7633c50..026cf27 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -650,6 +650,8 @@ static inline void __free_one_page(struct page *page,
>>   } else {
>>   list_del(&buddy->lru);
>>   zone->free_area[order].nr_free--;
>> + if (is_migrate_cma(migratetype))
>> + zone->free_area[order].cma_nr_free--;
>>   rmv_page_order(buddy);
>>   }
>>   combined_idx = buddy_idx & page_idx;
>> @@ -683,6 +685,8 @@ static inline void __free_one_page(struct page *page,
>>   list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
>>  out:
>>   zone->free_area[order].nr_free++;
>> + if (is_migrate_cma(migratetype))
>> + zone->free_area[order].cma_nr_free++;
>>  }
>>
>>  static inline int free_pages_check(struct page *page)
>> @@ -937,6 +941,8 @@ static inline void expand(struct zone *zone, struct page 
>> *page,
>>   }
>>   list_add(&page[size].lru, &area->free_list[migratetype]);
>>   

[PATCH] samples: hw_breakpoint: check the return value of kallsyms_lookup_name

2015-01-03 Thread Hui Zhu
data_breakpoint.ko can insert successful but cannot catch any change of
the data in my part because kallsyms_lookup_name rerurn 0 each time.
So add code to check the return value of kallsyms_lookup_name.

Signed-off-by: Hui Zhu 
---
 samples/hw_breakpoint/data_breakpoint.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/samples/hw_breakpoint/data_breakpoint.c 
b/samples/hw_breakpoint/data_breakpoint.c
index ef7f322..4fbf93b 100644
--- a/samples/hw_breakpoint/data_breakpoint.c
+++ b/samples/hw_breakpoint/data_breakpoint.c
@@ -52,27 +52,30 @@ static void sample_hbp_handler(struct perf_event *bp,
 
 static int __init hw_break_module_init(void)
 {
-   int ret;
+   int ret = 0;
struct perf_event_attr attr;
 
hw_breakpoint_init(&attr);
attr.bp_addr = kallsyms_lookup_name(ksym_name);
+   if (!attr.bp_addr) {
+   ret = -ENXIO;
+   printk(KERN_INFO "Get address for %s failed\n", ksym_name);
+   goto out;
+   }
+
attr.bp_len = HW_BREAKPOINT_LEN_4;
attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
 
sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler, 
NULL);
if (IS_ERR((void __force *)sample_hbp)) {
ret = PTR_ERR((void __force *)sample_hbp);
-   goto fail;
+   printk(KERN_INFO "Breakpoint registration failed\n");
+   goto out;
}
 
printk(KERN_INFO "HW Breakpoint for %s write installed\n", ksym_name);
 
-   return 0;
-
-fail:
-   printk(KERN_INFO "Breakpoint registration failed\n");
-
+out:
return ret;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] CMA: Fix CMA's page number is substructed twice in __zone_watermark_ok

2014-12-30 Thread Hui Zhu
The original of this patch [1] is used to fix the issue in Joonsoo's CMA patch
"CMA: always treat free cma pages as non-free on watermark checking" [2].

Joonsoo reminded me that this issue affect current kernel too.  So made a new
one for upstream.

Function __zone_watermark_ok substruct CMA pages number from free_pages
if system allocation can't use CMA areas:
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);

But after this part of code
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
CMA memory in each order is part of z->free_area[o].nr_free, then the CMA
page number of this order is substructed twice.  This bug will make
__zone_watermark_ok return more false.

This patch add cma_free_area to struct free_area that just record the number
of CMA pages.  And add it back in the order loop to handle the substruct
twice issue.

[1] https://lkml.org/lkml/2014/12/25/43
[2] https://lkml.org/lkml/2014/5/28/110

Signed-off-by: Hui Zhu 
Signed-off-by: Weixing Liu 
---
 include/linux/mmzone.h |  3 +++
 mm/page_alloc.c| 22 ++
 2 files changed, 25 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f0856d..094476b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -92,6 +92,9 @@ static inline int get_pfnblock_migratetype(struct page *page, 
unsigned long pfn)
 struct free_area {
struct list_headfree_list[MIGRATE_TYPES];
unsigned long   nr_free;
+#ifdef CONFIG_CMA
+   unsigned long   cma_nr_free;
+#endif
 };
 
 struct pglist_data;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c50..026cf27 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -650,6 +650,8 @@ static inline void __free_one_page(struct page *page,
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
+   if (is_migrate_cma(migratetype))
+   zone->free_area[order].cma_nr_free--;
rmv_page_order(buddy);
}
combined_idx = buddy_idx & page_idx;
@@ -683,6 +685,8 @@ static inline void __free_one_page(struct page *page,
list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
zone->free_area[order].nr_free++;
+   if (is_migrate_cma(migratetype))
+   zone->free_area[order].cma_nr_free++;
 }
 
 static inline int free_pages_check(struct page *page)
@@ -937,6 +941,8 @@ static inline void expand(struct zone *zone, struct page 
*page,
}
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
+   if (is_migrate_cma(migratetype))
+   area->cma_nr_free++;
set_page_order(&page[size], high);
}
 }
@@ -1020,6 +1026,8 @@ struct page *__rmqueue_smallest(struct zone *zone, 
unsigned int order,
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
+   if (is_migrate_cma(migratetype))
+   area->cma_nr_free--;
expand(zone, page, order, current_order, area, migratetype);
set_freepage_migratetype(page, migratetype);
return page;
@@ -1208,6 +1216,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, 
int start_migratetype)
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
area->nr_free--;
+   if (is_migrate_cma(migratetype))
+   area->cma_nr_free--;
 
new_type = try_to_steal_freepages(zone, page,
  start_migratetype,
@@ -1597,6 +1607,8 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
/* Remove page from free list */
list_del(&page->lru);
zone->free_area[order].nr_free--;
+   if (is_migrate_cma(mt))
+   zone->free_area[order].cma_nr_free--;
rmv_page_order(page);
 
/* Set the pageblock if the isolated page is at least a pageblock */
@@ -1827,6 +1839,13 @@ static bool __zone_watermark_ok(struct zone *z, unsigned 
int order,
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
 
+   /* If CMA's page number of this order was substructed as part
+  

Re: [PATCH 1/3] CMA: Fix the bug that CMA's page number is substructed twice

2014-12-30 Thread Hui Zhu
On Tue, Dec 30, 2014 at 12:48 PM, Joonsoo Kim  wrote:
> On Thu, Dec 25, 2014 at 05:43:26PM +0800, Hui Zhu wrote:
>> In Joonsoo's CMA patch "CMA: always treat free cma pages as non-free on
>> watermark checking" [1], it changes __zone_watermark_ok to substruct CMA
>> pages number from free_pages if system use CMA:
>>   if (IS_ENABLED(CONFIG_CMA) && z->managed_cma_pages)
>>   free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
>
> Hello,
>
> In fact, without that patch, watermark checking has a problem in current 
> kernel.
> If there is reserved CMA region, watermark check for high order
> allocation is done loosly. See following thread.
>
> https://lkml.org/lkml/2014/5/30/320
>
> Your patch can fix this situation, so, how about submitting this patch
> separately?
>
> Thanks.
>

Hi Joonsoo,

Thanks for your remind.  I will post a separate patch for current kernel.

Thanks,
Hui
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/3] CMA: Handle the issues of aggressively allocate the

2014-12-25 Thread Hui Zhu
I tried the Joonsoo's CMA patches [1] in my part and found that they works
better than mine [2] about handle LRU and other issues even if they
don't shrink the memory before cma_alloc.  So I began to test it in my
part.
But my colleague Weixing found some issues around it.  So we make 2 patches to
handle the issues.
And I merged cma_alloc_counter from [2] to cma_alloc work better.

This patchset is based on aa39477b5692611b91ac9455ae588738852b3f60 and [1].

[1] https://lkml.org/lkml/2014/5/28/64
[2] https://lkml.org/lkml/2014/10/15/623

Hui Zhu (3):
CMA: Fix the bug that CMA's page number is substructed twice
CMA: Fix the issue that nr_try_movable just count MIGRATE_MOVABLE memory
CMA: Add cma_alloc_counter to make cma_alloc work better if it meet busy range

 include/linux/cma.h|2 +
 include/linux/mmzone.h |3 +
 mm/cma.c   |6 +++
 mm/page_alloc.c|   76 ++---
 4 files changed, 65 insertions(+), 22 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3] CMA: Add cma_alloc_counter to make cma_alloc work better if it meet busy range

2014-12-25 Thread Hui Zhu
In [1], Joonsoo said that cma_alloc_counter is useless because pageblock
is isolated.
But if alloc_contig_range meet a busy range, it will undo_isolate_page_range
before goto try next range. At this time, __rmqueue_cma can begin allocd
CMA memory from the range.

So I add cma_alloc_counter let __rmqueue doesn't call __rmqueue_cma when
cma_alloc works.

[1] https://lkml.org/lkml/2014/10/24/26

Signed-off-by: Hui Zhu 
---
 include/linux/cma.h | 2 ++
 mm/cma.c| 6 ++
 mm/page_alloc.c | 8 +++-
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 9384ba6..155158f 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -26,6 +26,8 @@ extern int __init cma_declare_contiguous(phys_addr_t base,
 extern int cma_init_reserved_mem(phys_addr_t base,
phys_addr_t size, int order_per_bit,
struct cma **res_cma);
+
+extern atomic_t cma_alloc_counter;
 extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align);
 extern bool cma_release(struct cma *cma, struct page *pages, int count);
 #endif
diff --git a/mm/cma.c b/mm/cma.c
index 6707b5d..b63f6be 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -348,6 +348,8 @@ err:
return ret;
 }
 
+atomic_t cma_alloc_counter = ATOMIC_INIT(0);
+
 /**
  * cma_alloc() - allocate pages from contiguous area
  * @cma:   Contiguous memory region for which the allocation is performed.
@@ -378,6 +380,8 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned 
int align)
bitmap_maxno = cma_bitmap_maxno(cma);
bitmap_count = cma_bitmap_pages_to_bits(cma, count);
 
+   atomic_inc(&cma_alloc_counter);
+
for (;;) {
mutex_lock(&cma->lock);
bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
@@ -415,6 +419,8 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned 
int align)
start = bitmap_no + mask + 1;
}
 
+   atomic_dec(&cma_alloc_counter);
+
pr_debug("%s(): returned %p\n", __func__, page);
return page;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a5bbc38..0622c4c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,10 @@
 #include 
 #include "internal.h"
 
+#ifdef CONFIG_CMA
+#include 
+#endif
+
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
 #define MIN_PERCPU_PAGELIST_FRACTION   (8)
@@ -1330,7 +1334,9 @@ static struct page *__rmqueue(struct zone *zone, unsigned 
int order,
 {
struct page *page = NULL;
 
-   if (IS_ENABLED(CONFIG_CMA) && zone->managed_cma_pages) {
+   if (IS_ENABLED(CONFIG_CMA)
+   && zone->managed_cma_pages
+   && atomic_read(&cma_alloc_counter) == 0) {
if (migratetype == MIGRATE_MOVABLE
&& zone->nr_try_movable <= 0)
page = __rmqueue_cma(zone, order);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] CMA: Fix the bug that CMA's page number is substructed twice

2014-12-25 Thread Hui Zhu
In Joonsoo's CMA patch "CMA: always treat free cma pages as non-free on
watermark checking" [1], it changes __zone_watermark_ok to substruct CMA
pages number from free_pages if system use CMA:
if (IS_ENABLED(CONFIG_CMA) && z->managed_cma_pages)
free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);

But after this part of code
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
CMA memory in each order is part of z->free_area[o].nr_free, then the CMA
page number of this order is substructed twice.  This bug will make
__zone_watermark_ok return more false.

This patch add cma_free_area to struct free_area that just record the number
of CMA pages.  And add it back in the order loop to handle the substruct
twice issue.

[1] https://lkml.org/lkml/2014/5/28/110

Signed-off-by: Hui Zhu 
Signed-off-by: Weixing Liu 
---
 include/linux/mmzone.h |  3 +++
 mm/page_alloc.c| 29 -
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ee1ce1f..7ccad93 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -92,6 +92,9 @@ static inline int get_pfnblock_migratetype(struct page *page, 
unsigned long pfn)
 struct free_area {
struct list_headfree_list[MIGRATE_TYPES];
unsigned long   nr_free;
+#ifdef CONFIG_CMA
+   unsigned long   cma_nr_free;
+#endif
 };
 
 struct pglist_data;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1b6c82c..a8d9f03 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -650,6 +650,8 @@ static inline void __free_one_page(struct page *page,
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
+   if (is_migrate_cma(migratetype))
+   zone->free_area[order].cma_nr_free--;
rmv_page_order(buddy);
}
combined_idx = buddy_idx & page_idx;
@@ -683,6 +685,8 @@ static inline void __free_one_page(struct page *page,
list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 out:
zone->free_area[order].nr_free++;
+   if (is_migrate_cma(migratetype))
+   zone->free_area[order].cma_nr_free++;
 }
 
 static inline int free_pages_check(struct page *page)
@@ -987,6 +991,8 @@ static inline void expand(struct zone *zone, struct page 
*page,
}
list_add(&page[size].lru, &area->free_list[migratetype]);
area->nr_free++;
+   if (is_migrate_cma(migratetype))
+   area->cma_nr_free++;
set_page_order(&page[size], high);
}
 }
@@ -1070,6 +1076,8 @@ struct page *__rmqueue_smallest(struct zone *zone, 
unsigned int order,
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
+   if (is_migrate_cma(migratetype))
+   area->cma_nr_free--;
expand(zone, page, order, current_order, area, migratetype);
set_freepage_migratetype(page, migratetype);
return page;
@@ -1258,6 +1266,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, 
int start_migratetype)
page = list_entry(area->free_list[migratetype].next,
struct page, lru);
area->nr_free--;
+   if (is_migrate_cma(migratetype))
+   area->cma_nr_free--;
 
new_type = try_to_steal_freepages(zone, page,
  start_migratetype,
@@ -1682,6 +1692,8 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
/* Remove page from free list */
list_del(&page->lru);
zone->free_area[order].nr_free--;
+   if (is_migrate_cma(mt))
+   zone->free_area[order].cma_nr_free--;
rmv_page_order(page);
 
/* Set the pageblock if the isolated page is at least a pageblock */
@@ -1893,6 +1905,9 @@ static bool __zone_watermark_ok(struct zone *z, unsigned 
int order,
/* free_pages may go negative - that's OK */
long min = mark;
int o;
+#ifdef CONFIG_CMA
+   bool cma_is_subbed = false;
+#endif
 
free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
@@ -1905,8 +1920,10 @@ static bool __zone_watermark_ok(struct zone *z, unsigned 
int order,
 * unmovable/reclaimable allocation and they can suddenly
 * vanish through CMA allocation
 */
-   if (IS_ENABLED(CONFIG_CMA)

[PATCH 2/3] CMA: Fix the issue that nr_try_movable just count MIGRATE_MOVABLE memory

2014-12-25 Thread Hui Zhu
One of my plotform that use Joonsoo's CMA patch [1] has a device that
will alloc a lot of MIGRATE_UNMOVABLE memory when it works in a zone.
When this device works, the memory status of this zone is not OK.  Most of
CMA is not allocated but most normal memory is allocated.
This issue is because in __rmqueue:
if (IS_ENABLED(CONFIG_CMA) &&
migratetype == MIGRATE_MOVABLE && zone->managed_cma_pages)
page = __rmqueue_cma(zone, order);
Just allocated MIGRATE_MOVABLE will be record in nr_try_movable in function
__rmqueue_cma but not the others.  This device allocated a lot of
MIGRATE_UNMOVABLE memory affect the behavior of this zone memory allocation.

This patch change __rmqueue to let nr_try_movable record all the memory
allocation of normal memory.

[1] https://lkml.org/lkml/2014/5/28/64

Signed-off-by: Hui Zhu 
Signed-off-by: Weixing Liu 
---
 mm/page_alloc.c | 41 -
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a8d9f03..a5bbc38 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1301,28 +1301,23 @@ static struct page *__rmqueue_cma(struct zone *zone, 
unsigned int order)
 {
struct page *page;
 
-   if (zone->nr_try_movable > 0)
-   goto alloc_movable;
+   if (zone->nr_try_cma <= 0) {
+   /* Reset counter */
+   zone->nr_try_movable = zone->max_try_movable;
+   zone->nr_try_cma = zone->max_try_cma;
 
-   if (zone->nr_try_cma > 0) {
-   /* Okay. Now, we can try to allocate the page from cma region */
-   zone->nr_try_cma -= 1 << order;
-   page = __rmqueue_smallest(zone, order, MIGRATE_CMA);
-
-   /* CMA pages can vanish through CMA allocation */
-   if (unlikely(!page && order == 0))
-   zone->nr_try_cma = 0;
-
-   return page;
+   return NULL;
}
 
-   /* Reset counter */
-   zone->nr_try_movable = zone->max_try_movable;
-   zone->nr_try_cma = zone->max_try_cma;
+   /* Okay. Now, we can try to allocate the page from cma region */
+   zone->nr_try_cma -= 1 << order;
+   page = __rmqueue_smallest(zone, order, MIGRATE_CMA);
 
-alloc_movable:
-   zone->nr_try_movable -= 1 << order;
-   return NULL;
+   /* CMA pages can vanish through CMA allocation */
+   if (unlikely(!page && order == 0))
+   zone->nr_try_cma = 0;
+
+   return page;
 }
 #endif
 
@@ -1335,9 +1330,13 @@ static struct page *__rmqueue(struct zone *zone, 
unsigned int order,
 {
struct page *page = NULL;
 
-   if (IS_ENABLED(CONFIG_CMA) &&
-   migratetype == MIGRATE_MOVABLE && zone->managed_cma_pages)
-   page = __rmqueue_cma(zone, order);
+   if (IS_ENABLED(CONFIG_CMA) && zone->managed_cma_pages) {
+   if (migratetype == MIGRATE_MOVABLE
+   && zone->nr_try_movable <= 0)
+   page = __rmqueue_cma(zone, order);
+   else
+   zone->nr_try_movable -= 1 << order;
+   }
 
 retry_reserve:
if (!page)
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 4/4] (CMA_AGGRESSIVE) Update page alloc function

2014-11-27 Thread Hui Zhu
On Fri, Oct 24, 2014 at 1:28 PM, Joonsoo Kim  wrote:
> On Thu, Oct 16, 2014 at 11:35:51AM +0800, Hui Zhu wrote:
>> If page alloc function __rmqueue try to get pages from MIGRATE_MOVABLE and
>> conditions (cma_alloc_counter, cma_aggressive_free_min, cma_alloc_counter)
>> allow, MIGRATE_CMA will be allocated as MIGRATE_MOVABLE first.
>>
>> Signed-off-by: Hui Zhu 
>> ---
>>  mm/page_alloc.c | 42 +++---
>>  1 file changed, 31 insertions(+), 11 deletions(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 736d8e1..87bc326 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -65,6 +65,10 @@
>>  #include 
>>  #include "internal.h"
>>
>> +#ifdef CONFIG_CMA_AGGRESSIVE
>> +#include 
>> +#endif
>> +
>>  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
>>  static DEFINE_MUTEX(pcp_batch_high_lock);
>>  #define MIN_PERCPU_PAGELIST_FRACTION (8)
>> @@ -1189,20 +1193,36 @@ static struct page *__rmqueue(struct zone *zone, 
>> unsigned int order,
>>  {
>>   struct page *page;
>>
>> -retry_reserve:
>> +#ifdef CONFIG_CMA_AGGRESSIVE
>> + if (cma_aggressive_switch
>> + && migratetype == MIGRATE_MOVABLE
>> + && atomic_read(&cma_alloc_counter) == 0
>> + && global_page_state(NR_FREE_CMA_PAGES) > cma_aggressive_free_min
>> + + (1 << order))
>> + migratetype = MIGRATE_CMA;
>> +#endif
>> +retry:
>
> I don't get it why cma_alloc_counter should be tested.
> When cma alloc is progress, pageblock is isolated so that pages on that
> pageblock cannot be allocated. Why should we prevent aggressive
> allocation in this case?
>

Hi Joonsoo,

Even if the pageblock is isolated in the begin of function
alloc_contig_range, it will unisolate if alloc_contig_range get some
error for example "PFNs busy".  And the cma_alloc will keep call
alloc_contig_range with another address if need.

So it will decrease the contradiction between CMA allocation in
cma_alloc and __rmqueue with  cma_alloc_counter.

Thanks,
Hui

> Thanks.
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] (CMA_AGGRESSIVE) Make CMA memory be more aggressive about allocation

2014-11-04 Thread Hui Zhu
On Tue, Nov 4, 2014 at 3:53 PM, Minchan Kim  wrote:
> Hello,
>
> On Wed, Oct 29, 2014 at 03:43:33PM +0100, Vlastimil Babka wrote:
>> On 10/16/2014 10:55 AM, Laura Abbott wrote:
>> >On 10/15/2014 8:35 PM, Hui Zhu wrote:
>> >
>> >It's good to see another proposal to fix CMA utilization. Do you have
>> >any data about the success rate of CMA contiguous allocation after
>> >this patch series? I played around with a similar approach of using
>> >CMA for MIGRATE_MOVABLE allocations and found that although utilization
>> >did increase, contiguous allocations failed at a higher rate and were
>> >much slower. I see what this series is trying to do with avoiding
>> >allocation from CMA pages when a contiguous allocation is progress.
>> >My concern is that there would still be problems with contiguous
>> >allocation after all the MIGRATE_MOVABLE fallback has happened.
>>
>> Hi,
>>
>> did anyone try/suggest the following idea?
>>
>> - keep CMA as fallback to MOVABLE as is is now, i.e. non-agressive
>> - when UNMOVABLE (RECLAIMABLE also?) allocation fails and CMA
>> pageblocks have space, don't OOM immediately, but first try to
>> migrate some MOVABLE pages to CMA pageblocks, to make space for the
>> UNMOVABLE allocation in non-CMA pageblocks
>> - this should keep CMA pageblocks free as long as possible and
>> useful for CMA allocations, but without restricting the non-MOVABLE
>> allocations even though there is free memory (but in CMA pageblocks)
>> - the fact that a MOVABLE page could be successfully migrated to CMA
>> pageblock, means it was not pinned or otherwise non-migratable, so
>> there's a good chance it can be migrated back again if CMA
>> pageblocks need to be used by CMA allocation
>
> I suggested exactly same idea long time ago.
>
>> - it's more complex, but I guess we have most of the necessary
>> infrastructure in compaction already :)
>
> I agree but still, it doesn't solve reclaim problem(ie, VM doesn't
> need to reclaim CMA pages when memory pressure of unmovable pages
> happens). Of course, we could make VM be aware of that via introducing
> new flag of __isolate_lru_page.
>
> However, I'd like to think CMA design from the beginning.
> It made page allocation logic complicated, even very fragile as we
> had recently and now we need to add new logics to migrate like you said.
> As well, we need to fix reclaim path, too.
>
> It makes mm complicated day by day even though it doesn't do the role
> enough well(ie, big latency and frequent allocation failure) so I really
> want to stop making the mess bloated.
>
> Long time ago, when I saw Joonsoo's CMA agressive allocation patchset
> (ie, roundrobin allocation between CMA and normal movable pages)
> it was good to me at a first glance but it needs tweak of allocation
> path and doesn't solve reclaim path, either. Yes, reclaim path could
> be solved by another patch but I want to solve it altogether.
>
> At that time, I suggested big surgery to Joonsoo in offline that
> let's move CMA allocation with movable zone allocation. With it,
> we could make allocation/reclaim path simple but thing is we should
> make VM be aware of overlapping MOVABLE zone which means some of pages
> in the zone could be part of another zones but I think we already have
> logics to handle it when I read comment in isolate_freepages so I think
> the design should work.

Thanks.

>
> A thing you guys might worry is bigger CMA latency because it makes
> CMA memory usage ratio higher than the approach you mentioned but
> anyone couldn't guarantee it once memory is fully utilized.
> In addition, we have used fair zone allocator policy so it makes
> round robin allocation automatically so I believe it should be way
> to go.

Even if kernel use it to allocate the CMA memory, CMA alloc latency
will happen if most of memory is allocated and driver try to get CMA
memory.
https://lkml.org/lkml/2014/10/17/129
https://lkml.org/lkml/2014/10/17/130
These patches let cma_alloc do a shrink with function
shrink_all_memory_for_cma if need.  It handle a lot of latency issue
in my part.
And I think it can be more configurable for example some device use it
and others not.

Thanks,
Hui



>
>>
>> Thoughts?
>> Vlastimil
>>
>> >Thanks,
>> >Laura
>> >
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majord...@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
>
> --
> Kind regards,
> Minchan Kim
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] (CMA_AGGRESSIVE) Make CMA memory be more aggressive about allocation

2014-11-03 Thread Hui Zhu
On Wed, Oct 29, 2014 at 10:43 PM, Vlastimil Babka  wrote:
> On 10/16/2014 10:55 AM, Laura Abbott wrote:
>>
>> On 10/15/2014 8:35 PM, Hui Zhu wrote:
>>
>> It's good to see another proposal to fix CMA utilization. Do you have
>> any data about the success rate of CMA contiguous allocation after
>> this patch series? I played around with a similar approach of using
>> CMA for MIGRATE_MOVABLE allocations and found that although utilization
>> did increase, contiguous allocations failed at a higher rate and were
>> much slower. I see what this series is trying to do with avoiding
>> allocation from CMA pages when a contiguous allocation is progress.
>> My concern is that there would still be problems with contiguous
>> allocation after all the MIGRATE_MOVABLE fallback has happened.
>
>
> Hi,
>
> did anyone try/suggest the following idea?
>
> - keep CMA as fallback to MOVABLE as is is now, i.e. non-agressive
> - when UNMOVABLE (RECLAIMABLE also?) allocation fails and CMA pageblocks
> have space, don't OOM immediately, but first try to migrate some MOVABLE
> pages to CMA pageblocks, to make space for the UNMOVABLE allocation in
> non-CMA pageblocks
> - this should keep CMA pageblocks free as long as possible and useful for
> CMA allocations, but without restricting the non-MOVABLE allocations even
> though there is free memory (but in CMA pageblocks)
> - the fact that a MOVABLE page could be successfully migrated to CMA
> pageblock, means it was not pinned or otherwise non-migratable, so there's a
> good chance it can be migrated back again if CMA pageblocks need to be used
> by CMA allocation
> - it's more complex, but I guess we have most of the necessary
> infrastructure in compaction already :)

I think this idea make CMA allocation part become complex but make
balance and shrink code become easy because it make CMA become real
memory.
I just worry about the speed of migrate memory with this idea.  :)

Thanks,
Hui


>
> Thoughts?
> Vlastimil
>
>> Thanks,
>> Laura
>>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 2/4] mm/page_alloc: add freepage on isolate pageblock to correct buddy list

2014-11-03 Thread Hui Zhu
On Mon, Nov 3, 2014 at 4:22 PM, Heesub Shin  wrote:
> Hello,
>
>
> On 10/31/2014 04:25 PM, Joonsoo Kim wrote:
>>
>> In free_pcppages_bulk(), we use cached migratetype of freepage
>> to determine type of buddy list where freepage will be added.
>> This information is stored when freepage is added to pcp list, so
>> if isolation of pageblock of this freepage begins after storing,
>> this cached information could be stale. In other words, it has
>> original migratetype rather than MIGRATE_ISOLATE.
>>
>> There are two problems caused by this stale information. One is that
>> we can't keep these freepages from being allocated. Although this
>> pageblock is isolated, freepage will be added to normal buddy list
>> so that it could be allocated without any restriction. And the other
>> problem is incorrect freepage accounting. Freepages on isolate pageblock
>> should not be counted for number of freepage.
>>
>> Following is the code snippet in free_pcppages_bulk().
>>
>> /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
>> __free_one_page(page, page_to_pfn(page), zone, 0, mt);
>> trace_mm_page_pcpu_drain(page, 0, mt);
>> if (likely(!is_migrate_isolate_page(page))) {
>> __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
>> if (is_migrate_cma(mt))
>> __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
>> }
>>
>> As you can see above snippet, current code already handle second problem,
>> incorrect freepage accounting, by re-fetching pageblock migratetype
>> through is_migrate_isolate_page(page). But, because this re-fetched
>> information isn't used for __free_one_page(), first problem would not be
>> solved. This patch try to solve this situation to re-fetch pageblock
>> migratetype before __free_one_page() and to use it for __free_one_page().
>>
>> In addition to move up position of this re-fetch, this patch use
>> optimization technique, re-fetching migratetype only if there is
>> isolate pageblock. Pageblock isolation is rare event, so we can
>> avoid re-fetching in common case with this optimization.
>>
>> This patch also correct migratetype of the tracepoint output.
>>
>> Cc: 
>> Acked-by: Minchan Kim 
>> Acked-by: Michal Nazarewicz 
>> Acked-by: Vlastimil Babka 
>> Signed-off-by: Joonsoo Kim 
>> ---
>>   mm/page_alloc.c |   13 -
>>   1 file changed, 8 insertions(+), 5 deletions(-)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index f7a867e..6df23fe 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -725,14 +725,17 @@ static void free_pcppages_bulk(struct zone *zone,
>> int count,
>> /* must delete as __free_one_page list manipulates
>> */
>> list_del(&page->lru);
>> mt = get_freepage_migratetype(page);
>> +   if (unlikely(has_isolate_pageblock(zone))) {
>
>
> How about adding an additional check for 'mt == MIGRATE_MOVABLE' here? Then,
> most of get_pageblock_migratetype() calls could be avoided while the
> isolation is in progress. I am not sure this is the case on memory
> offlining. How do you think?

I think the reason is that this "mt" may be not the right value of this page.
It is set without zone->lock.

Thanks,
Hui

>
>> +   mt = get_pageblock_migratetype(page);
>> +   if (is_migrate_isolate(mt))
>> +   goto skip_counting;
>> +   }
>> +   __mod_zone_freepage_state(zone, 1, mt);
>> +
>> +skip_counting:
>> /* MIGRATE_MOVABLE list may include
>> MIGRATE_RESERVEs */
>> __free_one_page(page, page_to_pfn(page), zone, 0,
>> mt);
>> trace_mm_page_pcpu_drain(page, 0, mt);
>> -   if (likely(!is_migrate_isolate_page(page))) {
>> -   __mod_zone_page_state(zone, NR_FREE_PAGES,
>> 1);
>> -   if (is_migrate_cma(mt))
>> -   __mod_zone_page_state(zone,
>> NR_FREE_CMA_PAGES, 1);
>> -   }
>> } while (--to_free && --batch_free && !list_empty(list));
>> }
>> spin_unlock(&zone->lock);
>>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/4] (CMA_AGGRESSIVE) Make CMA memory be more aggressive about allocation

2014-11-02 Thread Hui Zhu
On Fri, Oct 24, 2014 at 1:25 PM, Joonsoo Kim  wrote:
> On Thu, Oct 16, 2014 at 11:35:47AM +0800, Hui Zhu wrote:
>> In fallbacks of page_alloc.c, MIGRATE_CMA is the fallback of
>> MIGRATE_MOVABLE.
>> MIGRATE_MOVABLE will use MIGRATE_CMA when it doesn't have a page in
>> order that Linux kernel want.
>>
>> If a system that has a lot of user space program is running, for
>> instance, an Android board, most of memory is in MIGRATE_MOVABLE and
>> allocated.  Before function __rmqueue_fallback get memory from
>> MIGRATE_CMA, the oom_killer will kill a task to release memory when
>> kernel want get MIGRATE_UNMOVABLE memory because fallbacks of
>> MIGRATE_UNMOVABLE are MIGRATE_RECLAIMABLE and MIGRATE_MOVABLE.
>> This status is odd.  The MIGRATE_CMA has a lot free memory but Linux
>> kernel kill some tasks to release memory.
>>
>> This patch series adds a new function CMA_AGGRESSIVE to make CMA memory
>> be more aggressive about allocation.
>> If function CMA_AGGRESSIVE is available, when Linux kernel call function
>> __rmqueue try to get pages from MIGRATE_MOVABLE and conditions allow,
>> MIGRATE_CMA will be allocated as MIGRATE_MOVABLE first.  If MIGRATE_CMA
>> doesn't have enough pages for allocation, go back to allocate memory from
>> MIGRATE_MOVABLE.
>> Then the memory of MIGRATE_MOVABLE can be kept for MIGRATE_UNMOVABLE and
>> MIGRATE_RECLAIMABLE which doesn't have fallback MIGRATE_CMA.
>
> Hello,
>
> I did some work similar to this.
> Please reference following links.
>
> https://lkml.org/lkml/2014/5/28/64
> https://lkml.org/lkml/2014/5/28/57

> I tested #1 approach and found the problem. Although free memory on
> meminfo can move around low watermark, there is large fluctuation on free
> memory, because too many pages are reclaimed when kswapd is invoked.
> Reason for this behaviour is that successive allocated CMA pages are
> on the LRU list in that order and kswapd reclaim them in same order.
> These memory doesn't help watermark checking from kwapd, so too many
> pages are reclaimed, I guess.

This issue can be handle with some change around shrink code.  I am
trying to integrate  a patch for them.
But I am not sure we met the same issue.  Do you mind give me more
info about this part?

>
> And, aggressive allocation should be postponed until freepage counting
> bug is fixed, because aggressive allocation enlarge the possiblity
> of problem occurence. I tried to fix that bug, too. See following link.
>
> https://lkml.org/lkml/2014/10/23/90

I am following these patches.  They are great!  Thanks for your work.

Best,
Hui

>
> Thanks.
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org";> em...@kvack.org 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 2/3] CMA: aggressively allocate the pages on cma reserved memory when not used

2014-10-30 Thread Hui Zhu
On Wed, May 28, 2014 at 3:04 PM, Joonsoo Kim  wrote:
> CMA is introduced to provide physically contiguous pages at runtime.
> For this purpose, it reserves memory at boot time. Although it reserve
> memory, this reserved memory can be used for movable memory allocation
> request. This usecase is beneficial to the system that needs this CMA
> reserved memory infrequently and it is one of main purpose of
> introducing CMA.
>
> But, there is a problem in current implementation. The problem is that
> it works like as just reserved memory approach. The pages on cma reserved
> memory are hardly used for movable memory allocation. This is caused by
> combination of allocation and reclaim policy.
>
> The pages on cma reserved memory are allocated if there is no movable
> memory, that is, as fallback allocation. So the time this fallback
> allocation is started is under heavy memory pressure. Although it is under
> memory pressure, movable allocation easily succeed, since there would be
> many pages on cma reserved memory. But this is not the case for unmovable
> and reclaimable allocation, because they can't use the pages on cma
> reserved memory. These allocations regard system's free memory as
> (free pages - free cma pages) on watermark checking, that is, free
> unmovable pages + free reclaimable pages + free movable pages. Because
> we already exhausted movable pages, only free pages we have are unmovable
> and reclaimable types and this would be really small amount. So watermark
> checking would be failed. It will wake up kswapd to make enough free
> memory for unmovable and reclaimable allocation and kswapd will do.
> So before we fully utilize pages on cma reserved memory, kswapd start to
> reclaim memory and try to make free memory over the high watermark. This
> watermark checking by kswapd doesn't take care free cma pages so many
> movable pages would be reclaimed. After then, we have a lot of movable
> pages again, so fallback allocation doesn't happen again. To conclude,
> amount of free memory on meminfo which includes free CMA pages is moving
> around 512 MB if I reserve 512 MB memory for CMA.
>
> I found this problem on following experiment.
>
> 4 CPUs, 1024 MB, VIRTUAL MACHINE
> make -j16
>
> CMA reserve:0 MB512 MB
> Elapsed-time:   225.2   472.5
> Average-MemFree:322490 KB   630839 KB
>
> To solve this problem, I can think following 2 possible solutions.
> 1. allocate the pages on cma reserved memory first, and if they are
>exhausted, allocate movable pages.
> 2. interleaved allocation: try to allocate specific amounts of memory
>from cma reserved memory and then allocate from free movable memory.
>
> I tested #1 approach and found the problem. Although free memory on
> meminfo can move around low watermark, there is large fluctuation on free
> memory, because too many pages are reclaimed when kswapd is invoked.
> Reason for this behaviour is that successive allocated CMA pages are
> on the LRU list in that order and kswapd reclaim them in same order.
> These memory doesn't help watermark checking from kwapd, so too many
> pages are reclaimed, I guess.

Could you send more information about this part?  I want to do some
test around it.
I use this way in my patch.

Thanks,
Hui

>
> So, I implement #2 approach.
> One thing I should note is that we should not change allocation target
> (movable list or cma) on each allocation attempt, since this prevent
> allocated pages to be in physically succession, so some I/O devices can
> be hurt their performance. To solve this, I keep allocation target
> in at least pageblock_nr_pages attempts and make this number reflect
> ratio, free pages without free cma pages to free cma pages. With this
> approach, system works very smoothly and fully utilize the pages on
> cma reserved memory.
>
> Following is the experimental result of this patch.
>
> 4 CPUs, 1024 MB, VIRTUAL MACHINE
> make -j16
>
> 
> CMA reserve:0 MB512 MB
> Elapsed-time:   225.2   472.5
> Average-MemFree:322490 KB   630839 KB
> nr_free_cma:0   131068
> pswpin: 0   261666
> pswpout:75  1241363
>
> 
> CMA reserve:0 MB512 MB
> Elapsed-time:   222.7   224
> Average-MemFree:325595 KB   393033 KB
> nr_free_cma:0   61001
> pswpin: 0   6
> pswpout:44  502
>
> There is no difference if we don't have cma reserved memory (0 MB case).
> But, with cma reserved memory (512 MB case), we fully utilize these
> reserved memory through this patch and the system behaves like as
> it doesn't reserve any memory.
>
> With this patch, we aggressively allocate the pages on cma reserved memory
> so latency of CMA can arise. Below is the experimental result about
> latency.
>
> 4 CPUs, 1024 MB, VIRTUAL MACH

[PATCH v2 3/4] (CMA_AGGRESSIVE) Update reserve custom contiguous area code

2014-10-17 Thread Hui Zhu
Update this patch according to the comments from Rafael.

Add cma_alloc_counter, cma_aggressive_switch, cma_aggressive_free_min and
cma_aggressive_shrink_switch.

cma_aggressive_switch is the swith for all CMA_AGGRESSIVE function.  It can be
controlled by sysctl vm.cma-aggressive-switch.

cma_aggressive_free_min can be controlled by sysctl
"vm.cma-aggressive-free-min".  If the number of CMA free pages is small than
this sysctl value, CMA_AGGRESSIVE will not work in page alloc code.

cma_aggressive_shrink_switch can be controlled by sysctl
"vm.cma-aggressive-shrink-switch".  If sysctl "vm.cma-aggressive-shrink-switch"
is true and free normal memory's size is smaller than the size that it want to
allocate, do memory shrink with function git commit -a --amend before driver
allocate pages from CMA.

When Linux kernel try to reserve custom contiguous area, increase the value of
cma_alloc_counter.  CMA_AGGRESSIVE will not work in page alloc code.
After reserve custom contiguous area function return, decreases the value of
cma_alloc_counter.

Signed-off-by: Hui Zhu 
---
 include/linux/cma.h |  7 +++
 kernel/sysctl.c | 27 +++
 mm/cma.c| 54 +
 3 files changed, 88 insertions(+)

diff --git a/include/linux/cma.h b/include/linux/cma.h
index 0430ed0..df96abf 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -15,6 +15,13 @@
 
 struct cma;
 
+#ifdef CONFIG_CMA_AGGRESSIVE
+extern atomic_t cma_alloc_counter;
+extern int cma_aggressive_switch;
+extern unsigned long cma_aggressive_free_min;
+extern int cma_aggressive_shrink_switch;
+#endif
+
 extern phys_addr_t cma_get_base(struct cma *cma);
 extern unsigned long cma_get_size(struct cma *cma);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4aada6d..646929e2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -92,6 +92,10 @@
 #include 
 #endif
 
+#ifdef CONFIG_CMA_AGGRESSIVE
+#include 
+#endif
+
 
 #if defined(CONFIG_SYSCTL)
 
@@ -1485,6 +1489,29 @@ static struct ctl_table vm_table[] = {
.mode   = 0644,
.proc_handler   = proc_doulongvec_minmax,
},
+#ifdef CONFIG_CMA_AGGRESSIVE
+   {
+   .procname   = "cma-aggressive-switch",
+   .data   = &cma_aggressive_switch,
+   .maxlen = sizeof(int),
+   .mode   = 0600,
+   .proc_handler   = proc_dointvec,
+   },
+   {
+   .procname   = "cma-aggressive-free-min",
+   .data   = &cma_aggressive_free_min,
+   .maxlen = sizeof(unsigned long),
+   .mode   = 0600,
+   .proc_handler   = proc_doulongvec_minmax,
+   },
+   {
+   .procname   = "cma-aggressive-shrink-switch",
+   .data   = &cma_aggressive_shrink_switch,
+   .maxlen = sizeof(int),
+   .mode   = 0600,
+   .proc_handler   = proc_dointvec,
+   },
+#endif
{ }
 };
 
diff --git a/mm/cma.c b/mm/cma.c
index 963bc4a..1cf341c 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct cma {
unsigned long   base_pfn;
@@ -127,6 +128,27 @@ err:
return -EINVAL;
 }
 
+#ifdef CONFIG_CMA_AGGRESSIVE
+/* The counter for the dma_alloc_from_contiguous and
+   dma_release_from_contiguous.  */
+atomic_t cma_alloc_counter = ATOMIC_INIT(0);
+
+/* Swich of CMA_AGGRESSIVE.  */
+int cma_aggressive_switch __read_mostly;
+
+/* If the number of CMA free pages is small than this value, CMA_AGGRESSIVE 
will
+   not work. */
+#ifdef CONFIG_CMA_AGGRESSIVE_FREE_MIN
+unsigned long cma_aggressive_free_min __read_mostly =
+   CONFIG_CMA_AGGRESSIVE_FREE_MIN;
+#else
+unsigned long cma_aggressive_free_min __read_mostly = 500;
+#endif
+
+/* Swich of CMA_AGGRESSIVE shink.  */
+int cma_aggressive_shrink_switch __read_mostly;
+#endif
+
 static int __init cma_init_reserved_areas(void)
 {
int i;
@@ -138,6 +160,22 @@ static int __init cma_init_reserved_areas(void)
return ret;
}
 
+#ifdef CONFIG_CMA_AGGRESSIVE
+   cma_aggressive_switch = 0;
+#ifdef CONFIG_CMA_AGGRESSIVE_PHY_MAX
+   if (memblock_phys_mem_size() <= CONFIG_CMA_AGGRESSIVE_PHY_MAX)
+#else
+   if (memblock_phys_mem_size() <= 0x4000)
+#endif
+   cma_aggressive_switch = 1;
+
+   cma_aggressive_shrink_switch = 0;
+#ifdef CONFIG_CMA_AGGRESSIVE_SHRINK
+   if (cma_aggressive_switch)
+   cma_aggressive_shrink_switch = 1;
+#endif
+#endif
+
return 0;
 }
 core_initcall(cma_init_reserved_areas);
@@ -312,6 +350,11 @@ struct page *cma_alloc(struct cma *cma, int count, 
unsigned int align)
unsigned long bitmap_maxno, bitmap_no, bitmap_coun

[PATCH v2 2/4] (CMA_AGGRESSIVE) Add new function shrink_all_memory_for_cma

2014-10-17 Thread Hui Zhu
Update this patch according to the comments from Rafael.

Function shrink_all_memory_for_cma try to free `nr_to_reclaim' of memory.
CMA aggressive shrink function will call this functon to free `nr_to_reclaim' of
memory.

Signed-off-by: Hui Zhu 
---
 mm/vmscan.c | 58 +++---
 1 file changed, 43 insertions(+), 15 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcb4707..658dc8d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3404,6 +3404,28 @@ void wakeup_kswapd(struct zone *zone, int order, enum 
zone_type classzone_idx)
wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
+#if defined CONFIG_HIBERNATION || defined CONFIG_CMA_AGGRESSIVE
+static unsigned long __shrink_all_memory(struct scan_control *sc)
+{
+   struct reclaim_state reclaim_state;
+   struct zonelist *zonelist = node_zonelist(numa_node_id(), sc->gfp_mask);
+   struct task_struct *p = current;
+   unsigned long nr_reclaimed;
+
+   p->flags |= PF_MEMALLOC;
+   lockdep_set_current_reclaim_state(sc->gfp_mask);
+   reclaim_state.reclaimed_slab = 0;
+   p->reclaim_state = &reclaim_state;
+
+   nr_reclaimed = do_try_to_free_pages(zonelist, sc);
+
+   p->reclaim_state = NULL;
+   lockdep_clear_current_reclaim_state();
+   p->flags &= ~PF_MEMALLOC;
+
+   return nr_reclaimed;
+}
+
 #ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3415,7 +3437,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum 
zone_type classzone_idx)
  */
 unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 {
-   struct reclaim_state reclaim_state;
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3425,24 +3446,31 @@ unsigned long shrink_all_memory(unsigned long 
nr_to_reclaim)
.may_swap = 1,
.hibernation_mode = 1,
};
-   struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
-   struct task_struct *p = current;
-   unsigned long nr_reclaimed;
-
-   p->flags |= PF_MEMALLOC;
-   lockdep_set_current_reclaim_state(sc.gfp_mask);
-   reclaim_state.reclaimed_slab = 0;
-   p->reclaim_state = &reclaim_state;
 
-   nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+   return __shrink_all_memory(&sc);
+}
+#endif /* CONFIG_HIBERNATION */
 
-   p->reclaim_state = NULL;
-   lockdep_clear_current_reclaim_state();
-   p->flags &= ~PF_MEMALLOC;
+#ifdef CONFIG_CMA_AGGRESSIVE
+/*
+ * Try to free `nr_to_reclaim' of memory, system-wide, for CMA aggressive
+ * shrink function.
+ */
+void shrink_all_memory_for_cma(unsigned long nr_to_reclaim)
+{
+   struct scan_control sc = {
+   .nr_to_reclaim = nr_to_reclaim,
+   .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_HIGHMEM,
+   .priority = DEF_PRIORITY,
+   .may_writepage = !laptop_mode,
+   .may_unmap = 1,
+   .may_swap = 1,
+   };
 
-   return nr_reclaimed;
+   __shrink_all_memory(&sc);
 }
-#endif /* CONFIG_HIBERNATION */
+#endif /* CONFIG_CMA_AGGRESSIVE */
+#endif /* CONFIG_HIBERNATION || CONFIG_CMA_AGGRESSIVE */
 
 /* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness.  So if the last cpu in a node goes
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/4] (CMA_AGGRESSIVE) Make CMA memory be more aggressive about allocation

2014-10-15 Thread Hui Zhu
In fallbacks of page_alloc.c, MIGRATE_CMA is the fallback of
MIGRATE_MOVABLE.
MIGRATE_MOVABLE will use MIGRATE_CMA when it doesn't have a page in
order that Linux kernel want.

If a system that has a lot of user space program is running, for
instance, an Android board, most of memory is in MIGRATE_MOVABLE and
allocated.  Before function __rmqueue_fallback get memory from
MIGRATE_CMA, the oom_killer will kill a task to release memory when
kernel want get MIGRATE_UNMOVABLE memory because fallbacks of
MIGRATE_UNMOVABLE are MIGRATE_RECLAIMABLE and MIGRATE_MOVABLE.
This status is odd.  The MIGRATE_CMA has a lot free memory but Linux
kernel kill some tasks to release memory.

This patch series adds a new function CMA_AGGRESSIVE to make CMA memory
be more aggressive about allocation.
If function CMA_AGGRESSIVE is available, when Linux kernel call function
__rmqueue try to get pages from MIGRATE_MOVABLE and conditions allow,
MIGRATE_CMA will be allocated as MIGRATE_MOVABLE first.  If MIGRATE_CMA
doesn't have enough pages for allocation, go back to allocate memory from
MIGRATE_MOVABLE.
Then the memory of MIGRATE_MOVABLE can be kept for MIGRATE_UNMOVABLE and
MIGRATE_RECLAIMABLE which doesn't have fallback MIGRATE_CMA.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   >