When file refaults are detected and there are many inactive file pages,
the system never reclaim anonymous pages, the file pages are dropped
aggressively when there are still a lot of cold anonymous pages and
system thrashes.  This issue impacts the performance of applications
with large executable, e.g. chrome.

With this patch, when file refault is detected, inactive_list_is_low()
always returns true for file pages in get_scan_count() to enable
scanning anonymous pages.

The problem can be reproduced by the following test program.

---8<---
void fallocate_file(const char *filename, off_t size)
{
        struct stat st;
        int fd;

        if (!stat(filename, &st) && st.st_size >= size)
                return;

        fd = open(filename, O_WRONLY | O_CREAT, 0600);
        if (fd < 0) {
                perror("create file");
                exit(1);
        }
        if (posix_fallocate(fd, 0, size)) {
                perror("fallocate");
                exit(1);
        }
        close(fd);
}

long *alloc_anon(long size)
{
        long *start = malloc(size);
        memset(start, 1, size);
        return start;
}

long access_file(const char *filename, long size, long rounds)
{
        int fd, i;
        volatile char *start1, *end1, *start2;
        const int page_size = getpagesize();
        long sum = 0;

        fd = open(filename, O_RDONLY);
        if (fd == -1) {
                perror("open");
                exit(1);
        }

        /*
         * Some applications, e.g. chrome, use a lot of executable file
         * pages, map some of the pages with PROT_EXEC flag to simulate
         * the behavior.
         */
        start1 = mmap(NULL, size / 2, PROT_READ | PROT_EXEC, MAP_SHARED,
                      fd, 0);
        if (start1 == MAP_FAILED) {
                perror("mmap");
                exit(1);
        }
        end1 = start1 + size / 2;

        start2 = mmap(NULL, size / 2, PROT_READ, MAP_SHARED, fd, size / 2);
        if (start2 == MAP_FAILED) {
                perror("mmap");
                exit(1);
        }

        for (i = 0; i < rounds; ++i) {
                struct timeval before, after;
                volatile char *ptr1 = start1, *ptr2 = start2;
                gettimeofday(&before, NULL);
                for (; ptr1 < end1; ptr1 += page_size, ptr2 += page_size)
                        sum += *ptr1 + *ptr2;
                gettimeofday(&after, NULL);
                printf("File access time, round %d: %f (sec)\n", i,
                       (after.tv_sec - before.tv_sec) +
                       (after.tv_usec - before.tv_usec) / 1000000.0);
        }
        return sum;
}

int main(int argc, char *argv[])
{
        const long MB = 1024 * 1024;
        long anon_mb, file_mb, file_rounds;
        const char filename[] = "large";
        long *ret1;
        long ret2;

        if (argc != 4) {
                printf("usage: thrash ANON_MB FILE_MB FILE_ROUNDS\n");
                exit(0);
        }
        anon_mb = atoi(argv[1]);
        file_mb = atoi(argv[2]);
        file_rounds = atoi(argv[3]);

        fallocate_file(filename, file_mb * MB);
        printf("Allocate %ld MB anonymous pages\n", anon_mb);
        ret1 = alloc_anon(anon_mb * MB);
        printf("Access %ld MB file pages\n", file_mb);
        ret2 = access_file(filename, file_mb * MB, file_rounds);
        printf("Print result to prevent optimization: %ld\n",
               *ret1 + ret2);
        return 0;
}
---8<---

Running the test program on 2GB RAM VM with kernel 5.2.0-rc5, the
program fills ram with 2048 MB memory, access a 200 MB file for 10
times.  Without this patch, the file cache is dropped aggresively and
every access to the file is from disk.

  $ ./thrash 2048 200 10
  Allocate 2048 MB anonymous pages
  Access 200 MB file pages
  File access time, round 0: 2.489316 (sec)
  File access time, round 1: 2.581277 (sec)
  File access time, round 2: 2.487624 (sec)
  File access time, round 3: 2.449100 (sec)
  File access time, round 4: 2.420423 (sec)
  File access time, round 5: 2.343411 (sec)
  File access time, round 6: 2.454833 (sec)
  File access time, round 7: 2.483398 (sec)
  File access time, round 8: 2.572701 (sec)
  File access time, round 9: 2.493014 (sec)

With this patch, these file pages can be cached.

  $ ./thrash 2048 200 10
  Allocate 2048 MB anonymous pages
  Access 200 MB file pages
  File access time, round 0: 2.475189 (sec)
  File access time, round 1: 2.440777 (sec)
  File access time, round 2: 2.411671 (sec)
  File access time, round 3: 1.955267 (sec)
  File access time, round 4: 0.029924 (sec)
  File access time, round 5: 0.000808 (sec)
  File access time, round 6: 0.000771 (sec)
  File access time, round 7: 0.000746 (sec)
  File access time, round 8: 0.000738 (sec)
  File access time, round 9: 0.000747 (sec)

Fixes: e9868505987a ("mm,vmscan: only evict file pages when we have plenty")
Fixes: 7c5bd705d8f9 ("mm: memcg: only evict file pages when we have plenty")
Signed-off-by: Kuo-Hsin Yang <vo...@chromium.org>
Acked-by: Johannes Weiner <han...@cmpxchg.org>
Cc: <sta...@vger.kernel.org> # 4.12+
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7889f583ced9f..da0b97204372e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2125,7 +2125,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
  *   10TB     320        32GB
  */
 static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-                                struct scan_control *sc, bool actual_reclaim)
+                                struct scan_control *sc, bool trace)
 {
        enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2151,7 +2151,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, 
bool file,
         * rid of the stale workingset quickly.
         */
        refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
-       if (file && actual_reclaim && lruvec->refaults != refaults) {
+       if (file && lruvec->refaults != refaults) {
                inactive_ratio = 0;
        } else {
                gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2161,7 +2161,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, 
bool file,
                        inactive_ratio = 1;
        }
 
-       if (actual_reclaim)
+       if (trace)
                trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, 
sc->reclaim_idx,
                        lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), 
inactive,
                        lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), 
active,
-- 
2.22.0.410.gd8fdbe21b5-goog

Reply via email to