This patch reworks on the file cache scanning mode of page-types tool,
where when page-types is called with -f <filepath>, it can scan pages
in page cache tree of the specified file via /proc/kpagecache interface.

In the original implementation, it did mmap/madvise/mincore/pagemap over
page cache of the target file(s), so it gives us much measurement-disturbance.
This patch avoids this by using /proc/kpagecache.
And page-types does recursive walking when -f option specifies a directory,
which is too much, so let's keep it compact for code maintenability.
We can do the similar thing more flexibly for example by the following:

  find /tmp | \
      while read f ; do tools/vm/page-types -f $f ; done | \
      grep 0x | tr -s '\t' ' ' | awk '
    {
      label = $4;
      arr[label] = arr[label] + $2;
    }
    END {
      for ( a in arr ) {
        printf("%s %ld\n", a, arr[a]);
      }
    }
  '

This code gets page status summary of all files under /tmp, whose output
is like this:

  __RUDl________b_____________________ 2   # page count
  __RUDlA_______b_____________________ 4

ChangeLog:
- rebased onto v3.15-rc5 (resolved conflict with Konstantins patch
  commit 65a6a4105f "tools/vm/page-types.c: page-cache sniffing feature")

Signed-off-by: Naoya Horiguchi <[email protected]>
---
 tools/vm/page-types.c | 276 +++++++++++++++++++++-----------------------------
 1 file changed, 114 insertions(+), 162 deletions(-)

diff --git v3.15-rc5.orig/tools/vm/page-types.c v3.15-rc5/tools/vm/page-types.c
index 05654f5e48d5..a0fb55489ea7 100644
--- v3.15-rc5.orig/tools/vm/page-types.c
+++ v3.15-rc5/tools/vm/page-types.c
@@ -30,14 +30,12 @@
 #include <getopt.h>
 #include <limits.h>
 #include <assert.h>
-#include <ftw.h>
-#include <time.h>
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
+#include <sys/stat.h>
 #include <sys/statfs.h>
-#include <sys/mman.h>
 #include "../../include/uapi/linux/magic.h"
 #include "../../include/uapi/linux/kernel-page-flags.h"
 #include <api/fs/debugfs.h>
@@ -79,6 +77,7 @@
 
 #define KPF_BYTES              8
 #define PROC_KPAGEFLAGS                "/proc/kpageflags"
+#define PROC_KPAGECACHE                "/proc/kpagecache"
 
 /* [32-] kernel hacking assistances */
 #define KPF_RESERVED           32
@@ -162,7 +161,7 @@ static int          opt_raw;        /* for kernel 
developers */
 static int             opt_list;       /* list pages (in ranges) */
 static int             opt_no_summary; /* don't show summary */
 static pid_t           opt_pid;        /* process to walk */
-const char *           opt_file;
+static char            *opt_file;      /* walk over pagecache of file */
 
 #define MAX_ADDR_RANGES        1024
 static int             nr_addr_ranges;
@@ -183,6 +182,7 @@ static int          page_size;
 
 static int             pagemap_fd;
 static int             kpageflags_fd;
+static int             kpagecache_fd;
 
 static int             opt_hwpoison;
 static int             opt_unpoison;
@@ -276,6 +276,13 @@ static unsigned long kpageflags_read(uint64_t *buf,
        return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages);
 }
 
+static unsigned long kpagecache_read(uint64_t *buf,
+                                    unsigned long index,
+                                    unsigned long pages)
+{
+       return do_u64_read(kpagecache_fd, PROC_KPAGECACHE, buf, index, pages);
+}
+
 static unsigned long pagemap_read(uint64_t *buf,
                                  unsigned long index,
                                  unsigned long pages)
@@ -338,53 +345,62 @@ static char *page_flag_longname(uint64_t flags)
        return buf;
 }
 
+#define __NR_PAGECACHE_TAGS    3
+#define KPC_TAGS_BITS  __NR_PAGECACHE_TAGS
+#define KPC_TAGS_OFFSET        (64 - KPC_TAGS_BITS)
+#define KPC_TAGS_MASK  (((1ULL << KPC_TAGS_BITS) - 1) << KPC_TAGS_OFFSET)
+#define KPC_TAGS(entry)        ((entry & KPC_TAGS_MASK) >> KPC_TAGS_OFFSET)
 
 /*
  * page list and summary
  */
 
-static void show_page_range(unsigned long voffset, unsigned long offset,
-                           unsigned long size, uint64_t flags)
+static void show_page_range(unsigned long voffset,
+                       unsigned long offset, uint64_t flags, uint64_t entry)
 {
        static uint64_t      flags0;
        static unsigned long voff;
        static unsigned long index;
        static unsigned long count;
+       static uint64_t      entry0;
 
        if (flags == flags0 && offset == index + count &&
-           size && voffset == voff + count) {
-               count += size;
+           (!opt_pid || voffset == voff + count) &&
+           (!opt_file || (voffset == voff + count && entry == entry0))) {
+               count++;
                return;
        }
 
        if (count) {
                if (opt_pid)
-                       printf("%lx\t", voff);
-               if (opt_file)
-                       printf("%lu\t", voff);
-               printf("%lx\t%lx\t%s\n",
-                               index, count, page_flag_name(flags0));
+                       printf("%lx\t%lx\t%lx\t%s\n",
+                              voff, index, count, page_flag_name(flags0));
+               else if (opt_file)
+                       printf("%lx\t%lx\t%lx\t%llx\t%s\n",
+                              voff, index, count, KPC_TAGS(entry0), 
page_flag_name(flags0));
+               else
+                       printf("%lx\t%lx\t%s\n",
+                              index, count, page_flag_name(flags0));
        }
 
        flags0 = flags;
        index  = offset;
        voff   = voffset;
-       count  = size;
-}
-
-static void flush_page_range(void)
-{
-       show_page_range(0, 0, 0, 0);
+       count  = 1;
+       entry0 = entry;
 }
 
 static void show_page(unsigned long voffset,
-                     unsigned long offset, uint64_t flags)
+                     unsigned long offset, uint64_t flags, uint64_t entry)
 {
        if (opt_pid)
-               printf("%lx\t", voffset);
-       if (opt_file)
-               printf("%lu\t", voffset);
-       printf("%lx\t%s\n", offset, page_flag_name(flags));
+               printf("%lx\t%lx\t%s\n",
+                      voffset, offset, page_flag_name(flags));
+       else if (opt_file)
+               printf("%lx\t%lx\t%llx\t%s\n",
+                      voffset, offset, KPC_TAGS(entry), page_flag_name(flags));
+       else
+               printf("%lx\t%s\n", offset, page_flag_name(flags));
 }
 
 static void show_summary(void)
@@ -574,9 +590,9 @@ static void add_page(unsigned long voffset,
                unpoison_page(offset);
 
        if (opt_list == 1)
-               show_page_range(voffset, offset, 1, flags);
+               show_page_range(voffset, offset, flags, pme);
        else if (opt_list == 2)
-               show_page(voffset, offset, flags);
+               show_page(voffset, offset, flags, pme);
 
        nr_pages[hash_slot(flags)]++;
        total_pages++;
@@ -655,6 +671,40 @@ static void walk_task(unsigned long index, unsigned long 
count)
        }
 }
 
+struct stat kpagecache_stat;
+
+#define KPAGECACHE_BATCH       (64 << 10)      /* 64k pages */
+static void walk_file(unsigned long index, unsigned long count)
+{
+       uint64_t buf[KPAGECACHE_BATCH];
+       unsigned long batch;
+       unsigned long pages;
+       unsigned long pfn;
+       unsigned long i;
+       unsigned long end_index = count;
+       unsigned long size;
+
+       stat(opt_file, &kpagecache_stat);
+       size = kpagecache_stat.st_size;
+       if (size > 0)
+               size = (size - 1) / 4096;
+       end_index = min_t(unsigned long, index + count - 1, size);
+       while (index <= end_index) {
+               batch = min_t(unsigned long, count, PAGEMAP_BATCH);
+               pages = kpagecache_read(buf, index, batch);
+               if (pages == 0)
+                       break;
+               for (i = 0; i < pages; i++) {
+                       pfn = buf[i] & ((1UL << 52) - 1UL);
+                       if (pfn)
+                               walk_pfn(index + i, pfn, 1, buf[i]);
+               }
+
+               index += pages;
+               count -= pages;
+       }
+}
+
 static void add_addr_range(unsigned long offset, unsigned long size)
 {
        if (nr_addr_ranges >= MAX_ADDR_RANGES)
@@ -675,10 +725,12 @@ static void walk_addr_ranges(void)
                add_addr_range(0, ULONG_MAX);
 
        for (i = 0; i < nr_addr_ranges; i++)
-               if (!opt_pid)
-                       walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0);
-               else
+               if (opt_pid)
                        walk_task(opt_offset[i], opt_size[i]);
+               else if (opt_file)
+                       walk_file(opt_offset[i], opt_size[i]);
+               else
+                       walk_pfn(0, opt_offset[i], opt_size[i], 0);
 
        close(kpageflags_fd);
 }
@@ -806,130 +858,21 @@ static void parse_pid(const char *str)
        fclose(file);
 }
 
-static void show_file(const char *name, const struct stat *st)
-{
-       unsigned long long size = st->st_size;
-       char atime[64], mtime[64];
-       long now = time(NULL);
-
-       printf("%s\tInode: %u\tSize: %llu (%llu pages)\n",
-                       name, (unsigned)st->st_ino,
-                       size, (size + page_size - 1) / page_size);
-
-       strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime));
-       strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime));
-
-       printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n",
-                       mtime, now - st->st_mtime,
-                       atime, now - st->st_atime);
-}
-
-static void walk_file(const char *name, const struct stat *st)
-{
-       uint8_t vec[PAGEMAP_BATCH];
-       uint64_t buf[PAGEMAP_BATCH], flags;
-       unsigned long nr_pages, pfn, i;
-       int fd;
-       off_t off;
-       ssize_t len;
-       void *ptr;
-       int first = 1;
-
-       fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
-
-       for (off = 0; off < st->st_size; off += len) {
-               nr_pages = (st->st_size - off + page_size - 1) / page_size;
-               if (nr_pages > PAGEMAP_BATCH)
-                       nr_pages = PAGEMAP_BATCH;
-               len = nr_pages * page_size;
-
-               ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off);
-               if (ptr == MAP_FAILED)
-                       fatal("mmap failed: %s", name);
-
-               /* determine cached pages */
-               if (mincore(ptr, len, vec))
-                       fatal("mincore failed: %s", name);
-
-               /* turn off readahead */
-               if (madvise(ptr, len, MADV_RANDOM))
-                       fatal("madvice failed: %s", name);
-
-               /* populate ptes */
-               for (i = 0; i < nr_pages ; i++) {
-                       if (vec[i] & 1)
-                               (void)*(volatile int *)(ptr + i * page_size);
-               }
-
-               /* turn off harvesting reference bits */
-               if (madvise(ptr, len, MADV_SEQUENTIAL))
-                       fatal("madvice failed: %s", name);
-
-               if (pagemap_read(buf, (unsigned long)ptr / page_size,
-                                       nr_pages) != nr_pages)
-                       fatal("cannot read pagemap");
-
-               munmap(ptr, len);
-
-               for (i = 0; i < nr_pages; i++) {
-                       pfn = pagemap_pfn(buf[i]);
-                       if (!pfn)
-                               continue;
-                       if (!kpageflags_read(&flags, pfn, 1))
-                               continue;
-                       if (first && opt_list) {
-                               first = 0;
-                               flush_page_range();
-                               show_file(name, st);
-                       }
-                       add_page(off / page_size + i, pfn, flags, buf[i]);
-               }
-       }
-
-       close(fd);
-}
-
-int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
-{
-       (void)f;
-       switch (type) {
-       case FTW_F:
-               if (S_ISREG(st->st_mode))
-                       walk_file(name, st);
-               break;
-       case FTW_DNR:
-               fprintf(stderr, "cannot read dir: %s\n", name);
-               break;
-       }
-       return 0;
-}
-
-static void walk_page_cache(void)
+static void parse_file(const char *name)
 {
-       struct stat st;
-
-       kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY);
-       pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
-
-       if (stat(opt_file, &st))
-               fatal("stat failed: %s\n", opt_file);
-
-       if (S_ISREG(st.st_mode)) {
-               walk_file(opt_file, &st);
-       } else if (S_ISDIR(st.st_mode)) {
-               /* do not follow symlinks and mountpoints */
-               if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0)
-                       fatal("nftw failed: %s\n", opt_file);
-       } else
-               fatal("unhandled file type: %s\n", opt_file);
-
-       close(kpageflags_fd);
-       close(pagemap_fd);
+       int ret;
+       opt_file = (char *)name;
+       kpagecache_fd = checked_open(PROC_KPAGECACHE, O_RDWR);
+       ret = write(kpagecache_fd, name, strlen(name));
+       if (ret != (int)strlen(name))
+               fatal("Failed to set file on %s\n", PROC_KPAGECACHE);
 }
 
-static void parse_file(const char *name)
+static void close_kpagecache(void)
 {
-       opt_file = name;
+       /* Reset in-kernel configuration. */
+       write(kpagecache_fd, NULL, 1);
+       close(kpagecache_fd);
 }
 
 static void parse_addr_range(const char *optarg)
@@ -1118,22 +1061,31 @@ int main(int argc, char *argv[])
                }
        }
 
-       if (opt_list && opt_pid)
-               printf("voffset\t");
-       if (opt_list && opt_file)
-               printf("foffset\t");
-       if (opt_list == 1)
-               printf("offset\tlen\tflags\n");
-       if (opt_list == 2)
-               printf("offset\tflags\n");
+       if (opt_pid && opt_file) {
+               fprintf(stderr,
+               "Option -p and -f are mutually exclusive. Don't set both.\n");
+               exit(1);
+       }
 
-       if (opt_file)
-               walk_page_cache();
-       else
-               walk_addr_ranges();
+       if (opt_pid) {
+               if (opt_list == 1)
+                       printf("voffset\toffset\tlen\tflags\n");
+               if (opt_list == 2)
+                       printf("voffset\toffset\tflags\n");
+       } else if (opt_file) {
+               if (opt_list == 1)
+                       printf("voffset\toffset\tlen\ttag\tflags\n");
+               if (opt_list == 2)
+                       printf("voffset\toffset\ttag\tflags\n");
+       }
+
+       walk_addr_ranges();
 
        if (opt_list == 1)
-               flush_page_range();
+               show_page_range(0, 0, 0, 0);  /* drain the buffer */
+
+       if (opt_file)
+               close_kpagecache();
 
        if (opt_no_summary)
                return 0;
-- 
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to