Re: [PATCH] brd: expose number of allocated pages in debugfs

2021-04-20 Thread Saravanan D
On Fri, Apr 16, 2021 at 02:18:29PM -0700, Saravanan D wrote:
> From: Calvin Owens 
> 
> While the maximum size of each ramdisk is defined either
> as a module parameter, or compile time default, it's impossible
> to know how many pages have currently been allocated by each
> ram%d device, since they're allocated when used and never freed.
> 
> This patch creates a new directory at this location:
> 
> »   /sys/kernel/debug/ramdisk_pages/
> 
> ...which will contain a file named "ram%d" for each instantiated
> ramdisk on the system. The file is read-only, and read() will
> output the number of pages currently held by that ramdisk.
> 

Justification : We lose track how much memory a ramdisk is using as
pages once used are simply recycled but never freed.

In instances where we exhaust the size of the ramdisk with a file that
exceeds it, encounter ENOSPC and delete the file for mitigation;
df would show decrease in used and increase in available blocks
but the since we have touched all pages, the memory footprint of the
ramdisk does not reflect the blocks used/available count

...
[root@localhost ~]# mkfs.ext2 /dev/ram15
mke2fs 1.45.6 (20-Mar-2020)
Creating filesystem with 4096 1k blocks and 1024 inodes
[root@localhost ~]# mount /dev/ram15 /mnt/ram15/

[root@localhost ~]# cat
/sys/kernel/debug/ramdisk_pages/ram15
58
[root@kerneltest008.06.prn3 ~]# df /dev/ram15
Filesystem 1K-blocks  Used Available Use% Mounted on
/dev/ram15  396331  3728   1% /mnt/ram15
[root@kerneltest008.06.prn3 ~]# dd if=/dev/urandom of=/mnt/ram15/test2
bs=1M count=5
dd: error writing '/mnt/ram15/test2': No space left on device
4+0 records in
3+0 records out
4005888 bytes (4.0 MB, 3.8 MiB) copied, 0.0446614 s, 89.7 MB/s
[root@kerneltest008.06.prn3 ~]# df /mnt/ram15/
Filesystem 1K-blocks  Used Available Use% Mounted on
/dev/ram15  3963  3960 0 100% /mnt/ram15
[root@kerneltest008.06.prn3 ~]# cat
/sys/kernel/debug/ramdisk_pages/ram15
1024
[root@kerneltest008.06.prn3 ~]# rm /mnt/ram15/test2
rm: remove regular file '/mnt/ram15/test2'? y
[root@kerneltest008.06.prn3 /var]# df /dev/ram15
Filesystem 1K-blocks  Used Available Use% Mounted on
/dev/ram15  396331  3728   1% /mnt/ram15

# Acutal memory footprint 
[root@kerneltest008.06.prn3 /var]# cat
/sys/kernel/debug/ramdisk_pages/ram15
1024
...

This debugfs counter will always reveal the accurate number of
permanently allocated pages to the ramdisk.


[PATCH] brd: expose number of allocated pages in debugfs

2021-04-16 Thread Saravanan D
From: Calvin Owens 

While the maximum size of each ramdisk is defined either
as a module parameter, or compile time default, it's impossible
to know how many pages have currently been allocated by each
ram%d device, since they're allocated when used and never freed.

This patch creates a new directory at this location:

»   /sys/kernel/debug/ramdisk_pages/

...which will contain a file named "ram%d" for each instantiated
ramdisk on the system. The file is read-only, and read() will
output the number of pages currently held by that ramdisk.

Signed-off-by: Calvin Owens 
[cleaned up the !CONFIG_DEBUG_FS case and API changes for HEAD]
Signed-off-by: Kyle McMartin 
[rebased]
Signed-off-by: Saravanan D 
---
 drivers/block/brd.c | 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 18bf99906662..6e622c1327ee 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -48,6 +49,7 @@ struct brd_device {
 */
spinlock_t  brd_lock;
struct radix_tree_root  brd_pages;
+   u64 brd_nr_pages;
 };
 
 /*
@@ -116,6 +118,8 @@ static struct page *brd_insert_page(struct brd_device *brd, 
sector_t sector)
page = radix_tree_lookup(&brd->brd_pages, idx);
BUG_ON(!page);
BUG_ON(page->index != idx);
+   } else {
+   brd->brd_nr_pages++;
}
spin_unlock(&brd->brd_lock);
 
@@ -365,11 +369,13 @@ __setup("ramdisk_size=", ramdisk_size);
  */
 static LIST_HEAD(brd_devices);
 static DEFINE_MUTEX(brd_devices_mutex);
+static struct dentry *brd_debugfs_dir;
 
 static struct brd_device *brd_alloc(int i)
 {
struct brd_device *brd;
struct gendisk *disk;
+   char buf[DISK_NAME_LEN];
 
brd = kzalloc(sizeof(*brd), GFP_KERNEL);
if (!brd)
@@ -382,6 +388,11 @@ static struct brd_device *brd_alloc(int i)
if (!brd->brd_queue)
goto out_free_dev;
 
+   snprintf(buf, DISK_NAME_LEN, "ram%d", i);
+   if (!IS_ERR_OR_NULL(brd_debugfs_dir))
+   debugfs_create_u64(buf, 0444, brd_debugfs_dir,
+   &brd->brd_nr_pages);
+
/* This is so fdisk will align partitions on 4k, because of
 * direct_access API needing 4k alignment, returning a PFN
 * (This is only a problem on very small devices <= 4M,
@@ -397,7 +408,7 @@ static struct brd_device *brd_alloc(int i)
disk->fops  = &brd_fops;
disk->private_data  = brd;
disk->flags = GENHD_FL_EXT_DEVT;
-   sprintf(disk->disk_name, "ram%d", i);
+   strlcpy(disk->disk_name, buf, DISK_NAME_LEN);
set_capacity(disk, rd_size * 2);
 
/* Tell the block layer that this is not a rotational device */
@@ -495,6 +506,8 @@ static int __init brd_init(void)
 
brd_check_and_reset_par();
 
+   brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
+
mutex_lock(&brd_devices_mutex);
for (i = 0; i < rd_nr; i++) {
brd = brd_alloc(i);
@@ -519,6 +532,8 @@ static int __init brd_init(void)
return 0;
 
 out_free:
+   debugfs_remove_recursive(brd_debugfs_dir);
+
list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
list_del(&brd->brd_list);
brd_free(brd);
@@ -534,6 +549,8 @@ static void __exit brd_exit(void)
 {
struct brd_device *brd, *next;
 
+   debugfs_remove_recursive(brd_debugfs_dir);
+
list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
brd_del_one(brd);
 
-- 
2.30.2



[PATCH] blk-mq: Fix spurious debugfs directory creation during initialization

2021-04-07 Thread Saravanan D
blk_mq_debugfs_register_sched_hctx() called from
device_add_disk()->elevator_init_mq()->blk_mq_init_sched()
initialization sequence does not have relevant parent directory
setup and thus spuriously attempts "sched" directory creation
from root mount of debugfs for every hw queue detected on the
block device

dmesg
...
debugfs: Directory 'sched' with parent '/' already present!
debugfs: Directory 'sched' with parent '/' already present!
.
.
debugfs: Directory 'sched' with parent '/' already present!
...

The parent debugfs directory for hw queues get properly setup
device_add_disk()->blk_register_queue()->blk_mq_debugfs_register()
->blk_mq_debugfs_register_hctx() later in the block device
initialization sequence.

A simple check for debugfs_dir has been added to thwart premature
debugfs directory/file creation attempts.

Signed-off-by: Saravanan D 
---
 block/blk-mq-debugfs.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 271f6596435b..2a75bc7401df 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -972,6 +972,14 @@ void blk_mq_debugfs_register_sched_hctx(struct 
request_queue *q,
 {
struct elevator_type *e = q->elevator->type;
 
+   /*
+* If the parent debugfs directory has not been created yet, return;
+* We will be called again later on with appropriate parent debugfs
+* directory from blk_register_queue()
+*/
+   if (!hctx->debugfs_dir)
+   return;
+
if (!e->hctx_debugfs_attrs)
return;
 
-- 
2.30.2



[PATCH V6] x86/mm: Tracking linear mapping split events

2021-02-18 Thread Saravanan D
To help with debugging the sluggishness caused by TLB miss/reload,
we introduce monotonic hugepage [direct mapped] split event counts since
system state: SYSTEM_RUNNING to be displayed as part of
/proc/vmstat in x86 servers

The lifetime split event information will be displayed at the bottom of
/proc/vmstat

swap_ra 0
swap_ra_hit 0
direct_map_level2_splits 94
direct_map_level3_splits 4
nr_unstable 0


One of the many lasting sources of direct hugepage splits is kernel
tracing (kprobes, tracepoints).

Note that the kernel's code segment [512 MB] points to the same
physical addresses that have been already mapped in the kernel's
direct mapping range.

Source : Documentation/x86/x86_64/mm.rst

When we enable kernel tracing, the kernel has to modify
attributes/permissions
of the text segment hugepages that are direct mapped causing them to
split.

Kernel's direct mapped hugepages do not coalesce back after split and
remain in place for the remainder of the lifetime.

An instance of direct page splits when we turn on
dynamic kernel tracing

cat /proc/vmstat | grep -i direct_map_level
direct_map_level2_splits 784
direct_map_level3_splits 12
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @ [pid, comm] =
count(); }'
cat /proc/vmstat | grep -i
direct_map_level
direct_map_level2_splits 789
direct_map_level3_splits 12


Signed-off-by: Saravanan D 
Acked-by: Tejun Heo 
Acked-by: Johannes Weiner 
Acked-by: Dave Hansen 
---
This patch has been acked and can be routed through either x86 or -mm
Please let me know if there's anything needed. Thanks.
---
 arch/x86/mm/pat/set_memory.c  | 8 
 include/linux/vm_event_item.h | 4 
 mm/vmstat.c   | 4 
 3 files changed, 16 insertions(+)

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..a7b3c5f1d316 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -91,6 +93,12 @@ static void split_page_count(int level)
return;
 
direct_pages_count[level]--;
+   if (system_state == SYSTEM_RUNNING) {
+   if (level == PG_LEVEL_2M)
+   count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
+   else if (level == PG_LEVEL_1G)
+   count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
+   }
direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 18e75974d4e3..7c06c2bdc33b 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -120,6 +120,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
+#endif
+#ifdef CONFIG_X86
+   DIRECT_MAP_LEVEL2_SPLIT,
+   DIRECT_MAP_LEVEL3_SPLIT,
 #endif
NR_VM_EVENT_ITEMS
 };
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8942160fc95..a43ac4ac98a2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1350,6 +1350,10 @@ const char * const vmstat_text[] = {
"swap_ra",
"swap_ra_hit",
 #endif
+#ifdef CONFIG_X86
+   "direct_map_level2_splits",
+   "direct_map_level3_splits",
+#endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
-- 
2.24.1



Re: [PATCH V6] x86/mm: Tracking linear mapping split events

2021-02-08 Thread Saravanan D
Hi all,

So far I have received two acks for V6 version of my patch

> Acked-by: Tejun Heo 
> Acked-by: Johannes Weiner 

Are there any more objections ?

Thanks,
Saravanan D


Re: [PATCH V5] x86/mm: Tracking linear mapping split events

2021-01-28 Thread Saravanan D
Hi Dave,
> 
> Eek.  There really doesn't appear to be a place in Documentation/ that
> we've documented vmstat entries.
> 
> Maybe you can start:
> 
>   Documentation/admin-guide/mm/vmstat.rst
> 
I was also very surprised that there does not exist documentation for
vmstat, that lead me to add a page in admin-guide which now requires lot
of caveats.

Starting a new documentation for vmstat goes beyond the scope of this patch.
I am inclined to remove Documentation from the next version [V6] of the patch.

I presume that a detailed commit log [V6] explaining why direct mapped kernel
page splis will never coalesce, how kernel tracing causes some of those
splits and why it is worth tracking them can do the job.

Proposed [V6] Commit Log:
>>>
To help with debugging the sluggishness caused by TLB miss/reload,
we introduce monotonic hugepage [direct mapped] split event counts since
system state: SYSTEM_RUNNING to be displayed as part of
/proc/vmstat in x86 servers

The lifetime split event information will be displayed at the bottom of
/proc/vmstat

swap_ra 0
swap_ra_hit 0
direct_map_level2_splits 94
direct_map_level3_splits 4
nr_unstable 0


One of the many lasting sources of direct hugepage splits is kernel
tracing (kprobes, tracepoints).

Note that the kernel's code segment [512 MB] points to the same 
physical addresses that have been already mapped in the kernel's 
direct mapping range.

Source : Documentation/x86/x86_64/mm.rst

When we enable kernel tracing, the kernel has to modify attributes/permissions
of the text segment hugepages that are direct mapped causing them to split.

Kernel's direct mapped hugepages do not coalesce back after split and
remain in place for the remainder of the lifetime.

An instance of direct page splits when we turn on
dynamic kernel tracing

cat /proc/vmstat | grep -i direct_map_level
direct_map_level2_splits 784
direct_map_level3_splits 12
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @ [pid, comm] =
count(); }'
cat /proc/vmstat | grep -i
direct_map_level
direct_map_level2_splits 789
direct_map_level3_splits 12

<<<

Thanks,
Saravanan D


Re: [PATCH V5] x86/mm: Tracking linear mapping split events

2021-01-28 Thread Saravanan D
Hi Mathew,

> Is this tracing of userspace programs causing splits, or is it kernel
> tracing?  Also, we have lots of kinds of tracing these days; are you
> referring to kprobes?  tracepoints?  ftrace?  Something else?

It has to be kernel tracing (kprobes, tracepoints) as we are dealing with 
direct mapping splits.

Kernel's direct mapping
`` 8880 | -119.5  TB | c87f |   64 TB | direct
 mapping of all physical memory (page_offset_base)``

The kernel text range
``8000 |   -2GB | 9fff |  512 MB | kernel
text mapping, mapped to physical address 0``

Source : Documentation/x86/x86_64/mm.rst

Kernel code segment points to the same physical addresses already mapped 
in the direct mapping range (0x2000 = 512 MB)

When we enable kernel tracing, we would have to modify attributes/permissions 
of the text segment pages that are direct mapped causing them to split.

When we track the direct_pages_count[] in arch/x86/mm/pat/set_memory.c
There are only splits from higher levels. They never coalesce back.

Splits when we turn on dynamic tracing

cat /proc/vmstat | grep -i direct_map_level
direct_map_level2_splits 784
direct_map_level3_splits 12
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @ [pid, comm] = count(); }'
cat /proc/vmstat | grep -i
direct_map_level
direct_map_level2_splits 789
direct_map_level3_splits 12
....

Thanks,
Saravanan D


[PATCH V4] x86/mm: Tracking linear mapping split events

2021-01-27 Thread Saravanan D
To help with debugging the sluggishness caused by TLB miss/reload,
we introduce monotonic lifetime hugepage split event counts since
system state: SYSTEM_RUNNING to be displayed as part of
/proc/vmstat in x86 servers

The lifetime split event information will be displayed at the bottom of
/proc/vmstat

swap_ra 0
swap_ra_hit 0
direct_map_level2_splits 94
direct_map_level3_splits 4
nr_unstable 0


One of the many lasting (as we don't coalesce back) sources for huge page
splits is tracing as the granular page attribute/permission changes would
force the kernel to split code segments mapped to huge pages to smaller
ones thereby increasing the probability of TLB miss/reload even after
tracing has been stopped.

Documentation regarding linear mapping split events added to admin-guide
as requested in V3 of the patch.

Signed-off-by: Saravanan D 
---
 .../admin-guide/mm/direct_mapping_splits.rst  | 59 +++
 Documentation/admin-guide/mm/index.rst|  1 +
 arch/x86/mm/pat/set_memory.c  | 13 
 include/linux/vm_event_item.h |  4 ++
 mm/vmstat.c   |  4 ++
 5 files changed, 81 insertions(+)
 create mode 100644 Documentation/admin-guide/mm/direct_mapping_splits.rst

diff --git a/Documentation/admin-guide/mm/direct_mapping_splits.rst 
b/Documentation/admin-guide/mm/direct_mapping_splits.rst
new file mode 100644
index ..298751391deb
--- /dev/null
+++ b/Documentation/admin-guide/mm/direct_mapping_splits.rst
@@ -0,0 +1,59 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=
+Direct Mapping Splits
+=
+
+Kernel maps all of physical memory in linear/direct mapped pages with
+translation of virtual kernel address to physical address is achieved
+through a simple subtraction of offset. CPUs maintain a cache of these
+translations on fast caches called TLBs. CPU architectures like x86 allow
+direct mapping large portions of memory into hugepages (2M, 1G, etc) in
+various page table levels.
+
+Maintaining huge direct mapped pages greatly reduces TLB miss pressure.
+The splintering of huge direct pages into smaller ones does result in
+a measurable performance hit caused by frequent TLB miss and reloads.
+
+One of the many lasting (as we don't coalesce back) sources for huge page
+splits is tracing as the granular page attribute/permission changes would
+force the kernel to split code segments mapped to hugepages to smaller
+ones thus increasing the probability of TLB miss/reloads even after
+tracing has been stopped.
+
+On x86 systems, we can track the splitting of huge direct mapped pages
+through lifetime event counters in ``/proc/vmstat``
+
+   direct_map_level2_splits xxx
+   direct_map_level3_splits yyy
+
+where:
+
+direct_map_level2_splits
+   are 2M/4M hugepage split events
+direct_map_level3_splits
+   are 1G hugepage split events
+
+The distribution of direct mapped system memory in various page sizes
+post splits can be viewed through ``/proc/meminfo`` whose output
+will include the following lines depending upon supporting CPU
+architecture
+
+   DirectMap4k:x kB
+   DirectMap2M:y kB
+   DirectMap1G:z kB
+
+where:
+
+DirectMap4k
+   is the total amount of direct mapped memory (in kB)
+   accessed through 4k pages
+DirectMap2M
+   is the total amount of direct mapped memory (in kB)
+   accessed through 2M pages
+DirectMap1G
+   is the total amount of direct mapped memory (in kB)
+   accessed through 1G pages
+
+
+-- Saravanan D, Jan 27, 2021
diff --git a/Documentation/admin-guide/mm/index.rst 
b/Documentation/admin-guide/mm/index.rst
index 4b14d8b50e9e..9439780f3f07 100644
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -38,3 +38,4 @@ the Linux memory management.
soft-dirty
transhuge
userfaultfd
+   direct_mapping_splits
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..767cade53bdc 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -85,12 +87,23 @@ void update_page_count(int level, unsigned long pages)
spin_unlock(&pgd_lock);
 }
 
+void update_split_page_event_count(int level)
+{
+   if (system_state == SYSTEM_RUNNING) {
+   if (level == PG_LEVEL_2M)
+   count_vm_event(DIRECT_MAP_LEVEL2_SPLIT);
+   else if (level == PG_LEVEL_1G)
+   count_vm_event(DIRECT_MAP_LEVEL3_SPLIT);
+   }
+}
+
 static void split_page_count(int level)
 {
if (direct_pages_count[level] == 0)
return;
 
direct_pages_count[level]--;
+   update_split_page_event_count(level);
direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 
diff --git a/include/linux/vm_event_item.h b

Re: [PATCH V3] x86/mm: Tracking linear mapping split events

2021-01-27 Thread Saravanan D
Hi Dave,

> We don't use __x86_64__ in the kernel.  This should be CONFIG_X86.
Noted. I will correct this in V4

> or the level from the bottom where the split occurred:
> 
>   direct_map_level2_splits
>   direct_map_level3_splits
> 
> That has the bonus of being usable on other architectures.
Naming them after page table levels makes lot of sense. 2 new vmstat 
event counters that is relevant for all without the need for #ifdef 
page size craziness.

- Saravanan D


Re: [PATCH V3] x86/mm: Tracking linear mapping split events

2021-01-27 Thread Saravanan D
Hi Randy,
> Documenation/ update, please.
I will include it in the V4 patch.

- Saravanan D


[PATCH V3] x86/mm: Tracking linear mapping split events

2021-01-27 Thread Saravanan D
To help with debugging the sluggishness caused by TLB miss/reload,
we introduce monotonic lifetime hugepage split event counts since
system state: SYSTEM_RUNNING to be displayed as part of
/proc/vmstat in x86 servers

The lifetime split event information will be displayed at the bottom of
/proc/vmstat

swap_ra 0
swap_ra_hit 0
direct_map_2M_splits 167
direct_map_1G_splits 6
nr_unstable 0


One of the many lasting (as we don't coalesce back) sources for huge page
splits is tracing as the granular page attribute/permission changes would
force the kernel to split code segments mapped to huge pages to smaller
ones thereby increasing the probability of TLB miss/reload even after
tracing has been stopped.

Signed-off-by: Saravanan D 
---
 arch/x86/mm/pat/set_memory.c  | 18 ++
 include/linux/vm_event_item.h |  8 
 mm/vmstat.c   |  8 
 3 files changed, 34 insertions(+)

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..3ea6316df089 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -85,12 +87,28 @@ void update_page_count(int level, unsigned long pages)
spin_unlock(&pgd_lock);
 }
 
+void update_split_page_event_count(int level)
+{
+   if (system_state == SYSTEM_RUNNING) {
+   if (level == PG_LEVEL_2M) {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+   count_vm_event(DIRECT_MAP_2M_SPLIT);
+#else
+   count_vm_event(DIRECT_MAP_4M_SPLIT);
+#endif
+   } else if (level == PG_LEVEL_1G) {
+   count_vm_event(DIRECT_MAP_1G_SPLIT);
+   }
+   }
+}
+
 static void split_page_count(int level)
 {
if (direct_pages_count[level] == 0)
return;
 
direct_pages_count[level]--;
+   update_split_page_event_count(level);
direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 18e75974d4e3..439742d2435e 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -120,6 +120,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_SWAP
SWAP_RA,
SWAP_RA_HIT,
+#endif
+#if defined(__x86_64__)
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+   DIRECT_MAP_2M_SPLIT,
+#else
+   DIRECT_MAP_4M_SPLIT,
+#endif
+   DIRECT_MAP_1G_SPLIT,
 #endif
NR_VM_EVENT_ITEMS
 };
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8942160fc95..beaa2bb4f9dc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1350,6 +1350,14 @@ const char * const vmstat_text[] = {
"swap_ra",
"swap_ra_hit",
 #endif
+#if defined(__x86_64__)
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+   "direct_map_2M_splits",
+#else
+   "direct_map_4M_splits",
+#endif
+   "direct_map_1G_splits",
+#endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
-- 
2.24.1



Re: [PATCH V2] x86/mm: Tracking linear mapping split events

2021-01-27 Thread Saravanan D
Hi Tejun,

> Saravanan, can you please drop the debugfs portion and repost?
Sure.

Saravanan D


[PATCH V2] x86/mm: Tracking linear mapping split events

2021-01-27 Thread Saravanan D
Numerous hugepage splits in the linear mapping would give
admins the signal to narrow down the sluggishness caused by TLB
miss/reload.

To help with debugging, we introduce monotonic lifetime  hugepage
split event counts since SYSTEM_RUNNING to be displayed as part of
/proc/vmstat in x86 servers

The lifetime split event information will be displayed at the bottom of
/proc/vmstat

swap_ra 0
swap_ra_hit 0
direct_map_2M_splits 139
direct_map_4M_splits 0
direct_map_1G_splits 7
nr_unstable 0


Ancillary debugfs split event counts exported to userspace via read-write
endpoints : /sys/kernel/debug/x86/direct_map_[2M|4M|1G]_split

dmesg log when user resets the debugfs split event count for
debugging

[  232.470531] debugfs 2M Pages split event count(128) reset to 0


One of the many lasting (as we don't coalesce back) sources for huge page
splits is tracing as the granular page attribute/permission changes would
force the kernel to split code segments mapped to huge pages to smaller
ones thereby increasing the probability of TLB miss/reload even after
tracing has been stopped.

Signed-off-by: Saravanan D 
---
 arch/x86/mm/pat/set_memory.c  | 117 ++
 include/linux/vm_event_item.h |   8 +++
 mm/vmstat.c   |   8 +++
 3 files changed, 133 insertions(+)

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..97b6ef8dbd12 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -76,6 +78,104 @@ static inline pgprot_t cachemode2pgprot(enum 
page_cache_mode pcm)
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
+static unsigned long split_page_event_count[PG_LEVEL_NUM];
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int direct_map_2M_split_set(void *data, u64 val)
+{
+   switch (val) {
+   case 0:
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   pr_info("debugfs 2M Pages split event count(%lu) reset to 0",
+ split_page_event_count[PG_LEVEL_2M]);
+   split_page_event_count[PG_LEVEL_2M] = 0;
+
+   return 0;
+}
+
+static int direct_map_2M_split_get(void *data, u64 *val)
+{
+   *val = split_page_event_count[PG_LEVEL_2M];
+   return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_2M_split, direct_map_2M_split_get,
+direct_map_2M_split_set, "%llu\n");
+#else
+static int direct_map_4M_split_set(void *data, u64 val)
+{
+   switch (val) {
+   case 0:
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   pr_info("debugfs 4M Pages split event count(%lu) reset to 0",
+ split_page_event_count[PG_LEVEL_2M]);
+   split_page_event_count[PG_LEVEL_2M] = 0;
+
+   return 0;
+}
+
+static int direct_map_4M_split_get(void *data, u64 *val)
+{
+   *val = split_page_event_count[PG_LEVEL_2M];
+   return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_4M_split, direct_map_4M_split_get,
+direct_map_4M_split_set, "%llu\n");
+#endif
+
+static int direct_map_1G_split_set(void *data, u64 val)
+{
+   switch (val) {
+   case 0:
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   pr_info("debugfs 1G Pages split event count(%lu) reset to 0",
+ split_page_event_count[PG_LEVEL_1G]);
+   split_page_event_count[PG_LEVEL_1G] = 0;
+
+   return 0;
+}
+
+static int direct_map_1G_split_get(void *data, u64 *val)
+{
+   *val = split_page_event_count[PG_LEVEL_1G];
+   return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_1G_split, direct_map_1G_split_get,
+direct_map_1G_split_set, "%llu\n");
+
+static __init int direct_map_split_debugfs_init(void)
+{
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+   debugfs_create_file("direct_map_2M_split", 0600,
+   arch_debugfs_dir, NULL,
+   &fops_direct_map_2M_split);
+#else
+   debugfs_create_file("direct_map_4M_split", 0600,
+   arch_debugfs_dir, NULL,
+   &fops_direct_map_4M_split);
+#endif
+   if (direct_gbpages)
+   debugfs_create_file("direct_map_1G_split", 0600,
+   arch_debugfs_dir, NULL,
+   &fops_direct_map_1G_split);
+   return 0;
+}
+
+late_initcall(direct_map_split_debugfs_init);
 
 void update_page_count(int level, unsigned long pages)
 {
@@ -85,12 +185,29 @@ void update_page_count(int level, unsigned long pages)
spin_unlock(&pgd_lock);
 }
 
+void update_split_page_event_count(int level)
+{
+   if (system_state == SYST