Numerous hugepage splits in the linear mapping would give admins the signal to narrow down the sluggishness caused by TLB miss/reload.
To help with debugging, we introduce monotonic lifetime hugepage split event counts since SYSTEM_RUNNING to be displayed as part of /proc/vmstat in x86 servers The lifetime split event information will be displayed at the bottom of /proc/vmstat .... swap_ra 0 swap_ra_hit 0 direct_map_2M_splits 139 direct_map_4M_splits 0 direct_map_1G_splits 7 nr_unstable 0 .... Ancillary debugfs split event counts exported to userspace via read-write endpoints : /sys/kernel/debug/x86/direct_map_[2M|4M|1G]_split dmesg log when user resets the debugfs split event count for debugging .... [ 232.470531] debugfs 2M Pages split event count(128) reset to 0 .... One of the many lasting (as we don't coalesce back) sources for huge page splits is tracing as the granular page attribute/permission changes would force the kernel to split code segments mapped to huge pages to smaller ones thereby increasing the probability of TLB miss/reload even after tracing has been stopped. Signed-off-by: Saravanan D <saravan...@fb.com> --- arch/x86/mm/pat/set_memory.c | 117 ++++++++++++++++++++++++++++++++++ include/linux/vm_event_item.h | 8 +++ mm/vmstat.c | 8 +++ 3 files changed, 133 insertions(+) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 16f878c26667..97b6ef8dbd12 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -16,6 +16,8 @@ #include <linux/pci.h> #include <linux/vmalloc.h> #include <linux/libnvdimm.h> +#include <linux/vmstat.h> +#include <linux/kernel.h> #include <asm/e820/api.h> #include <asm/processor.h> @@ -76,6 +78,104 @@ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; +static unsigned long split_page_event_count[PG_LEVEL_NUM]; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static int direct_map_2M_split_set(void *data, u64 val) +{ + switch (val) { + case 0: + break; + default: + return -EINVAL; + } + + pr_info("debugfs 2M Pages split event count(%lu) reset to 0", + split_page_event_count[PG_LEVEL_2M]); + split_page_event_count[PG_LEVEL_2M] = 0; + + return 0; +} + +static int direct_map_2M_split_get(void *data, u64 *val) +{ + *val = split_page_event_count[PG_LEVEL_2M]; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_2M_split, direct_map_2M_split_get, + direct_map_2M_split_set, "%llu\n"); +#else +static int direct_map_4M_split_set(void *data, u64 val) +{ + switch (val) { + case 0: + break; + default: + return -EINVAL; + } + + pr_info("debugfs 4M Pages split event count(%lu) reset to 0", + split_page_event_count[PG_LEVEL_2M]); + split_page_event_count[PG_LEVEL_2M] = 0; + + return 0; +} + +static int direct_map_4M_split_get(void *data, u64 *val) +{ + *val = split_page_event_count[PG_LEVEL_2M]; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_4M_split, direct_map_4M_split_get, + direct_map_4M_split_set, "%llu\n"); +#endif + +static int direct_map_1G_split_set(void *data, u64 val) +{ + switch (val) { + case 0: + break; + default: + return -EINVAL; + } + + pr_info("debugfs 1G Pages split event count(%lu) reset to 0", + split_page_event_count[PG_LEVEL_1G]); + split_page_event_count[PG_LEVEL_1G] = 0; + + return 0; +} + +static int direct_map_1G_split_get(void *data, u64 *val) +{ + *val = split_page_event_count[PG_LEVEL_1G]; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_1G_split, direct_map_1G_split_get, + direct_map_1G_split_set, "%llu\n"); + +static __init int direct_map_split_debugfs_init(void) +{ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + debugfs_create_file("direct_map_2M_split", 0600, + arch_debugfs_dir, NULL, + &fops_direct_map_2M_split); +#else + debugfs_create_file("direct_map_4M_split", 0600, + arch_debugfs_dir, NULL, + &fops_direct_map_4M_split); +#endif + if (direct_gbpages) + debugfs_create_file("direct_map_1G_split", 0600, + arch_debugfs_dir, NULL, + &fops_direct_map_1G_split); + return 0; +} + +late_initcall(direct_map_split_debugfs_init); void update_page_count(int level, unsigned long pages) { @@ -85,12 +185,29 @@ void update_page_count(int level, unsigned long pages) spin_unlock(&pgd_lock); } +void update_split_page_event_count(int level) +{ + if (system_state == SYSTEM_RUNNING) { + split_page_event_count[level]++; + if (level == PG_LEVEL_2M) { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + count_vm_event(DIRECT_MAP_2M_SPLIT); +#else + count_vm_event(DIRECT_MAP_4M_SPLIT); +#endif + } else if (level == PG_LEVEL_1G) { + count_vm_event(DIRECT_MAP_1G_SPLIT); + } + } +} + static void split_page_count(int level) { if (direct_pages_count[level] == 0) return; direct_pages_count[level]--; + update_split_page_event_count(level); direct_pages_count[level - 1] += PTRS_PER_PTE; } diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 18e75974d4e3..439742d2435e 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -120,6 +120,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, +#endif +#if defined(__x86_64__) +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + DIRECT_MAP_2M_SPLIT, +#else + DIRECT_MAP_4M_SPLIT, +#endif + DIRECT_MAP_1G_SPLIT, #endif NR_VM_EVENT_ITEMS }; diff --git a/mm/vmstat.c b/mm/vmstat.c index f8942160fc95..beaa2bb4f9dc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1350,6 +1350,14 @@ const char * const vmstat_text[] = { "swap_ra", "swap_ra_hit", #endif +#if defined(__x86_64__) +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + "direct_map_2M_splits", +#else + "direct_map_4M_splits", +#endif + "direct_map_1G_splits", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ -- 2.24.1