Hi, Linux kernel recently got a bugfix 1fde6f21d90f ("proc: fix /proc/net/* after setns(2)"), but unfortunately it only solves the issue for procfs net file inodes so they get correct content after a process change namespace.
Checking on a v5.2-rc6 kernel : sh-4.4# sh netns_procfs_test.sh [ 16.451640] ip (108) used greatest stack depth: 12264 bytes left Before net namespace change : ==== /proc/net/dev ==== Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packd eth0: 0 0 0 0 0 0 0 0 0 0 lo: 0 0 0 0 0 0 0 0 0 0 if_default: 0 0 0 0 0 0 0 0 0 0 sit0: 0 0 0 0 0 0 0 0 0 0 ==== files in /proc/net/dev_snmp6 ==== . .. lo eth0 sit0 if_default After net namespace change : ==== /proc/net/dev ==== Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packd sit0: 0 0 0 0 0 0 0 0 0 0 if_other: 0 0 0 0 0 0 0 0 0 0 lo: 0 0 0 0 0 0 0 0 0 0 ==== files in /proc/net/dev_snmp6 ==== . .. lo eth0 sit0 if_default This kernel is fixed for file inode bug but suffers dir inode bug sh-4.4# As can be seen above, after the namespace change we see new content in procfs net/dev but the listing of procfs net/dev_snmp6 still shows entries from previous namespace. We need to apply similar bugfix for directory creation in procfs net as the mentioned commit do for files. Checking on a v5.2-rc6 kernel with bugfixes : sh-4.4# sh netns_procfs_test.sh [ 745.993882] ip (108) used greatest stack depth: 12264 bytes left Before net namespace change : ==== /proc/net/dev ==== Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packd lo: 0 0 0 0 0 0 0 0 0 0 sit0: 0 0 0 0 0 0 0 0 0 0 eth0: 0 0 0 0 0 0 0 0 0 0 if_default: 0 0 0 0 0 0 0 0 0 0 ==== files in /proc/net/dev_snmp6 ==== . .. lo eth0 sit0 if_default After net namespace change : ==== /proc/net/dev ==== Inter-| Receive | Transmit face |bytes packets errs drop fifo frame compressed multicast|bytes packd if_other: 0 0 0 0 0 0 0 0 0 0 sit0: 0 0 0 0 0 0 0 0 0 0 lo: 0 0 0 0 0 0 0 0 0 0 ==== files in /proc/net/dev_snmp6 ==== . .. lo sit0 if_other This kernel is fixed for both file and dir inode bug sh-4.4# Here we see that the directory procfs net/dev_snmp6 is updated according to the namespace change. The fix is two commits, first updates proc_net_mkdir() entries similar to mentioned patch and second one is changing net/ipv6/proc.c to use proc_net_mkdir() instead. Speaking about proc_net_mkdir()... [phallsma@arn-phallsma-l3 linux]$ git grep proc_mkdir | grep proc_net drivers/isdn/divert/divert_procfs.c: isdn_proc_entry = proc_mkdir("isdn", init_net.proc_net); drivers/isdn/hysdn/hysdn_procconf.c: hysdn_proc_entry = proc_mkdir(PROC_SUBDIR_NAME, init_net.proc_net); drivers/net/bonding/bond_procfs.c: bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net); drivers/net/wireless/intel/ipw2x00/libipw_module.c: libipw_proc = proc_mkdir(DRV_PROCNAME, init_net.proc_net); drivers/net/wireless/intersil/hostap/hostap_main.c: hostap_proc = proc_mkdir("hostap", init_net.proc_net); drivers/staging/rtl8192u/ieee80211/ieee80211_module.c: ieee80211_proc = proc_mkdir(DRV_NAME, init_net.proc_net); drivers/staging/rtl8192u/r8192U_core.c: rtl8192_proc = proc_mkdir(RTL819XU_MODULE_NAME, init_net.proc_net); net/appletalk/atalk_proc.c: if (!proc_mkdir("atalk", init_net.proc_net)) net/core/pktgen.c: pn->proc_dir = proc_mkdir(PG_PROC_DIR, pn->net->proc_net); net/ipv4/netfilter/ipt_CLUSTERIP.c: cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); net/ipv6/proc.c: net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); net/llc/llc_proc.c: llc_proc_dir = proc_mkdir("llc", init_net.proc_net); net/netfilter/xt_hashlimit.c: hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net); net/netfilter/xt_hashlimit.c: hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net); net/netfilter/xt_recent.c: recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net); net/sunrpc/cache.c: cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); net/sunrpc/stats.c: sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net); net/x25/x25_proc.c: if (!proc_mkdir("x25", init_net.proc_net)) [phallsma@arn-phallsma-l3 linux]$ IMHO all code should use proc_net_mkdir() instead of proc_mkdir() for procfs net entries, or am I missing something here? If not possible to changeover to proc_net_mkdir() there is a need for duplicating my first commit at those places. I'm fixing the one for dev_snmp6() which is what I've tested as well. Also wonder if it all is optimal. Wouldn't it be better to re-enable dcache for these (files as well as directories) and in addition have kernel drop dcache in case of a namespace change? Attaching patches and app/script for verifying. I'm not on the mailing lists so please keep me on CC in case of responding. Best regards, Per -- Per Hallsmark per.hallsm...@windriver.com Senior Member Technical Staff Wind River AB Mobile: +46733249340 Office: +46859461127 Torshamnsgatan 27 164 40 Kista
From 66769af1f931124da1aa04b34350b3623a9003e3 Mon Sep 17 00:00:00 2001 From: Per Hallsmark <per.hallsm...@windriver.com> Date: Wed, 19 Jun 2019 15:46:39 +0200 Subject: [PATCH 1/2] Make directory inodes in /proc/net adhere to net namespace This patch fixes /proc/net directory inodes in similar way as commit 1fde6f21d90f ("proc: fix /proc/net/* after setns(2)") fixes file inodes. Signed-off-by: Per Hallsmark <per.hallsm...@windriver.com> --- fs/proc/proc_net.c | 12 ++++++++++++ include/linux/proc_fs.h | 7 ++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 76ae278df1c4..1c4231a60c9e 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -55,6 +55,18 @@ static void pde_force_lookup(struct proc_dir_entry *pde) pde->proc_dops = &proc_net_dentry_ops; } +struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, + struct proc_dir_entry *parent) +{ + struct proc_dir_entry *pde; + + pde = proc_mkdir_data(name, 0, parent, net); + pde->proc_dops = &proc_net_dentry_ops; + + return pde; +} +EXPORT_SYMBOL(proc_net_mkdir); + static int seq_open_net(struct inode *inode, struct file *file) { unsigned int state_size = PDE(inode)->state_size; diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 52a283ba0465..608b8a10e338 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -124,11 +124,8 @@ static inline struct pid *tgid_pidfd_to_pid(const struct file *file) struct net; -static inline struct proc_dir_entry *proc_net_mkdir( - struct net *net, const char *name, struct proc_dir_entry *parent) -{ - return proc_mkdir_data(name, 0, parent, net); -} +extern struct proc_dir_entry *proc_net_mkdir( + struct net *net, const char *name, struct proc_dir_entry *parent); struct ns_common; int open_related_ns(struct ns_common *ns, -- 2.20.1
From 6925a5408ffce37dc71871c2ee05db96e60cae0d Mon Sep 17 00:00:00 2001 From: Per Hallsmark <per.hallsm...@windriver.com> Date: Thu, 20 Jun 2019 10:21:31 +0200 Subject: [PATCH 2/2] net: Directories created in /proc/net should be done via proc_net_mkdir() Directories created in /proc/net should be done via proc_net_mkdir() Signed-off-by: Per Hallsmark <per.hallsm...@windriver.com> --- net/ipv6/proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 4a8da679866e..3728c57e93dc 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -282,7 +282,8 @@ static int __net_init ipv6_proc_init_net(struct net *net) snmp6_seq_show, NULL)) goto proc_snmp6_fail; - net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net); + net->mib.proc_net_devsnmp6 = proc_net_mkdir(net, + "dev_snmp6", net->proc_net); if (!net->mib.proc_net_devsnmp6) goto proc_dev_snmp6_fail; return 0; -- 2.20.1
/* Verifier of /proc/net and net namespace consistency Author : Per Hallsmark <per.hallsm...@windriver.com> First setup a net namespace and add a veth so we get something to test with ip netns add netns_other ip link add if_default type veth peer name if_other ip link set if_other netns netns_other Run test app without first reading some info from /proc/net in default namespace ./netns_procfs_test Run test app again, this time reading some info from /proc/net in default namespace before changing net namespace ./netns_procfs_test cached On a bugfree kernel, the output after "After net namespace change" should be same as in first test run, meaning we should see if_other in the /proc/net/dev dump and we should see if_other in the directory. The issue is not visible if it is different processes doing the readings since then the inode's aren't cached. */ #define _GNU_SOURCE #include <sched.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <errno.h> #include <string.h> #include <assert.h> #include <fcntl.h> #include <errno.h> #include <limits.h> #include <sys/types.h> #include <sys/mount.h> #include <errno.h> #include <dirent.h> #define NETNS_RUN_DIR "/var/run/netns" static int get_nsfd(const char *ns) { char path[strlen(NETNS_RUN_DIR) + strlen(ns) + 2]; int fd; snprintf(path, sizeof(path), "%s/%s", NETNS_RUN_DIR, ns); fd = open(path, O_RDONLY, 0); return fd; } static int dump_file(const char *fn, const int checkforbug) { int fd; char buf[512]; ssize_t len; char *wholebuf = NULL; ssize_t wholelen = 0; int status = 0; printf("==== %s ====\n", fn); fd = open(fn, O_RDONLY); if (fd == -1) { printf("Couldn't open %s (%s)\n", fn, strerror(errno)); return -1; } do { len = read(fd, buf, sizeof(buf)-1); buf[len] = 0; wholebuf = realloc(wholebuf, wholelen + len + 1); sprintf(&wholebuf[wholelen], "%s", buf); wholelen += len; } while (len > 0); printf("%s\n", wholebuf); if (checkforbug && strstr(wholebuf, "if_other")) { status = 1; } free(wholebuf); close(fd); return status; } static int dump_dir(const char *dn, const int checkforbug) { DIR *dir; struct dirent *dir_entry; int status = 0; printf("==== files in %s ====\n", dn); dir = opendir(dn); if (!dir) { printf("Couldn't open %s (%s)\n", dn, strerror(errno)); return -1; } while ((dir_entry = readdir(dir)) != NULL) { printf(" %s", dir_entry->d_name); if (checkforbug && !strcmp(dir_entry->d_name, "if_other")) { status = 2; } printf("\n"); } closedir(dir); return status; } static int switch_ns(const char *ns) { int nsfd = get_nsfd(ns); if (setns(nsfd, CLONE_NEWNET) < 0) { fprintf(stderr, "Unable to switch to namespace(fd=%d): %s.\n", nsfd, strerror(errno)); exit(-1); } return 0; } int main (int argc, char **argv) { int bugcheck = 0; int checkforbug = 1; if (argc == 2 && !strcmp(argv[1], "cached")) { // access /proc/net a bit so we dcache file and // directory inodes printf("Before net namespace change :\n"); dump_file("/proc/net/dev", 0); dump_dir("/proc/net/dev_snmp6", 0); } else { // we cannot check for bug if not run with cached arg checkforbug = 0; } // change namespace switch_ns("netns_other"); // access /proc/net again, if all works we should see // new info because of net namespace change printf("\n\nAfter net namespace change :\n"); bugcheck += dump_file("/proc/net/dev", 1); bugcheck += dump_dir("/proc/net/dev_snmp6", 1); if (!checkforbug) return 0; switch (bugcheck) { case 1 : printf("This kernel is fixed for file inode bug but suffers dir inode bug\n"); break; case 3 : printf("This kernel is fixed for both file and dir inode bug\n"); break; default : printf("This kernel suffers both file and dir inode bug\n"); } return bugcheck != 3; }
netns_procfs_test.sh
Description: netns_procfs_test.sh