Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper
On 02/07/2017 01:02 AM, Alexei Starovoitov wrote: On 2/6/17 3:39 PM, Daniel Borkmann wrote: On 02/04/2017 04:34 AM, Alexei Starovoitov wrote: [...] +BPF_CALL_1(bpf_skb_netns_id, struct sk_buff *, skb) +{ +struct net_device *dev = skb->dev; + +if (!dev) +return 0; +return proc_get_ns_devid_inum(_net(dev)->ns); +} + +static const struct bpf_func_proto bpf_skb_netns_id_proto = { +.func= bpf_skb_netns_id, +.gpl_only= false, +.ret_type= RET_INTEGER, +.arg1_type= ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -2620,6 +2649,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); +case BPF_FUNC_sk_netns_id: +return _skb_netns_id_proto; default: return NULL; } Btw, I think here's an oversight that would still need to be fixed. Above would mean that trace printk from unprivileged would fall through and use _skb_netns_id_proto as proto now instead of NULL. So BPF_FUNC_sk_netns_id needs to be placed above the BPF_FUNC_trace_printk case, not in its fall-through path. Looks like Chenbo in his get_socket_cookie missed this, too. Other than that BPF bits seem good to me. Ahh, right. Good catch. I'll add 'else return NULL;' otherwise somebody might step on it again. Thanks Daniel! I guess an explicit comment "/* fall-through */" would also be fine and get noticed. Thanks! Eric, still waiting for your review of nsfs.c bits.
Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper
On 2/6/17 3:39 PM, Daniel Borkmann wrote: On 02/04/2017 04:34 AM, Alexei Starovoitov wrote: [...] +BPF_CALL_1(bpf_skb_netns_id, struct sk_buff *, skb) +{ +struct net_device *dev = skb->dev; + +if (!dev) +return 0; +return proc_get_ns_devid_inum(_net(dev)->ns); +} + +static const struct bpf_func_proto bpf_skb_netns_id_proto = { +.func= bpf_skb_netns_id, +.gpl_only= false, +.ret_type= RET_INTEGER, +.arg1_type= ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -2620,6 +2649,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); +case BPF_FUNC_sk_netns_id: +return _skb_netns_id_proto; default: return NULL; } Btw, I think here's an oversight that would still need to be fixed. Above would mean that trace printk from unprivileged would fall through and use _skb_netns_id_proto as proto now instead of NULL. So BPF_FUNC_sk_netns_id needs to be placed above the BPF_FUNC_trace_printk case, not in its fall-through path. Looks like Chenbo in his get_socket_cookie missed this, too. Other than that BPF bits seem good to me. Ahh, right. Good catch. I'll add 'else return NULL;' otherwise somebody might step on it again. Thanks Daniel! Eric, still waiting for your review of nsfs.c bits.
Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper
On 02/04/2017 04:34 AM, Alexei Starovoitov wrote: [...] +BPF_CALL_1(bpf_skb_netns_id, struct sk_buff *, skb) +{ + struct net_device *dev = skb->dev; + + if (!dev) + return 0; + return proc_get_ns_devid_inum(_net(dev)->ns); +} + +static const struct bpf_func_proto bpf_skb_netns_id_proto = { + .func = bpf_skb_netns_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -2620,6 +2649,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) case BPF_FUNC_trace_printk: if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto(); + case BPF_FUNC_sk_netns_id: + return _skb_netns_id_proto; default: return NULL; } Btw, I think here's an oversight that would still need to be fixed. Above would mean that trace printk from unprivileged would fall through and use _skb_netns_id_proto as proto now instead of NULL. So BPF_FUNC_sk_netns_id needs to be placed above the BPF_FUNC_trace_printk case, not in its fall-through path. Looks like Chenbo in his get_socket_cookie missed this, too. Other than that BPF bits seem good to me. @@ -2700,6 +2731,17 @@ xdp_func_proto(enum bpf_func_id func_id) }
Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper
On Sun, Feb 5, 2017 at 11:24 AM, David Ahernwrote: > On 2/3/17 8:34 PM, Alexei Starovoitov wrote: >> Therefore introduce 'u64 bpf_sk_netns_id(sk);' helper. It returns >> unique value that identifies netns of given socket or dev_net(skb->dev) >> The upper 32-bits of the return value contain device id where namespace >> filesystem resides and lower 32-bits contain inode number within that >> filesystem. >> It's the same as >> struct stat st; >> stat("/proc/pid/ns/net", ); >> return (st->st_dev << 32) | st->st_ino; >> > ... > >> can be considered a new feature, whereas for cgroup_sock >> programs that modify sk->bound_dev_if (like 'ip vrf' does) >> it's a bug fix, since 'ip vrf' needs to be netns aware. >> > > > LGTM. > > Reviewed-by: David Ahern > Tested-by: David Ahern > > Updated patches for vrf in network namespaces are here: > https://github.com/dsahern/iproute2 vrf/ip-vrf > > > root@kenny-jessie2:~# ip vrf exec red bash > > root@kenny-jessie2:red:~# ping -c1 -w1 10.100.1.254 > PING 10.100.1.254 (10.100.1.254) 56(84) bytes of data. > 64 bytes from 10.100.1.254: icmp_seq=1 ttl=64 time=0.230 ms > > --- 10.100.1.254 ping statistics --- > 1 packets transmitted, 1 received, 0% packet loss, time 0ms > rtt min/avg/max/mdev = 0.230/0.230/0.230/0.000 ms > > root@kenny-jessie2:red:~# unshare -n > > root@kenny-jessie2:red:~# ping -c1 -w1 10.100.1.254 > connect: Network is unreachable > > > Andy: thank you for the feedback on the 'ip vrf' use case. I believe this > kernel patch + my iproute2 patches address the issues mentioned so far. > Specifically, the transcript above shows your concern about 'unshare -n' case > is handled. In one of the many responses last night, you mentioned I have 'a > somewhat kludgey fix that gets the "ip netns" case'. If you can elaborate on > 'somewhat kludgey', I can fix those this week as well. What I meant was: fixing it in iproute2 without kernel changes would be kludgey and incomplete. You seem to have fixed it by depending on this kernel patch. --Andy
Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper
On 2/3/17 8:34 PM, Alexei Starovoitov wrote: > Therefore introduce 'u64 bpf_sk_netns_id(sk);' helper. It returns > unique value that identifies netns of given socket or dev_net(skb->dev) > The upper 32-bits of the return value contain device id where namespace > filesystem resides and lower 32-bits contain inode number within that > filesystem. > It's the same as > struct stat st; > stat("/proc/pid/ns/net", ); > return (st->st_dev << 32) | st->st_ino; > ... > can be considered a new feature, whereas for cgroup_sock > programs that modify sk->bound_dev_if (like 'ip vrf' does) > it's a bug fix, since 'ip vrf' needs to be netns aware. > LGTM. Reviewed-by: David AhernTested-by: David Ahern Updated patches for vrf in network namespaces are here: https://github.com/dsahern/iproute2 vrf/ip-vrf root@kenny-jessie2:~# ip vrf exec red bash root@kenny-jessie2:red:~# ping -c1 -w1 10.100.1.254 PING 10.100.1.254 (10.100.1.254) 56(84) bytes of data. 64 bytes from 10.100.1.254: icmp_seq=1 ttl=64 time=0.230 ms --- 10.100.1.254 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 0.230/0.230/0.230/0.000 ms root@kenny-jessie2:red:~# unshare -n root@kenny-jessie2:red:~# ping -c1 -w1 10.100.1.254 connect: Network is unreachable Andy: thank you for the feedback on the 'ip vrf' use case. I believe this kernel patch + my iproute2 patches address the issues mentioned so far. Specifically, the transcript above shows your concern about 'unshare -n' case is handled. In one of the many responses last night, you mentioned I have 'a somewhat kludgey fix that gets the "ip netns" case'. If you can elaborate on 'somewhat kludgey', I can fix those this week as well.
[PATCH v3 net] bpf: add bpf_sk_netns_id() helper
in cases where bpf programs are looking at sockets and packets that belong to different netns, it could be useful to get an id that uniquely identify a netns within the whole system. Therefore introduce 'u64 bpf_sk_netns_id(sk);' helper. It returns unique value that identifies netns of given socket or dev_net(skb->dev) The upper 32-bits of the return value contain device id where namespace filesystem resides and lower 32-bits contain inode number within that filesystem. It's the same as struct stat st; stat("/proc/pid/ns/net", ); return (st->st_dev << 32) | st->st_ino; For example to disallow raw sockets in all non-init netns the bpf_type_cgroup_sock program can do: if (sk->type == SOCK_RAW && bpf_sk_netns_id(sk) != 0x3f075) return 0; where 0x3f075 comes from combination of st_dev and st_ino of /proc/pid/ns/net Note that all bpf programs types are global. The same socket filter program can be attached to sockets in different netns, just like cls_bpf can see ingress/egress packets of multiple net_devices in different netns. The cgroup_bpf programs are the most exposed to sockets and devices across netns, but the need to identify netns applies to all. For example, if bpf_type_cgroup_skb didn't exist the system wide monitoring daemon could have used ld_preload mechanism and attached the same program to see traffic from applications across netns. Therefore make bpf_sk_netns_id() helper available to all network related bpf program types. For socket, cls_bpf and cgroup_skb programs this helper can be considered a new feature, whereas for cgroup_sock programs that modify sk->bound_dev_if (like 'ip vrf' does) it's a bug fix, since 'ip vrf' needs to be netns aware. Signed-off-by: Alexei Starovoitov--- Eric, I'v added proc_get_ns_devid_inum() to nsfs.c right next to __ns_get_path(), so when it is time in the future to make nsfs more namespace aware, it will be easy to adjust both new_inode_pseudo(mnt->mnt_sb) line and proc_get_ns_devid_inum() I thought about using ns->stashed, but it's obviously transient inode and not usable. If later we decide to store dev_t into ns_common it will be fine as well. We'll just change proc_get_ns_devid_inum() without affecting user space. v2->v3: build bot complained. s/static/static inline/. no other changes. --- fs/nsfs.c | 7 +++ include/linux/proc_ns.h | 3 ++- include/uapi/linux/bpf.h | 14 +- net/core/filter.c | 44 ++- samples/bpf/bpf_helpers.h | 2 ++ samples/bpf/sock_flags_kern.c | 2 ++ samples/bpf/sockex1_kern.c| 2 ++ 7 files changed, 71 insertions(+), 3 deletions(-) diff --git a/fs/nsfs.c b/fs/nsfs.c index 8c9fb29c6673..1a604bccef86 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -49,6 +49,13 @@ static void nsfs_evict(struct inode *inode) ns->ops->put(ns); } +u64 proc_get_ns_devid_inum(struct ns_common *ns) +{ + u64 dev = new_encode_dev(nsfs_mnt->mnt_sb->s_dev); + + return (dev << 32) | ns->inum; +} + static void *__ns_get_path(struct path *path, struct ns_common *ns) { struct vfsmount *mnt = nsfs_mnt; diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 12cb8bd81d2d..531c16105198 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -48,7 +48,7 @@ extern int pid_ns_prepare_proc(struct pid_namespace *ns); extern void pid_ns_release_proc(struct pid_namespace *ns); extern int proc_alloc_inum(unsigned int *pino); extern void proc_free_inum(unsigned int inum); - +extern u64 proc_get_ns_devid_inum(struct ns_common *ns); #else /* CONFIG_PROC_FS */ static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; } @@ -61,6 +61,7 @@ static inline int proc_alloc_inum(unsigned int *inum) } static inline void proc_free_inum(unsigned int inum) {} +static inline u64 proc_get_ns_devid_inum(struct ns_common *ns) { return 0; } #endif /* CONFIG_PROC_FS */ static inline int ns_alloc_inum(struct ns_common *ns) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0eb0e87dbe9f..e5b8cf16cbaf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -430,6 +430,17 @@ union bpf_attr { * @xdp_md: pointer to xdp_md * @delta: An positive/negative integer to be added to xdp_md.data * Return: 0 on success or negative on error + * + * u64 bpf_sk_netns_id(sk) + * Returns unique value that identifies netns of given socket or skb. + * The upper 32-bits of the return value contain device id where namespace + * filesystem resides and lower 32-bits contain inode number within + * that filesystem. It's the same value as: + * struct stat st; + * stat("/proc/pid/ns/net", ); + * return (st->st_dev << 32) | st->st_ino; + * @sk: pointer to struct sock or struct __sk_buff + * Return: filesystem's device id | netns inode */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec),