[PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-25 Thread David Ahern
Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
Currently only sk_bound_dev_if is exported to userspace for modification
by a bpf program.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

Signed-off-by: David Ahern 
---
 include/linux/filter.h   |  2 +-
 include/uapi/linux/bpf.h | 15 
 kernel/bpf/cgroup.c  |  9 +
 kernel/bpf/syscall.c |  4 +++
 net/core/filter.c| 92 
 net/core/sock.c  |  7 
 6 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1f09c521adfe..808e158742a2 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -408,7 +408,7 @@ struct bpf_prog {
enum bpf_prog_type  type;   /* Type of BPF program */
struct bpf_prog_aux *aux;   /* Auxiliary fields */
struct sock_fprog_kern  *orig_prog; /* Original BPF program */
-   unsigned int(*bpf_func)(const struct sk_buff *skb,
+   unsigned int(*bpf_func)(const void *ctx,
const struct bpf_insn *filter);
/* Instructions for interpreter */
union {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6b62ee9a2f78..ce5283f221e7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -99,11 +99,13 @@ enum bpf_prog_type {
BPF_PROG_TYPE_XDP,
BPF_PROG_TYPE_PERF_EVENT,
BPF_PROG_TYPE_CGROUP_SKB,
+   BPF_PROG_TYPE_CGROUP_SOCK,
 };
 
 enum bpf_attach_type {
BPF_CGROUP_INET_INGRESS,
BPF_CGROUP_INET_EGRESS,
+   BPF_CGROUP_INET_SOCK_CREATE,
__MAX_BPF_ATTACH_TYPE
 };
 
@@ -449,6 +451,15 @@ enum bpf_func_id {
 */
BPF_FUNC_get_numa_node_id,
 
+   /**
+* sock_store_u32(sk, offset, val) - store bytes into sock
+* @sk: pointer to sock
+* @offset: offset within sock
+* @val: value to write
+* Return: 0 on success
+*/
+   BPF_FUNC_sock_store_u32,
+
__BPF_FUNC_MAX_ID,
 };
 
@@ -524,6 +535,10 @@ struct bpf_tunnel_key {
__u32 tunnel_label;
 };
 
+struct bpf_sock {
+   __u32 bound_dev_if;
+};
+
 /* User return codes for XDP prog type.
  * A valid XDP program must return one of these defined values. All other
  * return codes are reserved for future use. Unknown return codes will result
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 918c01a6f129..4fcb58013a3a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -117,6 +117,12 @@ void __cgroup_bpf_update(struct cgroup *cgrp,
}
 }
 
+static int __cgroup_bpf_run_filter_sk_create(struct sock *sk,
+struct bpf_prog *prog)
+{
+   return prog->bpf_func(sk, prog->insnsi) == 1 ? 0 : -EPERM;
+}
+
 static int __cgroup_bpf_run_filter_skb(struct sk_buff *skb,
   struct bpf_prog *prog)
 {
@@ -171,6 +177,9 @@ int __cgroup_bpf_run_filter(struct sock *sk,
case BPF_CGROUP_INET_EGRESS:
ret = __cgroup_bpf_run_filter_skb(skb, prog);
break;
+   case BPF_CGROUP_INET_SOCK_CREATE:
+   ret = __cgroup_bpf_run_filter_sk_create(sk, prog);
+   break;
/* make gcc happy else complains about missing enum value */
default:
return 0;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9abc88deabbc..3b7e30e28cd3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -844,6 +844,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
ptype = BPF_PROG_TYPE_CGROUP_SKB;
break;
 
+   case BPF_CGROUP_INET_SOCK_CREATE:
+   ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+   break;
default:
return -EINVAL;
}
@@ -879,6 +882,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
+   case BPF_CGROUP_INET_SOCK_CREATE:
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
diff --git a/net/core/filter.c b/net/core/filter.c
index 4552b8c93b99..775802881b01 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2482,6 +2482,27 @@ static const struct bpf_func_proto 
bpf_xdp_event_output_proto = {
.arg5_type  = ARG_CONST_STACK_SIZE,
 };
 
+

Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-25 Thread Daniel Borkmann

On 10/26/2016 12:30 AM, David Ahern wrote:

Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
Currently only sk_bound_dev_if is exported to userspace for modification
by a bpf program.

This allows a cgroup to be configured such that AF_INET{6} sockets opened
by processes are automatically bound to a specific device. In turn, this
enables the running of programs that do not support SO_BINDTODEVICE in a
specific VRF context / L3 domain.

Signed-off-by: David Ahern 

[...]

@@ -524,6 +535,10 @@ struct bpf_tunnel_key {
__u32 tunnel_label;
  };

+struct bpf_sock {
+   __u32 bound_dev_if;
+};
+
  /* User return codes for XDP prog type.
   * A valid XDP program must return one of these defined values. All other
   * return codes are reserved for future use. Unknown return codes will result

[...]

diff --git a/net/core/filter.c b/net/core/filter.c
index 4552b8c93b99..775802881b01 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2482,6 +2482,27 @@ static const struct bpf_func_proto 
bpf_xdp_event_output_proto = {
.arg5_type  = ARG_CONST_STACK_SIZE,
  };

+BPF_CALL_3(bpf_sock_store_u32, struct sock *, sk, u32, offset, u32, val)
+{
+   u8 *ptr = (u8 *)sk;
+
+   if (unlikely(offset > sizeof(*sk)))
+   return -EFAULT;
+
+   *((u32 *)ptr) = val;
+
+   return 0;
+}


Seems strange to me. So, this helper allows to overwrite arbitrary memory
of a struct sock instance. Potentially we could crash the kernel.

And in your sock_filter_convert_ctx_access(), you already implement inline
read/write for the context ...

Your demo code does in pseudocode:

  r1 = sk
  r2 = offsetof(struct bpf_sock, bound_dev_if)
  r3 = idx
  r1->sk_bound_dev_if = idx
  sock_store_u32(r1, r2, r3) // updates sk_bound_dev_if again to idx
  return 1

Dropping that helper from the patch, the only thing a program can do here
is to read/write the sk_bound_dev_if helper per cgroup. Hmm ... dunno. So
this really has to be for cgroups v2, right?


Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-25 Thread Eric Dumazet
On Tue, 2016-10-25 at 15:30 -0700, David Ahern wrote:
> Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
> BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
> any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
> Currently only sk_bound_dev_if is exported to userspace for modification
> by a bpf program.
> 
> This allows a cgroup to be configured such that AF_INET{6} sockets opened
> by processes are automatically bound to a specific device. In turn, this
> enables the running of programs that do not support SO_BINDTODEVICE in a
> specific VRF context / L3 domain.

Does this mean that these programs no longer can use loopback ?




Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-25 Thread Alexei Starovoitov
On Wed, Oct 26, 2016 at 01:28:24AM +0200, Daniel Borkmann wrote:
> On 10/26/2016 12:30 AM, David Ahern wrote:
> >Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
> >BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
> >any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
> >Currently only sk_bound_dev_if is exported to userspace for modification
> >by a bpf program.
> >
> >This allows a cgroup to be configured such that AF_INET{6} sockets opened
> >by processes are automatically bound to a specific device. In turn, this
> >enables the running of programs that do not support SO_BINDTODEVICE in a
> >specific VRF context / L3 domain.
> >
> >Signed-off-by: David Ahern 
> [...]
> >@@ -524,6 +535,10 @@ struct bpf_tunnel_key {
> > __u32 tunnel_label;
> >  };
> >
> >+struct bpf_sock {
> >+__u32 bound_dev_if;
> >+};
> >+
> >  /* User return codes for XDP prog type.
> >   * A valid XDP program must return one of these defined values. All other
> >   * return codes are reserved for future use. Unknown return codes will 
> > result
> [...]
> >diff --git a/net/core/filter.c b/net/core/filter.c
> >index 4552b8c93b99..775802881b01 100644
> >--- a/net/core/filter.c
> >+++ b/net/core/filter.c
> >@@ -2482,6 +2482,27 @@ static const struct bpf_func_proto 
> >bpf_xdp_event_output_proto = {
> > .arg5_type  = ARG_CONST_STACK_SIZE,
> >  };
> >
> >+BPF_CALL_3(bpf_sock_store_u32, struct sock *, sk, u32, offset, u32, val)
> >+{
> >+u8 *ptr = (u8 *)sk;
> >+
> >+if (unlikely(offset > sizeof(*sk)))
> >+return -EFAULT;
> >+
> >+*((u32 *)ptr) = val;
> >+
> >+return 0;
> >+}
> 
> Seems strange to me. So, this helper allows to overwrite arbitrary memory
> of a struct sock instance. Potentially we could crash the kernel.
> 
> And in your sock_filter_convert_ctx_access(), you already implement inline
> read/write for the context ...
> 
> Your demo code does in pseudocode:
> 
>   r1 = sk
>   r2 = offsetof(struct bpf_sock, bound_dev_if)
>   r3 = idx
>   r1->sk_bound_dev_if = idx
>   sock_store_u32(r1, r2, r3) // updates sk_bound_dev_if again to idx
>   return 1
> 
> Dropping that helper from the patch, the only thing a program can do here
> is to read/write the sk_bound_dev_if helper per cgroup. Hmm ... dunno. So
> this really has to be for cgroups v2, right?

Looks pretty cool.
Same question as Daniel... why extra helper?
If program overwrites bpf_sock->sk_bound_dev_if can we use that
after program returns?
Also do you think it's possible to extend this patch to prototype
the port bind restrictions that were proposed few month back using
the same bpf_sock input structure?
Probably the check would need to be moved into different
place instead of sk_alloc(), but then we'll have more
opportunities to overwrite bound_dev_if, look at ports and so on ?



Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-25 Thread David Ahern
On 10/25/16 5:28 PM, Daniel Borkmann wrote:
>> +BPF_CALL_3(bpf_sock_store_u32, struct sock *, sk, u32, offset, u32, val)
>> +{
>> +u8 *ptr = (u8 *)sk;
>> +
>> +if (unlikely(offset > sizeof(*sk)))
>> +return -EFAULT;
>> +
>> +*((u32 *)ptr) = val;
>> +
>> +return 0;
>> +}
> 
> Seems strange to me. So, this helper allows to overwrite arbitrary memory
> of a struct sock instance. Potentially we could crash the kernel.
> 
> And in your sock_filter_convert_ctx_access(), you already implement inline
> read/write for the context ...
> 
> Your demo code does in pseudocode:
> 
>   r1 = sk
>   r2 = offsetof(struct bpf_sock, bound_dev_if)
>   r3 = idx
>   r1->sk_bound_dev_if = idx
>   sock_store_u32(r1, r2, r3) // updates sk_bound_dev_if again to idx
>   return 1
> 
> Dropping that helper from the patch, the only thing a program can do here
> is to read/write the sk_bound_dev_if helper per cgroup. Hmm ... dunno. So
> this really has to be for cgroups v2, right?

Showing my inexperience with the bpf code. The helper can be dropped. I'll do 
that for v2.

Yes, Daniel's patch set provides the infra for this one and it has a cgroups v2 
limitation.


Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-25 Thread David Ahern
On 10/25/16 5:39 PM, Eric Dumazet wrote:
> On Tue, 2016-10-25 at 15:30 -0700, David Ahern wrote:
>> Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
>> BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
>> any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
>> Currently only sk_bound_dev_if is exported to userspace for modification
>> by a bpf program.
>>
>> This allows a cgroup to be configured such that AF_INET{6} sockets opened
>> by processes are automatically bound to a specific device. In turn, this
>> enables the running of programs that do not support SO_BINDTODEVICE in a
>> specific VRF context / L3 domain.
> 
> Does this mean that these programs no longer can use loopback ?

I am probably misunderstanding your question, so I'll ramble a bit and see if I 
cover it.

This patch set generically allows sk_bound_dev_if to be set to any value. It 
does not check that an index corresponds to a device at that moment (either bpf 
prog install or execution of the filter), and even if it did the device can be 
deleted at any moment. That seems to be standard operating procedure with bpf 
filters (user mistakes mean packets go no where and in this case a socket is 
bound to a non-existent device).

The index can be any interface (e.g., eth0) or an L3 device (e.g., a VRF). 
Loopback and index=1 is allowed.

The VRF device is the loopback device for the domain, so binding to it covers 
addresses on the VRF device as well as interfaces enslaved to it.

Did you mean something else?


Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-25 Thread David Ahern
On 10/25/16 7:55 PM, Alexei Starovoitov wrote:
> Same question as Daniel... why extra helper?

It can be dropped. wrong path while learning this code.

> If program overwrites bpf_sock->sk_bound_dev_if can we use that
> after program returns?
> Also do you think it's possible to extend this patch to prototype
> the port bind restrictions that were proposed few month back using
> the same bpf_sock input structure?
> Probably the check would need to be moved into different
> place instead of sk_alloc(), but then we'll have more
> opportunities to overwrite bound_dev_if, look at ports and so on ?
> 

I think the sk_bound_dev_if should be set when the socket is created versus 
waiting until it is used (bind, connect, sendmsg, recvmsg). That said, the 
filter could (should?) be run in the protocol family create function 
(inet_create and inet6_create) versus sk_alloc. That would allow the filter to 
allocate a local port based on its logic. I'd prefer interested parties to look 
into the details of that use case.

I'll move the running of the filter to the end of the create functions for v2.


Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-25 Thread Eric Dumazet
On Tue, 2016-10-25 at 20:21 -0600, David Ahern wrote:
> On 10/25/16 5:39 PM, Eric Dumazet wrote:
> > On Tue, 2016-10-25 at 15:30 -0700, David Ahern wrote:
> >> Add new cgroup based program type, BPF_PROG_TYPE_CGROUP_SOCK. Similar to
> >> BPF_PROG_TYPE_CGROUP_SKB programs can be attached to a cgroup and run
> >> any time a process in the cgroup opens an AF_INET or AF_INET6 socket.
> >> Currently only sk_bound_dev_if is exported to userspace for modification
> >> by a bpf program.
> >>
> >> This allows a cgroup to be configured such that AF_INET{6} sockets opened
> >> by processes are automatically bound to a specific device. In turn, this
> >> enables the running of programs that do not support SO_BINDTODEVICE in a
> >> specific VRF context / L3 domain.
> > 
> > Does this mean that these programs no longer can use loopback ?
> 
> I am probably misunderstanding your question, so I'll ramble a bit and
> see if I cover it.
> 
> This patch set generically allows sk_bound_dev_if to be set to any
> value. It does not check that an index corresponds to a device at that
> moment (either bpf prog install or execution of the filter), and even
> if it did the device can be deleted at any moment. That seems to be
> standard operating procedure with bpf filters (user mistakes mean
> packets go no where and in this case a socket is bound to a
> non-existent device).
> 
> The index can be any interface (e.g., eth0) or an L3 device (e.g., a
> VRF). Loopback and index=1 is allowed.
> 
> The VRF device is the loopback device for the domain, so binding to it
> covers addresses on the VRF device as well as interfaces enslaved to
> it.
> 
> Did you mean something else?

Maybe I do not understand how you plan to use this.

Let say you want a filter to force a BIND_TO_DEVICE xxx because a task
runs in a cgroup yyy

Then a program doing a socket() + connect (127.0.0.1)  will fail ?

I do not see how a BPF filter at socket() time can be selective.





Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-25 Thread David Ahern
On 10/25/16 8:48 PM, Eric Dumazet wrote:
> Maybe I do not understand how you plan to use this.
> 
> Let say you want a filter to force a BIND_TO_DEVICE xxx because a task
> runs in a cgroup yyy
> 
> Then a program doing a socket() + connect (127.0.0.1)  will fail ?

maybe. VRF devices can have 127.0.0.1 address in which case the connect would 
succeed. ntpq uses 127.0.0.1 to talk to ntpd for example. If ntpd is bound to a 
Management VRF, then you need this context for ntpq to talk to it.

> 
> I do not see how a BPF filter at socket() time can be selective.

Here's my use case - and this is what we are doing today with the l3mdev cgroup 
(a patch which has not been accepted upstream):

1. create VRF device

2. create cgroup and configure it

   in this case it means load the bpf program that sets the sk_bound_dev_if to 
the vrf device that was just created

3. Add shell to cgroup

   For Management VRF this can be done automatically at login so a user does 
not need to take any action.

At this point any command run in the shell runs in the VRF context (PS1 for 
bash can show the VRF to keep you from going crazy on why a connect fails :-)) 
so any ipv4/ipv6 sockets have that VRF scope.

For example, if the VRF is a management VRF, sockets opened by apt-get are 
automatically bound to the VRF at create time, so requests go out the eth0 
(management) interface.

In a similar fashion, using a cgroup and assigning tasks to it allows automated 
launch of systemd services with instances running in a VRF context - one 
dhcrelay in vrf red, one in vrf blue with both using a parameterized instance 
file.




Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-26 Thread Daniel Borkmann

On 10/26/2016 04:05 AM, David Ahern wrote:

On 10/25/16 5:28 PM, Daniel Borkmann wrote:

+BPF_CALL_3(bpf_sock_store_u32, struct sock *, sk, u32, offset, u32, val)
+{
+u8 *ptr = (u8 *)sk;
+
+if (unlikely(offset > sizeof(*sk)))
+return -EFAULT;
+
+*((u32 *)ptr) = val;
+
+return 0;
+}


Seems strange to me. So, this helper allows to overwrite arbitrary memory
of a struct sock instance. Potentially we could crash the kernel.

And in your sock_filter_convert_ctx_access(), you already implement inline
read/write for the context ...

Your demo code does in pseudocode:

   r1 = sk
   r2 = offsetof(struct bpf_sock, bound_dev_if)
   r3 = idx
   r1->sk_bound_dev_if = idx
   sock_store_u32(r1, r2, r3) // updates sk_bound_dev_if again to idx
   return 1

Dropping that helper from the patch, the only thing a program can do here
is to read/write the sk_bound_dev_if helper per cgroup. Hmm ... dunno. So
this really has to be for cgroups v2, right?


Showing my inexperience with the bpf code. The helper can be dropped. I'll do 
that for v2.

Yes, Daniel's patch set provides the infra for this one and it has a cgroups v2 
limitation.


Sure, I understand that, and I know it was brought up at netconf, I'm
just still wondering in general if BPF is a good fit here in the sense
that what the program can do is just really really limited (at least
right now). Hmm, just trying to understand where this would go long term.
Probably okay'ish, if it's guaranteed that it can also integrate various
other use cases as well for the new program type like the ones proposed
by Anoop from net cgroup.

If that would reuse BPF_PROG_TYPE_CGROUP_SOCK from not only sk_alloc()
hook, programs can thus change sk_bound_dev_if also from elsewhere since
it's a fixed part of the context, and attaching to the cgroup comes after
program was verified and returned a program fd back to the user. I guess
it might be expected, right?

I mean non-cooperative processes in that cgroup could already overwrite
the policy set in sk_alloc() anyway with SO_BINDTODEVICE, no? What is the
expectation if processes are moved from one cgroup to another one? Is it
expected that also sk_bound_dev_if updates (not yet seeing how that would
work from a BPF program)? If sk_bound_dev_if is enforced from cgroup side,
should that lock out processes from changing it (maybe similar to what we
do in SOCK_FILTER_LOCKED)?


Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-26 Thread Thomas Graf
On 10/25/16 at 03:30pm, David Ahern wrote:
> @@ -171,6 +177,9 @@ int __cgroup_bpf_run_filter(struct sock *sk,
>   case BPF_CGROUP_INET_EGRESS:
>   ret = __cgroup_bpf_run_filter_skb(skb, prog);
>   break;
> + case BPF_CGROUP_INET_SOCK_CREATE:
> + ret = __cgroup_bpf_run_filter_sk_create(sk, prog);
> + break;
>   /* make gcc happy else complains about missing enum value */
>   default:
>   return 0;

Thinking further ahead of your simple example. Instead of adding yet
another prog type for the same hook, we can make this compatible with
BPF_CGROUP_INET_EGRESS instead which would then provide a ctx which
contains both, the sk and skb.

The ctx concept is very flexible. We can keep the existing dummy skb
representation and add sk_ fields which are only valid for BPF at
socket layer, e.g skb->sk_bound_dev_if would translate to
sk->bound_dev_if.


Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-26 Thread David Ahern
On 10/26/16 2:33 AM, Daniel Borkmann wrote:
> Sure, I understand that, and I know it was brought up at netconf, I'm
> just still wondering in general if BPF is a good fit here in the sense
> that what the program can do is just really really limited (at least
> right now). Hmm, just trying to understand where this would go long term.
> Probably okay'ish, if it's guaranteed that it can also integrate various
> other use cases as well for the new program type like the ones proposed
> by Anoop from net cgroup.
> 
> If that would reuse BPF_PROG_TYPE_CGROUP_SOCK from not only sk_alloc()
> hook, programs can thus change sk_bound_dev_if also from elsewhere since
> it's a fixed part of the context, and attaching to the cgroup comes after
> program was verified and returned a program fd back to the user. I guess
> it might be expected, right?

sure.

> 
> I mean non-cooperative processes in that cgroup could already overwrite
> the policy set in sk_alloc() anyway with SO_BINDTODEVICE, no? What is the

yes. If a process running as root is invoked/wants to change the binding it 
can. For example a shell is set in Management VRF context and the user wants to 
ping out a data plane port the -I arg would do that.

> expectation if processes are moved from one cgroup to another one? Is it 
> expected that also sk_bound_dev_if updates (not yet seeing how that would
> work from a BPF program)? If sk_bound_dev_if is enforced from cgroup side,
> should that lock out processes from changing it (maybe similar to what we
> do in SOCK_FILTER_LOCKED)?

existing sockets would not be affected by the cgroup program.



Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-26 Thread David Ahern
On 10/26/16 2:41 AM, Thomas Graf wrote:
> On 10/25/16 at 03:30pm, David Ahern wrote:
>> @@ -171,6 +177,9 @@ int __cgroup_bpf_run_filter(struct sock *sk,
>>  case BPF_CGROUP_INET_EGRESS:
>>  ret = __cgroup_bpf_run_filter_skb(skb, prog);
>>  break;
>> +case BPF_CGROUP_INET_SOCK_CREATE:
>> +ret = __cgroup_bpf_run_filter_sk_create(sk, prog);
>> +break;
>>  /* make gcc happy else complains about missing enum value */
>>  default:
>>  return 0;
> 
> Thinking further ahead of your simple example. Instead of adding yet
> another prog type for the same hook, we can make this compatible with
> BPF_CGROUP_INET_EGRESS instead which would then provide a ctx which
> contains both, the sk and skb.
> 
> The ctx concept is very flexible. We can keep the existing dummy skb
> representation and add sk_ fields which are only valid for BPF at
> socket layer, e.g skb->sk_bound_dev_if would translate to
> sk->bound_dev_if.
> 

It's an odd user semantic to me to put sock elements into the shadow sk_buff 
and to reuse BPF_CGROUP_INET_EGRESS. 

I can drop the _CREATE and just make it BPF_CGROUP_INET_SOCK so it works for 
any sock modification someone wants to add -- e.g., the port binding use case.


Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-26 Thread Thomas Graf
On 10/26/16 at 10:08am, David Ahern wrote:
> On 10/26/16 2:41 AM, Thomas Graf wrote:
> > On 10/25/16 at 03:30pm, David Ahern wrote:
> >> @@ -171,6 +177,9 @@ int __cgroup_bpf_run_filter(struct sock *sk,
> >>case BPF_CGROUP_INET_EGRESS:
> >>ret = __cgroup_bpf_run_filter_skb(skb, prog);
> >>break;
> >> +  case BPF_CGROUP_INET_SOCK_CREATE:
> >> +  ret = __cgroup_bpf_run_filter_sk_create(sk, prog);
> >> +  break;
> >>/* make gcc happy else complains about missing enum value */
> >>default:
> >>return 0;
> > 
> > Thinking further ahead of your simple example. Instead of adding yet
> > another prog type for the same hook, we can make this compatible with
> > BPF_CGROUP_INET_EGRESS instead which would then provide a ctx which
> > contains both, the sk and skb.
> > 
> > The ctx concept is very flexible. We can keep the existing dummy skb
> > representation and add sk_ fields which are only valid for BPF at
> > socket layer, e.g skb->sk_bound_dev_if would translate to
> > sk->bound_dev_if.
> > 
> 
> It's an odd user semantic to me to put sock elements into the shadow sk_buff 
> and to reuse BPF_CGROUP_INET_EGRESS. 
> 
> I can drop the _CREATE and just make it BPF_CGROUP_INET_SOCK so it works for 
> any sock modification someone wants to add -- e.g., the port binding use case.

Your specific sk_alloc hook won't have an skb but the majority of
socket BPF programs will want to see both skb and sk. It is not
ideal to introduce a new bpf_prog_type for every minimal difference
in capability if the majority of capabilities and restrictions are
shared. Instead, the subtype approach was implemented by the Landlock
series looks much cleaner to me.

I think we should think about how to do this properly for all BPF
programs which operate in socket cgroup context.


Re: [PATCH net-next 2/3] bpf: Add new cgroups prog type to enable sock modifications

2016-10-26 Thread David Ahern
On 10/26/16 2:31 PM, Mahesh Bandewar (महेश बंडेवार) wrote:
> The hook insertion in sk_alloc() may not solve all control-path checks as not 
> much can be done (probably apart for changing sk_bound_dev_if) during 
> allocation but hooks in bind(), listen(), setsockopt() etc. (not a complete 
> list) will be needed. Also pushing this change (changing sk_bound_dev_if) 
> later (than sk_alloc()) you might avoid the case dumazet@ has mentioned.

For v2 I moved the running of the filter to the end of inet{6}_create.