date:20180919

Re: [PATCH bpf-next 2/2] xsk: fix bug when trying to use both copy and zero-copy on one queue id

2018-09-19 Thread Magnus Karlsson

On Tue, Sep 18, 2018 at 7:23 PM Y Song  wrote:
>
> On Tue, Sep 18, 2018 at 3:13 AM Magnus Karlsson
>  wrote:
> >
> > Previously, the xsk code did not record which umem was bound to a
> > specific queue id. This was not required if all drivers were zero-copy
> > enabled as this had to be recorded in the driver anyway. So if a user
> > tried to bind two umems to the same queue, the driver would say
> > no. But if copy-mode was first enabled and then zero-copy mode (or the
> > reverse order), we mistakenly enabled both of them on the same umem
> > leading to buggy behavior. The main culprit for this is that we did
> > not store the association of umem to queue id in the copy case and
> > only relied on the driver reporting this. As this relation was not
> > stored in the driver for copy mode (it does not rely on the AF_XDP
> > NDOs), this obviously could not work.
> >
> > This patch fixes the problem by always recording the umem to queue id
> > relationship in the netdev_queue and netdev_rx_queue structs. This way
> > we always know what kind of umem has been bound to a queue id and can
> > act appropriately at bind time.
> >
> > Signed-off-by: Magnus Karlsson 
> > ---
> >  net/xdp/xdp_umem.c | 87 
> > +++---
> >  net/xdp/xdp_umem.h |  2 +-
> >  2 files changed, 71 insertions(+), 18 deletions(-)
> >
> > diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> > index b3b632c..12300b5 100644
> > --- a/net/xdp/xdp_umem.c
> > +++ b/net/xdp/xdp_umem.c
> > @@ -42,6 +42,41 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct 
> > xdp_sock *xs)
> > }
> >  }
> >
> > +/* The umem is stored both in the _rx struct and the _tx struct as we do
> > + * not know if the device has more tx queues than rx, or the opposite.
> > + * This might also change during run time.
> > + */
> > +static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem 
> > *umem,
> > +   u16 queue_id)
> > +{
> > +   if (queue_id < dev->real_num_rx_queues)
> > +   dev->_rx[queue_id].umem = umem;
> > +   if (queue_id < dev->real_num_tx_queues)
> > +   dev->_tx[queue_id].umem = umem;
> > +}
> > +
> > +static struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
> > + u16 queue_id)
> > +{
> > +   if (queue_id < dev->real_num_rx_queues)
> > +   return dev->_rx[queue_id].umem;
> > +   if (queue_id < dev->real_num_tx_queues)
> > +   return dev->_tx[queue_id].umem;
> > +
> > +   return NULL;
> > +}
> > +
> > +static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
> > +{
> > +   /* Zero out the entry independent on how many queues are configured
> > +* at this point in time, as it might be used in the future.
> > +*/
> > +   if (queue_id < dev->num_rx_queues)
> > +   dev->_rx[queue_id].umem = NULL;
> > +   if (queue_id < dev->num_tx_queues)
> > +   dev->_tx[queue_id].umem = NULL;
> > +}
> > +
>
> I am sure whether the following scenario can happen or not.
> Could you clarify?
>1. suppose initially we have num_rx_queues = num_tx_queues = 10
>xdp_reg_umem_at_qid() set umem1 to queue_id = 8
>2. num_tx_queues is changed to 5

You probably mean real_num_tx_queues here. This is the current number
of queues configured via e.g. ethtool. num_tx_queues will not change
unless you change device (or device driver).

>3. xdp_clear_umem_at_qid() is called for queue_id = 8,
>and dev->_rx[8].umum = 0.

At this point both _rx[8].umem and _tx[8].umem are set to NULL as the
test is against num_[rx|tx]_queues which is the max allowed for the
device, not the current allocated one which is real_num_tx_queues.
With this in mind, the scenario below will not happen. Do you agree?

>4. xdp_reg_umem_at_qid() is called gain to set for queue_id = 8
>dev->_rx[8].umem = umem2
>5. num_tx_queues is changed to 10
>   Now dev->_rx[8].umem != dev->_tx[8].umem, is this possible and is it
> a problem?
>
> >  int xdp_umem_query(struct net_device *dev, u16 queue_id)
> >  {
> > struct netdev_bpf bpf;
> > @@ -58,11 +93,11 @@ int xdp_umem_query(struct net_device *dev, u16 queue_id)
> >  }
> >
> >  int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
> > -   u32 queue_id, u16 flags)
> > +   u16 queue_id, u16 flags)
> >  {
> > bool force_zc, force_copy;
> > struct netdev_bpf bpf;
> > -   int err;
> > +   int err = 0;
> >
> > force_zc = flags & XDP_ZEROCOPY;
> > force_copy = flags & XDP_COPY;
> > @@ -70,18 +105,28 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct 
> > net_device *dev,
> > if (force_zc && force_copy)
> > return -EINVAL;
> >
> > +   rtnl_lock();
> > +   if (xdp_get_umem_from_qid(dev, queue_id)) {
> > +   err = -EBUS

Re: [PATCH bpf-next 2/2] xsk: fix bug when trying to use both copy and zero-copy on one queue id

2018-09-19 Thread Magnus Karlsson

On Wed, Sep 19, 2018 at 3:58 AM Jakub Kicinski
 wrote:
>
> On Tue, 18 Sep 2018 10:22:11 -0700, Y Song wrote:
> > > +/* The umem is stored both in the _rx struct and the _tx struct as we do
> > > + * not know if the device has more tx queues than rx, or the opposite.
> > > + * This might also change during run time.
> > > + */
> > > +static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem 
> > > *umem,
> > > +   u16 queue_id)
> > > +{
> > > +   if (queue_id < dev->real_num_rx_queues)
> > > +   dev->_rx[queue_id].umem = umem;
> > > +   if (queue_id < dev->real_num_tx_queues)
> > > +   dev->_tx[queue_id].umem = umem;
> > > +}
> > > +
> > > +static struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
> > > + u16 queue_id)
> > > +{
> > > +   if (queue_id < dev->real_num_rx_queues)
> > > +   return dev->_rx[queue_id].umem;
> > > +   if (queue_id < dev->real_num_tx_queues)
> > > +   return dev->_tx[queue_id].umem;
> > > +
> > > +   return NULL;
> > > +}
> > > +
> > > +static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
> > > +{
> > > +   /* Zero out the entry independent on how many queues are 
> > > configured
> > > +* at this point in time, as it might be used in the future.
> > > +*/
> > > +   if (queue_id < dev->num_rx_queues)
> > > +   dev->_rx[queue_id].umem = NULL;
> > > +   if (queue_id < dev->num_tx_queues)
> > > +   dev->_tx[queue_id].umem = NULL;
> > > +}
> > > +
> >
> > I am sure whether the following scenario can happen or not.
> > Could you clarify?
> >1. suppose initially we have num_rx_queues = num_tx_queues = 10
> >xdp_reg_umem_at_qid() set umem1 to queue_id = 8
> >2. num_tx_queues is changed to 5
> >3. xdp_clear_umem_at_qid() is called for queue_id = 8,
> >and dev->_rx[8].umum = 0.
> >4. xdp_reg_umem_at_qid() is called gain to set for queue_id = 8
> >dev->_rx[8].umem = umem2
> >5. num_tx_queues is changed to 10
> >   Now dev->_rx[8].umem != dev->_tx[8].umem, is this possible and is it
> > a problem?
>
> Plus IIRC the check of qid vs real_num_[rt]x_queues in xsk_bind() is
> not under rtnl_lock so it doesn't count for much.  Why not do all the
> checks against num_[rt]x_queues here, instead of real_..?

You are correct, two separate rtnl_lock regions is broken. Will spin a
v2 tomorrow when I am back in the office.

Thanks Jakub for catching this. I really appreciate you reviewing my code.

/Magnus

[RFC bpf-next 2/4] bpf: return EOPNOTSUPP when map lookup isn't supported

2018-09-19 Thread Prashant Bhole

Return ERR_PTR(-EOPNOTSUPP) from map_lookup_elem() methods of below
map types:
- BPF_MAP_TYPE_PROG_ARRAY
- BPF_MAP_TYPE_STACK_TRACE
- BPF_MAP_TYPE_XSKMAP
- BPF_MAP_TYPE_SOCKMAP/BPF_MAP_TYPE_SOCKHASH

Signed-off-by: Prashant Bhole 
---
 kernel/bpf/arraymap.c | 2 +-
 kernel/bpf/sockmap.c  | 2 +-
 kernel/bpf/stackmap.c | 2 +-
 kernel/bpf/xskmap.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index dded84cbe814..24583da9ffd1 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -449,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map)
 
 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 {
-   return NULL;
+   return ERR_PTR(-EOPNOTSUPP);
 }
 
 /* only called from syscall */
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 488ef9663c01..e50922a802f7 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -2076,7 +2076,7 @@ int sockmap_get_from_fd(const union bpf_attr *attr, int 
type,
 
 static void *sock_map_lookup(struct bpf_map *map, void *key)
 {
-   return NULL;
+   return ERR_PTR(-EOPNOTSUPP);
 }
 
 static int sock_map_update_elem(struct bpf_map *map,
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 8061a439ef18..b2ade10f7ec3 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -505,7 +505,7 @@ const struct bpf_func_proto bpf_get_stack_proto = {
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
-   return NULL;
+   return ERR_PTR(-EOPNOTSUPP);
 }
 
 /* Called from syscall */
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 9f8463afda9c..ef0b7b6ef8a5 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -154,7 +154,7 @@ void __xsk_map_flush(struct bpf_map *map)
 
 static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
 {
-   return NULL;
+   return ERR_PTR(-EOPNOTSUPP);
 }
 
 static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value,
-- 
2.17.1

[RFC bpf-next 0/4] Error handling when map lookup isn't supported

2018-09-19 Thread Prashant Bhole

Currently when map a lookup is failed, user space API can not make any
distinction whether given key was not found or lookup is not supported
by particular map.

In this series we modify return value of maps which do not support
lookup. Lookup on such map implementation will return -EOPNOTSUPP.
bpf() syscall with BPF_MAP_LOOKUP_ELEM command will set EOPNOTSUPP
errno. We also handle this error in bpftool to print appropriate
message.

Patch 1: adds handling of BPF_MAP_LOOKUP ELEM command of bpf syscall
such that errno will set to EOPNOTSUPP when map doesn't support lookup

Patch 2: Modifies the return value of map_lookup_elem() to EOPNOTSUPP
for maps which do not support lookup

Patch 3: Splits do_dump() in bpftool/map.c. Element printing code is
moved out into new function dump_map_elem(). This was done in order to
reduce deep indentation and accomodate further changes.

Patch 4: Changes to bpftool to do additional checking for errno when
map lookup is failed. In case of EOPNOTSUPP errno, it prints message
"lookup not supported for this map"

Prashant Bhole (4):
  bpf: error handling when map_lookup_elem isn't supported
  bpf: return EOPNOTSUPP when map lookup isn't supported
  tools/bpf: bpftool, split the function do_dump()
  tools/bpf: handle EOPNOTSUPP when map lookup is failed

 kernel/bpf/arraymap.c|   2 +-
 kernel/bpf/sockmap.c |   2 +-
 kernel/bpf/stackmap.c|   2 +-
 kernel/bpf/syscall.c |   9 +++-
 kernel/bpf/xskmap.c  |   2 +-
 tools/bpf/bpftool/main.h |   5 ++
 tools/bpf/bpftool/map.c  | 108 +++
 7 files changed, 90 insertions(+), 40 deletions(-)

-- 
2.17.1

[RFC bpf-next 4/4] tools/bpf: handle EOPNOTSUPP when map lookup is failed

2018-09-19 Thread Prashant Bhole

Let's add a check for EOPNOTSUPP error when map lookup is failed.
Also in case map doesn't support lookup, the output of map dump is
changed from "can't lookup element" to "lookup not supported for
this map".

Patch adds function print_entry_error() function to print the error
value.

Following example dumps a map which does not support lookup.

Output before:
root# bpftool map -jp dump id 40
[
"key": ["0x0a","0x00","0x00","0x00"
],
"value": {
"error": "can\'t lookup element"
},
"key": ["0x0b","0x00","0x00","0x00"
],
"value": {
"error": "can\'t lookup element"
}
]

root# bpftool map dump id 40
can't lookup element with key:
0a 00 00 00
can't lookup element with key:
0b 00 00 00
Found 0 elements

Output after changes:
root# bpftool map dump -jp  id 45
[
"key": ["0x0a","0x00","0x00","0x00"
],
"value": {
"error": "lookup not supported for this map"
},
"key": ["0x0b","0x00","0x00","0x00"
],
"value": {
"error": "lookup not supported for this map"
}
]

root# bpftool map dump id 45
key:
0a 00 00 00
value:
lookup not supported for this map
key:
0b 00 00 00
value:
lookup not supported for this map
Found 0 elements

Signed-off-by: Prashant Bhole 
---
 tools/bpf/bpftool/main.h |  5 +
 tools/bpf/bpftool/map.c  | 35 ++-
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 40492cdc4e53..1a8c683f949b 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -46,6 +46,11 @@
 
 #include "json_writer.h"
 
+#define ERR_CANNOT_LOOKUP \
+   "can't lookup element"
+#define ERR_LOOKUP_NOT_SUPPORTED \
+   "lookup not supported for this map"
+
 #define ptr_to_u64(ptr)((__u64)(unsigned long)(ptr))
 
 #define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); })
diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 284e12a289c0..2faccd2098c9 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -333,6 +333,25 @@ static void print_entry_json(struct bpf_map_info *info, 
unsigned char *key,
jsonw_end_object(json_wtr);
 }
 
+static void print_entry_error(struct bpf_map_info *info, unsigned char *key,
+ char *value)
+{
+   bool single_line, break_names;
+   int value_size = strlen(value);
+
+   break_names = info->key_size > 16 || value_size > 16;
+   single_line = info->key_size + value_size <= 24 && !break_names;
+
+   printf("key:%c", break_names ? '\n' : ' ');
+   fprint_hex(stdout, key, info->key_size, " ");
+
+   printf(single_line ? "  " : "\n");
+
+   printf("value:%c%s", break_names ? '\n' : ' ', value);
+
+   printf("\n");
+}
+
 static void print_entry_plain(struct bpf_map_info *info, unsigned char *key,
  unsigned char *value)
 {
@@ -660,6 +679,8 @@ static int dump_map_elem(int fd, void *key, void *value,
 json_writer_t *btf_wtr)
 {
int num_elems = 0;
+   int lookup_errno;
+   char *errstr;
 
if (!bpf_map_lookup_elem(fd, key, value)) {
if (json_output) {
@@ -682,22 +703,26 @@ static int dump_map_elem(int fd, void *key, void *value,
}
 
/* lookup error handling */
+   lookup_errno = errno;
+
if (map_is_map_of_maps(map_info->type) ||
map_is_map_of_progs(map_info->type))
goto out;
 
+   if (lookup_errno == EOPNOTSUPP)
+   errstr = ERR_LOOKUP_NOT_SUPPORTED;
+   else
+   errstr = ERR_CANNOT_LOOKUP;
+
if (json_output) {
jsonw_name(json_wtr, "key");
print_hex_data_json(key, map_info->key_size);
jsonw_name(json_wtr, "value");
jsonw_start_object(json_wtr);
-   jsonw_string_field(json_wtr, "error",
-  "can't lookup element");
+   jsonw_string_field(json_wtr, "error", errstr);
jsonw_end_object(json_wtr);
} else {
-   p_info("can't lookup element with key: ");
-   fprint_hex(stderr, key, map_info->key_size, " ");
-   fprintf(stderr, "\n");
+   print_entry_error(map_info, key, errstr);
}
 out:
return num_elems;
-- 
2.17.1

[RFC bpf-next 3/4] tools/bpf: bpftool, split the function do_dump()

2018-09-19 Thread Prashant Bhole

do_dump() function in bpftool/map.c has deep indentations. In order
to reduce deep indent, let's move element printing code out of
do_dump() into dump_map_elem() function.

Signed-off-by: Prashant Bhole 
---
 tools/bpf/bpftool/map.c | 83 -
 1 file changed, 49 insertions(+), 34 deletions(-)

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index af8ad32fa6e9..284e12a289c0 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -655,6 +655,54 @@ static int do_show(int argc, char **argv)
return errno == ENOENT ? 0 : -1;
 }
 
+static int dump_map_elem(int fd, void *key, void *value,
+struct bpf_map_info *map_info, struct btf *btf,
+json_writer_t *btf_wtr)
+{
+   int num_elems = 0;
+
+   if (!bpf_map_lookup_elem(fd, key, value)) {
+   if (json_output) {
+   print_entry_json(map_info, key, value, btf);
+   } else {
+   if (btf) {
+   struct btf_dumper d = {
+   .btf = btf,
+   .jw = btf_wtr,
+   .is_plain_text = true,
+   };
+
+   do_dump_btf(&d, map_info, key, value);
+   } else {
+   print_entry_plain(map_info, key, value);
+   }
+   num_elems++;
+   }
+   goto out;
+   }
+
+   /* lookup error handling */
+   if (map_is_map_of_maps(map_info->type) ||
+   map_is_map_of_progs(map_info->type))
+   goto out;
+
+   if (json_output) {
+   jsonw_name(json_wtr, "key");
+   print_hex_data_json(key, map_info->key_size);
+   jsonw_name(json_wtr, "value");
+   jsonw_start_object(json_wtr);
+   jsonw_string_field(json_wtr, "error",
+  "can't lookup element");
+   jsonw_end_object(json_wtr);
+   } else {
+   p_info("can't lookup element with key: ");
+   fprint_hex(stderr, key, map_info->key_size, " ");
+   fprintf(stderr, "\n");
+   }
+out:
+   return num_elems;
+}
+
 static int do_dump(int argc, char **argv)
 {
struct bpf_map_info info = {};
@@ -710,40 +758,7 @@ static int do_dump(int argc, char **argv)
err = 0;
break;
}
-
-   if (!bpf_map_lookup_elem(fd, key, value)) {
-   if (json_output)
-   print_entry_json(&info, key, value, btf);
-   else
-   if (btf) {
-   struct btf_dumper d = {
-   .btf = btf,
-   .jw = btf_wtr,
-   .is_plain_text = true,
-   };
-
-   do_dump_btf(&d, &info, key, value);
-   } else {
-   print_entry_plain(&info, key, value);
-   }
-   num_elems++;
-   } else if (!map_is_map_of_maps(info.type) &&
-  !map_is_map_of_progs(info.type)) {
-   if (json_output) {
-   jsonw_name(json_wtr, "key");
-   print_hex_data_json(key, info.key_size);
-   jsonw_name(json_wtr, "value");
-   jsonw_start_object(json_wtr);
-   jsonw_string_field(json_wtr, "error",
-  "can't lookup element");
-   jsonw_end_object(json_wtr);
-   } else {
-   p_info("can't lookup element with key: ");
-   fprint_hex(stderr, key, info.key_size, " ");
-   fprintf(stderr, "\n");
-   }
-   }
-
+   num_elems += dump_map_elem(fd, key, value, &info, btf, btf_wtr);
prev_key = key;
}
 
-- 
2.17.1

[RFC bpf-next 1/4] bpf: error handling when map_lookup_elem isn't supported

2018-09-19 Thread Prashant Bhole

The error value returned by map_lookup_elem doesn't differentiate
whether lookup was failed because of invalid key or lookup is not
supported.

Lets add handling for -EOPNOTSUPP return value of map_lookup_elem()
method of map, with expectation from map's implementation that it
should return -EOPNOTSUPP if lookup is not supported.

The errno for bpf syscall for BPF_MAP_LOOKUP_ELEM command will be set
to EOPNOTSUPP if map lookup is not supported.

Signed-off-by: Prashant Bhole 
---
 kernel/bpf/syscall.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b3c2d09bcf7a..ecb06352b5a0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -716,10 +716,15 @@ static int map_lookup_elem(union bpf_attr *attr)
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
-   if (ptr)
+   if (IS_ERR(ptr)) {
+   err = PTR_ERR(ptr);
+   } else if (!ptr) {
+   err = -ENOENT;
+   } else {
+   err = 0;
memcpy(value, ptr, value_size);
+   }
rcu_read_unlock();
-   err = ptr ? 0 : -ENOENT;
}
 
if (err)
-- 
2.17.1

Re: [PATCH iproute2-next] iplink: add ipvtap support

2018-09-19 Thread Phil Sutter

On Wed, Sep 19, 2018 at 11:03:29AM +0800, Hangbin Liu wrote:
> IPVLAN and IPVTAP are using the same functions and parameters. So we can
> just add a new link_util with id ipvtap. Others are the same.
> 
> Signed-off-by: Hangbin Liu 

Acked-by: Phil Sutter

Re: [Patch net-next] ipv4: initialize ra_mutex in inet_init_net()

2018-09-19 Thread Kirill Tkhai

On 18.09.2018 23:17, Cong Wang wrote:
> On Mon, Sep 17, 2018 at 12:25 AM Kirill Tkhai  wrote:
>> In inet_init() the order of registration is:
>>
>> ip_mr_init();
>> init_inet_pernet_ops();
>>
>> This means, ipmr_net_ops pernet operations are before af_inet_ops
>> in pernet_list. So, there is a theoretical probability, sometimes
>> in the future, we will have a problem during a fail of net initialization.
>>
>> Say,
>>
>> setup_net():
>> ipmr_net_ops->init() returns 0
>> xxx->init()  returns error
>> and then we do:
>> ipmr_net_ops->exit(),
>>
>> which could touch ra_mutex (theoretically).
> 
> How could ra_mutex be touched in this scenario?
> 
> ra_mutex is only used in ip_ra_control() which is called
> only by {get,set}sockopt(). I don't see anything related
> to netns exit() path here.

Currently, it is not touched. But it's an ordinary practice,
someone closes sockets in pernet ->exit methods. For example,
we close percpu icmp sockets in icmp_sk_exit(), which are
also of RAW type, and there is also called ip_ra_control()
for them. Yes, they differ by their protocol; icmp sockets
are of IPPROTO_ICMP protocol, while ip_ra_control() acts
on IPPROTO_RAW sockets, but it's not good anyway. This does
not look reliable for the future. In case of someone changes
something here, we may do not notice this for the long time,
while some users will meet bugs on their places.

Problems on error paths is not easy to detect on testing,
while user may meet them. We had issue of same type with
uninitialized xfrm_policy_lock. It was introduced in 2013,
while the problem was found only in 2017:

introduced by 283bc9f35bbb
fixed  by c28a45cb

(Last week I met it on RH7 kernel, which still has no a fix.
 But this talk is not about distribution kernels, just about
 the way).

I just want to say if someone makes some creativity on top
of this code, it will be to more friendly from us to him/her
to not force this person to think about such not obvious details,
but just to implement nice architecture right now.

Thanks,
Kirill

Re: [PATCH net-next 1/2] net: linkwatch: add check for netdevice being present to linkwatch_do_dev

2018-09-19 Thread Geert Uytterhoeven

On Tue, Sep 18, 2018 at 9:56 PM Heiner Kallweit  wrote:
> When bringing down the netdevice (incl. detaching it) and calling
> netif_carrier_off directly or indirectly the latter triggers an
> asynchronous linkwatch event.
> This linkwatch event eventually may fail to access chip registers in
> the ndo_get_stats/ndo_get_stats64 callback because the device isn't
> accessible any longer, see call trace in [0].
>
> To prevent this scenario don't check for IFF_UP only, but also make
> sure that the netdevice is present.
>
> [0] https://lists.openwall.net/netdev/2018/03/15/62
>
> Signed-off-by: Heiner Kallweit 

Survived 100 suspend/resume cycles on sh73a0/kzm9g and r8a73a4/ape6evm.

Tested-by: Geert Uytterhoeven 

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

Re: [PATCH net-next 2/2] net: phy: call state machine synchronously in phy_stop

2018-09-19 Thread Geert Uytterhoeven

On Tue, Sep 18, 2018 at 9:56 PM Heiner Kallweit  wrote:
> phy_stop() may be called e.g. when suspending, therefore all needed
> actions should be performed synchronously. Therefore add a synchronous
> call to the state machine.
>
> Signed-off-by: Heiner Kallweit 

Survived 100 suspend/resume cycles on sh73a0/kzm9g and r8a73a4/ape6evm.

Tested-by: Geert Uytterhoeven 

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Jiri Benc

On Tue, 18 Sep 2018 15:12:11 +0200, Johannes Berg wrote:
>  static int validate_nla(const struct nlattr *nla, int maxtype,
>   const struct nla_policy *policy,
> - const char **error_msg)
> + struct netlink_ext_ack *extack, bool *extack_set)

Can't the extack_set be included in the struct netlink_ext_ack? One less
parameter to pass around and the NL_SET_* macros could check this on
their own.

This way, it can be also easily turned to a more complex mechanism,
such as the one Marcelo proposed.

Thanks,

 Jiri

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Johannes Berg

On Wed, 2018-09-19 at 11:10 +0200, Jiri Benc wrote:
> On Tue, 18 Sep 2018 15:12:11 +0200, Johannes Berg wrote:
> >  static int validate_nla(const struct nlattr *nla, int maxtype,
> > const struct nla_policy *policy,
> > -   const char **error_msg)
> > +   struct netlink_ext_ack *extack, bool *extack_set)
> 
> Can't the extack_set be included in the struct netlink_ext_ack? One less
> parameter to pass around and the NL_SET_* macros could check this on
> their own.

No, I don't think it can or should.

For one, having the NL_SET_* macros check it on their own will already
not work - as we discussed over in the NLA_REJECT thread, we do need to
be able to override the data, e.g. if somebody does

NL_SET_ERR_MSG(extack, "warning: deprecated command");
err = nla_parse(..., extack);

and nla_parse() sets a new message. Thus, hiding all the logic in there
already will not work, and is also IMHO rather unexpected. Normally,
*later* messages should win, not *earlier* ones - and that doesn't
require any tracking, just setting them unconditionally.

It's just a side effect of how we do the recursive validation here that
we want *earlier* messages to win (but only within this code piece - if
a previous message was set, we want it to be overwritten by nla_parse!).

It might be possible to do this differently, in theory, but all the ways
I've tried to come up with so far made the code vastly more complex.

johannes

Bridge connectivity interruptions while devices join or leave the bridge

2018-09-19 Thread Johannes Wienke

I am sorry for probably misusing this list, but I couldn't find any
other mailing list suitable for asking in-detail Linux networking
questions. As I am not subscribed, please CC me in a potential reply.

I am currently tracking down a connectivity issues of docker containers
on a custom bridge network, which I could reduce to the Linux network
stack without docker being involved.

The situation that I am observing is the following: I have a bridge
device, which is connected to the outer world using forwarding and
masquerading (so the bridge does not contain the outgoing network
interface of the host). This bridge is used to perform network
operations by a long-running process, which is restricted to this bridge
using network namespaces and veth devices (exactly what docker does
internally). What I see is that every time a (virtual) network device is
added to or removed from the bridge, the communication of the
long-running process is interrupted.

I have created two scripts that can be used to replicate the situation.
They are available at:
https://gist.github.com/languitar/9ac8dc5c8db7cf4a89e1546f6e32ca7b

setup.bash sets up the bridge, veth devices, network namespace and the
iptables rules to replicate the network setup and simulates the
long-running process by periodically performing (volatile) UDP DNS
requests in a while loop.

When launching this script, all DNS requests should succeed and you
should see success messages at a regular pace.

To simulate devices joining and leaving the bridge, you can start
interruptor.bash.

As soon as this script is running, you can observe that DNS requests
will be delayed frequently and some of them even fail. In a parallel
pcap you would see that sometimes the UDP packages from the DNS lookup
are not routed to the outside world, but instead end up at the bridge
device without ever leaving the host system.

Can someone explain what is happening here and why adding and removing
devices to a bridge results in the connectivity issues? How to avoid
this behavior? I'd be glad for any hint on that.

Kind regards
Johannes
-- 
Johannes Wienke, Researcher at CoR-Lab / CITEC, Bielefeld University
Address: Inspiration 1, D-33619 Bielefeld, Germany (Room 1.307)
Phone: +49 521 106-67277

Re: [PATCH v2 2/4] dt-bindings: net: qcom: Add binding for shared mdio bus

2018-09-19 Thread Wang, Dongsheng

On 2018/9/18 20:35, Andrew Lunn wrote:
>>> If you want to describe the MDIO controller, then you embed a mdio
>>> subnode into your Ethernet MAC node:
>>>
>>>  emac0: ethernet@feb2 {
>>> mdio {
>>> #address-cells = <1>;
>>> #size-cells = <0>;
>>>
>>> phy0: ethernet-phy@0 {
>>> reg = <0>;
>>> };
>>> };
>>> };
>>>
>>> And then each Ethernet MAC controller refers to their appropriate PHY
>>> device tree node using a phy-handle property to point to either their
>>> own MDIO controller, or another MAC's MDIO controller.
>> Sorry, I do not understand how phy-handle point to MDIO controller,
>> because phy-handle is defined to point to a phy.
> The MAC driver does not care what MDIO controller a PHY is on. All you
> need to do to register the PHY is:

Yes, these are all things that must be done, and emac driver will
connect phy when mac up.
If we had a separate MDIO controller, the MAC would not care about MDIO
bus. But MDIO is integrated within the EMAC, and emac driver maintains
the mdio.

Each EMAC do their mdio register/unregister. But in the shared scenario,
the EMACs that use the shared bus do not need to create an MDIO and
cannot release the Shared bus.

In device tree environment as you and Florian said. Just use phy-handle
get the phy_node.
The EMAC would not care about the phy come from which MDIO bus because
the phy device gets from the device_node match(phy-handle). And if the
phy_dev cannot get through phy_node means, the mdio bus is not ready.

But ACPI environment my understand is this:
First method. EMAC driver gets the shared MDIO bus, and maintain it.
The second way, EMAC match the phy_dev from the name.
These patch series try to use the FIRST way. Now I prefer to use the
second way to do the shared function.

I will rework this patchset and maybe patches will be a delay for a few
days.

Cheers,
Dongsheng
>   phy_node = of_parse_phandle(np, "phy-handle", 0);
>   phy_interface = of_get_phy_mode(np);
>   phydev = of_phy_connect(dev, phy_node,
> &handle_link_change, 0,
> phy_interface);
>
>   Andrew
>

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Johannes Berg

On Wed, 2018-09-19 at 00:37 -0300, Marcelo Ricardo Leitner wrote:

> Did you consider indicating the message level, and only overwrite the
> message that is already in there if the new message level is higher
> than the current one?

Hmm, no, I guess I didn't - I'm not even sure I understand what you're
saying.

This code in itself generates no "warning" messages; that was just a
construct we discussed in the NLA_REJECT thread, e.g. if you say (like I
just also wrote in my reply to Jiri):

NL_SET_ERR_MSG(extack, "warning: deprecated command");
err = nla_parse(..., extack);
if (err)
return err;
/* do something */
return 0;

Here you could consider the message there a warning that's transported
out even if we return 0, but if we return with a failure from
nla_parse() (or nla_validate instead if you wish), then that failure
message "wins".

> This way the first to set an Error message will have it, and Warning
> messages would be overwritten by such if needed. But it also would
> cause the first warning to be held, and not the last one, as it does
> today. We want the first error, but the last warning otherwise.
> 
> It would not be possible to overwrite if new_msglvl >= cur_msglvl
> because then it would trigger the initial issue again, so some extra
> logic would be needed to solve this.

That sounds way more complex than what I'm doing now?

Note, like I said above, this isn't *generic* in any way. This code here
will only ever set error messages that should "win".

I suppose we could - technically - make that generic, in that we could
have both

  NLA_SET_WARN_MSG(extack, "...");
  NLA_SET_ERR_MSG(extack, "...");

and keep track of warning vs. error; however, just like my first version
of the NLA_REJECT patch, that would break existing code.

I also don't think that we actually *need* this complexity in general.
It should almost always be possible (and realistically, pretty easy) to
structure your code in a way that warning messages only go out if no
error message overwrites them. The only reason we were ever even
discussing this was that in NLA_REJECT I missed the fact that somebody
could've set a message before and thus would keep it rather than
overwrite it, which was a change in behaviour.

Now, with this patch, all I'm doing is changing the internal behaviour
of nla_parse/nla_validate - externally, it still overwrites any existing
message if an error occurs, but internally it keeps the inner-most
error.

Why is this? Consider this:

static const struct nla_policy inner_policy[] = {
[INNER_FLAG] = { .type = NLA_REJECT,
 .validation_data = "must not set this flag" }
};

static const struct nla_policy outer_policy[] = {
[OUTER_NESTING] = { .type = NLA_NESTED, .len = INNER_MAX,
.validation_data = inner_policy,
};

Now if you invoke nla_parse/nla_validate with a message like this

[ OUTER_NESTING => [ INNER_FLAG, ... ], ... ]

you'd get "must not set this flag" with the error offset pointing to
that; if I didn't do this construction here with inner messages winning,
you'd get "Attribute failed policy validation" with the error offset
pointing to the "OUTER_NESTING" attribute, that's pretty useless.

>From an external API POV though, nla_validate/nla_parse will continue to
unconditionally overwrite any existing "warning" messages with errors,
if such occurred. They just won't overwrite their own messages when
returning from a nested policy validation.

johannes

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Jiri Benc

On Wed, 19 Sep 2018 11:15:25 +0200, Johannes Berg wrote:
> For one, having the NL_SET_* macros check it on their own will already
> not work - as we discussed over in the NLA_REJECT thread, we do need to
> be able to override the data, e.g. if somebody does
> 
> NL_SET_ERR_MSG(extack, "warning: deprecated command");
> err = nla_parse(..., extack);
> 
> and nla_parse() sets a new message. Thus, hiding all the logic in there
> already will not work, and is also IMHO rather unexpected. Normally,
> *later* messages should win, not *earlier* ones - and that doesn't
> require any tracking, just setting them unconditionally.
> 
> It's just a side effect of how we do the recursive validation here that
> we want *earlier* messages to win (but only within this code piece - if
> a previous message was set, we want it to be overwritten by nla_parse!).

Fair enough.

> It might be possible to do this differently, in theory, but all the ways
> I've tried to come up with so far made the code vastly more complex.

Wouldn't still make sense to store the flag in the struct
netlink_ext_ack, though? The way the parameters are passed around in
this patch looks ugly. And adding more users of the flag later (there
may be other cases when we want the earlier messages to be preserved)
would mean adding parameters all around, while the flag in the struct
would be readily available.

I don't have a strong opinion on this, just the patch seems to be
inelegant.

 Jiri

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Jiri Benc

On Wed, 19 Sep 2018 11:25:17 +0200, Johannes Berg wrote:
> Now, with this patch, all I'm doing is changing the internal behaviour
> of nla_parse/nla_validate - externally, it still overwrites any existing
> message if an error occurs, but internally it keeps the inner-most
> error.

Ah, okay, that answers my question about putting the flag into the
ext_ack struct, too.

It may still be worthwhile to have a mechanism for prioritizing certain
extack messages over another ones but it's clearly out of scope of this
patchset.

The patchset looks good to me. Just include the description you just
wrote in the commit message :-)

Thanks!

 Jiri

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Johannes Berg

On Wed, 2018-09-19 at 11:28 +0200, Jiri Benc wrote:

> > It might be possible to do this differently, in theory, but all the ways
> > I've tried to come up with so far made the code vastly more complex.
> 
> Wouldn't still make sense to store the flag in the struct
> netlink_ext_ack, though?

Does it make sense to store a flag there that only has a single,
localized, user? I'd say no.

> The way the parameters are passed around in
> this patch looks ugly. 

Yeah, it's not the best in some ways.

I considered having a cleared new extack on the stack in the outer-most
call (nla_parse/nla_validate), and then we can "set if not already set",
and at the end unconditionally copy/set to the real one... But then I
either need to duplicate that code in both, or pass another argument to
nla_validate_parse() anyway to indicate whether to do this or not (since
the inner calls to it shouldn't, since that would defeat the purpose),
or add another level of function call indirection?

I'm not really sure any of these are better ... and more complex, in
some ways, since we have to copy all the data around.

> And adding more users of the flag later (there
> may be other cases when we want the earlier messages to be preserved)
> would mean adding parameters all around, while the flag in the struct
> would be readily available.

I can't really think of any other users of such a thing?

johannes

Re: Bridge connectivity interruptions while devices join or leave the bridge

2018-09-19 Thread Ido Schimmel

On Wed, Sep 19, 2018 at 11:10:22AM +0200, Johannes Wienke wrote:
> Can someone explain what is happening here and why adding and removing
> devices to a bridge results in the connectivity issues? How to avoid
> this behavior? I'd be glad for any hint on that.

The MAC address of the bridge ('brtest' in your example) is inherited
from the bridge port with the "smallest" MAC address. Thus, when you
generate veth devices with random MACs and enslave them to the bridge,
you sometimes change the bridge's MAC address as well. And since the
bridge is the default gateway sometimes packets are sent to the wrong
MAC address.

You can avoid randomly changing the bridge's MAC by setting it
explicitly:
# ip link set dev brtest address

[PATCH] ixgbevf: off by one in ixgbevf_ipsec_tx()

2018-09-19 Thread Dan Carpenter

The ipsec->tx_tbl[] array has IXGBE_IPSEC_MAX_SA_COUNT elements so the >
should be a >=.

Fixes: 0062e7cc955e ("ixgbevf: add VF IPsec offload code")
Signed-off-by: Dan Carpenter 

diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c 
b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
index 997cea675a37..4fcbeffce52b 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
@@ -470,7 +470,7 @@ int ixgbevf_ipsec_tx(struct ixgbevf_ring *tx_ring,
}
 
sa_idx = xs->xso.offload_handle - IXGBE_IPSEC_BASE_TX_INDEX;
-   if (unlikely(sa_idx > IXGBE_IPSEC_MAX_SA_COUNT)) {
+   if (unlikely(sa_idx >= IXGBE_IPSEC_MAX_SA_COUNT)) {
netdev_err(tx_ring->netdev, "%s: bad sa_idx=%d handle=%lu\n",
   __func__, sa_idx, xs->xso.offload_handle);
return 0;

Re: Bridge connectivity interruptions while devices join or leave the bridge

2018-09-19 Thread Johannes Wienke

Dear Ido,

On 9/19/18 12:07 PM, Ido Schimmel wrote:
> On Wed, Sep 19, 2018 at 11:10:22AM +0200, Johannes Wienke wrote:
>> Can someone explain what is happening here and why adding and removing
>> devices to a bridge results in the connectivity issues? How to avoid
>> this behavior? I'd be glad for any hint on that.
> 
> The MAC address of the bridge ('brtest' in your example) is inherited
> from the bridge port with the "smallest" MAC address. Thus, when you
> generate veth devices with random MACs and enslave them to the bridge,
> you sometimes change the bridge's MAC address as well. And since the
> bridge is the default gateway sometimes packets are sent to the wrong
> MAC address.

Thank you very much! This was the important hint and solves the issues.

This behavior of inheriting the mac address is really unexpected to us.
Is it documented somewhere?

Kind regards
Johannes
-- 
Johannes Wienke, Researcher at CoR-Lab / CITEC, Bielefeld University
Address: Inspiration 1, D-33619 Bielefeld, Germany (Room 1.307)
Phone: +49 521 106-67277



signature.asc
Description: OpenPGP digital signature

Re: [PATCH v2 2/4] dt-bindings: net: qcom: Add binding for shared mdio bus

2018-09-19 Thread Andrew Lunn

On Wed, Sep 19, 2018 at 09:19:19AM +, Wang, Dongsheng wrote:
> On 2018/9/18 20:35, Andrew Lunn wrote:
> >>> If you want to describe the MDIO controller, then you embed a mdio
> >>> subnode into your Ethernet MAC node:
> >>>
> >>>  emac0: ethernet@feb2 {
> >>>   mdio {
> >>>   #address-cells = <1>;
> >>>   #size-cells = <0>;
> >>>
> >>>   phy0: ethernet-phy@0 {
> >>>   reg = <0>;
> >>>   };
> >>>   };
> >>> };
> >>>
> >>> And then each Ethernet MAC controller refers to their appropriate PHY
> >>> device tree node using a phy-handle property to point to either their
> >>> own MDIO controller, or another MAC's MDIO controller.
> >> Sorry, I do not understand how phy-handle point to MDIO controller,
> >> because phy-handle is defined to point to a phy.
> > The MAC driver does not care what MDIO controller a PHY is on. All you
> > need to do to register the PHY is:
> 
> Yes, these are all things that must be done, and emac driver will
> connect phy when mac up.
> If we had a separate MDIO controller, the MAC would not care about MDIO
> bus. But MDIO is integrated within the EMAC, and emac driver maintains
> the mdio.
> 
> Each EMAC do their mdio register/unregister. But in the shared scenario,
> the EMACs that use the shared bus do not need to create an MDIO and
> cannot release the Shared bus.

Hi Dongsheng

There is nothing new here. Many Ethernet drivers export an MDIO bus
which is then used by some other device, often an Ethernet
switch. Ordering should not be a problem, you just need to handle
EPROBE_DEFER, which will happen if the MDIO bus has not yet been
probed when you try to lookup the phy-handle. And once the phy has
been connected, the MDIO bus will be locked, preventing it from being
removed.

> But ACPI environment my understand is this:

ACPI is completely separate and should not affect the DT binding.
I've not yet looked at the ACPI changes you added.

> I will rework this patchset and maybe patches will be a delay for a few
> days.

Thanks

Andrew

[PATCH net-next] ipv6: Allow the l3mdev to be a loopback

2018-09-19 Thread Mike Manning

There is no way currently for an IPv6 client connect using a loopback
address in a VRF, whereas for IPv4 the loopback address can be added:

$ sudo ip addr add dev vrfred 127.0.0.1/8
$ sudo ip -6 addr add ::1/128 dev vrfred
RTNETLINK answers: Cannot assign requested address

So allow ::1 to be configured on an L3 master device. In order for
this to be usable ip_route_output_flags needs to not consider ::1 to
be a link scope address (since oif == l3mdev and so it would be
dropped), and ipv6_rcv needs to consider the l3mdev to be a loopback
device so that it doesn't drop the packets.

Signed-off-by: Robert Shearman 
---
 net/ipv6/addrconf.c  | 1 +
 net/ipv6/ip6_input.c | 3 ++-
 net/ipv6/route.c | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d4733160e6b7..bfe3ec7ecb14 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -997,6 +997,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config 
*cfg,
if (addr_type == IPV6_ADDR_ANY ||
addr_type & IPV6_ADDR_MULTICAST ||
(!(idev->dev->flags & IFF_LOOPBACK) &&
+!netif_is_l3_master(idev->dev) &&
 addr_type & IPV6_ADDR_LOOPBACK))
return ERR_PTR(-EADDRNOTAVAIL);
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6242682be876..96577e742afd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -178,7 +178,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, 
struct net_device *dev,
 */
if ((ipv6_addr_loopback(&hdr->saddr) ||
 ipv6_addr_loopback(&hdr->daddr)) &&
-!(dev->flags & IFF_LOOPBACK))
+   !(dev->flags & IFF_LOOPBACK) &&
+   !netif_is_l3_master(dev))
goto err;
 
/* RFC4291 Errata ID: 3480
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0fa62acc923c..f36ee8a3314f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2098,7 +2098,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, 
const struct sock *sk,
 {
bool any_src;
 
-   if (rt6_need_strict(&fl6->daddr)) {
+   if (ipv6_addr_type(&fl6->daddr) &
+   (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
 
dst = l3mdev_link_scope_lookup(net, fl6);
-- 
2.11.0

[PATCH net] ip6_tunnel: be careful when accessing the inner header

2018-09-19 Thread Paolo Abeni

the ip6 tunnel xmit ndo assumes that the processed skb always
contains an ip[v6] header, but syzbot has found a way to send
frames that fall short of this assumption, leading to the following splat:

BUG: KMSAN: uninit-value in ip6ip6_tnl_xmit net/ipv6/ip6_tunnel.c:1307
[inline]
BUG: KMSAN: uninit-value in ip6_tnl_start_xmit+0x7d2/0x1ef0
net/ipv6/ip6_tunnel.c:1390
CPU: 0 PID: 4504 Comm: syz-executor558 Not tainted 4.16.0+ #87
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x185/0x1d0 lib/dump_stack.c:53
  kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
  __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
  ip6ip6_tnl_xmit net/ipv6/ip6_tunnel.c:1307 [inline]
  ip6_tnl_start_xmit+0x7d2/0x1ef0 net/ipv6/ip6_tunnel.c:1390
  __netdev_start_xmit include/linux/netdevice.h:4066 [inline]
  netdev_start_xmit include/linux/netdevice.h:4075 [inline]
  xmit_one net/core/dev.c:3026 [inline]
  dev_hard_start_xmit+0x5f1/0xc70 net/core/dev.c:3042
  __dev_queue_xmit+0x27ee/0x3520 net/core/dev.c:3557
  dev_queue_xmit+0x4b/0x60 net/core/dev.c:3590
  packet_snd net/packet/af_packet.c:2944 [inline]
  packet_sendmsg+0x7c70/0x8a30 net/packet/af_packet.c:2969
  sock_sendmsg_nosec net/socket.c:630 [inline]
  sock_sendmsg net/socket.c:640 [inline]
  ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046
  __sys_sendmmsg+0x42d/0x800 net/socket.c:2136
  SYSC_sendmmsg+0xc4/0x110 net/socket.c:2167
  SyS_sendmmsg+0x63/0x90 net/socket.c:2162
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x441819
RSP: 002b:7ffe58ee8268 EFLAGS: 0213 ORIG_RAX: 0133
RAX: ffda RBX: 0003 RCX: 00441819
RDX: 0002 RSI: 2100 RDI: 0003
RBP: 006cd018 R08:  R09: 
R10:  R11: 0213 R12: 00402510
R13: 004025a0 R14:  R15: 

Uninit was created at:
  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
  kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
  kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
  kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
  slab_post_alloc_hook mm/slab.h:445 [inline]
  slab_alloc_node mm/slub.c:2737 [inline]
  __kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
  __kmalloc_reserve net/core/skbuff.c:138 [inline]
  __alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
  alloc_skb include/linux/skbuff.h:984 [inline]
  alloc_skb_with_frags+0x1d4/0xb20 net/core/skbuff.c:5234
  sock_alloc_send_pskb+0xb56/0x1190 net/core/sock.c:2085
  packet_alloc_skb net/packet/af_packet.c:2803 [inline]
  packet_snd net/packet/af_packet.c:2894 [inline]
  packet_sendmsg+0x6454/0x8a30 net/packet/af_packet.c:2969
  sock_sendmsg_nosec net/socket.c:630 [inline]
  sock_sendmsg net/socket.c:640 [inline]
  ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046
  __sys_sendmmsg+0x42d/0x800 net/socket.c:2136
  SYSC_sendmmsg+0xc4/0x110 net/socket.c:2167
  SyS_sendmmsg+0x63/0x90 net/socket.c:2162
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2

This change addresses the issue adding the needed check before
accessing the inner header.

The ipv4 side of the issue is apparently there since the ipv4 over ipv6
initial support, and the ipv6 side predates git history.

Fixes: c4d3efafcc93 ("[IPV6] IP6TUNNEL: Add support to IPv4 over IPv6 tunnel.")
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: syzbot+3fde91d4d394747d6...@syzkaller.appspotmail.com
Tested-by: Alexander Potapenko 
Signed-off-by: Paolo Abeni 
---
 net/ipv6/ip6_tunnel.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 419960b0ba16..a0b6932c3afd 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1234,7 +1234,7 @@ static inline int
 ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 {
struct ip6_tnl *t = netdev_priv(dev);
-   const struct iphdr  *iph = ip_hdr(skb);
+   const struct iphdr  *iph;
int encap_limit = -1;
struct flowi6 fl6;
__u8 dsfield;
@@ -1242,6 +1242,11 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev)
u8 tproto;
int err;
 
+   /* ensure we can access the full inner ip header */
+   if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+   return -1;
+
+   iph = ip_hdr(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
tproto = READ_ONCE(t->parms.proto);
@@ -1306,7 +1311,7 @@ static inline int
 ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 {
struct ip6_tnl *t = netdev_priv(dev);
-   struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+   struct ipv6hdr *ipv6h;
int encap_limit = -1;
__u16 offset;
struct flowi6 fl6;
@@ -1315,

Is it possible to get Rx timestamps in skb->tstamp?

2018-09-19 Thread David Howells

Hi,

Is it possible to tell a UDP socket that you'd like it to put reception
timestamps in skb->tstamp?

For some reason, I seem to remember that the kernel used to put something in
there - and AF_RXRPC makes use of it.

David

Re: Is it possible to get Rx timestamps in skb->tstamp?

2018-09-19 Thread Toke Høiland-Jørgensen

David Howells  writes:

> Hi,
>
> Is it possible to tell a UDP socket that you'd like it to put
> reception timestamps in skb->tstamp?

I think you probably want SO_TIMESTAMP*?

See 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/timestamping.txt

-Toke

[PATCH net-next] ipv6: Allow the l3mdev to be a loopback

2018-09-19 Thread Mike Manning

From: Robert Shearman 

There is no way currently for an IPv6 client connect using a loopback
address in a VRF, whereas for IPv4 the loopback address can be added:

$ sudo ip addr add dev vrfred 127.0.0.1/8
$ sudo ip -6 addr add ::1/128 dev vrfred
RTNETLINK answers: Cannot assign requested address

So allow ::1 to be configured on an L3 master device. In order for
this to be usable ip_route_output_flags needs to not consider ::1 to
be a link scope address (since oif == l3mdev and so it would be
dropped), and ipv6_rcv needs to consider the l3mdev to be a loopback
device so that it doesn't drop the packets.

Signed-off-by: Robert Shearman 
Signed-off-by: Mike Manning 
---
 net/ipv6/addrconf.c  | 1 +
 net/ipv6/ip6_input.c | 3 ++-
 net/ipv6/route.c | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d4733160e6b7..bfe3ec7ecb14 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -997,6 +997,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config 
*cfg,
if (addr_type == IPV6_ADDR_ANY ||
addr_type & IPV6_ADDR_MULTICAST ||
(!(idev->dev->flags & IFF_LOOPBACK) &&
+!netif_is_l3_master(idev->dev) &&
 addr_type & IPV6_ADDR_LOOPBACK))
return ERR_PTR(-EADDRNOTAVAIL);
 
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6242682be876..96577e742afd 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -178,7 +178,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, 
struct net_device *dev,
 */
if ((ipv6_addr_loopback(&hdr->saddr) ||
 ipv6_addr_loopback(&hdr->daddr)) &&
-!(dev->flags & IFF_LOOPBACK))
+   !(dev->flags & IFF_LOOPBACK) &&
+   !netif_is_l3_master(dev))
goto err;
 
/* RFC4291 Errata ID: 3480
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 0fa62acc923c..f36ee8a3314f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2098,7 +2098,8 @@ struct dst_entry *ip6_route_output_flags(struct net *net, 
const struct sock *sk,
 {
bool any_src;
 
-   if (rt6_need_strict(&fl6->daddr)) {
+   if (ipv6_addr_type(&fl6->daddr) &
+   (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
 
dst = l3mdev_link_scope_lookup(net, fl6);
-- 
2.11.0

Re: Is it possible to get Rx timestamps in skb->tstamp?

2018-09-19 Thread David Howells

Toke Høiland-Jørgensen  wrote:

> > Is it possible to tell a UDP socket that you'd like it to put
> > reception timestamps in skb->tstamp?
> 
> I think you probably want SO_TIMESTAMP*?
> 
> See 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/networking/timestamping.txt

That seems to work.  I thought that only affected recvmsg() putting the
timestamp into the ancillary data buffer, but apparently not.

Thanks,
David

Re: [PATCH v2 2/4] dt-bindings: net: qcom: Add binding for shared mdio bus

2018-09-19 Thread Timur Tabi


On 9/19/18 7:25 AM, Andrew Lunn wrote:

ACPI is completely separate and should not affect the DT binding.
I've not yet looked at the ACPI changes you added.


Just FYI, there is no device tree platform on which the upstream EMAC 
driver works.  All of the DT code in the driver is theoretical.  It 
worked once on a prototype platform, when I originally wrote the code, 
but since then DT support is mostly a guess.


The focus of any patches for the EMAC should be ACPI, not DT.  If 
anything, ACPI support should come first.  No one should be writing or 
reviewing DT code before ACPI code.


The upstream EMAC driver is only known to work on the QDF2400, which is 
an ACPI-only chip.  I feel like I've been repeating this too often.

Re: [PATCH ethtool] ethtool: support combinations of FEC modes

2018-09-19 Thread Michal Kubecek

On Wed, Sep 05, 2018 at 06:54:57PM +0100, Edward Cree wrote:
> Of the three drivers that currently support FEC configuration, two (sfc
>  and cxgb4[vf]) accept configurations with more than one bit set in the
>  feccmd.fec bitmask.  (The precise semantics of these combinations vary.)
> Thus, this patch adds the ability to specify such combinations through a
>  comma-separated list of FEC modes in the 'encoding' argument on the
>  command line.
> 
> Also adds --set-fec tests to test-cmdline.c, and corrects the man page
>  (the encoding argument is not optional) while updating it.
> 
> Signed-off-by: Edward Cree 
...
> +/* Takes a comma-separated list of FEC modes, returns the bitwise OR of their
> + * corresponding ETHTOOL_FEC_* constants.
> + * Accepts repetitions (e.g. 'auto,auto') and trailing comma (e.g. 'off,').
> + */
> +static int parse_fecmode(const char *str)
> +{
>   int fecmode = 0;
> + char buf[6];
>  
>   if (!str)
> - return fecmode;
> -
> - if (!strcasecmp(str, "auto"))
> - fecmode |= ETHTOOL_FEC_AUTO;
> - else if (!strcasecmp(str, "off"))
> - fecmode |= ETHTOOL_FEC_OFF;
> - else if (!strcasecmp(str, "rs"))
> - fecmode |= ETHTOOL_FEC_RS;
> - else if (!strcasecmp(str, "baser"))
> - fecmode |= ETHTOOL_FEC_BASER;
> + return 0;
> + while (*str) {
> + size_t next;
> + int mode;
>  
> + next = strcspn(str, ",");
> + if (next >= 6) /* Bad mode, longest name is 5 chars */
> + return 0;
> + /* Copy into temp buffer and nul-terminate */
> + memcpy(buf, str, next);
> + buf[next] = 0;
> + mode = fecmode_str_to_type(buf);
> + if (!mode) /* Bad mode encountered */
> + return 0;
> + fecmode |= mode;
> + str += next;
> + /* Skip over ',' (but not nul) */
> + if (*str)
> + str++;
> + }
>   return fecmode;
>  }
>  

I'm sorry I didn't notice this before the patch was accepted but as it's
not in a release yet, maybe it's still not too late.

Could I suggest to make the syntax consistent with other options? I mean 
rather than a comma separated list to use either

  ethtool --set-fec  encoding enc1 enc2 ...

(as we have for --reset) or

  ethtool --set-fec  encoding enc1 on|off enc2 on|off ...

(as we have e.g. for msglvl, -K or --set-eee)?

Second option seems more appropriate to me but it would require special
handling of the case when there is only one argument after "encoding" to
maintain backward compatibility with already released versions.

One loosely related question: how sure can we be that we are never going
to need more than 32 bits for FEC encodings? Is it something completely
hypothetical or is it something that could happen in the future?

Michal Kubecek

Re: [RFC bpf-next 2/4] bpf: return EOPNOTSUPP when map lookup isn't supported

2018-09-19 Thread Alexei Starovoitov

On Wed, Sep 19, 2018 at 04:51:41PM +0900, Prashant Bhole wrote:
> Return ERR_PTR(-EOPNOTSUPP) from map_lookup_elem() methods of below
> map types:
> - BPF_MAP_TYPE_PROG_ARRAY
> - BPF_MAP_TYPE_STACK_TRACE
> - BPF_MAP_TYPE_XSKMAP
> - BPF_MAP_TYPE_SOCKMAP/BPF_MAP_TYPE_SOCKHASH
> 
> Signed-off-by: Prashant Bhole 
> ---
>  kernel/bpf/arraymap.c | 2 +-
>  kernel/bpf/sockmap.c  | 2 +-
>  kernel/bpf/stackmap.c | 2 +-
>  kernel/bpf/xskmap.c   | 2 +-
>  4 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index dded84cbe814..24583da9ffd1 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
> @@ -449,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map)
>  
>  static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
>  {
> - return NULL;
> + return ERR_PTR(-EOPNOTSUPP);
>  }

conceptually the set looks good to me.
Please add a test to test_verifier.c to make sure
that these lookup helpers cannot be called from BPF program.
Otherwise this diff may cause crashes.

Re: [PATCH v2 2/4] dt-bindings: net: qcom: Add binding for shared mdio bus

2018-09-19 Thread Andrew Lunn

> The focus of any patches for the EMAC should be ACPI, not DT.  If anything,
> ACPI support should come first.  No one should be writing or reviewing DT
> code before ACPI code.

I suspect that is not going to be easy. Last time i looked, the ACPI
standard had nothing about MDIO busses or PHYs. Marcin Wojtas did some
work in this area a while back for the mvpp2, but if i remember
correctly, he worked around this by simply not having a PHY when using
ACPI, and making use of a MAC interrupt which indicated when there was
link.

Whoever implements this first needs to be an ACPI expert and probably
needs to write it up and submit it as an amendment to the ACPI
standard.

  Andrew

[PATCH net-next] net/tls: Add support for async encryption of records for performance

2018-09-19 Thread Vakul Garg

In current implementation, tls records are encrypted & transmitted
serially. Till the time the previously submitted user data is encrypted,
the implementation waits and on finish starts transmitting the record.
This approach of encrypt-one record at a time is inefficient when
asynchronous crypto accelerators are used. For each record, there are
overheads of interrupts, driver softIRQ scheduling etc. Also the crypto
accelerator sits idle most of time while an encrypted record's pages are
handed over to tcp stack for transmission.

This patch enables encryption of multiple records in parallel when an
async capable crypto accelerator is present in system. This is achieved
by allowing the user space application to send more data using sendmsg()
even while previously issued data is being processed by crypto
accelerator. This requires returning the control back to user space
application after submitting encryption request to accelerator. This
also means that zero-copy mode of encryption cannot be used with async
accelerator as we must be done with user space application buffer before
returning from sendmsg().

There can be multiple records in flight to/from the accelerator. Each of
the record is represented by 'struct tls_rec'. This is used to store the
memory pages for the record.

After the records are encrypted, they are added in a linked list called
tx_ready_list which contains encrypted tls records sorted as per tls
sequence number. The records from tx_ready_list are transmitted using a
newly introduced function called tls_tx_records(). The tx_ready_list is
polled for any record ready to be transmitted in sendmsg(), sendpage()
after initiating encryption of new tls records. This achieves parallel
encryption and transmission of records when async accelerator is
present.

There could be situation when crypto accelerator completes encryption
later than polling of tx_ready_list by sendmsg()/sendpage(). Therefore
we need a deferred work context to be able to transmit records from
tx_ready_list. The deferred work context gets scheduled if applications
are not sending much data through the socket. If the applications issue
sendmsg()/sendpage() in quick succession, then the scheduling of
tx_work_handler gets cancelled as the tx_ready_list would be polled from
application's context itself. This saves scheduling overhead of deferred
work.

The patch also brings some side benefit. We are able to get rid of the
concept of CLOSED record. This is because the records once closed are
either encrypted and then placed into tx_ready_list or if encryption
fails, the socket error is set. This simplifies the kernel tls
sendpath. However since tls_device.c is still using macros, accessory
functions for CLOSED records have been retained.

Signed-off-by: Vakul Garg 
---
 include/net/tls.h  |  70 +--
 net/tls/tls_main.c |  54 ++---
 net/tls/tls_sw.c   | 569 -
 3 files changed, 515 insertions(+), 178 deletions(-)

diff --git a/include/net/tls.h b/include/net/tls.h
index 9f3c4ea9ad6f..84756667fc2a 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -41,7 +41,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include 
 
 
@@ -93,24 +93,47 @@ enum {
TLS_NUM_CONFIG,
 };
 
-struct tls_sw_context_tx {
-   struct crypto_aead *aead_send;
-   struct crypto_wait async_wait;
-
-   char aad_space[TLS_AAD_SPACE_SIZE];
-
-   unsigned int sg_plaintext_size;
-   int sg_plaintext_num_elem;
+/* TLS records are maintained in 'struct tls_rec'. It stores the memory pages
+ * allocated or mapped for each TLS record. After encryption, the records are
+ * stores in a linked list.
+ */
+struct tls_rec {
+   struct list_head list;
+   int tx_flags;
struct scatterlist sg_plaintext_data[MAX_SKB_FRAGS];
-
-   unsigned int sg_encrypted_size;
-   int sg_encrypted_num_elem;
struct scatterlist sg_encrypted_data[MAX_SKB_FRAGS];
 
/* AAD | sg_plaintext_data | sg_tag */
struct scatterlist sg_aead_in[2];
/* AAD | sg_encrypted_data (data contain overhead for hdr&iv&tag) */
struct scatterlist sg_aead_out[2];
+
+   unsigned int sg_plaintext_size;
+   unsigned int sg_encrypted_size;
+   int sg_plaintext_num_elem;
+   int sg_encrypted_num_elem;
+
+   char aad_space[TLS_AAD_SPACE_SIZE];
+   struct aead_request aead_req;
+   u8 aead_req_ctx[];
+};
+
+struct tx_work {
+   struct delayed_work work;
+   struct sock *sk;
+};
+
+struct tls_sw_context_tx {
+   struct crypto_aead *aead_send;
+   struct crypto_wait async_wait;
+   struct tx_work tx_work;
+   struct tls_rec *open_rec;
+   struct list_head tx_ready_list;
+   atomic_t encrypt_pending;
+   int async_notify;
+
+#define BIT_TX_SCHEDULED   0
+   unsigned long tx_bitmask;
 };
 
 struct tls_sw_context_rx {
@@ -197,6 +220,8 @@ struct tls_context {
 
struct scatterlist *partially_sent_record;

Re: [PATCH 1/1] macsec: reflect the master interface state

2018-09-19 Thread Sabrina Dubroca

Hello,

2018-09-18, 20:16:12 -0400, Radu Rendec wrote:
> This patch makes macsec interfaces reflect the state of the underlying
> interface: if the master interface changes state to up/down, the macsec
> interface changes its state accordingly.

I got a separate report of the same issue and I've been looking in
that area too.

> This closes a loophole in the macsec interface state logic: the macsec
> interface cannot be brought up if the master interface is down (the
> operation is rejected with ENETDOWN); however, if the macsec interface
> is brought up while the master interface is up and then the master
> interface goes down, the macsec interface stays up.

Yes, that's a bit unfortunate. I was wondering if we should just allow
setting the device up, and let link state handle the rest.

> Reflecting the master interface state can also be useful if the macsec
> interface is used as a bridge port: if the master (physical) interface
> goes down, the state propagates through the macsec interface to the
> bridge module, which can react to the state change (for example if it
> runs STP).
> 
> The patch does nothing original. The same logic is implemented for vlan
> interfaces in vlan_device_event() in net/8021q/vlan.c. In fact, the code
> was copied and adapted from there.

It's missing small parts of link state handling that I have been
testing, mainly when creating a new device.

> Signed-off-by: Radu Rendec 
> ---
>  drivers/net/macsec.c | 57 +++-
>  1 file changed, 46 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
> index 7de88b33d5b9..cb93a1290f85 100644
> --- a/drivers/net/macsec.c
> +++ b/drivers/net/macsec.c
> @@ -3486,20 +3486,21 @@ static int macsec_notify(struct notifier_block *this, 
> unsigned long event,
>void *ptr)
>  {
>   struct net_device *real_dev = netdev_notifier_info_to_dev(ptr);
> - LIST_HEAD(head);
> + struct macsec_dev *m;
> + struct macsec_rxh_data *rxd;
>  
>   if (!is_macsec_master(real_dev))
>   return NOTIFY_DONE;
>  
> + rxd = macsec_data_rtnl(real_dev);
> +
>   switch (event) {
>   case NETDEV_UNREGISTER: {
> - struct macsec_dev *m, *n;
> - struct macsec_rxh_data *rxd;
> + struct macsec_dev *n;
> + LIST_HEAD(head);
>  
> - rxd = macsec_data_rtnl(real_dev);
> - list_for_each_entry_safe(m, n, &rxd->secys, secys) {
> + list_for_each_entry_safe(m, n, &rxd->secys, secys)
>   macsec_common_dellink(m->secy.netdev, &head);
> - }

Please don't mix coding style changes with bug fixes.

[...]
> + case NETDEV_DOWN: {
> + struct net_device *dev, *tmp;
> + LIST_HEAD(close_list);
> +
> + list_for_each_entry(m, &rxd->secys, secys) {
> + dev = m->secy.netdev;
> +
> + if (dev->flags & IFF_UP)
> + list_add(&dev->close_list, &close_list);
> + }
> +
> + dev_close_many(&close_list, false);
> +
> + list_for_each_entry_safe(dev, tmp, &close_list, close_list) {
> + netif_stacked_transfer_operstate(real_dev, dev);
> + list_del_init(&dev->close_list);
> + }
> + list_del(&close_list);
> + break;

My version of the patch only does netif_stacked_transfer_operstate(),
and skips setting the device administratively down (ie, same as
NETDEV_CHANGE).

>   }
> + case NETDEV_UP:
> + list_for_each_entry(m, &rxd->secys, secys) {
> + struct net_device *dev = m->secy.netdev;
> + int flags = dev_get_flags(dev);
> +
> + if (!(flags & IFF_UP)) {
> + dev_change_flags(dev, flags | IFF_UP);
> + netif_stacked_transfer_operstate(real_dev, dev);
> + }
> + }
> + break;

I don't like that this completely ignores whether the device was done
independently of the lower link. Maybe the administrator actually
wants the macsec device down, regardless of state changes on the lower
device.

I know it's consistent with what vlan is doing, but I'm not convinced
macsec should adopt this behavior.

>   }
>  
>   return NOTIFY_OK;
> -- 
> 2.17.1
> 

-- 
Sabrina

Re: [RFC bpf-next 3/4] tools/bpf: bpftool, split the function do_dump()

2018-09-19 Thread Jakub Kicinski

On Wed, 19 Sep 2018 16:51:42 +0900, Prashant Bhole wrote:
> +static int dump_map_elem(int fd, void *key, void *value,
> +  struct bpf_map_info *map_info, struct btf *btf,
> +  json_writer_t *btf_wtr)
> +{
> + int num_elems = 0;
> +
> + if (!bpf_map_lookup_elem(fd, key, value)) {
> + if (json_output) {
> + print_entry_json(map_info, key, value, btf);
> + } else {
> + if (btf) {
> + struct btf_dumper d = {
> + .btf = btf,
> + .jw = btf_wtr,
> + .is_plain_text = true,
> + };
> +
> + do_dump_btf(&d, map_info, key, value);
> + } else {
> + print_entry_plain(map_info, key, value);
> + }
> + num_elems++;
> + }
> + goto out;
> + }
> +
> + /* lookup error handling */
> + if (map_is_map_of_maps(map_info->type) ||
> + map_is_map_of_progs(map_info->type))
> + goto out;
> +

nit: why not just return?  the goto seems to only do a return anyway,
 is this suggested by some coding style?  Is it to help the
 compiler?  I see people do this from time to time..

[...]

> +out:
> + return num_elems;

Re: [PATCH] ixgbevf: off by one in ixgbevf_ipsec_tx()

2018-09-19 Thread Shannon Nelson


On 9/19/2018 3:35 AM, Dan Carpenter wrote:

The ipsec->tx_tbl[] array has IXGBE_IPSEC_MAX_SA_COUNT elements so the >
should be a >=.

Fixes: 0062e7cc955e ("ixgbevf: add VF IPsec offload code")
Signed-off-by: Dan Carpenter 


Signed-off-by: Shannon Nelson 



diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c 
b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
index 997cea675a37..4fcbeffce52b 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c
@@ -470,7 +470,7 @@ int ixgbevf_ipsec_tx(struct ixgbevf_ring *tx_ring,
}
  
  	sa_idx = xs->xso.offload_handle - IXGBE_IPSEC_BASE_TX_INDEX;

-   if (unlikely(sa_idx > IXGBE_IPSEC_MAX_SA_COUNT)) {
+   if (unlikely(sa_idx >= IXGBE_IPSEC_MAX_SA_COUNT)) {
netdev_err(tx_ring->netdev, "%s: bad sa_idx=%d handle=%lu\n",
   __func__, sa_idx, xs->xso.offload_handle);
return 0;

Re: [RFC bpf-next 4/4] tools/bpf: handle EOPNOTSUPP when map lookup is failed

2018-09-19 Thread Jakub Kicinski

On Wed, 19 Sep 2018 16:51:43 +0900, Prashant Bhole wrote:
> Let's add a check for EOPNOTSUPP error when map lookup is failed.
> Also in case map doesn't support lookup, the output of map dump is
> changed from "can't lookup element" to "lookup not supported for
> this map".
> 
> Patch adds function print_entry_error() function to print the error
> value.
> 
> Following example dumps a map which does not support lookup.
> 
> Output before:
> root# bpftool map -jp dump id 40
> [
> "key": ["0x0a","0x00","0x00","0x00"
> ],
> "value": {
> "error": "can\'t lookup element"
> },
> "key": ["0x0b","0x00","0x00","0x00"
> ],
> "value": {
> "error": "can\'t lookup element"
> }
> ]
> 
> root# bpftool map dump id 40
> can't lookup element with key:
> 0a 00 00 00
> can't lookup element with key:
> 0b 00 00 00
> Found 0 elements
> 
> Output after changes:
> root# bpftool map dump -jp  id 45
> [
> "key": ["0x0a","0x00","0x00","0x00"
> ],
> "value": {
> "error": "lookup not supported for this map"
> },
> "key": ["0x0b","0x00","0x00","0x00"
> ],
> "value": {
> "error": "lookup not supported for this map"
> }
> ]
> 
> root# bpftool map dump id 45
> key:
> 0a 00 00 00
> value:
> lookup not supported for this map
> key:
> 0b 00 00 00
> value:
> lookup not supported for this map
> Found 0 elements

Nice improvement, thanks for the changes!  I wonder what your thoughts
would be on just printing some form of "lookup not supported for this
map" only once?  It seems slightly like repeated information - if
lookup is not supported for one key it likely won't be for other keys
too, so we could shorten the output.  Would that make sense?

> Signed-off-by: Prashant Bhole 
> ---
>  tools/bpf/bpftool/main.h |  5 +
>  tools/bpf/bpftool/map.c  | 35 ++-
>  2 files changed, 35 insertions(+), 5 deletions(-)
> 
> diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
> index 40492cdc4e53..1a8c683f949b 100644
> --- a/tools/bpf/bpftool/main.h
> +++ b/tools/bpf/bpftool/main.h
> @@ -46,6 +46,11 @@
>  
>  #include "json_writer.h"
>  
> +#define ERR_CANNOT_LOOKUP \
> + "can't lookup element"
> +#define ERR_LOOKUP_NOT_SUPPORTED \
> + "lookup not supported for this map"

Do we need these?  Are we going to reused them in more parts of the
code?

>  #define ptr_to_u64(ptr)  ((__u64)(unsigned long)(ptr))
>  
>  #define NEXT_ARG()   ({ argc--; argv++; if (argc < 0) usage(); })
> diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
> index 284e12a289c0..2faccd2098c9 100644
> --- a/tools/bpf/bpftool/map.c
> +++ b/tools/bpf/bpftool/map.c
> @@ -333,6 +333,25 @@ static void print_entry_json(struct bpf_map_info *info, 
> unsigned char *key,
>   jsonw_end_object(json_wtr);
>  }
>  
> +static void print_entry_error(struct bpf_map_info *info, unsigned char *key,
> +   char *value)
> +{
> + bool single_line, break_names;
> + int value_size = strlen(value);

nit: order variables declaration lines to shortest, please.

> +
> + break_names = info->key_size > 16 || value_size > 16;
> + single_line = info->key_size + value_size <= 24 && !break_names;
> +
> + printf("key:%c", break_names ? '\n' : ' ');
> + fprint_hex(stdout, key, info->key_size, " ");
> +
> + printf(single_line ? "  " : "\n");
> +
> + printf("value:%c%s", break_names ? '\n' : ' ', value);
> +
> + printf("\n");
> +}
> +
>  static void print_entry_plain(struct bpf_map_info *info, unsigned char *key,
> unsigned char *value)
>  {
> @@ -660,6 +679,8 @@ static int dump_map_elem(int fd, void *key, void *value,
>json_writer_t *btf_wtr)
>  {
>   int num_elems = 0;
> + int lookup_errno;
> + char *errstr;

nit: const char?

>  
>   if (!bpf_map_lookup_elem(fd, key, value)) {
>   if (json_output) {

Re: [PATCH ethtool] ethtool: support combinations of FEC modes

2018-09-19 Thread Andrew Lunn

> One loosely related question: how sure can we be that we are never going
> to need more than 32 bits for FEC encodings? Is it something completely
> hypothetical or is it something that could happen in the future?
> 
Hi Michal

Hopefully we have moved to a netlink socket by that time :-)

I recently found that EEE still uses a u32 for advertise link modes.
We should fix that in the netlink API.

Andrew

Re: [PATCH ethtool] ethtool: support combinations of FEC modes

2018-09-19 Thread Edward Cree

On 19/09/18 15:41, Michal Kubecek wrote:
> I'm sorry I didn't notice this before the patch was accepted but as it's
> not in a release yet, maybe it's still not too late.
>
> Could I suggest to make the syntax consistent with other options?
I didn't realise ethtool had any patterns to be consistent with ;)
> I mean rather than a comma separated list to use either
>
>   ethtool --set-fec  encoding enc1 enc2 ...
but yes this looks fine to me, as long as we're reasonably confident that
 we won't want to add new parameters (that might require determining
 whether enc2 is an encoding or a parameter name) in the future, because
 while the parsing wouldn't be impossible it might get ugly.

I'll rustle up an RFC patch.

-Ed

Re: [PATCH ethtool] ethtool: support combinations of FEC modes

2018-09-19 Thread Michal Kubecek

On Wed, Sep 19, 2018 at 05:33:38PM +0200, Andrew Lunn wrote:
> > One loosely related question: how sure can we be that we are never going
> > to need more than 32 bits for FEC encodings? Is it something completely
> > hypothetical or is it something that could happen in the future?
> > 
> Hi Michal
> 
> Hopefully we have moved to a netlink socket by that time :-)

Actually, that's why I'm asking. :-)

> I recently found that EEE still uses a u32 for advertise link modes.
> We should fix that in the netlink API.

For EEE it felt obvious that arbitrary length bitmap is the way to go so
I used it (it's still u32 in ethtool_ops but that's easier to change
when needed).

For FEC I wasn't sure if it wouldn't be an overkill.

Michal Kubecek

Re: [PATCH ethtool] ethtool: support combinations of FEC modes

2018-09-19 Thread Michal Kubecek

On Wed, Sep 19, 2018 at 04:38:27PM +0100, Edward Cree wrote:
> On 19/09/18 15:41, Michal Kubecek wrote:
> > I'm sorry I didn't notice this before the patch was accepted but as it's
> > not in a release yet, maybe it's still not too late.
> >
> > Could I suggest to make the syntax consistent with other options?
> I didn't realise ethtool had any patterns to be consistent with ;)

Way too many, I must say. :-) That is why I wasn't happy about adding
another.

> > I mean rather than a comma separated list to use either
> >
> >   ethtool --set-fec  encoding enc1 enc2 ...
> but yes this looks fine to me, as long as we're reasonably confident that
>  we won't want to add new parameters (that might require determining
>  whether enc2 is an encoding or a parameter name) in the future, because
>  while the parsing wouldn't be impossible it might get ugly.

This problem already exists for "-s ... msglvl". In the parser for the
netlink series I introduced an "end of list" marker (tentatively "--")
for this purpose, perhaps that could be a way.

> I'll rustle up an RFC patch.

Thank you.

Michal Kubecek

[RFC PATCH ethtool] ethtool: better syntax for combinations of FEC modes

2018-09-19 Thread Edward Cree

Instead of commas, just have them as separate argvs.

The parsing state machine might look heavyweight, but it makes it easy to add
 more parameters later and distinguish parameter names from encoding names.

Suggested-by: Michal Kubecek 
Signed-off-by: Edward Cree 
---
 ethtool.8.in   |  6 +++---
 ethtool.c  | 63 --
 test-cmdline.c | 10 +-
 3 files changed, 25 insertions(+), 54 deletions(-)

diff --git a/ethtool.8.in b/ethtool.8.in
index 414eaa1..7ea2cc0 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -390,7 +390,7 @@ ethtool \- query or control network driver and hardware 
settings
 .B ethtool \-\-set\-fec
 .I devname
 .B encoding
-.BR auto | off | rs | baser [ , ...]
+.BR auto | off | rs | baser \ [...]
 .
 .\" Adjust lines (i.e. full justification) and hyphenate.
 .ad
@@ -1120,11 +1120,11 @@ current FEC mode, the driver or firmware must take the 
link down
 administratively and report the problem in the system logs for users to 
correct.
 .RS 4
 .TP
-.BR encoding\ auto | off | rs | baser [ , ...]
+.BR encoding\ auto | off | rs | baser \ [...]
 
 Sets the FEC encoding for the device.  Combinations of options are specified as
 e.g.
-.B auto,rs
+.B encoding auto rs
 ; the semantics of such combinations vary between drivers.
 .TS
 nokeep;
diff --git a/ethtool.c b/ethtool.c
index 9997930..2f7e96b 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -4979,39 +4979,6 @@ static int fecmode_str_to_type(const char *str)
return 0;
 }
 
-/* Takes a comma-separated list of FEC modes, returns the bitwise OR of their
- * corresponding ETHTOOL_FEC_* constants.
- * Accepts repetitions (e.g. 'auto,auto') and trailing comma (e.g. 'off,').
- */
-static int parse_fecmode(const char *str)
-{
-   int fecmode = 0;
-   char buf[6];
-
-   if (!str)
-   return 0;
-   while (*str) {
-   size_t next;
-   int mode;
-
-   next = strcspn(str, ",");
-   if (next >= 6) /* Bad mode, longest name is 5 chars */
-   return 0;
-   /* Copy into temp buffer and nul-terminate */
-   memcpy(buf, str, next);
-   buf[next] = 0;
-   mode = fecmode_str_to_type(buf);
-   if (!mode) /* Bad mode encountered */
-   return 0;
-   fecmode |= mode;
-   str += next;
-   /* Skip over ',' (but not nul) */
-   if (*str)
-   str++;
-   }
-   return fecmode;
-}
-
 static int do_gfec(struct cmd_context *ctx)
 {
struct ethtool_fecparam feccmd = { 0 };
@@ -5041,22 +5008,26 @@ static int do_gfec(struct cmd_context *ctx)
 
 static int do_sfec(struct cmd_context *ctx)
 {
-   char *fecmode_str = NULL;
+   enum { ARG_NONE, ARG_ENCODING } state = ARG_NONE;
struct ethtool_fecparam feccmd;
-   struct cmdline_info cmdline_fec[] = {
-   { "encoding", CMDL_STR,  &fecmode_str,  &feccmd.fec},
-   };
-   int changed;
-   int fecmode;
-   int rv;
+   int fecmode = 0, newmode;
+   int rv, i;
 
-   parse_generic_cmdline(ctx, &changed, cmdline_fec,
- ARRAY_SIZE(cmdline_fec));
-
-   if (!fecmode_str)
+   for (i = 0; i < ctx->argc; i++) {
+   if (!strcmp(ctx->argp[i], "encoding")) {
+   state = ARG_ENCODING;
+   continue;
+   }
+   if (state == ARG_ENCODING) {
+   newmode = fecmode_str_to_type(ctx->argp[i]);
+   if (!newmode)
+   exit_bad_args();
+   fecmode |= newmode;
+   continue;
+   }
exit_bad_args();
+   }
 
-   fecmode = parse_fecmode(fecmode_str);
if (!fecmode)
exit_bad_args();
 
@@ -5265,7 +5236,7 @@ static const struct option {
  " [ all ]\n"},
{ "--show-fec", 1, do_gfec, "Show FEC settings"},
{ "--set-fec", 1, do_sfec, "Set FEC settings",
- " [ encoding auto|off|rs|baser ]\n"},
+ " [ encoding auto|off|rs|baser [...]]\n"},
{ "-h|--help", 0, show_usage, "Show this help" },
{ "--version", 0, do_version, "Show version number" },
{}
diff --git a/test-cmdline.c b/test-cmdline.c
index 9c51cca..84630a5 100644
--- a/test-cmdline.c
+++ b/test-cmdline.c
@@ -268,12 +268,12 @@ static struct test_case {
{ 1, "--set-eee devname advertise foo" },
{ 1, "--set-fec devname" },
{ 0, "--set-fec devname encoding auto" },
-   { 0, "--set-fec devname encoding off," },
-   { 0, "--set-fec devname encoding baser,rs" },
-   { 0, "--set-fec devname encoding auto,auto," },
+   { 0, "--set-fec devname encoding off" },
+   { 0, "--set-fec devname encoding baser rs" },
+   { 0, "--set-fec devname encoding a

Re: [PATCH 1/1] macsec: reflect the master interface state

2018-09-19 Thread Radu Rendec

Hello,

On Wed, Sep 19, 2018 at 11:24 AM Sabrina Dubroca  wrote:
> 2018-09-18, 20:16:12 -0400, Radu Rendec wrote:
> > This patch makes macsec interfaces reflect the state of the underlying
> > interface: if the master interface changes state to up/down, the macsec
> > interface changes its state accordingly.
>
> Yes, that's a bit unfortunate. I was wondering if we should just allow
> setting the device up, and let link state handle the rest.

Yes, that could work. It would also be consistent with the idea of
propagating only the link state.

> It's missing small parts of link state handling that I have been
> testing, mainly when creating a new device.

Can you please be more specific? I've just looked at macsec_newlink()
and I didn't notice anything related to link state handling.

> > - list_for_each_entry_safe(m, n, &rxd->secys, secys) {
> > + list_for_each_entry_safe(m, n, &rxd->secys, secys)
> >   macsec_common_dellink(m->secy.netdev, &head);
> > - }
>
> Please don't mix coding style changes with bug fixes.

Noted.

> > + case NETDEV_DOWN: {
> > + struct net_device *dev, *tmp;
> > + LIST_HEAD(close_list);
> > +
> > + list_for_each_entry(m, &rxd->secys, secys) {
> > + dev = m->secy.netdev;
> > +
> > + if (dev->flags & IFF_UP)
> > + list_add(&dev->close_list, &close_list);
> > + }
> > +
> > + dev_close_many(&close_list, false);
> > +
> > + list_for_each_entry_safe(dev, tmp, &close_list, close_list) {
> > + netif_stacked_transfer_operstate(real_dev, dev);
> > + list_del_init(&dev->close_list);
> > + }
> > + list_del(&close_list);
> > + break;
>
> My version of the patch only does netif_stacked_transfer_operstate(),
> and skips setting the device administratively down (ie, same as
> NETDEV_CHANGE).

That makes sense. But, since you mentioned you had your own patch, does
it make sense for me to continue working on mine?

> >   }
> > + case NETDEV_UP:
> > + list_for_each_entry(m, &rxd->secys, secys) {
> > + struct net_device *dev = m->secy.netdev;
> > + int flags = dev_get_flags(dev);
> > +
> > + if (!(flags & IFF_UP)) {
> > + dev_change_flags(dev, flags | IFF_UP);
> > + netif_stacked_transfer_operstate(real_dev, 
> > dev);
> > + }
> > + }
> > + break;
>
> I don't like that this completely ignores whether the device was done
> independently of the lower link. Maybe the administrator actually
> wants the macsec device down, regardless of state changes on the lower
> device.

I thought about that too and I don't like it either. Perhaps the vlan
approach of having a "loose binding" flag is worth considering. Then the
admin can decice for themselves.

However, I guess the problem disappears if only the link state is
propagated. The only caveat to that is to not propagate an "up" link
state while the macsec interface is administratively down, but
"remember" to propagate it later if the macsec interface is
administratively set to "up".

Thanks for the feedback!

Radu Rendec

Re: Bridge connectivity interruptions while devices join or leave the bridge

2018-09-19 Thread Ido Schimmel

On Wed, Sep 19, 2018 at 01:00:15PM +0200, Johannes Wienke wrote:
> This behavior of inheriting the mac address is really unexpected to us.
> Is it documented somewhere?

Not that I'm aware, but it's a well established behavior.

Re: array bounds warning in xfrm_output_resume

2018-09-19 Thread David Ahern

On 6/18/18 11:10 AM, David Ahern wrote:
> Florian:
> 
> I am seeing this warning:
> 
> $ make O=kbuild/perf -j 24 -s
> In file included from /home/dsa/kernel-3.git/include/linux/kernel.h:10:0,
>  from /home/dsa/kernel-3.git/include/linux/list.h:9,
>  from /home/dsa/kernel-3.git/include/linux/module.h:9,
>  from /home/dsa/kernel-3.git/net/xfrm/xfrm_output.c:13:
> /home/dsa/kernel-3.git/net/xfrm/xfrm_output.c: In function
> ‘xfrm_output_resume’:
> /home/dsa/kernel-3.git/include/linux/compiler.h:252:20: warning: array
> subscript is above array bounds [-Warray-bounds]
>__read_once_size(&(x), __u.__c, sizeof(x));  \
> ^~~~
> /home/dsa/kernel-3.git/include/linux/compiler.h:258:22: note: in
> expansion of macro ‘__READ_ONCE’
>  #define READ_ONCE(x) __READ_ONCE(x, 1)
>   ^~~
> /home/dsa/kernel-3.git/include/linux/rcupdate.h:350:48: note: in
> expansion of macro ‘READ_ONCE’
>   typeof(*p) *p1 = (typeof(*p) *__force)READ_ONCE(p); \
> ^
> /home/dsa/kernel-3.git/include/linux/rcupdate.h:487:2: note: in
> expansion of macro ‘__rcu_dereference_check’
>   __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu)
>   ^~~
> /home/dsa/kernel-3.git/include/linux/rcupdate.h:545:28: note: in
> expansion of macro ‘rcu_dereference_check’
>  #define rcu_dereference(p) rcu_dereference_check(p, 0)
> ^
> /home/dsa/kernel-3.git/include/linux/netfilter.h:218:15: note: in
> expansion of macro ‘rcu_dereference’
>hook_head = rcu_dereference(net->nf.hooks_arp[hook]);
>^~~
> 
> Line in question is the nf_hook in xfrm_output_resume.
> NF_INET_POST_ROUTING = 4 which is greater than NF_ARP_NUMHOOKS = 3
> 
> I believe ef57170bbfdd6 is the commit that introduced the warning
> 

Hi Florian:

I am still seeing this. I realize the compiler is not understanding the
conditions properly, but it is a distraction every time I do a kernel
build on Debian Stretch having to filter the above noise from whether I
introduce another warning.

[PATCH net] bnxt_en: don't try to offload VLAN 'modify' action

2018-09-19 Thread Davide Caratti

bnxt offload code currently supports only 'push' and 'pop' operation: let
.ndo_setup_tc() return -EOPNOTSUPP if VLAN 'modify' action is configured.

Fixes: 2ae7408fedfe ("bnxt_en: bnxt: add TC flower filter offload support")
Signed-off-by: Davide Caratti 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
index 092c817f8f11..e1594c9df4c6 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c
@@ -75,17 +75,23 @@ static int bnxt_tc_parse_redir(struct bnxt *bp,
return 0;
 }
 
-static void bnxt_tc_parse_vlan(struct bnxt *bp,
-  struct bnxt_tc_actions *actions,
-  const struct tc_action *tc_act)
+static int bnxt_tc_parse_vlan(struct bnxt *bp,
+ struct bnxt_tc_actions *actions,
+ const struct tc_action *tc_act)
 {
-   if (tcf_vlan_action(tc_act) == TCA_VLAN_ACT_POP) {
+   switch (tcf_vlan_action(tc_act)) {
+   case TCA_VLAN_ACT_POP:
actions->flags |= BNXT_TC_ACTION_FLAG_POP_VLAN;
-   } else if (tcf_vlan_action(tc_act) == TCA_VLAN_ACT_PUSH) {
+   break;
+   case TCA_VLAN_ACT_PUSH:
actions->flags |= BNXT_TC_ACTION_FLAG_PUSH_VLAN;
actions->push_vlan_tci = htons(tcf_vlan_push_vid(tc_act));
actions->push_vlan_tpid = tcf_vlan_push_proto(tc_act);
+   break;
+   default:
+   return -EOPNOTSUPP;
}
+   return 0;
 }
 
 static int bnxt_tc_parse_tunnel_set(struct bnxt *bp,
@@ -134,7 +140,9 @@ static int bnxt_tc_parse_actions(struct bnxt *bp,
 
/* Push/pop VLAN */
if (is_tcf_vlan(tc_act)) {
-   bnxt_tc_parse_vlan(bp, actions, tc_act);
+   rc = bnxt_tc_parse_vlan(bp, actions, tc_act);
+   if (rc)
+   return rc;
continue;
}
 
-- 
2.17.1

Re: Bridge connectivity interruptions while devices join or leave the bridge

2018-09-19 Thread Stephen Hemminger

On Wed, 19 Sep 2018 19:45:08 +0300
Ido Schimmel  wrote:

> On Wed, Sep 19, 2018 at 01:00:15PM +0200, Johannes Wienke wrote:
> > This behavior of inheriting the mac address is really unexpected to us.
> > Is it documented somewhere?  
> 
> Not that I'm aware, but it's a well established behavior.

Not documented, has always been that way. It seems to be part of 802 standard 
maybe?
Anyway, if you set a MAC address of the bridge device it makes it sticky;
i.e it won't change if ports of bridge change.

Re: [PATCH net-next] ipv6: Allow the l3mdev to be a loopback

2018-09-19 Thread David Ahern

On 9/19/18 5:56 AM, Mike Manning wrote:
> From: Robert Shearman 
> 
> There is no way currently for an IPv6 client connect using a loopback
> address in a VRF, whereas for IPv4 the loopback address can be added:
> 
> $ sudo ip addr add dev vrfred 127.0.0.1/8
> $ sudo ip -6 addr add ::1/128 dev vrfred
> RTNETLINK answers: Cannot assign requested address
> 
> So allow ::1 to be configured on an L3 master device. In order for
> this to be usable ip_route_output_flags needs to not consider ::1 to
> be a link scope address (since oif == l3mdev and so it would be
> dropped), and ipv6_rcv needs to consider the l3mdev to be a loopback
> device so that it doesn't drop the packets.
> 
> Signed-off-by: Robert Shearman 
> Signed-off-by: Mike Manning 
> ---
>  net/ipv6/addrconf.c  | 1 +
>  net/ipv6/ip6_input.c | 3 ++-
>  net/ipv6/route.c | 3 ++-
>  3 files changed, 5 insertions(+), 2 deletions(-)
> 

Reviewed-by: David Ahern 

Been on my to-do list for a while. Thanks for the patch. This resolves,
for example, a harmless error message from the 'host' command from
bind9-host-9.10.3 which probes for dscp support via the loopback
address. e.g.,

$ host www.google.com
../../../../lib/isc/unix/net.c:581: sendmsg() failed: Network is unreachable

Re: [PATCH mlx5-next 02/25] net/mlx5: Set uid as part of QP commands

2018-09-19 Thread Jason Gunthorpe

On Mon, Sep 17, 2018 at 02:03:55PM +0300, Leon Romanovsky wrote:
> From: Yishai Hadas 
> 
> Set uid as part of QP commands so that the firmware can manage the
> QP object in a secured way.
> 
> That will enable using a QP that was created by verbs application to
> be used by the DEVX flow in case the uid is equal.
> 
> Signed-off-by: Yishai Hadas 
> Signed-off-by: Leon Romanovsky 
>  drivers/net/ethernet/mellanox/mlx5/core/qp.c | 45 
> +---
>  include/linux/mlx5/mlx5_ifc.h| 22 +++---
>  include/linux/mlx5/qp.h  |  1 +
>  3 files changed, 39 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
> index 4ca07bfb6b14..04f72a1cdbcc 100644
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
> @@ -240,6 +240,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
>   if (err)
>   return err;
>  
> + qp->uid = MLX5_GET(create_qp_in, in, uid);
>   qp->qpn = MLX5_GET(create_qp_out, out, qpn);
>   mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn);
>  
> @@ -261,6 +262,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev,
>   memset(dout, 0, sizeof(dout));
>   MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
>   MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
> + MLX5_SET(destroy_qp_in, din, uid, qp->uid);
>   mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
>   return err;
>  }
> @@ -320,6 +322,7 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
>  
>   MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
>   MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
> + MLX5_SET(destroy_qp_in, in, uid, qp->uid);
>   err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
>   if (err)
>   return err;
> @@ -373,7 +376,7 @@ static void mbox_free(struct mbox_info *mbox)
>  
>  static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int 
> qpn,
>   u32 opt_param_mask, void *qpc,
> - struct mbox_info *mbox)
> + struct mbox_info *mbox, u16 uid)
>  {
>   mbox->out = NULL;
>   mbox->in = NULL;
> @@ -381,26 +384,32 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev 
> *dev, u16 opcode, int qpn,
>  #define MBOX_ALLOC(mbox, typ)  \
>   mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), 
> MLX5_ST_SZ_BYTES(typ##_out))
>  
> -#define MOD_QP_IN_SET(typ, in, _opcode, _qpn) \
> - MLX5_SET(typ##_in, in, opcode, _opcode); \
> - MLX5_SET(typ##_in, in, qpn, _qpn)
> -
> -#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc) \
> - MOD_QP_IN_SET(typ, in, _opcode, _qpn); \
> - MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \
> - memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc, MLX5_ST_SZ_BYTES(qpc))
> +#define MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid) \
> + do { \
> + MLX5_SET(typ##_in, in, opcode, _opcode); \
> + MLX5_SET(typ##_in, in, qpn, _qpn); \
> + MLX5_SET(typ##_in, in, uid, _uid); \
> + } while (0)
> +
> +#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc, _uid) \
> + do { \
> + MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid); \
> + MLX5_SET(typ##_in, in, opt_param_mask, _opt_p); \
> + memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), \
> +_qpc, MLX5_ST_SZ_BYTES(qpc)); \
> + } while (0)

These should get touched by clang-format to follow the usual
formatting for macros..

Jason

Re: [PATCH mlx5-next 03/25] net/mlx5: Set uid as part of RQ commands

2018-09-19 Thread Jason Gunthorpe

On Mon, Sep 17, 2018 at 02:03:56PM +0300, Leon Romanovsky wrote:
> From: Yishai Hadas 
> 
> Set uid as part of RQ commands so that the firmware can manage the
> RQ object in a secured way.
> 
> That will enable using an RQ that was created by verbs application
> to be used by the DEVX flow in case the uid is equal.
> 
> Signed-off-by: Yishai Hadas 
> Signed-off-by: Leon Romanovsky 
>  drivers/net/ethernet/mellanox/mlx5/core/qp.c | 16 ++--
>  include/linux/mlx5/mlx5_ifc.h|  6 +++---
>  2 files changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
> index 04f72a1cdbcc..0ca68ef54d93 100644
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
> @@ -540,6 +540,17 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, 
> u32 xrcdn)
>  }
>  EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
>  
> +static void destroy_rq_tracked(struct mlx5_core_dev *dev, u32 rqn, u16 uid)
> +{
> + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)]   = {0};
> + u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {0};

= {} is the preferred version of this, right? 

{0} explicitly initializes the first element to zero and only works if
the first element happens to be something integral..

Jason

Re: [PATCH mlx5-next 07/25] net/mlx5: Update mlx5_ifc with DEVX UID bits

2018-09-19 Thread Jason Gunthorpe

On Mon, Sep 17, 2018 at 02:04:00PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky 
> 
> Add DEVX information to WQ, SRQ, CQ, TRI, TIS, QP,
> RQ, XRCD, PD, MKEY and MCG.
> 
> Signed-off-by: Leon Romanovsky 
> ---
>  include/linux/mlx5/mlx5_ifc.h | 67 
> +++
>  1 file changed, 43 insertions(+), 24 deletions(-)

It is weird that we sometimes have these IFC bundle updates and
sometimes the IFC is inlined in the patch.. 

Why not just do one big IFC only patch with everything the series
needs?

Jason

[PATCH net-next 11/12] net: hns3: Fix client initialize state issue when roce client initialize failed

2018-09-19 Thread Salil Mehta

From: Jian Shen 

When roce is loaded before nic, the roce client will not be initialized
until nic client is initialized, but roce init flag is set before it.
Furthermore, in this case of nic initialized success and roce failed,
the nic init flag is not set, and roce init flag is not cleared.

This patch fixes it by set init flag only after the client is initialized
successfully.

Fixes: e2cb1dec9779 ("net: hns3: Add HNS3 VF HCL(Hardware Compatibility Layer) 
Support")
Fixes: 46a3df9f9718 ("net: hns3: Add HNS3 Acceleration Engine & Compatibility 
Layer Support")
Signed-off-by: Jian Shen 
Signed-off-by: Peng Li 
Signed-off-by: Salil Mehta 
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.c   | 12 +---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h   |  3 +++
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   |  9 +
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c |  9 +
 4 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c 
b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
index fff5be8..781e5de 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c
@@ -29,8 +29,8 @@ static bool hnae3_client_match(enum hnae3_client_type 
client_type,
return false;
 }
 
-static void hnae3_set_client_init_flag(struct hnae3_client *client,
-  struct hnae3_ae_dev *ae_dev, int inited)
+void hnae3_set_client_init_flag(struct hnae3_client *client,
+   struct hnae3_ae_dev *ae_dev, int inited)
 {
switch (client->type) {
case HNAE3_CLIENT_KNIC:
@@ -46,6 +46,7 @@ static void hnae3_set_client_init_flag(struct hnae3_client 
*client,
break;
}
 }
+EXPORT_SYMBOL(hnae3_set_client_init_flag);
 
 static int hnae3_get_client_init_flag(struct hnae3_client *client,
   struct hnae3_ae_dev *ae_dev)
@@ -86,14 +87,11 @@ static int hnae3_match_n_instantiate(struct hnae3_client 
*client,
/* now, (un-)instantiate client by calling lower layer */
if (is_reg) {
ret = ae_dev->ops->init_client_instance(client, ae_dev);
-   if (ret) {
+   if (ret)
dev_err(&ae_dev->pdev->dev,
"fail to instantiate client, ret = %d\n", ret);
-   return ret;
-   }
 
-   hnae3_set_client_init_flag(client, ae_dev, 1);
-   return 0;
+   return ret;
}
 
if (hnae3_get_client_init_flag(client, ae_dev)) {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h 
b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index c48c187..17db631 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -523,4 +523,7 @@ void hnae3_register_ae_algo(struct hnae3_ae_algo *ae_algo);
 
 void hnae3_unregister_client(struct hnae3_client *client);
 int hnae3_register_client(struct hnae3_client *client);
+
+void hnae3_set_client_init_flag(struct hnae3_client *client,
+   struct hnae3_ae_dev *ae_dev, int inited);
 #endif
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 141da18..cf365d4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5171,6 +5171,8 @@ static int hclge_init_client_instance(struct hnae3_client 
*client,
goto clear_nic;
}
 
+   hnae3_set_client_init_flag(client, ae_dev, 1);
+
if (hdev->roce_client &&
hnae3_dev_roce_supported(hdev)) {
struct hnae3_client *rc = hdev->roce_client;
@@ -5182,6 +5184,9 @@ static int hclge_init_client_instance(struct hnae3_client 
*client,
ret = rc->ops->init_instance(&vport->roce);
if (ret)
goto clear_roce;
+
+   hnae3_set_client_init_flag(hdev->roce_client,
+  ae_dev, 1);
}
 
break;
@@ -5193,6 +5198,8 @@ static int hclge_init_client_instance(struct hnae3_client 
*client,
if (ret)
goto clear_nic;
 
+   hnae3_set_client_init_flag(client, ae_dev, 1);
+
break;
case HNAE3_CLIENT_ROCE:
if (hnae3_dev_roce_supported(hdev)) {
@@ -5208,6 +5215,8 @@ static int hclge_init_client_instance(struct hnae3_client 
*client,
ret = client->ops->init_instance(&vport->roce);

Re: array bounds warning in xfrm_output_resume

2018-09-19 Thread Florian Westphal

David Ahern  wrote:
> > I believe ef57170bbfdd6 is the commit that introduced the warning
> > 
> 
> Hi Florian:
> 
> I am still seeing this. I realize the compiler is not understanding the
> conditions properly, but it is a distraction every time I do a kernel
> build on Debian Stretch having to filter the above noise from whether I
> introduce another warning.

Sorry, I forgot about this.  I'll look into it tomorrow.

Re: [PATCH rdma-next 00/24] Extend DEVX functionality

2018-09-19 Thread Jason Gunthorpe

On Mon, Sep 17, 2018 at 02:03:53PM +0300, Leon Romanovsky wrote:
> From: Leon Romanovsky 
> 
> From Yishai,
> 
> This series comes to enable the DEVX functionality in some wider scope,
> specifically,
> - It enables using kernel objects that were created by the verbs
>   API in the DEVX flow.
> - It enables white list commands without DEVX user context.
> - It enables the IB link layer under CAP_NET_RAW capabilities.
> - It exposes the PRM handles for RAW QP (i.e. TIRN, TISN, RQN, SQN)
>   to be used later on directly by the DEVX interface.
> 
> In General,
> Each object that is created/destroyed/modified via verbs will be stamped
> with a UID based on its user context. This is already done for DEVX objects
> commands.
> 
> This will enable the firmware to enforce the usage of kernel objects
> from the DEVX flow by validating that the same UID is used and the resources 
> are
> really related to the same user.
> 
> For example in case a CQ was created with verbs it will be stamped with
> UID and once will be pointed by a DEVX create QP command the firmware will
> validate that the input CQN really belongs to the UID which issues the create 
> QP
> command.
> 
> As of the above, all the PRM objects (except of the public ones which
> are managed by the kernel e.g. FLOW, etc.) will have a UID upon their
> create/modify/destroy commands. The detection of UMEM / physical
> addressed in the relevant commands will be done by firmware according to a 
> 'umem
> valid bit' as the UID may be used in both cases.
> 
> The series also enables white list commands which don't require a
> specific DEVX context, instead of this a device UID is used so that
> the firmware will mask un-privileged functionality. The IB link layer
> is also enabled once CAP_NET_RAW permission exists.
> 
> To enable using the RAW QP underlay objects (e.g. TIRN, RQN, etc.) later
> on by DEVX commands the UHW output for this case was extended to return this
> data when a DEVX context is used.
> 
> Thanks
>
> Leon Romanovsky (1):
>   net/mlx5: Update mlx5_ifc with DEVX UID bits
> 
> Yishai Hadas (24):
>   net/mlx5: Set uid as part of CQ commands
>   net/mlx5: Set uid as part of QP commands
>   net/mlx5: Set uid as part of RQ commands
>   net/mlx5: Set uid as part of SQ commands
>   net/mlx5: Set uid as part of SRQ commands
>   net/mlx5: Set uid as part of DCT commands
>   IB/mlx5: Set uid as part of CQ creation
>   IB/mlx5: Set uid as part of QP creation
>   IB/mlx5: Set uid as part of RQ commands
>   IB/mlx5: Set uid as part of SQ commands
>   IB/mlx5: Set uid as part of TIR commands
>   IB/mlx5: Set uid as part of TIS commands
>   IB/mlx5: Set uid as part of RQT commands
>   IB/mlx5: Set uid as part of PD commands
>   IB/mlx5: Set uid as part of TD commands
>   IB/mlx5: Set uid as part of SRQ commands
>   IB/mlx5: Set uid as part of DCT commands
>   IB/mlx5: Set uid as part of XRCD commands
>   IB/mlx5: Set uid as part of MCG commands

This is really too many patches.. They are small and not too hard to
review, but it is well beyond the guideline.

And I'm not totally happy with the extensive use of ucontext in the IB
portions, it is problematic looking into the future, and uboject is
really not supposed to be used in the drivers.

The driver needs to store the uid in the PD (copied from the ucontext
that created it) and use that in all the dependent places, not use
pd->uobject->ucontext->devx_uid or some other convoluted way to get
to it. 

The ucontext variable should only be used when creating the PD, CQ and
devx objects.

This detail becomes quite important, for instance, if we get to the
'shared pd' that has been talked about at conference. In this case
when the 'receiver' of the 'shared pd' creates a child object, like a
MR, the MR must be stamped with the devx_uid of the PD (ie the
originating context's devx_uid), not the dev_uid of its local ufile!

If we do that, then the series can be split, so long as pd->devx_uid ==
0 until the entire series is applied. uid tagging is an all-or-nothing
thing, as partial tagging will break verbs. So breaking it up also
makes it more bi-section safe.

Something like these patches:

>   net/mlx5: Update mlx5_ifc with DEVX UID bits
>   net/mlx5: Set uid as part of CQ commands
>   net/mlx5: Set uid as part of QP commands
>   net/mlx5: Set uid as part of RQ commands
>   net/mlx5: Set uid as part of SQ commands
>   net/mlx5: Set uid as part of SRQ commands
>   net/mlx5: Set uid as part of DCT commands
>   IB/mlx5: Set uid as part of PD commands
>   IB/mlx5: Set uid as part of QP creation
>   IB/mlx5: Set uid as part of RQ commands
>   IB/mlx5: Set uid as part of SQ commands
>   IB/mlx5: Set uid as part of SRQ commands
>   IB/mlx5: Set uid as part of DCT commands

(13 patches)

Followed by the rest of the IB uid patches

Followed by a patch to make pd->uid != 0 along with these:

>   IB/mlx5: Set uid as part of CQ creation
>   IB/mlx5: Set valid umem bit on DEVX
>   IB/mlx5: Expose RAW QP device handles to us

Re: kernels > v4.12 oops/crash with ipsec-traffic: bisected to b838d5e1c5b6e57b10ec8af2268824041e3ea911: ipv4: mark DST_NOGC and remove the operation of dst_free()

2018-09-19 Thread Tobias Hommel

> After running for about 24 hours, I now encountered another panic. This time 
> it
> is caused by an out of memory situation. Although the trace shows action in 
> the
> filesystem code I'm posting it here because I cannot isolate the error and
> maybe it is caused by our NULL pointer bug or by the new fix.
> I do not have a serial console attached, so I could only attach a screenshot 
> of
> the panic to this mail.
> 
> I am running v4.19-rc3 from git with the above mentioned patch applied.
> After 19 hours everything still looked fine, XfrmFwdHdrError value was at 
> ~950.
> Overall memory usage shown by htop was at 1.2G/15.6G.
> I had htop running via ssh so I was able to see at least some status post
> mortem. Uptime: 23:50:57
> Overall memory usage was at 10.2G/15.6G and user processes were just
> using the usual amount of memory, so it looks like the kernel was eating up at
> least 9G of RAM.
> 
> Maybe this information is not very helpful for debugging, but it is at least a
> warning that something might still be wrong.
> 
> I'll try to gather some more information and keep you updated.

Running stable under load for more than 5 days now, I was not able to reproduce
that OOM situation. I leave it at that, the fix for the initial bug is fine for
me.

Re: [RFC bpf-next 2/4] bpf: return EOPNOTSUPP when map lookup isn't supported

2018-09-19 Thread Mauricio Vasquez





On 09/19/2018 10:14 AM, Alexei Starovoitov wrote:

On Wed, Sep 19, 2018 at 04:51:41PM +0900, Prashant Bhole wrote:

Return ERR_PTR(-EOPNOTSUPP) from map_lookup_elem() methods of below
map types:
- BPF_MAP_TYPE_PROG_ARRAY
- BPF_MAP_TYPE_STACK_TRACE
- BPF_MAP_TYPE_XSKMAP
- BPF_MAP_TYPE_SOCKMAP/BPF_MAP_TYPE_SOCKHASH

Signed-off-by: Prashant Bhole 
---
  kernel/bpf/arraymap.c | 2 +-
  kernel/bpf/sockmap.c  | 2 +-
  kernel/bpf/stackmap.c | 2 +-
  kernel/bpf/xskmap.c   | 2 +-
  4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index dded84cbe814..24583da9ffd1 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -449,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map)
  
  static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)

  {
-   return NULL;
+   return ERR_PTR(-EOPNOTSUPP);
  }

conceptually the set looks good to me.
Please add a test to test_verifier.c to make sure
that these lookup helpers cannot be called from BPF program.
Otherwise this diff may cause crashes.


I think we can remove all these stub functions that return EOPNOTSUPP or 
EINVAL and return the error in syscall.c if ops->map_[update, delete, 
lookup, ...] is null.
This will require to extend (and test) the verifier to guarantee that 
those helpers are not called in wrong maps, for example 
map_delete_element in array like maps.


Would it make sense?

Re: [PATCH mlx5-next 03/25] net/mlx5: Set uid as part of RQ commands

2018-09-19 Thread Saeed Mahameed

On Wed, Sep 19, 2018 at 10:28 AM Jason Gunthorpe  wrote:
>
> On Mon, Sep 17, 2018 at 02:03:56PM +0300, Leon Romanovsky wrote:
> > From: Yishai Hadas 
> >
> > Set uid as part of RQ commands so that the firmware can manage the
> > RQ object in a secured way.
> >
> > That will enable using an RQ that was created by verbs application
> > to be used by the DEVX flow in case the uid is equal.
> >
> > Signed-off-by: Yishai Hadas 
> > Signed-off-by: Leon Romanovsky 
> >  drivers/net/ethernet/mellanox/mlx5/core/qp.c | 16 ++--
> >  include/linux/mlx5/mlx5_ifc.h|  6 +++---
> >  2 files changed, 17 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c 
> > b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
> > index 04f72a1cdbcc..0ca68ef54d93 100644
> > +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
> > @@ -540,6 +540,17 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, 
> > u32 xrcdn)
> >  }
> >  EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
> >
> > +static void destroy_rq_tracked(struct mlx5_core_dev *dev, u32 rqn, u16 uid)
> > +{
> > + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)]   = {0};
> > + u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {0};
>
> = {} is the preferred version of this, right?
>
> {0} explicitly initializes the first element to zero and only works if
> the first element happens to be something integral..
>

Both are perfectly ok in our scenarios.
I remember one of the syntaxes yielded a statistic checker warning, i
don't remember which syntax and what static checker :) ..

> Jason

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Marcelo Ricardo Leitner

On Wed, Sep 19, 2018 at 11:25:17AM +0200, Johannes Berg wrote:
> On Wed, 2018-09-19 at 00:37 -0300, Marcelo Ricardo Leitner wrote:
> 
> > Did you consider indicating the message level, and only overwrite the
> > message that is already in there if the new message level is higher
> > than the current one?
> 
> Hmm, no, I guess I didn't - I'm not even sure I understand what you're
> saying.
> 
> This code in itself generates no "warning" messages; that was just a
> construct we discussed in the NLA_REJECT thread, e.g. if you say (like I
> just also wrote in my reply to Jiri):
> 
>   NL_SET_ERR_MSG(extack, "warning: deprecated command");
>   err = nla_parse(..., extack);
>   if (err)
>   return err;
>   /* do something */
>   return 0;
> 
> Here you could consider the message there a warning that's transported
> out even if we return 0, but if we return with a failure from
> nla_parse() (or nla_validate instead if you wish), then that failure
> message "wins".

Agree. This is the core issue here, IMHO. Once out of the context that
set the message, we have no way of knowing if we can nor should
overwrite the message that is already in there.

> 
> > This way the first to set an Error message will have it, and Warning
> > messages would be overwritten by such if needed. But it also would
> > cause the first warning to be held, and not the last one, as it does
> > today. We want the first error, but the last warning otherwise.
> > 
> > It would not be possible to overwrite if new_msglvl >= cur_msglvl
> > because then it would trigger the initial issue again, so some extra
> > logic would be needed to solve this.
> 
> That sounds way more complex than what I'm doing now?

Yes but hopefully it would a clearer way of how it is/should be handled.

> 
> Note, like I said above, this isn't *generic* in any way. This code here
> will only ever set error messages that should "win".
> 
> I suppose we could - technically - make that generic, in that we could
> have both
> 
>   NLA_SET_WARN_MSG(extack, "...");
>   NLA_SET_ERR_MSG(extack, "...");

I like this.

> 
> and keep track of warning vs. error; however, just like my first version
> of the NLA_REJECT patch, that would break existing code.

Hm, I may have missed something but I think the discussion in there
was for a different context. For an extack msg to be set by
when validate_nla() call returns on nla_parse(), the previous message
had to be a "warning" because otherwise the parsing wouldn't be even
attempted. So in that case, we are safe to simply overwrite it.

While for the situation you are describing here, it will set a generic
error message in case the inner code didn't do it.

Using the semantics of NLA_SET_WARN_MSG and ERR, then WARN would
replace other WARNs but not ERRs, and ERR would replace other WARNs
too but not other ERRs. All we need to do handle this is a bit in
extack saying if the message is considered a warning or not, or an
error/fatal message or not.

> 
> I also don't think that we actually *need* this complexity in general.
> It should almost always be possible (and realistically, pretty easy) to
> structure your code in a way that warning messages only go out if no
> error message overwrites them. The only reason we were ever even
> discussing this was that in NLA_REJECT I missed the fact that somebody
> could've set a message before and thus would keep it rather than
> overwrite it, which was a change in behaviour.

Okay but we have split parsing of netlink messages and this could be
useful in there too:
In cls_api.c, tc_new_tfilter() calls nmmsg_parse() and do some
processing, and then handle it to a classifier. cls_flower, for
example, will then do another parsing. If, for whatever reason, flower
failed and did not set an extack msg, tc_new_tfilter() could set a
default error message, but at this moment we can't tell if the msg
already set was just a warning from the first parsing (or any other
code before engaging flower) (which we can overwrite) or if it a
message that flower set (which we should not overwrite).

Hopefully my description clear.. 8-)

I think this is the same situation as with the nested parsing you're
proposing.

Currently it (tc_new_tfilter) doesn't set any default error message,
so this issue is/was not noticed.

> 
> Now, with this patch, all I'm doing is changing the internal behaviour
> of nla_parse/nla_validate - externally, it still overwrites any existing
> message if an error occurs, but internally it keeps the inner-most
> error.
> 
> Why is this? Consider this:
> 
> static const struct nla_policy inner_policy[] = {
>   [INNER_FLAG] = { .type = NLA_REJECT,
>  .validation_data = "must not set this flag" }
> };
> 
> static const struct nla_policy outer_policy[] = {
>   [OUTER_NESTING] = { .type = NLA_NESTED, .len = INNER_MAX,
> .validation_data = inner_policy,
> };
> 
> Now if you invoke nla_parse/nla_validate with a mes

[PATCH] mpls: allow routes on ip6gre devices

2018-09-19 Thread Saif Hasan

Summary:

This appears to be necessary and sufficient change to enable `MPLS` on
`ip6gre` tunnels (RFC4023).

This diff allows IP6GRE devices to be recognized by MPLS kernel module
and hence user can configure interface to accept packets with mpls
headers as well setup mpls routes on them.

Test Plan:

Test plan consists of multiple containers connected via GRE-V6 tunnel.
Then carrying out testing steps as below.

- Carry out necessary sysctl settings on all containers

```
sysctl -w net.mpls.platform_labels=65536
sysctl -w net.mpls.ip_ttl_propagate=1
sysctl -w net.mpls.conf.lo.input=1
```

- Establish IP6GRE tunnels

```
ip -6 tunnel add name if_1_2_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::2 key 1
ip link set dev if_1_2_1 up
sysctl -w net.mpls.conf.if_1_2_1.input=1
ip -4 addr add 169.254.0.2/31 dev if_1_2_1 scope link

ip -6 tunnel add name if_1_3_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::3 key 1
ip link set dev if_1_3_1 up
sysctl -w net.mpls.conf.if_1_3_1.input=1
ip -4 addr add 169.254.0.4/31 dev if_1_3_1 scope link
```

- Install MPLS encap rules on node-1 towards node-2

```
ip route add 192.168.0.11/32 nexthop encap mpls 32/64 \
  via inet 169.254.0.3 dev if_1_2_1
```

- Install MPLS forwarding rules on node-2 and node-3
```
// node2
ip -f mpls route add 32 via inet 169.254.0.7 dev if_2_4_1

// node3
ip -f mpls route add 64 via inet 169.254.0.12 dev if_4_3_1
```

- Ping 192.168.0.11 (node4) from 192.168.0.1 (node1) (where routing
  towards 192.168.0.1 is via IP route directly towards node1 from node4)
```
ping 192.168.0.11
```

- tcpdump on interface to capture ping packets wrapped within MPLS
  header which inturn wrapped within IP6GRE header

```
16:43:41.121073 IP6
  2401:db00:21:6048:feed::1 > 2401:db00:21:6048:feed::2:
  DSTOPT GREv0, key=0x1, length 100:
  MPLS (label 32, exp 0, ttl 255) (label 64, exp 0, [S], ttl 255)
  IP 192.168.0.1 > 192.168.0.11:
  ICMP echo request, id 1208, seq 45, length 64

0x:  6000 2cdb 006c 3c3f 2401 db00 0021 6048  `.,..l
---
 net/mpls/af_mpls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4de6d618b1..aeb5bf2f7595 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1533,10 +1533,11 @@ static int mpls_dev_notify(struct notifier_block *this, 
unsigned long event,
unsigned int flags;

if (event == NETDEV_REGISTER) {
-   /* For now just support Ethernet, IPGRE, SIT and IPIP devices */
+/* For now just support Ethernet, IPGRE, IP6GRE, SIT and IPIP devices */
if (dev->type == ARPHRD_ETHER ||
dev->type == ARPHRD_LOOPBACK ||
dev->type == ARPHRD_IPGRE ||
+   dev->type == ARPHRD_IP6GRE ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_TUNNEL) {
mdev = mpls_add_dev(dev);
--
2.13.5

[PATCH] mpls: allow routes on ip6gre devices

2018-09-19 Thread Saif Hhasan

From: Saif Hasan 

Summary:

This appears to be necessary and sufficient change to enable `MPLS` on
`ip6gre` tunnels (RFC4023).

This diff allows IP6GRE devices to be recognized by MPLS kernel module
and hence user can configure interface to accept packets with mpls
headers as well setup mpls routes on them.

Test Plan:

Test plan consists of multiple containers connected via GRE-V6 tunnel.
Then carrying out testing steps as below.

- Carry out necessary sysctl settings on all containers

```
sysctl -w net.mpls.platform_labels=65536
sysctl -w net.mpls.ip_ttl_propagate=1
sysctl -w net.mpls.conf.lo.input=1
```

- Establish IP6GRE tunnels

```
ip -6 tunnel add name if_1_2_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::2 key 1
ip link set dev if_1_2_1 up
sysctl -w net.mpls.conf.if_1_2_1.input=1
ip -4 addr add 169.254.0.2/31 dev if_1_2_1 scope link

ip -6 tunnel add name if_1_3_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::3 key 1
ip link set dev if_1_3_1 up
sysctl -w net.mpls.conf.if_1_3_1.input=1
ip -4 addr add 169.254.0.4/31 dev if_1_3_1 scope link
```

- Install MPLS encap rules on node-1 towards node-2

```
ip route add 192.168.0.11/32 nexthop encap mpls 32/64 \
  via inet 169.254.0.3 dev if_1_2_1
```

- Install MPLS forwarding rules on node-2 and node-3
```
// node2
ip -f mpls route add 32 via inet 169.254.0.7 dev if_2_4_1

// node3
ip -f mpls route add 64 via inet 169.254.0.12 dev if_4_3_1
```

- Ping 192.168.0.11 (node4) from 192.168.0.1 (node1) (where routing
  towards 192.168.0.1 is via IP route directly towards node1 from node4)
```
ping 192.168.0.11
```

- tcpdump on interface to capture ping packets wrapped within MPLS
  header which inturn wrapped within IP6GRE header

```
16:43:41.121073 IP6
  2401:db00:21:6048:feed::1 > 2401:db00:21:6048:feed::2:
  DSTOPT GREv0, key=0x1, length 100:
  MPLS (label 32, exp 0, ttl 255) (label 64, exp 0, [S], ttl 255)
  IP 192.168.0.1 > 192.168.0.11:
  ICMP echo request, id 1208, seq 45, length 64

0x:  6000 2cdb 006c 3c3f 2401 db00 0021 6048  `.,..l
---
 net/mpls/af_mpls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4de6d618b1..aeb5bf2f7595 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1533,10 +1533,11 @@ static int mpls_dev_notify(struct notifier_block *this, 
unsigned long event,
unsigned int flags;

if (event == NETDEV_REGISTER) {
-   /* For now just support Ethernet, IPGRE, SIT and IPIP devices */
+/* For now just support Ethernet, IPGRE, IP6GRE, SIT and IPIP devices */
if (dev->type == ARPHRD_ETHER ||
dev->type == ARPHRD_LOOPBACK ||
dev->type == ARPHRD_IPGRE ||
+   dev->type == ARPHRD_IP6GRE ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_TUNNEL) {
mdev = mpls_add_dev(dev);
--
2.13.5

[PATCH] mpls: allow routes on ip6gre devices

2018-09-19 Thread Saif Hasan

Summary:

This appears to be necessary and sufficient change to enable `MPLS` on
`ip6gre` tunnels (RFC4023).

This diff allows IP6GRE devices to be recognized by MPLS kernel module
and hence user can configure interface to accept packets with mpls
headers as well setup mpls routes on them.

Test Plan:

Test plan consists of multiple containers connected via GRE-V6 tunnel.
Then carrying out testing steps as below.

- Carry out necessary sysctl settings on all containers

```
sysctl -w net.mpls.platform_labels=65536
sysctl -w net.mpls.ip_ttl_propagate=1
sysctl -w net.mpls.conf.lo.input=1
```

- Establish IP6GRE tunnels

```
ip -6 tunnel add name if_1_2_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::2 key 1
ip link set dev if_1_2_1 up
sysctl -w net.mpls.conf.if_1_2_1.input=1
ip -4 addr add 169.254.0.2/31 dev if_1_2_1 scope link

ip -6 tunnel add name if_1_3_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::3 key 1
ip link set dev if_1_3_1 up
sysctl -w net.mpls.conf.if_1_3_1.input=1
ip -4 addr add 169.254.0.4/31 dev if_1_3_1 scope link
```

- Install MPLS encap rules on node-1 towards node-2

```
ip route add 192.168.0.11/32 nexthop encap mpls 32/64 \
  via inet 169.254.0.3 dev if_1_2_1
```

- Install MPLS forwarding rules on node-2 and node-3
```
// node2
ip -f mpls route add 32 via inet 169.254.0.7 dev if_2_4_1

// node3
ip -f mpls route add 64 via inet 169.254.0.12 dev if_4_3_1
```

- Ping 192.168.0.11 (node4) from 192.168.0.1 (node1) (where routing
  towards 192.168.0.1 is via IP route directly towards node1 from node4)
```
ping 192.168.0.11
```

- tcpdump on interface to capture ping packets wrapped within MPLS
  header which inturn wrapped within IP6GRE header

```
16:43:41.121073 IP6
  2401:db00:21:6048:feed::1 > 2401:db00:21:6048:feed::2:
  DSTOPT GREv0, key=0x1, length 100:
  MPLS (label 32, exp 0, ttl 255) (label 64, exp 0, [S], ttl 255)
  IP 192.168.0.1 > 192.168.0.11:
  ICMP echo request, id 1208, seq 45, length 64

0x:  6000 2cdb 006c 3c3f 2401 db00 0021 6048  `.,..l
---
 net/mpls/af_mpls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4de6d618b1..aeb5bf2f7595 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1533,10 +1533,11 @@ static int mpls_dev_notify(struct notifier_block *this, 
unsigned long event,
unsigned int flags;

if (event == NETDEV_REGISTER) {
-   /* For now just support Ethernet, IPGRE, SIT and IPIP devices */
+/* For now just support Ethernet, IPGRE, IP6GRE, SIT and IPIP devices */
if (dev->type == ARPHRD_ETHER ||
dev->type == ARPHRD_LOOPBACK ||
dev->type == ARPHRD_IPGRE ||
+   dev->type == ARPHRD_IP6GRE ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_TUNNEL) {
mdev = mpls_add_dev(dev);
--
2.13.5

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Johannes Berg

On Wed, 2018-09-19 at 15:46 -0300, Marcelo Ricardo Leitner wrote:

> > NL_SET_ERR_MSG(extack, "warning: deprecated command");
> > err = nla_parse(..., extack);
> > if (err)
> > return err;
> > /* do something */
> > return 0;
> > 
> > Here you could consider the message there a warning that's transported
> > out even if we return 0, but if we return with a failure from
> > nla_parse() (or nla_validate instead if you wish), then that failure
> > message "wins".
> 
> Agree. This is the core issue here, IMHO. Once out of the context that
> set the message, we have no way of knowing if we can nor should
> overwrite the message that is already in there.

True.

I'm not really sure I'd go as far as calling it an issue though.

IMHO, we really get into this situation if the code is badly structured
to start with. Taking the above code, if you write it as

err = nla_parse(...);
if (err)
return err;
/* do something */
NLA_SET_ERR_MSG(extack, "warning: deprecated command");
return 0;

instead, then you have no such problems.

Well, perhaps you do, if "/* do something */" might *also* set the
message, but basically you have to statically decide which one wins by
ordering the code correctly.

I'm still not convinced, btw, that we actually have any instance in the
kernel today of the issue we've been discussing - namely that some code
does something like the original quoted code at the top of the email.

> > I suppose we could - technically - make that generic, in that we could
> > have both
> > 
> >   NLA_SET_WARN_MSG(extack, "...");
> >   NLA_SET_ERR_MSG(extack, "...");
> 
> I like this.

I'm not really sure what for though :-)

FWIW, if you do think that there's a need for distinguishing this, then
I'd argue that perhaps the right way to address this would be to extend
this all the way to userspace and have two separate attributes for
errors and warnings in the extended ACK message?

> > and keep track of warning vs. error; however, just like my first version
> > of the NLA_REJECT patch, that would break existing code.
> 
> Hm, I may have missed something but I think the discussion in there
> was for a different context. For an extack msg to be set by
> when validate_nla() call returns on nla_parse(), the previous message
> had to be a "warning" because otherwise the parsing wouldn't be even
> attempted. So in that case, we are safe to simply overwrite it.

Yes, arguably, if you really had an "error" then you'd probably have
never gotten to the parsing. It's possible - although sort of stupid -
to write this code too though:

NL_SET_ERR_MSG(extack, "error: unsupported command");
err = nla_parse(...);
return err ? : -EOPNOTSUPP;

Which one should win in that case?

Again, in my opinion we've got enough flexibility if we let
nla_parse/nla_validate behave as today (mostly, nla_validate doesn't set
a message and I'd like to change that, it does set the error attribute
pointer/offset) and let the calling code sort it out.

In many cases nla_parse() will be called by generic code (e.g.
genetlink) and you'd never get into this situation. Once you're past
that, you  can do whatever you like.

> While for the situation you are describing here, it will set a generic
> error message in case the inner code didn't do it.

Yes, but that's not a change in semantics as far as the caller of
nla_parse/nla_validate is concerned - it'll continue to unconditionally
set a message if an error occurred, only the internal behaviour as to
which message seems more relevant is at issue, and the whole recursion
thing and avoiding an outer one overwriting an inner one is IMHO more
useful because that's the more specific error.

> Using the semantics of NLA_SET_WARN_MSG and ERR, then WARN would
> replace other WARNs but not ERRs, and ERR would replace other WARNs
> too but not other ERRs. All we need to do handle this is a bit in
> extack saying if the message is considered a warning or not, or an
> error/fatal message or not.

I'm still not really sure what the use case for a warning is, so not
sure I can really comment on this.

> Okay but we have split parsing of netlink messages and this could be
> useful in there too:
> In cls_api.c, tc_new_tfilter() calls nmmsg_parse() and do some
> processing, and then handle it to a classifier. cls_flower, for
> example, will then do another parsing. If, for whatever reason, flower
> failed and did not set an extack msg, tc_new_tfilter() could set a
> default error message, but at this moment we can't tell if the msg
> already set was just a warning from the first parsing (or any other
> code before engaging flower) (which we can overwrite) or if it a
> message that flower set (which we should not overwrite).
> 
> Hopefully my description clear.. 8-)
> 
> I think this is the same situation as with the nested parsing you're
> proposing.

Yes, I admit that's the same, just not part of pure policy checking, but
open-coded

[no subject]

2018-09-19 Thread Saif Hasan

>From e4f144286efe0f298c11efe58e17b1ab91c7ee3f Mon Sep 17 00:00:00 2001
From: Saif Hasan 
Date: Mon, 17 Sep 2018 16:28:54 -0700
Subject: [PATCH] mpls: allow routes on ip6gre devices

Summary:

This appears to be necessary and sufficient change to enable `MPLS` on
`ip6gre` tunnels (RFC4023).

This diff allows IP6GRE devices to be recognized by MPLS kernel module
and hence user can configure interface to accept packets with mpls
headers as well setup mpls routes on them.

Test Plan:

Test plan consists of multiple containers connected via GRE-V6 tunnel.
Then carrying out testing steps as below.

- Carry out necessary sysctl settings on all containers

```
sysctl -w net.mpls.platform_labels=65536
sysctl -w net.mpls.ip_ttl_propagate=1
sysctl -w net.mpls.conf.lo.input=1
```

- Establish IP6GRE tunnels

```
ip -6 tunnel add name if_1_2_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::2 key 1
ip link set dev if_1_2_1 up
sysctl -w net.mpls.conf.if_1_2_1.input=1
ip -4 addr add 169.254.0.2/31 dev if_1_2_1 scope link

ip -6 tunnel add name if_1_3_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::3 key 1
ip link set dev if_1_3_1 up
sysctl -w net.mpls.conf.if_1_3_1.input=1
ip -4 addr add 169.254.0.4/31 dev if_1_3_1 scope link
```

- Install MPLS encap rules on node-1 towards node-2

```
ip route add 192.168.0.11/32 nexthop encap mpls 32/64 \
  via inet 169.254.0.3 dev if_1_2_1
```

- Install MPLS forwarding rules on node-2 and node-3
```
// node2
ip -f mpls route add 32 via inet 169.254.0.7 dev if_2_4_1

// node3
ip -f mpls route add 64 via inet 169.254.0.12 dev if_4_3_1
```

- Ping 192.168.0.11 (node4) from 192.168.0.1 (node1) (where routing
  towards 192.168.0.1 is via IP route directly towards node1 from node4)
```
ping 192.168.0.11
```

- tcpdump on interface to capture ping packets wrapped within MPLS
  header which inturn wrapped within IP6GRE header

```
16:43:41.121073 IP6
  2401:db00:21:6048:feed::1 > 2401:db00:21:6048:feed::2:
  DSTOPT GREv0, key=0x1, length 100:
  MPLS (label 32, exp 0, ttl 255) (label 64, exp 0, [S], ttl 255)
  IP 192.168.0.1 > 192.168.0.11:
  ICMP echo request, id 1208, seq 45, length 64

0x:  6000 2cdb 006c 3c3f 2401 db00 0021 6048  `.,..l
---
 net/mpls/af_mpls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4de6d618b1..aeb5bf2f7595 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1533,10 +1533,11 @@ static int mpls_dev_notify(struct
notifier_block *this, unsigned long event,
unsigned int flags;

if (event == NETDEV_REGISTER) {
-   /* For now just support Ethernet, IPGRE, SIT and IPIP devices */
+/* For now just support Ethernet, IPGRE, IP6GRE, SIT and IPIP devices */
if (dev->type == ARPHRD_ETHER ||
dev->type == ARPHRD_LOOPBACK ||
dev->type == ARPHRD_IPGRE ||
+   dev->type == ARPHRD_IP6GRE ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_TUNNEL) {
mdev = mpls_add_dev(dev);
--
2.13.5

[PATCH net] mpls: allow routes on ip6gre devices

2018-09-19 Thread Saif Hasan

>From e4f144286efe0f298c11efe58e17b1ab91c7ee3f Mon Sep 17 00:00:00 2001
From: Saif Hasan 
Date: Mon, 17 Sep 2018 16:28:54 -0700
Subject: [PATCH] mpls: allow routes on ip6gre devices

Summary:

This appears to be necessary and sufficient change to enable `MPLS` on
`ip6gre` tunnels (RFC4023).

This diff allows IP6GRE devices to be recognized by MPLS kernel module
and hence user can configure interface to accept packets with mpls
headers as well setup mpls routes on them.

Test Plan:

Test plan consists of multiple containers connected via GRE-V6 tunnel.
Then carrying out testing steps as below.

- Carry out necessary sysctl settings on all containers

```
sysctl -w net.mpls.platform_labels=65536
sysctl -w net.mpls.ip_ttl_propagate=1
sysctl -w net.mpls.conf.lo.input=1
```

- Establish IP6GRE tunnels

```
ip -6 tunnel add name if_1_2_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::2 key 1
ip link set dev if_1_2_1 up
sysctl -w net.mpls.conf.if_1_2_1.input=1
ip -4 addr add 169.254.0.2/31 dev if_1_2_1 scope link

ip -6 tunnel add name if_1_3_1 mode ip6gre \
  local 2401:db00:21:6048:feed:0::1 \
  remote 2401:db00:21:6048:feed:0::3 key 1
ip link set dev if_1_3_1 up
sysctl -w net.mpls.conf.if_1_3_1.input=1
ip -4 addr add 169.254.0.4/31 dev if_1_3_1 scope link
```

- Install MPLS encap rules on node-1 towards node-2

```
ip route add 192.168.0.11/32 nexthop encap mpls 32/64 \
  via inet 169.254.0.3 dev if_1_2_1
```

- Install MPLS forwarding rules on node-2 and node-3
```
// node2
ip -f mpls route add 32 via inet 169.254.0.7 dev if_2_4_1

// node3
ip -f mpls route add 64 via inet 169.254.0.12 dev if_4_3_1
```

- Ping 192.168.0.11 (node4) from 192.168.0.1 (node1) (where routing
  towards 192.168.0.1 is via IP route directly towards node1 from node4)
```
ping 192.168.0.11
```

- tcpdump on interface to capture ping packets wrapped within MPLS
  header which inturn wrapped within IP6GRE header

```
16:43:41.121073 IP6
  2401:db00:21:6048:feed::1 > 2401:db00:21:6048:feed::2:
  DSTOPT GREv0, key=0x1, length 100:
  MPLS (label 32, exp 0, ttl 255) (label 64, exp 0, [S], ttl 255)
  IP 192.168.0.1 > 192.168.0.11:
  ICMP echo request, id 1208, seq 45, length 64

0x:  6000 2cdb 006c 3c3f 2401 db00 0021 6048  `.,..l
---
 net/mpls/af_mpls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 7a4de6d618b1..aeb5bf2f7595 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1533,10 +1533,11 @@ static int mpls_dev_notify(struct
notifier_block *this, unsigned long event,
unsigned int flags;

if (event == NETDEV_REGISTER) {
-   /* For now just support Ethernet, IPGRE, SIT and IPIP devices */
+/* For now just support Ethernet, IPGRE, IP6GRE, SIT and IPIP devices */
if (dev->type == ARPHRD_ETHER ||
dev->type == ARPHRD_LOOPBACK ||
dev->type == ARPHRD_IPGRE ||
+   dev->type == ARPHRD_IP6GRE ||
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_TUNNEL) {
mdev = mpls_add_dev(dev);
--
2.13.5

[PATCH net-next 1/2] r8169: simplify RTL8169 PHY initialization

2018-09-19 Thread Heiner Kallweit

PCI_LATENCY_TIMER is ignored on PCIe, therefore we have to do this
for the PCI chips (version <= 06) only. Also we can move setting
PCI_CACHE_LINE_SIZE.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index 7f4647c58..a27bf5807 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4046,16 +4046,13 @@ static void rtl8169_init_phy(struct net_device *dev, 
struct rtl8169_private *tp)
rtl_hw_phy_config(dev);
 
if (tp->mac_version <= RTL_GIGA_MAC_VER_06) {
+   pci_write_config_byte(tp->pci_dev, PCI_LATENCY_TIMER, 0x40);
+   pci_write_config_byte(tp->pci_dev, PCI_CACHE_LINE_SIZE, 0x08);
netif_dbg(tp, drv, dev,
  "Set MAC Reg C+CR Offset 0x82h = 0x01h\n");
RTL_W8(tp, 0x82, 0x01);
}
 
-   pci_write_config_byte(tp->pci_dev, PCI_LATENCY_TIMER, 0x40);
-
-   if (tp->mac_version <= RTL_GIGA_MAC_VER_06)
-   pci_write_config_byte(tp->pci_dev, PCI_CACHE_LINE_SIZE, 0x08);
-
if (tp->mac_version == RTL_GIGA_MAC_VER_02) {
netif_dbg(tp, drv, dev,
  "Set MAC Reg C+CR Offset 0x82h = 0x01h\n");
-- 
2.19.0

[PATCH net-next 2/2] r8169: remove duplicated RTL8169s PHY initialization steps

2018-09-19 Thread Heiner Kallweit

Setting register 0x82 to value 01 is done a few lines before for all
chip versions <= 06 anyway. And setting PHY register 0x0b to value 00
is done at the end of rtl8169s_hw_phy_config() already. So we can
remove this.

Signed-off-by: Heiner Kallweit 
---
 drivers/net/ethernet/realtek/r8169.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c 
b/drivers/net/ethernet/realtek/r8169.c
index a27bf5807..e2713867c 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -4053,15 +4053,6 @@ static void rtl8169_init_phy(struct net_device *dev, 
struct rtl8169_private *tp)
RTL_W8(tp, 0x82, 0x01);
}
 
-   if (tp->mac_version == RTL_GIGA_MAC_VER_02) {
-   netif_dbg(tp, drv, dev,
- "Set MAC Reg C+CR Offset 0x82h = 0x01h\n");
-   RTL_W8(tp, 0x82, 0x01);
-   netif_dbg(tp, drv, dev,
- "Set PHY Reg 0x0bh = 0x00h\n");
-   rtl_writephy(tp, 0x0b, 0x); //w 0x0b 15 0 0
-   }
-
/* We may have called phy_speed_down before */
phy_speed_up(dev->phydev);
 
-- 
2.19.0

[PATCH net] xfrm: validate template mode

2018-09-19 Thread Sean Tranchetti

XFRM mode parameters passed as part of the user templates
in the IP_XFRM_POLICY are never properly validated. Passing
values other than valid XFRM modes can cause stack-out-of-bounds
reads to occur later in the XFRM processing:

[  140.535608] 
[  140.543058] BUG: KASAN: stack-out-of-bounds in xfrm_state_find+0x17e4/0x1cc4
[  140.550306] Read of size 4 at addr ffc0238a7a58 by task repro/5148
[  140.557369]
[  140.558927] Call trace:
[  140.558936] dump_backtrace+0x0/0x388
[  140.558940] show_stack+0x24/0x30
[  140.558946] __dump_stack+0x24/0x2c
[  140.558949] dump_stack+0x8c/0xd0
[  140.558956] print_address_description+0x74/0x234
[  140.558960] kasan_report+0x240/0x264
[  140.558963] __asan_report_load4_noabort+0x2c/0x38
[  140.558967] xfrm_state_find+0x17e4/0x1cc4
[  140.558971] xfrm_resolve_and_create_bundle+0x40c/0x1fb8
[  140.558975] xfrm_lookup+0x238/0x1444
[  140.558977] xfrm_lookup_route+0x48/0x11c
[  140.558984] ip_route_output_flow+0x88/0xc4
[  140.558991] raw_sendmsg+0xa74/0x266c
[  140.558996] inet_sendmsg+0x258/0x3b0
[  140.559002] sock_sendmsg+0xbc/0xec
[  140.559005] SyS_sendto+0x3a8/0x5a8
[  140.559008] el0_svc_naked+0x34/0x38
[  140.559009]
[  140.592245] page dumped because: kasan: bad access detected
[  140.597981] page_owner info is not active (free page?)
[  140.603267]
[  140.653503] 

Signed-off-by: Sean Tranchetti 
---
 net/xfrm/xfrm_user.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 4791aa8..ab2f280 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1480,6 +1480,9 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl 
*ut, u16 family)
(ut[i].family != prev_family))
return -EINVAL;
 
+   if (ut[i].mode >= XFRM_MODE_MAX)
+   return -EINVAL;
+
prev_family = ut[i].family;
 
switch (ut[i].family) {
-- 
1.9.1

Re: Bridge connectivity interruptions while devices join or leave the bridge

2018-09-19 Thread Johannes Wienke

On 19.09.18 19:03, Stephen Hemminger wrote:
> On Wed, 19 Sep 2018 19:45:08 +0300
> Ido Schimmel  wrote:
> 
>> On Wed, Sep 19, 2018 at 01:00:15PM +0200, Johannes Wienke wrote:
>>> This behavior of inheriting the mac address is really unexpected to us.
>>> Is it documented somewhere?  
>>
>> Not that I'm aware, but it's a well established behavior.
> 
> Not documented, has always been that way. It seems to be part of 802 standard 
> maybe?
> Anyway, if you set a MAC address of the bridge device it makes it sticky;
> i.e it won't change if ports of bridge change.

Yes sure. It was just unexpected to meet this and we ha no clue that
this could be a reasonable default. I just wonder what the motivation
for hat is and of course some more obvious documentation would have
helped a lot in debugging/avoid this.

Cheers,
Johannes
-- 
Johannes Wienke, Researcher at CoR-Lab / CITEC, Bielefeld University
Address: Inspiration 1, D-33619 Bielefeld, Germany (Room 1.307)
Phone: +49 521 106-67277

Re: [PATCH mlx5-next 03/25] net/mlx5: Set uid as part of RQ commands

2018-09-19 Thread Jason Gunthorpe

On Wed, Sep 19, 2018 at 11:40:45AM -0700, Saeed Mahameed wrote:
> On Wed, Sep 19, 2018 at 10:28 AM Jason Gunthorpe  wrote:
> >
> > On Mon, Sep 17, 2018 at 02:03:56PM +0300, Leon Romanovsky wrote:
> > > From: Yishai Hadas 
> > >
> > > Set uid as part of RQ commands so that the firmware can manage the
> > > RQ object in a secured way.
> > >
> > > That will enable using an RQ that was created by verbs application
> > > to be used by the DEVX flow in case the uid is equal.
> > >
> > > Signed-off-by: Yishai Hadas 
> > > Signed-off-by: Leon Romanovsky 
> > >  drivers/net/ethernet/mellanox/mlx5/core/qp.c | 16 ++--
> > >  include/linux/mlx5/mlx5_ifc.h|  6 +++---
> > >  2 files changed, 17 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c 
> > > b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
> > > index 04f72a1cdbcc..0ca68ef54d93 100644
> > > +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
> > > @@ -540,6 +540,17 @@ int mlx5_core_xrcd_dealloc(struct mlx5_core_dev 
> > > *dev, u32 xrcdn)
> > >  }
> > >  EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
> > >
> > > +static void destroy_rq_tracked(struct mlx5_core_dev *dev, u32 rqn, u16 
> > > uid)
> > > +{
> > > + u32 in[MLX5_ST_SZ_DW(destroy_rq_in)]   = {0};
> > > + u32 out[MLX5_ST_SZ_DW(destroy_rq_out)] = {0};
> >
> > = {} is the preferred version of this, right?
> >
> > {0} explicitly initializes the first element to zero and only works if
> > the first element happens to be something integral..
> >
> 
> Both are perfectly ok in our scenarios.
> I remember one of the syntaxes yielded a statistic checker warning, i
> don't remember which syntax and what static checker :) ..

{0} will throw a 'missing-field-initializers' compiler warning,
however it recognizes '= {}' as an idom meaning 'zero everything' and
does not throw a warning.

Jason

Re: [RFC 4/5] netlink: prepare validate extack setting for recursion

2018-09-19 Thread Marcelo Ricardo Leitner

On Wed, Sep 19, 2018 at 09:19:31PM +0200, Johannes Berg wrote:
> On Wed, 2018-09-19 at 15:46 -0300, Marcelo Ricardo Leitner wrote:
> 
> > >   NL_SET_ERR_MSG(extack, "warning: deprecated command");
> > >   err = nla_parse(..., extack);
> > >   if (err)
> > >   return err;
> > >   /* do something */
> > >   return 0;
> > > 
> > > Here you could consider the message there a warning that's transported
> > > out even if we return 0, but if we return with a failure from
> > > nla_parse() (or nla_validate instead if you wish), then that failure
> > > message "wins".
> > 
> > Agree. This is the core issue here, IMHO. Once out of the context that
> > set the message, we have no way of knowing if we can nor should
> > overwrite the message that is already in there.
> 
> True.
> 
> I'm not really sure I'd go as far as calling it an issue though.

Ok, s/issue/matter in question/.

...
> > > I suppose we could - technically - make that generic, in that we could
> > > have both
> > > 
> > >   NLA_SET_WARN_MSG(extack, "...");
> > >   NLA_SET_ERR_MSG(extack, "...");
> > 
> > I like this.
> 
> I'm not really sure what for though :-)

Hehe :-)

> 
> FWIW, if you do think that there's a need for distinguishing this, then
> I'd argue that perhaps the right way to address this would be to extend
> this all the way to userspace and have two separate attributes for
> errors and warnings in the extended ACK message?

Likely, yes. And perhaps even support multiple messages. That way we
could, for example, parse all attributes and return a list of the all
the offending ones, instead of returning one at a time. Net benefit?
Not clear.. over engineering? Possibly.

I agree with you that in general we should not need this.

> > While for the situation you are describing here, it will set a generic
> > error message in case the inner code didn't do it.
> 
> Yes, but that's not a change in semantics as far as the caller of
> nla_parse/nla_validate is concerned - it'll continue to unconditionally
> set a message if an error occurred, only the internal behaviour as to
> which message seems more relevant is at issue, and the whole recursion
> thing and avoiding an outer one overwriting an inner one is IMHO more
> useful because that's the more specific error.

That's fine. My point here was only to clarify the use case, which
would be used in other places too (like in the example below).

> 
> > Using the semantics of NLA_SET_WARN_MSG and ERR, then WARN would
> > replace other WARNs but not ERRs, and ERR would replace other WARNs
> > too but not other ERRs. All we need to do handle this is a bit in
> > extack saying if the message is considered a warning or not, or an
> > error/fatal message or not.
> 
> I'm still not really sure what the use case for a warning is, so not
> sure I can really comment on this.

Good point. From iproute POV, a warning is a non-fatal message. From
an user perspective, that probably translates are nothing because in
the end the command actually worked. :-)

Seriously, I do think it's more of a hint for developers than anyone
else.

> 
> > Okay but we have split parsing of netlink messages and this could be
> > useful in there too:
> > In cls_api.c, tc_new_tfilter() calls nmmsg_parse() and do some
> > processing, and then handle it to a classifier. cls_flower, for
> > example, will then do another parsing. If, for whatever reason, flower
> > failed and did not set an extack msg, tc_new_tfilter() could set a
> > default error message, but at this moment we can't tell if the msg
> > already set was just a warning from the first parsing (or any other
> > code before engaging flower) (which we can overwrite) or if it a
> > message that flower set (which we should not overwrite).
> > 
> > Hopefully my description clear.. 8-)
> > 
> > I think this is the same situation as with the nested parsing you're
> > proposing.
> 
> Yes, I admit that's the same, just not part of pure policy checking, but
> open-coded (in a way).
> 
> > Currently it (tc_new_tfilter) doesn't set any default error message,
> > so this issue is/was not noticed.
> 
> True.
> 
> Except that I'd still say that tc_new_tfilter() can very well assume
> that nothing has set a message before it ran, it's invoked directly by
> the core netlink code after all.
> 
> So IMHO we don't have an issue here because there aren't arbitrary
> callers of this and it can't know what the state is; it does in fact
> know very well what the state is when it's called.

Good point.

> 
> With nla_validate/nla_parse that have far more callers we can't be as
> certain, although again - I doubt we actually have places today that
> would run into this situation (given how all this stuff is still fairly
> new).
> 
> > > From an external API POV though, nla_validate/nla_parse will continue to
> > > unconditionally overwrite any existing "warning" messages with errors,
> > > if such occurred. They just won't overwrite their own messages when
> > > returning from a nested pol

Re: [Patch net-next] ipv4: initialize ra_mutex in inet_init_net()

2018-09-19 Thread Cong Wang

On Wed, Sep 19, 2018 at 1:25 AM Kirill Tkhai  wrote:
>
> On 18.09.2018 23:17, Cong Wang wrote:
> > On Mon, Sep 17, 2018 at 12:25 AM Kirill Tkhai  wrote:
> >> In inet_init() the order of registration is:
> >>
> >> ip_mr_init();
> >> init_inet_pernet_ops();
> >>
> >> This means, ipmr_net_ops pernet operations are before af_inet_ops
> >> in pernet_list. So, there is a theoretical probability, sometimes
> >> in the future, we will have a problem during a fail of net initialization.
> >>
> >> Say,
> >>
> >> setup_net():
> >> ipmr_net_ops->init() returns 0
> >> xxx->init()  returns error
> >> and then we do:
> >> ipmr_net_ops->exit(),
> >>
> >> which could touch ra_mutex (theoretically).
> >
> > How could ra_mutex be touched in this scenario?
> >
> > ra_mutex is only used in ip_ra_control() which is called
> > only by {get,set}sockopt(). I don't see anything related
> > to netns exit() path here.
>
> Currently, it is not touched. But it's an ordinary practice,
> someone closes sockets in pernet ->exit methods. For example,
> we close percpu icmp sockets in icmp_sk_exit(), which are
> also of RAW type, and there is also called ip_ra_control()
> for them. Yes, they differ by their protocol; icmp sockets
> are of IPPROTO_ICMP protocol, while ip_ra_control() acts
> on IPPROTO_RAW sockets, but it's not good anyway. This does
> not look reliable for the future. In case of someone changes
> something here, we may do not notice this for the long time,
> while some users will meet bugs on their places.

First of all, we only consider current code base. Even if you
really planned to changed this in the future, it would be still your
responsibility to take care of it. Why? Simple, FIFO. My patch
comes ahead of any future changes here, obviously.

Secondly, it is certainly not hard to notice. I am pretty sure
you would get a warning/crash if this bug triggered.



>
> Problems on error paths is not easy to detect on testing,
> while user may meet them. We had issue of same type with
> uninitialized xfrm_policy_lock. It was introduced in 2013,
> while the problem was found only in 2017:

Been there, done that, I've fixed multiple IPv6 init code
error path. This is not where we disagree, by the way.


>
> introduced by 283bc9f35bbb
> fixed  by c28a45cb
>
> (Last week I met it on RH7 kernel, which still has no a fix.
>  But this talk is not about distribution kernels, just about
>  the way).
>
> I just want to say if someone makes some creativity on top
> of this code, it will be to more friendly from us to him/her
> to not force this person to think about such not obvious details,
> but just to implement nice architecture right now.

You keep saying nice architecture, how did you architect
ipv4.ra_mutex into net/core/net_namespace.c? It is apparently
not nice.

Good luck with your future.

I am tired, let's just drop it. I have no interest to fix your mess.

Thanks!

[PATCH net-next] nfp: provide a better warning when ring allocation fails

2018-09-19 Thread Jakub Kicinski

NFP supports fairly enormous ring sizes (up to 256k descriptors).
In commit 466271703867 ("nfp: use kvcalloc() to allocate SW buffer
descriptor arrays") we have started using kvcalloc() functions to
make sure the allocation of software state arrays doesn't hit
the MAX_ORDER limit.  Unfortunately, we can't use virtual mappings
for the DMA region holding HW descriptors.  In case this allocation
fails instead of the generic (and fairly scary) warning/splat in
the logs print a helpful message explaining what happened and
suggesting how to fix it.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Dirk van der Merwe 
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c  | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1aac55d66404..6dab45c1f21e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2180,9 +2180,13 @@ nfp_net_tx_ring_alloc(struct nfp_net_dp *dp, struct 
nfp_net_tx_ring *tx_ring)
 
tx_ring->size = array_size(tx_ring->cnt, sizeof(*tx_ring->txds));
tx_ring->txds = dma_zalloc_coherent(dp->dev, tx_ring->size,
-   &tx_ring->dma, GFP_KERNEL);
-   if (!tx_ring->txds)
+   &tx_ring->dma,
+   GFP_KERNEL | __GFP_NOWARN);
+   if (!tx_ring->txds) {
+   netdev_warn(dp->netdev, "failed to allocate TX descriptor ring 
memory, requested descriptor count: %d, consider lowering descriptor count\n",
+   tx_ring->cnt);
goto err_alloc;
+   }
 
tx_ring->txbufs = kvcalloc(tx_ring->cnt, sizeof(*tx_ring->txbufs),
   GFP_KERNEL);
@@ -2334,9 +2338,13 @@ nfp_net_rx_ring_alloc(struct nfp_net_dp *dp, struct 
nfp_net_rx_ring *rx_ring)
rx_ring->cnt = dp->rxd_cnt;
rx_ring->size = array_size(rx_ring->cnt, sizeof(*rx_ring->rxds));
rx_ring->rxds = dma_zalloc_coherent(dp->dev, rx_ring->size,
-   &rx_ring->dma, GFP_KERNEL);
-   if (!rx_ring->rxds)
+   &rx_ring->dma,
+   GFP_KERNEL | __GFP_NOWARN);
+   if (!rx_ring->rxds) {
+   netdev_warn(dp->netdev, "failed to allocate RX descriptor ring 
memory, requested descriptor count: %d, consider lowering descriptor count\n",
+   rx_ring->cnt);
goto err_alloc;
+   }
 
rx_ring->rxbufs = kvcalloc(rx_ring->cnt, sizeof(*rx_ring->rxbufs),
   GFP_KERNEL);
-- 
2.17.1

Re: [PATCH net-next v2 01/10] net: core: netlink: add helper refcount dec and lock function

2018-09-19 Thread Cong Wang

On Mon, Sep 17, 2018 at 12:19 AM Vlad Buslov  wrote:
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -130,6 +130,12 @@ int rtnl_is_locked(void)
>  }
>  EXPORT_SYMBOL(rtnl_is_locked);
>
> +bool refcount_dec_and_rtnl_lock(refcount_t *r)
> +{
> +   return refcount_dec_and_mutex_lock(r, &rtnl_mutex);
> +}
> +EXPORT_SYMBOL(refcount_dec_and_rtnl_lock);

I think you probably want to #include 
explicitly.

Re: [PATCH bpf-next] flow_dissector: fix build failure without CONFIG_NET

2018-09-19 Thread Daniel Borkmann

On 09/18/2018 10:20 PM, Willem de Bruijn wrote:
> From: Willem de Bruijn 
> 
> If boolean CONFIG_BPF_SYSCALL is enabled, kernel/bpf/syscall.c will
> call flow_dissector functions from net/core/flow_dissector.c.
> 
> This causes this build failure if CONFIG_NET is disabled:
> 
> kernel/bpf/syscall.o: In function `__x64_sys_bpf':
> syscall.c:(.text+0x3278): undefined reference to
> `skb_flow_dissector_bpf_prog_attach'
> syscall.c:(.text+0x3310): undefined reference to
> `skb_flow_dissector_bpf_prog_detach'
> kernel/bpf/syscall.o:(.rodata+0x3f0): undefined reference to
> `flow_dissector_prog_ops'
> kernel/bpf/verifier.o:(.rodata+0x250): undefined reference to
> `flow_dissector_verifier_ops'
> 
> Analogous to other optional BPF program types in syscall.c, add stubs
> if the relevant functions are not compiled and move the BPF_PROG_TYPE
> definition in the #ifdef CONFIG_NET block.
> 
> Fixes: d58e468b1112 ("flow_dissector: implements flow dissector BPF hook")
> Reported-by: Randy Dunlap 
> Signed-off-by: Willem de Bruijn 

Applied to bpf-next, thanks Willem!

Re: [PATCH bpf] tools: bpf: fix license for a compat header file

2018-09-19 Thread Daniel Borkmann

On 09/18/2018 07:13 PM, Jakub Kicinski wrote:
> libc_compat.h is used by libbpf so make sure it's licensed under
> LGPL or BSD license.  The license change should be OK, I'm the only
> author of the file.
> 
> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Quentin Monnet 

Applied to bpf, thanks Jakub!

Re: [PATCH net-next v2 05/10] net: sched: use Qdisc rcu API instead of relying on rtnl lock

2018-09-19 Thread Cong Wang

On Mon, Sep 17, 2018 at 12:19 AM Vlad Buslov  wrote:
> +static void tcf_qdisc_put(struct Qdisc *q, bool rtnl_held)
> +{
> +   if (!q)
> +   return;
> +
> +   if (rtnl_held)
> +   qdisc_put(q);
> +   else
> +   qdisc_put_unlocked(q);
> +}

This is very ugly. You should know whether RTNL is held or
not when calling it.

What's more, all of your code passes true, so why do you
need a parameter for rtnl_held?

Re: [PATCH net-next v2 08/10] net: sched: protect block idr with spinlock

2018-09-19 Thread Cong Wang

On Mon, Sep 17, 2018 at 12:19 AM Vlad Buslov  wrote:
> @@ -482,16 +483,25 @@ static int tcf_block_insert(struct tcf_block *block, 
> struct net *net,
> struct netlink_ext_ack *extack)
>  {
> struct tcf_net *tn = net_generic(net, tcf_net_id);
> +   int err;
> +
> +   idr_preload(GFP_KERNEL);
> +   spin_lock(&tn->idr_lock);
> +   err = idr_alloc_u32(&tn->idr, block, &block->index, block->index,
> +   GFP_NOWAIT);


Why GFP_NOWAIT rather than GFP_ATOMIC here?

[PATCH bpf-next 1/3] perf/core: introduce perf_event_mmap_bpf_prog

2018-09-19 Thread Alexei Starovoitov

introduce perf_event_mmap_bpf_prog() helper to emit RECORD_MMAP events
into perf ring buffer.
It's used by bpf load/unload logic to notify user space of addresses
and names of JITed bpf programs.

Note that event->mmap.pid == -1 is an existing indicator of kernel event.
In addition use event->mmap.tid == BPF_FS_MAGIC to indicate bpf related
RECORD_MMAP event.

Alternatively it's possible to introduce new 'enum perf_event_type' command
specificially for bpf prog load/unload, but existing RECORD_MMAP
is very close, so the choice made by this patch is to extend it.

Signed-off-by: Alexei Starovoitov 
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c   | 44 +-
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 53c500f0ca79..0e79af83138f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1113,6 +1113,7 @@ static inline void perf_event_task_sched_out(struct 
task_struct *prev,
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
+void perf_event_mmap_bpf_prog(u64 start, u64 len, char *name, int size);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks 
*callbacks);
 extern int perf_unregister_guest_info_callbacks(struct 
perf_guest_info_callbacks *callbacks);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2a62b96600ad..c48244ddf993 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7152,7 +7152,7 @@ static int perf_event_mmap_match(struct perf_event *event,
 {
struct perf_mmap_event *mmap_event = data;
struct vm_area_struct *vma = mmap_event->vma;
-   int executable = vma->vm_flags & VM_EXEC;
+   int executable = !vma || vma->vm_flags & VM_EXEC;
 
return (!executable && event->attr.mmap_data) ||
   (executable && (event->attr.mmap || event->attr.mmap2));
@@ -7165,12 +7165,13 @@ static void perf_event_mmap_output(struct perf_event 
*event,
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
+   bool bpf_event = !mmap_event->vma;
int ret;
 
if (!perf_event_mmap_match(event, data))
return;
 
-   if (event->attr.mmap2) {
+   if (event->attr.mmap2 && !bpf_event) {
mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
mmap_event->event_id.header.size += sizeof(mmap_event->maj);
mmap_event->event_id.header.size += sizeof(mmap_event->min);
@@ -7186,12 +7187,14 @@ static void perf_event_mmap_output(struct perf_event 
*event,
if (ret)
goto out;
 
-   mmap_event->event_id.pid = perf_event_pid(event, current);
-   mmap_event->event_id.tid = perf_event_tid(event, current);
+   if (!bpf_event) {
+   mmap_event->event_id.pid = perf_event_pid(event, current);
+   mmap_event->event_id.tid = perf_event_tid(event, current);
+   }
 
perf_output_put(&handle, mmap_event->event_id);
 
-   if (event->attr.mmap2) {
+   if (event->attr.mmap2 && !bpf_event) {
perf_output_put(&handle, mmap_event->maj);
perf_output_put(&handle, mmap_event->min);
perf_output_put(&handle, mmap_event->ino);
@@ -7448,6 +7451,37 @@ void perf_event_mmap(struct vm_area_struct *vma)
perf_event_mmap_event(&mmap_event);
 }
 
+void perf_event_mmap_bpf_prog(u64 start, u64 len, char *name, int size)
+{
+   struct perf_mmap_event mmap_event;
+
+   if (!atomic_read(&nr_mmap_events))
+   return;
+
+   if (!IS_ALIGNED(size, sizeof(u64))) {
+   WARN_ONCE(1, "size is not aligned\n");
+   return;
+   }
+
+   mmap_event = (struct perf_mmap_event){
+   .file_name = name,
+   .file_size = size,
+   .event_id  = {
+   .header = {
+   .type = PERF_RECORD_MMAP,
+   .misc = PERF_RECORD_MISC_KERNEL,
+   .size = sizeof(mmap_event.event_id) + size,
+   },
+   .pid = -1, /* indicates kernel */
+   .tid = BPF_FS_MAGIC, /* bpf mmap event */
+   .start  = start,
+   .len= len,
+   .pgoff  = start,
+   },
+   };
+   perf_iterate_sb(perf_event_mmap_output, &mmap_event, NULL);
+}
+
 void perf_event_aux_event(struct perf_event *event, unsigned long head,
  unsigned long size, u64 flags)
 {
-- 
2.17.1

[PATCH perf 3/3] tools/perf: recognize and process RECORD_MMAP events for bpf progs

2018-09-19 Thread Alexei Starovoitov

Recognize JITed bpf prog load/unload events.
Add/remove kernel symbols accordingly.

Signed-off-by: Alexei Starovoitov 
---
 tools/perf/util/machine.c | 27 +++
 tools/perf/util/symbol.c  | 13 +
 tools/perf/util/symbol.h  |  1 +
 3 files changed, 41 insertions(+)

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index c4acd2001db0..ae4f8a0fdc7e 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -25,6 +25,7 @@
 #include "sane_ctype.h"
 #include 
 #include 
+#include 
 
 static void __machine__remove_thread(struct machine *machine, struct thread 
*th, bool lock);
 
@@ -1460,6 +1461,32 @@ static int machine__process_kernel_mmap_event(struct 
machine *machine,
enum dso_kernel_type kernel_type;
bool is_kernel_mmap;
 
+   /* process JITed bpf programs load/unload events */
+   if (event->mmap.pid == ~0u && event->mmap.tid == BPF_FS_MAGIC) {
+   struct symbol *sym;
+   u64 ip;
+
+   map = map_groups__find(&machine->kmaps, event->mmap.start);
+   if (!map) {
+   pr_err("No kernel map for IP %lx\n", event->mmap.start);
+   goto out_problem;
+   }
+   ip = event->mmap.start - map->start + map->pgoff;
+   if (event->mmap.filename[0]) {
+   sym = symbol__new(ip, event->mmap.len, 0, 0,
+ event->mmap.filename);
+   dso__insert_symbol(map->dso, sym);
+   } else {
+   if (symbols__erase(&map->dso->symbols, ip)) {
+   pr_err("No bpf prog at IP %lx/%lx\n",
+  event->mmap.start, ip);
+   goto out_problem;
+   }
+   dso__reset_find_symbol_cache(map->dso);
+   }
+   return 0;
+   }
+
/* If we have maps from kcore then we do not need or want any others */
if (machine__uses_kcore(machine))
return 0;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index d188b7588152..0653f313661d 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -353,6 +353,19 @@ static struct symbol *symbols__find(struct rb_root 
*symbols, u64 ip)
return NULL;
 }
 
+int symbols__erase(struct rb_root *symbols, u64 ip)
+{
+   struct symbol *s;
+
+   s = symbols__find(symbols, ip);
+   if (!s)
+   return -ENOENT;
+
+   rb_erase(&s->rb_node, symbols);
+   symbol__delete(s);
+   return 0;
+}
+
 static struct symbol *symbols__first(struct rb_root *symbols)
 {
struct rb_node *n = rb_first(symbols);
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index f25fae4b5743..92ef31953d9a 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -310,6 +310,7 @@ char *dso__demangle_sym(struct dso *dso, int kmodule, const 
char *elf_name);
 
 void __symbols__insert(struct rb_root *symbols, struct symbol *sym, bool 
kernel);
 void symbols__insert(struct rb_root *symbols, struct symbol *sym);
+int symbols__erase(struct rb_root *symbols, u64 ip);
 void symbols__fixup_duplicate(struct rb_root *symbols);
 void symbols__fixup_end(struct rb_root *symbols);
 void map_groups__fixup_end(struct map_groups *mg);
-- 
2.17.1

[PATCH bpf-next 2/3] bpf: emit RECORD_MMAP events for bpf prog load/unload

2018-09-19 Thread Alexei Starovoitov

use perf_event_mmap_bpf_prog() helper to notify user space
about JITed bpf programs.
Use RECORD_MMAP perf event to tell user space where JITed bpf program was 
loaded.
Use empty program name as unload indication.

Signed-off-by: Alexei Starovoitov 
---
 kernel/bpf/core.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3f5bf1af0826..ddf11fdafd36 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -384,7 +384,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
*symbol_end   = addr + hdr->pages * PAGE_SIZE;
 }
 
-static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+static char *bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
const char *end = sym + KSYM_NAME_LEN;
 
@@ -402,9 +402,10 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, 
char *sym)
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
if (prog->aux->name[0])
-   snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+   sym += snprintf(sym, (size_t)(end - sym), "_%s", 
prog->aux->name);
else
*sym = 0;
+   return sym;
 }
 
 static __always_inline unsigned long
@@ -480,23 +481,40 @@ static bool bpf_prog_kallsyms_verify_off(const struct 
bpf_prog *fp)
 
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
+   unsigned long symbol_start, symbol_end;
+   char buf[KSYM_NAME_LEN], *sym;
+
if (!bpf_prog_kallsyms_candidate(fp) ||
!capable(CAP_SYS_ADMIN))
return;
 
+   bpf_get_prog_addr_region(fp, &symbol_start, &symbol_end);
+   sym = bpf_get_prog_name(fp, buf);
+   sym++; /* sym - buf is the length of the name including trailing 0 */
+   while (!IS_ALIGNED(sym - buf, sizeof(u64)))
+   *sym++ = 0;
spin_lock_bh(&bpf_lock);
bpf_prog_ksym_node_add(fp->aux);
spin_unlock_bh(&bpf_lock);
+   perf_event_mmap_bpf_prog(symbol_start, symbol_end - symbol_start,
+buf, sym - buf);
 }
 
 void bpf_prog_kallsyms_del(struct bpf_prog *fp)
 {
+   unsigned long symbol_start, symbol_end;
+   /* mmap_record.filename cannot be NULL and has to be u64 aligned */
+   char buf[sizeof(u64)] = {};
+
if (!bpf_prog_kallsyms_candidate(fp))
return;
 
spin_lock_bh(&bpf_lock);
bpf_prog_ksym_node_del(fp->aux);
spin_unlock_bh(&bpf_lock);
+   bpf_get_prog_addr_region(fp, &symbol_start, &symbol_end);
+   perf_event_mmap_bpf_prog(symbol_start, symbol_end - symbol_start,
+buf, sizeof(buf));
 }
 
 static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
-- 
2.17.1

[PATCH bpf-next 0/3] perf, bpf: reveal invisible bpf programs

2018-09-19 Thread Alexei Starovoitov

Hi All,

this patch set adds kernel and user space support to reveal
short lived bpf programs.
The kernel stores addr+bpf_prog_name information into perf ring buffer.
Later 'perf report' can properly attribute the cpu time to the program.

The following command was run before and after: 'perf record ./test_progs; perf 
report'

Before patch set:

# Overhead  Command Shared Object  Symbol   
  
#   ..  .  
...
#
10.73%  test_progs  [kernel.kallsyms]  [k] __htab_map_lookup_elem
 8.16%  test_progs  [kernel.kallsyms]  [k] bpf_skb_set_tunnel_key
 7.90%  test_progs  [kernel.kallsyms]  [k] memcmp
 3.10%  test_progs  [kernel.kallsyms]  [k] lookup_nulls_elem_raw
 2.57%  test_progs  [kernel.kallsyms]  [k] dst_release
 2.45%  test_progs  [kernel.kallsyms]  [k] do_check
 1.89%  test_progs  [kernel.kallsyms]  [k] bpf_xdp_adjust_head
 1.52%  test_progs  [kernel.kallsyms]  [k] 0x7fffa002cc5a
 1.22%  test_progs  [kernel.kallsyms]  [k] check_helper_mem_access
 0.99%  test_progs  [kernel.kallsyms]  [k] 0x7fffa001a6a8
 0.99%  test_progs  [kernel.kallsyms]  [k] kmem_cache_alloc_trace
 0.97%  test_progs  [kernel.kallsyms]  [k] 0x7fffa001a652
 0.95%  test_progs  [kernel.kallsyms]  [k] 0x7fffa0042d52
 0.95%  test_progs  [kernel.kallsyms]  [k] read_tsc
 0.81%  test_progs  [kernel.kallsyms]  [k] 0x7fffa001a660
 0.77%  test_progs  [kernel.kallsyms]  [k] 0x7fffa0042d69
 0.74%  test_progs  [kernel.kallsyms]  [k] percpu_array_map_lookup_elem
 0.69%  test_progs  [kernel.kallsyms]  [k] 0x7fffa001a64e

After:

# Overhead  Command Shared Object  Symbol   

#   ..  .  
.
#
18.13%  test_progs  [kernel.kallsyms]  [k] 
bpf_prog_1accc788e7f04c38_balancer_ingres
10.73%  test_progs  [kernel.kallsyms]  [k] __htab_map_lookup_elem
 7.94%  test_progs  [kernel.kallsyms]  [k] bpf_prog_20a05d8a586cf0e8_F
 7.80%  test_progs  [kernel.kallsyms]  [k] bpf_skb_set_tunnel_key
 4.64%  test_progs  [kernel.kallsyms]  [k] __sysfs_match_string
 3.61%  test_progs  [kernel.kallsyms]  [k] bpf_prog_9d89afa51f1dc0d7_F
 3.51%  test_progs  [kernel.kallsyms]  [k] bpf_prog_73b45270911a0294_F
 3.41%  test_progs  [kernel.kallsyms]  [k] bpf_prog_79be7b7bad8026bf_F
 3.26%  test_progs  [kernel.kallsyms]  [k] memcmp
 3.10%  test_progs  [kernel.kallsyms]  [k] lookup_nulls_elem_raw
 2.89%  test_progs  [kernel.kallsyms]  [k] bpf_prog_57bf3d413b9b7455_F
 2.57%  test_progs  [kernel.kallsyms]  [k] dst_discard
 2.45%  test_progs  [kernel.kallsyms]  [k] do_check
 2.39%  test_progs  [kernel.kallsyms]  [k] bpf_prog_576cbdaac1a4d2f6_F
 1.82%  test_progs  [kernel.kallsyms]  [k] bpf_prog_53ade2ecbddaa85b_F
 1.70%  test_progs  [kernel.kallsyms]  [k] bpf_xdp_adjust_head
 1.32%  test_progs  [kernel.kallsyms]  [k] bpf_prog_0edc54822404a598_F

Important considerations:

- Before and after the number of cpu samples are the same. No samples are lost.
  But perf cannot find the symbol by IP, so a lot of small 
0x7fffa001a64e-like
  symbols appear towards the end of 'perf report' and none of them look hot.
  In reallity these IP addresses belong to few bpf programs that
  were active at that time.

- newly loaded bpf program can be JITed into address space of unloaded prog.
  7fffa001aXXX address at time X can belong to a program A, but similar
  7fffa001aYYY address at time Y can belong to a different program B.

- event->mmap.pid == -1 is an existing indicator of kernel event.
  event->mmap.tid == BPF_FS_MAGIC is an extension to indicate bpf related event.
  Alternatively it's possible to introduce new 'enum perf_event_type' command
  specificially for bpf prog load/unload, but existing RECORD_MMAP
  is very close, so the choice made by this patchset is to extend it.

- steps to land the set:
  Patches 1 and 2 need to go via bpf-next tree,
  since we're adding support for better program names exactly in the same lines.
  Patch 3 can go in parallel into perf tree. It has no effect without kernel
  support and kernel support has not effect on old perf.

Peter, Arnaldo, Daniel, please review.

Alexei Starovoitov (3):
  perf/core: introduce perf_event_mmap_bpf_prog
  bpf: emit RECORD_MMAP events for bpf prog load/unload
  tools/perf: recognize and process RECORD_MMAP events for bpf progs

 include/linux/perf_event.h |  1 +
 kernel/bpf/core.c  | 22 +--
 kernel/events/core.c   | 44 +-
 tools/perf/util/machine.c  | 27 +++
 tools/perf/util/symbol.c   | 13 +++
 tools/perf/util/symbol.h   |  1 +
 6 files changed, 101 insertions(+), 7 deletions(-)

-- 
2.17.1

Re: [PATCH bpf-next 1/3] perf/core: introduce perf_event_mmap_bpf_prog

2018-09-19 Thread Song Liu




> On Sep 19, 2018, at 3:39 PM, Alexei Starovoitov  wrote:
> 
> introduce perf_event_mmap_bpf_prog() helper to emit RECORD_MMAP events
> into perf ring buffer.
> It's used by bpf load/unload logic to notify user space of addresses
> and names of JITed bpf programs.
> 
> Note that event->mmap.pid == -1 is an existing indicator of kernel event.
> In addition use event->mmap.tid == BPF_FS_MAGIC to indicate bpf related
> RECORD_MMAP event.
> 
> Alternatively it's possible to introduce new 'enum perf_event_type' command
> specificially for bpf prog load/unload, but existing RECORD_MMAP
> is very close, so the choice made by this patch is to extend it.
> 
> Signed-off-by: Alexei Starovoitov 

Acked-by: Song Liu 

I guess we should also use this for kernel modules load/unload? 


> ---
> include/linux/perf_event.h |  1 +
> kernel/events/core.c   | 44 +-
> 2 files changed, 40 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 53c500f0ca79..0e79af83138f 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -1113,6 +1113,7 @@ static inline void perf_event_task_sched_out(struct 
> task_struct *prev,
> }
> 
> extern void perf_event_mmap(struct vm_area_struct *vma);
> +void perf_event_mmap_bpf_prog(u64 start, u64 len, char *name, int size);
> extern struct perf_guest_info_callbacks *perf_guest_cbs;
> extern int perf_register_guest_info_callbacks(struct 
> perf_guest_info_callbacks *callbacks);
> extern int perf_unregister_guest_info_callbacks(struct 
> perf_guest_info_callbacks *callbacks);
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 2a62b96600ad..c48244ddf993 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -7152,7 +7152,7 @@ static int perf_event_mmap_match(struct perf_event 
> *event,
> {
>   struct perf_mmap_event *mmap_event = data;
>   struct vm_area_struct *vma = mmap_event->vma;
> - int executable = vma->vm_flags & VM_EXEC;
> + int executable = !vma || vma->vm_flags & VM_EXEC;
> 
>   return (!executable && event->attr.mmap_data) ||
>  (executable && (event->attr.mmap || event->attr.mmap2));
> @@ -7165,12 +7165,13 @@ static void perf_event_mmap_output(struct perf_event 
> *event,
>   struct perf_output_handle handle;
>   struct perf_sample_data sample;
>   int size = mmap_event->event_id.header.size;
> + bool bpf_event = !mmap_event->vma;
>   int ret;
> 
>   if (!perf_event_mmap_match(event, data))
>   return;
> 
> - if (event->attr.mmap2) {
> + if (event->attr.mmap2 && !bpf_event) {
>   mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
>   mmap_event->event_id.header.size += sizeof(mmap_event->maj);
>   mmap_event->event_id.header.size += sizeof(mmap_event->min);
> @@ -7186,12 +7187,14 @@ static void perf_event_mmap_output(struct perf_event 
> *event,
>   if (ret)
>   goto out;
> 
> - mmap_event->event_id.pid = perf_event_pid(event, current);
> - mmap_event->event_id.tid = perf_event_tid(event, current);
> + if (!bpf_event) {
> + mmap_event->event_id.pid = perf_event_pid(event, current);
> + mmap_event->event_id.tid = perf_event_tid(event, current);
> + }
> 
>   perf_output_put(&handle, mmap_event->event_id);
> 
> - if (event->attr.mmap2) {
> + if (event->attr.mmap2 && !bpf_event) {
>   perf_output_put(&handle, mmap_event->maj);
>   perf_output_put(&handle, mmap_event->min);
>   perf_output_put(&handle, mmap_event->ino);
> @@ -7448,6 +7451,37 @@ void perf_event_mmap(struct vm_area_struct *vma)
>   perf_event_mmap_event(&mmap_event);
> }
> 
> +void perf_event_mmap_bpf_prog(u64 start, u64 len, char *name, int size)
> +{
> + struct perf_mmap_event mmap_event;
> +
> + if (!atomic_read(&nr_mmap_events))
> + return;
> +
> + if (!IS_ALIGNED(size, sizeof(u64))) {
> + WARN_ONCE(1, "size is not aligned\n");
> + return;
> + }
> +
> + mmap_event = (struct perf_mmap_event){
> + .file_name = name,
> + .file_size = size,
> + .event_id  = {
> + .header = {
> + .type = PERF_RECORD_MMAP,
> + .misc = PERF_RECORD_MISC_KERNEL,
> + .size = sizeof(mmap_event.event_id) + size,
> + },
> + .pid = -1, /* indicates kernel */
> + .tid = BPF_FS_MAGIC, /* bpf mmap event */
> + .start  = start,
> + .len= len,
> + .pgoff  = start,
> + },
> + };
> + perf_iterate_sb(perf_event_mmap_output, &mmap_event, NULL);
> +}
> +
> void perf_event_aux_event(struct perf_event *event, unsigned long head,
> unsi

[PATCH iproute2 v2 3/3] testsuite: Warn about empty $(IPVERS)

2018-09-19 Thread Petr Vorel

alltests target requires having symlink created by configure target
(default target). Without that there is no test being run.

Signed-off-by: Petr Vorel 
---
 testsuite/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/testsuite/Makefile b/testsuite/Makefile
index 1c2467f5..b3aebec1 100644
--- a/testsuite/Makefile
+++ b/testsuite/Makefile
@@ -54,6 +54,9 @@ distclean: clean
echo "Entering iproute2" && cd iproute2 && $(MAKE) distclean && cd ..;
 
 $(TESTS): testclean
+ifeq (,$(IPVERS))
+   $(error Please run make first)
+endif
 ifeq (,$(HAVE_UNSHARED_UTIL))
$(error Please install util-linux tools to run tests in separated 
network namespace)
 endif
-- 
2.19.0.rc2

[PATCH iproute2 v2 2/3] testsuite: Generate generate_nlmsg when needed

2018-09-19 Thread Petr Vorel

Commit 886f2c43 added generate_nlmsg.c. Running alltests
target, which uses the binary required to run 'make -C tools' before.

Fixes: 886f2c43 testsuite: Generate nlmsg blob at runtime

Signed-off-by: Petr Vorel 
---
 testsuite/Makefile | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/testsuite/Makefile b/testsuite/Makefile
index a31d4531..1c2467f5 100644
--- a/testsuite/Makefile
+++ b/testsuite/Makefile
@@ -28,7 +28,7 @@ endif
 configure:
echo "Entering iproute2" && cd iproute2 && $(MAKE) configure && cd ..;
 
-compile: configure
+compile: configure generate_nlmsg
echo "Entering iproute2" && cd iproute2 && $(MAKE) && cd ..;
 
 listtests:
@@ -36,7 +36,10 @@ listtests:
echo "$$t"; \
done
 
-alltests: $(TESTS)
+generate_nlmsg:
+   $(MAKE) -C tools
+
+alltests: generate_nlmsg $(TESTS)
 
 testclean:
@echo "Removing $(RESULTS_DIR) dir ..."
-- 
2.19.0.rc2

[PATCH iproute2 v2 1/3] testsuite: Fix missing generate_nlmsg

2018-09-19 Thread Petr Vorel

Commit ad23e152 caused generate_nlmsg to be always missing:

$ make alltests
make: ./tools/generate_nlmsg: Command not found

Create testclean: to remove only results directory.

Fixes: ad23e152 testsuite: remove all temp files and implement make clean

Signed-off-by: Petr Vorel 
---
 testsuite/Makefile | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/testsuite/Makefile b/testsuite/Makefile
index d1ac997d..a31d4531 100644
--- a/testsuite/Makefile
+++ b/testsuite/Makefile
@@ -30,7 +30,6 @@ configure:
 
 compile: configure
echo "Entering iproute2" && cd iproute2 && $(MAKE) && cd ..;
-   $(MAKE) -C tools
 
 listtests:
@for t in $(TESTS); do \
@@ -39,9 +38,11 @@ listtests:
 
 alltests: $(TESTS)
 
-clean:
+testclean:
@echo "Removing $(RESULTS_DIR) dir ..."
@rm -rf $(RESULTS_DIR)
+
+clean: testclean
@rm -f iproute2/iproute2-this
@rm -f tests/ip/link/dev_wo_vf_rate.nl
$(MAKE) -C tools clean
@@ -49,18 +50,18 @@ clean:
 distclean: clean
echo "Entering iproute2" && cd iproute2 && $(MAKE) distclean && cd ..;
 
-$(TESTS): clean
+$(TESTS): testclean
 ifeq (,$(HAVE_UNSHARED_UTIL))
$(error Please install util-linux tools to run tests in separated 
network namespace)
 endif
@./tools/generate_nlmsg
 
@mkdir -p $(RESULTS_DIR)
-   
+
@for d in $(TESTS_DIR); do \
mkdir -p $(RESULTS_DIR)/$$d; \
done
-   
+
@if [ "$(KCPATH)" = "/proc/config.gz" ]; then \
gunzip -c $(KCPATH) >$(KENVFN); \
elif [ "$(KCPATH)" != "" ]; then \
-- 
2.19.0.rc2

[PATCH iproute2 v2 0/3] testsuite: make alltests fixes

2018-09-19 Thread Petr Vorel

Hi,

here are simply fixes to restore 'make alltests'.
Currently it does not run.

Kind regards,
Petr

Petr Vorel (3):
  testsuite: Fix missing generate_nlmsg
  testsuite: Generate generate_nlmsg when needed
  testsuite: Warn about empty $(IPVERS)

 testsuite/Makefile | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

-- 
2.19.0.rc2

[Patch net-next v3] net_sched: change tcf_del_walker() to take idrinfo->lock

2018-09-19 Thread Cong Wang

From: Vlad Buslov 

From: Vlad Buslov 

Action API was changed to work with actions and action_idr in concurrency
safe manner, however tcf_del_walker() still uses actions without taking a
reference or idrinfo->lock first, and deletes them directly, disregarding
possible concurrent delete.

Change tcf_del_walker() to take idrinfo->lock while iterating over actions
and use new tcf_idr_release_unsafe() to release them while holding the
lock.

And the blocking function fl_hw_destroy_tmplt() could be called when we
put a filter chain, so defer it to a work queue.

Signed-off-by: Vlad Buslov 
[xiyou.wangc...@gmail.com: heavily modify the code and changelog]
Signed-off-by: Cong Wang 
---
 net/sched/act_api.c| 20 +++-
 net/sched/cls_flower.c | 13 +++--
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 6f118d62c731..fac8c769454f 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -246,6 +246,20 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, 
struct sk_buff *skb,
goto done;
 }
 
+static int tcf_idr_release_unsafe(struct tc_action *p)
+{
+   if (atomic_read(&p->tcfa_bindcnt) > 0)
+   return -EPERM;
+
+   if (refcount_dec_and_test(&p->tcfa_refcnt)) {
+   idr_remove(&p->idrinfo->action_idr, p->tcfa_index);
+   tcf_action_cleanup(p);
+   return ACT_P_DELETED;
+   }
+
+   return 0;
+}
+
 static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb,
  const struct tc_action_ops *ops)
 {
@@ -262,15 +276,19 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, 
struct sk_buff *skb,
if (nla_put_string(skb, TCA_KIND, ops->kind))
goto nla_put_failure;
 
+   spin_lock(&idrinfo->lock);
idr_for_each_entry_ul(idr, p, id) {
-   ret = __tcf_idr_release(p, false, true);
+   ret = tcf_idr_release_unsafe(p);
if (ret == ACT_P_DELETED) {
module_put(ops->owner);
n_i++;
} else if (ret < 0) {
+   spin_unlock(&idrinfo->lock);
goto nla_put_failure;
}
}
+   spin_unlock(&idrinfo->lock);
+
if (nla_put_u32(skb, TCA_FCNT, n_i))
goto nla_put_failure;
nla_nest_end(skb, nest);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 4b8dd37dd4f8..0ed6630a5049 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -79,6 +79,7 @@ struct fl_flow_tmplt {
struct fl_flow_key mask;
struct flow_dissector dissector;
struct tcf_chain *chain;
+   struct rcu_work rwork;
 };
 
 struct cls_fl_head {
@@ -1437,14 +1438,22 @@ static void *fl_tmplt_create(struct net *net, struct 
tcf_chain *chain,
return ERR_PTR(err);
 }
 
-static void fl_tmplt_destroy(void *tmplt_priv)
+static void fl_tmplt_destroy_work(struct work_struct *work)
 {
-   struct fl_flow_tmplt *tmplt = tmplt_priv;
+   struct fl_flow_tmplt *tmplt = container_of(to_rcu_work(work),
+struct fl_flow_tmplt, rwork);
 
fl_hw_destroy_tmplt(tmplt->chain, tmplt);
kfree(tmplt);
 }
 
+static void fl_tmplt_destroy(void *tmplt_priv)
+{
+   struct fl_flow_tmplt *tmplt = tmplt_priv;
+
+   tcf_queue_work(&tmplt->rwork, fl_tmplt_destroy_work);
+}
+
 static int fl_dump_key_val(struct sk_buff *skb,
   void *val, int val_type,
   void *mask, int mask_type, int len)
-- 
2.14.4

Re: [PATCH bpf-next 2/3] bpf: emit RECORD_MMAP events for bpf prog load/unload

2018-09-19 Thread Song Liu




> On Sep 19, 2018, at 3:39 PM, Alexei Starovoitov  wrote:
> 
> use perf_event_mmap_bpf_prog() helper to notify user space
> about JITed bpf programs.
> Use RECORD_MMAP perf event to tell user space where JITed bpf program was 
> loaded.
> Use empty program name as unload indication.
> 
> Signed-off-by: Alexei Starovoitov 
> ---
> kernel/bpf/core.c | 22 --
> 1 file changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 3f5bf1af0826..ddf11fdafd36 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -384,7 +384,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
>   *symbol_end   = addr + hdr->pages * PAGE_SIZE;
> }
> 
> -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
> +static char *bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
> {
>   const char *end = sym + KSYM_NAME_LEN;
> 
> @@ -402,9 +402,10 @@ static void bpf_get_prog_name(const struct bpf_prog 
> *prog, char *sym)
>   sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
>   sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
>   if (prog->aux->name[0])
> - snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
> + sym += snprintf(sym, (size_t)(end - sym), "_%s", 
> prog->aux->name);
>   else
>   *sym = 0;
> + return sym;
> }
> 
> static __always_inline unsigned long
> @@ -480,23 +481,40 @@ static bool bpf_prog_kallsyms_verify_off(const struct 
> bpf_prog *fp)
> 
> void bpf_prog_kallsyms_add(struct bpf_prog *fp)
> {
> + unsigned long symbol_start, symbol_end;
> + char buf[KSYM_NAME_LEN], *sym;
> +
>   if (!bpf_prog_kallsyms_candidate(fp) ||
>   !capable(CAP_SYS_ADMIN))
>   return;
> 
> + bpf_get_prog_addr_region(fp, &symbol_start, &symbol_end);
> + sym = bpf_get_prog_name(fp, buf);
> + sym++; /* sym - buf is the length of the name including trailing 0 */
> + while (!IS_ALIGNED(sym - buf, sizeof(u64)))
> + *sym++ = 0;

nit: This logic feels a little weird to me. How about we wrap the extra logic
in a separate function:

size_t bpf_get_prog_name_u64_aligned(const struct bpf_prog fp, char *buf)

where the return value is the u64 aligned size. 

Other than this 

Acked-by: Song Liu 

>   spin_lock_bh(&bpf_lock);
>   bpf_prog_ksym_node_add(fp->aux);
>   spin_unlock_bh(&bpf_lock);
> + perf_event_mmap_bpf_prog(symbol_start, symbol_end - symbol_start,
> +  buf, sym - buf);
> }
> 
> void bpf_prog_kallsyms_del(struct bpf_prog *fp)
> {
> + unsigned long symbol_start, symbol_end;
> + /* mmap_record.filename cannot be NULL and has to be u64 aligned */
> + char buf[sizeof(u64)] = {};
> +
>   if (!bpf_prog_kallsyms_candidate(fp))
>   return;
> 
>   spin_lock_bh(&bpf_lock);
>   bpf_prog_ksym_node_del(fp->aux);
>   spin_unlock_bh(&bpf_lock);
> + bpf_get_prog_addr_region(fp, &symbol_start, &symbol_end);
> + perf_event_mmap_bpf_prog(symbol_start, symbol_end - symbol_start,
> +  buf, sizeof(buf));
> }
> 
> static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
> -- 
> 2.17.1
>

Re: [PATCH net-next 1/2] net: linkwatch: add check for netdevice being present to linkwatch_do_dev

2018-09-19 Thread Florian Fainelli

On 09/18/2018 12:55 PM, Heiner Kallweit wrote:
> When bringing down the netdevice (incl. detaching it) and calling
> netif_carrier_off directly or indirectly the latter triggers an
> asynchronous linkwatch event.
> This linkwatch event eventually may fail to access chip registers in
> the ndo_get_stats/ndo_get_stats64 callback because the device isn't
> accessible any longer, see call trace in [0].
> 
> To prevent this scenario don't check for IFF_UP only, but also make
> sure that the netdevice is present.
> 
> [0] https://lists.openwall.net/netdev/2018/03/15/62
> 
> Signed-off-by: Heiner Kallweit 

Reviewed-by: Florian Fainelli 

Thanks Heiner!
-- 
Florian

Re: [PATCH net-next 2/2] net: phy: call state machine synchronously in phy_stop

2018-09-19 Thread Florian Fainelli

On 09/18/2018 12:56 PM, Heiner Kallweit wrote:
> phy_stop() may be called e.g. when suspending, therefore all needed
> actions should be performed synchronously. Therefore add a synchronous
> call to the state machine.
> 
> Signed-off-by: Heiner Kallweit 

Reviewed-by: Florian Fainelli 

Yes!
-- 
Florian

Re: [PATCH] net: phy: don't reschedule state machine when PHY is halted

2018-09-19 Thread Florian Fainelli

On 09/18/2018 01:17 PM, Heiner Kallweit wrote:
> On 18.09.2018 22:02, Florian Fainelli wrote:
>> On 09/18/2018 12:12 PM, Heiner Kallweit wrote:
>>> I think I've seen a similar or same patch before, not sure why it
>>> didn't make it yet. When being in state PHY_HALTED we don't have to
>>> reschedule the state machine, phy_start() will start it again.
>>
>> Yes, this is conceptually the same patch as as this one:
>>
>> https://patchwork.ozlabs.org/patch/830415/
>>
> Thanks for the link, this is what I was referring to.
> 
>> I prefer your version, though the comment in the original patch explains
>> why.
>>
> To be sure I understand you correctly:
> You're fine with the patch as is or would you prefer to add a comment
> like in the original patch ?

Correct, unless you think this does not warrant a comment and it is
clear enough as-is?

> 
>>>
>>> Signed-off-by: Heiner Kallweit 
>>> ---
>>>  drivers/net/phy/phy.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
>>> index 1ee25877c..c78203b25 100644
>>> --- a/drivers/net/phy/phy.c
>>> +++ b/drivers/net/phy/phy.c
>>> @@ -1123,7 +1123,7 @@ void phy_state_machine(struct work_struct *work)
>>>  * PHY, if PHY_IGNORE_INTERRUPT is set, then we will be moving
>>>  * between states from phy_mac_interrupt()
>>>  */
>>> -   if (phy_polling_mode(phydev))
>>> +   if (phy_polling_mode(phydev) && old_state != PHY_HALTED)
>>> queue_delayed_work(system_power_efficient_wq, 
>>> &phydev->state_queue,
>>>PHY_STATE_TIME * HZ);
>>>  }
>>>
>>
>>
> 


-- 
Florian

[PATCH net] af_key: free SKBs under RCU protection

2018-09-19 Thread Sean Tranchetti

pfkey_broadcast() can make calls to pfkey_broadcast_one() which
will clone or copy the passed in SKB. This new SKB will be assigned
the sock_rfree() function as its destructor, which requires that
the socket reference the SKB contains is valid when the SKB is freed.

If this SKB is ever passed to pfkey_broadcast() again by some other
function (such as pkfey_dump() or pfkey_promisc) it will then be
freed there. However, since this free occurs outside of RCU protection,
it is possible that userspace could close the socket and trigger
pfkey_release() to free the socket before sock_rfree() can run, creating
the following race condition:

1: An SKB belonging to the pfkey socket is passed to pfkey_broadcast().
   It performs the broadcast to any other sockets, and calls
   rcu_read_unlock(), but does not yet reach kfree_skb().
2: Userspace closes the socket, triggering pfkey_realse(). Since no one
   holds the RCU lock, synchronize_rcu() returns and it is allowed to
   continue. It calls sock_put() to free the socket.
3: pfkey_broadcast() now calls kfree_skb() on the original SKB it was
   passed, triggering a call to sock_rfree(). This function now accesses
   the freed struct sock * via skb->sk, and attempts to update invalid
   memory.

By ensuring that the pfkey_broadcast() also frees the SKBs while it holds
the RCU lock, we can ensure that the socket will remain valid when the SKB
is freed, avoiding crashes like the following:

Unable to handle kernel paging request at virtual address 6b6b6b6b6b6c4b
[006b6b6b6b6b6c4b] address between user and kernel address ranges
Internal error: Oops: 9604 [#1] PREEMPT SMP
task: fff78f65b380 task.stack: ff8049a88000
pc : sock_rfree+0x38/0x6c
lr : skb_release_head_state+0x6c/0xcc
Process repro (pid: 7117, stack limit = 0xff8049a88000)
Call trace:
sock_rfree+0x38/0x6c
skb_release_head_state+0x6c/0xcc
skb_release_all+0x1c/0x38
__kfree_skb+0x1c/0x30
kfree_skb+0xd0/0xf4
pfkey_broadcast+0x14c/0x18c
pfkey_sendmsg+0x1d8/0x408
sock_sendmsg+0x44/0x60
___sys_sendmsg+0x1d0/0x2a8
__sys_sendmsg+0x64/0xb4
SyS_sendmsg+0x34/0x4c
el0_svc_naked+0x34/0x38
Kernel panic - not syncing: Fatal exception

Signed-off-by: Sean Tranchetti 
---
 net/key/af_key.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 9d61266..dd257c7 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -275,13 +275,13 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t 
allocation,
if ((broadcast_flags & BROADCAST_REGISTERED) && err)
err = err2;
}
-   rcu_read_unlock();
 
if (one_sk != NULL)
err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
 
kfree_skb(skb2);
kfree_skb(skb);
+   rcu_read_unlock();
return err;
 }
 
-- 
1.9.1

Re: [PATCH bpf-next 1/3] perf/core: introduce perf_event_mmap_bpf_prog

2018-09-19 Thread Alexei Starovoitov

On 9/19/18 4:30 PM, Song Liu wrote:
>
>
>> On Sep 19, 2018, at 3:39 PM, Alexei Starovoitov  wrote:
>>
>> introduce perf_event_mmap_bpf_prog() helper to emit RECORD_MMAP events
>> into perf ring buffer.
>> It's used by bpf load/unload logic to notify user space of addresses
>> and names of JITed bpf programs.
>>
>> Note that event->mmap.pid == -1 is an existing indicator of kernel event.
>> In addition use event->mmap.tid == BPF_FS_MAGIC to indicate bpf related
>> RECORD_MMAP event.
>>
>> Alternatively it's possible to introduce new 'enum perf_event_type' command
>> specificially for bpf prog load/unload, but existing RECORD_MMAP
>> is very close, so the choice made by this patch is to extend it.
>>
>> Signed-off-by: Alexei Starovoitov 
>
> Acked-by: Song Liu 
>
> I guess we should also use this for kernel modules load/unload?

yes. that's possible.
There is similar issue with modules today that get unloaded
before 'perf report'.
Synthetic record_mmap for modules that perf emits into perf.data
has filename==module_name. It solves typical use case though.
We can extend record_mmap further and let kernel emit them
for module load/unload.
Some new magic for 'tid' would be necessary.

Re: [PATCH bpf-next 2/3] bpf: emit RECORD_MMAP events for bpf prog load/unload

2018-09-19 Thread Alexei Starovoitov


On 9/19/18 4:44 PM, Song Liu wrote:




On Sep 19, 2018, at 3:39 PM, Alexei Starovoitov  wrote:

use perf_event_mmap_bpf_prog() helper to notify user space
about JITed bpf programs.
Use RECORD_MMAP perf event to tell user space where JITed bpf program was 
loaded.
Use empty program name as unload indication.

Signed-off-by: Alexei Starovoitov 
---
kernel/bpf/core.c | 22 --
1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3f5bf1af0826..ddf11fdafd36 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -384,7 +384,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
*symbol_end   = addr + hdr->pages * PAGE_SIZE;
}

-static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+static char *bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
const char *end = sym + KSYM_NAME_LEN;

@@ -402,9 +402,10 @@ static void bpf_get_prog_name(const struct bpf_prog *prog, 
char *sym)
sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
if (prog->aux->name[0])
-   snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+   sym += snprintf(sym, (size_t)(end - sym), "_%s", 
prog->aux->name);
else
*sym = 0;
+   return sym;
}

static __always_inline unsigned long
@@ -480,23 +481,40 @@ static bool bpf_prog_kallsyms_verify_off(const struct 
bpf_prog *fp)

void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
+   unsigned long symbol_start, symbol_end;
+   char buf[KSYM_NAME_LEN], *sym;
+
if (!bpf_prog_kallsyms_candidate(fp) ||
!capable(CAP_SYS_ADMIN))
return;

+   bpf_get_prog_addr_region(fp, &symbol_start, &symbol_end);
+   sym = bpf_get_prog_name(fp, buf);
+   sym++; /* sym - buf is the length of the name including trailing 0 */
+   while (!IS_ALIGNED(sym - buf, sizeof(u64)))
+   *sym++ = 0;


nit: This logic feels a little weird to me. How about we wrap the extra logic
in a separate function:

size_t bpf_get_prog_name_u64_aligned(const struct bpf_prog fp, char *buf)


probably bpf_get_prog_name_u64_padded() ?
would be cleaner indeed.

Will send v2 once Peter and Arnaldo provide their feedback.

Thanks for reviewing!

Re: [PATCH v2 2/4] dt-bindings: net: qcom: Add binding for shared mdio bus

2018-09-19 Thread Wang, Dongsheng

On 2018/9/19 22:15, Timur Tabi wrote:
> On 9/19/18 7:25 AM, Andrew Lunn wrote:
>> ACPI is completely separate and should not affect the DT binding.
>> I've not yet looked at the ACPI changes you added.
> Just FYI, there is no device tree platform on which the upstream EMAC 
> driver works.  All of the DT code in the driver is theoretical.  It 
> worked once on a prototype platform, when I originally wrote the code, 
> but since then DT support is mostly a guess.
>
> The focus of any patches for the EMAC should be ACPI, not DT.  If 
> anything, ACPI support should come first.  No one should be writing or 
> reviewing DT code before ACPI code.
>
> The upstream EMAC driver is only known to work on the QDF2400, which is 
> an ACPI-only chip.  I feel like I've been repeating this too often.
>
Ok, I just focus on ACPI, and keep DT code no changes.

Cheers,
Dongsheng

Re: [RFC bpf-next 2/4] bpf: return EOPNOTSUPP when map lookup isn't supported

2018-09-19 Thread Prashant Bhole





On 9/20/2018 3:40 AM, Mauricio Vasquez wrote:



On 09/19/2018 10:14 AM, Alexei Starovoitov wrote:

On Wed, Sep 19, 2018 at 04:51:41PM +0900, Prashant Bhole wrote:

Return ERR_PTR(-EOPNOTSUPP) from map_lookup_elem() methods of below
map types:
- BPF_MAP_TYPE_PROG_ARRAY
- BPF_MAP_TYPE_STACK_TRACE
- BPF_MAP_TYPE_XSKMAP
- BPF_MAP_TYPE_SOCKMAP/BPF_MAP_TYPE_SOCKHASH

Signed-off-by: Prashant Bhole 
---
  kernel/bpf/arraymap.c | 2 +-
  kernel/bpf/sockmap.c  | 2 +-
  kernel/bpf/stackmap.c | 2 +-
  kernel/bpf/xskmap.c   | 2 +-
  4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index dded84cbe814..24583da9ffd1 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -449,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map)
  static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
  {
-    return NULL;
+    return ERR_PTR(-EOPNOTSUPP);
  }

conceptually the set looks good to me.
Please add a test to test_verifier.c to make sure
that these lookup helpers cannot be called from BPF program.
Otherwise this diff may cause crashes.


I think we can remove all these stub functions that return EOPNOTSUPP or 
EINVAL and return the error in syscall.c if ops->map_[update, delete, 
lookup, ...] is null.
This will require to extend (and test) the verifier to guarantee that 
those helpers are not called in wrong maps, for example 
map_delete_element in array like maps.


Would it make sense?


Thanks for review and suggestion.

I had thought about this way too (except adding restrictions in the 
verifier). There is no strong reason for choosing current implementation.


I thought there must be some reason that those methods are implemented 
and just returning NULL. Also there are no NULL checks for 
map_lookup_elem stub. So I decided to simply change the return value.

If some more people agree with removing stub function idea, I will do it.

-Prashant

Re: [RFC bpf-next 2/4] bpf: return EOPNOTSUPP when map lookup isn't supported

2018-09-19 Thread Prashant Bhole





On 9/20/2018 12:14 AM, Alexei Starovoitov wrote:

On Wed, Sep 19, 2018 at 04:51:41PM +0900, Prashant Bhole wrote:

Return ERR_PTR(-EOPNOTSUPP) from map_lookup_elem() methods of below
map types:
- BPF_MAP_TYPE_PROG_ARRAY
- BPF_MAP_TYPE_STACK_TRACE
- BPF_MAP_TYPE_XSKMAP
- BPF_MAP_TYPE_SOCKMAP/BPF_MAP_TYPE_SOCKHASH

Signed-off-by: Prashant Bhole 
---
  kernel/bpf/arraymap.c | 2 +-
  kernel/bpf/sockmap.c  | 2 +-
  kernel/bpf/stackmap.c | 2 +-
  kernel/bpf/xskmap.c   | 2 +-
  4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index dded84cbe814..24583da9ffd1 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -449,7 +449,7 @@ static void fd_array_map_free(struct bpf_map *map)
  
  static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)

  {
-   return NULL;
+   return ERR_PTR(-EOPNOTSUPP);
  }


conceptually the set looks good to me.
Please add a test to test_verifier.c to make sure
that these lookup helpers cannot be called from BPF program.
Otherwise this diff may cause crashes.


Thanks for reviewing.
Is the verifier change below sufficient?

--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2128,10 +2128,18 @@ static int check_map_func_compatibility(struct 
bpf_verifier_env *env,

if (env->subprog_cnt > 1) {
 			verbose(env, "tail_calls are not allowed in programs with 
bpf-to-bpf calls\n");

return -EINVAL;
}
break;
+   case BPF_FUNC_map_lookup_elem:
+   if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+   map->map_type == BPF_MAP_TYPE_STACK_TRACE ||
+   map->map_type == BPF_MAP_TYPE_XSKMAP ||
+   map->map_type == BPF_MAP_TYPE_SOCKMAP ||
+   map->map_type == BPF_MAP_TYPE_SOCKHASH)
+   goto error;
+   break;
case BPF_FUNC_perf_event_read:
case BPF_FUNC_perf_event_output:
case BPF_FUNC_perf_event_read_value:
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;

-Prashant

Re: [RFC bpf-next 3/4] tools/bpf: bpftool, split the function do_dump()

2018-09-19 Thread Prashant Bhole





On 9/20/2018 12:26 AM, Jakub Kicinski wrote:

On Wed, 19 Sep 2018 16:51:42 +0900, Prashant Bhole wrote:

+static int dump_map_elem(int fd, void *key, void *value,
+struct bpf_map_info *map_info, struct btf *btf,
+json_writer_t *btf_wtr)
+{
+   int num_elems = 0;
+
+   if (!bpf_map_lookup_elem(fd, key, value)) {
+   if (json_output) {
+   print_entry_json(map_info, key, value, btf);
+   } else {
+   if (btf) {
+   struct btf_dumper d = {
+   .btf = btf,
+   .jw = btf_wtr,
+   .is_plain_text = true,
+   };
+
+   do_dump_btf(&d, map_info, key, value);
+   } else {
+   print_entry_plain(map_info, key, value);
+   }
+   num_elems++;
+   }
+   goto out;
+   }
+
+   /* lookup error handling */
+   if (map_is_map_of_maps(map_info->type) ||
+   map_is_map_of_progs(map_info->type))
+   goto out;
+


nit: why not just return?  the goto seems to only do a return anyway,
  is this suggested by some coding style?  Is it to help the
  compiler?  I see people do this from time to time..


Thanks for reviewing. I agree, goto and the label isn't needed. I will 
fix it.


-Prashant


[...]


+out:
+   return num_elems;

Re: [RFC bpf-next 4/4] tools/bpf: handle EOPNOTSUPP when map lookup is failed

2018-09-19 Thread Prashant Bhole





On 9/20/2018 12:29 AM, Jakub Kicinski wrote:

On Wed, 19 Sep 2018 16:51:43 +0900, Prashant Bhole wrote:

Let's add a check for EOPNOTSUPP error when map lookup is failed.
Also in case map doesn't support lookup, the output of map dump is
changed from "can't lookup element" to "lookup not supported for
this map".

Patch adds function print_entry_error() function to print the error
value.

Following example dumps a map which does not support lookup.

Output before:
root# bpftool map -jp dump id 40
[
 "key": ["0x0a","0x00","0x00","0x00"
 ],
 "value": {
 "error": "can\'t lookup element"
 },
 "key": ["0x0b","0x00","0x00","0x00"
 ],
 "value": {
 "error": "can\'t lookup element"
 }
]

root# bpftool map dump id 40
can't lookup element with key:
0a 00 00 00
can't lookup element with key:
0b 00 00 00
Found 0 elements

Output after changes:
root# bpftool map dump -jp  id 45
[
 "key": ["0x0a","0x00","0x00","0x00"
 ],
 "value": {
 "error": "lookup not supported for this map"
 },
 "key": ["0x0b","0x00","0x00","0x00"
 ],
 "value": {
 "error": "lookup not supported for this map"
 }
]

root# bpftool map dump id 45
key:
0a 00 00 00
value:
lookup not supported for this map
key:
0b 00 00 00
value:
lookup not supported for this map
Found 0 elements


Nice improvement, thanks for the changes!  I wonder what your thoughts
would be on just printing some form of "lookup not supported for this
map" only once?  It seems slightly like repeated information - if
lookup is not supported for one key it likely won't be for other keys
too, so we could shorten the output.  Would that make sense?


I too thought that the message is repeated information. One idea was as 
you said, stop iterating once we get EOPNOTSUPP. But only reason for 
keeping this way is that user will at least see dump of keys along with 
it. Also it will be consistent with Json output. What is your opinion on it?


I will fix other things that you pointed out. Thanks!

-Prashant




Signed-off-by: Prashant Bhole 
---
  tools/bpf/bpftool/main.h |  5 +
  tools/bpf/bpftool/map.c  | 35 ++-
  2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 40492cdc4e53..1a8c683f949b 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -46,6 +46,11 @@
  
  #include "json_writer.h"
  
+#define ERR_CANNOT_LOOKUP \

+   "can't lookup element"
+#define ERR_LOOKUP_NOT_SUPPORTED \
+   "lookup not supported for this map"


Do we need these?  Are we going to reused them in more parts of the
code?


  #define ptr_to_u64(ptr)   ((__u64)(unsigned long)(ptr))
  
  #define NEXT_ARG()	({ argc--; argv++; if (argc < 0) usage(); })

diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c
index 284e12a289c0..2faccd2098c9 100644
--- a/tools/bpf/bpftool/map.c
+++ b/tools/bpf/bpftool/map.c
@@ -333,6 +333,25 @@ static void print_entry_json(struct bpf_map_info *info, 
unsigned char *key,
jsonw_end_object(json_wtr);
  }
  
+static void print_entry_error(struct bpf_map_info *info, unsigned char *key,

+ char *value)
+{
+   bool single_line, break_names;
+   int value_size = strlen(value);


nit: order variables declaration lines to shortest, please.


+
+   break_names = info->key_size > 16 || value_size > 16;
+   single_line = info->key_size + value_size <= 24 && !break_names;
+
+   printf("key:%c", break_names ? '\n' : ' ');
+   fprint_hex(stdout, key, info->key_size, " ");
+
+   printf(single_line ? "  " : "\n");
+
+   printf("value:%c%s", break_names ? '\n' : ' ', value);
+
+   printf("\n");
+}
+
  static void print_entry_plain(struct bpf_map_info *info, unsigned char *key,
  unsigned char *value)
  {
@@ -660,6 +679,8 @@ static int dump_map_elem(int fd, void *key, void *value,
 json_writer_t *btf_wtr)
  {
int num_elems = 0;
+   int lookup_errno;
+   char *errstr;


nit: const char?

  
  	if (!bpf_map_lookup_elem(fd, key, value)) {

if (json_output) {

1 2 >

1 - 100 of 140 matches

Mail list logo