date:20180709

Re: [PATCH bpf-next v2 08/12] tools: libbpf: add extended attributes version of bpf_object__open()

2018-07-09 Thread Andrey Ignatov

Jakub Kicinski  [Mon, 2018-07-09 11:01 -0700]:
> Similarly to bpf_prog_load() users of bpf_object__open() may need
> to specify the expected program type.  Program type is needed at
> open to avoid the kernel version check for program types which don't
> require it.
> 
> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Quentin Monnet 
> ---
>  tools/lib/bpf/libbpf.c | 21 +
>  tools/lib/bpf/libbpf.h |  6 ++
>  2 files changed, 23 insertions(+), 4 deletions(-)
> 
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index edc3b0b3737d..5b0e84fbcf71 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -1520,7 +1520,8 @@ __bpf_object__open(const char *path, void *obj_buf, 
> size_t obj_buf_sz,
>   return ERR_PTR(err);
>  }
>  
> -struct bpf_object *bpf_object__open(const char *path)
> +struct bpf_object *bpf_object__open_xattr(const char *path,
> +   struct bpf_object_open_attr *attr)
>  {
>   /* param validation */
>   if (!path)
> @@ -1528,7 +1529,17 @@ struct bpf_object *bpf_object__open(const char *path)
>  
>   pr_debug("loading %s\n", path);
>  
> - return __bpf_object__open(path, NULL, 0, true);
> + return __bpf_object__open(path, NULL, 0,
> +   bpf_prog_type__needs_kver(attr->prog_type));
> +}
> +
> +struct bpf_object *bpf_object__open(const char *path)
> +{
> + struct bpf_object_open_attr attr = {
> + .prog_type  = BPF_PROG_TYPE_UNSPEC,
> + };
> +
> + return bpf_object__open_xattr(path, );
>  }
>  
>  struct bpf_object *bpf_object__open_buffer(void *obj_buf,
> @@ -2238,6 +2249,9 @@ int bpf_prog_load(const char *file, enum bpf_prog_type 
> type,
>  int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
>   struct bpf_object **pobj, int *prog_fd)
>  {
> + struct bpf_object_open_attr open_attr = {
> + .prog_type  = attr->prog_type,
> + };
>   struct bpf_program *prog, *first_prog = NULL;
>   enum bpf_attach_type expected_attach_type;
>   enum bpf_prog_type prog_type;
> @@ -2250,8 +2264,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr 
> *attr,
>   if (!attr->file)
>   return -EINVAL;
>  
> - obj = __bpf_object__open(attr->file, NULL, 0,
> -  bpf_prog_type__needs_kver(attr->prog_type));
> + obj = bpf_object__open_xattr(attr->file, _attr);
>   if (IS_ERR_OR_NULL(obj))
>   return -ENOENT;
>  
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index 3122d74f2643..60593ac44700 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -66,7 +66,13 @@ void libbpf_set_print(libbpf_print_fn_t warn,
>  /* Hide internal to user */
>  struct bpf_object;
>  
> +struct bpf_object_open_attr {
> + enum bpf_prog_type prog_type;
> +};
> +
>  struct bpf_object *bpf_object__open(const char *path);
> +struct bpf_object *bpf_object__open_xattr(const char *path,
> +   struct bpf_object_open_attr *attr);

Should the new bpf_object__open_xattr() API have _only_ attr argument?
Path, in turn, can become a member of attr.

That way it can be reused e.g. to load object from buffer (like
bpf_object__open_buffer() below), where path is not needed.

Otherwise, if bpf_object__open_buffer() has to be extended in the
future, another _xattr function will be needed (or caller would need to
pass NULL to path, what would make API less convenient).


>  struct bpf_object *bpf_object__open_buffer(void *obj_buf,
>  size_t obj_buf_sz,
>  const char *name);
> -- 
> 2.17.1
> 

-- 
Andrey Ignatov

Re: [PATCH bpf-next v2 05/12] tools: libbpf: expose the prog type guessing from section name logic

2018-07-09 Thread Andrey Ignatov

Jakub Kicinski  [Mon, 2018-07-09 11:01 -0700]:
> libbpf can guess program type based on ELF section names.  As libbpf
> becomes more popular its association between section name strings and
> types becomes more of a standard.  Allow libbpf users to use the same
> logic for matching strings to types, e.g. when the string originates
> from command line.
> 
> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Quentin Monnet 
> ---
>  tools/lib/bpf/libbpf.c | 43 --
>  tools/lib/bpf/libbpf.h |  3 +++
>  2 files changed, 28 insertions(+), 18 deletions(-)
> 
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index 38ed3e92e393..30f3e58bd563 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -2081,25 +2081,33 @@ static const struct {
>  #undef BPF_S_PROG_SEC
>  #undef BPF_SA_PROG_SEC
>  
> -static int bpf_program__identify_section(struct bpf_program *prog)
> +int libbpf_prog_type_by_string(const char *name, enum bpf_prog_type 
> *prog_type,
> +enum bpf_attach_type *expected_attach_type)
>  {
>   int i;
>  
> - if (!prog->section_name)
> - goto err;
> -
> - for (i = 0; i < ARRAY_SIZE(section_names); i++)
> - if (strncmp(prog->section_name, section_names[i].sec,
> - section_names[i].len) == 0)
> - return i;
> -
> -err:
> - pr_warning("failed to guess program type based on section name %s\n",
> -prog->section_name);
> + if (!name)
> + return -1;

Should it return -EINVAL? It can help in bpf_prog_load_xattr below:

err = bpf_program__identify_section(prog, _type,

_attach_type);
if (err < 0) {
...
return err;
}


>  
> + for (i = 0; i < ARRAY_SIZE(section_names); i++) {
> + if (strncmp(name, section_names[i].sec, section_names[i].len))
> + continue;
> + *prog_type = section_names[i].prog_type;
> + *expected_attach_type = section_names[i].expected_attach_type;
> + return 0;
> + }
>   return -1;

Same here.

>  }
>  
> +static int
> +bpf_program__identify_section(struct bpf_program *prog,
> +   enum bpf_prog_type *prog_type,
> +   enum bpf_attach_type *expected_attach_type)
> +{
> + return libbpf_prog_type_by_string(prog->section_name, prog_type,
> +   expected_attach_type);
> +}
> +
>  int bpf_map__fd(struct bpf_map *map)
>  {
>   return map ? map->fd : -EINVAL;
> @@ -2230,7 +2238,6 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr 
> *attr,
>   enum bpf_prog_type prog_type;
>   struct bpf_object *obj;
>   struct bpf_map *map;
> - int section_idx;
>   int err;
>  
>   if (!attr)
> @@ -2252,14 +2259,14 @@ int bpf_prog_load_xattr(const struct 
> bpf_prog_load_attr *attr,
>   prog->prog_ifindex = attr->ifindex;
>   expected_attach_type = attr->expected_attach_type;
>   if (prog_type == BPF_PROG_TYPE_UNSPEC) {
> - section_idx = bpf_program__identify_section(prog);
> - if (section_idx < 0) {
> + err = bpf_program__identify_section(prog, _type,
> + 
> _attach_type);
> + if (err < 0) {
> + pr_warning("failed to guess program type based 
> on section name %s\n",
> +prog->section_name);
>   bpf_object__close(obj);
>   return -EINVAL;
>   }
> - prog_type = section_names[section_idx].prog_type;
> - expected_attach_type =
> - section_names[section_idx].expected_attach_type;
>   }
>  
>   bpf_program__set_type(prog, prog_type);
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index 564f4be9bae0..617dacfc6704 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -92,6 +92,9 @@ int bpf_object__set_priv(struct bpf_object *obj, void *priv,
>bpf_object_clear_priv_t clear_priv);
>  void *bpf_object__priv(struct bpf_object *prog);
>  
> +int libbpf_prog_type_by_string(const char *name, enum bpf_prog_type 
> *prog_type,

Nit:

I think it should be either:
  int libbpf_prog_type_by_title(const char *title, enum bpf_prog_type 
*prog_type,

(to be consistent with bpf_program__title())),

or:
  int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type,

(to have function name consistent with argument name and with
bpf_program->name).

IMO "name" is better

Re: [net-next,v2] tcp: Improve setsockopt() TCP_USER_TIMEOUT accuracy

2018-07-09 Thread Eric Dumazet




On 07/04/2018 04:34 PM, Jonathan Maxwell wrote:
> Let's wait for Eric to review. Then I'll put together the next version.

Sorry for the delay (I was travelling last week) , please respin a v3, thanks !

Re: [PATCH bpf-next v2 11/12] tools: libbpf: allow map reuse

2018-07-09 Thread Andrey Ignatov

Jakub Kicinski  [Mon, 2018-07-09 19:49 -0700]:
> On Mon, 9 Jul 2018 13:22:54 -0700, Andrey Ignatov wrote:
> > Jakub Kicinski  [Mon, 2018-07-09 11:01 -0700]:
> > > More advanced applications may want to only replace programs without
> > > destroying associated maps.  Allow libbpf users to achieve that.
> > > Instead of always creating all of the maps at load time, expose to
> > > users an API to reconstruct the map object from already existing
> > > map.
> > > 
> > > The map parameters are read from the kernel and replace the parameters
> > > of the ELF map.  libbpf does not restrict the map replacement, i.e.
> > > the reused map does not have to be compatible with the ELF map
> > > definition.  We relay on the verifier for checking the compatibility
> > > between maps and programs.  The ELF map definition is completely
> > > overwritten by the information read from the kernel, to make sure
> > > libbpf's view of map object corresponds to the actual map.  
> > 
> > Thanks for working on this Jakub! I encountered this shortcoming of
> > libbpf as well and was planning to fix it, but you beat me to it :)
> 
> Ah!  I wish I didn't! :)
> 
> > > Signed-off-by: Jakub Kicinski 
> > > Reviewed-by: Quentin Monnet 
> > > ---
> > >  tools/lib/bpf/libbpf.c | 35 +++
> > >  tools/lib/bpf/libbpf.h |  1 +
> > >  2 files changed, 36 insertions(+)
> > > 
> > > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > > index b653dbb266c7..c80033fe66c3 100644
> > > --- a/tools/lib/bpf/libbpf.c
> > > +++ b/tools/lib/bpf/libbpf.c
> > > @@ -215,6 +215,7 @@ struct bpf_map {
> > >   int fd;
> > >   char *name;
> > >   size_t offset;
> > > + bool fd_preset;  
> > 
> > Any reason not to use map->fd itself to identify if fd is present?
> 
> Note: pre-set, not present.

Oh, sorry, I'm blind :)


> > fd of every map is set to -1 in bpf_object__init_maps() that, in turn, is
> > called from __bpf_object__open():
> > 
> > for (i = 0; i < nr_maps; i++)
> > obj->maps[i].fd = -1;
> > 
> > Later it will either contain valid fd that is >= 0, or that same -1, what
> > should be enough to identify fd presence.
> 
> I thought it to be cleaner to indicate the fd has been pre-set, in case
> things get more complicated in the future and fd >= 0 becomes ambiguous.
> 
> But no strong preference, should I change?

My preference (not strong either) is to avoid a new field whenever it's
possible. Though if you have a use-case that can't be covered by
(fd >= 0) keeping the field is fine as well.


> > >   int map_ifindex;
> > >   struct bpf_map_def def;
> > >   uint32_t btf_key_type_id;
> > > @@ -1082,6 +1083,34 @@ static int bpf_map_find_btf_info(struct bpf_map 
> > > *map, const struct btf *btf)
> > >   return 0;
> > >  }
> > >  
> > > +int bpf_map__reuse_fd(struct bpf_map *map, int fd)
> > > +{
> > > + struct bpf_map_info info = {};
> > > + __u32 len = sizeof(info);
> > > + int err;
> > > +
> > > + err = bpf_obj_get_info_by_fd(fd, , );
> > > + if (err)
> > > + return err;
> > > +  
> > 
> > Should there be a check that map->fd doesn't contain any valid fd (>= 0)
> > before rewriting it so that if it does (e.g. because the function is
> > called after bpf_object__load() by mistake), current map->fd won't be
> > leaked?
> 
> Hm.  In my first implementation libbpf just took the passed fd and
> didn't do a dup(), the lifetime of the fd remained with the caller.
> Having a check will prevent changing the descriptor unless we add some
> from of "un-reuse" as well.  Perhaps I should just add a close() in
> case fd >= 0?  Or do you prefer a hard error?

Agree, close() in case fd >= 0 should be fine since caller already made it
explicit that they don't care about current fd and there should not be a
reason to hard-fail.


> > > + map->fd = dup(fd);  
> > 
> > Unfortunately, new descriptor created by dup(2) will not have O_CLOEXEC 
> > set, in
> > contrast to original fd returned by kernel on map creation.
> > 
> > libbpf has other interface shortcomings where it comes up. E.g. struct
> > bpf_object owns all descriptors it contains (progs, maps) and closes them in
> > bpf_object__close(). if one wants to open/load ELF, then close it but
> > keep, say, prog fd to attach it to cgroup some time later, then fd
> > should be duplicated as well to get a new one not owned by bpf_object.
> > 
> > Currently I use this workaround to avoid time when new fd doesn't have
> > O_CLOEXEC:
> > 
> > int new_prog_fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
> > if (new_prog_fd < 0 ||
> > dup3(bpf_program__fd(prog), new_prog_fd, O_CLOEXEC) == -1) {
> > /* .. handle error .. */
> > close(new_prog_fd);
> > }
> > /* .. use new_prog_fd with O_CLOEXEC set */
> > 
> > Not sure how to simplify it. dup2() has same problem with regard to
> > O_CLOEXEC.
> > 
> > Use-case: standalone server application that uses libbpf and does
> > fork()/execve() a lot.
> 
> Good point!  I have no

Re: [PATCH bpf-next v2 11/12] tools: libbpf: allow map reuse

2018-07-09 Thread Jakub Kicinski

On Mon, 9 Jul 2018 13:22:54 -0700, Andrey Ignatov wrote:
> Jakub Kicinski  [Mon, 2018-07-09 11:01 -0700]:
> > More advanced applications may want to only replace programs without
> > destroying associated maps.  Allow libbpf users to achieve that.
> > Instead of always creating all of the maps at load time, expose to
> > users an API to reconstruct the map object from already existing
> > map.
> > 
> > The map parameters are read from the kernel and replace the parameters
> > of the ELF map.  libbpf does not restrict the map replacement, i.e.
> > the reused map does not have to be compatible with the ELF map
> > definition.  We relay on the verifier for checking the compatibility
> > between maps and programs.  The ELF map definition is completely
> > overwritten by the information read from the kernel, to make sure
> > libbpf's view of map object corresponds to the actual map.  
> 
> Thanks for working on this Jakub! I encountered this shortcoming of
> libbpf as well and was planning to fix it, but you beat me to it :)

Ah!  I wish I didn't! :)

> > Signed-off-by: Jakub Kicinski 
> > Reviewed-by: Quentin Monnet 
> > ---
> >  tools/lib/bpf/libbpf.c | 35 +++
> >  tools/lib/bpf/libbpf.h |  1 +
> >  2 files changed, 36 insertions(+)
> > 
> > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > index b653dbb266c7..c80033fe66c3 100644
> > --- a/tools/lib/bpf/libbpf.c
> > +++ b/tools/lib/bpf/libbpf.c
> > @@ -215,6 +215,7 @@ struct bpf_map {
> > int fd;
> > char *name;
> > size_t offset;
> > +   bool fd_preset;  
> 
> Any reason not to use map->fd itself to identify if fd is present?

Note: pre-set, not present.

> fd of every map is set to -1 in bpf_object__init_maps() that, in turn, is
> called from __bpf_object__open():
> 
>   for (i = 0; i < nr_maps; i++)
>   obj->maps[i].fd = -1;
> 
> Later it will either contain valid fd that is >= 0, or that same -1, what
> should be enough to identify fd presence.

I thought it to be cleaner to indicate the fd has been pre-set, in case
things get more complicated in the future and fd >= 0 becomes ambiguous.

But no strong preference, should I change?

> > int map_ifindex;
> > struct bpf_map_def def;
> > uint32_t btf_key_type_id;
> > @@ -1082,6 +1083,34 @@ static int bpf_map_find_btf_info(struct bpf_map 
> > *map, const struct btf *btf)
> > return 0;
> >  }
> >  
> > +int bpf_map__reuse_fd(struct bpf_map *map, int fd)
> > +{
> > +   struct bpf_map_info info = {};
> > +   __u32 len = sizeof(info);
> > +   int err;
> > +
> > +   err = bpf_obj_get_info_by_fd(fd, , );
> > +   if (err)
> > +   return err;
> > +  
> 
> Should there be a check that map->fd doesn't contain any valid fd (>= 0)
> before rewriting it so that if it does (e.g. because the function is
> called after bpf_object__load() by mistake), current map->fd won't be
> leaked?

Hm.  In my first implementation libbpf just took the passed fd and
didn't do a dup(), the lifetime of the fd remained with the caller.
Having a check will prevent changing the descriptor unless we add some
from of "un-reuse" as well.  Perhaps I should just add a close() in
case fd >= 0?  Or do you prefer a hard error?

> > +   map->fd = dup(fd);  
> 
> Unfortunately, new descriptor created by dup(2) will not have O_CLOEXEC set, 
> in
> contrast to original fd returned by kernel on map creation.
> 
> libbpf has other interface shortcomings where it comes up. E.g. struct
> bpf_object owns all descriptors it contains (progs, maps) and closes them in
> bpf_object__close(). if one wants to open/load ELF, then close it but
> keep, say, prog fd to attach it to cgroup some time later, then fd
> should be duplicated as well to get a new one not owned by bpf_object.
> 
> Currently I use this workaround to avoid time when new fd doesn't have
> O_CLOEXEC:
> 
>   int new_prog_fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
>   if (new_prog_fd < 0 ||
>   dup3(bpf_program__fd(prog), new_prog_fd, O_CLOEXEC) == -1) {
>   /* .. handle error .. */
>   close(new_prog_fd);
>   }
>   /* .. use new_prog_fd with O_CLOEXEC set */
> 
> Not sure how to simplify it. dup2() has same problem with regard to
> O_CLOEXEC.
> 
> Use-case: standalone server application that uses libbpf and does
> fork()/execve() a lot.

Good point!  I have no better ideas.  Although being slightly paranoid
I would perhaps use "/" instead of "/dev/null"?  Shouldn't matter?

> > +   if (map->fd < 0)
> > +   return map->fd;
> > +   map->fd_preset = true;
> > +
> > +   free(map->name);
> > +   map->name = strdup(info.name);
> > +   map->def.type = info.type;
> > +   map->def.key_size = info.key_size;
> > +   map->def.value_size = info.value_size;
> > +   map->def.max_entries = info.max_entries;
> > +   map->def.map_flags = info.map_flags;
> > +   map->btf_key_type_id = info.btf_key_type_id;
> > +   map->btf_value_type_id = info.btf_value_type_id;
>

Re: [PATCH bpf-next v2 12/12] tools: bpftool: allow reuse of maps with bpftool prog load

2018-07-09 Thread Jakub Kicinski

On Mon, 9 Jul 2018 12:48:20 -0700, Alexei Starovoitov wrote:
> On Mon, Jul 09, 2018 at 10:59:44AM -0700, Jakub Kicinski wrote:
> > Add map parameter to prog load which will allow reuse of existing
> > maps instead of creating new ones.
> > 
> > We need feature detection and compat code for reallocarray, since
> > it's not available in many libc versions.
> > 
> > Signed-off-by: Jakub Kicinski 
> > Reviewed-by: Quentin Monnet   
> 
> cmdline interface feels a bit awkward to use, but it's a nice improvement.
> Acked-by: Alexei Starovoitov 

Thanks, what about the cmdline feels awkward?  The syntax or having to
manipulate map reuse at cmdline level?

> any plans to extend bpf_map_def similar to iproute2 ?
> so things like pinned file name and map reuse can be specified in .c file
> instead of cmdline?

TBH for my purposes (testing, showcasing) being able to modify things
from command line is more convenient than baking such info in ELF files.
No plans to extend bpf_map_def at this point :(

[PATCH net-next] macvlan: Change status when lower device goes down

2018-07-09 Thread Travis Brown

Today macvlan ignores the notification when a lower device goes
administratively down, preventing the lack of connectivity from
bubbling up.

Processing NETDEV_DOWN results in a macvlan state of LOWERLAYERDOWN
with NO-CARRIER which should be easy to interpret in userspace.

2: lower:  mtu 1500 qdisc mq state DOWN mode DEFAULT group 
default qlen 1000
3: macvlan@lower:  mtu 1500 qdisc 
noqueue state LOWERLAYERDOWN mode DEFAULT group default qlen 1000

Signed-off-by: Suresh Krishnan 
Signed-off-by: Travis Brown 
---
 drivers/net/macvlan.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index adde8fc45588..6dcd715a9370 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1647,6 +1647,7 @@ static int macvlan_device_event(struct notifier_block 
*unused,
 
switch (event) {
case NETDEV_UP:
+   case NETDEV_DOWN:
case NETDEV_CHANGE:
list_for_each_entry(vlan, >vlans, list)
netif_stacked_transfer_operstate(vlan->lowerdev,
-- 
2.16.1

[PATCH v4 iproute2-next 2/3] tc: Add support for the ETF Qdisc

2018-07-09 Thread Jesus Sanchez-Palencia

From: Vinicius Costa Gomes 

The "Earliest TxTime First" (ETF) queueing discipline allows precise
control of the transmission time of packets by providing a sorted
time-based scheduling of packets.

The syntax is:

tc qdisc add dev DEV parent NODE etf delta 
 clockid  [offload] [deadline_mode]

Signed-off-by: Vinicius Costa Gomes 
Signed-off-by: Jesus Sanchez-Palencia 
---
 tc/Makefile |   1 +
 tc/q_etf.c  | 181 
 2 files changed, 182 insertions(+)
 create mode 100644 tc/q_etf.c

diff --git a/tc/Makefile b/tc/Makefile
index dfd00267..4525c0fb 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -71,6 +71,7 @@ TCMODULES += q_clsact.o
 TCMODULES += e_bpf.o
 TCMODULES += f_matchall.o
 TCMODULES += q_cbs.o
+TCMODULES += q_etf.o
 
 TCSO :=
 ifeq ($(TC_CONFIG_ATM),y)
diff --git a/tc/q_etf.c b/tc/q_etf.c
new file mode 100644
index ..79a06ba8
--- /dev/null
+++ b/tc/q_etf.c
@@ -0,0 +1,181 @@
+/*
+ * q_etf.c Earliest TxTime First (ETF).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Vinicius Costa Gomes 
+ * Jesus Sanchez-Palencia 
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+#include "tc_util.h"
+
+#define CLOCKID_INVALID (-1)
+static const struct static_clockid {
+   const char *name;
+   clockid_t clockid;
+} clockids_sysv[] = {
+   { "REALTIME", CLOCK_REALTIME },
+   { "TAI", CLOCK_TAI },
+   { "BOOTTIME", CLOCK_BOOTTIME },
+   { "MONOTONIC", CLOCK_MONOTONIC },
+   { NULL }
+};
+
+static void explain(void)
+{
+   fprintf(stderr, "Usage: ... etf delta NANOS clockid CLOCKID [offload] 
[deadline_mode]\n");
+   fprintf(stderr, "CLOCKID must be a valid SYS-V id (i.e. CLOCK_TAI)\n");
+}
+
+static void explain1(const char *arg, const char *val)
+{
+   fprintf(stderr, "etf: illegal value for \"%s\": \"%s\"\n", arg, val);
+}
+
+static void explain_clockid(const char *val)
+{
+   fprintf(stderr, "etf: illegal value for \"clockid\": \"%s\".\n", val);
+   fprintf(stderr, "It must be a valid SYS-V id (i.e. CLOCK_TAI)\n");
+}
+
+static int get_clockid(__s32 *val, const char *arg)
+{
+   const struct static_clockid *c;
+
+   /* Drop the CLOCK_ prefix if that is being used. */
+   if (strcasestr(arg, "CLOCK_") != NULL)
+   arg += sizeof("CLOCK_") - 1;
+
+   for (c = clockids_sysv; c->name; c++) {
+   if (strcasecmp(c->name, arg) == 0) {
+   *val = c->clockid;
+
+   return 0;
+   }
+   }
+
+   return -1;
+}
+
+static const char* get_clock_name(clockid_t clockid)
+{
+   const struct static_clockid *c;
+
+   for (c = clockids_sysv; c->name; c++) {
+   if (clockid == c->clockid)
+   return c->name;
+   }
+
+   return "invalid";
+}
+
+static int etf_parse_opt(struct qdisc_util *qu, int argc,
+char **argv, struct nlmsghdr *n, const char *dev)
+{
+   struct tc_etf_qopt opt = {
+   .clockid = CLOCKID_INVALID,
+   };
+   struct rtattr *tail;
+
+   while (argc > 0) {
+   if (matches(*argv, "offload") == 0) {
+   if (opt.flags & TC_ETF_OFFLOAD_ON) {
+   fprintf(stderr, "etf: duplicate \"offload\" 
specification\n");
+   return -1;
+   }
+
+   opt.flags |= TC_ETF_OFFLOAD_ON;
+   } else if (matches(*argv, "deadline_mode") == 0) {
+   if (opt.flags & TC_ETF_DEADLINE_MODE_ON) {
+   fprintf(stderr, "etf: duplicate 
\"deadline_mode\" specification\n");
+   return -1;
+   }
+
+   opt.flags |= TC_ETF_DEADLINE_MODE_ON;
+   } else if (matches(*argv, "delta") == 0) {
+   NEXT_ARG();
+   if (opt.delta) {
+   fprintf(stderr, "etf: duplicate \"delta\" 
specification\n");
+   return -1;
+   }
+   if (get_s32(, *argv, 0)) {
+   explain1("delta", *argv);
+   return -1;
+   }
+   } else if (matches(*argv, "clockid") == 0) {
+   NEXT_ARG();
+   if (opt.clockid != CLOCKID_INVALID) {
+   fprintf(stderr, "etf: duplicate \"clockid\" 
specification\n");
+   return -1;
+

[PATCH v4 iproute2-next 0/3] Add support for ETF qdisc

2018-07-09 Thread Jesus Sanchez-Palencia

fixes since v3:
 - Add support for clock names with the "CLOCK_" prefix;
 - Print clock name on print_opt();
 - Use strcasecmp() instead of strncasecmp().


The ETF (earliest txtime first) qdisc was recently merged into net-next
[1], so this patchset adds support for it through the tc command line
tool.

An initial man page is also provided.

The first commit in this series is adding an updated version of
include/uapi/linux/pkt_sched.h and is not meant to be merged. It's
provided here just as a convenience for those who want to easily build
this patchset.

[1] https://patchwork.ozlabs.org/cover/938991/

Jesus Sanchez-Palencia (2):
  uapi pkt_sched: Add etf info - DO NOT COMMIT
  man: Add initial manpage for tc-etf(8)

Vinicius Costa Gomes (1):
  tc: Add support for the ETF Qdisc

 include/uapi/linux/pkt_sched.h |  21 
 man/man8/tc-etf.8  | 141 +
 tc/Makefile|   1 +
 tc/q_etf.c | 181 +
 4 files changed, 344 insertions(+)
 create mode 100644 man/man8/tc-etf.8
 create mode 100644 tc/q_etf.c

-- 
2.18.0

[PATCH v4 iproute2-next 3/3] man: Add initial manpage for tc-etf(8)

2018-07-09 Thread Jesus Sanchez-Palencia

Add an initial manpage for tc-etf covering all config options, basic
concepts and operation modes.

Signed-off-by: Jesus Sanchez-Palencia 
---
 man/man8/tc-etf.8 | 141 ++
 1 file changed, 141 insertions(+)
 create mode 100644 man/man8/tc-etf.8

diff --git a/man/man8/tc-etf.8 b/man/man8/tc-etf.8
new file mode 100644
index ..30a12de7
--- /dev/null
+++ b/man/man8/tc-etf.8
@@ -0,0 +1,141 @@
+.TH ETF 8 "05 Jul 2018" "iproute2" "Linux"
+.SH NAME
+ETF \- Earliest TxTime First (ETF) Qdisc
+.SH SYNOPSIS
+.B tc qdisc ... dev
+dev
+.B parent
+classid
+.B [ handle
+major:
+.B ] etf clockid
+clockid
+.B [ delta
+delta_nsecs
+.B ] [ deadline_mode ]
+.B [ offload ]
+
+.SH DESCRIPTION
+The ETF (Earliest TxTime First) qdisc allows applications to control
+the instant when a packet should be dequeued from the traffic control
+layer into the netdevice. If
+.B offload
+is configured and supported by the network interface card, the it will
+also control when packets leave the network controller.
+
+ETF achieves that by buffering packets until a configurable time
+before their transmission time (i.e. txtime, or deadline), which can
+be configured through the
+.B delta
+option.
+
+The qdisc uses a rb-tree internally so packets are always 'ordered' by
+their txtime and will be dequeued following the (next) earliest txtime
+first.
+
+It relies on the SO_TXTIME socket option and the SCM_TXTIME CMSG in
+each packet field to configure the behavior of time dependent sockets:
+the clockid to be used as a reference, if the expected mode of txtime
+for that socket is deadline or strict mode, and if packet drops should
+be reported on the socket's error queue. See
+.BR socket(7)
+for more information.
+
+The etf qdisc will drop any packets with a txtime in the past, or if a
+packet expires while waiting for being dequeued.
+
+This queueing discipline is intended to be used by TSN (Time Sensitive
+Networking) applications, and it exposes a traffic shaping functionality
+that is commonly documented as "Launch Time" or "Time-Based Scheduling"
+by vendors and the documentation of network interface controllers.
+
+ETF is meant to be installed under another qdisc that maps packet flows
+to traffic classes, one example is
+.BR mqprio(8).
+
+.SH PARAMETERS
+.TP
+clockid
+.br
+Specifies the clock to be used by qdisc's internal timer for measuring
+time and scheduling events. The qdisc expects that packets passing
+through it to be using this same
+.B clockid
+as the reference of their txtime timestamps. It will drop packets
+coming from sockets that do not comply with that.
+
+For more information about time and clocks on Linux, please refer
+to
+.BR time(7)
+and
+.BR clock_gettime(3).
+
+.TP
+delta
+.br
+After enqueueing or dequeueing a packet, the qdisc will schedule its
+next wake-up time for the next txtime minus this delta value.
+This means
+.B delta
+can be used as a fudge factor for the scheduler latency of a system.
+This value must be specified in nanoseconds.
+The default value is 0 nanoseconds.
+
+.TP
+deadline_mode
+.br
+When
+.B deadline_mode
+is set, the qdisc will handle txtime with a different semantics,
+changed from a 'strict' transmission time to a deadline.
+In practice, this means during the dequeue flow
+.BR etf(8)
+will set the txtime of the packet being dequeued to 'now'.
+The default is for this option to be disabled.
+
+.TP
+offload
+.br
+When
+.B offload
+is set,
+.BR etf(8)
+will try to configure the network interface so time-based transmission
+arbitration is enabled in the controller. This feature is commonly
+referred to as "Launch Time" or "Time-Based Scheduling" by the
+documentation of network interface controllers.
+The default is for this option to be disabled.
+
+.SH EXAMPLES
+
+ETF is used to enforce a Quality of Service. It controls when each
+packets should be dequeued and transmitted, and can be used for
+limiting the data rate of a traffic class. To separate packets into
+traffic classes the user may choose
+.BR mqprio(8),
+and configure it like this:
+
+.EX
+# tc qdisc add dev eth0 handle 100: parent root mqprio num_tc 3 \\
+   map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \\
+   queues 1@0 1@1 2@2 \\
+   hw 0
+.EE
+.P
+To replace the current queueing discipline by ETF in traffic class
+number 0, issue:
+.P
+.EX
+# tc qdisc replace dev eth0 parent 100:1 etf \\
+   clockid CLOCK_TAI delta 30 offload
+.EE
+
+With the options above, etf will be configured to use CLOCK_TAI as
+its clockid_t, will schedule packets for 300 us before their txtime,
+and will enable the functionality on that in the network interface
+card. Deadline mode will not be configured for this mode.
+
+.SH AUTHORS
+Jesus Sanchez-Palencia 
+.br
+Vinicius Costa Gomes 
-- 
2.18.0

[PATCH v4 iproute2-next 1/3] uapi pkt_sched: Add etf info - DO NOT COMMIT

2018-07-09 Thread Jesus Sanchez-Palencia

This should come from the next uapi headers update.
Sending it now just as a convenience so anyone can build tc with etf
and taprio support.

Signed-off-by: Jesus Sanchez-Palencia 
---
 include/uapi/linux/pkt_sched.h | 21 +
 1 file changed, 21 insertions(+)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 37b5096a..94911846 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -539,6 +539,7 @@ enum {
TCA_NETEM_LATENCY64,
TCA_NETEM_JITTER64,
TCA_NETEM_SLOT,
+   TCA_NETEM_SLOT_DIST,
__TCA_NETEM_MAX,
 };
 
@@ -581,6 +582,8 @@ struct tc_netem_slot {
__s64   max_delay;
__s32   max_packets;
__s32   max_bytes;
+   __s64   dist_delay; /* nsec */
+   __s64   dist_jitter; /* nsec */
 };
 
 enum {
@@ -934,4 +937,22 @@ enum {
 
 #define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
 
+
+/* ETF */
+struct tc_etf_qopt {
+   __s32 delta;
+   __s32 clockid;
+   __u32 flags;
+#define TC_ETF_DEADLINE_MODE_ONBIT(0)
+#define TC_ETF_OFFLOAD_ON  BIT(1)
+};
+
+enum {
+   TCA_ETF_UNSPEC,
+   TCA_ETF_PARMS,
+   __TCA_ETF_MAX,
+};
+
+#define TCA_ETF_MAX (__TCA_ETF_MAX - 1)
+
 #endif
-- 
2.18.0

Re: [PATCH v2 net-next] net: Use __u32 in uapi net_stamp.h

2018-07-09 Thread David Miller

From: Jesus Sanchez-Palencia 
Date: Mon,  9 Jul 2018 16:20:56 -0700

> We are not supposed to use u32 in uapi, so change the flags member of
> struct sock_txtime from u32 to __u32 instead.
> 
> Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit 
> time")
> Reported-by: Eric Dumazet 
> Signed-off-by: Jesus Sanchez-Palencia 

Applied.

Re: [PATCH v2] rhashtable: add restart routine in rhashtable_free_and_destroy()

2018-07-09 Thread David Miller

From: Taehee Yoo 
Date: Sun,  8 Jul 2018 11:55:51 +0900

> rhashtable_free_and_destroy() cancels re-hash deferred work
> then walks and destroys elements. at this moment, some elements can be
> still in future_tbl. that elements are not destroyed.
> 
> test case:
> nft_rhash_destroy() calls rhashtable_free_and_destroy() to destroy
> all elements of sets before destroying sets and chains.
> But rhashtable_free_and_destroy() doesn't destroy elements of future_tbl.
> so that splat occurred.
> 
> test script:
 ...
> Splat looks like:
 ...
> V2:
>  - free all tables requested by Herbert Xu
> 
> Signed-off-by: Taehee Yoo 

Applied and queued up for -stable.

Re: [PATCH v2 net-next] net: Use __u32 in uapi net_stamp.h

2018-07-09 Thread Eric Dumazet




On 07/09/2018 04:20 PM, Jesus Sanchez-Palencia wrote:
> We are not supposed to use u32 in uapi, so change the flags member of
> struct sock_txtime from u32 to __u32 instead.
> 
> Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit 
> time")
> Reported-by: Eric Dumazet 
> Signed-off-by: Jesus Sanchez-Palencia 
> ---

Reviewed-by: Eric Dumazet 

Thanks !

Re: [PATCH net 0/6] bnxt_en: Bug fixes.

2018-07-09 Thread David Miller

From: Michael Chan 
Date: Mon,  9 Jul 2018 02:24:46 -0400

> These are bug fixes in error code paths, TC Flower VLAN TCI flow
> checking bug fix, proper filtering of Broadcast packets if IFF_BROADCAST
> is not set, and a bug fix in bnxt_get_max_rings() to return 0 ring
> parameters when the return value is -ENOMEM.

Series applied, thank you.

Re: [PATCH net-next 00/12] mlxsw: More Spectrum-2 preparations

2018-07-09 Thread David Miller

From: Ido Schimmel 
Date: Sun,  8 Jul 2018 23:51:15 +0300

> This is the second and last set of preparations towards initial
> Spectrum-2 support in mlxsw. It mainly re-arranges parts of the code
> that need to work with both ASICs, but somewhat differ.
> 
> The first three patches allow different ASICs to register different set
> of operations for KVD linear (KVDL) management. In Spectrum-2 there is
> no linear memory and instead entries that reside there in Spectrum
> (e.g., nexthops) are hashed and inserted to the hash-based KVD memory.
> 
> The fourth patch does a similar restructuring in the low-level multicast
> router code. This is necessary because multicast routing is implemented
> using regular circuit TCAM (C-TCAM) in Spectrum, whereas Spectrum-2 uses
> an algorithmic TCAM (A-TCAM).
> 
> Next six patches prepare the ACL code for the introduction of A-TCAM in
> follow-up patch sets.
> 
> Last two patches allow different ASICs to require different firmware
> versions and add two resources that need to be queried from firmware by
> Spectrum-2 specific code.

Series applied, thank you.

[PATCH v2 net-next] net: Use __u32 in uapi net_stamp.h

2018-07-09 Thread Jesus Sanchez-Palencia

We are not supposed to use u32 in uapi, so change the flags member of
struct sock_txtime from u32 to __u32 instead.

Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time")
Reported-by: Eric Dumazet 
Signed-off-by: Jesus Sanchez-Palencia 
---
 include/uapi/linux/net_tstamp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index f8f4539f1135..97ff3c17ec4d 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -155,8 +155,8 @@ enum txtime_flags {
 };
 
 struct sock_txtime {
-   clockid_t   clockid;/* reference clockid */
-   u32 flags;  /* flags defined by enum txtime_flags */
+   clockid_t   clockid;/* reference clockid */
+   __u32   flags;  /* as defined by enum txtime_flags */
 };
 
 #endif /* _NET_TIMESTAMPING_H */
-- 
2.18.0

Re: [PATCH net-next] net: Use __u32 in uapi net_stamp.h

2018-07-09 Thread Jesus Sanchez-Palencia




On 07/09/2018 04:18 PM, Eric Dumazet wrote:
> 
> 
> On 07/09/2018 04:08 PM, Jesus Sanchez-Palencia wrote:
>> We are not supposed to use u32 in uapi, so change the flags member of
>> struct sock_txtime from u32 to __u32 instead.
>>
>> Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit 
>> time")
>> Signed-off-by: Jesus Sanchez-Palencia 
> 
> Could you use this patch as an opportunity to tab-align the fields names ?
> 
> Also you can credit the reporter, as in :
> 
> Reported-by: Eric Dumazet 
> 

Sure


> Thanks !
>

Re: [PATCH v2 net-next 00/18] TLS offload rx, netdev & mlx5

2018-07-09 Thread David Miller

From: Boris Pismenny 
Date: Sun,  8 Jul 2018 12:36:14 +0300

> The following series provides TLS TX inline crypto offload.

I think this patch series provides RX inline offload not TX inline
offload.

Re: [PATCH v3 iproute2 2/3] tc: Add support for the ETF Qdisc

2018-07-09 Thread Jesus Sanchez-Palencia




On 07/09/2018 10:32 AM, David Ahern wrote:
> On 7/9/18 9:48 AM, Jesus Sanchez-Palencia wrote:
>> Hi David,
>>
>>
>> On 07/06/2018 08:58 AM, David Ahern wrote:
>>> On 7/5/18 4:42 PM, Jesus Sanchez-Palencia wrote:
>>>
 +static int get_clockid(__s32 *val, const char *arg)
 +{
 +  const struct static_clockid {
 +  const char *name;
 +  clockid_t clockid;
 +  } clockids_sysv[] = {
 +  { "CLOCK_REALTIME", CLOCK_REALTIME },
 +  { "CLOCK_TAI", CLOCK_TAI },
 +  { "CLOCK_BOOTTIME", CLOCK_BOOTTIME },
 +  { "CLOCK_MONOTONIC", CLOCK_MONOTONIC },
 +  { NULL }
 +  };
 +
 +  const struct static_clockid *c;
 +
 +  for (c = clockids_sysv; c->name; c++) {
 +  if (strncasecmp(c->name, arg, 25) == 0) {
>>>
>>> Why 25?
>>
>>
>> That was just an upper bound giving some room beyond the longest
>> clockid name we have today. Should I add a define MAX_CLOCK_NAME ?
> 
> why not just strcasecmp? using the 'n' variant with n > strlen of either
> argument seems pointless.


Ok, will fix.


> 
>>
>>
>>>
>>> be nice to allow shortcuts -- e.g., just REALTIME or realtime.
>>
>>
>> I'd rather just keep it as is and use the names as they are defined for
>> everything else (i.e. CLOCK_REALTIME), unless there are some strong 
>> objections.
> 
> An all caps argument is unnecessary work on the pinky finger and the
> CLOCK_ prefix is redundant to the keyword. Really, just a thought on
> making it easier for users. A CLI argument does not need to maintain a
> 1:1 with code names.


Lower case already works given the strncasecmp() usage but, fair enough, I will
modify the implementation so it accepts both CLOCK_FOO or FOO (lower case
included), and will make it print one of the two strings during print_opt().


Thanks,
Jesus

Re: [PATCH net-next] net: Use __u32 in uapi net_stamp.h

2018-07-09 Thread Eric Dumazet




On 07/09/2018 04:08 PM, Jesus Sanchez-Palencia wrote:
> We are not supposed to use u32 in uapi, so change the flags member of
> struct sock_txtime from u32 to __u32 instead.
> 
> Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit 
> time")
> Signed-off-by: Jesus Sanchez-Palencia 

Could you use this patch as an opportunity to tab-align the fields names ?

Also you can credit the reporter, as in :

Reported-by: Eric Dumazet 

Thanks !

[PATCH net-next] net: Use __u32 in uapi net_stamp.h

2018-07-09 Thread Jesus Sanchez-Palencia

We are not supposed to use u32 in uapi, so change the flags member of
struct sock_txtime from u32 to __u32 instead.

Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time")
Signed-off-by: Jesus Sanchez-Palencia 
---
 include/uapi/linux/net_tstamp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index f8f4539f1135..bdae4806fe40 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -156,7 +156,7 @@ enum txtime_flags {
 
 struct sock_txtime {
clockid_t   clockid;/* reference clockid */
-   u32 flags;  /* flags defined by enum txtime_flags */
+   __u32 flags;/* flags defined by enum txtime_flags */
 };
 
 #endif /* _NET_TIMESTAMPING_H */
-- 
2.18.0

Re: [PATCH net-next v2] tcp: remove SG-related comment in tcp_sendmsg()

2018-07-09 Thread David Miller

From: Julian Wiedmann 
Date: Mon,  9 Jul 2018 09:45:14 +0200

> Since commit 74d4a8f8d378 ("tcp: remove sk_can_gso() use"), the code
> doesn't care whether the interface supports SG.
> 
> Signed-off-by: Julian Wiedmann 
> ---
> v2: remove the whole comment (Eric)

Applied.

[PATCH bpf] bpf: fix ldx in ld_abs rewrite for large offsets

2018-07-09 Thread Daniel Borkmann

Mark reported that syzkaller triggered a KASAN detected slab-out-of-bounds
bug in ___bpf_prog_run() with a BPF_LD | BPF_ABS word load at offset 0x8001.
After further investigation it became clear that the issue was the
BPF_LDX_MEM() which takes offset as an argument whereas it cannot encode
larger than S16_MAX offsets into it. For this synthetical case we need to
move the full address into tmp register instead and do the LDX without
immediate value.

Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf")
Reported-by: syzbot 
Reported-by: Mark Rutland 
Signed-off-by: Daniel Borkmann 
---
 net/core/filter.c | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 5fa66a3..a13f5b1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -459,11 +459,21 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, 
struct bpf_insn **insnp)
 (!unaligned_ok && offset >= 0 &&
  offset + ip_align >= 0 &&
  offset + ip_align % size == 0))) {
+   bool ldx_off_ok = offset <= S16_MAX;
+
*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
*insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
-   *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
-   *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
- offset);
+   *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
+ size, 2 + endian + (!ldx_off_ok * 2));
+   if (ldx_off_ok) {
+   *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+ BPF_REG_D, offset);
+   } else {
+   *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
+   *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
+   *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+ BPF_REG_TMP, 0);
+   }
if (endian)
*insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
*insn++ = BPF_JMP_A(8);
-- 
2.9.5

Re: [BUG] mlx5 have problems with ipv4-ipv6 tunnels in linux 4.4

2018-07-09 Thread Saeed Mahameed

On Tue, Jul 3, 2018 at 10:45 PM, Konstantin Khlebnikov
 wrote:
> I'm seeing problems with tunnelled traffic with Mellanox Technologies
> MT27710 Family [ConnectX-4 Lx] using vanilla driver from linux 4.4.y
>
> Packets with payload bigger than 116 bytes are not exmited.
> Smaller packets and normal ipv6 works fine.
>

Hi Konstantin,

Is this true for all ipv6 traffic or just ipv4-ipv6 tunnels ?

what is the skb_network_offset(skb) for such packet ?

> In linux 4.9, 4.14 and out-of-tree driver everything seems fine for now.
> It's hard to guess or bisect commit: there are a lot of changes and
> something wrong with driver or swiotlb in 4.7..4.8.
> 4.6 is affected too - so this should be something between 4.6 and 4.9
>
> Probably this case was fixed indirectly by adding some kind of offload and
> non-offloaded path is still broken.
> Please give me a hint: which commit could it be.
>

I suspect it works in a newer kernel since we introduced on 4.7/4.8:

commit e3a19b53cbb0e6738b7a547f262179065b72e3fa
Author: Matthew Finlay 
Date:   Thu Jun 30 17:34:47 2016 +0300

net/mlx5e: Copy all L2 headers into inline segment

ConnectX4-Lx uses an inline wqe mode that currently defaults to
requiring the entire L2 header be included in the wqe.
This patch fixes mlx5e_get_inline_hdr_size() to account for
all L2 headers (VLAN, QinQ, etc) using skb_network_offset(skb).

Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files")
Signed-off-by: Matthew Finlay 
Signed-off-by: Saeed Mahameed 
Signed-off-by: David S. Miller 

commit ae76715d153e33c249b6850361e4d8d775388b5a
Author: Hadar Hen Zion 
Date:   Sun Jul 24 16:12:39 2016 +0300

net/mlx5e: Check the minimum inline header mode before xmit

and then some fixes on top of it, such as:

commit f600c6088018d1dbc5777d18daa83660f7ea4a64
Author: Eran Ben Elisha 
Date:   Thu Jan 25 11:18:09 2018 +0200

net/mlx5e: Verify inline header size do not exceed SKB linear size

anyhow, can you try the above patches one by one  on 4.4.y and see if it helps ?

Thanks,
Saeed

Re: [PATCH v2 net-next 02/14] net: Add a new socket option for a future transmit time.

2018-07-09 Thread Jesus Sanchez-Palencia




On 07/07/2018 05:44 PM, Eric Dumazet wrote:
> 
> 
> On 07/03/2018 03:42 PM, Jesus Sanchez-Palencia wrote:
>> From: Richard Cochran 
>>
>> This patch introduces SO_TXTIME. User space enables this option in
>> order to pass a desired future transmit time in a CMSG when calling
>> sendmsg(2). The argument to this socket option is a 8-bytes long struct
>> provided by the uapi header net_tstamp.h defined as:
>>
>> struct sock_txtime {
>>  clockid_t   clockid;
>>  u32 flags;
>> };
>>
>> Note that new fields were added to struct sock by filling a 2-bytes
>> hole found in the struct. For that reason, neither the struct size or
>> number of cachelines were altered.
> 
> 
>> diff --git a/include/uapi/linux/net_tstamp.h 
>> b/include/uapi/linux/net_tstamp.h
>> index 4fe104b2411f..c9a77c353b98 100644
>> --- a/include/uapi/linux/net_tstamp.h
>> +++ b/include/uapi/linux/net_tstamp.h
>> @@ -141,4 +141,19 @@ struct scm_ts_pktinfo {
>>  __u32 reserved[2];
>>  };
>>  
>> +/*
>> + * SO_TXTIME gets a struct sock_txtime with flags being an integer bit
>> + * field comprised of these values.
>> + */
>> +enum txtime_flags {
>> +SOF_TXTIME_DEADLINE_MODE = (1 << 0),
>> +
>> +SOF_TXTIME_FLAGS_MASK = (SOF_TXTIME_DEADLINE_MODE)
>> +};
>> +
>> +struct sock_txtime {
>> +clockid_t   clockid;/* reference clockid */
>> +u32 flags;  /* flags defined by enum txtime_flags */
>> +};
>> +
> 
> I was under the impression that we could not use 'u32' type in 
> include/uapi/linux/* file
> 
> This must be replaced by __u32, right ?


I'm sending a patch fixing that now, thanks.

[net-next 2/7] net: Add support for subordinate device traffic classes

2018-07-09 Thread Jeff Kirsher

From: Alexander Duyck 

This patch is meant to provide the basic tools needed to allow us to create
subordinate device traffic classes. The general idea here is to allow
subdividing the queues of a device into queue groups accessible through an
upper device such as a macvlan.

The idea here is to enforce the idea that an upper device has to be a
single queue device, ideally with IFF_NO_QUQUE set. With that being the
case we can pretty much guarantee that the tc_to_txq mappings and XPS maps
for the upper device are unused. As such we could reuse those in order to
support subdividing the lower device and distributing those queues between
the subordinate devices.

In order to distinguish between a regular set of traffic classes and if a
device is carrying subordinate traffic classes I changed num_tc from a u8
to a s16 value and use the negative values to represent the subordinate
pool values. So starting at -1 and running to -32768 we can encode those as
pool values, and the existing values of 0 to 15 can be maintained.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 include/linux/netdevice.h | 16 ++-
 net/core/dev.c| 89 +++
 net/core/net-sysfs.c  | 21 -
 3 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b683971e500d..b1ff77276bc4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -575,6 +575,9 @@ struct netdev_queue {
 * (/sys/class/net/DEV/Q/trans_timeout)
 */
unsigned long   trans_timeout;
+
+   /* Subordinate device that the queue has been assigned to */
+   struct net_device   *sb_dev;
 /*
  * write-mostly part
  */
@@ -1991,7 +1994,7 @@ struct net_device {
 #ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
-   u8  num_tc;
+   s16 num_tc;
struct netdev_tc_txqtc_to_txq[TC_MAX_QUEUE];
u8  prio_tc_map[TC_BITMASK + 1];
 
@@ -2045,6 +2048,17 @@ int netdev_get_num_tc(struct net_device *dev)
return dev->num_tc;
 }
 
+void netdev_unbind_sb_channel(struct net_device *dev,
+ struct net_device *sb_dev);
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+struct net_device *sb_dev,
+u8 tc, u16 count, u16 offset);
+int netdev_set_sb_channel(struct net_device *dev, u16 channel);
+static inline int netdev_get_sb_channel(struct net_device *dev)
+{
+   return max_t(int, -dev->num_tc, 0);
+}
+
 static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 unsigned int index)
diff --git a/net/core/dev.c b/net/core/dev.c
index 89825c1eccdc..cc1d6bba017a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2067,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned 
int txq)
struct netdev_tc_txq *tc = >tc_to_txq[0];
int i;
 
+   /* walk through the TCs and see if it falls into any of them */
for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
if ((txq - tc->offset) < tc->count)
return i;
}
 
+   /* didn't find it, just return -1 to indicate no match */
return -1;
}
 
@@ -2260,7 +2262,14 @@ int __netif_set_xps_queue(struct net_device *dev, const 
unsigned long *mask,
unsigned int nr_ids;
 
if (dev->num_tc) {
+   /* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
+   if (num_tc < 0)
+   return -EINVAL;
+
+   /* If queue belongs to subordinate dev use its map */
+   dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
@@ -2448,11 +2457,25 @@ int netif_set_xps_queue(struct net_device *dev, const 
struct cpumask *mask,
 EXPORT_SYMBOL(netif_set_xps_queue);
 
 #endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+   struct netdev_queue *txq = >_tx[dev->num_tx_queues];
+
+   /* Unbind any subordinate channels */
+   while (txq-- != >_tx[0]) {
+   if (txq->sb_dev)
+   netdev_unbind_sb_channel(dev, txq->sb_dev);
+   }
+}
+
 void netdev_reset_tc(struct net_device *dev)
 {
 #ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
 #endif
+   netdev_unbind_all_sb_channels(dev);
+
+   /* Reset TC configuration of device */
dev->num_tc = 0;
memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2481,11 +2504,77 @@ int

[net-next 4/7] net: Add support for subordinate traffic classes to netdev_pick_tx

2018-07-09 Thread Jeff Kirsher

From: Alexander Duyck 

This change makes it so that we can support the concept of subordinate
device traffic classes to the core networking code. In doing this we can
start pulling out the driver specific bits needed to support selecting a
queue based on an upper device.

The solution at is currently stands is only partially implemented. I have
the start of some XPS bits in here, but I would still need to allow for
configuration of the XPS maps on the queues reserved for the subordinate
devices. For now I am using the reference to the sb_dev XPS map as just a
way to skip the lookup of the lower device XPS map for now as that would
result in the wrong queue being picked.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 19 ++
 drivers/net/macvlan.c | 10 +---
 include/linux/netdevice.h |  4 +-
 net/core/dev.c| 58 +++
 4 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 80225af2acb1..abb176df2e7f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8208,20 +8208,17 @@ static void ixgbe_atr(struct ixgbe_ring *ring,
  input, common, ring->queue_index);
 }
 
+#ifdef IXGBE_FCOE
 static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
  void *accel_priv, select_queue_fallback_t 
fallback)
 {
-   struct ixgbe_fwd_adapter *fwd_adapter = accel_priv;
-#ifdef IXGBE_FCOE
struct ixgbe_adapter *adapter;
struct ixgbe_ring_feature *f;
-#endif
int txq;
 
-   if (fwd_adapter) {
-   u8 tc = netdev_get_num_tc(dev) ?
-   netdev_get_prio_tc_map(dev, skb->priority) : 0;
-   struct net_device *vdev = fwd_adapter->netdev;
+   if (accel_priv) {
+   u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+   struct net_device *vdev = accel_priv;
 
txq = vdev->tc_to_txq[tc].offset;
txq += reciprocal_scale(skb_get_hash(skb),
@@ -8230,8 +8227,6 @@ static u16 ixgbe_select_queue(struct net_device *dev, 
struct sk_buff *skb,
return txq;
}
 
-#ifdef IXGBE_FCOE
-
/*
 * only execute the code below if protocol is FCoE
 * or FIP and we have FCoE enabled on the adapter
@@ -8257,11 +8252,9 @@ static u16 ixgbe_select_queue(struct net_device *dev, 
struct sk_buff *skb,
txq -= f->indices;
 
return txq + f->offset;
-#else
-   return fallback(dev, skb);
-#endif
 }
 
+#endif
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
   struct xdp_frame *xdpf)
 {
@@ -10058,7 +10051,6 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_open   = ixgbe_open,
.ndo_stop   = ixgbe_close,
.ndo_start_xmit = ixgbe_xmit_frame,
-   .ndo_select_queue   = ixgbe_select_queue,
.ndo_set_rx_mode= ixgbe_set_rx_mode,
.ndo_validate_addr  = eth_validate_addr,
.ndo_set_mac_address= ixgbe_set_mac,
@@ -10081,6 +10073,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_poll_controller= ixgbe_netpoll,
 #endif
 #ifdef IXGBE_FCOE
+   .ndo_select_queue   = ixgbe_select_queue,
.ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get,
.ndo_fcoe_ddp_target = ixgbe_fcoe_ddp_target,
.ndo_fcoe_ddp_done = ixgbe_fcoe_ddp_put,
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index adde8fc45588..401e1d1ce1ec 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -514,7 +514,6 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct 
net_device *dev)
const struct macvlan_dev *vlan = netdev_priv(dev);
const struct macvlan_port *port = vlan->port;
const struct macvlan_dev *dest;
-   void *accel_priv = NULL;
 
if (vlan->mode == MACVLAN_MODE_BRIDGE) {
const struct ethhdr *eth = (void *)skb->data;
@@ -533,15 +532,10 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct 
net_device *dev)
return NET_XMIT_SUCCESS;
}
}
-
-   /* For packets that are non-multicast and not bridged we will pass
-* the necessary information so that the lowerdev can distinguish
-* the source of the packets via the accel_priv value.
-*/
-   accel_priv = vlan->accel_priv;
 xmit_world:
skb->dev = vlan->lowerdev;
-   return dev_queue_xmit_accel(skb, accel_priv);
+   return dev_queue_xmit_accel(skb,
+   netdev_get_sb_channel(dev) ? dev : NULL);
 }
 
 static inline netdev_tx_t

[net-next 7/7] net: allow fallback function to pass netdev

2018-07-09 Thread Jeff Kirsher

From: Alexander Duyck 

For most of these calls we can just pass NULL through to the fallback
function as the sb_dev. The only cases where we cannot are the cases where
we might be dealing with either an upper device or a driver that would
have configured things to support an sb_dev itself.

The only driver that has any significant change in this patch set should be
ixgbe as we can drop the redundant functionality that existed in both the
ndo_select_queue function and the fallback function that was passed through
to us.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c|  2 +-
 drivers/net/ethernet/broadcom/bcmsysport.c  |  4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |  3 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  2 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c   |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  4 ++--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c  |  4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c |  2 +-
 drivers/net/hyperv/netvsc_drv.c |  2 +-
 drivers/net/net_failover.c  |  2 +-
 drivers/net/xen-netback/interface.c |  2 +-
 include/linux/netdevice.h   |  3 ++-
 net/core/dev.c  | 12 +++-
 net/packet/af_packet.c  |  7 ---
 14 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c 
b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index e3befb1f9204..c673ac2df65b 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2224,7 +2224,7 @@ static u16 ena_select_queue(struct net_device *dev, 
struct sk_buff *skb,
if (skb_rx_queue_recorded(skb))
qid = skb_get_rx_queue(skb);
else
-   qid = fallback(dev, skb);
+   qid = fallback(dev, skb, NULL);
 
return qid;
 }
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index 32f548e6431d..eb890c4b3b2d 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2116,7 +2116,7 @@ static u16 bcm_sysport_select_queue(struct net_device 
*dev, struct sk_buff *skb,
unsigned int q, port;
 
if (!netdev_uses_dsa(dev))
-   return fallback(dev, skb);
+   return fallback(dev, skb, NULL);
 
/* DSA tagging layer will have configured the correct queue */
q = BRCM_TAG_GET_QUEUE(queue);
@@ -2124,7 +2124,7 @@ static u16 bcm_sysport_select_queue(struct net_device 
*dev, struct sk_buff *skb,
tx_ring = priv->ring_map[q + port * priv->per_port_num_tx_queues];
 
if (unlikely(!tx_ring))
-   return fallback(dev, skb);
+   return fallback(dev, skb, NULL);
 
return tx_ring->index;
 }
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index e4e1cf907ac6..5a727d4729da 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1933,7 +1933,8 @@ u16 bnx2x_select_queue(struct net_device *dev, struct 
sk_buff *skb,
}
 
/* select a non-FCoE queue */
-   return fallback(dev, skb) % (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos);
+   return fallback(dev, skb, NULL) %
+  (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos);
 }
 
 void bnx2x_set_num_queues(struct bnx2x *bp)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 5dc5e5604f05..40cf8dc9f163 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -973,7 +973,7 @@ static u16 cxgb_select_queue(struct net_device *dev, struct 
sk_buff *skb,
return txq;
}
 
-   return fallback(dev, skb) % dev->real_num_tx_queues;
+   return fallback(dev, skb, NULL) % dev->real_num_tx_queues;
 }
 
 static int closest_timer(const struct sge *s, int time)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c 
b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index ff7a74ec8f11..948b3e0d18f4 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -2033,7 +2033,7 @@ hns_nic_select_queue(struct net_device *ndev, struct 
sk_buff *skb,
is_multicast_ether_addr(eth_hdr->h_dest))
return 0;
else
-   return fallback(ndev, skb);
+   return fallback(ndev, skb, NULL);
 }
 
 static const struct net_device_ops hns_nic_netdev_ops = {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 8c7a68c57afa..bd6d9ea27b4b 100644
---

[net-next 1/7] net-sysfs: Drop support for XPS and traffic_class on single queue device

2018-07-09 Thread Jeff Kirsher

From: Alexander Duyck 

This patch makes it so that we do not report the traffic class or allow XPS
configuration on single queue devices. This is mostly to avoid unnecessary
complexity with changes I have planned that will allow us to reuse
the unused tc_to_txq and XPS configuration on a single queue device to
allow it to make use of a subset of queues on an underlying device.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 net/core/net-sysfs.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index f25ac5ff48a6..dce3ae0fbca2 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1047,9 +1047,14 @@ static ssize_t traffic_class_show(struct netdev_queue 
*queue,
  char *buf)
 {
struct net_device *dev = queue->dev;
-   int index = get_netdev_queue_index(queue);
-   int tc = netdev_txq_to_tc(dev, index);
+   int index;
+   int tc;
 
+   if (!netif_is_multiqueue(dev))
+   return -ENOENT;
+
+   index = get_netdev_queue_index(queue);
+   tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
 
@@ -1214,6 +1219,9 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
cpumask_var_t mask;
unsigned long index;
 
+   if (!netif_is_multiqueue(dev))
+   return -ENOENT;
+
index = get_netdev_queue_index(queue);
 
if (dev->num_tc) {
@@ -1260,6 +1268,9 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
cpumask_var_t mask;
int err;
 
+   if (!netif_is_multiqueue(dev))
+   return -ENOENT;
+
if (!capable(CAP_NET_ADMIN))
return -EPERM;
 
-- 
2.17.1

[net-next 6/7] net: allow ndo_select_queue to pass netdev

2018-07-09 Thread Jeff Kirsher

From: Alexander Duyck 

This patch makes it so that instead of passing a void pointer as the
accel_priv we instead pass a net_device pointer as sb_dev. Making this
change allows us to pass the subordinate device through to the fallback
function eventually so that we can keep the actual code in the
ndo_select_queue call as focused on possible on the exception cases.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/infiniband/hw/hfi1/vnic_main.c|  2 +-
 drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c |  4 ++--
 drivers/net/bonding/bond_main.c   |  3 ++-
 drivers/net/ethernet/amazon/ena/ena_netdev.c  |  3 ++-
 drivers/net/ethernet/broadcom/bcmsysport.c|  2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   |  3 ++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |  3 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   |  3 ++-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c |  3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  7 ---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c|  3 ++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h  |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |  3 ++-
 drivers/net/ethernet/renesas/ravb_main.c  |  3 ++-
 drivers/net/ethernet/sun/ldmvsw.c |  3 ++-
 drivers/net/ethernet/sun/sunvnet.c|  3 ++-
 drivers/net/hyperv/netvsc_drv.c   |  4 ++--
 drivers/net/net_failover.c|  5 +++--
 drivers/net/team/team.c   |  3 ++-
 drivers/net/tun.c |  3 ++-
 drivers/net/wireless/marvell/mwifiex/main.c   |  3 ++-
 drivers/net/xen-netback/interface.c   |  2 +-
 drivers/net/xen-netfront.c|  3 ++-
 drivers/staging/rtl8188eu/os_dep/os_intfs.c   |  3 ++-
 drivers/staging/rtl8723bs/os_dep/os_intfs.c   |  7 +++
 include/linux/netdevice.h | 11 +++
 net/core/dev.c|  6 --
 net/mac80211/iface.c  |  4 ++--
 29 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c 
b/drivers/infiniband/hw/hfi1/vnic_main.c
index 5d65582fe4d9..616fc9b6fad8 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -423,7 +423,7 @@ static netdev_tx_t hfi1_netdev_start_xmit(struct sk_buff 
*skb,
 
 static u16 hfi1_vnic_select_queue(struct net_device *netdev,
  struct sk_buff *skb,
- void *accel_priv,
+ struct net_device *sb_dev,
  select_queue_fallback_t fallback)
 {
struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c 
b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
index 0c8aec62a425..61558788b3fa 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
@@ -95,7 +95,7 @@ static netdev_tx_t opa_netdev_start_xmit(struct sk_buff *skb,
 }
 
 static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff 
*skb,
-void *accel_priv,
+struct net_device *sb_dev,
 select_queue_fallback_t fallback)
 {
struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
@@ -107,7 +107,7 @@ static u16 opa_vnic_select_queue(struct net_device *netdev, 
struct sk_buff *skb,
mdata->entropy = opa_vnic_calc_entropy(skb);
mdata->vl = opa_vnic_get_vl(adapter, skb);
rc = adapter->rn_ops->ndo_select_queue(netdev, skb,
-  accel_priv, fallback);
+  sb_dev, fallback);
skb_pull(skb, sizeof(*mdata));
return rc;
 }
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 63e3844c5bec..9a2ea3c1f949 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4094,7 +4094,8 @@ static inline int bond_slave_override(struct bonding 
*bond,
 
 
 static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
-void *accel_priv, select_queue_fallback_t fallback)
+struct net_device *sb_dev,
+select_queue_fallback_t fallback)
 {
/* This helper function exists to help dev_pick_tx get the correct
 * destination queue.  Using a helper function skips a call to
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c 
b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index f2af87d70594..e3befb1f9204 100644
---

[net-next 3/7] ixgbe: Add code to populate and use macvlan TC to Tx queue map

2018-07-09 Thread Jeff Kirsher

From: Alexander Duyck 

This patch makes it so that we use the tc_to_txq mapping in the macvlan
device in order to select the Tx queue for outgoing packets.

The idea here is to try and move away from using ixgbe_select_queue and to
come up with a generic way to make this work for devices going forward. By
encoding this information in the netdev this can become something that can
be used generically as a solution for similar setups going forward.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 44 ---
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a8e21becb619..80225af2acb1 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5275,6 +5275,8 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring 
*rx_ring)
 static int ixgbe_fwd_ring_up(struct ixgbe_adapter *adapter,
 struct ixgbe_fwd_adapter *accel)
 {
+   u16 rss_i = adapter->ring_feature[RING_F_RSS].indices;
+   int num_tc = netdev_get_num_tc(adapter->netdev);
struct net_device *vdev = accel->netdev;
int i, baseq, err;
 
@@ -5286,6 +5288,11 @@ static int ixgbe_fwd_ring_up(struct ixgbe_adapter 
*adapter,
accel->rx_base_queue = baseq;
accel->tx_base_queue = baseq;
 
+   /* record configuration for macvlan interface in vdev */
+   for (i = 0; i < num_tc; i++)
+   netdev_bind_sb_channel_queue(adapter->netdev, vdev,
+i, rss_i, baseq + (rss_i * i));
+
for (i = 0; i < adapter->num_rx_queues_per_pool; i++)
adapter->rx_ring[baseq + i]->netdev = vdev;
 
@@ -5310,6 +5317,10 @@ static int ixgbe_fwd_ring_up(struct ixgbe_adapter 
*adapter,
 
netdev_err(vdev, "L2FW offload disabled due to L2 filter error\n");
 
+   /* unbind the queues and drop the subordinate channel config */
+   netdev_unbind_sb_channel(adapter->netdev, vdev);
+   netdev_set_sb_channel(vdev, 0);
+
clear_bit(accel->pool, adapter->fwd_bitmask);
kfree(accel);
 
@@ -8201,18 +8212,22 @@ static u16 ixgbe_select_queue(struct net_device *dev, 
struct sk_buff *skb,
  void *accel_priv, select_queue_fallback_t 
fallback)
 {
struct ixgbe_fwd_adapter *fwd_adapter = accel_priv;
-   struct ixgbe_adapter *adapter;
-   int txq;
 #ifdef IXGBE_FCOE
+   struct ixgbe_adapter *adapter;
struct ixgbe_ring_feature *f;
 #endif
+   int txq;
 
if (fwd_adapter) {
-   adapter = netdev_priv(dev);
-   txq = reciprocal_scale(skb_get_hash(skb),
-  adapter->num_rx_queues_per_pool);
+   u8 tc = netdev_get_num_tc(dev) ?
+   netdev_get_prio_tc_map(dev, skb->priority) : 0;
+   struct net_device *vdev = fwd_adapter->netdev;
+
+   txq = vdev->tc_to_txq[tc].offset;
+   txq += reciprocal_scale(skb_get_hash(skb),
+   vdev->tc_to_txq[tc].count);
 
-   return txq + fwd_adapter->tx_base_queue;
+   return txq;
}
 
 #ifdef IXGBE_FCOE
@@ -8766,6 +8781,11 @@ static int ixgbe_reassign_macvlan_pool(struct net_device 
*vdev, void *data)
/* if we cannot find a free pool then disable the offload */
netdev_err(vdev, "L2FW offload disabled due to lack of queue 
resources\n");
macvlan_release_l2fw_offload(vdev);
+
+   /* unbind the queues and drop the subordinate channel config */
+   netdev_unbind_sb_channel(adapter->netdev, vdev);
+   netdev_set_sb_channel(vdev, 0);
+
kfree(accel);
 
return 0;
@@ -9769,6 +9789,13 @@ static void *ixgbe_fwd_add(struct net_device *pdev, 
struct net_device *vdev)
if (!macvlan_supports_dest_filter(vdev))
return ERR_PTR(-EMEDIUMTYPE);
 
+   /* We need to lock down the macvlan to be a single queue device so that
+* we can reuse the tc_to_txq field in the macvlan netdev to represent
+* the queue mapping to our netdev.
+*/
+   if (netif_is_multiqueue(vdev))
+   return ERR_PTR(-ERANGE);
+
pool = find_first_zero_bit(adapter->fwd_bitmask, adapter->num_rx_pools);
if (pool == adapter->num_rx_pools) {
u16 used_pools = adapter->num_vfs + adapter->num_rx_pools;
@@ -9825,6 +9852,7 @@ static void *ixgbe_fwd_add(struct net_device *pdev, 
struct net_device *vdev)
return ERR_PTR(-ENOMEM);
 
set_bit(pool, adapter->fwd_bitmask);
+   netdev_set_sb_channel(vdev, pool);
accel->pool = pool;
accel->netdev = vdev;
 
@@ -9866,6 +9894,10 @@ static void ixgbe_fwd_del(struct net_device *pdev, void 
*priv)
ring->netdev =

[net-next 0/7][pull request] L2 Fwd Offload & 10GbE Intel Driver Updates 2018-07-09

2018-07-09 Thread Jeff Kirsher

This patch series is meant to allow support for the L2 forward offload, aka
MACVLAN offload without the need for using ndo_select_queue.

The existing solution currently requires that we use ndo_select_queue in
the transmit path if we want to associate specific Tx queues with a given
MACVLAN interface. In order to get away from this we need to repurpose the
tc_to_txq array and XPS pointer for the MACVLAN interface and use those as
a means of accessing the queues on the lower device. As a result we cannot
offload a device that is configured as multiqueue, however it doesn't
really make sense to configure a macvlan interfaced as being multiqueue
anyway since it doesn't really have a qdisc of its own in the first place.

The big changes in this set are:
  Allow lower device to update tc_to_txq and XPS map of offloaded MACVLAN
  Disable XPS for single queue devices
  Replace accel_priv with sb_dev in ndo_select_queue
  Add sb_dev parameter to fallback function for ndo_select_queue
  Consolidated ndo_select_queue functions that appeared to be duplicates

The following are changes since commit c47078d6a33fd78d882200cdaacbcfcd63318234:
  tcp: remove redundant SOCK_DONE checks
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 10GbE

Alexander Duyck (7):
  net-sysfs: Drop support for XPS and traffic_class on single queue
device
  net: Add support for subordinate device traffic classes
  ixgbe: Add code to populate and use macvlan TC to Tx queue map
  net: Add support for subordinate traffic classes to netdev_pick_tx
  net: Add generic ndo_select_queue functions
  net: allow ndo_select_queue to pass netdev
  net: allow fallback function to pass netdev

 drivers/infiniband/hw/hfi1/vnic_main.c|   2 +-
 .../infiniband/ulp/opa_vnic/opa_vnic_netdev.c |   4 +-
 drivers/net/bonding/bond_main.c   |   3 +-
 drivers/net/ethernet/amazon/ena/ena_netdev.c  |   5 +-
 drivers/net/ethernet/broadcom/bcmsysport.c|   6 +-
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   |   6 +-
 .../net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |   3 +-
 .../net/ethernet/chelsio/cxgb4/cxgb4_main.c   |   5 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c |   5 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  62 +--
 drivers/net/ethernet/lantiq_etop.c|  10 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c|   7 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h  |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |   3 +-
 .../net/ethernet/mellanox/mlx5/core/en_tx.c   |   5 +-
 drivers/net/ethernet/renesas/ravb_main.c  |   3 +-
 drivers/net/ethernet/sun/ldmvsw.c |   3 +-
 drivers/net/ethernet/sun/sunvnet.c|   3 +-
 drivers/net/ethernet/ti/netcp_core.c  |   9 +-
 drivers/net/hyperv/netvsc_drv.c   |   6 +-
 drivers/net/macvlan.c |  10 +-
 drivers/net/net_failover.c|   7 +-
 drivers/net/team/team.c   |   3 +-
 drivers/net/tun.c |   3 +-
 drivers/net/wireless/marvell/mwifiex/main.c   |   3 +-
 drivers/net/xen-netback/interface.c   |   4 +-
 drivers/net/xen-netfront.c|   3 +-
 drivers/staging/netlogic/xlr_net.c|   9 +-
 drivers/staging/rtl8188eu/os_dep/os_intfs.c   |   3 +-
 drivers/staging/rtl8723bs/os_dep/os_intfs.c   |   7 +-
 include/linux/netdevice.h |  34 +++-
 net/core/dev.c| 157 +++---
 net/core/net-sysfs.c  |  36 +++-
 net/mac80211/iface.c  |   4 +-
 net/packet/af_packet.c|   7 +-
 35 files changed, 312 insertions(+), 131 deletions(-)

-- 
2.17.1

[net-next 5/7] net: Add generic ndo_select_queue functions

2018-07-09 Thread Jeff Kirsher

From: Alexander Duyck 

This patch adds a generic version of the ndo_select_queue functions for
either returning 0 or selecting a queue based on the processor ID. This is
generally meant to just reduce the number of functions we have to change
in the future when we have to deal with ndo_select_queue changes.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/lantiq_etop.c   | 10 +-
 drivers/net/ethernet/ti/netcp_core.c |  9 +
 drivers/staging/netlogic/xlr_net.c   |  9 +
 include/linux/netdevice.h|  4 
 net/core/dev.c   | 14 ++
 net/packet/af_packet.c   |  2 +-
 6 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/lantiq_etop.c 
b/drivers/net/ethernet/lantiq_etop.c
index afc810069440..7a637b51c7d2 100644
--- a/drivers/net/ethernet/lantiq_etop.c
+++ b/drivers/net/ethernet/lantiq_etop.c
@@ -563,14 +563,6 @@ ltq_etop_set_multicast_list(struct net_device *dev)
spin_unlock_irqrestore(>lock, flags);
 }
 
-static u16
-ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb,
- void *accel_priv, select_queue_fallback_t fallback)
-{
-   /* we are currently only using the first queue */
-   return 0;
-}
-
 static int
 ltq_etop_init(struct net_device *dev)
 {
@@ -641,7 +633,7 @@ static const struct net_device_ops ltq_eth_netdev_ops = {
.ndo_set_mac_address = ltq_etop_set_mac_address,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_rx_mode = ltq_etop_set_multicast_list,
-   .ndo_select_queue = ltq_etop_select_queue,
+   .ndo_select_queue = dev_pick_tx_zero,
.ndo_init = ltq_etop_init,
.ndo_tx_timeout = ltq_etop_tx_timeout,
 };
diff --git a/drivers/net/ethernet/ti/netcp_core.c 
b/drivers/net/ethernet/ti/netcp_core.c
index 6ebf110cd594..a1d335a3c5e4 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1889,13 +1889,6 @@ static int netcp_rx_kill_vid(struct net_device *ndev, 
__be16 proto, u16 vid)
return err;
 }
 
-static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
- void *accel_priv,
- select_queue_fallback_t fallback)
-{
-   return 0;
-}
-
 static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
  void *type_data)
 {
@@ -1972,7 +1965,7 @@ static const struct net_device_ops netcp_netdev_ops = {
.ndo_vlan_rx_add_vid= netcp_rx_add_vid,
.ndo_vlan_rx_kill_vid   = netcp_rx_kill_vid,
.ndo_tx_timeout = netcp_ndo_tx_timeout,
-   .ndo_select_queue   = netcp_select_queue,
+   .ndo_select_queue   = dev_pick_tx_zero,
.ndo_setup_tc   = netcp_setup_tc,
 };
 
diff --git a/drivers/staging/netlogic/xlr_net.c 
b/drivers/staging/netlogic/xlr_net.c
index e461168313bf..4e6611e4c59b 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -290,13 +290,6 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
 }
 
-static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb,
-   void *accel_priv,
-   select_queue_fallback_t fallback)
-{
-   return (u16)smp_processor_id();
-}
-
 static void xlr_hw_set_mac_addr(struct net_device *ndev)
 {
struct xlr_net_priv *priv = netdev_priv(ndev);
@@ -403,7 +396,7 @@ static const struct net_device_ops xlr_netdev_ops = {
.ndo_open = xlr_net_open,
.ndo_stop = xlr_net_stop,
.ndo_start_xmit = xlr_net_start_xmit,
-   .ndo_select_queue = xlr_net_select_queue,
+   .ndo_select_queue = dev_pick_tx_cpu_id,
.ndo_set_mac_address = xlr_net_set_mac_addr,
.ndo_set_rx_mode = xlr_set_rx_mode,
.ndo_get_stats64 = xlr_stats,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fda0bcda7a42..46f4c44ce3e4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2567,6 +2567,10 @@ void dev_close(struct net_device *dev);
 void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff 
*newskb);
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+void *accel_priv, select_queue_fallback_t fallback);
+u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
+  void *accel_priv, select_queue_fallback_t fallback);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
diff --git a/net/core/dev.c b/net/core/dev.c
index 09a7cc2f3c55..b5e538032d5e 100644
--- a/net/core/dev.c
+++

Re: [PATCH v3 net-next 0/3] fix use-after-free bugs in skb list processing

2018-07-09 Thread David Miller

From: Edward Cree 
Date: Mon, 9 Jul 2018 18:08:38 +0100

> A couple of bugs in skb list handling were spotted by Dan Carpenter, with
>  the help of Smatch; following up on them I found a couple more similar
>  cases.  This series fixes them by changing the relevant loops to use the
>  dequeue-enqueue model (rather than in-place list modification).
> 
> v3: fixed another similar bug in __netif_receive_skb_list_core().
> 
> v2: dropped patch #3 (new list.h helper), per DaveM's request.

Applied, thanks Edward.

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-09 Thread Marcelo Ricardo Leitner

On Mon, Jul 09, 2018 at 05:03:31PM -0400, Michel Machado wrote:
> On 07/09/2018 03:53 PM, Marcelo Ricardo Leitner wrote:
> > On Mon, Jul 09, 2018 at 02:18:33PM -0400, Michel Machado wrote:
> > > On 07/09/2018 11:44 AM, Marcelo Ricardo Leitner wrote:
> > > > On Sat, Jul 07, 2018 at 03:43:55PM +0530, Nishanth Devarajan wrote:
> > > > > net/sched: add skbprio scheduer
> > > > > 
> > > > > Skbprio (SKB Priority Queue) is a queueing discipline that 
> > > > > prioritizes packets
> > > > > according to their skb->priority field. Under congestion, 
> > > > > already-enqueued lower
> > > > > priority packets will be dropped to make space available for higher 
> > > > > priority
> > > > > packets. Skbprio was conceived as a solution for denial-of-service 
> > > > > defenses that
> > > > > need to route packets with different priorities as a means to 
> > > > > overcome DoS
> > > > > attacks.
> > > > 
> > > > Why can't we implement this as a new flag for sch_prio.c?
> > > > 
> > > > I don't see why this duplication is needed, especially because it will
> > > > only be "slower" (as in, it will do more work) when qdisc is already
> > > > full and dropping packets anyway.
> > > 
> > > sch_prio.c and skbprio diverge on a number of aspects:
> > > 
> > > 1. sch_prio.c supports up to 16 priorities whereas skbprio 64. This is
> > > not just a matter of changing a constant since sch_prio.c doesn't use
> > > skb->priority.
> > 
> > Yes it does use skb->priority for classifying into a band:
> > 
> > prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
> > {
> >  struct prio_sched_data *q = qdisc_priv(sch);
> >  u32 band = skb->priority;
> > ...
> 
>Changing TC_PRIO_MAX from 15 to 63 risks breaking backward compatibility
> with applications.

If done, it needs to be done carefully, indeed. I don't know if it's
doable, neither I know how hard is your requirement for 64 different
priorities.

You can get 64 different priorities by stacking sch_prio, btw. And if
you implement drop_from_tail() as part of Qdisc, you can even get it
working for this cascading case too.

> 
> > > 3. The queues of sch_prio.c are struct Qdisc, which don't have a 
> > > method
> > > to drop at its tail.
> > 
> > That can be implemented, most likely as prio_tail_drop() as above.
> 
>struct Qdisc represents *all* qdiscs. My knowledge of the other qdiscs is
> limited, but not all qdiscs may have a meaningful method to drop at the
> tail. For example: a qdisc that works over flows may not know with flow is

True, but it doesn't mean you have to implement it for all available qdiscs.

> the tail. Not to mention that this would be a widespread patch to only
> support this new prio qdisc. It would be prudent to wait for the production
> success of the proposed, self-contained qdisc before making this commitment.

On the other hand, by adding another qdisc you're adding more work
that one needs to do when dealing with qdisc infrastructure, such as
updating enqueue() prototype, for example.

Once this new qdisc is in, it won't be easy to deprecate it.

  Marcelo

Re: [PATCH net-next 07/10] r8169: migrate speed_down function to phylib

2018-07-09 Thread Heiner Kallweit

On 03.07.2018 18:48, Florian Fainelli wrote:
> 
> 
> On 07/02/2018 02:31 PM, Heiner Kallweit wrote:
>> On 02.07.2018 23:20, Andrew Lunn wrote:
>>>  On Mon, Jul 02, 2018 at 09:37:08PM +0200, Heiner Kallweit wrote:
 Change rtl_speed_down() to use phylib.

 Signed-off-by: Heiner Kallweit 
 ---
  drivers/net/ethernet/realtek/r8169.c | 33 +---
  1 file changed, 15 insertions(+), 18 deletions(-)

 diff --git a/drivers/net/ethernet/realtek/r8169.c 
 b/drivers/net/ethernet/realtek/r8169.c
 index 311321ee..807fbc75 100644
 --- a/drivers/net/ethernet/realtek/r8169.c
 +++ b/drivers/net/ethernet/realtek/r8169.c
 @@ -4240,6 +4240,10 @@ static void rtl8169_init_phy(struct net_device 
 *dev, struct rtl8169_private *tp)
rtl_writephy(tp, 0x0b, 0x); //w 0x0b 15 0 0
}
  
 +  /* We may have called rtl_speed_down before */
 +  dev->phydev->advertising = dev->phydev->supported;
 +  genphy_config_aneg(dev->phydev);
 +
genphy_soft_reset(dev->phydev);
  
rtl8169_set_speed(dev, AUTONEG_ENABLE, SPEED_1000, DUPLEX_FULL,
 @@ -4323,28 +4327,21 @@ static void rtl_init_mdio_ops(struct 
 rtl8169_private *tp)
}
  }
  
 +#define BASET10   (ADVERTISED_10baseT_Half | 
 ADVERTISED_10baseT_Full)
 +#define BASET100  (ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full)
 +#define BASET1000 (ADVERTISED_1000baseT_Half | ADVERTISED_1000baseT_Full)
 +
  static void rtl_speed_down(struct rtl8169_private *tp)
  {
 -  u32 adv;
 -  int lpa;
 +  struct phy_device *phydev = tp->dev->phydev;
 +  u32 adv = phydev->lp_advertising & phydev->supported;
  
 -  rtl_writephy(tp, 0x1f, 0x);
 -  lpa = rtl_readphy(tp, MII_LPA);
 +  if (adv & BASET10)
 +  phydev->advertising &= ~(BASET100 | BASET1000);
 +  else if (adv & BASET100)
 +  phydev->advertising &= ~BASET1000;
  
 -  if (lpa & (LPA_10HALF | LPA_10FULL))
 -  adv = ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full;
 -  else if (lpa & (LPA_100HALF | LPA_100FULL))
 -  adv = ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full |
 -ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full;
 -  else
 -  adv = ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full |
 -ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full |
 -(tp->mii.supports_gmii ?
 - ADVERTISED_1000baseT_Half |
 - ADVERTISED_1000baseT_Full : 0);
 -
 -  rtl8169_set_speed(tp->dev, AUTONEG_ENABLE, SPEED_1000, DUPLEX_FULL,
 -adv);
 +  genphy_config_aneg(phydev);
  }
>>>
>>> It probably it is me being too tired, but i don't get what this is
>>> doing? Changing the local advertisement based on what the remote is
>>> advertising. Why?
>>>
>> It also took me some time to understand what this speed_down is doing.
>> If we suspend and wait for a WoL packet, then we don't have to burn all
>> the energy for a GBit connection. Therefore we switch to the lowest
>> speed supported by chip and link partner. This is done by removing
>> higher speeds from the advertised modes and restarting an autonego.
> 
> This is something that the tg3 driver also does, we should probably
> consider doing this as part of a generic PHY library helpers since I was
> told by several HW engineers that usually 10Mbits for WoL is much more
> energy efficient.
> 
Yes, I agree this should become part of phylib. I'd prefer to do it
after this r8169 patch series, will spend a few thoughts on how to
do it best, also considering your remark below.

Heiner

> One thing that bothers me a bit is that this should ideally be offered
> as both blocking and non-blocking options, because we might want to make
> sure that at the time we suspend, and we already had a link established,
> we successfully re-negotiate the link with the partner. I agree that
> there could be any sort of link disruption happening at any point though..
>

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-09 Thread Michel Machado


On 07/09/2018 03:53 PM, Marcelo Ricardo Leitner wrote:

On Mon, Jul 09, 2018 at 02:18:33PM -0400, Michel Machado wrote:

On 07/09/2018 11:44 AM, Marcelo Ricardo Leitner wrote:

On Sat, Jul 07, 2018 at 03:43:55PM +0530, Nishanth Devarajan wrote:

net/sched: add skbprio scheduer

Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes packets
according to their skb->priority field. Under congestion, already-enqueued lower
priority packets will be dropped to make space available for higher priority
packets. Skbprio was conceived as a solution for denial-of-service defenses that
need to route packets with different priorities as a means to overcome DoS
attacks.


Why can't we implement this as a new flag for sch_prio.c?

I don't see why this duplication is needed, especially because it will
only be "slower" (as in, it will do more work) when qdisc is already
full and dropping packets anyway.


sch_prio.c and skbprio diverge on a number of aspects:

1. sch_prio.c supports up to 16 priorities whereas skbprio 64. This is
not just a matter of changing a constant since sch_prio.c doesn't use
skb->priority.


Yes it does use skb->priority for classifying into a band:

prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
{
 struct prio_sched_data *q = qdisc_priv(sch);
 u32 band = skb->priority;
...


   Changing TC_PRIO_MAX from 15 to 63 risks breaking backward 
compatibility with applications.



3. The queues of sch_prio.c are struct Qdisc, which don't have a method
to drop at its tail.


That can be implemented, most likely as prio_tail_drop() as above.


   struct Qdisc represents *all* qdiscs. My knowledge of the other 
qdiscs is limited, but not all qdiscs may have a meaningful method to 
drop at the tail. For example: a qdisc that works over flows may not 
know with flow is the tail. Not to mention that this would be a 
widespread patch to only support this new prio qdisc. It would be 
prudent to wait for the production success of the proposed, 
self-contained qdisc before making this commitment.


[ ]'s
Michel Machado

Re: [PATCH net-next] net: sched: fix unprotected access to rcu cookie pointer

2018-07-09 Thread Eric Dumazet




On 07/09/2018 01:34 PM, Marcelo Ricardo Leitner wrote:

> I am not sure if this is enough to fix the entire issue. Now it will
> fetch the length correctly but, what guarantees that when it tries to
> actually copy the key (tcf_action_dump_1), the same act_cookie pointer
> will be used? As in, can't the new re-fetch be different/smaller than
> the object used here?
> 

Yes, this presumably should use rtnl_dereference()

RTNL should be held between tcf_action_shared_attrs_size() and 
tcf_action_dump_1()

Although it might not be the case anymore, we keep changing this RTNL 
requirement
in dump operations ;)

Re: [PATCH net-next] net: sched: fix unprotected access to rcu cookie pointer

2018-07-09 Thread Vlad Buslov



On Mon 09 Jul 2018 at 20:34, Marcelo Ricardo Leitner 
 wrote:
> On Mon, Jul 09, 2018 at 08:26:47PM +0300, Vlad Buslov wrote:
>> Fix action attribute size calculation function to take rcu read lock and
>> access act_cookie pointer with rcu dereference.
>> 
>> Fixes: eec94fdb0480 ("net: sched: use rcu for action cookie update")
>> Reported-by: Marcelo Ricardo Leitner 
>> Signed-off-by: Vlad Buslov 
>> ---
>>  net/sched/act_api.c | 9 +++--
>>  1 file changed, 7 insertions(+), 2 deletions(-)
>> 
>> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>> index 66dc19746c63..148a89ab789b 100644
>> --- a/net/sched/act_api.c
>> +++ b/net/sched/act_api.c
>> @@ -149,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release);
>>  
>>  static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
>>  {
>> +struct tc_cookie *act_cookie;
>>  u32 cookie_len = 0;
>>  
>> -if (act->act_cookie)
>> -cookie_len = nla_total_size(act->act_cookie->len);
>> +rcu_read_lock();
>> +act_cookie = rcu_dereference(act->act_cookie);
>> +
>> +if (act_cookie)
>> +cookie_len = nla_total_size(act_cookie->len);
>> +rcu_read_unlock();
>
> I am not sure if this is enough to fix the entire issue. Now it will
> fetch the length correctly but, what guarantees that when it tries to
> actually copy the key (tcf_action_dump_1), the same act_cookie pointer
> will be used? As in, can't the new re-fetch be different/smaller than
> the object used here?

I checked the code of nlmsg_put() and similar functions, and they check
that there is enough free space at skb tailroom. If not, they fail
gracefully and return error. Am I missing something?

>
>>  
>>  return  nla_total_size(0) /* action number nested */
>>  + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
>> -- 
>> 2.7.5
>>

RE: [PATCH v1 net-next 9/9] lan743x: Add PTP support

2018-07-09 Thread Bryan.Whitehead

Thanks Richard,
I'll add it in my next revision.

> -Original Message-
> From: Richard Cochran [mailto:richardcoch...@gmail.com]
> Sent: Friday, July 6, 2018 5:25 PM
> To: Bryan Whitehead - C21958 
> Cc: da...@davemloft.net; netdev@vger.kernel.org; UNGLinuxDriver
> 
> Subject: Re: [PATCH v1 net-next 9/9] lan743x: Add PTP support
> 
> On Thu, Jul 05, 2018 at 12:39:26PM -0400, Bryan Whitehead wrote:
> > Signed-off-by: Bryan Whitehead 
> 
> 1. You forgot the commit message.
> 2. You forgot to add the PTP maintainer on CC.
> 
> Thanks,
> Richard

Re: [PATCH net-next] net: sched: fix unprotected access to rcu cookie pointer

2018-07-09 Thread Marcelo Ricardo Leitner

On Mon, Jul 09, 2018 at 08:26:47PM +0300, Vlad Buslov wrote:
> Fix action attribute size calculation function to take rcu read lock and
> access act_cookie pointer with rcu dereference.
> 
> Fixes: eec94fdb0480 ("net: sched: use rcu for action cookie update")
> Reported-by: Marcelo Ricardo Leitner 
> Signed-off-by: Vlad Buslov 
> ---
>  net/sched/act_api.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sched/act_api.c b/net/sched/act_api.c
> index 66dc19746c63..148a89ab789b 100644
> --- a/net/sched/act_api.c
> +++ b/net/sched/act_api.c
> @@ -149,10 +149,15 @@ EXPORT_SYMBOL(__tcf_idr_release);
>  
>  static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
>  {
> + struct tc_cookie *act_cookie;
>   u32 cookie_len = 0;
>  
> - if (act->act_cookie)
> - cookie_len = nla_total_size(act->act_cookie->len);
> + rcu_read_lock();
> + act_cookie = rcu_dereference(act->act_cookie);
> +
> + if (act_cookie)
> + cookie_len = nla_total_size(act_cookie->len);
> + rcu_read_unlock();

I am not sure if this is enough to fix the entire issue. Now it will
fetch the length correctly but, what guarantees that when it tries to
actually copy the key (tcf_action_dump_1), the same act_cookie pointer
will be used? As in, can't the new re-fetch be different/smaller than
the object used here?

>  
>   return  nla_total_size(0) /* action number nested */
>   + nla_total_size(IFNAMSIZ) /* TCA_ACT_KIND */
> -- 
> 2.7.5
>

Re: [PATCH bpf-next v2 11/12] tools: libbpf: allow map reuse

2018-07-09 Thread Andrey Ignatov

Jakub Kicinski  [Mon, 2018-07-09 11:01 -0700]:
> More advanced applications may want to only replace programs without
> destroying associated maps.  Allow libbpf users to achieve that.
> Instead of always creating all of the maps at load time, expose to
> users an API to reconstruct the map object from already existing
> map.
> 
> The map parameters are read from the kernel and replace the parameters
> of the ELF map.  libbpf does not restrict the map replacement, i.e.
> the reused map does not have to be compatible with the ELF map
> definition.  We relay on the verifier for checking the compatibility
> between maps and programs.  The ELF map definition is completely
> overwritten by the information read from the kernel, to make sure
> libbpf's view of map object corresponds to the actual map.

Thanks for working on this Jakub! I encountered this shortcoming of
libbpf as well and was planning to fix it, but you beat me to it :)


> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Quentin Monnet 
> ---
>  tools/lib/bpf/libbpf.c | 35 +++
>  tools/lib/bpf/libbpf.h |  1 +
>  2 files changed, 36 insertions(+)
> 
> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index b653dbb266c7..c80033fe66c3 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c
> @@ -215,6 +215,7 @@ struct bpf_map {
>   int fd;
>   char *name;
>   size_t offset;
> + bool fd_preset;

Any reason not to use map->fd itself to identify if fd is present?

fd of every map is set to -1 in bpf_object__init_maps() that, in turn, is
called from __bpf_object__open():

for (i = 0; i < nr_maps; i++)
obj->maps[i].fd = -1;

Later it will either contain valid fd that is >= 0, or that same -1, what
should be enough to identify fd presence.


>   int map_ifindex;
>   struct bpf_map_def def;
>   uint32_t btf_key_type_id;
> @@ -1082,6 +1083,34 @@ static int bpf_map_find_btf_info(struct bpf_map *map, 
> const struct btf *btf)
>   return 0;
>  }
>  
> +int bpf_map__reuse_fd(struct bpf_map *map, int fd)
> +{
> + struct bpf_map_info info = {};
> + __u32 len = sizeof(info);
> + int err;
> +
> + err = bpf_obj_get_info_by_fd(fd, , );
> + if (err)
> + return err;
> +

Should there be a check that map->fd doesn't contain any valid fd (>= 0)
before rewriting it so that if it does (e.g. because the function is
called after bpf_object__load() by mistake), current map->fd won't be
leaked?


> + map->fd = dup(fd);

Unfortunately, new descriptor created by dup(2) will not have O_CLOEXEC set, in
contrast to original fd returned by kernel on map creation.

libbpf has other interface shortcomings where it comes up. E.g. struct
bpf_object owns all descriptors it contains (progs, maps) and closes them in
bpf_object__close(). if one wants to open/load ELF, then close it but
keep, say, prog fd to attach it to cgroup some time later, then fd
should be duplicated as well to get a new one not owned by bpf_object.

Currently I use this workaround to avoid time when new fd doesn't have
O_CLOEXEC:

int new_prog_fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
if (new_prog_fd < 0 ||
dup3(bpf_program__fd(prog), new_prog_fd, O_CLOEXEC) == -1) {
/* .. handle error .. */
close(new_prog_fd);
}
/* .. use new_prog_fd with O_CLOEXEC set */

Not sure how to simplify it. dup2() has same problem with regard to
O_CLOEXEC.

Use-case: standalone server application that uses libbpf and does
fork()/execve() a lot.


> + if (map->fd < 0)
> + return map->fd;
> + map->fd_preset = true;
> +
> + free(map->name);
> + map->name = strdup(info.name);
> + map->def.type = info.type;
> + map->def.key_size = info.key_size;
> + map->def.value_size = info.value_size;
> + map->def.max_entries = info.max_entries;
> + map->def.map_flags = info.map_flags;
> + map->btf_key_type_id = info.btf_key_type_id;
> + map->btf_value_type_id = info.btf_value_type_id;
> +
> + return 0;
> +}
> +
>  static int
>  bpf_object__create_maps(struct bpf_object *obj)
>  {
> @@ -1094,6 +1123,12 @@ bpf_object__create_maps(struct bpf_object *obj)
>   struct bpf_map_def *def = >def;
>   int *pfd = >fd;
>  
> + if (map->fd_preset) {
> + pr_debug("skip map create (preset) %s: fd=%d\n",
> +  map->name, map->fd);
> + continue;
> + }
> +
>   create_attr.name = map->name;
>   create_attr.map_ifindex = map->map_ifindex;
>   create_attr.map_type = def->type;
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index 60593ac44700..8e709a74f47c 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -261,6 +261,7 @@ typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, 
> void *);
>  int

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-09 Thread Marcelo Ricardo Leitner

On Mon, Jul 09, 2018 at 02:18:33PM -0400, Michel Machado wrote:
> On 07/09/2018 11:44 AM, Marcelo Ricardo Leitner wrote:
> > On Sat, Jul 07, 2018 at 03:43:55PM +0530, Nishanth Devarajan wrote:
> > > net/sched: add skbprio scheduer
> > > 
> > > Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes 
> > > packets
> > > according to their skb->priority field. Under congestion, 
> > > already-enqueued lower
> > > priority packets will be dropped to make space available for higher 
> > > priority
> > > packets. Skbprio was conceived as a solution for denial-of-service 
> > > defenses that
> > > need to route packets with different priorities as a means to overcome DoS
> > > attacks.
> > 
> > Why can't we implement this as a new flag for sch_prio.c?
> > 
> > I don't see why this duplication is needed, especially because it will
> > only be "slower" (as in, it will do more work) when qdisc is already
> > full and dropping packets anyway.
> 
>sch_prio.c and skbprio diverge on a number of aspects:
> 
>1. sch_prio.c supports up to 16 priorities whereas skbprio 64. This is
> not just a matter of changing a constant since sch_prio.c doesn't use
> skb->priority.

Yes it does use skb->priority for classifying into a band:

prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
{
struct prio_sched_data *q = qdisc_priv(sch);
u32 band = skb->priority;
...

> 
>2. sch_prio.c does not have a global limit on the number of packets on
> all its queues, only a limit per queue.

It can be useful to sch_prio.c as well, why not?
prio_enqueue()
{
...
+   if (count > sch->global_limit)
+   prio_tail_drop(sch);   /* to be implemented */
ret = qdisc_enqueue(skb, qdisc, to_free);

> 
>3. The queues of sch_prio.c are struct Qdisc, which don't have a method
> to drop at its tail.

That can be implemented, most likely as prio_tail_drop() as above.

> 
>Given the divergences, adding flags to sch_prio.c will essentially keep
> both implementations together instead of being isolated as being proposed.

I don't agree. There aren't that many flags. I see only 2, one which
makes sense to sch_prio as it is already (the global limit) and from
where it should drop, the overflown packet or from tail.

All other code will be reused: stats handling, netlink handling,
enqueue and dequeue at least.

If we add this other qdisc, named as it is, it will be very confusing
to sysadmins: both are named very closely and work essentially in the
same way, but one drops from tail and another drops the incoming
packet.

> 
>On the speed point, there may not be noticeable difference between both
> qdiscs because the enqueueing and dequeueing costs of both qdics are O(1).
> Notice that the "extra work" (i.e. dropping lower priority packets) is a key
> aspect of skbprio since it gives routers a cheap way to choose which packets
> to drop during a DoS.

On that I agree. I was more referring to something like: "lets not make
sch_prio slow and implement a new one instead.", which I don't it's
valid because the extra "cost" is only visible when it's already
dropping packets. Hopefully it's clearer now :)

[]s
Marcelo

[PATCH iproute2-next 3/9] tc: convert stats print to json

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

Convert compatiablity statistics to print in JSON.

Signed-off-by: Stephen Hemminger 
---
 tc/tc_util.c | 32 ++--
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index e8e1241d083d..05b6c97563b3 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -887,25 +887,37 @@ void print_tcstats_attr(FILE *fp, struct rtattr *tb[],
/* handle case where kernel returns more/less than we know 
about */
memcpy(, RTA_DATA(tb[TCA_STATS]), 
MIN(RTA_PAYLOAD(tb[TCA_STATS]), sizeof(st)));
 
-   fprintf(fp, "%sSent %llu bytes %u pkts (dropped %u, overlimits 
%u) ",
-   prefix, (unsigned long long)st.bytes, st.packets, 
st.drops,
-   st.overlimits);
+   print_string(PRINT_FP, NULL, "%sSent ", prefix);
+   print_u64(PRINT_ANY, "bytes", "%llu bytes ", (unsigned long 
long)st.bytes);
+   print_uint(PRINT_ANY, "packets", "%u pkts ", st.packets);
+   print_uint(PRINT_ANY, "dropped", "(dropped %u, ", st.drops);
+   print_uint(PRINT_ANY, "overlimits", "overlimits %u) ", 
st.overlimits);
 
if (st.bps || st.pps || st.qlen || st.backlog) {
-   fprintf(fp, "\n%s", prefix);
+   print_string(PRINT_FP, "%s%s", _SL_, prefix);
+
if (st.bps || st.pps) {
-   fprintf(fp, "rate ");
+   print_string(PRINT_FP, NULL, "%s", "rate ");
+   print_uint(PRINT_JSON, "rate", NULL, st.bps);
if (st.bps)
-   fprintf(fp, "%s ", sprint_rate(st.bps, 
b1));
+   print_string(PRINT_FP, NULL, "%s ",
+sprint_rate(st.bps, b1));
+
+   print_uint(PRINT_JSON, "pps", NULL, st.pps);
if (st.pps)
-   fprintf(fp, "%upps ", st.pps);
+   print_uint(PRINT_FP, NULL, "%upps ", 
st.pps);
}
if (st.qlen || st.backlog) {
-   fprintf(fp, "backlog ");
+   print_string(PRINT_FP, NULL, "%s", "backlog ");
+
+   print_uint(PRINT_JSON, "backlog", NULL, 
st.backlog);
+   print_uint(PRINT_JSON, "qlen", NULL, st.qlen);
if (st.backlog)
-   fprintf(fp, "%s ", 
sprint_size(st.backlog, b1));
+   print_string(PRINT_FP, NULL,
+"%s ", 
sprint_size(st.backlog, b1));
if (st.qlen)
-   fprintf(fp, "%up ", st.qlen);
+   print_uint(PRINT_FP, NULL,
+"%up ", st.qlen);
}
}
}
-- 
2.18.0

[PATCH iproute2-next 5/9] tc/util: remove print_rate

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

This function is not used, only sprint_rate is used.

Signed-off-by: Stephen Hemminger 
---
 tc/tc_util.c | 10 +++---
 tc/tc_util.h |  1 -
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index 05b6c97563b3..f5ffe3443892 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -307,7 +307,7 @@ int get_rate64(__u64 *rate, const char *str)
return 0;
 }
 
-void print_rate(char *buf, int len, __u64 rate)
+char *sprint_rate(__u64 rate, char *buf)
 {
extern int use_iec;
unsigned long kilo = use_iec ? 1024 : 1000;
@@ -325,12 +325,8 @@ void print_rate(char *buf, int len, __u64 rate)
rate /= kilo;
}
 
-   snprintf(buf, len, "%.0f%s%sbit", (double)rate, units[i], str);
-}
-
-char *sprint_rate(__u64 rate, char *buf)
-{
-   print_rate(buf, SPRINT_BSIZE-1, rate);
+   snprintf(buf, SPRINT_BSIZE-1,
+"%.0f%s%sbit", (double)rate, units[i], str);
return buf;
 }
 
diff --git a/tc/tc_util.h b/tc/tc_util.h
index 64b309903c69..56e214cbc8de 100644
--- a/tc/tc_util.h
+++ b/tc/tc_util.h
@@ -84,7 +84,6 @@ int get_size_and_cell(unsigned int *size, int *cell_log, char 
*str);
 int get_time(unsigned int *time, const char *str);
 int get_linklayer(unsigned int *val, const char *arg);
 
-void print_rate(char *buf, int len, __u64 rate);
 void print_size(char *buf, int len, __u32 size);
 void print_qdisc_handle(char *buf, int len, __u32 h);
 void print_time(char *buf, int len, __u32 time);
-- 
2.18.0

[PATCH iproute2-next 7/9] tc/util: remove unused print_time

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

Only sprint_time is used.

Signed-off-by: Stephen Hemminger 
---
 tc/tc_util.c | 8 ++--
 tc/tc_util.h | 1 -
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index 01e131b5c5d7..95cb49b98612 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -357,9 +357,9 @@ int get_time(unsigned int *time, const char *str)
return 0;
 }
 
-
-void print_time(char *buf, int len, __u32 time)
+char *sprint_time(__u32 time, char *buf)
 {
+   const size_t len = SPRINT_BSIZE - 1;
double tmp = time;
 
if (tmp >= TIME_UNITS_PER_SEC)
@@ -368,11 +368,7 @@ void print_time(char *buf, int len, __u32 time)
snprintf(buf, len, "%.1fms", tmp/(TIME_UNITS_PER_SEC/1000));
else
snprintf(buf, len, "%uus", time);
-}
 
-char *sprint_time(__u32 time, char *buf)
-{
-   print_time(buf, SPRINT_BSIZE-1, time);
return buf;
 }
 
diff --git a/tc/tc_util.h b/tc/tc_util.h
index 01c6a09a8839..16babd21b473 100644
--- a/tc/tc_util.h
+++ b/tc/tc_util.h
@@ -85,7 +85,6 @@ int get_time(unsigned int *time, const char *str);
 int get_linklayer(unsigned int *val, const char *arg);
 
 void print_qdisc_handle(char *buf, int len, __u32 h);
-void print_time(char *buf, int len, __u32 time);
 void print_linklayer(char *buf, int len, unsigned int linklayer);
 void print_devname(enum output_type type, int ifindex);
 
-- 
2.18.0

[PATCH iproute2-next 1/9] tc: use JSON in error handling

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

If option or qdisc is unknown, still output in JSON.

Signed-off-by: Stephen Hemminger 
---
 tc/tc.c | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tc/tc.c b/tc/tc.c
index 0d223281ba25..b6d2a6151793 100644
--- a/tc/tc.c
+++ b/tc/tc.c
@@ -56,9 +56,11 @@ static struct filter_util *filter_list;
 static int print_noqopt(struct qdisc_util *qu, FILE *f,
struct rtattr *opt)
 {
-   if (opt && RTA_PAYLOAD(opt))
-   fprintf(f, "[Unknown qdisc, optlen=%u] ",
-   (unsigned int) RTA_PAYLOAD(opt));
+   if (opt && RTA_PAYLOAD(opt)) {
+   print_string(PRINT_JSON, "qdisc", NULL, "UNKNOWN");
+   print_uint(PRINT_ANY, "optlen", "[Unknown qdisc, optlen=%u] ",
+  RTA_PAYLOAD(opt));
+   }
return 0;
 }
 
@@ -76,11 +78,12 @@ static int parse_noqopt(struct qdisc_util *qu, int argc, 
char **argv,
 
 static int print_nofopt(struct filter_util *qu, FILE *f, struct rtattr *opt, 
__u32 fhandle)
 {
-   if (opt && RTA_PAYLOAD(opt))
-   fprintf(f, "fh %08x [Unknown filter, optlen=%u] ",
-   fhandle, (unsigned int) RTA_PAYLOAD(opt));
-   else if (fhandle)
-   fprintf(f, "fh %08x ", fhandle);
+   if (opt && RTA_PAYLOAD(opt)) {
+   print_0xhex(PRINT_ANY, "handle", "fh %08x ", fhandle);
+   print_uint(PRINT_ANY, "optlen", "[Unknown filter, optlen=%u] ",
+   RTA_PAYLOAD(opt));
+   } else if (fhandle)
+   print_0xhex(PRINT_ANY, "handle", "fh %08x ", fhandle);
return 0;
 }
 
-- 
2.18.0

[PATCH iproute2-next 6/9] tc/util: remove unused print_size

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

Only sprint_size is used, so fold it in.

Signed-off-by: Stephen Hemminger 
---
 tc/tc_util.c | 7 ++-
 tc/tc_util.h | 1 -
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index f5ffe3443892..01e131b5c5d7 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -451,8 +451,9 @@ void print_devname(enum output_type type, int ifindex)
   "dev", "%s ", ifname);
 }
 
-void print_size(char *buf, int len, __u32 sz)
+char *sprint_size(__u32 sz, char *buf)
 {
+   const size_t len = SPRINT_BSIZE - 1;
double tmp = sz;
 
if (sz >= 1024*1024 && fabs(1024*1024*rint(tmp/(1024*1024)) - sz) < 
1024)
@@ -461,11 +462,7 @@ void print_size(char *buf, int len, __u32 sz)
snprintf(buf, len, "%gKb", rint(tmp/1024));
else
snprintf(buf, len, "%ub", sz);
-}
 
-char *sprint_size(__u32 size, char *buf)
-{
-   print_size(buf, SPRINT_BSIZE-1, size);
return buf;
 }
 
diff --git a/tc/tc_util.h b/tc/tc_util.h
index 56e214cbc8de..01c6a09a8839 100644
--- a/tc/tc_util.h
+++ b/tc/tc_util.h
@@ -84,7 +84,6 @@ int get_size_and_cell(unsigned int *size, int *cell_log, char 
*str);
 int get_time(unsigned int *time, const char *str);
 int get_linklayer(unsigned int *val, const char *arg);
 
-void print_size(char *buf, int len, __u32 size);
 void print_qdisc_handle(char *buf, int len, __u32 h);
 void print_time(char *buf, int len, __u32 time);
 void print_linklayer(char *buf, int len, unsigned int linklayer);
-- 
2.18.0

[PATCH iproute2-next 8/9] tc/util: add print helpers for JSON

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

Add a helper to print rate and size in numeric or pretty format
based on JSON flag.

Signed-off-by: Stephen Hemminger 
---
 tc/tc_util.c | 59 ++--
 tc/tc_util.h |  2 ++
 2 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index 95cb49b98612..787c3fc8a57a 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -330,6 +330,19 @@ char *sprint_rate(__u64 rate, char *buf)
return buf;
 }
 
+/*
+ * print rate as numeric in JSON
+ * or in human format otherwise.
+ */
+void print_rate(const char *key, const char *fmt, __u64 rate)
+{
+   SPRINT_BUF(b1);
+
+   print_u64(PRINT_JSON, key, NULL, rate);
+   print_string(PRINT_FP, NULL, fmt,
+sprint_rate(rate, b1));
+}
+
 int get_time(unsigned int *time, const char *str)
 {
double t;
@@ -462,6 +475,15 @@ char *sprint_size(__u32 sz, char *buf)
return buf;
 }
 
+void print_size(const char *key, const char *fmt, __u32 sz)
+{
+   SPRINT_BUF(b1);
+
+   print_uint(PRINT_JSON, key, NULL, sz);
+   print_string(PRINT_FP, NULL, fmt,
+sprint_size(sz, b1));
+}
+
 void print_qdisc_handle(char *buf, int len, __u32 h)
 {
snprintf(buf, len, "%x:", TC_H_MAJ(h)>>16);
@@ -792,7 +814,6 @@ void print_tm(FILE *f, const struct tcf_t *tm)
 void print_tcstats2_attr(FILE *fp, struct rtattr *rta,
 const char *prefix, struct rtattr **xstats)
 {
-   SPRINT_BUF(b1);
struct rtattr *tbs[TCA_STATS_MAX + 1];
 
parse_rtattr_nested(tbs, TCA_STATS_MAX, rta);
@@ -823,21 +844,15 @@ void print_tcstats2_attr(FILE *fp, struct rtattr *rta,
   MIN(RTA_PAYLOAD(tbs[TCA_STATS_RATE_EST64]),
   sizeof(re)));
print_string(PRINT_FP, NULL, "\n%s", prefix);
-   print_lluint(PRINT_JSON, "rate", NULL, re.bps);
-   print_string(PRINT_FP, NULL, "rate %s",
-sprint_rate(re.bps, b1));
+   print_rate("rate", "rate %s", re.bps);
print_lluint(PRINT_ANY, "pps", " %llupps", re.pps);
} else if (tbs[TCA_STATS_RATE_EST]) {
struct gnet_stats_rate_est re = {0};
 
memcpy(, RTA_DATA(tbs[TCA_STATS_RATE_EST]),
   MIN(RTA_PAYLOAD(tbs[TCA_STATS_RATE_EST]), sizeof(re)));
-   fprintf(fp, "\n%srate %s %upps ",
-   prefix, sprint_rate(re.bps, b1), re.pps);
print_string(PRINT_FP, NULL, "\n%s", prefix);
-   print_uint(PRINT_JSON, "rate", NULL, re.bps);
-   print_string(PRINT_FP, NULL, "rate %s",
-sprint_rate(re.bps, b1));
+   print_rate("rate", "rate %s", re.bps);
print_uint(PRINT_ANY, "pps", " %upps", re.pps);
}
 
@@ -847,9 +862,7 @@ void print_tcstats2_attr(FILE *fp, struct rtattr *rta,
memcpy(, RTA_DATA(tbs[TCA_STATS_QUEUE]), 
MIN(RTA_PAYLOAD(tbs[TCA_STATS_QUEUE]), sizeof(q)));
if (!tbs[TCA_STATS_RATE_EST])
print_string(PRINT_FP, NULL, "\n%s", prefix);
-   print_uint(PRINT_JSON, "backlog", NULL, q.backlog);
-   print_string(PRINT_FP, NULL, "backlog %s",
-sprint_size(q.backlog, b1));
+   print_size("backlog",  "backlog %s", q.backlog);
print_uint(PRINT_ANY, "qlen", " %up", q.qlen);
print_uint(PRINT_FP, NULL, " requeues %u", q.requeues);
}
@@ -861,8 +874,6 @@ void print_tcstats2_attr(FILE *fp, struct rtattr *rta,
 void print_tcstats_attr(FILE *fp, struct rtattr *tb[],
const char *prefix, struct rtattr **xstats)
 {
-   SPRINT_BUF(b1);
-
if (tb[TCA_STATS2]) {
print_tcstats2_attr(fp, tb[TCA_STATS2], prefix, xstats);
if (xstats && NULL == *xstats)
@@ -887,26 +898,14 @@ void print_tcstats_attr(FILE *fp, struct rtattr *tb[],
 
if (st.bps || st.pps) {
print_string(PRINT_FP, NULL, "%s", "rate ");
-   print_uint(PRINT_JSON, "rate", NULL, st.bps);
-   if (st.bps)
-   print_string(PRINT_FP, NULL, "%s ",
-sprint_rate(st.bps, b1));
-
-   print_uint(PRINT_JSON, "pps", NULL, st.pps);
-   if (st.pps)
-   print_uint(PRINT_FP, NULL, "%upps ", 
st.pps);
+   print_rate("rate", "%s ", st.bps);
+   print_uint(PRINT_JSON, "pps", "%upps ", st.pps);
}
if (st.qlen || st.backlog) {
print_string(PRINT_FP, NULL, "%s", "backlog ");
 
-

[PATCH iproute2-next 2/9] tc: use const char in util

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

For prefix and path, can use const char.

Signed-off-by: Stephen Hemminger 
---
 tc/tc_util.c | 8 +---
 tc/tc_util.h | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index e0c96291ade0..e8e1241d083d 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -35,7 +35,7 @@ static struct db_names *cls_names;
 
 #define NAMES_DB "/etc/iproute2/tc_cls"
 
-int cls_names_init(char *path)
+int cls_names_init(const char *path)
 {
int ret;
 
@@ -800,7 +800,8 @@ void print_tm(FILE *f, const struct tcf_t *tm)
}
 }
 
-void print_tcstats2_attr(FILE *fp, struct rtattr *rta, char *prefix, struct 
rtattr **xstats)
+void print_tcstats2_attr(FILE *fp, struct rtattr *rta,
+const char *prefix, struct rtattr **xstats)
 {
SPRINT_BUF(b1);
struct rtattr *tbs[TCA_STATS_MAX + 1];
@@ -868,7 +869,8 @@ void print_tcstats2_attr(FILE *fp, struct rtattr *rta, char 
*prefix, struct rtat
*xstats = tbs[TCA_STATS_APP] ? : NULL;
 }
 
-void print_tcstats_attr(FILE *fp, struct rtattr *tb[], char *prefix, struct 
rtattr **xstats)
+void print_tcstats_attr(FILE *fp, struct rtattr *tb[],
+   const char *prefix, struct rtattr **xstats)
 {
SPRINT_BUF(b1);
 
diff --git a/tc/tc_util.h b/tc/tc_util.h
index 6632c4f9c528..64b309903c69 100644
--- a/tc/tc_util.h
+++ b/tc/tc_util.h
@@ -100,9 +100,9 @@ char *sprint_ticks(__u32 ticks, char *buf);
 char *sprint_linklayer(unsigned int linklayer, char *buf);
 
 void print_tcstats_attr(FILE *fp, struct rtattr *tb[],
-   char *prefix, struct rtattr **xstats);
+   const char *prefix, struct rtattr **xstats);
 void print_tcstats2_attr(FILE *fp, struct rtattr *rta,
-char *prefix, struct rtattr **xstats);
+const char *prefix, struct rtattr **xstats);
 
 int get_tc_classid(__u32 *h, const char *str);
 int print_tc_classid(char *buf, int len, __u32 h);
@@ -130,7 +130,7 @@ int parse_action(int *argc_p, char ***argv_p, int tca_id, 
struct nlmsghdr *n);
 void print_tm(FILE *f, const struct tcf_t *tm);
 int prio_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt);
 
-int cls_names_init(char *path);
+int cls_names_init(const char *path);
 void cls_names_uninit(void);
 
 int action_a2n(char *arg, int *result, bool allow_num);
-- 
2.18.0

Re: [PATCH bpf-next v2 12/12] tools: bpftool: allow reuse of maps with bpftool prog load

2018-07-09 Thread Alexei Starovoitov

On Mon, Jul 09, 2018 at 10:59:44AM -0700, Jakub Kicinski wrote:
> Add map parameter to prog load which will allow reuse of existing
> maps instead of creating new ones.
> 
> We need feature detection and compat code for reallocarray, since
> it's not available in many libc versions.
> 
> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Quentin Monnet 

cmdline interface feels a bit awkward to use, but it's a nice improvement.
Acked-by: Alexei Starovoitov 

any plans to extend bpf_map_def similar to iproute2 ?
so things like pinned file name and map reuse can be specified in .c file
instead of cmdline?

[PATCH iproute2-next 4/9] tc/cbq: use sprint_rate

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

All other places in tc use sprint_rate.

Signed-off-by: Stephen Hemminger 
---
 tc/q_cbq.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/tc/q_cbq.c b/tc/q_cbq.c
index e7f1a3bfaf5d..ad0170c41858 100644
--- a/tc/q_cbq.c
+++ b/tc/q_cbq.c
@@ -495,10 +495,9 @@ static int cbq_print_opt(struct qdisc_util *qu, FILE *f, 
struct rtattr *opt)
}
 
if (r) {
-   char buf[64];
 
-   print_rate(buf, sizeof(buf), r->rate);
-   fprintf(f, "rate %s ", buf);
+   fprintf(f, "rate %s ",
+   sprint_rate(r->rate, b1));
linklayer = (r->linklayer & TC_LINKLAYER_MASK);
if (linklayer > TC_LINKLAYER_ETHERNET || show_details)
fprintf(f, "linklayer %s ", sprint_linklayer(linklayer, 
b2));
@@ -530,14 +529,12 @@ static int cbq_print_opt(struct qdisc_util *qu, FILE *f, 
struct rtattr *opt)
fprintf(f, "prio %u", wrr->priority);
else
fprintf(f, "prio no-transmit");
-   if (show_details) {
-   char buf[64];
 
+   if (show_details) {
fprintf(f, "/%u ", wrr->cpriority);
-   if (wrr->weight != 1) {
-   print_rate(buf, sizeof(buf), wrr->weight);
-   fprintf(f, "weight %s ", buf);
-   }
+   if (wrr->weight != 1)
+   fprintf(f, "weight %s ",
+   sprint_rate(wrr->weight, b1));
if (wrr->allot)
fprintf(f, "allot %ub ", wrr->allot);
}
-- 
2.18.0

[PATCH iproute2-next 0/9] TC more JSON support

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

Update core of TC command and library to do more JSON.
Most of this patch set is about getting tc utility functions
to be more friendly to the json_print infrastructure.

Stephen Hemminger (9):
  tc: use JSON in error handling
  tc: use const char in util
  tc: convert stats print to json
  tc/cbq: use sprint_rate
  tc/util: remove print_rate
  tc/util: remove unused print_size
  tc/util: remove unused print_time
  tc/util: add print helpers for JSON
  tc/sfq: add json support

 tc/q_cbq.c   | 15 -
 tc/q_sfq.c   | 65 +---
 tc/tc.c  | 19 ++-
 tc/tc_util.c | 94 +++-
 tc/tc_util.h | 11 +++---
 5 files changed, 109 insertions(+), 95 deletions(-)

-- 
2.18.0

[PATCH iproute2-next 9/9] tc/sfq: add json support

2018-07-09 Thread Stephen Hemminger

From: Stephen Hemminger 

Convert to use JSON

Signed-off-by: Stephen Hemminger 
---
 tc/q_sfq.c | 65 --
 1 file changed, 39 insertions(+), 26 deletions(-)

diff --git a/tc/q_sfq.c b/tc/q_sfq.c
index 6a1d853b7c93..cc8ce0dddf7e 100644
--- a/tc/q_sfq.c
+++ b/tc/q_sfq.c
@@ -205,9 +205,6 @@ static int sfq_print_opt(struct qdisc_util *qu, FILE *f, 
struct rtattr *opt)
struct tc_sfq_qopt *qopt;
struct tc_sfq_qopt_v1 *qopt_ext = NULL;
 
-   SPRINT_BUF(b1);
-   SPRINT_BUF(b2);
-   SPRINT_BUF(b3);
if (opt == NULL)
return 0;
 
@@ -216,36 +213,53 @@ static int sfq_print_opt(struct qdisc_util *qu, FILE *f, 
struct rtattr *opt)
if (RTA_PAYLOAD(opt) >= sizeof(*qopt_ext))
qopt_ext = RTA_DATA(opt);
qopt = RTA_DATA(opt);
-   fprintf(f, "limit %up ", qopt->limit);
-   fprintf(f, "quantum %s ", sprint_size(qopt->quantum, b1));
+
+   print_uint(PRINT_ANY, "limit", "limit %up ", qopt->limit);
+   print_size("quantum", "quantum %s ", qopt->quantum);
+   
if (qopt_ext && qopt_ext->depth)
-   fprintf(f, "depth %u ", qopt_ext->depth);
+   print_uint(PRINT_ANY, "depth",
+  "depth %u ", qopt_ext->depth);
if (qopt_ext && qopt_ext->headdrop)
-   fprintf(f, "headdrop ");
+   print_null(PRINT_ANY, "headdrop", "headdrop ", NULL);
 
if (show_details) {
-   fprintf(f, "flows %u/%u ", qopt->flows, qopt->divisor);
+   print_uint(PRINT_ANY, "flows", "flows %u", qopt->flows);
+   print_uint(PRINT_ANY, "divisor", "/%u ", qopt->divisor);
+   } else {
+   print_uint(PRINT_ANY, "divisor", "divisor %u ", qopt->divisor);
}
-   fprintf(f, "divisor %u ", qopt->divisor);
+
if (qopt->perturb_period)
-   fprintf(f, "perturb %dsec ", qopt->perturb_period);
+   print_int(PRINT_ANY, "perturb",
+ "perturb %dsec ", qopt->perturb_period);
+
if (qopt_ext && qopt_ext->qth_min) {
-   fprintf(f, "\n ewma %u ", qopt_ext->Wlog);
-   fprintf(f, "min %s max %s probability %g ",
-   sprint_size(qopt_ext->qth_min, b2),
-   sprint_size(qopt_ext->qth_max, b3),
-   qopt_ext->max_P / pow(2, 32));
+   print_string(PRINT_FP, NULL, "%s", _SL_);
+   print_uint(PRINT_ANY, "ewma", " ewma %u ", qopt_ext->Wlog);
+   print_size("qth_min", "min %s" , qopt_ext->qth_min);
+   print_size("qth_max", "max %s ", qopt_ext->qth_max);
+   print_float(PRINT_ANY, "probability", "probability %g ",
+   qopt_ext->max_P / pow(2, 32));
+
if (qopt_ext->flags & TC_RED_ECN)
-   fprintf(f, "ecn ");
+   print_null(PRINT_ANY, "ecn", "ecn ", NULL);
+
if (show_stats) {
-   fprintf(f, "\n prob_mark %u prob_mark_head %u prob_drop 
%u",
-   qopt_ext->stats.prob_mark,
-   qopt_ext->stats.prob_mark_head,
-   qopt_ext->stats.prob_drop);
-   fprintf(f, "\n forced_mark %u forced_mark_head %u 
forced_drop %u",
-   qopt_ext->stats.forced_mark,
-   qopt_ext->stats.forced_mark_head,
-   qopt_ext->stats.forced_drop);
+   print_string(PRINT_FP, NULL, "%s", _SL_);
+   print_uint(PRINT_ANY, "prob_mark", " prob_mark %u",
+  qopt_ext->stats.prob_mark);
+   print_uint(PRINT_ANY, "prob_mark_head", " 
prob_mark_head %u",
+  qopt_ext->stats.prob_mark_head);
+   print_uint(PRINT_ANY, "prob_drop"," prob_drop %u",
+  qopt_ext->stats.prob_drop);
+   print_string(PRINT_FP, NULL, "%s", _SL_);
+   print_uint(PRINT_ANY, "forced_mark", " forced_mark %u",
+  qopt_ext->stats.forced_mark);
+   print_uint(PRINT_ANY, "forced_mark_head", " 
forced_mark_head %u",
+  qopt_ext->stats.forced_mark_head);
+   print_uint(PRINT_ANY, "forced_drop"," forced_drop %u",
+  qopt_ext->stats.forced_drop);
}
}
return 0;
@@ -262,8 +276,7 @@ static int sfq_print_xstats(struct qdisc_util *qu, FILE *f,
return -1;
st = RTA_DATA(xstats);
 
-   fprintf(f, " allot %d ", st->allot);
-   fprintf(f, "\n");
+   print_int(PRINT_ANY, "allot", "allot %d\n", st->allot);
return 0;
 }
 
-- 
2.18.0

Re: [PATCH bpf-next v2 09/12] tools: bpftool: reimplement bpf_prog_load() for prog load

2018-07-09 Thread Alexei Starovoitov

On Mon, Jul 09, 2018 at 10:59:41AM -0700, Jakub Kicinski wrote:
> bpf_prog_load() is a very useful helper but it doesn't give us full
> flexibility of modifying the BPF objects before loading.  Open code
> bpf_prog_load() in bpftool so we can add extra logic in following
> commits.
> 
> Signed-off-by: Jakub Kicinski 
> Reviewed-by: Quentin Monnet 

Acked-by: Alexei Starovoitov

Re: [PATCH net-next v3 0/2] tcp: fix high tail latencies in DCTCP

2018-07-09 Thread Lawrence Brakmo

On 7/9/18, 12:32 PM, "Yuchung Cheng"  wrote:

On Sat, Jul 7, 2018 at 7:07 AM, Neal Cardwell  wrote:
> On Sat, Jul 7, 2018 at 7:15 AM David Miller  wrote:
>>
>> From: Lawrence Brakmo 
>> Date: Tue, 3 Jul 2018 09:26:13 -0700
>>
>> > When have observed high tail latencies when using DCTCP for RPCs as
>> > compared to using Cubic. For example, in one setup there are 2 hosts
>> > sending to a 3rd one, with each sender having 3 flows (1 stream,
>> > 1 1MB back-to-back RPCs and 1 10KB back-to-back RPCs). The following
>> > table shows the 99% and 99.9% latencies for both Cubic and dctcp:
>> >
>> >Cubic 99%  Cubic 99.9%   dctcp 99%dctcp 99.9%
>> > 1MB RPCs2.6ms   5.5ms 43ms  208ms
>> > 10KB RPCs1.1ms   1.3ms 53ms  212ms
>>  ...
>> > v2: Removed call to tcp_ca_event from tcp_send_ack since I added one in
>> > tcp_event_ack_sent. Based on Neal Cardwell 
>> > feedback.
>> > Modified tcp_ecn_check_ce (and renamed it tcp_ecn_check) instead 
of modifying
>> > tcp_ack_send_check to insure an ACK when cwr is received.
>> > v3: Handling cwr in tcp_ecn_accept_cwr instead of in tcp_ecn_check.
>> >
>> > [PATCH net-next v3 1/2] tcp: notify when a delayed ack is sent
>> > [PATCH net-next v3 2/2] tcp: ack immediately when a cwr packet
>>
>> Neal and co., what are your thoughts right now about this patch series?
>>
>> Thank you.
>
> IMHO these patches are a definite improvement over what we have now.
>
> That said, in chatting with Yuchung before the July 4th break, I think
> Yuchung and I agreed that we would ideally like to see something like
> the following:
>
> (1) refactor the DCTCP code to check for pending delayed ACKs directly
> using existing state (inet_csk(sk)->icsk_ack.pending &
> ICSK_ACK_TIMER), and remove the ca->delayed_ack_reserved DCTCP field
> and the CA_EVENT_DELAYED_ACK and CA_EVENT_NON_DELAYED_ACK callbacks
> added for DCTCP (which Larry determined had at least one bug).

I agree that getting rid of the callbacks would be an improvement, but that is 
more about optimizing the code. This could be done after we fix the current 
bugs. My concern is that it may be more complicated that we think and the 
current bug would continue to exist. Yes, I realize that it has been there for 
a while; but not because no one found it before, but because it was hard to 
pinpoint. 

> (2) fix the bug with the DCTCP call to tcp_send_ack(sk) causing
> delayed ACKs to be incorrectly dropped/forgotten (not yet addressed by
> this patch series)

Good idea, but as I mentioned earlier, I would rather fix the really bad bugs 
first and then deal with this one. As far as I can see from my testing of 
DC-TCP, I have not seen any bad consequences from this bug so far.

> (3) then with fixes (1) and (2) in place, re-run tests and see if we
> still need Larry's heuristic (in patch 2) to fire an ACK immediately
> if a receiver receives a CWR packet (I suspect this is still very
> useful, but I think Yuchung is reluctant to add this complexity unless
> we have verified it's still needed after (1) and (2))

I fail to understand how (1) and (2) have anything to do with ACKing 
immediately when we receive a CWR packet. It has nothing to do with a current 
delayed ACK, it has to do with the cwnd closing to 1 when an TCP ECE marked 
packet is received at the end of an RPC and the current TCP delay ACK logic 
choosing to delay the ACK. The issue happens right after the receiver has sent 
its reply to the RPC, so at that stage there are no active delayed ACKs (the 
first patch fixed the issue where DC-TCP thought there was an active delayed 
ACK). 

>
> Our team may be able to help out with some proposed patches for (1) and 
(2).
>
> In any case, I would love to have Yuchung and Eric weigh in (perhaps
> Monday) before we merge this patch series.
Thanks Neal. Sorry for not reflecting these timely before I took off
for July 4 holidays. I was going to post the same comment - Larry: I
could provide draft patches if that helps.

Yuchung: go ahead and send me the drafts. But as I already mentioned, I would 
like to fix the bad bug first and then make it pretty.

>
> Thanks,
> neal

Re: [PATCH v3 net-next] net/sched: add skbprio scheduler

2018-07-09 Thread Michel Machado


On 07/09/2018 11:44 AM, Marcelo Ricardo Leitner wrote:

On Sat, Jul 07, 2018 at 03:43:55PM +0530, Nishanth Devarajan wrote:

net/sched: add skbprio scheduer

Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes packets
according to their skb->priority field. Under congestion, already-enqueued lower
priority packets will be dropped to make space available for higher priority
packets. Skbprio was conceived as a solution for denial-of-service defenses that
need to route packets with different priorities as a means to overcome DoS
attacks.


Why can't we implement this as a new flag for sch_prio.c?

I don't see why this duplication is needed, especially because it will
only be "slower" (as in, it will do more work) when qdisc is already
full and dropping packets anyway.


   sch_prio.c and skbprio diverge on a number of aspects:

   1. sch_prio.c supports up to 16 priorities whereas skbprio 64. This 
is not just a matter of changing a constant since sch_prio.c doesn't use 
skb->priority.


   2. sch_prio.c does not have a global limit on the number of packets 
on all its queues, only a limit per queue.


   3. The queues of sch_prio.c are struct Qdisc, which don't have a 
method to drop at its tail.


   Given the divergences, adding flags to sch_prio.c will essentially 
keep both implementations together instead of being isolated as being 
proposed.


   On the speed point, there may not be noticeable difference between 
both qdiscs because the enqueueing and dequeueing costs of both qdics 
are O(1). Notice that the "extra work" (i.e. dropping lower priority 
packets) is a key aspect of skbprio since it gives routers a cheap way 
to choose which packets to drop during a DoS.


[ ]'s
Michel Machado

Re: [PATCH net-next] tcp: expose both send and receive intervals for rate sample

2018-07-09 Thread Eric Dumazet




On 07/09/2018 10:53 AM, Deepti Raghavan wrote:
> Congestion control algorithms, which access the rate sample
> through the tcp_cong_control function, only have access to the maximum
> of the send and receive interval, for cases where the acknowledgment
> rate may be inaccurate due to ACK compression or decimation. Algorithms
> may want to use send rates and receive rates as separate signals.
> 
> Signed-off-by: Deepti Raghavan 

Signed-off-by: Eric Dumazet 

(Assuming another CC is coming soon, using this...)

Thanks

Re: [PATCH net-next] tcp: expose both send and receive intervals for rate sample

2018-07-09 Thread Neal Cardwell

On Mon, Jul 9, 2018 at 1:58 PM Deepti Raghavan  wrote:
>
> Congestion control algorithms, which access the rate sample
> through the tcp_cong_control function, only have access to the maximum
> of the send and receive interval, for cases where the acknowledgment
> rate may be inaccurate due to ACK compression or decimation. Algorithms
> may want to use send rates and receive rates as separate signals.
>
> Signed-off-by: Deepti Raghavan 
> ---
>  include/net/tcp.h   | 2 ++
>  net/ipv4/tcp_rate.c | 4 
>  2 files changed, 6 insertions(+)

Thanks for re-sending. It does seem to be showing up in patchwork now:
  https://patchwork.ozlabs.org/patch/941532/
And I can confirm I'm able to apply it to net-next.

Acked-by: Neal Cardwell 

thanks,
neal

[PATCH bpf-next v2 07/12] tools: libbpf: recognize offload neutral maps

2018-07-09 Thread Jakub Kicinski

Add helper to libbpf for recognizing maps which should not have
ifindex set when program is loaded.  These maps only contain
host metadata and therefore are not marked for offload, e.g.
the perf event map.

Use this helper in bpf_prog_load_xattr().

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/lib/bpf/libbpf.c | 8 +++-
 tools/lib/bpf/libbpf.h | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 30f3e58bd563..edc3b0b3737d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2154,6 +2154,11 @@ void *bpf_map__priv(struct bpf_map *map)
return map ? map->priv : ERR_PTR(-EINVAL);
 }
 
+bool bpf_map__is_offload_neutral(struct bpf_map *map)
+{
+   return map->def.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY;
+}
+
 void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex)
 {
map->map_ifindex = ifindex;
@@ -2278,7 +2283,8 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr 
*attr,
}
 
bpf_map__for_each(map, obj) {
-   map->map_ifindex = attr->ifindex;
+   if (!bpf_map__is_offload_neutral(map))
+   map->map_ifindex = attr->ifindex;
}
 
if (!first_prog) {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 617dacfc6704..3122d74f2643 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -255,6 +255,7 @@ typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void 
*);
 int bpf_map__set_priv(struct bpf_map *map, void *priv,
  bpf_map_clear_priv_t clear_priv);
 void *bpf_map__priv(struct bpf_map *map);
+bool bpf_map__is_offload_neutral(struct bpf_map *map);
 void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex);
 int bpf_map__pin(struct bpf_map *map, const char *path);
 
-- 
2.17.1

[PATCH bpf-next v2 04/12] tools: bpftool: add support for loading programs for offload

2018-07-09 Thread Jakub Kicinski

Extend the bpftool prog load command to also accept "dev"
parameter, which will allow us to load programs onto devices.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 .../bpftool/Documentation/bpftool-prog.rst|  6 ++--
 tools/bpf/bpftool/bash-completion/bpftool | 23 ++--
 tools/bpf/bpftool/prog.c  | 35 +--
 3 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst 
b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 43d34a5c3ec5..41723c6acaa6 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -24,7 +24,7 @@ MAP COMMANDS
 |  **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** 
| **visual**}]
 |  **bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | 
**opcodes**}]
 |  **bpftool** **prog pin** *PROG* *FILE*
-|  **bpftool** **prog load** *OBJ* *FILE*
+|  **bpftool** **prog load** *OBJ* *FILE* [**dev** *NAME*]
 |  **bpftool** **prog help**
 |
 |  *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
@@ -64,8 +64,10 @@ DESCRIPTION
 
  Note: *FILE* must be located in *bpffs* mount.
 
-   **bpftool prog load** *OBJ* *FILE*
+   **bpftool prog load** *OBJ* *FILE* [**dev** *NAME*]
  Load bpf program from binary *OBJ* and pin as *FILE*.
+ If **dev** *NAME* is specified program will be loaded onto
+ given networking device (offload).
 
  Note: *FILE* must be located in *bpffs* mount.
 
diff --git a/tools/bpf/bpftool/bash-completion/bpftool 
b/tools/bpf/bpftool/bash-completion/bpftool
index ce0bc0cda361..238c2f80092a 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -99,6 +99,12 @@ _bpftool_get_prog_tags()
 command sed -n 's/.*"tag": "\(.*\)",$/\1/p' )" -- "$cur" ) )
 }
 
+_sysfs_get_netdevs()
+{
+COMPREPLY+=( $( compgen -W "$( ls /sys/class/net 2>/dev/null )" -- \
+"$cur" ) )
+}
+
 # For bpftool map update: retrieve type of the map to update.
 _bpftool_map_update_map_type()
 {
@@ -262,8 +268,21 @@ _bpftool()
 return 0
 ;;
 load)
-_filedir
-return 0
+if [[ ${#words[@]} -lt 6 ]]; then
+_filedir
+return 0
+fi
+
+case $prev in
+dev)
+_sysfs_get_netdevs
+return 0
+;;
+*)
+_bpftool_once_attr 'dev'
+return 0
+;;
+esac
 ;;
 *)
 [[ $prev == $object ]] && \
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index a5ef46c59029..21c74de7156f 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -681,6 +682,9 @@ static int do_pin(int argc, char **argv)
 
 static int do_load(int argc, char **argv)
 {
+   struct bpf_prog_load_attr attr = {
+   .prog_type  = BPF_PROG_TYPE_UNSPEC,
+   };
const char *objfile, *pinfile;
struct bpf_object *obj;
int prog_fd;
@@ -690,7 +694,34 @@ static int do_load(int argc, char **argv)
objfile = GET_ARG();
pinfile = GET_ARG();
 
-   if (bpf_prog_load(objfile, BPF_PROG_TYPE_UNSPEC, , _fd)) {
+   while (argc) {
+   if (is_prefix(*argv, "dev")) {
+   NEXT_ARG();
+
+   if (attr.ifindex) {
+   p_err("offload device already specified");
+   return -1;
+   }
+   if (!REQ_ARGS(1))
+   return -1;
+
+   attr.ifindex = if_nametoindex(*argv);
+   if (!attr.ifindex) {
+   p_err("unrecognized netdevice '%s': %s",
+ *argv, strerror(errno));
+   return -1;
+   }
+   NEXT_ARG();
+   } else {
+   p_err("expected no more arguments or 'dev', got: '%s'?",
+ *argv);
+   return -1;
+   }
+   }
+
+   attr.file = objfile;
+
+   if (bpf_prog_load_xattr(, , _fd)) {
p_err("failed to load program");
return -1;
}
@@ -722,7 +753,7 @@ static int do_help(int argc, char **argv)
"   %s %s dump xlated PROG [{ file FILE | opcodes | visual

[PATCH bpf-next v2 06/12] tools: bpftool: allow users to specify program type for prog load

2018-07-09 Thread Jakub Kicinski

Sometimes program section names don't match with libbpf's expectation.
In particular XDP's default section names differ between libbpf and
iproute2.  Allow users to pass program type on command line.  Name
the types like the libbpf expected section names.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 .../bpftool/Documentation/bpftool-prog.rst| 15 ++-
 tools/bpf/bpftool/bash-completion/bpftool |  6 +++
 tools/bpf/bpftool/prog.c  | 44 +--
 3 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst 
b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 41723c6acaa6..e53e1ad2caf0 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -24,10 +24,19 @@ MAP COMMANDS
 |  **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** 
| **visual**}]
 |  **bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | 
**opcodes**}]
 |  **bpftool** **prog pin** *PROG* *FILE*
-|  **bpftool** **prog load** *OBJ* *FILE* [**dev** *NAME*]
+|  **bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**dev** 
*NAME*]
 |  **bpftool** **prog help**
 |
 |  *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+|  *TYPE* := {
+|  **socket** | **kprobe** | **kretprobe** | **classifier** | 
**action** |
+|  **tracepoint** | **raw_tracepoint** | **xdp** | **perf_event** 
| **cgroup/skb** |
+|  **cgroup/sock** | **cgroup/dev** | **lwt_in** | **lwt_out** | 
**lwt_xmit** |
+|  **lwt_seg6local** | **sockops** | **sk_skb** | **sk_msg** | 
**lirc_mode2** |
+|  **cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | 
**cgroup/post_bind6** |
+|  **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** 
| **cgroup/sendmsg6**
+|  }
+
 
 DESCRIPTION
 ===
@@ -64,8 +73,10 @@ DESCRIPTION
 
  Note: *FILE* must be located in *bpffs* mount.
 
-   **bpftool prog load** *OBJ* *FILE* [**dev** *NAME*]
+   **bpftool prog load** *OBJ* *FILE* [**type** *TYPE*] [**dev** *NAME*]
  Load bpf program from binary *OBJ* and pin as *FILE*.
+ **type** is optional, if not specified program type will be
+ inferred from section names.
  If **dev** *NAME* is specified program will be loaded onto
  given networking device (offload).
 
diff --git a/tools/bpf/bpftool/bash-completion/bpftool 
b/tools/bpf/bpftool/bash-completion/bpftool
index 238c2f80092a..caf8711993be 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -274,11 +274,17 @@ _bpftool()
 fi
 
 case $prev in
+type)
+COMPREPLY=( $( compgen -W "socket kprobe kretprobe 
classifier action tracepoint raw_tracepoint xdp perf_event cgroup/skb 
cgroup/sock cgroup/dev lwt_in lwt_out lwt_xmit lwt_seg6local sockops sk_skb 
sk_msg lirc_mode2 cgroup/bind4 cgroup/bind6 cgroup/connect4 cgroup/connect6 
cgroup/sendmsg4 cgroup/sendmsg6 cgroup/post_bind4 cgroup/post_bind6" -- \
+   "$cur" ) )
+return 0
+;;
 dev)
 _sysfs_get_netdevs
 return 0
 ;;
 *)
+_bpftool_once_attr 'type'
 _bpftool_once_attr 'dev'
 return 0
 ;;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 21c74de7156f..7a06fd4c5d27 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -688,6 +688,7 @@ static int do_load(int argc, char **argv)
const char *objfile, *pinfile;
struct bpf_object *obj;
int prog_fd;
+   int err;
 
if (!REQ_ARGS(2))
return -1;
@@ -695,7 +696,37 @@ static int do_load(int argc, char **argv)
pinfile = GET_ARG();
 
while (argc) {
-   if (is_prefix(*argv, "dev")) {
+   if (is_prefix(*argv, "type")) {
+   char *type;
+
+   NEXT_ARG();
+
+   if (attr.prog_type != BPF_PROG_TYPE_UNSPEC) {
+   p_err("program type already specified");
+   return -1;
+   }
+   if (!REQ_ARGS(1))
+   return -1;
+
+   /* Put a '/' at the end of type to appease libbpf */
+   type = malloc(strlen(*argv) + 2);
+   if (!type) {
+   p_err("mem alloc failed");
+

[PATCH bpf-next v2 05/12] tools: libbpf: expose the prog type guessing from section name logic

2018-07-09 Thread Jakub Kicinski

libbpf can guess program type based on ELF section names.  As libbpf
becomes more popular its association between section name strings and
types becomes more of a standard.  Allow libbpf users to use the same
logic for matching strings to types, e.g. when the string originates
from command line.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/lib/bpf/libbpf.c | 43 --
 tools/lib/bpf/libbpf.h |  3 +++
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 38ed3e92e393..30f3e58bd563 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2081,25 +2081,33 @@ static const struct {
 #undef BPF_S_PROG_SEC
 #undef BPF_SA_PROG_SEC
 
-static int bpf_program__identify_section(struct bpf_program *prog)
+int libbpf_prog_type_by_string(const char *name, enum bpf_prog_type *prog_type,
+  enum bpf_attach_type *expected_attach_type)
 {
int i;
 
-   if (!prog->section_name)
-   goto err;
-
-   for (i = 0; i < ARRAY_SIZE(section_names); i++)
-   if (strncmp(prog->section_name, section_names[i].sec,
-   section_names[i].len) == 0)
-   return i;
-
-err:
-   pr_warning("failed to guess program type based on section name %s\n",
-  prog->section_name);
+   if (!name)
+   return -1;
 
+   for (i = 0; i < ARRAY_SIZE(section_names); i++) {
+   if (strncmp(name, section_names[i].sec, section_names[i].len))
+   continue;
+   *prog_type = section_names[i].prog_type;
+   *expected_attach_type = section_names[i].expected_attach_type;
+   return 0;
+   }
return -1;
 }
 
+static int
+bpf_program__identify_section(struct bpf_program *prog,
+ enum bpf_prog_type *prog_type,
+ enum bpf_attach_type *expected_attach_type)
+{
+   return libbpf_prog_type_by_string(prog->section_name, prog_type,
+ expected_attach_type);
+}
+
 int bpf_map__fd(struct bpf_map *map)
 {
return map ? map->fd : -EINVAL;
@@ -2230,7 +2238,6 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr 
*attr,
enum bpf_prog_type prog_type;
struct bpf_object *obj;
struct bpf_map *map;
-   int section_idx;
int err;
 
if (!attr)
@@ -2252,14 +2259,14 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr 
*attr,
prog->prog_ifindex = attr->ifindex;
expected_attach_type = attr->expected_attach_type;
if (prog_type == BPF_PROG_TYPE_UNSPEC) {
-   section_idx = bpf_program__identify_section(prog);
-   if (section_idx < 0) {
+   err = bpf_program__identify_section(prog, _type,
+   
_attach_type);
+   if (err < 0) {
+   pr_warning("failed to guess program type based 
on section name %s\n",
+  prog->section_name);
bpf_object__close(obj);
return -EINVAL;
}
-   prog_type = section_names[section_idx].prog_type;
-   expected_attach_type =
-   section_names[section_idx].expected_attach_type;
}
 
bpf_program__set_type(prog, prog_type);
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 564f4be9bae0..617dacfc6704 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -92,6 +92,9 @@ int bpf_object__set_priv(struct bpf_object *obj, void *priv,
 bpf_object_clear_priv_t clear_priv);
 void *bpf_object__priv(struct bpf_object *prog);
 
+int libbpf_prog_type_by_string(const char *name, enum bpf_prog_type *prog_type,
+  enum bpf_attach_type *expected_attach_type);
+
 /* Accessors of bpf_program */
 struct bpf_program;
 struct bpf_program *bpf_program__next(struct bpf_program *prog,
-- 
2.17.1

[PATCH bpf-next v2 10/12] tools: bpf: make use of reallocarray

2018-07-09 Thread Jakub Kicinski

reallocarray() is a safer variant of realloc which checks for
multiplication overflow in case of array allocation.  Since it's
not available in Glibc < 2.26 import kernel's overflow.h and
add a static inline implementation when needed.  Use feature
detection to probe for existence of reallocarray.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
Reviewed-by: Jiong Wang 
---
 tools/bpf/bpftool/Makefile  |   6 +-
 tools/bpf/bpftool/main.h|   1 +
 tools/bpf/bpftool/xlated_dumper.c   |   6 +-
 tools/build/feature/Makefile|   4 +
 tools/build/feature/test-reallocarray.c |   8 +
 tools/include/linux/compiler-gcc.h  |   4 +
 tools/include/linux/overflow.h  | 278 
 tools/include/tools/libc_compat.h   |  23 ++
 tools/lib/bpf/Makefile  |   6 +-
 tools/lib/bpf/libbpf.c  |   9 +-
 10 files changed, 336 insertions(+), 9 deletions(-)
 create mode 100644 tools/build/feature/test-reallocarray.c
 create mode 100644 tools/include/linux/overflow.h
 create mode 100644 tools/include/tools/libc_compat.h

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 0911b00b25cc..6c4830e18879 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -52,7 +52,7 @@ INSTALL ?= install
 RM ?= rm -f
 
 FEATURE_USER = .bpftool
-FEATURE_TESTS = libbfd disassembler-four-args
+FEATURE_TESTS = libbfd disassembler-four-args reallocarray
 FEATURE_DISPLAY = libbfd disassembler-four-args
 
 check_feat := 1
@@ -75,6 +75,10 @@ ifeq ($(feature-disassembler-four-args), 1)
 CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE
 endif
 
+ifeq ($(feature-reallocarray), 0)
+CFLAGS += -DCOMPAT_NEED_REALLOCARRAY
+endif
+
 include $(wildcard $(OUTPUT)*.d)
 
 all: $(OUTPUT)bpftool
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 15b6c49ae533..1e02e4031693 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "json_writer.h"
 
diff --git a/tools/bpf/bpftool/xlated_dumper.c 
b/tools/bpf/bpftool/xlated_dumper.c
index b97f1da60dd1..3284759df98a 100644
--- a/tools/bpf/bpftool/xlated_dumper.c
+++ b/tools/bpf/bpftool/xlated_dumper.c
@@ -35,6 +35,7 @@
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
+#define _GNU_SOURCE
 #include 
 #include 
 #include 
@@ -66,9 +67,8 @@ void kernel_syms_load(struct dump_data *dd)
while (!feof(fp)) {
if (!fgets(buff, sizeof(buff), fp))
break;
-   tmp = realloc(dd->sym_mapping,
- (dd->sym_count + 1) *
- sizeof(*dd->sym_mapping));
+   tmp = reallocarray(dd->sym_mapping, dd->sym_count + 1,
+  sizeof(*dd->sym_mapping));
if (!tmp) {
 out:
free(dd->sym_mapping);
diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index dac9563b5470..0516259be70f 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -14,6 +14,7 @@ FILES=  \
  test-libaudit.bin  \
  test-libbfd.bin\
  test-disassembler-four-args.bin\
+ test-reallocarray.bin \
  test-liberty.bin   \
  test-liberty-z.bin \
  test-cplus-demangle.bin\
@@ -204,6 +205,9 @@ FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
 $(OUTPUT)test-disassembler-four-args.bin:
$(BUILD) -DPACKAGE='"perf"' -lbfd -lopcodes
 
+$(OUTPUT)test-reallocarray.bin:
+   $(BUILD)
+
 $(OUTPUT)test-liberty.bin:
$(CC) $(CFLAGS) -Wall -Werror -o $@ test-libbfd.c -DPACKAGE='"perf"' 
$(LDFLAGS) -lbfd -ldl -liberty
 
diff --git a/tools/build/feature/test-reallocarray.c 
b/tools/build/feature/test-reallocarray.c
new file mode 100644
index ..8170de35150d
--- /dev/null
+++ b/tools/build/feature/test-reallocarray.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include 
+
+int main(void)
+{
+   return !!reallocarray(NULL, 1, 1);
+}
diff --git a/tools/include/linux/compiler-gcc.h 
b/tools/include/linux/compiler-gcc.h
index 70fe61295733..0d35f18006a1 100644
--- a/tools/include/linux/compiler-gcc.h
+++ b/tools/include/linux/compiler-gcc.h
@@ -36,3 +36,7 @@
 #endif
 #define __printf(a, b) __attribute__((format(printf, a, b)))
 #define __scanf(a, b)  __attribute__((format(scanf, a, b)))
+
+#if GCC_VERSION >= 50100
+#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
+#endif
diff --git a/tools/include/linux/overflow.h b/tools/include/linux/overflow.h
new file mode 100644
index ..8712ff70995f
--- /dev/null
+++ b/tools/include/linux/overflow.h
@@ -0,0 +1,278 @@
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+#ifndef __LINUX_OVERFLOW_H
+#define

[PATCH bpf-next v2 11/12] tools: libbpf: allow map reuse

2018-07-09 Thread Jakub Kicinski

More advanced applications may want to only replace programs without
destroying associated maps.  Allow libbpf users to achieve that.
Instead of always creating all of the maps at load time, expose to
users an API to reconstruct the map object from already existing
map.

The map parameters are read from the kernel and replace the parameters
of the ELF map.  libbpf does not restrict the map replacement, i.e.
the reused map does not have to be compatible with the ELF map
definition.  We relay on the verifier for checking the compatibility
between maps and programs.  The ELF map definition is completely
overwritten by the information read from the kernel, to make sure
libbpf's view of map object corresponds to the actual map.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/lib/bpf/libbpf.c | 35 +++
 tools/lib/bpf/libbpf.h |  1 +
 2 files changed, 36 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b653dbb266c7..c80033fe66c3 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -215,6 +215,7 @@ struct bpf_map {
int fd;
char *name;
size_t offset;
+   bool fd_preset;
int map_ifindex;
struct bpf_map_def def;
uint32_t btf_key_type_id;
@@ -1082,6 +1083,34 @@ static int bpf_map_find_btf_info(struct bpf_map *map, 
const struct btf *btf)
return 0;
 }
 
+int bpf_map__reuse_fd(struct bpf_map *map, int fd)
+{
+   struct bpf_map_info info = {};
+   __u32 len = sizeof(info);
+   int err;
+
+   err = bpf_obj_get_info_by_fd(fd, , );
+   if (err)
+   return err;
+
+   map->fd = dup(fd);
+   if (map->fd < 0)
+   return map->fd;
+   map->fd_preset = true;
+
+   free(map->name);
+   map->name = strdup(info.name);
+   map->def.type = info.type;
+   map->def.key_size = info.key_size;
+   map->def.value_size = info.value_size;
+   map->def.max_entries = info.max_entries;
+   map->def.map_flags = info.map_flags;
+   map->btf_key_type_id = info.btf_key_type_id;
+   map->btf_value_type_id = info.btf_value_type_id;
+
+   return 0;
+}
+
 static int
 bpf_object__create_maps(struct bpf_object *obj)
 {
@@ -1094,6 +1123,12 @@ bpf_object__create_maps(struct bpf_object *obj)
struct bpf_map_def *def = >def;
int *pfd = >fd;
 
+   if (map->fd_preset) {
+   pr_debug("skip map create (preset) %s: fd=%d\n",
+map->name, map->fd);
+   continue;
+   }
+
create_attr.name = map->name;
create_attr.map_ifindex = map->map_ifindex;
create_attr.map_type = def->type;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 60593ac44700..8e709a74f47c 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -261,6 +261,7 @@ typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void 
*);
 int bpf_map__set_priv(struct bpf_map *map, void *priv,
  bpf_map_clear_priv_t clear_priv);
 void *bpf_map__priv(struct bpf_map *map);
+int bpf_map__reuse_fd(struct bpf_map *map, int fd);
 bool bpf_map__is_offload_neutral(struct bpf_map *map);
 void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex);
 int bpf_map__pin(struct bpf_map *map, const char *path);
-- 
2.17.1

[PATCH bpf-next v2 03/12] tools: bpftool: refactor argument parsing for prog load

2018-07-09 Thread Jakub Kicinski

Add a new macro for printing more informative message than straight
usage() when parameters are missing, and use it for prog do_load().
Save the object and pin path argument to variables for clarity.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/bpf/bpftool/main.h | 15 +++
 tools/bpf/bpftool/prog.c | 11 +++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index d39f7ef01d23..15b6c49ae533 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -50,6 +50,21 @@
 #define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); })
 #define NEXT_ARGP()({ (*argc)--; (*argv)++; if (*argc < 0) usage(); })
 #define BAD_ARG()  ({ p_err("what is '%s'?", *argv); -1; })
+#define GET_ARG()  ({ argc--; *argv++; })
+#define REQ_ARGS(cnt)  \
+   ({  \
+   int _cnt = (cnt);   \
+   bool _res;  \
+   \
+   if (argc < _cnt) {  \
+   p_err("'%s' needs at least %d arguments, %d found", \
+ argv[-1], _cnt, argc);\
+   _res = false;   \
+   } else {\
+   _res = true;\
+   }   \
+   _res;   \
+   })
 
 #define ERR_MAX_LEN1024
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index a740da99d477..a5ef46c59029 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -681,18 +681,21 @@ static int do_pin(int argc, char **argv)
 
 static int do_load(int argc, char **argv)
 {
+   const char *objfile, *pinfile;
struct bpf_object *obj;
int prog_fd;
 
-   if (argc != 2)
-   usage();
+   if (!REQ_ARGS(2))
+   return -1;
+   objfile = GET_ARG();
+   pinfile = GET_ARG();
 
-   if (bpf_prog_load(argv[0], BPF_PROG_TYPE_UNSPEC, , _fd)) {
+   if (bpf_prog_load(objfile, BPF_PROG_TYPE_UNSPEC, , _fd)) {
p_err("failed to load program");
return -1;
}
 
-   if (do_pin_fd(prog_fd, argv[1]))
+   if (do_pin_fd(prog_fd, pinfile))
goto err_close_obj;
 
if (json_output)
-- 
2.17.1

[PATCH bpf-next v2 08/12] tools: libbpf: add extended attributes version of bpf_object__open()

2018-07-09 Thread Jakub Kicinski

Similarly to bpf_prog_load() users of bpf_object__open() may need
to specify the expected program type.  Program type is needed at
open to avoid the kernel version check for program types which don't
require it.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/lib/bpf/libbpf.c | 21 +
 tools/lib/bpf/libbpf.h |  6 ++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index edc3b0b3737d..5b0e84fbcf71 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1520,7 +1520,8 @@ __bpf_object__open(const char *path, void *obj_buf, 
size_t obj_buf_sz,
return ERR_PTR(err);
 }
 
-struct bpf_object *bpf_object__open(const char *path)
+struct bpf_object *bpf_object__open_xattr(const char *path,
+ struct bpf_object_open_attr *attr)
 {
/* param validation */
if (!path)
@@ -1528,7 +1529,17 @@ struct bpf_object *bpf_object__open(const char *path)
 
pr_debug("loading %s\n", path);
 
-   return __bpf_object__open(path, NULL, 0, true);
+   return __bpf_object__open(path, NULL, 0,
+ bpf_prog_type__needs_kver(attr->prog_type));
+}
+
+struct bpf_object *bpf_object__open(const char *path)
+{
+   struct bpf_object_open_attr attr = {
+   .prog_type  = BPF_PROG_TYPE_UNSPEC,
+   };
+
+   return bpf_object__open_xattr(path, );
 }
 
 struct bpf_object *bpf_object__open_buffer(void *obj_buf,
@@ -2238,6 +2249,9 @@ int bpf_prog_load(const char *file, enum bpf_prog_type 
type,
 int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr,
struct bpf_object **pobj, int *prog_fd)
 {
+   struct bpf_object_open_attr open_attr = {
+   .prog_type  = attr->prog_type,
+   };
struct bpf_program *prog, *first_prog = NULL;
enum bpf_attach_type expected_attach_type;
enum bpf_prog_type prog_type;
@@ -2250,8 +2264,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr 
*attr,
if (!attr->file)
return -EINVAL;
 
-   obj = __bpf_object__open(attr->file, NULL, 0,
-bpf_prog_type__needs_kver(attr->prog_type));
+   obj = bpf_object__open_xattr(attr->file, _attr);
if (IS_ERR_OR_NULL(obj))
return -ENOENT;
 
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 3122d74f2643..60593ac44700 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -66,7 +66,13 @@ void libbpf_set_print(libbpf_print_fn_t warn,
 /* Hide internal to user */
 struct bpf_object;
 
+struct bpf_object_open_attr {
+   enum bpf_prog_type prog_type;
+};
+
 struct bpf_object *bpf_object__open(const char *path);
+struct bpf_object *bpf_object__open_xattr(const char *path,
+ struct bpf_object_open_attr *attr);
 struct bpf_object *bpf_object__open_buffer(void *obj_buf,
   size_t obj_buf_sz,
   const char *name);
-- 
2.17.1

[PATCH bpf-next v2 12/12] tools: bpftool: allow reuse of maps with bpftool prog load

2018-07-09 Thread Jakub Kicinski

Add map parameter to prog load which will allow reuse of existing
maps instead of creating new ones.

We need feature detection and compat code for reallocarray, since
it's not available in many libc versions.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 .../bpftool/Documentation/bpftool-prog.rst|  20 ++-
 tools/bpf/bpftool/bash-completion/bpftool |  67 +++-
 tools/bpf/bpftool/main.h  |   3 +
 tools/bpf/bpftool/map.c   |   4 +-
 tools/bpf/bpftool/prog.c  | 148 --
 5 files changed, 219 insertions(+), 23 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst 
b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index e53e1ad2caf0..64156a16d530 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -24,9 +24,10 @@ MAP COMMANDS
 |  **bpftool** **prog dump xlated** *PROG* [{**file** *FILE* | **opcodes** 
| **visual**}]
 |  **bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | 
**opcodes**}]
 |  **bpftool** **prog pin** *PROG* *FILE*
-|  **bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**dev** 
*NAME*]
+|  **bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** 
{**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
 |  **bpftool** **prog help**
 |
+|  *MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
 |  *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
 |  *TYPE* := {
 |  **socket** | **kprobe** | **kretprobe** | **classifier** | 
**action** |
@@ -73,10 +74,17 @@ DESCRIPTION
 
  Note: *FILE* must be located in *bpffs* mount.
 
-   **bpftool prog load** *OBJ* *FILE* [**type** *TYPE*] [**dev** *NAME*]
+   **bpftool prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** 
*IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
  Load bpf program from binary *OBJ* and pin as *FILE*.
  **type** is optional, if not specified program type will be
  inferred from section names.
+ By default bpftool will create new maps as declared in the ELF
+ object being loaded.  **map** parameter allows for the reuse
+ of existing maps.  It can be specified multiple times, each
+ time for a different map.  *IDX* refers to index of the map
+ to be replaced in the ELF file counting from 0, while *NAME*
+ allows to replace a map by name.  *MAP* specifies the map to
+ use, referring to it by **id** or through a **pinned** file.
  If **dev** *NAME* is specified program will be loaded onto
  given networking device (offload).
 
@@ -172,6 +180,14 @@ EXAMPLES
 mov%rbx,0x0(%rbp)
 48 89 5d 00
 
+|
+| **# bpftool prog load xdp1_kern.o /sys/fs/bpf/xdp1 type xdp map name rxcnt 
id 7**
+| **# bpftool prog show pinned /sys/fs/bpf/xdp1**
+|   9: xdp  name xdp_prog1  tag 539ec6ce11b52f98  gpl
+|  loaded_at 2018-06-25T16:17:31-0700  uid 0
+|  xlated 488B  jited 336B  memlock 4096B  map_ids 7
+| **# rm /sys/fs/bpf/xdp1**
+|
 
 SEE ALSO
 
diff --git a/tools/bpf/bpftool/bash-completion/bpftool 
b/tools/bpf/bpftool/bash-completion/bpftool
index caf8711993be..598066c40191 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -99,6 +99,29 @@ _bpftool_get_prog_tags()
 command sed -n 's/.*"tag": "\(.*\)",$/\1/p' )" -- "$cur" ) )
 }
 
+_bpftool_get_obj_map_names()
+{
+local obj
+
+obj=$1
+
+maps=$(objdump -j maps -t $obj 2>/dev/null | \
+command awk '/g . maps/ {print $NF}')
+
+COMPREPLY+=( $( compgen -W "$maps" -- "$cur" ) )
+}
+
+_bpftool_get_obj_map_idxs()
+{
+local obj
+
+obj=$1
+
+nmaps=$(objdump -j maps -t $obj 2>/dev/null | grep -c 'g . maps')
+
+COMPREPLY+=( $( compgen -W "$(seq 0 $((nmaps - 1)))" -- "$cur" ) )
+}
+
 _sysfs_get_netdevs()
 {
 COMPREPLY+=( $( compgen -W "$( ls /sys/class/net 2>/dev/null )" -- \
@@ -220,12 +243,14 @@ _bpftool()
 # Completion depends on object and command in use
 case $object in
 prog)
-case $prev in
-id)
-_bpftool_get_prog_ids
-return 0
-;;
-esac
+if [[ $command != "load" ]]; then
+case $prev in
+id)
+_bpftool_get_prog_ids
+return 0
+;;
+esac
+fi
 
 local PROG_TYPE='id pinned tag'
 case $command in
@@ -268,22 +293,52 @@ _bpftool()
 return 0
 ;;
 load)
+local obj
+
 if [[ ${#words[@]} -lt 6 ]]; then

[PATCH bpf-next v2 09/12] tools: bpftool: reimplement bpf_prog_load() for prog load

2018-07-09 Thread Jakub Kicinski

bpf_prog_load() is a very useful helper but it doesn't give us full
flexibility of modifying the BPF objects before loading.  Open code
bpf_prog_load() in bpftool so we can add extra logic in following
commits.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/bpf/bpftool/prog.c | 57 
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 7a06fd4c5d27..267d653c93f5 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -43,6 +43,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 #include 
 
@@ -682,12 +684,15 @@ static int do_pin(int argc, char **argv)
 
 static int do_load(int argc, char **argv)
 {
-   struct bpf_prog_load_attr attr = {
+   enum bpf_attach_type expected_attach_type;
+   struct bpf_object_open_attr attr = {
.prog_type  = BPF_PROG_TYPE_UNSPEC,
};
const char *objfile, *pinfile;
+   struct bpf_program *prog;
struct bpf_object *obj;
-   int prog_fd;
+   struct bpf_map *map;
+   __u32 ifindex = 0;
int err;
 
if (!REQ_ARGS(2))
@@ -719,7 +724,7 @@ static int do_load(int argc, char **argv)
strcat(type, "/");
 
err = libbpf_prog_type_by_string(type, _type,
-
_attach_type);
+_attach_type);
free(type);
if (err < 0) {
p_err("unknown program type '%s'", *argv);
@@ -729,15 +734,15 @@ static int do_load(int argc, char **argv)
} else if (is_prefix(*argv, "dev")) {
NEXT_ARG();
 
-   if (attr.ifindex) {
+   if (ifindex) {
p_err("offload device already specified");
return -1;
}
if (!REQ_ARGS(1))
return -1;
 
-   attr.ifindex = if_nametoindex(*argv);
-   if (!attr.ifindex) {
+   ifindex = if_nametoindex(*argv);
+   if (!ifindex) {
p_err("unrecognized netdevice '%s': %s",
  *argv, strerror(errno));
return -1;
@@ -750,14 +755,44 @@ static int do_load(int argc, char **argv)
}
}
 
-   attr.file = objfile;
-
-   if (bpf_prog_load_xattr(, , _fd)) {
-   p_err("failed to load program");
+   obj = bpf_object__open_xattr(objfile, );
+   if (IS_ERR_OR_NULL(obj)) {
+   p_err("failed to open object file");
return -1;
}
 
-   if (do_pin_fd(prog_fd, pinfile))
+   prog = bpf_program__next(NULL, obj);
+   if (!prog) {
+   p_err("object file doesn't contain any bpf program");
+   goto err_close_obj;
+   }
+
+   bpf_program__set_ifindex(prog, ifindex);
+   if (attr.prog_type == BPF_PROG_TYPE_UNSPEC) {
+   const char *sec_name = bpf_program__title(prog, false);
+
+   err = libbpf_prog_type_by_string(sec_name, _type,
+_attach_type);
+   if (err < 0) {
+   p_err("failed to guess program type based on section 
name %s\n",
+ sec_name);
+   goto err_close_obj;
+   }
+   }
+   bpf_program__set_type(prog, attr.prog_type);
+   bpf_program__set_expected_attach_type(prog, expected_attach_type);
+
+   bpf_map__for_each(map, obj)
+   if (!bpf_map__is_offload_neutral(map))
+   bpf_map__set_ifindex(map, ifindex);
+
+   err = bpf_object__load(obj);
+   if (err) {
+   p_err("failed to load object file");
+   goto err_close_obj;
+   }
+
+   if (do_pin_fd(bpf_program__fd(prog), pinfile))
goto err_close_obj;
 
if (json_output)
-- 
2.17.1

[PATCH bpf-next v2 01/12] selftests/bpf: remove duplicated word from test offloads

2018-07-09 Thread Jakub Kicinski

Trivial removal of duplicated "mode" in error message.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/testing/selftests/bpf/test_offload.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_offload.py 
b/tools/testing/selftests/bpf/test_offload.py
index be800d0e7a84..a257e4b08392 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -830,7 +830,7 @@ netns = []
 check_extack_nsim(err, "program loaded with different flags.", args)
 ret, _, err = sim.unset_xdp("", force=True,
 fail=False, include_stderr=True)
-fail(ret == 0, "Removed program with a bad mode mode")
+fail(ret == 0, "Removed program with a bad mode")
 check_extack_nsim(err, "program loaded with different flags.", args)
 
 start_test("Test MTU restrictions...")
-- 
2.17.1

[PATCH bpf-next v2 00/12] tools: bpf: extend bpftool prog load

2018-07-09 Thread Jakub Kicinski

Hi!

This series starts with two minor clean ups to test_offload.py
selftest script.

The next 9 patches extend the abilities of bpftool prog load
beyond the simple cgroup use cases.  Three new parameters are
added:

 - type - allows specifying program type, independent of how
   code sections are named;
 - map  - allows reusing existing maps, instead of creating a new
   map on every program load;
 - dev  - offload/binding to a device.

A number of changes to libbpf is required to accomplish the task.
The section - program type logic mapping is exposed.  We should
probably aim to use the libbpf program section naming everywhere.
For reuse of maps we need to allow users to set FD for bpf map
object in libbpf.

Examples

Load program my_xdp.o and pin it as /sys/fs/bpf/my_xdp, for xdp
program type:

$ bpftool prog load my_xdp.o /sys/fs/bpf/my_xdp \
  type xdp

As above but for offload:

$ bpftool prog load my_xdp.o /sys/fs/bpf/my_xdp \
  type xdp \
  dev netdevsim0

Load program my_maps.o, but for the first map reuse map id 17,
and for the map called "other_map" reuse pinned map /sys/fs/bpf/map0:

$ bpftool prog load my_maps.o /sys/fs/bpf/prog \
  map idx 0 id 17 \
  map name other_map pinned /sys/fs/bpf/map0

---
v2:
 - add compat for reallocarray().
 
Jakub Kicinski (12):
  selftests/bpf: remove duplicated word from test offloads
  selftests/bpf: add Error: prefix in check_extack helper
  tools: bpftool: refactor argument parsing for prog load
  tools: bpftool: add support for loading programs for offload
  tools: libbpf: expose the prog type guessing from section name logic
  tools: bpftool: allow users to specify program type for prog load
  tools: libbpf: recognize offload neutral maps
  tools: libbpf: add extended attributes version of bpf_object__open()
  tools: bpftool: reimplement bpf_prog_load() for prog load
  tools: bpf: make use of reallocarray
  tools: libbpf: allow map reuse
  tools: bpftool: allow reuse of maps with bpftool prog load

 .../bpftool/Documentation/bpftool-prog.rst|  33 ++-
 tools/bpf/bpftool/Makefile|   6 +-
 tools/bpf/bpftool/bash-completion/bpftool |  96 +-
 tools/bpf/bpftool/main.h  |  19 ++
 tools/bpf/bpftool/map.c   |   4 +-
 tools/bpf/bpftool/prog.c  | 245 ++-
 tools/bpf/bpftool/xlated_dumper.c |   6 +-
 tools/build/feature/Makefile  |   4 +
 tools/build/feature/test-reallocarray.c   |   8 +
 tools/include/linux/compiler-gcc.h|   4 +
 tools/include/linux/overflow.h| 278 ++
 tools/include/tools/libc_compat.h |  23 ++
 tools/lib/bpf/Makefile|   6 +-
 tools/lib/bpf/libbpf.c| 116 ++--
 tools/lib/bpf/libbpf.h|  11 +
 tools/testing/selftests/bpf/test_offload.py   |  10 +-
 16 files changed, 812 insertions(+), 57 deletions(-)
 create mode 100644 tools/build/feature/test-reallocarray.c
 create mode 100644 tools/include/linux/overflow.h
 create mode 100644 tools/include/tools/libc_compat.h

-- 
2.17.1

[PATCH bpf-next v2 02/12] selftests/bpf: add Error: prefix in check_extack helper

2018-07-09 Thread Jakub Kicinski

Currently the test only checks errors, not warnings, so save typing
and prefix the extack messages with "Error:" inside the check helper.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Quentin Monnet 
---
 tools/testing/selftests/bpf/test_offload.py | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_offload.py 
b/tools/testing/selftests/bpf/test_offload.py
index a257e4b08392..f8d9bd81d9a4 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -547,11 +547,11 @@ netns = [] # net namespaces to be removed
 if skip_extack:
 return
 lines = output.split("\n")
-comp = len(lines) >= 2 and lines[1] == reference
+comp = len(lines) >= 2 and lines[1] == 'Error: ' + reference
 fail(not comp, "Missing or incorrect netlink extack message")
 
 def check_extack_nsim(output, reference, args):
-check_extack(output, "Error: netdevsim: " + reference, args)
+check_extack(output, "netdevsim: " + reference, args)
 
 def check_no_extack(res, needle):
 fail((res[1] + res[2]).count(needle) or (res[1] + 
res[2]).count("Warning:"),
@@ -654,7 +654,7 @@ netns = []
 ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True,
  fail=False, include_stderr=True)
 fail(ret == 0, "TC filter loaded without enabling TC offloads")
-check_extack(err, "Error: TC offload is disabled on net device.", args)
+check_extack(err, "TC offload is disabled on net device.", args)
 sim.wait_for_flush()
 
 sim.set_ethtool_tc_offloads(True)
@@ -694,7 +694,7 @@ netns = []
  skip_sw=True,
  fail=False, include_stderr=True)
 fail(ret == 0, "Offloaded a filter to chain other than 0")
-check_extack(err, "Error: Driver supports only offload of chain 0.", args)
+check_extack(err, "Driver supports only offload of chain 0.", args)
 sim.tc_flush_filters()
 
 start_test("Test TC replace...")
-- 
2.17.1

[PATCH net-next] tcp: expose both send and receive intervals for rate sample

2018-07-09 Thread Deepti Raghavan

Congestion control algorithms, which access the rate sample
through the tcp_cong_control function, only have access to the maximum
of the send and receive interval, for cases where the acknowledgment
rate may be inaccurate due to ACK compression or decimation. Algorithms
may want to use send rates and receive rates as separate signals.

Signed-off-by: Deepti Raghavan 
---
 include/net/tcp.h   | 2 ++
 net/ipv4/tcp_rate.c | 4 
 2 files changed, 6 insertions(+)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cce3769..f6cb20e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -954,6 +954,8 @@ struct rate_sample {
u32  prior_delivered;   /* tp->delivered at "prior_mstamp" */
s32  delivered; /* number of packets delivered over interval */
long interval_us;   /* time for tp->delivered to incr "delivered" */
+   u32 snd_interval_us;/* snd interval for delivered packets */
+   u32 rcv_interval_us;/* rcv interval for delivered packets */
long rtt_us;/* RTT of last (S)ACKed packet (or -1) */
int  losses;/* number of packets marked lost upon ACK */
u32  acked_sacked;  /* number of packets newly (S)ACKed upon ACK */
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index c61240e..4dff40d 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
rs->prior_mstamp); /* ack phase */
rs->interval_us = max(snd_us, ack_us);
 
+   /* Record both segment send and ack receive intervals */
+   rs->snd_interval_us = snd_us;
+   rs->rcv_interval_us = ack_us;
+
/* Normally we expect interval_us >= min-rtt.
 * Note that rate may still be over-estimated when a spuriously
 * retransmistted skb was first (s)acked because "interval_us"
-- 
2.7.4

Re: [PATCH v3 iproute2 2/3] tc: Add support for the ETF Qdisc

2018-07-09 Thread David Ahern

On 7/9/18 9:48 AM, Jesus Sanchez-Palencia wrote:
> Hi David,
> 
> 
> On 07/06/2018 08:58 AM, David Ahern wrote:
>> On 7/5/18 4:42 PM, Jesus Sanchez-Palencia wrote:
>>
>>> +static int get_clockid(__s32 *val, const char *arg)
>>> +{
>>> +   const struct static_clockid {
>>> +   const char *name;
>>> +   clockid_t clockid;
>>> +   } clockids_sysv[] = {
>>> +   { "CLOCK_REALTIME", CLOCK_REALTIME },
>>> +   { "CLOCK_TAI", CLOCK_TAI },
>>> +   { "CLOCK_BOOTTIME", CLOCK_BOOTTIME },
>>> +   { "CLOCK_MONOTONIC", CLOCK_MONOTONIC },
>>> +   { NULL }
>>> +   };
>>> +
>>> +   const struct static_clockid *c;
>>> +
>>> +   for (c = clockids_sysv; c->name; c++) {
>>> +   if (strncasecmp(c->name, arg, 25) == 0) {
>>
>> Why 25?
> 
> 
> That was just an upper bound giving some room beyond the longest
> clockid name we have today. Should I add a define MAX_CLOCK_NAME ?

why not just strcasecmp? using the 'n' variant with n > strlen of either
argument seems pointless.

> 
> 
>>
>> be nice to allow shortcuts -- e.g., just REALTIME or realtime.
> 
> 
> I'd rather just keep it as is and use the names as they are defined for
> everything else (i.e. CLOCK_REALTIME), unless there are some strong 
> objections.

An all caps argument is unnecessary work on the pinky finger and the
CLOCK_ prefix is redundant to the keyword. Really, just a thought on
making it easier for users. A CLI argument does not need to maintain a
1:1 with code names.

Re: [PATCH] net: sched: Fix warnings from xchg() on RCU'd cookie pointer.

2018-07-09 Thread Vlad Buslov



On Mon 09 Jul 2018 at 15:30, Marcelo Ricardo Leitner 
 wrote:
> On Sun, Jul 08, 2018 at 05:03:58PM +0900, David Miller wrote:
>> 
>> The kbuild test robot reports:
>> 
>> >> net/sched/act_api.c:71:15: sparse: incorrect type in initializer 
>> >> (different address spaces) @@expected struct tc_cookie [noderef] 
>> >> *__ret @@got [noderef] *__ret @@
>>net/sched/act_api.c:71:15:expected struct tc_cookie [noderef] 
>> *__ret
>>net/sched/act_api.c:71:15:got struct tc_cookie *new_cookie
>> >> net/sched/act_api.c:71:13: sparse: incorrect type in assignment 
>> >> (different address spaces) @@expected struct tc_cookie *old @@got 
>> >> struct tc_cookie [noderef] >net/sched/act_api.c:71:13:expected struct tc_cookie *old
>>net/sched/act_api.c:71:13:got struct tc_cookie [noderef] 
>> *[assigned] __ret
>
> This one:
>
>> >> net/sched/act_api.c:132:48: sparse: dereference of noderef expression
>
> Actually belongs to a different issue, that was reported in the same
> email, but which wasn't handled in this patch.
>
>   Marcelo

Thanks,

I've sent the fix.

Vlad

Re: [PATCH net v2 4/5] net/ipv6: propagate net.ipv6.conf.all.addr_gen_mode to devices

2018-07-09 Thread David Ahern

On 7/9/18 4:25 AM, Sabrina Dubroca wrote:
> This aligns the addr_gen_mode sysctl with the expected behavior of the
> "all" variant.
> 
> Fixes: d35a00b8e33d ("net/ipv6: allow sysctl to change link-local address 
> generation mode")
> Suggested-by: David Ahern 
> Signed-off-by: Sabrina Dubroca 
> ---
>  net/ipv6/addrconf.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index e89bca83e0e4..1659a6b3cf42 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
> @@ -5926,6 +5926,18 @@ static int addrconf_sysctl_addr_gen_mode(struct 
> ctl_table *ctl, int write,
>   idev->cnf.addr_gen_mode = new_val;
>   addrconf_dev_config(idev->dev);
>   }
> + } else if (>ipv6.devconf_all->addr_gen_mode == ctl->data) {
> + struct net_device *dev;
> +
> + net->ipv6.devconf_dflt->addr_gen_mode = new_val;
> + for_each_netdev(net, dev) {
> + idev = __in6_dev_get(dev);
> + if (idev &&
> + idev->cnf.addr_gen_mode != new_val) {
> + idev->cnf.addr_gen_mode = new_val;
> + addrconf_dev_config(idev->dev);

This call is adding a new LL address without removing the previous one:

# ip -6 addr sh dev eth2
4: eth2:  mtu 1500 state UP qlen 1000
inet6 2001:db8:2::4/64 scope global
   valid_lft forever preferred_lft forever
inet6 fe80::e0:f9ff:fe45:6480/64 scope link
   valid_lft forever preferred_lft forever

# sysctl -w net.ipv6.conf.eth2.addr_gen_mode=3
net.ipv6.conf.eth2.addr_gen_mode = 3

# ip -6 addr sh dev eth2
4: eth2:  mtu 1500 state UP qlen 1000
inet6 2001:db8:2::4/64 scope global
   valid_lft forever preferred_lft forever
inet6 fe80::bc31:8009:270d:e019/64 scope link stable-privacy
   valid_lft forever preferred_lft forever
inet6 fe80::e0:f9ff:fe45:6480/64 scope link
   valid_lft forever preferred_lft forever


> + }
> + }
>   }
>  
>   *((u32 *)ctl->data) = new_val;
>

Re: [PATCH net v2 5/5] Documentation: ip-sysctl.txt: document addr_gen_mode

2018-07-09 Thread David Ahern

On 7/9/18 4:25 AM, Sabrina Dubroca wrote:
> addr_gen_mode was introduced in without documentation, add it now.
> 
> Fixes: d35a00b8e33d ("net/ipv6: allow sysctl to change link-local address 
> generation mode")
> Signed-off-by: Sabrina Dubroca 
> ---
>  Documentation/networking/ip-sysctl.txt | 9 +
>  1 file changed, 9 insertions(+)
> 

Reviewed-by: David Ahern

Re: [PATCH net v2 1/5] net/ipv6: fix addrconf_sysctl_addr_gen_mode

2018-07-09 Thread David Ahern

On 7/9/18 4:25 AM, Sabrina Dubroca wrote:
> addrconf_sysctl_addr_gen_mode() has multiple problems. First, it ignores
> the errors returned by proc_dointvec().
> 
> addrconf_sysctl_addr_gen_mode() calls proc_dointvec() directly, which
> writes the value to memory, and then checks if it's valid and may return
> EINVAL. If a bad value is given, the value displayed when reading
> net.ipv6.conf.foo.addr_gen_mode next time will be invalid. In case the
> value provided by the user was valid, addrconf_dev_config() won't be
> called since idev->cnf.addr_gen_mode has already been updated.
> 
> Fix this in the usual way we deal with values that need to be checked
> after the proc_do*() helper has returned: define a local ctl_table and
> storage, call proc_dointvec() on that temporary area, then check and
> store.
> 
> addrconf_sysctl_addr_gen_mode() also writes the new value to the global
> ipv6_devconf_dflt, when we're writing to some netns's default, so that
> new netns will inherit the value that was set by the change occuring in
> any netns. That doesn't make any sense, so let's drop this assignment.
> 
> Finally, since addr_gen_mode is a __u32, switch to proc_douintvec().
> 
> Fixes: d35a00b8e33d ("net/ipv6: allow sysctl to change link-local address 
> generation mode")
> Signed-off-by: Sabrina Dubroca 
> ---
>  net/ipv6/addrconf.c | 27 ++-
>  1 file changed, 14 insertions(+), 13 deletions(-)
> 

Reviewed-by: David Ahern

[PATCH v3 net-next 3/3] net: core: fix use-after-free in __netif_receive_skb_list_core

2018-07-09 Thread Edward Cree

__netif_receive_skb_core can free the skb, so we have to use the dequeue-
 enqueue model when calling it from __netif_receive_skb_list_core.

Fixes: 88eb1944e18c ("net: core: propagate SKB lists through packet_type 
lookup")
Signed-off-by: Edward Cree 
---
 net/core/dev.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index ce4583564e00..d13cddcac41f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4830,23 +4830,28 @@ static void __netif_receive_skb_list_core(struct 
list_head *head, bool pfmemallo
struct list_head sublist;
struct sk_buff *skb, *next;
 
+   INIT_LIST_HEAD();
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
 
+   list_del(>list);
__netif_receive_skb_core(skb, pfmemalloc, _prev);
+   if (!pt_prev)
+   continue;
if (pt_curr != pt_prev || od_curr != orig_dev) {
/* dispatch old sublist */
-   list_cut_before(, head, >list);
__netif_receive_skb_list_ptype(, pt_curr, 
od_curr);
/* start new sublist */
+   INIT_LIST_HEAD();
pt_curr = pt_prev;
od_curr = orig_dev;
}
+   list_add_tail(>list, );
}
 
/* dispatch final sublist */
-   __netif_receive_skb_list_ptype(head, pt_curr, od_curr);
+   __netif_receive_skb_list_ptype(, pt_curr, od_curr);
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)

[PATCH v3 net-next 2/3] netfilter: fix use-after-free in NF_HOOK_LIST

2018-07-09 Thread Edward Cree

nf_hook() can free the skb, so we need to remove it from the list before
 calling, and add passed skbs to a sublist afterwards.

Fixes: 17266ee93984 ("net: ipv4: listified version of ip_rcv")
Reported-by: Dan Carpenter 
Signed-off-by: Edward Cree 
---
 include/linux/netfilter.h | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 5a5e0a2ab2a3..23b48de8c2e2 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -294,12 +294,16 @@ NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net 
*net, struct sock *sk,
 int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
struct sk_buff *skb, *next;
+   struct list_head sublist;
 
+   INIT_LIST_HEAD();
list_for_each_entry_safe(skb, next, head, list) {
-   int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
-   if (ret != 1)
-   list_del(>list);
+   list_del(>list);
+   if (nf_hook(pf, hook, net, sk, skb, in, out, okfn) == 1)
+   list_add_tail(>list, );
}
+   /* Put passed packets back on main list */
+   list_splice(, head);
 }
 
 /* Call setsockopt() */

[PATCH v3 net-next 1/3] net: core: fix uses-after-free in list processing

2018-07-09 Thread Edward Cree

In netif_receive_skb_list_internal(), all of skb_defer_rx_timestamp(),
 do_xdp_generic() and enqueue_to_backlog() can lead to kfree(skb).  Thus,
 we cannot wait until after they return to remove the skb from the list;
 instead, we remove it first and, in the pass case, add it to a sublist
 afterwards.
In the case of enqueue_to_backlog() we have already decided not to pass
 when we call the function, so we do not need a sublist.

Fixes: 7da517a3bc52 ("net: core: Another step of skb receive list processing")
Reported-by: Dan Carpenter 
Signed-off-by: Edward Cree 
---
 net/core/dev.c | 21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 89825c1eccdc..ce4583564e00 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4982,25 +4982,30 @@ static void netif_receive_skb_list_internal(struct 
list_head *head)
 {
struct bpf_prog *xdp_prog = NULL;
struct sk_buff *skb, *next;
+   struct list_head sublist;
 
+   INIT_LIST_HEAD();
list_for_each_entry_safe(skb, next, head, list) {
net_timestamp_check(netdev_tstamp_prequeue, skb);
-   if (skb_defer_rx_timestamp(skb))
-   /* Handled, remove from list */
-   list_del(>list);
+   list_del(>list);
+   if (!skb_defer_rx_timestamp(skb))
+   list_add_tail(>list, );
}
+   list_splice_init(, head);
 
if (static_branch_unlikely(_xdp_needed_key)) {
preempt_disable();
rcu_read_lock();
list_for_each_entry_safe(skb, next, head, list) {
xdp_prog = rcu_dereference(skb->dev->xdp_prog);
-   if (do_xdp_generic(xdp_prog, skb) != XDP_PASS)
-   /* Dropped, remove from list */
-   list_del(>list);
+   list_del(>list);
+   if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
+   list_add_tail(>list, );
}
rcu_read_unlock();
preempt_enable();
+   /* Put passed packets back on main list */
+   list_splice_init(, head);
}
 
rcu_read_lock();
@@ -5011,9 +5016,9 @@ static void netif_receive_skb_list_internal(struct 
list_head *head)
int cpu = get_rps_cpu(skb->dev, skb, );
 
if (cpu >= 0) {
-   enqueue_to_backlog(skb, cpu, 
>last_qtail);
-   /* Handled, remove from list */
+   /* Will be handled, remove from list */
list_del(>list);
+   enqueue_to_backlog(skb, cpu, 
>last_qtail);
}
}
}

[PATCH v3 net-next 0/3] fix use-after-free bugs in skb list processing

2018-07-09 Thread Edward Cree

A couple of bugs in skb list handling were spotted by Dan Carpenter, with
 the help of Smatch; following up on them I found a couple more similar
 cases.  This series fixes them by changing the relevant loops to use the
 dequeue-enqueue model (rather than in-place list modification).

v3: fixed another similar bug in __netif_receive_skb_list_core().

v2: dropped patch #3 (new list.h helper), per DaveM's request.

Edward Cree (3):
  net: core: fix uses-after-free in list processing
  netfilter: fix use-after-free in NF_HOOK_LIST
  net: core: fix use-after-free in __netif_receive_skb_list_core

 include/linux/netfilter.h | 10 +++---
 net/core/dev.c| 30 --
 2 files changed, 27 insertions(+), 13 deletions(-)

Re: [PATCH net-next] tcp: expose both send and receive intervals for rate sample

2018-07-09 Thread Eric Dumazet




On 07/09/2018 09:23 AM, Yuchung Cheng wrote:
> On Mon, Jul 9, 2018 at 9:05 AM, Deepti Raghavan  wrote:
>> Congestion control algorithms, which access the rate sample
>> through the tcp_cong_control function, only have access to the maximum
>> of the send and receive interval, for cases where the acknowledgment
>> rate may be inaccurate due to ACK compression or decimation. Algorithms
>> may want to use send rates and receive rates as separate signals.
>>
>> Signed-off-by: Deepti Raghavan 
> Acked-by: Yuchung Cheng 


Okay, but please send a non HTML mail, otherwise the mail does not reach 
netdev@, 
nor https://patchwork.ozlabs.org/project/netdev/list/

[PATCH iproute2] tc: don't double print rate

2018-07-09 Thread Stephen Hemminger

Conversion to print stats in JSON forgot to remove existing
fprintf.

Fixes: 4fcec7f3665b ("tc: jsonify stats2")
Signed-off-by: Stephen Hemminger 
---
 tc/tc_util.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tc/tc_util.c b/tc/tc_util.c
index e0c96291ade0..d7578528a31b 100644
--- a/tc/tc_util.c
+++ b/tc/tc_util.c
@@ -842,8 +842,6 @@ void print_tcstats2_attr(FILE *fp, struct rtattr *rta, char 
*prefix, struct rtat
 
memcpy(, RTA_DATA(tbs[TCA_STATS_RATE_EST]),
   MIN(RTA_PAYLOAD(tbs[TCA_STATS_RATE_EST]), sizeof(re)));
-   fprintf(fp, "\n%srate %s %upps ",
-   prefix, sprint_rate(re.bps, b1), re.pps);
print_string(PRINT_FP, NULL, "\n%s", prefix);
print_uint(PRINT_JSON, "rate", NULL, re.bps);
print_string(PRINT_FP, NULL, "rate %s",
-- 
2.18.0

Re: [PATCH net-next] qed: Add srq core support for RoCE and iWARP

2018-07-09 Thread Jason Gunthorpe

On Wed, May 30, 2018 at 04:11:37PM +0300, Yuval Bason wrote:
> This patch adds support for configuring SRQ and provides the necessary
> APIs for rdma upper layer driver (qedr) to enable the SRQ feature.
> 
> Signed-off-by: Michal Kalderon 
> Signed-off-by: Ariel Elior 
> Signed-off-by: Yuval Bason 
> ---
>  drivers/net/ethernet/qlogic/qed/qed_cxt.c   |   5 +-
>  drivers/net/ethernet/qlogic/qed/qed_cxt.h   |   1 +
>  drivers/net/ethernet/qlogic/qed/qed_hsi.h   |   2 +
>  drivers/net/ethernet/qlogic/qed/qed_iwarp.c |  23 
>  drivers/net/ethernet/qlogic/qed/qed_main.c  |   2 +
>  drivers/net/ethernet/qlogic/qed/qed_rdma.c  | 179 
> +++-
>  drivers/net/ethernet/qlogic/qed/qed_rdma.h  |   2 +
>  drivers/net/ethernet/qlogic/qed/qed_roce.c  |  17 ++-
>  include/linux/qed/qed_rdma_if.h |  12 +-
>  9 files changed, 235 insertions(+), 8 deletions(-)

Is this a pre-requisite for your related RDMA patches?

If yes, are you proposing that this patch should go via the RDMA tree?

Jason

Re: [PATCH net-next v3 0/2] tcp: fix high tail latencies in DCTCP

2018-07-09 Thread Yuchung Cheng

On Sat, Jul 7, 2018 at 7:07 AM, Neal Cardwell  wrote:
> On Sat, Jul 7, 2018 at 7:15 AM David Miller  wrote:
>>
>> From: Lawrence Brakmo 
>> Date: Tue, 3 Jul 2018 09:26:13 -0700
>>
>> > When have observed high tail latencies when using DCTCP for RPCs as
>> > compared to using Cubic. For example, in one setup there are 2 hosts
>> > sending to a 3rd one, with each sender having 3 flows (1 stream,
>> > 1 1MB back-to-back RPCs and 1 10KB back-to-back RPCs). The following
>> > table shows the 99% and 99.9% latencies for both Cubic and dctcp:
>> >
>> >Cubic 99%  Cubic 99.9%   dctcp 99%dctcp 99.9%
>> > 1MB RPCs2.6ms   5.5ms 43ms  208ms
>> > 10KB RPCs1.1ms   1.3ms 53ms  212ms
>>  ...
>> > v2: Removed call to tcp_ca_event from tcp_send_ack since I added one in
>> > tcp_event_ack_sent. Based on Neal Cardwell 
>> > feedback.
>> > Modified tcp_ecn_check_ce (and renamed it tcp_ecn_check) instead of 
>> > modifying
>> > tcp_ack_send_check to insure an ACK when cwr is received.
>> > v3: Handling cwr in tcp_ecn_accept_cwr instead of in tcp_ecn_check.
>> >
>> > [PATCH net-next v3 1/2] tcp: notify when a delayed ack is sent
>> > [PATCH net-next v3 2/2] tcp: ack immediately when a cwr packet
>>
>> Neal and co., what are your thoughts right now about this patch series?
>>
>> Thank you.
>
> IMHO these patches are a definite improvement over what we have now.
>
> That said, in chatting with Yuchung before the July 4th break, I think
> Yuchung and I agreed that we would ideally like to see something like
> the following:
>
> (1) refactor the DCTCP code to check for pending delayed ACKs directly
> using existing state (inet_csk(sk)->icsk_ack.pending &
> ICSK_ACK_TIMER), and remove the ca->delayed_ack_reserved DCTCP field
> and the CA_EVENT_DELAYED_ACK and CA_EVENT_NON_DELAYED_ACK callbacks
> added for DCTCP (which Larry determined had at least one bug).
>
> (2) fix the bug with the DCTCP call to tcp_send_ack(sk) causing
> delayed ACKs to be incorrectly dropped/forgotten (not yet addressed by
> this patch series)
>
> (3) then with fixes (1) and (2) in place, re-run tests and see if we
> still need Larry's heuristic (in patch 2) to fire an ACK immediately
> if a receiver receives a CWR packet (I suspect this is still very
> useful, but I think Yuchung is reluctant to add this complexity unless
> we have verified it's still needed after (1) and (2))
>
> Our team may be able to help out with some proposed patches for (1) and (2).
>
> In any case, I would love to have Yuchung and Eric weigh in (perhaps
> Monday) before we merge this patch series.
Thanks Neal. Sorry for not reflecting these timely before I took off
for July 4 holidays. I was going to post the same comment - Larry: I
could provide draft patches if that helps.

>
> Thanks,
> neal

[jkirsher/next-queue PATCH v2 0/7] Add support for L2 Fwd Offload w/o ndo_select_queue

2018-07-09 Thread Alexander Duyck

This patch series is meant to allow support for the L2 forward offload, aka
MACVLAN offload without the need for using ndo_select_queue.

The existing solution currently requires that we use ndo_select_queue in
the transmit path if we want to associate specific Tx queues with a given
MACVLAN interface. In order to get away from this we need to repurpose the
tc_to_txq array and XPS pointer for the MACVLAN interface and use those as
a means of accessing the queues on the lower device. As a result we cannot
offload a device that is configured as multiqueue, however it doesn't
really make sense to configure a macvlan interfaced as being multiqueue
anyway since it doesn't really have a qdisc of its own in the first place.

I am submitting this as an RFC for the netdev mailing list, and officially
submitting it for testing to Jeff Kirsher's next-queue in order to validate
the ixgbe specific bits.

The big changes in this set are:
  Allow lower device to update tc_to_txq and XPS map of offloaded MACVLAN
  Disable XPS for single queue devices
  Replace accel_priv with sb_dev in ndo_select_queue
  Add sb_dev parameter to fallback function for ndo_select_queue
  Consolidated ndo_select_queue functions that appeared to be duplicates

v2:
Updated patch set to rebase the netdev_pick_tx logic off of recent Rx
symmetric queue changes.

---

Alexander Duyck (7):
  net-sysfs: Drop support for XPS and traffic_class on single queue device
  net: Add support for subordinate device traffic classes
  ixgbe: Add code to populate and use macvlan tc to Tx queue map
  net: Add support for subordinate traffic classes to netdev_pick_tx
  net: Add generic ndo_select_queue functions
  net: allow ndo_select_queue to pass netdev
  net: allow fallback function to pass netdev


 drivers/infiniband/hw/hfi1/vnic_main.c|2 
 drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c |4 -
 drivers/net/bonding/bond_main.c   |3 
 drivers/net/ethernet/amazon/ena/ena_netdev.c  |5 -
 drivers/net/ethernet/broadcom/bcmsysport.c|6 -
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   |6 +
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |3 
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   |5 -
 drivers/net/ethernet/hisilicon/hns/hns_enet.c |5 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   62 ++--
 drivers/net/ethernet/lantiq_etop.c|   10 -
 drivers/net/ethernet/mellanox/mlx4/en_tx.c|7 +
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h  |3 
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |3 
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |5 -
 drivers/net/ethernet/renesas/ravb_main.c  |3 
 drivers/net/ethernet/sun/ldmvsw.c |3 
 drivers/net/ethernet/sun/sunvnet.c|3 
 drivers/net/ethernet/ti/netcp_core.c  |9 -
 drivers/net/hyperv/netvsc_drv.c   |6 -
 drivers/net/macvlan.c |   10 -
 drivers/net/net_failover.c|7 +
 drivers/net/team/team.c   |3 
 drivers/net/tun.c |3 
 drivers/net/wireless/marvell/mwifiex/main.c   |3 
 drivers/net/xen-netback/interface.c   |4 -
 drivers/net/xen-netfront.c|3 
 drivers/staging/netlogic/xlr_net.c|9 -
 drivers/staging/rtl8188eu/os_dep/os_intfs.c   |3 
 drivers/staging/rtl8723bs/os_dep/os_intfs.c   |7 -
 include/linux/netdevice.h |   34 -
 net/core/dev.c|  157 ++---
 net/core/net-sysfs.c  |   36 -
 net/mac80211/iface.c  |4 -
 net/packet/af_packet.c|7 +
 35 files changed, 312 insertions(+), 131 deletions(-)

--

[jkirsher/next-queue PATCH v2 5/7] net: Add generic ndo_select_queue functions

2018-07-09 Thread Alexander Duyck

This patch adds a generic version of the ndo_select_queue functions for
either returning 0 or selecting a queue based on the processor ID. This is
generally meant to just reduce the number of functions we have to change
in the future when we have to deal with ndo_select_queue changes.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/lantiq_etop.c   |   10 +-
 drivers/net/ethernet/ti/netcp_core.c |9 +
 drivers/staging/netlogic/xlr_net.c   |9 +
 include/linux/netdevice.h|4 
 net/core/dev.c   |   14 ++
 net/packet/af_packet.c   |2 +-
 6 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/lantiq_etop.c 
b/drivers/net/ethernet/lantiq_etop.c
index afc8100..7a637b5 100644
--- a/drivers/net/ethernet/lantiq_etop.c
+++ b/drivers/net/ethernet/lantiq_etop.c
@@ -563,14 +563,6 @@ struct ltq_etop_priv {
spin_unlock_irqrestore(>lock, flags);
 }
 
-static u16
-ltq_etop_select_queue(struct net_device *dev, struct sk_buff *skb,
- void *accel_priv, select_queue_fallback_t fallback)
-{
-   /* we are currently only using the first queue */
-   return 0;
-}
-
 static int
 ltq_etop_init(struct net_device *dev)
 {
@@ -641,7 +633,7 @@ struct ltq_etop_priv {
.ndo_set_mac_address = ltq_etop_set_mac_address,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_rx_mode = ltq_etop_set_multicast_list,
-   .ndo_select_queue = ltq_etop_select_queue,
+   .ndo_select_queue = dev_pick_tx_zero,
.ndo_init = ltq_etop_init,
.ndo_tx_timeout = ltq_etop_tx_timeout,
 };
diff --git a/drivers/net/ethernet/ti/netcp_core.c 
b/drivers/net/ethernet/ti/netcp_core.c
index 6ebf110..a1d335a 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -1889,13 +1889,6 @@ static int netcp_rx_kill_vid(struct net_device *ndev, 
__be16 proto, u16 vid)
return err;
 }
 
-static u16 netcp_select_queue(struct net_device *dev, struct sk_buff *skb,
- void *accel_priv,
- select_queue_fallback_t fallback)
-{
-   return 0;
-}
-
 static int netcp_setup_tc(struct net_device *dev, enum tc_setup_type type,
  void *type_data)
 {
@@ -1972,7 +1965,7 @@ static int netcp_setup_tc(struct net_device *dev, enum 
tc_setup_type type,
.ndo_vlan_rx_add_vid= netcp_rx_add_vid,
.ndo_vlan_rx_kill_vid   = netcp_rx_kill_vid,
.ndo_tx_timeout = netcp_ndo_tx_timeout,
-   .ndo_select_queue   = netcp_select_queue,
+   .ndo_select_queue   = dev_pick_tx_zero,
.ndo_setup_tc   = netcp_setup_tc,
 };
 
diff --git a/drivers/staging/netlogic/xlr_net.c 
b/drivers/staging/netlogic/xlr_net.c
index e461168..4e6611e 100644
--- a/drivers/staging/netlogic/xlr_net.c
+++ b/drivers/staging/netlogic/xlr_net.c
@@ -290,13 +290,6 @@ static netdev_tx_t xlr_net_start_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
 }
 
-static u16 xlr_net_select_queue(struct net_device *ndev, struct sk_buff *skb,
-   void *accel_priv,
-   select_queue_fallback_t fallback)
-{
-   return (u16)smp_processor_id();
-}
-
 static void xlr_hw_set_mac_addr(struct net_device *ndev)
 {
struct xlr_net_priv *priv = netdev_priv(ndev);
@@ -403,7 +396,7 @@ static void xlr_stats(struct net_device *ndev, struct 
rtnl_link_stats64 *stats)
.ndo_open = xlr_net_open,
.ndo_stop = xlr_net_stop,
.ndo_start_xmit = xlr_net_start_xmit,
-   .ndo_select_queue = xlr_net_select_queue,
+   .ndo_select_queue = dev_pick_tx_cpu_id,
.ndo_set_mac_address = xlr_net_set_mac_addr,
.ndo_set_rx_mode = xlr_set_rx_mode,
.ndo_get_stats64 = xlr_stats,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5729bc80..2e056bc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2567,6 +2567,10 @@ struct net_device *__dev_get_by_flags(struct net *net, 
unsigned short flags,
 void dev_close_many(struct list_head *head, bool unlink);
 void dev_disable_lro(struct net_device *dev);
 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff 
*newskb);
+u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
+void *accel_priv, select_queue_fallback_t fallback);
+u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
+  void *accel_priv, select_queue_fallback_t fallback);
 int dev_queue_xmit(struct sk_buff *skb);
 int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev);
 int dev_direct_xmit(struct sk_buff *skb, u16 queue_id);
diff --git a/net/core/dev.c b/net/core/dev.c
index 09a7cc2..b5e5380 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3617,6 +3617,20 @@ static int get_xps_queue(struct net_device *dev, struct

RE: [EXT] [PATCH net-next] net: mvpp2: explicitly include linux/interrupt.h

2018-07-09 Thread Yan Markman

+10


Yan Markman
Tel. 05-44732819

-Original Message-
From: Antoine Tenart [mailto:antoine.ten...@bootlin.com] 
Sent: Monday, July 09, 2018 6:01 PM
To: da...@davemloft.net
Cc: Antoine Tenart ; netdev@vger.kernel.org; 
linux-ker...@vger.kernel.org; thomas.petazz...@bootlin.com; 
maxime.chevall...@bootlin.com; gregory.clem...@bootlin.com; 
miquel.ray...@bootlin.com; Nadav Haklai ; Stefan Chulski 
; Yan Markman ; m...@semihalf.com
Subject: [EXT] [PATCH net-next] net: mvpp2: explicitly include linux/interrupt.h

External Email

--
The Marvell PPv2 driver uses interrupts and tasklet but does not explicitly 
include linux/interrupt.h, relying on implicit includes. This one particularly 
is included by chance after a long unlogical chain of inclusions. Fix this so 
we do not get future build breaks.

Signed-off-by: Antoine Tenart 
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h 
b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index 81a66cce7fa8..18834619bb3a 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -12,6 +12,7 @@
 #ifndef _MVPP2_H_
 #define _MVPP2_H_
 
+#include 
 #include 
 #include 
 #include 
--
2.17.1

[jkirsher/next-queue PATCH v2 6/7] net: allow ndo_select_queue to pass netdev

2018-07-09 Thread Alexander Duyck

This patch makes it so that instead of passing a void pointer as the
accel_priv we instead pass a net_device pointer as sb_dev. Making this
change allows us to pass the subordinate device through to the fallback
function eventually so that we can keep the actual code in the
ndo_select_queue call as focused on possible on the exception cases.

Signed-off-by: Alexander Duyck 
---
 drivers/infiniband/hw/hfi1/vnic_main.c|2 +-
 drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c |4 ++--
 drivers/net/bonding/bond_main.c   |3 ++-
 drivers/net/ethernet/amazon/ena/ena_netdev.c  |3 ++-
 drivers/net/ethernet/broadcom/bcmsysport.c|2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   |3 ++-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h   |3 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   |3 ++-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c |3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |7 ---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c|3 ++-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h  |3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   |3 ++-
 drivers/net/ethernet/renesas/ravb_main.c  |3 ++-
 drivers/net/ethernet/sun/ldmvsw.c |3 ++-
 drivers/net/ethernet/sun/sunvnet.c|3 ++-
 drivers/net/hyperv/netvsc_drv.c   |4 ++--
 drivers/net/net_failover.c|5 +++--
 drivers/net/team/team.c   |3 ++-
 drivers/net/tun.c |3 ++-
 drivers/net/wireless/marvell/mwifiex/main.c   |3 ++-
 drivers/net/xen-netback/interface.c   |2 +-
 drivers/net/xen-netfront.c|3 ++-
 drivers/staging/rtl8188eu/os_dep/os_intfs.c   |3 ++-
 drivers/staging/rtl8723bs/os_dep/os_intfs.c   |7 +++
 include/linux/netdevice.h |   11 +++
 net/core/dev.c|6 --
 net/mac80211/iface.c  |4 ++--
 29 files changed, 66 insertions(+), 42 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c 
b/drivers/infiniband/hw/hfi1/vnic_main.c
index 5d65582..616fc9b 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -423,7 +423,7 @@ static netdev_tx_t hfi1_netdev_start_xmit(struct sk_buff 
*skb,
 
 static u16 hfi1_vnic_select_queue(struct net_device *netdev,
  struct sk_buff *skb,
- void *accel_priv,
+ struct net_device *sb_dev,
  select_queue_fallback_t fallback)
 {
struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c 
b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
index 0c8aec6..6155878 100644
--- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
+++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_netdev.c
@@ -95,7 +95,7 @@ static netdev_tx_t opa_netdev_start_xmit(struct sk_buff *skb,
 }
 
 static u16 opa_vnic_select_queue(struct net_device *netdev, struct sk_buff 
*skb,
-void *accel_priv,
+struct net_device *sb_dev,
 select_queue_fallback_t fallback)
 {
struct opa_vnic_adapter *adapter = opa_vnic_priv(netdev);
@@ -107,7 +107,7 @@ static u16 opa_vnic_select_queue(struct net_device *netdev, 
struct sk_buff *skb,
mdata->entropy = opa_vnic_calc_entropy(skb);
mdata->vl = opa_vnic_get_vl(adapter, skb);
rc = adapter->rn_ops->ndo_select_queue(netdev, skb,
-  accel_priv, fallback);
+  sb_dev, fallback);
skb_pull(skb, sizeof(*mdata));
return rc;
 }
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 63e3844..9a2ea3c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4094,7 +4094,8 @@ static inline int bond_slave_override(struct bonding 
*bond,
 
 
 static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
-void *accel_priv, select_queue_fallback_t fallback)
+struct net_device *sb_dev,
+select_queue_fallback_t fallback)
 {
/* This helper function exists to help dev_pick_tx get the correct
 * destination queue.  Using a helper function skips a call to
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c 
b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index f2af87d..e3befb1 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c

[jkirsher/next-queue PATCH v2 3/7] ixgbe: Add code to populate and use macvlan tc to Tx queue map

2018-07-09 Thread Alexander Duyck

This patch makes it so that we use the tc_to_txq mapping in the macvlan
device in order to select the Tx queue for outgoing packets.

The idea here is to try and move away from using ixgbe_select_queue and to
come up with a generic way to make this work for devices going forward. By
encoding this information in the netdev this can become something that can
be used generically as a solution for similar setups going forward.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   44 ++---
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c265963..3ff34ca 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -5275,6 +5275,8 @@ static void ixgbe_clean_rx_ring(struct ixgbe_ring 
*rx_ring)
 static int ixgbe_fwd_ring_up(struct ixgbe_adapter *adapter,
 struct ixgbe_fwd_adapter *accel)
 {
+   u16 rss_i = adapter->ring_feature[RING_F_RSS].indices;
+   int num_tc = netdev_get_num_tc(adapter->netdev);
struct net_device *vdev = accel->netdev;
int i, baseq, err;
 
@@ -5286,6 +5288,11 @@ static int ixgbe_fwd_ring_up(struct ixgbe_adapter 
*adapter,
accel->rx_base_queue = baseq;
accel->tx_base_queue = baseq;
 
+   /* record configuration for macvlan interface in vdev */
+   for (i = 0; i < num_tc; i++)
+   netdev_bind_sb_channel_queue(adapter->netdev, vdev,
+i, rss_i, baseq + (rss_i * i));
+
for (i = 0; i < adapter->num_rx_queues_per_pool; i++)
adapter->rx_ring[baseq + i]->netdev = vdev;
 
@@ -5310,6 +5317,10 @@ static int ixgbe_fwd_ring_up(struct ixgbe_adapter 
*adapter,
 
netdev_err(vdev, "L2FW offload disabled due to L2 filter error\n");
 
+   /* unbind the queues and drop the subordinate channel config */
+   netdev_unbind_sb_channel(adapter->netdev, vdev);
+   netdev_set_sb_channel(vdev, 0);
+
clear_bit(accel->pool, adapter->fwd_bitmask);
kfree(accel);
 
@@ -8206,18 +8217,22 @@ static u16 ixgbe_select_queue(struct net_device *dev, 
struct sk_buff *skb,
  void *accel_priv, select_queue_fallback_t 
fallback)
 {
struct ixgbe_fwd_adapter *fwd_adapter = accel_priv;
-   struct ixgbe_adapter *adapter;
-   int txq;
 #ifdef IXGBE_FCOE
+   struct ixgbe_adapter *adapter;
struct ixgbe_ring_feature *f;
 #endif
+   int txq;
 
if (fwd_adapter) {
-   adapter = netdev_priv(dev);
-   txq = reciprocal_scale(skb_get_hash(skb),
-  adapter->num_rx_queues_per_pool);
+   u8 tc = netdev_get_num_tc(dev) ?
+   netdev_get_prio_tc_map(dev, skb->priority) : 0;
+   struct net_device *vdev = fwd_adapter->netdev;
+
+   txq = vdev->tc_to_txq[tc].offset;
+   txq += reciprocal_scale(skb_get_hash(skb),
+   vdev->tc_to_txq[tc].count);
 
-   return txq + fwd_adapter->tx_base_queue;
+   return txq;
}
 
 #ifdef IXGBE_FCOE
@@ -8771,6 +8786,11 @@ static int ixgbe_reassign_macvlan_pool(struct net_device 
*vdev, void *data)
/* if we cannot find a free pool then disable the offload */
netdev_err(vdev, "L2FW offload disabled due to lack of queue 
resources\n");
macvlan_release_l2fw_offload(vdev);
+
+   /* unbind the queues and drop the subordinate channel config */
+   netdev_unbind_sb_channel(adapter->netdev, vdev);
+   netdev_set_sb_channel(vdev, 0);
+
kfree(accel);
 
return 0;
@@ -9779,6 +9799,13 @@ static void *ixgbe_fwd_add(struct net_device *pdev, 
struct net_device *vdev)
if (!macvlan_supports_dest_filter(vdev))
return ERR_PTR(-EMEDIUMTYPE);
 
+   /* We need to lock down the macvlan to be a single queue device so that
+* we can reuse the tc_to_txq field in the macvlan netdev to represent
+* the queue mapping to our netdev.
+*/
+   if (netif_is_multiqueue(vdev))
+   return ERR_PTR(-ERANGE);
+
pool = find_first_zero_bit(adapter->fwd_bitmask, adapter->num_rx_pools);
if (pool == adapter->num_rx_pools) {
u16 used_pools = adapter->num_vfs + adapter->num_rx_pools;
@@ -9835,6 +9862,7 @@ static void *ixgbe_fwd_add(struct net_device *pdev, 
struct net_device *vdev)
return ERR_PTR(-ENOMEM);
 
set_bit(pool, adapter->fwd_bitmask);
+   netdev_set_sb_channel(vdev, pool);
accel->pool = pool;
accel->netdev = vdev;
 
@@ -9876,6 +9904,10 @@ static void ixgbe_fwd_del(struct net_device *pdev, void 
*priv)
ring->netdev = NULL;
}
 
+   /* unbind the queues and drop the subordinate channel

[jkirsher/next-queue PATCH v2 7/7] net: allow fallback function to pass netdev

2018-07-09 Thread Alexander Duyck

For most of these calls we can just pass NULL through to the fallback
function as the sb_dev. The only cases where we cannot are the cases where
we might be dealing with either an upper device or a driver that would
have configured things to support an sb_dev itself.

The only driver that has any signficant change in this patchset should be
ixgbe as we can drop the redundant functionality that existed in both the
ndo_select_queue function and the fallback function that was passed through
to us.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c|2 +-
 drivers/net/ethernet/broadcom/bcmsysport.c  |4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |3 ++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |2 +-
 drivers/net/ethernet/hisilicon/hns/hns_enet.c   |2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |4 ++--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c  |4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c |2 +-
 drivers/net/hyperv/netvsc_drv.c |2 +-
 drivers/net/net_failover.c  |2 +-
 drivers/net/xen-netback/interface.c |2 +-
 include/linux/netdevice.h   |3 ++-
 net/core/dev.c  |   12 +++-
 net/packet/af_packet.c  |7 ---
 14 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c 
b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index e3befb1..c673ac2 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2224,7 +2224,7 @@ static u16 ena_select_queue(struct net_device *dev, 
struct sk_buff *skb,
if (skb_rx_queue_recorded(skb))
qid = skb_get_rx_queue(skb);
else
-   qid = fallback(dev, skb);
+   qid = fallback(dev, skb, NULL);
 
return qid;
 }
diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c 
b/drivers/net/ethernet/broadcom/bcmsysport.c
index 32f548e..eb890c4 100644
--- a/drivers/net/ethernet/broadcom/bcmsysport.c
+++ b/drivers/net/ethernet/broadcom/bcmsysport.c
@@ -2116,7 +2116,7 @@ static u16 bcm_sysport_select_queue(struct net_device 
*dev, struct sk_buff *skb,
unsigned int q, port;
 
if (!netdev_uses_dsa(dev))
-   return fallback(dev, skb);
+   return fallback(dev, skb, NULL);
 
/* DSA tagging layer will have configured the correct queue */
q = BRCM_TAG_GET_QUEUE(queue);
@@ -2124,7 +2124,7 @@ static u16 bcm_sysport_select_queue(struct net_device 
*dev, struct sk_buff *skb,
tx_ring = priv->ring_map[q + port * priv->per_port_num_tx_queues];
 
if (unlikely(!tx_ring))
-   return fallback(dev, skb);
+   return fallback(dev, skb, NULL);
 
return tx_ring->index;
 }
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index e4e1cf9..5a727d4 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -1933,7 +1933,8 @@ u16 bnx2x_select_queue(struct net_device *dev, struct 
sk_buff *skb,
}
 
/* select a non-FCoE queue */
-   return fallback(dev, skb) % (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos);
+   return fallback(dev, skb, NULL) %
+  (BNX2X_NUM_ETH_QUEUES(bp) * bp->max_cos);
 }
 
 void bnx2x_set_num_queues(struct bnx2x *bp)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 5dc5e56..40cf8dc 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -973,7 +973,7 @@ static u16 cxgb_select_queue(struct net_device *dev, struct 
sk_buff *skb,
return txq;
}
 
-   return fallback(dev, skb) % dev->real_num_tx_queues;
+   return fallback(dev, skb, NULL) % dev->real_num_tx_queues;
 }
 
 static int closest_timer(const struct sge *s, int time)
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c 
b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
index ff7a74e..948b3e0 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
@@ -2033,7 +2033,7 @@ static void hns_nic_get_stats64(struct net_device *ndev,
is_multicast_ether_addr(eth_hdr->h_dest))
return 0;
else
-   return fallback(ndev, skb);
+   return fallback(ndev, skb, NULL);
 }
 
 static const struct net_device_ops hns_nic_netdev_ops = {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a0cf33d..bdaecae 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8242,11 +8242,11 @@ static u16

[jkirsher/next-queue PATCH v2 4/7] net: Add support for subordinate traffic classes to netdev_pick_tx

2018-07-09 Thread Alexander Duyck

This change makes it so that we can support the concept of subordinate
device traffic classes to the core networking code. In doing this we can
start pulling out the driver specific bits needed to support selecting a
queue based on an upper device.

The solution at is currently stands is only partially implemented. I have
the start of some XPS bits in here, but I would still need to allow for
configuration of the XPS maps on the queues reserved for the subordinate
devices. For now I am using the reference to the sb_dev XPS map as just a
way to skip the lookup of the lower device XPS map for now as that would
result in the wrong queue being picked.

Signed-off-by: Alexander Duyck 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   19 +++-
 drivers/net/macvlan.c |   10 +---
 include/linux/netdevice.h |4 +-
 net/core/dev.c|   58 +++--
 4 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3ff34ca..41ef58f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8213,20 +8213,17 @@ static void ixgbe_atr(struct ixgbe_ring *ring,
  input, common, ring->queue_index);
 }
 
+#ifdef IXGBE_FCOE
 static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
  void *accel_priv, select_queue_fallback_t 
fallback)
 {
-   struct ixgbe_fwd_adapter *fwd_adapter = accel_priv;
-#ifdef IXGBE_FCOE
struct ixgbe_adapter *adapter;
struct ixgbe_ring_feature *f;
-#endif
int txq;
 
-   if (fwd_adapter) {
-   u8 tc = netdev_get_num_tc(dev) ?
-   netdev_get_prio_tc_map(dev, skb->priority) : 0;
-   struct net_device *vdev = fwd_adapter->netdev;
+   if (accel_priv) {
+   u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+   struct net_device *vdev = accel_priv;
 
txq = vdev->tc_to_txq[tc].offset;
txq += reciprocal_scale(skb_get_hash(skb),
@@ -8235,8 +8232,6 @@ static u16 ixgbe_select_queue(struct net_device *dev, 
struct sk_buff *skb,
return txq;
}
 
-#ifdef IXGBE_FCOE
-
/*
 * only execute the code below if protocol is FCoE
 * or FIP and we have FCoE enabled on the adapter
@@ -8262,11 +8257,9 @@ static u16 ixgbe_select_queue(struct net_device *dev, 
struct sk_buff *skb,
txq -= f->indices;
 
return txq + f->offset;
-#else
-   return fallback(dev, skb);
-#endif
 }
 
+#endif
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
   struct xdp_frame *xdpf)
 {
@@ -10068,7 +10061,6 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
.ndo_open   = ixgbe_open,
.ndo_stop   = ixgbe_close,
.ndo_start_xmit = ixgbe_xmit_frame,
-   .ndo_select_queue   = ixgbe_select_queue,
.ndo_set_rx_mode= ixgbe_set_rx_mode,
.ndo_validate_addr  = eth_validate_addr,
.ndo_set_mac_address= ixgbe_set_mac,
@@ -10091,6 +10083,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n,
.ndo_poll_controller= ixgbe_netpoll,
 #endif
 #ifdef IXGBE_FCOE
+   .ndo_select_queue   = ixgbe_select_queue,
.ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get,
.ndo_fcoe_ddp_target = ixgbe_fcoe_ddp_target,
.ndo_fcoe_ddp_done = ixgbe_fcoe_ddp_put,
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index adde8fc..401e1d1 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -514,7 +514,6 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct 
net_device *dev)
const struct macvlan_dev *vlan = netdev_priv(dev);
const struct macvlan_port *port = vlan->port;
const struct macvlan_dev *dest;
-   void *accel_priv = NULL;
 
if (vlan->mode == MACVLAN_MODE_BRIDGE) {
const struct ethhdr *eth = (void *)skb->data;
@@ -533,15 +532,10 @@ static int macvlan_queue_xmit(struct sk_buff *skb, struct 
net_device *dev)
return NET_XMIT_SUCCESS;
}
}
-
-   /* For packets that are non-multicast and not bridged we will pass
-* the necessary information so that the lowerdev can distinguish
-* the source of the packets via the accel_priv value.
-*/
-   accel_priv = vlan->accel_priv;
 xmit_world:
skb->dev = vlan->lowerdev;
-   return dev_queue_xmit_accel(skb, accel_priv);
+   return dev_queue_xmit_accel(skb,
+   netdev_get_sb_channel(dev) ? dev : NULL);
 }
 
 static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, 
struct sk_buff *skb)
diff --git

[jkirsher/next-queue PATCH v2 1/7] net-sysfs: Drop support for XPS and traffic_class on single queue device

2018-07-09 Thread Alexander Duyck

This patch makes it so that we do not report the traffic class or allow XPS
configuration on single queue devices. This is mostly to avoid unnecessary
complexity with changes I have planned that will allow us to reuse
the unused tc_to_txq and XPS configuration on a single queue device to
allow it to make use of a subset of queues on an underlying device.

Signed-off-by: Alexander Duyck 
---
 net/core/net-sysfs.c |   15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index f25ac5f..dce3ae0 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1047,9 +1047,14 @@ static ssize_t traffic_class_show(struct netdev_queue 
*queue,
  char *buf)
 {
struct net_device *dev = queue->dev;
-   int index = get_netdev_queue_index(queue);
-   int tc = netdev_txq_to_tc(dev, index);
+   int index;
+   int tc;
 
+   if (!netif_is_multiqueue(dev))
+   return -ENOENT;
+
+   index = get_netdev_queue_index(queue);
+   tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
 
@@ -1214,6 +1219,9 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
cpumask_var_t mask;
unsigned long index;
 
+   if (!netif_is_multiqueue(dev))
+   return -ENOENT;
+
index = get_netdev_queue_index(queue);
 
if (dev->num_tc) {
@@ -1260,6 +1268,9 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
cpumask_var_t mask;
int err;
 
+   if (!netif_is_multiqueue(dev))
+   return -ENOENT;
+
if (!capable(CAP_NET_ADMIN))
return -EPERM;

[jkirsher/next-queue PATCH v2 2/7] net: Add support for subordinate device traffic classes

2018-07-09 Thread Alexander Duyck

This patch is meant to provide the basic tools needed to allow us to create
subordinate device traffic classes. The general idea here is to allow
subdividing the queues of a device into queue groups accessible through an
upper device such as a macvlan.

The idea here is to enforce the idea that an upper device has to be a
single queue device, ideally with IFF_NO_QUQUE set. With that being the
case we can pretty much guarantee that the tc_to_txq mappings and XPS maps
for the upper device are unused. As such we could reuse those in order to
support subdividing the lower device and distributing those queues between
the subordinate devices.

In order to distinguish between a regular set of traffic classes and if a
device is carrying subordinate traffic classes I changed num_tc from a u8
to a s16 value and use the negative values to represent the suboordinate
pool values. So starting at -1 and running to -32768 we can encode those as
pool values, and the existing values of 0 to 15 can be maintained.

Signed-off-by: Alexander Duyck 
---
 include/linux/netdevice.h |   16 
 net/core/dev.c|   89 +
 net/core/net-sysfs.c  |   21 ++-
 3 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b683971..4648a9a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -575,6 +575,9 @@ struct netdev_queue {
 * (/sys/class/net/DEV/Q/trans_timeout)
 */
unsigned long   trans_timeout;
+
+   /* Suboordinate device that the queue has been assigned to */
+   struct net_device   *sb_dev;
 /*
  * write-mostly part
  */
@@ -1991,7 +1994,7 @@ struct net_device {
 #ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
-   u8  num_tc;
+   s16 num_tc;
struct netdev_tc_txqtc_to_txq[TC_MAX_QUEUE];
u8  prio_tc_map[TC_BITMASK + 1];
 
@@ -2045,6 +2048,17 @@ int netdev_get_num_tc(struct net_device *dev)
return dev->num_tc;
 }
 
+void netdev_unbind_sb_channel(struct net_device *dev,
+ struct net_device *sb_dev);
+int netdev_bind_sb_channel_queue(struct net_device *dev,
+struct net_device *sb_dev,
+u8 tc, u16 count, u16 offset);
+int netdev_set_sb_channel(struct net_device *dev, u16 channel);
+static inline int netdev_get_sb_channel(struct net_device *dev)
+{
+   return max_t(int, -dev->num_tc, 0);
+}
+
 static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 unsigned int index)
diff --git a/net/core/dev.c b/net/core/dev.c
index 89825c1..cc1d6bb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2067,11 +2067,13 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned 
int txq)
struct netdev_tc_txq *tc = >tc_to_txq[0];
int i;
 
+   /* walk through the TCs and see if it falls into any of them */
for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
if ((txq - tc->offset) < tc->count)
return i;
}
 
+   /* didn't find it, just return -1 to indicate no match */
return -1;
}
 
@@ -2260,7 +2262,14 @@ int __netif_set_xps_queue(struct net_device *dev, const 
unsigned long *mask,
unsigned int nr_ids;
 
if (dev->num_tc) {
+   /* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
+   if (num_tc < 0)
+   return -EINVAL;
+
+   /* If queue belongs to subordinate dev use its map */
+   dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
@@ -2448,11 +2457,25 @@ int netif_set_xps_queue(struct net_device *dev, const 
struct cpumask *mask,
 EXPORT_SYMBOL(netif_set_xps_queue);
 
 #endif
+static void netdev_unbind_all_sb_channels(struct net_device *dev)
+{
+   struct netdev_queue *txq = >_tx[dev->num_tx_queues];
+
+   /* Unbind any subordinate channels */
+   while (txq-- != >_tx[0]) {
+   if (txq->sb_dev)
+   netdev_unbind_sb_channel(dev, txq->sb_dev);
+   }
+}
+
 void netdev_reset_tc(struct net_device *dev)
 {
 #ifdef CONFIG_XPS
netif_reset_xps_queues_gt(dev, 0);
 #endif
+   netdev_unbind_all_sb_channels(dev);
+
+   /* Reset TC configuration of device */
dev->num_tc = 0;
memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
@@ -2481,11 +2504,77 @@ int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
 #ifdef CONFIG_XPS

Hello Dear

2018-07-09 Thread Smadar Barber-Tsadik

My name is Smadar Barber-Tsadik, I'm the Chief Executive Officer (C.P.A) of the 
First International Bank of Israel (FIBI). I'm getting in touch with you in 
regards to a very important and urgent matter. Kindly respond back at your 
earliest convinience so I can provide you the details.

Faithfully,
Smadar Barber-Tsadik

Re: [PATCH net-next] tcp: expose both send and receive intervals for rate sample

2018-07-09 Thread Yuchung Cheng

On Mon, Jul 9, 2018 at 9:05 AM, Deepti Raghavan  wrote:
> Congestion control algorithms, which access the rate sample
> through the tcp_cong_control function, only have access to the maximum
> of the send and receive interval, for cases where the acknowledgment
> rate may be inaccurate due to ACK compression or decimation. Algorithms
> may want to use send rates and receive rates as separate signals.
>
> Signed-off-by: Deepti Raghavan 
Acked-by: Yuchung Cheng 
> ---
>  include/net/tcp.h   | 2 ++
>  net/ipv4/tcp_rate.c | 4 
>  2 files changed, 6 insertions(+)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index cce3769..f6cb20e 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -954,6 +954,8 @@ struct rate_sample {
>   u32  prior_delivered; /* tp->delivered at "prior_mstamp" */
>   s32  delivered; /* number of packets delivered over interval */
>   long interval_us; /* time for tp->delivered to incr "delivered" */
> + u32 snd_interval_us; /* snd interval for delivered packets */
> + u32 rcv_interval_us; /* rcv interval for delivered packets */
>   long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
>   int  losses; /* number of packets marked lost upon ACK */
>   u32  acked_sacked; /* number of packets newly (S)ACKed upon ACK */
> diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
> index c61240e..4dff40d 100644
> --- a/net/ipv4/tcp_rate.c
> +++ b/net/ipv4/tcp_rate.c
> @@ -146,6 +146,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32
> lost,
>   rs->prior_mstamp); /* ack phase */
>   rs->interval_us = max(snd_us, ack_us);
>
> + /* Record both segment send and ack receive intervals */
> + rs->snd_interval_us = snd_us;
> + rs->rcv_interval_us = ack_us;
> +
>   /* Normally we expect interval_us >= min-rtt.
>* Note that rate may still be over-estimated when a spuriously
>* retransmistted skb was first (s)acked because "interval_us"
> --
> 2.7.4
>

Re: [PATCH net] net: diag: Don't double-free TCP_NEW_SYN_RECV sockets in tcp_abort

2018-07-09 Thread Eric Dumazet




On 07/09/2018 09:14 AM, David Ahern wrote:

> Perhaps it is something with my config, settings, ss version or test
> program, but I do not see it on 4.11:
>

Maybe some vrf issue, I dunno.

[PATCH net-next 1/2] cxgb4: remove stats fetched from firmware

2018-07-09 Thread Rahul Lakkireddy

When running ethtool -S, some stats are requested from firmware.
Since getting these stats via firmware mailbox is slow, some packets
get dropped under heavy load while running ethtool -S.

So, remove these stats from ethtool -S.

Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 133 -
 1 file changed, 133 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index ddb8b9eba6bf..a14a290a56ee 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -115,42 +115,10 @@ static char adapter_stats_strings[][ETH_GSTRING_LEN] = {
"db_drop",
"db_full",
"db_empty   ",
-   "tcp_ipv4_out_rsts  ",
-   "tcp_ipv4_in_segs   ",
-   "tcp_ipv4_out_segs  ",
-   "tcp_ipv4_retrans_segs  ",
-   "tcp_ipv6_out_rsts  ",
-   "tcp_ipv6_in_segs   ",
-   "tcp_ipv6_out_segs  ",
-   "tcp_ipv6_retrans_segs  ",
-   "usm_ddp_frames ",
-   "usm_ddp_octets ",
-   "usm_ddp_drops  ",
-   "rdma_no_rqe_mod_defer  ",
-   "rdma_no_rqe_pkt_defer  ",
-   "tp_err_ofld_no_neigh   ",
-   "tp_err_ofld_cong_defer ",
"write_coal_success ",
"write_coal_fail",
 };
 
-static char channel_stats_strings[][ETH_GSTRING_LEN] = {
-   "Channel- ",
-   "tp_cpl_requests",
-   "tp_cpl_responses   ",
-   "tp_mac_in_errs ",
-   "tp_hdr_in_errs ",
-   "tp_tcp_in_errs ",
-   "tp_tcp6_in_errs",
-   "tp_tnl_cong_drops  ",
-   "tp_tnl_tx_drops",
-   "tp_ofld_vlan_drops ",
-   "tp_ofld_chan_drops ",
-   "fcoe_octets_ddp",
-   "fcoe_frames_ddp",
-   "fcoe_frames_drop   ",
-};
-
 static char loopback_stats_strings[][ETH_GSTRING_LEN] = {
"---Loopback--- ",
"octets_ok  ",
@@ -187,7 +155,6 @@ static int get_sset_count(struct net_device *dev, int sset)
case ETH_SS_STATS:
return ARRAY_SIZE(stats_strings) +
   ARRAY_SIZE(adapter_stats_strings) +
-  ARRAY_SIZE(channel_stats_strings) +
   ARRAY_SIZE(loopback_stats_strings);
case ETH_SS_PRIV_FLAGS:
return ARRAY_SIZE(cxgb4_priv_flags_strings);
@@ -252,9 +219,6 @@ static void get_strings(struct net_device *dev, u32 
stringset, u8 *data)
memcpy(data, adapter_stats_strings,
   sizeof(adapter_stats_strings));
data += sizeof(adapter_stats_strings);
-   memcpy(data, channel_stats_strings,
-  sizeof(channel_stats_strings));
-   data += sizeof(channel_stats_strings);
memcpy(data, loopback_stats_strings,
   sizeof(loopback_stats_strings));
} else if (stringset == ETH_SS_PRIV_FLAGS) {
@@ -280,41 +244,10 @@ struct adapter_stats {
u64 db_drop;
u64 db_full;
u64 db_empty;
-   u64 tcp_v4_out_rsts;
-   u64 tcp_v4_in_segs;
-   u64 tcp_v4_out_segs;
-   u64 tcp_v4_retrans_segs;
-   u64 tcp_v6_out_rsts;
-   u64 tcp_v6_in_segs;
-   u64 tcp_v6_out_segs;
-   u64 tcp_v6_retrans_segs;
-   u64 frames;
-   u64 octets;
-   u64 drops;
-   u64 rqe_dfr_mod;
-   u64 rqe_dfr_pkt;
-   u64 ofld_no_neigh;
-   u64 ofld_cong_defer;
u64 wc_success;
u64 wc_fail;
 };
 
-struct channel_stats {
-   u64 cpl_req;
-   u64 cpl_rsp;
-   u64 mac_in_errs;
-   u64 hdr_in_errs;
-   u64 tcp_in_errs;
-   u64 tcp6_in_errs;
-   u64 tnl_cong_drops;
-   u64 tnl_tx_drops;
-   u64 ofld_vlan_drops;
-   u64 ofld_chan_drops;
-   u64 octets_ddp;
-   u64 frames_ddp;
-   u64 frames_drop;
-};
-
 static void collect_sge_port_stats(const struct adapter *adap,
   const struct port_info *p,
   struct queue_port_stats *s)
@@ -337,45 +270,14 @@ static void collect_sge_port_stats(const struct adapter 
*adap,
 
 static void collect_adapter_stats(struct adapter *adap, struct adapter_stats 
*s)
 {
-   struct tp_tcp_stats v4, v6;
-   struct tp_rdma_stats rdma_stats;
-   struct tp_err_stats err_stats;
-   struct tp_usm_stats usm_stats;
u64 val1, val2;
 
memset(s, 0, sizeof(*s));
 
-   spin_lock(>stats_lock);
-   t4_tp_get_tcp_stats(adap, , , false);
-   t4_tp_get_rdma_stats(adap, _stats, false);
-   t4_get_usm_stats(adap, _stats, false);
-   t4_tp_get_err_stats(adap, _stats, false);
-   spin_unlock(>stats_lock);
-
s->db_drop = adap->db_stats.db_drop;
s->db_full =

1 2 >

1 - 100 of 140 matches

Mail list logo