Re: [patch net-next RFC 3/3] switchdev: introduce deferred variants of obj_add/del helpers

2015-10-07 Thread Or Gerlitz
On Wed, Oct 7, 2015 at 9:30 PM, Jiri Pirko  wrote:
> From: Jiri Pirko 
>
> Similar to the attr usecase, the caller knows if he is holding RTNL and is
> in atomic section. So let the called to decide the correct call variant.
>
> This allows drivers to sleep inside their ops and wait for hw to get the
> operation status. Then the status is propagated into switchdev core.
> This avoids silent errors in drivers.
>
> Signed-off-by: Jiri Pirko 

> diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
> index 7f7d551..2086767 100644
> --- a/net/bridge/br_fdb.c
> +++ b/net/bridge/br_fdb.c
> @@ -139,7 +139,7 @@ static void fdb_del_external_learn(struct 
> net_bridge_fdb_entry *f)
> .vid = f->vlan_id,
> };
>
> -   switchdev_port_obj_del(f->dst->dev, &fdb.obj);
> +   switchdev_port_obj_del_deferred(f->dst->dev, &fdb.obj);
>  }


>  static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
> diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
> index c29f4ee..49e6e6f 100644
> --- a/net/switchdev/switchdev.c
> +++ b/net/switchdev/switchdev.c
> @@ -362,6 +362,75 @@ int switchdev_port_obj_add(struct net_device *dev,
>  }
>  EXPORT_SYMBOL_GPL(switchdev_port_obj_add);
>
> +struct switchdev_obj_work {
> +   struct work_struct work;
> +   struct net_device *dev;
> +   struct switchdev_obj obj;
> +   bool add; /* add of del */
> +};
> +
> +static void switchdev_port_obj_work(struct work_struct *work)
> +{
> +   struct switchdev_obj_work *ow =
> +   container_of(work, struct switchdev_obj_work, work);
> +   int err;
> +
> +   rtnl_lock();
> +   if (ow->add)
> +   err = switchdev_port_obj_add(ow->dev, &ow->obj);
> +   else
> +   err = switchdev_port_obj_del(ow->dev, &ow->obj);
> +   if (err && err != -EOPNOTSUPP)
> +   netdev_err(ow->dev, "failed (err=%d) to %s object (id=%d)\n",
> +  err, ow->add ? "add" : "del", ow->obj.id);

This introduced a regression to the 2-phase commit scheme, since the
prepare commit can fail
and that would go un-noticed toward the upper layer, agree?

Or.

> +   rtnl_unlock();
> +
> +   dev_put(ow->dev);
> +   kfree(ow);
> +}
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] net namespace: dynamically configure new net namespace inherit net config

2015-10-07 Thread yzhu1

Hi, Miller

Would you like to check this patch?

Thanks a lot.
Zhu Yanjun

On 06/26/2015 05:37 PM, Zhu Yanjun wrote:

The new net namespace can inherit from the original net config, or
the current net config. As such, a config is needed to decide where
the new namespace inherit from.

Signed-off-by: Zhu Yanjun 
---
  init/Kconfig   |  9 +
  net/ipv4/devinet.c | 13 +
  2 files changed, 22 insertions(+)

diff --git a/init/Kconfig b/init/Kconfig
index dc24dec..fab8c41 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1212,6 +1212,15 @@ config NET_NS
  Allow user space to create what appear to be multiple instances
  of the network stack.
  
+config NET_NS_INHERIT_ORIGINAL

+   bool "New network namespace inherits from original net config"
+   depends on NET_NS
+   default n
+   help
+ Allow new network namespace inherit from original net config.
+ If no, the new network namespace inherits from the current net
+ config including the modified net config.
+
  endif # NAMESPACES
  
  config SCHED_AUTOGROUP

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 419d23c..cf635e4 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2271,6 +2271,7 @@ static __net_init int devinet_init_net(struct net *net)
  #endif
  
  	err = -ENOMEM;

+#ifndef CONFIG_NET_NS_INHERIT_ORIGINAL
all = &ipv4_devconf;
dflt = &ipv4_devconf_dflt;
  
@@ -2282,6 +2283,15 @@ static __net_init int devinet_init_net(struct net *net)

dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
if (!dflt)
goto err_alloc_dflt;
+#else
+   all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL);
+   if (!all)
+   goto err_alloc_all;
+
+   dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), 
GFP_KERNEL);
+   if (!dflt)
+   goto err_alloc_dflt;
+#endif
  
  #ifdef CONFIG_SYSCTL

tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
@@ -2292,7 +2302,10 @@ static __net_init int devinet_init_net(struct net *net)
tbl[0].extra1 = all;
tbl[0].extra2 = net;
  #endif
+
+#ifndef CONFIG_NET_NS_INHERIT_ORIGINAL
}
+#endif
  
  #ifdef CONFIG_SYSCTL

err = __devinet_sysctl_register(net, "all", all);


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2 1/5] net: move net_get_random_once to lib

2015-10-07 Thread Daniel Borkmann

On 10/08/2015 07:12 AM, kbuild test robot wrote:

Hi Hannes,

[auto build test WARNING on net-next/master -- if it's inappropriate base, 
please ignore]

config: mips-ip27_defconfig (attached as .config)
reproduce:
 wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
 chmod +x ~/bin/make.cross
 # save the attached .config to linux build tree
 make.cross ARCH=mips

All warnings (new ones prefixed by >>):

mips-linux-gnu-ld: lib/lockref.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
mips-linux-gnu-ld: lib/lockref.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags

[...]

mips-linux-gnu-ld: lib/reciprocal_div.o: warning: Inconsistent ISA between 
e_flags and .MIPS.abiflags
mips-linux-gnu-ld: lib/reciprocal_div.o: warning: Inconsistent ISA 
extensions between e_flags and .MIPS.abiflags

mips-linux-gnu-ld: lib/once.o: warning: Inconsistent ISA between e_flags and 
.MIPS.abiflags
mips-linux-gnu-ld: lib/once.o: warning: Inconsistent ISA extensions between 
e_flags and .MIPS.abiflags


Looks like a known issue on MIPS given the warning is not new also on other 
object
files. Here was what was discussed recently on this subject on a refactoring 
patch
from David Ahern:

  https://patchwork.ozlabs.org/patch/525102/

Thanks!


mips-linux-gnu-ld: lib/string_helpers.o: warning: Inconsistent ISA between 
e_flags and .MIPS.abiflags
mips-linux-gnu-ld: lib/string_helpers.o: warning: Inconsistent ISA 
extensions between e_flags and .MIPS.abiflags

[...]
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/2] bpf: enable non-root eBPF programs

2015-10-07 Thread Alexei Starovoitov

On 10/7/15 11:21 PM, Ingo Molnar wrote:

so I see no reason why unprivileged eBPF couldn't have a sysctl too - with the
default value set to permissive.


agreed. sent out v2 follows 'modules_disabled' style.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/2] bpf: enable non-root eBPF programs

2015-10-07 Thread Ingo Molnar

* Alexei Starovoitov  wrote:

> As far as sysctl we can look at two with similar purpose:
> sysctl_perf_event_paranoid and modules_disabled.
> First one is indeed multi level, but not because of the fear of bugs,
> but because of real security implications.

It serves both purposes flexibly, and note that most people and distros will 
use 
the default value.

> [...] Like raw events on hyperthreaded cpu or uncore events can extract data 
> from other user processes. So it controls these extra privileges.

It also controls the generally increased risk caused by a larger attack 
surface, 
which some users may not want to carry and which they can thus shrink.

With a static keys approach there would be no runtime overhead worth speaking 
of, 
so I see no reason why unprivileged eBPF couldn't have a sysctl too - with the 
default value set to permissive.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 5/6] net: dsa: push prepare phase in port_fdb_add

2015-10-07 Thread Scott Feldman
On Wed, Oct 7, 2015 at 4:48 PM, Vivien Didelot
 wrote:
> Now that the prepare phase is pushed down to the DSA drivers, propagate
> it to the port_fdb_add function.
>
> Signed-off-by: Vivien Didelot 

Reviewed-by: Scott Feldman 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 6/6] net: dsa: use switchdev obj in port_fdb_del

2015-10-07 Thread Scott Feldman
On Wed, Oct 7, 2015 at 4:48 PM, Vivien Didelot
 wrote:
> For consistency with the FDB add operation, propagate the
> switchdev_obj_port_fdb structure in the DSA drivers.
>
> Signed-off-by: Vivien Didelot 

Reviewed-by: Scott Feldman 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 4/6] net: dsa: add port_fdb_prepare

2015-10-07 Thread Scott Feldman
On Wed, Oct 7, 2015 at 4:48 PM, Vivien Didelot
 wrote:
> Push the prepare phase for FDB operations down to the DSA drivers, with
> a new port_fdb_prepare function. Currently only mv88e6xxx is affected.
>
> Signed-off-by: Vivien Didelot 

Reviewed-by: Scott Feldman 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch net-next RFC 2/3] switchdev: allow caller to explicitly use deferred attr_set version

2015-10-07 Thread Scott Feldman
On Wed, Oct 7, 2015 at 10:39 PM, Jiri Pirko  wrote:
> Thu, Oct 08, 2015 at 06:27:07AM CEST, sfel...@gmail.com wrote:
>>On Wed, Oct 7, 2015 at 11:30 AM, Jiri Pirko  wrote:
>>> From: Jiri Pirko 
>>>
>>> Caller should know if he can call attr_set directly (when holding RTNL)
>>> or if he has to use deferred version of this function.
>>>
>>> This also allows drivers to sleep inside attr_set and report operation
>>> status back to switchdev core. Switchdev core then warns if status is
>>> not ok, instead of silent errors happening in drivers.
>>>
>>> Signed-off-by: Jiri Pirko 
>>> ---
>>>  include/net/switchdev.h   |   2 +
>>>  net/bridge/br_stp.c   |   4 +-
>>>  net/switchdev/switchdev.c | 113 
>>> +-
>>>  3 files changed, 65 insertions(+), 54 deletions(-)
>>>
>>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>>> index 89266a3..320be44 100644
>>> --- a/include/net/switchdev.h
>>> +++ b/include/net/switchdev.h
>>> @@ -168,6 +168,8 @@ int switchdev_port_attr_get(struct net_device *dev,
>>> struct switchdev_attr *attr);
>>>  int switchdev_port_attr_set(struct net_device *dev,
>>> struct switchdev_attr *attr);
>>> +int switchdev_port_attr_set_deferred(struct net_device *dev,
>>> +struct switchdev_attr *attr);
>>
>>Rather than adding another op, use attr->flags and define:
>>
>>#define SWITCHDEV_F_DEFERRED  BIT(x)
>>
>>So we get:
>>
>>void br_set_state(struct net_bridge_port *p, unsigned int state)
>>{
>>struct switchdev_attr attr = {
>>.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>>+  .flags = SWITCHDEV_F_DEFERRED,
>>.u.stp_state = state,
>>};
>>int err;
>>
>>p->state = state;
>>err = switchdev_port_attr_set(p->dev, &attr);
>>if (err && err != -EOPNOTSUPP)
>>br_warn(p->br, "error setting offload STP state on
>>port %u(%s)\n",
>>(unsigned int) p->port_no,
>>p->dev->name);
>>}
>>
>>(And add obj->flags to do the same).
>
> That's what I wanted to avoid. Also because the obj is const and for
> call from work, this flag would have to be removed.

What did you want to avoid?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 1/2] net: Fix Hisilicon Network Subsystem compilation warning

2015-10-07 Thread huangdaode
This patch fix a compilation warning on arm 32-bit platform

Signed-off-by: huangdaode 
Signed-off-by: yankejian 
---
 drivers/net/ethernet/hisilicon/hns/hnae.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.c 
b/drivers/net/ethernet/hisilicon/hns/hnae.c
index 0a0a9e8..ccbb666 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.c
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.c
@@ -448,12 +448,12 @@ static ssize_t handles_show(struct device *dev,
s += sprintf(buf + s, "handle %d (eport_id=%u from %s):\n",
i++, h->eport_id, h->dev->name);
for (j = 0; j < h->q_num; j++) {
-   s += sprintf(buf + s, "\tqueue[%d] on 0x%llx\n",
-j, (u64)h->qs[i]->io_base);
-#define HANDEL_TX_MSG "\t\ttx_ring on 0x%llx:%u,%u,%u,%u,%u,%llu,%llu\n"
+   s += sprintf(buf + s, "\tqueue[%d] on 0x%p\n",
+j, h->qs[i]->io_base);
+#define HANDEL_TX_MSG "\t\ttx_ring on 0x%p:%u,%u,%u,%u,%u,%llu,%llu\n"
s += sprintf(buf + s,
 HANDEL_TX_MSG,
-(u64)h->qs[i]->tx_ring.io_base,
+h->qs[i]->tx_ring.io_base,
 h->qs[i]->tx_ring.buf_size,
 h->qs[i]->tx_ring.desc_num,
 h->qs[i]->tx_ring.max_desc_num_per_pkt,
@@ -462,8 +462,8 @@ static ssize_t handles_show(struct device *dev,
 h->qs[i]->tx_ring.stats.sw_err_cnt,
 h->qs[i]->tx_ring.stats.io_err_cnt);
s += sprintf(buf + s,
-   "\t\trx_ring on 0x%llx:%u,%u,%llu,%llu,%llu\n",
-   (u64)h->qs[i]->rx_ring.io_base,
+   "\t\trx_ring on 0x%p:%u,%u,%llu,%llu,%llu\n",
+   h->qs[i]->rx_ring.io_base,
h->qs[i]->rx_ring.buf_size,
h->qs[i]->rx_ring.desc_num,
h->qs[i]->rx_ring.stats.sw_err_cnt,
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 0/2] net: fix some bugs in HNS drivers

2015-10-07 Thread huangdaode
This patchset fixes the two bugs in HNS driver, one is for fixing the
compilation warning bug on arm 32-bit platform, another is fixing the wrong
mac port judgement bug which is found during internal tests.

huangdaode (2):
  net: Fix Hisilicon Network Subsystem compilation warning
  net: fix a bug on Hisilicon Network Subsystem

 drivers/net/ethernet/hisilicon/hns/hnae.c | 12 ++--
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 2/2] net: fix a bug on Hisilicon Network Subsystem

2015-10-07 Thread huangdaode
This patch fixes a bug in hns driver during internal testing of the
driver.

Signed-off-by: huangdaode 
Signed-off-by: yankejian 
---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c 
b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
index 95bf42a..e162e4a 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_mac.c
@@ -179,7 +179,7 @@ static int hns_mac_get_inner_port_num(struct hns_mac_cb 
*mac_cb,
return -EINVAL;
}
} else if (mac_cb->dsaf_dev->dsaf_mode < DSAF_MODE_MAX) {
-   if (mac_cb->mac_id <= DSAF_MAX_PORT_NUM_PER_CHIP) {
+   if (mac_cb->mac_id >= DSAF_MAX_PORT_NUM_PER_CHIP) {
dev_err(mac_cb->dev,
"input invalid,%s mac%d vmid%d!\n",
mac_cb->dsaf_dev->ae_dev.name,
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2 4/4] rocker: handle setting bridge ageing_time

2015-10-07 Thread sfeldma
From: Scott Feldman 

The FDB cleanup timer will get rescheduled to re-evaluate FDB entries
based on new ageing_time.

Signed-off-by: Scott Feldman 
---
 drivers/net/ethernet/rocker/rocker.c |   22 ++
 1 file changed, 22 insertions(+)

diff --git a/drivers/net/ethernet/rocker/rocker.c 
b/drivers/net/ethernet/rocker/rocker.c
index cf91ffc..3c7f9ae 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4361,6 +4361,24 @@ static int rocker_port_brport_flags_set(struct 
rocker_port *rocker_port,
return err;
 }
 
+static int rocker_port_bridge_set(struct rocker_port *rocker_port,
+ struct switchdev_trans *trans,
+ struct switchdev_attr_bridge *bridge)
+{
+   switch (bridge->attr) {
+   case IFLA_BR_AGEING_TIME:
+   if (switchdev_trans_ph_prepare(trans))
+   return 0;
+   rocker_port->ageing_time = clock_t_to_jiffies(bridge->val);
+   mod_timer(&rocker_port->rocker->fdb_cleanup_timer, jiffies);
+   break;
+   default:
+   return -EOPNOTSUPP;
+   }
+
+   return 0;
+}
+
 static int rocker_port_attr_set(struct net_device *dev,
struct switchdev_attr *attr,
struct switchdev_trans *trans)
@@ -4378,6 +4396,10 @@ static int rocker_port_attr_set(struct net_device *dev,
err = rocker_port_brport_flags_set(rocker_port, trans,
   attr->u.brport_flags);
break;
+   case SWITCHDEV_ATTR_ID_BRIDGE:
+   err = rocker_port_bridge_set(rocker_port, trans,
+&attr->u.bridge);
+   break;
default:
err = -EOPNOTSUPP;
break;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2 3/4] bridge: push bridge setting ageing_time down to switchdev

2015-10-07 Thread sfeldma
From: Scott Feldman 

Use SWITCHDEV_F_SKIP_EOPNOTSUPP to skip over ports in bridge that don't
support setting ageing_time (or setting bridge attrs in general).

If push fails, don't update ageing_time in bridge and return err to user.

If push succeeds, update ageing_time in bridge and run gc_timer now to
recalabrate when to run gc_timer next, based on new ageing_time.

Signed-off-by: Scott Feldman 
Signed-off-by: Jiri Pirko 
---
 net/bridge/br_ioctl.c|3 +--
 net/bridge/br_netlink.c  |6 +++---
 net/bridge/br_private.h  |1 +
 net/bridge/br_stp.c  |   24 
 net/bridge/br_sysfs_br.c |3 +--
 5 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 8d423bc..263b4de 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -200,8 +200,7 @@ static int old_dev_ioctl(struct net_device *dev, struct 
ifreq *rq, int cmd)
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
 
-   br->ageing_time = clock_t_to_jiffies(args[1]);
-   return 0;
+   return br_set_ageing_time(br, args[1]);
 
case BRCTL_GET_PORT_INFO:
{
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index d78b442..544ab96 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -870,9 +870,9 @@ static int br_changelink(struct net_device *brdev, struct 
nlattr *tb[],
}
 
if (data[IFLA_BR_AGEING_TIME]) {
-   u32 ageing_time = nla_get_u32(data[IFLA_BR_AGEING_TIME]);
-
-   br->ageing_time = clock_t_to_jiffies(ageing_time);
+   err = br_set_ageing_time(br, 
nla_get_u32(data[IFLA_BR_AGEING_TIME]));
+   if (err)
+   return err;
}
 
if (data[IFLA_BR_STP_STATE]) {
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 09d3ecb..ba0c67b 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -882,6 +882,7 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned 
long t);
 int br_set_forward_delay(struct net_bridge *br, unsigned long x);
 int br_set_hello_time(struct net_bridge *br, unsigned long x);
 int br_set_max_age(struct net_bridge *br, unsigned long x);
+int br_set_ageing_time(struct net_bridge *br, u32 ageing_time);
 
 
 /* br_stp_if.c */
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 3a982c0..ae3286b 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -566,6 +566,30 @@ int br_set_max_age(struct net_bridge *br, unsigned long 
val)
 
 }
 
+int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
+{
+   struct switchdev_attr attr = {
+   .id = SWITCHDEV_ATTR_ID_BRIDGE,
+   .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
+   .u.bridge.attr = IFLA_BR_AGEING_TIME,
+   .u.bridge.val = ageing_time,
+   };
+   unsigned long t = clock_t_to_jiffies(ageing_time);
+   int err;
+
+   if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME)
+   return -ERANGE;
+
+   err = switchdev_port_attr_set(br->dev, &attr);
+   if (err)
+   return err;
+
+   br->ageing_time = t;
+   mod_timer(&br->gc_timer, jiffies);
+
+   return 0;
+}
+
 void __br_set_forward_delay(struct net_bridge *br, unsigned long t)
 {
br->bridge_forward_delay = t;
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 4c97fc5..04ef192 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -102,8 +102,7 @@ static ssize_t ageing_time_show(struct device *d,
 
 static int set_ageing_time(struct net_bridge *br, unsigned long val)
 {
-   br->ageing_time = clock_t_to_jiffies(val);
-   return 0;
+   return br_set_ageing_time(br, val);
 }
 
 static ssize_t ageing_time_store(struct device *d,
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2 1/4] switchdev: add bridge attributes

2015-10-07 Thread sfeldma
From: Scott Feldman 

Setting the stage to push bridge-level attributes down to port driver so
hardware can be programmed accordingly.  Bridge-level attribute example is
ageing_time.  This is a per-bridge attribute, not a per-bridge-port attr.

Signed-off-by: Scott Feldman 
---
 include/net/switchdev.h  |5 +
 include/uapi/linux/if_link.h |2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 89266a3..8d92cd0 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -43,6 +43,7 @@ enum switchdev_attr_id {
SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
SWITCHDEV_ATTR_ID_PORT_STP_STATE,
SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
+   SWITCHDEV_ATTR_ID_BRIDGE,
 };
 
 struct switchdev_attr {
@@ -52,6 +53,10 @@ struct switchdev_attr {
struct netdev_phys_item_id ppid;/* PORT_PARENT_ID */
u8 stp_state;   /* PORT_STP_STATE */
unsigned long brport_flags; /* PORT_BRIDGE_FLAGS */
+   struct switchdev_attr_bridge {  /* BRIDGE */
+   enum ifla_br attr;
+   u32 val;
+   } bridge;
} u;
 };
 
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index e3b6217..30177b3 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -222,7 +222,7 @@ enum in6_addr_gen_mode {
 
 /* Bridge section */
 
-enum {
+enum ifla_br {
IFLA_BR_UNSPEC,
IFLA_BR_FORWARD_DELAY,
IFLA_BR_HELLO_TIME,
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2 0/4] switchdev: push bridge attributes down

2015-10-07 Thread sfeldma
From: Scott Feldman 

Push bridge-level attributes down to switchdev drivers.  This patchset
adds the infrastructure and then pushes, as an example, ageing_time attribute
down from bridge to switchdev (rocker) driver.  Add some range-checking
for ageing_time.

# ip link set dev br0 type bridge ageing_time 1000

# ip link set dev br0 type bridge ageing_time 999
RTNETLINK answers: Numerical result out of range

Up until now, switchdev attrs where port-level attrs, so the netdev used in
switchdev_attr_set() would be a switch port or bond of switch ports.  With
bridge-level attrs, the netdev passed to switchdev_attr_set() is the bridge
netdev.  The same recusive algo is used to visit the leaves of the stacked
drivers to set the attr, it's just in this case we start one layer higher in
the stack.  One note is not all ports in the bridge may support setting a
bridge-level attribute, so rather than failing the entire set, we'll skip over
those ports returning -EOPNOTSUPP.

v1->v2: rebase w/ net-next

Scott Feldman (4):
  switchdev: add bridge attributes
  switchdev: skip over ports returning -EOPNOTSUPP when recursing ports
  bridge: push bridge setting ageing_time down to switchdev
  rocker: handle setting bridge ageing_time

 drivers/net/ethernet/rocker/rocker.c |   22 ++
 include/net/switchdev.h  |6 ++
 include/uapi/linux/if_link.h |2 +-
 net/bridge/br_ioctl.c|3 +--
 net/bridge/br_netlink.c  |6 +++---
 net/bridge/br_private.h  |1 +
 net/bridge/br_stp.c  |   24 
 net/bridge/br_sysfs_br.c |3 +--
 net/switchdev/switchdev.c|9 -
 9 files changed, 67 insertions(+), 9 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2 2/4] switchdev: skip over ports returning -EOPNOTSUPP when recursing ports

2015-10-07 Thread sfeldma
From: Scott Feldman 

This allows us to recurse over all the ports, skipping over unsupporting
ports.  Without the change, the recursion would stop at first unsupported
port.

Signed-off-by: Scott Feldman 
---
 include/net/switchdev.h   |1 +
 net/switchdev/switchdev.c |9 -
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 8d92cd0..f3de6f4 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -16,6 +16,7 @@
 #include 
 
 #define SWITCHDEV_F_NO_RECURSE BIT(0)
+#define SWITCHDEV_F_SKIP_EOPNOTSUPPBIT(1)
 
 struct switchdev_trans_item {
struct list_head list;
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 6e4a4f9..7a9ab90 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -147,7 +147,7 @@ static int __switchdev_port_attr_set(struct net_device *dev,
return ops->switchdev_port_attr_set(dev, attr, trans);
 
if (attr->flags & SWITCHDEV_F_NO_RECURSE)
-   return err;
+   goto done;
 
/* Switch device port(s) may be stacked under
 * bond/team/vlan dev, so recurse down to set attr on
@@ -156,10 +156,17 @@ static int __switchdev_port_attr_set(struct net_device 
*dev,
 
netdev_for_each_lower_dev(dev, lower_dev, iter) {
err = __switchdev_port_attr_set(lower_dev, attr, trans);
+   if (err == -EOPNOTSUPP &&
+   attr->flags & SWITCHDEV_F_SKIP_EOPNOTSUPP)
+   continue;
if (err)
break;
}
 
+done:
+   if (err == -EOPNOTSUPP && attr->flags & SWITCHDEV_F_SKIP_EOPNOTSUPP)
+   err = 0;
+
return err;
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH iproute2] bridge: add batch command support

2015-10-07 Thread Roopa Prabhu
From: Wilson Kok 

This patch adds support to batch bridge commands.
Follows ip batch code.

Signed-off-by: Wilson Kok 
Signed-off-by: Roopa Prabhu 
---
 bridge/bridge.c   | 59 +++
 man/man8/bridge.8 | 11 +++
 2 files changed, 70 insertions(+)

diff --git a/bridge/bridge.c b/bridge/bridge.c
index eaf09c8..c028f6c 100644
--- a/bridge/bridge.c
+++ b/bridge/bridge.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "SNAPSHOT.h"
 #include "utils.h"
@@ -23,6 +24,8 @@ int show_stats;
 int show_details;
 int compress_vlans;
 int timestamp;
+char *batch_file;
+int force;
 const char *_SL_;
 
 static void usage(void) __attribute__((noreturn));
@@ -31,6 +34,7 @@ static void usage(void)
 {
fprintf(stderr,
 "Usage: bridge [ OPTIONS ] OBJECT { COMMAND | help }\n"
+"  bridge [ -force ] -batch filename\n"
 "where OBJECT := { link | fdb | mdb | vlan | monitor }\n"
 "  OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] |\n"
 "   -o[neline] | -t[imestamp] | -n[etns] name |\n"
@@ -71,6 +75,50 @@ static int do_cmd(const char *argv0, int argc, char **argv)
return -1;
 }
 
+static int batch(const char *name)
+{
+   char *line = NULL;
+   size_t len = 0;
+   int ret = EXIT_SUCCESS;
+
+   if (name && strcmp(name, "-") != 0) {
+   if (freopen(name, "r", stdin) == NULL) {
+   fprintf(stderr,
+   "Cannot open file \"%s\" for reading: %s\n",
+   name, strerror(errno));
+   return EXIT_FAILURE;
+   }
+   }
+
+   if (rtnl_open(&rth, 0) < 0) {
+   fprintf(stderr, "Cannot open rtnetlink\n");
+   return EXIT_FAILURE;
+   }
+
+   cmdlineno = 0;
+   while (getcmdline(&line, &len, stdin) != -1) {
+   char *largv[100];
+   int largc;
+
+   largc = makeargs(line, largv, 100);
+   if (largc == 0)
+   continue;   /* blank line */
+
+   if (do_cmd(largv[0], largc, largv)) {
+   fprintf(stderr, "Command failed %s:%d\n",
+   name, cmdlineno);
+   ret = EXIT_FAILURE;
+   if (!force)
+   break;
+   }
+   }
+   if (line)
+   free(line);
+
+   rtnl_close(&rth);
+   return ret;
+}
+
 int
 main(int argc, char **argv)
 {
@@ -123,6 +171,14 @@ main(int argc, char **argv)
exit(-1);
} else if (matches(opt, "-compressvlans") == 0) {
++compress_vlans;
+   } else if (matches(opt, "-force") == 0) {
+   ++force;
+   } else if (matches(opt, "-batch") == 0) {
+   argc--;
+   argv++;
+   if (argc <= 1)
+   usage();
+   batch_file = argv[1];
} else {
fprintf(stderr,
"Option \"%s\" is unknown, try \"bridge 
help\".\n",
@@ -134,6 +190,9 @@ main(int argc, char **argv)
 
_SL_ = oneline ? "\\" : "\n";
 
+   if (batch_file)
+   return batch(batch_file);
+
if (rtnl_open(&rth, 0) < 0)
exit(1);
 
diff --git a/man/man8/bridge.8 b/man/man8/bridge.8
index 5347a56..d45c728 100644
--- a/man/man8/bridge.8
+++ b/man/man8/bridge.8
@@ -21,6 +21,7 @@ bridge \- show / manipulate bridge addresses and devices
 \fB\-V\fR[\fIersion\fR] |
 \fB\-s\fR[\fItatistics\fR] |
 \fB\-n\fR[\fIetns\fR] name }
+\fB\-b\fR[\fIatch\fR] filename }
 
 .ti -8
 .BR "bridge link set"
@@ -137,6 +138,16 @@ to
 .RI "-n[etns] " NETNS " [ " OPTIONS " ] " OBJECT " { " COMMAND " | "
 .BR help " }"
 
+.TP
+.BR "\-b", " \-batch " 
+Read commands from provided file or standard input and invoke them.
+First failure will cause termination of bridge command.
+
+.TP
+.BR "\-force"
+Don't terminate bridge command on errors in batch mode.
+If there were any errors during execution of the commands, the application
+return code will be non zero.
 
 .SH BRIDGE - COMMAND SYNTAX
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch net-next RFC 2/3] switchdev: allow caller to explicitly use deferred attr_set version

2015-10-07 Thread Jiri Pirko
Thu, Oct 08, 2015 at 06:27:07AM CEST, sfel...@gmail.com wrote:
>On Wed, Oct 7, 2015 at 11:30 AM, Jiri Pirko  wrote:
>> From: Jiri Pirko 
>>
>> Caller should know if he can call attr_set directly (when holding RTNL)
>> or if he has to use deferred version of this function.
>>
>> This also allows drivers to sleep inside attr_set and report operation
>> status back to switchdev core. Switchdev core then warns if status is
>> not ok, instead of silent errors happening in drivers.
>>
>> Signed-off-by: Jiri Pirko 
>> ---
>>  include/net/switchdev.h   |   2 +
>>  net/bridge/br_stp.c   |   4 +-
>>  net/switchdev/switchdev.c | 113 
>> +-
>>  3 files changed, 65 insertions(+), 54 deletions(-)
>>
>> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
>> index 89266a3..320be44 100644
>> --- a/include/net/switchdev.h
>> +++ b/include/net/switchdev.h
>> @@ -168,6 +168,8 @@ int switchdev_port_attr_get(struct net_device *dev,
>> struct switchdev_attr *attr);
>>  int switchdev_port_attr_set(struct net_device *dev,
>> struct switchdev_attr *attr);
>> +int switchdev_port_attr_set_deferred(struct net_device *dev,
>> +struct switchdev_attr *attr);
>
>Rather than adding another op, use attr->flags and define:
>
>#define SWITCHDEV_F_DEFERRED  BIT(x)
>
>So we get:
>
>void br_set_state(struct net_bridge_port *p, unsigned int state)
>{
>struct switchdev_attr attr = {
>.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
>+  .flags = SWITCHDEV_F_DEFERRED,
>.u.stp_state = state,
>};
>int err;
>
>p->state = state;
>err = switchdev_port_attr_set(p->dev, &attr);
>if (err && err != -EOPNOTSUPP)
>br_warn(p->br, "error setting offload STP state on
>port %u(%s)\n",
>(unsigned int) p->port_no,
>p->dev->name);
>}
>
>(And add obj->flags to do the same).

That's what I wanted to avoid. Also because the obj is const and for
call from work, this flag would have to be removed.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 0/3] bpf: unprivileged

2015-10-07 Thread Alexei Starovoitov
v1-v2:
- this set logically depends on cb patch
  "bpf: fix cb access in socket filter programs":
  http://patchwork.ozlabs.org/patch/527391/
  which is must have to allow unprivileged programs.
  Thanks Daniel for finding that issue.
- refactored sysctl to be similar to 'modules_disabled'
- dropped bpf_trace_printk
- split tests into separate patch and added more tests
  based on discussion

v1 cover letter:
I think it is time to liberate eBPF from CAP_SYS_ADMIN.
As was discussed when eBPF was first introduced two years ago
the only piece missing in eBPF verifier is 'pointer leak detection'
to make it available to non-root users.
Patch 1 adds this pointer analysis.
The eBPF programs, obviously, need to see and operate on kernel addresses,
but with these extra checks they won't be able to pass these addresses
to user space.
Patch 2 adds accounting of kernel memory used by programs and maps.
It changes behavoir for existing root users, but I think it needs
to be done consistently for both root and non-root, since today
programs and maps are only limited by number of open FDs (RLIMIT_NOFILE).
Patch 2 accounts program's and map's kernel memory as RLIMIT_MEMLOCK.

Unprivileged eBPF is only meaningful for 'socket filter'-like programs.
eBPF programs for tracing and TC classifiers/actions will stay root only.

In parallel the bpf fuzzing effort is ongoing and so far
we've found only one verifier bug and that was already fixed.
The 'constant blinding' pass also being worked on.
It will obfuscate constant-like values that are part of eBPF ISA
to make jit spraying attacks even harder.

Alexei Starovoitov (3):
  bpf: enable non-root eBPF programs
  bpf: charge user for creation of BPF maps and programs
  bpf: add unprivileged bpf tests

 include/linux/bpf.h |5 +
 include/linux/sched.h   |2 +-
 kernel/bpf/arraymap.c   |2 +-
 kernel/bpf/hashtab.c|4 +
 kernel/bpf/syscall.c|   74 -
 kernel/bpf/verifier.c   |  106 +++--
 kernel/sysctl.c |   13 ++
 net/core/filter.c   |3 +-
 samples/bpf/libbpf.h|8 +
 samples/bpf/test_verifier.c |  357 +--
 10 files changed, 547 insertions(+), 27 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 3/3] bpf: add unprivileged bpf tests

2015-10-07 Thread Alexei Starovoitov
Add new tests samples/bpf/test_verifier:

unpriv: return pointer
  checks that pointer cannot be returned from the eBPF program

unpriv: add const to pointer
unpriv: add pointer to pointer
unpriv: neg pointer
  checks that pointer arithmetic is disallowed

unpriv: cmp pointer with const
unpriv: cmp pointer with pointer
  checks that comparison of pointers is disallowed
  Only one case allowed 'void *value = bpf_map_lookup_elem(..); if (value == 0) 
...'

unpriv: check that printk is disallowed
  since bpf_trace_printk is not available to unprivileged

unpriv: pass pointer to helper function
  checks that pointers cannot be passed to functions that expect integers
  If function expects a pointer the verifier allows only that type of pointer.
  Like 1st argument of bpf_map_lookup_elem() must be pointer to map.
  (applies to non-root as well)

unpriv: indirectly pass pointer on stack to helper function
  checks that pointer stored into stack cannot be used as part of key
  passed into bpf_map_lookup_elem()

unpriv: mangle pointer on stack 1
unpriv: mangle pointer on stack 2
  checks that writing into stack slot that already contains a pointer
  is disallowed

unpriv: read pointer from stack in small chunks
  checks that < 8 byte read from stack slot that contains a pointer is
  disallowed

unpriv: write pointer into ctx
  checks that storing pointers into skb->fields is disallowed

unpriv: write pointer into map elem value
  checks that storing pointers into element values is disallowed
  For example:
  int bpf_prog(struct __sk_buff *skb)
  {
u32 key = 0;
u64 *value = bpf_map_lookup_elem(&map, &key);
if (value)
   *value = (u64) skb;
  }
  will be rejected.

unpriv: partial copy of pointer
  checks that doing 32-bit register mov from register containing
  a pointer is disallowed

unpriv: pass pointer to tail_call
  checks that passing pointer as an index into bpf_tail_call
  is disallowed

unpriv: cmp map pointer with zero
  checks that comparing map pointer with constant is disallowed

unpriv: write into frame pointer
  checks that frame pointer is read-only (applies to root too)

unpriv: cmp of frame pointer
  checks that R10 cannot be using in comparison

unpriv: cmp of stack pointer
  checks that Rx = R10 - imm is ok, but comparing Rx is not

unpriv: obfuscate stack pointer
  checks that Rx = R10 - imm is ok, but Rx -= imm is not

Signed-off-by: Alexei Starovoitov 
---
v1-v2:
- split tests into separate patch
- add tail_call and other tests and explain tests in commit log
---
 samples/bpf/libbpf.h|8 +
 samples/bpf/test_verifier.c |  357 +--
 2 files changed, 355 insertions(+), 10 deletions(-)

diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h
index 7235e292a03b..b7f63c70b4a2 100644
--- a/samples/bpf/libbpf.h
+++ b/samples/bpf/libbpf.h
@@ -64,6 +64,14 @@ extern char bpf_log_buf[LOG_BUF_SIZE];
.off   = 0, \
.imm   = 0 })
 
+#define BPF_MOV32_REG(DST, SRC)\
+   ((struct bpf_insn) {\
+   .code  = BPF_ALU | BPF_MOV | BPF_X, \
+   .dst_reg = DST, \
+   .src_reg = SRC, \
+   .off   = 0, \
+   .imm   = 0 })
+
 /* Short form of mov, dst_reg = imm32 */
 
 #define BPF_MOV64_IMM(DST, IMM)\
diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c
index ee0f110c9c54..563c507c0a09 100644
--- a/samples/bpf/test_verifier.c
+++ b/samples/bpf/test_verifier.c
@@ -15,20 +15,27 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "libbpf.h"
 
 #define MAX_INSNS 512
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
 
+#define MAX_FIXUPS 8
+
 struct bpf_test {
const char *descr;
struct bpf_insn insns[MAX_INSNS];
-   int fixup[32];
+   int fixup[MAX_FIXUPS];
+   int prog_array_fixup[MAX_FIXUPS];
const char *errstr;
+   const char *errstr_unpriv;
enum {
+   UNDEF,
ACCEPT,
REJECT
-   } result;
+   } result, result_unpriv;
enum bpf_prog_type prog_type;
 };
 
@@ -96,6 +103,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.errstr = "invalid BPF_LD_IMM insn",
+   .errstr_unpriv = "R1 pointer comparison",
.result = REJECT,
},
{
@@ -109,6 +117,7 @@ static struct bpf_test tests[] = {
BPF_EXIT_INSN(),
},
.errstr = "invalid BPF_LD_IMM insn",
+   .errstr_unpriv = "R1 pointer comparison",
.result = REJECT,
},
{
@@ -219,6 +228,7 @@ static struct bpf_test tests[] = {
 

[PATCH v2 net-next 1/3] bpf: enable non-root eBPF programs

2015-10-07 Thread Alexei Starovoitov
In order to let unprivileged users load and execute eBPF programs
teach verifier to prevent pointer leaks.
Verifier will prevent
- any arithmetic on pointers
  (except R10+Imm which is used to compute stack addresses)
- comparison of pointers
  (except if (map_value_ptr == 0) ... )
- passing pointers to helper functions
- indirectly passing pointers in stack to helper functions
- returning pointer from bpf program
- storing pointers into ctx or maps

Spill/fill of pointers into stack is allowed, but mangling
of pointers stored in the stack or reading them byte by byte is not.

Within bpf programs the pointers do exist, since programs need to
be able to access maps, pass skb pointer to LD_ABS insns, etc
but programs cannot pass such pointer values to the outside
or obfuscate them.

Only allow BPF_PROG_TYPE_SOCKET_FILTER unprivileged programs,
so that socket filters (tcpdump), af_packet (quic acceleration)
and future kcm can use it.
tracing and tc cls/act program types still require root permissions,
since tracing actually needs to be able to see all kernel pointers
and tc is for root only.

For example, the following unprivileged socket filter program is allowed:
int bpf_prog1(struct __sk_buff *skb)
{
  u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
  u64 *value = bpf_map_lookup_elem(&my_map, &index);

  if (value)
*value += skb->len;
  return 0;
}

but the following program is not:
int bpf_prog1(struct __sk_buff *skb)
{
  u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
  u64 *value = bpf_map_lookup_elem(&my_map, &index);

  if (value)
*value += (u64) skb;
  return 0;
}
since it would leak the kernel address into the map.

Unprivileged socket filter bpf programs have access to the
following helper functions:
- map lookup/update/delete (but they cannot store kernel pointers into them)
- get_random (it's already exposed to unprivileged user space)
- get_smp_processor_id
- tail_call into another socket filter program
- ktime_get_ns

The feature is controlled by sysctl kernel.unprivileged_bpf_disabled.
This toggle defaults to off (0), but can be set true (1).  Once true,
bpf programs and maps cannot be accessed from unprivileged process,
and the toggle cannot be set back to false.

Signed-off-by: Alexei Starovoitov 
---
v1->v2:
- sysctl_unprivileged_bpf_disabled
- drop bpf_trace_printk
- split tests into separate patch to ease review
---
 include/linux/bpf.h   |2 +
 kernel/bpf/syscall.c  |   11 ++---
 kernel/bpf/verifier.c |  106 -
 kernel/sysctl.c   |   13 ++
 net/core/filter.c |3 +-
 5 files changed, 120 insertions(+), 15 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 19b8a2081f88..e472b06df138 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -167,6 +167,8 @@ void bpf_prog_put_rcu(struct bpf_prog *prog);
 struct bpf_map *bpf_map_get(struct fd f);
 void bpf_map_put(struct bpf_map *map);
 
+extern int sysctl_unprivileged_bpf_disabled;
+
 /* verify correctness of eBPF program */
 int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
 #else
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5f35f420c12f..9f824b0f0f5f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
 #include 
 #include 
 
+int sysctl_unprivileged_bpf_disabled __read_mostly;
+
 static LIST_HEAD(bpf_map_types);
 
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -542,6 +544,9 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
 
+   if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
@@ -597,11 +602,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, 
uattr, unsigned int, siz
union bpf_attr attr = {};
int err;
 
-   /* the syscall is limited to root temporarily. This restriction will be
-* lifted when security audit is clean. Note that eBPF+tracing must have
-* this restriction, since it may pass kernel data to user space
-*/
-   if (!capable(CAP_SYS_ADMIN))
+   if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
return -EPERM;
 
if (!access_ok(VERIFY_READ, uattr, 1))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f8da034c2258..1d6b97be79e1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
struct verifier_state_list **explored_states; /* search pruning 
optimization */
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by 
eBPF program */
u32 used_map_cnt;   /* number of used maps */
+   bool allow_ptr_leaks;
 };
 
 /* verbose v

[PATCH v2 net-next 2/3] bpf: charge user for creation of BPF maps and programs

2015-10-07 Thread Alexei Starovoitov
since eBPF programs and maps use kernel memory consider it 'locked' memory
from user accounting point of view and charge it against RLIMIT_MEMLOCK limit.
This limit is typically set to 64Kbytes by distros, so almost all
bpf+tracing programs would need to increase it, since they use maps,
but kernel charges maximum map size upfront.
For example the hash map of 1024 elements will be charged as 64Kbyte.
It's inconvenient for current users and changes current behavior for root,
but probably worth doing to be consistent root vs non-root.

Similar accounting logic is done by mmap of perf_event.

Signed-off-by: Alexei Starovoitov 
---
The charging is agressive and even basic test_maps, test_verifier are
hitting memlock limit, so alternatively we can drop charging
for cap_sys_admin.
---
 include/linux/bpf.h   |3 +++
 include/linux/sched.h |2 +-
 kernel/bpf/arraymap.c |2 +-
 kernel/bpf/hashtab.c  |4 
 kernel/bpf/syscall.c  |   63 +
 5 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e472b06df138..e1c869f8e156 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -36,6 +36,8 @@ struct bpf_map {
u32 key_size;
u32 value_size;
u32 max_entries;
+   u32 pages;
+   struct user_struct *user;
const struct bpf_map_ops *ops;
struct work_struct work;
 };
@@ -128,6 +130,7 @@ struct bpf_prog_aux {
const struct bpf_verifier_ops *ops;
struct bpf_map **used_maps;
struct bpf_prog *prog;
+   struct user_struct *user;
union {
struct work_struct work;
struct rcu_head rcu;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b7b9501b41af..4817df5fffae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -840,7 +840,7 @@ struct user_struct {
struct hlist_node uidhash_node;
kuid_t uid;
 
-#ifdef CONFIG_PERF_EVENTS
+#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
atomic_long_t locked_vm;
 #endif
 };
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2fecc4aed119..f2d9e698c753 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
-
+   array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
array->elem_size = elem_size;
 
return &array->map;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d9b17a..28592d79502b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -88,6 +88,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->elem_size = sizeof(struct htab_elem) +
  round_up(htab->map.key_size, 8) +
  htab->map.value_size;
+
+   htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+  htab->elem_size * htab->map.max_entries,
+  PAGE_SIZE) >> PAGE_SHIFT;
return &htab->map;
 
 free_htab:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9f824b0f0f5f..43e8afaee329 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -46,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
 }
 
+static int bpf_map_charge_memlock(struct bpf_map *map)
+{
+   struct user_struct *user = get_current_user();
+   unsigned long memlock_limit;
+
+   memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+   atomic_long_add(map->pages, &user->locked_vm);
+
+   if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+   atomic_long_sub(map->pages, &user->locked_vm);
+   free_uid(user);
+   return -EPERM;
+   }
+   map->user = user;
+   return 0;
+}
+
+static void bpf_map_uncharge_memlock(struct bpf_map *map)
+{
+   struct user_struct *user = map->user;
+
+   atomic_long_sub(map->pages, &user->locked_vm);
+   free_uid(user);
+}
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
struct bpf_map *map = container_of(work, struct bpf_map, work);
 
+   bpf_map_uncharge_memlock(map);
/* implementation dependent freeing */
map->ops->map_free(map);
 }
@@ -110,6 +137,10 @@ static int map_create(union bpf_attr *attr)
 
atomic_set(&map->refcnt, 1);
 
+   err = bpf_map_charge_memlock(map);
+   if (err)
+   goto free_map;
+
err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | 
O_CLOEXEC);
 
if (err < 0)
@@ -440,11 +471,37 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
 }
 

Re: [PATCH net-next v2 1/5] net: move net_get_random_once to lib

2015-10-07 Thread kbuild test robot
Hi Hannes,

[auto build test WARNING on net-next/master -- if it's inappropriate base, 
please ignore]

config: mips-ip27_defconfig (attached as .config)
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=mips 

All warnings (new ones prefixed by >>):

   mips-linux-gnu-ld: lib/lockref.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/lockref.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/bcd.o: warning: Inconsistent ISA between e_flags and 
.MIPS.abiflags
   mips-linux-gnu-ld: lib/bcd.o: warning: Inconsistent ISA extensions between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/div64.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/div64.o: warning: Inconsistent ISA extensions between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/sort.o: warning: Inconsistent ISA between e_flags and 
.MIPS.abiflags
   mips-linux-gnu-ld: lib/sort.o: warning: Inconsistent ISA extensions between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/parser.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/parser.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/halfmd4.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/halfmd4.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/debug_locks.o: warning: Inconsistent ISA between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/debug_locks.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/random32.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/random32.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/bust_spinlocks.o: warning: Inconsistent ISA between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/bust_spinlocks.o: warning: Inconsistent ISA 
extensions between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/kasprintf.o: warning: Inconsistent ISA between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/kasprintf.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/bitmap.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/bitmap.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/scatterlist.o: warning: Inconsistent ISA between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/scatterlist.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/gcd.o: warning: Inconsistent ISA between e_flags and 
.MIPS.abiflags
   mips-linux-gnu-ld: lib/gcd.o: warning: Inconsistent ISA extensions between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/lcm.o: warning: Inconsistent ISA between e_flags and 
.MIPS.abiflags
   mips-linux-gnu-ld: lib/lcm.o: warning: Inconsistent ISA extensions between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/list_sort.o: warning: Inconsistent ISA between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/list_sort.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/uuid.o: warning: Inconsistent ISA between e_flags and 
.MIPS.abiflags
   mips-linux-gnu-ld: lib/uuid.o: warning: Inconsistent ISA extensions between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/flex_array.o: warning: Inconsistent ISA between 
e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/flex_array.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/iov_iter.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/iov_iter.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/clz_ctz.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/clz_ctz.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/bsearch.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/bsearch.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/find_bit.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/find_bit.o: warning: Inconsistent ISA extensions 
between e_flags and .MIPS.abiflags
   mips-linux-gnu-ld: lib/llist.o: warning: Inconsistent ISA between e_flags 
and .MIPS.abiflags
   mips-linux-gnu-ld: lib/llist.o: warning: Inconsistent ISA

[PATCHv2 net-next] cxgb4: Enhance driver to update FW, when FW is too old

2015-10-07 Thread Hariprasad Shenai
t4_check_fw_version() can return several error codes (-EINVAL, -EBUSY,
-EAGAIN). The present code sets the adapter state to UNINIT only if its
an EFAULT. In all the error cases set the adapter to uninitialized state.

In t4_check_fw_version() if call to t4_get_fw_version() fails, repeat the
operation a few times before returning failure.

Signed-off-by: Hariprasad Shenai 
---
V2: Retry t4_get_fw_version() only if ret value is EBUSY or EAGAIN
Based on review comment by Neil Horman 

 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +-
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c  | 6 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 9f1f5b2..c29227e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3698,7 +3698,7 @@ static int adap_init0(struct adapter *adap)
t4_get_tp_version(adap, &adap->params.tp_vers);
ret = t4_check_fw_version(adap);
/* If firmware is too old (not supported by driver) force an update. */
-   if (ret == -EFAULT)
+   if (ret)
state = DEV_STATE_UNINIT;
if ((adap->flags & MASTER_PF) && state != DEV_STATE_INIT) {
struct fw_info *fw_info;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index dc6ce31..cf61a58 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -2981,11 +2981,15 @@ int t4_get_exprom_version(struct adapter *adap, u32 
*vers)
  */
 int t4_check_fw_version(struct adapter *adap)
 {
-   int ret, major, minor, micro;
+   int i, ret, major, minor, micro;
int exp_major, exp_minor, exp_micro;
unsigned int chip_version = CHELSIO_CHIP_VERSION(adap->params.chip);
 
ret = t4_get_fw_version(adap, &adap->params.fw_vers);
+   /* Try multiple times before returning error */
+   for (i = 0; (ret == -EBUSY || ret == -EAGAIN) && i < 3; i++)
+   ret = t4_get_fw_version(adap, &adap->params.fw_vers);
+
if (ret)
return ret;
 
-- 
2.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch net-next RFC 2/3] switchdev: allow caller to explicitly use deferred attr_set version

2015-10-07 Thread Scott Feldman
On Wed, Oct 7, 2015 at 11:30 AM, Jiri Pirko  wrote:
> From: Jiri Pirko 
>
> Caller should know if he can call attr_set directly (when holding RTNL)
> or if he has to use deferred version of this function.
>
> This also allows drivers to sleep inside attr_set and report operation
> status back to switchdev core. Switchdev core then warns if status is
> not ok, instead of silent errors happening in drivers.
>
> Signed-off-by: Jiri Pirko 
> ---
>  include/net/switchdev.h   |   2 +
>  net/bridge/br_stp.c   |   4 +-
>  net/switchdev/switchdev.c | 113 
> +-
>  3 files changed, 65 insertions(+), 54 deletions(-)
>
> diff --git a/include/net/switchdev.h b/include/net/switchdev.h
> index 89266a3..320be44 100644
> --- a/include/net/switchdev.h
> +++ b/include/net/switchdev.h
> @@ -168,6 +168,8 @@ int switchdev_port_attr_get(struct net_device *dev,
> struct switchdev_attr *attr);
>  int switchdev_port_attr_set(struct net_device *dev,
> struct switchdev_attr *attr);
> +int switchdev_port_attr_set_deferred(struct net_device *dev,
> +struct switchdev_attr *attr);

Rather than adding another op, use attr->flags and define:

#define SWITCHDEV_F_DEFERRED  BIT(x)

So we get:

void br_set_state(struct net_bridge_port *p, unsigned int state)
{
struct switchdev_attr attr = {
.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+  .flags = SWITCHDEV_F_DEFERRED,
.u.stp_state = state,
};
int err;

p->state = state;
err = switchdev_port_attr_set(p->dev, &attr);
if (err && err != -EOPNOTSUPP)
br_warn(p->br, "error setting offload STP state on
port %u(%s)\n",
(unsigned int) p->port_no,
p->dev->name);
}

(And add obj->flags to do the same).
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next] cxgb4: Enhance driver to update FW, when FW is too old

2015-10-07 Thread Hariprasad S
On Wed, Oct 07, 2015 at 15:09:57 -0400, Neil Horman wrote:
> On Wed, Oct 07, 2015 at 09:21:40AM +0530, Hariprasad Shenai wrote:
> > t4_check_fw_version() can return several error codes (-EINVAL, -EBUSY,
> > -EAGAIN). The present code sets the adapter state to UNINIT only if its
> > an EFAULT. In all the error cases set the adapter to uninitialized state.
> > 
> > In t4_check_fw_version() if call to t4_read_flash() fails, repeat the
> > operation a few times before returning failure.
> > 
> > Signed-off-by: Hariprasad Shenai 
> > ---
> >  drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 2 +-
> >  drivers/net/ethernet/chelsio/cxgb4/t4_hw.c  | 6 +-
> >  2 files changed, 6 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
> > b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> > index 9f1f5b2..c29227e 100644
> > --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> > +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> > +   ret = t4_get_fw_version(adap, &adap->params.fw_vers);
> > +
> Shouldn't you only retry if you get an error that indicates the error is
> transient (i.e. EBUSY or EAGAIN)?
> Neil
> 
Yes you are right. Will send a V2 for the same.

Thanks
Hari

> 
> > if (ret)
> > return ret;
> >  
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] ip: find correct route for socket which is not bound (v2)

2015-10-07 Thread Wengang Wang

Hi,

Any comment on this patch?

thanks,
wengang

在 2015年09月25日 09:52, Wengang Wang 写道:

This is the v2, comparing the v1, the changes is:
  * for loopback outbound device, it continue skipping cached route;
for others, it goes through the cached route.

For multicast, we should find valid route(thus get the meaniful pmtu) for
the packet on the socket which is not bound to a device(sk_bound_dev_if
being 0) too.

 From man page of socket(7)

SO_BINDTODEVICE
Bind this socket to a particular device like “eth0”, as
specified in the passed interface name.  If the name is an
empty string or the option length is zero, the socket
device binding is removed. The  passed  option is  a
variable-length null-terminated interface name string with
the maximum size of IFNAMSIZ.  If a socket is bound to an
interface, only packets received from that particular
interface are processed by the socket. Note that this works
only for some socket types, particularly AF_INET sockets.
It is not supported for packet sockets (use normal bind(2)
there).

The man page doesn't say when socket not bound packets won't be routed.

A problem is hit that all multicast packets dropped by kernel(from sender
host). The lower layer is IPoIB with MTU being 7000. And I was sending 4096
length multicast  packets. Inside IPoIB the first send is dropped because
is exeeding the internal packet size limitation mcast_mtu which is 2044.
So IPoIB calls ip_rt_update_pmtu (indirectly) trying to set path mtu. A
correct route is configured for the multicast, so the setting of pmtu
cucceeded and the next multicast packet(to the same target) is expected
to succeed(it would be well fragmented accroding to the pmtu I just set).
But actually the second and later multicast packets got dropped too. And
the reason is that the neighor looking up(fib_lookup) is skipped because of
the socket is not bound to device(sk_bound_dev_if being 0). After applied
the patch I proposed here, it works fine.

Signed-off-by: Wengang Wang 
---
  net/ipv4/route.c | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5f4a556..c0534c2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2097,7 +2097,10 @@ struct rtable *__ip_route_output_key(struct net *net, 
struct flowi4 *fl4)
 */
  
  			fl4->flowi4_oif = dev_out->ifindex;

-   goto make_route;
+   if (dev_out->flags & IFF_LOOPBACK)
+   goto make_route;
+   else
+   goto lookup;
}
  
  		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {

@@ -2153,6 +2156,7 @@ struct rtable *__ip_route_output_key(struct net *net, 
struct flowi4 *fl4)
goto make_route;
}
  
+lookup:

if (fib_lookup(net, fl4, &res, 0)) {
res.fi = NULL;
res.table = NULL;


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 0/3] net: unix: fix use-after-free

2015-10-07 Thread Jason Baron
Hi,

These patches are against mainline, I can re-base to net-next, just
let me know.

They have been tested against: https://lkml.org/lkml/2015/9/13/195,
which causes the use-after-free quite quickly and here:
https://lkml.org/lkml/2015/10/2/693.

Thanks,

-Jason

v3:
-beef up memory barrier comments in 3/3 (Peter Zijlstra)
-clean up unix_dgram_writable() function in 3/3 (Joe Perches)

Jason Baron (3):
  net: unix: fix use-after-free in unix_dgram_poll()
  net: unix: Convert gc_flags to flags
  net: unix: optimize wakeups in unix_dgram_recvmsg()

 include/net/af_unix.h |   4 +-
 net/unix/af_unix.c| 117 +++---
 net/unix/garbage.c|  12 +++---
 3 files changed, 101 insertions(+), 32 deletions(-)

-- 
2.6.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 1/3] net: unix: fix use-after-free in unix_dgram_poll()

2015-10-07 Thread Jason Baron
The unix_dgram_poll() routine calls sock_poll_wait() not only for the wait
queue associated with the socket s that we are poll'ing against, but also calls
sock_poll_wait() for a remote peer socket p, if it is connected. Thus,
if we call poll()/select()/epoll() for the socket s, there are then
a couple of code paths in which the remote peer socket p and its associated
peer_wait queue can be freed before poll()/select()/epoll() have a chance
to remove themselves from the remote peer socket.

The way that remote peer socket can be freed are:

1. If s calls connect() to a connect to a new socket other than p, it will
drop its reference on p, and thus a close() on p will free it.

2. If we call close on p(), then a subsequent sendmsg() from s, will drop
the final reference to p, allowing it to be freed.

Address this issue, by reverting unix_dgram_poll() to only register with
the wait queue associated with s and register a callback with the remote peer
socket on connect() that will wake up the wait queue associated with s. If
scenarios 1 or 2 occur above we then simply remove the callback from the
remote peer. This then presents the expected semantics to poll()/select()/
epoll().

I've implemented this for sock-type, SOCK_RAW, SOCK_DGRAM, and SOCK_SEQPACKET
but not for SOCK_STREAM, since SOCK_STREAM does not use unix_dgram_poll().

Introduced in commit ec0d215f9420 ("af_unix: fix 'poll for write'/connected
DGRAM sockets").

Tested-by: Mathias Krause 
Signed-off-by: Jason Baron 
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c| 32 +++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 4a167b3..9698aff 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -62,6 +62,7 @@ struct unix_sock {
 #define UNIX_GC_CANDIDATE  0
 #define UNIX_GC_MAYBE_CYCLE1
struct socket_wqpeer_wq;
+   wait_queue_twait;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 03ee4d3..f789423 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -420,6 +420,9 @@ static void unix_release_sock(struct sock *sk, int embrion)
skpair = unix_peer(sk);
 
if (skpair != NULL) {
+   if (sk->sk_type != SOCK_STREAM)
+   remove_wait_queue(&unix_sk(skpair)->peer_wait,
+ &u->wait);
if (sk->sk_type == SOCK_STREAM || sk->sk_type == 
SOCK_SEQPACKET) {
unix_state_lock(skpair);
/* No more writes */
@@ -636,6 +639,16 @@ static struct proto unix_proto = {
  */
 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 
+static int peer_wake(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+   struct unix_sock *u;
+
+   u = container_of(wait, struct unix_sock, wait);
+   wake_up_interruptible_sync_poll(sk_sleep(&u->sk), key);
+
+   return 0;
+}
+
 static struct sock *unix_create1(struct net *net, struct socket *sock, int 
kern)
 {
struct sock *sk = NULL;
@@ -664,6 +677,7 @@ static struct sock *unix_create1(struct net *net, struct 
socket *sock, int kern)
INIT_LIST_HEAD(&u->link);
mutex_init(&u->readlock); /* single task reading lock */
init_waitqueue_head(&u->peer_wait);
+   init_waitqueue_func_entry(&u->wait, peer_wake);
unix_insert_socket(unix_sockets_unbound(sk), sk);
 out:
if (sk == NULL)
@@ -1030,7 +1044,11 @@ restart:
 */
if (unix_peer(sk)) {
struct sock *old_peer = unix_peer(sk);
+
+   remove_wait_queue(&unix_sk(old_peer)->peer_wait,
+ &unix_sk(sk)->wait);
unix_peer(sk) = other;
+   add_wait_queue(&unix_sk(other)->peer_wait, &unix_sk(sk)->wait);
unix_state_double_unlock(sk, other);
 
if (other != old_peer)
@@ -1038,8 +1056,12 @@ restart:
sock_put(old_peer);
} else {
unix_peer(sk) = other;
+   add_wait_queue(&unix_sk(other)->peer_wait, &unix_sk(sk)->wait);
unix_state_double_unlock(sk, other);
}
+   /* New remote may have created write space for us */
+   wake_up_interruptible_sync_poll(sk_sleep(sk),
+   POLLOUT | POLLWRNORM | POLLWRBAND);
return 0;
 
 out_unlock:
@@ -1194,6 +1216,8 @@ restart:
 
sock_hold(sk);
unix_peer(newsk)= sk;
+   if (sk->sk_type == SOCK_SEQPACKET)
+   add_wait_queue(&unix_sk(sk)->peer_wait, &unix_sk(newsk)->wait);
newsk->sk_state = TCP_ESTABLISHED;
newsk->sk_type  = sk->sk_type;
init_peercred(newsk);
@@ -1220,6 +1244,8 @@ restart:
 
smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
unix_peer(sk)   = newsk;
+   if (sk-

[PATCH v3 3/3] net: unix: optimize wakeups in unix_dgram_recvmsg()

2015-10-07 Thread Jason Baron
Now that connect() permanently registers a callback routine, we can induce
extra overhead in unix_dgram_recvmsg(), which unconditionally wakes up
its peer_wait queue on every receive. This patch makes the wakeup there
conditional on there being waiters.

Signed-off-by: Jason Baron 
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c| 85 ---
 2 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 6a4a345..cf21ffd 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -61,6 +61,7 @@ struct unix_sock {
unsigned long   flags;
 #define UNIX_GC_CANDIDATE  0
 #define UNIX_GC_MAYBE_CYCLE1
+#define UNIX_NOSPACE   2
struct socket_wqpeer_wq;
wait_queue_twait;
 };
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index f789423..66979d4 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -326,7 +326,7 @@ found:
return s;
 }
 
-static inline int unix_writable(struct sock *sk)
+static inline bool unix_writable(struct sock *sk)
 {
return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 }
@@ -1079,6 +1079,12 @@ static long unix_wait_for_peer(struct sock *other, long 
timeo)
 
prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
 
+   set_bit(UNIX_NOSPACE, &u->flags);
+   /* Ensure that we either see space in the peer sk_receive_queue via the
+* unix_recvq_full() check below, or we receive a wakeup when it
+* empties. Pairs with the mb in unix_dgram_recvmsg().
+*/
+   smp_mb__after_atomic();
sched = !sock_flag(other, SOCK_DEAD) &&
!(other->sk_shutdown & RCV_SHUTDOWN) &&
unix_recvq_full(other);
@@ -1623,17 +1629,27 @@ restart:
 
if (unix_peer(other) != sk && unix_recvq_full(other)) {
if (!timeo) {
-   err = -EAGAIN;
-   goto out_unlock;
-   }
+   set_bit(UNIX_NOSPACE, &unix_sk(other)->flags);
+   /* Ensure that we either see space in the peer
+* sk_receive_queue via the unix_recvq_full() check
+* below, or we receive a wakeup when it empties. This
+* makes sure that epoll ET triggers correctly. Pairs
+* with the mb in unix_dgram_recvmsg().
+*/
+   smp_mb__after_atomic();
+   if (unix_recvq_full(other)) {
+   err = -EAGAIN;
+   goto out_unlock;
+   }
+   } else {
+   timeo = unix_wait_for_peer(other, timeo);
 
-   timeo = unix_wait_for_peer(other, timeo);
+   err = sock_intr_errno(timeo);
+   if (signal_pending(current))
+   goto out_free;
 
-   err = sock_intr_errno(timeo);
-   if (signal_pending(current))
-   goto out_free;
-
-   goto restart;
+   goto restart;
+   }
}
 
if (sock_flag(other, SOCK_RCVTSTAMP))
@@ -1939,8 +1955,19 @@ static int unix_dgram_recvmsg(struct socket *sock, 
struct msghdr *msg,
goto out_unlock;
}
 
-   wake_up_interruptible_sync_poll(&u->peer_wait,
-   POLLOUT | POLLWRNORM | POLLWRBAND);
+   /* Ensure that waiters on our sk->sk_receive_queue draining that check
+* via unix_recvq_full() either see space in the queue or get a wakeup
+* below. sk->sk_receive_queue is reduece by the __skb_recv_datagram()
+* call above. Pairs with the mb in unix_dgram_sendmsg(),
+*unix_dgram_poll(), and unix_wait_for_peer().
+*/
+   smp_mb();
+   if (test_bit(UNIX_NOSPACE, &u->flags)) {
+   clear_bit(UNIX_NOSPACE, &u->flags);
+   wake_up_interruptible_sync_poll(&u->peer_wait,
+   POLLOUT | POLLWRNORM |
+   POLLWRBAND);
+   }
 
if (msg->msg_name)
unix_copy_addr(msg, skb->sk);
@@ -2432,11 +2459,19 @@ static unsigned int unix_poll(struct file *file, struct 
socket *sock, poll_table
return mask;
 }
 
+static bool unix_dgram_writable(struct sock *sk, struct sock *other)
+{
+   if (other && unix_peer(other) != sk && unix_recvq_full(other))
+   return false;
+
+   return unix_writable(sk);
+}
+
 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
poll_table *wait)
 {
struct sock *sk = sock->sk, *other;
-   unsigned int mask, writable;
+   unsigned int mask;
 
sock_poll_wait(file, sk_sle

[PATCH v3 2/3] net: unix: Convert gc_flags to flags

2015-10-07 Thread Jason Baron
Convert gc_flags to flags in perparation for the subsequent patch, which will
make use of a flag bit for a non-gc purpose.

Signed-off-by: Jason Baron 
---
 include/net/af_unix.h |  2 +-
 net/unix/garbage.c| 12 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 9698aff..6a4a345 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -58,7 +58,7 @@ struct unix_sock {
atomic_long_t   inflight;
spinlock_t  lock;
unsigned char   recursion_level;
-   unsigned long   gc_flags;
+   unsigned long   flags;
 #define UNIX_GC_CANDIDATE  0
 #define UNIX_GC_MAYBE_CYCLE1
struct socket_wqpeer_wq;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index a73a226..39794d9 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -179,7 +179,7 @@ static void scan_inflight(struct sock *x, void 
(*func)(struct unix_sock *),
 * have been added to the queues after
 * starting the garbage collection
 */
-   if (test_bit(UNIX_GC_CANDIDATE, 
&u->gc_flags)) {
+   if (test_bit(UNIX_GC_CANDIDATE, 
&u->flags)) {
hit = true;
 
func(u);
@@ -246,7 +246,7 @@ static void inc_inflight_move_tail(struct unix_sock *u)
 * of the list, so that it's checked even if it was already
 * passed over
 */
-   if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
+   if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->flags))
list_move_tail(&u->link, &gc_candidates);
 }
 
@@ -305,8 +305,8 @@ void unix_gc(void)
BUG_ON(total_refs < inflight_refs);
if (total_refs == inflight_refs) {
list_move_tail(&u->link, &gc_candidates);
-   __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
-   __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+   __set_bit(UNIX_GC_CANDIDATE, &u->flags);
+   __set_bit(UNIX_GC_MAYBE_CYCLE, &u->flags);
}
}
 
@@ -332,7 +332,7 @@ void unix_gc(void)
 
if (atomic_long_read(&u->inflight) > 0) {
list_move_tail(&u->link, ¬_cycle_list);
-   __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+   __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->flags);
scan_children(&u->sk, inc_inflight_move_tail, NULL);
}
}
@@ -343,7 +343,7 @@ void unix_gc(void)
 */
while (!list_empty(¬_cycle_list)) {
u = list_entry(not_cycle_list.next, struct unix_sock, link);
-   __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+   __clear_bit(UNIX_GC_CANDIDATE, &u->flags);
list_move_tail(&u->link, &gc_inflight_list);
}
 
-- 
2.6.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/2] bpf: enable non-root eBPF programs

2015-10-07 Thread Alexei Starovoitov

On 10/5/15 1:48 PM, Alexei Starovoitov wrote:

Unprivileged socket filter bpf programs have access to the
following helper functions:
- map lookup/update/delete (but they cannot store kernel pointers into them)
- get_random (it's already exposed to unprivileged user space)
- get_smp_processor_id
- tail_call into another socket filter program
- ktime_get_ns
- bpf_trace_printk (for debugging)


while reviewing everything for Nth time realized that
bpf_trace_printk is useless for unprivileged users, since
trace_pipe is root only.
So going to drop it in V2.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 4/4] sh_eth: Remove obsolete r8a777x-ether platform_device_id entry

2015-10-07 Thread Simon Horman
On Wed, Oct 07, 2015 at 10:14:17AM +0200, Geert Uytterhoeven wrote:
> Since commit 3d7608e4c169af03 ("ARM: shmobile: bockw: remove legacy
> board file and config"), R-Car Gen1 SoCs are only supported in generic
> DT-only ARM multi-platform builds.  The driver doesn't need to match
> platform devices by name anymore, hence remove the corresponding
> platform_device_id entry.
> 
> Protect sh_eth_set_rate_r8a777x() and r8a777x_data by #ifdef CONFIG_OF,
> as they're now referenced on DT platforms only.
> 
> Signed-off-by: Geert Uytterhoeven 

Acked-by: Simon Horman 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 3/4] sh_eth: Remove obsolete r8a7740-gether platform_device_id entry

2015-10-07 Thread Simon Horman
On Wed, Oct 07, 2015 at 10:14:16AM +0200, Geert Uytterhoeven wrote:
> Since commit 1fa59bda21c7fa36 ("ARM: shmobile: Remove legacy board code
> for Armadillo-800 EVA"), r8a7740 is only supported in generic DT-only
> ARM multi-platform builds.  The driver doesn't need to match platform
> devices by name anymore, hence remove the corresponding
> platform_device_id entry.
> 
> Protect r8a7740_data by #ifdef CONFIG_OF as it's now referenced on DT
> platforms only. Move it to a more logical position, in front of the
> r8a777x support, so we can have a single #ifdef covering all r7s* and
> r8a* support soon. This requires moving a few helper functions, too.
> 
> Signed-off-by: Geert Uytterhoeven 

Acked-by: Simon Horman 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 4/6] net: dsa: add port_fdb_prepare

2015-10-07 Thread Andrew Lunn
On Wed, Oct 07, 2015 at 07:48:29PM -0400, Vivien Didelot wrote:
> Push the prepare phase for FDB operations down to the DSA drivers, with
> a new port_fdb_prepare function. Currently only mv88e6xxx is affected.
> 
> Signed-off-by: Vivien Didelot 
> ---
>  drivers/net/dsa/mv88e6171.c |  1 +
>  drivers/net/dsa/mv88e6352.c |  1 +
>  drivers/net/dsa/mv88e6xxx.c | 10 ++
>  drivers/net/dsa/mv88e6xxx.h |  3 +++
>  include/net/dsa.h   |  4 
>  net/dsa/slave.c |  7 +--
>  6 files changed, 24 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
> index c95cfab..ca3330a 100644
> --- a/drivers/net/dsa/mv88e6171.c
> +++ b/drivers/net/dsa/mv88e6171.c
> @@ -121,6 +121,7 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
>   .port_vlan_add  = mv88e6xxx_port_vlan_add,
>   .port_vlan_del  = mv88e6xxx_port_vlan_del,
>   .vlan_getnext   = mv88e6xxx_vlan_getnext,
> + .port_fdb_prepare   = mv88e6xxx_port_fdb_prepare,

Hi Vivien

Bike shedding a bit, but i would call this
mv88e6xxx_port_fdb_prepare_add.

>   .port_fdb_add   = mv88e6xxx_port_fdb_add,
>   .port_fdb_del   = mv88e6xxx_port_fdb_del,
>   .port_fdb_getnext   = mv88e6xxx_port_fdb_getnext,

Taking a theoretical example, say mv88e6xxx_port_fdb_getnext needed a
prepare call to allocate memory to put the returned ATU into. What
would you call that?

mv88e6xxx_port_fdb_prepare_add and mv88e6xxx_port_fdb_prepare_getnext
just seems unambiguous and future proof.

 Andrew
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] Convert smsc911x to use ACPI as well as DT

2015-10-07 Thread Rafael J. Wysocki

On 10/6/2015 1:08 PM, David Woodhouse wrote:

On Mon, 2015-10-05 at 17:20 -0700, Charles Garcia-Tobin wrote:

it in ACPI circles
unless we had wider agreement among OSs to use it. AFAIK PRP1 has not
actually been approved yet in the specification forum, and that it in
itself is more of a concern for me,as the code has been pushed upstream.

Why would that be a concern? In that context it's just one device ID.
Individual devices don't *need* to be approved. OK, the 'PRP' vendor
prefix is not officially assigned but that's really a trivial piece of
bureaucracy.


I guess it¹s up to Catalin, but disabling for ARM seems like a good idea
right now, another option is to add tests to FWTS.

I understand the motivation to avoid embracing a whole bunch of crappy
bindings. But I think that eschewing PRP0001 is the wrong technical
approach to achieving that.

It has false negatives — as soon as you have a *single* existing DT
binding, perhaps something as simple as the serial port bindings from
the CHRP days, you'll be in a situation where you can't use that.
I've *got* hardware where I need to advertise a serial port with a
clock-frequency property because it *isn't* compatible with PNP0501.

And it has false positives — there's nothing to prevent people from
doing ACPI-style bindings with crappy device bindings which also aren't
approved.

I think it's utterly naïve to believe that simply avoiding the use of
PRP0001 + compatible for matching is going to have *any* significant
beneficial effect whatsoever. It only makes life harder for all
concerned.

Perhaps a better approach would be to introduce something like
CONFIG_UNAPPROVED_BINDINGS (which can't be set on ARM64), and those
drivers which use bindings that *aren't* approved by Catalin's crack
team of reviewers need to depend on !UNAPPROVED_BINDINGS. To be honest,
I still think even *that* is somewhat naïve, but it's still a better
way of implementing what you're actually trying to achieve, however
optimistic you have to be to think it'll ever work in practice.



Also, let me mention one case PRP0001 is intended for that seems to be real.

Say there is a piece of hardware that becomes so popular that the 
majority of vendors include it in their SoCs.  Now all of them have to 
allocate ACPI/PNP device IDs for that and (without PRP0001) all of those 
device IDs need to go into the driver for that thing to make it work on 
all of the SoCs in question.  As a result, the mainline kernel doesn't 
work on new SoCs with ACPI without modifications, although it works on 
all of them with DT in principle.  Moreover, unmodified binary distro 
kernels don't work on new SoCs with ACPI, although they may work just 
fine on them with DT (that sort of seems to be a case that Red Hat may 
be interested in for one example, but I may be wrong here).


In theory, that may be addressed by allocating a "master" ACPI/PNP 
device ID for that device and listing it in the device's _CID on all of 
the SoCs in question, but then it isn't particularly clear who's going 
to be responsible for allocating that device ID and "propagating" the 
knowledge of it to everybody interested to make things consistent.  
[Side note: Using a different vendor's device ID in a _CID is quite 
questionable, so the "master" one would need to be something "vendor 
agnostic" if that makes sense at all ...]


Let alone the fact that PRP0001 is actually quite useful at the 
prototyping stage when it is premature to allocate a new device ID just 
yet.  Taking that to the extreme, if someone whittles a board in his or 
her garage and wants to use it to drive their homemade grass watering 
system, having to invent a new device ID and put it into the existing 
driver that otherwise doesn't require any modifications is ... you know 
what I mean.


All in all, I'd recommend some caution to ensure that the baby is not 
going away along with the bath water here.


Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2 1/5] net: move net_get_random_once to lib

2015-10-07 Thread Alexei Starovoitov

On 10/7/15 4:20 PM, Daniel Borkmann wrote:

From: Hannes Frederic Sowa

There's no good reason why users outside of networking should not
be using this facility, f.e. for initializing their seeds.

Therefore, make it accessible from there as get_random_once().

Signed-off-by: Hannes Frederic Sowa
Signed-off-by: Daniel Borkmann


Acked-by: Alexei Starovoitov 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2 3/5] random32: add prandom_seed_full_state helper

2015-10-07 Thread Alexei Starovoitov

On 10/7/15 4:20 PM, Daniel Borkmann wrote:

Factor out the full reseed handling code that populates the state
through get_random_bytes() and runs prandom_warmup(). The resulting
prandom_seed_full_state() will be used later on in more than the
current __prandom_reseed() user. Fix also two minor whitespace
issues along the way.

Signed-off-by: Daniel Borkmann
Acked-by: Hannes Frederic Sowa


Acked-by: Alexei Starovoitov 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2 4/5] random32: add prandom_init_once helper for own rngs

2015-10-07 Thread Alexei Starovoitov

On 10/7/15 4:20 PM, Daniel Borkmann wrote:

Add a prandom_init_once() facility that works on the rnd_state, so that
users that are keeping their own state independent from prandom_u32() can
initialize their taus113 per cpu states.

The motivation here is similar to net_get_random_once(): initialize the
state as late as possible in the hope that enough entropy has been
collected for the seeding. prandom_init_once() makes use of the recently
introduced prandom_seed_full_state() helper and is generic enough so that
it could also be used on fast-paths due to the DO_ONCE().

Signed-off-by: Daniel Borkmann
Acked-by: Hannes Frederic Sowa


Acked-by: Alexei Starovoitov 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2 2/5] once: make helper generic for calling functions once

2015-10-07 Thread Alexei Starovoitov

On 10/7/15 4:20 PM, Daniel Borkmann wrote:

From: Hannes Frederic Sowa

Make the get_random_once() helper generic enough, so that functions
in general would only be called once, where one user of this is then
net_get_random_once().

The only implementation specific call is to get_random_bytes(), all
the rest of this *_once() facility would be duplicated among different
subsystems otherwise. The new DO_ONCE() helper will be used by prandom()
later on, but might also be useful for other scenarios/subsystems as
well where a one-time initialization in often-called, possibly fast
path code could occur.

Signed-off-by: Hannes Frederic Sowa
Signed-off-by: Daniel Borkmann


nice! you even added sparse tags.
Acked-by: Alexei Starovoitov 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 2/6] net: dsa: include dsa.h in dsa_priv.h

2015-10-07 Thread Vivien Didelot
dsa_priv.h uses dsa specific structures, as well as the files using it,
so include dsa.h here.

Signed-off-by: Vivien Didelot 
---
 net/dsa/dsa_priv.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 311796c8..4522f47 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct dsa_device_ops {
struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
-- 
2.6.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 0/6] net: dsa: push switchdev prepare phase in FDB ops

2015-10-07 Thread Vivien Didelot
This patchset pushes the switchdev prepare phase for the FDB add and del
operations down to the DSA drivers. Currently only mv88e6xxx is affected.

Since the dump requires a bit of refactoring in the driver, it'll come in a
future patchset.

The first 3 patches removes the dsa.h include from linux/netdevice.h, which
broke the inclusion of switchdev.h in dsa.h.

The last 3 patches add port_fdb_prepare and change port_fdb_add and
port_fdb_del to use the switchdev FDB object structure.

To be more specific about the include dependency issue, here's a snippet of
what happens currently if you include switchdev.h in dsa.h:

[...]
include/net/switchdev.h:52:30: error: field ‘ppid’ has incomplete type
   struct netdev_phys_item_id ppid; /* PORT_PARENT_ID */
  ^
include/net/switchdev.h:185:14: warning: ‘struct nlmsghdr’ declared inside 
parameter list [enabled by default]
   struct nlmsghdr *nlh, u16 flags);
  ^
include/net/switchdev.h:195:7: warning: ‘struct ndmsg’ declared inside 
parameter list [enabled by default]
include/net/switchdev.h:198:7: warning: ‘struct nlattr’ declared inside 
parameter list [enabled by default]
   u16 vid);
   ^
include/net/switchdev.h:201:15: warning: ‘struct netlink_callback’ declared 
inside parameter list [enabled by default]
struct net_device *filter_dev, int idx);
   ^
[...]

Removing the dsa.h include from linux/netdevice.h gets rid of these errors but
then the DSA code complains if you don't include it in dsa_priv.h:

[...]
net/dsa/slave.c: In function ‘dsa_slave_set_mac_address’:
net/dsa/slave.c:178:39: error: dereferencing pointer to incomplete type
  struct net_device *master = p->parent->dst->master_netdev;
   ^
In file included from include/linux/list.h:8:0,
 from net/dsa/slave.c:11:
net/dsa/slave.c: In function ‘dsa_bridge_check_vlan_range’:
net/dsa/slave.c:209:26: error: ‘DSA_MAX_PORTS’ undeclared (first use in this 
function)
  DECLARE_BITMAP(members, DSA_MAX_PORTS);
  ^
net/dsa/slave.c:209:26: note: each undeclared identifier is reported only once 
for each function it appears in
  DECLARE_BITMAP(members, DSA_MAX_PORTS);
  ^
include/linux/kernel.h:67:30: note: in definition of macro ‘DIV_ROUND_UP’
 #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
  ^
include/linux/types.h:10:21: note: in expansion of macro ‘BITS_TO_LONGS’
  unsigned long name[BITS_TO_LONGS(bits)]
 ^
net/dsa/slave.c:209:2: note: in expansion of macro ‘DECLARE_BITMAP’
  DECLARE_BITMAP(members, DSA_MAX_PORTS);
  ^
net/dsa/slave.c:1190:7: error: ‘DSA_TAG_PROTO_EDSA’ undeclared (first use in 
this function)
  case DSA_TAG_PROTO_EDSA:
   ^
net/dsa/slave.c: In function ‘dsa_slave_get_iflink’:
net/dsa/slave.c:64:1: warning: control reaches end of non-void function 
[-Wreturn-type]
 }
 ^
[...]


Thanks,
-v

Vivien Didelot (6):
  net: dsa: add uses_hw_tag
  net: dsa: include dsa.h in dsa_priv.h
  net: remove dsa.h include from linux/netdevice.h
  net: dsa: add port_fdb_prepare
  net: dsa: push prepare phase in port_fdb_add
  net: dsa: use switchdev obj in port_fdb_del

 drivers/net/dsa/mv88e6171.c |  1 +
 drivers/net/dsa/mv88e6352.c |  1 +
 drivers/net/dsa/mv88e6xxx.c | 23 +--
 drivers/net/dsa/mv88e6xxx.h |  8 ++--
 include/linux/netdevice.h   |  9 ++---
 include/net/dsa.h   | 14 +++---
 net/dsa/dsa.c   |  1 +
 net/dsa/dsa_priv.h  |  1 +
 net/dsa/slave.c | 11 +++
 9 files changed, 47 insertions(+), 22 deletions(-)

-- 
2.6.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 3/6] net: remove dsa.h include from linux/netdevice.h

2015-10-07 Thread Vivien Didelot
Forward declare struct dsa_switch_tree in netdevice.h instead of
including the dsa.h header.

Signed-off-by: Vivien Didelot 
---
 include/linux/netdevice.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 73f0510..d0bcabb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -41,7 +41,6 @@
 
 #include 
 #include 
-#include 
 #ifdef CONFIG_DCB
 #include 
 #endif
@@ -60,6 +59,8 @@ struct wireless_dev;
 /* 802.15.4 specific */
 struct wpan_dev;
 struct mpls_dev;
+/* DSA specific */
+struct dsa_switch_tree;
 
 void netdev_set_default_ethtool_ops(struct net_device *dev,
const struct ethtool_ops *ops);
-- 
2.6.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 1/6] net: dsa: add uses_hw_tag

2015-10-07 Thread Vivien Didelot
Instead of checking that the dsa_switch_tree rcv pointer is not NULL,
add a uses_hw_tag boolean to net_device to explicit whether it uses
hardware inserted tag or not.

Signed-off-by: Vivien Didelot 
---
 include/linux/netdevice.h | 6 --
 include/net/dsa.h | 5 -
 net/dsa/dsa.c | 1 +
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b337440..73f0510 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1422,6 +1422,7 @@ enum netdev_priv_flags {
  * @allmulti:  Counter, enables or disables allmulticast mode
  *
  * @vlan_info: VLAN info
+ * @uses_hw_tag:   Whether the device uses hardware inserted tag or not
  * @dsa_ptr:   dsa specific data
  * @tipc_ptr:  TIPC specific data
  * @atalk_ptr: AppleTalk link
@@ -1640,6 +1641,7 @@ struct net_device {
struct vlan_info __rcu  *vlan_info;
 #endif
 #if IS_ENABLED(CONFIG_NET_DSA)
+   bool uses_hw_tag;
struct dsa_switch_tree  *dsa_ptr;
 #endif
 #if IS_ENABLED(CONFIG_TIPC)
@@ -1891,8 +1893,8 @@ void dev_net_set(struct net_device *dev, struct net *net)
 static inline bool netdev_uses_dsa(struct net_device *dev)
 {
 #if IS_ENABLED(CONFIG_NET_DSA)
-   if (dev->dsa_ptr != NULL)
-   return dsa_uses_tagged_protocol(dev->dsa_ptr);
+   if (dev->uses_hw_tag)
+   return true;
 #endif
return false;
 }
diff --git a/include/net/dsa.h b/include/net/dsa.h
index b34d812..3e9eb6c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -333,9 +333,4 @@ static inline void *ds_to_priv(struct dsa_switch *ds)
 {
return (void *)(ds + 1);
 }
-
-static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst)
-{
-   return dst->rcv != NULL;
-}
 #endif
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index aa398bc..51b3815 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -834,6 +834,7 @@ static void dsa_setup_dst(struct dsa_switch_tree *dst, 
struct net_device *dev,
 */
wmb();
dev->dsa_ptr = (void *)dst;
+   dev->uses_hw_tag = dst->tag_protocol != DSA_TAG_PROTO_NONE;
 
if (dst->link_poll_needed) {
INIT_WORK(&dst->link_poll_work, dsa_link_poll_work);
-- 
2.6.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 5/6] net: dsa: push prepare phase in port_fdb_add

2015-10-07 Thread Vivien Didelot
Now that the prepare phase is pushed down to the DSA drivers, propagate
it to the port_fdb_add function.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx.c | 7 ---
 drivers/net/dsa/mv88e6xxx.h | 3 ++-
 include/net/dsa.h   | 3 ++-
 net/dsa/slave.c | 2 +-
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 9fbb727..916c98e 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1852,16 +1852,17 @@ int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, 
int port,
 }
 
 int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
-  const unsigned char *addr, u16 vid)
+  const struct switchdev_obj_port_fdb *fdb,
+  struct switchdev_trans *trans)
 {
-   int state = is_multicast_ether_addr(addr) ?
+   int state = is_multicast_ether_addr(fdb->addr) ?
GLOBAL_ATU_DATA_STATE_MC_STATIC :
GLOBAL_ATU_DATA_STATE_UC_STATIC;
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
int ret;
 
mutex_lock(&ps->smi_mutex);
-   ret = _mv88e6xxx_port_fdb_load(ds, port, addr, vid, state);
+   ret = _mv88e6xxx_port_fdb_load(ds, port, fdb->addr, fdb->vid, state);
mutex_unlock(&ps->smi_mutex);
 
return ret;
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 4475640..e688bee 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -483,7 +483,8 @@ int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int 
port,
   const struct switchdev_obj_port_fdb *fdb,
   struct switchdev_trans *trans);
 int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
-  const unsigned char *addr, u16 vid);
+  const struct switchdev_obj_port_fdb *fdb,
+  struct switchdev_trans *trans);
 int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
   const unsigned char *addr, u16 vid);
 int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 3aee8a5..2d86d31 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -321,7 +321,8 @@ struct dsa_switch_driver {
const struct switchdev_obj_port_fdb *fdb,
struct switchdev_trans *trans);
int (*port_fdb_add)(struct dsa_switch *ds, int port,
-   const unsigned char *addr, u16 vid);
+   const struct switchdev_obj_port_fdb *fdb,
+   struct switchdev_trans *trans);
int (*port_fdb_del)(struct dsa_switch *ds, int port,
const unsigned char *addr, u16 vid);
int (*port_fdb_getnext)(struct dsa_switch *ds, int port,
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 48e8c15..6f7f27e 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -354,7 +354,7 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
if (switchdev_trans_ph_prepare(trans))
ret = ds->drv->port_fdb_prepare(ds, p->port, fdb, trans);
else
-   ret = ds->drv->port_fdb_add(ds, p->port, fdb->addr, fdb->vid);
+   ret = ds->drv->port_fdb_add(ds, p->port, fdb, trans);
 
return ret;
 }
-- 
2.6.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 4/6] net: dsa: add port_fdb_prepare

2015-10-07 Thread Vivien Didelot
Push the prepare phase for FDB operations down to the DSA drivers, with
a new port_fdb_prepare function. Currently only mv88e6xxx is affected.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6171.c |  1 +
 drivers/net/dsa/mv88e6352.c |  1 +
 drivers/net/dsa/mv88e6xxx.c | 10 ++
 drivers/net/dsa/mv88e6xxx.h |  3 +++
 include/net/dsa.h   |  4 
 net/dsa/slave.c |  7 +--
 6 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
index c95cfab..ca3330a 100644
--- a/drivers/net/dsa/mv88e6171.c
+++ b/drivers/net/dsa/mv88e6171.c
@@ -121,6 +121,7 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
.port_vlan_add  = mv88e6xxx_port_vlan_add,
.port_vlan_del  = mv88e6xxx_port_vlan_del,
.vlan_getnext   = mv88e6xxx_vlan_getnext,
+   .port_fdb_prepare   = mv88e6xxx_port_fdb_prepare,
.port_fdb_add   = mv88e6xxx_port_fdb_add,
.port_fdb_del   = mv88e6xxx_port_fdb_del,
.port_fdb_getnext   = mv88e6xxx_port_fdb_getnext,
diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c
index 3736706..078a358 100644
--- a/drivers/net/dsa/mv88e6352.c
+++ b/drivers/net/dsa/mv88e6352.c
@@ -348,6 +348,7 @@ struct dsa_switch_driver mv88e6352_switch_driver = {
.port_vlan_add  = mv88e6xxx_port_vlan_add,
.port_vlan_del  = mv88e6xxx_port_vlan_del,
.vlan_getnext   = mv88e6xxx_vlan_getnext,
+   .port_fdb_prepare   = mv88e6xxx_port_fdb_prepare,
.port_fdb_add   = mv88e6xxx_port_fdb_add,
.port_fdb_del   = mv88e6xxx_port_fdb_del,
.port_fdb_getnext   = mv88e6xxx_port_fdb_getnext,
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 6053d11..9fbb727 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1841,6 +1841,16 @@ static int _mv88e6xxx_port_fdb_load(struct dsa_switch 
*ds, int port,
return _mv88e6xxx_atu_load(ds, &entry);
 }
 
+int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int port,
+  const struct switchdev_obj_port_fdb *fdb,
+  struct switchdev_trans *trans)
+{
+   /* We don't need any dynamic resource from the kernel (yet),
+* so skip the prepare phase.
+*/
+   return 0;
+}
+
 int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
   const unsigned char *addr, u16 vid)
 {
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index 39b261f..4475640 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -479,6 +479,9 @@ int mv88e6xxx_port_vlan_add(struct dsa_switch *ds, int 
port, u16 vid,
 int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, u16 vid);
 int mv88e6xxx_vlan_getnext(struct dsa_switch *ds, u16 *vid,
   unsigned long *ports, unsigned long *untagged);
+int mv88e6xxx_port_fdb_prepare(struct dsa_switch *ds, int port,
+  const struct switchdev_obj_port_fdb *fdb,
+  struct switchdev_trans *trans);
 int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
   const unsigned char *addr, u16 vid);
 int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 3e9eb6c..3aee8a5 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 enum dsa_tag_protocol {
DSA_TAG_PROTO_NONE = 0,
@@ -316,6 +317,9 @@ struct dsa_switch_driver {
/*
 * Forwarding database
 */
+   int (*port_fdb_prepare)(struct dsa_switch *ds, int port,
+   const struct switchdev_obj_port_fdb *fdb,
+   struct switchdev_trans *trans);
int (*port_fdb_add)(struct dsa_switch *ds, int port,
const unsigned char *addr, u16 vid);
int (*port_fdb_del)(struct dsa_switch *ds, int port,
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 4f607bc..48e8c15 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -346,10 +346,13 @@ static int dsa_slave_port_fdb_add(struct net_device *dev,
 {
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
-   int ret = -EOPNOTSUPP;
+   int ret;
+
+   if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add)
+   return -EOPNOTSUPP;
 
if (switchdev_trans_ph_prepare(trans))
-   ret = ds->drv->port_fdb_add ? 0 : -EOPNOTSUPP;
+   ret = ds->drv->port_fdb_prepare(ds, p->port, fdb, trans);
else
ret = ds->drv->port_fdb_add(ds, p->port, fdb->addr, fdb->vid);
 
-- 
2.6.0

--
To unsubscribe from this list: send the line "unsubscribe netde

[PATCH net-next 6/6] net: dsa: use switchdev obj in port_fdb_del

2015-10-07 Thread Vivien Didelot
For consistency with the FDB add operation, propagate the
switchdev_obj_port_fdb structure in the DSA drivers.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx.c | 4 ++--
 drivers/net/dsa/mv88e6xxx.h | 2 +-
 include/net/dsa.h   | 2 +-
 net/dsa/slave.c | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 916c98e..f73c953 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1869,13 +1869,13 @@ int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int 
port,
 }
 
 int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
-  const unsigned char *addr, u16 vid)
+  const struct switchdev_obj_port_fdb *fdb)
 {
struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
int ret;
 
mutex_lock(&ps->smi_mutex);
-   ret = _mv88e6xxx_port_fdb_load(ds, port, addr, vid,
+   ret = _mv88e6xxx_port_fdb_load(ds, port, fdb->addr, fdb->vid,
   GLOBAL_ATU_DATA_STATE_UNUSED);
mutex_unlock(&ps->smi_mutex);
 
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index e688bee..1347a73 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -486,7 +486,7 @@ int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port,
   const struct switchdev_obj_port_fdb *fdb,
   struct switchdev_trans *trans);
 int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port,
-  const unsigned char *addr, u16 vid);
+  const struct switchdev_obj_port_fdb *fdb);
 int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port,
   unsigned char *addr, u16 *vid, bool *is_static);
 int mv88e6xxx_phy_page_read(struct dsa_switch *ds, int port, int page, int 
reg);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 2d86d31..b802dab 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -324,7 +324,7 @@ struct dsa_switch_driver {
const struct switchdev_obj_port_fdb *fdb,
struct switchdev_trans *trans);
int (*port_fdb_del)(struct dsa_switch *ds, int port,
-   const unsigned char *addr, u16 vid);
+   const struct switchdev_obj_port_fdb *fdb);
int (*port_fdb_getnext)(struct dsa_switch *ds, int port,
unsigned char *addr, u16 *vid,
bool *is_static);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 6f7f27e..bb2bd3b 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -367,7 +367,7 @@ static int dsa_slave_port_fdb_del(struct net_device *dev,
int ret = -EOPNOTSUPP;
 
if (ds->drv->port_fdb_del)
-   ret = ds->drv->port_fdb_del(ds, p->port, fdb->addr, fdb->vid);
+   ret = ds->drv->port_fdb_del(ds, p->port, fdb);
 
return ret;
 }
-- 
2.6.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/2] bpf: enable non-root eBPF programs

2015-10-07 Thread Alexei Starovoitov

On 10/7/15 3:22 PM, Kees Cook wrote:

Yes, I agree with you that there would be a CVE regardless. I still
>like the option of configurable access, not a big fan of the sysctl
>either. Thinking out loudly, what about a Kconfig option? We started
>out like this on bpf(2) itself (initially under expert settings, now
>afaik not anymore), and depending on usage scenarios, a requirement
>could be to have immutable cap_sys_admin-only, for other use-cases a
>requirement on the kernel might instead be to have unprivileged users
>as well.

It'd be nice to have it just be a Kconfig, but this shoots
distro-users in the foot if a distro decides to include unpriv bpf and
the user doesn't want it. I think it's probably a good idea to keep
the sysctl.


I don't like introducing Kconfig for no clear reason. It only adds
to the testing matrix and makes it harder to hack around.
Paranoid distros can disable bpf via single config already,
there is no reason to go more fine grained here.
Unpriv checks add minimal amount of code, so even for tinification
purpose there is no need to chop of few bytes. tiny kernels would
disable bpf all together.

As far as sysctl we can look at two with similar purpose:
sysctl_perf_event_paranoid and modules_disabled.
First one is indeed multi level, but not because of the fear of bugs,
but because of real security implications. Like raw events on
hyperthreaded cpu or uncore events can extract data from other
user processes. So it controls these extra privileges.
For bpf there are no hw implications to deal with.
If we make seccomp+bpf in the future it shouldn't need another knob
or extra bit. There are no extra privileges to grant, so not needed.

modules_disabled is off by default and can be toggled on once.
I think for paranoid distro users that "don't want bpf" that is
the better model.
So I'm thinking to do sysctl_unprivileged_bpf_disabled that will be
0=off by default (meaning that users can load unpriv socket filter
programs and seccomp in the future) and that can be switched
to 1=on once and stay that way until reboot.
I think that's the best balance that avoids adding checks to all
apps that want to use bpf and admins can still act on it.
From app point of view it's no different than bpf syscall
was not compiled in. So single feature test for bpf syscall will
be enough.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Patch net] sch_hhf: fix return value of hhf_drop()

2015-10-07 Thread Cong Wang
Similar to commit c0afd9ce4d6a ("fq_codel: fix return value of fq_codel_drop()")
->drop() is supposed to return the number of bytes it dropped,
but hhf_drop () returns the id of the bucket where it drops
a packet from.

Cc: Jamal Hadi Salim 
Cc: Terry Lam 
Signed-off-by: Cong Wang 
Signed-off-by: Cong Wang 
---
 net/sched/sch_hhf.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 9d15cb6..86b04e3 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -368,6 +368,15 @@ static unsigned int hhf_drop(struct Qdisc *sch)
return bucket - q->buckets;
 }
 
+static unsigned int hhf_qdisc_drop(struct Qdisc *sch)
+{
+   unsigned int prev_backlog;
+
+   prev_backlog = sch->qstats.backlog;
+   hhf_drop(sch);
+   return prev_backlog - sch->qstats.backlog;
+}
+
 static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
struct hhf_sched_data *q = qdisc_priv(sch);
@@ -696,7 +705,7 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {
.enqueue=   hhf_enqueue,
.dequeue=   hhf_dequeue,
.peek   =   qdisc_peek_dequeued,
-   .drop   =   hhf_drop,
+   .drop   =   hhf_qdisc_drop,
.init   =   hhf_init,
.reset  =   hhf_reset,
.destroy=   hhf_destroy,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2 3/5] random32: add prandom_seed_full_state helper

2015-10-07 Thread Daniel Borkmann
Factor out the full reseed handling code that populates the state
through get_random_bytes() and runs prandom_warmup(). The resulting
prandom_seed_full_state() will be used later on in more than the
current __prandom_reseed() user. Fix also two minor whitespace
issues along the way.

Signed-off-by: Daniel Borkmann 
Acked-by: Hannes Frederic Sowa 
---
 lib/random32.c | 37 +
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/lib/random32.c b/lib/random32.c
index 0bee183..36c09fb 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -181,7 +181,7 @@ void prandom_seed(u32 entropy)
 * No locking on the CPUs, but then somewhat random results are, well,
 * expected.
 */
-   for_each_possible_cpu (i) {
+   for_each_possible_cpu(i) {
struct rnd_state *state = &per_cpu(net_rand_state, i);
 
state->s1 = __seed(state->s1 ^ entropy, 2U);
@@ -201,7 +201,7 @@ static int __init prandom_init(void)
prandom_state_selftest();
 
for_each_possible_cpu(i) {
-   struct rnd_state *state = &per_cpu(net_rand_state,i);
+   struct rnd_state *state = &per_cpu(net_rand_state, i);
u32 weak_seed = (i + jiffies) ^ random_get_entropy();
 
prandom_seed_early(state, weak_seed, true);
@@ -238,13 +238,30 @@ static void __init __prandom_start_seed_timer(void)
add_timer(&seed_timer);
 }
 
+static void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
+{
+   int i;
+
+   for_each_possible_cpu(i) {
+   struct rnd_state *state = per_cpu_ptr(pcpu_state, i);
+   u32 seeds[4];
+
+   get_random_bytes(&seeds, sizeof(seeds));
+   state->s1 = __seed(seeds[0],   2U);
+   state->s2 = __seed(seeds[1],   8U);
+   state->s3 = __seed(seeds[2],  16U);
+   state->s4 = __seed(seeds[3], 128U);
+
+   prandom_warmup(state);
+   }
+}
+
 /*
  * Generate better values after random number generator
  * is fully initialized.
  */
 static void __prandom_reseed(bool late)
 {
-   int i;
unsigned long flags;
static bool latch = false;
static DEFINE_SPINLOCK(lock);
@@ -266,19 +283,7 @@ static void __prandom_reseed(bool late)
goto out;
 
latch = true;
-
-   for_each_possible_cpu(i) {
-   struct rnd_state *state = &per_cpu(net_rand_state,i);
-   u32 seeds[4];
-
-   get_random_bytes(&seeds, sizeof(seeds));
-   state->s1 = __seed(seeds[0],   2U);
-   state->s2 = __seed(seeds[1],   8U);
-   state->s3 = __seed(seeds[2],  16U);
-   state->s4 = __seed(seeds[3], 128U);
-
-   prandom_warmup(state);
-   }
+   prandom_seed_full_state(&net_rand_state);
 out:
spin_unlock_irqrestore(&lock, flags);
 }
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2 2/5] once: make helper generic for calling functions once

2015-10-07 Thread Daniel Borkmann
From: Hannes Frederic Sowa 

Make the get_random_once() helper generic enough, so that functions
in general would only be called once, where one user of this is then
net_get_random_once().

The only implementation specific call is to get_random_bytes(), all
the rest of this *_once() facility would be duplicated among different
subsystems otherwise. The new DO_ONCE() helper will be used by prandom()
later on, but might also be useful for other scenarios/subsystems as
well where a one-time initialization in often-called, possibly fast
path code could occur.

Signed-off-by: Hannes Frederic Sowa 
Signed-off-by: Daniel Borkmann 
---
 include/linux/once.h | 61 
 lib/once.c   | 50 --
 2 files changed, 76 insertions(+), 35 deletions(-)

diff --git a/include/linux/once.h b/include/linux/once.h
index 2a83b53..285f12c 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -4,21 +4,54 @@
 #include 
 #include 
 
-bool __get_random_once(void *buf, int nbytes, bool *done,
-  struct static_key *once_key);
+bool __do_once_start(bool *done, unsigned long *flags);
+void __do_once_done(bool *done, struct static_key *once_key,
+   unsigned long *flags);
 
-#define get_random_once(buf, nbytes)   \
-   ({  \
-   bool ___ret = false;\
-   static bool ___done = false;\
-   static struct static_key ___once_key =  \
-   STATIC_KEY_INIT_TRUE;   \
-   if (static_key_true(&___once_key))  \
-   ___ret = __get_random_once((buf),   \
-  (nbytes),\
-  &___done,\
-  &___once_key);   \
-   ___ret; \
+/* Call a function exactly once. The idea of DO_ONCE() is to perform
+ * a function call such as initialization of random seeds, etc, only
+ * once, where DO_ONCE() can live in the fast-path. After @func has
+ * been called with the passed arguments, the static key will patch
+ * out the condition into a nop. DO_ONCE() guarantees type safety of
+ * arguments!
+ *
+ * Not that the following is not equivalent ...
+ *
+ *   DO_ONCE(func, arg);
+ *   DO_ONCE(func, arg);
+ *
+ * ... to this version:
+ *
+ *   void foo(void)
+ *   {
+ * DO_ONCE(func, arg);
+ *   }
+ *
+ *   foo();
+ *   foo();
+ *
+ * In case the one-time invocation could be triggered from multiple
+ * places, then a common helper function must be defined, so that only
+ * a single static key will be placed there!
+ */
+#define DO_ONCE(func, ...)  \
+   ({   \
+   bool ___ret = false; \
+   static bool ___done = false; \
+   static struct static_key ___once_key = STATIC_KEY_INIT_TRUE; \
+   if (static_key_true(&___once_key)) { \
+   unsigned long ___flags;  \
+   ___ret = __do_once_start(&___done, &___flags);   \
+   if (unlikely(___ret)) {  \
+   func(__VA_ARGS__);   \
+   __do_once_done(&___done, &___once_key,   \
+  &___flags);   \
+   }\
+   }\
+   ___ret;  \
})
 
+#define get_random_once(buf, nbytes)\
+   DO_ONCE(get_random_bytes, (buf), (nbytes))
+
 #endif /* _LINUX_ONCE_H */
diff --git a/lib/once.c b/lib/once.c
index 2d5a7de..05c8604 100644
--- a/lib/once.c
+++ b/lib/once.c
@@ -3,52 +3,60 @@
 #include 
 #include 
 
-struct __random_once_work {
+struct once_work {
struct work_struct work;
struct static_key *key;
 };
 
-static void __random_once_deferred(struct work_struct *w)
+static void once_deferred(struct work_struct *w)
 {
-   struct __random_once_work *work;
+   struct once_work *work;
 
-   work = container_of(w, struct __random_once_work, work);
+   work = container_of(w, struct once_work, work);
BUG_ON(!static_key_enabled(work->key));
static_key_slow_dec(work->key);

[PATCH net-next v2 4/5] random32: add prandom_init_once helper for own rngs

2015-10-07 Thread Daniel Borkmann
Add a prandom_init_once() facility that works on the rnd_state, so that
users that are keeping their own state independent from prandom_u32() can
initialize their taus113 per cpu states.

The motivation here is similar to net_get_random_once(): initialize the
state as late as possible in the hope that enough entropy has been
collected for the seeding. prandom_init_once() makes use of the recently
introduced prandom_seed_full_state() helper and is generic enough so that
it could also be used on fast-paths due to the DO_ONCE().

Signed-off-by: Daniel Borkmann 
Acked-by: Hannes Frederic Sowa 
---
 include/linux/random.h | 6 ++
 lib/random32.c | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/random.h b/include/linux/random.h
index e651874..a75840c 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -7,6 +7,8 @@
 #define _LINUX_RANDOM_H
 
 #include 
+#include 
+
 #include 
 
 struct random_ready_callback {
@@ -45,6 +47,10 @@ struct rnd_state {
 
 u32 prandom_u32_state(struct rnd_state *state);
 void prandom_bytes_state(struct rnd_state *state, void *buf, size_t nbytes);
+void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state);
+
+#define prandom_init_once(pcpu_state)  \
+   DO_ONCE(prandom_seed_full_state, (pcpu_state))
 
 /**
  * prandom_u32_max - returns a pseudo-random number in interval [0, ep_ro)
diff --git a/lib/random32.c b/lib/random32.c
index 36c09fb..1211191 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -238,7 +238,7 @@ static void __init __prandom_start_seed_timer(void)
add_timer(&seed_timer);
 }
 
-static void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
+void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
 {
int i;
 
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2 1/5] net: move net_get_random_once to lib

2015-10-07 Thread Daniel Borkmann
From: Hannes Frederic Sowa 

There's no good reason why users outside of networking should not
be using this facility, f.e. for initializing their seeds.

Therefore, make it accessible from there as get_random_once().

Signed-off-by: Hannes Frederic Sowa 
Signed-off-by: Daniel Borkmann 
---
 include/linux/net.h  | 21 
 include/linux/once.h | 24 +++
 lib/Makefile |  3 ++-
 lib/once.c   | 54 
 net/core/utils.c | 49 ---
 5 files changed, 84 insertions(+), 67 deletions(-)
 create mode 100644 include/linux/once.h
 create mode 100644 lib/once.c

diff --git a/include/linux/net.h b/include/linux/net.h
index 049d4b0..70ac5e2 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -24,7 +24,8 @@
 #include/* For O_CLOEXEC and O_NONBLOCK */
 #include 
 #include 
-#include 
+#include 
+
 #include 
 
 struct poll_table_struct;
@@ -250,22 +251,8 @@ do {   
\
} while (0)
 #endif
 
-bool __net_get_random_once(void *buf, int nbytes, bool *done,
-  struct static_key *done_key);
-
-#define net_get_random_once(buf, nbytes)   \
-   ({  \
-   bool ___ret = false;\
-   static bool ___done = false;\
-   static struct static_key ___once_key =  \
-   STATIC_KEY_INIT_TRUE;   \
-   if (static_key_true(&___once_key))  \
-   ___ret = __net_get_random_once(buf, \
-  nbytes,  \
-  &___done,\
-  &___once_key);   \
-   ___ret; \
-   })
+#define net_get_random_once(buf, nbytes)   \
+   get_random_once((buf), (nbytes))
 
 int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec,
   size_t num, size_t len);
diff --git a/include/linux/once.h b/include/linux/once.h
new file mode 100644
index 000..2a83b53
--- /dev/null
+++ b/include/linux/once.h
@@ -0,0 +1,24 @@
+#ifndef _LINUX_ONCE_H
+#define _LINUX_ONCE_H
+
+#include 
+#include 
+
+bool __get_random_once(void *buf, int nbytes, bool *done,
+  struct static_key *once_key);
+
+#define get_random_once(buf, nbytes)   \
+   ({  \
+   bool ___ret = false;\
+   static bool ___done = false;\
+   static struct static_key ___once_key =  \
+   STATIC_KEY_INIT_TRUE;   \
+   if (static_key_true(&___once_key))  \
+   ___ret = __get_random_once((buf),   \
+  (nbytes),\
+  &___done,\
+  &___once_key);   \
+   ___ret; \
+   })
+
+#endif /* _LINUX_ONCE_H */
diff --git a/lib/Makefile b/lib/Makefile
index 13a7c6a..8de3b01 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,8 @@ obj-y += bcd.o div64.o sort.o parser.o halfmd4.o 
debug_locks.o random32.o \
 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
 gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \
 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
-percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
+percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \
+once.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += hexdump.o
diff --git a/lib/once.c b/lib/once.c
new file mode 100644
index 000..2d5a7de
--- /dev/null
+++ b/lib/once.c
@@ -0,0 +1,54 @@
+#include 
+#include 
+#include 
+#include 
+
+struct __random_once_work {
+   struct work_struct work;
+   struct static_key *key;
+};
+
+static void __random_once_deferred(struct work_struct *w)
+{
+   struct __random_once_work *work;
+
+   work = container_of(w, struct __random_once_work, work);
+   BUG_ON(!static_key_enabled(work->key));
+   static_key_slow_dec(work->key);
+   kfree(work);
+}
+
+static void __random_once_disable_jump(struct static_key *key)
+{
+   struct __random_once_work *w;
+
+   w = kmalloc(si

[PATCH net-next v2 0/5] BPF/random32 updates

2015-10-07 Thread Daniel Borkmann
BPF update to split the prandom state apart, and to move the
*once helpers to the core. For details, please see individual
patches. Given the changes and since it's in the tree for
quite some time, net-next is a better choice in our opinion.

v1 -> v2:
 - Make DO_ONCE() type-safe, remove the kvec helper. Credits
   go to Alexei Starovoitov for the __VA_ARGS__ hint, thanks!
 - Add a comment to the DO_ONCE() helper as suggested by Alexei.
 - Rework prandom_init_once() helper to the new API.
 - Keep Alexei's Acked-by on the last patch.

Thanks!

Daniel Borkmann (3):
  random32: add prandom_seed_full_state helper
  random32: add prandom_init_once helper for own rngs
  bpf: split state from prandom_u32() and consolidate {c,e}BPF prngs

Hannes Frederic Sowa (2):
  net: move net_get_random_once to lib
  once: make helper generic for calling function once

 include/linux/bpf.h|  4 
 include/linux/net.h| 21 -
 include/linux/once.h   | 57 ++
 include/linux/random.h |  6 +
 kernel/bpf/core.c  | 26 +
 kernel/bpf/helpers.c   |  7 +-
 kernel/bpf/syscall.c   |  2 ++
 lib/Makefile   |  3 ++-
 lib/once.c | 62 ++
 lib/random32.c | 37 +-
 net/core/filter.c  |  9 ++--
 net/core/utils.c   | 49 ---
 12 files changed, 187 insertions(+), 96 deletions(-)
 create mode 100644 include/linux/once.h
 create mode 100644 lib/once.c

-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v2 5/5] bpf: split state from prandom_u32() and consolidate {c,e}BPF prngs

2015-10-07 Thread Daniel Borkmann
While recently arguing on a seccomp discussion that raw prandom_u32()
access shouldn't be exposed to unpriviledged user space, I forgot the
fact that SKF_AD_RANDOM extension actually already does it for some time
in cBPF via commit 4cd3675ebf74 ("filter: added BPF random opcode").

Since prandom_u32() is being used in a lot of critical networking code,
lets be more conservative and split their states. Furthermore, consolidate
eBPF and cBPF prandom handlers to use the new internal PRNG. For eBPF,
bpf_get_prandom_u32() was only accessible for priviledged users, but
should that change one day, we also don't want to leak raw sequences
through things like eBPF maps.

One thought was also to have own per bpf_prog states, but due to ABI
reasons this is not easily possible, i.e. the program code currently
cannot access bpf_prog itself, and copying the rnd_state to/from the
stack scratch space whenever a program uses the prng seems not really
worth the trouble and seems too hacky. If needed, taus113 could in such
cases be implemented within eBPF using a map entry to keep the state
space, or get_random_bytes() could become a second helper in cases where
performance would not be critical.

Both sides can trigger a one-time late init via prandom_init_once() on
the shared state. Performance-wise, there should even be a tiny gain
as bpf_user_rnd_u32() saves one function call. The PRNG needs to live
inside the BPF core since kernels could have a NET-less config as well.

Signed-off-by: Daniel Borkmann 
Acked-by: Hannes Frederic Sowa 
Acked-by: Alexei Starovoitov 
Cc: Chema Gonzalez 
---
 include/linux/bpf.h  |  4 
 kernel/bpf/core.c| 26 ++
 kernel/bpf/helpers.c |  7 +--
 kernel/bpf/syscall.c |  2 ++
 net/core/filter.c|  9 ++---
 5 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c915a6b..3697ad5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -200,4 +200,8 @@ extern const struct bpf_func_proto 
bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 
+/* Shared helpers among cBPF and eBPF. */
+void bpf_user_rnd_init_once(void);
+u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+
 #endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c8855c2..8086471 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -731,6 +731,32 @@ void bpf_prog_free(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
 
+/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
+
+void bpf_user_rnd_init_once(void)
+{
+   prandom_init_once(&bpf_user_rnd_state);
+}
+
+u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   /* Should someone ever have the rather unwise idea to use some
+* of the registers passed into this function, then note that
+* this function is called from native eBPF and classic-to-eBPF
+* transformations. Register assignments from both sides are
+* different, f.e. classic always sets fn(ctx, A, X) here.
+*/
+   struct rnd_state *state;
+   u32 res;
+
+   state = &get_cpu_var(bpf_user_rnd_state);
+   res = prandom_u32_state(state);
+   put_cpu_var(state);
+
+   return res;
+}
+
 /* Weak definitions of helper functions in case we don't have bpf syscall. */
 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec0..4504ca6 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
.arg2_type  = ARG_PTR_TO_MAP_KEY,
 };
 
-static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-   return prandom_u32();
-}
-
 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
-   .func   = bpf_get_prandom_u32,
+   .func   = bpf_user_rnd_u32,
.gpl_only   = false,
.ret_type   = RET_INTEGER,
 };
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5f35f42..c868caf 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -404,6 +404,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 
if (insn->imm == BPF_FUNC_get_route_realm)
prog->dst_needed = 1;
+   if (insn->imm == BPF_FUNC_get_prandom_u32)
+   bpf_user_rnd_init_once();
if (insn->imm == BPF_FUNC_tail_call) {
/* mark bpf_tail_call as different opcode
 * to avoid conditional branch in
diff --git a/net/core/filter.c b/net/core/filter.c
index 8f4603c..342e6c8 100644

Re: [PATCH RFC] net: mvneta: add ethtool statistics

2015-10-07 Thread Andrew Lunn
On Tue, Oct 06, 2015 at 09:41:08PM +0100, Russell King wrote:
> Add support for the ethtool statistic interface, returning the full set
> of statistics which both Armada 370 and Armada XP can support.
> 
> Signed-off-by: Russell King 
> ---
> Andrew,
> 
> Here's the patch updated to use the example set by mv643xx_eth.c.

Hi Russell

I did some quick tests on an Armada XP based DLINK WRT1900AC. We have
some inconsistencies:

root@wrt1900ac:~# ethtool -S eth0
NIC statistics:
 good_octets_received: 4300691
 good_frames_received: 7655
 bad_octets_received: 0
 bad_frames_received: 0
 broadcast_frames_received: 0
 multicast_frames_received: 0
 unrec_mac_control_received: 0
 good_fc_received: 0
 bad_fc_received: 0
 undersize_received: 0
 fragments_received: 0
 oversize_received: 0
 jabber_received: 0
 mac_receive_error: 0
 bad_crc_event: 0
 collision: 0
 late_collision: 0
 rx_discard: 0
 rx_overrun: 0
 frames_64_octets: 2
 frames_65_to_127_octets: 9
 frames_128_to_255_octets: 1
 frames_256_to_511_octets: 0
 frames_512_to_1023_octets: 7652
 frames_1024_to_max_octets: 0
 good_octets_sent: 383562
 good_frames_sent: 7665
 excessive_collision: 0
 multicast_frames_sent: 8
 broadcast_frames_sent: 3
 fc_sent: 0
 internal_mac_transmit_err: 0
root@wrt1900ac:~# ifconfig eth0
eth0  Link encap:Ethernet  HWaddr 94:10:3e:80:bc:f3  
  inet6 addr: fe80::9610:3eff:fe80:bcf3/64 Scope:Link
  UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
  RX packets:2 errors:0 dropped:0 overruns:0 frame:0
  TX packets:8 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:532 
  RX bytes:497 (497.0 B)  TX bytes:648 (648.0 B)
  Interrupt:27

I'm guessing ethtool is including the uboot TFTP packets, where as
ifconfig is just Linux? This would suggest the driver is not clearing
the statistics when it loads. Should it?

Apart from that, i pinged with a few different sizes and the right
frame size counter went up. I broadcast pinged and the broadcast
counter went up. So not exhaustive testing, but better than nothing.
Since clearing the statistics is a separate issue:

Tested-by: Andrew Lunn 

Thanks
Andrew

> 
>  drivers/net/ethernet/marvell/mvneta.c | 99 
> +++
>  1 file changed, 99 insertions(+)
> 
> diff --git a/drivers/net/ethernet/marvell/mvneta.c 
> b/drivers/net/ethernet/marvell/mvneta.c
> index 514df76fc70f..9f048ba92d0e 100644
> --- a/drivers/net/ethernet/marvell/mvneta.c
> +++ b/drivers/net/ethernet/marvell/mvneta.c
> @@ -277,6 +277,50 @@
>  
>  #define MVNETA_RX_BUF_SIZE(pkt_size)   ((pkt_size) + NET_SKB_PAD)
>  
> +struct mvneta_statistic {
> + unsigned short offset;
> + unsigned short type;
> + const char name[ETH_GSTRING_LEN];
> +};
> +
> +#define T_REG_32 32
> +#define T_REG_64 64
> +
> +static const struct mvneta_statistic mvneta_statistics[] = {
> + { 0x3000, T_REG_64, "good_octets_received", },
> + { 0x3010, T_REG_32, "good_frames_received", },
> + { 0x3008, T_REG_32, "bad_octets_received", },
> + { 0x3014, T_REG_32, "bad_frames_received", },
> + { 0x3018, T_REG_32, "broadcast_frames_received", },
> + { 0x301c, T_REG_32, "multicast_frames_received", },
> + { 0x3050, T_REG_32, "unrec_mac_control_received", },
> + { 0x3058, T_REG_32, "good_fc_received", },
> + { 0x305c, T_REG_32, "bad_fc_received", },
> + { 0x3060, T_REG_32, "undersize_received", },
> + { 0x3064, T_REG_32, "fragments_received", },
> + { 0x3068, T_REG_32, "oversize_received", },
> + { 0x306c, T_REG_32, "jabber_received", },
> + { 0x3070, T_REG_32, "mac_receive_error", },
> + { 0x3074, T_REG_32, "bad_crc_event", },
> + { 0x3078, T_REG_32, "collision", },
> + { 0x307c, T_REG_32, "late_collision", },
> + { 0x2484, T_REG_32, "rx_discard", },
> + { 0x2488, T_REG_32, "rx_overrun", },
> + { 0x3020, T_REG_32, "frames_64_octets", },
> + { 0x3024, T_REG_32, "frames_65_to_127_octets", },
> + { 0x3028, T_REG_32, "frames_128_to_255_octets", },
> + { 0x302c, T_REG_32, "frames_256_to_511_octets", },
> + { 0x3030, T_REG_32, "frames_512_to_1023_octets", },
> + { 0x3034, T_REG_32, "frames_1024_to_max_octets", },
> + { 0x3038, T_REG_64, "good_octets_sent", },
> + { 0x3040, T_REG_32, "good_frames_sent", },
> + { 0x3044, T_REG_32, "excessive_collision", },
> + { 0x3048, T_REG_32, "multicast_frames_sent", },
> + { 0x304c, T_REG_32, "broadcast_frames_sent", },
> + { 0x3054, T_REG_32, "fc_sent", },
> + { 0x300c, T_REG_32, "internal_mac_transmit_err", },
> +};
> +
>  struct mvneta_pcpu_stats {
>   struct  u64_stats_sync syncp;
>   u64 rx_packets;
> @@ -312,6 +356,8 @@ struct mvneta_port {
>   unsigned int speed;
>   unsigned int tx_csum_limit;
>   int use_inband_status:1;
> +
> + u6

[net-next 18/18] i40e/i40evf: remove unused opcode

2015-10-07 Thread Jeff Kirsher
From: Mitch Williams 

This opcode is not required. VFs that program RSS through the firmware
do it by interacting directly with the firmware, and do not need to use
the virtual channel for this functionality.

Change-ID: Iaf17d2600e28ff1b6be8653f2fe9df1facd23b0e
Signed-off-by: Mitch Williams 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl.h   | 1 -
 drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl.h 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl.h
index 0f8d415..95d0f8c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl.h
@@ -81,7 +81,6 @@ enum i40e_virtchnl_ops {
I40E_VIRTCHNL_OP_GET_STATS = 15,
I40E_VIRTCHNL_OP_FCOE = 16,
I40E_VIRTCHNL_OP_EVENT = 17,
-   I40E_VIRTCHNL_OP_CONFIG_RSS = 18,
 };
 
 /* Virtual channel message descriptor. This overlays the admin queue
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h 
b/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h
index e6db20e..cadda64 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h
@@ -81,7 +81,6 @@ enum i40e_virtchnl_ops {
I40E_VIRTCHNL_OP_GET_STATS = 15,
I40E_VIRTCHNL_OP_FCOE = 16,
I40E_VIRTCHNL_OP_EVENT = 17,
-   I40E_VIRTCHNL_OP_CONFIG_RSS = 18,
 };
 
 /* Virtual channel message descriptor. This overlays the admin queue
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 10/18] i40e: Never let speed get set to 0 in get_settings

2015-10-07 Thread Jeff Kirsher
From: Catherine Sullivan 

In ethtool, there is a possibility of speed getting set to 0
if advertise is set to 0 (which it is when autoneg is disabled).
We never want this to happen as the firmware will actually attempt
to set the speed to 0 sending link down, so add an extra check
to make sure this doesn't happen.

Change-ID: I62e02cbf043d8e6f5c9c9f0b92794e877f01
Signed-off-by: Catherine Sullivan 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 928bd5a..930369c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -666,6 +666,13 @@ static int i40e_set_settings(struct net_device *netdev,
advertise & ADVERTISED_4baseLR4_Full)
config.link_speed |= I40E_LINK_SPEED_40GB;
 
+   /* If speed didn't get set, set it to what it currently is.
+* This is needed because if advertise is 0 (as it is when autoneg
+* is disabled) then speed won't get set.
+*/
+   if (!config.link_speed)
+   config.link_speed = abilities.link_speed;
+
if (change || (abilities.link_speed != config.link_speed)) {
/* copy over the rest of the abilities */
config.phy_type = abilities.phy_type;
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 00/18][pull request] Intel Wired LAN Driver Updates 2015-10-07

2015-10-07 Thread Jeff Kirsher
This series contains updates to i40e and i40evf only.

Paul updates i40e to simply increase the amount of time we wait for a
reset to complete since we have seen in some rare occasions the reset
can take longer to complete.

Shannon updates the driver to turn on Wake-on-LAN by default if it is
enabled in the hardware config to begin with, rather than always disable
it and wait for the user to expressly turn it on.  Added new device id's
and support for future devices.  Fixed a possible type compare problem
between a size and possible negative number.  Also fixed a shift value
that was wrong, which ended up with a bad bitmask.  Did general house
cleaning of the driver to cleanup several low lying fruit in the
driver.  Fixed an issue where new unicast address's would be added to
the VSI list and then immediately removed and would never actually
make it down to the hardware.  Resolved the issue by removing the
separation from unicast and multicast in the search for filters to be
deleted.

Mitch fixes an issue where the hardware would continue to access the
memory formerly used by the rings for a VF which have been removed,
causing memory corruption or DMAR errors.  To relieve this condition,
explicitly stop all rings associated with each VF before releasing its
resources.  Also fixed a panic if the driver is unable to enable MSI-X
or its unable to acquire enough vectors, so propagate interrupt
allocation failure information to the calling function.  Cleaned up
opcode that is not required.

Carolyn extends the size of the test available for the interrupt names
so that all the descriptive data available for the Flow Director
interrupts is not truncated.

Catherine fixes an issue where there was a possibility of speed getting
set to 0 if advertised is set to 0 (which is the case when autoneg is
disabled).

Jesse fixes the checksum on big endian machines, so added code to swap
it correctly.  Also fixed a bug in the return from get_link_status()
where only true or false was being returned, but false could mean
multiple things.  So allow the caller to get all the return values
in the call chain bubbled back to the source so that the reason for
the failure does not get lost.

Anjali adds statistics to keep track of how many times we ask the stack
to linearize the SKB because the hardware cannot handle SKBs with more
than 8 frags per segment/single packet.

The following are changes since commit acb4a6bfc80ddeea4c44074dd630f916259e909e:
  tcp: ensure prior synack rtx behavior with small backlogs
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue master

Anjali Singhai Jain (1):
  i40e/i40evf: Add a stat to keep track of linearization count

Carolyn Wyborny (1):
  i40e: Fix for truncated interrupt name

Catherine Sullivan (1):
  i40e: Never let speed get set to 0 in get_settings

Jesse Brandeburg (2):
  i40e: add little endian conversion for checksum
  i40e: fix bug in return from get_link_status and avoid spurious link
messages

Mitch Williams (3):
  i40e: stop VF rings
  i40evf: propagate interrupt allocation failure
  i40e/i40evf: remove unused opcode

Neerav Parikh (1):
  i40e: Additional checks for CEE APP priority validity

Paul M Stillwell Jr (1):
  i40e: Increase the amount of time we wait for reset to be done

Shannon Nelson (8):
  i40e: enable WoL operation if config bit show WoL capable
  i40e/i40evf: add driver support for new device ids
  i40e/i40evf: fix a potential type compare issue
  i40e: fix bad CEE status shift value
  i40e: make i40e_init_pf_fcoe to void
  i40e/i40evf: assure clean asq status report
  i40e/i40evf: give up the __func__
  i40e/i40evf: fix unicast mac address add

 drivers/net/ethernet/intel/i40e/i40e.h |   5 +-
 drivers/net/ethernet/intel/i40e/i40e_adminq.c  |   2 +
 drivers/net/ethernet/intel/i40e/i40e_adminq.h  |   3 +-
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |   3 +-
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  18 ++--
 drivers/net/ethernet/intel/i40e/i40e_dcb.c |  48 ++---
 drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c  |   5 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  20 +++-
 drivers/net/ethernet/intel/i40e/i40e_fcoe.c|  10 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c| 116 ++---
 drivers/net/ethernet/intel/i40e/i40e_nvm.c |   4 +-
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c|   5 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h|   1 +
 drivers/net/ethernet/intel/i40e/i40e_type.h|   1 +
 drivers/net/ethernet/intel/i40e/i40e_virtchnl.h|   1 -
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |   5 +
 drivers/net/ethernet/intel/i40evf/i40e_adminq.c|   2 +
 drivers/net/ethernet/intel/i40evf/i40e_adminq.h|   3 +-
 drivers/net/ethernet/intel/i40evf/i40e_common.c|   1 +
 drivers/net/ethernet/intel/i40evf/i4

[net-next 06/18] i40e: fix bad CEE status shift value

2015-10-07 Thread Jeff Kirsher
From: Shannon Nelson 

Fix a shift value that was wrong, ending up with a bad bitmask.  Also add
a blank line between two sets of #defines for better readability.

Change-ID: I3e41fa2a2ab904d3a4e6cbf13972ab0036a10601
Signed-off-by: Shannon Nelson 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index b840fab..785b3db 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -2062,6 +2062,7 @@ I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_start);
 #define I40E_AQC_CEE_APP_ISCSI_MASK(0x7 << I40E_AQC_CEE_APP_ISCSI_SHIFT)
 #define I40E_AQC_CEE_APP_FIP_SHIFT 0x8
 #define I40E_AQC_CEE_APP_FIP_MASK  (0x7 << I40E_AQC_CEE_APP_FIP_SHIFT)
+
 #define I40E_AQC_CEE_PG_STATUS_SHIFT   0x0
 #define I40E_AQC_CEE_PG_STATUS_MASK(0x7 << I40E_AQC_CEE_PG_STATUS_SHIFT)
 #define I40E_AQC_CEE_PFC_STATUS_SHIFT  0x3
@@ -2070,7 +2071,7 @@ I40E_CHECK_CMD_LENGTH(i40e_aqc_lldp_start);
 #define I40E_AQC_CEE_APP_STATUS_MASK   (0x7 << I40E_AQC_CEE_APP_STATUS_SHIFT)
 #define I40E_AQC_CEE_FCOE_STATUS_SHIFT 0x8
 #define I40E_AQC_CEE_FCOE_STATUS_MASK  (0x7 << I40E_AQC_CEE_FCOE_STATUS_SHIFT)
-#define I40E_AQC_CEE_ISCSI_STATUS_SHIFT0xA
+#define I40E_AQC_CEE_ISCSI_STATUS_SHIFT0xB
 #define I40E_AQC_CEE_ISCSI_STATUS_MASK (0x7 << I40E_AQC_CEE_ISCSI_STATUS_SHIFT)
 #define I40E_AQC_CEE_FIP_STATUS_SHIFT  0x10
 #define I40E_AQC_CEE_FIP_STATUS_MASK   (0x7 << I40E_AQC_CEE_FIP_STATUS_SHIFT)
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 15/18] i40e/i40evf: Add a stat to keep track of linearization count

2015-10-07 Thread Jeff Kirsher
From: Anjali Singhai Jain 

Keep track of how many times we ask the stack to linearize the
skb because the HW cannot handle skbs with more than 8 frags per
segment/single packet.

Change-ID: If455452060963a769bbe6112cba952e79e944b52
Signed-off-by: Anjali Singhai Jain 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h | 1 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c| 5 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.c| 5 +++--
 drivers/net/ethernet/intel/i40e/i40e_txrx.h| 1 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c  | 5 +++--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h  | 1 +
 7 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index a54577a..681bd5d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -475,6 +475,7 @@ struct i40e_vsi {
 #endif
u32 tx_restart;
u32 tx_busy;
+   u64 tx_linearize;
u32 rx_buf_failed;
u32 rx_page_failed;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 1fa38f6..ffa9431 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -87,6 +87,7 @@ static const struct i40e_stats i40e_gstrings_misc_stats[] = {
I40E_VSI_STAT("rx_broadcast", eth_stats.rx_broadcast),
I40E_VSI_STAT("tx_broadcast", eth_stats.tx_broadcast),
I40E_VSI_STAT("rx_unknown_protocol", eth_stats.rx_unknown_protocol),
+   I40E_VSI_STAT("tx_linearize", tx_linearize),
 };
 
 static int i40e_add_fdir_ethtool(struct i40e_vsi *vsi,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 84f9dd9..fb4b34d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -879,6 +879,7 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
u32 rx_page, rx_buf;
u64 bytes, packets;
unsigned int start;
+   u64 tx_linearize;
u64 rx_p, rx_b;
u64 tx_p, tx_b;
u16 q;
@@ -897,7 +898,7 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
 */
rx_b = rx_p = 0;
tx_b = tx_p = 0;
-   tx_restart = tx_busy = 0;
+   tx_restart = tx_busy = tx_linearize = 0;
rx_page = 0;
rx_buf = 0;
rcu_read_lock();
@@ -914,6 +915,7 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
tx_p += packets;
tx_restart += p->tx_stats.restart_queue;
tx_busy += p->tx_stats.tx_busy;
+   tx_linearize += p->tx_stats.tx_linearize;
 
/* Rx queue is part of the same block as Tx queue */
p = &p[1];
@@ -930,6 +932,7 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
rcu_read_unlock();
vsi->tx_restart = tx_restart;
vsi->tx_busy = tx_busy;
+   vsi->tx_linearize = tx_linearize;
vsi->rx_page_failed = rx_page;
vsi->rx_buf_failed = rx_buf;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 85e61b0..889ed10 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2771,10 +2771,11 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff 
*skb,
if (tsyn)
tx_flags |= I40E_TX_FLAGS_TSYN;
 
-   if (i40e_chk_linearize(skb, tx_flags))
+   if (i40e_chk_linearize(skb, tx_flags)) {
if (skb_linearize(skb))
goto out_drop;
-
+   tx_ring->tx_stats.tx_linearize++;
+   }
skb_tx_timestamp(skb);
 
/* always enable CRC insertion offload */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 7c9975c..ac3fb3a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -188,6 +188,7 @@ struct i40e_tx_queue_stats {
u64 restart_queue;
u64 tx_busy;
u64 tx_done_old;
+   u64 tx_linearize;
 };
 
 struct i40e_rx_queue_stats {
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 0e1a4d6..3b102f2 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1927,10 +1927,11 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff 
*skb,
else if (tso)
tx_flags |= I40E_TX_FLAGS_TSO;
 
-   if (i40e_chk_linearize(skb, tx_flags))
+   if (i40e_chk_linearize(skb, tx_flags)) {
if (skb_linearize(skb))
goto out_drop;
-
+   tx_ring->tx_stats.tx_linearize++;
+   }
 

[net-next 04/18] i40e/i40evf: add driver support for new device ids

2015-10-07 Thread Jeff Kirsher
From: Shannon Nelson 

Early addition of new a device id.

Change-ID: I61a8c8556fdf4f5714be4e4089689e374f30293c
Signed-off-by: Shannon Nelson 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_common.c   | 1 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c  | 1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c | 1 +
 drivers/net/ethernet/intel/i40e/i40e_type.h | 1 +
 drivers/net/ethernet/intel/i40evf/i40e_common.c | 1 +
 drivers/net/ethernet/intel/i40evf/i40e_type.h   | 1 +
 6 files changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 1e81f4e..7a70abc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -51,6 +51,7 @@ static i40e_status i40e_set_mac_type(struct i40e_hw *hw)
case I40E_DEV_ID_QSFP_B:
case I40E_DEV_ID_QSFP_C:
case I40E_DEV_ID_10G_BASE_T:
+   case I40E_DEV_ID_10G_BASE_T4:
case I40E_DEV_ID_20G_KR2:
case I40E_DEV_ID_20G_KR2_A:
hw->mac.type = I40E_MAC_XL710;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 1345de2..928bd5a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -425,6 +425,7 @@ static void i40e_get_settings_link_down(struct i40e_hw *hw,
ecmd->advertising = ADVERTISED_1baseKR_Full;
break;
case I40E_DEV_ID_10G_BASE_T:
+   case I40E_DEV_ID_10G_BASE_T4:
ecmd->supported = SUPPORTED_1baseT_Full |
  SUPPORTED_1000baseT_Full |
  SUPPORTED_100baseT_Full;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 7fc5e2c..19eb14d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -75,6 +75,7 @@ static const struct pci_device_id i40e_pci_tbl[] = {
{PCI_VDEVICE(INTEL, I40E_DEV_ID_QSFP_B), 0},
{PCI_VDEVICE(INTEL, I40E_DEV_ID_QSFP_C), 0},
{PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T), 0},
+   {PCI_VDEVICE(INTEL, I40E_DEV_ID_10G_BASE_T4), 0},
{PCI_VDEVICE(INTEL, I40E_DEV_ID_20G_KR2), 0},
{PCI_VDEVICE(INTEL, I40E_DEV_ID_SFP_X722), 0},
{PCI_VDEVICE(INTEL, I40E_DEV_ID_1G_BASE_T_X722), 0},
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h 
b/drivers/net/ethernet/intel/i40e/i40e_type.h
index c5b6a65..34720e0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -46,6 +46,7 @@
 #define I40E_DEV_ID_10G_BASE_T 0x1586
 #define I40E_DEV_ID_20G_KR20x1587
 #define I40E_DEV_ID_20G_KR2_A  0x1588
+#define I40E_DEV_ID_10G_BASE_T40x1589
 #define I40E_DEV_ID_VF 0x154C
 #define I40E_DEV_ID_VF_HV  0x1571
 #define I40E_DEV_ID_SFP_X722   0x37D0
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_common.c 
b/drivers/net/ethernet/intel/i40evf/i40e_common.c
index 1950db1..96e48ee 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_common.c
@@ -51,6 +51,7 @@ i40e_status i40e_set_mac_type(struct i40e_hw *hw)
case I40E_DEV_ID_QSFP_B:
case I40E_DEV_ID_QSFP_C:
case I40E_DEV_ID_10G_BASE_T:
+   case I40E_DEV_ID_10G_BASE_T4:
case I40E_DEV_ID_20G_KR2:
case I40E_DEV_ID_20G_KR2_A:
hw->mac.type = I40E_MAC_XL710;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h 
b/drivers/net/ethernet/intel/i40evf/i40e_type.h
index 37bacc3..bbb3886 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h
@@ -46,6 +46,7 @@
 #define I40E_DEV_ID_10G_BASE_T 0x1586
 #define I40E_DEV_ID_20G_KR20x1587
 #define I40E_DEV_ID_20G_KR2_A  0x1588
+#define I40E_DEV_ID_10G_BASE_T40x1589
 #define I40E_DEV_ID_VF 0x154C
 #define I40E_DEV_ID_VF_HV  0x1571
 #define I40E_DEV_ID_SFP_X722   0x37D0
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 16/18] i40e: Additional checks for CEE APP priority validity

2015-10-07 Thread Jeff Kirsher
From: Neerav Parikh 

The firmware has added additional status information to allow software
to determine if the APP priority for FCoE/iSCSI/FIP is valid or not in
CEE DCBX mode.

This patch adds to support those additional checks and will only add
applications to the software table that have oper and sync bits set
without any error.

Change-ID: I0a76c52427dadf97d4dba4538a3068d05e4eb56b
Signed-off-by: Neerav Parikh 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_dcb.c | 48 --
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb.c 
b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
index 90de46a..9aee35d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb.c
@@ -531,37 +531,55 @@ static void i40e_cee_to_dcb_config(
dcbcfg->pfc.pfcenable = cee_cfg->oper_pfc_en;
dcbcfg->pfc.pfccap = I40E_MAX_TRAFFIC_CLASS;
 
-   status = (tlv_status & I40E_AQC_CEE_APP_STATUS_MASK) >>
- I40E_AQC_CEE_APP_STATUS_SHIFT;
+   i = 0;
+   status = (tlv_status & I40E_AQC_CEE_FCOE_STATUS_MASK) >>
+ I40E_AQC_CEE_FCOE_STATUS_SHIFT;
err = (status & I40E_TLV_STATUS_ERR) ? 1 : 0;
sync = (status & I40E_TLV_STATUS_SYNC) ? 1 : 0;
oper = (status & I40E_TLV_STATUS_OPER) ? 1 : 0;
-   /* Add APPs if Error is False and Oper/Sync is True */
+   /* Add FCoE APP if Error is False and Oper/Sync is True */
if (!err && sync && oper) {
-   /* CEE operating configuration supports FCoE/iSCSI/FIP only */
-   dcbcfg->numapps = I40E_CEE_OPER_MAX_APPS;
-
/* FCoE APP */
-   dcbcfg->app[0].priority =
+   dcbcfg->app[i].priority =
(app_prio & I40E_AQC_CEE_APP_FCOE_MASK) >>
 I40E_AQC_CEE_APP_FCOE_SHIFT;
-   dcbcfg->app[0].selector = I40E_APP_SEL_ETHTYPE;
-   dcbcfg->app[0].protocolid = I40E_APP_PROTOID_FCOE;
+   dcbcfg->app[i].selector = I40E_APP_SEL_ETHTYPE;
+   dcbcfg->app[i].protocolid = I40E_APP_PROTOID_FCOE;
+   i++;
+   }
 
+   status = (tlv_status & I40E_AQC_CEE_ISCSI_STATUS_MASK) >>
+ I40E_AQC_CEE_ISCSI_STATUS_SHIFT;
+   err = (status & I40E_TLV_STATUS_ERR) ? 1 : 0;
+   sync = (status & I40E_TLV_STATUS_SYNC) ? 1 : 0;
+   oper = (status & I40E_TLV_STATUS_OPER) ? 1 : 0;
+   /* Add iSCSI APP if Error is False and Oper/Sync is True */
+   if (!err && sync && oper) {
/* iSCSI APP */
-   dcbcfg->app[1].priority =
+   dcbcfg->app[i].priority =
(app_prio & I40E_AQC_CEE_APP_ISCSI_MASK) >>
 I40E_AQC_CEE_APP_ISCSI_SHIFT;
-   dcbcfg->app[1].selector = I40E_APP_SEL_TCPIP;
-   dcbcfg->app[1].protocolid = I40E_APP_PROTOID_ISCSI;
+   dcbcfg->app[i].selector = I40E_APP_SEL_TCPIP;
+   dcbcfg->app[i].protocolid = I40E_APP_PROTOID_ISCSI;
+   i++;
+   }
 
+   status = (tlv_status & I40E_AQC_CEE_FIP_STATUS_MASK) >>
+ I40E_AQC_CEE_FIP_STATUS_SHIFT;
+   err = (status & I40E_TLV_STATUS_ERR) ? 1 : 0;
+   sync = (status & I40E_TLV_STATUS_SYNC) ? 1 : 0;
+   oper = (status & I40E_TLV_STATUS_OPER) ? 1 : 0;
+   /* Add FIP APP if Error is False and Oper/Sync is True */
+   if (!err && sync && oper) {
/* FIP APP */
-   dcbcfg->app[2].priority =
+   dcbcfg->app[i].priority =
(app_prio & I40E_AQC_CEE_APP_FIP_MASK) >>
 I40E_AQC_CEE_APP_FIP_SHIFT;
-   dcbcfg->app[2].selector = I40E_APP_SEL_ETHTYPE;
-   dcbcfg->app[2].protocolid = I40E_APP_PROTOID_FIP;
+   dcbcfg->app[i].selector = I40E_APP_SEL_ETHTYPE;
+   dcbcfg->app[i].protocolid = I40E_APP_PROTOID_FIP;
+   i++;
}
+   dcbcfg->numapps = i;
 }
 
 /**
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 12/18] i40e: add little endian conversion for checksum

2015-10-07 Thread Jeff Kirsher
From: Jesse Brandeburg 

The checksum is not correct on big endian machines so add code to swap it
correctly.

Change-ID: Ic92b886d172a2cbe49f5d7eee1bc78e447023c7b
Signed-off-by: Jesse Brandeburg 
Signed-off-by: Paul M Stillwell Jr 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_nvm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c 
b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
index d0288ad..2142e10 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c
@@ -547,11 +547,13 @@ i40e_status i40e_update_nvm_checksum(struct i40e_hw *hw)
 {
i40e_status ret_code = 0;
u16 checksum;
+   __le16 le_sum;
 
ret_code = i40e_calc_nvm_checksum(hw, &checksum);
+   le_sum = cpu_to_le16(checksum);
if (!ret_code)
ret_code = i40e_write_nvm_aq(hw, 0x00, I40E_SR_SW_CHECKSUM_WORD,
-1, &checksum, true);
+1, &le_sum, true);
 
return ret_code;
 }
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 08/18] i40e/i40evf: assure clean asq status report

2015-10-07 Thread Jeff Kirsher
From: Shannon Nelson 

There was a possibility where the asq_last_status could get through without
update and thus report a previous error.  I don't think we've actually seen
this happen, but this patch will help make sure it doesn't.

Change-ID: I9e33927052a5ee6ea21f80b66d4c4b76c2760b17
Signed-off-by: Shannon Nelson 
Signed-off-by: Christopher Pau 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_adminq.c   | 2 ++
 drivers/net/ethernet/intel/i40evf/i40e_adminq.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.c 
b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
index 34a64e0..287cb8d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.c
@@ -756,6 +756,8 @@ i40e_status i40e_asq_send_command(struct i40e_hw *hw,
goto asq_send_command_error;
}
 
+   hw->aq.asq_last_status = I40E_AQ_RC_OK;
+
val = rd32(hw, hw->aq.asq.head);
if (val >= hw->aq.num_asq_entries) {
i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c 
b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
index c7f59e0..c5cf886 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq.c
@@ -696,6 +696,8 @@ i40e_status i40evf_asq_send_command(struct i40e_hw *hw,
goto asq_send_command_error;
}
 
+   hw->aq.asq_last_status = I40E_AQ_RC_OK;
+
val = rd32(hw, hw->aq.asq.head);
if (val >= hw->aq.num_asq_entries) {
i40e_debug(hw, I40E_DEBUG_AQ_MESSAGE,
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 11/18] i40e/i40evf: give up the __func__

2015-10-07 Thread Jeff Kirsher
From: Shannon Nelson 

During early development, we added the function name to all of the error
strings to make debugging simpler. Now that we've released the driver,
our users should have more comprehensible error messages. So tear the
roof off and give up the __func__. Ow.

Change-ID: I7e1766252c7a032b9af6520da6aff536bdfd533c
Signed-off-by: Shannon Nelson 
Signed-off-by: Mitch Williams 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c  |  5 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c| 40 ++-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c|  9 ++--
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c| 59 ++
 4 files changed, 47 insertions(+), 66 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c 
b/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
index 1c51f73..dbadad7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_dcb_nl.c
@@ -240,9 +240,8 @@ static void i40e_dcbnl_del_app(struct i40e_pf *pf,
if (pf->vsi[v] && pf->vsi[v]->netdev) {
err = i40e_dcbnl_vsi_del_app(pf->vsi[v], app);
if (err)
-   dev_info(&pf->pdev->dev, "%s: Failed deleting 
app for VSI seid=%d err=%d sel=%d proto=0x%x prio=%d\n",
-__func__, pf->vsi[v]->seid,
-err, app->selector,
+   dev_info(&pf->pdev->dev, "Failed deleting app 
for VSI seid=%d err=%d sel=%d proto=0x%x prio=%d\n",
+pf->vsi[v]->seid, err, app->selector,
 app->protocolid, app->priority);
}
}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0ef6a15..1fa1eba 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3156,8 +3156,7 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi 
*vsi, char *basename)
  q_vector);
if (err) {
dev_info(&pf->pdev->dev,
-"%s: request_irq failed, error: %d\n",
-__func__, err);
+"MSIX request_irq failed, error: %d\n", err);
goto free_queue_irqs;
}
/* assign the mask for this irq */
@@ -3681,9 +3680,8 @@ static int i40e_vsi_control_tx(struct i40e_vsi *vsi, bool 
enable)
ret = i40e_pf_txq_wait(pf, pf_q, enable);
if (ret) {
dev_info(&pf->pdev->dev,
-"%s: VSI seid %d Tx ring %d %sable timeout\n",
-__func__, vsi->seid, pf_q,
-(enable ? "en" : "dis"));
+"VSI seid %d Tx ring %d %sable timeout\n",
+vsi->seid, pf_q, (enable ? "en" : "dis"));
break;
}
}
@@ -3759,9 +3757,8 @@ static int i40e_vsi_control_rx(struct i40e_vsi *vsi, bool 
enable)
ret = i40e_pf_rxq_wait(pf, pf_q, enable);
if (ret) {
dev_info(&pf->pdev->dev,
-"%s: VSI seid %d Rx ring %d %sable timeout\n",
-__func__, vsi->seid, pf_q,
-(enable ? "en" : "dis"));
+"VSI seid %d Rx ring %d %sable timeout\n",
+vsi->seid, pf_q, (enable ? "en" : "dis"));
break;
}
}
@@ -4056,8 +4053,7 @@ static void i40e_quiesce_vsi(struct i40e_vsi *vsi)
if ((test_bit(__I40E_PORT_TX_SUSPENDED, &vsi->back->state)) &&
vsi->type == I40E_VSI_FCOE) {
dev_dbg(&vsi->back->pdev->dev,
-   "%s: VSI seid %d skipping FCoE VSI disable\n",
-__func__, vsi->seid);
+"VSI seid %d skipping FCoE VSI disable\n", vsi->seid);
return;
}
 
@@ -4131,8 +4127,8 @@ static int i40e_vsi_wait_txq_disabled(struct i40e_vsi 
*vsi)
ret = i40e_pf_txq_wait(pf, pf_q, false);
if (ret) {
dev_info(&pf->pdev->dev,
-"%s: VSI seid %d Tx ring %d disable timeout\n",
-__func__, vsi->seid, pf_q);
+"VSI seid %d Tx ring %d disable timeout\n",
+vsi->seid, pf_q);
return ret;
}
}
@@ -5423,8 +5419,7 @@ bool i40e_dcb_need_reconfig(struct i40e_pf *pf,
dev_dbg(&pf->pdev->dev, "APP Ta

[net-next 09/18] i40e: Fix for truncated interrupt name

2015-10-07 Thread Jeff Kirsher
From: Carolyn Wyborny 

This patch extends the size of the text available for the interrupt names.
Without this patch, all the descriptive data available for the Flow
Director interrupts is truncated.

Change-ID: I2ac458f23ac3b4ea8f1edf73edc283b1d3704c7f
Signed-off-by: Carolyn Wyborny 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 44837ec..a54577a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -97,7 +97,7 @@
 #define I40E_MAX_USER_PRIORITY8
 #define I40E_DEFAULT_MSG_ENABLE   4
 #define I40E_QUEUE_WAIT_RETRY_LIMIT   10
-#define I40E_INT_NAME_STR_LEN(IFNAMSIZ + 9)
+#define I40E_INT_NAME_STR_LEN(IFNAMSIZ + 16)
 
 /* Ethtool Private Flags */
 #define I40E_PRIV_FLAGS_NPAR_FLAG  BIT(0)
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 07/18] i40e: make i40e_init_pf_fcoe to void

2015-10-07 Thread Jeff Kirsher
From: Shannon Nelson 

i40e_init_pf_fcoe() didn't return anything except 0, it prints enough
error info already, and no driver logic depends on the return value,
so this can be void.

Change-ID: Ie6afad849857d87a7064c42c3cce14c74c2f29d8
Signed-off-by: Shannon Nelson 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h  |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_fcoe.c | 10 --
 drivers/net/ethernet/intel/i40e/i40e_main.c |  8 ++--
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index f6d97ad..44837ec 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -740,7 +740,7 @@ int i40e_fcoe_vsi_init(struct i40e_vsi *vsi, struct 
i40e_vsi_context *ctxt);
 u8 i40e_get_fcoe_tc_map(struct i40e_pf *pf);
 void i40e_fcoe_config_netdev(struct net_device *netdev, struct i40e_vsi *vsi);
 void i40e_fcoe_vsi_setup(struct i40e_pf *pf);
-int i40e_init_pf_fcoe(struct i40e_pf *pf);
+void i40e_init_pf_fcoe(struct i40e_pf *pf);
 int i40e_fcoe_setup_ddp_resources(struct i40e_vsi *vsi);
 void i40e_fcoe_free_ddp_resources(struct i40e_vsi *vsi);
 int i40e_fcoe_handle_offload(struct i40e_ring *rx_ring,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c 
b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
index 5ea75dd..2398d9b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_fcoe.c
@@ -272,10 +272,8 @@ out:
 /**
  * i40e_fcoe_sw_init - sets up the HW for FCoE
  * @pf: pointer to PF
- *
- * Returns 0 if FCoE is supported otherwise the error code
  **/
-int i40e_init_pf_fcoe(struct i40e_pf *pf)
+void i40e_init_pf_fcoe(struct i40e_pf *pf)
 {
struct i40e_hw *hw = &pf->hw;
u32 val;
@@ -287,13 +285,13 @@ int i40e_init_pf_fcoe(struct i40e_pf *pf)
 
if (!pf->hw.func_caps.fcoe) {
dev_info(&pf->pdev->dev, "FCoE capability is disabled\n");
-   return 0;
+   return;
}
 
if (!pf->hw.func_caps.dcb) {
dev_warn(&pf->pdev->dev,
 "Hardware is not DCB capable not enabling FCoE.\n");
-   return 0;
+   return;
}
 
/* enable FCoE hash filter */
@@ -326,7 +324,7 @@ int i40e_init_pf_fcoe(struct i40e_pf *pf)
wr32(hw, I40E_GLFCOE_RCTL, val);
 
dev_info(&pf->pdev->dev, "FCoE is supported.\n");
-   return 0;
+   return;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 19eb14d..0ef6a15 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -6573,9 +6573,7 @@ static void i40e_reset_and_rebuild(struct i40e_pf *pf, 
bool reinit)
}
 #endif /* CONFIG_I40E_DCB */
 #ifdef I40E_FCOE
-   ret = i40e_init_pf_fcoe(pf);
-   if (ret)
-   dev_info(&pf->pdev->dev, "init_pf_fcoe failed: %d\n", ret);
+   i40e_init_pf_fcoe(pf);
 
 #endif
/* do basic switch setup */
@@ -7976,9 +7974,7 @@ static int i40e_sw_init(struct i40e_pf *pf)
}
 
 #ifdef I40E_FCOE
-   err = i40e_init_pf_fcoe(pf);
-   if (err)
-   dev_info(&pf->pdev->dev, "init_pf_fcoe failed: %d\n", err);
+   i40e_init_pf_fcoe(pf);
 
 #endif /* I40E_FCOE */
 #ifdef CONFIG_PCI_IOV
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 01/18] i40e: Increase the amount of time we wait for reset to be done

2015-10-07 Thread Jeff Kirsher
From: Paul M Stillwell Jr 

In some rare cases the reset can take longer to complete so increase the
amount of time we wait.

Change-ID: Ib5628ec54b526a811ee33d1214fe763226406671
Signed-off-by: Paul M Stillwell Jr 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 6833717..1e81f4e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -1256,7 +1256,7 @@ i40e_status i40e_pf_reset(struct i40e_hw *hw)
grst_del = (rd32(hw, I40E_GLGEN_RSTCTL) &
I40E_GLGEN_RSTCTL_GRSTDEL_MASK) >>
I40E_GLGEN_RSTCTL_GRSTDEL_SHIFT;
-   for (cnt = 0; cnt < grst_del + 2; cnt++) {
+   for (cnt = 0; cnt < grst_del + 10; cnt++) {
reg = rd32(hw, I40E_GLGEN_RSTAT);
if (!(reg & I40E_GLGEN_RSTAT_DEVSTATE_MASK))
break;
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 05/18] i40e/i40evf: fix a potential type compare issue

2015-10-07 Thread Jeff Kirsher
From: Shannon Nelson 

Rework an if expression to assure there is no type compare problem between
a size and a possible negative number.

Change-ID: I4921fcc96abfcf69490efce020a9e4007f251c99
Reported-by: Helin Zhang 
Signed-off-by: Shannon Nelson 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_adminq.h   | 3 +--
 drivers/net/ethernet/intel/i40evf/i40e_adminq.h | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq.h
index ca81b0b..12fbbdd 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq.h
@@ -144,8 +144,7 @@ static inline int i40e_aq_rc_to_posix(int aq_ret, int aq_rc)
if (aq_ret == I40E_ERR_ADMIN_QUEUE_TIMEOUT)
return -EAGAIN;
 
-   if (aq_rc >= (sizeof(aq_to_posix) / sizeof((aq_to_posix)[0])) ||
-   aq_rc < 0)
+   if (!((u32)aq_rc < (sizeof(aq_to_posix) / sizeof((aq_to_posix)[0]
return -ERANGE;
 
return aq_to_posix[aq_rc];
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq.h 
b/drivers/net/ethernet/intel/i40evf/i40e_adminq.h
index e62e951..a3eae5d 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq.h
@@ -144,8 +144,7 @@ static inline int i40e_aq_rc_to_posix(int aq_ret, int aq_rc)
if (aq_ret == I40E_ERR_ADMIN_QUEUE_TIMEOUT)
return -EAGAIN;
 
-   if (aq_rc >= (sizeof(aq_to_posix) / sizeof((aq_to_posix)[0])) ||
-   aq_rc < 0)
+   if (!((u32)aq_rc < (sizeof(aq_to_posix) / sizeof((aq_to_posix)[0]
return -ERANGE;
 
return aq_to_posix[aq_rc];
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 13/18] i40e: fix bug in return from get_link_status and avoid spurious link messages

2015-10-07 Thread Jeff Kirsher
From: Jesse Brandeburg 

Previously, the driver could call this function and have only true/false
returned, but false could mean multiple things like failure to read
or link was down. This change allows the caller to get all return values
in the call chain bubbled back to the source, which keeps information about
failures from being lost.

Also, in some unlikely scenarios, the firmware can become slow to respond
to admin queue (AQ) queries for link state.  Should the AQ time out,
the driver can detect the state and avoid a link change when there
may have been none.

Change-ID: Ib2ac38407b7880750fb891b392fa77457fe6c21c
Signed-off-by: Jesse Brandeburg 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_common.c| 15 ---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c   | 11 ++-
 drivers/net/ethernet/intel/i40e/i40e_main.c  | 12 ++--
 drivers/net/ethernet/intel/i40e/i40e_prototype.h |  2 +-
 4 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 7a70abc..2839ea5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -2235,27 +2235,28 @@ i40e_status i40e_aq_send_driver_version(struct i40e_hw 
*hw,
 /**
  * i40e_get_link_status - get status of the HW network link
  * @hw: pointer to the hw struct
+ * @link_up: pointer to bool (true/false = linkup/linkdown)
  *
- * Returns true if link is up, false if link is down.
+ * Variable link_up true if link is up, false if link is down.
+ * The variable link_up is invalid if returned value of status != 0
  *
  * Side effect: LinkStatusEvent reporting becomes enabled
  **/
-bool i40e_get_link_status(struct i40e_hw *hw)
+i40e_status i40e_get_link_status(struct i40e_hw *hw, bool *link_up)
 {
i40e_status status = 0;
-   bool link_status = false;
 
if (hw->phy.get_link_info) {
status = i40e_aq_get_link_info(hw, true, NULL, NULL);
 
if (status)
-   goto i40e_get_link_status_exit;
+   i40e_debug(hw, I40E_DEBUG_LINK, "get link failed: 
status %d\n",
+  status);
}
 
-   link_status = hw->phy.link_info.link_info & I40E_AQ_LINK_UP;
+   *link_up = hw->phy.link_info.link_info & I40E_AQ_LINK_UP;
 
-i40e_get_link_status_exit:
-   return link_status;
+   return status;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 930369c..1fa38f6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1516,9 +1516,18 @@ static int i40e_link_test(struct net_device *netdev, u64 
*data)
 {
struct i40e_netdev_priv *np = netdev_priv(netdev);
struct i40e_pf *pf = np->vsi->back;
+   i40e_status status;
+   bool link_up = false;
 
netif_info(pf, hw, netdev, "link test\n");
-   if (i40e_get_link_status(&pf->hw))
+   status = i40e_get_link_status(&pf->hw, &link_up);
+   if (status) {
+   netif_err(pf, drv, netdev, "link query timed out, please retry 
test\n");
+   *data = 1;
+   return *data;
+   }
+
+   if (link_up)
*data = 0;
else
*data = 1;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 1fa1eba..f205e18 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5836,15 +5836,23 @@ static void i40e_veb_link_event(struct i40e_veb *veb, 
bool link_up)
  **/
 static void i40e_link_event(struct i40e_pf *pf)
 {
-   bool new_link, old_link;
struct i40e_vsi *vsi = pf->vsi[pf->lan_vsi];
u8 new_link_speed, old_link_speed;
+   i40e_status status;
+   bool new_link, old_link;
 
/* set this to force the get_link_status call to refresh state */
pf->hw.phy.get_link_info = true;
 
old_link = (pf->hw.phy.link_info_old.link_info & I40E_AQ_LINK_UP);
-   new_link = i40e_get_link_status(&pf->hw);
+
+   status = i40e_get_link_status(&pf->hw, &new_link);
+   if (status) {
+   dev_dbg(&pf->pdev->dev, "couldn't get link state, status: %d\n",
+   status);
+   return;
+   }
+
old_link_speed = pf->hw.phy.link_info_old.link_speed;
new_link_speed = pf->hw.phy.link_info.link_speed;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h 
b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index dcb72a8..e51e156 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -258,7 +258,7 @@ i40e_status i40e_init_shared_code(struct i40e_hw *hw);
 i40e_status i40e_pf_reset(struct i40e_h

[net-next 17/18] i40evf: propagate interrupt allocation failure

2015-10-07 Thread Jeff Kirsher
From: Mitch Williams 

Lower level functions are properly reporting errors, and higher-level
functions are correctly responding to errors, but the errors aren't
actually getting through. Typically, the middle-manager function seems
to want to shield its boss from any bad news.

This change fixes a panic if the driver is unable to enable MSI-X or is
unable to acquire enough vectors.

Change-ID: Ifd5787ce92519a5d97e4b465902db930d97b71a1
Signed-off-by: Mitch Williams 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index cc78fdf..0d18446 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -1163,7 +1163,7 @@ static int i40evf_set_interrupt_capability(struct 
i40evf_adapter *adapter)
for (vector = 0; vector < v_budget; vector++)
adapter->msix_entries[vector].entry = vector;
 
-   i40evf_acquire_msix_vectors(adapter, v_budget);
+   err = i40evf_acquire_msix_vectors(adapter, v_budget);
 
 out:
adapter->netdev->real_num_tx_queues = pairs;
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 14/18] i40e/i40evf: fix unicast mac address add

2015-10-07 Thread Jeff Kirsher
From: Shannon Nelson 

When using something like "ip maddr add ..." to add another unicast mac
address to the netdev, the mac address comes into the set_rx_mode handler
in the multicast list whether it is a unicast or multicast address.
This was confusing the code when it was trying to search for addresses
that needed to be deleted from the VSI, because it was looking for the
VSI unicast address in the netdev unicast list.  The result was that a
new unicast address would get added to the VSI list and then immediately
removed, and would never actually make it down into the hardware.

This patch removes the separation from unicast and multicast in the search
for filters to be deleted.  It also simplifies the logic a little with a
jump to the bottom of the loop when an address is found.  Now it doesn't
matter which netdev list the address is hiding in, we'll check them all.

Change-ID: Ie3685a92427ae7d2212bf948919ce295bc7a874c
Signed-off-by: Shannon Nelson 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 41 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 45 -
 2 files changed, 38 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f205e18..84f9dd9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1726,36 +1726,27 @@ static void i40e_set_rx_mode(struct net_device *netdev)
 
/* remove filter if not in netdev list */
list_for_each_entry_safe(f, ftmp, &vsi->mac_filter_list, list) {
-   bool found = false;
 
if (!f->is_netdev)
continue;
 
-   if (is_multicast_ether_addr(f->macaddr)) {
-   netdev_for_each_mc_addr(mca, netdev) {
-   if (ether_addr_equal(mca->addr, f->macaddr)) {
-   found = true;
-   break;
-   }
-   }
-   } else {
-   netdev_for_each_uc_addr(uca, netdev) {
-   if (ether_addr_equal(uca->addr, f->macaddr)) {
-   found = true;
-   break;
-   }
-   }
+   netdev_for_each_mc_addr(mca, netdev)
+   if (ether_addr_equal(mca->addr, f->macaddr))
+   goto bottom_of_search_loop;
 
-   for_each_dev_addr(netdev, ha) {
-   if (ether_addr_equal(ha->addr, f->macaddr)) {
-   found = true;
-   break;
-   }
-   }
-   }
-   if (!found)
-   i40e_del_filter(
-  vsi, f->macaddr, I40E_VLAN_ANY, false, true);
+   netdev_for_each_uc_addr(uca, netdev)
+   if (ether_addr_equal(uca->addr, f->macaddr))
+   goto bottom_of_search_loop;
+
+   for_each_dev_addr(netdev, ha)
+   if (ether_addr_equal(ha->addr, f->macaddr))
+   goto bottom_of_search_loop;
+
+   /* f->macaddr wasn't found in uc, mc, or ha list so delete it */
+   i40e_del_filter(vsi, f->macaddr, I40E_VLAN_ANY, false, true);
+
+bottom_of_search_loop:
+   continue;
}
 
/* check for other flag changes */
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index f3a61c4..cc78fdf 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -855,6 +855,7 @@ static void i40evf_set_rx_mode(struct net_device *netdev)
struct i40evf_mac_filter *f, *ftmp;
struct netdev_hw_addr *uca;
struct netdev_hw_addr *mca;
+   struct netdev_hw_addr *ha;
int count = 50;
 
/* add addr if not already in the filter list */
@@ -876,29 +877,27 @@ static void i40evf_set_rx_mode(struct net_device *netdev)
}
/* remove filter if not in netdev list */
list_for_each_entry_safe(f, ftmp, &adapter->mac_filter_list, list) {
-   bool found = false;
-
-   if (is_multicast_ether_addr(f->macaddr)) {
-   netdev_for_each_mc_addr(mca, netdev) {
-   if (ether_addr_equal(mca->addr, f->macaddr)) {
-   found = true;
-   break;
-   }
-   }
-   } else {
-   netdev_for_each_uc_addr(uca, n

[net-next 02/18] i40e: enable WoL operation if config bit show WoL capable

2015-10-07 Thread Jeff Kirsher
From: Shannon Nelson 

The driver was disabling Wake-on-LAN by default and waiting for the user
to expressly turn it on.  This patch has the driver turning on WoL from
the start if enabled in the hardware config, which matches the behavior
of our other drivers.

Change-ID: I43faedb907f8ba4d1a61b72a7c86072b97af12b1
Signed-off-by: Shannon Nelson 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5646ee8..7fc5e2c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -9947,6 +9947,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
struct i40e_pf *pf;
struct i40e_hw *hw;
static u16 pfs_found;
+   u16 wol_nvm_bits;
u16 link_status;
int err = 0;
u32 len;
@@ -10163,8 +10164,12 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
clear_bit(__I40E_SERVICE_SCHED, &pf->state);
pf->flags |= I40E_FLAG_NEED_LINK_UPDATE;
 
-   /* WoL defaults to disabled */
-   pf->wol_en = false;
+   /* NVM bit on means WoL disabled for the port */
+   i40e_read_nvm_word(hw, I40E_SR_NVM_WAKE_ON_LAN, &wol_nvm_bits);
+   if ((1 << hw->port) & wol_nvm_bits || hw->partition_id != 1)
+   pf->wol_en = false;
+   else
+   pf->wol_en = true;
device_set_wakeup_enable(&pf->pdev->dev, pf->wol_en);
 
/* set up the main switch operations */
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 03/18] i40e: stop VF rings

2015-10-07 Thread Jeff Kirsher
From: Mitch Williams 

Explicitly stop the rings belonging to each VF when disabling SR-IOV.
Even though the VFs were gone, and the associated VSIs were removed, the
rings were not stopped, and in some circumstances the hardware would
continue to access the memory formerly used by the rings, causing memory
corruption or DMAR errors, both of which would lead to general malaise
of the kernel.

To relieve this condition, explicitly stop all the rings associated with
each VF before releasing its resources.

Change-ID: I78c05d562c66e7b594b7e48d67860f49b3e5b6ec
Signed-off-by: Mitch Williams 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index b148694..0545e3f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -872,6 +872,11 @@ void i40e_free_vfs(struct i40e_pf *pf)
i40e_vsi_control_rings(pf->vsi[pf->vf[i].lan_vsi_idx],
   false);
 
+   for (i = 0; i < pf->num_alloc_vfs; i++)
+   if (test_bit(I40E_VF_STAT_INIT, &pf->vf[i].vf_states))
+   i40e_vsi_control_rings(pf->vsi[pf->vf[i].lan_vsi_idx],
+  false);
+
/* Disable IOV before freeing resources. This lets any VF drivers
 * running in the host get themselves cleaned up before we yank
 * the carpet out from underneath their feet.
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/2] bpf: enable non-root eBPF programs

2015-10-07 Thread Kees Cook
On Wed, Oct 7, 2015 at 3:07 PM, Daniel Borkmann  wrote:
> On 10/07/2015 11:20 PM, Alexei Starovoitov wrote:
>>
>> On 10/6/15 5:45 AM, Daniel Borkmann wrote:
>>>
>>> Should instead something similar be adapted on bpf(2) as well? Or, would
>>> that even be more painful for application developers shipping their stuff
>>> through distros in the end (where they might then decide to just setup
>>> everything BPF-related and then drop privs)?
>>
>>
>> I think loading as root and then dropping privs won't work in many
>> cases, since apps still need to access maps even after dropping privs
>> and today it's not possible, since cap_sys_admin is tested for every
>> bpf syscall.
>
>
> Yep, maps-only would then need to be made accessible in some way.
>
>>> I'm also wondering with regards to seccomp, which could adapt to eBPF at
>>> some point and be used by unprivileged programs. Perhaps then, a single
>>> paranoia alike setting might not suit to all eBPF subsystem users. Any
>>> ideas?
>>
>>
>> There is no such paranoid sysctl for cBPF, so there is no reason to
>> add one for eBPF other than fear.
>> Adding multiple sysctl knobs for seccomp, socket, tracing is only
>> reflection of even higher fear.
>> What sysadmins suppose to do with such sysctl when kernel is kinda
>> saying 'may be something unsafe here you're on your own' ?
>> Also the presence of this sysctl_bpf_enable_unprivileged or any other
>> one doesn't help with CVEs. Any bug with security implications will
>> be a CVE regardless, so I think the better course of action is to
>> avoid introducing this sysctl.
>
>
> Yes, I agree with you that there would be a CVE regardless. I still
> like the option of configurable access, not a big fan of the sysctl
> either. Thinking out loudly, what about a Kconfig option? We started
> out like this on bpf(2) itself (initially under expert settings, now
> afaik not anymore), and depending on usage scenarios, a requirement
> could be to have immutable cap_sys_admin-only, for other use-cases a
> requirement on the kernel might instead be to have unprivileged users
> as well.

It'd be nice to have it just be a Kconfig, but this shoots
distro-users in the foot if a distro decides to include unpriv bpf and
the user doesn't want it. I think it's probably a good idea to keep
the sysctl.

-Kees

>
>> We've discussed adding something like CAP_BPF to control it,
>> but then again, do we want this because of fear of bugs or because
>> it's actually needed. I think the design of all CAP_* is to give
>> unprivileged users permissions to do something beyond normal that
>> can potentially be harmful for other users or the whole system.
>> In this case it's not the case. One user can load eBPF programs
>> and maps up to its MEMLOCK limit and they cannot interfere with
>> other users or affect the host, so CAP_BPF is not necessary either.
>
>
> Thanks,
> Daniel



-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/2] bpf: enable non-root eBPF programs

2015-10-07 Thread Daniel Borkmann

On 10/07/2015 11:20 PM, Alexei Starovoitov wrote:

On 10/6/15 5:45 AM, Daniel Borkmann wrote:

Should instead something similar be adapted on bpf(2) as well? Or, would
that even be more painful for application developers shipping their stuff
through distros in the end (where they might then decide to just setup
everything BPF-related and then drop privs)?


I think loading as root and then dropping privs won't work in many
cases, since apps still need to access maps even after dropping privs
and today it's not possible, since cap_sys_admin is tested for every
bpf syscall.


Yep, maps-only would then need to be made accessible in some way.


I'm also wondering with regards to seccomp, which could adapt to eBPF at
some point and be used by unprivileged programs. Perhaps then, a single
paranoia alike setting might not suit to all eBPF subsystem users. Any
ideas?


There is no such paranoid sysctl for cBPF, so there is no reason to
add one for eBPF other than fear.
Adding multiple sysctl knobs for seccomp, socket, tracing is only
reflection of even higher fear.
What sysadmins suppose to do with such sysctl when kernel is kinda
saying 'may be something unsafe here you're on your own' ?
Also the presence of this sysctl_bpf_enable_unprivileged or any other
one doesn't help with CVEs. Any bug with security implications will
be a CVE regardless, so I think the better course of action is to
avoid introducing this sysctl.


Yes, I agree with you that there would be a CVE regardless. I still
like the option of configurable access, not a big fan of the sysctl
either. Thinking out loudly, what about a Kconfig option? We started
out like this on bpf(2) itself (initially under expert settings, now
afaik not anymore), and depending on usage scenarios, a requirement
could be to have immutable cap_sys_admin-only, for other use-cases a
requirement on the kernel might instead be to have unprivileged users
as well.


We've discussed adding something like CAP_BPF to control it,
but then again, do we want this because of fear of bugs or because
it's actually needed. I think the design of all CAP_* is to give
unprivileged users permissions to do something beyond normal that
can potentially be harmful for other users or the whole system.
In this case it's not the case. One user can load eBPF programs
and maps up to its MEMLOCK limit and they cannot interfere with
other users or affect the host, so CAP_BPF is not necessary either.


Thanks,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 3/4] net: Add IPv6 support to VRF device

2015-10-07 Thread David Ahern
Add support for IPv6 to VRF device driver. Implemenation parallels what
has been done for IPv4.

Signed-off-by: David Ahern 
---
 drivers/net/Kconfig |   4 +-
 drivers/net/vrf.c   | 279 +++-
 2 files changed, 281 insertions(+), 2 deletions(-)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index b9ebd0d18a52..f184fb5bd110 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -298,8 +298,10 @@ config NLMON
 
 config NET_VRF
tristate "Virtual Routing and Forwarding (Lite)"
-   depends on IP_MULTIPLE_TABLES && IPV6_MULTIPLE_TABLES
+   depends on IP_MULTIPLE_TABLES
depends on NET_L3_MASTER_DEV
+   depends on IPV6 || IPV6=n
+   depends on IPV6_MULTIPLE_TABLES || IPV6=n
---help---
  This option enables the support for mapping interfaces into VRF's. The
  support enables VRF devices.
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 64499766e00f..c6f9266c4032 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -57,6 +58,7 @@ struct slave_queue {
 struct net_vrf {
struct slave_queue  queue;
struct rtable   *rth;
+   struct rt6_info *rt6;
u32 tb_id;
 };
 
@@ -104,12 +106,56 @@ static struct dst_ops vrf_dst_ops = {
.default_advmss = vrf_default_advmss,
 };
 
+/* neighbor handling is done with actual device; do not want
+ * to flip skb->dev for those ndisc packets. This really fails
+ * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
+ * a start.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+static bool check_ipv6_frame(const struct sk_buff *skb)
+{
+   const struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->data;
+   size_t hlen = sizeof(*ipv6h);
+   bool rc = true;
+
+   if (skb->len < hlen)
+   goto out;
+
+   if (ipv6h->nexthdr == NEXTHDR_ICMP) {
+   const struct icmp6hdr *icmph;
+
+   if (skb->len < hlen + sizeof(*icmph))
+   goto out;
+
+   icmph = (struct icmp6hdr *)(skb->data + sizeof(*ipv6h));
+   switch (icmph->icmp6_type) {
+   case NDISC_ROUTER_SOLICITATION:
+   case NDISC_ROUTER_ADVERTISEMENT:
+   case NDISC_NEIGHBOUR_SOLICITATION:
+   case NDISC_NEIGHBOUR_ADVERTISEMENT:
+   case NDISC_REDIRECT:
+   rc = false;
+   break;
+   }
+   }
+
+out:
+   return rc;
+}
+#else
+static bool check_ipv6_frame(const struct sk_buff *skb)
+{
+   return false;
+}
+#endif
+
 static bool is_ip_rx_frame(struct sk_buff *skb)
 {
switch (skb->protocol) {
case htons(ETH_P_IP):
-   case htons(ETH_P_IPV6):
return true;
+   case htons(ETH_P_IPV6):
+   return check_ipv6_frame(skb);
}
return false;
 }
@@ -169,12 +215,52 @@ static struct rtnl_link_stats64 *vrf_get_stats64(struct 
net_device *dev,
return stats;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
+  struct net_device *dev)
+{
+   const struct ipv6hdr *iph = ipv6_hdr(skb);
+   struct net *net = dev_net(skb->dev);
+   struct flowi6 fl6 = {
+   /* needed to match OIF rule */
+   .flowi6_oif = dev->ifindex,
+   .flowi6_iif = LOOPBACK_IFINDEX,
+   .daddr = iph->daddr,
+   .saddr = iph->saddr,
+   .flowlabel = ip6_flowinfo(iph),
+   .flowi6_mark = skb->mark,
+   .flowi6_proto = iph->nexthdr,
+   .flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF,
+   };
+   int ret = NET_XMIT_DROP;
+   struct dst_entry *dst;
+
+   dst = ip6_route_output(net, NULL, &fl6);
+   if (dst == (struct dst_entry *)net->ipv6.ip6_null_entry)
+   goto err;
+
+   skb_dst_drop(skb);
+   skb_dst_set(skb, dst);
+
+   ret = ip6_local_out(skb);
+   if (unlikely(net_xmit_eval(ret)))
+   dev->stats.tx_errors++;
+   else
+   ret = NET_XMIT_SUCCESS;
+
+   return ret;
+err:
+   vrf_tx_error(dev, skb);
+   return NET_XMIT_DROP;
+}
+#else
 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
   struct net_device *dev)
 {
vrf_tx_error(dev, skb);
return NET_XMIT_DROP;
 }
+#endif
 
 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4,
struct net_device *vrf_dev)
@@ -269,6 +355,164 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct 
net_device *dev)
return ret;
 }
 
+#if IS_ENABLED(CONFIG_IPV6)
+static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie)
+{
+   return dst;
+}
+
+static in

[PATCH net-next 0/4 v2] net: VRF support in IPv6 stack

2015-10-07 Thread David Ahern
Initial support for VRF in IPv6 stack. Makes IPv6 functionality
on par with IPv4 -- ping, tcp client/server and udp client/server
all work fine. tcpdump on vrf device and external tap (e.g., host
side tap device) shows all packets with proper addresses. IPv6
does not need the source address operation like IPv4. Verified
vti6 works properly in my setup as does use of an IPv6 address
on the VRF device.

v2
- fixed CONFIG_IPV6 dependency as questioned by Cong
  - if IPV6 is a module, kbuild ensures VRF is a module
  - if IPV6 is disabled IPV6 functionality is compiled out of VRF module

- addressed comments from Nik over IRC
  - removed duplicate call to netif_is_l3_master in l3mdev_rt6_dst_by_oif
  - changed allocation flag from GFP_ATOMIC to GFP_KERNEL since it is init time
  - added free of rt6i_pcpu
  - check_ipv6_frame returns false only if packet is NDISC type

David Ahern (4):
  net: Add IPv6 support to l3mdev
  net: Export fib6_get_table and nd_tbl
  net: Add IPv6 support to VRF device
  net: Add VRF support to IPv6 stack

 drivers/net/Kconfig   |   4 +-
 drivers/net/vrf.c | 279 +-
 include/net/l3mdev.h  |  46 +
 net/ipv6/addrconf.c   |  12 ++-
 net/ipv6/icmp.c   |   6 +-
 net/ipv6/ip6_fib.c|   1 +
 net/ipv6/ip6_output.c |   6 +-
 net/ipv6/ndisc.c  |  27 -
 net/ipv6/route.c  |  22 +++-
 9 files changed, 387 insertions(+), 16 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 1/4] net: Add IPv6 support to l3mdev

2015-10-07 Thread David Ahern
Add operations to retrieve cached IPv6 dst entry from l3mdev device
and lookup IPv6 source address.

Signed-off-by: David Ahern 
---
 include/net/l3mdev.h | 46 ++
 1 file changed, 46 insertions(+)

diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index 44a19a171104..774d85b2d5d9 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -19,14 +19,22 @@
  * @l3mdev_get_rtable: Get cached IPv4 rtable (dst_entry) for device
  *
  * @l3mdev_get_saddr: Get source address for a flow
+ *
+ * @l3mdev_get_rt6_dst: Get cached IPv6 rt6_info (dst_entry) for device
  */
 
 struct l3mdev_ops {
u32 (*l3mdev_fib_table)(const struct net_device *dev);
+
+   /* IPv4 ops */
struct rtable * (*l3mdev_get_rtable)(const struct net_device *dev,
 const struct flowi4 *fl4);
void(*l3mdev_get_saddr)(struct net_device *dev,
struct flowi4 *fl4);
+
+   /* IPv6 ops */
+   struct dst_entry * (*l3mdev_get_rt6_dst)(const struct net_device *dev,
+const struct flowi6 *fl6);
 };
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
@@ -123,6 +131,31 @@ static inline void l3mdev_get_saddr(struct net *net, int 
ifindex,
}
 }
 
+static inline struct dst_entry *l3mdev_get_rt6_dst(const struct net_device 
*dev,
+  const struct flowi6 *fl6)
+{
+   if (netif_is_l3_master(dev) && dev->l3mdev_ops->l3mdev_get_rt6_dst)
+   return dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6);
+
+   return NULL;
+}
+
+static inline
+struct dst_entry *l3mdev_rt6_dst_by_oif(struct net *net,
+   const struct flowi6 *fl6)
+{
+   struct dst_entry *dst = NULL;
+   struct net_device *dev;
+
+   dev = dev_get_by_index(net, fl6->flowi6_oif);
+   if (dev) {
+   dst = l3mdev_get_rt6_dst(dev, fl6);
+   dev_put(dev);
+   }
+
+   return dst;
+}
+
 #else
 
 static inline int l3mdev_master_ifindex_rcu(struct net_device *dev)
@@ -171,6 +204,19 @@ static inline void l3mdev_get_saddr(struct net *net, int 
ifindex,
struct flowi4 *fl4)
 {
 }
+
+static inline
+struct dst_entry *l3mdev_get_rt6_dst(const struct net_device *dev,
+const struct flowi6 *fl6)
+{
+   return NULL;
+}
+static inline
+struct dst_entry *l3mdev_rt6_dst_by_oif(struct net *net,
+   const struct flowi6 *fl6)
+{
+   return NULL;
+}
 #endif
 
 #endif /* _NET_L3MDEV_H_ */
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 2/4] net: Export fib6_get_table and nd_tbl

2015-10-07 Thread David Ahern
Signed-off-by: David Ahern 
---
 net/ipv6/ip6_fib.c | 1 +
 net/ipv6/ndisc.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 7d2e0023c72d..09fddf70cca4 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -264,6 +264,7 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id)
 
return NULL;
 }
+EXPORT_SYMBOL_GPL(fib6_get_table);
 
 static void __net_init fib6_tables_init(struct net *net)
 {
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7089c305245c..b3292db05198 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -147,6 +147,7 @@ struct neigh_table nd_tbl = {
.gc_thresh2 =512,
.gc_thresh3 =   1024,
 };
+EXPORT_SYMBOL_GPL(nd_tbl);
 
 static void ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data)
 {
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 13/16] ipvlan: Cache net in ipvlan_process_v4_outbound and ipvlan_process_v6_outbound

2015-10-07 Thread Eric W. Biederman
Compute net once in ipvlan_process_v4_outbound and
ipvlan_process_v6_outbound and store it in a variable so that net does
not need to be recomputed next time it is used.

Signed-off-by: "Eric W. Biederman" 
---
 drivers/net/ipvlan/ipvlan_core.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 75dcf36c0366..976f30b291f6 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -344,6 +344,7 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb)
 {
const struct iphdr *ip4h = ip_hdr(skb);
struct net_device *dev = skb->dev;
+   struct net *net = dev_net(dev);
struct rtable *rt;
int err, ret = NET_XMIT_DROP;
struct flowi4 fl4 = {
@@ -354,7 +355,7 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb)
.saddr = ip4h->saddr,
};
 
-   rt = ip_route_output_flow(dev_net(dev), &fl4, NULL);
+   rt = ip_route_output_flow(net, &fl4, NULL);
if (IS_ERR(rt))
goto err;
 
@@ -381,6 +382,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
 {
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct net_device *dev = skb->dev;
+   struct net *net = dev_net(dev);
struct dst_entry *dst;
int err, ret = NET_XMIT_DROP;
struct flowi6 fl6 = {
@@ -393,7 +395,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
.flowi6_proto = ip6h->nexthdr,
};
 
-   dst = ip6_route_output(dev_net(dev), NULL, &fl6);
+   dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
ret = dst->error;
dst_release(dst);
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 4/4] net: Add VRF support to IPv6 stack

2015-10-07 Thread David Ahern
As with IPv4 support for VRFs added to IPv6 stack by replacing hardcoded
table ids with possibly device specific ones and manipulating the oif in
the flowi6. The flow flags are used to skip oif compare in nexthop lookups
if the device is enslaved to a VRF via the L3 master device.

Signed-off-by: David Ahern 
---
 net/ipv6/addrconf.c   | 12 +---
 net/ipv6/icmp.c   |  6 +-
 net/ipv6/ip6_output.c |  6 --
 net/ipv6/ndisc.c  | 26 +++---
 net/ipv6/route.c  | 22 +-
 5 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b2d6f2fc0fbd..e07b1fb52131 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -81,6 +81,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2149,7 +2150,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, 
struct net_device *dev,
  unsigned long expires, u32 flags)
 {
struct fib6_config cfg = {
-   .fc_table = RT6_TABLE_PREFIX,
+   .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
.fc_metric = IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_expires = expires,
@@ -2182,8 +2183,9 @@ static struct rt6_info *addrconf_get_prefix_route(const 
struct in6_addr *pfx,
struct fib6_node *fn;
struct rt6_info *rt = NULL;
struct fib6_table *table;
+   u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
 
-   table = fib6_get_table(dev_net(dev), RT6_TABLE_PREFIX);
+   table = fib6_get_table(dev_net(dev), tb_id);
if (!table)
return NULL;
 
@@ -2214,7 +2216,7 @@ static struct rt6_info *addrconf_get_prefix_route(const 
struct in6_addr *pfx,
 static void addrconf_add_mroute(struct net_device *dev)
 {
struct fib6_config cfg = {
-   .fc_table = RT6_TABLE_LOCAL,
+   .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_LOCAL,
.fc_metric = IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_dst_len = 8,
@@ -3035,6 +3037,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, 
bool prefix_route)
 {
struct in6_addr addr;
 
+   /* no link local addresses on L3 master devices */
+   if (netif_is_l3_master(idev->dev))
+   return;
+
ipv6_addr_set(&addr, htonl(0xFE80), 0, 0, 0);
 
if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) {
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 6c2b2132c8d3..efb1c00f2270 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -496,6 +497,9 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 
code, __u32 info)
else if (!fl6.flowi6_oif)
fl6.flowi6_oif = np->ucast_oif;
 
+   if (!fl6.flowi6_oif)
+   fl6.flowi6_oif = l3mdev_master_ifindex(skb->dev);
+
dst = icmpv6_route_lookup(net, skb, sk, &fl6);
if (IS_ERR(dst))
goto out;
@@ -575,7 +579,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
fl6.daddr = ipv6_hdr(skb)->saddr;
if (saddr)
fl6.saddr = *saddr;
-   fl6.flowi6_oif = skb->dev->ifindex;
+   fl6.flowi6_oif = l3mdev_fib_oif(skb->dev);
fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
fl6.flowi6_mark = mark;
security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index caf7d14a1bdd..527870ccf5c1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff 
*skb)
 {
@@ -886,7 +887,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 #ifdef CONFIG_IPV6_SUBTREES
ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 #endif
-   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
+  (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
+ (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
dst_release(dst);
dst = NULL;
}
@@ -1038,7 +1040,7 @@ struct dst_entry *ip6_dst_lookup_flow(const struct sock 
*sk, struct flowi6 *fl6,
if (final_dst)
fl6->daddr = *final_dst;
if (!fl6->flowi6_oif)
-   fl6->flowi6_oif = dst->dev->ifindex;
+   fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
 
return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 
0);
 }
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b3292db05198..0b85242d8469 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -67,6 +67,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -442,8 +443,11 @@ static void ndisc_send_skb(struct sk_buff 

[PATCH net-next 11/16] ipv4: Cache net in ip_build_and_send_pkt and ip_queue_xmit

2015-10-07 Thread Eric W. Biederman
Compute net and store it in a variable in the functions
ip_build_and_send_pkt and ip_queue_xmit so that it does not need to be
recomputed next time it is needed.

Signed-off-by: "Eric W. Biederman" 
---
 net/ipv4/ip_output.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 10366ee03bec..a7012f2fa68a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -139,6 +139,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct 
sock *sk,
 {
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
+   struct net *net = sock_net(sk);
struct iphdr *iph;
 
/* Build the IP header. */
@@ -157,7 +158,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct 
sock *sk,
iph->id = 0;
} else {
iph->frag_off = 0;
-   __ip_select_ident(sock_net(sk), iph, 1);
+   __ip_select_ident(net, iph, 1);
}
 
if (opt && opt->opt.optlen) {
@@ -382,6 +383,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct 
flowi4 *fl4)
 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
 {
struct inet_sock *inet = inet_sk(sk);
+   struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
@@ -412,7 +414,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, 
struct flowi *fl)
 * keep trying until route appears or the connection times
 * itself out.
 */
-   rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+   rt = ip_route_output_ports(net, fl4, sk,
   daddr, inet->inet_saddr,
   inet->inet_dport,
   inet->inet_sport,
@@ -449,7 +451,7 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
 
-   ip_select_ident_segs(sock_net(sk), skb, sk,
+   ip_select_ident_segs(net, skb, sk,
 skb_shinfo(skb)->gso_segs ?: 1);
 
/* TODO : should we use skb->sk here instead of sk ? */
@@ -462,7 +464,7 @@ packet_routed:
 
 no_route:
rcu_read_unlock();
-   IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+   IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
 }
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 15/16] ipv4,ipv6: Pass net into ip_local_out and ip6_local_out

2015-10-07 Thread Eric W. Biederman
Signed-off-by: "Eric W. Biederman" 
---
 drivers/net/ipvlan/ipvlan_core.c| 4 ++--
 drivers/net/ppp/pptp.c  | 2 +-
 drivers/net/vrf.c   | 4 ++--
 include/net/ip.h| 2 +-
 include/net/ip6_tunnel.h| 2 +-
 include/net/ipv6.h  | 2 +-
 net/ipv4/igmp.c | 4 ++--
 net/ipv4/ip_output.c| 9 -
 net/ipv4/ip_tunnel_core.c   | 2 +-
 net/ipv4/netfilter/ipt_SYNPROXY.c   | 2 +-
 net/ipv4/netfilter/nf_dup_ipv4.c| 2 +-
 net/ipv4/netfilter/nf_reject_ipv4.c | 2 +-
 net/ipv6/ip6_output.c   | 2 +-
 net/ipv6/netfilter/ip6t_SYNPROXY.c  | 2 +-
 net/ipv6/netfilter/nf_dup_ipv6.c| 2 +-
 net/ipv6/netfilter/nf_reject_ipv6.c | 2 +-
 net/ipv6/output_core.c  | 3 +--
 net/netfilter/ipvs/ip_vs_xmit.c | 4 ++--
 18 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 976f30b291f6..24f8dbcf854f 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -365,7 +365,7 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb)
}
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
-   err = ip_local_out(skb->sk, skb);
+   err = ip_local_out(net, skb->sk, skb);
if (unlikely(net_xmit_eval(err)))
dev->stats.tx_errors++;
else
@@ -403,7 +403,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
}
skb_dst_drop(skb);
skb_dst_set(skb, dst);
-   err = ip6_local_out(skb->sk, skb);
+   err = ip6_local_out(net, skb->sk, skb);
if (unlikely(net_xmit_eval(err)))
dev->stats.tx_errors++;
else
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 5243ab6ed4d4..fc69e41d0950 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -283,7 +283,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct 
sk_buff *skb)
ip_select_ident(net, skb, NULL);
ip_send_check(iph);
 
-   ip_local_out(skb->sk, skb);
+   ip_local_out(net, skb->sk, skb);
return 1;
 
 tx_error:
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index b27dc11cd3f2..21bb7deb6d58 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -76,7 +76,7 @@ static struct dst_entry *vrf_ip_check(struct dst_entry *dst, 
u32 cookie)
 
 static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff 
*skb)
 {
-   return ip_local_out(sk, skb);
+   return ip_local_out(net, sk, skb);
 }
 
 static unsigned int vrf_v4_mtu(const struct dst_entry *dst)
@@ -222,7 +222,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff 
*skb,
   RT_SCOPE_LINK);
}
 
-   ret = ip_local_out(skb->sk, skb);
+   ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
if (unlikely(net_xmit_eval(ret)))
vrf_dev->stats.tx_errors++;
else
diff --git a/include/net/ip.h b/include/net/ip.h
index 34b40381fb9b..7febbab784cd 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -113,7 +113,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct 
sk_buff *skb,
   int (*output)(struct net *, struct sock *, struct sk_buff 
*));
 void ip_send_check(struct iphdr *ip);
 int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
-int ip_local_out(struct sock *sk, struct sk_buff *skb);
+int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 
 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
 void ip_init(void);
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 8f18a8b126e9..aaee6fa02cf1 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -87,7 +87,7 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct 
sk_buff *skb,
int pkt_len, err;
 
pkt_len = skb->len - skb_inner_network_offset(skb);
-   err = ip6_local_out(sk, skb);
+   err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb);
 
if (net_xmit_eval(err) == 0) {
struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 42834039cf20..fce8120c2be3 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -866,7 +866,7 @@ int ip6_input(struct sk_buff *skb);
 int ip6_mc_input(struct sk_buff *skb);
 
 int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
-int ip6_local_out(struct sock *sk, struct sk_buff *skb);
+int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 
 /*
  * Extension header (options) processing
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 43375d9e02ab..64aaf3522a59 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -397,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
 
pig->csum = ip_compute_cs

[PATCH net-next 12/16] ppp: Cache net in pptp_xmit

2015-10-07 Thread Eric W. Biederman
Compute net and store it in a variable in pptp_xmit, so that the value
can be reused the next time it is needed.

Signed-off-by: "Eric W. Biederman" 
---
 drivers/net/ppp/pptp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 6bef7be10671..5243ab6ed4d4 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -169,6 +169,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct 
sk_buff *skb)
 {
struct sock *sk = (struct sock *) chan->private;
struct pppox_sock *po = pppox_sk(sk);
+   struct net *net = sock_net(sk);
struct pptp_opt *opt = &po->proto.pptp;
struct pptp_gre_header *hdr;
unsigned int header_len = sizeof(*hdr);
@@ -187,7 +188,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct 
sk_buff *skb)
if (sk_pppox(po)->sk_state & PPPOX_DEAD)
goto tx_error;
 
-   rt = ip_route_output_ports(sock_net(sk), &fl4, NULL,
+   rt = ip_route_output_ports(net, &fl4, NULL,
   opt->dst_addr.sin_addr.s_addr,
   opt->src_addr.sin_addr.s_addr,
   0, 0, IPPROTO_GRE,
@@ -279,7 +280,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct 
sk_buff *skb)
nf_reset(skb);
 
skb->ip_summed = CHECKSUM_NONE;
-   ip_select_ident(sock_net(sk), skb, NULL);
+   ip_select_ident(net, skb, NULL);
ip_send_check(iph);
 
ip_local_out(skb->sk, skb);
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 14/16] ipv4,ipv6: Pass net into __ip_local_out and __ip6_local_out

2015-10-07 Thread Eric W. Biederman
Signed-off-by: "Eric W. Biederman" 
---
 drivers/net/vrf.c  | 2 +-
 include/net/dst_ops.h  | 3 ++-
 include/net/ip.h   | 2 +-
 include/net/ipv6.h | 2 +-
 net/ipv4/ip_output.c   | 5 ++---
 net/ipv6/output_core.c | 5 ++---
 net/xfrm/xfrm_output.c | 2 +-
 7 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 231f9d85d4eb..b27dc11cd3f2 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -74,7 +74,7 @@ static struct dst_entry *vrf_ip_check(struct dst_entry *dst, 
u32 cookie)
return dst;
 }
 
-static int vrf_ip_local_out(struct sock *sk, struct sk_buff *skb)
+static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff 
*skb)
 {
return ip_local_out(sk, skb);
 }
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 3f26a6af444e..a0d443ca16fc 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -9,6 +9,7 @@ struct kmem_cachep;
 struct net_device;
 struct sk_buff;
 struct sock;
+struct net;
 
 struct dst_ops {
unsigned short  family;
@@ -28,7 +29,7 @@ struct dst_ops {
   struct sk_buff *skb, u32 mtu);
void(*redirect)(struct dst_entry *dst, struct sock 
*sk,
struct sk_buff *skb);
-   int (*local_out)(struct sock *sk, struct sk_buff 
*skb);
+   int (*local_out)(struct net *net, struct sock *sk, 
struct sk_buff *skb);
struct neighbour *  (*neigh_lookup)(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr);
diff --git a/include/net/ip.h b/include/net/ip.h
index 03e80f936847..34b40381fb9b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -112,7 +112,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb);
 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
   int (*output)(struct net *, struct sock *, struct sk_buff 
*));
 void ip_send_check(struct iphdr *ip);
-int __ip_local_out(struct sock *sk, struct sk_buff *skb);
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_local_out(struct sock *sk, struct sk_buff *skb);
 
 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 30eb1821c184..42834039cf20 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -865,7 +865,7 @@ int ip6_forward(struct sk_buff *skb);
 int ip6_input(struct sk_buff *skb);
 int ip6_mc_input(struct sk_buff *skb);
 
-int __ip6_local_out(struct sock *sk, struct sk_buff *skb);
+int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip6_local_out(struct sock *sk, struct sk_buff *skb);
 
 /*
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index a7012f2fa68a..39d3fbe66c68 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -96,9 +96,8 @@ void ip_send_check(struct iphdr *iph)
 }
 EXPORT_SYMBOL(ip_send_check);
 
-int __ip_local_out(struct sock *sk, struct sk_buff *skb)
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-   struct net *net = dev_net(skb_dst(skb)->dev);
struct iphdr *iph = ip_hdr(skb);
 
iph->tot_len = htons(skb->len);
@@ -113,7 +112,7 @@ int ip_local_out(struct sock *sk, struct sk_buff *skb)
struct net *net = dev_net(skb_dst(skb)->dev);
int err;
 
-   err = __ip_local_out(sk, skb);
+   err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
err = dst_output(net, sk, skb);
 
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 12855811c6a0..7f64d67b637d 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -138,9 +138,8 @@ int ip6_dst_hoplimit(struct dst_entry *dst)
 EXPORT_SYMBOL(ip6_dst_hoplimit);
 #endif
 
-int __ip6_local_out(struct sock *sk, struct sk_buff *skb)
+int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-   struct net *net = dev_net(skb_dst(skb)->dev);
int len;
 
len = skb->len - sizeof(struct ipv6hdr);
@@ -160,7 +159,7 @@ int ip6_local_out(struct sock *sk, struct sk_buff *skb)
struct net *net = dev_net(skb_dst(skb)->dev);
int err;
 
-   err = __ip6_local_out(sk, skb);
+   err = __ip6_local_out(net, sk, skb);
if (likely(err == 1))
err = dst_output(net, sk, skb);
 
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index a7a254fe7985..cc3676eb6239 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -136,7 +136,7 @@ int xfrm_output_resume(struct sk_buff *skb, int err)
while (likely((err = xfrm_output_one(skb, err)) == 0)) {
nf_reset(skb);
 
-   err = skb_dst(skb)->ops->local_out(skb->sk, skb);
+   err = skb_d

[PATCH net-next 16/16] dst: Pass net into dst->output

2015-10-07 Thread Eric W. Biederman
The network namespace is already passed into dst_output pass it into
dst->output lwt->output and friends.

Signed-off-by: "Eric W. Biederman" 
---
 drivers/net/vrf.c|  3 +--
 include/net/dst.h|  8 
 include/net/ip.h |  4 ++--
 include/net/ipv6.h   |  2 +-
 include/net/lwtunnel.h   |  8 
 include/net/xfrm.h   |  6 +++---
 net/core/dst.c   | 14 +++---
 net/core/lwtunnel.c  |  4 ++--
 net/decnet/dn_route.c|  6 +++---
 net/ipv4/ip_output.c |  6 ++
 net/ipv4/route.c |  4 ++--
 net/ipv4/xfrm4_output.c  |  4 +---
 net/ipv6/ila.c   |  4 ++--
 net/ipv6/ip6_output.c|  3 +--
 net/ipv6/route.c | 14 +++---
 net/ipv6/xfrm6_output.c  |  4 +---
 net/mpls/mpls_iptunnel.c |  2 +-
 net/xfrm/xfrm_policy.c   |  2 +-
 18 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 21bb7deb6d58..191579aeab16 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -312,10 +312,9 @@ err:
return ret;
 }
 
-static int vrf_output(struct sock *sk, struct sk_buff *skb)
+static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
struct net_device *dev = skb_dst(skb)->dev;
-   struct net *net = dev_net(dev);
 
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
 
diff --git a/include/net/dst.h b/include/net/dst.h
index fdd01fed1a7b..1279f9b09791 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -45,7 +45,7 @@ struct dst_entry {
void*__pad1;
 #endif
int (*input)(struct sk_buff *);
-   int (*output)(struct sock *sk, struct sk_buff *skb);
+   int (*output)(struct net *net, struct sock *sk, 
struct sk_buff *skb);
 
unsigned short  flags;
 #define DST_HOST   0x0001
@@ -365,10 +365,10 @@ static inline void skb_tunnel_rx(struct sk_buff *skb, 
struct net_device *dev,
__skb_tunnel_rx(skb, dev, net);
 }
 
-int dst_discard_sk(struct sock *sk, struct sk_buff *skb);
+int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
 static inline int dst_discard(struct sk_buff *skb)
 {
-   return dst_discard_sk(skb->sk, skb);
+   return dst_discard_out(&init_net, skb->sk, skb);
 }
 void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref,
int initial_obsolete, unsigned short flags);
@@ -456,7 +456,7 @@ static inline void dst_set_expires(struct dst_entry *dst, 
int timeout)
 /* Output packet to network from transport.  */
 static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff 
*skb)
 {
-   return skb_dst(skb)->output(sk, skb);
+   return skb_dst(skb)->output(net, sk, skb);
 }
 
 /* Input packet from network to transport.  */
diff --git a/include/net/ip.h b/include/net/ip.h
index 7febbab784cd..3c904a28d5e5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -107,8 +107,8 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, 
struct packet_type *pt,
   struct net_device *orig_dev);
 int ip_local_deliver(struct sk_buff *skb);
 int ip_mr_input(struct sk_buff *skb);
-int ip_output(struct sock *sk, struct sk_buff *skb);
-int ip_mc_output(struct sock *sk, struct sk_buff *skb);
+int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
+int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
   int (*output)(struct net *, struct sock *, struct sk_buff 
*));
 void ip_send_check(struct iphdr *ip);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index fce8120c2be3..e1a10b0ac0b0 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -860,7 +860,7 @@ struct dst_entry *ip6_blackhole_route(struct net *net,
  * skb processing functions
  */
 
-int ip6_output(struct sock *sk, struct sk_buff *skb);
+int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int ip6_forward(struct sk_buff *skb);
 int ip6_input(struct sk_buff *skb);
 int ip6_mc_input(struct sk_buff *skb);
diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index fce0e35e74d0..66350ce3e955 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -18,7 +18,7 @@ struct lwtunnel_state {
__u16   type;
__u16   flags;
atomic_trefcnt;
-   int (*orig_output)(struct sock *sk, struct sk_buff *skb);
+   int (*orig_output)(struct net *net, struct sock *sk, struct 
sk_buff *skb);
int (*orig_input)(struct sk_buff *);
int len;
__u8data[0];
@@ -28,7 +28,7 @@ struct lwtunnel_encap_ops {
int (*build_state)(struct net_device *dev, struct nlattr *encap,
   unsigned int family, const void *cfg,
   struct lwtunnel_stat

[PATCH net-next 10/16] ipv4: Cache net in iptunnel_xmit

2015-10-07 Thread Eric W. Biederman
Store net in a variable in ip_tunnel_xmit so it does not need
to be recomputed when it is used again.

Signed-off-by: "Eric W. Biederman" 
---
 net/ipv4/ip_tunnel_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 8d85ecd1ced5..caef8e2c281d 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -53,6 +53,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct 
sk_buff *skb,
  __u8 tos, __u8 ttl, __be16 df, bool xnet)
 {
int pkt_len = skb->len - skb_inner_network_offset(skb);
+   struct net *net = dev_net(rt->dst.dev);
struct iphdr *iph;
int err;
 
@@ -76,8 +77,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct 
sk_buff *skb,
iph->daddr  =   dst;
iph->saddr  =   src;
iph->ttl=   ttl;
-   __ip_select_ident(dev_net(rt->dst.dev), iph,
- skb_shinfo(skb)->gso_segs ?: 1);
+   __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
 
err = ip_local_out(sk, skb);
if (unlikely(net_xmit_eval(err)))
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 08/16] ipv6: Merge __ip6_local_out and __ip6_local_out_sk

2015-10-07 Thread Eric W. Biederman
Only __ip6_local_out_sk has callers so rename __ip6_local_out_sk
__ip6_local_out and remove the previous __ip6_local_out.

Signed-off-by: "Eric W. Biederman" 
---
 include/net/ipv6.h  | 3 +--
 net/ipv6/output_core.c  | 9 ++---
 net/ipv6/route.c| 2 +-
 net/ipv6/xfrm6_policy.c | 2 +-
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 56920262dbe9..be7e7689514b 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -865,8 +865,7 @@ int ip6_forward(struct sk_buff *skb);
 int ip6_input(struct sk_buff *skb);
 int ip6_mc_input(struct sk_buff *skb);
 
-int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb);
-int __ip6_local_out(struct sk_buff *skb);
+int __ip6_local_out(struct sock *sk, struct sk_buff *skb);
 int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb);
 int ip6_local_out(struct sk_buff *skb);
 
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index e5affb5fe095..f93ae1515387 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -138,7 +138,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst)
 EXPORT_SYMBOL(ip6_dst_hoplimit);
 #endif
 
-int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip6_local_out(struct sock *sk, struct sk_buff *skb)
 {
struct net *net = dev_net(skb_dst(skb)->dev);
int len;
@@ -153,11 +153,6 @@ int __ip6_local_out_sk(struct sock *sk, struct sk_buff 
*skb)
   net, sk, skb, NULL, skb_dst(skb)->dev,
   dst_output);
 }
-
-int __ip6_local_out(struct sk_buff *skb)
-{
-   return __ip6_local_out_sk(skb->sk, skb);
-}
 EXPORT_SYMBOL_GPL(__ip6_local_out);
 
 int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
@@ -165,7 +160,7 @@ int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
struct net *net = dev_net(skb_dst(skb)->dev);
int err;
 
-   err = __ip6_local_out_sk(sk, skb);
+   err = __ip6_local_out(sk, skb);
if (likely(err == 1))
err = dst_output(net, sk, skb);
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b62a507cc1a5..d3d946773a3e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -226,7 +226,7 @@ static struct dst_ops ip6_dst_ops_template = {
.link_failure   =   ip6_link_failure,
.update_pmtu=   ip6_rt_update_pmtu,
.redirect   =   rt6_do_redirect,
-   .local_out  =   __ip6_local_out_sk,
+   .local_out  =   __ip6_local_out,
.neigh_lookup   =   ip6_neigh_lookup,
 };
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index f7876830f263..08c9c93f3527 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -285,7 +285,7 @@ static struct dst_ops xfrm6_dst_ops = {
.cow_metrics =  dst_cow_metrics_generic,
.destroy =  xfrm6_dst_destroy,
.ifdown =   xfrm6_dst_ifdown,
-   .local_out =__ip6_local_out_sk,
+   .local_out =__ip6_local_out,
.gc_thresh =32768,
 };
 
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 09/16] ipv6: Merge ip6_local_out and ip6_local_out_sk

2015-10-07 Thread Eric W. Biederman
Stop hidding the sk parameter with an inline helper function and make
all of the callers pass it, so that it is clear what the function is
doing.

Signed-off-by: "Eric W. Biederman" 
---
 drivers/net/ipvlan/ipvlan_core.c| 2 +-
 include/net/ip6_tunnel.h| 2 +-
 include/net/ipv6.h  | 3 +--
 net/ipv6/ip6_output.c   | 2 +-
 net/ipv6/netfilter/ip6t_SYNPROXY.c  | 2 +-
 net/ipv6/netfilter/nf_dup_ipv6.c| 2 +-
 net/ipv6/netfilter/nf_reject_ipv6.c | 2 +-
 net/ipv6/output_core.c  | 8 +---
 net/netfilter/ipvs/ip_vs_xmit.c | 2 +-
 9 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index c75ad39c752f..75dcf36c0366 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -401,7 +401,7 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb)
}
skb_dst_drop(skb);
skb_dst_set(skb, dst);
-   err = ip6_local_out(skb);
+   err = ip6_local_out(skb->sk, skb);
if (unlikely(net_xmit_eval(err)))
dev->stats.tx_errors++;
else
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index fa915fa0f703..8f18a8b126e9 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -87,7 +87,7 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct 
sk_buff *skb,
int pkt_len, err;
 
pkt_len = skb->len - skb_inner_network_offset(skb);
-   err = ip6_local_out_sk(sk, skb);
+   err = ip6_local_out(sk, skb);
 
if (net_xmit_eval(err) == 0) {
struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index be7e7689514b..30eb1821c184 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -866,8 +866,7 @@ int ip6_input(struct sk_buff *skb);
 int ip6_mc_input(struct sk_buff *skb);
 
 int __ip6_local_out(struct sock *sk, struct sk_buff *skb);
-int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb);
-int ip6_local_out(struct sk_buff *skb);
+int ip6_local_out(struct sock *sk, struct sk_buff *skb);
 
 /*
  * Extension header (options) processing
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 0171e762e03c..31c686b7fcc0 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1692,7 +1692,7 @@ int ip6_send_skb(struct sk_buff *skb)
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
int err;
 
-   err = ip6_local_out(skb);
+   err = ip6_local_out(skb->sk, skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c 
b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index c2356602158a..c38c3411150b 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -76,7 +76,7 @@ synproxy_send_tcp(const struct synproxy_net *snet,
nf_conntrack_get(nfct);
}
 
-   ip6_local_out(nskb);
+   ip6_local_out(nskb->sk, nskb);
return;
 
 free_nskb:
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index ee0d9a5b16c3..64f3fe5e2719 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -68,7 +68,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, 
unsigned int hooknum,
}
if (nf_dup_ipv6_route(net, skb, gw, oif)) {
__this_cpu_write(nf_skb_duplicated, true);
-   ip6_local_out(skb);
+   ip6_local_out(skb->sk, skb);
__this_cpu_write(nf_skb_duplicated, false);
} else {
kfree_skb(skb);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c 
b/net/ipv6/netfilter/nf_reject_ipv6.c
index 94b4c6dfb400..a4f73e235ca5 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -206,7 +206,7 @@ void nf_send_reset6(struct net *net, struct sk_buff 
*oldskb, int hook)
dev_queue_xmit(nskb);
} else
 #endif
-   ip6_local_out(nskb);
+   ip6_local_out(nskb->sk, nskb);
 }
 EXPORT_SYMBOL_GPL(nf_send_reset6);
 
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index f93ae1515387..12855811c6a0 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -155,7 +155,7 @@ int __ip6_local_out(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(__ip6_local_out);
 
-int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int ip6_local_out(struct sock *sk, struct sk_buff *skb)
 {
struct net *net = dev_net(skb_dst(skb)->dev);
int err;
@@ -166,10 +166,4 @@ int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
 
return err;
 }
-EXPORT_SYMBOL_GPL(ip6_local_out_sk);
-
-int ip6_local_out(struct sk_buff *skb)
-{
-   return ip6_local_out_sk(skb->sk, skb);
-}
 EXPORT_SYMBOL_GPL(ip6_local_out);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c

[PATCH net-next 06/16] ipv4: Merge __ip_local_out and __ip_local_out_sk

2015-10-07 Thread Eric W. Biederman
Signed-off-by: "Eric W. Biederman" 
---
 include/net/ip.h| 3 +--
 net/ipv4/ip_output.c| 9 ++---
 net/ipv4/route.c| 2 +-
 net/ipv4/xfrm4_policy.c | 2 +-
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index ea1f721f7224..46272e04f3b6 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -112,8 +112,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb);
 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
   int (*output)(struct net *, struct sock *, struct sk_buff 
*));
 void ip_send_check(struct iphdr *ip);
-int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb);
-int __ip_local_out(struct sk_buff *skb);
+int __ip_local_out(struct sock *sk, struct sk_buff *skb);
 int ip_local_out_sk(struct sock *sk, struct sk_buff *skb);
 static inline int ip_local_out(struct sk_buff *skb)
 {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c38dfd7404fb..66c627b85a91 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -96,7 +96,7 @@ void ip_send_check(struct iphdr *iph)
 }
 EXPORT_SYMBOL(ip_send_check);
 
-int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip_local_out(struct sock *sk, struct sk_buff *skb)
 {
struct net *net = dev_net(skb_dst(skb)->dev);
struct iphdr *iph = ip_hdr(skb);
@@ -108,17 +108,12 @@ int __ip_local_out_sk(struct sock *sk, struct sk_buff 
*skb)
   dst_output);
 }
 
-int __ip_local_out(struct sk_buff *skb)
-{
-   return __ip_local_out_sk(skb->sk, skb);
-}
-
 int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
 {
struct net *net = dev_net(skb_dst(skb)->dev);
int err;
 
-   err = __ip_local_out_sk(sk, skb);
+   err = __ip_local_out(sk, skb);
if (likely(err == 1))
err = dst_output(net, sk, skb);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 638b976008b7..bf1486bd7e81 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -165,7 +165,7 @@ static struct dst_ops ipv4_dst_ops = {
.link_failure = ipv4_link_failure,
.update_pmtu =  ip_rt_update_pmtu,
.redirect = ip_do_redirect,
-   .local_out =__ip_local_out_sk,
+   .local_out =__ip_local_out,
.neigh_lookup = ipv4_neigh_lookup,
 };
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index d46d99f9cabd..f2606b9056bb 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -243,7 +243,7 @@ static struct dst_ops xfrm4_dst_ops = {
.cow_metrics =  dst_cow_metrics_generic,
.destroy =  xfrm4_dst_destroy,
.ifdown =   xfrm4_dst_ifdown,
-   .local_out =__ip_local_out_sk,
+   .local_out =__ip_local_out,
.gc_thresh =32768,
 };
 
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 07/16] ipv4: Merge ip_local_out and ip_local_out_sk

2015-10-07 Thread Eric W. Biederman
It is confusing and silly hiding a parameter so modify all of
the callers to pass in the appropriate socket or skb->sk if
no socket is known.

Signed-off-by: "Eric W. Biederman" 
---
 drivers/net/ipvlan/ipvlan_core.c|  2 +-
 drivers/net/ppp/pptp.c  |  2 +-
 drivers/net/vrf.c   |  4 ++--
 include/net/ip.h|  6 +-
 net/ipv4/igmp.c |  4 ++--
 net/ipv4/ip_output.c| 10 +-
 net/ipv4/ip_tunnel_core.c   |  2 +-
 net/ipv4/netfilter/ipt_SYNPROXY.c   |  2 +-
 net/ipv4/netfilter/nf_dup_ipv4.c|  2 +-
 net/ipv4/netfilter/nf_reject_ipv4.c |  2 +-
 net/netfilter/ipvs/ip_vs_xmit.c |  2 +-
 11 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 207f62e8de9a..c75ad39c752f 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -364,7 +364,7 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb)
}
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
-   err = ip_local_out(skb);
+   err = ip_local_out(skb->sk, skb);
if (unlikely(net_xmit_eval(err)))
dev->stats.tx_errors++;
else
diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
index 686f37daa262..6bef7be10671 100644
--- a/drivers/net/ppp/pptp.c
+++ b/drivers/net/ppp/pptp.c
@@ -282,7 +282,7 @@ static int pptp_xmit(struct ppp_channel *chan, struct 
sk_buff *skb)
ip_select_ident(sock_net(sk), skb, NULL);
ip_send_check(iph);
 
-   ip_local_out(skb);
+   ip_local_out(skb->sk, skb);
return 1;
 
 tx_error:
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 1039eb5f6c2a..231f9d85d4eb 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -76,7 +76,7 @@ static struct dst_entry *vrf_ip_check(struct dst_entry *dst, 
u32 cookie)
 
 static int vrf_ip_local_out(struct sock *sk, struct sk_buff *skb)
 {
-   return ip_local_out_sk(sk, skb);
+   return ip_local_out(sk, skb);
 }
 
 static unsigned int vrf_v4_mtu(const struct dst_entry *dst)
@@ -222,7 +222,7 @@ static netdev_tx_t vrf_process_v4_outbound(struct sk_buff 
*skb,
   RT_SCOPE_LINK);
}
 
-   ret = ip_local_out(skb);
+   ret = ip_local_out(skb->sk, skb);
if (unlikely(net_xmit_eval(ret)))
vrf_dev->stats.tx_errors++;
else
diff --git a/include/net/ip.h b/include/net/ip.h
index 46272e04f3b6..03e80f936847 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -113,11 +113,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, 
struct sk_buff *skb,
   int (*output)(struct net *, struct sock *, struct sk_buff 
*));
 void ip_send_check(struct iphdr *ip);
 int __ip_local_out(struct sock *sk, struct sk_buff *skb);
-int ip_local_out_sk(struct sock *sk, struct sk_buff *skb);
-static inline int ip_local_out(struct sk_buff *skb)
-{
-   return ip_local_out_sk(skb->sk, skb);
-}
+int ip_local_out(struct sock *sk, struct sk_buff *skb);
 
 int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
 void ip_init(void);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index de6d4c8ba600..43375d9e02ab 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -397,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
 
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
 
-   return ip_local_out(skb);
+   return ip_local_out(skb->sk, skb);
 }
 
 static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -739,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, 
struct ip_mc_list *pmc,
ih->group = group;
ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
 
-   return ip_local_out(skb);
+   return ip_local_out(skb->sk, skb);
 }
 
 static void igmp_gq_timer_expire(unsigned long data)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 66c627b85a91..10366ee03bec 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -108,7 +108,7 @@ int __ip_local_out(struct sock *sk, struct sk_buff *skb)
   dst_output);
 }
 
-int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int ip_local_out(struct sock *sk, struct sk_buff *skb)
 {
struct net *net = dev_net(skb_dst(skb)->dev);
int err;
@@ -119,7 +119,7 @@ int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
 
return err;
 }
-EXPORT_SYMBOL_GPL(ip_local_out_sk);
+EXPORT_SYMBOL_GPL(ip_local_out);
 
 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 {
@@ -169,7 +169,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct 
sock *sk,
skb->mark = sk->sk_mark;
 
/* Send it out. */
-   return ip_local_out(skb);
+   return ip_local_out(skb->sk, skb);
 }
 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 
@@ -456,7 +456,7 @@ packet_routed:
 

[PATCH net-next 02/16] ipv4: Fix ip_queue_xmit to pass sk into ip_local_out_sk

2015-10-07 Thread Eric W. Biederman
After a packet has been encapsulated by a tunnel we should use the
tunnel sockets local multicast loopback flag to control if the
encapsulated packet should be locally loopback back.

Pass sk into ip_local_out_sk so that in the rare case we are dealing
with a tunneled packet whose tunnel destination address is a multicast
address the kernel properly decides to loopback this packet.

In practice I don't think this matters as ip_queue_xmit is used by
tcp, l2tp and sctp none of which I am aware of uses ip level
multicasting as they are all point to point communications protocols.
Let's fix this before someone uses ip_queue_xmit for a tunnel protocol
that does use multicast.

Fixes: aad88724c9d5 ("ipv4: add a sock pointer to dst->output() path.")
Fixes: b0270e91014d ("ipv4: add a sock pointer to ip_queue_xmit()")
Signed-off-by: "Eric W. Biederman" 
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6cb585a05dd1..1030f48d66e1 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -460,7 +460,7 @@ packet_routed:
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
 
-   res = ip_local_out(skb);
+   res = ip_local_out_sk(sk, skb);
rcu_read_unlock();
return res;
 
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 03/16] xfrm: Only compute net once in xfrm_policy_queue_process

2015-10-07 Thread Eric W. Biederman
Signed-off-by: "Eric W. Biederman" 
---
 net/xfrm/xfrm_policy.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 418daa038edf..be1776bc5673 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1887,6 +1887,7 @@ static void xfrm_policy_queue_process(unsigned long arg)
struct sock *sk;
struct dst_entry *dst;
struct xfrm_policy *pol = (struct xfrm_policy *)arg;
+   struct net *net = xp_net(pol);
struct xfrm_policy_queue *pq = &pol->polq;
struct flowi fl;
struct sk_buff_head list;
@@ -1903,8 +1904,7 @@ static void xfrm_policy_queue_process(unsigned long arg)
spin_unlock(&pq->hold_queue.lock);
 
dst_hold(dst->path);
-   dst = xfrm_lookup(xp_net(pol), dst->path, &fl,
- sk, 0);
+   dst = xfrm_lookup(net, dst->path, &fl, sk, 0);
if (IS_ERR(dst))
goto purge_queue;
 
@@ -1934,8 +1934,7 @@ static void xfrm_policy_queue_process(unsigned long arg)
 
xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family);
dst_hold(skb_dst(skb)->path);
-   dst = xfrm_lookup(xp_net(pol), skb_dst(skb)->path,
- &fl, skb->sk, 0);
+   dst = xfrm_lookup(net, skb_dst(skb)->path, &fl, skb->sk, 0);
if (IS_ERR(dst)) {
kfree_skb(skb);
continue;
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 04/16] net: Pass net into dst_output and remove dst_output_okfn

2015-10-07 Thread Eric W. Biederman
Replace dst_output_okfn with dst_output

Signed-off-by: "Eric W. Biederman" 
---
 include/net/dst.h   | 6 +-
 net/decnet/dn_nsp_out.c | 4 ++--
 net/ipv4/ip_forward.c   | 2 +-
 net/ipv4/ip_output.c| 7 ---
 net/ipv4/ip_vti.c   | 2 +-
 net/ipv4/ipmr.c | 2 +-
 net/ipv4/raw.c  | 2 +-
 net/ipv4/xfrm4_output.c | 2 +-
 net/ipv6/ip6_output.c   | 4 ++--
 net/ipv6/ip6_vti.c  | 2 +-
 net/ipv6/ip6mr.c| 2 +-
 net/ipv6/mcast.c| 4 ++--
 net/ipv6/ndisc.c| 2 +-
 net/ipv6/output_core.c  | 5 +++--
 net/ipv6/raw.c  | 2 +-
 net/ipv6/xfrm6_output.c | 2 +-
 net/netfilter/ipvs/ip_vs_xmit.c | 4 ++--
 net/xfrm/xfrm_output.c  | 2 +-
 net/xfrm/xfrm_policy.c  | 2 +-
 19 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/include/net/dst.h b/include/net/dst.h
index 779206c15f8b..fdd01fed1a7b 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -454,14 +454,10 @@ static inline void dst_set_expires(struct dst_entry *dst, 
int timeout)
 }
 
 /* Output packet to network from transport.  */
-static inline int dst_output(struct sock *sk, struct sk_buff *skb)
+static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff 
*skb)
 {
return skb_dst(skb)->output(sk, skb);
 }
-static inline int dst_output_okfn(struct net *net, struct sock *sk, struct 
sk_buff *skb)
-{
-   return dst_output(sk, skb);
-}
 
 /* Input packet from network to transport.  */
 static inline int dst_input(struct sk_buff *skb)
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 4b02dd300f50..849805e7af52 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -85,7 +85,7 @@ static void dn_nsp_send(struct sk_buff *skb)
if (dst) {
 try_again:
skb_dst_set(skb, dst);
-   dst_output(skb->sk, skb);
+   dst_output(&init_net, skb->sk, skb);
return;
}
 
@@ -582,7 +582,7 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, 
unsigned char msgflg,
 * associations.
 */
skb_dst_set(skb, dst_clone(dst));
-   dst_output(skb->sk, skb);
+   dst_output(&init_net, skb->sk, skb);
 }
 
 
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index d66cfb35ba74..da0d7ce85844 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -72,7 +72,7 @@ static int ip_forward_finish(struct net *net, struct sock 
*sk, struct sk_buff *s
ip_forward_options(skb);
 
skb_sender_cpu_clear(skb);
-   return dst_output(sk, skb);
+   return dst_output(net, sk, skb);
 }
 
 int ip_forward(struct sk_buff *skb)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1030f48d66e1..c94efb22f380 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -105,7 +105,7 @@ static int __ip_local_out_sk(struct sock *sk, struct 
sk_buff *skb)
ip_send_check(iph);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
   net, sk, skb, NULL, skb_dst(skb)->dev,
-  dst_output_okfn);
+  dst_output);
 }
 
 int __ip_local_out(struct sk_buff *skb)
@@ -115,11 +115,12 @@ int __ip_local_out(struct sk_buff *skb)
 
 int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
 {
+   struct net *net = dev_net(skb_dst(skb)->dev);
int err;
 
err = __ip_local_out_sk(sk, skb);
if (likely(err == 1))
-   err = dst_output(sk, skb);
+   err = dst_output(net, sk, skb);
 
return err;
 }
@@ -276,7 +277,7 @@ static int ip_finish_output(struct net *net, struct sock 
*sk, struct sk_buff *sk
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
-   return dst_output(sk, skb);
+   return dst_output(net, sk, skb);
}
 #endif
mtu = ip_skb_dst_mtu(skb);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3b87ec5178f9..4d8f0b698777 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct 
net_device *dev,
skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
 
-   err = dst_output(skb->sk, skb);
+   err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
err = skb->len;
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index cfcb996ec51b..fc42525d8694 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1689,7 +1689,7 @@ static inline int ipmr_forward_finish(struct net *net, 
struct sock *sk,
if (unlikely(opt->optlen))
ip_forward_options(skb);
 
-   return dst_output(sk, skb);
+   return dst_output(net, sk, skb);
 }
 
 /*
diff --git 

[PATCH net-next 05/16] dst: Pass a sk into .local_out

2015-10-07 Thread Eric W. Biederman
For consistency with the other similar methods in the kernel pass a
struct sock into the dst_ops .local_out method.

Simplifying the socket passing case is needed a prequel to passing a
struct net reference into .local_out.

Signed-off-by: "Eric W. Biederman" 
---
 drivers/net/vrf.c   | 4 ++--
 include/net/dst_ops.h   | 2 +-
 include/net/ip.h| 1 +
 include/net/ipv6.h  | 1 +
 net/ipv4/ip_output.c| 2 +-
 net/ipv4/route.c| 2 +-
 net/ipv4/xfrm4_policy.c | 2 +-
 net/ipv6/output_core.c  | 2 +-
 net/ipv6/route.c| 2 +-
 net/ipv6/xfrm6_policy.c | 2 +-
 net/xfrm/xfrm_output.c  | 2 +-
 11 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 64499766e00f..1039eb5f6c2a 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -74,9 +74,9 @@ static struct dst_entry *vrf_ip_check(struct dst_entry *dst, 
u32 cookie)
return dst;
 }
 
-static int vrf_ip_local_out(struct sk_buff *skb)
+static int vrf_ip_local_out(struct sock *sk, struct sk_buff *skb)
 {
-   return ip_local_out(skb);
+   return ip_local_out_sk(sk, skb);
 }
 
 static unsigned int vrf_v4_mtu(const struct dst_entry *dst)
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index d64253914a6a..3f26a6af444e 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -28,7 +28,7 @@ struct dst_ops {
   struct sk_buff *skb, u32 mtu);
void(*redirect)(struct dst_entry *dst, struct sock 
*sk,
struct sk_buff *skb);
-   int (*local_out)(struct sk_buff *skb);
+   int (*local_out)(struct sock *sk, struct sk_buff 
*skb);
struct neighbour *  (*neigh_lookup)(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr);
diff --git a/include/net/ip.h b/include/net/ip.h
index dd06ab3669f9..ea1f721f7224 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -112,6 +112,7 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb);
 int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
   int (*output)(struct net *, struct sock *, struct sk_buff 
*));
 void ip_send_check(struct iphdr *ip);
+int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb);
 int __ip_local_out(struct sk_buff *skb);
 int ip_local_out_sk(struct sock *sk, struct sk_buff *skb);
 static inline int ip_local_out(struct sk_buff *skb)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 3dde042bcd3f..56920262dbe9 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -865,6 +865,7 @@ int ip6_forward(struct sk_buff *skb);
 int ip6_input(struct sk_buff *skb);
 int ip6_mc_input(struct sk_buff *skb);
 
+int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb);
 int __ip6_local_out(struct sk_buff *skb);
 int ip6_local_out_sk(struct sock *sk, struct sk_buff *skb);
 int ip6_local_out(struct sk_buff *skb);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c94efb22f380..c38dfd7404fb 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -96,7 +96,7 @@ void ip_send_check(struct iphdr *iph)
 }
 EXPORT_SYMBOL(ip_send_check);
 
-static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
 {
struct net *net = dev_net(skb_dst(skb)->dev);
struct iphdr *iph = ip_hdr(skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bf1486bd7e81..638b976008b7 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -165,7 +165,7 @@ static struct dst_ops ipv4_dst_ops = {
.link_failure = ipv4_link_failure,
.update_pmtu =  ip_rt_update_pmtu,
.redirect = ip_do_redirect,
-   .local_out =__ip_local_out,
+   .local_out =__ip_local_out_sk,
.neigh_lookup = ipv4_neigh_lookup,
 };
 
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index f2606b9056bb..d46d99f9cabd 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -243,7 +243,7 @@ static struct dst_ops xfrm4_dst_ops = {
.cow_metrics =  dst_cow_metrics_generic,
.destroy =  xfrm4_dst_destroy,
.ifdown =   xfrm4_dst_ifdown,
-   .local_out =__ip_local_out,
+   .local_out =__ip_local_out_sk,
.gc_thresh =32768,
 };
 
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 4337147ee23d..e5affb5fe095 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -138,7 +138,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst)
 EXPORT_SYMBOL(ip6_dst_hoplimit);
 #endif
 
-static int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip6_local_out_sk(struct sock *sk, struct sk_buff *skb)
 {
struct ne

[PATCH net-next 01/16] ipv4: Fix ip_local_out_sk by passing the sk into __ip_local_out_sk

2015-10-07 Thread Eric W. Biederman
In the rare case where sk != skb->sk ip_local_out_sk arranges
to call dst->output differently if the skb is queued or not.
This is a bug.

Fix this bug by passing the sk parameter of ip_local_out_sk through
from ip_local_out_sk to __ip_local_out_sk (skipping __ip_local_out).

Fixes: 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn().")
Signed-off-by: "Eric W. Biederman" 
---
 net/ipv4/ip_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 911ea739049a..6cb585a05dd1 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -117,7 +117,7 @@ int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
 {
int err;
 
-   err = __ip_local_out(skb);
+   err = __ip_local_out_sk(sk, skb);
if (likely(err == 1))
err = dst_output(sk, skb);
 
-- 
2.2.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 00/16] net: Pass net through the output path v2

2015-10-07 Thread Eric W. Biederman

This is the next installment of my work to pass struct net through the
output path so the code does not need to guess how to figure out which
network namespace it is in, and ultimately routes can have output
devices in another network namespace.

The first patch in this series is a fix for a bug that came in when sk
was passed through the functions in the output path, and as such is
probably a candidate for net.  At the same time my later patches depend
on it so sending the fix separately would be confusing.

The second patch in this series is another fix that for an issue that
came in when sk was passed through the output path.  I don't think it
needs a backport as I don't think anyone uses the path where the code
was incorrect.

The rest of the patchset focuses on the path from xxx_local_out to
dst_output and in the end succeeds in passing sock_net(sk) from the
socket a packet locally originates on to the dst->output function.

Given the size reduction in the code I think this counts as a cleanup as
much as feature work.

There remain a number of helper functions (like ip option processing) to
take care of before the network stack can support destination devices in
other network namespaces but with this set of changes the backbone of
the work is done.

The changes are also available against net-next at:
git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/net-next.git master

Eric

Eric W. Biederman (16):
  ipv4: Fix ip_local_out_sk by passing the sk into __ip_local_out_sk
  ipv4: Fix ip_queue_xmit to pass sk into ip_local_out_sk
  xfrm: Only compute net once in xfrm_policy_queue_process
  net: Pass net into dst_output and remove dst_output_okfn
  dst: Pass a sk into .local_out
  ipv4: Merge __ip_local_out and __ip_local_out_sk
  ipv4: Merge ip_local_out and ip_local_out_sk
  ipv6: Merge __ip6_local_out and __ip6_local_out_sk
  ipv6: Merge ip6_local_out and ip6_local_out_sk
  ipv4: Cache net in iptunnel_xmit
  ipv4: Cache net in ip_build_and_send_pkt and ip_queue_xmit
  ppp: Cache net in pptp_xmit
  ipvlan: Cache net in ipvlan_process_v4_outbound and 
ipvlan_process_v6_outbound
  ipv4,ipv6: Pass net into __ip_local_out and __ip6_local_out
  ipv4,ipv6: Pass net into ip_local_out and ip6_local_out
  dst: Pass net into dst->output

 drivers/net/ipvlan/ipvlan_core.c| 10 +
 drivers/net/ppp/pptp.c  |  7 ---
 drivers/net/vrf.c   |  9 
 include/net/dst.h   | 14 +
 include/net/dst_ops.h   |  3 ++-
 include/net/ip.h| 12 ---
 include/net/ip6_tunnel.h|  2 +-
 include/net/ipv6.h  |  7 +++
 include/net/lwtunnel.h  |  8 +++
 include/net/xfrm.h  |  6 +++---
 net/core/dst.c  | 14 ++---
 net/core/lwtunnel.c |  4 ++--
 net/decnet/dn_nsp_out.c |  4 ++--
 net/decnet/dn_route.c   |  6 +++---
 net/ipv4/igmp.c |  4 ++--
 net/ipv4/ip_forward.c   |  2 +-
 net/ipv4/ip_output.c| 42 -
 net/ipv4/ip_tunnel_core.c   |  6 +++---
 net/ipv4/ip_vti.c   |  2 +-
 net/ipv4/ipmr.c |  2 +-
 net/ipv4/netfilter/ipt_SYNPROXY.c   |  2 +-
 net/ipv4/netfilter/nf_dup_ipv4.c|  2 +-
 net/ipv4/netfilter/nf_reject_ipv4.c |  2 +-
 net/ipv4/raw.c  |  2 +-
 net/ipv4/route.c|  4 ++--
 net/ipv4/xfrm4_output.c |  6 ++
 net/ipv6/ila.c  |  4 ++--
 net/ipv6/ip6_output.c   |  9 
 net/ipv6/ip6_vti.c  |  2 +-
 net/ipv6/ip6mr.c|  2 +-
 net/ipv6/mcast.c|  4 ++--
 net/ipv6/ndisc.c|  2 +-
 net/ipv6/netfilter/ip6t_SYNPROXY.c  |  2 +-
 net/ipv6/netfilter/nf_dup_ipv6.c|  2 +-
 net/ipv6/netfilter/nf_reject_ipv6.c |  2 +-
 net/ipv6/output_core.c  | 22 +--
 net/ipv6/raw.c  |  2 +-
 net/ipv6/route.c| 14 ++---
 net/ipv6/xfrm6_output.c |  6 ++
 net/mpls/mpls_iptunnel.c|  2 +-
 net/netfilter/ipvs/ip_vs_xmit.c |  8 +++
 net/xfrm/xfrm_output.c  |  4 ++--
 net/xfrm/xfrm_policy.c  | 11 +-
 43 files changed, 125 insertions(+), 155 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   3   >