[PATCH net-next] cxgb4: Add PTP Hardware Clock (PHC) support

2017-06-28 Thread Atul Gupta
Add PTP IEEE-1588 support and make it accessible via PHC subsystem.
The functionality is enabled for T5/T6 adapters. Driver interfaces with
Firmware to program and adjust the clock offset.

Signed-off-by: Atul Gupta 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/Makefile|   2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |   9 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c |  25 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c|  87 +++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c | 476 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.h |  74 
 drivers/net/ethernet/chelsio/cxgb4/sge.c   | 166 ++-
 drivers/net/ethernet/chelsio/cxgb4/t4_msg.h|  28 ++
 drivers/net/ethernet/chelsio/cxgb4/t4_regs.h   |   2 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h  |  50 +++
 10 files changed, 906 insertions(+), 13 deletions(-)
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.h

diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile 
b/drivers/net/ethernet/chelsio/cxgb4/Makefile
index c6b71f6..8172127 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/Makefile
+++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_CHELSIO_T4) += cxgb4.o
 
-cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o 
cxgb4_uld.o sched.o cxgb4_filter.o cxgb4_tc_u32.o
+cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o 
cxgb4_uld.o sched.o cxgb4_filter.o cxgb4_tc_u32.o cxgb4_ptp.o
 cxgb4-$(CONFIG_CHELSIO_T4_DCB) +=  cxgb4_dcb.o
 cxgb4-$(CONFIG_CHELSIO_T4_FCOE) +=  cxgb4_fcoe.o
 cxgb4-$(CONFIG_DEBUG_FS) += cxgb4_debugfs.o
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 451c138..dd6e5a3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -48,6 +48,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include "t4_chip_type.h"
 #include "cxgb4_uld.h"
@@ -510,6 +512,7 @@ struct port_info {
 #endif /* CONFIG_CHELSIO_T4_FCOE */
bool rxtstamp;  /* Enable TS */
struct hwtstamp_config tstamp_config;
+   bool ptp_enable;
struct sched_table *sched_tbl;
 };
 
@@ -705,6 +708,7 @@ struct sge_uld_txq_info {
 
 struct sge {
struct sge_eth_txq ethtxq[MAX_ETH_QSETS];
+   struct sge_eth_txq ptptxq;
struct sge_ctrl_txq ctrlq[MAX_CTRL_QUEUES];
 
struct sge_eth_rxq ethrxq[MAX_ETH_QSETS];
@@ -869,6 +873,11 @@ struct adapter {
 * used for all 4 filters.
 */
 
+   struct ptp_clock *ptp_clock;
+   struct ptp_clock_info ptp_clock_info;
+   struct sk_buff *ptp_tx_skb;
+   /* ptp lock */
+   spinlock_t ptp_lock;
spinlock_t stats_lock;
spinlock_t win0_lock cacheline_aligned_in_smp;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index e9bab72..7e4c04d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -1113,14 +1113,37 @@ static int set_flash(struct net_device *netdev, struct 
ethtool_flash *ef)
 
 static int get_ts_info(struct net_device *dev, struct ethtool_ts_info *ts_info)
 {
+   struct port_info *pi = netdev_priv(dev);
+   struct  adapter *adapter = pi->adapter;
+
ts_info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE |
   SOF_TIMESTAMPING_RX_SOFTWARE |
   SOF_TIMESTAMPING_SOFTWARE;
 
ts_info->so_timestamping |= SOF_TIMESTAMPING_RX_HARDWARE |
+   SOF_TIMESTAMPING_TX_HARDWARE |
SOF_TIMESTAMPING_RAW_HARDWARE;
 
-   ts_info->phc_index = -1;
+   ts_info->tx_types = (1 << HWTSTAMP_TX_OFF) |
+   (1 << HWTSTAMP_TX_ON);
+
+   ts_info->rx_filters = (1 << HWTSTAMP_FILTER_NONE) |
+ (1 << HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
+ (1 << HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
+ (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) |
+ (1 << HWTSTAMP_FILTER_PTP_V2_L2_EVENT) |
+ (1 << HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
+ (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) |
+ (1 << HWTSTAMP_FILTER_PTP_V2_L2_SYNC) |
+ (1 << HWTSTAMP_FILTER_PTP_V2_L4_SYNC) |
+ (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ) |
+ (1 << HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ) |
+ (1 << HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ);
+
+   if (adapter->ptp_clock)
+   ts_info->phc_inde

kernel (master) build failure w. !CONFIG_NET_RX_BUSY_POLL

2017-06-28 Thread Mike Galbraith
Greetings network wizards,

The latest RT explicitly disables CONFIG_NET_RX_BUSY_POLL, thus
uncovering $subject.  Below is what I did about it.

-Mike

net: Move napi_hash_add/del() inside CONFIG_NET_RX_BUSY_POLL

Since 545cd5e5ec54 ("net: Busy polling should ignore sender CPUs"),
kernel build fails when CONFIG_NET_RX_BUSY_POLL is disabled.  Move
napi_hash_add/del() accordingly.

Banged-upon-by: Mike Galbraith 
---
 include/linux/netdevice.h |8 
 net/core/dev.c|   12 
 2 files changed, 16 insertions(+), 4 deletions(-)

--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -479,6 +479,8 @@ static inline bool napi_complete(struct
return napi_complete_done(n, 0);
 }
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+
 /**
  * napi_hash_del - remove a NAPI from global table
  * @napi: NAPI context
@@ -493,6 +495,12 @@ static inline bool napi_complete(struct
  */
 bool napi_hash_del(struct napi_struct *napi);
 
+#else /* !CONFIG_NET_RX_BUSY_POLL */
+
+static inline bool napi_hash_del(struct napi_struct *napi) { return false; }
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
 /**
  * napi_disable - prevent NAPI from scheduling
  * @n: NAPI context
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -184,11 +184,13 @@ static int call_netdevice_notifiers_info
 DEFINE_RWLOCK(dev_base_lock);
 EXPORT_SYMBOL(dev_base_lock);
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
 /* protects napi_hash addition/deletion and napi_gen_id */
 static DEFINE_SPINLOCK(napi_hash_lock);
 
 static unsigned int napi_gen_id = NR_CPUS;
 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
+#endif
 
 static seqcount_t devnet_rename_seq;
 static DEFINE_MUTEX(devnet_rename_mutex);
@@ -5185,6 +5187,8 @@ bool napi_complete_done(struct napi_stru
 }
 EXPORT_SYMBOL(napi_complete_done);
 
+#if defined(CONFIG_NET_RX_BUSY_POLL)
+
 /* must be called under rcu_read_lock(), as we dont take a reference */
 static struct napi_struct *napi_by_id(unsigned int napi_id)
 {
@@ -5198,8 +5202,6 @@ static struct napi_struct *napi_by_id(un
return NULL;
 }
 
-#if defined(CONFIG_NET_RX_BUSY_POLL)
-
 #define BUSY_POLL_BUDGET 8
 
 static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
@@ -5300,8 +5302,6 @@ void napi_busy_loop(unsigned int napi_id
 }
 EXPORT_SYMBOL(napi_busy_loop);
 
-#endif /* CONFIG_NET_RX_BUSY_POLL */
-
 static void napi_hash_add(struct napi_struct *napi)
 {
if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
@@ -5341,6 +5341,8 @@ bool napi_hash_del(struct napi_struct *n
 }
 EXPORT_SYMBOL_GPL(napi_hash_del);
 
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
 {
struct napi_struct *napi;
@@ -5377,7 +5379,9 @@ void netif_napi_add(struct net_device *d
napi->poll_owner = -1;
 #endif
set_bit(NAPI_STATE_SCHED, &napi->state);
+#ifdef CONFIG_NET_RX_BUSY_POLL
napi_hash_add(napi);
+#endif
 }
 EXPORT_SYMBOL(netif_napi_add);
 


Re: [PATCH/RFC net-next 3/9] nfp: provide infrastructure for offloading flower based TC filters

2017-06-28 Thread Simon Horman
On Tue, Jun 27, 2017 at 11:13:06PM -0700, Jakub Kicinski wrote:
> On Wed, 28 Jun 2017 01:21:43 +0200, Simon Horman wrote:
> > From: Pieter Jansen van Vuuren 
> > 
> > Adds a flower based TC offload handler for representor devices, this
> > is in addition to the bpf based offload handler. The changes in this
> > patch will be used in a follow-up patch to add tc flower offload to
> > the NFP.
> > 
> > The flower app enables tc offloads on representors by default.
> > 
> > Signed-off-by: Pieter Jansen van Vuuren 
> > 
> > Signed-off-by: Simon Horman 
> 
> > diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c 
> > b/drivers/net/ethernet/netronome/nfp/flower/main.c
> > index ab68a8f58862..7b27871f489c 100644
> > --- a/drivers/net/ethernet/netronome/nfp/flower/main.c
> > +++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
> > @@ -37,6 +37,7 @@
> >  #include 
> >  #include 
> >  
> > +#include "main.h"
> >  #include "../nfpcore/nfp_cpp.h"
> >  #include "../nfpcore/nfp_nsp.h"
> >  #include "../nfp_app.h"
> > @@ -303,8 +304,14 @@ static int nfp_flower_vnic_init(struct nfp_app *app, 
> > struct nfp_net *nn,
> > eth_hw_addr_random(nn->dp.netdev);
> > netif_keep_dst(nn->dp.netdev);
> >  
> > +   if (nfp_flower_repr_init(app))
> > +   goto err_free_priv;
> 
> Please make the contents of nfp_flower_repr_init() part of app's .init
> callback.

Thanks, I will fix this up and other comments relating to
nfp_flower_repr_init()

> > return 0;
> >  
> > +err_free_priv:
> > +   kfree(app->priv);
> > +   app->priv = NULL;
> 
> This doesn't belong here after my recent series.  priv init was moved
> to app's init callback.
> 
> >  err_invalid_port:
> > nn->port = nfp_port_alloc(app, NFP_PORT_INVALID, nn->dp.netdev);
> > return PTR_ERR_OR_ZERO(nn->port);
> 
> > +int nfp_flower_repr_init(struct nfp_app *app)
> > +{
> > +   u64 version;
> > +   int err;
> > +
> > +   version = nfp_rtsym_read_le(app->pf->rtbl, "hw_flower_version", &err);
> > +   if (err)
> > +   return -EINVAL;
> 
> Nit: this could return err directly.  Also I think it's worth printing
> an error message.
> 
> > +   /* We need to ensure hardware has enough flower capabilities. */
> > +   if (version != NFP_FLOWER_ALLOWED_VER)
> > +   return -EINVAL;
> 
> Here we should definitely tell the user what went wrong.
> 
> > +   return 0;
> > +}
> 
> > diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c 
> > b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
> > index bc9108071e5b..a73b311c1f75 100644
> > --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
> > +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
> > @@ -250,6 +250,18 @@ static int nfp_repr_open(struct net_device *netdev)
> > return nfp_app_repr_open(repr->app, repr);
> >  }
> >  
> > +static int
> > +nfp_repr_setup_tc(struct net_device *netdev, u32 handle, u32 chain_index,
> > + __be16 proto, struct tc_to_netdev *tc)
> > +{
> > +   struct nfp_repr *repr = netdev_priv(netdev);
> > +
> > +   if (chain_index)
> > +   return -EOPNOTSUPP;
> > +
> > +   return nfp_app_setup_tc(repr->app, netdev, handle, proto, tc);
> > +}
> 
> Just a reminder that this could be a nfp_port function.

Sure, will do.


Re: [oss-drivers] Re: [PATCH/RFC net-next 4/9] nfp: extend flower add flow offload

2017-06-28 Thread Simon Horman
On Tue, Jun 27, 2017 at 11:13:30PM -0700, Jakub Kicinski wrote:
> On Wed, 28 Jun 2017 01:21:44 +0200, Simon Horman wrote:
> > diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c 
> > b/drivers/net/ethernet/netronome/nfp/flower/offload.c
> > index 9127c28ea9c3..8ad72f57493d 100644
> > --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c
> > +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c
> > @@ -45,6 +45,145 @@
> >  #include "../nfp_net.h"
> >  #include "../nfp_port.h"
> >  
> > +static bool nfp_flower_check_lower_than_mac(struct tc_cls_flower_offload 
> > *f)
> > +{
> > +   return dissector_uses_key(f->dissector,
> > + FLOW_DISSECTOR_KEY_IPV4_ADDRS) ||
> > +   dissector_uses_key(f->dissector,
> > +  FLOW_DISSECTOR_KEY_IPV6_ADDRS) ||
> > +   dissector_uses_key(f->dissector,
> > +  FLOW_DISSECTOR_KEY_PORTS) ||
> > +   dissector_uses_key(f->dissector, FLOW_DISSECTOR_KEY_ICMP);
> > +}
> 
> Nit: should this be named higher than mac?

Yes, I think so now you mention it.
I'll fix this in v2.


[PATCH NET V7 0/2] Add loopback support in phy_driver and hns ethtool fix

2017-06-28 Thread Lin Yun Sheng
This Patch Set add set_loopback in phy_driver and use it to setup loopback
when doing ethtool phy self_test.

Patch V7:
1. Add comment why resume the phy in hns_nic_config_phy_loopback.
2. Fix a typo error in patch description.

Patch V6:
Fix Or'ing error code in __lb_setup.

Patch V5:
Removing non loopback related code change.

Patch V4:
1. Remove c45 checking
2. Add -ENOTSUPP when function pointer is null,
   take mutex in phy_loopback.

Patch V3:
Calling phy_loopback enable and disable in pair in hns mac driver.

Patch V2:
1. Add phy_loopback in phy_device.c.
2. Do error checking and do the read and write once in
   genphy_loopback.
3. Remove gen10g_loopback in phy_device.c.

Patch V1:
Initial Submit

Lin Yun Sheng (2):
  net: phy: Add phy loopback support in net phy framework
  net: hns: Use phy_driver to setup Phy loopback

 drivers/net/ethernet/hisilicon/hns/hnae.h|   1 +
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 105 ---
 drivers/net/phy/marvell.c|   1 +
 drivers/net/phy/phy_device.c |  51 +++
 include/linux/phy.h  |   5 ++
 5 files changed, 92 insertions(+), 71 deletions(-)

-- 
1.9.1



[PATCH NET V7 2/2] net: hns: Use phy_driver to setup Phy loopback

2017-06-28 Thread Lin Yun Sheng
Use function set_loopback in phy_driver to setup phy loopback
when doing ethtool self test.

Signed-off-by: Lin Yun Sheng 
---
 drivers/net/ethernet/hisilicon/hns/hnae.h|   1 +
 drivers/net/ethernet/hisilicon/hns/hns_ethtool.c | 105 ---
 2 files changed, 35 insertions(+), 71 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.h 
b/drivers/net/ethernet/hisilicon/hns/hnae.h
index 04211ac..7ba653a 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.h
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.h
@@ -360,6 +360,7 @@ enum hnae_loop {
MAC_INTERNALLOOP_MAC = 0,
MAC_INTERNALLOOP_SERDES,
MAC_INTERNALLOOP_PHY,
+   MAC_LOOP_PHY_NONE,
MAC_LOOP_NONE,
 };
 
diff --git a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
index e95795b..92865cf 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_ethtool.c
@@ -259,67 +259,27 @@ static int hns_nic_set_link_ksettings(struct net_device 
*net_dev,
 
 static int hns_nic_config_phy_loopback(struct phy_device *phy_dev, u8 en)
 {
-#define COPPER_CONTROL_REG 0
-#define PHY_POWER_DOWN BIT(11)
-#define PHY_LOOP_BACK BIT(14)
-   u16 val = 0;
-
-   if (phy_dev->is_c45) /* c45 branch adding for XGE PHY */
-   return -ENOTSUPP;
+   int err;
 
if (en) {
-   /* speed : 1000M */
-   phy_write(phy_dev, HNS_PHY_PAGE_REG, 2);
-   phy_write(phy_dev, 21, 0x1046);
-
-   phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
-   /* Force Master */
-   phy_write(phy_dev, 9, 0x1F00);
-
-   /* Soft-reset */
-   phy_write(phy_dev, 0, 0x9140);
-   /* If autoneg disabled,two soft-reset operations */
-   phy_write(phy_dev, 0, 0x9140);
-
-   phy_write(phy_dev, HNS_PHY_PAGE_REG, 0xFA);
-
-   /* Default is 0x0400 */
-   phy_write(phy_dev, 1, 0x418);
-
-   /* Force 1000M Link, Default is 0x0200 */
-   phy_write(phy_dev, 7, 0x20C);
-
-   /* Powerup Fiber */
-   phy_write(phy_dev, HNS_PHY_PAGE_REG, 1);
-   val = phy_read(phy_dev, COPPER_CONTROL_REG);
-   val &= ~PHY_POWER_DOWN;
-   phy_write(phy_dev, COPPER_CONTROL_REG, val);
-
-   /* Enable Phy Loopback */
-   phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
-   val = phy_read(phy_dev, COPPER_CONTROL_REG);
-   val |= PHY_LOOP_BACK;
-   val &= ~PHY_POWER_DOWN;
-   phy_write(phy_dev, COPPER_CONTROL_REG, val);
+   /* Doing phy loopback in offline state, phy resuming is
+* needed to power up the device.
+*/
+   err = phy_resume(phy_dev);
+   if (err)
+   goto out;
+
+   err = phy_loopback(phy_dev, true);
} else {
-   phy_write(phy_dev, HNS_PHY_PAGE_REG, 0xFA);
-   phy_write(phy_dev, 1, 0x400);
-   phy_write(phy_dev, 7, 0x200);
-
-   phy_write(phy_dev, HNS_PHY_PAGE_REG, 1);
-   val = phy_read(phy_dev, COPPER_CONTROL_REG);
-   val |= PHY_POWER_DOWN;
-   phy_write(phy_dev, COPPER_CONTROL_REG, val);
-
-   phy_write(phy_dev, HNS_PHY_PAGE_REG, 0);
-   phy_write(phy_dev, 9, 0xF00);
-
-   val = phy_read(phy_dev, COPPER_CONTROL_REG);
-   val &= ~PHY_LOOP_BACK;
-   val |= PHY_POWER_DOWN;
-   phy_write(phy_dev, COPPER_CONTROL_REG, val);
+   err = phy_loopback(phy_dev, false);
+   if (err)
+   goto out;
+
+   err = phy_suspend(phy_dev);
}
-   return 0;
+
+out:
+   return err;
 }
 
 static int __lb_setup(struct net_device *ndev,
@@ -332,10 +292,9 @@ static int __lb_setup(struct net_device *ndev,
 
switch (loop) {
case MAC_INTERNALLOOP_PHY:
-   if ((phy_dev) && (!phy_dev->is_c45)) {
-   ret = hns_nic_config_phy_loopback(phy_dev, 0x1);
-   ret |= h->dev->ops->set_loopback(h, loop, 0x1);
-   }
+   ret = hns_nic_config_phy_loopback(phy_dev, 0x1);
+   if (!ret)
+   ret = h->dev->ops->set_loopback(h, loop, 0x1);
break;
case MAC_INTERNALLOOP_MAC:
if ((h->dev->ops->set_loopback) &&
@@ -346,17 +305,17 @@ static int __lb_setup(struct net_device *ndev,
if (h->dev->ops->set_loopback)
ret = h->dev->ops->set_loopback(h, loop, 0x1);
break;
+   case MAC_LOOP_PHY_NONE:
+   ret = hns_nic_config_phy_loopback(phy_dev, 0x0);
case MAC_LOOP_NONE:
-   if ((phy_dev) && (!phy_dev->is_c45))
-   ret |= h

[PATCH NET V7 1/2] net: phy: Add phy loopback support in net phy framework

2017-06-28 Thread Lin Yun Sheng
This patch add set_loopback in phy_driver, which is used by MAC
driver to enable or disable phy loopback. it also add a generic
genphy_loopback function, which use BMCR loopback bit to enable
or disable loopback.

Signed-off-by: Lin Yun Sheng 
---
 drivers/net/phy/marvell.c|  1 +
 drivers/net/phy/phy_device.c | 51 
 include/linux/phy.h  |  5 +
 3 files changed, 57 insertions(+)

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 57297ba..01a1586 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -2094,6 +2094,7 @@ static int m88e1510_probe(struct phy_device *phydev)
.get_sset_count = marvell_get_sset_count,
.get_strings = marvell_get_strings,
.get_stats = marvell_get_stats,
+   .set_loopback = genphy_loopback,
},
{
.phy_id = MARVELL_PHY_ID_88E1540,
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 1219eea..1e08d62 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1123,6 +1123,39 @@ int phy_resume(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_resume);
 
+int phy_loopback(struct phy_device *phydev, bool enable)
+{
+   struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver);
+   int ret = 0;
+
+   mutex_lock(&phydev->lock);
+
+   if (enable && phydev->loopback_enabled) {
+   ret = -EBUSY;
+   goto out;
+   }
+
+   if (!enable && !phydev->loopback_enabled) {
+   ret = -EINVAL;
+   goto out;
+   }
+
+   if (phydev->drv && phydrv->set_loopback)
+   ret = phydrv->set_loopback(phydev, enable);
+   else
+   ret = -EOPNOTSUPP;
+
+   if (ret)
+   goto out;
+
+   phydev->loopback_enabled = enable;
+
+out:
+   mutex_unlock(&phydev->lock);
+   return ret;
+}
+EXPORT_SYMBOL(phy_loopback);
+
 /* Generic PHY support and helper functions */
 
 /**
@@ -1628,6 +1661,23 @@ static int gen10g_resume(struct phy_device *phydev)
return 0;
 }
 
+int genphy_loopback(struct phy_device *phydev, bool enable)
+{
+   int value;
+
+   value = phy_read(phydev, MII_BMCR);
+   if (value < 0)
+   return value;
+
+   if (enable)
+   value |= BMCR_LOOPBACK;
+   else
+   value &= ~BMCR_LOOPBACK;
+
+   return phy_write(phydev, MII_BMCR, value);
+}
+EXPORT_SYMBOL(genphy_loopback);
+
 static int __set_phy_supported(struct phy_device *phydev, u32 max_speed)
 {
/* The default values for phydev->supported are provided by the PHY
@@ -1874,6 +1924,7 @@ void phy_drivers_unregister(struct phy_driver *drv, int n)
.read_status= genphy_read_status,
.suspend= genphy_suspend,
.resume = genphy_resume,
+   .set_loopback   = genphy_loopback,
 }, {
.phy_id = 0x,
.phy_id_mask= 0x,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e76e4ad..49c903dc 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -364,6 +364,7 @@ struct phy_c45_device_ids {
  * is_pseudo_fixed_link: Set to true if this phy is an Ethernet switch, etc.
  * has_fixups: Set to true if this phy has fixups/quirks.
  * suspended: Set to true if this phy has been suspended successfully.
+ * loopback_enabled: Set true if this phy has been loopbacked successfully.
  * state: state of the PHY for management purposes
  * dev_flags: Device-specific flags used by the PHY driver.
  * link_timeout: The number of timer firings to wait before the
@@ -400,6 +401,7 @@ struct phy_device {
bool is_pseudo_fixed_link;
bool has_fixups;
bool suspended;
+   bool loopback_enabled;
 
enum phy_state state;
 
@@ -639,6 +641,7 @@ struct phy_driver {
int (*set_tunable)(struct phy_device *dev,
struct ethtool_tunable *tuna,
const void *data);
+   int (*set_loopback)(struct phy_device *dev, bool enable);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),
\
  struct phy_driver, mdiodrv)
@@ -774,6 +777,7 @@ static inline void phy_device_free(struct phy_device 
*phydev) { }
 int phy_init_hw(struct phy_device *phydev);
 int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
+int phy_loopback(struct phy_device *phydev, bool enable);
 struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
  phy_interface_t interface);
 struct phy_device *phy_find_first(struct mii_bus *bus);
@@ -825,6 +829,7 @@ void phy_attached_print(struct phy_device *phydev, const 
char *fmt, ...)
 int genphy_read_status(struct phy_device *phydev);
 int genphy_suspend(struct phy_device *phydev);
 int genphy_resume(struct phy_device *phy

Re: [PATCH] net: stmmac: Add additional registers for dwmac1000_dma ethtool

2017-06-28 Thread Giuseppe CAVALLARO

On 6/28/2017 12:16 AM, thor.tha...@linux.intel.com wrote:

From: Thor Thayer 

Version 3.70a of the Designware has additional DMA registers so
add those to the ethtool DMA Register dump.
Offset 9  - Receive Interrupt Watchdog Timer Register
Offset 10 - AXI Bus Mode Register
Offset 11 - AHB or AXI Status Register
Offset 22 - HW Feature Register

Signed-off-by: Thor Thayer 


Acked-by: Giuseppe Cavallaro 


---
  drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c  | 4 ++--
  drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c | 2 +-
  2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index 471a9aa..22cf635 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -205,8 +205,8 @@ static void dwmac1000_dump_dma_regs(void __iomem *ioaddr, 
u32 *reg_space)
  {
int i;
  
-	for (i = 0; i < 22; i++)

-   if ((i < 9) || (i > 17))
+   for (i = 0; i < 23; i++)
+   if ((i < 12) || (i > 17))
reg_space[DMA_BUS_MODE / 4 + i] =
readl(ioaddr + DMA_BUS_MODE + i * 4);
  }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index 743170d..babb39c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -29,7 +29,7 @@
  #include "stmmac.h"
  #include "dwmac_dma.h"
  
-#define REG_SPACE_SIZE	0x1054

+#define REG_SPACE_SIZE 0x1060
  #define MAC100_ETHTOOL_NAME   "st_mac100"
  #define GMAC_ETHTOOL_NAME "st_gmac"
  





Re: [PATCH 2/2] ethtool: stmmac: Add DMA HW Feature Register

2017-06-28 Thread Giuseppe CAVALLARO

On 6/27/2017 11:51 PM, thor.tha...@linux.intel.com wrote:

From: Thor Thayer 

This patch adds the DMA HW Feature Register which is at the end
of the DMA registers and is documented in Version 3.70a.

Signed-off-by: Thor Thayer 
---
  stmmac.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stmmac.c b/stmmac.c
index e1bb291..7d7bebd 100644
--- a/stmmac.c
+++ b/stmmac.c
@@ -64,7 +64,7 @@ int st_gmac_dump_regs(struct ethtool_drvinfo *info, struct 
ethtool_regs *regs)
fprintf(stdout, "\n");
fprintf(stdout, "DMA Registers\n");
stmmac_reg = (unsigned int *)regs->data + DMA_REG_OFFSET;
-   for (i = 0; i < 22; i++)
+   for (i = 0; i < 23; i++)


thx Thor for these changes, I wonder if you could add a macro instead 23 
while doing this kind of changes


Sorry if I didn't it in the past.

the, you can send the series with my Acked-by: Giuseppe Cavallaro 



Regards

peppe



fprintf(stdout, "Reg%d  0x%08X\n", i, *stmmac_reg++);
  
  	return 0;





Re: [oss-drivers] Re: [PATCH/RFC net-next 7/9] nfp: add metadata to each flow offload

2017-06-28 Thread Simon Horman
On Tue, Jun 27, 2017 at 11:15:20PM -0700, Jakub Kicinski wrote:
> On Wed, 28 Jun 2017 01:21:47 +0200, Simon Horman wrote:
> > From: Pieter Jansen van Vuuren 
> > 
> > Adds metadata describing the mask id of each flow and keeps track of
> > flows installed in hardware. Previously a flow could not be removed
> > from hardware as there was no way of knowing if that a specific flow
> > was installed. This is solved by storing the offloaded flows in a
> > hash table.
t reb> > 
> > Signed-off-by: Pieter Jansen van Vuuren 
> > 
> > Signed-off-by: Simon Horman 
> 
> > diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h 
> > b/drivers/net/ethernet/netronome/nfp/flower/main.h
> > index 52db2acb250e..cc184618306c 100644
> > --- a/drivers/net/ethernet/netronome/nfp/flower/main.h
> > +++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
> > @@ -34,6 +34,7 @@
> >  #ifndef __NFP_FLOWER_H__
> >  #define __NFP_FLOWER_H__ 1
> >  
> > +#include 
> >  #include 
> >  
> >  #include "cmsg.h"
> > @@ -45,6 +46,42 @@ struct tc_to_netdev;
> >  struct net_device;
> >  struct nfp_app;
> >  
> > +#define NFP_FLOWER_HASH_BITS   10
> > +#define NFP_FLOWER_HASH_SEED   129004
> > +
> > +#define NFP_FLOWER_MASK_ENTRY_RS   256
> > +#define NFP_FLOWER_MASK_ELEMENT_RS 1
> > +#define NFP_FLOWER_MASK_HASH_BITS  10
> > +#define NFP_FLOWER_MASK_HASH_SEED  9198806
> > +
> > +#define NFP_FL_META_FLAG_NEW_MASK  128
> > +#define NFP_FL_META_FLAG_LAST_MASK 1
> > +
> > +#define NFP_FL_MASK_REUSE_TIME 40
> > +#define NFP_FL_MASK_ID_LOCATION1
> > +
> > +struct nfp_fl_mask_id {
> > +   struct circ_buf mask_id_free_list;
> > +   struct timeval *last_used;
> > +   u8 init_unallocated;
> > +};
> > +
> > +/**
> > + * struct nfp_flower_priv - Flower APP per-vNIC priv data
> > + * @nn:Pointer to vNIC
> > + * @flower_version:HW version of flower
> > + * @mask_ids:  List of free mask ids
> > + * @mask_table:Hash table used to store masks
> > + * @flow_table:Hash table used to store flower rules
> > + */
> > +struct nfp_flower_priv {
> > +   struct nfp_net *nn;
> > +   u64 flower_version;
> > +   struct nfp_fl_mask_id mask_ids;
> > +   DECLARE_HASHTABLE(mask_table, NFP_FLOWER_MASK_HASH_BITS);
> > +   DECLARE_HASHTABLE(flow_table, NFP_FLOWER_HASH_BITS);
> 
> Include for hashtable seems missing.

Thanks, I'll include linux/hashtable.h

> > +};
> > +
> >  struct nfp_fl_key_ls {
> > u32 key_layer_two;
> > u8 key_layer;
> > @@ -69,6 +106,10 @@ struct nfp_fl_payload {
> > char *action_data;
> >  };
> >  
> > +int nfp_flower_metadata_init(struct nfp_app *app);
> > +void nfp_flower_metadata_cleanup(struct nfp_app *app);
> > +
> > +int nfp_repr_get_port_id(struct net_device *netdev);
> 
> Isn't this a static inline in repr.h?

Sorry, I think that crept in during some patch shuffling.
I will remove it.

> >  int nfp_flower_repr_init(struct nfp_app *app);
> >  int nfp_flower_setup_tc(struct nfp_app *app, struct net_device *netdev,
> > u32 handle, __be16 proto, struct tc_to_netdev *tc);
> 
> > diff --git a/drivers/net/ethernet/netronome/nfp/flower/metadata.c 
> > b/drivers/net/ethernet/netronome/nfp/flower/metadata.c
> > new file mode 100644
> > index ..acbf4c757988
> 
> > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> > + * SOFTWARE.
> > + */
> > +
> > +#include 
> 
> I think this is unnecessary.

Thanks, removed.

> 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> 
> > +static int nfp_mask_alloc(struct nfp_app *app, u8 *mask_id)
> > +{
> > +   struct nfp_flower_priv *priv = app->priv;
> > +   struct timeval reuse, curr;
> > +   struct circ_buf *ring;
> > +   u8 temp_id, freed_id;
> > +
> > +   ring = &priv->mask_ids.mask_id_free_list;
> > +   freed_id = NFP_FLOWER_MASK_ENTRY_RS - 1;
> > +   /* Checking for unallocated entries first. */
> > +   if (priv->mask_ids.init_unallocated > 0) {
> > +   *mask_id = priv->mask_ids.init_unallocated;
> > +   priv->mask_ids.init_unallocated--;
> > +   goto exit_check_timestamp;
> 
> Do you really need to check the timestamp here?  Isn't this if() for the
> case where we have some masks which were never used by the driver?

I think not. I will drop this.

> > +   }
> > +
> > +   /* Checking if buffer is empty. */
> > +   if (ring->head == ring->tail) {
> > +   *mask_id = freed_id;
> > +   return -ENOENT;
> > +   }
> > +
> > +   memcpy(&temp_id, &ring->buf[ring->tail], NFP_FLOWER_MASK_ELEMENT_RS);
> > +   *mask_id = temp_id;
> > +   memcpy(&ring->buf[ring->tail], &freed_id, NFP_FLOWER_MASK_ELEMENT_RS);
> > +
> > +   ring->tail = (ring->tail + NFP_FLOWER_MASK_ELEMENT_RS) %
> > +(NFP_FLOWER_MASK_ENTRY_RS * NFP_FLOWER_MASK_ELEMENT_RS);
> > +
> > +exit_check_timestamp:
> > +   do_gettimeofday(&curr);
> > +   reuse.tv_sec = curr.tv_sec -
> > +  priv->mask_ids

[PATCH net] rocker: move dereference before free

2017-06-28 Thread Dan Carpenter
My static checker complains that ofdpa_neigh_del() can sometimes free
"found".   It just makes sense to use it first before deleting it.

Fixes: ecf244f753e0 ("rocker: fix maybe-uninitialized warning")
Signed-off-by: Dan Carpenter 
---
Most ref counting function use atomic_dec() but ofdpa_neigh_del() just
uses normal decrement.  Are you sure this isn't racy?

This applies to net.  In net-next the code has shifted around and it
doesn't apply anymore.  Should I send a different patch for net-next?

diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c 
b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 2ae852454780..a9ce82d3e9cf 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -1505,8 +1505,8 @@ static int ofdpa_port_ipv4_nh(struct ofdpa_port 
*ofdpa_port,
*index = entry->index;
resolved = false;
} else if (removing) {
-   ofdpa_neigh_del(trans, found);
*index = found->index;
+   ofdpa_neigh_del(trans, found);
} else if (updating) {
ofdpa_neigh_update(found, trans, NULL, false);
resolved = !is_zero_ether_addr(found->eth_dst);


Re: [PATCH] iwlwifi: mvm: add const to thermal_cooling_device_ops structure

2017-06-28 Thread Luca Coelho
On Wed, 2017-06-21 at 14:10 +0530, Bhumika Goyal wrote:
> Declare thermal_cooling_device_ops structure as const as it is only passed
> as an argument to the function thermal_cooling_device_register and this
> argument is of type const. So, declare the structure as const.
> 
> Signed-off-by: Bhumika Goyal 
> ---

Thanks, we're reviewing this internally.  It looks fine, but I need to
assess whether this will have any impacts in our backports project
before we can apply it.

--
Cheers,
Luca.


[PATCH 05/17] net: convert sk_buff.users from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 drivers/infiniband/hw/nes/nes_cm.c |  4 ++--
 drivers/isdn/mISDN/socket.c|  2 +-
 drivers/net/rionet.c   |  2 +-
 drivers/s390/net/ctcm_main.c   | 26 +-
 drivers/s390/net/netiucv.c | 10 +-
 drivers/s390/net/qeth_core_main.c  |  4 ++--
 include/linux/skbuff.h |  6 +++---
 net/core/datagram.c|  8 
 net/core/dev.c | 10 +-
 net/core/netpoll.c |  4 ++--
 net/core/pktgen.c  | 16 
 net/core/rtnetlink.c   |  2 +-
 net/core/skbuff.c  | 20 ++--
 net/dccp/ipv6.c|  2 +-
 net/ipv6/syncookies.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  2 +-
 net/key/af_key.c   |  4 ++--
 net/netlink/af_netlink.c   |  6 +++---
 net/rxrpc/skbuff.c | 12 ++--
 net/sctp/outqueue.c|  2 +-
 net/sctp/socket.c  |  2 +-
 21 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/drivers/infiniband/hw/nes/nes_cm.c 
b/drivers/infiniband/hw/nes/nes_cm.c
index 30b256a..de4025d 100644
--- a/drivers/infiniband/hw/nes/nes_cm.c
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -742,7 +742,7 @@ int schedule_nes_timer(struct nes_cm_node *cm_node, struct 
sk_buff *skb,
 
if (type == NES_TIMER_TYPE_SEND) {
new_send->seq_num = ntohl(tcp_hdr(skb)->seq);
-   atomic_inc(&new_send->skb->users);
+   refcount_inc(&new_send->skb->users);
spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
cm_node->send_entry = new_send;
add_ref_cm_node(cm_node);
@@ -924,7 +924,7 @@ static void nes_cm_timer_tick(unsigned long pass)
  flags);
break;
}
-   atomic_inc(&send_entry->skb->users);
+   refcount_inc(&send_entry->skb->users);
cm_packets_retrans++;
nes_debug(NES_DBG_CM, "Retransmitting send_entry %p "
  "for node %p, jiffies = %lu, time to send = "
diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 99e5f97..c5603d1 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -155,7 +155,7 @@ mISDN_sock_recvmsg(struct socket *sock, struct msghdr *msg, 
size_t len,
copied = skb->len + MISDN_HEADER_LEN;
if (len < copied) {
if (flags & MSG_PEEK)
-   atomic_dec(&skb->users);
+   refcount_dec(&skb->users);
else
skb_queue_head(&sk->sk_receive_queue, skb);
return -ENOSPC;
diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c
index 300bb14..e9f101c 100644
--- a/drivers/net/rionet.c
+++ b/drivers/net/rionet.c
@@ -201,7 +201,7 @@ static int rionet_start_xmit(struct sk_buff *skb, struct 
net_device *ndev)
rionet_queue_tx_msg(skb, ndev,
nets[rnet->mport->id].active[i]);
if (count)
-   atomic_inc(&skb->users);
+   refcount_inc(&skb->users);
count++;
}
} else if (RIONET_MAC_MATCH(eth->h_dest)) {
diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c
index 198842c..912b877 100644
--- a/drivers/s390/net/ctcm_main.c
+++ b/drivers/s390/net/ctcm_main.c
@@ -483,7 +483,7 @@ static int ctcm_transmit_skb(struct channel *ch, struct 
sk_buff *skb)
spin_unlock_irqrestore(&ch->collect_lock, saveflags);
return -EBUSY;
} else {
-   atomic_inc(&skb->users);
+   refcount_inc(&skb->users);
header.length = l;
header.type = be16_to_cpu(skb->protocol);
header.unused = 0;
@@ -500,7 +500,7 @@ static int ctcm_transmit_skb(struct channel *ch, struct 
sk_buff *skb)
 * Protect skb against beeing free'd by upper
 * layers.
 */
-   atomic_inc(&skb->users);
+   refcount_inc(&skb->users);
ch->prof.txlen += skb->len;
header.length = skb->len + LL_HEADER_LENGTH;
header.type = be16_to_cpu(skb->protocol);
@@ -517,14 +517,14 @@ static int ctcm_transmit_skb(struct channel *ch, struct 
s

[PATCH 15/17] net: convert net.passive from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/net/net_namespace.h | 3 ++-
 net/core/net-sysfs.c| 2 +-
 net/core/net_namespace.c| 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index fe80bb4..bffe0a3 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -5,6 +5,7 @@
 #define __NET_NET_NAMESPACE_H
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -46,7 +47,7 @@ struct netns_ipvs;
 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 
 struct net {
-   atomic_tpassive;/* To decided when the network
+   refcount_t  passive;/* To decided when the network
 * namespace should be freed.
 */
atomic_tcount;  /* To decided when the network
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 65ea0ff..bdcf5dd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1444,7 +1444,7 @@ static void *net_grab_current_ns(void)
struct net *ns = current->nsproxy->net_ns;
 #ifdef CONFIG_NET_NS
if (ns)
-   atomic_inc(&ns->passive);
+   refcount_inc(&ns->passive);
 #endif
return ns;
 }
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 26bbfab..50935eb 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -284,7 +284,7 @@ static __net_init int setup_net(struct net *net, struct 
user_namespace *user_ns)
LIST_HEAD(net_exit_list);
 
atomic_set(&net->count, 1);
-   atomic_set(&net->passive, 1);
+   refcount_set(&net->passive, 1);
net->dev_base_seq = 1;
net->user_ns = user_ns;
idr_init(&net->netns_ids);
@@ -380,7 +380,7 @@ static void net_free(struct net *net)
 void net_drop_ns(void *p)
 {
struct net *ns = p;
-   if (ns && atomic_dec_and_test(&ns->passive))
+   if (ns && refcount_dec_and_test(&ns->passive))
net_free(ns);
 }
 
-- 
2.7.4



[PATCH 16/17] net: convert netlbl_lsm_cache.refcount from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/net/netlabel.h | 8 
 net/ipv4/cipso_ipv4.c  | 4 ++--
 net/ipv6/calipso.c | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index efe9806..72d6435 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -37,7 +37,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 struct cipso_v4_doi;
 struct calipso_doi;
@@ -136,7 +136,7 @@ struct netlbl_audit {
  *
  */
 struct netlbl_lsm_cache {
-   atomic_t refcount;
+   refcount_t refcount;
void (*free) (const void *data);
void *data;
 };
@@ -295,7 +295,7 @@ static inline struct netlbl_lsm_cache 
*netlbl_secattr_cache_alloc(gfp_t flags)
 
cache = kzalloc(sizeof(*cache), flags);
if (cache)
-   atomic_set(&cache->refcount, 1);
+   refcount_set(&cache->refcount, 1);
return cache;
 }
 
@@ -309,7 +309,7 @@ static inline struct netlbl_lsm_cache 
*netlbl_secattr_cache_alloc(gfp_t flags)
  */
 static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache)
 {
-   if (!atomic_dec_and_test(&cache->refcount))
+   if (!refcount_dec_and_test(&cache->refcount))
return;
 
if (cache->free)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index ae20616..c204477 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -265,7 +265,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
entry->key_len == key_len &&
memcmp(entry->key, key, key_len) == 0) {
entry->activity += 1;
-   atomic_inc(&entry->lsm_data->refcount);
+   refcount_inc(&entry->lsm_data->refcount);
secattr->cache = entry->lsm_data;
secattr->flags |= NETLBL_SECATTR_CACHE;
secattr->type = NETLBL_NLTYPE_CIPSOV4;
@@ -332,7 +332,7 @@ int cipso_v4_cache_add(const unsigned char *cipso_ptr,
}
entry->key_len = cipso_ptr_len;
entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len);
-   atomic_inc(&secattr->cache->refcount);
+   refcount_inc(&secattr->cache->refcount);
entry->lsm_data = secattr->cache;
 
bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 8d772fe..4406752 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -227,7 +227,7 @@ static int calipso_cache_check(const unsigned char *key,
entry->key_len == key_len &&
memcmp(entry->key, key, key_len) == 0) {
entry->activity += 1;
-   atomic_inc(&entry->lsm_data->refcount);
+   refcount_inc(&entry->lsm_data->refcount);
secattr->cache = entry->lsm_data;
secattr->flags |= NETLBL_SECATTR_CACHE;
secattr->type = NETLBL_NLTYPE_CALIPSO;
@@ -296,7 +296,7 @@ static int calipso_cache_add(const unsigned char 
*calipso_ptr,
}
entry->key_len = calipso_ptr_len;
entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len);
-   atomic_inc(&secattr->cache->refcount);
+   refcount_inc(&secattr->cache->refcount);
entry->lsm_data = secattr->cache;
 
bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1);
-- 
2.7.4



[PATCH 13/17] net: convert fib_rule.refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/net/fib_rules.h | 7 ---
 net/core/fib_rules.c| 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 76c7300..c487bfa 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -29,7 +30,7 @@ struct fib_rule {
struct fib_rule __rcu   *ctarget;
struct net  *fr_net;
 
-   atomic_trefcnt;
+   refcount_t  refcnt;
u32 pref;
int suppress_ifgroup;
int suppress_prefixlen;
@@ -103,12 +104,12 @@ struct fib_rules_ops {
 
 static inline void fib_rule_get(struct fib_rule *rule)
 {
-   atomic_inc(&rule->refcnt);
+   refcount_inc(&rule->refcnt);
 }
 
 static inline void fib_rule_put(struct fib_rule *rule)
 {
-   if (atomic_dec_and_test(&rule->refcnt))
+   if (refcount_dec_and_test(&rule->refcnt))
kfree_rcu(rule, rcu);
 }
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 3bba291..c4ecd9f 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -46,7 +46,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
if (r == NULL)
return -ENOMEM;
 
-   atomic_set(&r->refcnt, 1);
+   refcount_set(&r->refcnt, 1);
r->action = FR_ACT_TO_TBL;
r->pref = pref;
r->table = table;
@@ -283,7 +283,7 @@ int fib_rules_lookup(struct fib_rules_ops *ops, struct 
flowi *fl,
 
if (err != -EAGAIN) {
if ((arg->flags & FIB_LOOKUP_NOREF) ||
-   likely(atomic_inc_not_zero(&rule->refcnt))) {
+   likely(refcount_inc_not_zero(&rule->refcnt))) {
arg->rule = rule;
goto out;
}
-- 
2.7.4



[PATCH 17/17] net: convert packet_fanout.sk_ref from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 net/packet/af_packet.c | 8 
 net/packet/internal.h  | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 87d7867..5c78d6a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1742,7 +1742,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 
type_flags)
match->flags = flags;
INIT_LIST_HEAD(&match->list);
spin_lock_init(&match->lock);
-   atomic_set(&match->sk_ref, 0);
+   refcount_set(&match->sk_ref, 0);
fanout_init_data(match);
match->prot_hook.type = po->prot_hook.type;
match->prot_hook.dev = po->prot_hook.dev;
@@ -1756,10 +1756,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 
type_flags)
match->prot_hook.type == po->prot_hook.type &&
match->prot_hook.dev == po->prot_hook.dev) {
err = -ENOSPC;
-   if (atomic_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
+   if (refcount_read(&match->sk_ref) < PACKET_FANOUT_MAX) {
__dev_remove_pack(&po->prot_hook);
po->fanout = match;
-   atomic_inc(&match->sk_ref);
+   refcount_set(&match->sk_ref, 
refcount_read(&match->sk_ref) + 1);
__fanout_link(sk, po);
err = 0;
}
@@ -1788,7 +1788,7 @@ static struct packet_fanout *fanout_release(struct sock 
*sk)
if (f) {
po->fanout = NULL;
 
-   if (atomic_dec_and_test(&f->sk_ref))
+   if (refcount_dec_and_test(&f->sk_ref))
list_del(&f->list);
else
f = NULL;
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 9ee4631..94d1d40 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -1,6 +1,8 @@
 #ifndef __PACKET_INTERNAL_H__
 #define __PACKET_INTERNAL_H__
 
+#include 
+
 struct packet_mclist {
struct packet_mclist*next;
int ifindex;
@@ -86,7 +88,7 @@ struct packet_fanout {
struct list_headlist;
struct sock *arr[PACKET_FANOUT_MAX];
spinlock_t  lock;
-   atomic_tsk_ref;
+   refcount_t  sk_ref;
struct packet_type  prot_hook cacheline_aligned_in_smp;
 };
 
-- 
2.7.4



[PATCH 14/17] net: convert inet_frag_queue.refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/net/inet_frag.h  |  4 ++--
 net/ipv4/inet_fragment.c | 14 +++---
 net/ipv4/ip_fragment.c   |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 5894730..5a334bf 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -50,7 +50,7 @@ struct inet_frag_queue {
spinlock_t  lock;
struct timer_list   timer;
struct hlist_node   list;
-   atomic_trefcnt;
+   refcount_t  refcnt;
struct sk_buff  *fragments;
struct sk_buff  *fragments_tail;
ktime_t stamp;
@@ -129,7 +129,7 @@ void inet_frag_maybe_warn_overflow(struct inet_frag_queue 
*q,
 
 static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags 
*f)
 {
-   if (atomic_dec_and_test(&q->refcnt))
+   if (refcount_dec_and_test(&q->refcnt))
inet_frag_destroy(q, f);
 }
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index b5e9317..96e95e8 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -276,11 +276,11 @@ static inline void fq_unlink(struct inet_frag_queue *fq, 
struct inet_frags *f)
 void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
 {
if (del_timer(&fq->timer))
-   atomic_dec(&fq->refcnt);
+   refcount_dec(&fq->refcnt);
 
if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq_unlink(fq, f);
-   atomic_dec(&fq->refcnt);
+   refcount_dec(&fq->refcnt);
}
 }
 EXPORT_SYMBOL(inet_frag_kill);
@@ -329,7 +329,7 @@ static struct inet_frag_queue *inet_frag_intern(struct 
netns_frags *nf,
 */
hlist_for_each_entry(qp, &hb->chain, list) {
if (qp->net == nf && f->match(qp, arg)) {
-   atomic_inc(&qp->refcnt);
+   refcount_inc(&qp->refcnt);
spin_unlock(&hb->chain_lock);
qp_in->flags |= INET_FRAG_COMPLETE;
inet_frag_put(qp_in, f);
@@ -339,9 +339,9 @@ static struct inet_frag_queue *inet_frag_intern(struct 
netns_frags *nf,
 #endif
qp = qp_in;
if (!mod_timer(&qp->timer, jiffies + nf->timeout))
-   atomic_inc(&qp->refcnt);
+   refcount_inc(&qp->refcnt);
 
-   atomic_inc(&qp->refcnt);
+   refcount_inc(&qp->refcnt);
hlist_add_head(&qp->list, &hb->chain);
 
spin_unlock(&hb->chain_lock);
@@ -370,7 +370,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct 
netns_frags *nf,
 
setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
spin_lock_init(&q->lock);
-   atomic_set(&q->refcnt, 1);
+   refcount_set(&q->refcnt, 1);
 
return q;
 }
@@ -405,7 +405,7 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags 
*nf,
spin_lock(&hb->chain_lock);
hlist_for_each_entry(q, &hb->chain, list) {
if (q->net == nf && f->match(q, key)) {
-   atomic_inc(&q->refcnt);
+   refcount_inc(&q->refcnt);
spin_unlock(&hb->chain_lock);
return q;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b3cdeec..9a8cfac 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -312,7 +312,7 @@ static int ip_frag_reinit(struct ipq *qp)
unsigned int sum_truesize = 0;
 
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
-   atomic_inc(&qp->q.refcnt);
+   refcount_inc(&qp->q.refcnt);
return -ETIMEDOUT;
}
 
-- 
2.7.4



[PATCH 08/17] net: convert sock.sk_refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

This patch uses refcount_inc_not_zero() instead of
atomic_inc_not_zero_hint() due to absense of a _hint()
version of refcount API. If the hint() version must
be used, we might need to revisit API.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 crypto/algif_aead.c |  2 +-
 include/net/inet_hashtables.h   |  4 ++--
 include/net/request_sock.h  |  9 +
 include/net/sock.h  | 17 +
 net/atm/proc.c  |  2 +-
 net/bluetooth/af_bluetooth.c|  2 +-
 net/bluetooth/rfcomm/sock.c |  2 +-
 net/core/skbuff.c   |  6 +++---
 net/core/sock.c |  6 +++---
 net/ipv4/inet_connection_sock.c |  2 +-
 net/ipv4/inet_hashtables.c  |  4 ++--
 net/ipv4/inet_timewait_sock.c   |  8 
 net/ipv4/ping.c |  4 ++--
 net/ipv4/raw.c  |  2 +-
 net/ipv4/syncookies.c   |  2 +-
 net/ipv4/tcp_fastopen.c |  2 +-
 net/ipv4/tcp_ipv4.c |  4 ++--
 net/ipv4/udp.c  |  6 +++---
 net/ipv4/udp_diag.c |  4 ++--
 net/ipv6/datagram.c |  2 +-
 net/ipv6/inet6_hashtables.c |  4 ++--
 net/ipv6/tcp_ipv6.c |  4 ++--
 net/ipv6/udp.c  |  4 ++--
 net/key/af_key.c|  2 +-
 net/l2tp/l2tp_debugfs.c |  3 +--
 net/llc/llc_conn.c  |  8 
 net/llc/llc_sap.c   |  2 +-
 net/netfilter/xt_TPROXY.c   |  4 ++--
 net/netlink/af_netlink.c|  6 +++---
 net/packet/af_packet.c  |  2 +-
 net/phonet/socket.c |  2 +-
 net/rxrpc/af_rxrpc.c|  2 +-
 net/sched/em_meta.c |  2 +-
 net/tipc/socket.c   |  2 +-
 net/unix/af_unix.c  |  2 +-
 35 files changed, 70 insertions(+), 69 deletions(-)

diff --git a/crypto/algif_aead.c b/crypto/algif_aead.c
index 8af664f..be11749 100644
--- a/crypto/algif_aead.c
+++ b/crypto/algif_aead.c
@@ -877,7 +877,7 @@ static void aead_sock_destruct(struct sock *sk)
unsigned int ivlen = crypto_aead_ivsize(
crypto_aead_reqtfm(&ctx->aead_req));
 
-   WARN_ON(atomic_read(&sk->sk_refcnt) != 0);
+   WARN_ON(refcount_read(&sk->sk_refcnt) != 0);
aead_put_sgl(sk);
sock_kzfree_s(sk, ctx->iv, ivlen);
sock_kfree_s(sk, ctx, ctx->len);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 1178931..b9e6e0e 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -32,7 +32,7 @@
 #include 
 #include 
 
-#include 
+#include 
 #include 
 
 /* This is for all connections with a full identity, no wildcards.
@@ -334,7 +334,7 @@ static inline struct sock *inet_lookup(struct net *net,
sk = __inet_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
   dport, dif, &refcounted);
 
-   if (sk && !refcounted && !atomic_inc_not_zero(&sk->sk_refcnt))
+   if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
 }
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index a12a5d2..e76e8c2 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -89,7 +90,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock 
*sk_listener,
return NULL;
req->rsk_listener = NULL;
if (attach_listener) {
-   if (unlikely(!atomic_inc_not_zero(&sk_listener->sk_refcnt))) {
+   if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) {
kmem_cache_free(ops->slab, req);
return NULL;
}
@@ -100,7 +101,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock 
*sk_listener,
sk_node_init(&req_to_sk(req)->sk_node);
sk_tx_queue_clear(req_to_sk(req));
req->saved_syn = NULL;
-   atomic_set(&req->rsk_refcnt, 0);
+   refcount_set(&req->rsk_refcnt, 0);
 
return req;
 }
@@ -108,7 +109,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock 
*sk_listener,
 static inline void reqsk_free(struct request_sock *req)
 {
/* temporary debugging */
-   WARN_ON_ONCE(atomic_read(&req->rsk_refcnt) != 0);
+   WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0);
 
req->rsk_ops->destructor(req);
if (req->rsk_listener)
@@ -119,7 +120,7 @@ static inline void reqsk_free(struct request_sock *req)
 
 static inline void reqsk_put(struct request_sock *req)
 {
-   if (atomic_dec_and_test(&req->rsk_refcnt))
+   if (refcount_dec_and_test(&req->rs

[PATCH 12/17] net: convert unix_address.refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/net/af_unix.h | 3 ++-
 net/unix/af_unix.c| 8 
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index fd60ecc..3a385e4 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 void unix_inflight(struct user_struct *user, struct file *fp);
@@ -21,7 +22,7 @@ extern spinlock_t unix_table_lock;
 extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 
 struct unix_address {
-   atomic_trefcnt;
+   refcount_t  refcnt;
int len;
unsigned inthash;
struct sockaddr_un name[0];
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index c885254..b9ee766 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -212,7 +212,7 @@ EXPORT_SYMBOL_GPL(unix_peer_get);
 
 static inline void unix_release_addr(struct unix_address *addr)
 {
-   if (atomic_dec_and_test(&addr->refcnt))
+   if (refcount_dec_and_test(&addr->refcnt))
kfree(addr);
 }
 
@@ -864,7 +864,7 @@ static int unix_autobind(struct socket *sock)
goto out;
 
addr->name->sun_family = AF_UNIX;
-   atomic_set(&addr->refcnt, 1);
+   refcount_set(&addr->refcnt, 1);
 
 retry:
addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + 
sizeof(short);
@@ -1040,7 +1040,7 @@ static int unix_bind(struct socket *sock, struct sockaddr 
*uaddr, int addr_len)
memcpy(addr->name, sunaddr, addr_len);
addr->len = addr_len;
addr->hash = hash ^ sk->sk_type;
-   atomic_set(&addr->refcnt, 1);
+   refcount_set(&addr->refcnt, 1);
 
if (sun_path[0]) {
addr->hash = UNIX_HASH_SIZE;
@@ -1335,7 +1335,7 @@ static int unix_stream_connect(struct socket *sock, 
struct sockaddr *uaddr,
 
/* copy address information from listening to new sock*/
if (otheru->addr) {
-   atomic_inc(&otheru->addr->refcnt);
+   refcount_inc(&otheru->addr->refcnt);
newu->addr = otheru->addr;
}
if (otheru->path.dentry) {
-- 
2.7.4



[PATCH 10/17] net: convert in_device.refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/linux/inetdevice.h | 11 ++-
 net/ipv4/devinet.c |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index a2e9d6e..4b9d135 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct ipv4_devconf {
void*sysctl;
@@ -22,7 +23,7 @@ struct ipv4_devconf {
 
 struct in_device {
struct net_device   *dev;
-   atomic_trefcnt;
+   refcount_t  refcnt;
int dead;
struct in_ifaddr*ifa_list;  /* IP ifaddr chain  
*/
 
@@ -212,7 +213,7 @@ static inline struct in_device *in_dev_get(const struct 
net_device *dev)
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev)
-   atomic_inc(&in_dev->refcnt);
+   refcount_inc(&in_dev->refcnt);
rcu_read_unlock();
return in_dev;
 }
@@ -233,12 +234,12 @@ void in_dev_finish_destroy(struct in_device *idev);
 
 static inline void in_dev_put(struct in_device *idev)
 {
-   if (atomic_dec_and_test(&idev->refcnt))
+   if (refcount_dec_and_test(&idev->refcnt))
in_dev_finish_destroy(idev);
 }
 
-#define __in_dev_put(idev)  atomic_dec(&(idev)->refcnt)
-#define in_dev_hold(idev)   atomic_inc(&(idev)->refcnt)
+#define __in_dev_put(idev)  refcount_dec(&(idev)->refcnt)
+#define in_dev_hold(idev)   refcount_inc(&(idev)->refcnt)
 
 #endif /* __KERNEL__ */
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index df14815..16c5e22 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -251,7 +251,7 @@ static struct in_device *inetdev_init(struct net_device 
*dev)
/* Reference in_dev->dev */
dev_hold(dev);
/* Account for reference dev->ip_ptr (below) */
-   in_dev_hold(in_dev);
+   refcount_set(&in_dev->refcnt, 1);
 
err = devinet_sysctl_register(in_dev);
if (err) {
-- 
2.7.4



[PATCH 09/17] net: convert ip_mc_list.refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/linux/igmp.h |  3 ++-
 net/ipv4/igmp.c  | 10 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 12f6fba..97caf18 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 static inline struct igmphdr *igmp_hdr(const struct sk_buff *skb)
@@ -84,7 +85,7 @@ struct ip_mc_list {
struct ip_mc_list __rcu *next_hash;
struct timer_list   timer;
int users;
-   atomic_trefcnt;
+   refcount_t  refcnt;
spinlock_t  lock;
chartm_running;
charreporter;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index ec9a396..f7685f7 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -173,7 +173,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 
*pmca, int sfmode,
 
 static void ip_ma_put(struct ip_mc_list *im)
 {
-   if (atomic_dec_and_test(&im->refcnt)) {
+   if (refcount_dec_and_test(&im->refcnt)) {
in_dev_put(im->interface);
kfree_rcu(im, rcu);
}
@@ -199,7 +199,7 @@ static void igmp_stop_timer(struct ip_mc_list *im)
 {
spin_lock_bh(&im->lock);
if (del_timer(&im->timer))
-   atomic_dec(&im->refcnt);
+   refcount_dec(&im->refcnt);
im->tm_running = 0;
im->reporter = 0;
im->unsolicit_count = 0;
@@ -213,7 +213,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int 
max_delay)
 
im->tm_running = 1;
if (!mod_timer(&im->timer, jiffies+tv+2))
-   atomic_inc(&im->refcnt);
+   refcount_inc(&im->refcnt);
 }
 
 static void igmp_gq_start_timer(struct in_device *in_dev)
@@ -249,7 +249,7 @@ static void igmp_mod_timer(struct ip_mc_list *im, int 
max_delay)
spin_unlock_bh(&im->lock);
return;
}
-   atomic_dec(&im->refcnt);
+   refcount_dec(&im->refcnt);
}
igmp_start_timer(im, max_delay);
spin_unlock_bh(&im->lock);
@@ -1374,7 +1374,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 
addr)
/* initial mode is (EX, empty) */
im->sfmode = MCAST_EXCLUDE;
im->sfcount[MCAST_EXCLUDE] = 1;
-   atomic_set(&im->refcnt, 1);
+   refcount_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
 #ifdef CONFIG_IP_MULTICAST
setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
-- 
2.7.4



[PATCH 07/17] net: convert sock.sk_wmem_alloc from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 drivers/atm/fore200e.c   | 12 +---
 drivers/atm/he.c |  2 +-
 drivers/atm/idt77252.c   |  4 ++--
 include/linux/atmdev.h   |  2 +-
 include/net/sock.h   |  8 
 net/atm/br2684.c |  2 +-
 net/atm/clip.c   |  2 +-
 net/atm/common.c | 10 +-
 net/atm/lec.c|  4 ++--
 net/atm/mpc.c|  4 ++--
 net/atm/pppoatm.c|  2 +-
 net/atm/raw.c|  2 +-
 net/atm/signaling.c  |  2 +-
 net/caif/caif_socket.c   |  2 +-
 net/core/datagram.c  |  2 +-
 net/core/skbuff.c|  2 +-
 net/core/sock.c  | 26 +-
 net/ipv4/af_inet.c   |  2 +-
 net/ipv4/esp4.c  |  2 +-
 net/ipv4/ip_output.c |  6 +++---
 net/ipv4/tcp.c   |  4 ++--
 net/ipv4/tcp_offload.c   |  2 +-
 net/ipv4/tcp_output.c| 13 ++---
 net/ipv6/esp6.c  |  2 +-
 net/ipv6/ip6_output.c|  4 ++--
 net/kcm/kcmproc.c|  2 +-
 net/key/af_key.c |  2 +-
 net/netlink/af_netlink.c |  2 +-
 net/packet/af_packet.c   |  4 ++--
 net/phonet/socket.c  |  2 +-
 net/rds/tcp_send.c   |  2 +-
 net/rxrpc/af_rxrpc.c |  4 ++--
 net/sched/sch_atm.c  |  2 +-
 net/sctp/output.c|  2 +-
 net/sctp/proc.c  |  2 +-
 net/sctp/socket.c|  4 ++--
 net/unix/af_unix.c   |  6 +++---
 37 files changed, 73 insertions(+), 84 deletions(-)

diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 637c3e6..b770d18 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -924,12 +924,7 @@ fore200e_tx_irq(struct fore200e* fore200e)
else {
dev_kfree_skb_any(entry->skb);
}
-#if 1
-   /* race fixed by the above incarnation mechanism, but... */
-   if (atomic_read(&sk_atm(vcc)->sk_wmem_alloc) < 0) {
-   atomic_set(&sk_atm(vcc)->sk_wmem_alloc, 0);
-   }
-#endif
+
/* check error condition */
if (*entry->status & STATUS_ERROR)
atomic_inc(&vcc->stats->tx_err);
@@ -1130,13 +1125,9 @@ fore200e_push_rpd(struct fore200e* fore200e, struct 
atm_vcc* vcc, struct rpd* rp
return -ENOMEM;
 }
 
-ASSERT(atomic_read(&sk_atm(vcc)->sk_wmem_alloc) >= 0);
-
 vcc->push(vcc, skb);
 atomic_inc(&vcc->stats->rx);
 
-ASSERT(atomic_read(&sk_atm(vcc)->sk_wmem_alloc) >= 0);
-
 return 0;
 }
 
@@ -1572,7 +1563,6 @@ fore200e_send(struct atm_vcc *vcc, struct sk_buff *skb)
 unsigned long   flags;
 
 ASSERT(vcc);
-ASSERT(atomic_read(&sk_atm(vcc)->sk_wmem_alloc) >= 0);
 ASSERT(fore200e);
 ASSERT(fore200e_vcc);
 
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index 3617659..fc1bbdb 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -2395,7 +2395,7 @@ he_close(struct atm_vcc *vcc)
 * TBRQ, the host issues the close command to the adapter.
 */
 
-   while (((tx_inuse = atomic_read(&sk_atm(vcc)->sk_wmem_alloc)) > 
1) &&
+   while (((tx_inuse = refcount_read(&sk_atm(vcc)->sk_wmem_alloc)) 
> 1) &&
   (retry < MAX_RETRY)) {
msleep(sleep);
if (sleep < 250)
diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c
index 5ec1095..20eda87 100644
--- a/drivers/atm/idt77252.c
+++ b/drivers/atm/idt77252.c
@@ -724,7 +724,7 @@ push_on_scq(struct idt77252_dev *card, struct vc_map *vc, 
struct sk_buff *skb)
struct sock *sk = sk_atm(vcc);
 
vc->estimator->cells += (skb->len + 47) / 48;
-   if (atomic_read(&sk->sk_wmem_alloc) >
+   if (refcount_read(&sk->sk_wmem_alloc) >
(sk->sk_sndbuf >> 1)) {
u32 cps = vc->estimator->maxcps;
 
@@ -2012,7 +2012,7 @@ idt77252_send_oam(struct atm_vcc *vcc, void *cell, int 
flags)
atomic_inc(&vcc->stats->tx_err);
return -ENOMEM;
}
-   atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
+   refcount_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc);
 
memcpy(skb_put(skb, 52), cell, 52);
 
diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index c1da539..4d97a89 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -254,7 +254,7 @@ static inline void atm_return(struct atm_vcc *vcc,int 
truesize)
 
 static inline int atm_may_send(struct atm_vcc *vcc,unsigned int size)
 {
-   return (size + atomic_read(&sk_atm(vcc)->sk_wmem_alloc)) <
+   return (size + refcount_read(&sk_atm(vcc)->sk_wmem_alloc)) <
   sk_atm

[PATCH 11/17] net: convert netpoll_info.refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/linux/netpoll.h | 3 ++-
 net/core/netpoll.c  | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 1828900..27c0aaa 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 union inet_addr {
__u32   all[4];
@@ -34,7 +35,7 @@ struct netpoll {
 };
 
 struct netpoll_info {
-   atomic_t refcnt;
+   refcount_t refcnt;
 
struct semaphore dev_lock;
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e256cd8..59a08e5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -632,7 +632,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device 
*ndev)
skb_queue_head_init(&npinfo->txq);
INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
 
-   atomic_set(&npinfo->refcnt, 1);
+   refcount_set(&npinfo->refcnt, 1);
 
ops = np->dev->netdev_ops;
if (ops->ndo_netpoll_setup) {
@@ -642,7 +642,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device 
*ndev)
}
} else {
npinfo = rtnl_dereference(ndev->npinfo);
-   atomic_inc(&npinfo->refcnt);
+   refcount_inc(&npinfo->refcnt);
}
 
npinfo->netpoll = np;
@@ -821,7 +821,7 @@ void __netpoll_cleanup(struct netpoll *np)
 
synchronize_srcu(&netpoll_srcu);
 
-   if (atomic_dec_and_test(&npinfo->refcnt)) {
+   if (refcount_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
 
ops = np->dev->netdev_ops;
-- 
2.7.4



Re: [PATCH] iwlwifi: mvm: fix iwl_mvm_sar_find_wifi_pkg corner case

2017-06-28 Thread Luca Coelho
On Tue, 2017-06-27 at 17:24 +0200, Arnd Bergmann wrote:
> gcc warns about what it thinks is an uninitialized variable
> access:
> 
> drivers/net/wireless/intel/iwlwifi/mvm/fw.c: In function 
> 'iwl_mvm_sar_find_wifi_pkg.isra.14':
> drivers/net/wireless/intel/iwlwifi/mvm/fw.c:1102:5: error: 'wifi_pkg' may be 
> used uninitialized in this function [-Werror=maybe-uninitialized]
> 
> That problem cannot really happen, as we check data->package.count
> to ensure that the loop is entered at least once.
> However, something that can indeed happen is returning an incorrect
> wifi_pkg pointer in case none of the elements are what we are looking
> for.
> 
> This modifies the loop again, to only return a correct object, and
> to shut up that warning.
> 
> Fixes: c386dacb4ed6 ("iwlwifi: mvm: refactor SAR init to prepare for dynamic 
> SAR")
> Signed-off-by: Arnd Bergmann 
> ---

Thanks, Arnd!

I've pushed this to our internal tree and it will eventually reach the
mainline, via our normal upstreaming process.


--
Cheers,
Luca.


[PATCH 06/17] net: convert sk_buff_fclones.fclone_ref from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/linux/skbuff.h |  4 ++--
 net/core/skbuff.c  | 10 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7498e64..5e26b4c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -948,7 +948,7 @@ struct sk_buff_fclones {
 
struct sk_buff  skb2;
 
-   atomic_tfclone_ref;
+   refcount_t  fclone_ref;
 };
 
 /**
@@ -968,7 +968,7 @@ static inline bool skb_fclone_busy(const struct sock *sk,
fclones = container_of(skb, struct sk_buff_fclones, skb1);
 
return skb->fclone == SKB_FCLONE_ORIG &&
-  atomic_read(&fclones->fclone_ref) > 1 &&
+  refcount_read(&fclones->fclone_ref) > 1 &&
   fclones->skb2.sk == sk;
 }
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f04c1f8..1bd9352 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -268,7 +268,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
 
kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
skb->fclone = SKB_FCLONE_ORIG;
-   atomic_set(&fclones->fclone_ref, 1);
+   refcount_set(&fclones->fclone_ref, 1);
 
fclones->skb2.fclone = SKB_FCLONE_CLONE;
}
@@ -629,7 +629,7 @@ static void kfree_skbmem(struct sk_buff *skb)
 * This test would have no chance to be true for the clone,
 * while here, branch prediction will be good.
 */
-   if (atomic_read(&fclones->fclone_ref) == 1)
+   if (refcount_read(&fclones->fclone_ref) == 1)
goto fastpath;
break;
 
@@ -637,7 +637,7 @@ static void kfree_skbmem(struct sk_buff *skb)
fclones = container_of(skb, struct sk_buff_fclones, skb2);
break;
}
-   if (!atomic_dec_and_test(&fclones->fclone_ref))
+   if (!refcount_dec_and_test(&fclones->fclone_ref))
return;
 fastpath:
kmem_cache_free(skbuff_fclone_cache, fclones);
@@ -1018,9 +1018,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t 
gfp_mask)
return NULL;
 
if (skb->fclone == SKB_FCLONE_ORIG &&
-   atomic_read(&fclones->fclone_ref) == 1) {
+   refcount_read(&fclones->fclone_ref) == 1) {
n = &fclones->skb2;
-   atomic_set(&fclones->fclone_ref, 2);
+   refcount_set(&fclones->fclone_ref, 2);
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
-- 
2.7.4



[PATCH 00/17] v2 net generic subsystem refcount conversions

2017-06-28 Thread Elena Reshetova
Changes in v2:
No changes in patches apart from rebases, but now by
default refcount_t = atomic_t (*) and uses all atomic standard operations
unless CONFIG_REFCOUNT_FULL is enabled. This is a compromise for the
systems that are critical on performance (such as net) and cannot accept even
slight delay on the refcounter operations.

This series, for core network subsystem components, replaces atomic_t reference
counters with the new refcount_t type and API (see include/linux/refcount.h).
By doing this we prevent intentional or accidental
underflows or overflows that can led to use-after-free vulnerabilities.
These patches contain only generic net pieces. Other changes will be sent 
separately.

The patches are fully independent and can be cherry-picked separately.
The big patches, such as conversions for sock structure, need a very detailed
look from maintainers: refcount managing is quite complex in them and while
it seems that they would benefit from the change, extra checking is needed.
The biggest corner issue is the fact that refcount_inc() does not increment
from zero.

If there are no objections to the patches, please merge them via respective 
trees.

* The respective change is currently merged into -next as
  "locking/refcount: Create unchecked atomic_t implementation".

Elena Reshetova (17):
  net: convert inet_peer.refcnt from atomic_t to refcount_t
  net: convert neighbour.refcnt from atomic_t to refcount_t
  net: convert neigh_params.refcnt from atomic_t to refcount_t
  net: convert nf_bridge_info.use from atomic_t to refcount_t
  net: convert sk_buff.users from atomic_t to refcount_t
  net: convert sk_buff_fclones.fclone_ref from atomic_t to refcount_t
  net: convert sock.sk_wmem_alloc from atomic_t to refcount_t
  net: convert sock.sk_refcnt from atomic_t to refcount_t
  net: convert ip_mc_list.refcnt from atomic_t to refcount_t
  net: convert in_device.refcnt from atomic_t to refcount_t
  net: convert netpoll_info.refcnt from atomic_t to refcount_t
  net: convert unix_address.refcnt from atomic_t to refcount_t
  net: convert fib_rule.refcnt from atomic_t to refcount_t
  net: convert inet_frag_queue.refcnt from atomic_t to refcount_t
  net: convert net.passive from atomic_t to refcount_t
  net: convert netlbl_lsm_cache.refcount from atomic_t to refcount_t
  net: convert packet_fanout.sk_ref from atomic_t to refcount_t

 crypto/algif_aead.c  |  2 +-
 drivers/atm/fore200e.c   | 12 +---
 drivers/atm/he.c |  2 +-
 drivers/atm/idt77252.c   |  4 ++--
 drivers/infiniband/hw/nes/nes_cm.c   |  4 ++--
 drivers/isdn/mISDN/socket.c  |  2 +-
 drivers/net/rionet.c |  2 +-
 drivers/s390/net/ctcm_main.c | 26 
 drivers/s390/net/netiucv.c   | 10 +-
 drivers/s390/net/qeth_core_main.c|  4 ++--
 include/linux/atmdev.h   |  2 +-
 include/linux/igmp.h |  3 ++-
 include/linux/inetdevice.h   | 11 ++-
 include/linux/netpoll.h  |  3 ++-
 include/linux/skbuff.h   | 16 +++
 include/net/af_unix.h|  3 ++-
 include/net/arp.h|  2 +-
 include/net/fib_rules.h  |  7 ---
 include/net/inet_frag.h  |  4 ++--
 include/net/inet_hashtables.h|  4 ++--
 include/net/inetpeer.h   |  4 ++--
 include/net/ndisc.h  |  2 +-
 include/net/neighbour.h  | 15 +++---
 include/net/net_namespace.h  |  3 ++-
 include/net/netfilter/br_netfilter.h |  2 +-
 include/net/netlabel.h   |  8 
 include/net/request_sock.h   |  9 +
 include/net/sock.h   | 25 
 net/atm/br2684.c |  2 +-
 net/atm/clip.c   |  8 
 net/atm/common.c | 10 +-
 net/atm/lec.c|  4 ++--
 net/atm/mpc.c|  4 ++--
 net/atm/pppoatm.c|  2 +-
 net/atm/proc.c   |  2 +-
 net/atm/raw.c|  2 +-
 net/atm/signaling.c  |  2 +-
 net/bluetooth/af_bluetooth.c |  2 +-
 net/bluetooth/rfcomm/sock.c  |  2 +-
 net/bridge/br_netfilter_hooks.c  |  4 ++--
 net/caif/caif_socket.c   |  2 +-
 net/core/datagram.c  | 10 +-
 net/core/dev.c   | 10 +-
 net/core/fib_rules.c |  4 ++--
 net/core/neighbour.c | 22 ++---
 net/core/net-sysfs.c |  2 +-
 net/core/net_namespace.c |  4 ++--
 net/core/netpoll.c   | 10 +-
 net/core/pktgen.c| 16 +++
 net/core/rtnetlink.c |  2 +-
 net/core/skbuff.c| 38 ++--
 net/core/sock.c   

[PATCH 03/17] net: convert neigh_params.refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/net/neighbour.h | 6 +++---
 net/core/neighbour.c| 8 
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index c5f6d51..83d1e9f 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -77,7 +77,7 @@ struct neigh_parms {
void*sysctl_table;
 
int dead;
-   atomic_t refcnt;
+   refcount_t refcnt;
struct rcu_head rcu_head;
 
int reachable_time;
@@ -395,12 +395,12 @@ void neigh_sysctl_unregister(struct neigh_parms *p);
 
 static inline void __neigh_parms_put(struct neigh_parms *parms)
 {
-   atomic_dec(&parms->refcnt);
+   refcount_dec(&parms->refcnt);
 }
 
 static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms)
 {
-   atomic_inc(&parms->refcnt);
+   refcount_inc(&parms->refcnt);
return parms;
 }
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index bc52190..24afede 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -674,7 +674,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms);
 
 static inline void neigh_parms_put(struct neigh_parms *parms)
 {
-   if (atomic_dec_and_test(&parms->refcnt))
+   if (refcount_dec_and_test(&parms->refcnt))
neigh_parms_destroy(parms);
 }
 
@@ -1444,7 +1444,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device 
*dev,
p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
if (p) {
p->tbl= tbl;
-   atomic_set(&p->refcnt, 1);
+   refcount_set(&p->refcnt, 1);
p->reachable_time =
neigh_rand_reach_time(NEIGH_VAR(p, 
BASE_REACHABLE_TIME));
dev_hold(dev);
@@ -1507,7 +1507,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
INIT_LIST_HEAD(&tbl->parms_list);
list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
-   atomic_set(&tbl->parms.refcnt, 1);
+   refcount_set(&tbl->parms.refcnt, 1);
tbl->parms.reachable_time =
  neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, 
BASE_REACHABLE_TIME));
 
@@ -1758,7 +1758,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, 
struct neigh_parms *parms)
 
if ((parms->dev &&
 nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
-   nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) ||
+   nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||
nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
/* approximative value for deprecated QUEUE_LEN (in packets) */
-- 
2.7.4



[PATCH 02/17] net: convert neighbour.refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/net/arp.h   |  2 +-
 include/net/ndisc.h |  2 +-
 include/net/neighbour.h |  9 +
 net/atm/clip.c  |  6 +++---
 net/core/neighbour.c| 14 +++---
 net/decnet/dn_neigh.c   |  2 +-
 6 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/include/net/arp.h b/include/net/arp.h
index 65619a2..17d90e4 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -28,7 +28,7 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct 
net_device *dev, u32
 
rcu_read_lock_bh();
n = __ipv4_neigh_lookup_noref(dev, key);
-   if (n && !atomic_inc_not_zero(&n->refcnt))
+   if (n && !refcount_inc_not_zero(&n->refcnt))
n = NULL;
rcu_read_unlock_bh();
 
diff --git a/include/net/ndisc.h b/include/net/ndisc.h
index 1036c90..31b1bb1 100644
--- a/include/net/ndisc.h
+++ b/include/net/ndisc.h
@@ -384,7 +384,7 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct 
net_device *dev, cons
 
rcu_read_lock_bh();
n = __ipv6_neigh_lookup_noref(dev, pkey);
-   if (n && !atomic_inc_not_zero(&n->refcnt))
+   if (n && !refcount_inc_not_zero(&n->refcnt))
n = NULL;
rcu_read_unlock_bh();
 
diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index e4dd3a2..c5f6d51 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -17,6 +17,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -137,7 +138,7 @@ struct neighbour {
unsigned long   confirmed;
unsigned long   updated;
rwlock_tlock;
-   atomic_trefcnt;
+   refcount_t  refcnt;
struct sk_buff_head arp_queue;
unsigned intarp_queue_len_bytes;
struct timer_list   timer;
@@ -409,18 +410,18 @@ static inline struct neigh_parms 
*neigh_parms_clone(struct neigh_parms *parms)
 
 static inline void neigh_release(struct neighbour *neigh)
 {
-   if (atomic_dec_and_test(&neigh->refcnt))
+   if (refcount_dec_and_test(&neigh->refcnt))
neigh_destroy(neigh);
 }
 
 static inline struct neighbour * neigh_clone(struct neighbour *neigh)
 {
if (neigh)
-   atomic_inc(&neigh->refcnt);
+   refcount_inc(&neigh->refcnt);
return neigh;
 }
 
-#define neigh_hold(n)  atomic_inc(&(n)->refcnt)
+#define neigh_hold(n)  refcount_inc(&(n)->refcnt)
 
 static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff 
*skb)
 {
diff --git a/net/atm/clip.c b/net/atm/clip.c
index ec527b6..68435f6 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -137,11 +137,11 @@ static int neigh_check_cb(struct neighbour *n)
if (entry->vccs || time_before(jiffies, entry->expires))
return 0;
 
-   if (atomic_read(&n->refcnt) > 1) {
+   if (refcount_read(&n->refcnt) > 1) {
struct sk_buff *skb;
 
pr_debug("destruction postponed with ref %d\n",
-atomic_read(&n->refcnt));
+refcount_read(&n->refcnt));
 
while ((skb = skb_dequeue(&n->arp_queue)) != NULL)
dev_kfree_skb(skb);
@@ -767,7 +767,7 @@ static void atmarp_info(struct seq_file *seq, struct 
neighbour *n,
seq_printf(seq, "(resolving)\n");
else
seq_printf(seq, "(expired, ref %d)\n",
-  atomic_read(&entry->neigh->refcnt));
+  refcount_read(&entry->neigh->refcnt));
} else if (!svc) {
seq_printf(seq, "%d.%d.%d\n",
   clip_vcc->vcc->dev->number,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d274f81..bc52190 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -141,7 +141,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 * - it is not permanent
 */
write_lock(&n->lock);
-   if (atomic_read(&n->refcnt) == 1 &&
+   if (refcount_read(&n->refcnt) == 1 &&
!(n->nud_state & NUD_PERMANENT)) {
rcu_assign_pointer(*np,
rcu_dereference_protected(n->next,
@@ -219,7 +219,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct 
net_device *dev)
neigh_del_timer(n);
n->dead = 1;
 
-   if (atomic_read(&n->refcnt) != 1) {
+   if 

[PATCH 01/17] net: convert inet_peer.refcnt from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.
This conversion requires overall +1 on the whole
refcounting scheme.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/net/inetpeer.h |  4 ++--
 net/ipv4/inetpeer.c| 18 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
index 235c781..f2a215f 100644
--- a/include/net/inetpeer.h
+++ b/include/net/inetpeer.h
@@ -46,7 +46,7 @@ struct inet_peer {
struct rcu_head gc_rcu;
};
/*
-* Once inet_peer is queued for deletion (refcnt == -1), following field
+* Once inet_peer is queued for deletion (refcnt == 0), following field
 * is not available: rid
 * We can share memory with rcu_head to help keep inet_peer small.
 */
@@ -60,7 +60,7 @@ struct inet_peer {
 
/* following fields might be frequently dirtied */
__u32   dtime;  /* the time of last use of not 
referenced entries */
-   atomic_trefcnt;
+   refcount_t  refcnt;
 };
 
 struct inet_peer_base {
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 86fa458..c5a117c 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -115,7 +115,7 @@ static void inetpeer_gc_worker(struct work_struct *work)
 
n = list_entry(p->gc_list.next, struct inet_peer, gc_list);
 
-   if (!atomic_read(&p->refcnt)) {
+   if (refcount_read(&p->refcnt) == 1) {
list_del(&p->gc_list);
kmem_cache_free(peer_cachep, p);
}
@@ -202,10 +202,11 @@ static struct inet_peer *lookup_rcu(const struct 
inetpeer_addr *daddr,
int cmp = inetpeer_addr_cmp(daddr, &u->daddr);
if (cmp == 0) {
/* Before taking a reference, check if this entry was
-* deleted (refcnt=-1)
+* deleted (refcnt=0)
 */
-   if (!atomic_add_unless(&u->refcnt, 1, -1))
+   if (!refcount_inc_not_zero(&u->refcnt)) {
u = NULL;
+   }
return u;
}
if (cmp == -1)
@@ -382,11 +383,10 @@ static int inet_peer_gc(struct inet_peer_base *base,
while (stackptr > stack) {
stackptr--;
p = rcu_deref_locked(**stackptr, base);
-   if (atomic_read(&p->refcnt) == 0) {
+   if (refcount_read(&p->refcnt) == 1) {
smp_rmb();
delta = (__u32)jiffies - p->dtime;
-   if (delta >= ttl &&
-   atomic_cmpxchg(&p->refcnt, 0, -1) == 0) {
+   if (delta >= ttl && refcount_dec_if_one(&p->refcnt)) {
p->gc_next = gchead;
gchead = p;
}
@@ -432,7 +432,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
 relookup:
p = lookup(daddr, stack, base);
if (p != peer_avl_empty) {
-   atomic_inc(&p->refcnt);
+   refcount_inc(&p->refcnt);
write_sequnlock_bh(&base->lock);
return p;
}
@@ -444,7 +444,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
p = create ? kmem_cache_alloc(peer_cachep, GFP_ATOMIC) : NULL;
if (p) {
p->daddr = *daddr;
-   atomic_set(&p->refcnt, 1);
+   refcount_set(&p->refcnt, 2);
atomic_set(&p->rid, 0);
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
@@ -468,7 +468,7 @@ void inet_putpeer(struct inet_peer *p)
 {
p->dtime = (__u32)jiffies;
smp_mb__before_atomic();
-   atomic_dec(&p->refcnt);
+   refcount_dec(&p->refcnt);
 }
 EXPORT_SYMBOL_GPL(inet_putpeer);
 
-- 
2.7.4



[PATCH 04/17] net: convert nf_bridge_info.use from atomic_t to refcount_t

2017-06-28 Thread Elena Reshetova
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova 
Signed-off-by: Hans Liljestrand 
Signed-off-by: Kees Cook 
Signed-off-by: David Windsor 
---
 include/linux/skbuff.h   | 6 +++---
 include/net/netfilter/br_netfilter.h | 2 +-
 net/bridge/br_netfilter_hooks.c  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a098d95..8a26c02 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -250,7 +250,7 @@ struct nf_conntrack {
 
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 struct nf_bridge_info {
-   atomic_tuse;
+   refcount_t  use;
enum {
BRNF_PROTO_UNCHANGED,
BRNF_PROTO_8021Q,
@@ -3593,13 +3593,13 @@ static inline void nf_conntrack_get(struct nf_conntrack 
*nfct)
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
 {
-   if (nf_bridge && atomic_dec_and_test(&nf_bridge->use))
+   if (nf_bridge && refcount_dec_and_test(&nf_bridge->use))
kfree(nf_bridge);
 }
 static inline void nf_bridge_get(struct nf_bridge_info *nf_bridge)
 {
if (nf_bridge)
-   atomic_inc(&nf_bridge->use);
+   refcount_inc(&nf_bridge->use);
 }
 #endif /* CONFIG_BRIDGE_NETFILTER */
 static inline void nf_reset(struct sk_buff *skb)
diff --git a/include/net/netfilter/br_netfilter.h 
b/include/net/netfilter/br_netfilter.h
index 0b0c35c..925524e 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h
@@ -8,7 +8,7 @@ static inline struct nf_bridge_info *nf_bridge_alloc(struct 
sk_buff *skb)
skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
 
if (likely(skb->nf_bridge))
-   atomic_set(&(skb->nf_bridge->use), 1);
+   refcount_set(&(skb->nf_bridge->use), 1);
 
return skb->nf_bridge;
 }
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 067cf03..2261e51 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -149,12 +149,12 @@ static inline struct nf_bridge_info 
*nf_bridge_unshare(struct sk_buff *skb)
 {
struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 
-   if (atomic_read(&nf_bridge->use) > 1) {
+   if (refcount_read(&nf_bridge->use) > 1) {
struct nf_bridge_info *tmp = nf_bridge_alloc(skb);
 
if (tmp) {
memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
-   atomic_set(&tmp->use, 1);
+   refcount_set(&tmp->use, 1);
}
nf_bridge_put(nf_bridge);
nf_bridge = tmp;
-- 
2.7.4



Re: [PATCH net] virtio-net: unbreak cusmed packet for small buffer XDP

2017-06-28 Thread Jason Wang



On 2017年06月28日 12:01, Michael S. Tsirkin wrote:

On Wed, Jun 28, 2017 at 11:40:30AM +0800, Jason Wang wrote:


On 2017年06月28日 11:31, Michael S. Tsirkin wrote:

On Wed, Jun 28, 2017 at 10:45:18AM +0800, Jason Wang wrote:

On 2017年06月28日 10:17, Michael S. Tsirkin wrote:

On Wed, Jun 28, 2017 at 10:14:34AM +0800, Jason Wang wrote:

On 2017年06月28日 10:02, Michael S. Tsirkin wrote:

On Wed, Jun 28, 2017 at 09:54:03AM +0800, Jason Wang wrote:

We should allow csumed packet for small buffer, otherwise XDP_PASS
won't work correctly.

Fixes commit bb91accf2733 ("virtio-net: XDP support for small buffers")
Signed-off-by: Jason Wang

The issue would be VIRTIO_NET_HDR_F_DATA_VALID might be set.
What do you think?

I think it's safe. For XDP_PASS, it work like in the past.

That's the part I don't get. With DATA_VALID csum in packet is wrong, XDP
tools assume it's value.

DATA_VALID is CHECKSUM_UNCESSARY on the host, and according to the comment
in skbuff.h


"
   *   The hardware you're dealing with doesn't calculate the full checksum
   *   (as in CHECKSUM_COMPLETE), but it does parse headers and verify
checksums
   *   for specific protocols. For such packets it will set
CHECKSUM_UNNECESSARY
   *   if their checksums are okay. skb->csum is still undefined in this case
   *   though. A driver or device must never modify the checksum field in the
   *   packet even if checksum is verified.
"

The csum is correct I believe?

Thanks

That's on input. But I think for tun it's output, where that is equivalent
to CHECKSUM_NONE



Yes, but the comment said:

"
CKSUM_NONE:
  *
  *   The skb was already checksummed by the protocol, or a checksum is not
  *   required.
  *
  * CHECKSUM_UNNECESSARY:
  *
  *   This has the same meaning on as CHECKSUM_NONE for checksum offload on
  *   output.
  *
"

So still correct I think?

Thanks

Hmm maybe I mean NEEDS_CHECKSUM actually.

I'll need to re-read the spec.



Not sure this is an issue. But if it is, we can probably checksum the 
packet before passing it to XDP. But it would be a little slow.


Thanks


Re: [PATCH 2/2] ethtool: stmmac: Add DMA HW Feature Register

2017-06-28 Thread Thor Thayer

On 06/28/2017 04:03 AM, Giuseppe CAVALLARO wrote:

On 6/27/2017 11:51 PM, thor.tha...@linux.intel.com wrote:

From: Thor Thayer 

This patch adds the DMA HW Feature Register which is at the end
of the DMA registers and is documented in Version 3.70a.

Signed-off-by: Thor Thayer 
---
  stmmac.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stmmac.c b/stmmac.c
index e1bb291..7d7bebd 100644
--- a/stmmac.c
+++ b/stmmac.c
@@ -64,7 +64,7 @@ int st_gmac_dump_regs(struct ethtool_drvinfo *info, 
struct ethtool_regs *regs)

  fprintf(stdout, "\n");
  fprintf(stdout, "DMA Registers\n");
  stmmac_reg = (unsigned int *)regs->data + DMA_REG_OFFSET;
-for (i = 0; i < 22; i++)
+for (i = 0; i < 23; i++)


thx Thor for these changes, I wonder if you could add a macro instead 23 
while doing this kind of changes


Sorry if I didn't it in the past.

the, you can send the series with my Acked-by: Giuseppe Cavallaro 



Regards

peppe



Sure. I'll also add a macro for the # of main registers too (55). Some 
maintainers prefer the macros while others prefer the number to reduce 
the space.


Thanks for the quick review!

Thor



  fprintf(stdout, "Reg%d  0x%08X\n", i, *stmmac_reg++);
  return 0;








[RFC net-next 2/9] net: xfrm: revert to lower xfrm dst gc limit

2017-06-28 Thread Florian Westphal
revert c386578f1cdb4dac230395 ("xfrm: Let the flowcache handle its size by 
default.").

Once we remove flow cache, we don't have a flow cache limit anymore.
We must not allow (virtually) unlimited allocations of xfrm dst entries.
Revert back to the old xfrm dst gc limits.

Signed-off-by: Florian Westphal 
---
 Documentation/networking/ip-sysctl.txt | 6 ++
 net/ipv4/xfrm4_policy.c| 2 +-
 net/ipv6/xfrm6_policy.c| 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 974ab47ae53a..f485d553e65c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1291,8 +1291,7 @@ tag - INTEGER
 xfrm4_gc_thresh - INTEGER
The threshold at which we will start garbage collecting for IPv4
destination cache entries.  At twice this value the system will
-   refuse new allocations. The value must be set below the flowcache
-   limit (4096 * number of online cpus) to take effect.
+   refuse new allocations.
 
 igmp_link_local_mcast_reports - BOOLEAN
Enable IGMP reports for link local multicast groups in the
@@ -1778,8 +1777,7 @@ ratelimit - INTEGER
 xfrm6_gc_thresh - INTEGER
The threshold at which we will start garbage collecting for IPv6
destination cache entries.  At twice this value the system will
-   refuse new allocations. The value must be set below the flowcache
-   limit (4096 * number of online cpus) to take effect.
+   refuse new allocations.
 
 
 IPv6 Update by:
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 71b4ecc195c7..19455a5fc328 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -266,7 +266,7 @@ static struct dst_ops xfrm4_dst_ops_template = {
.destroy =  xfrm4_dst_destroy,
.ifdown =   xfrm4_dst_ifdown,
.local_out =__ip_local_out,
-   .gc_thresh =INT_MAX,
+   .gc_thresh =32768,
 };
 
 static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 79651bc71bf0..ae30dc4973e8 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -286,7 +286,7 @@ static struct dst_ops xfrm6_dst_ops_template = {
.destroy =  xfrm6_dst_destroy,
.ifdown =   xfrm6_dst_ifdown,
.local_out =__ip6_local_out,
-   .gc_thresh =INT_MAX,
+   .gc_thresh =32768,
 };
 
 static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
-- 
2.13.0



[RFC ipsec-next] flow cache removal

2017-06-28 Thread Florian Westphal
Here is an updated version of the flow cache removal
set.

Changes since last iteration:
 - rebase
 - split removal into multiple gradual chunks to ease review
 - add a small pcpu xdst cache to reduce alloc/free overhead
   when subsequent packet can re-use previous xdst

I did some sanity testing and ran a few netperf tests.
The most severe hit is with pure UDP_RR workload.
TCP_STREAM is ok-ish, UDP_STREAM is marginally faster with
the simpler pcpu cache (we only instantiate one xfrm_dst and then
reuse it).

We can discuss fine print and possible further work (avoid this_cpu_xchg,
xfrm_genid removal, etc) at NFWS if needed.

Thanks,
Florian

 Documentation/networking/ip-sysctl.txt |6 
 /include/net/flow.h |   34 --
 /include/net/netns/xfrm.h   |   11 
 /include/net/xfrm.h |9 
 /net/core/Makefile  |1 
 /net/ipv4/ip_vti.c  |   31 -
 /net/ipv4/xfrm4_policy.c|   11 
 /net/ipv6/ip6_vti.c |   31 -
 /net/ipv6/xfrm6_policy.c|   11 
 /net/key/af_key.c   |4 
 /net/xfrm/xfrm_device.c |3 
 /net/xfrm/xfrm_policy.c |  334 
 /net/xfrm/xfrm_user.c   |2 
 /security/selinux/include/xfrm.h|4 
 include/net/flowcache.h  |   25 -
 net/core/flow.c  |  516 ---
 16 files changed, 82 insertions(+), 951 deletions(-)



[RFC net-next 1/9] vti: revert flush x-netns xfrm cache when vti interface is removed

2017-06-28 Thread Florian Westphal
flow cache is removed in next commit.

Signed-off-by: Florian Westphal 
---
 net/ipv4/ip_vti.c  | 31 ---
 net/ipv6/ip6_vti.c | 31 ---
 2 files changed, 62 deletions(-)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 0192c255e508..5ed63d250950 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -584,33 +584,6 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
.get_link_net   = ip_tunnel_get_link_net,
 };
 
-static bool is_vti_tunnel(const struct net_device *dev)
-{
-   return dev->netdev_ops == &vti_netdev_ops;
-}
-
-static int vti_device_event(struct notifier_block *unused,
-   unsigned long event, void *ptr)
-{
-   struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-   struct ip_tunnel *tunnel = netdev_priv(dev);
-
-   if (!is_vti_tunnel(dev))
-   return NOTIFY_DONE;
-
-   switch (event) {
-   case NETDEV_DOWN:
-   if (!net_eq(tunnel->net, dev_net(dev)))
-   xfrm_garbage_collect(tunnel->net);
-   break;
-   }
-   return NOTIFY_DONE;
-}
-
-static struct notifier_block vti_notifier_block __read_mostly = {
-   .notifier_call = vti_device_event,
-};
-
 static int __init vti_init(void)
 {
const char *msg;
@@ -618,8 +591,6 @@ static int __init vti_init(void)
 
pr_info("IPv4 over IPsec tunneling driver\n");
 
-   register_netdevice_notifier(&vti_notifier_block);
-
msg = "tunnel device";
err = register_pernet_device(&vti_net_ops);
if (err < 0)
@@ -652,7 +623,6 @@ static int __init vti_init(void)
 xfrm_proto_esp_failed:
unregister_pernet_device(&vti_net_ops);
 pernet_dev_failed:
-   unregister_netdevice_notifier(&vti_notifier_block);
pr_err("vti init: failed to register %s\n", msg);
return err;
 }
@@ -664,7 +634,6 @@ static void __exit vti_fini(void)
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti_net_ops);
-   unregister_netdevice_notifier(&vti_notifier_block);
 }
 
 module_init(vti_init);
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 486c2305f53c..79444a4bfd6d 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1145,33 +1145,6 @@ static struct xfrm6_protocol vti_ipcomp6_protocol 
__read_mostly = {
.priority   =   100,
 };
 
-static bool is_vti6_tunnel(const struct net_device *dev)
-{
-   return dev->netdev_ops == &vti6_netdev_ops;
-}
-
-static int vti6_device_event(struct notifier_block *unused,
-unsigned long event, void *ptr)
-{
-   struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-   struct ip6_tnl *t = netdev_priv(dev);
-
-   if (!is_vti6_tunnel(dev))
-   return NOTIFY_DONE;
-
-   switch (event) {
-   case NETDEV_DOWN:
-   if (!net_eq(t->net, dev_net(dev)))
-   xfrm_garbage_collect(t->net);
-   break;
-   }
-   return NOTIFY_DONE;
-}
-
-static struct notifier_block vti6_notifier_block __read_mostly = {
-   .notifier_call = vti6_device_event,
-};
-
 /**
  * vti6_tunnel_init - register protocol and reserve needed resources
  *
@@ -1182,8 +1155,6 @@ static int __init vti6_tunnel_init(void)
const char *msg;
int err;
 
-   register_netdevice_notifier(&vti6_notifier_block);
-
msg = "tunnel device";
err = register_pernet_device(&vti6_net_ops);
if (err < 0)
@@ -1216,7 +1187,6 @@ static int __init vti6_tunnel_init(void)
 xfrm_proto_esp_failed:
unregister_pernet_device(&vti6_net_ops);
 pernet_dev_failed:
-   unregister_netdevice_notifier(&vti6_notifier_block);
pr_err("vti6 init: failed to register %s\n", msg);
return err;
 }
@@ -1231,7 +1201,6 @@ static void __exit vti6_tunnel_cleanup(void)
xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
unregister_pernet_device(&vti6_net_ops);
-   unregister_netdevice_notifier(&vti6_notifier_block);
 }
 
 module_init(vti6_tunnel_init);
-- 
2.13.0



[RFC net-next 5/9] xfrm_policy: kill flow to policy dir conversion

2017-06-28 Thread Florian Westphal
XFRM_POLICY_IN/OUT/FWD are identical to FLOW_DIR_*, so gcc already
removed this function as its just returns the argument.  Again, no
code change.

Signed-off-by: Florian Westphal 
---
 net/xfrm/xfrm_policy.c | 46 --
 1 file changed, 4 insertions(+), 42 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 64cef0e601b8..626351915a97 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1191,24 +1191,6 @@ __xfrm_policy_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir
return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, 
family, dir);
 }
 
-static int flow_to_policy_dir(int dir)
-{
-   if (XFRM_POLICY_IN == FLOW_DIR_IN &&
-   XFRM_POLICY_OUT == FLOW_DIR_OUT &&
-   XFRM_POLICY_FWD == FLOW_DIR_FWD)
-   return dir;
-
-   switch (dir) {
-   default:
-   case FLOW_DIR_IN:
-   return XFRM_POLICY_IN;
-   case FLOW_DIR_OUT:
-   return XFRM_POLICY_OUT;
-   case FLOW_DIR_FWD:
-   return XFRM_POLICY_FWD;
-   }
-}
-
 static struct flow_cache_object *
 xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
   u8 dir, struct flow_cache_object *old_obj, void *ctx)
@@ -1218,7 +1200,7 @@ xfrm_policy_lookup(struct net *net, const struct flowi 
*fl, u16 family,
if (old_obj)
xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
 
-   pol = __xfrm_policy_lookup(net, fl, family, flow_to_policy_dir(dir));
+   pol = __xfrm_policy_lookup(net, fl, family, dir);
if (IS_ERR_OR_NULL(pol))
return ERR_CAST(pol);
 
@@ -1229,23 +1211,6 @@ xfrm_policy_lookup(struct net *net, const struct flowi 
*fl, u16 family,
return &pol->flo;
 }
 
-static inline int policy_to_flow_dir(int dir)
-{
-   if (XFRM_POLICY_IN == FLOW_DIR_IN &&
-   XFRM_POLICY_OUT == FLOW_DIR_OUT &&
-   XFRM_POLICY_FWD == FLOW_DIR_FWD)
-   return dir;
-   switch (dir) {
-   default:
-   case XFRM_POLICY_IN:
-   return FLOW_DIR_IN;
-   case XFRM_POLICY_OUT:
-   return FLOW_DIR_OUT;
-   case XFRM_POLICY_FWD:
-   return FLOW_DIR_FWD;
-   }
-}
-
 static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int 
dir,
 const struct flowi *fl, u16 
family)
 {
@@ -1265,7 +1230,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const 
struct sock *sk, int dir,
}
err = security_xfrm_policy_lookup(pol->security,
  fl->flowi_secid,
- policy_to_flow_dir(dir));
+ dir);
if (!err) {
if (!xfrm_pol_hold_rcu(pol))
goto again;
@@ -2067,8 +2032,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir,
/* Resolve policies to use if we couldn't get them from
 * previous cache entry */
num_pols = 1;
-   pols[0] = __xfrm_policy_lookup(net, fl, family,
-  flow_to_policy_dir(dir));
+   pols[0] = __xfrm_policy_lookup(net, fl, family, dir);
err = xfrm_expand_policies(fl, family, pols,
   &num_pols, &num_xfrms);
if (err < 0)
@@ -2146,7 +2110,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct 
dst_entry *dst_orig,
struct xfrm_dst *xdst;
struct dst_entry *dst, *route;
u16 family = dst_orig->ops->family;
-   u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
+   u8 dir = XFRM_POLICY_OUT;
int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
 
dst = NULL;
@@ -2403,12 +2367,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, 
struct sk_buff *skb,
int pi;
int reverse;
struct flowi fl;
-   u8 fl_dir;
int xerr_idx = -1;
 
reverse = dir & ~XFRM_POLICY_MASK;
dir &= XFRM_POLICY_MASK;
-   fl_dir = policy_to_flow_dir(dir);
 
if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
-- 
2.13.0



[RFC net-next 4/9] xfrm_policy: remove always true/false branches

2017-06-28 Thread Florian Westphal
after previous change oldflo and xdst are always NULL.
These branches were already removed by gcc, this doesn't change code.

Signed-off-by: Florian Westphal 
---
 net/xfrm/xfrm_policy.c | 74 ++
 1 file changed, 14 insertions(+), 60 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 084736ff2681..64cef0e601b8 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2060,48 +2060,23 @@ xfrm_bundle_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir,
 {
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
struct xfrm_dst *xdst, *new_xdst;
-   int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
-   struct flow_cache_object *oldflo = NULL;
+   int num_pols = 0, num_xfrms = 0, err;
 
/* Check if the policies from old bundle are usable */
xdst = NULL;
-   if (oldflo) {
-   xdst = container_of(oldflo, struct xfrm_dst, flo);
-   num_pols = xdst->num_pols;
-   num_xfrms = xdst->num_xfrms;
-   pol_dead = 0;
-   for (i = 0; i < num_pols; i++) {
-   pols[i] = xdst->pols[i];
-   pol_dead |= pols[i]->walk.dead;
-   }
-   if (pol_dead) {
-   /* Mark DST_OBSOLETE_DEAD to fail the next
-* xfrm_dst_check()
-*/
-   xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
-   dst_release_immediate(&xdst->u.dst);
-   xdst = NULL;
-   num_pols = 0;
-   num_xfrms = 0;
-   oldflo = NULL;
-   }
-   }
-
/* Resolve policies to use if we couldn't get them from
 * previous cache entry */
-   if (xdst == NULL) {
-   num_pols = 1;
-   pols[0] = __xfrm_policy_lookup(net, fl, family,
-  flow_to_policy_dir(dir));
-   err = xfrm_expand_policies(fl, family, pols,
+   num_pols = 1;
+   pols[0] = __xfrm_policy_lookup(net, fl, family,
+  flow_to_policy_dir(dir));
+   err = xfrm_expand_policies(fl, family, pols,
   &num_pols, &num_xfrms);
-   if (err < 0)
-   goto inc_error;
-   if (num_pols == 0)
-   return NULL;
-   if (num_xfrms <= 0)
-   goto make_dummy_bundle;
-   }
+   if (err < 0)
+   goto inc_error;
+   if (num_pols == 0)
+   return NULL;
+   if (num_xfrms <= 0)
+   goto make_dummy_bundle;
 
new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
  xflo->dst_orig);
@@ -2109,26 +2084,10 @@ xfrm_bundle_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir,
err = PTR_ERR(new_xdst);
if (err != -EAGAIN)
goto error;
-   if (oldflo == NULL)
-   goto make_dummy_bundle;
-   dst_hold(&xdst->u.dst);
-   return oldflo;
+   goto make_dummy_bundle;
} else if (new_xdst == NULL) {
num_xfrms = 0;
-   if (oldflo == NULL)
-   goto make_dummy_bundle;
-   xdst->num_xfrms = 0;
-   dst_hold(&xdst->u.dst);
-   return oldflo;
-   }
-
-   /* Kill the previous bundle */
-   if (xdst) {
-   /* The policies were stolen for newly generated bundle */
-   xdst->num_pols = 0;
-   /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
-   xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
-   dst_release_immediate(&xdst->u.dst);
+   goto make_dummy_bundle;
}
 
return &new_xdst->flo;
@@ -2152,12 +2111,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir,
 inc_error:
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
 error:
-   if (xdst != NULL) {
-   /* Mark DST_OBSOLETE_DEAD to fail the next xfrm_dst_check() */
-   xdst->u.dst.obsolete = DST_OBSOLETE_DEAD;
-   dst_release_immediate(&xdst->u.dst);
-   } else
-   xfrm_pols_put(pols, num_pols);
+   xfrm_pols_put(pols, num_pols);
return ERR_PTR(err);
 }
 
-- 
2.13.0



[RFC net-next 3/9] xfrm_policy: bypass flow_cache_lookup

2017-06-28 Thread Florian Westphal
Instead of consulting flow cache, call the xfrm bundle/policy lookup
functions directly.  This pretends the flow cache had no entry.

This helps to gradually remove flow cache integration,
followup commit will remove the dead code that this change adds.

Signed-off-by: Florian Westphal 
---
 net/xfrm/xfrm_policy.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index a3dc7ab0b7ed..084736ff2681 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2056,13 +2056,12 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct 
net *net,
 }
 
 static struct flow_cache_object *
-xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
-  struct flow_cache_object *oldflo, void *ctx)
+xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 
dir, struct xfrm_flo *xflo)
 {
-   struct xfrm_flo *xflo = (struct xfrm_flo *)ctx;
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
struct xfrm_dst *xdst, *new_xdst;
int num_pols = 0, num_xfrms = 0, i, err, pol_dead;
+   struct flow_cache_object *oldflo = NULL;
 
/* Check if the policies from old bundle are usable */
xdst = NULL;
@@ -2132,8 +2131,6 @@ xfrm_bundle_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir,
dst_release_immediate(&xdst->u.dst);
}
 
-   /* We do need to return one reference for original caller */
-   dst_hold(&new_xdst->u.dst);
return &new_xdst->flo;
 
 make_dummy_bundle:
@@ -2246,8 +2243,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct 
dst_entry *dst_orig,
!net->xfrm.policy_count[XFRM_POLICY_OUT])
goto nopol;
 
-   flo = flow_cache_lookup(net, fl, family, dir,
-   xfrm_bundle_lookup, &xflo);
+   flo = xfrm_bundle_lookup(net, fl, family, dir, &xflo);
if (flo == NULL)
goto nopol;
if (IS_ERR(flo)) {
@@ -2493,8 +2489,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct 
sk_buff *skb,
if (!pol) {
struct flow_cache_object *flo;
 
-   flo = flow_cache_lookup(net, &fl, family, fl_dir,
-   xfrm_policy_lookup, NULL);
+   flo = xfrm_policy_lookup(net, &fl, family, dir, NULL, NULL);
+
if (IS_ERR_OR_NULL(flo))
pol = ERR_CAST(flo);
else
-- 
2.13.0



[RFC net-next 6/9] xfrm_policy: remove xfrm_policy_lookup

2017-06-28 Thread Florian Westphal
This removes the wrapper and renames the __xfrm_policy_lookup variant
to get rid of another place that used flow cache objects.

Signed-off-by: Florian Westphal 
---
 net/xfrm/xfrm_policy.c | 36 
 1 file changed, 4 insertions(+), 32 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 626351915a97..86907731f161 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1179,7 +1179,7 @@ static struct xfrm_policy 
*xfrm_policy_lookup_bytype(struct net *net, u8 type,
 }
 
 static struct xfrm_policy *
-__xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 
dir)
+xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
 {
 #ifdef CONFIG_XFRM_SUB_POLICY
struct xfrm_policy *pol;
@@ -1191,26 +1191,6 @@ __xfrm_policy_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir
return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, 
family, dir);
 }
 
-static struct flow_cache_object *
-xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family,
-  u8 dir, struct flow_cache_object *old_obj, void *ctx)
-{
-   struct xfrm_policy *pol;
-
-   if (old_obj)
-   xfrm_pol_put(container_of(old_obj, struct xfrm_policy, flo));
-
-   pol = __xfrm_policy_lookup(net, fl, family, dir);
-   if (IS_ERR_OR_NULL(pol))
-   return ERR_CAST(pol);
-
-   /* Resolver returns two references:
-* one for cache and one for caller of flow_cache_lookup() */
-   xfrm_pol_hold(pol);
-
-   return &pol->flo;
-}
-
 static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int 
dir,
 const struct flowi *fl, u16 
family)
 {
@@ -2032,7 +2012,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir,
/* Resolve policies to use if we couldn't get them from
 * previous cache entry */
num_pols = 1;
-   pols[0] = __xfrm_policy_lookup(net, fl, family, dir);
+   pols[0] = xfrm_policy_lookup(net, fl, family, dir);
err = xfrm_expand_policies(fl, family, pols,
   &num_pols, &num_xfrms);
if (err < 0)
@@ -2402,16 +2382,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct 
sk_buff *skb,
}
}
 
-   if (!pol) {
-   struct flow_cache_object *flo;
-
-   flo = xfrm_policy_lookup(net, &fl, family, dir, NULL, NULL);
-
-   if (IS_ERR_OR_NULL(flo))
-   pol = ERR_CAST(flo);
-   else
-   pol = container_of(flo, struct xfrm_policy, flo);
-   }
+   if (!pol)
+   pol = xfrm_policy_lookup(net, &fl, family, dir);
 
if (IS_ERR(pol)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
-- 
2.13.0



[RFC net-next 7/9] xfrm_policy: make xfrm_bundle_lookup return xfrm dst object

2017-06-28 Thread Florian Westphal
This allows to remove flow cache object embedded in struct xfrm_dst.

Signed-off-by: Florian Westphal 
---
 net/xfrm/xfrm_policy.c | 28 
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 86907731f161..5bb049d8e8d5 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2000,15 +2000,13 @@ static struct xfrm_dst *xfrm_create_dummy_bundle(struct 
net *net,
goto out;
 }
 
-static struct flow_cache_object *
+static struct xfrm_dst *
 xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 
dir, struct xfrm_flo *xflo)
 {
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
-   struct xfrm_dst *xdst, *new_xdst;
int num_pols = 0, num_xfrms = 0, err;
+   struct xfrm_dst *xdst;
 
-   /* Check if the policies from old bundle are usable */
-   xdst = NULL;
/* Resolve policies to use if we couldn't get them from
 * previous cache entry */
num_pols = 1;
@@ -2022,19 +2020,19 @@ xfrm_bundle_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir,
if (num_xfrms <= 0)
goto make_dummy_bundle;
 
-   new_xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
+   xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
  xflo->dst_orig);
-   if (IS_ERR(new_xdst)) {
-   err = PTR_ERR(new_xdst);
+   if (IS_ERR(xdst)) {
+   err = PTR_ERR(xdst);
if (err != -EAGAIN)
goto error;
goto make_dummy_bundle;
-   } else if (new_xdst == NULL) {
+   } else if (xdst == NULL) {
num_xfrms = 0;
goto make_dummy_bundle;
}
 
-   return &new_xdst->flo;
+   return xdst;
 
 make_dummy_bundle:
/* We found policies, but there's no bundles to instantiate:
@@ -2050,7 +2048,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi 
*fl, u16 family, u8 dir,
memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
 
dst_hold(&xdst->u.dst);
-   return &xdst->flo;
+   return xdst;
 
 inc_error:
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
@@ -2086,7 +2084,6 @@ struct dst_entry *xfrm_lookup(struct net *net, struct 
dst_entry *dst_orig,
  const struct sock *sk, int flags)
 {
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
-   struct flow_cache_object *flo;
struct xfrm_dst *xdst;
struct dst_entry *dst, *route;
u16 family = dst_orig->ops->family;
@@ -2141,14 +2138,13 @@ struct dst_entry *xfrm_lookup(struct net *net, struct 
dst_entry *dst_orig,
!net->xfrm.policy_count[XFRM_POLICY_OUT])
goto nopol;
 
-   flo = xfrm_bundle_lookup(net, fl, family, dir, &xflo);
-   if (flo == NULL)
+   xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo);
+   if (xdst == NULL)
goto nopol;
-   if (IS_ERR(flo)) {
-   err = PTR_ERR(flo);
+   if (IS_ERR(xdst)) {
+   err = PTR_ERR(xdst);
goto dropdst;
}
-   xdst = container_of(flo, struct xfrm_dst, flo);
 
num_pols = xdst->num_pols;
num_xfrms = xdst->num_xfrms;
-- 
2.13.0



[RFC net-next 9/9] xfrm: add a small xdst pcpu cache

2017-06-28 Thread Florian Westphal
retain last used xfrm_dst in a pcpu cache.
On next request, reuse this dst if the policies are the same.

The cache does'nt help at all with strictly-RR workloads as
we never have a hit.

Also, the cache adds cost of this_cpu_xchg() in packet path.
It would be better to use plain this_cpu_read/write, however,
a netdev notifier can run in parallel on other cpu and write same
pcpu value so the xchg is needed to avoid race.

The notifier is needed so we do not add long hangs when a device
is dismantled but some pcpu xdst still holds a reference.

Test results using 4 network namespaces and null encryption:

ns1   ns2  -> ns3   -> ns4
netperf -> xfrm/null enc   -> xfrm/null dec -> netserver

whatTCP_STREAM  UDP_STREAM  UDP_RR
Flow cache: 14804.4 279.738 326213.0
No flow cache:  14158.3 257.458 228486.8
Pcpu cache: 14766.4 286.958 239433.5

UDP tests used 64byte packets, tests ran for one minute each,
value is average over ten iterations.

'Flow cache' is 'net-next', 'No flow cache' is net-next plus this
series but without this one.

Signed-off-by: Florian Westphal 
---
 include/net/xfrm.h |  1 +
 net/xfrm/xfrm_device.c |  1 +
 net/xfrm/xfrm_policy.c | 44 
 3 files changed, 46 insertions(+)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 9b85367529a4..8bde1d569790 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -316,6 +316,7 @@ int xfrm_policy_register_afinfo(const struct 
xfrm_policy_afinfo *afinfo, int fam
 void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo);
 void km_policy_notify(struct xfrm_policy *xp, int dir,
  const struct km_event *c);
+void xfrm_policy_dev_unreg(void);
 void km_state_notify(struct xfrm_state *x, const struct km_event *c);
 
 struct xfrm_tmpl;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index d01cb256e89c..8221d05d43d1 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -151,6 +151,7 @@ static int xfrm_dev_register(struct net_device *dev)
 
 static int xfrm_dev_unregister(struct net_device *dev)
 {
+   xfrm_policy_dev_unreg();
return NOTIFY_DONE;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f4419d1b9f38..ac83b39850ce 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -44,6 +44,7 @@ struct xfrm_flo {
u8 flags;
 };
 
+static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst);
 static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
 static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
__read_mostly;
@@ -1700,6 +1701,34 @@ static int xfrm_expand_policies(const struct flowi *fl, 
u16 family,
 
 }
 
+void xfrm_policy_dev_unreg(void)
+{
+   int cpu;
+
+   local_bh_disable();
+   rcu_read_lock();
+   for_each_possible_cpu(cpu) {
+   struct xfrm_dst *tmp, *old;
+
+   old = per_cpu(xfrm_last_dst, cpu);
+   if (!old || xfrm_bundle_ok(old))
+   continue;
+
+   tmp = cmpxchg(&(per_cpu(xfrm_last_dst, cpu)), old, NULL);
+   if (tmp == old)
+   dst_release(&old->u.dst);
+   }
+   rcu_read_unlock();
+   local_bh_enable();
+}
+
+static void xfrm_last_dst_update(struct xfrm_dst *xdst)
+{
+   struct xfrm_dst *old = this_cpu_xchg(xfrm_last_dst, xdst);
+   if (old)
+   dst_release(&old->u.dst);
+}
+
 static struct xfrm_dst *
 xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
   const struct flowi *fl, u16 family,
@@ -1711,17 +1740,29 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy 
**pols, int num_pols,
struct xfrm_dst *xdst;
int err;
 
+   xdst = this_cpu_read(xfrm_last_dst);
+   if (xdst &&
+   xdst->u.dst.dev == dst_orig->dev &&
+   xdst->num_pols == num_pols &&
+   memcmp(xdst->pols, pols,
+  sizeof(struct xfrm_policy *) * num_pols) == 0 &&
+   xfrm_bundle_ok(xdst) &&
+   dst_hold_safe(&xdst->u.dst))
+   return xdst;
+
/* Try to instantiate a bundle */
err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
if (err <= 0) {
if (err != 0 && err != -EAGAIN)
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
+   xfrm_last_dst_update(NULL);
return ERR_PTR(err);
}
 
dst = xfrm_bundle_create(pols[0], xfrm, err, fl, dst_orig);
if (IS_ERR(dst)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
+   xfrm_last_dst_update(NULL);
return ERR_CAST(dst);
}
 
@@ -1731,6 +1772,9 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, 
int num_pols,
 

[RFC net-next 8/9] xfrm: remove flow cache

2017-06-28 Thread Florian Westphal
After rcu conversions performance degradation in forward tests isn't that
noticeable anymore.

See next patch for some numbers.

Signed-off-by: Florian Westphal 
---
 include/net/flow.h  |  34 ---
 include/net/flowcache.h |  25 --
 include/net/netns/xfrm.h|  11 -
 include/net/xfrm.h  |   8 -
 net/core/Makefile   |   1 -
 net/core/flow.c | 516 
 net/ipv4/xfrm4_policy.c |   9 -
 net/ipv6/xfrm6_policy.c |   9 -
 net/key/af_key.c|   4 -
 net/xfrm/xfrm_device.c  |   2 -
 net/xfrm/xfrm_policy.c  | 108 -
 net/xfrm/xfrm_user.c|   2 -
 security/selinux/include/xfrm.h |   4 +-
 13 files changed, 1 insertion(+), 732 deletions(-)
 delete mode 100644 include/net/flowcache.h
 delete mode 100644 net/core/flow.c

diff --git a/include/net/flow.h b/include/net/flow.h
index bae198b3039e..f3dc61b29bb5 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -218,40 +218,6 @@ static inline unsigned int flow_key_size(u16 family)
return 0;
 }
 
-#define FLOW_DIR_IN0
-#define FLOW_DIR_OUT   1
-#define FLOW_DIR_FWD   2
-
-struct net;
-struct sock;
-struct flow_cache_ops;
-
-struct flow_cache_object {
-   const struct flow_cache_ops *ops;
-};
-
-struct flow_cache_ops {
-   struct flow_cache_object *(*get)(struct flow_cache_object *);
-   int (*check)(struct flow_cache_object *);
-   void (*delete)(struct flow_cache_object *);
-};
-
-typedef struct flow_cache_object *(*flow_resolve_t)(
-   struct net *net, const struct flowi *key, u16 family,
-   u8 dir, struct flow_cache_object *oldobj, void *ctx);
-
-struct flow_cache_object *flow_cache_lookup(struct net *net,
-   const struct flowi *key, u16 family,
-   u8 dir, flow_resolve_t resolver,
-   void *ctx);
-int flow_cache_init(struct net *net);
-void flow_cache_fini(struct net *net);
-void flow_cache_hp_init(void);
-
-void flow_cache_flush(struct net *net);
-void flow_cache_flush_deferred(struct net *net);
-extern atomic_t flow_cache_genid;
-
 __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys);
 
 static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6)
diff --git a/include/net/flowcache.h b/include/net/flowcache.h
deleted file mode 100644
index 51eb971e8973..
--- a/include/net/flowcache.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _NET_FLOWCACHE_H
-#define _NET_FLOWCACHE_H
-
-#include 
-#include 
-#include 
-#include 
-
-struct flow_cache_percpu {
-   struct hlist_head   *hash_table;
-   unsigned inthash_count;
-   u32 hash_rnd;
-   int hash_rnd_recalc;
-   struct tasklet_struct   flush_tasklet;
-};
-
-struct flow_cache {
-   u32 hash_shift;
-   struct flow_cache_percpu __percpu *percpu;
-   struct hlist_node   node;
-   unsigned intlow_watermark;
-   unsigned inthigh_watermark;
-   struct timer_list   rnd_timer;
-};
-#endif /* _NET_FLOWCACHE_H */
diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
index 27bb9633c69d..611521646dd4 100644
--- a/include/net/netns/xfrm.h
+++ b/include/net/netns/xfrm.h
@@ -6,7 +6,6 @@
 #include 
 #include 
 #include 
-#include 
 
 struct ctl_table_header;
 
@@ -73,16 +72,6 @@ struct netns_xfrm {
spinlock_t xfrm_state_lock;
spinlock_t xfrm_policy_lock;
struct mutex xfrm_cfg_mutex;
-
-   /* flow cache part */
-   struct flow_cache   flow_cache_global;
-   atomic_tflow_cache_genid;
-   struct list_headflow_cache_gc_list;
-   atomic_tflow_cache_gc_count;
-   spinlock_t  flow_cache_gc_lock;
-   struct work_struct  flow_cache_gc_work;
-   struct work_struct  flow_cache_flush_work;
-   struct mutexflow_flush_sem;
 };
 
 #endif
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 01f5bc144ee5..9b85367529a4 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -562,7 +562,6 @@ struct xfrm_policy {
atomic_trefcnt;
struct timer_list   timer;
 
-   struct flow_cache_object flo;
atomic_tgenid;
u32 priority;
u32 index;
@@ -977,7 +976,6 @@ struct xfrm_dst {
struct rt6_info rt6;
} u;
struct dst_entry *route;
-   struct flow_cache_object flo;
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
int num_pols, num_xfrms;
u32 xfrm_genid;
@@ -1225,9 +1223,6 @@ static inline void xfrm_sk_free_policy(struct sock *sk)

[net-next:master 66/373] warning: __mcount_loc already exists: drivers/net/wireless/intel/iwlwifi/dvm/rx.o

2017-06-28 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   2ee87db3a287f81bf1bbc10ef52e7cb6d034ef92
commit: 31efcc250a1dea96edca6595a9639d898cf99ae5 [66/373] net/sched: properly 
assign RCU pointer in tcf_chain_tp_insert/remove
config: i386-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
git checkout 31efcc250a1dea96edca6595a9639d898cf99ae5
# save the attached .config to linux build tree
make ARCH=i386 

All warnings (new ones prefixed by >>):

>> warning: __mcount_loc already exists: 
>> drivers/net/wireless/intel/iwlwifi/dvm/rx.o
--
>> warning: __mcount_loc already exists: 
>> drivers/net/wireless/intel/iwlwifi/dvm/rxon.o

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH NET V7 1/2] net: phy: Add phy loopback support in net phy framework

2017-06-28 Thread Andrew Lunn
On Wed, Jun 28, 2017 at 05:13:10PM +0800, Lin Yun Sheng wrote:
> This patch add set_loopback in phy_driver, which is used by MAC
> driver to enable or disable phy loopback. it also add a generic
> genphy_loopback function, which use BMCR loopback bit to enable
> or disable loopback.
> 
> Signed-off-by: Lin Yun Sheng 

Hi Lin

It is normal to include my

Reviewed-by: Andrew Lunn 

when resubmitting a patch. The only time you drop such tags is when
you make a big change.

Andrew


Re: [PATCH net-next 1/3] net: ethtool: add support for forward error correction modes

2017-06-28 Thread Andrew Lunn
On Tue, Jun 27, 2017 at 03:22:39AM -0700, Jakub Kicinski wrote:
> On Sat, 24 Jun 2017 12:19:43 -0700, Roopa Prabhu wrote:
> > Encoding: Types of encoding
> > Off:  Turning off any encoding
> > RS :  enforcing RS-FEC encoding on supported speeds
> > BaseR  :  enforcing Base R encoding on supported speeds
> > Auto   :  IEEE defaults for the speed/medium combination
> 
> Just to be sure - does auto mean autonegotiate as defined by IEEE or
> some presets?

I don't know this field very well. Is this confusion likely to happen
a lot? Is there a better name for Auto which is less likely to be
confused?

Andrew


Re: [PATCH v3 net-next 01/12] selftests/bpf: add test for mixed signed and unsigned bounds checks

2017-06-28 Thread Daniel Borkmann

On 06/27/2017 02:56 PM, Edward Cree wrote:

Currently fails due to bug in verifier bounds handling.

Signed-off-by: Edward Cree 


Acked-by: Daniel Borkmann 


Re: [PATCH v3 net-next 00/12] bpf: rewrite value tracking in verifier

2017-06-28 Thread Daniel Borkmann

Hi Edward,

On 06/27/2017 02:53 PM, Edward Cree wrote:

This series simplifies alignment tracking, generalises bounds tracking and
  fixes some bounds-tracking bugs in the BPF verifier.  Pointer arithmetic on
  packet pointers, stack pointers, map value pointers and context pointers has
  been unified, and bounds on these pointers are only checked when the pointer
  is dereferenced.
Operations on pointers which destroy all relation to the original pointer
  (such as multiplies and shifts) are disallowed if !env->allow_ptr_leaks,
  otherwise they convert the pointer to an unknown scalar and feed it to the
  normal scalar arithmetic handling.
Pointer types have been unified with the corresponding adjusted-pointer types
  where those existed (e.g. PTR_TO_MAP_VALUE[_ADJ] or FRAME_PTR vs
  PTR_TO_STACK); similarly, CONST_IMM and UNKNOWN_VALUE have been unified into
  SCALAR_VALUE.
Pointer types (except CONST_PTR_TO_MAP, PTR_TO_MAP_VALUE_OR_NULL and
  PTR_TO_PACKET_END, which do not allow arithmetic) have a 'fixed offset' and
  a 'variable offset'; the former is used when e.g. adding an immediate or a
  known-constant register, as long as it does not overflow.  Otherwise the
  latter is used, and any operation creating a new variable offset creates a
  new 'id' (and, for PTR_TO_PACKET, clears the 'range').
SCALAR_VALUEs use the 'variable offset' fields to track the range of possible
  values; the 'fixed offset' should never be set on a scalar.

As of patch 12/12, all tests of tools/testing/selftests/bpf/test_verifier
  and tools/testing/selftests/bpf/test_align pass.

v3: added a few more tests; removed RFC tags.


Did you also have a chance in the meantime to look at reducing complexity
along with your unification? I did run the cilium test suite with your
latest set from here and current # worst case processed insns that
verifier has to go through for cilium progs increases from ~53k we have
right now to ~76k. I'm a bit worried that this quickly gets us close to
the upper ~98k max limit starting to reject programs again. Alternative
is to bump the complexity limit again in near future once run into it,
but preferably there's a way to optimize it along with the rewrite? Do
you see any possibilities worth exploring?


v2: fixed nfp build, made test_align pass again and extended it with a few
  new tests (though still need to add more).

Edward Cree (12):
   selftests/bpf: add test for mixed signed and unsigned bounds checks
   bpf/verifier: rework value tracking
   nfp: change bpf verifier hooks to match new verifier data structures
   bpf/verifier: track signed and unsigned min/max values
   bpf/verifier: more concise register state logs for constant var_off
   selftests/bpf: change test_verifier expectations
   selftests/bpf: rewrite test_align
   selftests/bpf: add a test to test_align
   selftests/bpf: add test for bogus operations on pointers
   selftests/bpf: don't try to access past MAX_PACKET_OFF in
 test_verifier
   selftests/bpf: add tests for subtraction & negative numbers
   selftests/bpf: variable offset negative tests

  drivers/net/ethernet/netronome/nfp/bpf/verifier.c |   24 +-
  include/linux/bpf.h   |   34 +-
  include/linux/bpf_verifier.h  |   56 +-
  include/linux/tnum.h  |   81 +
  kernel/bpf/Makefile   |2 +-
  kernel/bpf/tnum.c |  180 ++
  kernel/bpf/verifier.c | 1943 -
  tools/testing/selftests/bpf/test_align.c  |  462 -
  tools/testing/selftests/bpf/test_verifier.c   |  293 ++--
  9 files changed, 2034 insertions(+), 1041 deletions(-)
  create mode 100644 include/linux/tnum.h
  create mode 100644 kernel/bpf/tnum.c



Thanks,
Daniel


Re: [PATCH v3 net-next 00/12] bpf: rewrite value tracking in verifier

2017-06-28 Thread Edward Cree
On 28/06/17 14:50, Daniel Borkmann wrote:
> Hi Edward,
>
> Did you also have a chance in the meantime to look at reducing complexity
> along with your unification? I did run the cilium test suite with your
> latest set from here and current # worst case processed insns that
> verifier has to go through for cilium progs increases from ~53k we have
> right now to ~76k. I'm a bit worried that this quickly gets us close to
> the upper ~98k max limit starting to reject programs again. Alternative
> is to bump the complexity limit again in near future once run into it,
> but preferably there's a way to optimize it along with the rewrite? Do
> you see any possibilities worth exploring? 
The trouble, I think, is that as we're now tracking more information about
 each register value, we're less able to prune branches.  But often that
 information is not actually being used in reaching the exit state.  So it
 seems like the way to tackle this would be to track what information is
 used — or at least, which registers are read from (including e.g. writing
 through them or passing them to helper calls) — in reaching a safe state.
 Then only registers which are used are required to match for pruning.
But that tracking would presumably have to propagate backwards through the
 verifier stack, and I'm not sure how easily that could be done.  Someone
 (was it you?) was talking about replacing the current DAG walking and
 pruning with some kind of basic-block thing, which would help with this.
Summary: I think it could be done, but I haven't looked into the details
 of implementation yet; if it's not actually breaking your programs (yet),
 maybe leave it for a followup patch series?

-Ed


Re: [net-next v2 6/6] ixgbe: Add malicious driver detection support

2017-06-28 Thread Or Gerlitz
On Wed, Jun 28, 2017 at 1:14 AM, Tantilov, Emil S
 wrote:

> Mainly because I am not sure that other (non-Intel) drivers will benefit from
> such an option. In normal operation this functionality should not cause issues
> and if it doesn't we may be able to deprecate the private flag in the future.

If you think this functionality makes sense any driver running over HW
implementing
it would like to be able to expose that and hence you better not use
private flag.

Are we sure the trust UAPI can't be extended for that matter?


Re: [PATCH net-next] cxgb4: Add PTP Hardware Clock (PHC) support

2017-06-28 Thread Richard Cochran
On Wed, Jun 28, 2017 at 01:29:03PM +0530, Atul Gupta wrote:
> Add PTP IEEE-1588 support and make it accessible via PHC subsystem.
> The functionality is enabled for T5/T6 adapters. Driver interfaces with
> Firmware to program and adjust the clock offset.
> 
> Signed-off-by: Atul Gupta 
> Signed-off-by: Ganesh Goudar 

Please put the PTP maintainer onto CC for PTP patches.

>  drivers/net/ethernet/chelsio/cxgb4/Makefile|   2 +-
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |   9 +
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c |  25 +-
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c|  87 +++-
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.c | 476 
> +
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_ptp.h |  74 
>  drivers/net/ethernet/chelsio/cxgb4/sge.c   | 166 ++-
>  drivers/net/ethernet/chelsio/cxgb4/t4_msg.h|  28 ++
>  drivers/net/ethernet/chelsio/cxgb4/t4_regs.h   |   2 +
>  drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h  |  50 +++
>  10 files changed, 906 insertions(+), 13 deletions(-)

This should be broken down into a series of at least 2 patches:

1. Transmit hardware SO_TIMESTAMPING

2. PHC support

Thanks,
Richard


Re: [PATCH] net: ethernet: stmmac: properly set PS bit in MII configurations during reset

2017-06-28 Thread Giuseppe CAVALLARO

Hello Thomas

I do not want to change a critical reset function shared among different 
platforms where
this problem has never met but you are right that we have to find a way 
to proceed in order
to finalize your work. Let me elaborate your initial patch and I try to 
give you a proposal asap.
In my mind, we should have a dedicated spear_dma_reset for your case 
that should be used on

SPEAr platform driver (or by using st,spear600-gmac compatibility).
Also your patch did not consider the RMII and (R)GMII cases.

Regards
Peppe

On 6/25/2017 2:32 PM, Thomas Petazzoni wrote:

Hello Giuseppe,

On Mon, 15 May 2017 16:27:34 +0200, Thomas Petazzoni wrote:


On Wed, 10 May 2017 09:18:17 +0200, Thomas Petazzoni wrote:


On Wed, 10 May 2017 09:03:12 +0200, Giuseppe CAVALLARO wrote:
   

Please, read again my patch and the description of the problem that I
have sent. But basically, any solution that does not allow to set the
PS bit between asserting the DMA reset bit and polling for it to clear
will not work for MII PHYs.

yes your point was clear to me, I was just wondering if we could find an
easier way
to solve it w/o changing the API, adding  the set_ps and propagating the
"interface"
inside the DMA reset.

Maybe this could be fixed in the glue-logic in some way. Let me know
what do you think.

Well, it's more up to you to tell me how you would like this be solved.
We figured out what the problem was, but I don't know well enough the
architecture of the driver to decide how the solution to this problem
should be designed. I made an initial simple proposal to show what is
needed, but I'm definitely open to suggestions.

Do you have any suggestion on how to move forward with this?

Another kind ping on this topic. I really would like to have the
SPEAr600 network support work out of the box in mainline, which
currently isn't the case with an MII PHY.

I posted a patch that fixes the problem, see
https://patchwork.ozlabs.org/patch/755926/, but the feedback I got so
far does not give any direction on how to rework the patch to make it
acceptable. Would it be possible to get some more feedback?





[PATCHv2 2/3] ethtool: stmmac: Add macros for number of registers

2017-06-28 Thread thor . thayer
From: Thor Thayer 

This patch adds macros for the number of registers to
loop through to make the code easier to read.

Signed-off-by: Thor Thayer 
---
v2  New commit. Add macros for number of registers.
---
 stmmac.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/stmmac.c b/stmmac.c
index e1bb291..ab83779 100644
--- a/stmmac.c
+++ b/stmmac.c
@@ -14,6 +14,10 @@
 #include 
 #include "internal.h"
 
+#define MAC100_DMA_REG_NUM 9
+#define GMAC_REG_NUM   55
+#define GMAC_DMA_REG_NUM   22
+
 /* The DMA Registers start at offset 0x1000 in the DW IP */
 #define DMA_REG_OFFSET (0x1000 / 4)
 
@@ -40,7 +44,7 @@ int st_mac100_dump_regs(struct ethtool_drvinfo *info,
fprintf(stdout, "\n");
fprintf(stdout, "DMA Registers\n");
stmmac_reg = (unsigned int *)regs->data + DMA_REG_OFFSET;
-   for (i = 0; i < 9; i++)
+   for (i = 0; i < MAC100_DMA_REG_NUM; i++)
fprintf(stdout, "CSR%d  0x%08X\n", i, *stmmac_reg++);
 
fprintf(stdout, "DMA cur tx buf addr 0x%08X\n", *stmmac_reg++);
@@ -58,13 +62,13 @@ int st_gmac_dump_regs(struct ethtool_drvinfo *info, struct 
ethtool_regs *regs)
 
fprintf(stdout, "ST GMAC Registers\n");
fprintf(stdout, "GMAC Registers\n");
-   for (i = 0; i < 55; i++)
+   for (i = 0; i < GMAC_REG_NUM; i++)
fprintf(stdout, "Reg%d  0x%08X\n", i, *stmmac_reg++);
 
fprintf(stdout, "\n");
fprintf(stdout, "DMA Registers\n");
stmmac_reg = (unsigned int *)regs->data + DMA_REG_OFFSET;
-   for (i = 0; i < 22; i++)
+   for (i = 0; i < GMAC_DMA_REG_NUM; i++)
fprintf(stdout, "Reg%d  0x%08X\n", i, *stmmac_reg++);
 
return 0;
-- 
2.7.4



[PATCHv2 1/3] ethtool: stmmac: Fix Designware ethtool register dump

2017-06-28 Thread thor . thayer
From: Thor Thayer 

The commit fbf68229ffe7 ("net: stmmac: unify registers dumps methods")

in the Linux kernel modified the register dump to store the DMA registers
at the DMA register offset (0x1000) but ethtool (stmmac.c) looks for the
DMA registers after the MAC registers which is offset 12.
This patch adds the DMA register offset so that indexing is correct.

Signed-off-by: Thor Thayer 
Acked-by: Giuseppe Cavallaro 
---
v2  Modify the commit message to specify commit from Linux kernel.
Add Acked-by.
---
 stmmac.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/stmmac.c b/stmmac.c
index fb69bfe..e1bb291 100644
--- a/stmmac.c
+++ b/stmmac.c
@@ -14,6 +14,9 @@
 #include 
 #include "internal.h"
 
+/* The DMA Registers start at offset 0x1000 in the DW IP */
+#define DMA_REG_OFFSET (0x1000 / 4)
+
 int st_mac100_dump_regs(struct ethtool_drvinfo *info,
struct ethtool_regs *regs)
 {
@@ -36,6 +39,7 @@ int st_mac100_dump_regs(struct ethtool_drvinfo *info,
 
fprintf(stdout, "\n");
fprintf(stdout, "DMA Registers\n");
+   stmmac_reg = (unsigned int *)regs->data + DMA_REG_OFFSET;
for (i = 0; i < 9; i++)
fprintf(stdout, "CSR%d  0x%08X\n", i, *stmmac_reg++);
 
@@ -59,6 +63,7 @@ int st_gmac_dump_regs(struct ethtool_drvinfo *info, struct 
ethtool_regs *regs)
 
fprintf(stdout, "\n");
fprintf(stdout, "DMA Registers\n");
+   stmmac_reg = (unsigned int *)regs->data + DMA_REG_OFFSET;
for (i = 0; i < 22; i++)
fprintf(stdout, "Reg%d  0x%08X\n", i, *stmmac_reg++);
 
-- 
2.7.4



[PATCHv2 0/3] ethtool: stmmac: Fix DMA register dump

2017-06-28 Thread thor . thayer
From: Thor Thayer 

1. The DMA register dump structure changed which requires this
change to the indexing of the DMA registers.
2. Also dump the DMA HW Feature Register.
3. V2 also adds macros for the number of registers.

Thor Thayer (3):
  ethtool: stmmac: Fix Designware ethtool register dump
  ethtool: stmmac: Add macros for number of registers
  ethtool: stmmac: Add DMA HW Feature Register

 stmmac.c | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

-- 
2.7.4



[PATCHv2 3/3] ethtool: stmmac: Add DMA HW Feature Register

2017-06-28 Thread thor . thayer
From: Thor Thayer 

This patch adds the DMA HW Feature Register which is at the end
of the DMA registers and is documented in Version 3.70a.

Signed-off-by: Thor Thayer 
Acked-by: Giuseppe Cavallaro 
---
v2  Modify for MACRO changes and add Acked-by
---
 stmmac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stmmac.c b/stmmac.c
index ab83779..e5d8c7b 100644
--- a/stmmac.c
+++ b/stmmac.c
@@ -16,7 +16,7 @@
 
 #define MAC100_DMA_REG_NUM 9
 #define GMAC_REG_NUM   55
-#define GMAC_DMA_REG_NUM   22
+#define GMAC_DMA_REG_NUM   23
 
 /* The DMA Registers start at offset 0x1000 in the DW IP */
 #define DMA_REG_OFFSET (0x1000 / 4)
-- 
2.7.4



Re: [PATCH v3 net-next 02/12] bpf/verifier: rework value tracking

2017-06-28 Thread Daniel Borkmann

On 06/27/2017 02:56 PM, Edward Cree wrote:

Tracks value alignment by means of tracking known & unknown bits.
Tightens some min/max value checks and fixes a couple of bugs therein.


You mean the one in relation to patch 1/12? Would be good to elaborate
here since otherwise this gets forgotten few weeks later.

Could you also document all the changes that verifier will then start
allowing for after the patch?


If pointer leaks are allowed, and adjust_ptr_min_max_vals returns -EACCES,
  treat the pointer as an unknown scalar and try again, because we might be
  able to conclude something about the result (e.g. pointer & 0x40 is either
  0 or 0x40).

Signed-off-by: Edward Cree 

[...]

  /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -899,52 +965,79 @@ static int check_mem_access(struct bpf_verifier_env *env, 
int insn_idx, u32 regn
struct bpf_reg_state *reg = &state->regs[regno];
int size, err = 0;

-   if (reg->type == PTR_TO_STACK)
-   off += reg->imm;
-
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;


[...]

-   if (reg->type == PTR_TO_MAP_VALUE ||
-   reg->type == PTR_TO_MAP_VALUE_ADJ) {
+   /* for access checks, reg->off is just part of off */
+   off += reg->off;


Could you elaborate on why removing the reg->type == PTR_TO_STACK?
Also in context of below PTR_TO_CTX.

[...]

} else if (reg->type == PTR_TO_CTX) {
-   enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+   enum bpf_reg_type reg_type = SCALAR_VALUE;

if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
+   /* ctx accesses must be at a fixed offset, so that we can
+* determine what type of data were returned.
+*/
+   if (!tnum_is_const(reg->var_off)) {
+   char tn_buf[48];
+
+   tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+   verbose("variable ctx access var_off=%s off=%d size=%d",
+   tn_buf, off, size);
+   return -EACCES;
+   }
+   off += reg->var_off.value;


... f.e. in PTR_TO_CTX case the only access that is currently
allowed is LDX/STX with fixed offset from insn->off, which is
passed as off param to check_mem_access(). Can you elaborate on
off += reg->var_off.value? Meaning we make this more dynamic
as long as access is known const?


err = check_ctx_access(env, insn_idx, off, size, t, ®_type);
if (!err && t == BPF_READ && value_regno >= 0) {
-   mark_reg_unknown_value_and_range(state->regs,
-value_regno);
-   /* note that reg.[id|off|range] == 0 */
+   /* ctx access returns either a scalar, or a
+* PTR_TO_PACKET[_END].  In the latter case, we know
+* the offset is zero.
+*/
+   if (reg_type == SCALAR_VALUE)
+   mark_reg_unknown(state->regs, value_regno);
+   else
+   mark_reg_known_zero(state->regs, value_regno);
+   state->regs[value_regno].id = 0;
+   state->regs[value_regno].off = 0;
+   state->regs[value_regno].range = 0;
state->regs[value_regno].type = reg_type;
-   state->regs[value_regno].aux_off = 0;
-   state->regs[value_regno].aux_off_align = 0;
}

-   } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
+   } else if (reg->type == PTR_TO_STACK) {

[...]


[PATCH] mwifiex: fix spelling mistake: "secuirty" -> "security"

2017-06-28 Thread Colin King
From: Colin Ian King 

Trivial fix to spelling mistake in mwifiex_dbg message

Signed-off-by: Colin Ian King 
---
 drivers/net/wireless/marvell/mwifiex/cfg80211.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c 
b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
index a850ec0054e2..d8ff823c0818 100644
--- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c
+++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c
@@ -1981,7 +1981,7 @@ static int mwifiex_cfg80211_start_ap(struct wiphy *wiphy,
 
if (mwifiex_set_secure_params(priv, bss_cfg, params)) {
mwifiex_dbg(priv->adapter, ERROR,
-   "Failed to parse secuirty parameters!\n");
+   "Failed to parse security parameters!\n");
goto out;
}
 
-- 
2.11.0



Re: [GIT PULL] arcnet: fixes and features

2017-06-28 Thread Michael Grzeschik
Hi,

On Fri, Jun 23, 2017 at 01:54:03PM -0400, David Miller wrote:
> From: Michael Grzeschik 
> Date: Thu, 22 Jun 2017 17:31:02 +0200
> 
> > are available in the git repository at:
> > 
> >   pub...@git.pengutronix.de:/mgr/linux.git tags/arcnet-for-mainline
> 
> I'm not pulling from that address, either setup a properl kernel.org
> GIT account or we work with just plain patches.

Since helpd...@kernel.org does not seem to respond for
my account request I will resend the series as plain patches.

> Next, you need to provide a proper commit message in a
> "[PATCH 0/N] " posting explaining at a high level what
> the patch series is doing as a unit, how it is doing it,
> and why it is doing it that way.

Will do.

> Finally, in patch #1 the assignment of "flags" to "0" in the
> declaration is unnecessary please remove it.

Fixed that.

Thanks,
Michael

-- 
Pengutronix e.K.   | |
Industrial Linux Solutions | http://www.pengutronix.de/  |
Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0|
Amtsgericht Hildesheim, HRA 2686   | Fax:   +49-5121-206917- |


signature.asc
Description: PGP signature


Re: [PATCH v3 net-next 02/12] bpf/verifier: rework value tracking

2017-06-28 Thread Edward Cree
On 28/06/17 16:15, Daniel Borkmann wrote:
> On 06/27/2017 02:56 PM, Edward Cree wrote:
>> Tracks value alignment by means of tracking known & unknown bits.
>> Tightens some min/max value checks and fixes a couple of bugs therein.
>
> You mean the one in relation to patch 1/12? Would be good to elaborate
> here since otherwise this gets forgotten few weeks later.
That wasn't the only one; there were also some in the new min/max value
 calculation for ALU ops.  For instance, in subtraction we were taking
 the new bounds as [min-min, max-max] instead of [min-max, max-min].
I can't remember what else there was and there might also have been some
 that I missed but that got incidentally fixed by the rewrite.  But I
 guess I should change "checks" to "checks and updates" in the above?
> Could you also document all the changes that verifier will then start
> allowing for after the patch?
Maybe not the changes, because the old verifier had a lot of special
 cases, but I could, and probably should, document the new behaviour
 (maybe in Documentation/networking/filter.txt, that already has a bit
 of description of the verifier).
> [...]
>>   /* check whether memory at (regno + off) is accessible for t = (read | 
>> write)
>> @@ -899,52 +965,79 @@ static int check_mem_access(struct bpf_verifier_env 
>> *env, int insn_idx, u32 regn
>>   struct bpf_reg_state *reg = &state->regs[regno];
>>   int size, err = 0;
>>
>> -if (reg->type == PTR_TO_STACK)
>> -off += reg->imm;
>> -
>>   size = bpf_size_to_bytes(bpf_size);
>>   if (size < 0)
>>   return size;
>>
> [...]
>> -if (reg->type == PTR_TO_MAP_VALUE ||
>> -reg->type == PTR_TO_MAP_VALUE_ADJ) {
>> +/* for access checks, reg->off is just part of off */
>> +off += reg->off;
>
> Could you elaborate on why removing the reg->type == PTR_TO_STACK?
Previously bpf_reg_state had a member 'imm' which, for PTR_TO_STACK, was
 a fixed offset, so we had to add it in to the offset.  Now we instead
 have reg->off and it's generic to all pointerish types, so we don't need
 special handling of PTR_TO_STACK here.
> Also in context of below PTR_TO_CTX.
>
> [...]
>>   } else if (reg->type == PTR_TO_CTX) {
>> -enum bpf_reg_type reg_type = UNKNOWN_VALUE;
>> +enum bpf_reg_type reg_type = SCALAR_VALUE;
>>
>>   if (t == BPF_WRITE && value_regno >= 0 &&
>>   is_pointer_value(env, value_regno)) {
>>   verbose("R%d leaks addr into ctx\n", value_regno);
>>   return -EACCES;
>>   }
>> +/* ctx accesses must be at a fixed offset, so that we can
>> + * determine what type of data were returned.
>> + */
>> +if (!tnum_is_const(reg->var_off)) {
>> +char tn_buf[48];
>> +
>> +tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
>> +verbose("variable ctx access var_off=%s off=%d size=%d",
>> +tn_buf, off, size);
>> +return -EACCES;
>> +}
>> +off += reg->var_off.value;
>
> ... f.e. in PTR_TO_CTX case the only access that is currently
> allowed is LDX/STX with fixed offset from insn->off, which is
> passed as off param to check_mem_access(). Can you elaborate on
> off += reg->var_off.value? Meaning we make this more dynamic
> as long as access is known const?
So, I can't actually figure out how to construct a pointer with a known
 variable offset, but future changes to the verifier (like learning from
 comparing two pointers with the same base) could make it possible.  The
 situation we're handling here is where our register holds ctx + x,
 where x is also known to be some constant value k, and currently I don't
 know if that's possible except for the trivial case of k==0, and the edge
 case where k is too big to fit in the s32 reg->off (in which case the
 check_ctx_access will presumably reject it).
Stepping back a bit, each register holding a pointer type has two offsets,
 reg->off and reg->var_off, and the latter is a tnum representing
 knowledge about a value that's not necessarily exactly known.  But
 tnum_is_const checks that it _is_ exactly known.
There is another case that we allow now through the reg->off handling:
 adding a constant to a pointer and then dereferencing it.
So, with r1=ctx, instead of r2 = *(r1 + off), you can write
r3 = r1 + off
r2 = *(r1 + 0)
 if for some reason that suits you better.  But in that case, because off
 is a known value (either an immediate, or a register whose value is
 exactly known), that value gets added to r3->off rather than r3->var_off;
 see adjust_ptr_min_max_vals().

Hope that's clear,
-Ed


Re: [PATCH iproute2 3/5] rdma: Add device capability parsing

2017-06-28 Thread Jason Gunthorpe
On Tue, Jun 27, 2017 at 03:18:59PM -0700, Stephen Hemminger wrote:
> On Tue, 27 Jun 2017 20:46:15 +0300
> Leon Romanovsky  wrote:
> 
> > On Tue, Jun 27, 2017 at 11:37:35AM -0600, Jason Gunthorpe wrote:
> > > On Tue, Jun 27, 2017 at 08:33:01PM +0300, Leon Romanovsky wrote:
> > >  
> > > > My initial plan was to put all parsers under their respective names, in
> > > > the similar way as I did for caps: $ rdma dev show mlx5_4 caps  
> > >
> > > I think you should have a useful summary display similar to 'ip a' and
> > > other commands.
> > >
> > > guid(s), subnet prefix or default gid for IB, lid/lmc, link state,
> > > speed, mtu, pkeys protocol(s)  
> > 
> > It will, but before I would like to see this tool be a part of
> > iproute2, so other people will be able to extend it in addition
> > to me.
> > 
> > Are you fine with the proposed code?
> > 
> 
> Output formats need to be nailed down. The output of iproute2 commands is 
> almost
> like an ABI. Users build scripts to parse it (whether that is a great idea or 
> not
> is debateable, it mostly shows the weakness in programatic API's). Therefore 
> fully
> changing output formats in later revisions is likely to get users upset.

It would be nice to see an example of what the completed command
should output to make judgements on the format.. Going bit by bit
doesn't really give a full picture, IHO.

Jason


Re: [PATCH] rsi: add in missing RSI_FSM_STATES into array fsm_state

2017-06-28 Thread Kalle Valo
Colin King  writes:

> From: Colin Ian King 
>
> Two recent commits added new RSI_FSM_STATES (namely FSM_FW_NOT_LOADED
> and FSM_COMMON_DEV_PARAMS_SENT) and the corresponding table fsm_state
> was not updated to match. This can lead to an array overrun when
> accessing the latter two states in fsm_state. Fix this by adding in
> the missing states.
>
> Detected by CoverityScan, CID#1398379 ("Illegal address computation")
>
> Fixes: 9920322ccd8e04 ("rsi: add tx frame for common device configuration")
> Fixes: 015e367494c1d5 ("rsi: Register interrupt handler before firmware load")

This is very much nitpicking, no need to resend because of this, but the
preferred format is to to use 12 chars for the commit id:

Fixes: 9920322ccd8e ("rsi: add tx frame for common device configuration")
Fixes: 015e367494c1 ("rsi: Register interrupt handler before firmware load")

Documentation/process/submitting-patches.rst contains a nice tip how you
can get that automatically with git:

[core]
abbrev = 12
[pretty]
fixes = Fixes: %h (\"%s\")

-- 
Kalle Valo


[PATCH 4/5] arcnet: com20020-pci: fix dev_id calculation

2017-06-28 Thread Michael Grzeschik
The dev_id was miscalculated. Only the two bits 4-5 are relevant for the
MA1 card. PCIARC1 and PCIFB2 use the four bits 4-7 for id selection.

Signed-off-by: Michael Grzeschik 
---
 drivers/net/arcnet/com20020-pci.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/arcnet/com20020-pci.c 
b/drivers/net/arcnet/com20020-pci.c
index 239de38fbd6a5..fec2df2c869fe 100644
--- a/drivers/net/arcnet/com20020-pci.c
+++ b/drivers/net/arcnet/com20020-pci.c
@@ -135,6 +135,7 @@ static int com20020pci_probe(struct pci_dev *pdev,
for (i = 0; i < ci->devcount; i++) {
struct com20020_pci_channel_map *cm = &ci->chan_map_tbl[i];
struct com20020_dev *card;
+   int dev_id_mask = 0xf;
 
dev = alloc_arcdev(device);
if (!dev) {
@@ -179,8 +180,8 @@ static int com20020pci_probe(struct pci_dev *pdev,
 
/* Get the dev_id from the PLX rotary coder */
if (!strncmp(ci->name, "EAE PLX-PCI MA1", 15))
-   dev->dev_id = 0xc;
-   dev->dev_id ^= inb(priv->misc + ci->rotary) >> 4;
+   dev_id_mask = 0x3;
+   dev->dev_id = (inb(priv->misc + ci->rotary) >> 4) & dev_id_mask;
 
snprintf(dev->name, sizeof(dev->name), "arc%d-%d", dev->dev_id, 
i);
 
-- 
2.11.0



[PATCH 5/5] arcnet: com20020-pci: add missing pdev setup in netdev structure

2017-06-28 Thread Michael Grzeschik
We add the pdev data to the pci devices netdev structure. This way
the interface get consistent device names in the userspace (udev).

Signed-off-by: Michael Grzeschik 
---
 drivers/net/arcnet/com20020-pci.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/arcnet/com20020-pci.c 
b/drivers/net/arcnet/com20020-pci.c
index fec2df2c869fe..47f80b83dcf42 100644
--- a/drivers/net/arcnet/com20020-pci.c
+++ b/drivers/net/arcnet/com20020-pci.c
@@ -167,6 +167,7 @@ static int com20020pci_probe(struct pci_dev *pdev,
arcnet_outb(0x00, ioaddr, COM20020_REG_W_COMMAND);
arcnet_inb(ioaddr, COM20020_REG_R_DIAGSTAT);
 
+   SET_NETDEV_DEV(dev, &pdev->dev);
dev->base_addr = ioaddr;
dev->dev_addr[0] = node;
dev->irq = pdev->irq;
-- 
2.11.0



[PATCH 2/5] Trivial fix to spelling mistake in arc_printk message

2017-06-28 Thread Michael Grzeschik
From: Colin Ian King 

Signed-off-by: Colin Ian King 
Signed-off-by: Michael Grzeschik 
---
 drivers/net/arcnet/capmode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/arcnet/capmode.c b/drivers/net/arcnet/capmode.c
index a80f4eb9262d5..b780be6f41ff9 100644
--- a/drivers/net/arcnet/capmode.c
+++ b/drivers/net/arcnet/capmode.c
@@ -212,7 +212,7 @@ static int ack_tx(struct net_device *dev, int acked)
ackpkt->soft.cap.proto = 0; /* using protocol 0 for acknowledge */
ackpkt->soft.cap.mes.ack = acked;
 
-   arc_printk(D_PROTO, dev, "Ackknowledge for cap packet %x.\n",
+   arc_printk(D_PROTO, dev, "Acknowledge for cap packet %x.\n",
   *((int *)&ackpkt->soft.cap.cookie[0]));
 
ackskb->protocol = cpu_to_be16(ETH_P_ARCNET);
-- 
2.11.0



[PATCH 3/5] arcnet: com20020: remove needless base_addr assignment

2017-06-28 Thread Michael Grzeschik
The assignment is superfluous.

Signed-off-by: Michael Grzeschik 
---
 drivers/net/arcnet/com20020.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/arcnet/com20020.c b/drivers/net/arcnet/com20020.c
index 13d9ad4b3f5c9..78043a9c5981e 100644
--- a/drivers/net/arcnet/com20020.c
+++ b/drivers/net/arcnet/com20020.c
@@ -246,8 +246,6 @@ int com20020_found(struct net_device *dev, int shared)
return -ENODEV;
}
 
-   dev->base_addr = ioaddr;
-
arc_printk(D_NORMAL, dev, "%s: station %02Xh found at %03lXh, IRQ 
%d.\n",
   lp->card_name, dev->dev_addr[0], dev->base_addr, dev->irq);
 
-- 
2.11.0



[PATCH 1/5] arcnet: change irq handler to lock irqsave

2017-06-28 Thread Michael Grzeschik
This patch prevents the arcnet driver from the following deadlock.

[   41.273910] ==
[   41.280397] [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ]
[   41.287433] 4.4.0-00034-gc0ae784 #536 Not tainted
[   41.292366] --
[   41.298863] arcecho/233 [HC0[0]:SC0[2]:HE0:SE0] is trying to acquire:
[   41.305628]  (&(&lp->lock)->rlock){+.+...}, at: [] 
arcnet_send_packet+0x60/0x1c0 [arcnet]
[   41.315199]
[   41.315199] and this task is already holding:
[   41.321324]  (_xmit_ARCNET#2){+.-...}, at: [] 
packet_direct_xmit+0xfc/0x1c8
[   41.329593] which would create a new lock dependency:
[   41.334893]  (_xmit_ARCNET#2){+.-...} -> (&(&lp->lock)->rlock){+.+...}
[   41.341801]
[   41.341801] but this new dependency connects a SOFTIRQ-irq-safe lock:
[   41.350108]  (_xmit_ARCNET#2){+.-...}
... which became SOFTIRQ-irq-safe at:
[   41.357539]   [] _raw_spin_lock+0x30/0x40
[   41.362677]   [] dev_watchdog+0x5c/0x264
[   41.367723]   [] call_timer_fn+0x6c/0xf4
[   41.372759]   [] run_timer_softirq+0x154/0x210
[   41.378340]   [] __do_softirq+0x144/0x298
[   41.383469]   [] irq_exit+0xcc/0x130
[   41.388138]   [] __handle_domain_irq+0x60/0xb4
[   41.393728]   [] __irq_svc+0x58/0x78
[   41.398402]   [] arch_cpu_idle+0x24/0x3c
[   41.403443]   [] cpu_startup_entry+0x1f8/0x25c
[   41.409029]   [] start_kernel+0x3c0/0x3cc
[   41.414170]
[   41.414170] to a SOFTIRQ-irq-unsafe lock:
[   41.419931]  (&(&lp->lock)->rlock){+.+...}
... which became SOFTIRQ-irq-unsafe at:
[   41.427996] ...  [] _raw_spin_lock+0x30/0x40
[   41.433409]   [] arcnet_interrupt+0x2c/0x800 [arcnet]
[   41.439646]   [] handle_nested_irq+0x8c/0xec
[   41.445063]   [] regmap_irq_thread+0x190/0x314
[   41.450661]   [] irq_thread_fn+0x1c/0x34
[   41.455700]   [] irq_thread+0x13c/0x1dc
[   41.460649]   [] kthread+0xe4/0xf8
[   41.465158]   [] ret_from_fork+0x14/0x24
[   41.470207]
[   41.470207] other info that might help us debug this:
[   41.470207]
[   41.478627]  Possible interrupt unsafe locking scenario:
[   41.478627]
[   41.485763]CPU0CPU1
[   41.490521]
[   41.495279]   lock(&(&lp->lock)->rlock);
[   41.499414]local_irq_disable();
[   41.505636]lock(_xmit_ARCNET#2);
[   41.511967]lock(&(&lp->lock)->rlock);
[   41.518741]   
[   41.521490] lock(_xmit_ARCNET#2);
[   41.525356]
[   41.525356]  *** DEADLOCK ***
[   41.525356]
[   41.531587] 1 lock held by arcecho/233:
[   41.535617]  #0:  (_xmit_ARCNET#2){+.-...}, at: [] 
packet_direct_xmit+0xfc/0x1c8
[   41.544355]
the dependencies between SOFTIRQ-irq-safe lock and the holding lock:
[   41.552362] -> (_xmit_ARCNET#2){+.-...} ops: 27 {
[   41.557357]HARDIRQ-ON-W at:
[   41.560664] [] _raw_spin_lock+0x30/0x40
[   41.567445] [] dev_deactivate_many+0x114/0x304
[   41.574866] [] dev_deactivate+0x24/0x38
[   41.581646] [] linkwatch_do_dev+0x40/0x74
[   41.588613] [] __linkwatch_run_queue+0xec/0x140
[   41.596120] [] linkwatch_event+0x2c/0x34
[   41.602991] [] process_one_work+0x188/0x40c
[   41.610131] [] worker_thread+0x4c/0x480
[   41.616912] [] kthread+0xe4/0xf8
[   41.623048] [] ret_from_fork+0x14/0x24
[   41.629735]IN-SOFTIRQ-W at:
[   41.633039] [] _raw_spin_lock+0x30/0x40
[   41.639820] [] dev_watchdog+0x5c/0x264
[   41.646508] [] call_timer_fn+0x6c/0xf4
[   41.653190] [] run_timer_softirq+0x154/0x210
[   41.660425] [] __do_softirq+0x144/0x298
[   41.667201] [] irq_exit+0xcc/0x130
[   41.673518] [] __handle_domain_irq+0x60/0xb4
[   41.680754] [] __irq_svc+0x58/0x78
[   41.687077] [] arch_cpu_idle+0x24/0x3c
[   41.693769] [] cpu_startup_entry+0x1f8/0x25c
[   41.701006] [] start_kernel+0x3c0/0x3cc
[   41.707791]INITIAL USE at:
[   41.711003][] _raw_spin_lock+0x30/0x40
[   41.717696][] dev_deactivate_many+0x114/0x304
[   41.725026][] dev_deactivate+0x24/0x38
[   41.731718][] linkwatch_do_dev+0x40/0x74
[   41.738593][] __linkwatch_run_queue+0xec/0x140
[   41.746011][] linkwatch_event+0x2c/0x34
[   41.752789][] process_one_work+0x188/0x40c
[   41.759847][] worker_thread+0x4c/0x480
[   41.766541][] kthread+0xe4/0xf8
[   41.772596][] ret_from_fork+0x14/0x24
[   41.779198]  }
[   41.780945]  ... key  at: [] netd

[PATCH 0/5] arcnet: Collection of latest fixes

2017-06-28 Thread Michael Grzeschik
Here we sum up the recent fixes I collected on the way to use and
stabilise the framework. Part of it is an possible deadlock that we
prevent as well to fix the calculation of the dev_id that can be setup
by an rotary encoder. Beside that we added an trivial spelling patch and
fix some wrong and missing assignments that improves the code footprint.

Colin Ian King (1):
  Trivial fix to spelling mistake in arc_printk message

Michael Grzeschik (4):
  arcnet: change irq handler to lock irqsave
  arcnet: com20020: remove needless base_addr assignment
  arcnet: com20020-pci: fix dev_id calculation
  arcnet: com20020-pci: add missing pdev setup in netdev structure

 drivers/net/arcnet/arcnet.c   | 7 ---
 drivers/net/arcnet/capmode.c  | 2 +-
 drivers/net/arcnet/com20020-pci.c | 6 --
 drivers/net/arcnet/com20020.c | 2 --
 4 files changed, 9 insertions(+), 8 deletions(-)

-- 
2.11.0



[PATCH 4/4] arcnet: com20020-pci: add support for PCIFB2 card

2017-06-28 Thread Michael Grzeschik
We add support for the PCIFB2 card from EAE.

Beside other cards, this card has the backplane mode enabled by default.

Signed-off-by: Michael Grzeschik 
---
 drivers/net/arcnet/com20020-pci.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/drivers/net/arcnet/com20020-pci.c 
b/drivers/net/arcnet/com20020-pci.c
index f5854ab7dc326..24deb88a37f06 100644
--- a/drivers/net/arcnet/com20020-pci.c
+++ b/drivers/net/arcnet/com20020-pci.c
@@ -201,6 +201,9 @@ static int com20020pci_probe(struct pci_dev *pdev,
 
lp->backplane = (inb(priv->misc) >> (2 + i)) & 0x1;
 
+   if (!strncmp(ci->name, "EAE PLX-PCI FB2", 15))
+   lp->backplane = 1;
+
/* Get the dev_id from the PLX rotary coder */
if (!strncmp(ci->name, "EAE PLX-PCI MA1", 15))
dev->dev_id = 0xc;
@@ -385,6 +388,31 @@ static struct com20020_pci_card_info card_info_eae_ma1 = {
.flags = ARC_CAN_10MBIT,
 };
 
+static struct com20020_pci_card_info card_info_eae_fb2 = {
+   .name = "EAE PLX-PCI FB2",
+   .devcount = 1,
+   .chan_map_tbl = {
+   {
+   .bar = 2,
+   .offset = 0x00,
+   .size = 0x08,
+   },
+   },
+   .misc_map = {
+   .bar = 2,
+   .offset = 0x10,
+   .size = 0x04,
+   },
+   .leds = {
+   {
+   .green = 0x0,
+   .red = 0x1,
+   },
+   },
+   .rotary = 0x0,
+   .flags = ARC_CAN_10MBIT,
+};
+
 static const struct pci_device_id com20020pci_id_table[] = {
{
0x1571, 0xa001,
@@ -531,6 +559,12 @@ static const struct pci_device_id com20020pci_id_table[] = 
{
(kernel_ulong_t)&card_info_eae_ma1
},
{
+   0x10B5, 0x9050,
+   0x10B5, 0x3294,
+   0, 0,
+   (kernel_ulong_t)&card_info_eae_fb2
+   },
+   {
0x14BA, 0x6000,
PCI_ANY_ID, PCI_ANY_ID,
0, 0,
-- 
2.11.0



[PATCH 0/4] arcnet: Collection of latest features

2017-06-28 Thread Michael Grzeschik
Here we sum up the latest features to improve the arcnet framework. One
patch is used to get feedback from the transfer queue about failed xfers
by adding the err_skb message queue. Beside that we improve the
backplane status that can be read by the PCI-based cards and offer that
status via an extra sysfs attribute. In the last patch we add another
card type PCIFB2.

Michael Grzeschik (4):
  arcnet: add err_skb package for package status feedback
  arcnet: com20020-pci: add attribute to readback backplane status
  arcnet: com20020-pci: handle backplane mode depending on card type
  arcnet: com20020-pci: add support for PCIFB2 card

 drivers/net/arcnet/arcdevice.h|  4 +++
 drivers/net/arcnet/arcnet.c   | 74 +--
 drivers/net/arcnet/com20020-pci.c | 58 ++
 3 files changed, 126 insertions(+), 10 deletions(-)

-- 
2.11.0



[PATCH 2/4] arcnet: com20020-pci: add attribute to readback backplane status

2017-06-28 Thread Michael Grzeschik
We add the sysfs interface the read back the backplane
status of the interface.

Signed-off-by: Michael Grzeschik 
---
 drivers/net/arcnet/com20020-pci.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/drivers/net/arcnet/com20020-pci.c 
b/drivers/net/arcnet/com20020-pci.c
index 239de38fbd6a5..dec300cac55f9 100644
--- a/drivers/net/arcnet/com20020-pci.c
+++ b/drivers/net/arcnet/com20020-pci.c
@@ -93,6 +93,27 @@ static void led_recon_set(struct led_classdev *led_cdev,
outb(!!value, priv->misc + ci->leds[card->index].red);
 }
 
+static ssize_t backplane_mode_show(struct device *dev,
+  struct device_attribute *attr,
+  char *buf)
+{
+   struct net_device *net_dev = to_net_dev(dev);
+   struct arcnet_local *lp = netdev_priv(net_dev);
+
+   return sprintf(buf, "%s\n", lp->backplane ? "true" : "false");
+}
+static DEVICE_ATTR_RO(backplane_mode);
+
+static struct attribute *com20020_state_attrs[] = {
+   &dev_attr_backplane_mode.attr,
+   NULL,
+};
+
+static struct attribute_group com20020_state_group = {
+   .name = NULL,
+   .attrs = com20020_state_attrs,
+};
+
 static void com20020pci_remove(struct pci_dev *pdev);
 
 static int com20020pci_probe(struct pci_dev *pdev,
@@ -168,6 +189,7 @@ static int com20020pci_probe(struct pci_dev *pdev,
 
dev->base_addr = ioaddr;
dev->dev_addr[0] = node;
+   dev->sysfs_groups[0] = &com20020_state_group;
dev->irq = pdev->irq;
lp->card_name = "PCI COM20020";
lp->card_flags = ci->flags;
-- 
2.11.0



[PATCH 3/4] arcnet: com20020-pci: handle backplane mode depending on card type

2017-06-28 Thread Michael Grzeschik
We read the backplane mode of each subcard from bits 2 and 3 of the misc
register.

Signed-off-by: Michael Grzeschik 
---
 drivers/net/arcnet/com20020-pci.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/arcnet/com20020-pci.c 
b/drivers/net/arcnet/com20020-pci.c
index dec300cac55f9..f5854ab7dc326 100644
--- a/drivers/net/arcnet/com20020-pci.c
+++ b/drivers/net/arcnet/com20020-pci.c
@@ -199,6 +199,8 @@ static int com20020pci_probe(struct pci_dev *pdev,
lp->timeout = timeout;
lp->hw.owner = THIS_MODULE;
 
+   lp->backplane = (inb(priv->misc) >> (2 + i)) & 0x1;
+
/* Get the dev_id from the PLX rotary coder */
if (!strncmp(ci->name, "EAE PLX-PCI MA1", 15))
dev->dev_id = 0xc;
-- 
2.11.0



[PATCH 1/4] arcnet: add err_skb package for package status feedback

2017-06-28 Thread Michael Grzeschik
We need to track the status of our queued packages. This way the driving
process knows if failed packages need to be retransmitted. For this
purpose we queue the transferred/failed packages back into the err_skb
message queue added with some status information.

Signed-off-by: Michael Grzeschik 
---
 drivers/net/arcnet/arcdevice.h |  4 +++
 drivers/net/arcnet/arcnet.c| 74 --
 2 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/drivers/net/arcnet/arcdevice.h b/drivers/net/arcnet/arcdevice.h
index 20bfb9ba83ea2..cbb4f8566bbe5 100644
--- a/drivers/net/arcnet/arcdevice.h
+++ b/drivers/net/arcnet/arcdevice.h
@@ -269,6 +269,10 @@ struct arcnet_local {
 
struct timer_list   timer;
 
+   struct net_device *dev;
+   int reply_status;
+   struct tasklet_struct reply_tasklet;
+
/*
 * Buffer management: an ARCnet card has 4 x 512-byte buffers, each of
 * which can be used for either sending or receiving.  The new dynamic
diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c
index 62ee439d58829..d87f4da29f113 100644
--- a/drivers/net/arcnet/arcnet.c
+++ b/drivers/net/arcnet/arcnet.c
@@ -51,6 +51,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -391,6 +392,52 @@ static void arcnet_timer(unsigned long data)
}
 }
 
+static void arcnet_reply_tasklet(unsigned long data)
+{
+   struct arcnet_local *lp = (struct arcnet_local *)data;
+
+   struct sk_buff *ackskb, *skb;
+   struct sock_exterr_skb *serr;
+   struct sock *sk;
+   int ret;
+
+   local_irq_disable();
+   skb = lp->outgoing.skb;
+   if (!skb || !skb->sk) {
+   local_irq_enable();
+   return;
+   }
+
+   sock_hold(skb->sk);
+   sk = skb->sk;
+   ackskb = skb_clone_sk(skb);
+   sock_put(skb->sk);
+
+   if (!ackskb) {
+   local_irq_enable();
+   return;
+   }
+
+   serr = SKB_EXT_ERR(ackskb);
+   memset(serr, 0, sizeof(*serr));
+   serr->ee.ee_errno = ENOMSG;
+   serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+   serr->ee.ee_data = skb_shinfo(skb)->tskey;
+   serr->ee.ee_info = lp->reply_status;
+
+   /* finally erasing outgoing skb */
+   dev_kfree_skb(lp->outgoing.skb);
+   lp->outgoing.skb = NULL;
+
+   ackskb->dev = lp->dev;
+
+   ret = sock_queue_err_skb(sk, ackskb);
+   if (ret)
+   kfree_skb(ackskb);
+
+   local_irq_enable();
+};
+
 struct net_device *alloc_arcdev(const char *name)
 {
struct net_device *dev;
@@ -401,6 +448,7 @@ struct net_device *alloc_arcdev(const char *name)
if (dev) {
struct arcnet_local *lp = netdev_priv(dev);
 
+   lp->dev = dev;
spin_lock_init(&lp->lock);
init_timer(&lp->timer);
lp->timer.data = (unsigned long) dev;
@@ -436,6 +484,9 @@ int arcnet_open(struct net_device *dev)
arc_cont(D_PROTO, "\n");
}
 
+   tasklet_init(&lp->reply_tasklet, arcnet_reply_tasklet,
+(unsigned long)lp);
+
arc_printk(D_INIT, dev, "arcnet_open: resetting card.\n");
 
/* try to put the card in a defined state - if it fails the first
@@ -527,6 +578,8 @@ int arcnet_close(struct net_device *dev)
netif_stop_queue(dev);
netif_carrier_off(dev);
 
+   tasklet_kill(&lp->reply_tasklet);
+
/* flush TX and disable RX */
lp->hw.intmask(dev, 0);
lp->hw.command(dev, NOTXcmd);   /* stop transmit */
@@ -635,13 +688,13 @@ netdev_tx_t arcnet_send_packet(struct sk_buff *skb,
txbuf = -1;
 
if (txbuf != -1) {
+   lp->outgoing.skb = skb;
if (proto->prepare_tx(dev, pkt, skb->len, txbuf) &&
!proto->ack_tx) {
/* done right away and we don't want to acknowledge
 *  the package later - forget about it now
 */
dev->stats.tx_bytes += skb->len;
-   dev_kfree_skb(skb);
} else {
/* do it the 'split' way */
lp->outgoing.proto = proto;
@@ -842,8 +895,16 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id)
 
/* a transmit finished, and we're interested in it. */
if ((status & lp->intmask & TXFREEflag) || lp->timed_out) {
+   int ackstatus;
lp->intmask &= ~(TXFREEflag | EXCNAKflag);
 
+   if (status & TXACKflag)
+   ackstatus = 2;
+   else if (lp->excnak_pending)
+   ackstatus = 1;
+   else
+   ackstatus = 0;
+
arc_printk(D_DURING, dev, "TX IRQ (stat=%Xh)\n",
   status);
 

Re: ti: wl18xx: add checks on wl18xx_top_reg_write() return value

2017-06-28 Thread Kalle Valo
"Gustavo A. R. Silva"  wrote:

> Check return value from call to wl18xx_top_reg_write(),
> so in case of error jump to goto label out and return.
> 
> Also, remove unnecessary value check before goto label out.
> 
> Addresses-Coverity-ID: 1226938
> Signed-off-by: Gustavo A. R. Silva 

The prefix should be "wl18xx:", I'll fix that.

-- 
https://patchwork.kernel.org/patch/9810591/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches



[PATCH] amd-xgbe: fix spelling mistake: "avialable" -> "available"

2017-06-28 Thread Colin King
From: Colin Ian King 

Trivial fix to spelling mistake in netdev_err message

Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
index 920566a3a599..67a2e52ad25d 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c
@@ -247,7 +247,7 @@ static int xgbe_set_pauseparam(struct net_device *netdev,
 
if (pause->autoneg && (pdata->phy.autoneg != AUTONEG_ENABLE)) {
netdev_err(netdev,
-  "autoneg disabled, pause autoneg not avialable\n");
+  "autoneg disabled, pause autoneg not available\n");
return -EINVAL;
}
 
-- 
2.11.0



Re: ath10k: add const to thermal_cooling_device_ops structure

2017-06-28 Thread Kalle Valo
Bhumika Goyal  wrote:

> Declare thermal_cooling_device_ops structure as const as it is only passed
> as an argument to the function thermal_cooling_device_register and this
> argument is of type const. So, declare the structure as const.
> 
> Signed-off-by: Bhumika Goyal 
> Signed-off-by: Kalle Valo 

Patch applied to ath-next branch of ath.git, thanks.

1cdb6c9fd433 ath10k: add const to thermal_cooling_device_ops structure

-- 
https://patchwork.kernel.org/patch/9801291/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches



Re: [-next] ath10k: fix a bunch of spelling mistakes in messages

2017-06-28 Thread Kalle Valo
Colin Ian King  wrote:

> Fix the following spelling mistakes in messages:
>   syncronise -> synchronize
>   unusally -> unusually
>   addrress -> address
>   inverval -> interval
> 
> Signed-off-by: Colin Ian King 
> Signed-off-by: Kalle Valo 

Patch applied to ath-next branch of ath.git, thanks.

23de57975f14 ath10k: fix a bunch of spelling mistakes in messages

-- 
https://patchwork.kernel.org/patch/9808405/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches



Re: ath9k: remove useless variable assignment in ath_mci_intr()

2017-06-28 Thread Kalle Valo
"Gustavo A. R. Silva"  wrote:

> Value assigned to variable offset at line 551 is overwritten at line 562,
> before it can be used. This makes such variable assignment useless.
> 
> Addresses-Coverity-ID: 1226941
> Signed-off-by: Gustavo A. R. Silva 
> Signed-off-by: Kalle Valo 

Patch applied to ath-next branch of ath.git, thanks.

6788a3832c70 ath9k: remove useless variable assignment in ath_mci_intr()

-- 
https://patchwork.kernel.org/patch/9810609/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches



[PATCH iproute2 1/1] tc: updated ife man page.

2017-06-28 Thread Roman Mashak
Explain when skbmark encoding may fail.

Signed-off-by: Roman Mashak 
---
 man/man8/tc-ife.8 | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/man/man8/tc-ife.8 b/man/man8/tc-ife.8
index a8f1f28..1499a3f 100644
--- a/man/man8/tc-ife.8
+++ b/man/man8/tc-ife.8
@@ -59,7 +59,10 @@ Encode direction only. Enforce static encoding of specified 
metadata.
 .BR mark " [ "
 .IR u32_value " ]"
 The value to set for the skb mark. The u32 value is required only when
-.BR use " is specified."
+.BR use " is specified. If
+.BR mark " value is zero, it will not be encoded, instead
+"overlimits" statistics increment and
+.BR CONTROL " action is taken.
 .TP
 .BR prio " [ "
 .IR u32_value " ]"
-- 
1.9.1



Re: [PATCH v3 net-next 02/12] bpf/verifier: rework value tracking

2017-06-28 Thread Daniel Borkmann

On 06/27/2017 02:56 PM, Edward Cree wrote:

Tracks value alignment by means of tracking known & unknown bits.
Tightens some min/max value checks and fixes a couple of bugs therein.
If pointer leaks are allowed, and adjust_ptr_min_max_vals returns -EACCES,
  treat the pointer as an unknown scalar and try again, because we might be
  able to conclude something about the result (e.g. pointer & 0x40 is either
  0 or 0x40).

Signed-off-by: Edward Cree 

[...]

+static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+  struct bpf_insn *insn)
+{
+   struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+   struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+   u8 opcode = BPF_OP(insn->code);
+   int rc;
+
+   dst_reg = ®s[insn->dst_reg];
+   check_reg_overflow(dst_reg);
+   src_reg = NULL;
+   if (dst_reg->type != SCALAR_VALUE)
+   ptr_reg = dst_reg;
+   if (BPF_SRC(insn->code) == BPF_X) {
+   src_reg = ®s[insn->src_reg];
+   check_reg_overflow(src_reg);
+
+   if (src_reg->type != SCALAR_VALUE) {
+   if (dst_reg->type != SCALAR_VALUE) {
+   /* Combining two pointers by any ALU op yields
+* an arbitrary scalar.
+*/
+   if (!env->allow_ptr_leaks) {
+   verbose("R%d pointer %s pointer 
prohibited\n",
+   insn->dst_reg,
+   bpf_alu_string[opcode >> 4]);
+   return -EACCES;
+   }
+   mark_reg_unknown(regs, insn->dst_reg);
+   return 0;
+   } else {
+   /* scalar += pointer
+* This is legal, but we have to reverse our
+* src/dest handling in computing the range
+*/
+   rc = adjust_ptr_min_max_vals(env, insn,
+src_reg, dst_reg);
+   if (rc == -EACCES && env->allow_ptr_leaks) {
+   /* scalar += unknown scalar */
+   __mark_reg_unknown(&off_reg);
+   return adjust_scalar_min_max_vals(
+   env, insn,
+   dst_reg, &off_reg);


Could you elaborate on this one? If I understand it correctly, then
the scalar += pointer case would mean the following: given I have one
of the allowed pointer types in adjust_ptr_min_max_vals() then the
prior scalar type inherits the ptr type/id. I would then 'destroy' the
pointer value so we get a -EACCES on it. We mark the tmp off_reg as
scalar type, but shouldn't also actual dst_reg be marked as such
like in below pointer += scalar case, such that we undo the prior
ptr_type inheritance?


+   }
+   return rc;
+   }
+   } else if (ptr_reg) {
+   /* pointer += scalar */
+   rc = adjust_ptr_min_max_vals(env, insn,
+dst_reg, src_reg);
+   if (rc == -EACCES && env->allow_ptr_leaks) {
+   /* unknown scalar += scalar */
+   __mark_reg_unknown(dst_reg);
+   return adjust_scalar_min_max_vals(
+   env, insn, dst_reg, src_reg);
+   }
+   return rc;
+   }
+   } else {

[...]


Re: [PATCH net-next 1/2] vxlan: change vxlan_validate() to use netlink_ext_ack for error reporting

2017-06-28 Thread Jiri Benc
On Tue, 27 Jun 2017 22:47:57 +0200, Matthias Schiffer wrote:
>   if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
> - pr_debug("invalid all zero ethernet address\n");
> + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
> + "invalid ethernet address");

Could we be more specific here? This is better than nothing but still
not as helpful to the user as it could be. What about something like
"the provided ethernet address is not unicast"?

> - if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU)
> + if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
> + NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
> + "invalid MTU");

"MTU must be between 68 and 65535"

> - if (id >= VXLAN_N_VID)
> + if (id >= VXLAN_N_VID) {
> + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID],
> + "invalid VXLAN ID");

"VXLAN ID must be lower than 16777216"

>   if (ntohs(p->high) < ntohs(p->low)) {
> - pr_debug("port range %u .. %u not valid\n",
> -  ntohs(p->low), ntohs(p->high));
> + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE],
> + "port range not valid");

Since you're getting rid of the values output, I'd rather suggest more
explicit "the first value of the port range must not be higher than the
second value" or so. Shorter wording is welcome :-)

Thanks,

 Jiri


Re: [PATCH net-next 2/2] vxlan: add back error messages to vxlan_config_validate() as extended netlink acks

2017-06-28 Thread Jiri Benc
On Tue, 27 Jun 2017 22:47:58 +0200, Matthias Schiffer wrote:
>   if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
>   !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
> + NL_SET_ERR_MSG(extack,
> +"unsupported combination of extensions");

Since we're redesigning this, let's be more helpful to the user.
There's probably not going to be tremendous improvements here but let's
try at least a bit.

"VXLAN GPE does not support this combination of extensions"

>   if (local_type & IPV6_ADDR_LINKLOCAL) {
>   if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
> - (remote_type != IPV6_ADDR_ANY))
> + (remote_type != IPV6_ADDR_ANY)) {
> + NL_SET_ERR_MSG(extack,
> +"invalid combination of 
> address scopes");

"invalid combination of local and remote address scopes"

>   return -EINVAL;
> + }
>  
>   conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
>   } else {
>   if (remote_type ==
> - (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL))
> + (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
> + NL_SET_ERR_MSG(extack,
> +"invalid combination of 
> address scopes");

ditto

The rest looks good to me. Thanks a lot for doing the work, Matthias!

 Jiri


[PATCH net-next v4 01/16] bpf: BPF support for sock_ops

2017-06-28 Thread Lawrence Brakmo
Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.). It uses the
existing bpf cgroups infrastructure so the programs can be attached per
cgroup with full inheritance support. The program will be called at
appropriate times to set relevant connections parameters such as buffer
sizes, SYN and SYN-ACK RTOs, etc., based on connection information such
as IP addresses, port numbers, etc.

Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
distinct advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it oes not require
application changes and it can be updated easily at any time.

Although the bpf cgroup framework already contains a sock related
program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type
(BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called
only once during the connections's lifetime. In contrast, the new
program type will be called multiple times from different places in the
network stack code.  For example, before sending SYN and SYN-ACKs to set
an appropriate timeout, when the connection is established to set
congestion control, etc. As a result it has "op" field to specify the
type of operation requested.

The purpose of this new program type is to simplify setting connection
parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is
easy to use facebook's internal IPv6 addresses to determine if both hosts
of a connection are in the same datacenter. Therefore, it is easy to
write a BPF program to choose a small SYN RTO value when both hosts are
in the same datacenter.

This patch only contains the framework to support the new BPF program
type, following patches add the functionality to set various connection
parameters.

This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS
and a new bpf syscall command to load a new program of this type:
BPF_PROG_LOAD_SOCKET_OPS.

Two new corresponding structs (one for the kernel one for the user/BPF
program):

/* kernel version */
struct bpf_sock_ops_kern {
struct sock *sk;
bool   is_req_sock:1;
__u32  op;
union {
__u32 reply;
__u32 replylong[4];
};
};

/* user version */
struct bpf_sock_ops {
__u32 op;
union {
__u32 reply;
__u32 replylong[4];
};
__u32 family;
__u32 remote_ip4;
__u32 local_ip4;
__u32 remote_ip6[4];
__u32 local_ip6[4];
__u32 remote_port;
__u32 local_port;
};

Currently there are two types of ops. The first type expects the BPF
program to return a value which is then used by the caller (or a
negative value to indicate the operation is not supported). The second
type expects state changes to be done by the BPF program, for example
through a setsockopt BPF helper function, and they ignore the return
value.

The reply fields of the bpf_sockt_ops struct are there in case a bpf
program needs to return a value larger than an integer.

Signed-off-by: Lawrence Brakmo 
---
 include/linux/bpf-cgroup.h |  18 +
 include/linux/bpf_types.h  |   1 +
 include/linux/filter.h |  10 +++
 include/net/tcp.h  |  37 ++
 include/uapi/linux/bpf.h   |  28 
 kernel/bpf/cgroup.c|  37 ++
 kernel/bpf/syscall.c   |   5 ++
 net/core/filter.c  | 170 +
 samples/bpf/bpf_load.c |  13 +++-
 9 files changed, 316 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c970a25..26449c7 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -7,6 +7,7 @@
 struct sock;
 struct cgroup;
 struct sk_buff;
+struct bpf_sock_ops_kern;
 
 #ifdef CONFIG_CGROUP_BPF
 
@@ -42,6 +43,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
   enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+struct bpf_sock_ops_kern *sock_ops,
+enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_IN

[PATCH net-next v4 06/16] bpf: Sample bpf program to set initial window

2017-06-28 Thread Lawrence Brakmo
The sample bpf program, tcp_rwnd_kern.c, sets the initial
advertized window to 40 packets in an environment where
distinct IPv6 prefixes indicate that both hosts are not
in the same data center.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile|  1 +
 samples/bpf/tcp_rwnd_kern.c | 61 +
 2 files changed, 62 insertions(+)
 create mode 100644 samples/bpf/tcp_rwnd_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index e29370a..ca95528 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -114,6 +114,7 @@ always += xdp_tx_iptunnel_kern.o
 always += test_map_in_map_kern.o
 always += cookie_uid_helper_example.o
 always += tcp_synrto_kern.o
+always += tcp_rwnd_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c
new file mode 100644
index 000..5daa649
--- /dev/null
+++ b/samples/bpf/tcp_rwnd_kern.c
@@ -0,0 +1,61 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets when using IPv6
+ * and the first 5.5 bytes of the IPv6 addresses are not the same (in this
+ * example that means both hosts are not the same datacenter.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_rwnd(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int rv = -1;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Check for RWND_INIT operation and IPv6 addresses */
+   if (op == BPF_SOCK_OPS_RWND_INIT &&
+   skops->family == AF_INET6) {
+
+   /* If the first 5.5 bytes of the IPv6 address are not the same
+* then both hosts are not in the same datacenter
+* so use a larger initial advertized window (40 packets)
+*/
+   if (skops->local_ip6[0] != skops->remote_ip6[0] ||
+   (skops->local_ip6[1] & 0xf000) !=
+   (skops->remote_ip6[1] & 0xf000))
+   bpf_trace_printk(fmt2, sizeof(fmt2), -1);
+   rv = 40;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   skops->reply = rv;
+   return 1;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3



[PATCH net-next v4 00/16] bpf: BPF cgroup support for sock_ops

2017-06-28 Thread Lawrence Brakmo
Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.) and setting
connection parameters such as buffer sizes, initial window, SYN/SYN-ACK
RTOs, etc.

Unlike current BPF program types that expect to be called at a particular
place in the network stack code, SOCK_OPS program can be called at
different places and use an "op" field to indicate the context. There
are currently two types of operations, those whose effect is through
their return value and those whose effect is through the new
bpf_setsocketop BPF helper function.

Example operands of the first type are:
  BPF_SOCK_OPS_TIMEOUT_INIT
  BPF_SOCK_OPS_RWND_INIT
  BPF_SOCK_OPS_NEEDS_ECN

Example operands of the secont type are:
  BPF_SOCK_OPS_TCP_CONNECT_CB
  BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB
  BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB

Current operands are only called during connection establishment so
there should not be any BPF overheads after connection establishment. The
main idea is to use connection information form both hosts, such as IP
addresses and ports to allow setting of per connection parameters to
optimize the connection's peformance.

Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
disticnt advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it does not require
application changes and it can be updated easily at any time.

It uses the existing bpf cgroups infrastructure so the programs can be
attached per cgroup with full inheritance support. Although the bpf cgroup
framework already contains a sock related program type 
(BPF_PROG_TYPE_CGROUP_SOCK),
I created the new type (BPF_PROG_TYPE_SOCK_OPS) beccause the existing type
expects to be called only once during the connections's lifetime. In contrast,
the new program type will be called multiple times from different places in the
network stack code.  For example, before sending SYN and SYN-ACKs to set
an appropriate timeout, when the connection is established to set congestion
control, etc. As a result it has "op" field to specify the type of operation
requested.

This patch set also includes sample BPF programs to demostrate the differnet
features.

v2: Formatting changes, rebased to latest net-next

v3: Fixed build issues, changed socket_ops to sock_ops throught,
fixed formatting issues, removed the syscall to load sock_ops
program and added functionality to use existing bpf attach and
bpf detach system calls, removed reader/writer locks in
sock_bpfops.c (used when saving sock_ops global program)
and fixed missing module refcount increment.

v4: Removed global sock_ops program and instead used existing cgroup bpf
infrastructure to support a new BPF_CGROUP_ATTCH type.

Consists of the following patches:


 include/linux/bpf-cgroup.h |  18 
 include/linux/bpf_types.h  |   1 +
 include/linux/filter.h |  10 ++
 include/net/tcp.h  |  67 +++-
 include/uapi/linux/bpf.h   |  66 +++-
 kernel/bpf/cgroup.c|  37 +++
 kernel/bpf/syscall.c   |   5 +
 net/core/filter.c  | 271 
+++
 net/ipv4/tcp.c |   2 +-
 net/ipv4/tcp_cong.c|  32 --
 net/ipv4/tcp_fastopen.c|   1 +
 net/ipv4/tcp_input.c   |  10 +-
 net/ipv4/tcp_minisocks.c   |   9 +-
 net/ipv4/tcp_output.c  |  18 +++-
 samples/bpf/Makefile   |   9 ++
 samples/bpf/bpf_helpers.h  |   3 +
 samples/bpf/bpf_load.c |  13 ++-
 samples/bpf/load_sock_ops.c|  97 +
 samples/bpf/tcp_bufs_kern.c|  77 ++
 samples/bpf/tcp_clamp_kern.c   |  94 
 samples/bpf/tcp_cong_kern.c|  74 +
 samples/bpf/tcp_iw_kern.c  |  79 ++
 samples/bpf/tcp_rwnd_kern.c|  61 +++
 samples/bpf/tcp_synrto_kern.c  |  60 +++
 tools/include/uapi/linux/bpf.h |  66 +++-
 25 files changed, 1154 insertions(+), 26 deletions(-)



[PATCH net-next v4 03/16] bpf: Support for per connection SYN/SYN-ACK RTOs

2017-06-28 Thread Lawrence Brakmo
This patch adds support for setting a per connection SYN and
SYN_ACK RTOs from within a BPF_SOCK_OPS program. For example,
to set small RTOs when it is known both hosts are within a
datacenter.

Signed-off-by: Lawrence Brakmo 
---
 include/net/tcp.h| 11 +++
 include/uapi/linux/bpf.h |  3 +++
 net/ipv4/tcp_input.c |  3 ++-
 net/ipv4/tcp_output.c|  2 +-
 4 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 804c27a..cd9ef63 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2058,4 +2058,15 @@ static inline int tcp_call_bpf(struct sock *sk, bool 
is_req_sock, int op)
 }
 #endif
 
+static inline u32 tcp_timeout_init(struct sock *sk, bool is_req_sock)
+{
+   int timeout;
+
+   timeout = tcp_call_bpf(sk, is_req_sock, BPF_SOCK_OPS_TIMEOUT_INIT);
+
+   if (timeout <= 0)
+   timeout = TCP_TIMEOUT_INIT;
+   return timeout;
+}
+
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 617fb66..4174668 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -746,6 +746,9 @@ struct bpf_sock_ops {
  */
 enum {
BPF_SOCK_OPS_VOID,
+   BPF_SOCK_OPS_TIMEOUT_INIT,  /* Should return SYN-RTO value to use or
+* -1 if default value should be used
+*/
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2ab7e2f..0867b05 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6406,7 +6406,8 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
} else {
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
-   inet_csk_reqsk_queue_hash_add(sk, req, 
TCP_TIMEOUT_INIT);
+   inet_csk_reqsk_queue_hash_add(sk, req,
+   tcp_timeout_init((struct sock *)req, true));
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
   TCP_SYNACK_COOKIE);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9a9c395..5e478a1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3327,7 +3327,7 @@ static void tcp_connect_init(struct sock *sk)
tp->rcv_wup = tp->rcv_nxt;
tp->copied_seq = tp->rcv_nxt;
 
-   inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+   inet_csk(sk)->icsk_rto = tcp_timeout_init(sk, false);
inet_csk(sk)->icsk_retransmits = 0;
tcp_clear_retrans(tp);
 }
-- 
2.9.3



[PATCH net-next v4 10/16] bpf: Add support for changing congestion control

2017-06-28 Thread Lawrence Brakmo
Added support for changing congestion control for SOCK_OPS bpf
programs through the setsockopt bpf helper function. It also adds
a new SOCK_OPS op, BPF_SOCK_OPS_NEEDS_ECN, that is needed for
congestion controls, like dctcp, that need to enable ECN in the
SYN packets.

Signed-off-by: Lawrence Brakmo 
---
 include/net/tcp.h|  9 -
 include/uapi/linux/bpf.h |  3 +++
 net/core/filter.c| 11 +--
 net/ipv4/tcp.c   |  2 +-
 net/ipv4/tcp_cong.c  | 32 ++--
 net/ipv4/tcp_input.c |  3 ++-
 net/ipv4/tcp_output.c|  8 +---
 7 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index af404aa..4faa8d1 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1004,7 +1004,9 @@ void tcp_get_default_congestion_control(char *name);
 void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
-int tcp_set_congestion_control(struct sock *sk, const char *name);
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
+void tcp_reinit_congestion_control(struct sock *sk,
+  const struct tcp_congestion_ops *ca);
 u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
 void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
@@ -2079,4 +2081,9 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk, bool 
is_req_sock)
rwnd = 0;
return rwnd;
 }
+
+static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
+{
+   return (tcp_call_bpf(sk, true, BPF_SOCK_OPS_NEEDS_ECN) == 1);
+}
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 5b7207d..77d05ff 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -776,6 +776,9 @@ enum {
 * passive connection is
 * established
 */
+   BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
+* needs ECN
+*/
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 167eca0..b36ec83 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2717,8 +2717,15 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, 
bpf_sock,
}
} else if (level == SOL_TCP &&
   sk->sk_prot->setsockopt == tcp_setsockopt) {
-   /* Place holder */
-   ret = -EINVAL;
+   if (optname == TCP_CONGESTION) {
+   ret = tcp_set_congestion_control(sk, optval, false);
+   if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
+   /* replacing an existing ca */
+   tcp_reinit_congestion_control(sk,
+   inet_csk(sk)->icsk_ca_ops);
+   } else {
+   ret = -EINVAL;
+   }
} else {
ret = -EINVAL;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4c88d20..5199952 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2479,7 +2479,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
name[val] = 0;
 
lock_sock(sk);
-   err = tcp_set_congestion_control(sk, name);
+   err = tcp_set_congestion_control(sk, name, true);
release_sock(sk);
return err;
}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 324c9bc..fde983f 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk)
INET_ECN_dontxmit(sk);
 }
 
-static void tcp_reinit_congestion_control(struct sock *sk,
- const struct tcp_congestion_ops *ca)
+void tcp_reinit_congestion_control(struct sock *sk,
+  const struct tcp_congestion_ops *ca)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
 
@@ -333,8 +333,12 @@ int tcp_set_allowed_congestion_control(char *val)
return ret;
 }
 
-/* Change congestion control for socket */
-int tcp_set_congestion_control(struct sock *sk, const char *name)
+/* Change congestion control for socket. If load is false, then it is the
+ * responsibility of the caller to call tcp_init_congestion_control or
+ * tcp_reinit_congestion_control (if the current congestion control was
+ * already initialized.
+ */
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca;
@@ -344,21 +348,29 @@ int tcp_set_congestion_control(str

[PATCH net-next v4 11/16] bpf: Sample BPF program to set congestion control

2017-06-28 Thread Lawrence Brakmo
Sample BPF program that sets congestion control to dctcp when both hosts
are within the same datacenter. In this example that is assumed to be
when they have the first 5.5 bytes of their IPv6 address are the same.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile|  1 +
 samples/bpf/tcp_cong_kern.c | 74 +
 2 files changed, 75 insertions(+)
 create mode 100644 samples/bpf/tcp_cong_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 3b300db..6fdf32d 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -116,6 +116,7 @@ always += cookie_uid_helper_example.o
 always += tcp_synrto_kern.o
 always += tcp_rwnd_kern.o
 always += tcp_bufs_kern.o
+always += tcp_cong_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c
new file mode 100644
index 000..fdced0f
--- /dev/null
+++ b/samples/bpf/tcp_cong_kern.c
@@ -0,0 +1,74 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set congestion control to dctcp when both hosts are
+ * in the same datacenter (as deteremined by IPv6 prefix).
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_cong(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   char cong[] = "dctcp";
+   int rv = 0;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Check if both hosts are in the same datacenter. For this
+* example they are if the 1st 5.5 bytes in the IPv6 address
+* are the same.
+*/
+   if (skops->family == AF_INET6 &&
+   skops->local_ip6[0] == skops->remote_ip6[0] &&
+   (skops->local_ip6[1] & 0xfff0) ==
+   (skops->remote_ip6[1] & 0xfff0)) {
+   switch (op) {
+   case BPF_SOCK_OPS_NEEDS_ECN:
+   rv = 1;
+   break;
+   case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+   rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+   cong, sizeof(cong));
+   break;
+   case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+   rv = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION,
+   cong, sizeof(cong));
+   break;
+   default:
+   rv = -1;
+   }
+   } else {
+   rv = -1;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   skops->reply = rv;
+   return 1;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3



[PATCH net-next v4 02/16] bpf: program to load and attach sock_ops BPF progs

2017-06-28 Thread Lawrence Brakmo
The program load_sock_ops can be used to load sock_ops bpf programs and
to attach it to an existing (v2) cgroup. It can also be used to detach
sock_ops programs.

Examples:
load_sock_ops [-l]  
Load and attaches a sock_ops program at the specified cgroup.
If "-l" is used, the program will continue to run to output the
BPF log buffer.
If the specified filename does not end in ".o", it appends
"_kern.o" to the name.

load_sock_ops -r 
Detaches the currently attached sock_ops program from the
specified cgroup.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile|  3 ++
 samples/bpf/load_sock_ops.c | 97 +
 2 files changed, 100 insertions(+)
 create mode 100644 samples/bpf/load_sock_ops.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index e7ec9b8..015589b 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -36,6 +36,7 @@ hostprogs-y += lwt_len_hist
 hostprogs-y += xdp_tx_iptunnel
 hostprogs-y += test_map_in_map
 hostprogs-y += per_socket_stats_example
+hostprogs-y += load_sock_ops
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o
@@ -52,6 +53,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
 lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
@@ -130,6 +132,7 @@ HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
+HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
 HOSTLOADLIBES_trace_output += -lelf -lrt
 HOSTLOADLIBES_lathist += -lelf
diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c
new file mode 100644
index 000..91aa00d
--- /dev/null
+++ b/samples/bpf/load_sock_ops.c
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+#include 
+#include 
+#include 
+#include 
+
+static void usage(char *pname)
+{
+   printf("USAGE:\n  %s [-l]  \n", pname);
+   printf("\tLoad and attach a sock_ops program to the specified "
+  "cgroup\n");
+   printf("\tIf \"-l\" is used, the program will continue to run\n");
+   printf("\tprinting the BPF log buffer\n");
+   printf("\tIf the specified filename does not end in \".o\", it\n");
+   printf("\tappends \"_kern.o\" to the name\n");
+   printf("\n");
+   printf("  %s -r \n", pname);
+   printf("\tDetaches the currently attached sock_ops program\n");
+   printf("\tfrom the specified cgroup\n");
+   printf("\n");
+   exit(0);
+}
+
+int main(int argc, char **argv)
+{
+   int logFlag = 0;
+   int error = 0;
+   char *cg_path;
+   char fn[500];
+   char *prog;
+   int cg_fd;
+
+   if (argc < 3)
+   usage(argv[0]);
+
+   if (!strcmp(argv[1], "-r")) {
+   cg_path = argv[2];
+   cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
+   error = bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+   if (error) {
+   printf("ERROR: bpf_prog_detach: %d (%s)\n",
+  error, strerror(errno));
+   return 1;
+   }
+   return 0;
+   } else if (!strcmp(argv[1], "-h")) {
+   usage(argv[0]);
+   } else if (!strcmp(argv[1], "-l")) {
+   logFlag = 1;
+   if (argc < 4)
+   usage(argv[0]);
+   }
+
+   prog = argv[argc - 1];
+   cg_path = argv[argc - 2];
+   if (strlen(prog) > 480) {
+   fprintf(stderr, "ERROR: program name too long (> 480 chars)\n");
+   exit(2);
+   }
+   cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
+
+   if (!strcmp(prog + strlen(prog)-2, ".o"))
+   strcpy(fn, prog);
+   else
+   sprintf(fn, "%s_kern.o", prog);
+   if (logFlag)
+   printf("loading bpf file:%s\n", fn);
+   if (load_bpf_file(fn)) {
+   printf("ERROR: load_bpf_file failed for: %s\n", fn);
+   printf("%s", bpf_log_buf);
+   return 1;
+   }
+   if (logFlag)
+   printf("TCP BPF Loaded %s\n", fn);
+
+   error = bpf_prog_attach(prog_fd[0], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+   if (error) {
+   printf("ERROR: bpf_prog_attach

[PATCH net-next v4 05/16] bpf: Support for setting initial receive window

2017-06-28 Thread Lawrence Brakmo
This patch adds suppport for setting the initial advertized window from
within a BPF_SOCK_OPS program. This can be used to support larger
initial cwnd values in environments where it is known to be safe.

Signed-off-by: Lawrence Brakmo 
---
 include/net/tcp.h| 10 ++
 include/uapi/linux/bpf.h |  4 
 net/ipv4/tcp_minisocks.c |  9 -
 net/ipv4/tcp_output.c|  7 ++-
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index cd9ef63..af404aa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2069,4 +2069,14 @@ static inline u32 tcp_timeout_init(struct sock *sk, bool 
is_req_sock)
return timeout;
 }
 
+static inline u32 tcp_rwnd_init_bpf(struct sock *sk, bool is_req_sock)
+{
+   int rwnd;
+
+   rwnd = tcp_call_bpf(sk, is_req_sock, BPF_SOCK_OPS_RWND_INIT);
+
+   if (rwnd < 0)
+   rwnd = 0;
+   return rwnd;
+}
 #endif /* _TCP_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4174668..cdec348 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -749,6 +749,10 @@ enum {
BPF_SOCK_OPS_TIMEOUT_INIT,  /* Should return SYN-RTO value to use or
 * -1 if default value should be used
 */
+   BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized
+* window (in packets) or -1 if default
+* value should be used
+*/
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d30ee31..bbaf3c6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -351,6 +351,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
int full_space = tcp_full_space(sk_listener);
u32 window_clamp;
__u8 rcv_wscale;
+   u32 rcv_wnd;
int mss;
 
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -363,6 +364,12 @@ void tcp_openreq_init_rwin(struct request_sock *req,
(req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
req->rsk_window_clamp = full_space;
 
+   rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req, true);
+   if (rcv_wnd == 0)
+   rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+   else if (full_space < rcv_wnd * mss)
+   full_space = rcv_wnd * mss;
+
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -370,7 +377,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
&req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
-   dst_metric(dst, RTAX_INITRWND));
+   rcv_wnd);
ireq->rcv_wscale = rcv_wscale;
 }
 EXPORT_SYMBOL(tcp_openreq_init_rwin);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5e478a1..e5f623f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3267,6 +3267,7 @@ static void tcp_connect_init(struct sock *sk)
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+   u32 rcv_wnd;
 
/* We'll fix this up when we get a response from the other end.
 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3300,13 +3301,17 @@ static void tcp_connect_init(struct sock *sk)
(tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
tp->window_clamp = tcp_full_space(sk);
 
+   rcv_wnd = tcp_rwnd_init_bpf(sk, false);
+   if (rcv_wnd == 0)
+   rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+
tcp_select_initial_window(tcp_full_space(sk),
  tp->advmss - (tp->rx_opt.ts_recent_stamp ? 
tp->tcp_header_len - sizeof(struct tcphdr) : 0),
  &tp->rcv_wnd,
  &tp->window_clamp,
  sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
  &rcv_wscale,
- dst_metric(dst, RTAX_INITRWND));
+ rcv_wnd);
 
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
-- 
2.9.3



[PATCH net-next v4 04/16] bpf: Sample bpf program to set SYN/SYN-ACK RTOs

2017-06-28 Thread Lawrence Brakmo
The sample BPF program, tcp_synrto_kern.c, sets the SYN and SYN-ACK
RTOs to 10ms when both hosts are within the same datacenter (i.e.
small RTTs) in an environment where common IPv6 prefixes indicate
both hosts are in the same data center.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile  |  1 +
 samples/bpf/tcp_synrto_kern.c | 60 +++
 2 files changed, 61 insertions(+)
 create mode 100644 samples/bpf/tcp_synrto_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 015589b..e29370a 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -113,6 +113,7 @@ always += lwt_len_hist_kern.o
 always += xdp_tx_iptunnel_kern.o
 always += test_map_in_map_kern.o
 always += cookie_uid_helper_example.o
+always += tcp_synrto_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c
new file mode 100644
index 000..b16ac39
--- /dev/null
+++ b/samples/bpf/tcp_synrto_kern.c
@@ -0,0 +1,60 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set SYN and SYN-ACK RTOs to 10ms when using IPv6 addresses
+ * and the first 5.5 bytes of the IPv6 addresses are the same (in this example
+ * that means both hosts are in the same datacenter.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_synrto(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int rv = -1;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Check for TIMEOUT_INIT operation and IPv6 addresses */
+   if (op == BPF_SOCK_OPS_TIMEOUT_INIT &&
+   skops->family == AF_INET6) {
+
+   /* If the first 5.5 bytes of the IPv6 address are the same
+* then both hosts are in the same datacenter
+* so use an RTO of 10ms
+*/
+   if (skops->local_ip6[0] == skops->remote_ip6[0] &&
+   (skops->local_ip6[1] & 0xfff0) ==
+   (skops->remote_ip6[1] & 0xfff0))
+   rv = 10;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   skops->reply = rv;
+   return 1;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3



[PATCH net-next v4 09/16] bpf: Sample BPF program to set buffer sizes

2017-06-28 Thread Lawrence Brakmo
This patch contains a BPF program to set initial receive window to
40 packets and send and receive buffers to 1.5MB. This would usually
be done after doing appropriate checks that indicate the hosts are
far enough away (i.e. large RTT).

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile|  1 +
 samples/bpf/tcp_bufs_kern.c | 77 +
 2 files changed, 78 insertions(+)
 create mode 100644 samples/bpf/tcp_bufs_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ca95528..3b300db 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -115,6 +115,7 @@ always += test_map_in_map_kern.o
 always += cookie_uid_helper_example.o
 always += tcp_synrto_kern.o
 always += tcp_rwnd_kern.o
+always += tcp_bufs_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c
new file mode 100644
index 000..ccd3bbe
--- /dev/null
+++ b/samples/bpf/tcp_bufs_kern.c
@@ -0,0 +1,77 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial receive window to 40 packets and send
+ * and receive buffers to 1.5MB. This would usually be done after
+ * doing appropriate checks that indicate the hosts are far enough
+ * away (i.e. large RTT).
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_bufs(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int bufsize = 150;
+   int rwnd_init = 40;
+   int rv = 0;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Usually there would be a check to insure the hosts are far
+* from each other so it makes sense to increase buffer sizes
+*/
+   switch (op) {
+   case BPF_SOCK_OPS_RWND_INIT:
+   rv = rwnd_init;
+   break;
+   case BPF_SOCK_OPS_TCP_CONNECT_CB:
+   /* Set sndbuf and rcvbuf of active connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+   sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+&bufsize, sizeof(bufsize));
+   break;
+   case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+   /* Nothing to do */
+   break;
+   case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+   /* Set sndbuf and rcvbuf of passive connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+   sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+&bufsize, sizeof(bufsize));
+   break;
+   default:
+   rv = -1;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   skops->reply = rv;
+   return 1;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3



[PATCH net-next v4 08/16] bpf: Add TCP connection BPF callbacks

2017-06-28 Thread Lawrence Brakmo
Added callbacks to BPF SOCK_OPS type program before an active
connection is intialized and after a passive or active connection is
established.

The following patch demostrates how they can be used to set send and
receive buffer sizes.

Signed-off-by: Lawrence Brakmo 
---
 include/uapi/linux/bpf.h | 11 +++
 net/ipv4/tcp_fastopen.c  |  1 +
 net/ipv4/tcp_input.c |  4 +++-
 net/ipv4/tcp_output.c|  1 +
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2dbae9e..5b7207d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -765,6 +765,17 @@ enum {
 * window (in packets) or -1 if default
 * value should be used
 */
+   BPF_SOCK_OPS_TCP_CONNECT_CB,/* Calls BPF program right before an
+* active connection is initialized
+*/
+   BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an
+* active connection is
+* established
+*/
+   BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,/* Calls BPF program when a
+* passive connection is
+* established
+*/
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 4af82b9..ed6b549 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -221,6 +221,7 @@ static struct sock *tcp_fastopen_create_child(struct sock 
*sk,
tcp_init_congestion_control(child);
tcp_mtup_init(child);
tcp_init_metrics(child);
+   tcp_call_bpf(child, false, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tcp_init_buffer_space(child);
 
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0867b05..1b868ae 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5571,7 +5571,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff 
*skb)
icsk->icsk_af_ops->rebuild_header(sk);
 
tcp_init_metrics(sk);
-
+   tcp_call_bpf(sk, false, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
tcp_init_congestion_control(sk);
 
/* Prevent spurious tcp_cwnd_restart() on first data
@@ -5977,6 +5977,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb)
} else {
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
+   tcp_call_bpf(sk, false,
+BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
tcp_init_congestion_control(sk);
 
tcp_mtup_init(sk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e5f623f..958edc8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3445,6 +3445,7 @@ int tcp_connect(struct sock *sk)
struct sk_buff *buff;
int err;
 
+   tcp_call_bpf(sk, false, BPF_SOCK_OPS_TCP_CONNECT_CB);
tcp_connect_init(sk);
 
if (unlikely(tp->repair)) {
-- 
2.9.3



[PATCH net-next v4 07/16] bpf: Add setsockopt helper function to bpf

2017-06-28 Thread Lawrence Brakmo
Added support for calling a subset of socket setsockopts from
BPF_PROG_TYPE_SOCK_OPS programs. The code was duplicated rather
than making the changes to call the socket setsockopt function because
the changes required would have been larger.

The ops supported are:
  SO_RCVBUF
  SO_SNDBUF
  SO_MAX_PACING_RATE
  SO_PRIORITY
  SO_RCVLOWAT
  SO_MARK

Signed-off-by: Lawrence Brakmo 
---
 include/uapi/linux/bpf.h  | 14 -
 net/core/filter.c | 77 ++-
 samples/bpf/bpf_helpers.h |  3 ++
 3 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cdec348..2dbae9e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -520,6 +520,17 @@ union bpf_attr {
  * Set full skb->hash.
  * @skb: pointer to skb
  * @hash: hash to set
+ *
+ * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
+ * Calls setsockopt. Not all opts are available, only those with
+ * integer optvals plus TCP_CONGESTION.
+ * Supported levels: SOL_SOCKET and IPROTO_TCP
+ * @bpf_socket: pointer to bpf_socket
+ * @level: SOL_SOCKET or IPROTO_TCP
+ * @optname: option name
+ * @optval: pointer to option value
+ * @optlen: length of optval in byes
+ * Return: 0 or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -570,7 +581,8 @@ union bpf_attr {
FN(probe_read_str), \
FN(get_socket_cookie),  \
FN(get_socket_uid), \
-   FN(set_hash),
+   FN(set_hash),   \
+   FN(setsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index bb54832..167eca0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -54,6 +54,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * sk_filter_trim_cap - run a packet through a socket filter
@@ -2672,6 +2673,69 @@ static const struct bpf_func_proto 
bpf_get_socket_uid_proto = {
.arg1_type  = ARG_PTR_TO_CTX,
 };
 
+BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+  int, level, int, optname, char *, optval, int, optlen)
+{
+   struct sock *sk = bpf_sock->sk;
+   int ret = 0;
+   int val;
+
+   if (bpf_sock->is_req_sock)
+   return -EINVAL;
+
+   if (level == SOL_SOCKET) {
+   /* Only some socketops are supported */
+   val = *((int *)optval);
+
+   switch (optname) {
+   case SO_RCVBUF:
+   sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+   sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+   break;
+   case SO_SNDBUF:
+   sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+   sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+   break;
+   case SO_MAX_PACING_RATE:
+   sk->sk_max_pacing_rate = val;
+   sk->sk_pacing_rate = min(sk->sk_pacing_rate,
+sk->sk_max_pacing_rate);
+   break;
+   case SO_PRIORITY:
+   sk->sk_priority = val;
+   break;
+   case SO_RCVLOWAT:
+   if (val < 0)
+   val = INT_MAX;
+   sk->sk_rcvlowat = val ? : 1;
+   break;
+   case SO_MARK:
+   sk->sk_mark = val;
+   break;
+   default:
+   ret = -EINVAL;
+   }
+   } else if (level == SOL_TCP &&
+  sk->sk_prot->setsockopt == tcp_setsockopt) {
+   /* Place holder */
+   ret = -EINVAL;
+   } else {
+   ret = -EINVAL;
+   }
+   return ret;
+}
+
+static const struct bpf_func_proto bpf_setsockopt_proto = {
+   .func   = bpf_setsockopt,
+   .gpl_only   = true,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_CTX,
+   .arg2_type  = ARG_ANYTHING,
+   .arg3_type  = ARG_ANYTHING,
+   .arg4_type  = ARG_PTR_TO_MEM,
+   .arg5_type  = ARG_CONST_SIZE_OR_ZERO,
+};
+
 static const struct bpf_func_proto *
 bpf_base_func_proto(enum bpf_func_id func_id)
 {
@@ -2823,6 +2887,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
+   sock_ops_func_proto(enum bpf_func_id func_id)
+{
+   switch (func_id) {
+   case BPF_FUNC_setsockopt:
+   return &bpf_setsockopt_proto;
+   default:
+   return bpf_base_func_proto(func_id);
+   }
+}
+
+static const struct bpf_func_proto *
 lwt_xmit_func_proto(enum bpf_f

[PATCH net-next v4 12/16] bpf: Adds support for setting initial cwnd

2017-06-28 Thread Lawrence Brakmo
Adds a new bpf_setsockopt for TCP sockets, TCP_BPF_IW, which sets the
initial congestion window. This can be used when the hosts are far
apart (large RTTs) and it is safe to start with a large inital cwnd.

Signed-off-by: Lawrence Brakmo 
---
 include/uapi/linux/bpf.h |  2 ++
 net/core/filter.c| 14 +-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77d05ff..0d9ff6d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -781,4 +781,6 @@ enum {
 */
 };
 
+#define TCP_BPF_IW 1001/* Set TCP initial congestion window */
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index b36ec83..147b637 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2724,7 +2724,19 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, 
bpf_sock,
tcp_reinit_congestion_control(sk,
inet_csk(sk)->icsk_ca_ops);
} else {
-   ret = -EINVAL;
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   val = *((int *)optval);
+   switch (optname) {
+   case TCP_BPF_IW:
+   if (val <= 0 || tp->data_segs_out > 0)
+   ret = -EINVAL;
+   else
+   tp->snd_cwnd = val;
+   break;
+   default:
+   ret = -EINVAL;
+   }
}
} else {
ret = -EINVAL;
-- 
2.9.3



[PATCH net-next v4 14/16] bpf: Adds support for setting sndcwnd clamp

2017-06-28 Thread Lawrence Brakmo
Adds a new bpf_setsockopt for TCP sockets, TCP_BPF_SNDCWND_CLAMP, which
sets the initial congestion window. It is useful to limit the sndcwnd
when the host are close to each other (small RTT).

Signed-off-by: Lawrence Brakmo 
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c| 7 +++
 2 files changed, 8 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0d9ff6d..284b366 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -782,5 +782,6 @@ enum {
 };
 
 #define TCP_BPF_IW 1001/* Set TCP initial congestion window */
+#define TCP_BPF_SNDCWND_CLAMP  1002/* Set sndcwnd_clamp */
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index 147b637..516353e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2734,6 +2734,13 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, 
bpf_sock,
else
tp->snd_cwnd = val;
break;
+   case TCP_BPF_SNDCWND_CLAMP:
+   if (val <= 0) {
+   ret = -EINVAL;
+   } else {
+   tp->snd_cwnd_clamp = val;
+   tp->snd_ssthresh = val;
+   }
default:
ret = -EINVAL;
}
-- 
2.9.3



[PATCH net-next v4 15/16] bpf: Sample bpf program to set sndcwnd clamp

2017-06-28 Thread Lawrence Brakmo
Sample BPF program, tcp_clamp_kern.c, to demostrate the use
of setting the sndcwnd clamp. This program assumes that if the
first 5.5 bytes of the host's IPv6 addresses are the same, then
the hosts are in the same datacenter and sets sndcwnd clamp to
100 packets, SYN and SYN-ACK RTOs to 10ms and send/receive buffer
sizes to 150KB.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile |  1 +
 samples/bpf/tcp_clamp_kern.c | 94 
 2 files changed, 95 insertions(+)
 create mode 100644 samples/bpf/tcp_clamp_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 242d76e..9c65058 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -118,6 +118,7 @@ always += tcp_rwnd_kern.o
 always += tcp_bufs_kern.o
 always += tcp_cong_kern.o
 always += tcp_iw_kern.o
+always += tcp_clamp_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c
new file mode 100644
index 000..07e334e
--- /dev/null
+++ b/samples/bpf/tcp_clamp_kern.c
@@ -0,0 +1,94 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample BPF program to set send and receive buffers to 150KB, sndcwnd clamp
+ * to 100 packets and SYN and SYN_ACK RTOs to 10ms when both hosts are within
+ * the same datacenter. For his example, we assume they are within the same
+ * datacenter when the first 5.5 bytes of their IPv6 addresses are the same.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_clamp(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int bufsize = 15;
+   int to_init = 10;
+   int clamp = 100;
+   int rv = 0;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Check that both hosts are within same datacenter. For this example
+* it is the case when the first 5.5 bytes of their IPv6 addresses are
+* the same.
+*/
+   if (skops->family == AF_INET6 &&
+   skops->local_ip6[0] == skops->remote_ip6[0] &&
+   (skops->local_ip6[1] & 0xfff0) ==
+   (skops->remote_ip6[1] & 0xfff0)) {
+   switch (op) {
+   case BPF_SOCK_OPS_TIMEOUT_INIT:
+   rv = to_init;
+   break;
+   case BPF_SOCK_OPS_TCP_CONNECT_CB:
+   /* Set sndbuf and rcvbuf of active connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF,
+   &bufsize, sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
+   break;
+   case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+   rv = bpf_setsockopt(skops, SOL_TCP,
+   TCP_BPF_SNDCWND_CLAMP,
+   &clamp, sizeof(clamp));
+   break;
+   case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+   /* Set sndbuf and rcvbuf of passive connections */
+   rv = bpf_setsockopt(skops, SOL_TCP,
+   TCP_BPF_SNDCWND_CLAMP,
+   &clamp, sizeof(clamp));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
+ SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
+   break;
+   default:
+   rv = -1;
+   }
+   } else {
+   rv = -1;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   skops->reply = rv;
+   return 1;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3



[PATCH net-next v4 13/16] bpf: Sample BPF program to set initial cwnd

2017-06-28 Thread Lawrence Brakmo
Sample BPF program that assumes hosts are far away (i.e. large RTTs)
and sets initial cwnd and initial receive window to 40 packets,
send and receive buffers to 1.5MB.

In practice there would be a test to insure the hosts are actually
far enough away.

Signed-off-by: Lawrence Brakmo 
---
 samples/bpf/Makefile  |  1 +
 samples/bpf/tcp_iw_kern.c | 79 +++
 2 files changed, 80 insertions(+)
 create mode 100644 samples/bpf/tcp_iw_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 6fdf32d..242d76e 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -117,6 +117,7 @@ always += tcp_synrto_kern.o
 always += tcp_rwnd_kern.o
 always += tcp_bufs_kern.o
 always += tcp_cong_kern.o
+always += tcp_iw_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c
new file mode 100644
index 000..28626f9
--- /dev/null
+++ b/samples/bpf/tcp_iw_kern.c
@@ -0,0 +1,79 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set initial congestion window and initial receive
+ * window to 40 packets and send and receive buffers to 1.5MB. This
+ * would usually be done after doing appropriate checks that indicate
+ * the hosts are far enough away (i.e. large RTT).
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+#define DEBUG 1
+
+SEC("sockops")
+int bpf_iw(struct bpf_sock_ops *skops)
+{
+   char fmt1[] = "BPF command: %d\n";
+   char fmt2[] = "  Returning %d\n";
+   int bufsize = 150;
+   int rwnd_init = 40;
+   int iw = 40;
+   int rv = 0;
+   int op;
+
+   /* For testing purposes, only execute rest of BPF program
+* if neither port numberis 55601
+*/
+   if (skops->remote_port != 55601 && skops->local_port != 55601)
+   return -1;
+
+   op = (int) skops->op;
+
+#ifdef DEBUG
+   bpf_trace_printk(fmt1, sizeof(fmt1), op);
+#endif
+
+   /* Usually there would be a check to insure the hosts are far
+* from each other so it makes sense to increase buffer sizes
+*/
+   switch (op) {
+   case BPF_SOCK_OPS_RWND_INIT:
+   rv = rwnd_init;
+   break;
+   case BPF_SOCK_OPS_TCP_CONNECT_CB:
+   /* Set sndbuf and rcvbuf of active connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+   sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+&bufsize, sizeof(bufsize));
+   break;
+   case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+   rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw,
+   sizeof(iw));
+   break;
+   case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+   /* Set sndbuf and rcvbuf of passive connections */
+   rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
+   sizeof(bufsize));
+   rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+&bufsize, sizeof(bufsize));
+   break;
+   default:
+   rv = -1;
+   }
+#ifdef DEBUG
+   bpf_trace_printk(fmt2, sizeof(fmt2), rv);
+#endif
+   skops->reply = rv;
+   return 1;
+}
+char _license[] SEC("license") = "GPL";
-- 
2.9.3



[PATCH net-next v4 16/16] bpf: update tools/include/uapi/linux/bpf.h

2017-06-28 Thread Lawrence Brakmo
Update tools/include/uapi/linux/bpf.h to include changes related to new
bpf sock_ops program type.

Signed-off-by: Lawrence Brakmo 
---
 tools/include/uapi/linux/bpf.h | 66 +-
 1 file changed, 65 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index f94b48b..284b366 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -120,12 +120,14 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LWT_IN,
BPF_PROG_TYPE_LWT_OUT,
BPF_PROG_TYPE_LWT_XMIT,
+   BPF_PROG_TYPE_SOCK_OPS,
 };
 
 enum bpf_attach_type {
BPF_CGROUP_INET_INGRESS,
BPF_CGROUP_INET_EGRESS,
BPF_CGROUP_INET_SOCK_CREATE,
+   BPF_CGROUP_SOCK_OPS,
__MAX_BPF_ATTACH_TYPE
 };
 
@@ -518,6 +520,17 @@ union bpf_attr {
  * Set full skb->hash.
  * @skb: pointer to skb
  * @hash: hash to set
+ *
+ * int bpf_setsockopt(bpf_socket, level, optname, optval, optlen)
+ * Calls setsockopt. Not all opts are available, only those with
+ * integer optvals plus TCP_CONGESTION.
+ * Supported levels: SOL_SOCKET and IPROTO_TCP
+ * @bpf_socket: pointer to bpf_socket
+ * @level: SOL_SOCKET or IPROTO_TCP
+ * @optname: option name
+ * @optval: pointer to option value
+ * @optlen: length of optval in byes
+ * Return: 0 or negative error
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec), \
@@ -568,7 +581,8 @@ union bpf_attr {
FN(probe_read_str), \
FN(get_socket_cookie),  \
FN(get_socket_uid), \
-   FN(set_hash),
+   FN(set_hash),   \
+   FN(setsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -720,4 +734,54 @@ struct bpf_map_info {
__u32 map_flags;
 } __attribute__((aligned(8)));
 
+/* User bpf_sock_ops struct to access socket values and specify request ops
+ * and their replies.
+ * New fields can only be added at the end of this structure
+ */
+struct bpf_sock_ops {
+   __u32 op;
+   union {
+   __u32 reply;
+   __u32 replylong[4];
+   };
+   __u32 family;
+   __u32 remote_ip4;
+   __u32 local_ip4;
+   __u32 remote_ip6[4];
+   __u32 local_ip6[4];
+   __u32 remote_port;
+   __u32 local_port;
+};
+
+/* List of known BPF sock_ops operators.
+ * New entries can only be added at the end
+ */
+enum {
+   BPF_SOCK_OPS_VOID,
+   BPF_SOCK_OPS_TIMEOUT_INIT,  /* Should return SYN-RTO value to use or
+* -1 if default value should be used
+*/
+   BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized
+* window (in packets) or -1 if default
+* value should be used
+*/
+   BPF_SOCK_OPS_TCP_CONNECT_CB,/* Calls BPF program right before an
+* active connection is initialized
+*/
+   BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an
+* active connection is
+* established
+*/
+   BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,/* Calls BPF program when a
+* passive connection is
+* established
+*/
+   BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control
+* needs ECN
+*/
+};
+
+#define TCP_BPF_IW 1001/* Set TCP initial congestion window */
+#define TCP_BPF_SNDCWND_CLAMP  1002/* Set sndcwnd_clamp */
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
2.9.3



[PATCH net-next] bpf: Fix out-of-bound access on interpreters[]

2017-06-28 Thread Martin KaFai Lau
The index is off-by-one when fp->aux->stack_depth
has already been rounded up to 32.  In particular,
if stack_depth is 512, the index will be 16.

The fix is to round_up and then takes -1 instead of round_down.

[   22.318680] 
==
[   22.319745] BUG: KASAN: global-out-of-bounds in 
bpf_prog_select_runtime+0x48a/0x670
[   22.320737] Read of size 8 at addr 82aadae0 by task sockex3/1946
[   22.321646]
[   22.321858] CPU: 1 PID: 1946 Comm: sockex3 Tainted: GW   
4.12.0-rc6-01680-g2ee87db3a287 #22
[   22.323061] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.9.3-1.el7.centos 04/01/2014
[   22.324260] Call Trace:
[   22.324612]  dump_stack+0x67/0x99
[   22.325081]  print_address_description+0x1e8/0x290
[   22.325734]  ? bpf_prog_select_runtime+0x48a/0x670
[   22.326360]  kasan_report+0x265/0x350
[   22.326860]  __asan_report_load8_noabort+0x19/0x20
[   22.327484]  bpf_prog_select_runtime+0x48a/0x670
[   22.328109]  bpf_prog_load+0x626/0xd40
[   22.328637]  ? __bpf_prog_charge+0xc0/0xc0
[   22.329222]  ? check_nnp_nosuid.isra.61+0x100/0x100
[   22.329890]  ? __might_fault+0xf6/0x1b0
[   22.330446]  ? lock_acquire+0x360/0x360
[   22.331013]  SyS_bpf+0x67c/0x24d0
[   22.331491]  ? trace_hardirqs_on+0xd/0x10
[   22.332049]  ? __getnstimeofday64+0xaf/0x1c0
[   22.332635]  ? bpf_prog_get+0x20/0x20
[   22.333135]  ? __audit_syscall_entry+0x300/0x600
[   22.333770]  ? syscall_trace_enter+0x540/0xdd0
[   22.334339]  ? exit_to_usermode_loop+0xe0/0xe0
[   22.334950]  ? do_syscall_64+0x48/0x410
[   22.335446]  ? bpf_prog_get+0x20/0x20
[   22.335954]  do_syscall_64+0x181/0x410
[   22.336454]  entry_SYSCALL64_slow_path+0x25/0x25
[   22.337121] RIP: 0033:0x7f263fe81f19
[   22.337618] RSP: 002b:7ffd9a3440c8 EFLAGS: 0202 ORIG_RAX: 
0141
[   22.338619] RAX: ffda RBX: 00aac5fb RCX: 7f263fe81f19
[   22.339600] RDX: 0030 RSI: 7ffd9a3440d0 RDI: 0005
[   22.340470] RBP: 00a9a1e0 R08: 00a9a1e0 R09: 009d0001
[   22.341430] R10:  R11: 0202 R12: 0001
[   22.342411] R13: 00a9a023 R14: 0001 R15: 0003
[   22.343369]
[   22.343593] The buggy address belongs to the variable:
[   22.344241]  interpreters+0x80/0x980
[   22.344708]
[   22.344908] Memory state around the buggy address:
[   22.345556]  82aad980: 00 00 00 04 fa fa fa fa 04 fa fa fa fa fa fa 
fa
[   22.346449]  82aada00: 00 00 00 00 00 fa fa fa fa fa fa fa 00 00 00 
00
[   22.347361] >82aada80: 00 00 00 00 00 00 00 00 00 00 00 00 fa fa fa 
fa
[   22.348301]^
[   22.349142]  82aadb00: 00 01 fa fa fa fa fa fa 00 00 00 00 00 00 00 
00
[   22.350058]  82aadb80: 00 00 07 fa fa fa fa fa 00 00 05 fa fa fa fa 
fa
[   22.350984] 
==

Fixes: b870aa901f4b ("bpf: use different interpreter depending on required 
stack size")
Signed-off-by: Martin KaFai Lau 
Acked-by: Alexei Starovoitov 
Acked-by: Daniel Borkmann 
---
 kernel/bpf/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 774069ca18a7..ad5f55922a13 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1297,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
  */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
-   fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32];
+   u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+
+   fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
 
/* eBPF JITs can rewrite the program in case constant
 * blinding is active. However, in case of error during
-- 
2.9.3



  1   2   >