[PATCH net-next v1 03/10] amd-xgbe: Perform priority-based hardware FIFO allocation

2016-11-03 Thread Tom Lendacky
Allocate the FIFO across the hardware Rx queues based on the priority
of the queues.  Giving more FIFO resources to queues with a higher
priority.  If PFC is active but not enabled for a queue, then less
resources can allocated to the queue.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c |  547 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |   26 +
 2 files changed, 434 insertions(+), 139 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index 18f8001..f8fffea 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -123,6 +123,11 @@
 #include "xgbe.h"
 #include "xgbe-common.h"
 
+static inline unsigned int xgbe_get_max_frame(struct xgbe_prv_data *pdata)
+{
+   return pdata->netdev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN;
+}
+
 static unsigned int xgbe_usec_to_riwt(struct xgbe_prv_data *pdata,
  unsigned int usec)
 {
@@ -491,6 +496,27 @@ static void xgbe_config_rss(struct xgbe_prv_data *pdata)
   "error configuring RSS, RSS disabled\n");
 }
 
+static bool xgbe_is_pfc_queue(struct xgbe_prv_data *pdata,
+ unsigned int queue)
+{
+   unsigned int prio, tc;
+
+   for (prio = 0; prio < IEEE_8021QAZ_MAX_TCS; prio++) {
+   /* Does this queue handle the priority? */
+   if (pdata->prio2q_map[prio] != queue)
+   continue;
+
+   /* Get the Traffic Class for this priority */
+   tc = pdata->ets->prio_tc[prio];
+
+   /* Check if PFC is enabled for this traffic class */
+   if (pdata->pfc->pfc_en & (1 << tc))
+   return true;
+   }
+
+   return false;
+}
+
 static int xgbe_disable_tx_flow_control(struct xgbe_prv_data *pdata)
 {
unsigned int max_q_count, q_count;
@@ -528,27 +554,14 @@ static int xgbe_enable_tx_flow_control(struct 
xgbe_prv_data *pdata)
for (i = 0; i < pdata->rx_q_count; i++) {
unsigned int ehfc = 0;
 
-   if (pfc && ets) {
-   unsigned int prio;
-
-   for (prio = 0; prio < IEEE_8021QAZ_MAX_TCS; prio++) {
-   unsigned int tc;
-
-   /* Does this queue handle the priority? */
-   if (pdata->prio2q_map[prio] != i)
-   continue;
-
-   /* Get the Traffic Class for this priority */
-   tc = ets->prio_tc[prio];
-
-   /* Check if flow control should be enabled */
-   if (pfc->pfc_en & (1 << tc)) {
+   if (pdata->rx_rfd[i]) {
+   /* Flow control thresholds are established */
+   if (pfc && ets) {
+   if (xgbe_is_pfc_queue(pdata, i))
ehfc = 1;
-   break;
-   }
+   } else {
+   ehfc = 1;
}
-   } else {
-   ehfc = 1;
}
 
XGMAC_MTL_IOWRITE_BITS(pdata, i, MTL_Q_RQOMR, EHFC, ehfc);
@@ -1327,106 +1340,6 @@ static int xgbe_config_tstamp(struct xgbe_prv_data 
*pdata,
return 0;
 }
 
-static void xgbe_config_tc(struct xgbe_prv_data *pdata)
-{
-   unsigned int offset, queue, prio;
-   u8 i;
-
-   netdev_reset_tc(pdata->netdev);
-   if (!pdata->num_tcs)
-   return;
-
-   netdev_set_num_tc(pdata->netdev, pdata->num_tcs);
-
-   for (i = 0, queue = 0, offset = 0; i < pdata->num_tcs; i++) {
-   while ((queue < pdata->tx_q_count) &&
-  (pdata->q2tc_map[queue] == i))
-   queue++;
-
-   netif_dbg(pdata, drv, pdata->netdev, "TC%u using TXq%u-%u\n",
- i, offset, queue - 1);
-   netdev_set_tc_queue(pdata->netdev, i, queue - offset, offset);
-   offset = queue;
-   }
-
-   if (!pdata->ets)
-   return;
-
-   for (prio = 0; prio < IEEE_8021QAZ_MAX_TCS; prio++)
-   netdev_set_prio_tc_map(pdata->netdev, prio,
-  pdata->ets->prio_tc[prio]);
-}
-
-static void xgbe_config_dcb_tc(struct xgbe_prv_data *pdata)
-{
-   struct ieee_ets *ets = pdata->ets;
-   unsigned int total_weight, min_weight, weight;
-   unsigned int mask, reg, reg_val;
-   unsigned int i, prio;
-
-   if (!ets)
-   return;
-
-   /* Set Tx to deficit weighted round robin scheduling algorithm (when
-* traffic class is using ETS algorithm)
-*/
-   XGMAC_IOWRITE_BITS(pdata, 

[PATCH net-next v1 06/10] amd-xgbe: Add support for clause 37 auto-negotiation

2016-11-03 Thread Tom Lendacky
Add support to be able to use clause 37 auto-negotiation.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-common.h |   41 +
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |  242 ++-
 drivers/net/ethernet/amd/xgbe/xgbe.h|   11 +
 3 files changed, 286 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h 
b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
index 695e982..8bcf4ef 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
@@ -1027,6 +1027,10 @@
 #define MDIO_PMA_10GBR_FECCTRL 0x00ab
 #endif
 
+#ifndef MDIO_PCS_DIG_CTRL
+#define MDIO_PCS_DIG_CTRL  0x8000
+#endif
+
 #ifndef MDIO_AN_XNP
 #define MDIO_AN_XNP0x0016
 #endif
@@ -1047,10 +1051,34 @@
 #define MDIO_AN_INT0x8002
 #endif
 
+#ifndef MDIO_VEND2_AN_ADVERTISE
+#define MDIO_VEND2_AN_ADVERTISE0x0004
+#endif
+
+#ifndef MDIO_VEND2_AN_LP_ABILITY
+#define MDIO_VEND2_AN_LP_ABILITY   0x0005
+#endif
+
+#ifndef MDIO_VEND2_AN_CTRL
+#define MDIO_VEND2_AN_CTRL 0x8001
+#endif
+
+#ifndef MDIO_VEND2_AN_STAT
+#define MDIO_VEND2_AN_STAT 0x8002
+#endif
+
 #ifndef MDIO_CTRL1_SPEED1G
 #define MDIO_CTRL1_SPEED1G (MDIO_CTRL1_SPEED10G & ~BMCR_SPEED100)
 #endif
 
+#ifndef MDIO_VEND2_CTRL1_AN_ENABLE
+#define MDIO_VEND2_CTRL1_AN_ENABLE BIT(12)
+#endif
+
+#ifndef MDIO_VEND2_CTRL1_AN_RESTART
+#define MDIO_VEND2_CTRL1_AN_RESTARTBIT(9)
+#endif
+
 /* MDIO mask values */
 #define XGBE_AN_CL73_INT_CMPLT BIT(0)
 #define XGBE_AN_CL73_INC_LINK  BIT(1)
@@ -1065,6 +1093,19 @@
 #define XGBE_KR_TRAINING_START BIT(0)
 #define XGBE_KR_TRAINING_ENABLEBIT(1)
 
+#define XGBE_PCS_CL37_BP   BIT(12)
+
+#define XGBE_AN_CL37_INT_CMPLT BIT(0)
+#define XGBE_AN_CL37_INT_MASK  0x01
+
+#define XGBE_AN_CL37_HD_MASK   0x40
+#define XGBE_AN_CL37_FD_MASK   0x20
+
+#define XGBE_AN_CL37_PCS_MODE_MASK 0x06
+#define XGBE_AN_CL37_PCS_MODE_BASEX0x00
+#define XGBE_AN_CL37_PCS_MODE_SGMII0x04
+#define XGBE_AN_CL37_TX_CONFIG_MASK0x08
+
 /* Bit setting and getting macros
  *  The get macro will extract the current bit field value from within
  *  the variable
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index d5bfbe4..723eb90 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -125,6 +125,41 @@
 #include "xgbe.h"
 #include "xgbe-common.h"
 
+static void xgbe_an37_clear_interrupts(struct xgbe_prv_data *pdata)
+{
+   int reg;
+
+   reg = XMDIO_READ(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_STAT);
+   reg &= ~XGBE_AN_CL37_INT_MASK;
+   XMDIO_WRITE(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_STAT, reg);
+}
+
+static void xgbe_an37_disable_interrupts(struct xgbe_prv_data *pdata)
+{
+   int reg;
+
+   reg = XMDIO_READ(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_CTRL);
+   reg &= ~XGBE_AN_CL37_INT_MASK;
+   XMDIO_WRITE(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_CTRL, reg);
+
+   reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_PCS_DIG_CTRL);
+   reg &= ~XGBE_PCS_CL37_BP;
+   XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_PCS_DIG_CTRL, reg);
+}
+
+static void xgbe_an37_enable_interrupts(struct xgbe_prv_data *pdata)
+{
+   int reg;
+
+   reg = XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_PCS_DIG_CTRL);
+   reg |= XGBE_PCS_CL37_BP;
+   XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_PCS_DIG_CTRL, reg);
+
+   reg = XMDIO_READ(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_CTRL);
+   reg |= XGBE_AN_CL37_INT_MASK;
+   XMDIO_WRITE(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_CTRL, reg);
+}
+
 static void xgbe_an73_clear_interrupts(struct xgbe_prv_data *pdata)
 {
XMDIO_WRITE(pdata, MDIO_MMD_AN, MDIO_AN_INT, 0);
@@ -146,11 +181,21 @@ static void xgbe_an_enable_interrupts(struct 
xgbe_prv_data *pdata)
case XGBE_AN_MODE_CL73:
xgbe_an73_enable_interrupts(pdata);
break;
+   case XGBE_AN_MODE_CL37:
+   case XGBE_AN_MODE_CL37_SGMII:
+   xgbe_an37_enable_interrupts(pdata);
+   break;
default:
break;
}
 }
 
+static void xgbe_an_clear_interrupts_all(struct xgbe_prv_data *pdata)
+{
+   xgbe_an73_clear_interrupts(pdata);
+   xgbe_an37_clear_interrupts(pdata);
+}
+
 static void xgbe_an73_enable_kr_training(struct xgbe_prv_data *pdata)
 {
unsigned int reg;
@@ -258,6 +303,39 @@ static bool xgbe_use_mode(struct xgbe_prv_data *pdata,
return pdata->phy_if.phy_impl.use_mode(pdata, mode);
 }
 
+static void xgbe_an37_set(struct xgbe_prv_data *pdata, bool enable,
+ bool restart)
+{
+   unsigned int reg;
+
+   reg = XMDIO_READ(pdata, MDIO_MMD_VEND2, MDIO_CTRL1);
+   reg &= ~MDIO_VEND2_CTRL1_AN_ENABLE;
+

[PATCH net-next v1 04/10] amd-xgbe: Prepare for working with more than one type of phy

2016-11-03 Thread Tom Lendacky
Prepare the code to be able to work with more than one type of phy by
adding additional callable functions into the phy interface and removing
phy specific settings/functions from non-phy related files.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/Makefile   |3 
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c |   58 +-
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c |6 
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c |   19 -
 drivers/net/ethernet/amd/xgbe/xgbe-main.c|  214 ++-
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c|  549 +
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c  |  821 ++
 drivers/net/ethernet/amd/xgbe/xgbe.h |  130 ++--
 8 files changed, 1140 insertions(+), 660 deletions(-)
 create mode 100644 drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c

diff --git a/drivers/net/ethernet/amd/xgbe/Makefile 
b/drivers/net/ethernet/amd/xgbe/Makefile
index 171a7e6..60b4ae2 100644
--- a/drivers/net/ethernet/amd/xgbe/Makefile
+++ b/drivers/net/ethernet/amd/xgbe/Makefile
@@ -2,7 +2,8 @@ obj-$(CONFIG_AMD_XGBE) += amd-xgbe.o
 
 amd-xgbe-objs := xgbe-main.o xgbe-drv.o xgbe-dev.o \
 xgbe-desc.o xgbe-ethtool.o xgbe-mdio.o \
-xgbe-ptp.o
+xgbe-ptp.o \
+xgbe-phy-v1.o
 
 amd-xgbe-$(CONFIG_AMD_XGBE_DCB) += xgbe-dcb.o
 amd-xgbe-$(CONFIG_DEBUG_FS) += xgbe-debugfs.o
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index f8fffea..75c3df1 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -717,32 +717,26 @@ static void xgbe_enable_mac_interrupts(struct 
xgbe_prv_data *pdata)
XGMAC_IOWRITE_BITS(pdata, MMC_TIER, ALL_INTERRUPTS, 0x);
 }
 
-static int xgbe_set_gmii_speed(struct xgbe_prv_data *pdata)
+static int xgbe_set_speed(struct xgbe_prv_data *pdata, int speed)
 {
-   if (XGMAC_IOREAD_BITS(pdata, MAC_TCR, SS) == 0x3)
-   return 0;
-
-   XGMAC_IOWRITE_BITS(pdata, MAC_TCR, SS, 0x3);
-
-   return 0;
-}
-
-static int xgbe_set_gmii_2500_speed(struct xgbe_prv_data *pdata)
-{
-   if (XGMAC_IOREAD_BITS(pdata, MAC_TCR, SS) == 0x2)
-   return 0;
+   unsigned int ss;
 
-   XGMAC_IOWRITE_BITS(pdata, MAC_TCR, SS, 0x2);
-
-   return 0;
-}
-
-static int xgbe_set_xgmii_speed(struct xgbe_prv_data *pdata)
-{
-   if (XGMAC_IOREAD_BITS(pdata, MAC_TCR, SS) == 0)
-   return 0;
+   switch (speed) {
+   case SPEED_1000:
+   ss = 0x03;
+   break;
+   case SPEED_2500:
+   ss = 0x02;
+   break;
+   case SPEED_1:
+   ss = 0x00;
+   break;
+   default:
+   return -EINVAL;
+   }
 
-   XGMAC_IOWRITE_BITS(pdata, MAC_TCR, SS, 0);
+   if (XGMAC_IOREAD_BITS(pdata, MAC_TCR, SS) != ss)
+   XGMAC_IOWRITE_BITS(pdata, MAC_TCR, SS, ss);
 
return 0;
 }
@@ -2469,19 +2463,7 @@ static void xgbe_config_jumbo_enable(struct 
xgbe_prv_data *pdata)
 
 static void xgbe_config_mac_speed(struct xgbe_prv_data *pdata)
 {
-   switch (pdata->phy_speed) {
-   case SPEED_1:
-   xgbe_set_xgmii_speed(pdata);
-   break;
-
-   case SPEED_2500:
-   xgbe_set_gmii_2500_speed(pdata);
-   break;
-
-   case SPEED_1000:
-   xgbe_set_gmii_speed(pdata);
-   break;
-   }
+   xgbe_set_speed(pdata, pdata->phy_speed);
 }
 
 static void xgbe_config_checksum_offload(struct xgbe_prv_data *pdata)
@@ -3195,9 +3177,7 @@ void xgbe_init_function_ptrs_dev(struct xgbe_hw_if *hw_if)
hw_if->read_mmd_regs = xgbe_read_mmd_regs;
hw_if->write_mmd_regs = xgbe_write_mmd_regs;
 
-   hw_if->set_gmii_speed = xgbe_set_gmii_speed;
-   hw_if->set_gmii_2500_speed = xgbe_set_gmii_2500_speed;
-   hw_if->set_xgmii_speed = xgbe_set_xgmii_speed;
+   hw_if->set_speed = xgbe_set_speed;
 
hw_if->enable_tx = xgbe_enable_tx;
hw_if->disable_tx = xgbe_disable_tx;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index c4e6682..dd166a0 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -778,7 +778,7 @@ static void xgbe_free_rx_data(struct xgbe_prv_data *pdata)
DBGPR("<--xgbe_free_rx_data\n");
 }
 
-static int xgbe_phy_init(struct xgbe_prv_data *pdata)
+static int xgbe_phy_reset(struct xgbe_prv_data *pdata)
 {
pdata->phy_link = -1;
pdata->phy_speed = SPEED_UNKNOWN;
@@ -1292,8 +1292,8 @@ static int xgbe_open(struct net_device *netdev)
 
DBGPR("-->xgbe_open\n");
 
-   /* Initialize the phy */
-   ret = xgbe_phy_init(pdata);
+   /* Reset the phy settings */
+   ret = xgbe_phy_reset(pdata);
if (ret)
return ret;
 
diff --git 

[PATCH net-next v1 09/10] amd-xgbe: Update how to determine DMA channel status

2016-11-03 Thread Tom Lendacky
Tx and Rx DMA channel status determiniation is different depending on the
version of the hardware. Update the channel status processing code to
account for the change.  Also, reduce the timeout value used when stopping
the channels.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-common.h |4 ++
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c|   58 ++-
 drivers/net/ethernet/amd/xgbe/xgbe.h|2 -
 3 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h 
b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
index 6c40915..8036ee5 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
@@ -790,6 +790,10 @@
 #define MTL_Q_RQOMR_RSF_WIDTH  1
 #define MTL_Q_RQOMR_RTC_INDEX  0
 #define MTL_Q_RQOMR_RTC_WIDTH  2
+#define MTL_Q_TQDR_TRCSTS_INDEX1
+#define MTL_Q_TQDR_TRCSTS_WIDTH2
+#define MTL_Q_TQDR_TXQSTS_INDEX4
+#define MTL_Q_TQDR_TXQSTS_WIDTH1
 #define MTL_Q_TQOMR_FTQ_INDEX  0
 #define MTL_Q_TQOMR_FTQ_WIDTH  1
 #define MTL_Q_TQOMR_Q2TCMAP_INDEX  8
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index fbd60ee..0a7ab63 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -2944,20 +2944,48 @@ static void xgbe_config_mmc(struct xgbe_prv_data *pdata)
XGMAC_IOWRITE_BITS(pdata, MMC_CR, CR, 1);
 }
 
+static void xgbe_txq_prepare_tx_stop(struct xgbe_prv_data *pdata,
+unsigned int queue)
+{
+   unsigned int tx_status;
+   unsigned long tx_timeout;
+
+   /* The Tx engine cannot be stopped if it is actively processing
+* packets. Wait for the Tx queue to empty the Tx fifo.  Don't
+* wait forever though...
+*/
+   tx_timeout = jiffies + (XGBE_DMA_STOP_TIMEOUT * HZ);
+   while (time_before(jiffies, tx_timeout)) {
+   tx_status = XGMAC_MTL_IOREAD(pdata, queue, MTL_Q_TQDR);
+   if ((XGMAC_GET_BITS(tx_status, MTL_Q_TQDR, TRCSTS) != 1) &&
+   (XGMAC_GET_BITS(tx_status, MTL_Q_TQDR, TXQSTS) == 0))
+   break;
+
+   usleep_range(500, 1000);
+   }
+
+   if (!time_before(jiffies, tx_timeout))
+   netdev_info(pdata->netdev,
+   "timed out waiting for Tx queue %u to empty\n",
+   queue);
+}
+
 static void xgbe_prepare_tx_stop(struct xgbe_prv_data *pdata,
-struct xgbe_channel *channel)
+unsigned int queue)
 {
unsigned int tx_dsr, tx_pos, tx_qidx;
unsigned int tx_status;
unsigned long tx_timeout;
 
+   if (XGMAC_GET_BITS(pdata->hw_feat.version, MAC_VR, SNPSVER) > 0x20)
+   return xgbe_txq_prepare_tx_stop(pdata, queue);
+
/* Calculate the status register to read and the position within */
-   if (channel->queue_index < DMA_DSRX_FIRST_QUEUE) {
+   if (queue < DMA_DSRX_FIRST_QUEUE) {
tx_dsr = DMA_DSR0;
-   tx_pos = (channel->queue_index * DMA_DSR_Q_WIDTH) +
-DMA_DSR0_TPS_START;
+   tx_pos = (queue * DMA_DSR_Q_WIDTH) + DMA_DSR0_TPS_START;
} else {
-   tx_qidx = channel->queue_index - DMA_DSRX_FIRST_QUEUE;
+   tx_qidx = queue - DMA_DSRX_FIRST_QUEUE;
 
tx_dsr = DMA_DSR1 + ((tx_qidx / DMA_DSRX_QPR) * DMA_DSRX_INC);
tx_pos = ((tx_qidx % DMA_DSRX_QPR) * DMA_DSR_Q_WIDTH) +
@@ -2982,7 +3010,7 @@ static void xgbe_prepare_tx_stop(struct xgbe_prv_data 
*pdata,
if (!time_before(jiffies, tx_timeout))
netdev_info(pdata->netdev,
"timed out waiting for Tx DMA channel %u to stop\n",
-   channel->queue_index);
+   queue);
 }
 
 static void xgbe_enable_tx(struct xgbe_prv_data *pdata)
@@ -3014,13 +3042,8 @@ static void xgbe_disable_tx(struct xgbe_prv_data *pdata)
unsigned int i;
 
/* Prepare for Tx DMA channel stop */
-   channel = pdata->channel;
-   for (i = 0; i < pdata->channel_count; i++, channel++) {
-   if (!channel->tx_ring)
-   break;
-
-   xgbe_prepare_tx_stop(pdata, channel);
-   }
+   for (i = 0; i < pdata->tx_q_count; i++)
+   xgbe_prepare_tx_stop(pdata, i);
 
/* Disable MAC Tx */
XGMAC_IOWRITE_BITS(pdata, MAC_TCR, TE, 0);
@@ -3144,13 +3167,8 @@ static void xgbe_powerdown_tx(struct xgbe_prv_data 
*pdata)
unsigned int i;
 
/* Prepare for Tx DMA channel stop */
-   channel = pdata->channel;
-   for (i = 0; i < pdata->channel_count; i++, channel++) {
-   if 

[PATCH net-next v1 07/10] amd-xgbe: Prepare for a new PCS register access method

2016-11-03 Thread Tom Lendacky
Prepare the code to be able to support accessing of the PCS registers
in a new way, while maintaining the current access method. Provide a
version specific field that indicates the method to use.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-common.h |   21 +++--
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c|  107 +--
 drivers/net/ethernet/amd/xgbe/xgbe-main.c   |1 
 drivers/net/ethernet/amd/xgbe/xgbe.h|9 ++
 4 files changed, 119 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h 
b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
index 8bcf4ef..6c40915 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
@@ -852,14 +852,9 @@
 #define MTL_TSA_SP 0x00
 #define MTL_TSA_ETS0x02
 
-/* PCS MMD select register offset
- *  The MMD select register is used for accessing PCS registers
- *  when the underlying APB3 interface is using indirect addressing.
- *  Indirect addressing requires accessing registers in two phases,
- *  an address phase and a data phase.  The address phases requires
- *  writing an address selection value to the MMD select regiesters.
- */
-#define PCS_MMD_SELECT 0xff
+/* PCS register offsets */
+#define PCS_V1_WINDOW_SELECT   0x03fc
+#define PCS_V2_WINDOW_SELECT   0x9064
 
 /* SerDes integration register offsets */
 #define SIR0_KR_RT_1   0x002c
@@ -1241,12 +1236,18 @@
 /* Macros for building, reading or writing register values or bits
  * within the register values of XPCS registers.
  */
-#define XPCS_IOWRITE(_pdata, _off, _val)   \
+#define XPCS32_IOWRITE(_pdata, _off, _val) \
iowrite32(_val, (_pdata)->xpcs_regs + (_off))
 
-#define XPCS_IOREAD(_pdata, _off)  \
+#define XPCS32_IOREAD(_pdata, _off)\
ioread32((_pdata)->xpcs_regs + (_off))
 
+#define XPCS16_IOWRITE(_pdata, _off, _val) \
+   iowrite16(_val, (_pdata)->xpcs_regs + (_off))
+
+#define XPCS16_IOREAD(_pdata, _off)\
+   ioread16((_pdata)->xpcs_regs + (_off))
+
 /* Macros for building, reading or writing register values or bits
  * within the register values of SerDes integration registers.
  */
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index 75c3df1..b8a04e7 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -1026,8 +1026,71 @@ static int xgbe_config_rx_mode(struct xgbe_prv_data 
*pdata)
return 0;
 }
 
-static int xgbe_read_mmd_regs(struct xgbe_prv_data *pdata, int prtad,
- int mmd_reg)
+static int xgbe_read_mmd_regs_v2(struct xgbe_prv_data *pdata, int prtad,
+int mmd_reg)
+{
+   unsigned long flags;
+   unsigned int mmd_address, index, offset;
+   int mmd_data;
+
+   if (mmd_reg & MII_ADDR_C45)
+   mmd_address = mmd_reg & ~MII_ADDR_C45;
+   else
+   mmd_address = (pdata->mdio_mmd << 16) | (mmd_reg & 0x);
+
+   /* The PCS registers are accessed using mmio. The underlying
+* management interface uses indirect addressing to access the MMD
+* register sets. This requires accessing of the PCS register in two
+* phases, an address phase and a data phase.
+*
+* The mmio interface is based on 16-bit offsets and values. All
+* register offsets must therefore be adjusted by left shifting the
+* offset 1 bit and reading 16 bits of data.
+*/
+   mmd_address <<= 1;
+   index = mmd_address & ~pdata->xpcs_window_mask;
+   offset = pdata->xpcs_window + (mmd_address & pdata->xpcs_window_mask);
+
+   spin_lock_irqsave(>xpcs_lock, flags);
+   XPCS32_IOWRITE(pdata, PCS_V2_WINDOW_SELECT, index);
+   mmd_data = XPCS16_IOREAD(pdata, offset);
+   spin_unlock_irqrestore(>xpcs_lock, flags);
+
+   return mmd_data;
+}
+
+static void xgbe_write_mmd_regs_v2(struct xgbe_prv_data *pdata, int prtad,
+  int mmd_reg, int mmd_data)
+{
+   unsigned long flags;
+   unsigned int mmd_address, index, offset;
+
+   if (mmd_reg & MII_ADDR_C45)
+   mmd_address = mmd_reg & ~MII_ADDR_C45;
+   else
+   mmd_address = (pdata->mdio_mmd << 16) | (mmd_reg & 0x);
+
+   /* The PCS registers are accessed using mmio. The underlying
+* management interface uses indirect addressing to access the MMD
+* register sets. This requires accessing of the PCS register in two
+* phases, an address phase and a data phase.
+*
+* The mmio interface is based on 16-bit offsets and values. All
+  

[PATCH net-next v1 02/10] amd-xgbe: Prepare for priority-based FIFO allocation

2016-11-03 Thread Tom Lendacky
Currently, the Rx and Tx fifos are evenly allocated between the hardware
queues of the device.  As more queues are instantiated, the fifo memory
needs to be able to be allocated based on queue priority. This allows for
higher priority queues to have more fifo memory than lower priority
queues. Prepare for this by modifying the current fifo calculation to
assign the fifo queue allocation in an array that is then used to program
the hardware.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c |   55 +-
 drivers/net/ethernet/amd/xgbe/xgbe.h |3 +-
 2 files changed, 41 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
index 1babcc1..18f8001 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-dev.c
@@ -2000,19 +2000,37 @@ static void xgbe_config_mtl_mode(struct xgbe_prv_data 
*pdata)
XGMAC_IOWRITE_BITS(pdata, MTL_OMR, RAA, MTL_RAA_SP);
 }
 
-static unsigned int xgbe_calculate_per_queue_fifo(unsigned int fifo_size,
- unsigned int queue_count)
+static unsigned int xgbe_get_tx_fifo_size(struct xgbe_prv_data *pdata)
 {
-   unsigned int q_fifo_size;
-   unsigned int p_fifo;
+   unsigned int fifo_size;
 
/* Calculate the configured fifo size */
-   q_fifo_size = 1 << (fifo_size + 7);
+   fifo_size = 1 << (pdata->hw_feat.tx_fifo_size + 7);
 
/* The configured value may not be the actual amount of fifo RAM */
-   q_fifo_size = min_t(unsigned int, XGBE_FIFO_MAX, q_fifo_size);
+   return min_t(unsigned int, XGMAC_FIFO_TX_MAX, fifo_size);
+}
+
+static unsigned int xgbe_get_rx_fifo_size(struct xgbe_prv_data *pdata)
+{
+   unsigned int fifo_size;
 
-   q_fifo_size = q_fifo_size / queue_count;
+   /* Calculate the configured fifo size */
+   fifo_size = 1 << (pdata->hw_feat.rx_fifo_size + 7);
+
+   /* The configured value may not be the actual amount of fifo RAM */
+   return min_t(unsigned int, XGMAC_FIFO_RX_MAX, fifo_size);
+}
+
+static void xgbe_calculate_equal_fifo(unsigned int fifo_size,
+ unsigned int queue_count,
+ unsigned int *fifo)
+{
+   unsigned int q_fifo_size;
+   unsigned int p_fifo;
+   unsigned int i;
+
+   q_fifo_size = fifo_size / queue_count;
 
/* Each increment in the queue fifo size represents 256 bytes of
 * fifo, with 0 representing 256 bytes. Distribute the fifo equally
@@ -2022,39 +2040,44 @@ static unsigned int 
xgbe_calculate_per_queue_fifo(unsigned int fifo_size,
if (p_fifo)
p_fifo--;
 
-   return p_fifo;
+   for (i = 0; i < queue_count; i++)
+   fifo[i] = p_fifo;
 }
 
 static void xgbe_config_tx_fifo_size(struct xgbe_prv_data *pdata)
 {
unsigned int fifo_size;
+   unsigned int fifo[XGBE_MAX_QUEUES];
unsigned int i;
 
-   fifo_size = xgbe_calculate_per_queue_fifo(pdata->hw_feat.tx_fifo_size,
- pdata->tx_q_count);
+   fifo_size = xgbe_get_tx_fifo_size(pdata);
+
+   xgbe_calculate_equal_fifo(fifo_size, pdata->tx_q_count, fifo);
 
for (i = 0; i < pdata->tx_q_count; i++)
-   XGMAC_MTL_IOWRITE_BITS(pdata, i, MTL_Q_TQOMR, TQS, fifo_size);
+   XGMAC_MTL_IOWRITE_BITS(pdata, i, MTL_Q_TQOMR, TQS, fifo[i]);
 
netif_info(pdata, drv, pdata->netdev,
   "%d Tx hardware queues, %d byte fifo per queue\n",
-  pdata->tx_q_count, ((fifo_size + 1) * 256));
+  pdata->tx_q_count, ((fifo[0] + 1) * 256));
 }
 
 static void xgbe_config_rx_fifo_size(struct xgbe_prv_data *pdata)
 {
unsigned int fifo_size;
+   unsigned int fifo[XGBE_MAX_QUEUES];
unsigned int i;
 
-   fifo_size = xgbe_calculate_per_queue_fifo(pdata->hw_feat.rx_fifo_size,
- pdata->rx_q_count);
+   fifo_size = xgbe_get_rx_fifo_size(pdata);
+
+   xgbe_calculate_equal_fifo(fifo_size, pdata->rx_q_count, fifo);
 
for (i = 0; i < pdata->rx_q_count; i++)
-   XGMAC_MTL_IOWRITE_BITS(pdata, i, MTL_Q_RQOMR, RQS, fifo_size);
+   XGMAC_MTL_IOWRITE_BITS(pdata, i, MTL_Q_RQOMR, RQS, fifo[i]);
 
netif_info(pdata, drv, pdata->netdev,
   "%d Rx hardware queues, %d byte fifo per queue\n",
-  pdata->rx_q_count, ((fifo_size + 1) * 256));
+  pdata->rx_q_count, ((fifo[0] + 1) * 256));
 }
 
 static void xgbe_config_queue_mapping(struct xgbe_prv_data *pdata)
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h 
b/drivers/net/ethernet/amd/xgbe/xgbe.h
index 5dd17dc..d838b44 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -208,7 +208,8 @@
 

[PATCH net-next v1 01/10] amd-xgbe: Fix formatting of PCS register dump

2016-11-03 Thread Tom Lendacky
Fix the length value used for the PCS register dump so that the full
value can be displayed.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |   24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 84c5d29..e9b01fc 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -1257,33 +1257,33 @@ static void xgbe_dump_phy_registers(struct 
xgbe_prv_data *pdata)
 
dev_dbg(dev, "\n* PHY Reg dump **\n");
 
-   dev_dbg(dev, "PCS Control Reg (%#04x) = %#04x\n", MDIO_CTRL1,
+   dev_dbg(dev, "PCS Control Reg (%#06x) = %#06x\n", MDIO_CTRL1,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1));
-   dev_dbg(dev, "PCS Status Reg (%#04x) = %#04x\n", MDIO_STAT1,
+   dev_dbg(dev, "PCS Status Reg (%#06x) = %#06x\n", MDIO_STAT1,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_STAT1));
-   dev_dbg(dev, "Phy Id (PHYS ID 1 %#04x)= %#04x\n", MDIO_DEVID1,
+   dev_dbg(dev, "Phy Id (PHYS ID 1 %#06x)= %#06x\n", MDIO_DEVID1,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_DEVID1));
-   dev_dbg(dev, "Phy Id (PHYS ID 2 %#04x)= %#04x\n", MDIO_DEVID2,
+   dev_dbg(dev, "Phy Id (PHYS ID 2 %#06x)= %#06x\n", MDIO_DEVID2,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_DEVID2));
-   dev_dbg(dev, "Devices in Package (%#04x)= %#04x\n", MDIO_DEVS1,
+   dev_dbg(dev, "Devices in Package (%#06x)= %#06x\n", MDIO_DEVS1,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_DEVS1));
-   dev_dbg(dev, "Devices in Package (%#04x)= %#04x\n", MDIO_DEVS2,
+   dev_dbg(dev, "Devices in Package (%#06x)= %#06x\n", MDIO_DEVS2,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_DEVS2));
 
-   dev_dbg(dev, "Auto-Neg Control Reg (%#04x) = %#04x\n", MDIO_CTRL1,
+   dev_dbg(dev, "Auto-Neg Control Reg (%#06x) = %#06x\n", MDIO_CTRL1,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1));
-   dev_dbg(dev, "Auto-Neg Status Reg (%#04x) = %#04x\n", MDIO_STAT1,
+   dev_dbg(dev, "Auto-Neg Status Reg (%#06x) = %#06x\n", MDIO_STAT1,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_STAT1));
-   dev_dbg(dev, "Auto-Neg Ad Reg 1 (%#04x) = %#04x\n",
+   dev_dbg(dev, "Auto-Neg Ad Reg 1 (%#06x) = %#06x\n",
MDIO_AN_ADVERTISE,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE));
-   dev_dbg(dev, "Auto-Neg Ad Reg 2 (%#04x) = %#04x\n",
+   dev_dbg(dev, "Auto-Neg Ad Reg 2 (%#06x) = %#06x\n",
MDIO_AN_ADVERTISE + 1,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 1));
-   dev_dbg(dev, "Auto-Neg Ad Reg 3 (%#04x) = %#04x\n",
+   dev_dbg(dev, "Auto-Neg Ad Reg 3 (%#06x) = %#06x\n",
MDIO_AN_ADVERTISE + 2,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2));
-   dev_dbg(dev, "Auto-Neg Completion Reg (%#04x) = %#04x\n",
+   dev_dbg(dev, "Auto-Neg Completion Reg (%#06x) = %#06x\n",
MDIO_AN_COMP_STAT,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_COMP_STAT));
 



[PATCH net-next v1 00/10] amd-xgbe: AMD XGBE driver updates 2016-11-03

2016-11-03 Thread Tom Lendacky
This patch series is targeted at preparing the driver for a new PCI version
of the hardware.  After this series is applied, a follow-on series will
introduce the support for the PCI version of the hardware.

The following updates and fixes are included in this driver update series:

- Fix formatting of PCS debug register dump
- Prepare for priority-based FIFO allocation
- Implement priority-based FIFO allocation
- Prepare for working with more than one type of PCS/PHY
- Prepare for the introduction of clause 37 auto-negotiation
- Add support for clause 37 auto-negotiation
- Prepare for supporting a new PCS register access method
- Add support for 64-bit management counter registers
- Update DMA channel status determination
- Prepare for supporting PCI devices in addition to platform devices

This patch series is based on net-next.

---

Tom Lendacky (10):
  amd-xgbe: Fix formatting of PCS register dump
  amd-xgbe: Prepare for priority-based FIFO allocation
  amd-xgbe: Perform priority-based hardware FIFO allocation
  amd-xgbe: Prepare for working with more than one type of phy
  amd-xgbe: Prepare for introduction of clause 37 autoneg
  amd-xgbe: Add support for clause 37 auto-negotiation
  amd-xgbe: Prepare for a new PCS register access method
  amd-xgbe: Support for 64-bit management counter registers
  amd-xgbe: Update how to determine DMA channel status
  amd-xgbe: Prepare for supporting PCI devices


 drivers/net/ethernet/amd/xgbe/Makefile|4 
 drivers/net/ethernet/amd/xgbe/xgbe-common.h   |   71 ++
 drivers/net/ethernet/amd/xgbe/xgbe-dev.c  |  847 -
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c  |   28 -
 drivers/net/ethernet/amd/xgbe/xgbe-ethtool.c  |   19 
 drivers/net/ethernet/amd/xgbe/xgbe-main.c |  662 ++---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |  990 ++---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c   |  828 +
 drivers/net/ethernet/amd/xgbe/xgbe-platform.c |  632 
 drivers/net/ethernet/amd/xgbe/xgbe.h  |  213 -
 10 files changed, 2923 insertions(+), 1371 deletions(-)
 create mode 100644 drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c
 create mode 100644 drivers/net/ethernet/amd/xgbe/xgbe-platform.c

-- 
Tom Lendacky


Re: net/sctp: use-after-free in __sctp_connect

2016-11-03 Thread Andrey Konovalov
On Thu, Nov 3, 2016 at 6:52 PM, Marcelo Ricardo Leitner
 wrote:
> On Thu, Nov 03, 2016 at 06:11:01PM +0100, Andrey Konovalov wrote:
>> On Wed, Nov 2, 2016 at 11:42 PM, Andrey Konovalov  
>> wrote:
>> > On Wed, Oct 19, 2016 at 6:57 PM, Marcelo Ricardo Leitner
>> >  wrote:
>> >> On Wed, Oct 19, 2016 at 02:25:24PM +0200, Andrey Konovalov wrote:
>> >>> Hi,
>> >>>
>> >>> I've got the following error report while running the syzkaller fuzzer:
>> >>>
>> >>> ==
>> >>> BUG: KASAN: use-after-free in __sctp_connect+0xabe/0xbf0 at addr
>> >>> 88006b1dc610
>> >>
>> >> Seems this is the same that Dmitry Vyukov had reported back in Jan 13th.
>> >> So far I couldn't identify the reason.
>> >> "Good" to know it's still there, thanks for reporting it.
>>
>> Hi Marcelo,
>>
>
> Hi
>
>> So I've looked at the code.
>> As far as I understand, the problem is a race condition between
>> setsockopt(SCTP_SOCKOPT_CONNECTX) and shutdown on an sctp socket.
>> setsockopt() calls sctp_wait_for_connect(), which exits the for loop
>> on the sk->sk_shutdown & RCV_SHUTDOWN if clause, and then frees asoc
>> with sctp_association_put() and returns err = 0.
>> Then __sctp_connect() checks that err == 0 and reads asoc->assoc_id
>> from the freed asoc.
>
> Suddenly this seems familiar. Your description makes sense, thanks for
> looking deeper into this, Andrey.
>
> This fix should do it, can you please try it? I'll post it properly
> if it works.

Yes, it fixes the issue.

Tested-by: Andrey Konovalov 

Thanks for the fix!

>
> wait_for_connect is only used in two places, we can move the ref to a
> broader scope and cover that read too, instead of holding another ref.
>
> sendmsg path won't read anything from the asoc after waiting, so this
> should be enough for it too.
>
> ---8<---
>
> commit 7f7ba9b4fb834a61ab097dfd7c1f267e6a6d70a8
> Author: Marcelo Ricardo Leitner 
> Date:   Thu Nov 3 15:47:45 2016 -0200
>
> sctp: hold the asoc longer when associating
>
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 9fbb6feb8c27..aac271571930 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -1214,9 +1214,11 @@ static int __sctp_connect(struct sock *sk,
>
> timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
>
> +   sctp_association_hold(asoc);
> err = sctp_wait_for_connect(asoc, );
> if ((err == 0 || err == -EINPROGRESS) && assoc_id)
> *assoc_id = asoc->assoc_id;
> +   sctp_association_put(asoc);
>
> /* Don't free association on exit. */
> asoc = NULL;
> @@ -1985,7 +1987,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr 
> *msg, size_t msg_len)
>
> if (unlikely(wait_connect)) {
> timeo = sock_sndtimeo(sk, msg_flags & MSG_DONTWAIT);
> +   sctp_association_hold(asoc);
> sctp_wait_for_connect(asoc, );
> +   sctp_association_put(asoc);
> }
>
> /* If we are already past ASSOCIATE, the lower
> @@ -7501,6 +7505,7 @@ static int sctp_writeable(struct sock *sk)
>
>  /* Wait for an association to go into ESTABLISHED state. If timeout is 0,
>   * returns immediately with EINPROGRESS.
> + * Note: caller must hold a ref on asoc before calling this function.
>   */
>  static int sctp_wait_for_connect(struct sctp_association *asoc, long 
> *timeo_p)
>  {
> @@ -7511,9 +7516,6 @@ static int sctp_wait_for_connect(struct 
> sctp_association *asoc, long *timeo_p)
>
> pr_debug("%s: asoc:%p, timeo:%ld\n", __func__, asoc, *timeo_p);
>
> -   /* Increment the association's refcnt.  */
> -   sctp_association_hold(asoc);
> -
> for (;;) {
> prepare_to_wait_exclusive(>wait, ,
>   TASK_INTERRUPTIBLE);
> @@ -7543,9 +7545,6 @@ static int sctp_wait_for_connect(struct 
> sctp_association *asoc, long *timeo_p)
>  out:
> finish_wait(>wait, );
>
> -   /* Release the association's refcnt.  */
> -   sctp_association_put(asoc);
> -
> return err;
>
>  do_error:


did you receive my previous email ?

2016-11-03 Thread Friedrich Mayrhofer



This is the second time i am sending you this mail.I, Friedrich Mayrhofer 
Donate $ 1,000,000.00 to You, Email  Me personally for more details.

Regards.
Friedrich Mayrhofer


Re: net/sctp: use-after-free in __sctp_connect

2016-11-03 Thread Marcelo Ricardo Leitner
On Thu, Nov 03, 2016 at 06:11:01PM +0100, Andrey Konovalov wrote:
> On Wed, Nov 2, 2016 at 11:42 PM, Andrey Konovalov  
> wrote:
> > On Wed, Oct 19, 2016 at 6:57 PM, Marcelo Ricardo Leitner
> >  wrote:
> >> On Wed, Oct 19, 2016 at 02:25:24PM +0200, Andrey Konovalov wrote:
> >>> Hi,
> >>>
> >>> I've got the following error report while running the syzkaller fuzzer:
> >>>
> >>> ==
> >>> BUG: KASAN: use-after-free in __sctp_connect+0xabe/0xbf0 at addr
> >>> 88006b1dc610
> >>
> >> Seems this is the same that Dmitry Vyukov had reported back in Jan 13th.
> >> So far I couldn't identify the reason.
> >> "Good" to know it's still there, thanks for reporting it.
> 
> Hi Marcelo,
> 

Hi

> So I've looked at the code.
> As far as I understand, the problem is a race condition between
> setsockopt(SCTP_SOCKOPT_CONNECTX) and shutdown on an sctp socket.
> setsockopt() calls sctp_wait_for_connect(), which exits the for loop
> on the sk->sk_shutdown & RCV_SHUTDOWN if clause, and then frees asoc
> with sctp_association_put() and returns err = 0.
> Then __sctp_connect() checks that err == 0 and reads asoc->assoc_id
> from the freed asoc.

Suddenly this seems familiar. Your description makes sense, thanks for
looking deeper into this, Andrey.

This fix should do it, can you please try it? I'll post it properly
if it works.

wait_for_connect is only used in two places, we can move the ref to a
broader scope and cover that read too, instead of holding another ref.

sendmsg path won't read anything from the asoc after waiting, so this
should be enough for it too.

---8<---

commit 7f7ba9b4fb834a61ab097dfd7c1f267e6a6d70a8
Author: Marcelo Ricardo Leitner 
Date:   Thu Nov 3 15:47:45 2016 -0200

sctp: hold the asoc longer when associating

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 9fbb6feb8c27..aac271571930 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1214,9 +1214,11 @@ static int __sctp_connect(struct sock *sk,
 
timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
 
+   sctp_association_hold(asoc);
err = sctp_wait_for_connect(asoc, );
if ((err == 0 || err == -EINPROGRESS) && assoc_id)
*assoc_id = asoc->assoc_id;
+   sctp_association_put(asoc);
 
/* Don't free association on exit. */
asoc = NULL;
@@ -1985,7 +1987,9 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t msg_len)
 
if (unlikely(wait_connect)) {
timeo = sock_sndtimeo(sk, msg_flags & MSG_DONTWAIT);
+   sctp_association_hold(asoc);
sctp_wait_for_connect(asoc, );
+   sctp_association_put(asoc);
}
 
/* If we are already past ASSOCIATE, the lower
@@ -7501,6 +7505,7 @@ static int sctp_writeable(struct sock *sk)
 
 /* Wait for an association to go into ESTABLISHED state. If timeout is 0,
  * returns immediately with EINPROGRESS.
+ * Note: caller must hold a ref on asoc before calling this function.
  */
 static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p)
 {
@@ -7511,9 +7516,6 @@ static int sctp_wait_for_connect(struct sctp_association 
*asoc, long *timeo_p)
 
pr_debug("%s: asoc:%p, timeo:%ld\n", __func__, asoc, *timeo_p);
 
-   /* Increment the association's refcnt.  */
-   sctp_association_hold(asoc);
-
for (;;) {
prepare_to_wait_exclusive(>wait, ,
  TASK_INTERRUPTIBLE);
@@ -7543,9 +7545,6 @@ static int sctp_wait_for_connect(struct sctp_association 
*asoc, long *timeo_p)
 out:
finish_wait(>wait, );
 
-   /* Release the association's refcnt.  */
-   sctp_association_put(asoc);
-
return err;
 
 do_error:


[PATCH net-next] netfilter: Update ip_route_me_harder to consider L3 domain

2016-11-03 Thread David Ahern
ip_route_me_harder is not considering the L3 domain and sending lookups
to the wrong table. For example consider the following output rule:

iptables -I OUTPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset

using perf to analyze lookups via the fib_table_lookup tracepoint shows:

vrf-test  1187 [001] 46887.295927: fib:fib_table_lookup: table 255 oif 0 iif 0 
src 0.0.0.0 dst 10.100.1.254 tos 0 scope 0 flags 0
8143922c perf_trace_fib_table_lookup ([kernel.kallsyms])
81493aac fib_table_lookup ([kernel.kallsyms])
8148dda3 __inet_dev_addr_type ([kernel.kallsyms])
8148ddf6 inet_addr_type ([kernel.kallsyms])
8149e344 ip_route_me_harder ([kernel.kallsyms])

and

vrf-test  1187 [001] 46887.295933: fib:fib_table_lookup: table 255 oif 0 iif 1 
src 10.100.1.254 dst 10.100.1.2 tos 0 scope 0 flags
8143922c perf_trace_fib_table_lookup ([kernel.kallsyms])
81493aac fib_table_lookup ([kernel.kallsyms])
814998ff fib4_rule_action ([kernel.kallsyms])
81437f35 fib_rules_lookup ([kernel.kallsyms])
81499758 __fib_lookup ([kernel.kallsyms])
8144f010 fib_lookup.constprop.34 ([kernel.kallsyms])
8144f759 __ip_route_output_key_hash ([kernel.kallsyms])
8144fc6a ip_route_output_flow ([kernel.kallsyms])
8149e39b ip_route_me_harder ([kernel.kallsyms])

Updating both lookups to pull the L3 domain from the dst currently
attached to the skb directs both lookups to the correct table.

Signed-off-by: David Ahern 
---
Pablo: from a code review it seems ip_route_me_harder is only called in
   the output path and after skb_dst is set.

 net/ipv4/netfilter.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index c3776ff6749f..b3cc1335adbc 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -24,10 +24,11 @@ int ip_route_me_harder(struct net *net, struct sk_buff 
*skb, unsigned int addr_t
struct flowi4 fl4 = {};
__be32 saddr = iph->saddr;
__u8 flags = skb->sk ? inet_sk_flowi_flags(skb->sk) : 0;
+   struct net_device *dev = skb_dst(skb)->dev;
unsigned int hh_len;
 
if (addr_type == RTN_UNSPEC)
-   addr_type = inet_addr_type(net, saddr);
+   addr_type = inet_addr_type_dev_table(net, dev, saddr);
if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
flags |= FLOWI_FLAG_ANYSRC;
else
@@ -40,6 +41,8 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, 
unsigned int addr_t
fl4.saddr = saddr;
fl4.flowi4_tos = RT_TOS(iph->tos);
fl4.flowi4_oif = skb->sk ? skb->sk->sk_bound_dev_if : 0;
+   if (!fl4.flowi4_oif)
+   fl4.flowi4_oif = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_flags = flags;
rt = ip_route_output_key(net, );
-- 
2.1.4



[PATCH net v2 2/4] net: ethernet: ti: cpsw: fix device and of_node leaks

2016-11-03 Thread Johan Hovold
Make sure to drop the references taken by of_get_child_by_name() and
bus_find_device() before returning from cpsw_phy_sel().

Note that holding a reference to the cpsw-phy-sel device does not
prevent the devres-managed private data from going away.

Fixes: 5892cd135e16 ("drivers: net: cpsw-phy-sel: Add new driver...")
Cc: Mugunthan V N 
Cc: Grygorii Strashko 
Cc: linux-o...@vger.kernel.org
Signed-off-by: Johan Hovold 
---
 drivers/net/ethernet/ti/cpsw-phy-sel.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw-phy-sel.c 
b/drivers/net/ethernet/ti/cpsw-phy-sel.c
index 054a8dd23dae..ba1e45ff6aae 100644
--- a/drivers/net/ethernet/ti/cpsw-phy-sel.c
+++ b/drivers/net/ethernet/ti/cpsw-phy-sel.c
@@ -176,9 +176,12 @@ void cpsw_phy_sel(struct device *dev, phy_interface_t 
phy_mode, int slave)
}
 
dev = bus_find_device(_bus_type, NULL, node, match);
+   of_node_put(node);
priv = dev_get_drvdata(dev);
 
priv->cpsw_phy_sel(priv, phy_mode, slave);
+
+   put_device(dev);
 }
 EXPORT_SYMBOL_GPL(cpsw_phy_sel);
 
-- 
2.7.3



[PATCH net v2 4/4] net: hns: fix device reference leaks

2016-11-03 Thread Johan Hovold
Make sure to drop the reference taken by class_find_device() in
hnae_get_handle() on errors and when later releasing the handle.

Fixes: 6fe6611ff275 ("net: add Hisilicon Network Subsystem...")
Cc: Yisen Zhuang 
Cc: Salil Mehta 
Signed-off-by: Johan Hovold 
---
 drivers/net/ethernet/hisilicon/hns/hnae.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.c 
b/drivers/net/ethernet/hisilicon/hns/hnae.c
index c54c6fac0d1d..b6ed818f78ff 100644
--- a/drivers/net/ethernet/hisilicon/hns/hnae.c
+++ b/drivers/net/ethernet/hisilicon/hns/hnae.c
@@ -332,8 +332,10 @@ struct hnae_handle *hnae_get_handle(struct device 
*owner_dev,
return ERR_PTR(-ENODEV);
 
handle = dev->ops->get_handle(dev, port_id);
-   if (IS_ERR(handle))
+   if (IS_ERR(handle)) {
+   put_device(>cls_dev);
return handle;
+   }
 
handle->dev = dev;
handle->owner_dev = owner_dev;
@@ -356,6 +358,8 @@ struct hnae_handle *hnae_get_handle(struct device 
*owner_dev,
for (j = i - 1; j >= 0; j--)
hnae_fini_queue(handle->qs[j]);
 
+   put_device(>cls_dev);
+
return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(hnae_get_handle);
@@ -377,6 +381,8 @@ void hnae_put_handle(struct hnae_handle *h)
dev->ops->put_handle(h);
 
module_put(dev->owner);
+
+   put_device(>cls_dev);
 }
 EXPORT_SYMBOL(hnae_put_handle);
 
-- 
2.7.3



[PATCH net v2 1/4] phy: fix device reference leaks

2016-11-03 Thread Johan Hovold
Make sure to drop the reference taken by bus_find_device_by_name()
before returning from phy_connect() and phy_attach().

Note that both function still take a reference to the phy device
through phy_attach_direct().

Fixes: e13934563db0 ("[PATCH] PHY Layer fixup")
Cc: Florian Fainelli 
Signed-off-by: Johan Hovold 
---
 drivers/net/phy/phy_device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index e977ba931878..1a4bf8acad78 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -723,6 +723,7 @@ struct phy_device *phy_connect(struct net_device *dev, 
const char *bus_id,
phydev = to_phy_device(d);
 
rc = phy_connect_direct(dev, phydev, handler, interface);
+   put_device(d);
if (rc)
return ERR_PTR(rc);
 
@@ -953,6 +954,7 @@ struct phy_device *phy_attach(struct net_device *dev, const 
char *bus_id,
phydev = to_phy_device(d);
 
rc = phy_attach_direct(dev, phydev, phydev->dev_flags, interface);
+   put_device(d);
if (rc)
return ERR_PTR(rc);
 
-- 
2.7.3



[PATCH net v2 0/4] net: fix device reference leaks

2016-11-03 Thread Johan Hovold
This series fixes a number of device reference leaks (and one of_node
leak) due to failure to drop the references taken by bus_find_device()
and friends.

Note that the final two patches have been compile tested only.

Thanks,
Johan


v2
 - hold reference to cpsw-phy-sel device while accessing private data as
   requested by David. Also update the commit message. (patch 1/4)
 - add linux-omap on CC where appropriate


Johan Hovold (4):
  phy: fix device reference leaks
  net: ethernet: ti: cpsw: fix device and of_node leaks
  net: ethernet: ti: davinci_emac: fix device reference leak
  net: hns: fix device reference leaks

 drivers/net/ethernet/hisilicon/hns/hnae.c |  8 +++-
 drivers/net/ethernet/ti/cpsw-phy-sel.c|  3 +++
 drivers/net/ethernet/ti/davinci_emac.c| 10 ++
 drivers/net/phy/phy_device.c  |  2 ++
 4 files changed, 18 insertions(+), 5 deletions(-)

-- 
2.7.3



[PATCH net v2 3/4] net: ethernet: ti: davinci_emac: fix device reference leak

2016-11-03 Thread Johan Hovold
Make sure to drop the references taken by bus_find_device() before
returning from emac_dev_open().

Note that phy_connect still takes a reference to the phy device.

Fixes: 5d69e0076a72 ("net: davinci_emac: switch to new mdio")
Cc: Mugunthan V N 
Cc: Grygorii Strashko 
Cc: linux-o...@vger.kernel.org
Signed-off-by: Johan Hovold 
---
 drivers/net/ethernet/ti/davinci_emac.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_emac.c 
b/drivers/net/ethernet/ti/davinci_emac.c
index 2fd94a5bc1f3..84fbe5714f8b 100644
--- a/drivers/net/ethernet/ti/davinci_emac.c
+++ b/drivers/net/ethernet/ti/davinci_emac.c
@@ -1410,6 +1410,7 @@ static int emac_dev_open(struct net_device *ndev)
int i = 0;
struct emac_priv *priv = netdev_priv(ndev);
struct phy_device *phydev = NULL;
+   struct device *phy = NULL;
 
ret = pm_runtime_get_sync(>pdev->dev);
if (ret < 0) {
@@ -1488,19 +1489,20 @@ static int emac_dev_open(struct net_device *ndev)
 
/* use the first phy on the bus if pdata did not give us a phy id */
if (!phydev && !priv->phy_id) {
-   struct device *phy;
-
phy = bus_find_device(_bus_type, NULL, NULL,
  match_first_device);
-   if (phy)
+   if (phy) {
priv->phy_id = dev_name(phy);
+   if (!priv->phy_id || !*priv->phy_id)
+   put_device(phy);
+   }
}
 
if (!phydev && priv->phy_id && *priv->phy_id) {
phydev = phy_connect(ndev, priv->phy_id,
 _adjust_link,
 PHY_INTERFACE_MODE_MII);
-
+   put_device(phy);/* reference taken by bus_find_device */
if (IS_ERR(phydev)) {
dev_err(emac_dev, "could not connect to phy %s\n",
priv->phy_id);
-- 
2.7.3



Re: [PATCH net] net: Check for fullsock in sock_i_uid()

2016-11-03 Thread Lorenzo Colitti
On Thu, Nov 3, 2016 at 2:18 AM, Eric Dumazet  wrote:
> Lorenzo, have'nt you already fixed all these bugs ?

Not yet. There's still a fair bit of out-of-tree code left. Other than
per-UID routing, xt_qtaguid is the big one, but there's also xt_quota2
and xt_idletimer. to fix.

> if (skb && skb->sk)
> timer->uid = from_kuid_munged(current_user_ns(),
>  sock_i_uid(skb_to_full_sk(skb)));

Thanks! I didn't know about skb_to_full_sk.


[PATCH net-next v2 3/3] net: inet: Support UID-based routing in IP protocols.

2016-11-03 Thread Lorenzo Colitti
- Use the UID in routing lookups made by protocol connect() and
  sendmsg() functions.
- Make sure that routing lookups triggered by incoming packets
  (e.g., Path MTU discovery) take the UID of the socket into
  account.
- For packets not associated with a userspace socket, (e.g., ping
  replies) use UID 0 inside the user namespace corresponding to
  the network namespace the socket belongs to. This allows
  all namespaces to apply routing and iptables rules to
  kernel-originated traffic in that namespaces by matching UID 0.
  This is better than using the UID of the kernel socket that is
  sending the traffic, because the UID of kernel sockets created
  at namespace creation time (e.g., the per-processor ICMP and
  TCP sockets) is the UID of the user that created the socket,
  which might not be mapped in the namespace.

Tested: compiles allnoconfig, allyesconfig, allmodconfig
Tested: https://android-review.googlesource.com/253302
Signed-off-by: Lorenzo Colitti 
---
 include/net/flow.h   |  4 +++-
 include/net/ip.h |  1 +
 include/net/ip6_route.h  |  5 +++--
 include/net/route.h  |  5 +++--
 net/ipv4/icmp.c  |  2 ++
 net/ipv4/inet_connection_sock.c  |  4 ++--
 net/ipv4/ip_output.c |  3 ++-
 net/ipv4/ping.c  |  3 ++-
 net/ipv4/raw.c   |  2 +-
 net/ipv4/route.c | 26 +++---
 net/ipv4/syncookies.c|  2 +-
 net/ipv4/tcp_ipv4.c  |  9 ++---
 net/ipv4/udp.c   |  3 ++-
 net/ipv6/af_inet6.c  |  1 +
 net/ipv6/ah6.c   |  5 +++--
 net/ipv6/datagram.c  |  1 +
 net/ipv6/esp6.c  |  5 +++--
 net/ipv6/icmp.c  |  7 +--
 net/ipv6/inet6_connection_sock.c |  2 ++
 net/ipv6/ip6_gre.c   |  4 
 net/ipv6/ip6_tunnel.c|  4 
 net/ipv6/ip6_vti.c   |  5 +++--
 net/ipv6/ipcomp6.c   |  5 +++--
 net/ipv6/netfilter.c |  1 +
 net/ipv6/ping.c  |  1 +
 net/ipv6/raw.c   |  1 +
 net/ipv6/route.c | 13 +
 net/ipv6/syncookies.c|  1 +
 net/ipv6/tcp_ipv6.c  |  2 ++
 net/ipv6/udp.c   |  1 +
 net/l2tp/l2tp_ip6.c  |  1 +
 31 files changed, 89 insertions(+), 40 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 51373f3..6bbbca8 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -96,7 +96,8 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int 
oif,
  __u32 mark, __u8 tos, __u8 scope,
  __u8 proto, __u8 flags,
  __be32 daddr, __be32 saddr,
- __be16 dport, __be16 sport)
+ __be16 dport, __be16 sport,
+ kuid_t uid)
 {
fl4->flowi4_oif = oif;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
@@ -107,6 +108,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, 
int oif,
fl4->flowi4_flags = flags;
fl4->flowi4_secid = 0;
fl4->flowi4_tun_key.tun_id = 0;
+   fl4->flowi4_uid = uid;
fl4->daddr = daddr;
fl4->saddr = saddr;
fl4->fl4_dport = dport;
diff --git a/include/net/ip.h b/include/net/ip.h
index 5413883..55cdaac 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -179,6 +179,7 @@ struct ip_reply_arg {
/* -1 if not needed */ 
int bound_dev_if;
u8  tos;
+   kuid_t  uid;
 }; 
 
 #define IP_REPLY_ARG_NOSRCCHECK 1
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index f83e78d..9dc2c18 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -140,9 +140,10 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
  const struct in6_addr *gwaddr);
 
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
-u32 mark);
+u32 mark, kuid_t uid);
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
-void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark);
+void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
+ kuid_t uid);
 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
u32 mark);
 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);
diff --git a/include/net/route.h b/include/net/route.h
index 0429d47..c0874c8 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -153,7 +153,7 @@ static inline struct rtable *ip_route_output_ports(struct 
net *net, struct flowi
flowi4_init_output(fl4, oif, sk ? sk->sk_mark : 0, tos,
   RT_SCOPE_UNIVERSE, 

[PATCH net-next v2 1/3] net: core: Add a UID field to struct sock.

2016-11-03 Thread Lorenzo Colitti
Protocol sockets (struct sock) don't have UIDs, but most of the
time, they map 1:1 to userspace sockets (struct socket) which do.

Various operations such as the iptables xt_owner match need
access to the "UID of a socket", and do so by following the
backpointer to the struct socket. This involves taking
sk_callback_lock and doesn't work when there is no socket
because userspace has already called close().

Simplify this by adding a sk_uid field to struct sock whose value
matches the UID of the corresponding struct socket. The semantics
are as follows:

1. Whenever sk_socket is non-null: sk_uid is the same as the UID
   in sk_socket, i.e., matches the return value of sock_i_uid.
   Specifically, the UID is set when userspace calls socket(),
   fchown(), or accept().
2. When sk_socket is NULL, sk_uid is defined as follows:
   - For a socket that no longer has a sk_socket because
 userspace has called close(): the previous UID.
   - For a cloned socket (e.g., an incoming connection that is
 established but on which userspace has not yet called
 accept): the UID of the socket it was cloned from.
   - For a socket that has never had an sk_socket: UID 0 inside
 the user namespace corresponding to the network namespace
 the socket belongs to.

Kernel sockets created by sock_create_kern are a special case
of #1 and sk_uid is the user that created them. For kernel
sockets created at network namespace creation time, such as the
per-processor ICMP and TCP sockets, this is the user that created
the network namespace.

Signed-off-by: Lorenzo Colitti 
---
 include/net/sock.h |  7 +++
 net/core/sock.c|  5 -
 net/socket.c   | 14 ++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 93331a1..cf617ee 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -419,6 +419,7 @@ struct sock {
u32 sk_max_ack_backlog;
__u32   sk_priority;
__u32   sk_mark;
+   kuid_t  sk_uid;
struct pid  *sk_peer_pid;
const struct cred   *sk_peer_cred;
longsk_rcvtimeo;
@@ -1664,6 +1665,7 @@ static inline void sock_graft(struct sock *sk, struct 
socket *parent)
sk->sk_wq = parent->wq;
parent->sk = sk;
sk_set_socket(sk, parent);
+   sk->sk_uid = SOCK_INODE(parent)->i_uid;
security_sock_graft(sk, parent);
write_unlock_bh(>sk_callback_lock);
 }
@@ -1671,6 +1673,11 @@ static inline void sock_graft(struct sock *sk, struct 
socket *parent)
 kuid_t sock_i_uid(struct sock *sk);
 unsigned long sock_i_ino(struct sock *sk);
 
+static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
+{
+   return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
+}
+
 static inline u32 net_tx_rndhash(void)
 {
u32 v = prandom_u32();
diff --git a/net/core/sock.c b/net/core/sock.c
index d8e4532e..40dbc13 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2460,8 +2460,11 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_type =   sock->type;
sk->sk_wq   =   sock->wq;
sock->sk=   sk;
-   } else
+   sk->sk_uid  =   SOCK_INODE(sock)->i_uid;
+   } else {
sk->sk_wq   =   NULL;
+   sk->sk_uid  =   make_kuid(sock_net(sk)->user_ns, 0);
+   }
 
rwlock_init(>sk_callback_lock);
lockdep_set_class_and_name(>sk_callback_lock,
diff --git a/net/socket.c b/net/socket.c
index 970a7ea..4ce33c3 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -518,8 +518,22 @@ static ssize_t sockfs_listxattr(struct dentry *dentry, 
char *buffer,
return used;
 }
 
+int sockfs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+   int err = simple_setattr(dentry, iattr);
+
+   if (!err) {
+   struct socket *sock = SOCKET_I(d_inode(dentry));
+
+   sock->sk->sk_uid = iattr->ia_uid;
+   }
+
+   return err;
+}
+
 static const struct inode_operations sockfs_inode_ops = {
.listxattr = sockfs_listxattr,
+   .setattr = sockfs_setattr,
 };
 
 /**
-- 
2.8.0.rc3.226.g39d4020



[PATCH net-next v2] net: inet: Support UID-based routing

2016-11-03 Thread Lorenzo Colitti
This patchset adds support for per-UID routing. It allows the
administrator to configure rules such as:

  ip rule add uidrange 100-200 lookup 123

This functionality has been in use by all Android devices since
5.0. It is primarily used to impose per-app routing policies (on
Android, every app has its own UID) without having to resort to
rerouting packets in iptables, which breaks getsockname() and
MTU/MSS calculation, and generally disrupts end-to-end
connectivity.

This patch series is similar to the code currently used on
Android, but has better correctness and performance because
it stores the UID in the socket instead of calling sock_i_uid.
This avoids contention on sk->sk_callback_lock, and makes it
possible to correctly route a socket on which userspace has
called close(), for which sock_i_uid will return 0.

Changes from v1:
- Don't set the UID in sk_clone_lock, it's already set by
  sock_copy.
- For packets originated by kernel sockets, don't use the socket
  UID. This is the UID that created the namespace, but it might
  not be mapped in the namespace at all. Instead, use UID 0 in
  the namespace, which is less surprising and consistent with
  what happens in the root namespace.
- Fix UID routing of IPv4 and IPv6 SYN_RECV sockets.
- Fix UID routing of received IPv6 redirects.



Re: [PATCH net-next v1 00/21] amd-xgbe: AMD XGBE driver updates 2016-11-01

2016-11-03 Thread Tom Lendacky
On 11/03/2016 12:14 PM, David Miller wrote:
> 
> 20+ patches is too many to submit at one time.
> 
> Making huge patch series submissions puts an unreasonable burdon on
> those developers who might decide to review your work, including me.
> 
> Please keep your series down to a small, reasonable, size.  Perhaps
> 10-15 patches maximum.

Ok, I'll break this up into smaller submissions - new device prep
work and then the new device support.

Thanks,
Tom

> 
> I will not be looking at nor applying this series.
> 


[PATCH net-next v2 2/3] net: core: add UID to flows, rules, and routes

2016-11-03 Thread Lorenzo Colitti
- Define a new FIB rule attributes, FRA_UID_RANGE, to describe a
  range of UIDs.
- Define a RTA_UID attribute for per-UID route lookups and dumps.
- Support passing these attributes to and from userspace via
  rtnetlink. The value INVALID_UID indicates no UID was
  specified.
- Add a UID field to the flow structures.

Signed-off-by: Lorenzo Colitti 
---
 include/net/fib_rules.h|  9 -
 include/net/flow.h |  5 +++
 include/uapi/linux/fib_rules.h |  6 
 include/uapi/linux/rtnetlink.h |  1 +
 net/core/fib_rules.c   | 74 --
 net/ipv4/fib_frontend.c|  1 +
 net/ipv4/route.c   | 11 +++
 net/ipv6/route.c   |  7 
 8 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 456e4a6..8dbfdf7 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -8,6 +8,11 @@
 #include 
 #include 
 
+struct fib_kuid_range {
+   kuid_t start;
+   kuid_t end;
+};
+
 struct fib_rule {
struct list_headlist;
int iifindex;
@@ -30,6 +35,7 @@ struct fib_rule {
int suppress_prefixlen;
chariifname[IFNAMSIZ];
charoifname[IFNAMSIZ];
+   struct fib_kuid_range   uid_range;
struct rcu_head rcu;
 };
 
@@ -92,7 +98,8 @@ struct fib_rules_ops {
[FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \
[FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \
[FRA_GOTO]  = { .type = NLA_U32 }, \
-   [FRA_L3MDEV]= { .type = NLA_U8 }
+   [FRA_L3MDEV]= { .type = NLA_U8 }, \
+   [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }
 
 static inline void fib_rule_get(struct fib_rule *rule)
 {
diff --git a/include/net/flow.h b/include/net/flow.h
index 035aa77..51373f3 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * ifindex generation is per-net namespace, and loopback is
@@ -37,6 +38,7 @@ struct flowi_common {
 #define FLOWI_FLAG_SKIP_NH_OIF 0x04
__u32   flowic_secid;
struct flowi_tunnel flowic_tun_key;
+   kuid_t  flowic_uid;
 };
 
 union flowi_uli {
@@ -74,6 +76,7 @@ struct flowi4 {
 #define flowi4_flags   __fl_common.flowic_flags
 #define flowi4_secid   __fl_common.flowic_secid
 #define flowi4_tun_key __fl_common.flowic_tun_key
+#define flowi4_uid __fl_common.flowic_uid
 
/* (saddr,daddr) must be grouped, same order as in IP header */
__be32  saddr;
@@ -131,6 +134,7 @@ struct flowi6 {
 #define flowi6_flags   __fl_common.flowic_flags
 #define flowi6_secid   __fl_common.flowic_secid
 #define flowi6_tun_key __fl_common.flowic_tun_key
+#define flowi6_uid __fl_common.flowic_uid
struct in6_addr daddr;
struct in6_addr saddr;
/* Note: flowi6_tos is encoded in flowlabel, too. */
@@ -176,6 +180,7 @@ struct flowi {
 #define flowi_flagsu.__fl_common.flowic_flags
 #define flowi_secidu.__fl_common.flowic_secid
 #define flowi_tun_key  u.__fl_common.flowic_tun_key
+#define flowi_uid  u.__fl_common.flowic_uid
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4)
diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h
index 14404b3..bbf02a6 100644
--- a/include/uapi/linux/fib_rules.h
+++ b/include/uapi/linux/fib_rules.h
@@ -29,6 +29,11 @@ struct fib_rule_hdr {
__u32   flags;
 };
 
+struct fib_rule_uid_range {
+   __u32   start;
+   __u32   end;
+};
+
 enum {
FRA_UNSPEC,
FRA_DST,/* destination address */
@@ -51,6 +56,7 @@ enum {
FRA_OIFNAME,
FRA_PAD,
FRA_L3MDEV, /* iif or oif is l3mdev goto its table */
+   FRA_UID_RANGE,  /* UID range */
__FRA_MAX
 };
 
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 5a78be5..e14377f 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -318,6 +318,7 @@ enum rtattr_type_t {
RTA_ENCAP,
RTA_EXPIRES,
RTA_PAD,
+   RTA_UID,
__RTA_MAX
 };
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index be4629c..5de436a 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,11 @@
 #include 
 #include 
 
+static const struct fib_kuid_range fib_kuid_range_unset = {
+   KUIDT_INIT(0),
+   KUIDT_INIT(~0),
+};
+
 int fib_default_rule_add(struct fib_rules_ops *ops,
 u32 pref, u32 table, u32 flags)
 {
@@ -33,6 +38,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->table = table;
r->flags = flags;
r->fr_net = ops->fro_net;
+  

Re: net/netlink: null-ptr-deref in netlink_dump/lock_acquire

2016-11-03 Thread Andrey Konovalov
Hi,

Another report that looks related:

[ INFO: possible circular locking dependency detected ]
4.9.0-rc3+ #344 Not tainted
---
syz-executor/25526 is trying to acquire lock:
 ([  950.351060] [i].mutex
[] nfnl_lock+0x28/0x30 net/netfilter/nfnetlink.c:61

but task is already holding lock:
 ([  950.351060] rtnl_mutex
[< inline >] rtnl_lock net/core/rtnetlink.c:70
[] rtnetlink_rcv+0x1b/0x40 net/core/rtnetlink.c:4032

which lock already depends on the new lock.


the existing dependency chain (in reverse order) is:

:
   [  950.351060] [] lock_acquire+0x17e/0x340
kernel/locking/lockdep.c:3746
   [  950.351060] [< inline >] __mutex_lock_common
kernel/locking/mutex.c:521
   [  950.351060] []
mutex_lock_nested+0xb1/0x890 kernel/locking/mutex.c:621
   [  950.351060] [] rtnl_lock+0x17/0x20
net/core/rtnetlink.c:70
   [  950.351060] []
nl80211_get_reg_dump+0x5f/0x400 net/wireless/nl80211.c:6171
   [  950.351060] [] genl_lock_dumpit+0x68/0x90
net/netlink/genetlink.c:517
   [  950.351060] [] netlink_dump+0x397/0xac0
net/netlink/af_netlink.c:2110
   [  950.351060] []
__netlink_dump_start+0x501/0x770 net/netlink/af_netlink.c:2200
   [  950.351060] []
genl_family_rcv_msg+0xad8/0xc80 net/netlink/genetlink.c:584
   [  950.351060] [] genl_rcv_msg+0x1b6/0x270
net/netlink/genetlink.c:658
   [  950.351060] [] netlink_rcv_skb+0x2c0/0x3b0
net/netlink/af_netlink.c:2281
   [  950.351060] [] genl_rcv+0x28/0x40
net/netlink/genetlink.c:669
   [  950.351060] [< inline >] netlink_unicast_kernel
net/netlink/af_netlink.c:1214
   [  950.351060] [] netlink_unicast+0x5a9/0x880
net/netlink/af_netlink.c:1240
   [  950.351060] [] netlink_sendmsg+0x9b7/0xce0
net/netlink/af_netlink.c:1786
   [  950.351060] [< inline >] sock_sendmsg_nosec net/socket.c:606
   [  950.351060] [] sock_sendmsg+0xcc/0x110
net/socket.c:616
   [  950.351060] [] sock_write_iter+0x221/0x3b0
net/socket.c:814
   [  950.351060] [< inline >] new_sync_write fs/read_write.c:499
   [  950.351060] [] __vfs_write+0x334/0x570
fs/read_write.c:512
   [  950.351060] [] vfs_write+0x17b/0x500
fs/read_write.c:560
   [  950.351060] [< inline >] SYSC_write fs/read_write.c:607
   [  950.351060] [] SyS_write+0xd4/0x1a0
fs/read_write.c:599
   [  950.351060] [] entry_SYSCALL_64_fastpath+0x1f/0xc2

:
   [  950.351060] [] lock_acquire+0x17e/0x340
kernel/locking/lockdep.c:3746
   [  950.351060] [< inline >] __mutex_lock_common
kernel/locking/mutex.c:521
   [  950.351060] []
mutex_lock_nested+0xb1/0x890 kernel/locking/mutex.c:621
   [  950.351060] [< inline >] genl_lock net/netlink/genetlink.c:31
   [  950.351060] [] genl_lock_dumpit+0x41/0x90
net/netlink/genetlink.c:516
   [  950.351060] [] netlink_dump+0x397/0xac0
net/netlink/af_netlink.c:2110
   [  950.351060] []
__netlink_dump_start+0x501/0x770 net/netlink/af_netlink.c:2200
   [  950.351060] []
genl_family_rcv_msg+0xad8/0xc80 net/netlink/genetlink.c:584
   [  950.351060] [] genl_rcv_msg+0x1b6/0x270
net/netlink/genetlink.c:658
   [  950.351060] [] netlink_rcv_skb+0x2c0/0x3b0
net/netlink/af_netlink.c:2281
   [  950.351060] [] genl_rcv+0x28/0x40
net/netlink/genetlink.c:669
   [  950.351060] [< inline >] netlink_unicast_kernel
net/netlink/af_netlink.c:1214
   [  950.351060] [] netlink_unicast+0x5a9/0x880
net/netlink/af_netlink.c:1240
   [  950.351060] [] netlink_sendmsg+0x9b7/0xce0
net/netlink/af_netlink.c:1786
   [  950.351060] [< inline >] sock_sendmsg_nosec net/socket.c:606
   [  950.351060] [] sock_sendmsg+0xcc/0x110
net/socket.c:616
   [  950.351060] [] sock_write_iter+0x221/0x3b0
net/socket.c:814
   [  950.351060] [< inline >] new_sync_write fs/read_write.c:499
   [  950.351060] [] __vfs_write+0x334/0x570
fs/read_write.c:512
   [  950.351060] [] vfs_write+0x17b/0x500
fs/read_write.c:560
   [  950.351060] [< inline >] SYSC_write fs/read_write.c:607
   [  950.351060] [] SyS_write+0xd4/0x1a0
fs/read_write.c:599
   [  950.351060] [] entry_SYSCALL_64_fastpath+0x1f/0xc2

:
   [  950.351060] [] lock_acquire+0x17e/0x340
kernel/locking/lockdep.c:3746
   [  950.351060] [< inline >] __mutex_lock_common
kernel/locking/mutex.c:521
   [  950.351060] []
mutex_lock_nested+0xb1/0x890 kernel/locking/mutex.c:621
   [  950.351060] []
__netlink_dump_start+0xfa/0x770 net/netlink/af_netlink.c:2170
   [  950.351060] [< inline >] netlink_dump_start
include/linux/netlink.h:165
   [  950.351060] []
ctnetlink_stat_ct_cpu+0xd9/0x130
net/netfilter/nf_conntrack_netlink.c:2045
   [  950.351060] []
nfnetlink_rcv_msg+0xa10/0xc10 net/netfilter/nfnetlink.c:212
   [  950.351060] [] netlink_rcv_skb+0x2c0/0x3b0
net/netlink/af_netlink.c:2281
   [  950.351060] [] nfnetlink_rcv+0x848/0x1170

[PATCH net-next resend 04/13] nfp: reuse ring helpers on .ndo_open() path

2016-11-03 Thread Jakub Kicinski
Ring allocation helpers encapsulate all ring allocation and
initialization steps nicely.  Reuse them on .ndo_open() path.

Signed-off-by: Jakub Kicinski 
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 60 --
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index b7b2851ebb6b..50aeaea9e318 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2051,6 +2051,13 @@ static void nfp_net_open_stack(struct nfp_net *nn)
 static int nfp_net_netdev_open(struct net_device *netdev)
 {
struct nfp_net *nn = netdev_priv(netdev);
+   struct nfp_net_ring_set rx = {
+   .mtu = nn->netdev->mtu,
+   .dcnt = nn->rxd_cnt,
+   };
+   struct nfp_net_ring_set tx = {
+   .dcnt = nn->txd_cnt,
+   };
int err, r;
 
if (nn->ctrl & NFP_NET_CFG_CTRL_ENABLE) {
@@ -2075,38 +2082,22 @@ static int nfp_net_netdev_open(struct net_device 
*netdev)
goto err_free_exn;
disable_irq(nn->irq_entries[NFP_NET_IRQ_LSC_IDX].vector);
 
-   nn->rx_rings = kcalloc(nn->num_rx_rings, sizeof(*nn->rx_rings),
-  GFP_KERNEL);
-   if (!nn->rx_rings) {
-   err = -ENOMEM;
-   goto err_free_lsc;
-   }
-   nn->tx_rings = kcalloc(nn->num_tx_rings, sizeof(*nn->tx_rings),
-  GFP_KERNEL);
-   if (!nn->tx_rings) {
-   err = -ENOMEM;
-   goto err_free_rx_rings;
-   }
-
for (r = 0; r < nn->num_r_vecs; r++) {
err = nfp_net_prepare_vector(nn, >r_vecs[r], r);
if (err)
goto err_cleanup_vec_p;
}
-   for (r = 0; r < nn->num_tx_rings; r++) {
-   err = nfp_net_tx_ring_alloc(nn->r_vecs[r].tx_ring, nn->txd_cnt);
-   if (err)
-   goto err_free_tx_ring_p;
+
+   nn->rx_rings = nfp_net_rx_ring_set_prepare(nn, );
+   if (!nn->rx_rings) {
+   err = -ENOMEM;
+   goto err_cleanup_vec;
}
-   for (r = 0; r < nn->num_rx_rings; r++) {
-   err = nfp_net_rx_ring_alloc(nn->r_vecs[r].rx_ring,
-   nn->fl_bufsz, nn->rxd_cnt);
-   if (err)
-   goto err_flush_free_rx_ring_p;
 
-   err = nfp_net_rx_ring_bufs_alloc(nn, nn->r_vecs[r].rx_ring);
-   if (err)
-   goto err_free_rx_ring_p;
+   nn->tx_rings = nfp_net_tx_ring_set_prepare(nn, );
+   if (!nn->tx_rings) {
+   err = -ENOMEM;
+   goto err_free_rx_rings;
}
 
err = netif_set_real_num_tx_queues(netdev, nn->num_tx_rings);
@@ -2139,25 +2130,14 @@ static int nfp_net_netdev_open(struct net_device 
*netdev)
return 0;
 
 err_free_rings:
-   r = nn->num_rx_rings;
-err_flush_free_rx_ring_p:
-   while (r--) {
-   nfp_net_rx_ring_bufs_free(nn, nn->r_vecs[r].rx_ring);
-err_free_rx_ring_p:
-   nfp_net_rx_ring_free(nn->r_vecs[r].rx_ring);
-   }
-   r = nn->num_tx_rings;
-err_free_tx_ring_p:
-   while (r--)
-   nfp_net_tx_ring_free(nn->r_vecs[r].tx_ring);
+   nfp_net_tx_ring_set_free(nn, );
+err_free_rx_rings:
+   nfp_net_rx_ring_set_free(nn, );
+err_cleanup_vec:
r = nn->num_r_vecs;
 err_cleanup_vec_p:
while (r--)
nfp_net_cleanup_vector(nn, >r_vecs[r]);
-   kfree(nn->tx_rings);
-err_free_rx_rings:
-   kfree(nn->rx_rings);
-err_free_lsc:
nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
 err_free_exn:
nfp_net_aux_irq_free(nn, NFP_NET_CFG_EXN, NFP_NET_IRQ_EXN_IDX);
-- 
1.9.1



Re: [PATCH net-next v2 1/1] driver: veth: Refine the statistics codes of veth driver

2016-11-03 Thread David Miller
From: Gao Feng 
Date: Thu, 3 Nov 2016 22:38:28 +0800

> On Thu, Nov 3, 2016 at 10:31 PM, Eric Dumazet  wrote:
>> On Thu, 2016-11-03 at 21:39 +0800, Gao Feng wrote:
>>> Hi Eric,
>>>
>>> On Thu, Nov 3, 2016 at 9:30 PM, Eric Dumazet  wrote:
>>> > On Thu, 2016-11-03 at 21:03 +0800, f...@ikuai8.com wrote:
>>> >> From: Gao Feng 
>>> >>
>>> >> The dropped count of veth is located in struct veth_priv, but other
>>> >> statistics like packets and bytes are in another struct pcpu_vstats.
>>> >> Now keep these three counters in the same struct.
>>> >>
>>> >> Signed-off-by: Gao Feng 
>>> >> ---
>>> >>  v2: Use right "peer" instead of "dev";
>>> >>  v1: Initial version
>>> >
>>> > May I ask : Why ?
>>>
>>> Just because I think statistics should be in the same struct.
>>
>> That is not a good reason then.
> 
> Because other net devices put the statistics together.

Organizational "prettyness" is not argument for this change, when the
downsides are fundamentally clear:

1) It is not a fast-path accessed statistic, so the per-cpu'ness is
   not important.

2) We aim to minimize the amount of per-cpu data in the kernel because
   it is expensive.  So when not necessary, as is the case here, we
   do not user per-cpu data.

There are no good reasons to make this change, so I am dropping your
patch.


[PATCH net-next resend 03/13] nfp: rename ring allocation helpers

2016-11-03 Thread Jakub Kicinski
"Shadow" in ring helpers used to mean that the helper will allocate
rings without touching existing configuration, this was used for
reconfiguration while the device was running.  We will soon use
the same helpers for .ndo_open() path, so replace "shadow" with
"ring_set".

No functional changes.

Signed-off-by: Jakub Kicinski 
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 26 +++---
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index e58532d27c5b..b7b2851ebb6b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1573,7 +1573,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 }
 
 static struct nfp_net_tx_ring *
-nfp_net_shadow_tx_rings_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_tx_ring_set_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_tx_ring *rings;
unsigned int r;
@@ -1599,7 +1599,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 }
 
 static void
-nfp_net_shadow_tx_rings_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_tx_ring_set_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_tx_ring *rings = s->rings;
struct nfp_net_ring_set new = *s;
@@ -1616,7 +1616,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 }
 
 static void
-nfp_net_shadow_tx_rings_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_tx_ring_set_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_tx_ring *rings = s->rings;
unsigned int r;
@@ -1693,7 +1693,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 }
 
 static struct nfp_net_rx_ring *
-nfp_net_shadow_rx_rings_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_rx_ring_set_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
unsigned int fl_bufsz = nfp_net_calc_fl_bufsz(nn, s->mtu);
struct nfp_net_rx_ring *rings;
@@ -1726,7 +1726,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 }
 
 static void
-nfp_net_shadow_rx_rings_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_rx_ring_set_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_rx_ring *rings = s->rings;
struct nfp_net_ring_set new = *s;
@@ -1746,7 +1746,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 }
 
 static void
-nfp_net_shadow_rx_rings_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_rx_ring_set_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_rx_ring *rings = s->rings;
unsigned int r;
@@ -2268,9 +2268,9 @@ static void nfp_net_set_rx_mode(struct net_device *netdev)
 struct nfp_net_ring_set *tx)
 {
if (rx)
-   nfp_net_shadow_rx_rings_swap(nn, rx);
+   nfp_net_rx_ring_set_swap(nn, rx);
if (tx)
-   nfp_net_shadow_tx_rings_swap(nn, tx);
+   nfp_net_tx_ring_set_swap(nn, tx);
 
return __nfp_net_set_config_and_enable(nn);
 }
@@ -2299,11 +2299,11 @@ static void nfp_net_set_rx_mode(struct net_device 
*netdev)
 
/* Prepare new rings */
if (rx) {
-   if (!nfp_net_shadow_rx_rings_prepare(nn, rx))
+   if (!nfp_net_rx_ring_set_prepare(nn, rx))
return -ENOMEM;
}
if (tx) {
-   if (!nfp_net_shadow_tx_rings_prepare(nn, tx)) {
+   if (!nfp_net_tx_ring_set_prepare(nn, tx)) {
err = -ENOMEM;
goto err_free_rx;
}
@@ -2327,9 +2327,9 @@ static void nfp_net_set_rx_mode(struct net_device *netdev)
}
 
if (rx)
-   nfp_net_shadow_rx_rings_free(nn, rx);
+   nfp_net_rx_ring_set_free(nn, rx);
if (tx)
-   nfp_net_shadow_tx_rings_free(nn, tx);
+   nfp_net_tx_ring_set_free(nn, tx);
 
nfp_net_open_stack(nn);
 
@@ -2337,7 +2337,7 @@ static void nfp_net_set_rx_mode(struct net_device *netdev)
 
 err_free_rx:
if (rx)
-   nfp_net_shadow_rx_rings_free(nn, rx);
+   nfp_net_rx_ring_set_free(nn, rx);
return err;
 }
 
-- 
1.9.1



[PATCH net-next resend 05/13] nfp: loosen relation between rings and IRQs vectors

2016-11-03 Thread Jakub Kicinski
Upcoming XDP support will break the assumption that one can iterate
over IRQ vectors to get to all the rings easily.  Use nn->.x_ring
arrays directly.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 50aeaea9e318..97cc21eae466 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1920,9 +1920,9 @@ static void nfp_net_clear_config_and_disable(struct 
nfp_net *nn)
nn_err(nn, "Could not disable device: %d\n", err);
 
for (r = 0; r < nn->num_rx_rings; r++)
-   nfp_net_rx_ring_reset(nn->r_vecs[r].rx_ring);
+   nfp_net_rx_ring_reset(>rx_rings[r]);
for (r = 0; r < nn->num_tx_rings; r++)
-   nfp_net_tx_ring_reset(nn, nn->r_vecs[r].tx_ring);
+   nfp_net_tx_ring_reset(nn, >tx_rings[r]);
for (r = 0; r < nn->num_r_vecs; r++)
nfp_net_vec_clear_ring_data(nn, r);
 
@@ -2000,7 +2000,7 @@ static int __nfp_net_set_config_and_enable(struct nfp_net 
*nn)
nn->ctrl = new_ctrl;
 
for (r = 0; r < nn->num_rx_rings; r++)
-   nfp_net_rx_ring_fill_freelist(nn->r_vecs[r].rx_ring);
+   nfp_net_rx_ring_fill_freelist(>rx_rings[r]);
 
/* Since reconfiguration requests while NFP is down are ignored we
 * have to wipe the entire VXLAN configuration and reinitialize it.
@@ -2173,11 +2173,11 @@ static void nfp_net_close_free_all(struct nfp_net *nn)
unsigned int r;
 
for (r = 0; r < nn->num_rx_rings; r++) {
-   nfp_net_rx_ring_bufs_free(nn, nn->r_vecs[r].rx_ring);
-   nfp_net_rx_ring_free(nn->r_vecs[r].rx_ring);
+   nfp_net_rx_ring_bufs_free(nn, >rx_rings[r]);
+   nfp_net_rx_ring_free(>rx_rings[r]);
}
for (r = 0; r < nn->num_tx_rings; r++)
-   nfp_net_tx_ring_free(nn->r_vecs[r].tx_ring);
+   nfp_net_tx_ring_free(>tx_rings[r]);
for (r = 0; r < nn->num_r_vecs; r++)
nfp_net_cleanup_vector(nn, >r_vecs[r]);
 
-- 
1.9.1



Re: [PATCH net-next v1 00/21] amd-xgbe: AMD XGBE driver updates 2016-11-01

2016-11-03 Thread David Miller

20+ patches is too many to submit at one time.

Making huge patch series submissions puts an unreasonable burdon on
those developers who might decide to review your work, including me.

Please keep your series down to a small, reasonable, size.  Perhaps
10-15 patches maximum.

I will not be looking at nor applying this series.


[PATCH net-next resend 09/13] nfp: reorganize nfp_net_rx() to get packet offsets early

2016-11-03 Thread Jakub Kicinski
Calculate packet offsets early in nfp_net_rx() so that we will be
able to use them in upcoming XDP handler.  While at it move relevant
variables into the loop scope.

Signed-off-by: Jakub Kicinski 
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 56 --
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 506362729607..2ab63661a6fd 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1383,16 +1383,17 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, 
int budget)
 {
struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
struct nfp_net *nn = r_vec->nfp_net;
-   unsigned int data_len, meta_len;
-   struct nfp_net_rx_buf *rxbuf;
-   struct nfp_net_rx_desc *rxd;
-   dma_addr_t new_dma_addr;
struct sk_buff *skb;
int pkts_polled = 0;
-   void *new_frag;
int idx;
 
while (pkts_polled < budget) {
+   unsigned int meta_len, data_len, data_off, pkt_len, pkt_off;
+   struct nfp_net_rx_buf *rxbuf;
+   struct nfp_net_rx_desc *rxd;
+   dma_addr_t new_dma_addr;
+   void *new_frag;
+
idx = rx_ring->rd_p & (rx_ring->cnt - 1);
 
rxd = _ring->rxds[idx];
@@ -1408,22 +1409,6 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, 
int budget)
pkts_polled++;
 
rxbuf = _ring->rxbufs[idx];
-   skb = build_skb(rxbuf->frag, nn->fl_bufsz);
-   if (unlikely(!skb)) {
-   nfp_net_rx_drop(r_vec, rx_ring, rxbuf, NULL);
-   continue;
-   }
-   new_frag = nfp_net_napi_alloc_one(nn, _dma_addr);
-   if (unlikely(!new_frag)) {
-   nfp_net_rx_drop(r_vec, rx_ring, rxbuf, skb);
-   continue;
-   }
-
-   nfp_net_dma_unmap_rx(nn, rx_ring->rxbufs[idx].dma_addr,
-nn->fl_bufsz, DMA_FROM_DEVICE);
-
-   nfp_net_rx_give_one(rx_ring, new_frag, new_dma_addr);
-
/* < meta_len >
 *  <-- [rx_offset] -->
 *  -
@@ -1438,20 +1423,39 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, 
int budget)
 */
meta_len = rxd->rxd.meta_len_dd & PCIE_DESC_RX_META_LEN_MASK;
data_len = le16_to_cpu(rxd->rxd.data_len);
+   pkt_len = data_len - meta_len;
 
if (nn->rx_offset == NFP_NET_CFG_RX_OFFSET_DYNAMIC)
-   skb_reserve(skb, NFP_NET_RX_BUF_HEADROOM + meta_len);
+   pkt_off = meta_len;
else
-   skb_reserve(skb,
-   NFP_NET_RX_BUF_HEADROOM + nn->rx_offset);
-   skb_put(skb, data_len - meta_len);
+   pkt_off = nn->rx_offset;
+   data_off = NFP_NET_RX_BUF_HEADROOM + pkt_off;
 
/* Stats update */
u64_stats_update_begin(_vec->rx_sync);
r_vec->rx_pkts++;
-   r_vec->rx_bytes += skb->len;
+   r_vec->rx_bytes += pkt_len;
u64_stats_update_end(_vec->rx_sync);
 
+   skb = build_skb(rxbuf->frag, nn->fl_bufsz);
+   if (unlikely(!skb)) {
+   nfp_net_rx_drop(r_vec, rx_ring, rxbuf, NULL);
+   continue;
+   }
+   new_frag = nfp_net_napi_alloc_one(nn, _dma_addr);
+   if (unlikely(!new_frag)) {
+   nfp_net_rx_drop(r_vec, rx_ring, rxbuf, skb);
+   continue;
+   }
+
+   nfp_net_dma_unmap_rx(nn, rx_ring->rxbufs[idx].dma_addr,
+nn->fl_bufsz, DMA_FROM_DEVICE);
+
+   nfp_net_rx_give_one(rx_ring, new_frag, new_dma_addr);
+
+   skb_reserve(skb, data_off);
+   skb_put(skb, pkt_len);
+
if (nn->fw_ver.major <= 3) {
nfp_net_set_hash_desc(nn->netdev, skb, rxd);
} else if (meta_len) {
-- 
1.9.1



[PATCH] net: icmp_route_lookup should use rt dev to determine L3 domain

2016-11-03 Thread David Ahern
icmp_send is called in response to some event. The skb may not have
the device set (skb->dev is NULL), but it is expected to have an rt.
Update icmp_route_lookup to use the rt on the skb to determine L3
domain.

Fixes: 613d09b30f8b ("net: Use VRF device index for lookups on TX")
Signed-off-by: David Ahern 
---
 net/ipv4/icmp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 38abe70e595f..774a15e9f041 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -477,7 +477,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
-   fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev);
+   fl4->flowi4_oif = l3mdev_master_ifindex(skb_rtable(skb_in)->dst.dev);
 
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
rt = __ip_route_output_key_hash(net, fl4,
@@ -502,7 +502,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
if (err)
goto relookup_failed;
 
-   if (inet_addr_type_dev_table(net, skb_in->dev,
+   if (inet_addr_type_dev_table(net, skb_rtable(skb_in)->dst.dev,
 fl4_dec.saddr) == RTN_LOCAL) {
rt2 = __ip_route_output_key(net, _dec);
if (IS_ERR(rt2))
-- 
2.1.4



[PATCH net-next resend 08/13] nfp: add support for ethtool .set_channels

2016-11-03 Thread Jakub Kicinski
Allow changing the number of rings via ethtool .set_channels API.
Runtime reconfig needs to be extended to handle number of rings.
We need to be able to activate interrupt vectors before rings are
assigned to them.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 93 +-
 .../net/ethernet/netronome/nfp/nfp_net_debugfs.c   |  4 +-
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   | 47 +++
 4 files changed, 123 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 14b5e21cabf1..486e7c6453bc 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -584,6 +584,7 @@ struct nfp_net {
 };
 
 struct nfp_net_ring_set {
+   unsigned int n_rings;
unsigned int mtu;
unsigned int dcnt;
void *rings;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 09cec6e2c6cf..506362729607 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -494,7 +494,7 @@ static void nfp_net_irqs_assign(struct net_device *netdev)
nn->lsc_handler = nfp_net_irq_lsc;
nn->exn_handler = nfp_net_irq_exn;
 
-   for (r = 0; r < nn->num_r_vecs; r++) {
+   for (r = 0; r < nn->max_r_vecs; r++) {
r_vec = >r_vecs[r];
r_vec->nfp_net = nn;
r_vec->handler = nfp_net_irq_rxtx;
@@ -1578,12 +1578,12 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
struct nfp_net_tx_ring *rings;
unsigned int r;
 
-   rings = kcalloc(nn->num_tx_rings, sizeof(*rings), GFP_KERNEL);
+   rings = kcalloc(s->n_rings, sizeof(*rings), GFP_KERNEL);
if (!rings)
return NULL;
 
-   for (r = 0; r < nn->num_tx_rings; r++) {
-   nfp_net_tx_ring_init([r], nn->tx_rings[r].r_vec, r);
+   for (r = 0; r < s->n_rings; r++) {
+   nfp_net_tx_ring_init([r], >r_vecs[r], r);
 
if (nfp_net_tx_ring_alloc([r], s->dcnt))
goto err_free_prev;
@@ -1605,9 +1605,11 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 
s->dcnt = nn->txd_cnt;
s->rings = nn->tx_rings;
+   s->n_rings = nn->num_tx_rings;
 
nn->txd_cnt = new.dcnt;
nn->tx_rings = new.rings;
+   nn->num_tx_rings = new.n_rings;
 }
 
 static void
@@ -1616,7 +1618,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
struct nfp_net_tx_ring *rings = s->rings;
unsigned int r;
 
-   for (r = 0; r < nn->num_tx_rings; r++)
+   for (r = 0; r < s->n_rings; r++)
nfp_net_tx_ring_free([r]);
 
kfree(rings);
@@ -1694,12 +1696,12 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
struct nfp_net_rx_ring *rings;
unsigned int r;
 
-   rings = kcalloc(nn->num_rx_rings, sizeof(*rings), GFP_KERNEL);
+   rings = kcalloc(s->n_rings, sizeof(*rings), GFP_KERNEL);
if (!rings)
return NULL;
 
-   for (r = 0; r < nn->num_rx_rings; r++) {
-   nfp_net_rx_ring_init([r], nn->rx_rings[r].r_vec, r);
+   for (r = 0; r < s->n_rings; r++) {
+   nfp_net_rx_ring_init([r], >r_vecs[r], r);
 
if (nfp_net_rx_ring_alloc([r], fl_bufsz, s->dcnt))
goto err_free_prev;
@@ -1728,11 +1730,13 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
s->mtu = nn->netdev->mtu;
s->dcnt = nn->rxd_cnt;
s->rings = nn->rx_rings;
+   s->n_rings = nn->num_rx_rings;
 
nn->netdev->mtu = new.mtu;
nn->fl_bufsz = nfp_net_calc_fl_bufsz(nn, new.mtu);
nn->rxd_cnt = new.dcnt;
nn->rx_rings = new.rings;
+   nn->num_rx_rings = new.n_rings;
 }
 
 static void
@@ -1741,7 +1745,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
struct nfp_net_rx_ring *rings = s->rings;
unsigned int r;
 
-   for (r = 0; r < nn->num_rx_rings; r++) {
+   for (r = 0; r < s->n_rings; r++) {
nfp_net_rx_ring_bufs_free(nn, [r]);
nfp_net_rx_ring_free([r]);
}
@@ -1764,19 +1768,20 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
struct msix_entry *entry = >irq_entries[r_vec->irq_idx];
int err;
 
+   /* Setup NAPI */
+   netif_napi_add(nn->netdev, _vec->napi,
+  nfp_net_poll, NAPI_POLL_WEIGHT);
+
snprintf(r_vec->name, sizeof(r_vec->name),
 "%s-rxtx-%d", nn->netdev->name, idx);
err = request_irq(entry->vector, r_vec->handler, 0, r_vec->name, r_vec);
   

[PATCH net-next resend 07/13] nfp: move RSS indirection table init into a separate function

2016-11-03 Thread Jakub Kicinski
We will need to rerun the initialization of the RSS indirection table
after the number of rings is changed.  Move the code to a separate
function.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 2a4e1f1cb3c9..09cec6e2c6cf 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2229,6 +2229,15 @@ static void nfp_net_set_rx_mode(struct net_device 
*netdev)
nn->ctrl = new_ctrl;
 }
 
+static void nfp_net_rss_init_itbl(struct nfp_net *nn)
+{
+   int i;
+
+   for (i = 0; i < sizeof(nn->rss_itbl); i++)
+   nn->rss_itbl[i] =
+   ethtool_rxfh_indir_default(i, nn->num_rx_rings);
+}
+
 static int
 nfp_net_ring_swap_enable(struct nfp_net *nn,
 struct nfp_net_ring_set *rx,
@@ -2707,13 +2716,9 @@ void nfp_net_netdev_free(struct nfp_net *nn)
  */
 static void nfp_net_rss_init(struct nfp_net *nn)
 {
-   int i;
-
netdev_rss_key_fill(nn->rss_key, NFP_NET_CFG_RSS_KEY_SZ);
 
-   for (i = 0; i < sizeof(nn->rss_itbl); i++)
-   nn->rss_itbl[i] =
-   ethtool_rxfh_indir_default(i, nn->num_rx_rings);
+   nfp_net_rss_init_itbl(nn);
 
/* Enable IPv4/IPv6 TCP by default */
nn->rss_cfg = NFP_NET_CFG_RSS_IPV4_TCP |
-- 
1.9.1



[PATCH net-next resend 01/13] nfp: add support for ethtool .get_channels

2016-11-03 Thread Jakub Kicinski
Report number of rings via ethtool .get_channels API.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 3418f2277e9d..a7386d1b2883 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -614,6 +614,21 @@ static int nfp_net_set_coalesce(struct net_device *netdev,
return nfp_net_reconfig(nn, NFP_NET_CFG_UPDATE_IRQMOD);
 }
 
+static void nfp_net_get_channels(struct net_device *netdev,
+struct ethtool_channels *channel)
+{
+   struct nfp_net *nn = netdev_priv(netdev);
+
+   channel->max_rx = min(nn->max_rx_rings, nn->max_r_vecs);
+   channel->max_tx = min(nn->max_tx_rings, nn->max_r_vecs);
+   channel->max_combined = min(channel->max_rx, channel->max_tx);
+   channel->max_other = NFP_NET_NON_Q_VECTORS;
+   channel->combined_count = min(nn->num_rx_rings, nn->num_tx_rings);
+   channel->rx_count = nn->num_rx_rings - channel->combined_count;
+   channel->tx_count = nn->num_tx_rings - channel->combined_count;
+   channel->other_count = NFP_NET_NON_Q_VECTORS;
+}
+
 static const struct ethtool_ops nfp_net_ethtool_ops = {
.get_drvinfo= nfp_net_get_drvinfo,
.get_link   = ethtool_op_get_link,
@@ -632,6 +647,7 @@ static int nfp_net_set_coalesce(struct net_device *netdev,
.get_regs   = nfp_net_get_regs,
.get_coalesce   = nfp_net_get_coalesce,
.set_coalesce   = nfp_net_set_coalesce,
+   .get_channels   = nfp_net_get_channels,
 };
 
 void nfp_net_set_ethtool_ops(struct net_device *netdev)
-- 
1.9.1



[PATCH net-next resend 13/13] nfp: add support for offload of XDP programs

2016-11-03 Thread Jakub Kicinski
Most infrastructure can be reused, provide separate handling
of context offsets and exit codes.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_bpf.h   |  1 +
 drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c   | 92 +-
 .../net/ethernet/netronome/nfp/nfp_bpf_verifier.c  |  3 +
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  2 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 44 ++-
 .../net/ethernet/netronome/nfp/nfp_net_offload.c   |  3 +
 6 files changed, 139 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_bpf.h 
b/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
index 87aa8a3e9112..76a19f1796af 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
@@ -62,6 +62,7 @@ enum nfp_bpf_action_type {
NN_ACT_TC_DROP,
NN_ACT_TC_REDIR,
NN_ACT_DIRECT,
+   NN_ACT_XDP,
 };
 
 /* Software register representation, hardware encoding in asm.h */
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c 
b/drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c
index f8df5300f49c..335beb8b8b45 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c
@@ -1126,7 +1126,7 @@ static int data_ind_ld4(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
 meta->insn.src_reg * 2, true, 4);
 }
 
-static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int mem_ldx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
if (meta->insn.off == offsetof(struct sk_buff, len))
emit_alu(nfp_prog, reg_both(meta->insn.dst_reg * 2),
@@ -1134,12 +1134,42 @@ static int mem_ldx4(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
else
return -ENOTSUPP;
 
-   wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
+   return 0;
+}
+
+static int mem_ldx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   u32 dst = reg_both(meta->insn.dst_reg * 2);
+
+   if (meta->insn.off != offsetof(struct xdp_md, data) &&
+   meta->insn.off != offsetof(struct xdp_md, data_end))
+   return -ENOTSUPP;
+
+   emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, NFP_BPF_ABI_PKT);
+
+   if (meta->insn.off == offsetof(struct xdp_md, data))
+   return 0;
+
+   emit_alu(nfp_prog, dst, dst, ALU_OP_ADD, NFP_BPF_ABI_LEN);
 
return 0;
 }
 
-static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   int ret;
+
+   if (nfp_prog->act == NN_ACT_XDP)
+   ret = mem_ldx4_xdp(nfp_prog, meta);
+   else
+   ret = mem_ldx4_skb(nfp_prog, meta);
+
+   wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
+
+   return ret;
+}
+
+static int mem_stx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
if (meta->insn.off == offsetof(struct sk_buff, mark))
return wrp_set_mark(nfp_prog, meta->insn.src_reg * 2);
@@ -1147,6 +1177,18 @@ static int mem_stx4(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
return -ENOTSUPP;
 }
 
+static int mem_stx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   return -ENOTSUPP;
+}
+
+static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   if (nfp_prog->act == NN_ACT_XDP)
+   return mem_stx4_xdp(nfp_prog, meta);
+   return mem_stx4_skb(nfp_prog, meta);
+}
+
 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
if (meta->insn.off < 0) /* TODO */
@@ -1530,6 +1572,47 @@ static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
 }
 
+static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
+{
+   /* XDP return codes:
+*   0 aborted  0x82 -> drop,  count as stat3
+*   1drop  0x22 -> drop,  count as stat1
+*   2pass  0x11 -> pass,  count as stat0
+*   3  tx  0x44 -> redir, count as stat2
+*   * unknown  0x82 -> drop,  count as stat3
+*/
+   /* Target for aborts */
+   nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
+
+   emit_br_def(nfp_prog, nfp_prog->tgt_done, 2);
+
+   emit_alu(nfp_prog, reg_a(0),
+reg_none(), ALU_OP_NONE, NFP_BPF_ABI_FLAGS);
+   emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
+
+   /* Target for normal exits */
+   nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
+
+   /* if R0 > 3 jump to abort */
+   emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
+   emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
+
+   wrp_immed(nfp_prog, 

[PATCH net-next resend 11/13] nfp: add XDP support in the driver

2016-11-03 Thread Jakub Kicinski
Add XDP support.  Separate stack's and XDP's TX rings logically.
Add functions for handling XDP_TX and cleanup of XDP's TX rings.
For XDP allocate all RX buffers as separate pages and map them
with DMA_BIDIRECTIONAL.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  17 +-
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 474 +
 .../net/ethernet/netronome/nfp/nfp_net_debugfs.c   |  37 +-
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  22 +-
 4 files changed, 449 insertions(+), 101 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 486e7c6453bc..abc9e56e93b8 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -171,7 +171,10 @@ struct nfp_net_tx_desc {
  * on the head's buffer). Equal to skb->len for non-TSO packets.
  */
 struct nfp_net_tx_buf {
-   struct sk_buff *skb;
+   union {
+   struct sk_buff *skb;
+   void *frag;
+   };
dma_addr_t dma_addr;
short int fidx;
u16 pkt_cnt;
@@ -341,6 +344,7 @@ struct nfp_net_rx_ring {
  * @napi:   NAPI structure for this ring vec
  * @tx_ring:Pointer to TX ring
  * @rx_ring:Pointer to RX ring
+ * @xdp_ring:  Pointer to an extra TX ring for XDP
  * @irq_idx:Index into MSI-X table
  * @rx_sync:   Seqlock for atomic updates of RX stats
  * @rx_pkts:Number of received packets
@@ -384,6 +388,8 @@ struct nfp_net_r_vector {
u64 hw_csum_rx_inner_ok;
u64 hw_csum_rx_error;
 
+   struct nfp_net_tx_ring *xdp_ring;
+
struct u64_stats_sync tx_sync;
u64 tx_pkts;
u64 tx_bytes;
@@ -432,6 +438,7 @@ struct nfp_stat_pair {
  * @ctrl:   Local copy of the control register/word.
  * @fl_bufsz:   Currently configured size of the freelist buffers
  * @rx_offset: Offset in the RX buffers where packet data starts
+ * @xdp_prog:  Installed XDP program
  * @cpp:Pointer to the CPP handle
  * @nfp_dev_cpp:Pointer to the NFP Device handle
  * @ctrl_area:  Pointer to the CPP area for the control BAR
@@ -451,6 +458,7 @@ struct nfp_stat_pair {
  * @max_tx_rings:   Maximum number of TX rings supported by the Firmware
  * @max_rx_rings:   Maximum number of RX rings supported by the Firmware
  * @num_tx_rings:   Currently configured number of TX rings
+ * @num_stack_tx_rings:Number of TX rings used by the stack (not XDP)
  * @num_rx_rings:   Currently configured number of RX rings
  * @txd_cnt:Size of the TX ring in number of descriptors
  * @rxd_cnt:Size of the RX ring in number of descriptors
@@ -500,6 +508,8 @@ struct nfp_net {
 
u32 rx_offset;
 
+   struct bpf_prog *xdp_prog;
+
struct nfp_net_tx_ring *tx_rings;
struct nfp_net_rx_ring *rx_rings;
 
@@ -532,6 +542,7 @@ struct nfp_net {
unsigned int max_rx_rings;
 
unsigned int num_tx_rings;
+   unsigned int num_stack_tx_rings;
unsigned int num_rx_rings;
 
int stride_tx;
@@ -779,8 +790,8 @@ struct nfp_net *
 int nfp_net_irqs_alloc(struct nfp_net *nn);
 void nfp_net_irqs_disable(struct nfp_net *nn);
 int
-nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_ring_set *rx,
- struct nfp_net_ring_set *tx);
+nfp_net_ring_reconfig(struct nfp_net *nn, struct bpf_prog **xdp_prog,
+ struct nfp_net_ring_set *rx, struct nfp_net_ring_set *tx);
 
 #ifdef CONFIG_NFP_NET_DEBUG
 void nfp_net_debugfs_create(void);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 2ab63661a6fd..fa43dbcecc4f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -41,6 +41,7 @@
  *  Chris Telfer 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -490,6 +491,7 @@ static void nfp_net_irqs_assign(struct net_device *netdev)
 
nn->num_rx_rings = min(nn->num_r_vecs, nn->num_rx_rings);
nn->num_tx_rings = min(nn->num_r_vecs, nn->num_tx_rings);
+   nn->num_stack_tx_rings = nn->num_tx_rings;
 
nn->lsc_handler = nfp_net_irq_lsc;
nn->exn_handler = nfp_net_irq_exn;
@@ -713,6 +715,13 @@ static void nfp_net_tx_csum(struct nfp_net *nn, struct 
nfp_net_r_vector *r_vec,
u64_stats_update_end(_vec->tx_sync);
 }
 
+static void nfp_net_tx_xmit_more_flush(struct nfp_net_tx_ring *tx_ring)
+{
+   wmb();
+   nfp_qcp_wr_ptr_add(tx_ring->qcp_q, tx_ring->wr_ptr_add);
+   tx_ring->wr_ptr_add = 0;
+}
+
 /**
  * nfp_net_tx() - Main transmit entry point
  * @skb:SKB to transmit
@@ -827,12 +836,8 @@ static int nfp_net_tx(struct sk_buff *skb, struct 
net_device *netdev)

[PATCH net-next resend 10/13] debugfs: constify argument to debugfs_real_fops()

2016-11-03 Thread Jakub Kicinski
seq_file users can only access const version of file pointer,
because the ->file member of struct seq_operations is marked
as such.  Make parameter to debugfs_real_fops() const.

CC: Greg Kroah-Hartman 
CC: Nicolai Stange 
CC: Christian Lamparter 
CC: LKML 
Signed-off-by: Jakub Kicinski 
---
 include/linux/debugfs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 4d3f0d1aec73..bf1907d96097 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -52,7 +52,8 @@ struct debugfs_regset32 {
  * Must only be called under the protection established by
  * debugfs_use_file_start().
  */
-static inline const struct file_operations *debugfs_real_fops(struct file 
*filp)
+static inline const struct file_operations *
+debugfs_real_fops(const struct file *filp)
__must_hold(_srcu)
 {
/*
-- 
1.9.1



[PATCH net-next resend 12/13] nfp: remove unnecessary parameters from nfp_net_bpf_offload()

2016-11-03 Thread Jakub Kicinski
nfp_net_bpf_offload() takes all .setup_tc() parameters but it
doesn't use them at the moment.  Remove unnecessary ones to make
it possible for XDP to reuse this function.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h | 4 +---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c  | 2 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_offload.c | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index abc9e56e93b8..fd29a6306991 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -817,8 +817,6 @@ static inline void nfp_net_debugfs_adapter_del(struct 
nfp_net *nn)
 #endif /* CONFIG_NFP_NET_DEBUG */
 
 void nfp_net_filter_stats_timer(unsigned long data);
-int
-nfp_net_bpf_offload(struct nfp_net *nn, u32 handle, __be16 proto,
-   struct tc_cls_bpf_offload *cls_bpf);
+int nfp_net_bpf_offload(struct nfp_net *nn, struct tc_cls_bpf_offload 
*cls_bpf);
 
 #endif /* _NFP_NET_H_ */
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index fa43dbcecc4f..1e8e00d25c51 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2694,7 +2694,7 @@ static bool nfp_net_ebpf_capable(struct nfp_net *nn)
return -ENOTSUPP;
 
if (tc->type == TC_SETUP_CLSBPF && nfp_net_ebpf_capable(nn))
-   return nfp_net_bpf_offload(nn, handle, proto, tc->cls_bpf);
+   return nfp_net_bpf_offload(nn, tc->cls_bpf);
 
return -EINVAL;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c
index cfed40c0e310..4bb6f16e2a7a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c
@@ -233,9 +233,7 @@ static int nfp_net_bpf_stop(struct nfp_net *nn)
return nfp_net_reconfig(nn, NFP_NET_CFG_UPDATE_GEN);
 }
 
-int
-nfp_net_bpf_offload(struct nfp_net *nn, u32 handle, __be16 proto,
-   struct tc_cls_bpf_offload *cls_bpf)
+int nfp_net_bpf_offload(struct nfp_net *nn, struct tc_cls_bpf_offload *cls_bpf)
 {
struct nfp_bpf_result res;
dma_addr_t dma_addr;
-- 
1.9.1



[PATCH net-next resend 06/13] nfp: add helper to reassign rings to IRQ vectors

2016-11-03 Thread Jakub Kicinski
Instead of fixing ring -> vector relations up in ring swap functions
put the reassignment into a helper function which will reinit all
links.

Signed-off-by: Jakub Kicinski 
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 40 +-
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 97cc21eae466..2a4e1f1cb3c9 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1601,16 +1601,11 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 static void
 nfp_net_tx_ring_set_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
-   struct nfp_net_tx_ring *rings = s->rings;
struct nfp_net_ring_set new = *s;
-   unsigned int r;
 
s->dcnt = nn->txd_cnt;
s->rings = nn->tx_rings;
 
-   for (r = 0; r < nn->num_tx_rings; r++)
-   nn->tx_rings[r].r_vec->tx_ring = [r];
-
nn->txd_cnt = new.dcnt;
nn->tx_rings = new.rings;
 }
@@ -1728,17 +1723,12 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 static void
 nfp_net_rx_ring_set_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
-   struct nfp_net_rx_ring *rings = s->rings;
struct nfp_net_ring_set new = *s;
-   unsigned int r;
 
s->mtu = nn->netdev->mtu;
s->dcnt = nn->rxd_cnt;
s->rings = nn->rx_rings;
 
-   for (r = 0; r < nn->num_rx_rings; r++)
-   nn->rx_rings[r].r_vec->rx_ring = [r];
-
nn->netdev->mtu = new.mtu;
nn->fl_bufsz = nfp_net_calc_fl_bufsz(nn, new.mtu);
nn->rxd_cnt = new.dcnt;
@@ -1759,6 +1749,14 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
kfree(rings);
 }
 
+static void
+nfp_net_vector_assign_rings(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
+   int idx)
+{
+   r_vec->rx_ring = idx < nn->num_rx_rings ? >rx_rings[idx] : NULL;
+   r_vec->tx_ring = idx < nn->num_tx_rings ? >tx_rings[idx] : NULL;
+}
+
 static int
 nfp_net_prepare_vector(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
   int idx)
@@ -1766,20 +1764,6 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
struct msix_entry *entry = >irq_entries[r_vec->irq_idx];
int err;
 
-   if (idx < nn->num_tx_rings) {
-   r_vec->tx_ring = >tx_rings[idx];
-   nfp_net_tx_ring_init(r_vec->tx_ring, r_vec, idx);
-   } else {
-   r_vec->tx_ring = NULL;
-   }
-
-   if (idx < nn->num_rx_rings) {
-   r_vec->rx_ring = >rx_rings[idx];
-   nfp_net_rx_ring_init(r_vec->rx_ring, r_vec, idx);
-   } else {
-   r_vec->rx_ring = NULL;
-   }
-
snprintf(r_vec->name, sizeof(r_vec->name),
 "%s-rxtx-%d", nn->netdev->name, idx);
err = request_irq(entry->vector, r_vec->handler, 0, r_vec->name, r_vec);
@@ -2100,6 +2084,9 @@ static int nfp_net_netdev_open(struct net_device *netdev)
goto err_free_rx_rings;
}
 
+   for (r = 0; r < nn->max_r_vecs; r++)
+   nfp_net_vector_assign_rings(nn, >r_vecs[r], r);
+
err = netif_set_real_num_tx_queues(netdev, nn->num_tx_rings);
if (err)
goto err_free_rings;
@@ -2247,11 +2234,16 @@ static void nfp_net_set_rx_mode(struct net_device 
*netdev)
 struct nfp_net_ring_set *rx,
 struct nfp_net_ring_set *tx)
 {
+   unsigned int r;
+
if (rx)
nfp_net_rx_ring_set_swap(nn, rx);
if (tx)
nfp_net_tx_ring_set_swap(nn, tx);
 
+   for (r = 0; r < nn->max_r_vecs; r++)
+   nfp_net_vector_assign_rings(nn, >r_vecs[r], r);
+
return __nfp_net_set_config_and_enable(nn);
 }
 
-- 
1.9.1



[PATCH net-next resend 00/13] ring reconfiguration and XDP support

2016-11-03 Thread Jakub Kicinski
Hi!

This set adds support for ethtool channel API and XDP.

I kick off with ethtool get_channels() implementation.  
set_channels() needs some preparations to get right.  I follow
the prepare/commit paradigm and allocate all resources before
stopping the device.  It has already been done for ndo_change_mtu
and ethtool set_ringparam(), it makes sense now to consolidate all
the required logic in one place.

XDP support requires splitting TX rings into two classes - 
for the stack and for XDP.  The ring structures are identical.
The differences are in how they are connected to IRQ vector
structs and how the completion/cleanup works.  When XDP is enabled
I switch from the frag allocator to page-per-packet and map buffers
BIDIRECTIONALly.

Last but not least XDP offload is added (the patch just takes
care of the small formal differences between cls_bpf and XDP).

There is a tiny & trivial DebugFS patch in the mix, I hope it can
be taken via net-next provided we have the right Acks.

Resending with improved commit message and CCing more people on patch 10.

Jakub Kicinski (13):
  nfp: add support for ethtool .get_channels
  nfp: centralize runtime reconfiguration logic
  nfp: rename ring allocation helpers
  nfp: reuse ring helpers on .ndo_open() path
  nfp: loosen relation between rings and IRQs vectors
  nfp: add helper to reassign rings to IRQ vectors
  nfp: move RSS indirection table init into a separate function
  nfp: add support for ethtool .set_channels
  nfp: reorganize nfp_net_rx() to get packet offsets early
  debugfs: constify argument to debugfs_real_fops()
  nfp: add XDP support in the driver
  nfp: remove unnecessary parameters from nfp_net_bpf_offload()
  nfp: add support for offload of XDP programs

 drivers/net/ethernet/netronome/nfp/nfp_bpf.h   |   1 +
 drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c   |  92 ++-
 .../net/ethernet/netronome/nfp/nfp_bpf_verifier.c  |   3 +
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  30 +-
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 892 ++---
 .../net/ethernet/netronome/nfp/nfp_net_debugfs.c   |  41 +-
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  94 +++
 .../net/ethernet/netronome/nfp/nfp_net_offload.c   |   7 +-
 include/linux/debugfs.h|   3 +-
 9 files changed, 869 insertions(+), 294 deletions(-)

-- 
1.9.1



[PATCH net-next resend 02/13] nfp: centralize runtime reconfiguration logic

2016-11-03 Thread Jakub Kicinski
All functions which need to reallocate ring resources at runtime
look very similar.  Centralize that logic into a separate function.
Encapsulate configuration parameters in a structure.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  10 +-
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 208 +
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  19 ++
 3 files changed, 118 insertions(+), 119 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index e8713254786b..14b5e21cabf1 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -583,6 +583,12 @@ struct nfp_net {
struct dentry *debugfs_dir;
 };
 
+struct nfp_net_ring_set {
+   unsigned int mtu;
+   unsigned int dcnt;
+   void *rings;
+};
+
 /* Functions to read/write from/to a BAR
  * Performs any endian conversion necessary.
  */
@@ -771,7 +777,9 @@ struct nfp_net *
 void nfp_net_coalesce_write_cfg(struct nfp_net *nn);
 int nfp_net_irqs_alloc(struct nfp_net *nn);
 void nfp_net_irqs_disable(struct nfp_net *nn);
-int nfp_net_set_ring_size(struct nfp_net *nn, u32 rxd_cnt, u32 txd_cnt);
+int
+nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_ring_set *rx,
+ struct nfp_net_ring_set *tx);
 
 #ifdef CONFIG_NFP_NET_DEBUG
 void nfp_net_debugfs_create(void);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 97e0bbef13d1..e58532d27c5b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1573,7 +1573,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 }
 
 static struct nfp_net_tx_ring *
-nfp_net_shadow_tx_rings_prepare(struct nfp_net *nn, u32 buf_cnt)
+nfp_net_shadow_tx_rings_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_tx_ring *rings;
unsigned int r;
@@ -1585,11 +1585,11 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
for (r = 0; r < nn->num_tx_rings; r++) {
nfp_net_tx_ring_init([r], nn->tx_rings[r].r_vec, r);
 
-   if (nfp_net_tx_ring_alloc([r], buf_cnt))
+   if (nfp_net_tx_ring_alloc([r], s->dcnt))
goto err_free_prev;
}
 
-   return rings;
+   return s->rings = rings;
 
 err_free_prev:
while (r--)
@@ -1598,27 +1598,29 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
return NULL;
 }
 
-static struct nfp_net_tx_ring *
-nfp_net_shadow_tx_rings_swap(struct nfp_net *nn, struct nfp_net_tx_ring *rings)
+static void
+nfp_net_shadow_tx_rings_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
-   struct nfp_net_tx_ring *old = nn->tx_rings;
+   struct nfp_net_tx_ring *rings = s->rings;
+   struct nfp_net_ring_set new = *s;
unsigned int r;
 
+   s->dcnt = nn->txd_cnt;
+   s->rings = nn->tx_rings;
+
for (r = 0; r < nn->num_tx_rings; r++)
-   old[r].r_vec->tx_ring = [r];
+   nn->tx_rings[r].r_vec->tx_ring = [r];
 
-   nn->tx_rings = rings;
-   return old;
+   nn->txd_cnt = new.dcnt;
+   nn->tx_rings = new.rings;
 }
 
 static void
-nfp_net_shadow_tx_rings_free(struct nfp_net *nn, struct nfp_net_tx_ring *rings)
+nfp_net_shadow_tx_rings_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
+   struct nfp_net_tx_ring *rings = s->rings;
unsigned int r;
 
-   if (!rings)
-   return;
-
for (r = 0; r < nn->num_tx_rings; r++)
nfp_net_tx_ring_free([r]);
 
@@ -1691,9 +1693,9 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 }
 
 static struct nfp_net_rx_ring *
-nfp_net_shadow_rx_rings_prepare(struct nfp_net *nn, unsigned int fl_bufsz,
-   u32 buf_cnt)
+nfp_net_shadow_rx_rings_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
+   unsigned int fl_bufsz = nfp_net_calc_fl_bufsz(nn, s->mtu);
struct nfp_net_rx_ring *rings;
unsigned int r;
 
@@ -1704,14 +1706,14 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
for (r = 0; r < nn->num_rx_rings; r++) {
nfp_net_rx_ring_init([r], nn->rx_rings[r].r_vec, r);
 
-   if (nfp_net_rx_ring_alloc([r], fl_bufsz, buf_cnt))
+   if (nfp_net_rx_ring_alloc([r], fl_bufsz, s->dcnt))
goto err_free_prev;
 
if (nfp_net_rx_ring_bufs_alloc(nn, [r]))
goto err_free_ring;
}
 
-   return rings;
+   return s->rings = rings;
 
 err_free_prev:
while (r--) {
@@ -1723,27 +1725,32 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
return 

Re: net/sctp: use-after-free in __sctp_connect

2016-11-03 Thread Andrey Konovalov
On Wed, Nov 2, 2016 at 11:42 PM, Andrey Konovalov  wrote:
> On Wed, Oct 19, 2016 at 6:57 PM, Marcelo Ricardo Leitner
>  wrote:
>> On Wed, Oct 19, 2016 at 02:25:24PM +0200, Andrey Konovalov wrote:
>>> Hi,
>>>
>>> I've got the following error report while running the syzkaller fuzzer:
>>>
>>> ==
>>> BUG: KASAN: use-after-free in __sctp_connect+0xabe/0xbf0 at addr
>>> 88006b1dc610
>>
>> Seems this is the same that Dmitry Vyukov had reported back in Jan 13th.
>> So far I couldn't identify the reason.
>> "Good" to know it's still there, thanks for reporting it.

Hi Marcelo,

So I've looked at the code.
As far as I understand, the problem is a race condition between
setsockopt(SCTP_SOCKOPT_CONNECTX) and shutdown on an sctp socket.
setsockopt() calls sctp_wait_for_connect(), which exits the for loop
on the sk->sk_shutdown & RCV_SHUTDOWN if clause, and then frees asoc
with sctp_association_put() and returns err = 0.
Then __sctp_connect() checks that err == 0 and reads asoc->assoc_id
from the freed asoc.


Re: [PATCH RFC 0/2] ethtool: Add actual port speed reporting

2016-11-03 Thread Rick Jones

And besides, one can argue that in the SR-IOV scenario the VF has no business
knowing the physical port speed.



Good point, but there are more use-cases we should consider.
For example, when using Multi-Host/Flex-10/Multi-PF each PF should
be able to query both physical port speed and actual speed.


Despite my email address, I'm not fully versed on VC/Flex, but I have 
always been under the impression that the flexnics created were, 
conceptually, "distinct" NICs considered independently of the physical 
port over which they operated.  Tossing another worm or three into the 
can, while "back in the day" (when some of the first ethtool changes to 
report speeds other than the "normal" ones went in) the speed of a 
flexnic was fixed, today, it can actually operate in a range.  From a 
minimum guarantee to an "if there is bandwidth available" cap.


rick jones



Re: [PATCH RFC 0/2] ethtool: Add actual port speed reporting

2016-11-03 Thread Gal Pressman


On 02/11/2016 17:50, Mintz, Yuval wrote:
>> Sending RFC to get feedback for the following ethtool proposal:
>>
>> In some cases such as virtual machines and multi functions (SR-IOV), the 
>> actual
>> bandwidth exposed for each machine is not accurately shown in ethtool.
>> Currently ethtool shows only physical port link speed.
>> In our case we would like to show the virtual port operational link speed 
>> which
>> in some cases is less than the physical port speed.
>>
>> This will give users better visibility for the actual speed running on their 
>> device.
>>
>> $ ethtool ens6
>> ...
>> Speed: 5Mb/s
>> Actual speed: 25000Mb/s
> 
> Not saying this is a bad thing, but where exactly is it listed that ethtool 
> has
> to show the physical port speed?
> E.g., bnx2x shows the logical speed instead, and has been doing that for 
> years.
> [Perhaps that's a past wrongness, but that's how it goes].
> 
> And besides, one can argue that in the SR-IOV scenario the VF has no business
> knowing the physical port speed.
> 

Good point, but there are more use-cases we should consider.
For example, when using Multi-Host/Flex-10/Multi-PF each PF should
be able to query both physical port speed and actual speed.


Re: stmmac/RTL8211F/Meson GXBB: TX throughput problems

2016-11-03 Thread Jerome Brunet
On Mon, 2016-10-31 at 11:25 +0100, André Roth wrote:
> Hi all,
>  
> > 
> > on my device this results in:
> > [0xc9410018] = 0x200
> > [0xc9410030] = 0x0
> > [0xc941003c] = 0x0
> > [0xc9411000] = 0x1100802
> > [0xc9411018] = 0x2202006
> > [0xc9411028] = 0x0
> > 
> > maybe someone else could try the command from above on his device
> > (running the original Amlogic kernel).
> 
> those registers have the same value on an original image from
> hardkernel: 
> 
> Linux odroid64 3.14.65-65 #1 SMP PREEMPT Sat May 28
> 02:50:51 BRT 2016 aarch64 aarch64 aarch64 GNU/Linux
> 
> > 
> > please also state if ethernet is working properly on the original
> > kernel (and preferably which device/board this is).
> 
> yes, the ethernet works flawless in 100 and 1000 Mbit/s on the 3.14
> kernel.

Andre, the 3.14 kernel you are talking, is it this one ? : 
https://github.com/hardkernel/linux/tree/odroidc2-3.14.y

Because in drivers/net/phy/realtek.c, they disable EEE, but
also 1000Base-T Full Duplex advertisement ?

+   /* disable 1000m adv*/
+   val = phy_read(phydev, 0x9);
+   phy_write(phydev, 0x9, val&(~(1<<9)));

If this is the kernel you are running, you should not be able to have
ethernet at 1000MB/s ? Or is it in half duplex mode ?

> 
> I can now confirm that both 100 and 1000 Mbit/s do not work properly
> on the 4.8/integ branch. the network connection is interrupted after
> some outbound traffic. it can be recovered by running a ifdown/ifup
> which restarts dhclient, which I think is able to somehow reset the
> interface. 
> 
> Anything I can help to debug the issue further ?
> 
> Regards,
> 
>  André
>  
> 
> 
> ___
> linux-amlogic mailing list
> linux-amlo...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-amlogic


Re: [PATCH net-next 10/13] debugfs: constify argument to debugfs_real_fops()

2016-11-03 Thread Jakub Kicinski
On Thu, 03 Nov 2016 17:55:39 +0100, Nicolai Stange wrote:
> Hi Jakub,
> 
> thanks for this.
> 
> However, the debugfs maintainer, Greg K-H, as well as the lkml is
> missing from the To/Cc. Can you resend please?

Sure thing!

> Jakub Kicinski  writes:
> 
> > seq_file users can only access const version of file pointer,  
> 
> ... because the ->file member of struct seq_operations is marked as such.

Thanks!


Re: [PATCH net-next 10/13] debugfs: constify argument to debugfs_real_fops()

2016-11-03 Thread Nicolai Stange
Hi Jakub,

thanks for this.

However, the debugfs maintainer, Greg K-H, as well as the lkml is
missing from the To/Cc. Can you resend please?

Jakub Kicinski  writes:

> seq_file users can only access const version of file pointer,

... because the ->file member of struct seq_operations is marked as such.

> make parameter to debugfs_real_fops() const.
>
> CC: Nicolai Stange 
> CC: Christian Lamparter 
> Signed-off-by: Jakub Kicinski 
> ---
>  include/linux/debugfs.h | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
> index 4d3f0d1aec73..bf1907d96097 100644
> --- a/include/linux/debugfs.h
> +++ b/include/linux/debugfs.h
> @@ -52,7 +52,8 @@ struct debugfs_regset32 {
>   * Must only be called under the protection established by
>   * debugfs_use_file_start().
>   */
> -static inline const struct file_operations *debugfs_real_fops(struct file 
> *filp)
> +static inline const struct file_operations *
> +debugfs_real_fops(const struct file *filp)
>   __must_hold(_srcu)
>  {
>   /*


Re: [PATCH v5 5/7] net: ethernet: bgmac: device tree phy enablement

2016-11-03 Thread Jon Mason
On Thu, Nov 03, 2016 at 09:31:21AM +0100, Rafal Milecki wrote:
> On 11/02/2016 06:08 PM, Jon Mason wrote:
> >Change the bgmac driver to allow for phy's defined by the device tree
> 
> This is a late review, I know, sorry... :(
> 
> 
> >+static int bcma_phy_direct_connect(struct bgmac *bgmac)
> >+{
> >+struct fixed_phy_status fphy_status = {
> >+.link = 1,
> >+.speed = SPEED_1000,
> >+.duplex = DUPLEX_FULL,
> >+};
> >+struct phy_device *phy_dev;
> >+int err;
> >+
> >+phy_dev = fixed_phy_register(PHY_POLL, _status, -1, NULL);
> >+if (!phy_dev || IS_ERR(phy_dev)) {
> >+dev_err(bgmac->dev, "Failed to register fixed PHY device\n");
> >+return -ENODEV;
> >+}
> >+
> >+err = phy_connect_direct(bgmac->net_dev, phy_dev, bgmac_adjust_link,
> >+ PHY_INTERFACE_MODE_MII);
> >+if (err) {
> >+dev_err(bgmac->dev, "Connecting PHY failed\n");
> >+return err;
> >+}
> >+
> >+return err;
> >+}
> 
> This bcma specific function looks exactly the same as...
> 
> 
> >+static int platform_phy_direct_connect(struct bgmac *bgmac)
> >+{
> >+struct fixed_phy_status fphy_status = {
> >+.link = 1,
> >+.speed = SPEED_1000,
> >+.duplex = DUPLEX_FULL,
> >+};
> >+struct phy_device *phy_dev;
> >+int err;
> >+
> >+phy_dev = fixed_phy_register(PHY_POLL, _status, -1, NULL);
> >+if (!phy_dev || IS_ERR(phy_dev)) {
> >+dev_err(bgmac->dev, "Failed to register fixed PHY device\n");
> >+return -ENODEV;
> >+}
> >+
> >+err = phy_connect_direct(bgmac->net_dev, phy_dev, bgmac_adjust_link,
> >+ PHY_INTERFACE_MODE_MII);
> >+if (err) {
> >+dev_err(bgmac->dev, "Connecting PHY failed\n");
> >+return err;
> >+}
> >+
> >+return err;
> >+}
> 
> This one.
> 
> Would that make sense to keep bgmac_phy_connect_direct and just use it in
> bcma/platform code?

Yes, I was having the same internal debate.  I hate the duplication of
code, but I really wanted to keep the PHY logic out of the bgmac.c
file.  Do you think it is acceptable to make this an inline function
in bgmac.h?

Thanks,
Jon


[Patch net] taskstats: fix the length of cgroupstats_cmd_get_policy

2016-11-03 Thread Cong Wang
cgroupstats_cmd_get_policy is [CGROUPSTATS_CMD_ATTR_MAX+1],
taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1],
but their family.maxattr is TASKSTATS_CMD_ATTR_MAX.
CGROUPSTATS_CMD_ATTR_MAX is less than TASKSTATS_CMD_ATTR_MAX,
so we could end up accessing out-of-bound.

Change cgroupstats_cmd_get_policy to TASKSTATS_CMD_ATTR_MAX+1,
this is safe because the rest are initialized to 0's.

Reported-by: Andrey Konovalov 
Tested-by: Andrey Konovalov 
Signed-off-by: Cong Wang 
---
 kernel/taskstats.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b3f05ee..cbb387a 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -54,7 +54,11 @@ static const struct nla_policy 
taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1
[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
 
-static const struct nla_policy 
cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
+/*
+ * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family.
+ * Make sure they are always aligned.
+ */
+static const struct nla_policy 
cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
 };
 
-- 
2.1.0



[Patch net] genetlink: fix a memory leak on error path

2016-11-03 Thread Cong Wang
In __genl_register_family(), when genl_validate_assign_mc_groups()
fails, we forget to free the memory we possibly allocate for
family->attrbuf.

Note, some callers call genl_unregister_family() to clean up
on error path, it doesn't work because the family is inserted
to the global list in the nearly last step.

Cc: Jakub Kicinski 
Cc: Johannes Berg 
Signed-off-by: Cong Wang 
---
 net/netlink/genetlink.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 23cc126..49c28e8 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -404,7 +404,7 @@ int __genl_register_family(struct genl_family *family)
 
err = genl_validate_assign_mc_groups(family);
if (err)
-   goto errout_locked;
+   goto errout_free;
 
list_add_tail(>family_list, genl_family_chain(family->id));
genl_unlock_all();
@@ -417,6 +417,8 @@ int __genl_register_family(struct genl_family *family)
 
return 0;
 
+errout_free:
+   kfree(family->attrbuf);
 errout_locked:
genl_unlock_all();
 errout:
-- 
2.1.0



Re: bpf: kernel BUG in htab_elem_free

2016-11-03 Thread Daniel Borkmann

On 11/03/2016 03:15 PM, Dmitry Vyukov wrote:

On Wed, Nov 2, 2016 at 11:14 PM, Dmitry Vyukov  wrote:

Here we go.

The following program triggers kernel BUG in htab_elem_free.
On commit 0c183d92b20b5c84ca655b45ef57b3318b83eb9e (Oct 31).
Run as "while true; do ./a.out; done".


This one fixes it for me. Could you check it from your side as well?
I'll submit an official fix then.

Thanks a lot for the catch!
Daniel

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 570eeca..ad1bc67 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab)

hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(>hash_node);
-   htab_elem_free(htab, l);
+   if (l->state != HTAB_EXTRA_ELEM_USED)
+   htab_elem_free(htab, l);
}
}
 }


Re: stmmac/RTL8211F/Meson GXBB: TX throughput problems

2016-11-03 Thread Jerome Brunet
On Sat, 2016-10-01 at 17:58 +0200, Martin Blumenstingl wrote:
> Hello Peppe,
> 
> On Mon, Sep 26, 2016 at 8:17 AM, Giuseppe CAVALLARO
>  wrote:
> > 
> > Hello André
> > 
> > On 9/17/2016 11:23 PM, André Roth wrote:
> > > 
> > > 
> > > 
> > > Hi all,
> > > 
> > > I have an odroid c2 board which shows this issue. No data is
> > > transmitted or received after a moment of intense tx traffic.
> > > Copying a
> > > 1GB file per scp from the board triggers it repeatedly.
> > > 
> > > The board has a stmmac - user ID: 0x11, Synopsys ID: 0x37.
> > > 
> > > When switching the network to 100Mb/s the copying does
> > > not seam to trigger the issue.
> > > 
> > > I've attached the ethtool statistics before and after the
> > > problem.
> > 
> > 
> > at first glance, it enters in EEE mode often in the ethtool.after.
> > On some platforms we met problems and it was necessary to disable
> > the
> > feature. Maybe, you can start looking at if this is true on yours.
> > We will see to provide a clean subset of patches to switch-on/off
> > it.
> I did some hacking in the stmmac driver to disable the LPI stuff (see
> the attachment)
> 
> Unfortunately this did not fix the problem.
> 
> I did not issue any ethtool commands not shown in the logs.
> Also I did not have time to change the AXI tuning / PBL value yet -
> so
> those are also untouched.
> 
> I will keep testing, but unfortunately my device is starting to fall
> apart (I sometimes have DDR initialization issues and u-boot fails to
> come up, oh dear...).

Hi all,

I did several tests on this issue with amlogic's S905 SoC (Synopsys MAC
- user ID: 0x11, Synopsys ID: 0x37.) 

With the OdroidC2 (PHY Realtek RTL8211F), EEE is on by default.
Just before launching iperf3, here are the ethtool stats regarding LPI:
     irq_tx_path_in_lpi_mode_n: 6
 irq_tx_path_exit_lpi_mode_n: 5
 irq_rx_path_in_lpi_mode_n: 76
 irq_rx_path_exit_lpi_mode_n: 75
 phy_eee_wakeup_error_n: 0

Sending data with iperf usually works for little while (between 0 and
10s)

# iperf3 -c 192.168.1.170 -p12345
Connecting to host 192.168.1.170, port 12345
local 192.168.1.30 port 54450 connected to 192.168.1.170 port 12345
Interval   Transfer Bandwidth   Retr  Cwnd
0.00-1.00   sec   112 MBytes   938 Mbits/sec0409 KBytes   
1.00-2.00   sec   112 MBytes   940 Mbits/sec0426 KBytes       
2.00-3.00   sec   112 MBytes   939 Mbits/sec0426 KBytes   
3.00-4.00   sec   112 MBytes   940 Mbits/sec0426 KBytes   
4.00-5.00   sec   112 MBytes   940 Mbits/sec0426 KBytes   
5.00-6.00   sec   112 MBytes   939 Mbits/sec0426 KBytes   
6.00-7.00   sec  9.26 MBytes  77.6 Mbits/sec2   1.41 KBytes   
7.00-8.00   sec  0.00 Bytes  0.00 bits/sec1   1.41 KBytes   
8.00-9.00   sec  0.00 Bytes  0.00 bits/sec0   1.41 KBytes   
^C10.00-13.58  sec  0.00 Bytes  0.00 bits/sec1   1.41 KBytes   
- - - - - - - - - - - - - - - - - - - - - - - - -
Interval   Transfer Bandwidth   Retr
0.00-13.58  sec   681 MBytes   421 Mbits/sec4 sender
0.00-13.58  sec  0.00 Bytes  0.00 bits/sec  receiver
iperf3: interrupt - the client has terminated

iperf3 does not exit ant the link seems completely broken. We cannot
send or receive until the interface is brought down then up again.

Here are the LPI related stats after the test:
     irq_tx_path_in_lpi_mode_n: 48
 irq_tx_path_exit_lpi_mode_n: 48
 irq_rx_path_in_lpi_mode_n: 325
 irq_rx_path_exit_lpi_mode_n: 325
 phy_eee_wakeup_error_n: 0

Like Martin, I tried playing around with eee in stmmac, but I could not
improve the situation. Then I tried disabling EEE advertisement on the
PHY (patch attached). With this patch, iperf3 runs nicely for me.

This is what the folks of FreeBSD have done for the Same MAC/PHY
combination [0]

On the P200 Board (PHY Micrel KSZ9031), EEE is off by default. There is
no problem on this board right now. I tried to force the activation of
EEE on this board and ended up in the same situation as the OdroidC2
(link broken). The stats were a bit different though:
     irq_tx_path_in_lpi_mode_n: 28
 irq_tx_path_exit_lpi_mode_n: 28
 irq_rx_path_in_lpi_mode_n: 408
 irq_rx_path_exit_lpi_mode_n: 408
 phy_eee_wakeup_error_n: 5440

To everybody having similar issue with their OdroidC2, could you try
the attached patch and let us know if it changes anything for you ?

Peppe, Alexandre,
What is your view on this ? I'm not sure that removing EEE
advertisement is the right way to address the problem ?
Could it be an issue stmmac ?
If there is any other information / test which would help understand
the issue, please let me know.

Cheers

Jerome


[0] : https://github.com/freebsd/freebsd-base-graphics/commit/1f49e276c
3801545dc0a337792a5f07e6ad39c84
 

> ___
> linux-amlogic mailing list
> linux-amlo...@lists.infradead.org
> 

Is there a maximum bytes in flight limitation in the tcp stack?

2016-11-03 Thread De Schepper, Koen (Nokia - BE)
Hi,

We experience some limit on the maximum packets in flight which seem not to be 
related with the receive or write buffers. Does somebody know if there is an 
issue with a maximum of around 1MByte (or sometimes 2Mbyte) of data in flight 
per TCP flow?

It seems to be a strict and stable limit independent from the CC (tested with 
Cubic, Reno and DCTCP). On a link of 200Mbps and 200ms RTT our link is only 20% 
(sometimes 40%, see conditions below) utilized for a single TCP flow with no 
drop experienced at all (no bottleneck in the AQM or RTT emulation, as it 
supports more throughput if multiple flows are active).

Some configuration changes we already tried on both client and server (kernel 
3.18.9):

net.ipv4.tcp_no_metrics_save = 1
net.ipv4.tcp_rmem = 4096 87380 6291456
net.ipv4.tcp_wmem = 4096 16384 4194304

SERVER# ss -i
tcpESTAB  0  1049728  10.187.255.211:46642 10.187.16.194:ssh
 dctcp wscale:7,7 rto:408 rtt:204.333/0.741 ato:40 mss:1448 cwnd:1466 
send 83.1Mbps unacked:728 rcv_rtt:212 rcv_space:29200
CLIENT# ss -i
tcpESTAB  0  288  10.187.16.194:ssh  10.187.255.211:46642
 dctcp wscale:7,7 rto:404 rtt:203.389/0.213 ato:40 mss:1448 cwnd:78 
send 4.4Mbps unacked:8 rcv_rtt:204 rcv_space:1074844

When increasing the write and receive mem further (they were already way above 
1 or 2 MB) it steps to double (40%; 2Mbytes in flight):
net.ipv4.tcp_no_metrics_save = 1
net.ipv4.tcp_rmem = 4096 800 16291456
net.ipv4.tcp_wmem = 4096 800 16291456

SERVER # ss -i
tcpESTAB  0  2068976  10.187.255.212:54637 10.187.16.112:ssh
 cubic wscale:8,8 rto:404 rtt:202.622/0.061 ato:40 mss:1448 cwnd:1849 
ssthresh:1140 send 105.7Mbps unacked:1457 rcv_rtt:217.5 rcv_space:29200
CLIENT# ss -i
tcpESTAB  0  648  10.187.16.112:ssh  10.187.255.212:54637
 cubic wscale:8,8 rto:404 rtt:201.956/0.038 ato:40 mss:1448 cwnd:132 
send 7.6Mbps unacked:18 rcv_rtt:204 rcv_space:2093044

Further increasing (x10) does not help anymore...
net.ipv4.tcp_no_metrics_save = 1
net.ipv4.tcp_rmem = 4096 8000 162914560
net.ipv4.tcp_wmem = 4096 8000 162914560

As all these parameters autotune, it is hard to find out which one is 
limiting... In the examples, above unacked does not want to go higher, while 
congestion window in the server is big enough... rcv_space could be limiting, 
but it tunes up if I change the server with the higher buffers (switching to 
2MByte in flight).

We also tried tcp_limit_output_bytes, setting it bigger (x10) and smaller(/10), 
without effect. We've put it in /etc/sysctl.conf and rebooted, to make sure 
that it is effective.

Some more detailed tests that had an effect on the 1 or 2MByte:
- It seems that with TSO off, if we configure a bigger wmem buffer, an ongoing 
flow suddenly is able to immediately double its bytes in flight limit. We 
configured further up to more than 10x the buffer, but no further increase 
helps, and the limits we saw are only 1MByte and 2Mbyte (no intermediate values 
depending on any parameter). When setting tcp_wmem smaller again, the 2MByte 
limit stays on the ongoing flow. We have to restart the flow to make the buffer 
reduction to 1MByte effective.
- With TSO on, only the 2MByte limit is effective, independent from the wmem 
buffer. We have to restart the flow to make a tso change effective.

Koen.



[PATCH net-next] net: Update raw socket bind to consider l3 domain

2016-11-03 Thread David Ahern
Binding a raw socket to a local address fails if the socket is bound
to an L3 domain:

$ vrf-test  -s -l 10.100.1.2 -R -I red
error binding socket: 99: Cannot assign requested address

Update raw_bind to look consider if sk_bound_dev_if is bound to an L3
domain and use inet_addr_type_table to lookup the address.

Signed-off-by: David Ahern 
---
 net/ipv4/raw.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 6a0bd68a565b..9ef2a602f052 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -695,12 +695,20 @@ static int raw_bind(struct sock *sk, struct sockaddr 
*uaddr, int addr_len)
 {
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+   u32 tb_id = RT_TABLE_LOCAL;
int ret = -EINVAL;
int chk_addr_ret;
 
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
goto out;
-   chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
+
+   if (sk->sk_bound_dev_if)
+   tb_id = l3mdev_fib_table_by_index(sock_net(sk),
+sk->sk_bound_dev_if) ? : tb_id;
+
+   chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
+   tb_id);
+
ret = -EADDRNOTAVAIL;
if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
-- 
2.1.4



stmmac: GMAC_RGSMIIIS reports bogus values

2016-11-03 Thread Alexey Brodkin
Hello,

I'm seeing pretty strange issue with GMAC reporting a lot of link state changes
based on bits in GMAC_RGSMIIIS. It looks like that:
-->8---
Link is Down
Link is Up - 10/Full
Link is Down
Link is Up - 10/Half
Link is Down
Link is Down
Link is Up - 10/Half
Link is Up -
1000/Half
Link is Down
Link is Down
Link is Down
Link is Down
Link is Up - 10/Half
Link is Down
Link is Down
Link is Up -
1000/Half
Link is Up - 1000/Full
-->8---

What's especially interesting my board with GMAC is connected to 100Mbit device
which means there's no chance for 1Gb mode to be set.

Also this has nothing to do with link state detected and reported by PHY via 
MDIO.
So obviously GMAC_RGSMIIIS bits are wrong. But given the fact that 
GMAC_RGSMIIIS bits
are set based on state of RXD[3:0] lines of RGMII I may only thing that it's
PHY (in my case DP83865) who's sending random data on the RXD during 
inter-frame gap.

Note data transferred through that networking connection is perfectly correct 
and
actually I haven't see those link state prints before kernel v4.8 basically
because the prints in question were implemented with pr_debug() and then with 
[1]
we got pr_info() that made prints visible by default.

Since I don't have any means to capture all required GMII signals to do better
analysis and my data is not corrupted in the end I'm thinking about way how to
mute these pretty senseless messages.

One thing I may think of we may disable checking of GMAC_RGSMIIIS if a 
particular
board uses MDIO for PHY setup. Something like that:
-->8---
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -337,7 +337,7 @@ static int dwmac1000_irq_status(struct mac_device_info *hw,
 
dwmac_pcs_isr(ioaddr, GMAC_PCS_BASE, intr_status, x);
 
-   if (intr_status & PCS_RGSMIIIS_IRQ)
+   if (!priv->use_mdio && (intr_status & PCS_RGSMIIIS_IRQ))
dwmac1000_rgsmii(ioaddr, x);
 
return ret;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 6c85b61aaa0b..34e9de0450ba 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -3356,11 +3356,13 @@ int stmmac_dvr_probe(struct device *device,
 
stmmac_check_pcs_mode(priv);
 
+   priv->use_mdio = 0;
if (priv->hw->pcs != STMMAC_PCS_RGMII  &&
priv->hw->pcs != STMMAC_PCS_TBI &&
priv->hw->pcs != STMMAC_PCS_RTBI) {
/* MDIO bus Registration */
ret = stmmac_mdio_register(ndev);
+   priv->use_mdio = 1;
if (ret < 0) {
pr_debug("%s: MDIO bus (id: %d) registration failed",
 __func__, priv->plat->bus_id);
-->8---

Any thoughts on that are much appreciated!

Regards,
Alexey

[1] 
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=70523e639bf8ca09b3357371c3546cee55c06351

Re: [PATCH net] ipv6: dccp: add missing bind_conflict to dccp_ipv6_mapped

2016-11-03 Thread Arnaldo Carvalho de Melo
Em Thu, Nov 03, 2016 at 08:59:46AM -0700, Eric Dumazet escreveu:
> From: Eric Dumazet 
> 
> While fuzzing kernel with syzkaller, Andrey reported a nasty crash
> in inet6_bind() caused by DCCP lacking a required method.

Ouch, thanks, forgot the mapped case :-)

Acked-by: Arnaldo Carvalho de Melo 

- Arnaldo
 
> Fixes: ab1e0a13d7029 ("[SOCK] proto: Add hashinfo member to struct proto")
> Signed-off-by: Eric Dumazet 
> Reported-by: Andrey Konovalov 
> Tested-by: Andrey Konovalov 
> Cc: Arnaldo Carvalho de Melo 
> ---
>  net/dccp/ipv6.c |1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
> index 3828f94b234c..95353bdbfa7b 100644
> --- a/net/dccp/ipv6.c
> +++ b/net/dccp/ipv6.c
> @@ -956,6 +956,7 @@ static const struct inet_connection_sock_af_ops
> dccp_ipv6_mapped = {
>   .getsockopt= ipv6_getsockopt,
>   .addr2sockaddr = inet6_csk_addr2sockaddr,
>   .sockaddr_len  = sizeof(struct sockaddr_in6),
> + .bind_conflict = inet6_csk_bind_conflict,
>  #ifdef CONFIG_COMPAT
>   .compat_setsockopt = compat_ipv6_setsockopt,
>   .compat_getsockopt = compat_ipv6_getsockopt,
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe dccp" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net] ipv6: dccp: add missing bind_conflict to dccp_ipv6_mapped

2016-11-03 Thread Eric Dumazet
From: Eric Dumazet 

While fuzzing kernel with syzkaller, Andrey reported a nasty crash
in inet6_bind() caused by DCCP lacking a required method.

Fixes: ab1e0a13d7029 ("[SOCK] proto: Add hashinfo member to struct proto")
Signed-off-by: Eric Dumazet 
Reported-by: Andrey Konovalov 
Tested-by: Andrey Konovalov 
Cc: Arnaldo Carvalho de Melo 
---
 net/dccp/ipv6.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 3828f94b234c..95353bdbfa7b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -956,6 +956,7 @@ static const struct inet_connection_sock_af_ops
dccp_ipv6_mapped = {
.getsockopt= ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len  = sizeof(struct sockaddr_in6),
+   .bind_conflict = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,




Re: [Intel-wired-lan] [PATCH] e1000e: free IRQ when the link is up or down

2016-11-03 Thread Baicar, Tyler

On 11/3/2016 2:09 AM, Ruinskiy, Dima wrote:

-Original Message-
From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On
Behalf Of Tyler Baicar
Sent: Wednesday, 02 November, 2016 23:08
To: Kirsher, Jeffrey T; intel-wired-...@lists.osuosl.org;
netdev@vger.kernel.org; linux-ker...@vger.kernel.org;
ok...@codeaurora.org; ti...@codeaurora.org
Cc: Tyler Baicar
Subject: [Intel-wired-lan] [PATCH] e1000e: free IRQ when the link is up or
down

Move IRQ free code so that it will happen regardless of the link state.
Currently the e1000e driver only releases its IRQ if the link is up. This is not
sufficient because it is possible for a link to go down without releasing the 
IRQ.
A secondary bus reset can cause this case to happen.

Signed-off-by: Tyler Baicar 
---
drivers/net/ethernet/intel/e1000e/netdev.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c
b/drivers/net/ethernet/intel/e1000e/netdev.c
index 7017281..36cfcb0 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -4679,12 +4679,13 @@ int e1000e_close(struct net_device *netdev)

if (!test_bit(__E1000_DOWN, >state)) {
e1000e_down(adapter, true);
-   e1000_free_irq(adapter);

/* Link status message must follow this format */
pr_info("%s NIC Link is Down\n", adapter->netdev->name);
}

+   e1000_free_irq(adapter);
+
napi_disable(>napi);

e1000e_free_tx_resources(adapter->tx_ring);

This is not correct. __E1000_DOWN has nothing to do with link state. It is an 
internal driver status bit that indicates that device shutdown is in progress.

I would not change this code without checking very carefully the driver state 
machine. This can cause a whole lot of issues. Did you encounter some 
particular problem that is resolved by this change?

Hello Dima,

The issue is that when a secondary bus reset occurs the current code 
will not free the IRQ due to this __E1000_DOWN check. If the IRQ isn't 
freed, then later in e1000_remove we run into a kernel bug:


pcieport 0004:00:00.0: PCIe Bus Error: severity=Corrected, type=Physical 
Layer, id=(Receiver ID)
pcieport 0004:00:00.0:   device [17cb:0400] error 
status/mask=0001/6000

pcieport 0004:00:00.0:[ 0] Receiver Error (First)
pcieport 0004:00:00.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), 
type=Transaction Layer, id=(Requester ID)
pcieport 0004:00:00.0:   device [17cb:0400] error 
status/mask=4000/0040

pcieport 0004:00:00.0:[14] Completion Timeout (First)
ACPI: \_SB_.PCI4: Device has suffered a power fault
kernel BUG at drivers/pci/msi.c:369!

The stack dump is:

free_msi_irqs+0x6c/0x1a8
pci_disable_msi+0xb0/0x148
e1000e_reset_interrupt_capability+0x60/0x78
e1000_remove+0xc8/0x180
pci_device_remove+0x48/0x118
__device_release_driver+0x80/0x108
device_release_driver+0x2c/0x40
pci_stop_bus_device+0xa0/0xb0
pci_stop_bus_device+0x3c/0xb0
pci_stop_root_bus+0x54/0x80
acpi_pci_root_remove+0x28/0x64
acpi_bus_trim+0x6c/0xa4
acpi_device_hotplug+0x19c/0x3f4
acpi_hotplug_work_fn+0x28/0x3c
process_one_work+0x150/0x460
worker_thread+0x50/0x4b8
kthread+0xd4/0xe8
ret_from_fork+0x10/0x50

This bug is hit because the IRQ still has action since it was never 
freed. This patch resolves this issue.


Thanks,
Tyler

--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.



URGENTLY CONFIRM

2016-11-03 Thread REV.JOHN GAZDA
-- 
 HELLO,
 HAVE YOU RECEIVED THE FUNDS? PLEASE CONFIRM TO ME IF YOU HAVE RECEIVED
 THE FUND OR NOT.
 REV.JOHN GAZDA


Re: net/ipv6: null-ptr-deref in inet6_bind

2016-11-03 Thread Andrey Konovalov
Hi Eric,

It seems that your patch fixes the issue, I'm not seeing the report any more.

Tested-by: Andrey Konovalov 

Thanks!

On Thu, Nov 3, 2016 at 4:39 PM, Eric Dumazet  wrote:
> On Wed, Nov 2, 2016 at 2:14 PM, Andrey Konovalov  
> wrote:
>> Hi,
>>
>> I've got the following error report while running the syzkaller fuzzer:
>>
>> BUG: unable to handle kernel NULL pointer dereference at   (null)
>> IP: [<  (null)>]   (null)
>> PGD 66b6f067 [  102.549865] PUD 66c6e067
>> PMD 0 [  102.549865]
>> Oops: 0010 [#1] SMP KASAN
>> Modules linked in:
>> CPU: 0 PID: 4143 Comm: a.out Not tainted 4.9.0-rc3+ #336
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>> task: 880066b1c200 task.stack: 880065b58000
>> RIP: 0010:[<>]  [<  (null)>]   (null)
>> RSP: 0018:880065b5fbc0  EFLAGS: 00010246
>> RAX: 880066b1c200 RBX: 88006873864a RCX: 
>> RDX: 0001 RSI: 880068738640 RDI: 880063bd3200
>> RBP: 880065b5fd20 R08: 11000c77a713 R09: dc00
>> R10: 844fc800 R11: 11000d0e70c9 R12: 84e7e040
>> R13: 880068738640 R14: 880063bd3200 R15: 86836380
>> FS:  7f40b7acf700() GS:88006cc0() knlGS:
>> CS:  0010 DS:  ES:  CR0: 80050033
>> CR2:  CR3: 6bb28000 CR4: 06f0
>> Stack:
>>  83099988 8479f7e8 81208580 110c
>>  41b58ab3 8479f7e8 81208580 812506ed
>>  0007 880065b5fc18 812506ed 880065b5fcd0
>> Call Trace:
>>  [] inet6_bind+0x8ec/0x1020 net/ipv6/af_inet6.c:384
>>  [] SYSC_bind+0x1ec/0x250 net/socket.c:1367
>>  [] SyS_bind+0x24/0x30 net/socket.c:1353
>>  [] entry_SYSCALL_64_fastpath+0x1f/0xc2
>> arch/x86/entry/entry_64.S:209
>> Code:  Bad RIP value.
>> RIP  [<  (null)>]   (null)
>>  RSP 
>> CR2: 
>> ---[ end trace b5ec698ae4926a97 ]---
>> Kernel panic - not syncing: Fatal exception in interrupt
>> Kernel Offset: disabled
>> ---[ end Kernel panic - not syncing: Fatal exception in interrupt
>>
>> On commit 0c183d92b20b5c84ca655b45ef57b3318b83eb9e (Oct 31).
>>
>> I'm able to reproduce it with the attached program by running it as:
>> $ gcc -lpthread inet6-bind-poc.c
>> $ while true; do ./a.out; done
>>
>> Thanks!
>
> Looks like this patch should fix it ?
>
> Thanks !
>
> diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
> index 3828f94b234c..95353bdbfa7b 100644
> --- a/net/dccp/ipv6.c
> +++ b/net/dccp/ipv6.c
> @@ -956,6 +956,7 @@ static const struct inet_connection_sock_af_ops
> dccp_ipv6_mapped = {
> .getsockopt= ipv6_getsockopt,
> .addr2sockaddr = inet6_csk_addr2sockaddr,
> .sockaddr_len  = sizeof(struct sockaddr_in6),
> +   .bind_conflict = inet6_csk_bind_conflict,
>  #ifdef CONFIG_COMPAT
> .compat_setsockopt = compat_ipv6_setsockopt,
> .compat_getsockopt = compat_ipv6_getsockopt,


Re: [RFC] make kmemleak scan __ro_after_init section (was: Re: [PATCH 0/5] genetlink improvements)

2016-11-03 Thread Johannes Berg
Hi,

Sorry for not chipping in earlier - LPC is taking my time.

> > > > Looks like we are missing a kfree(family->attrbuf); on error
> > > > path, but it is not related to Johannes' recent patches.

Actually, I think it *is* related to my patch - I inserted the code
there in the wrong place or so. I'll find a fix for that when I'm back
home, or you (Cong) can submit yours. It wasn't likely that this was
the problem though, since that's just an error path that should never
happen (we have <30 genl families, and a 16-bit space for their IDs)

> I realized that kmemleak is not scanning the __ro_after_init
> section...
> Following patch solves the false positives but I wonder if it's the
> right/acceptable solution.

Hah, makes sense to me, but I guess we really want Catalin to comment
:)

johannes


Re: [PATCH 2/2] rtl8xxxu: Fix for bogus data used to determine macpower

2016-11-03 Thread Larry Finger

On 11/03/2016 03:41 AM, Joe Perches wrote:

On Sun, 2016-10-30 at 19:02 -0400, Jes Sorensen wrote:

Code is 80 characters wide, and comments are /* */ never the ugly C++
crap.


You might look at the recent Linus Torvalds authored commit
5e467652ffef (?printk: re-organize log_output() to be more legible")
which does both of those: c99 // comments and > 80 columns.

Absolutes are for zealots.


Of course, but who is going to criticize Linus? I have gently chided him when an 
untested patch of his was inserted just before the final release, and broke my 
laptop. At least the bisection was pretty quick. :)


Larry






Re: net/ipv6: null-ptr-deref in inet6_bind

2016-11-03 Thread Eric Dumazet
On Wed, Nov 2, 2016 at 2:14 PM, Andrey Konovalov  wrote:
> Hi,
>
> I've got the following error report while running the syzkaller fuzzer:
>
> BUG: unable to handle kernel NULL pointer dereference at   (null)
> IP: [<  (null)>]   (null)
> PGD 66b6f067 [  102.549865] PUD 66c6e067
> PMD 0 [  102.549865]
> Oops: 0010 [#1] SMP KASAN
> Modules linked in:
> CPU: 0 PID: 4143 Comm: a.out Not tainted 4.9.0-rc3+ #336
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: 880066b1c200 task.stack: 880065b58000
> RIP: 0010:[<>]  [<  (null)>]   (null)
> RSP: 0018:880065b5fbc0  EFLAGS: 00010246
> RAX: 880066b1c200 RBX: 88006873864a RCX: 
> RDX: 0001 RSI: 880068738640 RDI: 880063bd3200
> RBP: 880065b5fd20 R08: 11000c77a713 R09: dc00
> R10: 844fc800 R11: 11000d0e70c9 R12: 84e7e040
> R13: 880068738640 R14: 880063bd3200 R15: 86836380
> FS:  7f40b7acf700() GS:88006cc0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2:  CR3: 6bb28000 CR4: 06f0
> Stack:
>  83099988 8479f7e8 81208580 110c
>  41b58ab3 8479f7e8 81208580 812506ed
>  0007 880065b5fc18 812506ed 880065b5fcd0
> Call Trace:
>  [] inet6_bind+0x8ec/0x1020 net/ipv6/af_inet6.c:384
>  [] SYSC_bind+0x1ec/0x250 net/socket.c:1367
>  [] SyS_bind+0x24/0x30 net/socket.c:1353
>  [] entry_SYSCALL_64_fastpath+0x1f/0xc2
> arch/x86/entry/entry_64.S:209
> Code:  Bad RIP value.
> RIP  [<  (null)>]   (null)
>  RSP 
> CR2: 
> ---[ end trace b5ec698ae4926a97 ]---
> Kernel panic - not syncing: Fatal exception in interrupt
> Kernel Offset: disabled
> ---[ end Kernel panic - not syncing: Fatal exception in interrupt
>
> On commit 0c183d92b20b5c84ca655b45ef57b3318b83eb9e (Oct 31).
>
> I'm able to reproduce it with the attached program by running it as:
> $ gcc -lpthread inet6-bind-poc.c
> $ while true; do ./a.out; done
>
> Thanks!

Looks like this patch should fix it ?

Thanks !

diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 3828f94b234c..95353bdbfa7b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -956,6 +956,7 @@ static const struct inet_connection_sock_af_ops
dccp_ipv6_mapped = {
.getsockopt= ipv6_getsockopt,
.addr2sockaddr = inet6_csk_addr2sockaddr,
.sockaddr_len  = sizeof(struct sockaddr_in6),
+   .bind_conflict = inet6_csk_bind_conflict,
 #ifdef CONFIG_COMPAT
.compat_setsockopt = compat_ipv6_setsockopt,
.compat_getsockopt = compat_ipv6_getsockopt,


Re: [PATCH 1/2] rtl8xxxu: Fix for authentication failure

2016-11-03 Thread Larry Finger

On 11/03/2016 02:10 AM, John Heenan wrote:

On 3 November 2016 at 11:00, Larry Finger  wrote:

On 10/30/2016 05:20 AM, John Heenan wrote:


This fix enables the same sequence of init behaviour as the alternative
working driver for the wireless rtl8723bu IC at
https://github.com/lwfinger/rtl8723bu

For exampe rtl8xxxu_init_device is now called each time
userspace wpa_supplicant is executed instead of just once when
modprobe is executed.



After all the trouble you have had with your patches, I would expect you to
use more care when composing the commit message. Note the typo in the
paragraph above.



OK, the nasty games continue and the message is not getting through.

An appropriate response by a maintainer would have been to request I
revise the code according to the way it has currently and elegantly
revised in.


I am NOT a maintainer for this driver. I do have an interest in it, but not in 
any official capacity.


If you cannot accept constructive criticism, you are in the wrong activity. 
Please grow up!


Larry




Re: net/netlink: global-out-of-bounds in genl_family_rcv_msg/validate_nla

2016-11-03 Thread Andrey Konovalov
Hi Cong,

Yes, the last patch fixes the issue.

Tested-by: Andrey Konovalov 

Thanks!

On Thu, Nov 3, 2016 at 6:29 AM, Cong Wang  wrote:
> On Wed, Nov 2, 2016 at 10:25 PM, Cong Wang  wrote:
>> On Wed, Nov 2, 2016 at 5:25 PM, Andrey Konovalov  
>> wrote:
>>> Hi,
>>>
>>> I've got the following error report while running the syzkaller fuzzer:
>>>
>>> ==
>>> BUG: KASAN: global-out-of-bounds in validate_nla+0x49b/0x4e0 at addr
>>> 8407e3ac
>>> Read of size 2 by task a.out/3877
>>> Address belongs to variable[]
>>> cgroupstats_cmd_get_policy+0xc/0x40 ??:?
>>
>> Seems taskstats doesn't use genetlink correctly, CGROUPSTATS_CMD_ATTR_FD
>> is not within 0~TASKSTATS_CMD_ATTR_MAX.
>>
>> I guess we need the following patch, but it certainly breaks user-space... 
>> :-/
>
>
> Wait, maybe just this one-line fix is enough:
>
> diff --git a/kernel/taskstats.c b/kernel/taskstats.c
> index b3f05ee..e6b342e 100644
> --- a/kernel/taskstats.c
> +++ b/kernel/taskstats.c
> @@ -54,7 +54,7 @@ static const struct nla_policy
> taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1
> [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
> [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
>
> -static const struct nla_policy
> cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = {
> +static const struct nla_policy
> cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
> [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
>  };


Re: [PATCH net-next v2 0/5] bpf: BPF for lightweight tunnel encapsulation

2016-11-03 Thread Thomas Graf
On 3 November 2016 at 08:52, Hannes Frederic Sowa
 wrote:
> On 02.11.2016 23:54, Thomas Graf wrote:
>> Why would I want to accept the overhead if I simply avoid it? Just
>> parsing the header and doing the hash lookup will add cost, cost for
>> each packet.
>
> That is true, but in case you are outside of the namespace, you still
> have to calculate the cost of doing the FIB lookup for the BPF program
> each time, too.
>
> E.g. given the lookup cost in a hash for a netnwork namespace pointer
> vs. the cost of doing a FIB lookup to get a program that does a specific
> transformation sounds at least in the big O-notiation to be in a better
> place. ;)
>
> If you have to do both anyway, probably your patchset will perform
> better, I agree.

Most containers are unprivileged, the route inside the container's
namespace is owned by the host and we can attach the BPF program
directly to the default route inside the container and all packets
egressing from the container will pass through it. That fib lookup is
needed anyway so we can leverage the cost of that lookup. We can drop
hostile packets early without ever going on L2 level.


Re: [PATCH net-next v1 17/21] amd-xgbe: Add I2C support for determining SFP media types

2016-11-03 Thread Andrew Lunn
> There are a couple of things about this. Russel's work isn't part of
> the kernel yet so I can't make use of it.

Well, you could guide it into the kernel. Part of it has already made
the way in. And i know of other platforms which would benefit from it.

> Additionally, the I2C device is integrated into the IP of the
> network device with register addresses being offsets of the network
> device BAR so I'm not sure how I would go about getting it setup in
> order to use the i2c infrastructure.

Have you looked at the core i2c stuff? All you need is an
i2c_algorithim structure:

http://lxr.free-electrons.com/source/include/linux/i2c.h#L407

and an i2c_adaptor structure:

http://lxr.free-electrons.com/source/include/linux/i2c.h#L532

and then you can call i2c_add_adapter() to register your i2c bus with
the i2c core. Embedded such an i2c driver inside another driver is not
a problem.

  Andrew



Re: [PATCH net-next v2 1/1] driver: veth: Refine the statistics codes of veth driver

2016-11-03 Thread Gao Feng
Hi Eric,

On Thu, Nov 3, 2016 at 11:07 PM, Eric Dumazet  wrote:
> On Thu, 2016-11-03 at 22:38 +0800, Gao Feng wrote:
>
>> Because other net devices put the statistics together.
>> Take tun/tap as example, it is a virtual device, but its all
>> statistics are percpu including dropped.
>
> Take a look at 2681128f0ced8aa4e66f221197e183cc16d244fe
> ("veth: reduce stat overhead")
>
> Feel free to fix tun/tap, not bloat veth and undo my work,
> without knowing why it was done this way.
>
> Thanks.
>
>

Thanks your detail explanations.

Best Regards
Feng




Re: [PATCH net-next v2 1/1] driver: veth: Refine the statistics codes of veth driver

2016-11-03 Thread Eric Dumazet
On Thu, 2016-11-03 at 22:38 +0800, Gao Feng wrote:

> Because other net devices put the statistics together.
> Take tun/tap as example, it is a virtual device, but its all
> statistics are percpu including dropped.

Take a look at 2681128f0ced8aa4e66f221197e183cc16d244fe
("veth: reduce stat overhead")

Feel free to fix tun/tap, not bloat veth and undo my work,
without knowing why it was done this way.

Thanks.




RE: [mm PATCH v2 01/26] swiotlb: Drop unused functions swiotlb_map_sg and swiotlb_unmap_sg

2016-11-03 Thread Duyck, Alexander H
> -Original Message-
> From: Christoph Hellwig [mailto:h...@infradead.org]
> Sent: Thursday, November 3, 2016 7:46 AM
> To: Konrad Rzeszutek Wilk 
> Cc: Christoph Hellwig ; Duyck, Alexander H
> ; linux...@kvack.org; akpm@linux-
> foundation.org; netdev@vger.kernel.org; linux-ker...@vger.kernel.org
> Subject: Re: [mm PATCH v2 01/26] swiotlb: Drop unused functions
> swiotlb_map_sg and swiotlb_unmap_sg
> 
> On Thu, Nov 03, 2016 at 10:29:52AM -0400, Konrad Rzeszutek Wilk wrote:
> > Somehow I thought you wanted to put them through your tree (which is
> > why I acked them).
> >
> > I can take them and also the first couple of Alexander through my
> > tree. Or if it makes it simpler - they can go through the -mm tree?
> 
> I don't have a tree for it, so I kinda expected you to pick it up.
> But I'm also fine with you just Acking the version from Alex and having him
> funnel it through whatever tree he wants to get his patches in through.

For the first 3 patches in my series I am fine with them being pulled into the 
swiotlb tree.  So if you want to pull Christoph's two patches, and then drop my 
duplicate patch and instead pull the next 2 I could submit a v3 of my series 
without the swiotlb patches in it.

At this point I have redone my series so that I technically don't have anything 
with a hard dependency on the DMA_ATTR_SKIP_CPU_SYNC actually doing anything 
yet.  My plan is to get this all into Linus's tree first via whatever tree I 
can get these patches pulled into and once I have all that I will start 
updating drivers in net-next.

Thanks.

- Alex


Re: [PATCH net-next v1 17/21] amd-xgbe: Add I2C support for determining SFP media types

2016-11-03 Thread Tom Lendacky
On 11/03/2016 09:28 AM, Andrew Lunn wrote:
> On Thu, Nov 03, 2016 at 08:30:36AM -0500, Tom Lendacky wrote:
>> Add support to initialize and use the I2C controller within the hardware
>> in order to determine the SFP media type that is installed.
> 
> Hi Tom

Hi Andrew

> 
> Did you see the work Russell King did for phylink?
> 
> https://lwn.net/Articles/667055/
> 
> Rather than doing your own proprietary i2c infrastructure, use the
> standard Linux i2c infrastructure. The SFP modules are then just
> normal i2c devices on a normal i2c bus. The work Russell did can then
> access them, export them to user space, get the PHY modes correctly
> setup, etc.

There are a couple of things about this. Russel's work isn't part of
the kernel yet so I can't make use of it.  Additionally, the I2C device
is integrated into the IP of the network device with register addresses
being offsets of the network device BAR so I'm not sure how I would go
about getting it setup in order to use the i2c infrastructure.

Thanks,
Tom

> 
>Andrew
> 


[PATCH 08/25] net/dev: Convert to hotplug state machine

2016-11-03 Thread Sebastian Andrzej Siewior
Install the callbacks via the state machine.

Cc: "David S. Miller" 
Cc: netdev@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Thomas Gleixner 
---
 include/linux/cpuhotplug.h |  1 +
 net/core/dev.c | 16 ++--
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 31c58f6ec3c6..394eb7ed53be 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -36,6 +36,7 @@ enum cpuhp_state {
CPUHP_PERCPU_CNT_DEAD,
CPUHP_RADIX_DEAD,
CPUHP_PAGE_ALLOC_DEAD,
+   CPUHP_NET_DEV_DEAD,
CPUHP_WORKQUEUE_PREP,
CPUHP_POWER_NUMA_PREPARE,
CPUHP_HRTIMERS_PREPARE,
diff --git a/net/core/dev.c b/net/core/dev.c
index 4bc19a164ba5..71693729bdd5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7947,18 +7947,13 @@ int dev_change_net_namespace(struct net_device *dev, 
struct net *net, const char
 }
 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
 
-static int dev_cpu_callback(struct notifier_block *nfb,
-   unsigned long action,
-   void *ocpu)
+static int dev_cpu_dead(unsigned int oldcpu)
 {
struct sk_buff **list_skb;
struct sk_buff *skb;
-   unsigned int cpu, oldcpu = (unsigned long)ocpu;
+   unsigned int cpu;
struct softnet_data *sd, *oldsd;
 
-   if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
-   return NOTIFY_OK;
-
local_irq_disable();
cpu = smp_processor_id();
sd = _cpu(softnet_data, cpu);
@@ -8008,10 +8003,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
input_queue_head_incr(oldsd);
}
 
-   return NOTIFY_OK;
+   return 0;
 }
 
-
 /**
  * netdev_increment_features - increment feature set by one
  * @all: current feature set
@@ -8345,7 +8339,9 @@ static int __init net_dev_init(void)
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
 
-   hotcpu_notifier(dev_cpu_callback, 0);
+   rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
+  NULL, dev_cpu_dead);
+   WARN_ON(rc < 0);
dst_subsys_init();
rc = 0;
 out:
-- 
2.10.2



Re: [PATCH net-next v1 02/21] amd-xgbe: Prepare for priority-based FIFO allocation

2016-11-03 Thread Tom Lendacky
On 11/03/2016 08:51 AM, Mintz, Yuval wrote:
>> +static void xgbe_calculate_equal_fifo(unsigned int fifo_size,
>> +  unsigned int queue_count,
>> +  unsigned int *fifo)
>> +{
> ...
>> +
>> +return;
>>  }
> 
> No need for explicit return.
> 

Ok, if there's feedback that requires a v2 I'll remove the return,
otherwise I plan to submit a checkpatch cleanup series as a follow-up
to fix some of the new warnings issued by the newer version of
checkpatch.

Thanks,
Tom


Re: [PATCH net-next v2 0/5] bpf: BPF for lightweight tunnel encapsulation

2016-11-03 Thread Hannes Frederic Sowa
On 02.11.2016 23:54, Thomas Graf wrote:
> On 1 November 2016 at 16:12, Hannes Frederic Sowa
>  wrote:
>> On 01.11.2016 21:59, Thomas Graf wrote:
 Dumping and verifying which routes get used might actually already be
 quite complex on its own. Thus my fear.
>>>
>>> We even have an API to query which route is used for a tuple. What
>>> else would you like to see?
>>
>> I am not sure here. Some ideas I had were to allow tcpdump (pf_packet)
>> sockets sniff at interfaces and also gather and dump the metadata to
>> user space (this would depend on bpf programs only doing the
>> modifications in metadata and not in the actual packet).
> 
> Not sure I understand. Why does this depend on BPF?

It doesn't. My hope was, if BPF merely tries to modify meta-data, we can
provide better debugging tools as if we mangle the packet directly.

>> Or maybe just tracing support (without depending on the eBPF program
>> developer to have added debugging in the BPF program).
> 
> Absolutely in favour of that.
> 
>>> This will be addressed with signing AFAIK.
>>
>> This sounds a bit unrealistic. Signing lots of small programs can be a
>> huge burden to the entity doing the signing (if it is not on the same
>> computer). And as far as I understood the programs should be generated
>> dynamically?
> 
> Right, for generated programs, a hash is a better fit and still sufficient.
> 
>>> Would it help if we allow to store the original source used for
>>> bytecode generation. What would make it clear which program was used.
>>
>> I would also be fine with just a strong hash of the bytecode, so the
>> program can be identified accurately. Maybe helps with deduplication
>> later on, too. ;)
> 
> OK, I think we all already agreed on doing this.
> 
>> Even though I read through the patchset I am not absolutely sure which
>> problem it really solves. Especially because lots of things can be done
>> already at the ingress vs. egress interface (I looked at patch 4 but I
>> am not sure how realistic they are).
> 
> Filtering at egress requires to attach the BPF program to all
> potential outgoing interface and then pass every single packet through
> the program whereas with LWT BPF, I'm only taking the cost where
> actually needed.

I do certainly see this point as a big plus. I definitely also thought
about this a lot when thinking about how flower can/should be used with
multiple interfaces and how to keep its flow tables synchronized.

>>> I also don't see how this could possibly scale if all packets must go
>>> through a single BPF program. The overhead will be tremendous if you
>>> only want to filter a couple of prefixes.
>>
>> In case of hash table lookup it should be fast. llvm will probably also
>> generate jump table for a few 100 ip addresses, no? Additionally the
>> routing table lookup could be not done at all.
> 
> Why would I want to accept the overhead if I simply avoid it? Just
> parsing the header and doing the hash lookup will add cost, cost for
> each packet.

That is true, but in case you are outside of the namespace, you still
have to calculate the cost of doing the FIB lookup for the BPF program
each time, too.

E.g. given the lookup cost in a hash for a netnwork namespace pointer
vs. the cost of doing a FIB lookup to get a program that does a specific
transformation sounds at least in the big O-notiation to be in a better
place. ;)

If you have to do both anyway, probably your patchset will perform
better, I agree.

Bye,
Hannes



[PATCH 09/25] net/flowcache: Convert to hotplug state machine

2016-11-03 Thread Sebastian Andrzej Siewior
Install the callbacks via the state machine. Use multi state support to avoid
custom list handling for the multiple instances.

Cc: "David S. Miller" 
Cc: Steffen Klassert 
Cc: Herbert Xu 
Cc: netdev@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Thomas Gleixner 
---
 include/linux/cpuhotplug.h |  1 +
 include/net/flow.h |  1 +
 include/net/flowcache.h|  2 +-
 net/core/flow.c| 60 --
 net/xfrm/xfrm_policy.c |  1 +
 5 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 394eb7ed53be..86b940f19df8 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -56,6 +56,7 @@ enum cpuhp_state {
CPUHP_ARM_SHMOBILE_SCU_PREPARE,
CPUHP_SH_SH3X_PREPARE,
CPUHP_BLK_MQ_PREPARE,
+   CPUHP_NET_FLOW_PREPARE,
CPUHP_TIMERS_DEAD,
CPUHP_NOTF_ERR_INJ_PREPARE,
CPUHP_MIPS_SOC_PREPARE,
diff --git a/include/net/flow.h b/include/net/flow.h
index 035aa7716967..2e386bd6ee63 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -239,6 +239,7 @@ struct flow_cache_object *flow_cache_lookup(struct net *net,
void *ctx);
 int flow_cache_init(struct net *net);
 void flow_cache_fini(struct net *net);
+void flow_cache_hp_init(void);
 
 void flow_cache_flush(struct net *net);
 void flow_cache_flush_deferred(struct net *net);
diff --git a/include/net/flowcache.h b/include/net/flowcache.h
index c8f665ec6e0d..9caf3bfc8d2d 100644
--- a/include/net/flowcache.h
+++ b/include/net/flowcache.h
@@ -17,7 +17,7 @@ struct flow_cache_percpu {
 struct flow_cache {
u32 hash_shift;
struct flow_cache_percpu __percpu *percpu;
-   struct notifier_block   hotcpu_notifier;
+   struct hlist_node   node;
int low_watermark;
int high_watermark;
struct timer_list   rnd_timer;
diff --git a/net/core/flow.c b/net/core/flow.c
index 3937b1b68d5b..841fd7f87b30 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -419,28 +419,20 @@ static int flow_cache_cpu_prepare(struct flow_cache *fc, 
int cpu)
return 0;
 }
 
-static int flow_cache_cpu(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int flow_cache_cpu_up_prep(unsigned int cpu, struct hlist_node *node)
 {
-   struct flow_cache *fc = container_of(nfb, struct flow_cache,
-   hotcpu_notifier);
-   int res, cpu = (unsigned long) hcpu;
+   struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
+
+   return flow_cache_cpu_prepare(fc, cpu);
+}
+
+static int flow_cache_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+   struct flow_cache *fc = hlist_entry_safe(node, struct flow_cache, node);
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
 
-   switch (action) {
-   case CPU_UP_PREPARE:
-   case CPU_UP_PREPARE_FROZEN:
-   res = flow_cache_cpu_prepare(fc, cpu);
-   if (res)
-   return notifier_from_errno(res);
-   break;
-   case CPU_DEAD:
-   case CPU_DEAD_FROZEN:
-   __flow_cache_shrink(fc, fcp, 0);
-   break;
-   }
-   return NOTIFY_OK;
+   __flow_cache_shrink(fc, fcp, 0);
+   return 0;
 }
 
 int flow_cache_init(struct net *net)
@@ -467,18 +459,8 @@ int flow_cache_init(struct net *net)
if (!fc->percpu)
return -ENOMEM;
 
-   cpu_notifier_register_begin();
-
-   for_each_online_cpu(i) {
-   if (flow_cache_cpu_prepare(fc, i))
-   goto err;
-   }
-   fc->hotcpu_notifier = (struct notifier_block){
-   .notifier_call = flow_cache_cpu,
-   };
-   __register_hotcpu_notifier(>hotcpu_notifier);
-
-   cpu_notifier_register_done();
+   if (cpuhp_state_add_instance(CPUHP_NET_FLOW_PREPARE, >node))
+   goto err;
 
setup_timer(>rnd_timer, flow_cache_new_hashrnd,
(unsigned long) fc);
@@ -494,8 +476,6 @@ int flow_cache_init(struct net *net)
fcp->hash_table = NULL;
}
 
-   cpu_notifier_register_done();
-
free_percpu(fc->percpu);
fc->percpu = NULL;
 
@@ -509,7 +489,8 @@ void flow_cache_fini(struct net *net)
struct flow_cache *fc = >xfrm.flow_cache_global;
 
del_timer_sync(>rnd_timer);
-   unregister_hotcpu_notifier(>hotcpu_notifier);
+
+   cpuhp_state_remove_instance_nocalls(CPUHP_NET_FLOW_PREPARE, >node);
 
for_each_possible_cpu(i) {
struct 

Re: [mm PATCH v2 01/26] swiotlb: Drop unused functions swiotlb_map_sg and swiotlb_unmap_sg

2016-11-03 Thread Christoph Hellwig
On Thu, Nov 03, 2016 at 10:29:52AM -0400, Konrad Rzeszutek Wilk wrote:
> Somehow I thought you wanted to put them through your tree (which
> is why I acked them).
> 
> I can take them and also the first couple of Alexander through
> my tree. Or if it makes it simpler - they can go through the -mm tree?

I don't have a tree for it, so I kinda expected you to pick it up.
But I'm also fine with you just Acking the version from Alex and having
him funnel it through whatever tree he wants to get his patches in
through.


Re: [PATCH net-next v2 1/1] driver: veth: Refine the statistics codes of veth driver

2016-11-03 Thread Eric Dumazet
On Thu, 2016-11-03 at 21:39 +0800, Gao Feng wrote:
> Hi Eric,
> 
> On Thu, Nov 3, 2016 at 9:30 PM, Eric Dumazet  wrote:
> > On Thu, 2016-11-03 at 21:03 +0800, f...@ikuai8.com wrote:
> >> From: Gao Feng 
> >>
> >> The dropped count of veth is located in struct veth_priv, but other
> >> statistics like packets and bytes are in another struct pcpu_vstats.
> >> Now keep these three counters in the same struct.
> >>
> >> Signed-off-by: Gao Feng 
> >> ---
> >>  v2: Use right "peer" instead of "dev";
> >>  v1: Initial version
> >
> > May I ask : Why ?
> 
> Just because I think statistics should be in the same struct.

That is not a good reason then.

> 
> >
> > We did that because there was no point making per-cpu requirements
> > bigger, for a counter that is hardly ever updated.
> >
> > Do you have a real case where performance dropping packets in a driver
> > is needed ?
> 
> No, I haven't met the performance issue now.

OK then kill this patch.

> 
> >
> > At some point we will have to stop dumb percpu explosion, when we have
> > 128+ cores per host. Folding all these percpu counters is taking a lot
> > of time too.
> >
> >
> >
> Ok, I get it. It is designed specially to put the dropped counter as
> atomic counter, not percpu.
> But I have one question that when put the counters as percpu, and when not?

Because the regular fast path needs to be fast ?

Try to _use_ veth without these percpu stats and be prepared to be
shocked.





Re: [PATCH net-next v2 1/1] driver: veth: Refine the statistics codes of veth driver

2016-11-03 Thread Gao Feng
On Thu, Nov 3, 2016 at 10:31 PM, Eric Dumazet  wrote:
> On Thu, 2016-11-03 at 21:39 +0800, Gao Feng wrote:
>> Hi Eric,
>>
>> On Thu, Nov 3, 2016 at 9:30 PM, Eric Dumazet  wrote:
>> > On Thu, 2016-11-03 at 21:03 +0800, f...@ikuai8.com wrote:
>> >> From: Gao Feng 
>> >>
>> >> The dropped count of veth is located in struct veth_priv, but other
>> >> statistics like packets and bytes are in another struct pcpu_vstats.
>> >> Now keep these three counters in the same struct.
>> >>
>> >> Signed-off-by: Gao Feng 
>> >> ---
>> >>  v2: Use right "peer" instead of "dev";
>> >>  v1: Initial version
>> >
>> > May I ask : Why ?
>>
>> Just because I think statistics should be in the same struct.
>
> That is not a good reason then.

Because other net devices put the statistics together.
Take tun/tap as example, it is a virtual device, but its all
statistics are percpu including dropped.

Regards
Feng

>
>>
>> >
>> > We did that because there was no point making per-cpu requirements
>> > bigger, for a counter that is hardly ever updated.
>> >
>> > Do you have a real case where performance dropping packets in a driver
>> > is needed ?
>>
>> No, I haven't met the performance issue now.
>
> OK then kill this patch.
>
>>
>> >
>> > At some point we will have to stop dumb percpu explosion, when we have
>> > 128+ cores per host. Folding all these percpu counters is taking a lot
>> > of time too.
>> >
>> >
>> >
>> Ok, I get it. It is designed specially to put the dropped counter as
>> atomic counter, not percpu.
>> But I have one question that when put the counters as percpu, and when not?
>
> Because the regular fast path needs to be fast ?
>
> Try to _use_ veth without these percpu stats and be prepared to be
> shocked.
>
>
>




Re: [mm PATCH v2 01/26] swiotlb: Drop unused functions swiotlb_map_sg and swiotlb_unmap_sg

2016-11-03 Thread Konrad Rzeszutek Wilk
On Thu, Nov 03, 2016 at 07:14:46AM -0700, Christoph Hellwig wrote:
> On Wed, Nov 02, 2016 at 07:12:31AM -0400, Alexander Duyck wrote:
> > There are no users for swiotlb_map_sg or swiotlb_unmap_sg so we might as
> > well just drop them.
> 
> FYI, I sent the same patch already on Sep, 11 and Konrad already ACKed
> it:
> 
> https://lkml.org/lkml/2016/9/11/112
> https://lkml.org/lkml/2016/9/16/474

Somehow I thought you wanted to put them through your tree (which
is why I acked them).

I can take them and also the first couple of Alexander through
my tree. Or if it makes it simpler - they can go through the -mm tree?



Re: [PATCH net-next v1 17/21] amd-xgbe: Add I2C support for determining SFP media types

2016-11-03 Thread Andrew Lunn
On Thu, Nov 03, 2016 at 08:30:36AM -0500, Tom Lendacky wrote:
> Add support to initialize and use the I2C controller within the hardware
> in order to determine the SFP media type that is installed.

Hi Tom

Did you see the work Russell King did for phylink?

https://lwn.net/Articles/667055/

Rather than doing your own proprietary i2c infrastructure, use the
standard Linux i2c infrastructure. The SFP modules are then just
normal i2c devices on a normal i2c bus. The work Russell did can then
access them, export them to user space, get the PHY modes correctly
setup, etc.

   Andrew


Re: bpf: kernel BUG in htab_elem_free

2016-11-03 Thread Dmitry Vyukov
On Wed, Nov 2, 2016 at 11:14 PM, Dmitry Vyukov  wrote:
> Here we go.
>
> The following program triggers kernel BUG in htab_elem_free.
> On commit 0c183d92b20b5c84ca655b45ef57b3318b83eb9e (Oct 31).
> Run as "while true; do ./a.out; done".
>
> [ cut here ]
> kernel BUG at mm/slub.c:3866!
> invalid opcode:  [#1] SMP KASAN
> Modules linked in:
> CPU: 1 PID: 1542 Comm: kworker/1:2 Not tainted 4.9.0-rc3+ #20
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Workqueue: events bpf_map_free_deferred
> task: 88003b9c0040 task.stack: 88003cb7
> RIP: 0010:[]  [] kfree+0x140/0x1a0
> RSP: 0018:88003cb77c50  EFLAGS: 00010246
> RAX: eafb0aa0 RBX: 88003ec2a1a8 RCX: 
> RDX:  RSI: 110007b50401 RDI: 88003ec2a1a8
> RBP: 88003cb77c70 R08: 00021800 R09: 
> R10:  R11:  R12: eafb0a80
> R13: 81392bcb R14:  R15: 88003ec2a1a8
> FS:  () GS:88003ed0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 205d7000 CR3: 37d29000 CR4: 06e0
> Stack:
>  dc00 88003da82008 88003b75bb88 
>  88003cb77ce0 81392bcb 81acf4f8 88003b75bc04
>  88003b75bbe0 ed00076eb772 88003b75bb90 3cb77ce0
> Call Trace:
>  [< inline >] htab_elem_free kernel/bpf/hashtab.c:388
>  [< inline >] delete_all_elements kernel/bpf/hashtab.c:690
>  [] htab_map_free+0x30b/0x470 kernel/bpf/hashtab.c:711
>  [] bpf_map_free_deferred+0xac/0xd0 kernel/bpf/syscall.c:97
>  [] process_one_work+0x8a7/0x1300 kernel/workqueue.c:2096
>  [] worker_thread+0xed/0x14e0 kernel/workqueue.c:2230
>  [] kthread+0x1ec/0x260 kernel/kthread.c:209
>  [] ret_from_fork+0x25/0x30 arch/x86/entry/entry_64.S:433
> Code: 83 c4 18 48 89 da 4c 89 ee ff d0 49 8b 04 24 48 85 c0 75 e6 e9
> e9 fe ff ff 49 8b 04 24 f6 c4 40 75 0b 49 8b 44 24 20 a8 01 75 02 <0f>
> 0b 48 89 df e8 56 35 00 00 49 8b 04 24 31 f6 f6 c4 40 74 05
> RIP  [< inline >] PageCompound ./include/linux/page-flags.h:157
> RIP  [] kfree+0x140/0x1a0 mm/slub.c:3866
>  RSP 
> ---[ end trace 1dc58d6aeb2596aa ]---
> ==
> BUG: KASAN: stack-out-of-bounds in complete+0x68/0x70 at addr 88003cb77ed8
> Read of size 4 by task kworker/1:2/1542
> page:eaf2ddc0 count:0 mapcount:0 mapping:  (null) index:0x0
> flags: 0x100()
> page dumped because: kasan: bad access detected
> CPU: 1 PID: 1542 Comm: kworker/1:2 Tainted: G  D 4.9.0-rc3+ #20
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>  88003cb77ce0 81acf609 ed000796efdb ed000796efdb
>  0004  88003cb77d60 814cdbfb
>  88003c8d97c8 dc00 811dd038 0097
> Call Trace:
>  [< inline >] __dump_stack lib/dump_stack.c:15
>  [] dump_stack+0x83/0xba lib/dump_stack.c:51
>  [< inline >] kasan_report_error mm/kasan/report.c:204
>  [] kasan_report+0x4cb/0x500 mm/kasan/report.c:303
>  [] __asan_report_load4_noabort+0x14/0x20
> mm/kasan/report.c:328
>  [] complete+0x68/0x70 kernel/sched/completion.c:34
>  [< inline >] complete_vfork_done kernel/fork.c:1030
>  [] mm_release+0x222/0x3f0 kernel/fork.c:1114
>  [< inline >] exit_mm kernel/exit.c:467
>  [] do_exit+0x3a1/0x2960 kernel/exit.c:815
>  [] rewind_stack_do_exit+0x17/0x20
> arch/x86/entry/entry_64.S:1526
> Memory state around the buggy address:
>  88003cb77d80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>  88003cb77e00: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 00 f4 f4 f4
>>88003cb77e80: f2 f2 f2 f2 00 f4 f4 f4 f2 f2 f2 f2 00 00 f4 f4
> ^
>  88003cb77f00: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
>  88003cb77f80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> ==
> BUG: unable to handle kernel
> paging request at ffd8
> IP: [] kthread_data+0x4d/0x70 kernel/kthread.c:137
> PGD 360d067 [   48.581115] PUD 360f067
> PMD 0 [   48.581840]
> Oops:  [#2] SMP KASAN
> Modules linked in:
> CPU: 1 PID: 1542 Comm: kworker/1:2 Tainted: GB D 4.9.0-rc3+ #20
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: 88003b9c0040 task.stack: 88003cb7
> RIP: 0010:[]  [] kthread_data+0x4d/0x70
> RSP: 0018:88003cb77c78  EFLAGS: 00010046
> RAX: dc00 RBX:  RCX: 
> RDX: 1ffb RSI: 88003b9c00c0 RDI: ffd8
> RBP: 88003cb77c80 R08: 88003ed20a48 R09: 88003ed20a40
> R10:  R11:  R12: 

Re: [PATCH net-next v2 0/5] bpf: BPF for lightweight tunnel encapsulation

2016-11-03 Thread Thomas Graf
On 2 November 2016 at 04:48, Hannes Frederic Sowa
 wrote:
> On Wed, Nov 2, 2016, at 00:07, Tom Herbert wrote:
>> On the other hand, I'm not really sure how to implement for this level
>> of performance this in LWT+BPF either. It seems like one way to do
>> that would be to create a program each destination and set it each
>> host. As you point out would create a million different programs which
>> doesn't seem manageable. I don't think the BPF map works either since
>> that implies we need a lookup (?). It seems like what we need is one
>> program but allow it to be parameterized with per destination
>> information saved in the route (LWT structure).
>
> Yes, that is my proposal. Just using the dst entry as meta-data (which
> can actually also be an ID for the network namespace the packet is
> coming from).

I have no objection to doing this on top of this series.

> My concern with using BPF is that the rest of the kernel doesn't really
> see the semantics and can't optimize or cache at specific points,
> because the kernel cannot introspect what the BPF program does (for
> metadata manipulation, one can e.g. specifiy that the program is "pure",
> and always provides the same output for some specified given input, thus
> things can be cached and memorized, but that framework seems very hard
> to build).

So you want to reintroduce a routing cache? Each packet needs to pass
through the BPF program anyway for accounting purposes. This is not
just about getting the packets out the right nexthop in the fastest
possible way.

> I also fear this becomes a kernel by-pass:
>
> It might be very hard e.g. to apply NFT/netfilter to such packets, if
> e.g. a redirect happens suddenly and packet flow is diverted from the
> one the user sees currently based on the interfaces and routing tables.

The LWT xmit hook is after the POST_ROUTING hook. The input and output
hook cannot redirect and output will become read-only just like input
already is. We are not bypassing anything. Please stop throwing the
word bypass around. This is just a false claim.

> That's why I am in favor of splitting this patchset down and allow the
> policies that should be expressed by BPF programs being applied to the
> specific subsystems (I am not totally against a generic BPF hook in
> input or output of the protocol engines). E.g. can we deal with static
> rewriting of L2 addresses in the neighbor cache? We already provide a
> fast header cache for L2 data which might be used here?

Split what? What policies?

I have two primary use cases for this:
1) Traffic into local containers: Containers are only supposed to do
L3, all L2 traffic is dropped for security reasons. The L2 header for
any packets in and out of the container is fixed and does not require
any sort of resolving. I in order to feed packets from the local host
into the containers, a route with the container prefix is set up. It
points to a nexthop address which appears behind a veth pair. A BPF
program is listening at tc ingress on the veth pair and will enforce
policies and do accounting. It requires very ugly hacks because Linux
does not like to do forwarding to an address which is considered
local. It works but it is a hack.

What I want to do instead is to run the BPF program for the route
directly, apply the policies, do accounting, push the fixed dummy L2
header and redirect it to the container. If someone has netfilter
rules installed, they will still apply. Nothing is hidden.

2) For external traffic that is coming in. We have a BPF program
listening on tc ingress which matches on the destination address on
all incoming traffic. If the packet is a for a container, we perform
the same actions as above. In this case we are bypassing the routing
table. This is ugly. What I want to do instead is to have the
container prefix invoke the BPF program so all packets have a route
lookup performed and netfilter filtering performed, only after that,
the BPF program is invoked exclusively for the packets destined for
local containers. Yes, it would be possible to redirect into a
temporary veth again and listen on that but it again requires to fake
a L2 segment which is just unnecessary and slow.

This is not hiding anything and it is not bypassing anything.


RE: [PATCH net-next v1 02/21] amd-xgbe: Prepare for priority-based FIFO allocation

2016-11-03 Thread Mintz, Yuval
> +static void xgbe_calculate_equal_fifo(unsigned int fifo_size,
> +   unsigned int queue_count,
> +   unsigned int *fifo)
> +{
...
> +
> + return;
>  }

No need for explicit return.



Re: [mm PATCH v2 01/26] swiotlb: Drop unused functions swiotlb_map_sg and swiotlb_unmap_sg

2016-11-03 Thread Christoph Hellwig
On Wed, Nov 02, 2016 at 07:12:31AM -0400, Alexander Duyck wrote:
> There are no users for swiotlb_map_sg or swiotlb_unmap_sg so we might as
> well just drop them.

FYI, I sent the same patch already on Sep, 11 and Konrad already ACKed
it:

https://lkml.org/lkml/2016/9/11/112
https://lkml.org/lkml/2016/9/16/474


[PATCH net-next 09/13] nfp: reorganize nfp_net_rx() to get packet offsets early

2016-11-03 Thread Jakub Kicinski
Calculate packet offsets early in nfp_net_rx() so that we will be
able to use them in upcoming XDP handler.  While at it move relevant
variables into the loop scope.

Signed-off-by: Jakub Kicinski 
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 56 --
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 506362729607..2ab63661a6fd 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1383,16 +1383,17 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, 
int budget)
 {
struct nfp_net_r_vector *r_vec = rx_ring->r_vec;
struct nfp_net *nn = r_vec->nfp_net;
-   unsigned int data_len, meta_len;
-   struct nfp_net_rx_buf *rxbuf;
-   struct nfp_net_rx_desc *rxd;
-   dma_addr_t new_dma_addr;
struct sk_buff *skb;
int pkts_polled = 0;
-   void *new_frag;
int idx;
 
while (pkts_polled < budget) {
+   unsigned int meta_len, data_len, data_off, pkt_len, pkt_off;
+   struct nfp_net_rx_buf *rxbuf;
+   struct nfp_net_rx_desc *rxd;
+   dma_addr_t new_dma_addr;
+   void *new_frag;
+
idx = rx_ring->rd_p & (rx_ring->cnt - 1);
 
rxd = _ring->rxds[idx];
@@ -1408,22 +1409,6 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, 
int budget)
pkts_polled++;
 
rxbuf = _ring->rxbufs[idx];
-   skb = build_skb(rxbuf->frag, nn->fl_bufsz);
-   if (unlikely(!skb)) {
-   nfp_net_rx_drop(r_vec, rx_ring, rxbuf, NULL);
-   continue;
-   }
-   new_frag = nfp_net_napi_alloc_one(nn, _dma_addr);
-   if (unlikely(!new_frag)) {
-   nfp_net_rx_drop(r_vec, rx_ring, rxbuf, skb);
-   continue;
-   }
-
-   nfp_net_dma_unmap_rx(nn, rx_ring->rxbufs[idx].dma_addr,
-nn->fl_bufsz, DMA_FROM_DEVICE);
-
-   nfp_net_rx_give_one(rx_ring, new_frag, new_dma_addr);
-
/* < meta_len >
 *  <-- [rx_offset] -->
 *  -
@@ -1438,20 +1423,39 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, 
int budget)
 */
meta_len = rxd->rxd.meta_len_dd & PCIE_DESC_RX_META_LEN_MASK;
data_len = le16_to_cpu(rxd->rxd.data_len);
+   pkt_len = data_len - meta_len;
 
if (nn->rx_offset == NFP_NET_CFG_RX_OFFSET_DYNAMIC)
-   skb_reserve(skb, NFP_NET_RX_BUF_HEADROOM + meta_len);
+   pkt_off = meta_len;
else
-   skb_reserve(skb,
-   NFP_NET_RX_BUF_HEADROOM + nn->rx_offset);
-   skb_put(skb, data_len - meta_len);
+   pkt_off = nn->rx_offset;
+   data_off = NFP_NET_RX_BUF_HEADROOM + pkt_off;
 
/* Stats update */
u64_stats_update_begin(_vec->rx_sync);
r_vec->rx_pkts++;
-   r_vec->rx_bytes += skb->len;
+   r_vec->rx_bytes += pkt_len;
u64_stats_update_end(_vec->rx_sync);
 
+   skb = build_skb(rxbuf->frag, nn->fl_bufsz);
+   if (unlikely(!skb)) {
+   nfp_net_rx_drop(r_vec, rx_ring, rxbuf, NULL);
+   continue;
+   }
+   new_frag = nfp_net_napi_alloc_one(nn, _dma_addr);
+   if (unlikely(!new_frag)) {
+   nfp_net_rx_drop(r_vec, rx_ring, rxbuf, skb);
+   continue;
+   }
+
+   nfp_net_dma_unmap_rx(nn, rx_ring->rxbufs[idx].dma_addr,
+nn->fl_bufsz, DMA_FROM_DEVICE);
+
+   nfp_net_rx_give_one(rx_ring, new_frag, new_dma_addr);
+
+   skb_reserve(skb, data_off);
+   skb_put(skb, pkt_len);
+
if (nn->fw_ver.major <= 3) {
nfp_net_set_hash_desc(nn->netdev, skb, rxd);
} else if (meta_len) {
-- 
1.9.1



[PATCH net-next 03/13] nfp: rename ring allocation helpers

2016-11-03 Thread Jakub Kicinski
"Shadow" in ring helpers used to mean that the helper will allocate
rings without touching existing configuration, this was used for
reconfiguration while the device was running.  We will soon use
the same helpers for .ndo_open() path, so replace "shadow" with
"ring_set".

No functional changes.

Signed-off-by: Jakub Kicinski 
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 26 +++---
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index e58532d27c5b..b7b2851ebb6b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1573,7 +1573,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 }
 
 static struct nfp_net_tx_ring *
-nfp_net_shadow_tx_rings_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_tx_ring_set_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_tx_ring *rings;
unsigned int r;
@@ -1599,7 +1599,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 }
 
 static void
-nfp_net_shadow_tx_rings_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_tx_ring_set_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_tx_ring *rings = s->rings;
struct nfp_net_ring_set new = *s;
@@ -1616,7 +1616,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 }
 
 static void
-nfp_net_shadow_tx_rings_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_tx_ring_set_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_tx_ring *rings = s->rings;
unsigned int r;
@@ -1693,7 +1693,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 }
 
 static struct nfp_net_rx_ring *
-nfp_net_shadow_rx_rings_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_rx_ring_set_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
unsigned int fl_bufsz = nfp_net_calc_fl_bufsz(nn, s->mtu);
struct nfp_net_rx_ring *rings;
@@ -1726,7 +1726,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 }
 
 static void
-nfp_net_shadow_rx_rings_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_rx_ring_set_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_rx_ring *rings = s->rings;
struct nfp_net_ring_set new = *s;
@@ -1746,7 +1746,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 }
 
 static void
-nfp_net_shadow_rx_rings_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
+nfp_net_rx_ring_set_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_rx_ring *rings = s->rings;
unsigned int r;
@@ -2268,9 +2268,9 @@ static void nfp_net_set_rx_mode(struct net_device *netdev)
 struct nfp_net_ring_set *tx)
 {
if (rx)
-   nfp_net_shadow_rx_rings_swap(nn, rx);
+   nfp_net_rx_ring_set_swap(nn, rx);
if (tx)
-   nfp_net_shadow_tx_rings_swap(nn, tx);
+   nfp_net_tx_ring_set_swap(nn, tx);
 
return __nfp_net_set_config_and_enable(nn);
 }
@@ -2299,11 +2299,11 @@ static void nfp_net_set_rx_mode(struct net_device 
*netdev)
 
/* Prepare new rings */
if (rx) {
-   if (!nfp_net_shadow_rx_rings_prepare(nn, rx))
+   if (!nfp_net_rx_ring_set_prepare(nn, rx))
return -ENOMEM;
}
if (tx) {
-   if (!nfp_net_shadow_tx_rings_prepare(nn, tx)) {
+   if (!nfp_net_tx_ring_set_prepare(nn, tx)) {
err = -ENOMEM;
goto err_free_rx;
}
@@ -2327,9 +2327,9 @@ static void nfp_net_set_rx_mode(struct net_device *netdev)
}
 
if (rx)
-   nfp_net_shadow_rx_rings_free(nn, rx);
+   nfp_net_rx_ring_set_free(nn, rx);
if (tx)
-   nfp_net_shadow_tx_rings_free(nn, tx);
+   nfp_net_tx_ring_set_free(nn, tx);
 
nfp_net_open_stack(nn);
 
@@ -2337,7 +2337,7 @@ static void nfp_net_set_rx_mode(struct net_device *netdev)
 
 err_free_rx:
if (rx)
-   nfp_net_shadow_rx_rings_free(nn, rx);
+   nfp_net_rx_ring_set_free(nn, rx);
return err;
 }
 
-- 
1.9.1



[PATCH net-next v1 21/21] amd-xgbe: Add support for a KR redriver

2016-11-03 Thread Tom Lendacky
This patch provides support for the presence of a KR redriver chip in
between the device PCS and an external PHY.  When a redriver chip is
present the device must perform clause 73 auto-negotiation in order to
set the redriver chip for the downstream connection.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-common.h |   10 +
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c   |   45 ++-
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v1.c |7 
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |  427 +--
 drivers/net/ethernet/amd/xgbe/xgbe.h|6 
 5 files changed, 458 insertions(+), 37 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h 
b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
index ecd4f4d..5b7ba25 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
@@ -1062,6 +1062,16 @@
 #define XP_PROP_4_MUX_ADDR_LO_WIDTH3
 #define XP_PROP_4_MUX_CHAN_INDEX   4
 #define XP_PROP_4_MUX_CHAN_WIDTH   3
+#define XP_PROP_4_REDRV_ADDR_INDEX 16
+#define XP_PROP_4_REDRV_ADDR_WIDTH 7
+#define XP_PROP_4_REDRV_IF_INDEX   23
+#define XP_PROP_4_REDRV_IF_WIDTH   1
+#define XP_PROP_4_REDRV_LANE_INDEX 24
+#define XP_PROP_4_REDRV_LANE_WIDTH 3
+#define XP_PROP_4_REDRV_MODEL_INDEX28
+#define XP_PROP_4_REDRV_MODEL_WIDTH3
+#define XP_PROP_4_REDRV_PRESENT_INDEX  31
+#define XP_PROP_4_REDRV_PRESENT_WIDTH  1
 
 /* I2C Control register offsets */
 #define IC_CON 0x
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 622675a..0ecae70 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -179,6 +179,7 @@ static void xgbe_an_enable_interrupts(struct xgbe_prv_data 
*pdata)
 {
switch (pdata->an_mode) {
case XGBE_AN_MODE_CL73:
+   case XGBE_AN_MODE_CL73_REDRV:
xgbe_an73_enable_interrupts(pdata);
break;
case XGBE_AN_MODE_CL37:
@@ -254,6 +255,10 @@ static void xgbe_kx_1000_mode(struct xgbe_prv_data *pdata)
 
 static void xgbe_sfi_mode(struct xgbe_prv_data *pdata)
 {
+   /* If a KR re-driver is present, change to KR mode instead */
+   if (pdata->kr_redrv)
+   return xgbe_kr_mode(pdata);
+
/* Disable KR training */
xgbe_an73_disable_kr_training(pdata);
 
@@ -433,6 +438,7 @@ static void xgbe_an_restart(struct xgbe_prv_data *pdata)
 {
switch (pdata->an_mode) {
case XGBE_AN_MODE_CL73:
+   case XGBE_AN_MODE_CL73_REDRV:
xgbe_an73_restart(pdata);
break;
case XGBE_AN_MODE_CL37:
@@ -448,6 +454,7 @@ static void xgbe_an_disable(struct xgbe_prv_data *pdata)
 {
switch (pdata->an_mode) {
case XGBE_AN_MODE_CL73:
+   case XGBE_AN_MODE_CL73_REDRV:
xgbe_an73_disable(pdata);
break;
case XGBE_AN_MODE_CL37:
@@ -687,6 +694,7 @@ static irqreturn_t xgbe_an_isr(int irq, void *data)
 
switch (pdata->an_mode) {
case XGBE_AN_MODE_CL73:
+   case XGBE_AN_MODE_CL73_REDRV:
xgbe_an73_isr(pdata);
break;
case XGBE_AN_MODE_CL37:
@@ -895,6 +903,7 @@ static void xgbe_an_state_machine(struct work_struct *work)
 
switch (pdata->an_mode) {
case XGBE_AN_MODE_CL73:
+   case XGBE_AN_MODE_CL73_REDRV:
xgbe_an73_state_machine(pdata);
break;
case XGBE_AN_MODE_CL37:
@@ -910,16 +919,18 @@ static void xgbe_an_state_machine(struct work_struct 
*work)
 
 static void xgbe_an37_init(struct xgbe_prv_data *pdata)
 {
-   unsigned int reg;
+   unsigned int advertising, reg;
+
+   advertising = pdata->phy_if.phy_impl.an_advertising(pdata);
 
/* Set up Advertisement register */
reg = XMDIO_READ(pdata, MDIO_MMD_VEND2, MDIO_VEND2_AN_ADVERTISE);
-   if (pdata->phy.advertising & ADVERTISED_Pause)
+   if (advertising & ADVERTISED_Pause)
reg |= 0x100;
else
reg &= ~0x100;
 
-   if (pdata->phy.advertising & ADVERTISED_Asym_Pause)
+   if (advertising & ADVERTISED_Asym_Pause)
reg |= 0x80;
else
reg &= ~0x80;
@@ -954,11 +965,13 @@ static void xgbe_an37_init(struct xgbe_prv_data *pdata)
 
 static void xgbe_an73_init(struct xgbe_prv_data *pdata)
 {
-   unsigned int reg;
+   unsigned int advertising, reg;
+
+   advertising = pdata->phy_if.phy_impl.an_advertising(pdata);
 
/* Set up Advertisement register 3 first */
reg = XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2);
-   if (pdata->phy.advertising & ADVERTISED_1baseR_FEC)
+   if (advertising & ADVERTISED_1baseR_FEC)
reg |= 0xc000;
  

[PATCH net-next v1 01/21] amd-xgbe: Fix formatting of PCS register dump

2016-11-03 Thread Tom Lendacky
Fix the length value used for the PCS register dump so that the full
value can be displayed.

Signed-off-by: Tom Lendacky 
---
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c |   24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index 84c5d29..e9b01fc 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -1257,33 +1257,33 @@ static void xgbe_dump_phy_registers(struct 
xgbe_prv_data *pdata)
 
dev_dbg(dev, "\n* PHY Reg dump **\n");
 
-   dev_dbg(dev, "PCS Control Reg (%#04x) = %#04x\n", MDIO_CTRL1,
+   dev_dbg(dev, "PCS Control Reg (%#06x) = %#06x\n", MDIO_CTRL1,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_CTRL1));
-   dev_dbg(dev, "PCS Status Reg (%#04x) = %#04x\n", MDIO_STAT1,
+   dev_dbg(dev, "PCS Status Reg (%#06x) = %#06x\n", MDIO_STAT1,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_STAT1));
-   dev_dbg(dev, "Phy Id (PHYS ID 1 %#04x)= %#04x\n", MDIO_DEVID1,
+   dev_dbg(dev, "Phy Id (PHYS ID 1 %#06x)= %#06x\n", MDIO_DEVID1,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_DEVID1));
-   dev_dbg(dev, "Phy Id (PHYS ID 2 %#04x)= %#04x\n", MDIO_DEVID2,
+   dev_dbg(dev, "Phy Id (PHYS ID 2 %#06x)= %#06x\n", MDIO_DEVID2,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_DEVID2));
-   dev_dbg(dev, "Devices in Package (%#04x)= %#04x\n", MDIO_DEVS1,
+   dev_dbg(dev, "Devices in Package (%#06x)= %#06x\n", MDIO_DEVS1,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_DEVS1));
-   dev_dbg(dev, "Devices in Package (%#04x)= %#04x\n", MDIO_DEVS2,
+   dev_dbg(dev, "Devices in Package (%#06x)= %#06x\n", MDIO_DEVS2,
XMDIO_READ(pdata, MDIO_MMD_PCS, MDIO_DEVS2));
 
-   dev_dbg(dev, "Auto-Neg Control Reg (%#04x) = %#04x\n", MDIO_CTRL1,
+   dev_dbg(dev, "Auto-Neg Control Reg (%#06x) = %#06x\n", MDIO_CTRL1,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_CTRL1));
-   dev_dbg(dev, "Auto-Neg Status Reg (%#04x) = %#04x\n", MDIO_STAT1,
+   dev_dbg(dev, "Auto-Neg Status Reg (%#06x) = %#06x\n", MDIO_STAT1,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_STAT1));
-   dev_dbg(dev, "Auto-Neg Ad Reg 1 (%#04x) = %#04x\n",
+   dev_dbg(dev, "Auto-Neg Ad Reg 1 (%#06x) = %#06x\n",
MDIO_AN_ADVERTISE,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE));
-   dev_dbg(dev, "Auto-Neg Ad Reg 2 (%#04x) = %#04x\n",
+   dev_dbg(dev, "Auto-Neg Ad Reg 2 (%#06x) = %#06x\n",
MDIO_AN_ADVERTISE + 1,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 1));
-   dev_dbg(dev, "Auto-Neg Ad Reg 3 (%#04x) = %#04x\n",
+   dev_dbg(dev, "Auto-Neg Ad Reg 3 (%#06x) = %#06x\n",
MDIO_AN_ADVERTISE + 2,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_ADVERTISE + 2));
-   dev_dbg(dev, "Auto-Neg Completion Reg (%#04x) = %#04x\n",
+   dev_dbg(dev, "Auto-Neg Completion Reg (%#06x) = %#06x\n",
MDIO_AN_COMP_STAT,
XMDIO_READ(pdata, MDIO_MMD_AN, MDIO_AN_COMP_STAT));
 



[PATCH net-next 08/13] nfp: add support for ethtool .set_channels

2016-11-03 Thread Jakub Kicinski
Allow changing the number of rings via ethtool .set_channels API.
Runtime reconfig needs to be extended to handle number of rings.
We need to be able to activate interrupt vectors before rings are
assigned to them.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 93 +-
 .../net/ethernet/netronome/nfp/nfp_net_debugfs.c   |  4 +-
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   | 47 +++
 4 files changed, 123 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 14b5e21cabf1..486e7c6453bc 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -584,6 +584,7 @@ struct nfp_net {
 };
 
 struct nfp_net_ring_set {
+   unsigned int n_rings;
unsigned int mtu;
unsigned int dcnt;
void *rings;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 09cec6e2c6cf..506362729607 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -494,7 +494,7 @@ static void nfp_net_irqs_assign(struct net_device *netdev)
nn->lsc_handler = nfp_net_irq_lsc;
nn->exn_handler = nfp_net_irq_exn;
 
-   for (r = 0; r < nn->num_r_vecs; r++) {
+   for (r = 0; r < nn->max_r_vecs; r++) {
r_vec = >r_vecs[r];
r_vec->nfp_net = nn;
r_vec->handler = nfp_net_irq_rxtx;
@@ -1578,12 +1578,12 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
struct nfp_net_tx_ring *rings;
unsigned int r;
 
-   rings = kcalloc(nn->num_tx_rings, sizeof(*rings), GFP_KERNEL);
+   rings = kcalloc(s->n_rings, sizeof(*rings), GFP_KERNEL);
if (!rings)
return NULL;
 
-   for (r = 0; r < nn->num_tx_rings; r++) {
-   nfp_net_tx_ring_init([r], nn->tx_rings[r].r_vec, r);
+   for (r = 0; r < s->n_rings; r++) {
+   nfp_net_tx_ring_init([r], >r_vecs[r], r);
 
if (nfp_net_tx_ring_alloc([r], s->dcnt))
goto err_free_prev;
@@ -1605,9 +1605,11 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 
s->dcnt = nn->txd_cnt;
s->rings = nn->tx_rings;
+   s->n_rings = nn->num_tx_rings;
 
nn->txd_cnt = new.dcnt;
nn->tx_rings = new.rings;
+   nn->num_tx_rings = new.n_rings;
 }
 
 static void
@@ -1616,7 +1618,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
struct nfp_net_tx_ring *rings = s->rings;
unsigned int r;
 
-   for (r = 0; r < nn->num_tx_rings; r++)
+   for (r = 0; r < s->n_rings; r++)
nfp_net_tx_ring_free([r]);
 
kfree(rings);
@@ -1694,12 +1696,12 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
struct nfp_net_rx_ring *rings;
unsigned int r;
 
-   rings = kcalloc(nn->num_rx_rings, sizeof(*rings), GFP_KERNEL);
+   rings = kcalloc(s->n_rings, sizeof(*rings), GFP_KERNEL);
if (!rings)
return NULL;
 
-   for (r = 0; r < nn->num_rx_rings; r++) {
-   nfp_net_rx_ring_init([r], nn->rx_rings[r].r_vec, r);
+   for (r = 0; r < s->n_rings; r++) {
+   nfp_net_rx_ring_init([r], >r_vecs[r], r);
 
if (nfp_net_rx_ring_alloc([r], fl_bufsz, s->dcnt))
goto err_free_prev;
@@ -1728,11 +1730,13 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
s->mtu = nn->netdev->mtu;
s->dcnt = nn->rxd_cnt;
s->rings = nn->rx_rings;
+   s->n_rings = nn->num_rx_rings;
 
nn->netdev->mtu = new.mtu;
nn->fl_bufsz = nfp_net_calc_fl_bufsz(nn, new.mtu);
nn->rxd_cnt = new.dcnt;
nn->rx_rings = new.rings;
+   nn->num_rx_rings = new.n_rings;
 }
 
 static void
@@ -1741,7 +1745,7 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
struct nfp_net_rx_ring *rings = s->rings;
unsigned int r;
 
-   for (r = 0; r < nn->num_rx_rings; r++) {
+   for (r = 0; r < s->n_rings; r++) {
nfp_net_rx_ring_bufs_free(nn, [r]);
nfp_net_rx_ring_free([r]);
}
@@ -1764,19 +1768,20 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
struct msix_entry *entry = >irq_entries[r_vec->irq_idx];
int err;
 
+   /* Setup NAPI */
+   netif_napi_add(nn->netdev, _vec->napi,
+  nfp_net_poll, NAPI_POLL_WEIGHT);
+
snprintf(r_vec->name, sizeof(r_vec->name),
 "%s-rxtx-%d", nn->netdev->name, idx);
err = request_irq(entry->vector, r_vec->handler, 0, r_vec->name, r_vec);
   

[PATCH net-next 13/13] nfp: add support for offload of XDP programs

2016-11-03 Thread Jakub Kicinski
Most infrastructure can be reused, provide separate handling
of context offsets and exit codes.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_bpf.h   |  1 +
 drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c   | 92 +-
 .../net/ethernet/netronome/nfp/nfp_bpf_verifier.c  |  3 +
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  2 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 44 ++-
 .../net/ethernet/netronome/nfp/nfp_net_offload.c   |  3 +
 6 files changed, 139 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_bpf.h 
b/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
index 87aa8a3e9112..76a19f1796af 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
@@ -62,6 +62,7 @@ enum nfp_bpf_action_type {
NN_ACT_TC_DROP,
NN_ACT_TC_REDIR,
NN_ACT_DIRECT,
+   NN_ACT_XDP,
 };
 
 /* Software register representation, hardware encoding in asm.h */
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c 
b/drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c
index f8df5300f49c..335beb8b8b45 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_bpf_jit.c
@@ -1126,7 +1126,7 @@ static int data_ind_ld4(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
 meta->insn.src_reg * 2, true, 4);
 }
 
-static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int mem_ldx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
if (meta->insn.off == offsetof(struct sk_buff, len))
emit_alu(nfp_prog, reg_both(meta->insn.dst_reg * 2),
@@ -1134,12 +1134,42 @@ static int mem_ldx4(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
else
return -ENOTSUPP;
 
-   wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
+   return 0;
+}
+
+static int mem_ldx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   u32 dst = reg_both(meta->insn.dst_reg * 2);
+
+   if (meta->insn.off != offsetof(struct xdp_md, data) &&
+   meta->insn.off != offsetof(struct xdp_md, data_end))
+   return -ENOTSUPP;
+
+   emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, NFP_BPF_ABI_PKT);
+
+   if (meta->insn.off == offsetof(struct xdp_md, data))
+   return 0;
+
+   emit_alu(nfp_prog, dst, dst, ALU_OP_ADD, NFP_BPF_ABI_LEN);
 
return 0;
 }
 
-static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   int ret;
+
+   if (nfp_prog->act == NN_ACT_XDP)
+   ret = mem_ldx4_xdp(nfp_prog, meta);
+   else
+   ret = mem_ldx4_skb(nfp_prog, meta);
+
+   wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
+
+   return ret;
+}
+
+static int mem_stx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
if (meta->insn.off == offsetof(struct sk_buff, mark))
return wrp_set_mark(nfp_prog, meta->insn.src_reg * 2);
@@ -1147,6 +1177,18 @@ static int mem_stx4(struct nfp_prog *nfp_prog, struct 
nfp_insn_meta *meta)
return -ENOTSUPP;
 }
 
+static int mem_stx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   return -ENOTSUPP;
+}
+
+static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
+{
+   if (nfp_prog->act == NN_ACT_XDP)
+   return mem_stx4_xdp(nfp_prog, meta);
+   return mem_stx4_skb(nfp_prog, meta);
+}
+
 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
 {
if (meta->insn.off < 0) /* TODO */
@@ -1530,6 +1572,47 @@ static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
 }
 
+static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
+{
+   /* XDP return codes:
+*   0 aborted  0x82 -> drop,  count as stat3
+*   1drop  0x22 -> drop,  count as stat1
+*   2pass  0x11 -> pass,  count as stat0
+*   3  tx  0x44 -> redir, count as stat2
+*   * unknown  0x82 -> drop,  count as stat3
+*/
+   /* Target for aborts */
+   nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
+
+   emit_br_def(nfp_prog, nfp_prog->tgt_done, 2);
+
+   emit_alu(nfp_prog, reg_a(0),
+reg_none(), ALU_OP_NONE, NFP_BPF_ABI_FLAGS);
+   emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
+
+   /* Target for normal exits */
+   nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
+
+   /* if R0 > 3 jump to abort */
+   emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
+   emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
+
+   wrp_immed(nfp_prog, 

[PATCH net-next 06/13] nfp: add helper to reassign rings to IRQ vectors

2016-11-03 Thread Jakub Kicinski
Instead of fixing ring -> vector relations up in ring swap functions
put the reassignment into a helper function which will reinit all
links.

Signed-off-by: Jakub Kicinski 
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 40 +-
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 97cc21eae466..2a4e1f1cb3c9 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1601,16 +1601,11 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 static void
 nfp_net_tx_ring_set_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
-   struct nfp_net_tx_ring *rings = s->rings;
struct nfp_net_ring_set new = *s;
-   unsigned int r;
 
s->dcnt = nn->txd_cnt;
s->rings = nn->tx_rings;
 
-   for (r = 0; r < nn->num_tx_rings; r++)
-   nn->tx_rings[r].r_vec->tx_ring = [r];
-
nn->txd_cnt = new.dcnt;
nn->tx_rings = new.rings;
 }
@@ -1728,17 +1723,12 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 static void
 nfp_net_rx_ring_set_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
-   struct nfp_net_rx_ring *rings = s->rings;
struct nfp_net_ring_set new = *s;
-   unsigned int r;
 
s->mtu = nn->netdev->mtu;
s->dcnt = nn->rxd_cnt;
s->rings = nn->rx_rings;
 
-   for (r = 0; r < nn->num_rx_rings; r++)
-   nn->rx_rings[r].r_vec->rx_ring = [r];
-
nn->netdev->mtu = new.mtu;
nn->fl_bufsz = nfp_net_calc_fl_bufsz(nn, new.mtu);
nn->rxd_cnt = new.dcnt;
@@ -1759,6 +1749,14 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
kfree(rings);
 }
 
+static void
+nfp_net_vector_assign_rings(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
+   int idx)
+{
+   r_vec->rx_ring = idx < nn->num_rx_rings ? >rx_rings[idx] : NULL;
+   r_vec->tx_ring = idx < nn->num_tx_rings ? >tx_rings[idx] : NULL;
+}
+
 static int
 nfp_net_prepare_vector(struct nfp_net *nn, struct nfp_net_r_vector *r_vec,
   int idx)
@@ -1766,20 +1764,6 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
struct msix_entry *entry = >irq_entries[r_vec->irq_idx];
int err;
 
-   if (idx < nn->num_tx_rings) {
-   r_vec->tx_ring = >tx_rings[idx];
-   nfp_net_tx_ring_init(r_vec->tx_ring, r_vec, idx);
-   } else {
-   r_vec->tx_ring = NULL;
-   }
-
-   if (idx < nn->num_rx_rings) {
-   r_vec->rx_ring = >rx_rings[idx];
-   nfp_net_rx_ring_init(r_vec->rx_ring, r_vec, idx);
-   } else {
-   r_vec->rx_ring = NULL;
-   }
-
snprintf(r_vec->name, sizeof(r_vec->name),
 "%s-rxtx-%d", nn->netdev->name, idx);
err = request_irq(entry->vector, r_vec->handler, 0, r_vec->name, r_vec);
@@ -2100,6 +2084,9 @@ static int nfp_net_netdev_open(struct net_device *netdev)
goto err_free_rx_rings;
}
 
+   for (r = 0; r < nn->max_r_vecs; r++)
+   nfp_net_vector_assign_rings(nn, >r_vecs[r], r);
+
err = netif_set_real_num_tx_queues(netdev, nn->num_tx_rings);
if (err)
goto err_free_rings;
@@ -2247,11 +2234,16 @@ static void nfp_net_set_rx_mode(struct net_device 
*netdev)
 struct nfp_net_ring_set *rx,
 struct nfp_net_ring_set *tx)
 {
+   unsigned int r;
+
if (rx)
nfp_net_rx_ring_set_swap(nn, rx);
if (tx)
nfp_net_tx_ring_set_swap(nn, tx);
 
+   for (r = 0; r < nn->max_r_vecs; r++)
+   nfp_net_vector_assign_rings(nn, >r_vecs[r], r);
+
return __nfp_net_set_config_and_enable(nn);
 }
 
-- 
1.9.1



[net-next PATCH 1/3] net: make default TX queue length a defined constant

2016-11-03 Thread Jesper Dangaard Brouer
The default TX queue length of Ethernet devices have been a magic
constant of 1000, ever since the initial git import.

Looking back in historical trees[1][2] the value used to be 100,
with the same comment "Ethernet wants good queues". The commit[3]
that changed this from 100 to 1000 didn't describe why, but from
conversations with Robert Olsson it seems that it was changed
when Ethernet devices went from 100Mbit/s to 1Gbit/s, because the
link speed increased x10 the queue size were also adjusted.  This
value later caused much heartache for the bufferbloat community.

This patch merely moves the value into a defined constant.

[1] https://git.kernel.org/cgit/linux/kernel/git/davem/netdev-vger-cvs.git/
[2] https://git.kernel.org/cgit/linux/kernel/git/tglx/history.git/
[3] https://git.kernel.org/tglx/history/c/98921832c232

Signed-off-by: Jesper Dangaard Brouer 
---
 include/net/pkt_sched.h |2 ++
 net/ethernet/eth.c  |3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index cd334c9584e9..f1b76b8e6d2d 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -6,6 +6,8 @@
 #include 
 #include 
 
+#define DEFAULT_TX_QUEUE_LEN   1000
+
 struct qdisc_walker {
int stop;
int skip;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index d9e2fe1da724..8c5a479681ca 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -62,6 +62,7 @@
 #include 
 #include 
 #include 
+#include 
 
 __setup("ether=", netdev_boot_setup);
 
@@ -359,7 +360,7 @@ void ether_setup(struct net_device *dev)
dev->min_mtu= ETH_MIN_MTU;
dev->max_mtu= ETH_DATA_LEN;
dev->addr_len   = ETH_ALEN;
-   dev->tx_queue_len   = 1000; /* Ethernet wants good queues */
+   dev->tx_queue_len   = DEFAULT_TX_QUEUE_LEN;
dev->flags  = IFF_BROADCAST|IFF_MULTICAST;
dev->priv_flags |= IFF_TX_SKB_SHARING;
 



[PATCH net-next 07/13] nfp: move RSS indirection table init into a separate function

2016-11-03 Thread Jakub Kicinski
We will need to rerun the initialization of the RSS indirection table
after the number of rings is changed.  Move the code to a separate
function.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 2a4e1f1cb3c9..09cec6e2c6cf 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2229,6 +2229,15 @@ static void nfp_net_set_rx_mode(struct net_device 
*netdev)
nn->ctrl = new_ctrl;
 }
 
+static void nfp_net_rss_init_itbl(struct nfp_net *nn)
+{
+   int i;
+
+   for (i = 0; i < sizeof(nn->rss_itbl); i++)
+   nn->rss_itbl[i] =
+   ethtool_rxfh_indir_default(i, nn->num_rx_rings);
+}
+
 static int
 nfp_net_ring_swap_enable(struct nfp_net *nn,
 struct nfp_net_ring_set *rx,
@@ -2707,13 +2716,9 @@ void nfp_net_netdev_free(struct nfp_net *nn)
  */
 static void nfp_net_rss_init(struct nfp_net *nn)
 {
-   int i;
-
netdev_rss_key_fill(nn->rss_key, NFP_NET_CFG_RSS_KEY_SZ);
 
-   for (i = 0; i < sizeof(nn->rss_itbl); i++)
-   nn->rss_itbl[i] =
-   ethtool_rxfh_indir_default(i, nn->num_rx_rings);
+   nfp_net_rss_init_itbl(nn);
 
/* Enable IPv4/IPv6 TCP by default */
nn->rss_cfg = NFP_NET_CFG_RSS_IPV4_TCP |
-- 
1.9.1



[PATCH net-next 02/13] nfp: centralize runtime reconfiguration logic

2016-11-03 Thread Jakub Kicinski
All functions which need to reallocate ring resources at runtime
look very similar.  Centralize that logic into a separate function.
Encapsulate configuration parameters in a structure.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  10 +-
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 208 +
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  19 ++
 3 files changed, 118 insertions(+), 119 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index e8713254786b..14b5e21cabf1 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -583,6 +583,12 @@ struct nfp_net {
struct dentry *debugfs_dir;
 };
 
+struct nfp_net_ring_set {
+   unsigned int mtu;
+   unsigned int dcnt;
+   void *rings;
+};
+
 /* Functions to read/write from/to a BAR
  * Performs any endian conversion necessary.
  */
@@ -771,7 +777,9 @@ struct nfp_net *
 void nfp_net_coalesce_write_cfg(struct nfp_net *nn);
 int nfp_net_irqs_alloc(struct nfp_net *nn);
 void nfp_net_irqs_disable(struct nfp_net *nn);
-int nfp_net_set_ring_size(struct nfp_net *nn, u32 rxd_cnt, u32 txd_cnt);
+int
+nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_ring_set *rx,
+ struct nfp_net_ring_set *tx);
 
 #ifdef CONFIG_NFP_NET_DEBUG
 void nfp_net_debugfs_create(void);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 97e0bbef13d1..e58532d27c5b 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1573,7 +1573,7 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
 }
 
 static struct nfp_net_tx_ring *
-nfp_net_shadow_tx_rings_prepare(struct nfp_net *nn, u32 buf_cnt)
+nfp_net_shadow_tx_rings_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
struct nfp_net_tx_ring *rings;
unsigned int r;
@@ -1585,11 +1585,11 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
for (r = 0; r < nn->num_tx_rings; r++) {
nfp_net_tx_ring_init([r], nn->tx_rings[r].r_vec, r);
 
-   if (nfp_net_tx_ring_alloc([r], buf_cnt))
+   if (nfp_net_tx_ring_alloc([r], s->dcnt))
goto err_free_prev;
}
 
-   return rings;
+   return s->rings = rings;
 
 err_free_prev:
while (r--)
@@ -1598,27 +1598,29 @@ static int nfp_net_tx_ring_alloc(struct nfp_net_tx_ring 
*tx_ring, u32 cnt)
return NULL;
 }
 
-static struct nfp_net_tx_ring *
-nfp_net_shadow_tx_rings_swap(struct nfp_net *nn, struct nfp_net_tx_ring *rings)
+static void
+nfp_net_shadow_tx_rings_swap(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
-   struct nfp_net_tx_ring *old = nn->tx_rings;
+   struct nfp_net_tx_ring *rings = s->rings;
+   struct nfp_net_ring_set new = *s;
unsigned int r;
 
+   s->dcnt = nn->txd_cnt;
+   s->rings = nn->tx_rings;
+
for (r = 0; r < nn->num_tx_rings; r++)
-   old[r].r_vec->tx_ring = [r];
+   nn->tx_rings[r].r_vec->tx_ring = [r];
 
-   nn->tx_rings = rings;
-   return old;
+   nn->txd_cnt = new.dcnt;
+   nn->tx_rings = new.rings;
 }
 
 static void
-nfp_net_shadow_tx_rings_free(struct nfp_net *nn, struct nfp_net_tx_ring *rings)
+nfp_net_shadow_tx_rings_free(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
+   struct nfp_net_tx_ring *rings = s->rings;
unsigned int r;
 
-   if (!rings)
-   return;
-
for (r = 0; r < nn->num_tx_rings; r++)
nfp_net_tx_ring_free([r]);
 
@@ -1691,9 +1693,9 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
 }
 
 static struct nfp_net_rx_ring *
-nfp_net_shadow_rx_rings_prepare(struct nfp_net *nn, unsigned int fl_bufsz,
-   u32 buf_cnt)
+nfp_net_shadow_rx_rings_prepare(struct nfp_net *nn, struct nfp_net_ring_set *s)
 {
+   unsigned int fl_bufsz = nfp_net_calc_fl_bufsz(nn, s->mtu);
struct nfp_net_rx_ring *rings;
unsigned int r;
 
@@ -1704,14 +1706,14 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
for (r = 0; r < nn->num_rx_rings; r++) {
nfp_net_rx_ring_init([r], nn->rx_rings[r].r_vec, r);
 
-   if (nfp_net_rx_ring_alloc([r], fl_bufsz, buf_cnt))
+   if (nfp_net_rx_ring_alloc([r], fl_bufsz, s->dcnt))
goto err_free_prev;
 
if (nfp_net_rx_ring_bufs_alloc(nn, [r]))
goto err_free_ring;
}
 
-   return rings;
+   return s->rings = rings;
 
 err_free_prev:
while (r--) {
@@ -1723,27 +1725,32 @@ static void nfp_net_rx_ring_free(struct nfp_net_rx_ring 
*rx_ring)
return 

[PATCH net-next 11/13] nfp: add XDP support in the driver

2016-11-03 Thread Jakub Kicinski
Add XDP support.  Separate stack's and XDP's TX rings logically.
Add functions for handling XDP_TX and cleanup of XDP's TX rings.
For XDP allocate all RX buffers as separate pages and map them
with DMA_BIDIRECTIONAL.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h   |  17 +-
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 474 +
 .../net/ethernet/netronome/nfp/nfp_net_debugfs.c   |  37 +-
 .../net/ethernet/netronome/nfp/nfp_net_ethtool.c   |  22 +-
 4 files changed, 449 insertions(+), 101 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index 486e7c6453bc..abc9e56e93b8 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -171,7 +171,10 @@ struct nfp_net_tx_desc {
  * on the head's buffer). Equal to skb->len for non-TSO packets.
  */
 struct nfp_net_tx_buf {
-   struct sk_buff *skb;
+   union {
+   struct sk_buff *skb;
+   void *frag;
+   };
dma_addr_t dma_addr;
short int fidx;
u16 pkt_cnt;
@@ -341,6 +344,7 @@ struct nfp_net_rx_ring {
  * @napi:   NAPI structure for this ring vec
  * @tx_ring:Pointer to TX ring
  * @rx_ring:Pointer to RX ring
+ * @xdp_ring:  Pointer to an extra TX ring for XDP
  * @irq_idx:Index into MSI-X table
  * @rx_sync:   Seqlock for atomic updates of RX stats
  * @rx_pkts:Number of received packets
@@ -384,6 +388,8 @@ struct nfp_net_r_vector {
u64 hw_csum_rx_inner_ok;
u64 hw_csum_rx_error;
 
+   struct nfp_net_tx_ring *xdp_ring;
+
struct u64_stats_sync tx_sync;
u64 tx_pkts;
u64 tx_bytes;
@@ -432,6 +438,7 @@ struct nfp_stat_pair {
  * @ctrl:   Local copy of the control register/word.
  * @fl_bufsz:   Currently configured size of the freelist buffers
  * @rx_offset: Offset in the RX buffers where packet data starts
+ * @xdp_prog:  Installed XDP program
  * @cpp:Pointer to the CPP handle
  * @nfp_dev_cpp:Pointer to the NFP Device handle
  * @ctrl_area:  Pointer to the CPP area for the control BAR
@@ -451,6 +458,7 @@ struct nfp_stat_pair {
  * @max_tx_rings:   Maximum number of TX rings supported by the Firmware
  * @max_rx_rings:   Maximum number of RX rings supported by the Firmware
  * @num_tx_rings:   Currently configured number of TX rings
+ * @num_stack_tx_rings:Number of TX rings used by the stack (not XDP)
  * @num_rx_rings:   Currently configured number of RX rings
  * @txd_cnt:Size of the TX ring in number of descriptors
  * @rxd_cnt:Size of the RX ring in number of descriptors
@@ -500,6 +508,8 @@ struct nfp_net {
 
u32 rx_offset;
 
+   struct bpf_prog *xdp_prog;
+
struct nfp_net_tx_ring *tx_rings;
struct nfp_net_rx_ring *rx_rings;
 
@@ -532,6 +542,7 @@ struct nfp_net {
unsigned int max_rx_rings;
 
unsigned int num_tx_rings;
+   unsigned int num_stack_tx_rings;
unsigned int num_rx_rings;
 
int stride_tx;
@@ -779,8 +790,8 @@ struct nfp_net *
 int nfp_net_irqs_alloc(struct nfp_net *nn);
 void nfp_net_irqs_disable(struct nfp_net *nn);
 int
-nfp_net_ring_reconfig(struct nfp_net *nn, struct nfp_net_ring_set *rx,
- struct nfp_net_ring_set *tx);
+nfp_net_ring_reconfig(struct nfp_net *nn, struct bpf_prog **xdp_prog,
+ struct nfp_net_ring_set *rx, struct nfp_net_ring_set *tx);
 
 #ifdef CONFIG_NFP_NET_DEBUG
 void nfp_net_debugfs_create(void);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 2ab63661a6fd..fa43dbcecc4f 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -41,6 +41,7 @@
  *  Chris Telfer 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -490,6 +491,7 @@ static void nfp_net_irqs_assign(struct net_device *netdev)
 
nn->num_rx_rings = min(nn->num_r_vecs, nn->num_rx_rings);
nn->num_tx_rings = min(nn->num_r_vecs, nn->num_tx_rings);
+   nn->num_stack_tx_rings = nn->num_tx_rings;
 
nn->lsc_handler = nfp_net_irq_lsc;
nn->exn_handler = nfp_net_irq_exn;
@@ -713,6 +715,13 @@ static void nfp_net_tx_csum(struct nfp_net *nn, struct 
nfp_net_r_vector *r_vec,
u64_stats_update_end(_vec->tx_sync);
 }
 
+static void nfp_net_tx_xmit_more_flush(struct nfp_net_tx_ring *tx_ring)
+{
+   wmb();
+   nfp_qcp_wr_ptr_add(tx_ring->qcp_q, tx_ring->wr_ptr_add);
+   tx_ring->wr_ptr_add = 0;
+}
+
 /**
  * nfp_net_tx() - Main transmit entry point
  * @skb:SKB to transmit
@@ -827,12 +836,8 @@ static int nfp_net_tx(struct sk_buff *skb, struct 
net_device *netdev)

[PATCH net-next 04/13] nfp: reuse ring helpers on .ndo_open() path

2016-11-03 Thread Jakub Kicinski
Ring allocation helpers encapsulate all ring allocation and
initialization steps nicely.  Reuse them on .ndo_open() path.

Signed-off-by: Jakub Kicinski 
---
 .../net/ethernet/netronome/nfp/nfp_net_common.c| 60 --
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index b7b2851ebb6b..50aeaea9e318 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2051,6 +2051,13 @@ static void nfp_net_open_stack(struct nfp_net *nn)
 static int nfp_net_netdev_open(struct net_device *netdev)
 {
struct nfp_net *nn = netdev_priv(netdev);
+   struct nfp_net_ring_set rx = {
+   .mtu = nn->netdev->mtu,
+   .dcnt = nn->rxd_cnt,
+   };
+   struct nfp_net_ring_set tx = {
+   .dcnt = nn->txd_cnt,
+   };
int err, r;
 
if (nn->ctrl & NFP_NET_CFG_CTRL_ENABLE) {
@@ -2075,38 +2082,22 @@ static int nfp_net_netdev_open(struct net_device 
*netdev)
goto err_free_exn;
disable_irq(nn->irq_entries[NFP_NET_IRQ_LSC_IDX].vector);
 
-   nn->rx_rings = kcalloc(nn->num_rx_rings, sizeof(*nn->rx_rings),
-  GFP_KERNEL);
-   if (!nn->rx_rings) {
-   err = -ENOMEM;
-   goto err_free_lsc;
-   }
-   nn->tx_rings = kcalloc(nn->num_tx_rings, sizeof(*nn->tx_rings),
-  GFP_KERNEL);
-   if (!nn->tx_rings) {
-   err = -ENOMEM;
-   goto err_free_rx_rings;
-   }
-
for (r = 0; r < nn->num_r_vecs; r++) {
err = nfp_net_prepare_vector(nn, >r_vecs[r], r);
if (err)
goto err_cleanup_vec_p;
}
-   for (r = 0; r < nn->num_tx_rings; r++) {
-   err = nfp_net_tx_ring_alloc(nn->r_vecs[r].tx_ring, nn->txd_cnt);
-   if (err)
-   goto err_free_tx_ring_p;
+
+   nn->rx_rings = nfp_net_rx_ring_set_prepare(nn, );
+   if (!nn->rx_rings) {
+   err = -ENOMEM;
+   goto err_cleanup_vec;
}
-   for (r = 0; r < nn->num_rx_rings; r++) {
-   err = nfp_net_rx_ring_alloc(nn->r_vecs[r].rx_ring,
-   nn->fl_bufsz, nn->rxd_cnt);
-   if (err)
-   goto err_flush_free_rx_ring_p;
 
-   err = nfp_net_rx_ring_bufs_alloc(nn, nn->r_vecs[r].rx_ring);
-   if (err)
-   goto err_free_rx_ring_p;
+   nn->tx_rings = nfp_net_tx_ring_set_prepare(nn, );
+   if (!nn->tx_rings) {
+   err = -ENOMEM;
+   goto err_free_rx_rings;
}
 
err = netif_set_real_num_tx_queues(netdev, nn->num_tx_rings);
@@ -2139,25 +2130,14 @@ static int nfp_net_netdev_open(struct net_device 
*netdev)
return 0;
 
 err_free_rings:
-   r = nn->num_rx_rings;
-err_flush_free_rx_ring_p:
-   while (r--) {
-   nfp_net_rx_ring_bufs_free(nn, nn->r_vecs[r].rx_ring);
-err_free_rx_ring_p:
-   nfp_net_rx_ring_free(nn->r_vecs[r].rx_ring);
-   }
-   r = nn->num_tx_rings;
-err_free_tx_ring_p:
-   while (r--)
-   nfp_net_tx_ring_free(nn->r_vecs[r].tx_ring);
+   nfp_net_tx_ring_set_free(nn, );
+err_free_rx_rings:
+   nfp_net_rx_ring_set_free(nn, );
+err_cleanup_vec:
r = nn->num_r_vecs;
 err_cleanup_vec_p:
while (r--)
nfp_net_cleanup_vector(nn, >r_vecs[r]);
-   kfree(nn->tx_rings);
-err_free_rx_rings:
-   kfree(nn->rx_rings);
-err_free_lsc:
nfp_net_aux_irq_free(nn, NFP_NET_CFG_LSC, NFP_NET_IRQ_LSC_IDX);
 err_free_exn:
nfp_net_aux_irq_free(nn, NFP_NET_CFG_EXN, NFP_NET_IRQ_EXN_IDX);
-- 
1.9.1



[PATCH net-next 05/13] nfp: loosen relation between rings and IRQs vectors

2016-11-03 Thread Jakub Kicinski
Upcoming XDP support will break the assumption that one can iterate
over IRQ vectors to get to all the rings easily.  Use nn->.x_ring
arrays directly.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 50aeaea9e318..97cc21eae466 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1920,9 +1920,9 @@ static void nfp_net_clear_config_and_disable(struct 
nfp_net *nn)
nn_err(nn, "Could not disable device: %d\n", err);
 
for (r = 0; r < nn->num_rx_rings; r++)
-   nfp_net_rx_ring_reset(nn->r_vecs[r].rx_ring);
+   nfp_net_rx_ring_reset(>rx_rings[r]);
for (r = 0; r < nn->num_tx_rings; r++)
-   nfp_net_tx_ring_reset(nn, nn->r_vecs[r].tx_ring);
+   nfp_net_tx_ring_reset(nn, >tx_rings[r]);
for (r = 0; r < nn->num_r_vecs; r++)
nfp_net_vec_clear_ring_data(nn, r);
 
@@ -2000,7 +2000,7 @@ static int __nfp_net_set_config_and_enable(struct nfp_net 
*nn)
nn->ctrl = new_ctrl;
 
for (r = 0; r < nn->num_rx_rings; r++)
-   nfp_net_rx_ring_fill_freelist(nn->r_vecs[r].rx_ring);
+   nfp_net_rx_ring_fill_freelist(>rx_rings[r]);
 
/* Since reconfiguration requests while NFP is down are ignored we
 * have to wipe the entire VXLAN configuration and reinitialize it.
@@ -2173,11 +2173,11 @@ static void nfp_net_close_free_all(struct nfp_net *nn)
unsigned int r;
 
for (r = 0; r < nn->num_rx_rings; r++) {
-   nfp_net_rx_ring_bufs_free(nn, nn->r_vecs[r].rx_ring);
-   nfp_net_rx_ring_free(nn->r_vecs[r].rx_ring);
+   nfp_net_rx_ring_bufs_free(nn, >rx_rings[r]);
+   nfp_net_rx_ring_free(>rx_rings[r]);
}
for (r = 0; r < nn->num_tx_rings; r++)
-   nfp_net_tx_ring_free(nn->r_vecs[r].tx_ring);
+   nfp_net_tx_ring_free(>tx_rings[r]);
for (r = 0; r < nn->num_r_vecs; r++)
nfp_net_cleanup_vector(nn, >r_vecs[r]);
 
-- 
1.9.1



[PATCH net-next 10/13] debugfs: constify argument to debugfs_real_fops()

2016-11-03 Thread Jakub Kicinski
seq_file users can only access const version of file pointer,
make parameter to debugfs_real_fops() const.

CC: Nicolai Stange 
CC: Christian Lamparter 
Signed-off-by: Jakub Kicinski 
---
 include/linux/debugfs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index 4d3f0d1aec73..bf1907d96097 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -52,7 +52,8 @@ struct debugfs_regset32 {
  * Must only be called under the protection established by
  * debugfs_use_file_start().
  */
-static inline const struct file_operations *debugfs_real_fops(struct file 
*filp)
+static inline const struct file_operations *
+debugfs_real_fops(const struct file *filp)
__must_hold(_srcu)
 {
/*
-- 
1.9.1



[PATCH net-next 12/13] nfp: remove unnecessary parameters from nfp_net_bpf_offload()

2016-11-03 Thread Jakub Kicinski
nfp_net_bpf_offload() takes all .setup_tc() parameters but it
doesn't use them at the moment.  Remove unnecessary ones to make
it possible for XDP to reuse this function.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net.h | 4 +---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c  | 2 +-
 drivers/net/ethernet/netronome/nfp/nfp_net_offload.c | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net.h 
b/drivers/net/ethernet/netronome/nfp/nfp_net.h
index abc9e56e93b8..fd29a6306991 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net.h
@@ -817,8 +817,6 @@ static inline void nfp_net_debugfs_adapter_del(struct 
nfp_net *nn)
 #endif /* CONFIG_NFP_NET_DEBUG */
 
 void nfp_net_filter_stats_timer(unsigned long data);
-int
-nfp_net_bpf_offload(struct nfp_net *nn, u32 handle, __be16 proto,
-   struct tc_cls_bpf_offload *cls_bpf);
+int nfp_net_bpf_offload(struct nfp_net *nn, struct tc_cls_bpf_offload 
*cls_bpf);
 
 #endif /* _NFP_NET_H_ */
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index fa43dbcecc4f..1e8e00d25c51 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2694,7 +2694,7 @@ static bool nfp_net_ebpf_capable(struct nfp_net *nn)
return -ENOTSUPP;
 
if (tc->type == TC_SETUP_CLSBPF && nfp_net_ebpf_capable(nn))
-   return nfp_net_bpf_offload(nn, handle, proto, tc->cls_bpf);
+   return nfp_net_bpf_offload(nn, tc->cls_bpf);
 
return -EINVAL;
 }
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c
index cfed40c0e310..4bb6f16e2a7a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c
@@ -233,9 +233,7 @@ static int nfp_net_bpf_stop(struct nfp_net *nn)
return nfp_net_reconfig(nn, NFP_NET_CFG_UPDATE_GEN);
 }
 
-int
-nfp_net_bpf_offload(struct nfp_net *nn, u32 handle, __be16 proto,
-   struct tc_cls_bpf_offload *cls_bpf)
+int nfp_net_bpf_offload(struct nfp_net *nn, struct tc_cls_bpf_offload *cls_bpf)
 {
struct nfp_bpf_result res;
dma_addr_t dma_addr;
-- 
1.9.1



[PATCH net-next 01/13] nfp: add support for ethtool .get_channels

2016-11-03 Thread Jakub Kicinski
Report number of rings via ethtool .get_channels API.

Signed-off-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 3418f2277e9d..a7386d1b2883 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -614,6 +614,21 @@ static int nfp_net_set_coalesce(struct net_device *netdev,
return nfp_net_reconfig(nn, NFP_NET_CFG_UPDATE_IRQMOD);
 }
 
+static void nfp_net_get_channels(struct net_device *netdev,
+struct ethtool_channels *channel)
+{
+   struct nfp_net *nn = netdev_priv(netdev);
+
+   channel->max_rx = min(nn->max_rx_rings, nn->max_r_vecs);
+   channel->max_tx = min(nn->max_tx_rings, nn->max_r_vecs);
+   channel->max_combined = min(channel->max_rx, channel->max_tx);
+   channel->max_other = NFP_NET_NON_Q_VECTORS;
+   channel->combined_count = min(nn->num_rx_rings, nn->num_tx_rings);
+   channel->rx_count = nn->num_rx_rings - channel->combined_count;
+   channel->tx_count = nn->num_tx_rings - channel->combined_count;
+   channel->other_count = NFP_NET_NON_Q_VECTORS;
+}
+
 static const struct ethtool_ops nfp_net_ethtool_ops = {
.get_drvinfo= nfp_net_get_drvinfo,
.get_link   = ethtool_op_get_link,
@@ -632,6 +647,7 @@ static int nfp_net_set_coalesce(struct net_device *netdev,
.get_regs   = nfp_net_get_regs,
.get_coalesce   = nfp_net_get_coalesce,
.set_coalesce   = nfp_net_set_coalesce,
+   .get_channels   = nfp_net_get_channels,
 };
 
 void nfp_net_set_ethtool_ops(struct net_device *netdev)
-- 
1.9.1



<    1   2   3   >