[PATCH] add one parameter wro_enable to enable relaxed ordering for IXGBE

2016-10-29 Thread Mao Wenan
This patch provides a way to enable relaxed ordering, where it helps with 
performance in some architecture.
The default value of wro_enable is 0, if you want to enable relaxed ordering, 
please set wro_enable=1.

Mao Wenan (1):
  add one parameter wro_enable for IXGBE

 drivers/net/ethernet/intel/ixgbe/ixgbe.h|  1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c  | 29 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 28 +---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  9 
 4 files changed, 41 insertions(+), 26 deletions(-)

-- 
2.5.0




[PATCH] add one parameter wro_enable for IXGBE

2016-10-29 Thread Mao Wenan
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h|  1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c  | 29 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 28 +---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  9 
 4 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index b06e32d..9bc0be5 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -1027,4 +1027,5 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
  struct ixgbe_ring *tx_ring);
 u32 ixgbe_rss_indir_tbl_entries(struct ixgbe_adapter *adapter);
 void ixgbe_store_reta(struct ixgbe_adapter *adapter);
+bool ixgbe_wro_enable(void);
 #endif /* _IXGBE_H_ */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
index fb51be7..c312aaa 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
@@ -186,20 +186,23 @@ static s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw)
ret_val = ixgbe_start_hw_generic(hw);
 
 #ifndef CONFIG_SPARC
-   /* Disable relaxed ordering */
-   for (i = 0; ((i < hw->mac.max_tx_queues) &&
-(i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-   regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i));
-   regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-   IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval);
-   }
+   if(likely(!ixgbe_wro_enable())) {
+
+   /* Disable relaxed ordering */
+   for (i = 0; ((i < hw->mac.max_tx_queues) &&
+(i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
+   regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i));
+   regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
+   IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval);
+   }
 
-   for (i = 0; ((i < hw->mac.max_rx_queues) &&
-(i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-   regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
-   regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
-   IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
-   IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
+   for (i = 0; ((i < hw->mac.max_rx_queues) &&
+(i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
+   regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
+   regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
+   IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
+   IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
+   }
}
 #endif
if (ret_val)
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 77d3039..7115dc0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -347,22 +347,24 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
IXGBE_WRITE_FLUSH(hw);
 
 #ifndef CONFIG_SPARC
-   /* Disable relaxed ordering */
-   for (i = 0; i < hw->mac.max_tx_queues; i++) {
-   u32 regval;
-
-   regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
-   regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-   IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
-   }
+   if(likely(!ixgbe_wro_enable())) {
+   /* Disable relaxed ordering */
+   for (i = 0; i < hw->mac.max_tx_queues; i++) {
+   u32 regval;
+
+   regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
+   regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
+   IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
+   }
 
-   for (i = 0; i < hw->mac.max_rx_queues; i++) {
-   u32 regval;
+   for (i = 0; i < hw->mac.max_rx_queues; i++) {
+   u32 regval;
 
-   regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
-   regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
-   IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
+   regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
+   regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
+   IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
+   }
}
 #endif
return 0;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index a244d9a..79ebce3 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -171,11 +171,20 @@ static int debug = -1;
 modu

[net-next 13/17] i40e: Fix for long link down notification time

2016-10-29 Thread Jeff Kirsher
From: Carolyn Wyborny 

This patch fixes a problem where it could take a very
long time (>100 msec) to print the link down notification.
This problem is fixed by changing how often we update link
info from fw, when link is down. Without this patch, it can
take over 100msec to notify user link is down.

Change-ID: Ib876eb30834c7080792becd13ee093b9cbb35d78
Signed-off-by: Carolyn Wyborny 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_common.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index fe8100b..a475946 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -2494,7 +2494,10 @@ i40e_status i40e_update_link_info(struct i40e_hw *hw)
if (status)
return status;
 
-   if (hw->phy.link_info.link_info & I40E_AQ_MEDIA_AVAILABLE) {
+   /* extra checking needed to ensure link info to user is timely */
+   if ((hw->phy.link_info.link_info & I40E_AQ_MEDIA_AVAILABLE) &&
+   ((hw->phy.link_info.link_info & I40E_AQ_LINK_UP) ||
+!(hw->phy.link_info_old.link_info & I40E_AQ_LINK_UP))) {
status = i40e_aq_get_phy_capabilities(hw, false, false,
  &abilities, NULL);
if (status)
-- 
2.7.4



[net-next 06/17] i40e: reopen client after reset

2016-10-29 Thread Jeff Kirsher
From: Mitch Williams 

Allow the client interface to reopen existing clients if they were
closed. This allows clients to recover from reset, which is essential
for supporting VF RDMA. In one instance, the driver was not clearing the
open bit when the client was closed. Add the code to clear this bit so
that the state is accurate and the driver will not attempt to reopen
already-open clients. Remove the ref_cnt variable; it was just getting
in the way and was not being used consistently.

Change-ID: Ic71af4553b096963ac0c56a997f887c9a4ed162d
Signed-off-by: Mitch Williams 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_client.c | 47 ++-
 drivers/net/ethernet/intel/i40e/i40e_client.h |  2 --
 2 files changed, 17 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c 
b/drivers/net/ethernet/intel/i40e/i40e_client.c
index 417ac16..7fe72ab 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -287,6 +287,7 @@ void i40e_notify_client_of_netdev_close(struct i40e_vsi 
*vsi, bool reset)
}
cdev->client->ops->close(&cdev->lan_info, cdev->client,
 reset);
+   clear_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
i40e_client_release_qvlist(&cdev->lan_info);
}
}
@@ -544,28 +545,27 @@ void i40e_client_subtask(struct i40e_pf *pf)
continue;
 
if (!existing) {
-   /* Also up the ref_cnt for no. of instances of this
-* client.
-*/
-   atomic_inc(&client->ref_cnt);
dev_info(&pf->pdev->dev, "Added instance of Client %s 
to PF%d bus=0x%02x func=0x%02x\n",
 client->name, pf->hw.pf_id,
 pf->hw.bus.device, pf->hw.bus.func);
-   mutex_lock(&i40e_client_instance_mutex);
-   atomic_inc(&cdev->ref_cnt);
+   }
+
+   mutex_lock(&i40e_client_instance_mutex);
+   if (!test_bit(__I40E_CLIENT_INSTANCE_OPENED,
+ &cdev->state)) {
+   /* Send an Open request to the client */
if (client->ops && client->ops->open)
ret = client->ops->open(&cdev->lan_info,
client);
-   atomic_dec(&cdev->ref_cnt);
-   if (ret < 0) {
-   mutex_unlock(&i40e_client_instance_mutex);
+   if (!ret) {
+   set_bit(__I40E_CLIENT_INSTANCE_OPENED,
+   &cdev->state);
+   } else {
+   /* remove client instance */
i40e_client_del_instance(pf, client);
-   atomic_dec(&client->ref_cnt);
-   continue;
}
-   set_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
-   mutex_unlock(&i40e_client_instance_mutex);
}
+   mutex_unlock(&i40e_client_instance_mutex);
}
mutex_unlock(&i40e_client_mutex);
 }
@@ -660,10 +660,6 @@ static int i40e_client_release(struct i40e_client *client)
continue;
pf = (struct i40e_pf *)cdev->lan_info.pf;
if (test_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state)) {
-   if (atomic_read(&cdev->ref_cnt) > 0) {
-   ret = I40E_ERR_NOT_READY;
-   goto out;
-   }
if (client->ops && client->ops->close)
client->ops->close(&cdev->lan_info, client,
   false);
@@ -676,11 +672,9 @@ static int i40e_client_release(struct i40e_client *client)
}
/* delete the client instance from the list */
list_move(&cdev->list, &cdevs_tmp);
-   atomic_dec(&client->ref_cnt);
dev_info(&pf->pdev->dev, "Deleted client instance of Client 
%s\n",
 client->name);
}
-out:
mutex_unlock(&i40e_client_instance_mutex);
 
/* free the client device and release its vsi */
@@ -1006,17 +1000,10 @@ int i40e_unregister_client(struct i40e_client *client)
ret = -ENODEV;
goto out;
}
-   if (atomic_read(&client->ref_cnt) == 0) {
-   clear_bit(__I40E_CLIENT_REGISTERED, &client->state);
-   list_del(&client->list);
-   pr_info("i40e: Un

[net-next 01/17] i40e: Fix client interaction

2016-10-29 Thread Jeff Kirsher
From: Carolyn Wyborny 

This patch fixes a problem in the client interface that
was causing random stack traces in RDMA driver load and
unload tests.  This patch fixes the problem by checking
for an existing client before trying to open it.  Without
this patch, there is a timing related null pointer deref.

Change-ID: Ib73d30671a27f6f9770dd53b3e5292b88d6b62da
Signed-off-by: Carolyn Wyborny 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_client.c | 29 ---
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c 
b/drivers/net/ethernet/intel/i40e/i40e_client.c
index 250db0b..6ffac03 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -565,7 +565,7 @@ void i40e_client_subtask(struct i40e_pf *pf)
if (test_bit(__I40E_DOWN, &pf->vsi[pf->lan_vsi]->state))
continue;
} else {
-   dev_warn(&pf->pdev->dev, "This client %s is being 
instanciated at probe\n",
+   dev_warn(&pf->pdev->dev, "This client %s is being 
instantiated at probe\n",
 client->name);
}
 
@@ -582,24 +582,21 @@ void i40e_client_subtask(struct i40e_pf *pf)
dev_info(&pf->pdev->dev, "Added instance of Client %s 
to PF%d bus=0x%02x func=0x%02x\n",
 client->name, pf->hw.pf_id,
 pf->hw.bus.device, pf->hw.bus.func);
-   }
-
-   mutex_lock(&i40e_client_instance_mutex);
-   /* Send an Open request to the client */
-   atomic_inc(&cdev->ref_cnt);
-   if (client->ops && client->ops->open)
-   ret = client->ops->open(&cdev->lan_info, client);
-   atomic_dec(&cdev->ref_cnt);
-   if (!ret) {
+   mutex_lock(&i40e_client_instance_mutex);
+   atomic_inc(&cdev->ref_cnt);
+   if (client->ops && client->ops->open)
+   ret = client->ops->open(&cdev->lan_info,
+   client);
+   atomic_dec(&cdev->ref_cnt);
+   if (ret < 0) {
+   mutex_unlock(&i40e_client_instance_mutex);
+   i40e_client_del_instance(pf, client);
+   atomic_dec(&client->ref_cnt);
+   continue;
+   }
set_bit(__I40E_CLIENT_INSTANCE_OPENED, &cdev->state);
-   } else {
-   /* remove client instance */
mutex_unlock(&i40e_client_instance_mutex);
-   i40e_client_del_instance(pf, client);
-   atomic_dec(&client->ref_cnt);
-   continue;
}
-   mutex_unlock(&i40e_client_instance_mutex);
}
mutex_unlock(&i40e_client_mutex);
 }
-- 
2.7.4



[net-next 00/17][pull request] 40GbE Intel Wired LAN Driver Updates 2016-10-28

2016-10-29 Thread Jeff Kirsher
This series contains updates to i40e and i40evf only.

Carolyn provides a couple of fixes, first resolving a problem in the
client interface that was causing random stack traces in the RDMA driver
which was due to a timing related NULL pointer dereference.  Fixed a
problem where it could take a very long time to print the link down
notification, by changing how often we update link info from firmware.

Alex provides a number of changes, first is a re-write of the bust wait
loop in the Flow Director transmit function to reduce code size.  Cleans
up unused code in favor of the same functionality which can be inlined.
Dropped the functionality for SCTP since we cannot currently support it.
Cleans up redundant code in the receive clean-up path.  Finally cleaned
up the convoluted configuration for how the driver handled the debug
flags contained in msg_level.

Filip fixes an incorrect bit mask which was being used for testing the
"get link status".  Cleaned up a workaround that is no longer needed
for production NICs and was causing frames to pass while disregarding
the VLAN tagging.

Mitch brings another fix for the client interface supporting the VF RDMA
driver to allow clients to recover from reset by re-opening existing
clients.

Alan fixes a bug in which a "perfect storm" can occur and cause interrupts
to fail to be correctly affinitized.

Lihong fixes a confusing dmesg reported when users were using ethtool -L
option.

The following are changes since commit b09edbd07f876c9f7046c4aae1831e58919cffea:
  net caif: insert missing spaces in pr_* messages and unbreak multi-line 
strings
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE

Alan Brady (1):
  i40e/i40evf: fix interrupt affinity bug

Alexander Duyck (5):
  i40e: Rewrite Flow Director busy wait loop
  i40e: Remove unused function i40e_vsi_lookup
  i40e: Drop code for unsupported flow types
  i40e: Drop redundant Rx descriptor processing code
  i40e: Clean up handling of msglevel flags and debug parameter

Bimmy Pujari (2):
  i40e/i40evf: Changed version from 1.6.16 to 1.6.19
  i40e/i40evf: Changed version from 1.6.19 to 1.6.21

Carolyn Wyborny (2):
  i40e: Fix client interaction
  i40e: Fix for long link down notification time

David Ertman (1):
  i40e: Fix bit logic error in failure case

Filip Sadowski (2):
  i40e: Bit test mask correction
  i40e: Removal of workaround for simple MAC address filter deletion

Joe Perches (1):
  i40e: Make struct i40e_stats const

Lihong Yang (1):
  i40e: fix confusing dmesg info for ethtool -L option

Mitch Williams (1):
  i40e: reopen client after reset

Preethi Banala (1):
  i40e: group base mode VF offload flags

 drivers/net/ethernet/intel/i40e/i40e.h|   4 +-
 drivers/net/ethernet/intel/i40e/i40e_client.c |  85 +++---
 drivers/net/ethernet/intel/i40e/i40e_client.h |   2 -
 drivers/net/ethernet/intel/i40e/i40e_common.c |   7 +-
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c|  19 ---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c|   9 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c   | 136 ++
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  98 +++-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl.h   |   4 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c |  49 +---
 drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h |   4 +
 drivers/net/ethernet/intel/i40evf/i40evf.h|   3 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c   |  73 
 13 files changed, 232 insertions(+), 261 deletions(-)

-- 
2.7.4



[net-next 05/17] i40e: Drop code for unsupported flow types

2016-10-29 Thread Jeff Kirsher
From: Alexander Duyck 

We cannot currently support SCTP in the hardware, and IPV4_FLOW is not used
anywhere by the software so we can go through and drop the functionality
related to these two flow types.

In addition we cannot support masking based on the protocol value so if the
user is expecting a value other than TCP or UDP we should simply return an
error rather then trying to allocate a filter for a rule that will only
partially match what the user requested.

Change-ID: I10d52bb97d8104d76255fe244551814ff9531a63
Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 31 +
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index af36c44..7d160c9 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -330,22 +330,6 @@ static int i40e_add_del_fdir_tcpv4(struct i40e_vsi *vsi,
return err ? -EOPNOTSUPP : 0;
 }
 
-/**
- * i40e_add_del_fdir_sctpv4 - Add/Remove SCTPv4 Flow Director filters for
- * a specific flow spec
- * @vsi: pointer to the targeted VSI
- * @fd_data: the flow director data required for the FDir descriptor
- * @add: true adds a filter, false removes it
- *
- * Returns 0 if the filters were successfully added or removed
- **/
-static int i40e_add_del_fdir_sctpv4(struct i40e_vsi *vsi,
-   struct i40e_fdir_filter *fd_data,
-   bool add)
-{
-   return -EOPNOTSUPP;
-}
-
 #define I40E_IP_DUMMY_PACKET_LEN 34
 /**
  * i40e_add_del_fdir_ipv4 - Add/Remove IPv4 Flow Director filters for
@@ -428,12 +412,6 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi,
case UDP_V4_FLOW:
ret = i40e_add_del_fdir_udpv4(vsi, input, add);
break;
-   case SCTP_V4_FLOW:
-   ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
-   break;
-   case IPV4_FLOW:
-   ret = i40e_add_del_fdir_ipv4(vsi, input, add);
-   break;
case IP_USER_FLOW:
switch (input->ip4_proto) {
case IPPROTO_TCP:
@@ -442,15 +420,16 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi,
case IPPROTO_UDP:
ret = i40e_add_del_fdir_udpv4(vsi, input, add);
break;
-   case IPPROTO_SCTP:
-   ret = i40e_add_del_fdir_sctpv4(vsi, input, add);
-   break;
-   default:
+   case IPPROTO_IP:
ret = i40e_add_del_fdir_ipv4(vsi, input, add);
break;
+   default:
+   /* We cannot support masking based on protocol */
+   goto unsupported_flow;
}
break;
default:
+unsupported_flow:
dev_info(&pf->pdev->dev, "Could not specify spec type %d\n",
 input->flow_type);
ret = -EINVAL;
-- 
2.7.4



[net-next 02/17] i40e: Rewrite Flow Director busy wait loop

2016-10-29 Thread Jeff Kirsher
From: Alexander Duyck 

We can reorder the busy wait loop at the start of the Flow Director
transmit function to reduce the overall code size while still retaining the
same functionality.  As such I am taking advantage of the opportunity to do
so.

Change-ID: I34c403ca001953c6ac9816e65d5305e73d869026
Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 6287bf6..af36c44 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -122,7 +122,6 @@ static int i40e_program_fdir_filter(struct i40e_fdir_filter 
*fdir_data,
struct device *dev;
dma_addr_t dma;
u32 td_cmd = 0;
-   u16 delay = 0;
u16 i;
 
/* find existing FDIR VSI */
@@ -137,15 +136,11 @@ static int i40e_program_fdir_filter(struct 
i40e_fdir_filter *fdir_data,
dev = tx_ring->dev;
 
/* we need two descriptors to add/del a filter and we can wait */
-   do {
-   if (I40E_DESC_UNUSED(tx_ring) > 1)
-   break;
+   for (i = I40E_FD_CLEAN_DELAY; I40E_DESC_UNUSED(tx_ring) < 2; i--) {
+   if (!i)
+   return -EAGAIN;
msleep_interruptible(1);
-   delay++;
-   } while (delay < I40E_FD_CLEAN_DELAY);
-
-   if (!(I40E_DESC_UNUSED(tx_ring) > 1))
-   return -EAGAIN;
+   }
 
dma = dma_map_single(dev, raw_packet,
 I40E_FDIR_MAX_RAW_PACKET_SIZE, DMA_TO_DEVICE);
-- 
2.7.4



[net-next 12/17] i40e: Drop redundant Rx descriptor processing code

2016-10-29 Thread Jeff Kirsher
From: Alexander Duyck 

This patch cleans up several pieces of redundant code in the Rx clean-up
paths.

The first bit is that hdr_addr and the status_err_len portions of the Rx
descriptor represent the same value.  As such there is no point in setting
them to 0 before setting them to 0.  I'm dropping the second spot where we
are updating the value to 0 so that we only have 1 write for this value
instead of 2.

The second piece is the checking for the DD bit in the packet.  We only
need to check for a non-zero value for the status_err_len because if the
device is done with the descriptor it will have written something back and
the DD is just one piece of it.  In addition I have moved the reading of
the Rx descriptor bits related to rx_ptype down so that they are actually
below the dma_rmb() call so that we are guaranteed that we don't have any
funky 64b on 32b calls causing any ordering issues.

Change-ID: I256e44a025d3c64a7224aaaec37c852bfcb1871b
Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 18 ++
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 18 ++
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 48e6533..daade4fe 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1220,7 +1220,6 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 
cleaned_count)
 * because each write-back erases this info.
 */
rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
-   rx_desc->read.hdr_addr = 0;
 
rx_desc++;
bi++;
@@ -1741,7 +1740,6 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
while (likely(total_rx_packets < budget)) {
union i40e_rx_desc *rx_desc;
struct sk_buff *skb;
-   u32 rx_status;
u16 vlan_tag;
u8 rx_ptype;
u64 qword;
@@ -1755,21 +1753,13 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
 
rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
-   qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
-   rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
-  I40E_RXD_QW1_PTYPE_SHIFT;
-   rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
-   I40E_RXD_QW1_STATUS_SHIFT;
-
-   if (!(rx_status & BIT(I40E_RX_DESC_STATUS_DD_SHIFT)))
-   break;
-
/* status_error_len will always be zero for unused descriptors
 * because it's cleared in cleanup, and overlaps with hdr_addr
 * which is always zero because packet split isn't used, if the
 * hardware wrote DD then it will be non-zero
 */
-   if (!rx_desc->wb.qword1.status_error_len)
+   if (!i40e_test_staterr(rx_desc,
+  BIT(I40E_RX_DESC_STATUS_DD_SHIFT)))
break;
 
/* This memory barrier is needed to keep us from reading
@@ -1803,6 +1793,10 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
/* probably a little skewed due to removing CRC */
total_rx_bytes += skb->len;
 
+   qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+   rx_ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >>
+  I40E_RXD_QW1_PTYPE_SHIFT;
+
/* populate checksum, VLAN, and protocol */
i40e_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index dd8ad6b..e2d3622 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -705,7 +705,6 @@ bool i40evf_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 
cleaned_count)
 * because each write-back erases this info.
 */
rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
-   rx_desc->read.hdr_addr = 0;
 
rx_desc++;
bi++;
@@ -1209,7 +1208,6 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
while (likely(total_rx_packets < budget)) {
union i40e_rx_desc *rx_desc;
struct sk_buff *skb;
-   u32 rx_status;
u16 vlan_tag;
u8 rx_ptype;
u64 qword;
@@ -1223,21 +1221,13 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, 
int budget)
 
rx_desc = I40E_RX_DESC(rx_ring, rx_ring->next

[net-next 09/17] i40e/i40evf: Changed version from 1.6.16 to 1.6.19

2016-10-29 Thread Jeff Kirsher
From: Bimmy Pujari 

Signed-off-by: Bimmy Pujari 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 9382ba80..c25247f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -41,7 +41,7 @@ static const char i40e_driver_string[] =
 
 #define DRV_VERSION_MAJOR 1
 #define DRV_VERSION_MINOR 6
-#define DRV_VERSION_BUILD 16
+#define DRV_VERSION_BUILD 19
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 __stringify(DRV_VERSION_MINOR) "." \
 __stringify(DRV_VERSION_BUILD)DRV_KERN
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index e95e873..3e51ff2 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -38,7 +38,7 @@ static const char i40evf_driver_string[] =
 
 #define DRV_VERSION_MAJOR 1
 #define DRV_VERSION_MINOR 6
-#define DRV_VERSION_BUILD 16
+#define DRV_VERSION_BUILD 19
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 __stringify(DRV_VERSION_MINOR) "." \
 __stringify(DRV_VERSION_BUILD) \
-- 
2.7.4



[net-next 16/17] i40e: Fix bit logic error in failure case

2016-10-29 Thread Jeff Kirsher
From: David Ertman 

Patch a036244c0686 "i40e: Fix kernel panic on enable/disable LLDP"
introduced an error in bit logic.

Originally this bit manipulation was meant to clear two bits to indicate
that DCB was not enabled or capable. An "&" was incorrectly used instead
of an "|" bit operator to combine the two bitmasks into one.  This also
created a static checker error since the resultant code was a no-op.

This patch fixes the error by using the correct bit-wise operator.

Signed-off-by: Dave Ertman 
Reported-by: Dan Carpenter 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 00c322d..0c2328d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -10970,7 +10970,7 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
err = i40e_init_pf_dcb(pf);
if (err) {
dev_info(&pdev->dev, "DCB init failed %d, disabled\n", err);
-   pf->flags &= ~(I40E_FLAG_DCB_CAPABLE & I40E_FLAG_DCB_ENABLED);
+   pf->flags &= ~(I40E_FLAG_DCB_CAPABLE | I40E_FLAG_DCB_ENABLED);
/* Continue without DCB enabled */
}
 #endif /* CONFIG_I40E_DCB */
-- 
2.7.4



[net-next 07/17] i40e: group base mode VF offload flags

2016-10-29 Thread Jeff Kirsher
From: Preethi Banala 

Group together the minimum set of offload capabilities that are always
supported by VF in base mode. This define would be used by PF to make
sure VF in base mode gets minimum of base capabilities .

Change-ID: Id5e8f22ba169c8f0a38d22fc36b2cb531c02582c
Signed-off-by: Preethi Banala 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl.h   | 4 
 drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h | 4 
 2 files changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl.h 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl.h
index f861d31..974ba2b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl.h
@@ -165,6 +165,10 @@ struct i40e_virtchnl_vsi_resource {
 #define I40E_VIRTCHNL_VF_OFFLOAD_RSS_PF0X0008
 #define I40E_VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM0X0010
 
+#define I40E_VF_BASE_MODE_OFFLOADS (I40E_VIRTCHNL_VF_OFFLOAD_L2 | \
+   I40E_VIRTCHNL_VF_OFFLOAD_VLAN | \
+   I40E_VIRTCHNL_VF_OFFLOAD_RSS_PF)
+
 struct i40e_virtchnl_vf_resource {
u16 num_vsis;
u16 num_queue_pairs;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h 
b/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h
index bd691ad..fc374f8 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_virtchnl.h
@@ -162,6 +162,10 @@ struct i40e_virtchnl_vsi_resource {
 #define I40E_VIRTCHNL_VF_OFFLOAD_RSS_PF0X0008
 #define I40E_VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM0X0010
 
+#define I40E_VF_BASE_MODE_OFFLOADS (I40E_VIRTCHNL_VF_OFFLOAD_L2 | \
+   I40E_VIRTCHNL_VF_OFFLOAD_VLAN | \
+   I40E_VIRTCHNL_VF_OFFLOAD_RSS_PF)
+
 struct i40e_virtchnl_vf_resource {
u16 num_vsis;
u16 num_queue_pairs;
-- 
2.7.4



[net-next 10/17] i40e: Make struct i40e_stats const

2016-10-29 Thread Jeff Kirsher
From: Joe Perches 

Move some data to text

$ size drivers/net/ethernet/intel/i40e/i40e_ethtool.o*
   textdata bss dec hex filename
  25012   0  32   2504461d4 
drivers/net/ethernet/intel/i40e/i40e_ethtool.o.new
  228682120  32   2502061bc 
drivers/net/ethernet/intel/i40e/i40e_ethtool.o.old

Signed-off-by: Joe Perches 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 92bc884..3a1f91e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -104,7 +104,7 @@ static const struct i40e_stats i40e_gstrings_misc_stats[] = 
{
  * The PF_STATs are appended to the netdev stats only when ethtool -S
  * is queried on the base PF netdev, not on the VMDq or FCoE netdev.
  */
-static struct i40e_stats i40e_gstrings_stats[] = {
+static const struct i40e_stats i40e_gstrings_stats[] = {
I40E_PF_STAT("rx_bytes", stats.eth.rx_bytes),
I40E_PF_STAT("tx_bytes", stats.eth.tx_bytes),
I40E_PF_STAT("rx_unicast", stats.eth.rx_unicast),
-- 
2.7.4



[net-next 08/17] i40e/i40evf: fix interrupt affinity bug

2016-10-29 Thread Jeff Kirsher
From: Alan Brady 

There exists a bug in which a 'perfect storm' can occur and cause
interrupts to fail to be correctly affinitized. This causes unexpected
behavior and has a substantial impact on performance when it happens.

The bug occurs if there is heavy traffic, any number of CPUs that have
an i40e interrupt are pegged at 100%, and the interrupt afffinity for
those CPUs is changed.  Instead of moving to the new CPU, the interrupt
continues to be polled while there is heavy traffic.

The bug is most readily realized as the driver is first brought up and
all interrupts start on CPU0. If there is heavy traffic and the
interrupt starts polling before the interrupt is affinitized, the
interrupt will be stuck on CPU0 until traffic stops. The bug, however,
can also be wrought out more simply by affinitizing all the interrupts
to a single CPU and then attempting to move any of those interrupts off
while there is heavy traffic.

This patch fixes the bug by registering for update notifications from
the kernel when the interrupt affinity changes. When that fires, we
cache the intended affinity mask. Then, while polling, if the cpu is
pegged at 100% and we failed to clean the rings, we check to make sure
we have the correct affinity and stop polling if we're firing on the
wrong CPU.  When the kernel successfully moves the interrupt, it will
start polling on the correct CPU. The performance impact is minimal
since the only time this section gets executed is when performance is
already compromised by the CPU.

Change-ID: I4410a880159b9dba1f8297aa72bef36dca34e830
Signed-off-by: Alan Brady 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h  |  2 +
 drivers/net/ethernet/intel/i40e/i40e_main.c | 64 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c | 36 ++---
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c   | 31 +--
 drivers/net/ethernet/intel/i40evf/i40evf.h  |  3 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 71 +
 6 files changed, 159 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 34c7a5d..01cce5b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -607,6 +607,8 @@ struct i40e_q_vector {
unsigned long hung_detected; /* Set/Reset for hung_detection logic */
 
cpumask_t affinity_mask;
+   struct irq_affinity_notify affinity_notify;
+
struct rcu_head rcu;/* to avoid race with update stats on free */
char name[I40E_INT_NAME_STR_LEN];
bool arm_wb_state;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 83edbe8..9382ba80 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3317,6 +3317,33 @@ static irqreturn_t i40e_msix_clean_rings(int irq, void 
*data)
 }
 
 /**
+ * i40e_irq_affinity_notify - Callback for affinity changes
+ * @notify: context as to what irq was changed
+ * @mask: the new affinity mask
+ *
+ * This is a callback function used by the irq_set_affinity_notifier function
+ * so that we may register to receive changes to the irq affinity masks.
+ **/
+static void i40e_irq_affinity_notify(struct irq_affinity_notify *notify,
+const cpumask_t *mask)
+{
+   struct i40e_q_vector *q_vector =
+   container_of(notify, struct i40e_q_vector, affinity_notify);
+
+   q_vector->affinity_mask = *mask;
+}
+
+/**
+ * i40e_irq_affinity_release - Callback for affinity notifier release
+ * @ref: internal core kernel usage
+ *
+ * This is a callback function used by the irq_set_affinity_notifier function
+ * to inform the current notification subscriber that they will no longer
+ * receive notifications.
+ **/
+static void i40e_irq_affinity_release(struct kref *ref) {}
+
+/**
  * i40e_vsi_request_irq_msix - Initialize MSI-X interrupts
  * @vsi: the VSI being configured
  * @basename: name for the vector
@@ -3331,10 +3358,13 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi 
*vsi, char *basename)
int rx_int_idx = 0;
int tx_int_idx = 0;
int vector, err;
+   int irq_num;
 
for (vector = 0; vector < q_vectors; vector++) {
struct i40e_q_vector *q_vector = vsi->q_vectors[vector];
 
+   irq_num = pf->msix_entries[base + vector].vector;
+
if (q_vector->tx.ring && q_vector->rx.ring) {
snprintf(q_vector->name, sizeof(q_vector->name) - 1,
 "%s-%s-%d", basename, "TxRx", rx_int_idx++);
@@ -3349,7 +3379,7 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi 
*vsi, char *basename)
/* skip this unused q_vector */
continue;
}
-   err = request_irq(pf

[net-next 15/17] i40e/i40evf: Changed version from 1.6.19 to 1.6.21

2016-10-29 Thread Jeff Kirsher
From: Bimmy Pujari 

Signed-off-by: Bimmy Pujari 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2e787ff..00c322d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -41,7 +41,7 @@ static const char i40e_driver_string[] =
 
 #define DRV_VERSION_MAJOR 1
 #define DRV_VERSION_MINOR 6
-#define DRV_VERSION_BUILD 19
+#define DRV_VERSION_BUILD 21
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 __stringify(DRV_VERSION_MINOR) "." \
 __stringify(DRV_VERSION_BUILD)DRV_KERN
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 3e51ff2..bcb1caf 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -38,7 +38,7 @@ static const char i40evf_driver_string[] =
 
 #define DRV_VERSION_MAJOR 1
 #define DRV_VERSION_MINOR 6
-#define DRV_VERSION_BUILD 19
+#define DRV_VERSION_BUILD 21
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 __stringify(DRV_VERSION_MINOR) "." \
 __stringify(DRV_VERSION_BUILD) \
-- 
2.7.4



[net-next 03/17] i40e: Bit test mask correction

2016-10-29 Thread Jeff Kirsher
From: Filip Sadowski 

Incorrect bit mask was used for testing "get link status" response.
Instead of I40E_AQ_LSE_ENABLE (which is actually 0x03) it most probably
should be I40E_AQ_LSE_IS_ENABLED (which is defined as 0x01).

Change-ID: Ia199142906720507f847de3a33a25c61a9781b2f
Signed-off-by: Filip Sadowski 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 2154a34..fe8100b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -1849,7 +1849,7 @@ i40e_status i40e_aq_get_link_info(struct i40e_hw *hw,
else
hw_link_info->crc_enable = false;
 
-   if (resp->command_flags & cpu_to_le16(I40E_AQ_LSE_ENABLE))
+   if (resp->command_flags & cpu_to_le16(I40E_AQ_LSE_IS_ENABLED))
hw_link_info->lse_enable = true;
else
hw_link_info->lse_enable = false;
-- 
2.7.4



[net-next 04/17] i40e: Remove unused function i40e_vsi_lookup

2016-10-29 Thread Jeff Kirsher
From: Alexander Duyck 

The function is not used so there is no need to carry it forward.  I have
plans to add a slightly different function that can be inlined to handle
the same kind of functionality.

Change-ID: Ie2dfcb189dc75e5fbc156bac23003e3b4210ae0f
Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h|  2 --
 drivers/net/ethernet/intel/i40e/i40e_client.c | 31 ---
 2 files changed, 33 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 2030d7c..34c7a5d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -728,8 +728,6 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi);
 struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type,
u16 uplink, u32 param1);
 int i40e_vsi_release(struct i40e_vsi *vsi);
-struct i40e_vsi *i40e_vsi_lookup(struct i40e_pf *pf, enum i40e_vsi_type type,
-struct i40e_vsi *start_vsi);
 #ifdef I40E_FCOE
 void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
  struct i40e_vsi_context *ctxt,
diff --git a/drivers/net/ethernet/intel/i40e/i40e_client.c 
b/drivers/net/ethernet/intel/i40e/i40e_client.c
index 6ffac03..417ac16 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_client.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_client.c
@@ -406,37 +406,6 @@ int i40e_vf_client_capable(struct i40e_pf *pf, u32 vf_id,
 }
 
 /**
- * i40e_vsi_lookup - finds a matching VSI from the PF list starting at 
start_vsi
- * @pf: board private structure
- * @type: vsi type
- * @start_vsi: a VSI pointer from where to start the search
- *
- * Returns non NULL on success or NULL for failure
- **/
-struct i40e_vsi *i40e_vsi_lookup(struct i40e_pf *pf,
-enum i40e_vsi_type type,
-struct i40e_vsi *start_vsi)
-{
-   struct i40e_vsi *vsi;
-   int i = 0;
-
-   if (start_vsi) {
-   for (i = 0; i < pf->num_alloc_vsi; i++) {
-   vsi = pf->vsi[i];
-   if (vsi == start_vsi)
-   break;
-   }
-   }
-   for (; i < pf->num_alloc_vsi; i++) {
-   vsi = pf->vsi[i];
-   if (vsi && vsi->type == type)
-   return vsi;
-   }
-
-   return NULL;
-}
-
-/**
  * i40e_client_add_instance - add a client instance struct to the instance list
  * @pf: pointer to the board struct
  * @client: pointer to a client struct in the client list.
-- 
2.7.4



[net-next 17/17] i40e: Clean up handling of msglevel flags and debug parameter

2016-10-29 Thread Jeff Kirsher
From: Alexander Duyck 

So the i40e driver had a really convoluted configuration for how to handle
the debug flags contained in msg_level.  Part of the issue is that the
driver has its own 32 bit mask that it was using to track a separate set of
debug features.  From what I can tell it was trying to use the upper 4 bits
to determine if the value was meant to represent a bit-mask or the numeric
value provided by debug level.

What this patch does is clean this up by compressing those 4 bits into bit
31, as a result we just have to perform a check against the value being
negative to determine if we are looking at a debug level (positive), or a
debug mask (negative).  The debug level will populate the msg_level, and
the debug mask will populate the debug_mask in the hardware struct.

I added similar logic for ethtool.  If the value being provided has bit 31
set we assume the value being provided is a debug mask, otherwise we assume
it is a msg_enable mask.  For displaying we only provide the msg_enable,
and if debug_mask is in use we will print it to the dmesg log.

Lastly I removed the debugfs interface.  It is redundant with what we
already have in ethtool and really doesn't belong anyway.

Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 19 ---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  7 ++-
 drivers/net/ethernet/intel/i40e/i40e_main.c| 23 ---
 3 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c 
b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 0c1875b..0354632 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -1210,24 +1210,6 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
dev_info(&pf->pdev->dev,
 "dump debug fwdata   
\n");
}
-
-   } else if (strncmp(cmd_buf, "msg_enable", 10) == 0) {
-   u32 level;
-   cnt = sscanf(&cmd_buf[10], "%i", &level);
-   if (cnt) {
-   if (I40E_DEBUG_USER & level) {
-   pf->hw.debug_mask = level;
-   dev_info(&pf->pdev->dev,
-"set hw.debug_mask = 0x%08x\n",
-pf->hw.debug_mask);
-   }
-   pf->msg_enable = level;
-   dev_info(&pf->pdev->dev, "set msg_enable = 0x%08x\n",
-pf->msg_enable);
-   } else {
-   dev_info(&pf->pdev->dev, "msg_enable = 0x%08x\n",
-pf->msg_enable);
-   }
} else if (strncmp(cmd_buf, "pfr", 3) == 0) {
dev_info(&pf->pdev->dev, "debugfs: forcing PFR\n");
i40e_do_reset_safe(pf, BIT(__I40E_PF_RESET_REQUESTED));
@@ -1644,7 +1626,6 @@ static ssize_t i40e_dbg_command_write(struct file *filp,
dev_info(&pf->pdev->dev, "  dump desc aq\n");
dev_info(&pf->pdev->dev, "  dump reset stats\n");
dev_info(&pf->pdev->dev, "  dump debug fwdata  
 \n");
-   dev_info(&pf->pdev->dev, "  msg_enable [level]\n");
dev_info(&pf->pdev->dev, "  read \n");
dev_info(&pf->pdev->dev, "  write  \n");
dev_info(&pf->pdev->dev, "  clear_stats vsi [seid]\n");
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 3a1f91e..fb4fb52 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -978,6 +978,10 @@ static u32 i40e_get_msglevel(struct net_device *netdev)
 {
struct i40e_netdev_priv *np = netdev_priv(netdev);
struct i40e_pf *pf = np->vsi->back;
+   u32 debug_mask = pf->hw.debug_mask;
+
+   if (debug_mask)
+   netdev_info(netdev, "i40e debug_mask: 0x%08X\n", debug_mask);
 
return pf->msg_enable;
 }
@@ -989,7 +993,8 @@ static void i40e_set_msglevel(struct net_device *netdev, 
u32 data)
 
if (I40E_DEBUG_USER & data)
pf->hw.debug_mask = data;
-   pf->msg_enable = data;
+   else
+   pf->msg_enable = data;
 }
 
 static int i40e_get_regs_len(struct net_device *netdev)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0c2328d..7fa535f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -93,8 +93,8 @@ MODULE_DEVICE_TABLE(pci, i40e_pci_tbl);
 
 #define I40E_MAX_VF_COUNT 128
 static int debug = -1;
-module_param(debug, int, 0);
-MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+module_param(debug, uint, 0);
+MODULE_PA

[net-next 11/17] i40e: fix confusing dmesg info for ethtool -L option

2016-10-29 Thread Jeff Kirsher
From: Lihong Yang 

Ethtool -L option with the combined parameter is for changing the number of
multi-purpose channels of the specified network device. The pre-set maximum
for the combined channels is cpu dependent. Currently, for an i40e device,
when the user sets a value between 64 and the maximum that the cpu can
support for the combined parameter, the i40e driver displays the confusing
info in dmesg to only show 64 as the RSS count regardless of what the
accepted user input is as long as it is larger than 64.

This patch fixes the message in the i40e driver when the user uses
ethtool -L to change the number of the combined channels to consistently
display the user requested value if it is valid and accepted by ethtool.

Change-ID: Ia80a68bc844b779a49e0f76e7d3dcc915032d9af
Signed-off-by: Lihong Yang 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index c25247f..a4bae0a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8400,8 +8400,8 @@ int i40e_reconfig_rss_queues(struct i40e_pf *pf, int 
queue_count)
 
i40e_pf_config_rss(pf);
}
-   dev_info(&pf->pdev->dev, "RSS count/HW max RSS count:  %d/%d\n",
-pf->alloc_rss_size, pf->rss_size_max);
+   dev_info(&pf->pdev->dev, "User requested queue count/HW max RSS count:  
%d/%d\n",
+vsi->req_queue_pairs, pf->rss_size_max);
return pf->alloc_rss_size;
 }
 
-- 
2.7.4



[net-next 14/17] i40e: Removal of workaround for simple MAC address filter deletion

2016-10-29 Thread Jeff Kirsher
From: Filip Sadowski 

This is code refactoring. This patch removes the workaround which deleted
a default MAC filter added by the firmware when the interface was brought
up. This filter caused frames to pass disregarding the VLAN tagging.
It used to be automatically applied after reset in pre-SRA FW versions.
This workaround is not needed in production NICs and hence can be removed.

Change-ID: I129fe1aae1f17b5a224c9b29a996d916aa1be1ec
Signed-off-by: Filip Sadowski 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 41 -
 1 file changed, 41 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index a4bae0a..2e787ff 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1287,39 +1287,6 @@ int i40e_del_mac_all_vlan(struct i40e_vsi *vsi, u8 
*macaddr,
 }
 
 /**
- * i40e_rm_default_mac_filter - Remove the default MAC filter set by NVM
- * @vsi: the PF Main VSI - inappropriate for any other VSI
- * @macaddr: the MAC address
- *
- * Remove whatever filter the firmware set up so the driver can manage
- * its own filtering intelligently.
- **/
-static void i40e_rm_default_mac_filter(struct i40e_vsi *vsi, u8 *macaddr)
-{
-   struct i40e_aqc_remove_macvlan_element_data element;
-   struct i40e_pf *pf = vsi->back;
-
-   /* Only appropriate for the PF main VSI */
-   if (vsi->type != I40E_VSI_MAIN)
-   return;
-
-   memset(&element, 0, sizeof(element));
-   ether_addr_copy(element.mac_addr, macaddr);
-   element.vlan_tag = 0;
-   /* Ignore error returns, some firmware does it this way... */
-   element.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH;
-   i40e_aq_remove_macvlan(&pf->hw, vsi->seid, &element, 1, NULL);
-
-   memset(&element, 0, sizeof(element));
-   ether_addr_copy(element.mac_addr, macaddr);
-   element.vlan_tag = 0;
-   /* ...and some firmware does it this way. */
-   element.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH |
-   I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
-   i40e_aq_remove_macvlan(&pf->hw, vsi->seid, &element, 1, NULL);
-}
-
-/**
  * i40e_add_filter - Add a mac/vlan filter to the VSI
  * @vsi: the VSI to be searched
  * @macaddr: the MAC address
@@ -9218,12 +9185,6 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
if (vsi->type == I40E_VSI_MAIN) {
SET_NETDEV_DEV(netdev, &pf->pdev->dev);
ether_addr_copy(mac_addr, hw->mac.perm_addr);
-   /* The following steps are necessary to prevent reception
-* of tagged packets - some older NVM configurations load a
-* default a MAC-VLAN filter that accepts any tagged packet
-* which must be replaced by a normal filter.
-*/
-   i40e_rm_default_mac_filter(vsi, mac_addr);
spin_lock_bh(&vsi->mac_filter_list_lock);
i40e_add_filter(vsi, mac_addr, I40E_VLAN_ANY, false, true);
spin_unlock_bh(&vsi->mac_filter_list_lock);
@@ -9741,8 +9702,6 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct 
i40e_vsi *vsi)
pf->vsi[pf->lan_vsi]->tc_config.enabled_tc = 0;
pf->vsi[pf->lan_vsi]->seid = pf->main_vsi_seid;
i40e_vsi_config_tc(pf->vsi[pf->lan_vsi], enabled_tc);
-   if (vsi->type == I40E_VSI_MAIN)
-   i40e_rm_default_mac_filter(vsi, pf->hw.mac.perm_addr);
 
/* assign it some queues */
ret = i40e_alloc_rings(vsi);
-- 
2.7.4



Re: [PATCH] add one parameter wro_enable to enable relaxed ordering for IXGBE

2016-10-29 Thread Jeff Kirsher
On Sat, 2016-10-29 at 15:08 +0800, Mao Wenan wrote:
> This patch provides a way to enable relaxed ordering, where it helps with
> performance in some architecture.
> The default value of wro_enable is 0, if you want to enable relaxed
> ordering, please set wro_enable=1.
> 
> Mao Wenan (1):
>   add one parameter wro_enable for IXGBE
> 
>  drivers/net/ethernet/intel/ixgbe/ixgbe.h    |  1 +
>  drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c  | 29 ++-
> --
>  drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 28 +
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  9 
>  4 files changed, 41 insertions(+), 26 deletions(-)

Why have a title patch for only one patch?  Better yet, the one patch does
not have a patch description.  Get rid of the title patch and add the above
information into the patches description.

In addition, module parameters are not kindly looked upon, one reason is
that it cannot be standardized and enforced.

I am also confused because you are stating that on some architectures, yet
this code is only compiled in when SPARC is defined and that there are
times when you want relaxed ordering enabled and other times disabled?
 Your gonna have to provide more data on why, because the code as is was
resolving serious performance issues on SPARC when relaxed ordering was
enabled.

signature.asc
Description: This is a digitally signed message part


Let's do P4

2016-10-29 Thread Jiri Pirko
Hi all.

The network world is divided into 2 general types of hw:
1) network ASICs - network specific silicon, containing things like TCAM
   These ASICs are suitable to be programmed by P4.
2) network processors - basically a general purpose CPUs
   These processors are suitable to be programmed by eBPF.

I believe that by now, the most people came to a conclusion that it is
very difficult to handle both types by either P4 or eBPF. And since
eBPF is part of the kernel, I would like to introduce P4 into kernel
as well. Here's a plan:

1) Define P4 intermediate representation
   I cannot imagine loading P4 program (c-like syntax text file) into
   kernel as is. That means that as the first step, we need find some
   intermediate representation. I can imagine someting in a form of AST,
   call it "p4ast". I don't really know how to do this exactly though,
   it's just an idea.

   In the end there would be a userspace precompiler for this:
   $ makep4ast example.p4 example.ast

2) Implement p4ast in-kernel interpreter 
   A kernel module which takes a p4ast and emulates the pipeline.
   This can be implemented from scratch. Or, p4ast could be compiled
   to eBPF. I know there are already couple of p4>eBPF compilers.
   Not sure how feasible it would be to put this compiler in kernel.

3) Expose the p4ast in-kernel interpreter to userspace
   As the easiest way I see in to introduce a new TC classifier cls_p4.

   This can work in a very similar way cls_bpf is:
   $ tc filter add dev eth0 ingress p4 da ast example.ast

   The TC cls_p4 will be also used for runtime table manipulation.

4) Offload p4ast programs into hardware
   The same p4ast program representation will be passed down
   to drivers via existing TC offloading way - ndo_setup_tc.
   Drivers will then parse it and setup the hardware
   accordingly. Driver will also have possibility to error out
   in case it does not support some requested feature.

Thoughts? Ideas?

Thanks,
Jiri


Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue

2016-10-29 Thread Paolo Abeni
On Fri, 2016-10-28 at 10:50 -0700, Eric Dumazet wrote:
> On Fri, 2016-10-28 at 10:16 -0700, Eric Dumazet wrote:
> > Nice !
> > 
> > I was working on this as well and my implementation was somewhat
> > different.
> 
> This is my WIP
> 
> Note this can be split in two parts.
> 
> 1) One adding struct sock *sk param to ip_cmsg_recv_offset()
>  
>This was because I left skb->sk NULL for skbs stored in receive
> queue.
>You chose instead to set skb->sk, which is unusual (check
> skb_orphan() BUG_ON())
> 
> 2) Udp changes.
> 
> Tell me what you think, thanks again !

Thank you for working on this. 

I just gave a very quick look (the WE has started, children are
screaming ;-), overall the implementation seems quite similar to our
one.

I like the additional argument to  ip_cmsg_recv_offset() instead of
keeping skb->sk set.

If I read udp_skb_destructor() correctly, the atomic manipulation of
both sk_rmem_alloc and udp_memory_allocated will happen under the
receive lock. In our experiments this increment measurably the
contention on the lock in respect to moving said the operations outside
the lock (as done in our patch). Do you foreseen any issues with that ?
AFAICS every in kernel UDP user of skb_recv_datagram() needs to be
updated with both implementation.

Cheers,

Paolo



Re: [PATCH v2 2/6] net: phy: broadcom: Add BCM54810 PHY entry

2016-10-29 Thread Andrew Lunn
On Fri, Oct 28, 2016 at 04:56:55PM -0400, Jon Mason wrote:
> The BCM54810 PHY requires some semi-unique configuration, which results
> in some additional configuration in addition to the standard config.
> Also, some users of the BCM54810 require the PHY lanes to be swapped.
> Since there is no way to detect this, add a device tree query to see if
> it is applicable.
> 
> Inspired-by: Vikas Soni 
> Signed-off-by: Jon Mason 
> ---
>  drivers/net/phy/Kconfig|  2 +-
>  drivers/net/phy/broadcom.c | 58 
> +-
>  include/linux/brcmphy.h| 10 

Hi Jon

The binding documentation is missing.

> + if (of_property_read_bool(np, "brcm,enet-phy-lane-swap")) {
> + /* Lane Swap - Undocumented register...magic! */
> + ret = bcm_phy_write_exp(phydev, MII_BCM54XX_EXP_SEL_ER + 0x9,
> + 0x11B);
> + if (ret < 0)
> + return ret;
> + }
> +

I wounder if this property could be made generic? What exactly are you
swapping? Rx and Tx lanes? Maybe we should add it to phy.txt?

  Andrew


Re: Let's do P4

2016-10-29 Thread Thomas Graf
On 10/29/16 at 09:53am, Jiri Pirko wrote:
> Hi all.
> 
> The network world is divided into 2 general types of hw:
> 1) network ASICs - network specific silicon, containing things like TCAM
>These ASICs are suitable to be programmed by P4.
> 2) network processors - basically a general purpose CPUs
>These processors are suitable to be programmed by eBPF.
> 
> I believe that by now, the most people came to a conclusion that it is
> very difficult to handle both types by either P4 or eBPF. And since
> eBPF is part of the kernel, I would like to introduce P4 into kernel
> as well. Here's a plan:

For reference, last time I remember we discussed this in the BPF
offload context:
http://www.spinics.net/lists/netdev/msg356178.html

> 1) Define P4 intermediate representation
>I cannot imagine loading P4 program (c-like syntax text file) into
>kernel as is. That means that as the first step, we need find some
>intermediate representation. I can imagine someting in a form of AST,
>call it "p4ast". I don't really know how to do this exactly though,
>it's just an idea.
> 
>In the end there would be a userspace precompiler for this:
>$ makep4ast example.p4 example.ast
> 
> 2) Implement p4ast in-kernel interpreter 
>A kernel module which takes a p4ast and emulates the pipeline.
>This can be implemented from scratch. Or, p4ast could be compiled
>to eBPF. I know there are already couple of p4>eBPF compilers.
>Not sure how feasible it would be to put this compiler in kernel.

+1 to using eBPF for emulation. Maybe the compiler doesn't need to be
in the kernel and user space can compile and provide the emulated
pipeline in eBPF directly. See next paragraph for an example where
this could be useful.

> 3) Expose the p4ast in-kernel interpreter to userspace
>As the easiest way I see in to introduce a new TC classifier cls_p4.
> 
>This can work in a very similar way cls_bpf is:
>$ tc filter add dev eth0 ingress p4 da ast example.ast
> 
>The TC cls_p4 will be also used for runtime table manipulation.

I think this is a great model for the case where HW can provide all
of the required capabilities. Thinking about the case where HW
provides a subset and SW provides an extended version, i.e. the
reality we live in for hosts with ASIC NICs ;-) The hand off point
requires some understanding between p4ast and eBPF.

Therefore another idea would be to use cls_bpf directly for this. The
p4ast IR could be stored in a separate ELF section in the same object
file with an existing eBPF program. The p4ast IR will match the
eBPF prog if capabilities of HW and SW match. If HW is limited, the
p4ast IR represents what the HW can do plus how to pass it to SW. The
eBPF prog contains whatever logic is required to take over if the HW
either bailed out or handed over deliberately. Then on top, all the
missing pieces of functionality which can only be performed in SW.

tc then loads 1) eBPF maps and prog through bpf() syscall
  2) cls_bpf filter with p4ast IR plus ref to prog and
 maps

> 4) Offload p4ast programs into hardware
>The same p4ast program representation will be passed down
>to drivers via existing TC offloading way - ndo_setup_tc.
>Drivers will then parse it and setup the hardware
>accordingly. Driver will also have possibility to error out
>in case it does not support some requested feature.


Re: Let's do P4

2016-10-29 Thread Jiri Pirko
Sat, Oct 29, 2016 at 11:39:05AM CEST, tg...@suug.ch wrote:
>On 10/29/16 at 09:53am, Jiri Pirko wrote:
>> Hi all.
>> 
>> The network world is divided into 2 general types of hw:
>> 1) network ASICs - network specific silicon, containing things like TCAM
>>These ASICs are suitable to be programmed by P4.
>> 2) network processors - basically a general purpose CPUs
>>These processors are suitable to be programmed by eBPF.
>> 
>> I believe that by now, the most people came to a conclusion that it is
>> very difficult to handle both types by either P4 or eBPF. And since
>> eBPF is part of the kernel, I would like to introduce P4 into kernel
>> as well. Here's a plan:
>
>For reference, last time I remember we discussed this in the BPF
>offload context:
>http://www.spinics.net/lists/netdev/msg356178.html
>
>> 1) Define P4 intermediate representation
>>I cannot imagine loading P4 program (c-like syntax text file) into
>>kernel as is. That means that as the first step, we need find some
>>intermediate representation. I can imagine someting in a form of AST,
>>call it "p4ast". I don't really know how to do this exactly though,
>>it's just an idea.
>> 
>>In the end there would be a userspace precompiler for this:
>>$ makep4ast example.p4 example.ast
>> 
>> 2) Implement p4ast in-kernel interpreter 
>>A kernel module which takes a p4ast and emulates the pipeline.
>>This can be implemented from scratch. Or, p4ast could be compiled
>>to eBPF. I know there are already couple of p4>eBPF compilers.
>>Not sure how feasible it would be to put this compiler in kernel.
>
>+1 to using eBPF for emulation. Maybe the compiler doesn't need to be
>in the kernel and user space can compile and provide the emulated
>pipeline in eBPF directly. See next paragraph for an example where
>this could be useful.

Ditto.


>
>> 3) Expose the p4ast in-kernel interpreter to userspace
>>As the easiest way I see in to introduce a new TC classifier cls_p4.
>> 
>>This can work in a very similar way cls_bpf is:
>>$ tc filter add dev eth0 ingress p4 da ast example.ast
>> 
>>The TC cls_p4 will be also used for runtime table manipulation.
>
>I think this is a great model for the case where HW can provide all
>of the required capabilities. Thinking about the case where HW
>provides a subset and SW provides an extended version, i.e. the
>reality we live in for hosts with ASIC NICs ;-) The hand off point
>requires some understanding between p4ast and eBPF.

It can be the other way around. The p4>ebpf compiler won't be complete
at the beginning so it is possible that HW could provide more features.
I don't think it is a problem. With SKIP_SW and SKIP_HW flags in TC,
the user can set different program to each. I think in real life, that
would be the most common case anyway.


>
>Therefore another idea would be to use cls_bpf directly for this. The
>p4ast IR could be stored in a separate ELF section in the same object
>file with an existing eBPF program. The p4ast IR will match the

I don't like this idea. The kernel API should be clean and simple.
Bundling p4ast with bpf.o code, so the bpf.o is for kernel and p4ast is
for driver does not look clean at all. The bundle does not make really
sense as the programs may do different things for BPF and p4.

Plus, it's up to user to set this up like he wants. If he wants SW
processing by BPF and at the same time HW processing by P4, he will use:
cls_bpf instance with SKIP_HW
cls_p4 instance with SKIP_SW.

This is much more variable, clean and non-confusing approach, I believe.


>eBPF prog if capabilities of HW and SW match. If HW is limited, the
>p4ast IR represents what the HW can do plus how to pass it to SW. The
>eBPF prog contains whatever logic is required to take over if the HW
>either bailed out or handed over deliberately. Then on top, all the
>missing pieces of functionality which can only be performed in SW.
>
>tc then loads 1) eBPF maps and prog through bpf() syscall
>  2) cls_bpf filter with p4ast IR plus ref to prog and
> maps
>
>> 4) Offload p4ast programs into hardware
>>The same p4ast program representation will be passed down
>>to drivers via existing TC offloading way - ndo_setup_tc.
>>Drivers will then parse it and setup the hardware
>>accordingly. Driver will also have possibility to error out
>>in case it does not support some requested feature.


Re: [PATCH 03/17] batman-adv: Add network_coding and mcast sysfs files to README

2016-10-29 Thread Jiri Pirko
Thu, Oct 27, 2016 at 09:01:36PM CEST, s...@simonwunderlich.de wrote:
>From: Sven Eckelmann 
>
>Signed-off-by: Sven Eckelmann 
>Signed-off-by: Simon Wunderlich 
>---
> Documentation/networking/batman-adv.txt | 9 +
> 1 file changed, 5 insertions(+), 4 deletions(-)
>
>diff --git a/Documentation/networking/batman-adv.txt 
>b/Documentation/networking/batman-adv.txt
>index d414e60..8afa991 100644
>--- a/Documentation/networking/batman-adv.txt
>+++ b/Documentation/networking/batman-adv.txt
>@@ -71,10 +71,11 @@ All  mesh  wide  settings  can be found in batman's own 
>interface
> folder:
> 
> # ls /sys/class/net/bat0/mesh/
>-#aggregated_ogmsdistributed_arp_table  gw_sel_classorig_interval
>-#ap_isolation   fragmentation  hop_penalty routing_algo
>-#bondinggw_bandwidth   isolation_mark  vlan0
>-#bridge_loop_avoidance  gw_modelog_level
>+# aggregated_ogmsfragmentation  isolation_mark  routing_algo
>+# ap_isolation   gw_bandwidth   log_level   vlan0
>+# bondinggw_modemulticast_mode
>+# bridge_loop_avoidance  gw_sel_class   network_coding
>+# distributed_arp_table  hop_penaltyorig_interval

I strongly believe it is a huge mistake to use sysfs for things like
this. This should be done via generic netlink api.
>


Re: [PATCH 03/17] batman-adv: Add network_coding and mcast sysfs files to README

2016-10-29 Thread Sven Eckelmann
On Samstag, 29. Oktober 2016 12:33:01 CEST Jiri Pirko wrote:
[...]
> >--- a/Documentation/networking/batman-adv.txt
> >+++ b/Documentation/networking/batman-adv.txt
> >@@ -71,10 +71,11 @@ All  mesh  wide  settings  can be found in batman's own 
> >interface
> > folder:
> > 
> > # ls /sys/class/net/bat0/mesh/
> >-#aggregated_ogmsdistributed_arp_table  gw_sel_classorig_interval
> >-#ap_isolation   fragmentation  hop_penalty routing_algo
> >-#bondinggw_bandwidth   isolation_mark  vlan0
> >-#bridge_loop_avoidance  gw_modelog_level
> >+# aggregated_ogmsfragmentation  isolation_mark  routing_algo
> >+# ap_isolation   gw_bandwidth   log_level   vlan0
> >+# bondinggw_modemulticast_mode
> >+# bridge_loop_avoidance  gw_sel_class   network_coding
> >+# distributed_arp_table  hop_penaltyorig_interval
> 
> I strongly believe it is a huge mistake to use sysfs for things like
> this. This should be done via generic netlink api.

This doesn't change the problem that it is already that way. This patch
only adds the list of available files to the README.

Kind regards,
Sven


signature.asc
Description: This is a digitally signed message part.


Re: [PATCH 03/17] batman-adv: Add network_coding and mcast sysfs files to README

2016-10-29 Thread Jiri Pirko
Sat, Oct 29, 2016 at 12:37:07PM CEST, s...@narfation.org wrote:
>On Samstag, 29. Oktober 2016 12:33:01 CEST Jiri Pirko wrote:
>[...]
>> >--- a/Documentation/networking/batman-adv.txt
>> >+++ b/Documentation/networking/batman-adv.txt
>> >@@ -71,10 +71,11 @@ All  mesh  wide  settings  can be found in batman's own 
>> >interface
>> > folder:
>> > 
>> > # ls /sys/class/net/bat0/mesh/
>> >-#aggregated_ogmsdistributed_arp_table  gw_sel_class
>> >orig_interval
>> >-#ap_isolation   fragmentation  hop_penalty routing_algo
>> >-#bondinggw_bandwidth   isolation_mark  vlan0
>> >-#bridge_loop_avoidance  gw_modelog_level
>> >+# aggregated_ogmsfragmentation  isolation_mark  routing_algo
>> >+# ap_isolation   gw_bandwidth   log_level   vlan0
>> >+# bondinggw_modemulticast_mode
>> >+# bridge_loop_avoidance  gw_sel_class   network_coding
>> >+# distributed_arp_table  hop_penaltyorig_interval
>> 
>> I strongly believe it is a huge mistake to use sysfs for things like
>> this. This should be done via generic netlink api.
>
>This doesn't change the problem that it is already that way. This patch
>only adds the list of available files to the README.

Sure. Just found out you did it like that. Therefore I commented. I
suggest to rework the api to use genl entirely.

>
>Kind regards,
>   Sven




Re: Let's do P4

2016-10-29 Thread Thomas Graf
On 10/29/16 at 12:10pm, Jiri Pirko wrote:
> Sat, Oct 29, 2016 at 11:39:05AM CEST, tg...@suug.ch wrote:
> >On 10/29/16 at 09:53am, Jiri Pirko wrote:
> >> 3) Expose the p4ast in-kernel interpreter to userspace
> >>As the easiest way I see in to introduce a new TC classifier cls_p4.
> >> 
> >>This can work in a very similar way cls_bpf is:
> >>$ tc filter add dev eth0 ingress p4 da ast example.ast
> >> 
> >>The TC cls_p4 will be also used for runtime table manipulation.
> >
> >I think this is a great model for the case where HW can provide all
> >of the required capabilities. Thinking about the case where HW
> >provides a subset and SW provides an extended version, i.e. the
> >reality we live in for hosts with ASIC NICs ;-) The hand off point
> >requires some understanding between p4ast and eBPF.
> 
> It can be the other way around. The p4>ebpf compiler won't be complete
> at the beginning so it is possible that HW could provide more features.
> I don't think it is a problem. With SKIP_SW and SKIP_HW flags in TC,
> the user can set different program to each. I think in real life, that
> would be the most common case anyway.

So given the SKIP_SW flag, the in-kernel compiler is optional anyway.
Why even risk including a possibly incomplete compiler? Older kernels
must be capable of running along newer hardware as long as eBPF can
represent the software path. Having to upgrade to latest and greatest
kernels is not an option for most people so they would simply have to
fall back to SKIP_SW and do it in user space anyway.

> >Therefore another idea would be to use cls_bpf directly for this. The
> >p4ast IR could be stored in a separate ELF section in the same object
> >file with an existing eBPF program. The p4ast IR will match the
> 
> I don't like this idea. The kernel API should be clean and simple.
> Bundling p4ast with bpf.o code, so the bpf.o is for kernel and p4ast is
> for driver does not look clean at all. The bundle does not make really
> sense as the programs may do different things for BPF and p4.

I don't care strongly for the bundle. Let's forget about it for now.

> Plus, it's up to user to set this up like he wants. If he wants SW
> processing by BPF and at the same time HW processing by P4, he will use:
> cls_bpf instance with SKIP_HW
> cls_p4 instance with SKIP_SW.
> 
> This is much more variable, clean and non-confusing approach, I believe.

Non ASIC hardware will want to do offload based on BPF though so your
model would require the user to be aware of what is the preferred
model for his hardware and then either load a cls_bpf only to work
with a Netronome NIC or a cls_p4 + cls_bpf to work with an ASIC NIC,
correct?

I'm not seeing how either of them is more or less variable. The main
difference is whether to require configuring a single cls with both
p4ast + bpf or two separate cls, one for each. I'd prefer the single
cls approach simply because it is cleaner wither regard to offload
directly off bpf vs off p4ast.

My main point is to not include a IR to eBPF compiler in the kernel
and let user space handle this instead.


Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net

2016-10-29 Thread Thomas Graf
On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
> Generally agree, but SRIOV nics with multiple queues can end up in a bad
> spot if each buffer was 4K right ? I see a specific page pool to be used
> by queues which are enabled for XDP as the easiest to swing solution that
> way the memory overhead can be restricted to enabled queues and shared
> access issues can be restricted to skb's using that pool no ?

Isn't this clearly a must anyway? I may be missing something
fundamental here so please enlighten me :-)

If we dedicate a page per packet, that could translate to 14M*4K worth
of memory being mapped per second for just a 10G NIC under DoS attack.
How can one protect such as system? Is the assumption that we can always
drop such packets quickly enough before we start dropping randomly due
to memory pressure? If a handshake is required to determine validity
of a packet then that is going to be difficult.


[PATCH 1/1] ath10k: use the right length of "background"

2016-10-29 Thread Nicolas Iooss
The word "background" contains 10 characters so the third argument of
strncmp() need to be 10 in order to match this prefix correctly.

Signed-off-by: Nicolas Iooss 
Fixes: 855aed1220d2 ("ath10k: add spectral scan feature")
---
 drivers/net/wireless/ath/ath10k/spectral.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/ath/ath10k/spectral.c 
b/drivers/net/wireless/ath/ath10k/spectral.c
index 7d9b0da1b010..2ffc1fe4923b 100644
--- a/drivers/net/wireless/ath/ath10k/spectral.c
+++ b/drivers/net/wireless/ath/ath10k/spectral.c
@@ -338,7 +338,7 @@ static ssize_t write_file_spec_scan_ctl(struct file *file,
} else {
res = -EINVAL;
}
-   } else if (strncmp("background", buf, 9) == 0) {
+   } else if (strncmp("background", buf, 10) == 0) {
res = ath10k_spectral_scan_config(ar, SPECTRAL_BACKGROUND);
} else if (strncmp("manual", buf, 6) == 0) {
res = ath10k_spectral_scan_config(ar, SPECTRAL_MANUAL);
-- 
2.10.1



Re: Let's do P4

2016-10-29 Thread Jiri Pirko
Sat, Oct 29, 2016 at 01:15:48PM CEST, tg...@suug.ch wrote:
>On 10/29/16 at 12:10pm, Jiri Pirko wrote:
>> Sat, Oct 29, 2016 at 11:39:05AM CEST, tg...@suug.ch wrote:
>> >On 10/29/16 at 09:53am, Jiri Pirko wrote:
>> >> 3) Expose the p4ast in-kernel interpreter to userspace
>> >>As the easiest way I see in to introduce a new TC classifier cls_p4.
>> >> 
>> >>This can work in a very similar way cls_bpf is:
>> >>$ tc filter add dev eth0 ingress p4 da ast example.ast
>> >> 
>> >>The TC cls_p4 will be also used for runtime table manipulation.
>> >
>> >I think this is a great model for the case where HW can provide all
>> >of the required capabilities. Thinking about the case where HW
>> >provides a subset and SW provides an extended version, i.e. the
>> >reality we live in for hosts with ASIC NICs ;-) The hand off point
>> >requires some understanding between p4ast and eBPF.
>> 
>> It can be the other way around. The p4>ebpf compiler won't be complete
>> at the beginning so it is possible that HW could provide more features.
>> I don't think it is a problem. With SKIP_SW and SKIP_HW flags in TC,
>> the user can set different program to each. I think in real life, that
>> would be the most common case anyway.
>
>So given the SKIP_SW flag, the in-kernel compiler is optional anyway.
>Why even risk including a possibly incomplete compiler? Older kernels
>must be capable of running along newer hardware as long as eBPF can
>represent the software path. Having to upgrade to latest and greatest
>kernels is not an option for most people so they would simply have to
>fall back to SKIP_SW and do it in user space anyway.

The thing is, if we needo to offload something, it needs to be
implemented in kernel first. Also, I believe that it is good to have
in-kernel p4 engine for testing and development purposes.


>
>> >Therefore another idea would be to use cls_bpf directly for this. The
>> >p4ast IR could be stored in a separate ELF section in the same object
>> >file with an existing eBPF program. The p4ast IR will match the
>> 
>> I don't like this idea. The kernel API should be clean and simple.
>> Bundling p4ast with bpf.o code, so the bpf.o is for kernel and p4ast is
>> for driver does not look clean at all. The bundle does not make really
>> sense as the programs may do different things for BPF and p4.
>
>I don't care strongly for the bundle. Let's forget about it for now.
>
>> Plus, it's up to user to set this up like he wants. If he wants SW
>> processing by BPF and at the same time HW processing by P4, he will use:
>> cls_bpf instance with SKIP_HW
>> cls_p4 instance with SKIP_SW.
>> 
>> This is much more variable, clean and non-confusing approach, I believe.
>
>Non ASIC hardware will want to do offload based on BPF though so your
>model would require the user to be aware of what is the preferred
>model for his hardware and then either load a cls_bpf only to work
>with a Netronome NIC or a cls_p4 + cls_bpf to work with an ASIC NIC,
>correct?

Correct


>
>I'm not seeing how either of them is more or less variable. The main
>difference is whether to require configuring a single cls with both
>p4ast + bpf or two separate cls, one for each. I'd prefer the single
>cls approach simply because it is cleaner wither regard to offload
>directly off bpf vs off p4ast.

That's the bundle that you asked me to forget earlier in this email? :)

>
>My main point is to not include a IR to eBPF compiler in the kernel
>and let user space handle this instead.

It we do it as you describe, we would be using 2 different APIs for
offloaded and non-offloaded path. I don't believe it is acceptable as
the offloaded features has to have kernel implementation. Therefore, I
believe that p4ast as a kernel API is the only possible option.



Re: [PATCH 03/17] batman-adv: Add network_coding and mcast sysfs files to README

2016-10-29 Thread Sven Eckelmann
On Samstag, 29. Oktober 2016 12:56:28 CEST Jiri Pirko wrote:
[...]
> >> I strongly believe it is a huge mistake to use sysfs for things like
> >> this. This should be done via generic netlink api.
> >
> >This doesn't change the problem that it is already that way. This patch
> >only adds the list of available files to the README.
> 
> Sure. Just found out you did it like that. Therefore I commented. I
> suggest to rework the api to use genl entirely.

Fair enough, I have added it to the issue tracker [1].

It seems there is no easy way to drop support for modifying batman-adv
attributes of the interface or its ports via sysfs in the near
future. But disallowing sysfs for new attributes might be a viable
policy.

Kind regards,
Sven

[1] https://www.open-mesh.org/issues/300

signature.asc
Description: This is a digitally signed message part.


Re: Let's do P4

2016-10-29 Thread Thomas Graf
On 10/29/16 at 01:28pm, Jiri Pirko wrote:
> Sat, Oct 29, 2016 at 01:15:48PM CEST, tg...@suug.ch wrote:
> >So given the SKIP_SW flag, the in-kernel compiler is optional anyway.
> >Why even risk including a possibly incomplete compiler? Older kernels
> >must be capable of running along newer hardware as long as eBPF can
> >represent the software path. Having to upgrade to latest and greatest
> >kernels is not an option for most people so they would simply have to
> >fall back to SKIP_SW and do it in user space anyway.
> 
> The thing is, if we needo to offload something, it needs to be
> implemented in kernel first. Also, I believe that it is good to have
> in-kernel p4 engine for testing and development purposes.

You lost me now :-) In an earlier email you said:

> It can be the other way around. The p4>ebpf compiler won't be complete
> at the beginning so it is possible that HW could provide more features.
> I don't think it is a problem. With SKIP_SW and SKIP_HW flags in TC,
> the user can set different program to each. I think in real life, that
> would be the most common case anyway.

If you allow to SKIP_SW and set different programs each to address
this, then how is this any different.

I completely agree that kernel must be able to provide the same
functionality as HW with optional additional capabilities on top so
the HW can always bail out and punt to SW.

[...]

> >I'm not seeing how either of them is more or less variable. The main
> >difference is whether to require configuring a single cls with both
> >p4ast + bpf or two separate cls, one for each. I'd prefer the single
> >cls approach simply because it is cleaner wither regard to offload
> >directly off bpf vs off p4ast.
> 
> That's the bundle that you asked me to forget earlier in this email? :)

I thought you referred to the "store in same object file" as bundle.
I don't really care about that. What I care about is a single way to
configure this that works for both ASIC and non-ASIC hardware.

> >My main point is to not include a IR to eBPF compiler in the kernel
> >and let user space handle this instead.
> 
> It we do it as you describe, we would be using 2 different APIs for
> offloaded and non-offloaded path. I don't believe it is acceptable as
> the offloaded features has to have kernel implementation. Therefore, I
> believe that p4ast as a kernel API is the only possible option.

Yes, the kernel has the SW implementation in eBPF. I thought that is
what you propose as well. The only difference is whether to generate
that eBPF in kernel or user space.

Not sure I understand the multiple APIs point for offload vs
non-offload. There is a single API: tc. Both models require the user
to provide additional metadata to allow programming ASIC HW: p4ast
IR or whatever we agree on.


Re: [PATCH net-next] udp: do fwd memory scheduling on dequeue

2016-10-29 Thread Eric Dumazet
On Sat, 2016-10-29 at 10:17 +0200, Paolo Abeni wrote:

> Thank you for working on this. 
> 
> I just gave a very quick look (the WE has started, children are
> screaming ;-), overall the implementation seems quite similar to our
> one.
> 
> I like the additional argument to  ip_cmsg_recv_offset() instead of
> keeping skb->sk set.
> 
> If I read udp_skb_destructor() correctly, the atomic manipulation of
> both sk_rmem_alloc and udp_memory_allocated will happen under the
> receive lock. In our experiments this increment measurably the
> contention on the lock in respect to moving said the operations outside
> the lock (as done in our patch). Do you foreseen any issues with that ?
> AFAICS every in kernel UDP user of skb_recv_datagram() needs to be
> updated with both implementation.

So if you look at tcp, we do not release forward allocation at every
recvmsg(), but rather when we are under tcp memory pressure, or at timer
firing when we know the flow has been idle for a while.

You hit contention on the lock, but the root cause is that right now udp
is very conservative and also hits false sharing on
udp_memory_allocated.

So I believe this is another problem which needs a fix anyway.

No need to make a complicated patch right now, if we know that this
problem will be separately fixed, in another patch ?





Re: net-next still on 4.8.0?

2016-10-29 Thread David Miller
From: Andrew Lunn 
Date: Fri, 28 Oct 2016 20:52:36 +0200

> During the merge window, you didn't close net-next. Trying something
> new.
> 
> We are now at v4.9-rc2, yet net-next is still based on 4.8.0. Is this
> also something new you are trying? Or do you still plan to rebase to
> an -rcX at some point?

I plan to rebase the next time I ask Linus to pull 'net' and he takes
it in, which should be this weekend.


[PATCH 1/4] isdn/eicon: remove unused argument in DBG_ERR call

2016-10-29 Thread Nicolas Iooss
diva_um_idi_read() can call DBG_ERR with 3 format arguments but using a
format string which only uses 2 of them. Remove the last one.

This bug has been found by adding a __printf attribute to
myDbgPrint_...() functions. As this addition leads the compiler to
report a lot of -Wformat warnings (for example the compiler complains
when "%08x" is used to format a pointer, as it is done with all usages
of "E(%08x)" in um_idi.c), this patch does not add any __printf
attribute.

This patch has only been compile-tested.

Signed-off-by: Nicolas Iooss 
---
 drivers/isdn/hardware/eicon/um_idi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/isdn/hardware/eicon/um_idi.c 
b/drivers/isdn/hardware/eicon/um_idi.c
index e1519718ce67..13ef38fa6cb0 100644
--- a/drivers/isdn/hardware/eicon/um_idi.c
+++ b/drivers/isdn/hardware/eicon/um_idi.c
@@ -351,7 +351,7 @@ int diva_um_idi_read(void *entity,
  Not enough space to read message
*/
DBG_ERR(("A: A(%d) E(%08x) read small buffer",
-a->adapter_nr, e, ret));
+a->adapter_nr, e));
diva_os_leave_spin_lock(&adapter_lock, &old_irql,
"read");
return (-2);
-- 
2.10.1



[PATCH 4/4] isdn/eicon: use const strings with format arguments

2016-10-29 Thread Nicolas Iooss
Functions using a printf format argument do not modify the value of this
argument. These functions can therefore use type "const char *" instead
of "char *".

This patch has only been compile-tested.

Signed-off-by: Nicolas Iooss 
---
 drivers/isdn/hardware/eicon/debug.c| 14 +++---
 drivers/isdn/hardware/eicon/debuglib.h |  6 +++---
 drivers/isdn/hardware/eicon/maintidi.c |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/isdn/hardware/eicon/debug.c 
b/drivers/isdn/hardware/eicon/debug.c
index cd8d70e3292d..b8772bbee872 100644
--- a/drivers/isdn/hardware/eicon/debug.c
+++ b/drivers/isdn/hardware/eicon/debug.c
@@ -14,9 +14,9 @@
 
 static void DI_register(void *arg);
 static void DI_deregister(pDbgHandle hDbg);
-static void DI_format(int do_lock, word id, int type, char *format, va_list 
argument_list);
-static void DI_format_locked(word id, int type, char *format, va_list 
argument_list);
-static void DI_format_old(word id, char *format, va_list ap) { }
+static void DI_format(int do_lock, word id, int type, const char *format, 
va_list argument_list);
+static void DI_format_locked(word id, int type, const char *format, va_list 
argument_list);
+static void DI_format_old(word id, const char *format, va_list ap) { }
 static void DiProcessEventLog(unsigned short id, unsigned long msgID, va_list 
ap) { }
 static void single_p(byte *P, word *PLength, byte Id);
 static void diva_maint_xdi_cb(ENTITY *e);
@@ -25,7 +25,7 @@ static int diva_mnt_cmp_nmbr(const char *nmbr);
 static void diva_free_dma_descriptor(IDI_CALL request, int nr);
 static int diva_get_dma_descriptor(IDI_CALL request, dword *dma_magic);
 __printf(3, 4)
-void diva_mnt_internal_dprintf(dword drv_id, dword type, char *p, ...);
+void diva_mnt_internal_dprintf(dword drv_id, dword type, const char *p, ...);
 
 static dword MaxDumpSize = 256;
 static dword MaxXlogSize = 2 + 128;
@@ -561,7 +561,7 @@ static void DI_deregister(pDbgHandle hDbg) {
 
 static void DI_format_locked(unsigned short id,
 int type,
-char *format,
+const char *format,
 va_list argument_list) {
DI_format(1, id, type, format, argument_list);
 }
@@ -569,7 +569,7 @@ static void DI_format_locked(unsigned short id,
 static void DI_format(int do_lock,
  unsigned short id,
  int type,
- char *format,
+ const char *format,
  va_list ap) {
diva_os_spin_lock_magic_t old_irql;
dword sec, usec;
@@ -1904,7 +1904,7 @@ static void 
diva_change_management_debug_mask(diva_maint_client_t *pC, dword old
 }
 
 
-void diva_mnt_internal_dprintf(dword drv_id, dword type, char *fmt, ...) {
+void diva_mnt_internal_dprintf(dword drv_id, dword type, const char *fmt, ...) 
{
va_list ap;
 
va_start(ap, fmt);
diff --git a/drivers/isdn/hardware/eicon/debuglib.h 
b/drivers/isdn/hardware/eicon/debuglib.h
index 6dcbf6afb8f9..2170de140335 100644
--- a/drivers/isdn/hardware/eicon/debuglib.h
+++ b/drivers/isdn/hardware/eicon/debuglib.h
@@ -230,10 +230,10 @@ extern void DbgSetLevel(unsigned long dbgMask);
  */
 typedef struct _DbgHandle_ *pDbgHandle;
 typedef void (*DbgEnd)(pDbgHandle);
-typedef void (*DbgLog)(unsigned short, int, char *, va_list);
-typedef void (*DbgOld)(unsigned short, char *, va_list);
+typedef void (*DbgLog)(unsigned short, int, const char *, va_list);
+typedef void (*DbgOld)(unsigned short, const char *, va_list);
 typedef void (*DbgEv)(unsigned short, unsigned long, va_list);
-typedef void (*DbgIrq)(unsigned short, int, char *, va_list);
+typedef void (*DbgIrq)(unsigned short, int, const char *, va_list);
 typedef struct _DbgHandle_
 { charRegistered; /* driver successfully registered */
 #define DBG_HANDLE_REG_NEW 0x01  /* this (new) structure*/
diff --git a/drivers/isdn/hardware/eicon/maintidi.c 
b/drivers/isdn/hardware/eicon/maintidi.c
index b2ed2939b4fa..a635595e9be3 100644
--- a/drivers/isdn/hardware/eicon/maintidi.c
+++ b/drivers/isdn/hardware/eicon/maintidi.c
@@ -31,7 +31,7 @@
 
 
 extern __printf(3, 4)
-void diva_mnt_internal_dprintf(dword drv_id, dword type, char *p, ...);
+void diva_mnt_internal_dprintf(dword drv_id, dword type, const char *p, ...);
 
 #define MODEM_PARSE_ENTRIES  16 /* amount of variables of interest */
 #define FAX_PARSE_ENTRIES12 /* amount of variables of interest */
-- 
2.10.1



[PATCH 3/4] isdn/eicon: add some __printf attributes

2016-10-29 Thread Nicolas Iooss
Add __printf attributes to some functions. This helps detecting errors
related to printf-formats at compile time.

When doing this, gcc reports some issues in debug.c. Fix them.

This patch has only been compile-tested.

Signed-off-by: Nicolas Iooss 
---
 drivers/isdn/hardware/eicon/debug.c| 129 +
 drivers/isdn/hardware/eicon/maintidi.c |   3 +-
 drivers/isdn/hardware/eicon/platform.h |   2 +-
 3 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/drivers/isdn/hardware/eicon/debug.c 
b/drivers/isdn/hardware/eicon/debug.c
index 576b7b4a3278..cd8d70e3292d 100644
--- a/drivers/isdn/hardware/eicon/debug.c
+++ b/drivers/isdn/hardware/eicon/debug.c
@@ -24,6 +24,7 @@ static word SuperTraceCreateReadReq(byte *P, const char 
*path);
 static int diva_mnt_cmp_nmbr(const char *nmbr);
 static void diva_free_dma_descriptor(IDI_CALL request, int nr);
 static int diva_get_dma_descriptor(IDI_CALL request, dword *dma_magic);
+__printf(3, 4)
 void diva_mnt_internal_dprintf(dword drv_id, dword type, char *p, ...);
 
 static dword MaxDumpSize = 256;
@@ -1514,29 +1515,29 @@ static void diva_maint_state_change_notify(void 
*user_context,
}
 
 
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Ch= %lu",
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Ch= %u",
  (int)modem->ChannelNumber);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Event = %lu", modem->Event);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Norm  = %lu", modem->Norm);
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Event = %u", modem->Event);
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Norm  = %u", modem->Norm);
diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Opts. = 0x%08x", modem->Options);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Tx= %lu Bps", modem->TxSpeed);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Rx= %lu Bps", modem->RxSpeed);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
RT= %lu mSec",
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Tx= %u Bps", modem->TxSpeed);
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Rx= %u Bps", modem->RxSpeed);
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
RT= %u mSec",
  modem->RoundtripMsec);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Sr= %lu", modem->SymbolRate);
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Sr= %u", modem->SymbolRate);
diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
Rxl   = %d dBm", modem->RxLeveldBm);
diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
El= %d dBm", modem->EchoLeveldBm);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
SNR   = %lu dB", modem->SNRdb);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
MAE   = %lu", modem->MAE);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
LRet  = %lu",
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
SNR   = %u dB", modem->SNRdb);
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
MAE   = %u", modem->MAE);
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
LRet  = %u",
  modem->LocalRetrains);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
RRet  = %lu",
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
RRet  = %u",
  modem->RemoteRetrains);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
LRes  = %lu", modem->LocalResyncs);
-   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
RRes  = %lu",
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
LRes  = %u", modem->LocalResyncs);
+   diva_mnt_internal_dprintf(pC->hDbg->id, DLI_STAT, "MDM 
RRes  = %u",
  modem->RemoteResyncs);
if (modem->Event == 3) {
-   diva_mnt_internal_dprintf(pC->hDbg->id, 
DLI_STAT, "MDM Disc  =  %lu", modem->DiscReason);
+   diva_mnt_internal_dprintf(pC->hDbg->id, 
DLI_STAT, "MDM Disc  =  %u", modem->DiscReason);
}
}
 

[PATCH 2/4] isdn/eicon: fix some message formatting errors

2016-10-29 Thread Nicolas Iooss
There are some inconsistent debug message formats in message.c. For
example,

dprintf("XDI CAPI: RC cancelled Id:0x02, Ch:%02x", e->Id, ch);

wrongly reports an ID of 2 and prints the entity ID as the channel ID.
There are also object pointers which are used instead of the IDs.

All these inconsistent formats have been found by adding __printf
attribute to myDbgPrint_...() functions (used by dbug()). As this makes
the compiler to also complain about using "%ld" with unsigned int values
(instead of "%u") and some other less-important format issues, this
patch does not add any __printf attribute.

This patch has only been compile-tested.

Signed-off-by: Nicolas Iooss 
---
 drivers/isdn/hardware/eicon/message.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/isdn/hardware/eicon/message.c 
b/drivers/isdn/hardware/eicon/message.c
index 1a1d99704fe6..7cafa34c3464 100644
--- a/drivers/isdn/hardware/eicon/message.c
+++ b/drivers/isdn/hardware/eicon/message.c
@@ -1059,7 +1059,7 @@ static void plci_remove(PLCI *plci)
}
if (plci->Sig.Id == 0xff)
{
-   dbug(1, dprintf("D-channel X.25 plci->NL.Id:%0x", plci->NL.Id));
+   dbug(1, dprintf("D-channel X.25 plci->NL.Id:%02x", 
plci->NL.Id));
if (plci->NL.Id && !plci->nl_remove_id)
{
nl_req_ncci(plci, REMOVE, 0);
@@ -3109,7 +3109,7 @@ static byte data_b3_req(dword Id, word Number, 
DIVA_CAPI_ADAPTER *a,
 
Info = _WRONG_IDENTIFIER;
ncci = (word)(Id >> 16);
-   dbug(1, dprintf("ncci=0x%x, plci=0x%x", ncci, plci));
+   dbug(1, dprintf("ncci=0x%x, plci=0x%x", ncci, plci->Id));
 
if (plci && ncci)
{
@@ -3325,7 +3325,7 @@ static byte select_b_req(dword Id, word Number, 
DIVA_CAPI_ADAPTER *a,
else
{
dbug(1, 
dprintf("select_b_req[%d],PLCI=0x%x,Tel=0x%x,NL=0x%x,appl=0x%x,sstate=0x%x",
-   msg->length, plci->Id, plci->tel, plci->NL.Id, 
plci->appl, plci->SuppState));
+   msg->length, plci->Id, plci->tel, plci->NL.Id, 
appl->Id, plci->SuppState));
dbug(1, dprintf("PlciState=0x%x", plci->State));
for (i = 0; i < 7; i++) bp_parms[i].length = 0;
 
@@ -3910,7 +3910,7 @@ void callback(ENTITY *e)
if (no_cancel_rc && (a->FlowControlIdTable[ch] 
== e->Id) && e->Id) {
a->FlowControlIdTable[ch] = 0;
if ((rc == OK) && 
a->FlowControlSkipTable[ch]) {
-   dbug(3, dprintf("XDI CAPI: RC 
cancelled Id:0x02, Ch:%02x", e->Id, ch));
+   dbug(3, dprintf("XDI CAPI: RC 
cancelled Id:%02x, Ch:%02x", e->Id, ch));
return;
}
}
@@ -9135,7 +9135,7 @@ static word AdvCodecSupport(DIVA_CAPI_ADAPTER *a, PLCI 
*plci, APPL *appl,
{
if (a->AdvSignalAppl != appl || a->AdvSignalPLCI)
{
-   dbug(1, dprintf("AdvSigPlci=0x%x", 
a->AdvSignalPLCI));
+   dbug(1, dprintf("AdvSigPlci=0x%x", 
a->AdvSignalPLCI->Id));
return 0x2001; /* codec in use by another 
application */
}
if (plci != NULL)
-- 
2.10.1



Re: Let's do P4

2016-10-29 Thread Jiri Pirko
Sat, Oct 29, 2016 at 02:09:32PM CEST, tg...@suug.ch wrote:
>On 10/29/16 at 01:28pm, Jiri Pirko wrote:
>> Sat, Oct 29, 2016 at 01:15:48PM CEST, tg...@suug.ch wrote:
>> >So given the SKIP_SW flag, the in-kernel compiler is optional anyway.
>> >Why even risk including a possibly incomplete compiler? Older kernels
>> >must be capable of running along newer hardware as long as eBPF can
>> >represent the software path. Having to upgrade to latest and greatest
>> >kernels is not an option for most people so they would simply have to
>> >fall back to SKIP_SW and do it in user space anyway.
>> 
>> The thing is, if we needo to offload something, it needs to be
>> implemented in kernel first. Also, I believe that it is good to have
>> in-kernel p4 engine for testing and development purposes.
>
>You lost me now :-) In an earlier email you said:
>
>> It can be the other way around. The p4>ebpf compiler won't be complete
>> at the beginning so it is possible that HW could provide more features.
>> I don't think it is a problem. With SKIP_SW and SKIP_HW flags in TC,
>> the user can set different program to each. I think in real life, that
>> would be the most common case anyway.
>
>If you allow to SKIP_SW and set different programs each to address
>this, then how is this any different.
>
>I completely agree that kernel must be able to provide the same
>functionality as HW with optional additional capabilities on top so
>the HW can always bail out and punt to SW.
>
>[...]
>
>> >I'm not seeing how either of them is more or less variable. The main
>> >difference is whether to require configuring a single cls with both
>> >p4ast + bpf or two separate cls, one for each. I'd prefer the single
>> >cls approach simply because it is cleaner wither regard to offload
>> >directly off bpf vs off p4ast.
>> 
>> That's the bundle that you asked me to forget earlier in this email? :)
>
>I thought you referred to the "store in same object file" as bundle.
>I don't really care about that. What I care about is a single way to
>configure this that works for both ASIC and non-ASIC hardware.
>
>> >My main point is to not include a IR to eBPF compiler in the kernel
>> >and let user space handle this instead.
>> 
>> It we do it as you describe, we would be using 2 different APIs for
>> offloaded and non-offloaded path. I don't believe it is acceptable as
>> the offloaded features has to have kernel implementation. Therefore, I
>> believe that p4ast as a kernel API is the only possible option.
>
>Yes, the kernel has the SW implementation in eBPF. I thought that is
>what you propose as well. The only difference is whether to generate
>that eBPF in kernel or user space.
>
>Not sure I understand the multiple APIs point for offload vs
>non-offload. There is a single API: tc. Both models require the user
>to provide additional metadata to allow programming ASIC HW: p4ast
>IR or whatever we agree on.

If you do p4>ebpf in userspace, you have 2 apis:
1) to setup sw (in-kernel) p4 datapath, you push bpf.o to kernel
2) to setup hw p4 datapath, you push program.p4ast to kernel

Those are 2 apis. Both wrapped up by TC, but still 2 apis.

What I believe is correct is to have one api:
1) to setup sw (in-kernel) p4 datapath, you push program.p4ast to kernel
2) to setup hw p4 datapath, you push program.p4ast to kernel

In case of 1), the program.p4ast will be either interpreted by new p4
interpreter, of translated to bpf and interpreted by that. But this
translation code is part of kernel.



[PATCH net] qede: Fix out-of-bound fastpath memory access

2016-10-29 Thread Yuval Mintz
Driver allocates a shadow array for transmitted SKBs with X entries;
That means valid indices are {0,...,X - 1}. [X == 8191]
Problem is the driver also uses X as a mask for a
producer/consumer in order to choose the right entry in the
array which allows access to entry X which is out of bounds.

To fix this, simply allocate X + 1 entries in the shadow array.

Signed-off-by: Yuval Mintz 
---
Hi Dave,

Please consider applying this to `net'.

Thanks,
Yuval
---
 drivers/net/ethernet/qlogic/qede/qede_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c 
b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 444b271..7def29a 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -2940,7 +2940,7 @@ static int qede_alloc_mem_txq(struct qede_dev *edev, 
struct qede_tx_queue *txq)
txq->num_tx_buffers = edev->q_num_tx_buffers;
 
/* Allocate the parallel driver ring for Tx buffers */
-   size = sizeof(*txq->sw_tx_ring) * NUM_TX_BDS_MAX;
+   size = sizeof(*txq->sw_tx_ring) * TX_RING_SIZE;
txq->sw_tx_ring = kzalloc(size, GFP_KERNEL);
if (!txq->sw_tx_ring) {
DP_NOTICE(edev, "Tx buffers ring allocation failed\n");
@@ -2951,7 +2951,7 @@ static int qede_alloc_mem_txq(struct qede_dev *edev, 
struct qede_tx_queue *txq)
QED_CHAIN_USE_TO_CONSUME_PRODUCE,
QED_CHAIN_MODE_PBL,
QED_CHAIN_CNT_TYPE_U16,
-   NUM_TX_BDS_MAX,
+   TX_RING_SIZE,
sizeof(*p_virt), &txq->tx_pbl);
if (rc)
goto err;
-- 
1.9.3



Re: [PATCH 03/17] batman-adv: Add network_coding and mcast sysfs files to README

2016-10-29 Thread Jiri Pirko
Sat, Oct 29, 2016 at 01:46:59PM CEST, s...@narfation.org wrote:
>On Samstag, 29. Oktober 2016 12:56:28 CEST Jiri Pirko wrote:
>[...]
>> >> I strongly believe it is a huge mistake to use sysfs for things like
>> >> this. This should be done via generic netlink api.
>> >
>> >This doesn't change the problem that it is already that way. This patch
>> >only adds the list of available files to the README.
>> 
>> Sure. Just found out you did it like that. Therefore I commented. I
>> suggest to rework the api to use genl entirely.
>
>Fair enough, I have added it to the issue tracker [1].
>
>It seems there is no easy way to drop support for modifying batman-adv
>attributes of the interface or its ports via sysfs in the near
>future. But disallowing sysfs for new attributes might be a viable
>policy.

Cool. Thanks!


>
>Kind regards,
>   Sven
>
>[1] https://www.open-mesh.org/issues/300




Re: Let's do P4

2016-10-29 Thread Jakub Kicinski
On Sat, 29 Oct 2016 09:53:28 +0200, Jiri Pirko wrote:
> Hi all.
> 
> The network world is divided into 2 general types of hw:
> 1) network ASICs - network specific silicon, containing things like TCAM
>These ASICs are suitable to be programmed by P4.
> 2) network processors - basically a general purpose CPUs
>These processors are suitable to be programmed by eBPF.
> 
> I believe that by now, the most people came to a conclusion that it is
> very difficult to handle both types by either P4 or eBPF. And since
> eBPF is part of the kernel, I would like to introduce P4 into kernel
> as well. Here's a plan:
> 
> 1) Define P4 intermediate representation
>I cannot imagine loading P4 program (c-like syntax text file) into
>kernel as is. That means that as the first step, we need find some
>intermediate representation. I can imagine someting in a form of AST,
>call it "p4ast". I don't really know how to do this exactly though,
>it's just an idea.
> 
>In the end there would be a userspace precompiler for this:
>$ makep4ast example.p4 example.ast

Maybe stating the obvious, but IMHO defining the IR is the hardest part.
eBPF *is* the IR, we can compile C, P4 or even JIT Lua to eBPF.  The
AST/IR for switch pipelines should allow for similar flexibility.
Looser coupling would also protect us from changes in spec of the high
level language.


Re: Let's do P4

2016-10-29 Thread Jakub Kicinski
On Sat, 29 Oct 2016 15:58:55 +0200, Jiri Pirko wrote:
> Sat, Oct 29, 2016 at 02:09:32PM CEST, tg...@suug.ch wrote:
> >On 10/29/16 at 01:28pm, Jiri Pirko wrote:  
> >> Sat, Oct 29, 2016 at 01:15:48PM CEST, tg...@suug.ch wrote:  
> >> >So given the SKIP_SW flag, the in-kernel compiler is optional anyway.
> >> >Why even risk including a possibly incomplete compiler? Older kernels
> >> >must be capable of running along newer hardware as long as eBPF can
> >> >represent the software path. Having to upgrade to latest and greatest
> >> >kernels is not an option for most people so they would simply have to
> >> >fall back to SKIP_SW and do it in user space anyway.  
> >> 
> >> The thing is, if we needo to offload something, it needs to be
> >> implemented in kernel first. Also, I believe that it is good to have
> >> in-kernel p4 engine for testing and development purposes.  
> >
> >You lost me now :-) In an earlier email you said:
> >  
> >> It can be the other way around. The p4>ebpf compiler won't be complete
> >> at the beginning so it is possible that HW could provide more features.
> >> I don't think it is a problem. With SKIP_SW and SKIP_HW flags in TC,
> >> the user can set different program to each. I think in real life, that
> >> would be the most common case anyway.  
> >
> >If you allow to SKIP_SW and set different programs each to address
> >this, then how is this any different.
> >
> >I completely agree that kernel must be able to provide the same
> >functionality as HW with optional additional capabilities on top so
> >the HW can always bail out and punt to SW.
> >
> >[...]
> >  
> >> >I'm not seeing how either of them is more or less variable. The main
> >> >difference is whether to require configuring a single cls with both
> >> >p4ast + bpf or two separate cls, one for each. I'd prefer the single
> >> >cls approach simply because it is cleaner wither regard to offload
> >> >directly off bpf vs off p4ast.  
> >> 
> >> That's the bundle that you asked me to forget earlier in this email? :)  
> >
> >I thought you referred to the "store in same object file" as bundle.
> >I don't really care about that. What I care about is a single way to
> >configure this that works for both ASIC and non-ASIC hardware.
> >  
> >> >My main point is to not include a IR to eBPF compiler in the kernel
> >> >and let user space handle this instead.  
> >> 
> >> It we do it as you describe, we would be using 2 different APIs for
> >> offloaded and non-offloaded path. I don't believe it is acceptable as
> >> the offloaded features has to have kernel implementation. Therefore, I
> >> believe that p4ast as a kernel API is the only possible option.  
> >
> >Yes, the kernel has the SW implementation in eBPF. I thought that is
> >what you propose as well. The only difference is whether to generate
> >that eBPF in kernel or user space.
> >
> >Not sure I understand the multiple APIs point for offload vs
> >non-offload. There is a single API: tc. Both models require the user
> >to provide additional metadata to allow programming ASIC HW: p4ast
> >IR or whatever we agree on.  
> 
> If you do p4>ebpf in userspace, you have 2 apis:
> 1) to setup sw (in-kernel) p4 datapath, you push bpf.o to kernel
> 2) to setup hw p4 datapath, you push program.p4ast to kernel
> 
> Those are 2 apis. Both wrapped up by TC, but still 2 apis.
> 
> What I believe is correct is to have one api:
> 1) to setup sw (in-kernel) p4 datapath, you push program.p4ast to kernel
> 2) to setup hw p4 datapath, you push program.p4ast to kernel
> 
> In case of 1), the program.p4ast will be either interpreted by new p4
> interpreter, of translated to bpf and interpreted by that. But this
> translation code is part of kernel.

Option 3) use a well structured subset of eBPF as user space ABI ;)

In all seriousness, user space already has to have some knowledge about
the underlaying hardware today with different vendors picking different
TC classifiers for offload.  So I humbly agree that 2 APIs may be
acceptable here.


Re: Let's do P4

2016-10-29 Thread Jiri Pirko
Sat, Oct 29, 2016 at 04:49:03PM CEST, kubak...@wp.pl wrote:
>On Sat, 29 Oct 2016 09:53:28 +0200, Jiri Pirko wrote:
>> Hi all.
>> 
>> The network world is divided into 2 general types of hw:
>> 1) network ASICs - network specific silicon, containing things like TCAM
>>These ASICs are suitable to be programmed by P4.
>> 2) network processors - basically a general purpose CPUs
>>These processors are suitable to be programmed by eBPF.
>> 
>> I believe that by now, the most people came to a conclusion that it is
>> very difficult to handle both types by either P4 or eBPF. And since
>> eBPF is part of the kernel, I would like to introduce P4 into kernel
>> as well. Here's a plan:
>> 
>> 1) Define P4 intermediate representation
>>I cannot imagine loading P4 program (c-like syntax text file) into
>>kernel as is. That means that as the first step, we need find some
>>intermediate representation. I can imagine someting in a form of AST,
>>call it "p4ast". I don't really know how to do this exactly though,
>>it's just an idea.
>> 
>>In the end there would be a userspace precompiler for this:
>>$ makep4ast example.p4 example.ast
>
>Maybe stating the obvious, but IMHO defining the IR is the hardest part.
>eBPF *is* the IR, we can compile C, P4 or even JIT Lua to eBPF.  The
>AST/IR for switch pipelines should allow for similar flexibility.
>Looser coupling would also protect us from changes in spec of the high
>level language.

Agreed. I agree with you point this would be nice to have it done in a
generic way. However, I'm not aware of any other language similar to p4.



Re: Let's do P4

2016-10-29 Thread Jiri Pirko
Sat, Oct 29, 2016 at 04:54:21PM CEST, kubak...@wp.pl wrote:
>On Sat, 29 Oct 2016 15:58:55 +0200, Jiri Pirko wrote:
>> Sat, Oct 29, 2016 at 02:09:32PM CEST, tg...@suug.ch wrote:
>> >On 10/29/16 at 01:28pm, Jiri Pirko wrote:  
>> >> Sat, Oct 29, 2016 at 01:15:48PM CEST, tg...@suug.ch wrote:  
>> >> >So given the SKIP_SW flag, the in-kernel compiler is optional anyway.
>> >> >Why even risk including a possibly incomplete compiler? Older kernels
>> >> >must be capable of running along newer hardware as long as eBPF can
>> >> >represent the software path. Having to upgrade to latest and greatest
>> >> >kernels is not an option for most people so they would simply have to
>> >> >fall back to SKIP_SW and do it in user space anyway.  
>> >> 
>> >> The thing is, if we needo to offload something, it needs to be
>> >> implemented in kernel first. Also, I believe that it is good to have
>> >> in-kernel p4 engine for testing and development purposes.  
>> >
>> >You lost me now :-) In an earlier email you said:
>> >  
>> >> It can be the other way around. The p4>ebpf compiler won't be complete
>> >> at the beginning so it is possible that HW could provide more features.
>> >> I don't think it is a problem. With SKIP_SW and SKIP_HW flags in TC,
>> >> the user can set different program to each. I think in real life, that
>> >> would be the most common case anyway.  
>> >
>> >If you allow to SKIP_SW and set different programs each to address
>> >this, then how is this any different.
>> >
>> >I completely agree that kernel must be able to provide the same
>> >functionality as HW with optional additional capabilities on top so
>> >the HW can always bail out and punt to SW.
>> >
>> >[...]
>> >  
>> >> >I'm not seeing how either of them is more or less variable. The main
>> >> >difference is whether to require configuring a single cls with both
>> >> >p4ast + bpf or two separate cls, one for each. I'd prefer the single
>> >> >cls approach simply because it is cleaner wither regard to offload
>> >> >directly off bpf vs off p4ast.  
>> >> 
>> >> That's the bundle that you asked me to forget earlier in this email? :)  
>> >
>> >I thought you referred to the "store in same object file" as bundle.
>> >I don't really care about that. What I care about is a single way to
>> >configure this that works for both ASIC and non-ASIC hardware.
>> >  
>> >> >My main point is to not include a IR to eBPF compiler in the kernel
>> >> >and let user space handle this instead.  
>> >> 
>> >> It we do it as you describe, we would be using 2 different APIs for
>> >> offloaded and non-offloaded path. I don't believe it is acceptable as
>> >> the offloaded features has to have kernel implementation. Therefore, I
>> >> believe that p4ast as a kernel API is the only possible option.  
>> >
>> >Yes, the kernel has the SW implementation in eBPF. I thought that is
>> >what you propose as well. The only difference is whether to generate
>> >that eBPF in kernel or user space.
>> >
>> >Not sure I understand the multiple APIs point for offload vs
>> >non-offload. There is a single API: tc. Both models require the user
>> >to provide additional metadata to allow programming ASIC HW: p4ast
>> >IR or whatever we agree on.  
>> 
>> If you do p4>ebpf in userspace, you have 2 apis:
>> 1) to setup sw (in-kernel) p4 datapath, you push bpf.o to kernel
>> 2) to setup hw p4 datapath, you push program.p4ast to kernel
>> 
>> Those are 2 apis. Both wrapped up by TC, but still 2 apis.
>> 
>> What I believe is correct is to have one api:
>> 1) to setup sw (in-kernel) p4 datapath, you push program.p4ast to kernel
>> 2) to setup hw p4 datapath, you push program.p4ast to kernel
>> 
>> In case of 1), the program.p4ast will be either interpreted by new p4
>> interpreter, of translated to bpf and interpreted by that. But this
>> translation code is part of kernel.
>
>Option 3) use a well structured subset of eBPF as user space ABI ;)

:( That would not be nice I believe. Also confusing and hard to
maintain. Plus we would have to do 2 translations, in between
incompatible paradigms.


>
>In all seriousness, user space already has to have some knowledge about
>the underlaying hardware today with different vendors picking different
>TC classifiers for offload.  So I humbly agree that 2 APIs may be
>acceptable here.


Re: [PATCH v7 0/6] Add eBPF hooks for cgroups

2016-10-29 Thread Lorenzo Colitti
On Sat, Oct 29, 2016 at 3:24 PM, Alexei Starovoitov
 wrote:
> it could be solved by swapping the order of cgroup_bpf_run_filter()
> and NF_INET_POST_ROUTING in patch 5. It was proposed some time back, but
> the current patch, I think, is more symmetrical.
> cgroup+bpf runs after nf hook on rx and runs before it on tx.
> imo it's more consistent.

I guess what I was trying to say was: what does doing this filtering
in ip_output give you over running this from the netfilter hooks?
Doing this filtering in netfilter is much more general because there
can be complex rules both before and after the filtering is applied. I
hadn't thought of the scalability issue you note below though.

For accounting you probably want to run after the hooks, both for
ingress and for egress, because the hooks can do all sorts of stuff
like drop packets, change packet sizes, reroute them to different
interfaces, etc. Do you see use cases where you want to run before the
hooks?

> Regardless of this choice... are you going to backport cgroupv2 to
> android? Because this set is v2 only.

Certainly anything that can't easily be backported to, say,
android-4.4 is not really feasible in the short term. I don't think we
use network cgroups at all, so if v2 network cgroups can coexist with
v1 cgroups of other types (which what little I've read seems to
indicate) then that should be possible.

> yes. that's certainly doable, but sooner or later such approach will hit
> scalability issue when number of cgroups is large. Same issue we saw
> with cls_bpf and bpf_skb_under_cgroup(). Hence this patch set was needed
> that is centered around cgroups instead of hooks. Note, unlike, tc and nf
> there is no way to attach to a hook. The bpf program is attached to a cgroup.
> It's an important distinction vs everything that currently exists in the 
> stack.

Ah, I see. Out of curiosity, what was the first scaling limitation you
hit? eBPF program length? eBPF map size?


Re: [PATCH net 00/12] Mellanox 100G mlx5 fixes 2016-10-25

2016-10-29 Thread David Miller
From: Saeed Mahameed 
Date: Tue, 25 Oct 2016 18:36:23 +0300

> This series contains some bug fixes for the mlx5 core and mlx5e
> driver.

Series applied, thank you.


Re: [PATCH net] sctp: validate chunk len before actually using it

2016-10-29 Thread David Miller
From: Marcelo Ricardo Leitner 
Date: Tue, 25 Oct 2016 14:27:39 -0200

> Andrey Konovalov reported that KASAN detected that SCTP was using a slab
> beyond the boundaries. It was caused because when handling out of the
> blue packets in function sctp_sf_ootb() it was checking the chunk len
> only after already processing the first chunk, validating only for the
> 2nd and subsequent ones.
> 
> The fix is to just move the check upwards so it's also validated for the
> 1st chunk.
> 
> Reported-by: Andrey Konovalov 
> Tested-by: Andrey Konovalov 
> Signed-off-by: Marcelo Ricardo Leitner 
> ---
> 
> Hi. Please consider this to -stable too. Thanks

Applied and queued up for -stable, thanks!


Re: [PATCH v2 net-next] virtio-net: Update the mtu code to match virtio spec

2016-10-29 Thread David Miller
From: Aaron Conole 
Date: Tue, 25 Oct 2016 16:12:12 -0400

> The virtio committee recently ratified a change, VIRTIO-152, which
> defines the mtu field to be 'max' MTU, not simply desired MTU.
> 
> This commit brings the virtio-net device in compliance with VIRTIO-152.
> 
> Additionally, drop the max_mtu branch - it cannot be taken since the u16
> returned by virtio_cread16 will never exceed the initial value of
> max_mtu.
> 
> Signed-off-by: Aaron Conole 
> Acked-by: "Michael S. Tsirkin" 
> Acked-by: Jarod Wilson 
> ---
> Nothing code-wise has changed, but I've included the ACKs and fixed up the
> subject line.

Applied, thanks.


Re: [PATCH net] inet: Fix missing return value in inet6_hash

2016-10-29 Thread David Miller
From: Craig Gallek 
Date: Tue, 25 Oct 2016 18:08:49 -0400

> From: Craig Gallek 
> 
> As part of a series to implement faster SO_REUSEPORT lookups,
> commit 086c653f5862 ("sock: struct proto hash function may error")
> added return values to protocol hash functions and
> commit 496611d7b5ea ("inet: create IPv6-equivalent inet_hash function")
> implemented a new hash function for IPv6.  However, the latter does
> not respect the former's convention.
> 
> This properly propagates the hash errors in the IPv6 case.
> 
> Fixes: 496611d7b5ea ("inet: create IPv6-equivalent inet_hash function")
> Reported-by: Soheil Hassas Yeganeh 
> Signed-off-by: Craig Gallek 

Applied and queued up for -stable, thank you.


Re: Let's do P4

2016-10-29 Thread John Fastabend
On 16-10-29 07:49 AM, Jakub Kicinski wrote:
> On Sat, 29 Oct 2016 09:53:28 +0200, Jiri Pirko wrote:
>> Hi all.
>>
>> The network world is divided into 2 general types of hw:
>> 1) network ASICs - network specific silicon, containing things like TCAM
>>These ASICs are suitable to be programmed by P4.
>> 2) network processors - basically a general purpose CPUs
>>These processors are suitable to be programmed by eBPF.
>>
>> I believe that by now, the most people came to a conclusion that it is
>> very difficult to handle both types by either P4 or eBPF. And since
>> eBPF is part of the kernel, I would like to introduce P4 into kernel
>> as well. Here's a plan:
>>
>> 1) Define P4 intermediate representation
>>I cannot imagine loading P4 program (c-like syntax text file) into
>>kernel as is. That means that as the first step, we need find some
>>intermediate representation. I can imagine someting in a form of AST,
>>call it "p4ast". I don't really know how to do this exactly though,
>>it's just an idea.
>>
>>In the end there would be a userspace precompiler for this:
>>$ makep4ast example.p4 example.ast
> 
> Maybe stating the obvious, but IMHO defining the IR is the hardest part.
> eBPF *is* the IR, we can compile C, P4 or even JIT Lua to eBPF.  The
> AST/IR for switch pipelines should allow for similar flexibility.
> Looser coupling would also protect us from changes in spec of the high
> level language.
> 

Jumping in the middle here. You managed to get an entire thread going
before I even woke up :)

The problem with eBPF as an IR is that in the universe of eBPF IR
programs the subset that can be offloaded onto a standard ASIC based
hardware (non NPU/FPGA/etc) is so small to be almost meaningless IMO.

I tried this for awhile and the result is users have to write very
targeted eBPF that they "know" will be pattern matched and pushed into
an ASIC. It can work but its very fragile. When I did this I ended up
with an eBPF generator for deviceX and an eBPF generator for deviceY
each with a very specific pattern matching engine in the driver to
xlate ebpf-deviceX into its asic. Existing ASICs for example usually
support only one pipeline, only one parser (or require moving mountains
to change the parse via ucode), only one set of tables, and only one
deparser/serailizer at the end to build the new packet. Next-gen pieces
may have some flexibility on the parser side.

There is an interesting resource allocation problem we have that could
be solved by p4 or devlink where in we want to pre-allocate slices of
the TCAM for certain match types. I was planning on writing devlink code
for this because its primarily done at initialization once.

I will note one nice thing about using eBPF however is that you have an
easy software emulation path via ebpf engine in kernel.

... And merging threads here with Jiri's email ...

> If you do p4>ebpf in userspace, you have 2 apis:
> 1) to setup sw (in-kernel) p4 datapath, you push bpf.o to kernel
> 2) to setup hw p4 datapath, you push program.p4ast to kernel
> 
> Those are 2 apis. Both wrapped up by TC, but still 2 apis.
> 
> What I believe is correct is to have one api:
> 1) to setup sw (in-kernel) p4 datapath, you push program.p4ast to kernel
> 2) to setup hw p4 datapath, you push program.p4ast to kernel
> 

Couple comments around this, first adding yet another IR in the kernel
and another JIT engine to map that IR on to eBPF or hardware vendor X
doesn't get me excited. Its really much easier to write these as backend
objects in LLVM. Not saying it can't be done just saying it is easier
in LLVM. Also we already have the LLVM code for P4 to LLVM-IR to eBPF.
In the end this would be a reasonably complex bit of code in
the kernel only for hardware offload. I have doubts that folks would
ever use it for software only cases. I'm happy to admit I'm wrong here
though.

So yes using llvm backends creates two paths a hardware mgmt and sw
path but in the hardware + software case typical on the edge the
orchestration and management planes have started to manage the hardware
and software as two blocks of logic for performance SLA logic. Even on
the edge it seems in most cases folks are selling SR-IOV ports and
can't fall back to software and charge for the port. But this is just
one use case I suspect others where it does make sense.

> In case of 1), the program.p4ast will be either interpreted by new p4
> interpreter, of translated to bpf and interpreted by that. But this
> translation code is part of kernel.

Finally a couple historic bits. The Flow-API proposed in Ottawa was
mechanically generated from an original P4 draft. At the time I was
working fairly closely with both the hardware and compiler folks. If
there is interest we could use that as a base IR for hardware. It has
a simple mapping to/from the original P4 spec. The newer P4 specs are
significantly more complex by the way.

We also have an emulated path also auto-generated from compiler t

Re: net/dccp: warning in dccp_feat_clone_sp_val/__might_sleep

2016-10-29 Thread Andrey Konovalov
Hi Cong,

Tested with your patch, still getting a warning, though it's a little different:

[ cut here ]
WARNING: CPU: 1 PID: 3876 at kernel/sched/core.c:7724
__might_sleep+0x14c/0x1a0 kernel/sched/core.c:7719
do not call blocking ops when !TASK_RUNNING; state=1 set at
[] prepare_to_wait+0xbc/0x210
kernel/sched/wait.c:178
Modules linked in:
CPU: 1 PID: 3876 Comm: a.out Not tainted 4.9.0-rc2+ #325
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
 88006c2d7770 81b46914 88006c2d77e8 
 84052960  88006c2d77b8 8237
 41b58ab3 1e2c ed000d85aef9 84052960
Call Trace:
 [< inline >] __dump_stack lib/dump_stack.c:15
 [] dump_stack+0xb3/0x10f lib/dump_stack.c:51
 [] __warn+0x1a7/0x1f0 kernel/panic.c:550
 [] warn_slowpath_fmt+0xac/0xd0 kernel/panic.c:565
 [] __might_sleep+0x14c/0x1a0 kernel/sched/core.c:7719
 [< inline >] slab_pre_alloc_hook mm/slab.h:393
 [< inline >] slab_alloc_node mm/slub.c:2634
 [< inline >] slab_alloc mm/slub.c:2716
 [] kmem_cache_alloc_trace+0x1bb/0x270 mm/slub.c:2733
 [< inline >] kmalloc ./include/linux/slab.h:490
 [] dccp_feat_entry_new+0x182/0x2a0 net/dccp/feat.c:468
 [] dccp_feat_push_confirm+0x3a/0x270 net/dccp/feat.c:516
 [< inline >] dccp_feat_change_recv net/dccp/feat.c:1160
 [] dccp_feat_parse_options+0xb37/0x13d0 net/dccp/feat.c:1412
 [] dccp_parse_options+0x721/0x1010 net/dccp/options.c:128
 [] dccp_rcv_state_process+0x200/0x15b0 net/dccp/input.c:644
 [] dccp_v4_do_rcv+0xf4/0x1a0 net/dccp/ipv4.c:681
 [< inline >] sk_backlog_rcv ./include/net/sock.h:872
 [] __release_sock+0x126/0x3a0 net/core/sock.c:2044
 [] release_sock+0x59/0x1c0 net/core/sock.c:2502
 [< inline >] inet_wait_for_connect net/ipv4/af_inet.c:547
 [] __inet_stream_connect+0x5d2/0xbb0 net/ipv4/af_inet.c:617
 [] inet_stream_connect+0x55/0xa0 net/ipv4/af_inet.c:656
 [] SYSC_connect+0x244/0x2f0 net/socket.c:1533
 [] SyS_connect+0x24/0x30 net/socket.c:1514
 [] entry_SYSCALL_64_fastpath+0x1f/0xc2
arch/x86/entry/entry_64.S:209
---[ end trace c7e036cf4dc54077 ]---

Thanks!

On Sat, Oct 29, 2016 at 8:10 AM, Cong Wang  wrote:
> On Fri, Oct 28, 2016 at 5:40 PM, Andrey Konovalov  
> wrote:
>> Hi,
>>
>> I've got the following error report while running the syzkaller fuzzer:
>>
>> [ cut here ]
>> WARNING: CPU: 0 PID: 4608 at kernel/sched/core.c:7724
>> __might_sleep+0x14c/0x1a0 kernel/sched/core.c:7719
>> do not call blocking ops when !TASK_RUNNING; state=1 set at
>> [] prepare_to_wait+0xbc/0x210
>> kernel/sched/wait.c:178
>> Modules linked in:
>> CPU: 0 PID: 4608 Comm: syz-executor Not tainted 4.9.0-rc2+ #320
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>>  88006625f7a0 81b46914 88006625f818 
>>  84052960  88006625f7e8 8237
>>  88006aceac00 1e2c ed000cc4beff 84052960
>> Call Trace:
>>  [< inline >] __dump_stack lib/dump_stack.c:15
>>  [] dump_stack+0xb3/0x10f lib/dump_stack.c:51
>>  [] __warn+0x1a7/0x1f0 kernel/panic.c:550
>>  [] warn_slowpath_fmt+0xac/0xd0 kernel/panic.c:565
>>  [] __might_sleep+0x14c/0x1a0 kernel/sched/core.c:7719
>>  [< inline >] slab_pre_alloc_hook mm/slab.h:393
>>  [< inline >] slab_alloc_node mm/slub.c:2634
>>  [< inline >] slab_alloc mm/slub.c:2716
>>  [] __kmalloc_track_caller+0x150/0x2a0 mm/slub.c:4240
>>  [] kmemdup+0x24/0x50 mm/util.c:113
>>  [] dccp_feat_clone_sp_val.part.5+0x4f/0xe0
>> net/dccp/feat.c:374
>>  [< inline >] dccp_feat_clone_sp_val net/dccp/feat.c:1141
>>  [< inline >] dccp_feat_change_recv net/dccp/feat.c:1141
>>  [] dccp_feat_parse_options+0xaa1/0x13d0 
>> net/dccp/feat.c:1411
>>  [] dccp_parse_options+0x721/0x1010 net/dccp/options.c:128
>>  [] dccp_rcv_state_process+0x200/0x15b0 
>> net/dccp/input.c:644
>>  [] dccp_v4_do_rcv+0xf4/0x1a0 net/dccp/ipv4.c:681
>>  [< inline >] sk_backlog_rcv ./include/net/sock.h:872
>>  [] __release_sock+0x126/0x3a0 net/core/sock.c:2044
>>  [] release_sock+0x59/0x1c0 net/core/sock.c:2502
>>  [< inline >] inet_wait_for_connect net/ipv4/af_inet.c:547
>>  [] __inet_stream_connect+0x5d2/0xbb0 
>> net/ipv4/af_inet.c:617
>>  [] inet_stream_connect+0x55/0xa0 net/ipv4/af_inet.c:656
>>  [] SYSC_connect+0x244/0x2f0 net/socket.c:1533
>>  [] SyS_connect+0x24/0x30 net/socket.c:1514
>>  [] entry_SYSCALL_64_fastpath+0x1f/0xc2
>> arch/x86/entry/entry_64.S:209
>
> Should be fixed the attached patch. I will verify it with your
> reproducer tomorrow.
>
> Thanks!


Re: [PATCH net 2/3] sctp: return back transport in __sctp_rcv_init_lookup

2016-10-29 Thread Xin Long
On Sat, Oct 29, 2016 at 5:39 AM, Marcelo Ricardo Leitner
 wrote:
> On Fri, Oct 28, 2016 at 05:42:21PM -0200, Marcelo Ricardo Leitner wrote:
>> On Fri, Oct 28, 2016 at 06:10:53PM +0800, Xin Long wrote:
>> > Prior to this patch, it used a local variable to save the transport that is
>> > looked up by __sctp_lookup_association(), and didn't return it back. But in
>> > sctp_rcv, it is used to initialize chunk->transport. So when hitting this
>> > code, it was initializing chunk->transport with some random stack value
>> > instead.
here should be:
So when hitting this, even if it found the transport, it was still initializing
chunk->transport with null instead.

>> >
>> > This patch is to return the transport back through transport pointer
>> > that is from __sctp_rcv_lookup_harder().
>> >
>> > Signed-off-by: Xin Long 
>>
>> Acked-by: Marcelo Ricardo Leitner 
>>
>> transport pointer in sctp_rcv() is initialized to null and there are
>> checks for it after this path, so this shouldn't be exploitable, just
>> malfunction.
>
> This actually sort of contradicts the changelog.
>
> Xin, did I miss something here? Seems we need to update the changelog if
> not.
>
You're right, thanks, will repost.


Re: net/dccp: warning in dccp_feat_clone_sp_val/__might_sleep

2016-10-29 Thread Eric Dumazet
On Sat, 2016-10-29 at 19:06 +0200, Andrey Konovalov wrote:
> Hi Cong,
> 
> Tested with your patch, still getting a warning, though it's a little 
> different:
> 
> [ cut here ]
> WARNING: CPU: 1 PID: 3876 at kernel/sched/core.c:7724
> __might_sleep+0x14c/0x1a0 kernel/sched/core.c:7719
> do not call blocking ops when !TASK_RUNNING; state=1 set at
> [] prepare_to_wait+0xbc/0x210
> kernel/sched/wait.c:178
> Modules linked in:

This looks like the following patch is needed, can you test it ?
Thanks !

diff --git a/net/dccp/output.c b/net/dccp/output.c
index b66c84db0766..74d8583a0d52 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -228,6 +228,7 @@ static int dccp_wait_for_ccid(struct sock *sk, unsigned 
long delay)
 
remaining = schedule_timeout(delay);
 
+   sched_annotate_sleep();
lock_sock(sk);
sk->sk_write_pending--;
finish_wait(sk_sleep(sk), &wait);





Re: net/dccp: warning in dccp_feat_clone_sp_val/__might_sleep

2016-10-29 Thread Andrey Konovalov
Hi Eric,

Tested with both patches applied, still seeing the warning.

Thanks!

On Sat, Oct 29, 2016 at 7:43 PM, Eric Dumazet  wrote:
> On Sat, 2016-10-29 at 19:06 +0200, Andrey Konovalov wrote:
>> Hi Cong,
>>
>> Tested with your patch, still getting a warning, though it's a little 
>> different:
>>
>> [ cut here ]
>> WARNING: CPU: 1 PID: 3876 at kernel/sched/core.c:7724
>> __might_sleep+0x14c/0x1a0 kernel/sched/core.c:7719
>> do not call blocking ops when !TASK_RUNNING; state=1 set at
>> [] prepare_to_wait+0xbc/0x210
>> kernel/sched/wait.c:178
>> Modules linked in:
>
> This looks like the following patch is needed, can you test it ?
> Thanks !
>
> diff --git a/net/dccp/output.c b/net/dccp/output.c
> index b66c84db0766..74d8583a0d52 100644
> --- a/net/dccp/output.c
> +++ b/net/dccp/output.c
> @@ -228,6 +228,7 @@ static int dccp_wait_for_ccid(struct sock *sk, unsigned 
> long delay)
>
> remaining = schedule_timeout(delay);
>
> +   sched_annotate_sleep();
> lock_sock(sk);
> sk->sk_write_pending--;
> finish_wait(sk_sleep(sk), &wait);
>
>
>


[PATCH net] net: mangle zero checksum in skb_checksum_help()

2016-10-29 Thread Eric Dumazet
From: Eric Dumazet 

Sending zero checksum is ok for TCP, but not for UDP.

UDPv6 receiver should by default drop a frame with a 0 checksum,
and UDPv4 would not verify the checksum and might accept a corrupted
packet.

Simply replace such checksum by 0x, regardless of transport.

This error was caught on SIT tunnels, but seems generic.

Signed-off-by: Eric Dumazet 
Cc: Maciej Żenczykowski 
Cc: Willem de Bruijn 
---
 net/core/dev.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index dbc871306910..899a0f00e721 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2484,7 +2484,7 @@ int skb_checksum_help(struct sk_buff *skb)
goto out;
}
 
-   *(__sum16 *)(skb->data + offset) = csum_fold(csum);
+   *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
 out_set_summed:
skb->ip_summed = CHECKSUM_NONE;
 out:




Re: net/dccp: warning in dccp_feat_clone_sp_val/__might_sleep

2016-10-29 Thread Eric Dumazet
On Sat, 2016-10-29 at 19:59 +0200, Andrey Konovalov wrote:
> Hi Eric,
> 
> Tested with both patches applied, still seeing the warning.
> 
> Thanks!

Arg, sorry, this was at the wrong place.

Thanks for testing !

diff --git a/net/dccp/output.c b/net/dccp/output.c
index b66c84db0766..2548edff86ff 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -224,6 +224,11 @@ static int dccp_wait_for_ccid(struct sock *sk, unsigned 
long delay)
 
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
sk->sk_write_pending++;
+
+   /* release_sock()/lock_sock() will process socket backlog
+* from process context. Be prepared to sleep !
+*/
+   sched_annotate_sleep();
release_sock(sk);
 
remaining = schedule_timeout(delay);




Re: [PATCH net] net: mangle zero checksum in skb_checksum_help()

2016-10-29 Thread Maciej Żenczykowski
Acked-by: Maciej Żenczykowski 


Re: [PATCH net-next V3 0/9] liquidio CN23XX VF support

2016-10-29 Thread David Miller
From: Raghu Vatsavayi 
Date: Tue, 25 Oct 2016 17:57:01 -0700

> 2) As recommended by you removed custom module parameter max_vfs.

I feel the same way about num_queues_per_{p,v}f.

What's really strange is that there is a reference to num_queues_per_pf
in one of the kernel log messages already.


Re: [PATCH net] bpf: fix samples to add fake KBUILD_MODNAME

2016-10-29 Thread David Miller
From: Daniel Borkmann 
Date: Wed, 26 Oct 2016 00:37:53 +0200

> Some of the sample files are causing issues when they are loaded with tc
> and cls_bpf, meaning tc bails out while trying to parse the resulting ELF
> file as program/map/etc sections are not present, which can be easily
> spotted with readelf(1).
> 
> Currently, BPF samples are including some of the kernel headers and mid
> term we should change them to refrain from this, really. When dynamic
> debugging is enabled, we bail out due to undeclared KBUILD_MODNAME, which
> is easily overlooked in the build as clang spills this along with other
> noisy warnings from various header includes, and llc still generates an
> ELF file with mentioned characteristics. For just playing around with BPF
> examples, this can be a bit of a hurdle to take.
> 
> Just add a fake KBUILD_MODNAME as a band-aid to fix the issue, same is
> done in xdp*_kern samples already.
> 
> Fixes: 65d472fb007d ("samples/bpf: add 'pointer to packet' tests")
> Fixes: 6afb1e28b859 ("samples/bpf: Add tunnel set/get tests.")
> Fixes: a3f74617340b ("cgroup: bpf: Add an example to do cgroup checking in 
> BPF")
> Reported-by: Chandrasekar Kannan 
> Signed-off-by: Daniel Borkmann 

Applied.


Re: [PATCH] ip6_tunnel: Update skb->protocol to ETH_P_IPV6 in ip6_tnl_xmit()

2016-10-29 Thread David Miller
From: Eli Cooper 
Date: Wed, 26 Oct 2016 10:11:09 +0800

> This patch updates skb->protocol to ETH_P_IPV6 in ip6_tnl_xmit() when an
> IPv6 header is installed to a socket buffer.
> 
> This is not a cosmetic change.  Without updating this value, GSO packets
> transmitted through an ipip6 tunnel have the protocol of ETH_P_IP and
> skb_mac_gso_segment() will attempt to call gso_segment() for IPv4,
> which results in the packets being dropped.
> 
> Fixes: b8921ca83eed ("ip4ip6: Support for GSO/GRO")
> Signed-off-by: Eli Cooper 

Applied and queued up for -stable, thank you.


Re: [PATCH] [net-next] net: ip, diag: include net/inet_sock.h

2016-10-29 Thread David Miller
From: Arnd Bergmann 
Date: Tue, 25 Oct 2016 17:53:22 +0200

> The newly added raw_diag.c fails to build in some configurations
> unless we include this header:
> 
> In file included from net/ipv4/raw_diag.c:6:0:
> include/net/raw.h:71:21: error: field 'inet' has incomplete type
> net/ipv4/raw_diag.c: In function 'raw_diag_dump':
> net/ipv4/raw_diag.c:166:29: error: implicit declaration of function 'inet_sk'
> 
> Fixes: 432490f9d455 ("net: ip, diag -- Add diag interface for raw sockets")
> Signed-off-by: Arnd Bergmann 

Applied, thanks Arnd.


Re: [PATCH net] cxgb4: Fix error handling in alloc_uld_rxqs().

2016-10-29 Thread David Miller
From: Ganesh Goudar 
Date: Wed, 26 Oct 2016 13:26:38 +0530

> Fix to release resources properly in error handling path of
> alloc_uld_rxqs(), This patch also removes unwanted arguments
> and avoids calling the same function twice.
> 
> Fixes: 94cdb8bb993a (cxgb4: Add support for dynamic allocation
>of resources for ULD
> Signed-off-by: Ganesh Goudar 

Applied.


Re: [PATCH net-next] netlink: Add nla_memdup() to wrap kmemdup() use on nlattr

2016-10-29 Thread David Miller
From: Thomas Graf 
Date: Wed, 26 Oct 2016 10:53:16 +0200

> Wrap several common instances of:
>   kmemdup(nla_data(attr), nla_len(attr), GFP_KERNEL);
> 
> Signed-off-by: Thomas Graf 

Applied.


Re: [PATCH net-next] switchdev: Remove redundant variable

2016-10-29 Thread David Miller
From: ido...@idosch.org
Date: Wed, 26 Oct 2016 12:03:03 +0300

> From: Ido Schimmel 
> 
> Instead of storing return value in 'err' and returning, just return
> directly.
> 
> Signed-off-by: Ido Schimmel 
> Reviewed-by: Jiri Pirko 

Applied.


Re: [PATCH] net_sched actions: use nla_parse_nested()

2016-10-29 Thread David Miller
From: Johannes Berg 
Date: Wed, 26 Oct 2016 14:44:33 +0200

> From: Johannes Berg 
> 
> Use nla_parse_nested instead of open-coding the call to
> nla_parse() with the attribute data/len.
> 
> Signed-off-by: Johannes Berg 

Applied, thanks.


Re: [PATCH net v2] packet: on direct_xmit, limit tso and csum to supported devices

2016-10-29 Thread David Miller
From: Willem de Bruijn 
Date: Wed, 26 Oct 2016 11:23:07 -0400

> From: Willem de Bruijn 
> 
> When transmitting on a packet socket with PACKET_VNET_HDR and
> PACKET_QDISC_BYPASS, validate device support for features requested
> in vnet_hdr.
> 
> Drop TSO packets sent to devices that do not support TSO or have the
> feature disabled. Note that the latter currently do process those
> packets correctly, regardless of not advertising the feature.
> 
> Because of SKB_GSO_DODGY, it is not sufficient to test device features
> with netif_needs_gso. Full validate_xmit_skb is needed.
> 
> Switch to software checksum for non-TSO packets that request checksum
> offload if that device feature is unsupported or disabled. Note that
> similar to the TSO case, device drivers may perform checksum offload
> correctly even when not advertising it.
> 
> When switching to software checksum, packets hit skb_checksum_help,
> which has two BUG_ON checksum not in linear segment. Packet sockets
> always allocate at least up to csum_start + csum_off + 2 as linear.
> 
> Tested by running github.com/wdebruij/kerneltools/psock_txring_vnet.c
> 
>   ethtool -K eth0 tso off tx on
>   psock_txring_vnet -d $dst -s $src -i eth0 -l 2000 -n 1 -q -v
>   psock_txring_vnet -d $dst -s $src -i eth0 -l 2000 -n 1 -q -v -N
> 
>   ethtool -K eth0 tx off
>   psock_txring_vnet -d $dst -s $src -i eth0 -l 1000 -n 1 -q -v -G
>   psock_txring_vnet -d $dst -s $src -i eth0 -l 1000 -n 1 -q -v -G -N
> 
> v2:
>   - add EXPORT_SYMBOL_GPL(validate_xmit_skb_list)
> 
> Fixes: d346a3fae3ff ("packet: introduce PACKET_QDISC_BYPASS socket option")
> Signed-off-by: Willem de Bruijn 

Applied and queued up for -stable.


[net-next:master 68/93] include/net/raw.h:71:21: error: field 'inet' has incomplete type

2016-10-29 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   c778453b138889587eee23e246e231bb12d1e80d
commit: 432490f9d455fb842d70219f22d9d2c812371676 [68/93] net: ip, diag -- Add 
diag interface for raw sockets
config: i386-randconfig-x012-201644 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
git checkout 432490f9d455fb842d70219f22d9d2c812371676
# save the attached .config to linux build tree
make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   In file included from net/ipv4/raw_diag.c:6:0:
>> include/net/raw.h:71:21: error: field 'inet' has incomplete type
 struct inet_sock   inet;
^~~~
   net/ipv4/raw_diag.c: In function 'raw_diag_dump':
>> net/ipv4/raw_diag.c:166:29: error: implicit declaration of function 
>> 'inet_sk' [-Werror=implicit-function-declaration]
   struct inet_sock *inet = inet_sk(sk);
^~~
>> net/ipv4/raw_diag.c:166:29: warning: initialization makes pointer from 
>> integer without a cast [-Wint-conversion]
>> net/ipv4/raw_diag.c:174:33: error: dereferencing pointer to incomplete type 
>> 'struct inet_sock'
   if (r->id.idiag_sport != inet->inet_sport &&
^~
   cc1: some warnings being treated as errors

vim +/inet +71 include/net/raw.h

20380731 Arnaldo Carvalho de Melo 2005-08-16  65  
086c653f Craig Gallek 2016-02-10  66  int raw_hash_sk(struct sock 
*sk);
fc8717ba Pavel Emelyanov  2008-03-22  67  void raw_unhash_sk(struct 
sock *sk);
65b4c50b Pavel Emelyanov  2007-11-19  68  
f74e49b5 Patrick McHardy  2010-04-13  69  struct raw_sock {
f74e49b5 Patrick McHardy  2010-04-13  70/* inet_sock has to be 
the first member */
f74e49b5 Patrick McHardy  2010-04-13 @71struct inet_sock   inet;
f74e49b5 Patrick McHardy  2010-04-13  72struct icmp_filter 
filter;
f0ad0860 Patrick McHardy  2010-04-13  73u32
ipmr_table;
f74e49b5 Patrick McHardy  2010-04-13  74  };

:: The code at line 71 was first introduced by commit
:: f74e49b5613206fb18468bdc9509a1db746aa01b ipv4: raw: move struct raw_sock 
and raw_sk() to include/net/raw.h

:: TO: Patrick McHardy 
:: CC: David S. Miller 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH net] Revert "hv_netvsc: report vmbus name in ethtool"

2016-10-29 Thread David Miller
From: Stephen Hemminger 
Date: Wed, 26 Oct 2016 09:27:53 -0700

> From: Stephen Hemminger 
> 
> This reverts commit e3f74b841d48
> ("hv_netvsc: report vmbus name in ethtool")'
> because of problem introduced by commit f9a56e5d6a0ba
> ("Drivers: hv: make VMBus bus ids persistent").
> This changed the format of the vmbus name and this new format is too
> long to fit in the bus_info field of ethtool.
> 
> Signed-off-by: Stephen Hemminger 

Applied.


Re: [PATCH 0/3] pull request for net: batman-adv 2016-10-26

2016-10-29 Thread David Miller
From: Simon Wunderlich 
Date: Wed, 26 Oct 2016 17:55:12 +0200

> here are some bugfix patches which we would like to have integrated
> into net.
> 
> Please pull or let me know of any problem!

Pulled, thanks Simon.


Re: [PATCH net-next] net: phy: at803x: Add a definition for PHY ID mask

2016-10-29 Thread David Miller
From: Fabio Estevam 
Date: Wed, 26 Oct 2016 14:03:54 -0200

> Add a definition for PHY ID mask for improving code readability.
> 
> Signed-off-by: Fabio Estevam 

Applied.


Re: [PATCH net-next] tcp/dccp: drop SYN packets if accept queue is full

2016-10-29 Thread David Miller
From: Eric Dumazet 
Date: Wed, 26 Oct 2016 09:27:57 -0700

> From: Eric Dumazet 
> 
> Per listen(fd, backlog) rules, there is really no point accepting a SYN,
> sending a SYNACK, and dropping the following ACK packet if accept queue
> is full, because application is not draining accept queue fast enough.
> 
> This behavior is fooling TCP clients that believe they established a
> flow, while there is nothing at server side. They might then send about
> 10 MSS (if using IW10) that will be dropped anyway while server is under
> stress.
> 
> Signed-off-by: Eric Dumazet 

Applied.


Re: [PATCH next] flow_dissector: __skb_get_hash_symmetric arg can be const

2016-10-29 Thread David Miller
From: Florian Westphal 
Date: Wed, 26 Oct 2016 18:49:46 +0200

> Signed-off-by: Florian Westphal 

Applied, thanks Florian.


Re: [PATCH] net: mv643xx_eth: Fetch the phy connection type from DT

2016-10-29 Thread David Miller
From: Jason Gunthorpe 
Date: Wed, 26 Oct 2016 11:47:02 -0600

> The MAC is capable of RGMII mode and that is probably a more typical
> connection type than GMII today (eg it is used by Marvell Reference
> designs for several SOCs). Let DT users specify the standard
> 
>phy-connection-type = "rgmii-id";
> 
> On a phy node.
> 
> Signed-off-by: Jason Gunthorpe 

Applied.


Re: [PATCH net] ibmvnic: Fix releasing of sub-CRQ IRQs in interrupt context

2016-10-29 Thread David Miller
From: Thomas Falcon 
Date: Wed, 26 Oct 2016 13:57:38 -0500

> Schedule these XPORT event tasks in the shared workqueue
> so that IRQs are not freed in an interrupt context when
> sub-CRQs are released.
> 
> Signed-off-by: Thomas Falcon 

Applied.


[PATCH] unix: escape all null bytes in abstract unix domain socket

2016-10-29 Thread Isaac Boukris
Abstract unix domain socket may embed null characters,
these should be translated to '@' when printed out to
proc the same way the null prefix is currently being
translated.

This helps for tools such as netstat, lsof and the proc
based implementation in ss to show all the significant
bytes of the name (instead of getting cut at the first
null occurrence).

Signed-off-by: Isaac Boukris 
---
 net/unix/af_unix.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 145082e..9250b03 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2805,14 +2805,19 @@ static int unix_seq_show(struct seq_file *seq, void *v)
 
i = 0;
len = u->addr->len - sizeof(short);
-   if (!UNIX_ABSTRACT(s))
+   if (!UNIX_ABSTRACT(s)) {
len--;
-   else {
+   for ( ; i < len; i++)
+   seq_putc(seq,
+u->addr->name->sun_path[i]);
+   } else {
seq_putc(seq, '@');
i++;
+   for ( ; i < len; i++)
+   seq_putc(seq,
+u->addr->name->sun_path[i] ?:
+'@');
}
-   for ( ; i < len; i++)
-   seq_putc(seq, u->addr->name->sun_path[i]);
}
unix_state_unlock(s);
seq_putc(seq, '\n');
-- 
2.7.4



[PATCH] iproute2: ss: escape all null bytes in abstract unix domain socket

2016-10-29 Thread Isaac Boukris
Abstract unix domain socket may embed null characters,
these should be translated to '@' when printed by ss the
same way the null prefix is currently being translated.

Signed-off-by: Isaac Boukris 
---
 misc/ss.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/misc/ss.c b/misc/ss.c
index dd77b81..0e28998 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -2895,7 +2895,9 @@ static int unix_show_sock(const struct sockaddr_nl *addr, 
struct nlmsghdr *nlh,
memcpy(name, RTA_DATA(tb[UNIX_DIAG_NAME]), len);
name[len] = '\0';
if (name[0] == '\0')
-   name[0] = '@';
+   for (int i = 0; i < len; i++)
+   if (name[i] == '\0')
+   name[i] = '@';
stat.name = &name[0];
memcpy(stat.local.data, &stat.name, sizeof(stat.name));
}
-- 
2.7.4



Re: iproute: ss truncates abstract unix domain socket embedding null

2016-10-29 Thread Isaac Boukris
On Thu, Oct 27, 2016 at 1:22 AM, Stephen Hemminger
 wrote:
> Just translating all null characters to @ seems the most consistent and
> logical. Also translating other all non-printing characters to something 
> (maybe '?')
> might be wise. It would be nice if all utilities output the same thing.

I've sent two patches to netdev with which all of ss, netstat and lsof
would translate all null bytes the same way.
I left out the other non-printable characters as I wasn't as confident about it.

Thanks.


Re: pull-request: mac80211 2016-10-27

2016-10-29 Thread David Miller
From: Johannes Berg 
Date: Thu, 27 Oct 2016 10:05:49 +0200

> Before I go off for LPC and the workshop and everything, I have
> two fixes. Neither is very important, but I figured I'd get them
> out since I won't have any others soon.
> 
> Let me know if there's any problem.

Pulled, thanks.


Re: [PATCH net-next] bpf: Print function name in addition to function id

2016-10-29 Thread David Miller
From: Thomas Graf 
Date: Thu, 27 Oct 2016 11:23:51 +0200

> The verifier currently prints raw function ids when printing CALL
> instructions or when complaining:
> 
>   5: (85) call 23
>   unknown func 23
> 
> print a meaningful function name instead:
> 
>   5: (85) call bpf_redirect#23
>   unknown func bpf_redirect#23
> 
> Moves the function documentation to a single comment and renames all
> helpers names in the list to conform to the bpf_ prefix notation so
> they can be greped in the kernel source.
> 
> Signed-off-by: Thomas Graf 
> Acked-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Applied.


Re: [PATCH RDS v1] rds: debug messages are enabled by default

2016-10-29 Thread David Miller
From: shamir.rabinovi...@oracle.com
Date: Thu, 27 Oct 2016 05:46:38 -0400

> From: shamir rabinovitch 
> 
> rds use Kconfig option called "RDS_DEBUG" to enable rds debug messages.
> This option cause the rds Makefile to add -DDEBUG to the rds gcc command
> line.
> 
> When CONFIG_DYNAMIC_DEBUG is enabled, the "DEBUG" macro is used by
> include/linux/dynamic_debug.h to decide if dynamic debug prints should
> be sent by default to the kernel log.
> 
> rds should not enable this macro for production builds. rds dynamic
> debug work as expected follow this fix.
> 
> Signed-off-by: Shamir Rabinovitch 

Applied.


[PATCH 15/15] solos-pci: use permission-specific DEVICE_ATTR variants

2016-10-29 Thread Julia Lawall
Use DEVICE_ATTR_RW for read-write attributes.  This simplifies the
source code, improves readbility, and reduces the chance of
inconsistencies.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// 
@rw@
declarer name DEVICE_ATTR;
identifier x,x_show,x_store;
@@

DEVICE_ATTR(x, \(0644\|S_IRUGO|S_IWUSR\), x_show, x_store);

@script:ocaml@
x << rw.x;
x_show << rw.x_show;
x_store << rw.x_store;
@@

if not (x^"_show" = x_show && x^"_store" = x_store)
then Coccilib.include_match false

@@
declarer name DEVICE_ATTR_RW;
identifier rw.x,rw.x_show,rw.x_store;
@@

- DEVICE_ATTR(x, \(0644\|S_IRUGO|S_IWUSR\), x_show, x_store);
+ DEVICE_ATTR_RW(x);
// 

Signed-off-by: Julia Lawall 

---
 drivers/atm/solos-pci.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/atm/solos-pci.c b/drivers/atm/solos-pci.c
index 6ac2b2b..5ad037c 100644
--- a/drivers/atm/solos-pci.c
+++ b/drivers/atm/solos-pci.c
@@ -584,7 +584,7 @@ static ssize_t hardware_show(struct device *dev, struct 
device_attribute *attr,
return sprintf(buf, "%d\n", data32);
 }
 
-static DEVICE_ATTR(console, 0644, console_show, console_store);
+static DEVICE_ATTR_RW(console);
 
 
 #define SOLOS_ATTR_RO(x) static DEVICE_ATTR(x, 0444, solos_param_show, NULL);



[PATCH 12/15] ptp: use permission-specific DEVICE_ATTR variants

2016-10-29 Thread Julia Lawall
Use DEVICE_ATTR_RO for read only attributes.  This simplifies the
source code, improves readbility, and reduces the chance of
inconsistencies.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// 
@ro@
declarer name DEVICE_ATTR;
identifier x,x_show;
@@

DEVICE_ATTR(x, \(0444\|S_IRUGO\), x_show, NULL);

@script:ocaml@
x << ro.x;
x_show << ro.x_show;
@@

if not (x^"_show" = x_show) then Coccilib.include_match false

@@
declarer name DEVICE_ATTR_RO;
identifier ro.x,ro.x_show;
@@

- DEVICE_ATTR(x, \(0444\|S_IRUGO\), x_show, NULL);
+ DEVICE_ATTR_RO(x);
// 

Signed-off-by: Julia Lawall 

---
 drivers/ptp/ptp_sysfs.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c
index 302e626..53d4395 100644
--- a/drivers/ptp/ptp_sysfs.c
+++ b/drivers/ptp/ptp_sysfs.c
@@ -28,7 +28,7 @@ static ssize_t clock_name_show(struct device *dev,
struct ptp_clock *ptp = dev_get_drvdata(dev);
return snprintf(page, PAGE_SIZE-1, "%s\n", ptp->info->name);
 }
-static DEVICE_ATTR(clock_name, 0444, clock_name_show, NULL);
+static DEVICE_ATTR_RO(clock_name);
 
 #define PTP_SHOW_INT(name, var)
\
 static ssize_t var##_show(struct device *dev,  \



[PATCH 00/15] use permission-specific DEVICE_ATTR variants

2016-10-29 Thread Julia Lawall

Use DEVICE_ATTR_RO etc. for read only attributes etc.  This simplifies the
source code, improves readbility, and reduces the chance of
inconsistencies.

The complete semantic patch is as follows:
(http://coccinelle.lip6.fr/)

// 
@ro@
declarer name DEVICE_ATTR;
identifier x,x_show;
@@

DEVICE_ATTR(x, \(0444\|S_IRUGO\), x_show, NULL);

@wo@
declarer name DEVICE_ATTR;
identifier x,x_store;
@@

DEVICE_ATTR(x, \(0200\|S_IWUSR\), NULL, x_store);

@rw@
declarer name DEVICE_ATTR;
identifier x,x_show,x_store;
@@

DEVICE_ATTR(x, \(0644\|S_IRUGO|S_IWUSR\), x_show, x_store);

@script:ocaml@
x << ro.x;
x_show << ro.x_show;
@@

if not (x^"_show" = x_show) then Coccilib.include_match false

@script:ocaml@
x << wo.x;
x_store << wo.x_store;
@@

if not (x^"_store" = x_store) then Coccilib.include_match false

@script:ocaml@
x << rw.x;
x_show << rw.x_show;
x_store << rw.x_store;
@@

if not (x^"_show" = x_show && x^"_store" = x_store)
then Coccilib.include_match false

@@
declarer name DEVICE_ATTR_RO;
identifier ro.x,ro.x_show;
@@

- DEVICE_ATTR(x, \(0444\|S_IRUGO\), x_show, NULL);
+ DEVICE_ATTR_RO(x);

@@
declarer name DEVICE_ATTR_WO;
identifier wo.x,wo.x_store;
@@

- DEVICE_ATTR(x, \(0200\|S_IWUSR\), NULL, x_store);
+ DEVICE_ATTR_WO(x);

@@
declarer name DEVICE_ATTR_RW;
identifier rw.x,rw.x_show,rw.x_store;
@@

- DEVICE_ATTR(x, \(0644\|S_IRUGO|S_IWUSR\), x_show, x_store);
+ DEVICE_ATTR_RW(x);
// 

---

 arch/mips/txx9/generic/7segled.c  |4 ++--
 arch/powerpc/kernel/iommu.c   |3 +--
 arch/tile/kernel/sysfs.c  |   14 +++---
 drivers/atm/solos-pci.c   |2 +-
 drivers/pci/pcie/aspm.c   |4 ++--
 drivers/power/supply/wm8350_power.c   |2 +-
 drivers/ptp/ptp_sysfs.c   |2 +-
 drivers/thermal/int340x_thermal/int3400_thermal.c |2 +-
 drivers/thermal/thermal_hwmon.c   |2 +-
 drivers/tty/nozomi.c  |4 ++--
 drivers/usb/wusbcore/dev-sysfs.c  |6 +++---
 drivers/usb/wusbcore/wusbhc.c |   13 +
 drivers/video/fbdev/wm8505fb.c|2 +-
 sound/soc/omap/mcbsp.c|4 ++--
 sound/soc/soc-dapm.c  |2 +-
 15 files changed, 31 insertions(+), 35 deletions(-)


[PATCH net-next] firewire: net: really fix maximum possible MTU

2016-10-29 Thread Stefan Richter
The maximum unicast datagram size /without/ link fragmentation is
4096 - 4 = 4092 (max IEEE 1394 async payload size at >= S800 bus speed,
minus unfragmented encapssulation header).  Max broadcast datagram size
without fragmentation is 8 bytes less than that (due to GASP header).

The maximum datagram size /with/ link fragmentation is 0xfff = 4095
for unicast and broadcast.  This is because the RFC 2734 fragment
encapsulation header field for datagram size is only 12 bits wide.

Fixes: 5d48f00d836a('firewire: net: fix maximum possible MTU')
Signed-off-by: Stefan Richter 
---
 drivers/firewire/net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 03715e7d9d92..363fc5ec1a4e 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -1465,7 +1465,7 @@ static int fwnet_probe(struct fw_unit *unit,
 
net->mtu = 1500U;
net->min_mtu = ETH_MIN_MTU;
-   net->max_mtu = ETH_MAX_MTU;
+   net->max_mtu = 0xfff;
 
/* Set our hardware address while we're at it */
ha = (union fwnet_hwaddr *)net->dev_addr;
-- 
Stefan Richter
-==- =-=- ===-=
http://arcgraph.de/sr/


Re: [net-next 00/17][pull request] 40GbE Intel Wired LAN Driver Updates 2016-10-28

2016-10-29 Thread David Miller
From: Jeff Kirsher 
Date: Sat, 29 Oct 2016 00:30:41 -0700

> This series contains updates to i40e and i40evf only.

Pulled, thanks Jeff.


Re: [PATCH net 00/10] mlx4 misc fixes for 4.9

2016-10-29 Thread David Miller
From: Tariq Toukan 
Date: Thu, 27 Oct 2016 16:27:12 +0300

> This patchset contains several bug fixes from the team to the
> mlx4 Eth and Core drivers.
> 
> Series generated against net commit:
> ecc515d7238f 'sctp: fix the panic caused by route update'

Series applied, thanks Tariq.


Re: [PATCH 00/17] pull request for net-next: batman-adv 2016-10-27

2016-10-29 Thread David Miller
From: Simon Wunderlich 
Date: Thu, 27 Oct 2016 21:01:33 +0200

> this is our first feature pull request for batman-adv (mostly
> containing code cleanup stuff), there are at least two more to come.
> 
> Please pull or let me know of any problem!

Pulled, thanks Simon.


Re: [PATCH v7 0/6] Add eBPF hooks for cgroups

2016-10-29 Thread Daniel Borkmann

On 10/29/2016 05:34 PM, Lorenzo Colitti wrote:

On Sat, Oct 29, 2016 at 3:24 PM, Alexei Starovoitov
 wrote:

it could be solved by swapping the order of cgroup_bpf_run_filter()
and NF_INET_POST_ROUTING in patch 5. It was proposed some time back, but
the current patch, I think, is more symmetrical.
cgroup+bpf runs after nf hook on rx and runs before it on tx.
imo it's more consistent.


I guess what I was trying to say was: what does doing this filtering
in ip_output give you over running this from the netfilter hooks?
Doing this filtering in netfilter is much more general because there
can be complex rules both before and after the filtering is applied. I
hadn't thought of the scalability issue you note below though.

For accounting you probably want to run after the hooks, both for
ingress and for egress, because the hooks can do all sorts of stuff
like drop packets, change packet sizes, reroute them to different
interfaces, etc. Do you see use cases where you want to run before the
hooks?


Fwiw, not sure if swapping brings much, even after netfilter there could
be complex processing that would potentially drop, mangle, redirect, etc
from tc layer (egress or from qdisc itself). But also at even lower layers
(although rather unlikely, but not impossible), for example in drivers or
shortly before passing skb to them during segmentation (GSO), etc.
Eventually, for that you'd need to monitor various things, and the cgroup
one is just at higher layers with different semantics.


Regardless of this choice... are you going to backport cgroupv2 to
android? Because this set is v2 only.


Certainly anything that can't easily be backported to, say,
android-4.4 is not really feasible in the short term. I don't think we
use network cgroups at all, so if v2 network cgroups can coexist with
v1 cgroups of other types (which what little I've read seems to
indicate) then that should be possible.


yes. that's certainly doable, but sooner or later such approach will hit
scalability issue when number of cgroups is large. Same issue we saw
with cls_bpf and bpf_skb_under_cgroup(). Hence this patch set was needed
that is centered around cgroups instead of hooks. Note, unlike, tc and nf
there is no way to attach to a hook. The bpf program is attached to a cgroup.
It's an important distinction vs everything that currently exists in the stack.


Ah, I see. Out of curiosity, what was the first scaling limitation you
hit? eBPF program length? eBPF map size?


The scalability issue is not really program length or map size from eBPF
side in this context. While for v1, you have the bpf_get_cgroup_classid()
helper available on egress (not ingress though) that can scale with larger
number of cgroups since it works on the user-defined net_cls tagging, but
for v2, bpf_skb_under_cgroup() was initially introduced, which can only test
whether the sk's v2 cgroup related to the skb is in the sub-hierarchy of
a specific cgroup that is provided via maps. Effectively, when you have a
larger number of v2 cgroups that boolean test will not scale and you need
to linearly test through various cgroups. It's good enough when need to
special case only few cgroups in the v2 hierarchy on egress. Idea was that
attaching to cgroup itself would resolve this from a different angle for
egress and also ingress in a complementary way, but also seems to open up
for various other use-cases at the same time as seen from various patches
on the list.

Cheers,
Daniel


Re: [PATCH net-next v1 04/16] tipc: rename struct tipc_skb_cb member handle to bytes_read

2016-10-29 Thread David Miller
From: Parthasarathy Bhuvaragan 
Date: Thu, 27 Oct 2016 16:22:25 +0200

> @@ -95,7 +95,7 @@ struct plist;
>  #define TIPC_MEDIA_INFO_OFFSET   5
>  
>  struct tipc_skb_cb {
> - void *handle;
> + u32 bytes_read;
>   struct sk_buff *tail;
>   bool validated;
>   bool wakeup_pending;

If this is now a u32, then:

> - u32 offset = (u32)(unsigned long)(TIPC_SKB_CB(buf)->handle);
> + u32 offset = (u32)(TIPC_SKB_CB(buf)->bytes_read);

This cast is unnecessary as are the parenthesis.


Re: [PATCH net-next] tcp_bbr: add a state transition diagram and accompanying comment

2016-10-29 Thread David Miller
From: Neal Cardwell 
Date: Thu, 27 Oct 2016 13:26:37 -0400

> Document the possible state transitions for a BBR flow, and also add a
> prose summary of the state machine, covering the life of a typical BBR
> flow.
> 
> Signed-off-by: Neal Cardwell 
> Signed-off-by: Yuchung Cheng 
> Signed-off-by: Eric Dumazet 
> Signed-off-by: Soheil Hassas Yeganeh 

Applied.


Re: [net 0/4][pull request] Intel Wired LAN Driver Updates 2016-10-27

2016-10-29 Thread David Miller
From: Jeff Kirsher 
Date: Thu, 27 Oct 2016 14:27:53 -0700

> This series contains fixes to ixgbe and i40e.

Pulled, thanks Jeff.


Re: [patch net-next v2] rocker: set physical device for port netdevice

2016-10-29 Thread David Miller
From: Jiri Pirko 
Date: Thu, 27 Oct 2016 22:32:22 +0200

> From: Jiri Pirko 
> 
> Do this so the sysfs has "device" link correctly set.
> 
> Signed-off-by: Jiri Pirko 
> ---
> v1->v2:
>  - make pdev non-const as pointerd out by kbuild test robot

Applied, thanks Jiri.


Re: [PATCH net v1 2/2] ibmvnic: Fix missing brackets in init_sub_crq_irqs

2016-10-29 Thread David Miller
From: Thomas Falcon 
Date: Thu, 27 Oct 2016 12:28:52 -0500

> Signed-off-by: Thomas Falcon 
> ---
> v1: caught by kbuild bot with -Wmisleading-indentation after
> after submitting previous patch

Applied.


Re: [PATCH net v2 1/2] ibmvnic: Fix releasing of sub-CRQ IRQs in interrupt context

2016-10-29 Thread David Miller
From: Thomas Falcon 
Date: Thu, 27 Oct 2016 12:28:51 -0500

> Schedule these XPORT event tasks in the shared workqueue
> so that IRQs are not freed in an interrupt context when
> sub-CRQs are released.
> 
> Signed-off-by: Thomas Falcon 
> ---
> v2: correct warnings by kbuild bot

Applied.


Re: [PATCH net 1/1] tipc: fix broadcast link synchronization problem

2016-10-29 Thread David Miller
From: Jon Maloy 
Date: Thu, 27 Oct 2016 18:51:55 -0400

> In commit 2d18ac4ba745 ("tipc: extend broadcast link initialization
> criteria") we tried to fix a problem with the initial synchronization
> of broadcast link acknowledge values. Unfortunately that solution is
> not sufficient to solve the issue.
> 
> We have seen it happen that LINK_PROTOCOL/STATE packets with a valid
> non-zero unicast acknowledge number may bypass BCAST_PROTOCOL
> initialization, NAME_DISTRIBUTOR and other STATE packets with invalid
> broadcast acknowledge numbers, leading to premature opening of the
> broadcast link. When the bypassed packets finally arrive, they are
> inadvertently accepted, and the already correctly initialized
> acknowledge number in the broadcast receive link is overwritten by
> the invalid (zero) value of the said packets. After this the broadcast
> link goes stale.
> 
> We now fix this by marking the packets where we know the acknowledge
> value is or may be invalid, and then ignoring the acks from those.
> 
> To this purpose, we claim an unused bit in the header to indicate that
> the value is invalid. We set the bit to 1 in the initial BCAST_PROTOCOL
> synchronization packet and all initial ("bulk") NAME_DISTRIBUTOR
> packets, plus those LINK_PROTOCOL packets sent out before the broadcast
> links are fully synchronized.
> 
> This minor protocol update is fully backwards compatible.
> 
> Reported-by: John Thompson 
> Tested-by: John Thompson 
> Signed-off-by: Jon Maloy 

Applied, thanks Jon.


Re: [PATCH net] enic: fix rq disable

2016-10-29 Thread David Miller
From: Govindarajulu Varadarajan 
Date: Thu, 27 Oct 2016 16:01:03 -0700

> When MTU is changed from 9000 to 1500 while there is burst of inbound 9000
> bytes packets, adaptor sometimes delivers 9000 bytes packets to 1500 bytes
> buffers. This causes memory corruption and sometimes crash.
> 
> This is because of a race condition in adaptor between "RQ disable"
> clearing descriptor mini-cache and mini-cache valid bit being set by
> completion of descriptor fetch. This can result in stale RQ desc being
> cached and used when packets arrive. In this case, the stale descriptor
> have old MTU value.
> 
> Solution is to write RQ->disable twice. The first write will stop any
> further desc fetches, allowing the second disable to clear the mini-cache
> valid bit without danger of a race.
> 
> Also, the check for rq->running becoming 0 after writing rq->enable to 0
> is not done properly. When incoming packets are flooding the interface,
> rq->running will pulse high for each dropped packet. Since the driver was
> waiting for 10us between each poll, it is possible to see rq->running = 1
> 1000 times in a row, even though it is not actually stuck running.
> This results in false failure of vnic_rq_disable(). Fix is to try more
> than 1000 time without delay between polls to ensure we do not miss when
> running goes low.
> 
> In old adaptors rq->enable needs to be re-written to 0 when posted_index
> is reset in vnic_rq_clean() in order to keep rq->prefetch_index in sync.
> 
> Signed-off-by: Govindarajulu Varadarajan <_gov...@gmx.com>

Applied.


  1   2   >