[PATCH net-next 0/2] nfp: process MTU updates from firmware flower app

2017-08-16 Thread Simon Horman
The first patch of this series moves processing of control messages from a
BH handler to a workqueue. That change makes it safe to process MTU
updates from the firmware which is added by the second patch of this
series.

Simon Horman (2):
  nfp: process control messages in workqueue in flower app
  nfp: process MTU updates from firmware flower app

 drivers/net/ethernet/netronome/nfp/flower/cmsg.c | 35 ++--
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h |  1 +
 drivers/net/ethernet/netronome/nfp/flower/main.c | 14 --
 drivers/net/ethernet/netronome/nfp/flower/main.h |  5 
 4 files changed, 50 insertions(+), 5 deletions(-)

-- 
2.1.4



[PATCH net-next 2/2] nfp: process MTU updates from firmware flower app

2017-08-16 Thread Simon Horman
Now that control message processing occurs in a workqueue rather than a BH
handler MTU updates received from the firmware may be safely processed.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/flower/cmsg.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c 
b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
index 6c8c22491fe7..806924b82adc 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -150,10 +150,17 @@ nfp_flower_cmsg_portmod_rx(struct nfp_app *app, struct 
sk_buff *skb)
return;
}
 
-   if (link)
+   if (link) {
+   u16 mtu = be16_to_cpu(msg->mtu);
+
netif_carrier_on(netdev);
-   else
+
+   /* An MTU of 0 from the firmware should be ignored */
+   if (mtu)
+   dev_set_mtu(netdev, mtu);
+   } else {
netif_carrier_off(netdev);
+   }
rcu_read_unlock();
 }
 
-- 
2.1.4



[PATCH net-next 1/2] nfp: process control messages in workqueue in flower app

2017-08-16 Thread Simon Horman
Processing of control messages is not time-critical and future processing
of some messages will require taking the RTNL which is not possible
in a BH handler. It seems simplest to move all control message processing
to a workqueue.

Signed-off-by: Simon Horman 
Reviewed-by: Jakub Kicinski 
---
 drivers/net/ethernet/netronome/nfp/flower/cmsg.c | 24 +++-
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h |  1 +
 drivers/net/ethernet/netronome/nfp/flower/main.c | 14 --
 drivers/net/ethernet/netronome/nfp/flower/main.h |  5 +
 4 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c 
b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
index aa46b23cdfb1..6c8c22491fe7 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -34,10 +34,12 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "main.h"
 #include "../nfpcore/nfp_cpp.h"
+#include "../nfp_net.h"
 #include "../nfp_net_repr.h"
 #include "./cmsg.h"
 
@@ -155,7 +157,8 @@ nfp_flower_cmsg_portmod_rx(struct nfp_app *app, struct 
sk_buff *skb)
rcu_read_unlock();
 }
 
-void nfp_flower_cmsg_rx(struct nfp_app *app, struct sk_buff *skb)
+static void
+nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 {
struct nfp_flower_cmsg_hdr *cmsg_hdr;
enum nfp_flower_cmsg_type_port type;
@@ -184,3 +187,22 @@ void nfp_flower_cmsg_rx(struct nfp_app *app, struct 
sk_buff *skb)
 out:
dev_kfree_skb_any(skb);
 }
+
+void nfp_flower_cmsg_process_rx(struct work_struct *work)
+{
+   struct nfp_flower_priv *priv;
+   struct sk_buff *skb;
+
+   priv = container_of(work, struct nfp_flower_priv, cmsg_work);
+
+   while ((skb = skb_dequeue(&priv->cmsg_skbs)))
+   nfp_flower_cmsg_process_one_rx(priv->nn->app, skb);
+}
+
+void nfp_flower_cmsg_rx(struct nfp_app *app, struct sk_buff *skb)
+{
+   struct nfp_flower_priv *priv = app->priv;
+
+   skb_queue_tail(&priv->cmsg_skbs, skb);
+   schedule_work(&priv->cmsg_work);
+}
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h 
b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index aa92a8711a02..a2ec60344236 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -330,6 +330,7 @@ nfp_flower_cmsg_mac_repr_add(struct sk_buff *skb, unsigned 
int idx,
 unsigned int nbi, unsigned int nbi_port,
 unsigned int phys_port);
 int nfp_flower_cmsg_portmod(struct nfp_repr *repr, bool carrier_ok);
+void nfp_flower_cmsg_process_rx(struct work_struct *work);
 void nfp_flower_cmsg_rx(struct nfp_app *app, struct sk_buff *skb);
 struct sk_buff *
 nfp_flower_cmsg_alloc(struct nfp_app *app, unsigned int size,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c 
b/drivers/net/ethernet/netronome/nfp/flower/main.c
index b905454b30ca..3088e959f2a3 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -332,6 +332,7 @@ static int nfp_flower_vnic_init(struct nfp_app *app, struct 
nfp_net *nn,
 static int nfp_flower_init(struct nfp_app *app)
 {
const struct nfp_pf *pf = app->pf;
+   struct nfp_flower_priv *app_priv;
u64 version;
int err;
 
@@ -362,10 +363,14 @@ static int nfp_flower_init(struct nfp_app *app)
return -EINVAL;
}
 
-   app->priv = vzalloc(sizeof(struct nfp_flower_priv));
-   if (!app->priv)
+   app_priv = vzalloc(sizeof(struct nfp_flower_priv));
+   if (!app_priv)
return -ENOMEM;
 
+   app->priv = app_priv;
+   skb_queue_head_init(&app_priv->cmsg_skbs);
+   INIT_WORK(&app_priv->cmsg_work, nfp_flower_cmsg_process_rx);
+
err = nfp_flower_metadata_init(app);
if (err)
goto err_free_app_priv;
@@ -379,6 +384,11 @@ static int nfp_flower_init(struct nfp_app *app)
 
 static void nfp_flower_clean(struct nfp_app *app)
 {
+   struct nfp_flower_priv *app_priv = app->priv;
+
+   skb_queue_purge(&app_priv->cmsg_skbs);
+   flush_work(&app_priv->cmsg_work);
+
nfp_flower_metadata_cleanup(app);
vfree(app->priv);
app->priv = NULL;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h 
b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 71e4f4f4e9ba..b7043ca9b9fc 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct net_device;
 struct nfp_app;
@@ -78,6 +79,8 @@ struct nfp_fl_stats_id {
  * @mask_ids:  List of free mask ids
  * @mask_table:Hash table used to store masks
  * @flow_table:Hash table used to store flower rules
+ * @cmsg_work: Workqueue for control messages 

Re: 100% CPU load when generating traffic to destination network that nexthop is not reachable

2017-08-16 Thread Julian Anastasov

Hello,

On Tue, 15 Aug 2017, Eric Dumazet wrote:

> It must be possible to add a fast path without locks.
> 
> (say if jiffies has not changed before last state change)

New day - new idea. Something like this? But it
has bug: without checking neigh->dead under lock we don't
have the right to access neigh->parms, it can be destroyed
immediately by neigh_release->neigh_destroy->neigh_parms_put->
neigh_parms_destroy->kfree. Not sure, may be kfree_rcu can help
for this...

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 9816df2..f52763c 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -428,10 +428,10 @@ static inline int neigh_event_send(struct neighbour 
*neigh, struct sk_buff *skb)
 {
unsigned long now = jiffies;

-   if (neigh->used != now)
-   neigh->used = now;
if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
return __neigh_event_send(neigh, skb);
+   if (neigh->used != now)
+   neigh->used = now;
return 0;
 }
 
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 16a1a4c..52a8718 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -991,8 +991,18 @@ static void neigh_timer_handler(unsigned long arg)
 
 int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 {
-   int rc;
bool immediate_probe = false;
+   unsigned long now = jiffies;
+   int rc;
+
+   if (neigh->used != now) {
+   neigh->used = now;
+   } else if (neigh->nud_state == NUD_INCOMPLETE &&
+  (!skb || neigh->arp_queue_len_bytes + skb->truesize >
+   NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES))) {
+   kfree_skb(skb);
+   return 1;
+   }
 
write_lock_bh(&neigh->lock);
 
@@ -1005,7 +1015,7 @@ int __neigh_event_send(struct neighbour *neigh, struct 
sk_buff *skb)
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
NEIGH_VAR(neigh->parms, APP_PROBES)) {
-   unsigned long next, now = jiffies;
+   unsigned long next;
 
atomic_set(&neigh->probes,
   NEIGH_VAR(neigh->parms, UCAST_PROBES));

Regards

--
Julian Anastasov 


DSA support for Micrel KSZ8895

2017-08-16 Thread Pavel Machek
Hi!

I've got hardware with KSZ8895, and I'd like to use switch ports as
separate ethernet cards. I believe that means DSA support.

And there are even patches available from microchip... unfortunately
they are in strange form and for v3.18.

http://www.microchip.com/SWLibraryWeb/product.aspx?product=KSZ8895%20Software%20Linux%203.18

Is there newer version of the driver available somewhere? Is the
driver good starting point, or should I start with something else?

Best regards,
Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


Re: [PATCH] net: igmp: Use ingress interface rather than vrf device

2017-08-16 Thread Nikolay Aleksandrov
On 16/08/17 04:38, David Ahern wrote:
> Anuradha reported that statically added groups for interfaces enslaved
> to a VRF device were not persisting. The problem is that igmp queries
> and reports need to use the data in the in_dev for the real ingress
> device rather than the VRF device. Update igmp_rcv accordingly.
> 
> Fixes: e58e41596811 ("net: Enable support for VRF with ipv4 multicast")
> Reported-by: Anuradha Karuppiah 
> Signed-off-by: David Ahern 
> ---
>  net/ipv4/igmp.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
> index 9f86b5133605..ab183af0b5b6 100644
> --- a/net/ipv4/igmp.c
> +++ b/net/ipv4/igmp.c
> @@ -1007,10 +1007,18 @@ int igmp_rcv(struct sk_buff *skb)
>  {
>   /* This basically follows the spec line by line -- see RFC1112 */
>   struct igmphdr *ih;
> - struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
> + struct net_device *dev = skb->dev;
> + struct in_device *in_dev;
>   int len = skb->len;
>   bool dropped = true;
>  
> + if (netif_is_l3_master(dev)) {
> + dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
> + if (!dev)
> + goto drop;
> + }
> +
> + in_dev = __in_dev_get_rcu(dev);
>   if (!in_dev)
>   goto drop;
>  
> 

Reviewed-by: Nikolay Aleksandrov 



[patch net-next repost 0/3] net/sched: Improve getting objects by indexes

2017-08-16 Thread Chris Mi
Using current TC code, it is very slow to insert a lot of rules.

In order to improve the rules update rate in TC,
we introduced the following two changes:
1) changed cls_flower to use IDR to manage the filters.
2) changed all act_xxx modules to use IDR instead of
   a small hash table

But IDR has a limitation that it uses int. TC handle uses u32.
To make sure there is no regression, we also changed IDR to use
unsigned long. All clients of IDR are changed to use new IDR API.

Chris Mi (3):
  idr: Use unsigned long instead of int
  net/sched: Change cls_flower to use IDR
  net/sched: Change act_api and act_xxx modules to use IDR

 block/bsg.c |   8 +-
 block/genhd.c   |  12 +-
 drivers/atm/nicstar.c   |  11 +-
 drivers/block/drbd/drbd_main.c  |  31 +--
 drivers/block/drbd/drbd_nl.c|  22 ++-
 drivers/block/drbd/drbd_proc.c  |   3 +-
 drivers/block/drbd/drbd_receiver.c  |  15 +-
 drivers/block/drbd/drbd_state.c |  34 ++--
 drivers/block/drbd/drbd_worker.c|   6 +-
 drivers/block/loop.c|  17 +-
 drivers/block/nbd.c |  20 +-
 drivers/block/zram/zram_drv.c   |   9 +-
 drivers/char/tpm/tpm-chip.c |  10 +-
 drivers/char/tpm/tpm.h  |   2 +-
 drivers/dca/dca-sysfs.c |   9 +-
 drivers/firewire/core-cdev.c|  18 +-
 drivers/firewire/core-device.c  |  15 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |   8 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |   9 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c |   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |   2 +-
 drivers/gpu/drm/drm_auth.c  |   9 +-
 drivers/gpu/drm/drm_connector.c |  10 +-
 drivers/gpu/drm/drm_context.c   |  20 +-
 drivers/gpu/drm/drm_dp_aux_dev.c|  11 +-
 drivers/gpu/drm/drm_drv.c   |   6 +-
 drivers/gpu/drm/drm_gem.c   |  19 +-
 drivers/gpu/drm/drm_info.c  |   2 +-
 drivers/gpu/drm/drm_mode_object.c   |  11 +-
 drivers/gpu/drm/drm_syncobj.c   |  18 +-
 drivers/gpu/drm/exynos/exynos_drm_ipp.c |  25 ++-
 drivers/gpu/drm/i915/gvt/display.c  |   2 +-
 drivers/gpu/drm/i915/gvt/kvmgt.c|   2 +-
 drivers/gpu/drm/i915/gvt/vgpu.c |   9 +-
 drivers/gpu/drm/i915/i915_debugfs.c |   6 +-
 drivers/gpu/drm/i915/i915_gem_context.c |   9 +-
 drivers/gpu/drm/qxl/qxl_cmd.c   |   8 +-
 drivers/gpu/drm/qxl/qxl_release.c   |  14 +-
 drivers/gpu/drm/sis/sis_mm.c|   8 +-
 drivers/gpu/drm/tegra/drm.c |  10 +-
 drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c|   3 +-
 drivers/gpu/drm/vgem/vgem_fence.c   |  12 +-
 drivers/gpu/drm/via/via_mm.c|   8 +-
 drivers/gpu/drm/virtio/virtgpu_kms.c|   5 +-
 drivers/gpu/drm/virtio/virtgpu_vq.c |   5 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_resource.c|   9 +-
 drivers/i2c/i2c-core-base.c |  19 +-
 drivers/infiniband/core/cm.c|   8 +-
 drivers/infiniband/core/cma.c   |  12 +-
 drivers/infiniband/core/rdma_core.c |   9 +-
 drivers/infiniband/core/sa_query.c  |  23 +--
 drivers/infiniband/core/ucm.c   |   7 +-
 drivers/infiniband/core/ucma.c  |  14 +-
 drivers/infiniband/hw/cxgb3/iwch.c  |   4 +-
 drivers/infiniband/hw/cxgb3/iwch.h  |   4 +-
 drivers/infiniband/hw/cxgb4/device.c|  18 +-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h  |   4 +-
 drivers/infiniband/hw/hfi1/init.c   |   9 +-
 drivers/infiniband/hw/hfi1/vnic_main.c  |   6 +-
 drivers/infiniband/hw/mlx4/cm.c |  13 +-
 drivers/infiniband/hw/ocrdma/ocrdma_main.c  |   7 +-
 drivers/infiniband/hw/qib/qib_init.c|   9 +-
 drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c |  10 +-
 drivers/iommu/intel-svm.c   |   9 +-
 drivers/md/dm.c |  13 +-
 drivers/memstick/core/memstick.c|  10 +-
 drivers/memstick/core/ms_block.c|   9 +-
 drivers/memstick/core/mspro_block.c |  12 +-
 drivers/mfd/rtsx_pcr.c  |   9 +-
 drivers/misc/c2port/core.c  |   7 +-
 drivers/misc/cxl/context.c  |   8 +-
 drivers/misc/cxl/main.c |  15 +-
 drivers/misc/mei/main.c |   8 +-
 drivers/misc/mic/scif/scif_api.c|  11 +-
 drivers/misc/mic/scif/scif_ports.c

[patch net-next repost 3/3] net/sched: Change act_api and act_xxx modules to use IDR

2017-08-16 Thread Chris Mi
Typically, each TC filter has its own action. All the actions of the
same type are saved in its hash table. But the hash buckets are too
small that it degrades to a list. And the performance is greatly
affected. For example, it takes about 0m11.914s to insert 64K rules.
If we convert the hash table to IDR, it only takes about 0m1.500s.
The improvement is huge.

But please note that the test result is based on previous patch that
cls_flower uses IDR.

Signed-off-by: Chris Mi 
Signed-off-by: Jiri Pirko 
---
 include/net/act_api.h  |  76 +-
 net/sched/act_api.c| 249 ++---
 net/sched/act_bpf.c|  17 ++--
 net/sched/act_connmark.c   |  16 ++-
 net/sched/act_csum.c   |  16 ++-
 net/sched/act_gact.c   |  16 ++-
 net/sched/act_ife.c|  20 ++--
 net/sched/act_ipt.c|  26 +++--
 net/sched/act_mirred.c |  19 ++--
 net/sched/act_nat.c|  16 ++-
 net/sched/act_pedit.c  |  18 ++--
 net/sched/act_police.c |  18 ++--
 net/sched/act_sample.c |  17 ++--
 net/sched/act_simple.c |  20 ++--
 net/sched/act_skbedit.c|  18 ++--
 net/sched/act_skbmod.c |  18 ++--
 net/sched/act_tunnel_key.c |  20 ++--
 net/sched/act_vlan.c   |  22 ++--
 18 files changed, 277 insertions(+), 345 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 26ffd83..8f3d5d8 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -10,12 +10,9 @@
 #include 
 #include 
 
-
-struct tcf_hashinfo {
-   struct hlist_head   *htab;
-   unsigned inthmask;
-   spinlock_t  lock;
-   u32 index;
+struct tcf_idrinfo {
+   spinlock_t  lock;
+   struct idr  action_idr;
 };
 
 struct tc_action_ops;
@@ -25,9 +22,8 @@ struct tc_action {
__u32   type; /* for backward 
compat(TCA_OLD_COMPAT) */
__u32   order;
struct list_headlist;
-   struct tcf_hashinfo *hinfo;
+   struct tcf_idrinfo  *idrinfo;
 
-   struct hlist_node   tcfa_head;
u32 tcfa_index;
int tcfa_refcnt;
int tcfa_bindcnt;
@@ -44,7 +40,6 @@ struct tc_action {
struct tc_cookie*act_cookie;
struct tcf_chain*goto_chain;
 };
-#define tcf_head   common.tcfa_head
 #define tcf_index  common.tcfa_index
 #define tcf_refcnt common.tcfa_refcnt
 #define tcf_bindcntcommon.tcfa_bindcnt
@@ -57,27 +52,6 @@ struct tc_action {
 #define tcf_lock   common.tcfa_lock
 #define tcf_rcucommon.tcfa_rcu
 
-static inline unsigned int tcf_hash(u32 index, unsigned int hmask)
-{
-   return index & hmask;
-}
-
-static inline int tcf_hashinfo_init(struct tcf_hashinfo *hf, unsigned int mask)
-{
-   int i;
-
-   spin_lock_init(&hf->lock);
-   hf->index = 0;
-   hf->hmask = mask;
-   hf->htab = kzalloc((mask + 1) * sizeof(struct hlist_head),
-  GFP_KERNEL);
-   if (!hf->htab)
-   return -ENOMEM;
-   for (i = 0; i < mask + 1; i++)
-   INIT_HLIST_HEAD(&hf->htab[i]);
-   return 0;
-}
-
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
  */
@@ -126,53 +100,51 @@ struct tc_action_ops {
 };
 
 struct tc_action_net {
-   struct tcf_hashinfo *hinfo;
+   struct tcf_idrinfo *idrinfo;
const struct tc_action_ops *ops;
 };
 
 static inline
 int tc_action_net_init(struct tc_action_net *tn,
-  const struct tc_action_ops *ops, unsigned int mask)
+  const struct tc_action_ops *ops)
 {
int err = 0;
 
-   tn->hinfo = kmalloc(sizeof(*tn->hinfo), GFP_KERNEL);
-   if (!tn->hinfo)
+   tn->idrinfo = kmalloc(sizeof(*tn->idrinfo), GFP_KERNEL);
+   if (!tn->idrinfo)
return -ENOMEM;
tn->ops = ops;
-   err = tcf_hashinfo_init(tn->hinfo, mask);
-   if (err)
-   kfree(tn->hinfo);
+   spin_lock_init(&tn->idrinfo->lock);
+   idr_init(&tn->idrinfo->action_idr);
return err;
 }
 
-void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
- struct tcf_hashinfo *hinfo);
+void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
+struct tcf_idrinfo *idrinfo);
 
 static inline void tc_action_net_exit(struct tc_action_net *tn)
 {
-   tcf_hashinfo_destroy(tn->ops, tn->hinfo);
-   kfree(tn->hinfo);
+   tcf_idrinfo_destroy(tn->ops, tn->idrinfo);
+   kfree(tn->idrinfo);
 }
 
 int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
   struct netlink_callback *cb, int type,
   const struct tc_action_ops *ops);
-int tcf_hash_search(struct tc_ac

[patch net-next repost 2/3] net/sched: Change cls_flower to use IDR

2017-08-16 Thread Chris Mi
Currently, all filters with the same priority are linked in a doubly
linked list. Every filter should have a unique handle. To make the
handle unique, we need to iterate the list every time to see if the
handle exists or not when inserting a new filter. It is time-consuming.
For example, it takes about 5m3.169s to insert 64K rules.

This patch changes cls_flower to use IDR. With this patch, it
takes about 0m1.127s to insert 64K rules. The improvement is huge.

But please note that in this testing, all filters share the same action.
If every filter has a unique action, that is another bottleneck.
Follow-up patch in this patchset addresses that.

Signed-off-by: Chris Mi 
Signed-off-by: Jiri Pirko 
---
 net/sched/cls_flower.c | 55 +-
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 052e902..071f0ef 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -68,7 +68,6 @@ struct cls_fl_head {
struct rhashtable ht;
struct fl_flow_mask mask;
struct flow_dissector dissector;
-   u32 hgen;
bool mask_assigned;
struct list_head filters;
struct rhashtable_params ht_params;
@@ -76,6 +75,7 @@ struct cls_fl_head {
struct work_struct work;
struct rcu_head rcu;
};
+   struct idr handle_idr;
 };
 
 struct cls_fl_filter {
@@ -210,6 +210,7 @@ static int fl_init(struct tcf_proto *tp)
 
INIT_LIST_HEAD_RCU(&head->filters);
rcu_assign_pointer(tp->root, head);
+   idr_init(&head->handle_idr);
 
return 0;
 }
@@ -295,6 +296,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct 
cls_fl_filter *f)
 
 static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f)
 {
+   struct cls_fl_head *head = rtnl_dereference(tp->root);
+
+   idr_remove(&head->handle_idr, f->handle);
list_del_rcu(&f->list);
if (!tc_skip_hw(f->flags))
fl_hw_destroy_filter(tp, f);
@@ -327,6 +331,7 @@ static void fl_destroy(struct tcf_proto *tp)
 
list_for_each_entry_safe(f, next, &head->filters, list)
__fl_delete(tp, f);
+   idr_destroy(&head->handle_idr);
 
__module_get(THIS_MODULE);
call_rcu(&head->rcu, fl_destroy_rcu);
@@ -335,12 +340,8 @@ static void fl_destroy(struct tcf_proto *tp)
 static void *fl_get(struct tcf_proto *tp, u32 handle)
 {
struct cls_fl_head *head = rtnl_dereference(tp->root);
-   struct cls_fl_filter *f;
 
-   list_for_each_entry(f, &head->filters, list)
-   if (f->handle == handle)
-   return f;
-   return NULL;
+   return idr_find(&head->handle_idr, handle);
 }
 
 static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
@@ -859,27 +860,6 @@ static int fl_set_parms(struct net *net, struct tcf_proto 
*tp,
return 0;
 }
 
-static u32 fl_grab_new_handle(struct tcf_proto *tp,
- struct cls_fl_head *head)
-{
-   unsigned int i = 0x8000;
-   u32 handle;
-
-   do {
-   if (++head->hgen == 0x7FFF)
-   head->hgen = 1;
-   } while (--i > 0 && fl_get(tp, head->hgen));
-
-   if (unlikely(i == 0)) {
-   pr_err("Insufficient number of handles\n");
-   handle = 0;
-   } else {
-   handle = head->hgen;
-   }
-
-   return handle;
-}
-
 static int fl_change(struct net *net, struct sk_buff *in_skb,
 struct tcf_proto *tp, unsigned long base,
 u32 handle, struct nlattr **tca,
@@ -890,6 +870,7 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
struct cls_fl_filter *fnew;
struct nlattr **tb;
struct fl_flow_mask mask = {};
+   unsigned long idr_index;
int err;
 
if (!tca[TCA_OPTIONS])
@@ -920,13 +901,21 @@ static int fl_change(struct net *net, struct sk_buff 
*in_skb,
goto errout;
 
if (!handle) {
-   handle = fl_grab_new_handle(tp, head);
-   if (!handle) {
-   err = -EINVAL;
+   err = idr_alloc(&head->handle_idr, fnew, &idr_index,
+   1, 0x8000, GFP_KERNEL);
+   if (err)
goto errout;
-   }
+   fnew->handle = idr_index;
+   }
+
+   /* user specifies a handle and it doesn't exist */
+   if (handle && !fold) {
+   err = idr_alloc(&head->handle_idr, fnew, &idr_index,
+   handle, handle + 1, GFP_KERNEL);
+   if (err)
+   goto errout;
+   fnew->handle = idr_index;
}
-   fnew->handle = handle;
 
if (tb[TCA_FLOWER_FLAGS]) {
fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
@@ -980,6 +969,8 @@ static int fl_change(struct net *net,

Re: [patch net-next 0/3] net/sched: Improve getting objects by indexes

2017-08-16 Thread Jiri Pirko
Wed, Aug 16, 2017 at 09:49:07AM CEST, christian.koe...@amd.com wrote:
>Am 16.08.2017 um 04:12 schrieb Chris Mi:
>> Using current TC code, it is very slow to insert a lot of rules.
>> 
>> In order to improve the rules update rate in TC,
>> we introduced the following two changes:
>>  1) changed cls_flower to use IDR to manage the filters.
>>  2) changed all act_xxx modules to use IDR instead of
>> a small hash table
>> 
>> But IDR has a limitation that it uses int. TC handle uses u32.
>> To make sure there is no regression, we also changed IDR to use
>> unsigned long. All clients of IDR are changed to use new IDR API.
>
>WOW, wait a second. The idr change is touching a lot of drivers and to be
>honest doesn't looks correct at all.
>
>Just look at the first chunk of your modification:
>> @@ -998,8 +999,9 @@ int bsg_register_queue(struct request_queue *q, struct 
>> device *parent,
>>  mutex_lock(&bsg_mutex);
>> -ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL);
>> -if (ret < 0) {
>> +ret = idr_alloc(&bsg_minor_idr, bcd, &idr_index, 0, BSG_MAX_DEVS,
>> +GFP_KERNEL);
>> +if (ret) {
>>  if (ret == -ENOSPC) {
>>  printk(KERN_ERR "bsg: too many bsg devices\n");
>>  ret = -EINVAL;
>The condition "if (ret)" will now always be true after the first allocation
>and so we always run into the error handling after that.

On success, idr_alloc returns 0.


>
>I've never read the bsg code before, but that's certainly not correct. And
>that incorrect pattern repeats over and over again in this code.
>
>Apart from that why the heck do you want to allocate more than 1<<31 handles?

tc action indexes for example. That is part of this patchset.


Re: [PATCH] bpf: Update sysctl documentation to list all supported architectures

2017-08-16 Thread Daniel Borkmann

Hi Michael,

On 08/16/2017 07:15 AM, Michael Ellerman wrote:

The sysctl documentation states that the JIT is only available on
x86_64, which is no longer correct.

Update the list to include all architectures that enable HAVE_CBPF_JIT
or HAVE_EBPF_JIT under some configuration.

Signed-off-by: Michael Ellerman 


Thanks for the patch!


  Documentation/sysctl/net.txt | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 14db18c970b1..f68356024d09 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -36,8 +36,9 @@ bpf_jit_enable
  --

  This enables Berkeley Packet Filter Just in Time compiler.
-Currently supported on x86_64 architecture, bpf_jit provides a framework
-to speed packet filtering, the one used by tcpdump/libpcap for example.
+Currently supported on arm, arm64, mips, powerpc, s390, sparc and x86_64
+architectures, bpf_jit provides a framework to speed packet filtering, the one
+used by tcpdump/libpcap for example.


Good point, could we actually make that as a bullet list and
differentiate between cBPF and eBPF JITs, so that a user doesn't
need to run git grep HAVE_{E,C}BPF_JIT to figure it out what the
switch enables on the arch used? That would be great.

So for eBPF JITs, we have covered:

 * x86_64
 * arm64
 * ppc64
 * sparc64
 * mips64

For old cBPF, there is:

 * arm
 * mips
 * ppc
 * sparc

Thanks,
Daniel


  Values :
0 - disable the JIT (default value)
1 - enable the JIT





Re: [patch net-next 0/3] net/sched: Improve getting objects by indexes

2017-08-16 Thread Christian König

Am 16.08.2017 um 10:16 schrieb Jiri Pirko:

Wed, Aug 16, 2017 at 09:49:07AM CEST, christian.koe...@amd.com wrote:

Am 16.08.2017 um 04:12 schrieb Chris Mi:

Using current TC code, it is very slow to insert a lot of rules.

In order to improve the rules update rate in TC,
we introduced the following two changes:
  1) changed cls_flower to use IDR to manage the filters.
  2) changed all act_xxx modules to use IDR instead of
 a small hash table

But IDR has a limitation that it uses int. TC handle uses u32.
To make sure there is no regression, we also changed IDR to use
unsigned long. All clients of IDR are changed to use new IDR API.

WOW, wait a second. The idr change is touching a lot of drivers and to be
honest doesn't looks correct at all.

Just look at the first chunk of your modification:

@@ -998,8 +999,9 @@ int bsg_register_queue(struct request_queue *q, struct 
device *parent,
mutex_lock(&bsg_mutex);
-   ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL);
-   if (ret < 0) {
+   ret = idr_alloc(&bsg_minor_idr, bcd, &idr_index, 0, BSG_MAX_DEVS,
+   GFP_KERNEL);
+   if (ret) {
if (ret == -ENOSPC) {
printk(KERN_ERR "bsg: too many bsg devices\n");
ret = -EINVAL;

The condition "if (ret)" will now always be true after the first allocation
and so we always run into the error handling after that.

On success, idr_alloc returns 0.


Ah, I see. You change the idr_alloc to return the resulting index as 
separate parameter.


You should explicit note that in the commit message, cause that is 
something easily overlooked.


In general I strongly suggest to add a separate interface for allocating 
unsigned long handles, use that for the while being and then move the 
existing drivers over bit by bit.


A single patch which touches so many different driver is practically 
impossible to review consequently.



I've never read the bsg code before, but that's certainly not correct. And
that incorrect pattern repeats over and over again in this code.

Apart from that why the heck do you want to allocate more than 1<<31 handles?

tc action indexes for example. That is part of this patchset.


Well, let me refine the question: Why does tc action indexes need more 
than 31 bits? From an outside view that looks like pure overkill.


Regards,
Christian.


Re: [patch net-next 0/3] net/sched: Improve getting objects by indexes

2017-08-16 Thread Jiri Pirko
Wed, Aug 16, 2017 at 10:31:35AM CEST, christian.koe...@amd.com wrote:
>Am 16.08.2017 um 10:16 schrieb Jiri Pirko:
>> Wed, Aug 16, 2017 at 09:49:07AM CEST, christian.koe...@amd.com wrote:
>> > Am 16.08.2017 um 04:12 schrieb Chris Mi:
>> > > Using current TC code, it is very slow to insert a lot of rules.
>> > > 
>> > > In order to improve the rules update rate in TC,
>> > > we introduced the following two changes:
>> > >   1) changed cls_flower to use IDR to manage the filters.
>> > >   2) changed all act_xxx modules to use IDR instead of
>> > >  a small hash table
>> > > 
>> > > But IDR has a limitation that it uses int. TC handle uses u32.
>> > > To make sure there is no regression, we also changed IDR to use
>> > > unsigned long. All clients of IDR are changed to use new IDR API.
>> > WOW, wait a second. The idr change is touching a lot of drivers and to be
>> > honest doesn't looks correct at all.
>> > 
>> > Just look at the first chunk of your modification:
>> > > @@ -998,8 +999,9 @@ int bsg_register_queue(struct request_queue *q, 
>> > > struct device *parent,
>> > >  mutex_lock(&bsg_mutex);
>> > > -ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, 
>> > > GFP_KERNEL);
>> > > -if (ret < 0) {
>> > > +ret = idr_alloc(&bsg_minor_idr, bcd, &idr_index, 0, 
>> > > BSG_MAX_DEVS,
>> > > +GFP_KERNEL);
>> > > +if (ret) {
>> > >  if (ret == -ENOSPC) {
>> > >  printk(KERN_ERR "bsg: too many bsg devices\n");
>> > >  ret = -EINVAL;
>> > The condition "if (ret)" will now always be true after the first allocation
>> > and so we always run into the error handling after that.
>> On success, idr_alloc returns 0.
>
>Ah, I see. You change the idr_alloc to return the resulting index as separate
>parameter.
>
>You should explicit note that in the commit message, cause that is something
>easily overlooked.
>
>In general I strongly suggest to add a separate interface for allocating
>unsigned long handles, use that for the while being and then move the
>existing drivers over bit by bit.
>
>A single patch which touches so many different driver is practically
>impossible to review consequently.

Understood. I think is is good to avoid having some "idr_alloc2". That
is why I suggested to do this in one go, to avoid "idr_alloc2" and then
patch to rename "idr_alloc2" to "idr_alloc" once nobody uses the original
"idr_alloc". In fact, if you do it driver, by driver, the review burden
would be the same, probably even bigger, you'll just have 100+ patches.
Why would it help?

I believe that the changes in drivers are trivial enough to have it in
one patch.


>
>> > I've never read the bsg code before, but that's certainly not correct. And
>> > that incorrect pattern repeats over and over again in this code.
>> > 
>> > Apart from that why the heck do you want to allocate more than 1<<31 
>> > handles?
>> tc action indexes for example. That is part of this patchset.
>
>Well, let me refine the question: Why does tc action indexes need more than
>31 bits? From an outside view that looks like pure overkill.

That is current state, uapi. We have to live with it.


Re: [PATCH net] net: sched: fix NULL pointer dereference when action calls some targets

2017-08-16 Thread Xin Long
On Wed, Aug 9, 2017 at 7:33 AM, Cong Wang  wrote:
> On Mon, Aug 7, 2017 at 7:33 PM, Xin Long  wrote:
>> On Tue, Aug 8, 2017 at 9:15 AM, Cong Wang  wrote:
>>> This looks like a completely API burden?
>> netfilter xt targets are not really compatible with netsched action.
>> I've got to say, the patch is just a way to make checkentry return
>> false and avoid panic. like [1] said
>
> I don't doubt you fix a crash, I am thinking if we can
> "fix" the API instead of fixing the caller.
Hi, Cong,

For now, I don't think it's possible to change APIs or  some of their targets
for the panic caused by action xt calling.

The common way should be fixed in net_sched side.

Given that the issue is very easy to triggered,
let's wait for netfilter's replies for another few days,
otherwise I will repost the fix, agree ?

>
> I am not familiar with this API, so just my 2 cents...


Re: [net-next PATCH] bpf: devmap: remove unnecessary value size check

2017-08-16 Thread Daniel Borkmann

On 08/16/2017 08:35 AM, John Fastabend wrote:

In the devmap alloc map logic we check to ensure that the sizeof the
values are not greater than KMALLOC_MAX_SIZE. But, in the dev map case
we ensure the value size is 4bytes earlier in the function because all
values should be netdev ifindex values.

The second check is harmless but is not needed so remove it.

Signed-off-by: John Fastabend 


Acked-by: Daniel Borkmann 


Re: [patch net-next 0/3] net/sched: Improve getting objects by indexes

2017-08-16 Thread Christian König

Am 16.08.2017 um 10:39 schrieb Jiri Pirko:

Wed, Aug 16, 2017 at 10:31:35AM CEST, christian.koe...@amd.com wrote:

Am 16.08.2017 um 10:16 schrieb Jiri Pirko:

Wed, Aug 16, 2017 at 09:49:07AM CEST, christian.koe...@amd.com wrote:

Am 16.08.2017 um 04:12 schrieb Chris Mi:

Using current TC code, it is very slow to insert a lot of rules.

In order to improve the rules update rate in TC,
we introduced the following two changes:
   1) changed cls_flower to use IDR to manage the filters.
   2) changed all act_xxx modules to use IDR instead of
  a small hash table

But IDR has a limitation that it uses int. TC handle uses u32.
To make sure there is no regression, we also changed IDR to use
unsigned long. All clients of IDR are changed to use new IDR API.

WOW, wait a second. The idr change is touching a lot of drivers and to be
honest doesn't looks correct at all.

Just look at the first chunk of your modification:

@@ -998,8 +999,9 @@ int bsg_register_queue(struct request_queue *q, struct 
device *parent,
mutex_lock(&bsg_mutex);
-   ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL);
-   if (ret < 0) {
+   ret = idr_alloc(&bsg_minor_idr, bcd, &idr_index, 0, BSG_MAX_DEVS,
+   GFP_KERNEL);
+   if (ret) {
if (ret == -ENOSPC) {
printk(KERN_ERR "bsg: too many bsg devices\n");
ret = -EINVAL;

The condition "if (ret)" will now always be true after the first allocation
and so we always run into the error handling after that.

On success, idr_alloc returns 0.

Ah, I see. You change the idr_alloc to return the resulting index as separate
parameter.

You should explicit note that in the commit message, cause that is something
easily overlooked.

In general I strongly suggest to add a separate interface for allocating
unsigned long handles, use that for the while being and then move the
existing drivers over bit by bit.

A single patch which touches so many different driver is practically
impossible to review consequently.

Understood. I think is is good to avoid having some "idr_alloc2". That
is why I suggested to do this in one go, to avoid "idr_alloc2" and then
patch to rename "idr_alloc2" to "idr_alloc" once nobody uses the original
"idr_alloc". In fact, if you do it driver, by driver, the review burden
would be the same, probably even bigger, you'll just have 100+ patches.
Why would it help?


Because it would give each maintainer only the part of the change he is 
interested in.


Current status of this change is that you send a mail with nearly 300 
people on CC.


Do you really expect to get an reviewed-by or acked-by on this single 
patch from all of them?


If yes then it somehow makes sense to send the patch bit by bit, if no 
then it doesn't seem to make to much sense to CC them all individually.



I've never read the bsg code before, but that's certainly not correct. And
that incorrect pattern repeats over and over again in this code.

Apart from that why the heck do you want to allocate more than 1<<31 handles?

tc action indexes for example. That is part of this patchset.

Well, let me refine the question: Why does tc action indexes need more than
31 bits? From an outside view that looks like pure overkill.

That is current state, uapi. We have to live with it.


Is the range to allocate from part of the uapi or what is the issue here?

If the issue is that userspace can specify the handle then I suggest 
that you use the radix tree directly instead of the idr wrapper around it.


Regards,
Christian.


Re: [PATCH net] xfrm: Clear sk_dst_cache when applying per-socket policy.

2017-08-16 Thread Lorenzo Colitti
On Wed, Aug 16, 2017 at 7:25 AM, Jonathan Basseri
 wrote:
> If an IPv6 socket has a valid dst cache

Did you look into why IPv4 does not suffer from this problem?

That said, clearing the dst cache entry does seem prudent in general.


Re: [PATCH net] xfrm: Clear sk_dst_cache when applying per-socket policy.

2017-08-16 Thread Jakub Sitnicki
On Tue, 15 Aug 2017 15:25:10 -0700
Jonathan Basseri  wrote:

> If an IPv6 socket has a valid dst cache, then xfrm_lookup_route will get
> skipped. However, the cache is not invalidated when applying policy to a
> socket (i.e. IPV6_XFRM_POLICY). The result is that new policies are
> sometimes ignored on those sockets.
> 
> This can be demonstrated like so,
> 1. Create UDPv6 socket.
> 2. connect() the socket.
> 3. Apply an outbound XFRM policy to the socket.
> 4. send() data on the socket.
> 
> Packets will continue to be sent in the clear instead of matching an
> xfrm or returning a no-match error (EAGAIN). This affects calls to
> send() and not sendto().
> 
> Note: Creating normal XFRM policies should have a similar effect on
> sk_dst_cache entries that match the policy, but that is not fixed in
> this patch.
> 
> Fixes: 00bc0ef5880d ("ipv6: Skip XFRM lookup if dst_entry in socket cache is 
> valid")
> Tested: https://android-review.googlesource.com/418659
> Signed-off-by: Jonathan Basseri 
> ---

Thank you for the fix.

Acked-by: Jakub Sitnicki 


[PATCH] net/mlx4: fix spelling mistake: "availible" -> "available"

2017-08-16 Thread Colin King
From: Colin Ian King 

Trivial fix to spelling mistakes in the mlx4 driver

Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c| 16 
 drivers/net/ethernet/mellanox/mlx4/fw_qos.c |  6 +++---
 drivers/net/ethernet/mellanox/mlx4/fw_qos.h | 10 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 674773b28b2e..6309389b09a7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1958,19 +1958,19 @@ static void mlx4_allocate_port_vpps(struct mlx4_dev 
*dev, int port)
int i;
int err;
int num_vfs;
-   u16 availible_vpp;
+   u16 available_vpp;
u8 vpp_param[MLX4_NUM_UP];
struct mlx4_qos_manager *port_qos;
struct mlx4_priv *priv = mlx4_priv(dev);
 
-   err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
+   err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
if (err) {
-   mlx4_info(dev, "Failed query availible VPPs\n");
+   mlx4_info(dev, "Failed query available VPPs\n");
return;
}
 
port_qos = &priv->mfunc.master.qos_ctl[port];
-   num_vfs = (availible_vpp /
+   num_vfs = (available_vpp /
   bitmap_weight(port_qos->priority_bm, MLX4_NUM_UP));
 
for (i = 0; i < MLX4_NUM_UP; i++) {
@@ -1985,14 +1985,14 @@ static void mlx4_allocate_port_vpps(struct mlx4_dev 
*dev, int port)
}
 
/* Query actual allocated VPP, just to make sure */
-   err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
+   err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
if (err) {
-   mlx4_info(dev, "Failed query availible VPPs\n");
+   mlx4_info(dev, "Failed query available VPPs\n");
return;
}
 
port_qos->num_of_qos_vfs = num_vfs;
-   mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, availible_vpp);
+   mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, available_vpp);
 
for (i = 0; i < MLX4_NUM_UP; i++)
mlx4_dbg(dev, "Port %d UP %d Allocated %d VPPs\n", port, i,
@@ -2891,7 +2891,7 @@ static int mlx4_set_vport_qos(struct mlx4_priv *priv, int 
slave, int port,
memset(vpp_qos, 0, sizeof(struct mlx4_vport_qos_param) * MLX4_NUM_UP);
 
if (slave > port_qos->num_of_qos_vfs) {
-   mlx4_info(dev, "No availible VPP resources for this VF\n");
+   mlx4_info(dev, "No available VPP resources for this VF\n");
return -EINVAL;
}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c 
b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
index 8f2fde0487c4..3a09d7122d3b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
@@ -65,7 +65,7 @@ struct mlx4_set_port_scheduler_context {
 
 /* Granular Qos (per VF) section */
 struct mlx4_alloc_vpp_param {
-   __be32 availible_vpp;
+   __be32 available_vpp;
__be32 vpp_p_up[MLX4_NUM_UP];
 };
 
@@ -157,7 +157,7 @@ int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, 
u8 *tc_tx_bw,
 EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
 
 int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
- u16 *availible_vpp, u8 *vpp_p_up)
+ u16 *available_vpp, u8 *vpp_p_up)
 {
int i;
int err;
@@ -179,7 +179,7 @@ int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
goto out;
 
/* Total number of supported VPPs */
-   *availible_vpp = (u16)be32_to_cpu(out_param->availible_vpp);
+   *available_vpp = (u16)be32_to_cpu(out_param->available_vpp);
 
for (i = 0; i < MLX4_NUM_UP; i++)
vpp_p_up[i] = (u8)be32_to_cpu(out_param->vpp_p_up[i]);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h 
b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
index ac1f331878e6..582997577a04 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
@@ -84,23 +84,23 @@ int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 
*prio2tc);
 int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
u8 *pg, u16 *ratelimit);
 /**
- * mlx4_ALLOCATE_VPP_get - Query port VPP availible resources and allocation.
- * Before distribution of VPPs to priorities, only availible_vpp is returned.
+ * mlx4_ALLOCATE_VPP_get - Query port VPP available resources and allocation.
+ * Before distribution of VPPs to priorities, only available_vpp is returned.
  * After initialization it returns the distribution of VPPs among priorities.
  *
  * @dev: mlx4_dev.
  * @port: Physical port number.
- * @availible_vpp: Pointer to variable where number of availible VPPs is stored
+ * @available_vpp: Pointer to variable where

Re: [PATCH 2/3] ARM: sun8i: sunxi-h3-h5: add phy-is-integrated property to internal PHY

2017-08-16 Thread Corentin Labbe
On Fri, Aug 11, 2017 at 08:03:29AM -0700, Florian Fainelli wrote:
> On August 11, 2017 6:25:26 AM PDT, Corentin Labbe  
> wrote:
> >On Fri, Aug 11, 2017 at 04:22:11PM +0800, Chen-Yu Tsai wrote:
> >> On Fri, Aug 11, 2017 at 4:19 PM, Corentin Labbe
> >>  wrote:
> >> > On Fri, Aug 11, 2017 at 04:11:13PM +0800, Chen-Yu Tsai wrote:
> >> >> On Fri, Aug 11, 2017 at 4:05 PM, Corentin Labbe
> >> >>  wrote:
> >> >> > On Fri, Aug 11, 2017 at 10:42:51AM +0800, Chen-Yu Tsai wrote:
> >> >> >> Hi,
> >> >> >>
> >> >> >> On Thu, Aug 10, 2017 at 4:51 PM, Corentin Labbe
> >> >> >>  wrote:
> >> >> >> > This patch add the new phy-is-integrated property to the
> >internal PHY
> >> >> >> > node.
> >> >> >> >
> >> >> >> > Signed-off-by: Corentin Labbe 
> >> >> >> > ---
> >> >> >> >  arch/arm/boot/dts/sunxi-h3-h5.dtsi | 1 +
> >> >> >> >  1 file changed, 1 insertion(+)
> >> >> >> >
> >> >> >> > diff --git a/arch/arm/boot/dts/sunxi-h3-h5.dtsi
> >b/arch/arm/boot/dts/sunxi-h3-h5.dtsi
> >> >> >> > index 4b599b5d26f6..54fc24e4c569 100644
> >> >> >> > --- a/arch/arm/boot/dts/sunxi-h3-h5.dtsi
> >> >> >> > +++ b/arch/arm/boot/dts/sunxi-h3-h5.dtsi
> >> >> >> > @@ -425,6 +425,7 @@
> >> >> >> > reg = <1>;
> >> >> >> > clocks = <&ccu
> >CLK_BUS_EPHY>;
> >> >> >> > resets = <&ccu
> >RST_BUS_EPHY>;
> >> >> >> > +   phy-is-integrated;
> >> >> >>
> >> >> >> You also need to "delete" this property at the board level for
> >> >> >> any board that has the external PHY at address <1>. Otherwise
> >> >> >> they will stop working. This is due to the internal and
> >external
> >> >> >> PHYs having the same path and node name in the device tree, so
> >> >> >> they are effectively the same node.
> >> >> >>
> >> >> >> ChenYu
> >> >> >>
> >> >> >
> >> >> > They have not the same name, ext_rgmii_phy vs int_mii_phy.
> >> >>
> >> >> That is just the label. The label plays no part in device tree
> >merging. The path
> >> >>
> >> >> /soc/ethernet@1c3/mdio/ethernet-phy@1
> >> >>
> >> >> is the same. You can look under
> >> >>
> >> >> /proc/device-tree/soc/ethernet@1c3/mdio
> >> >>
> >> >> on the OrangePI Plus 2E or any other H3 board that uses an
> >> >> external PHY at address 1.
> >> >>
> >> >> ChenYu
> >> >
> >> > Since we get the phy node by phy-handle and not by path, I think
> >all should be good.
> >> 
> >> You are not getting me. The fact that the two seemingly separate
> >> nodes are merged together means, whatever properties you put in
> >> the internal PHY node, also affect the external PHY node. Once
> >> compiled, they are the SAME node.
> >
> >Hello Rob, florian, mark
> >
> >Adding a delete property on all external ethernet-phy@1 is a bit
> >overkill, and I dont like the idea that nodes are merged.
> 
> This is not exactly up to you that's just how DTC works.
> 
> >What do you think about other possible solutions:
> >- Using integrated-phy@1 for the integrated PHY node name
> 
> That might be okay although you are using now a seemingly non-standard unit 
> name.
> 
> >- Using a fake address like 31 (see patch below)
> 
> You could also drop the address part in the unit name although we'd probably 
> get a DTC warning for that.
> 
> I suspect both of your solutions and what I mentioned above will be producing 
> DTC warnings to some extent... Rob what do you think?
> 

I think I found an easier solution, putting phy-is-integrated on board DT nodes 
only.
I will send an updated serie.

Regards


Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet

2017-08-16 Thread Jason Wang



On 2017年08月16日 12:07, Jason Wang wrote:



On 2017年08月16日 11:59, Michael S. Tsirkin wrote:

On Wed, Aug 16, 2017 at 11:57:51AM +0800, Jason Wang wrote:


On 2017年08月16日 11:55, Michael S. Tsirkin wrote:

On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:

On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
We use tun_alloc_skb() which calls sock_alloc_send_pskb() to 
allocate

skb in the past. This socket based method is not suitable for high
speed userspace like virtualization which usually:

- ignore sk_sndbuf (INT_MAX) and expect to receive the packet as 
fast as

possible
- don't want to be block at sendmsg()

To eliminate the above overheads, this patch tries to use 
build_skb()

for small packet. We will do this only when the following conditions
are all met:

- TAP instead of TUN
- sk_sndbuf is INT_MAX
- caller don't want to be blocked
- zerocopy is not used
- packet size is smaller enough to use build_skb()

Pktgen from guest to host shows ~11% improvement for rx pps of tap:

Before: ~1.70Mpps
After : ~1.88Mpps

What's more important, this makes it possible to implement XDP 
for tap

before creating skbs.

Well well well.

You do realize that tun_build_skb() is not thread safe ?

The issue is alloc frag, isn't it?
I guess for now we can limit this to XDP mode only, and
just allocate full pages in that mode.


Limit this to XDP mode only does not prevent user from sending 
packets to

same queue in parallel I think?

Thanks

Yes but then you can just drop the page frag allocator since
XDP is assumed not to care about truesize for most packets.



Ok, let me do some test to see the numbers between the two methods first.

Thanks


It looks like full page allocation just produce too much stress on the 
page allocator.


I get 1.58Mpps (full page) vs 1.95Mpps (page frag) with the patches 
attached.


Since non-XDP case can also benefit from build_skb(), I tend to use 
spinlock instead of full page in this case.


Thanks
>From 0b9d930e8192466a9c4b85d136193f9c5f01d96a Mon Sep 17 00:00:00 2001
From: Jason Wang 
Date: Wed, 16 Aug 2017 13:48:11 +0800
Subject: [PATCH] tun: thread safe tun_build_skb()

Signed-off-by: Jason Wang 
---
 drivers/net/tun.c | 33 ++---
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5892284..c72c2ea 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1247,6 +1247,8 @@ static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
 static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 			  int len, int noblock, bool zerocopy)
 {
+	struct bpf_prog *xdp_prog;
+
 	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
 		return false;
 
@@ -1263,7 +1265,11 @@ static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
 	SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
 		return false;
 
-	return true;
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(tun->xdp_prog);
+	rcu_read_unlock();
+
+	return xdp_prog;
 }
 
 static struct sk_buff *tun_build_skb(struct tun_struct *tun,
@@ -1272,7 +1278,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
  struct virtio_net_hdr *hdr,
  int len, int *generic_xdp)
 {
-	struct page_frag *alloc_frag = &tfile->alloc_frag;
+	struct page *page = alloc_page(GFP_KERNEL);
 	struct sk_buff *skb;
 	struct bpf_prog *xdp_prog;
 	int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
@@ -1283,15 +1289,15 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	bool xdp_xmit = false;
 	int err;
 
-	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+	if (unlikely(!page))
 		return ERR_PTR(-ENOMEM);
 
-	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-	copied = copy_page_from_iter(alloc_frag->page,
- alloc_frag->offset + TUN_RX_PAD,
- len, from);
-	if (copied != len)
+	buf = (char *)page_address(page);
+	copied = copy_page_from_iter(page, TUN_RX_PAD, len, from);
+	if (copied != len) {
+		put_page(page);
 		return ERR_PTR(-EFAULT);
+	}
 
 	if (hdr->gso_type)
 		*generic_xdp = 1;
@@ -1313,11 +1319,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		switch (act) {
 		case XDP_REDIRECT:
-			get_page(alloc_frag->page);
-			alloc_frag->offset += buflen;
 			err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
 			if (err)
-goto err_redirect;
+goto err_xdp;
 			return NULL;
 		case XDP_TX:
 			xdp_xmit = true;
@@ -1339,13 +1343,13 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	skb = build_skb(buf, buflen);
 	if (!skb) {
 		rcu_read_unlock();
+		put_page(page);
 		return ERR_PTR(-ENOMEM);
 	}
 
 	skb_reserve(skb, TUN_RX_PAD - delta);
 	skb_put(skb, len + delta);
-	get_page(alloc_frag->page);
-	alloc_frag->offset += buflen;
+	get_page(page);
 
 	if (xdp_xmit) {
 		skb->dev = tun->dev;
@@ -1358,9 +1362,8 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 	return sk

Re: [PATCH v3] openvswitch: enable NSH support

2017-08-16 Thread Jiri Benc
Please always CC reviewers of the previous versions, thanks.

On Wed, 16 Aug 2017 13:35:30 +0800, Yi Yang wrote:
> v2->v3
>  - Change OVS_KEY_ATTR_NSH to nested key to handle
>length-fixed attributes and length-variable
>attriubte more flexibly.
>  - Remove struct ovs_action_push_nsh completely
>  - Add code to handle nested attribute for SET_MASKED
>  - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
>to transfer NSH header data.
>  - Fix comments and coding style issues by Jiri and Eric

Thanks! I like this version. I think we're almost there. A few more
comments below.

> +struct push_nsh_para {
> + __be16 ver_flags_len;
> + u8 md_type;
> + u8 next_proto;
> + __be32 path_hdr;
> + u8 metadata[NSH_M_TYPE2_MAX_LEN-8];
> +};

Please get rid of this struct. It's a copy of struct nsh_hdr with some
space added to the bottom.

One of the options (though maybe not the best one, feel free to come up
with something better) is to change struct nsh_md1_ctx to:

struct nsh_md1_ctx {
__be32 context[];
};

and change struct push_nsh_para:

struct push_nsh_para {
struct nsh_hdr hdr;
u8 metadata[NSH_M_TYPE2_MAX_LEN-8];
};

Another option (a better one, though a bit more work) is to get rid of
push_nsh_para completely and just pass a properly allocated nsh_hdr
around. Introduce macros and/or functions to help with the allocation.

> +static inline struct nsh_md1_ctx *nsh_md1_ctx(struct nsh_hdr *nsh)
> +{
> + return &nsh->md1;
> +}
> +
> +static inline struct nsh_md2_tlv *nsh_md2_ctx(struct nsh_hdr *nsh)
> +{
> + return nsh->md2;
> +}

These are unused, please remove them.

> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
[...]
> +#define NSH_MD1_CONTEXT_SIZE 4

Please move this to nsh.h and use it there, too, instead of the open
coded 4.

> +static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
> + const struct push_nsh_para *pnp)
> +{
> + struct nsh_hdr *nsh;
> + size_t length = ((ntohs(pnp->ver_flags_len) & NSH_LEN_MASK)
> +  >> NSH_LEN_SHIFT) << 2;

Once push_nsh_para is removed/changed, this can be changed to a call to
nsh_hdr_len.

> + flags = (ntohs(nsh->ver_flags_len) & NSH_FLAGS_MASK) >>
> + NSH_FLAGS_SHIFT;

nsh_get_flags

> + case OVS_KEY_ATTR_NSH: {
> + struct ovs_key_nsh nsh;
> + struct ovs_key_nsh nsh_mask;
> + size_t size = nla_len(a) / 2;
> + struct nlattr attr[1 + DIV_ROUND_UP(sizeof(struct ovs_key_ipv6)
> + , sizeof(struct nlattr))];
> + struct nlattr mask[1 + DIV_ROUND_UP(sizeof(struct ovs_key_ipv6)
> + , sizeof(struct nlattr))];
> +
> + attr->nla_type = nla_type(a);
> + mask->nla_type = attr->nla_type;
> + attr->nla_len = NLA_HDRLEN + size;
> + mask->nla_len = attr->nla_len;
> + memcpy(attr + 1, (char *)(a + 1), size);
> + memcpy(mask + 1, (char *)(a + 1) + size, size);

This is too hacky. Please find a better way to handle this.

One option is to create a struct with struct nlattr as the first member
followed by a char buffer. Still not nice but at least it's clear
what's the intent.

> +static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
> +{
> + struct nsh_hdr *nsh = (struct nsh_hdr *)skb_network_header(skb);
> + u8 version, length;
> + u32 path_hdr;
> + int i;
> +
> + memset(&key->nsh, 0, sizeof(struct ovs_key_nsh));
> + version = nsh_get_ver(nsh);
> + length = nsh_get_len(nsh);
> +
> + key->nsh.flags = nsh_get_flags(nsh);
> + key->nsh.mdtype = nsh->md_type;
> + key->nsh.np = nsh->next_proto;
> + path_hdr = ntohl(nsh->path_hdr);

The path_hdr variable is unused.

> + key->nsh.path_hdr = nsh->path_hdr;
> + switch (key->nsh.mdtype) {
> + case NSH_M_TYPE1:
> + if ((length << 2) != NSH_M_TYPE1_LEN)

Why length << 2?

> + return -EINVAL;
> +
> + for (i = 0; i < 4; i++)

NSH_MD1_CONTEXT_SIZE

> + key->nsh.context[i] = nsh->md1.context[i];
> +
> + break;

Will go through the rest later. Feel free to send a new version
meanwhile.

Thanks,

 Jiri


Re: [PATCH net] datagram: When peeking datagrams with offset < 0 don't skip empty skbs

2017-08-16 Thread Paolo Abeni
On Tue, 2017-08-15 at 13:00 -0400, Willem de Bruijn wrote:
> > There is another difference between reading sk_peek_offset in the
> > caller or in __skb_try_recv_from_queue. The latter is called repeatedly
> > when it returns NULL. Each call can modify *off. I believe that it needs
> > to restart with _off at sk->sk_peek_off each time, as it restarts from the
> > head of the queue each time.
> 
> I made a mistake here. *off is not updated when returning NULL.
> 
> In that case, it is better to read sk_peek_offset once, than to read
> it each time __skb_try_recv_from_queue is entered.

If I read the above correctly, you are arguining in favor of the
addittional flag version, right?

Regarding the MSG flag exaustion, there are a bunch of flags defined
but apparently unused (MSG_FIN, MSG_SYN, MSG_RST) since long time (if
I'm not too low on coffee).

We can shadow one of them (or ev. drop the above defines, if really
unused).

I think that the MSG_PEEK_OFF should be explicitly cleared in
sk_peek_offset() when the sk_peek_off is negative, to avoid beeing
fooled by stray bits passed by the us, like in the following:
---
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 8b13db5163cc..3b7f53b9cc08 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -286,6 +286,7 @@ struct ucred {
 #define MSG_SENDPAGE_NOTLAST 0x2 /* sendpage() internal : not the last 
page */
 #define MSG_BATCH  0x4 /* sendmmsg(): more messages coming */
 #define MSG_EOF MSG_FIN
+#define MSG_PEEK_OFF   MSG_FIN /* Peeking with offset for datagram sockets */
 
 #define MSG_FASTOPEN   0x2000  /* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x4000/* Set close_on_exec for file
diff --git a/include/net/sock.h b/include/net/sock.h
index 7c0632c7e870..452f9aac2e6a 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -504,12 +504,17 @@ enum sk_pacing {
 
 int sk_set_peek_off(struct sock *sk, int val);
 
-static inline int sk_peek_offset(struct sock *sk, int flags)
+static inline int sk_peek_offset(struct sock *sk, int *flags)
 {
-   if (unlikely(flags & MSG_PEEK)) {
+   if (unlikely(*flags & MSG_PEEK)) {
s32 off = READ_ONCE(sk->sk_peek_off);
-   if (off >= 0)
+   if (off >= 0) {
+   *flags |= MSG_PEEK_OFF;
return off;
+   }
+
+   /* clear evental stray bits in the user-provided bitmask */
+   *flags &= ~MSG_PEEK_OFF;
}
 
return 0;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index ee5647bd91b3..91e1d014d64c 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -175,8 +175,8 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
*last = queue->prev;
skb_queue_walk(queue, skb) {
if (flags & MSG_PEEK) {
-   if (_off >= skb->len && (skb->len || _off ||
-skb->peeked)) {
+   if (flags & MSG_PEEK_OFF && _off >= skb->len &&
+   (skb->len || _off || skb->peeked)) {
_off -= skb->len;
continue;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index ac2a404c73eb..0c123540b02f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2408,9 +2408,7 @@ EXPORT_SYMBOL(__sk_mem_reclaim);
 
 int sk_set_peek_off(struct sock *sk, int val)
 {
-   if (val < 0)
-   return -EINVAL;
-
+   /* a negative value will disable peeking with offset */
sk->sk_peek_off = val;
return 0;
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a7c804f73990..7e1bcd3500f4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1574,7 +1574,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, 
size_t len, int noblock,
return ip_recv_error(sk, msg, len, addr_len);
 
 try_again:
-   peeking = off = sk_peek_offset(sk, flags);
+   peeking = off = sk_peek_offset(sk, &flags);
skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 578142b7ca3e..86fb4ff8934c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -362,7 +362,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, 
size_t len,
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
 
 try_again:
-   peeking = off = sk_peek_offset(sk, flags);
+   peeking = off = sk_peek_offset(sk, &flags);
skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
if (!skb)
return err;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 7b52a380d710..06c740939d9d 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -2124,7 +2124,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct 
msghdr *msg,
do {
mutex_lock(&u->iolo

Re: [patch net-next 0/3] net/sched: Improve getting objects by indexes

2017-08-16 Thread Jiri Pirko
Wed, Aug 16, 2017 at 10:55:56AM CEST, christian.koe...@amd.com wrote:
>Am 16.08.2017 um 10:39 schrieb Jiri Pirko:
>> Wed, Aug 16, 2017 at 10:31:35AM CEST, christian.koe...@amd.com wrote:
>> > Am 16.08.2017 um 10:16 schrieb Jiri Pirko:
>> > > Wed, Aug 16, 2017 at 09:49:07AM CEST, christian.koe...@amd.com wrote:
>> > > > Am 16.08.2017 um 04:12 schrieb Chris Mi:
>> > > > > Using current TC code, it is very slow to insert a lot of rules.
>> > > > > 
>> > > > > In order to improve the rules update rate in TC,
>> > > > > we introduced the following two changes:
>> > > > >1) changed cls_flower to use IDR to manage the filters.
>> > > > >2) changed all act_xxx modules to use IDR instead of
>> > > > >   a small hash table
>> > > > > 
>> > > > > But IDR has a limitation that it uses int. TC handle uses u32.
>> > > > > To make sure there is no regression, we also changed IDR to use
>> > > > > unsigned long. All clients of IDR are changed to use new IDR API.
>> > > > WOW, wait a second. The idr change is touching a lot of drivers and to 
>> > > > be
>> > > > honest doesn't looks correct at all.
>> > > > 
>> > > > Just look at the first chunk of your modification:
>> > > > > @@ -998,8 +999,9 @@ int bsg_register_queue(struct request_queue *q, 
>> > > > > struct device *parent,
>> > > > >  mutex_lock(&bsg_mutex);
>> > > > > -ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, 
>> > > > > GFP_KERNEL);
>> > > > > -if (ret < 0) {
>> > > > > +ret = idr_alloc(&bsg_minor_idr, bcd, &idr_index, 0, 
>> > > > > BSG_MAX_DEVS,
>> > > > > +GFP_KERNEL);
>> > > > > +if (ret) {
>> > > > >  if (ret == -ENOSPC) {
>> > > > >  printk(KERN_ERR "bsg: too many bsg devices\n");
>> > > > >  ret = -EINVAL;
>> > > > The condition "if (ret)" will now always be true after the first 
>> > > > allocation
>> > > > and so we always run into the error handling after that.
>> > > On success, idr_alloc returns 0.
>> > Ah, I see. You change the idr_alloc to return the resulting index as 
>> > separate
>> > parameter.
>> > 
>> > You should explicit note that in the commit message, cause that is 
>> > something
>> > easily overlooked.
>> > 
>> > In general I strongly suggest to add a separate interface for allocating
>> > unsigned long handles, use that for the while being and then move the
>> > existing drivers over bit by bit.
>> > 
>> > A single patch which touches so many different driver is practically
>> > impossible to review consequently.
>> Understood. I think is is good to avoid having some "idr_alloc2". That
>> is why I suggested to do this in one go, to avoid "idr_alloc2" and then
>> patch to rename "idr_alloc2" to "idr_alloc" once nobody uses the original
>> "idr_alloc". In fact, if you do it driver, by driver, the review burden
>> would be the same, probably even bigger, you'll just have 100+ patches.
>> Why would it help?
>
>Because it would give each maintainer only the part of the change he is
>interested in.
>
>Current status of this change is that you send a mail with nearly 300 people
>on CC.

That was a mistake to cc all.


>
>Do you really expect to get an reviewed-by or acked-by on this single patch
>from all of them?

I don't. It is an API change, maintainers of the individual drivers are
not expected to review the patches like this.


>
>If yes then it somehow makes sense to send the patch bit by bit, if no then
>it doesn't seem to make to much sense to CC them all individually.
>
>> > > > I've never read the bsg code before, but that's certainly not correct. 
>> > > > And
>> > > > that incorrect pattern repeats over and over again in this code.
>> > > > 
>> > > > Apart from that why the heck do you want to allocate more than 1<<31 
>> > > > handles?
>> > > tc action indexes for example. That is part of this patchset.
>> > Well, let me refine the question: Why does tc action indexes need more than
>> > 31 bits? From an outside view that looks like pure overkill.
>> That is current state, uapi. We have to live with it.
>
>Is the range to allocate from part of the uapi or what is the issue here?

Yes.

>
>If the issue is that userspace can specify the handle then I suggest that you
>use the radix tree directly instead of the idr wrapper around it.

But why? idr is exactly the tool we need. Only signed int does not suit
us. In fact, it does not make sense idr is using signed int when it
uses radix tree with unsigned long under the hood.



>
>Regards,
>Christian.


Re: [patch net-next 0/3] net/sched: Improve getting objects by indexes

2017-08-16 Thread Christian König

Am 16.08.2017 um 11:31 schrieb Jiri Pirko:

[SNIP]
I don't. It is an API change, maintainers of the individual drivers are
not expected to review the patches like this.


Yeah, completely agree.


If yes then it somehow makes sense to send the patch bit by bit, if no then
it doesn't seem to make to much sense to CC them all individually.


I've never read the bsg code before, but that's certainly not correct. And
that incorrect pattern repeats over and over again in this code.

Apart from that why the heck do you want to allocate more than 1<<31 handles?

tc action indexes for example. That is part of this patchset.

Well, let me refine the question: Why does tc action indexes need more than
31 bits? From an outside view that looks like pure overkill.

That is current state, uapi. We have to live with it.

Is the range to allocate from part of the uapi or what is the issue here?

Yes.


A bit strange uapi design, but ok in this case that change actually 
makes sense.



If the issue is that userspace can specify the handle then I suggest that you
use the radix tree directly instead of the idr wrapper around it.

But why? idr is exactly the tool we need. Only signed int does not suit
us. In fact, it does not make sense idr is using signed int when it
uses radix tree with unsigned long under the hood.


Well it always depends on what you do and how to use it.

In amdgpu for example for have very very short lived objects and only 
few of them are active at the same time.


The solution was not to use and idr, but rather 64bit identifiers and a 
ring buffer with the last 128 entries.


But in your case changing the idr calling convention actually makes 
sense (at least from the tn mile high view), feel free to add an 
Acked-by: Christian König  on it.


Regards,
Christian.


Re: [PATCH] net/mlx4: fix spelling mistake: "availible" -> "available"

2017-08-16 Thread Yuval Shaia
On Wed, Aug 16, 2017 at 10:05:11AM +0100, Colin King wrote:
> From: Colin Ian King 
> 
> Trivial fix to spelling mistakes in the mlx4 driver
> 
> Signed-off-by: Colin Ian King 
> ---
>  drivers/net/ethernet/mellanox/mlx4/cmd.c| 16 
>  drivers/net/ethernet/mellanox/mlx4/fw_qos.c |  6 +++---
>  drivers/net/ethernet/mellanox/mlx4/fw_qos.h | 10 +-
>  3 files changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
> b/drivers/net/ethernet/mellanox/mlx4/cmd.c
> index 674773b28b2e..6309389b09a7 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
> @@ -1958,19 +1958,19 @@ static void mlx4_allocate_port_vpps(struct mlx4_dev 
> *dev, int port)
>   int i;
>   int err;
>   int num_vfs;
> - u16 availible_vpp;
> + u16 available_vpp;
>   u8 vpp_param[MLX4_NUM_UP];
>   struct mlx4_qos_manager *port_qos;
>   struct mlx4_priv *priv = mlx4_priv(dev);
>  
> - err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
> + err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
>   if (err) {
> - mlx4_info(dev, "Failed query availible VPPs\n");
> + mlx4_info(dev, "Failed query available VPPs\n");
>   return;
>   }
>  
>   port_qos = &priv->mfunc.master.qos_ctl[port];
> - num_vfs = (availible_vpp /
> + num_vfs = (available_vpp /
>  bitmap_weight(port_qos->priority_bm, MLX4_NUM_UP));
>  
>   for (i = 0; i < MLX4_NUM_UP; i++) {
> @@ -1985,14 +1985,14 @@ static void mlx4_allocate_port_vpps(struct mlx4_dev 
> *dev, int port)
>   }
>  
>   /* Query actual allocated VPP, just to make sure */
> - err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
> + err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
>   if (err) {
> - mlx4_info(dev, "Failed query availible VPPs\n");
> + mlx4_info(dev, "Failed query available VPPs\n");
>   return;
>   }
>  
>   port_qos->num_of_qos_vfs = num_vfs;
> - mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, availible_vpp);
> + mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, available_vpp);
>  
>   for (i = 0; i < MLX4_NUM_UP; i++)
>   mlx4_dbg(dev, "Port %d UP %d Allocated %d VPPs\n", port, i,
> @@ -2891,7 +2891,7 @@ static int mlx4_set_vport_qos(struct mlx4_priv *priv, 
> int slave, int port,
>   memset(vpp_qos, 0, sizeof(struct mlx4_vport_qos_param) * MLX4_NUM_UP);
>  
>   if (slave > port_qos->num_of_qos_vfs) {
> - mlx4_info(dev, "No availible VPP resources for this VF\n");
> + mlx4_info(dev, "No available VPP resources for this VF\n");
>   return -EINVAL;
>   }
>  
> diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c 
> b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
> index 8f2fde0487c4..3a09d7122d3b 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
> @@ -65,7 +65,7 @@ struct mlx4_set_port_scheduler_context {
>  
>  /* Granular Qos (per VF) section */
>  struct mlx4_alloc_vpp_param {
> - __be32 availible_vpp;
> + __be32 available_vpp;
>   __be32 vpp_p_up[MLX4_NUM_UP];
>  };
>  
> @@ -157,7 +157,7 @@ int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 
> port, u8 *tc_tx_bw,
>  EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
>  
>  int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
> -   u16 *availible_vpp, u8 *vpp_p_up)
> +   u16 *available_vpp, u8 *vpp_p_up)
>  {
>   int i;
>   int err;
> @@ -179,7 +179,7 @@ int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
>   goto out;
>  
>   /* Total number of supported VPPs */
> - *availible_vpp = (u16)be32_to_cpu(out_param->availible_vpp);
> + *available_vpp = (u16)be32_to_cpu(out_param->available_vpp);
>  
>   for (i = 0; i < MLX4_NUM_UP; i++)
>   vpp_p_up[i] = (u8)be32_to_cpu(out_param->vpp_p_up[i]);
> diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h 
> b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
> index ac1f331878e6..582997577a04 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
> +++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
> @@ -84,23 +84,23 @@ int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, 
> u8 *prio2tc);
>  int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
>   u8 *pg, u16 *ratelimit);
>  /**
> - * mlx4_ALLOCATE_VPP_get - Query port VPP availible resources and allocation.
> - * Before distribution of VPPs to priorities, only availible_vpp is returned.
> + * mlx4_ALLOCATE_VPP_get - Query port VPP available resources and allocation.
> + * Before distribution of VPPs to priorities, only available_vpp is returned.
>   * After initialization it returns the distribution of VPPs among priorit

[PATCH net 2/2] net: ixgbe: Use new IXGBE_FLAG2_ROOT_NO_RELAXED_ORDERING flag

2017-08-16 Thread Ding Tianhong
The ixgbe driver use the compile check to determine if it can
send TLPs to Root Port with the Relaxed Ordering Attribute set,
this is too inconvenient, now the new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING
has been added to the kernel and we could check the bit4 in the PCIe
Davice Control register to determine whether we should use the Relaxed
Ordering Attributes or not, so we add a new flag which called
IXGBE_FLAG2_ROOT_NO_RELAXED_ORDERING to the ixgbe driver, it will
be set if the Root Port couldn't deal the upstream TLPs with Relaxed
Ordering Attribute, then the driver could know what to do next.

Signed-off-by: Ding Tianhong 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h|  1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c  | 37 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 32 +++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 17 
 4 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index dd55787..50e0553 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -621,6 +621,7 @@ struct ixgbe_adapter {
 #define IXGBE_FLAG2_EEE_CAPABLEBIT(14)
 #define IXGBE_FLAG2_EEE_ENABLEDBIT(15)
 #define IXGBE_FLAG2_RX_LEGACY  BIT(16)
+#define IXGBE_FLAG2_ROOT_NO_RELAXED_ORDERING   BIT(17)
 
/* Tx fast path data */
int num_tx_queues;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
index 523f9d0..0727a30 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c
@@ -175,31 +175,30 @@ static s32 ixgbe_init_phy_ops_82598(struct ixgbe_hw *hw)
  **/
 static s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw)
 {
-#ifndef CONFIG_SPARC
-   u32 regval;
-   u32 i;
-#endif
+   u32 regval, i;
s32 ret_val;
+   struct ixgbe_adapter *adapter = hw->back;
 
ret_val = ixgbe_start_hw_generic(hw);
 
-#ifndef CONFIG_SPARC
-   /* Disable relaxed ordering */
-   for (i = 0; ((i < hw->mac.max_tx_queues) &&
-(i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-   regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i));
-   regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-   IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval);
-   }
+   if (adapter->flags2 & IXGBE_FLAG2_ROOT_NO_RELAXED_ORDERING) {
+   /* Disable relaxed ordering */
+   for (i = 0; ((i < hw->mac.max_tx_queues) &&
+(i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
+   regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i));
+   regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
+   IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval);
+   }
 
-   for (i = 0; ((i < hw->mac.max_rx_queues) &&
-(i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
-   regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
-   regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
-   IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
-   IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
+   for (i = 0; ((i < hw->mac.max_rx_queues) &&
+(i < IXGBE_DCA_MAX_QUEUES_82598)); i++) {
+   regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
+   regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
+   IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
+   IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
+   }
}
-#endif
+
if (ret_val)
return ret_val;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index d4933d2..2473c0b 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -342,6 +342,7 @@ s32 ixgbe_start_hw_generic(struct ixgbe_hw *hw)
 s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
 {
u32 i;
+   struct ixgbe_adapter *adapter = hw->back;
 
/* Clear the rate limiters */
for (i = 0; i < hw->mac.max_tx_queues; i++) {
@@ -350,25 +351,26 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
}
IXGBE_WRITE_FLUSH(hw);
 
-#ifndef CONFIG_SPARC
-   /* Disable relaxed ordering */
-   for (i = 0; i < hw->mac.max_tx_queues; i++) {
-   u32 regval;
+   if (adapter->flags2 & IXGBE_FLAG2_ROOT_NO_RELAXED_ORDERING) {
+   /* Disable relaxed ordering */
+   for (i = 0; i < hw->mac.max_tx_queues; i++) {
+   u32 regval;
 
-   regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
-   regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
-   IXGBE_WRITE_REG(hw, IXGBE_DCA

[PATCH net 0/2] net: ixgbe: Use new flag to disable Relaxed Ordering

2017-08-16 Thread Ding Tianhong
The new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING has been added
to indicate that Relaxed Ordering Attributes (RO) should not
be used for Transaction Layer Packets (TLP) targeted toward
these affected Root Port, it will clear the bit4 in the PCIe
Device Control register, so the PCIe device drivers could
query PCIe configuration space to determine if it can send
TLPs to Root Port with the Relaxed Ordering Attributes set.

The ixgbe driver could use this flag to determine if it can
send TLPs to Root Port with the Relaxed Ordering Attributes set.

Ding Tianhong (2):
  Revert commit 1a8b6d76dc5b ("net:add one common config...")
  net: ixgbe: Use new IXGBE_FLAG2_ROOT_NO_RELAXED_ORDERING flag

 arch/Kconfig|  3 --
 arch/sparc/Kconfig  |  1 -
 drivers/net/ethernet/intel/ixgbe/ixgbe.h|  1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c  | 37 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 32 +++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   | 17 
 6 files changed, 53 insertions(+), 38 deletions(-)

-- 
1.8.3.1




[PATCH net 1/2] Revert commit 1a8b6d76dc5b ("net:add one common config...")

2017-08-16 Thread Ding Tianhong
The new flag PCI_DEV_FLAGS_NO_RELAXED_ORDERING has been added
to indicate that Relaxed Ordering Attributes (RO) should not
be used for Transaction Layer Packets (TLP) targeted toward
these affected Root Port, it will clear the bit4 in the PCIe
Device Control register, so the PCIe device drivers could
query PCIe configuration space to determine if it can send
TLPs to Root Port with the Relaxed Ordering Attributes set.

With this new flag  we don't need the config ARCH_WANT_RELAX_ORDER
to control the Relaxed Ordering Attributes for the ixgbe drivers
just like the commit 1a8b6d76dc5b ("net:add one common config...") did,
so revert this commit.

Signed-off-by: Ding Tianhong 
---
 arch/Kconfig| 3 ---
 arch/sparc/Kconfig  | 1 -
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.c | 2 +-
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 21d0089..00cfc63 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -928,9 +928,6 @@ config STRICT_MODULE_RWX
  and non-text memory will be made non-executable. This provides
  protection against certain security exploits (e.g. writing to text)
 
-config ARCH_WANT_RELAX_ORDER
-   bool
-
 config REFCOUNT_FULL
bool "Perform full reference count validation at the expense of speed"
help
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index a4a6261..987a575 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -44,7 +44,6 @@ config SPARC
select ARCH_HAS_SG_CHAIN
select CPU_NO_EFFICIENT_FFS
select LOCKDEP_SMALL if LOCKDEP
-   select ARCH_WANT_RELAX_ORDER
 
 config SPARC32
def_bool !64BIT
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
index 4e35e70..d4933d2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c
@@ -350,7 +350,7 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw)
}
IXGBE_WRITE_FLUSH(hw);
 
-#ifndef CONFIG_ARCH_WANT_RELAX_ORDER
+#ifndef CONFIG_SPARC
/* Disable relaxed ordering */
for (i = 0; i < hw->mac.max_tx_queues; i++) {
u32 regval;
-- 
1.8.3.1




Re: [PATCH v3] openvswitch: enable NSH support

2017-08-16 Thread Yang, Yi
On Wed, Aug 16, 2017 at 11:19:21AM +0200, Jiri Benc wrote:
> Please always CC reviewers of the previous versions, thanks.

Jiri, thank you for quick review. Sorry, I made a mistake on
sending and missed all the CCs, will indeed do this in next version.

> > +   __be16 ver_flags_len;
> > +   u8 md_type;
> > +   u8 next_proto;
> > +   __be32 path_hdr;
> > +   u8 metadata[NSH_M_TYPE2_MAX_LEN-8];
> > +};
> 
> Please get rid of this struct. It's a copy of struct nsh_hdr with some
> space added to the bottom.
> 
> One of the options (though maybe not the best one, feel free to come up
> with something better) is to change struct nsh_md1_ctx to:
> 
> struct nsh_md1_ctx {
>   __be32 context[];
> };
> 
> and change struct push_nsh_para:
> 
> struct push_nsh_para {
>   struct nsh_hdr hdr;
>   u8 metadata[NSH_M_TYPE2_MAX_LEN-8];
> };
> 
> Another option (a better one, though a bit more work) is to get rid of
> push_nsh_para completely and just pass a properly allocated nsh_hdr
> around. Introduce macros and/or functions to help with the allocation.
>

Yeah, good point, we can use dynamic allocation and struct nsh_hdr * to
handle this.

> > +static inline struct nsh_md1_ctx *nsh_md1_ctx(struct nsh_hdr *nsh)
> > +{
> > +   return &nsh->md1;
> > +}
> > +
> > +static inline struct nsh_md2_tlv *nsh_md2_ctx(struct nsh_hdr *nsh)
> > +{
> > +   return nsh->md2;
> > +}
> 
> These are unused, please remove them.

Will remove them, userspace does use them.

> 
> > --- a/include/uapi/linux/openvswitch.h
> > +++ b/include/uapi/linux/openvswitch.h
> [...]
> > +#define NSH_MD1_CONTEXT_SIZE 4
> 
> Please move this to nsh.h and use it there, too, instead of the open
> coded 4.

ovs code is very ugly, it will convert array[4] in
datapath/linux/compat/include/linux/openvswitch.h to other struct, I
have to change context[4] to such format :-), we can use 4 here for
Linux kernel.

> 
> > +static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key,
> > +   const struct push_nsh_para *pnp)
> > +{
> > +   struct nsh_hdr *nsh;
> > +   size_t length = ((ntohs(pnp->ver_flags_len) & NSH_LEN_MASK)
> > +>> NSH_LEN_SHIFT) << 2;
> 
> Once push_nsh_para is removed/changed, this can be changed to a call to
> nsh_hdr_len.

Yes, will do that way.

> 
> > +   flags = (ntohs(nsh->ver_flags_len) & NSH_FLAGS_MASK) >>
> > +   NSH_FLAGS_SHIFT;
> 
> nsh_get_flags

Missed this one :-)

> 
> > +   case OVS_KEY_ATTR_NSH: {
> > +   struct ovs_key_nsh nsh;
> > +   struct ovs_key_nsh nsh_mask;
> > +   size_t size = nla_len(a) / 2;
> > +   struct nlattr attr[1 + DIV_ROUND_UP(sizeof(struct ovs_key_ipv6)
> > +   , sizeof(struct nlattr))];
> > +   struct nlattr mask[1 + DIV_ROUND_UP(sizeof(struct ovs_key_ipv6)
> > +   , sizeof(struct nlattr))];
> > +
> > +   attr->nla_type = nla_type(a);
> > +   mask->nla_type = attr->nla_type;
> > +   attr->nla_len = NLA_HDRLEN + size;
> > +   mask->nla_len = attr->nla_len;
> > +   memcpy(attr + 1, (char *)(a + 1), size);
> > +   memcpy(mask + 1, (char *)(a + 1) + size, size);
> 
> This is too hacky. Please find a better way to handle this.
> 
> One option is to create a struct with struct nlattr as the first member
> followed by a char buffer. Still not nice but at least it's clear
> what's the intent.

The issue is nested attributes only can use this way, nested attribute
for SET_MASKED is very special, we have to handle it specially.

> 
> > +static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key)
> > +{
> > +   struct nsh_hdr *nsh = (struct nsh_hdr *)skb_network_header(skb);
> > +   u8 version, length;
> > +   u32 path_hdr;
> > +   int i;
> > +
> > +   memset(&key->nsh, 0, sizeof(struct ovs_key_nsh));
> > +   version = nsh_get_ver(nsh);
> > +   length = nsh_get_len(nsh);
> > +
> > +   key->nsh.flags = nsh_get_flags(nsh);
> > +   key->nsh.mdtype = nsh->md_type;
> > +   key->nsh.np = nsh->next_proto;
> > +   path_hdr = ntohl(nsh->path_hdr);
> 
> The path_hdr variable is unused.

Will remove it.

> 
> > +   key->nsh.path_hdr = nsh->path_hdr;
> > +   switch (key->nsh.mdtype) {
> > +   case NSH_M_TYPE1:
> > +   if ((length << 2) != NSH_M_TYPE1_LEN)
> 
> Why length << 2?

len in NSH header is number of 4 octets, so need to multiply 4.

> 
> > +   return -EINVAL;
> > +
> > +   for (i = 0; i < 4; i++)
> 
> NSH_MD1_CONTEXT_SIZE

Ok

> 
> > +   key->nsh.context[i] = nsh->md1.context[i];
> > +
> > +   break;
> 
> Will go through the rest later. Feel free to send a new version
> meanwhile.
> 
> Thanks,
> 
>  Jiri

Thank you so much for your comments, will work out new version ASAP.


Re: [PATCH] net/i40e: use cpumask_copy() for assigning cpumask

2017-08-16 Thread Stefano Brivio
Hi Juergen,

On Sat, 12 Aug 2017 18:09:46 +0200
Juergen Gross  wrote:

> Using direct assignment for a cpumask is wrong, cpumask_copy() should
> be used instead.

Perhaps a Fixes: tag might be helpful here, such as:

Fixes: 96db776a3682 ("i40e/i40evf: fix interrupt affinity bug")

as I reported in my other (late) patch:

http://marc.info/?l=linux-netdev&m=150279303222066&w=2

> Cc: sta...@vger.kernel.org

and maybe also an indication about which versions this applies to, such as

Cc: sta...@vger.kernel.org # 4.10+

In general, feel free to copy from my commit message.

These comments also apply to:

[PATCH] net/i40evf: use cpumask_copy() for assigning cpumask

Thanks,


--
Stefano


Re: [PATCH] net/mlx4: fix spelling mistake: "availible" -> "available"

2017-08-16 Thread Leon Romanovsky
On Wed, Aug 16, 2017 at 10:05:11AM +0100, Colin King wrote:
> From: Colin Ian King 
>
> Trivial fix to spelling mistakes in the mlx4 driver
>
> Signed-off-by: Colin Ian King 
> ---
>  drivers/net/ethernet/mellanox/mlx4/cmd.c| 16 
>  drivers/net/ethernet/mellanox/mlx4/fw_qos.c |  6 +++---
>  drivers/net/ethernet/mellanox/mlx4/fw_qos.h | 10 +-
>  3 files changed, 16 insertions(+), 16 deletions(-)
>

Thanks,
Reviewed-by: Leon Romanovsky 


signature.asc
Description: PGP signature


Re: 100% CPU load when generating traffic to destination network that nexthop is not reachable

2017-08-16 Thread Paweł Staszewski

Hi


Patch applied - but no big change - from 0.7Mpps per vlan to 1.2Mpps per 
vlan


previously(without patch) 100% cpu load:

  bwm-ng v0.6.1 (probing every 0.500s), press 'h' for help
  input: /proc/net/dev type: rate
  | iface   Rx TxTotal
== 

 vlan1002:0.00 P/s 1.99 P/s 
1.99 P/s

 vlan1001:0.00 P/s717227.12 P/s 717227.12 P/s
   enp175s0f0:  2713679.25 P/s 0.00 P/s 2713679.25 P/s
 vlan1000:0.00 P/s716145.44 P/s 716145.44 P/s
-- 


total:  2713679.25 P/s   1433374.50 P/s 4147054.00 P/s


With patch (100% cpu load a little better pps performance)

 bwm-ng v0.6.1 (probing every 1.000s), press 'h' for help
  input: /proc/net/dev type: rate
  | iface   Rx TxTotal
==
 vlan1002:0.00 P/s 1.00 P/s 
1.00 P/s

 vlan1001:0.00 P/s   1202161.50 P/s 1202161.50 P/s
   enp175s0f0:  3699864.50 P/s 0.00 P/s 3699864.50 P/s
 vlan1000:0.00 P/s   1196870.38 P/s 1196870.38 P/s
--
total:  3699864.50 P/s   2399033.00 P/s 6098897.50 P/s


perf top attached below:

 1.90% 0.00%  ksoftirqd/39[kernel.vmlinux] [k] run_ksoftirqd
|
 --1.90%--run_ksoftirqd
   |
--1.90%--__softirqentry_text_start
  |
   --1.90%--net_rx_action
 |
--1.90%--mlx5e_napi_poll
|
--1.89%--mlx5e_poll_rx_cq
|
--1.88%--mlx5e_handle_rx_cqe
|
--1.85%--napi_gro_receive
|
--1.85%--netif_receive_skb_internal
|
--1.85%--__netif_receive_skb
|
--1.85%--__netif_receive_skb_core
|
--1.85%--ip_rcv
|
--1.85%--ip_rcv_finish
|
--1.83%--ip_forward
|
--1.82%--ip_forward_finish
|
--1.82%--ip_output
|
--1.82%--ip_finish_output
|
--1.82%--ip_finish_output2
|
--1.79%--neigh_resolve_output
|
--1.77%--neigh_event_send
|
--1.77%--__neigh_event_send
|
--1.74%--_raw_write_lock_bh
|
--1.74%--queued_write_lock
queued_write_lock_slowpath
|
--1.70%--queued_spin_lock_slowpath


1.90% 0.00%  ksoftirqd/34[kernel.vmlinux] [k] 
__softirqentry_text_start

|
---__softirqentry_text_start
   |
--1.90%--net_rx_action
  |
   --1.90%--mlx5e_napi_poll
 |
  --1.89%--mlx5e_poll_rx_cq
|
--1.88%--mlx5e_handle_rx_cqe
   |
--1.86%--napi_gro_receive
  |
--1.85%--netif_receive_skb_internal
|
--1.85%--__netif_receive_skb
|
--1.85%--__netif_receive_skb_core
|
--1.85%--ip_rcv
|
--1.85%--ip_rcv_finish
|
--1.83%--ip_forward
|
--1.82%--ip_forward_finish
|
--1.82%--ip_output
|
--1.82%--ip_finish_output
|
--1.82%--ip_finish_output2
|
--1.79%--neigh_resolve_output
|
--1.77%--neigh_event_send
|
--1.77%--__neigh_event_send
|
--1.74%--_raw_write_lock_bh
queued_write_lock
queued_write_lock_slowpath
|
--1.71%--queued_spin_lock_slowpath

 1.85% 0.00%  ksoftirqd/38[kernel.vmlinux]  [k] 
ip_rcv_finish

|
 --1.85%--ip_rcv_finish
   |
--1.83%--ip_forward
  |
   --1.82%--ip_forward_finish
 |
  --1.82%--ip_output
|
--1.82%--ip_finish_output
|
--1.82%--ip_finish_output2
|
--1.79%--neigh_resolve_output
|
--1.77%--neigh_event_send
|
--1.77%--__neigh_event_send
|
--1.74%--_raw_write_lock_bh
queued_write_lock
queued_write_lock_slowpath
|
--1.71%--queued_spin_lock_slowpath

 1.85% 0.00%  ksoftirqd/22[kernel.vmlinux] [k] ip_rcv
|
 --1.85%--ip_rcv
   |
--1.85%--ip_rcv_finish
  |
   --1.83%--ip_forward
 |
--1.82%--ip_forward_finish
|
--1.82%--ip_output
|
--1.82%--ip_finish_output
|
--1.82%--ip_finish_output2
|
--1.79%--neigh_resolve_output
|
--1.77%--neigh_event_send
|
--1.77%--__ne

Re: [PATCH 1/2] mpls: add handlers

2017-08-16 Thread Amine Kherbouche



On 08/16/2017 07:30 AM, Roopa Prabhu wrote:

On Tue, Aug 15, 2017 at 2:37 AM, David Lamparter  wrote:

[snip]


I think the reverse is the better option, removing the vpls device
information and just going with the route table.  My approach to this
would be to add a new netlink route attribute "RTA_VPLS" which
identifies the vpls device, is stored in the route table, and provides
the device ptr needed here.
(The control word config should also be on the route.)

My reason for thinking this is that the VPLS code needs exactly the same
information as does a normal MPLS route:  it attaches to an incoming
label (decapsulating packets instead of forwarding them), and for TX it
does the same operation of looking up a nexthop (possibly with ECMP
support) and adding a label stack.  The code should, in fact, probably
reuse the TX path.

This also fits both an 1:1 and 1:n model pretty well.  Creating a VPLS
head-end netdevice doesn't even need any config.  It'd just work like:
- ip link add name vpls123 type vpls
- ip -f mpls route add \
1234 \  # incoming label for decap
vpls vpls123 \  # new attr: VPLS device
as 2345 via inet 10.0.0.1 dev eth0  # outgoing label for encap

For a 1:n model, one would simply add multiple routes on the same vpls
device.



this is a nice model too. But, i don't see how vlans and mac based
learning will fit in here.

modeling it same as how vxlan l2 overlay tunnels are done seems like a
great fit.
The vpls driver can learn mac's per pw tunnel labels. And this l2 fdb
table per vpls device can also carry dst information similar to how
vxlan driver does today.



I think this is a good idea too, I'll implement this concept in mpls and 
have a look at the way vxlan is done to be able to support the l2 part 
in vpls driver.


Thanks


Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet

2017-08-16 Thread Eric Dumazet
On Wed, 2017-08-16 at 11:55 +0800, Jason Wang wrote:
> 
> On 2017年08月16日 11:45, Eric Dumazet wrote:
> >
> > You do realize that tun_build_skb() is not thread safe ?
> 
> Ok, I think the issue if skb_page_frag_refill(), need a spinlock 
> probably. Will prepare a patch.

But since tun is used from process context, why don't you use the
per-thread generator (no lock involved)

tcp_sendmsg() uses this for GFP_KERNEL allocations.

Untested patch :

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 
5892284eb8d05b0678d820bad3d0d2c61a879aeb..c38cd840cc0b7fecf182b23976e36f709cacca1f
 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -175,7 +175,6 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
-   struct page_frag alloc_frag;
 };
 
 struct tun_flow_entry {
@@ -578,8 +577,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
}
if (tun)
skb_array_cleanup(&tfile->tx_array);
-   if (tfile->alloc_frag.page)
-   put_page(tfile->alloc_frag.page);
sock_put(&tfile->sk);
}
 }
@@ -1272,7 +1269,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
*tun,
 struct virtio_net_hdr *hdr,
 int len, int *generic_xdp)
 {
-   struct page_frag *alloc_frag = &tfile->alloc_frag;
+   struct page_frag *alloc_frag = ¤t->task_frag;
struct sk_buff *skb;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
@@ -2580,8 +2577,6 @@ static int tun_chr_open(struct inode *inode, struct file 
* file)
tfile->sk.sk_write_space = tun_sock_write_space;
tfile->sk.sk_sndbuf = INT_MAX;
 
-   tfile->alloc_frag.page = NULL;
-
file->private_data = tfile;
INIT_LIST_HEAD(&tfile->next);
 






Re: Something hitting my total number of connections to the server

2017-08-16 Thread Eric Dumazet
On Wed, 2017-08-16 at 10:18 +0530, Akshat Kakkar wrote:
> On Mon, Aug 14, 2017 at 2:37 PM, Akshat Kakkar  wrote:
> > I have centos 7.3 (Kernel 3.10) running on a server with 128GB RAM and
> > 2 x 10 Core Xeon Processor.
> > I have hosted a webserver on it and enabled ssh for remote maintenance.
> > Previously it was running on Centos 6.3.
> > After upgrading to CentOS 7.3, occasionally (probably when number of
> > hits are more on the server), I am not able to create new connections
> > (neither on web nor on ssh). Existing connections keeps on running
> > fine.
> >
> > I did packet capturing using tcpdump to understand if its some
> > intermediate network issue.
> > What I found was the server is not replying for new SYN requests.
> >
> > So it's clear that its not at all application issue. Also, there are
> > no logs in applications logs for any connections dropped, if any.
> >
> > I check my firewall rules if there is some rate limiting imposed.
> > There is nothing in there.
> >
> > I check tc, if by mistake some rate limiting is imposed. There is
> > nothing in there too.
> >
> > I have increased noOfFiles to 100 and other sysctl parameters, but
> > the issue is still there.
> >
> > Has anybody experienced the same?
> >
> > How to go about? Anybody ... Please Help!!!
> 
> Its getting lonely out here. Anybody there ???

We wont help you unless you use a recent kernel.

3.10 misses all recent improvements in TCP stack (4 years of hard work)







Re: [patch net-next repost 1/3] idr: Use unsigned long instead of int

2017-08-16 Thread Eric Dumazet
On Wed, 2017-08-16 at 04:14 -0400, Chris Mi wrote:
> IDR uses internally radix tree which uses unsigned long. It doesn't
> makes sense to have index as signed value.
> 
> Signed-off-by: Chris Mi 
> Signed-off-by: Jiri Pirko 
> ---
>  block/bsg.c |  8 ++--
>  block/genhd.c   | 12 +++---
>  drivers/atm/nicstar.c   | 11 ++---
>  drivers/block/drbd/drbd_main.c  | 31 --
>  drivers/block/drbd/drbd_nl.c| 22 +-
>  drivers/block/drbd/drbd_proc.c  |  3 +-
>  drivers/block/drbd/drbd_receiver.c  | 15 ---
>  drivers/block/drbd/drbd_state.c | 34 ---
>  drivers/block/drbd/drbd_worker.c|  6 +--
>  drivers/block/loop.c| 17 +---
>  drivers/block/nbd.c | 20 +
>  drivers/block/zram/zram_drv.c   |  9 ++--
>  drivers/char/tpm/tpm-chip.c | 10 +++--
>  drivers/char/tpm/tpm.h  |  2 +-
>  drivers/dca/dca-sysfs.c |  9 ++--
>  drivers/firewire/core-cdev.c| 18 
>  drivers/firewire/core-device.c  | 15 ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  8 ++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  9 ++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c |  6 +--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  2 +-
>  drivers/gpu/drm/drm_auth.c  |  9 ++--
>  drivers/gpu/drm/drm_connector.c | 10 +++--
>  drivers/gpu/drm/drm_context.c   | 20 +
>  drivers/gpu/drm/drm_dp_aux_dev.c| 11 ++---
>  drivers/gpu/drm/drm_drv.c   |  6 ++-
>  drivers/gpu/drm/drm_gem.c   | 19 +
>  drivers/gpu/drm/drm_info.c  |  2 +-
>  drivers/gpu/drm/drm_mode_object.c   | 11 +++--
>  drivers/gpu/drm/drm_syncobj.c   | 18 +---
>  drivers/gpu/drm/exynos/exynos_drm_ipp.c | 25 ++-
>  drivers/gpu/drm/i915/gvt/display.c  |  2 +-
>  drivers/gpu/drm/i915/gvt/kvmgt.c|  2 +-
>  drivers/gpu/drm/i915/gvt/vgpu.c |  9 ++--
>  drivers/gpu/drm/i915/i915_debugfs.c |  6 +--
>  drivers/gpu/drm/i915/i915_gem_context.c |  9 ++--
>  drivers/gpu/drm/qxl/qxl_cmd.c   |  8 ++--
>  drivers/gpu/drm/qxl/qxl_release.c   | 14 +++---
>  drivers/gpu/drm/sis/sis_mm.c|  8 ++--
>  drivers/gpu/drm/tegra/drm.c | 10 +++--
>  drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c|  3 +-
>  drivers/gpu/drm/vgem/vgem_fence.c   | 12 +++---
>  drivers/gpu/drm/via/via_mm.c|  8 ++--
>  drivers/gpu/drm/virtio/virtgpu_kms.c|  5 ++-
>  drivers/gpu/drm/virtio/virtgpu_vq.c |  5 ++-
>  drivers/gpu/drm/vmwgfx/vmwgfx_resource.c|  9 ++--
>  drivers/i2c/i2c-core-base.c | 19 +
>  drivers/infiniband/core/cm.c|  8 ++--
>  drivers/infiniband/core/cma.c   | 12 +++---
>  drivers/infiniband/core/rdma_core.c |  9 ++--
>  drivers/infiniband/core/sa_query.c  | 23 +-
>  drivers/infiniband/core/ucm.c   |  7 ++-
>  drivers/infiniband/core/ucma.c  | 14 --
>  drivers/infiniband/hw/cxgb3/iwch.c  |  4 +-
>  drivers/infiniband/hw/cxgb3/iwch.h  |  4 +-
>  drivers/infiniband/hw/cxgb4/device.c| 18 
>  drivers/infiniband/hw/cxgb4/iw_cxgb4.h  |  4 +-
>  drivers/infiniband/hw/hfi1/init.c   |  9 ++--
>  drivers/infiniband/hw/hfi1/vnic_main.c  |  6 +--
>  drivers/infiniband/hw/mlx4/cm.c | 13 +++---
>  drivers/infiniband/hw/ocrdma/ocrdma_main.c  |  7 ++-
>  drivers/infiniband/hw/qib/qib_init.c|  9 ++--
>  drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 10 ++---
>  drivers/iommu/intel-svm.c   |  9 ++--
>  drivers/md/dm.c | 13 +++---
>  drivers/memstick/core/memstick.c| 10 +++--
>  drivers/memstick/core/ms_block.c|  9 ++--
>  drivers/memstick/core/mspro_block.c | 12 --
>  drivers/mfd/rtsx_pcr.c  |  9 ++--
>  drivers/misc/c2port/core.c  |  7 +--
>  drivers/misc/cxl/context.c  |  8 ++--
>  drivers/misc/cxl/main.c | 15 ---
>  drivers/misc/mei/main.c |  8 ++--
>  drivers/misc/mic/scif/scif_api.c| 11 ++---
>  drivers/misc/mic/scif/scif_ports.c  | 18 
>  drivers/misc/tifm_core.c|  9 ++--
>  drivers/mtd/mtdcore.c

Re: [PATCH net] xfrm: Clear sk_dst_cache when applying per-socket policy.

2017-08-16 Thread Eric Dumazet
On Wed, 2017-08-16 at 11:03 +0200, Jakub Sitnicki wrote:
> On Tue, 15 Aug 2017 15:25:10 -0700
> Jonathan Basseri  wrote:
> 
> > If an IPv6 socket has a valid dst cache, then xfrm_lookup_route will get
> > skipped. However, the cache is not invalidated when applying policy to a
> > socket (i.e. IPV6_XFRM_POLICY). The result is that new policies are
> > sometimes ignored on those sockets.
> > 
> > This can be demonstrated like so,
> > 1. Create UDPv6 socket.
> > 2. connect() the socket.
> > 3. Apply an outbound XFRM policy to the socket.
> > 4. send() data on the socket.
> > 
> > Packets will continue to be sent in the clear instead of matching an
> > xfrm or returning a no-match error (EAGAIN). This affects calls to
> > send() and not sendto().
> > 
> > Note: Creating normal XFRM policies should have a similar effect on
> > sk_dst_cache entries that match the policy, but that is not fixed in
> > this patch.
> > 
> > Fixes: 00bc0ef5880d ("ipv6: Skip XFRM lookup if dst_entry in socket cache 
> > is valid")
> > Tested: https://android-review.googlesource.com/418659
> > Signed-off-by: Jonathan Basseri 
> > ---
> 
> Thank you for the fix.
> 
> Acked-by: Jakub Sitnicki 

I do not believe this fix is correct.

What happens if the socket is TCP ?

sk_dst_reset(sk) is not safe for them.

This might add use-after-free, and eventually crash.





Re: [patch net-next repost 1/3] idr: Use unsigned long instead of int

2017-08-16 Thread Jiri Pirko
Wed, Aug 16, 2017 at 12:37:09PM CEST, eric.duma...@gmail.com wrote:
>On Wed, 2017-08-16 at 04:14 -0400, Chris Mi wrote:
>> IDR uses internally radix tree which uses unsigned long. It doesn't
>> makes sense to have index as signed value.
>> 
>> Signed-off-by: Chris Mi 
>> Signed-off-by: Jiri Pirko 
>> ---
>>  block/bsg.c |  8 ++--
>>  block/genhd.c   | 12 +++---
>>  drivers/atm/nicstar.c   | 11 ++---
>>  drivers/block/drbd/drbd_main.c  | 31 --
>>  drivers/block/drbd/drbd_nl.c| 22 +-
>>  drivers/block/drbd/drbd_proc.c  |  3 +-
>>  drivers/block/drbd/drbd_receiver.c  | 15 ---
>>  drivers/block/drbd/drbd_state.c | 34 ---
>>  drivers/block/drbd/drbd_worker.c|  6 +--
>>  drivers/block/loop.c| 17 +---
>>  drivers/block/nbd.c | 20 +
>>  drivers/block/zram/zram_drv.c   |  9 ++--
>>  drivers/char/tpm/tpm-chip.c | 10 +++--
>>  drivers/char/tpm/tpm.h  |  2 +-
>>  drivers/dca/dca-sysfs.c |  9 ++--
>>  drivers/firewire/core-cdev.c| 18 
>>  drivers/firewire/core-device.c  | 15 ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c |  8 ++--
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  9 ++--
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c |  6 +--
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  2 +-
>>  drivers/gpu/drm/drm_auth.c  |  9 ++--
>>  drivers/gpu/drm/drm_connector.c | 10 +++--
>>  drivers/gpu/drm/drm_context.c   | 20 +
>>  drivers/gpu/drm/drm_dp_aux_dev.c| 11 ++---
>>  drivers/gpu/drm/drm_drv.c   |  6 ++-
>>  drivers/gpu/drm/drm_gem.c   | 19 +
>>  drivers/gpu/drm/drm_info.c  |  2 +-
>>  drivers/gpu/drm/drm_mode_object.c   | 11 +++--
>>  drivers/gpu/drm/drm_syncobj.c   | 18 +---
>>  drivers/gpu/drm/exynos/exynos_drm_ipp.c | 25 ++-
>>  drivers/gpu/drm/i915/gvt/display.c  |  2 +-
>>  drivers/gpu/drm/i915/gvt/kvmgt.c|  2 +-
>>  drivers/gpu/drm/i915/gvt/vgpu.c |  9 ++--
>>  drivers/gpu/drm/i915/i915_debugfs.c |  6 +--
>>  drivers/gpu/drm/i915/i915_gem_context.c |  9 ++--
>>  drivers/gpu/drm/qxl/qxl_cmd.c   |  8 ++--
>>  drivers/gpu/drm/qxl/qxl_release.c   | 14 +++---
>>  drivers/gpu/drm/sis/sis_mm.c|  8 ++--
>>  drivers/gpu/drm/tegra/drm.c | 10 +++--
>>  drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c|  3 +-
>>  drivers/gpu/drm/vgem/vgem_fence.c   | 12 +++---
>>  drivers/gpu/drm/via/via_mm.c|  8 ++--
>>  drivers/gpu/drm/virtio/virtgpu_kms.c|  5 ++-
>>  drivers/gpu/drm/virtio/virtgpu_vq.c |  5 ++-
>>  drivers/gpu/drm/vmwgfx/vmwgfx_resource.c|  9 ++--
>>  drivers/i2c/i2c-core-base.c | 19 +
>>  drivers/infiniband/core/cm.c|  8 ++--
>>  drivers/infiniband/core/cma.c   | 12 +++---
>>  drivers/infiniband/core/rdma_core.c |  9 ++--
>>  drivers/infiniband/core/sa_query.c  | 23 +-
>>  drivers/infiniband/core/ucm.c   |  7 ++-
>>  drivers/infiniband/core/ucma.c  | 14 --
>>  drivers/infiniband/hw/cxgb3/iwch.c  |  4 +-
>>  drivers/infiniband/hw/cxgb3/iwch.h  |  4 +-
>>  drivers/infiniband/hw/cxgb4/device.c| 18 
>>  drivers/infiniband/hw/cxgb4/iw_cxgb4.h  |  4 +-
>>  drivers/infiniband/hw/hfi1/init.c   |  9 ++--
>>  drivers/infiniband/hw/hfi1/vnic_main.c  |  6 +--
>>  drivers/infiniband/hw/mlx4/cm.c | 13 +++---
>>  drivers/infiniband/hw/ocrdma/ocrdma_main.c  |  7 ++-
>>  drivers/infiniband/hw/qib/qib_init.c|  9 ++--
>>  drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 10 ++---
>>  drivers/iommu/intel-svm.c   |  9 ++--
>>  drivers/md/dm.c | 13 +++---
>>  drivers/memstick/core/memstick.c| 10 +++--
>>  drivers/memstick/core/ms_block.c|  9 ++--
>>  drivers/memstick/core/mspro_block.c | 12 --
>>  drivers/mfd/rtsx_pcr.c  |  9 ++--
>>  drivers/misc/c2port/core.c  |  7 +--
>>  drivers/misc/cxl/context.c  |  8 ++--
>>  drivers/misc/cxl/main.c | 15 ---
>>  drivers/misc/mei/main.c |  8 ++--
>>  drivers/misc/mic/scif/scif_api.c| 11 ++---
>>  drivers/misc/mic/scif

RE: [net-next 11/12] igbvf: convert msleep to mdelay in atomic context

2017-08-16 Thread David Laight
From: Greg Edwards
> Sent: 15 August 2017 20:32
> On Mon, Aug 14, 2017 at 10:17:31AM +, David Laight wrote:
> > From: Jeff Kirsher
> >> Sent: 09 August 2017 22:48
> >> From: Greg Edwards 
> >>
> >> This fixes a "scheduling while atomic" splat seen with
> >> CONFIG_DEBUG_ATOMIC_SLEEP enabled.
> >>
> >> Signed-off-by: Greg Edwards 
> >> Tested-by: Aaron Brown 
> >> Signed-off-by: Jeff Kirsher 
> >> ---
> >>  drivers/net/ethernet/intel/igbvf/vf.c | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/net/ethernet/intel/igbvf/vf.c 
> >> b/drivers/net/ethernet/intel/igbvf/vf.c
> >> index 1d3aa9adcaa8..9577ccf4b26a 100644
> >> --- a/drivers/net/ethernet/intel/igbvf/vf.c
> >> +++ b/drivers/net/ethernet/intel/igbvf/vf.c
> >> @@ -149,7 +149,7 @@ static s32 e1000_reset_hw_vf(struct e1000_hw *hw)
> >>msgbuf[0] = E1000_VF_RESET;
> >>mbx->ops.write_posted(hw, msgbuf, 1);
> >>
> >> -  msleep(10);
> >> +  mdelay(10);
> >
> > Spinning for 10ms seems somewhat sub-optimal
> 
> Jeff,
> 
> Do we even need this delay?  The subsequent read_posted() will poll for
> the PF's mailbox reply for up to 1s.

A 1 second loop?
Who is kidding who that this code is sensible.
If this code is ever executed and has to wait at all other interfaces
are likely to lose packets.

David



Re: [patch net-next repost 1/3] idr: Use unsigned long instead of int

2017-08-16 Thread Eric Dumazet
On Wed, 2017-08-16 at 12:53 +0200, Jiri Pirko wrote:

> rhashtable is unnecesary big hammer for this. IDR is nice fit for
> this purpose.

Obviously IDR does not fit, since you have to change its ABI.

If rhashtable does not fit this, then I wonder why we spent so many days
of work adding it in the kernel.





Re: [patch net-next repost 1/3] idr: Use unsigned long instead of int

2017-08-16 Thread Jiri Pirko
Wed, Aug 16, 2017 at 12:58:53PM CEST, eric.duma...@gmail.com wrote:
>On Wed, 2017-08-16 at 12:53 +0200, Jiri Pirko wrote:
>
>> rhashtable is unnecesary big hammer for this. IDR is nice fit for
>> this purpose.
>
>Obviously IDR does not fit, since you have to change its ABI.

I don't think it is a problem to adjust something to your needs.
Moreover, if it's API is misdesigned from the beginning. We are just
putting IDR back on track, cleaning it's API. I don't see anything wrong
on that. Everyone would benefit.


>
>If rhashtable does not fit this, then I wonder why we spent so many days
>of work adding it in the kernel.

It fits, sure. But it is not needed (the big hammer I mentioned).
We don't need a custom key for lookup. Just a single pointer as a key
would do. And that is exactly why IDR is here for. Does not make sense
to use anything else when we have the thing to do the work right here.


RE: [PATCH net-next 3/3 v4] drivers: net: ethernet: qualcomm: rmnet: Initial implementation

2017-08-16 Thread David Laight
From: David Miller
> Sent: 16 August 2017 05:24
> From: Subash Abhinov Kasiviswanathan 
> Date: Tue, 15 Aug 2017 22:15:53 -0600
> 
> > +static int rmnet_unregister_real_device(struct net_device *dev)
> > +{
> > +   int config_id = RMNET_LOCAL_LOGICAL_ENDPOINT;
> > +   struct rmnet_logical_ep_conf_s *epconfig_l;
> > +   struct rmnet_phys_ep_conf_s *config;
> > +
> > +   ASSERT_RTNL();
> > +
> > +   netdev_info(dev, "Removing device %s\n", dev->name);
> > +
> > +   if (!rmnet_is_real_dev_registered(dev))
> > +   return -EINVAL;
> > +
> > +   for (; config_id < RMNET_MAX_LOGICAL_EP; config_id++) {
> 
> This loop is so much harder to understand because you initialize
> the loop index several lines above the for() statement.  Just
> initialize it here at the beginning of the for() loop and deal
> with the fact that this will have to therefore be a multi-line
> for() statement the best you can.
...

One way to split the multi-line for() is to put the initialiser
on the immediately preceding line:
config_id = RMNET_LOCAL_LOGICAL_ENDPOINT;
for (; config_id < RMNET_MAX_LOGICAL_EP; config_id++) {

David



Re: [PATCH] bpf: Update sysctl documentation to list all supported architectures

2017-08-16 Thread Michael Ellerman
Daniel Borkmann  writes:

> Hi Michael,
>
> On 08/16/2017 07:15 AM, Michael Ellerman wrote:
>> The sysctl documentation states that the JIT is only available on
>> x86_64, which is no longer correct.
>>
>> Update the list to include all architectures that enable HAVE_CBPF_JIT
>> or HAVE_EBPF_JIT under some configuration.
>>
>> Signed-off-by: Michael Ellerman 
>
> Thanks for the patch!
>
>>   Documentation/sysctl/net.txt | 5 +++--
>>   1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
>> index 14db18c970b1..f68356024d09 100644
>> --- a/Documentation/sysctl/net.txt
>> +++ b/Documentation/sysctl/net.txt
>> @@ -36,8 +36,9 @@ bpf_jit_enable
>>   --
>>
>>   This enables Berkeley Packet Filter Just in Time compiler.
>> -Currently supported on x86_64 architecture, bpf_jit provides a framework
>> -to speed packet filtering, the one used by tcpdump/libpcap for example.
>> +Currently supported on arm, arm64, mips, powerpc, s390, sparc and x86_64
>> +architectures, bpf_jit provides a framework to speed packet filtering, the 
>> one
>> +used by tcpdump/libpcap for example.
>
> Good point, could we actually make that as a bullet list and
> differentiate between cBPF and eBPF JITs, so that a user doesn't
> need to run git grep HAVE_{E,C}BPF_JIT to figure it out what the
> switch enables on the arch used? That would be great.

We could.

Does a user of the sysctl want/need to know the difference though? Or do
they just want to turn on "the JIT"?

cheers


Re: [PATCH net] xfrm: Clear sk_dst_cache when applying per-socket policy.

2017-08-16 Thread Jakub Sitnicki
On Wed, 16 Aug 2017 03:43:54 -0700
Eric Dumazet  wrote:

> On Wed, 2017-08-16 at 11:03 +0200, Jakub Sitnicki wrote:
> > On Tue, 15 Aug 2017 15:25:10 -0700
> > Jonathan Basseri  wrote:
> >   
> > > If an IPv6 socket has a valid dst cache, then xfrm_lookup_route will get
> > > skipped. However, the cache is not invalidated when applying policy to a
> > > socket (i.e. IPV6_XFRM_POLICY). The result is that new policies are
> > > sometimes ignored on those sockets.
> > > 
> > > This can be demonstrated like so,
> > > 1. Create UDPv6 socket.
> > > 2. connect() the socket.
> > > 3. Apply an outbound XFRM policy to the socket.
> > > 4. send() data on the socket.
> > > 
> > > Packets will continue to be sent in the clear instead of matching an
> > > xfrm or returning a no-match error (EAGAIN). This affects calls to
> > > send() and not sendto().
> > > 
> > > Note: Creating normal XFRM policies should have a similar effect on
> > > sk_dst_cache entries that match the policy, but that is not fixed in
> > > this patch.
> > > 
> > > Fixes: 00bc0ef5880d ("ipv6: Skip XFRM lookup if dst_entry in socket cache 
> > > is valid")
> > > Tested: https://android-review.googlesource.com/418659
> > > Signed-off-by: Jonathan Basseri 
> > > ---  
> > 
> > Thank you for the fix.
> > 
> > Acked-by: Jakub Sitnicki   
> 
> I do not believe this fix is correct.
> 
> What happens if the socket is TCP ?
> 
> sk_dst_reset(sk) is not safe for them.
> 
> This might add use-after-free, and eventually crash.

You are right. I see that RCU-variant __sk_dst_reset() is used
throughout TCP stack. Thank you for pointing it out.

Please disregard my earlier ACK.

-Jakub


Re: [PATCH net-next 3/3 v4] drivers: net: ethernet: qualcomm: rmnet: Initial implementation

2017-08-16 Thread Jiri Pirko
Wed, Aug 16, 2017 at 06:15:53AM CEST, subas...@codeaurora.org wrote:
>RmNet driver provides a transport agnostic MAP (multiplexing and
>aggregation protocol) support in embedded module. Module provides
>virtual network devices which can be attached to any IP-mode
>physical device. This will be used to provide all MAP functionality
>on future hardware in a single consistent location.
>
>Signed-off-by: Subash Abhinov Kasiviswanathan 

[...]


>+struct rmnet_phys_ep_conf_s {

The name is cryptic. Why "_s"?


>+  struct net_device *dev;
>+  struct rmnet_logical_ep_conf_s local_ep;
>+  struct rmnet_logical_ep_conf_s muxed_ep[RMNET_MAX_LOGICAL_EP];
>+  u32 ingress_data_format;
>+  u32 egress_data_format;
>+  struct net_device *rmnet_devices[RMNET_MAX_VND];
>+};
>+
>+extern struct rtnl_link_ops rmnet_link_ops;
>+
>+struct rmnet_vnd_private_s {

Again, cryptic.


>+  struct rmnet_logical_ep_conf_s local_ep;
>+  u32 msg_enable;
>+};

[...]


>+rx_handler_result_t rmnet_ingress_handler(struct sk_buff *skb)
>+{
>+  struct rmnet_phys_ep_conf_s *config;

I still fail to understand why the name of this is "config". Please
change to something else across whole code. Including the name of the struct.


>+  struct net_device *dev;
>+  int rc;
>+
>+  if (!skb)
>+  return RX_HANDLER_CONSUMED;
>+
>+  dev = skb->dev;
>+  config = rmnet_get_phys_ep_config(skb->dev);

You have dev. Why not use dev?


>+
>+  /* Sometimes devices operate in ethernet mode even thouth there is no
>+   * ethernet header. This causes the skb->protocol to contain a bogus
>+   * value and the skb->data pointer to be off by 14 bytes. Fix it if
>+   * configured to do so
>+   */
>+  if (config->ingress_data_format & RMNET_INGRESS_FIX_ETHERNET) {
>+  skb_push(skb, RMNET_ETHERNET_HEADER_LENGTH);
>+  rmnet_set_skb_proto(skb);
>+  }
>+
>+  if (config->ingress_data_format & RMNET_INGRESS_FORMAT_MAP) {
>+  rc = rmnet_map_ingress_handler(skb, config);
>+  } else {
>+  switch (ntohs(skb->protocol)) {
>+  case ETH_P_MAP:
>+  if (config->local_ep.rmnet_mode ==
>+  RMNET_EPMODE_BRIDGE) {
>+  rc = rmnet_ingress_deliver_packet(skb, config);
>+  } else {
>+  kfree_skb(skb);
>+  rc = RX_HANDLER_CONSUMED;
>+  }
>+  break;
>+
>+  case ETH_P_ARP:
>+  case ETH_P_IP:
>+  case ETH_P_IPV6:
>+  rc = rmnet_ingress_deliver_packet(skb, config);
>+  break;
>+
>+  default:
>+  rc = RX_HANDLER_PASS;
>+  }
>+  }
>+
>+  return rc;
>+}
>+
>+rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb)
>+{
>+  return rmnet_ingress_handler(*pskb);

This is just silly. Why you don't have the content of rmnet_ingress_handler
right here?




[PATCH V4 net-next 00/21] Huawei HiNIC Ethernet Driver

2017-08-16 Thread Aviad Krawczyk
The patch-set contains the support of the HiNIC Ethernet driver for
hinic family of PCIE Network Interface Cards.

The Huawei's PCIE HiNIC card is a new Ethernet card and hence there was
a need of a new driver.

The current driver is meant to be used for the Physical Function and there
would soon be a support for Virtual Function and more features once the
basic PF driver has been accepted.

Changes V3 -> V4:
1. Reverse christmas tree order - David Miller comment
https://lkml.org/lkml/2017/8/3/862

Changes V2 -> V3:
1. Replace dev_ functions by netif_ functions - Joe Perches comment
https://lkml.org/lkml/2017/7/19/424
2. Fix the driver directory in MAINTAINERS file - Sergei Shtylyov comment
https://lkml.org/lkml/2017/7/19/615
3. Add a newline at the end of Makefile - David Miller comment
https://lkml.org/lkml/2017/7/19/1345
4. Return a pointer as a val instead of in arg - Francois Romieu comment
https://lkml.org/lkml/2017/7/19/1319
5. Change the error labels to err_xyz - Francois Romieu comment
https://lkml.org/lkml/2017/7/19/1319
6. Remove check of Func Type in free function - Francois Romieu comment
https://lkml.org/lkml/2017/7/19/1319
7. Remove !netdev check in remove function - Francois Romieu comment
https://lkml.org/lkml/2017/7/19/1319
8. Use module_pci_driver - Francois Romieu comment
https://lkml.org/lkml/2017/7/19/1319
9. Move the PCI device ID to the .c file - Francois Romieu comment
https://lkml.org/lkml/2017/7/19/1319
10. Remove void * to avoid passing wrong ptr - Francois Romieu comment
https://lkml.org/lkml/2017/7/19/1319

Changes V1 -> V2:
1. remove driver verstion - Andrew Lunn comment
https://lkml.org/lkml/2017/7/12/372
2. replace kzalloc by devm_kzalloc for short clean - Andrew Lunn comment
https://lkml.org/lkml/2017/7/12/372
3. replace pr_ functions by dev_ functions - Andrew Lunn comment
https://lkml.org/lkml/2017/7/12/375
4. seperate last patch by moving ops to a new patch - Andrew Lunn comment
https://lkml.org/lkml/2017/7/12/377

Aviad Krawczyk (21):
  net-next/hinic: Initialize hw interface
  net-next/hinic: Initialize hw device components
  net-next/hinic: Initialize api cmd resources
  net-next/hinic: Initialize api cmd hw
  net-next/hinic: Add management messages
  net-next/hinic: Add api cmd commands
  net-next/hinic: Add aeqs
  net-next/hinic: Add port management commands
  net-next/hinic: Add Rx mode and link event handler
  net-next/hinic: Add logical Txq and Rxq
  net-next/hinic: Add wq
  net-next/hinic: Add qp resources
  net-next/hinic: Set qp context
  net-next/hinic: Initialize cmdq
  net-next/hinic: Add ceqs
  net-next/hinic: Add cmdq commands
  net-next/hinic: Add cmdq completion handler
  net-next/hinic: Add Rx handler
  net-next/hinic: Add Tx operation
  net-next/hinic: Add ethtool and stats
  net-next/hinic: Add select_queue and netpoll

 Documentation/networking/hinic.txt |  125 +++
 MAINTAINERS|7 +
 drivers/net/ethernet/Kconfig   |1 +
 drivers/net/ethernet/Makefile  |1 +
 drivers/net/ethernet/huawei/Kconfig|   19 +
 drivers/net/ethernet/huawei/Makefile   |5 +
 drivers/net/ethernet/huawei/hinic/Kconfig  |   13 +
 drivers/net/ethernet/huawei/hinic/Makefile |6 +
 drivers/net/ethernet/huawei/hinic/hinic_common.c   |   80 ++
 drivers/net/ethernet/huawei/hinic/hinic_common.h   |   38 +
 drivers/net/ethernet/huawei/hinic/hinic_dev.h  |   64 ++
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.c   |  977 +
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.h   |  208 
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c  |  946 
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h  |  187 
 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h   |  149 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c   | 1045 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h   |  239 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c   |  888 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h   |  265 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.c|  352 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h|  272 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.c|  533 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.h|   97 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c  |  597 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h  |  153 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c|  892 
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h|  180 
 .../net/ethernet/huawei/hinic/hinic_hw_qp_ctxt.h   |  214 
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c|  879 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h|  117 ++
 drivers/net

[PATCH V4 net-next 02/21] net-next/hinic: Initialize hw device components

2017-08-16 Thread Aviad Krawczyk
Initialize hw device by calling the initialization functions of aeqs and
management channel.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/Makefile|   3 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c  | 172 --
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h  |  14 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c  | 149 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h  | 107 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h   |   8 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c |  92 
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h |  45 ++
 8 files changed, 576 insertions(+), 14 deletions(-)
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h

diff --git a/drivers/net/ethernet/huawei/hinic/Makefile 
b/drivers/net/ethernet/huawei/hinic/Makefile
index 353cee0..717ad71 100644
--- a/drivers/net/ethernet/huawei/hinic/Makefile
+++ b/drivers/net/ethernet/huawei/hinic/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_HINIC) += hinic.o
 
-hinic-y := hinic_main.o hinic_hw_dev.o hinic_hw_if.o
+hinic-y := hinic_main.o hinic_hw_dev.o hinic_hw_mgmt.o hinic_hw_eqs.o \
+  hinic_hw_if.o
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
index f681846..d430e60 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
@@ -23,11 +23,132 @@
 #include 
 
 #include "hinic_hw_if.h"
+#include "hinic_hw_eqs.h"
+#include "hinic_hw_mgmt.h"
 #include "hinic_hw_dev.h"
 
 #define MAX_IRQS(max_qps, num_aeqs, num_ceqs)   \
 (2 * (max_qps) + (num_aeqs) + (num_ceqs))
 
+enum intr_type {
+   INTR_MSIX_TYPE,
+};
+
+/* HW struct */
+struct hinic_dev_cap {
+   u8  status;
+   u8  version;
+   u8  rsvd0[6];
+
+   u8  rsvd1[5];
+   u8  intr_type;
+   u8  rsvd2[66];
+   u16 max_sqs;
+   u16 max_rqs;
+   u8  rsvd3[208];
+};
+
+/**
+ * get_capability - convert device capabilities to NIC capabilities
+ * @hwdev: the HW device to set and convert device capabilities for
+ * @dev_cap: device capabilities from FW
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int get_capability(struct hinic_hwdev *hwdev,
+ struct hinic_dev_cap *dev_cap)
+{
+   struct hinic_cap *nic_cap = &hwdev->nic_cap;
+   int num_aeqs, num_ceqs, num_irqs;
+
+   if (!HINIC_IS_PF(hwdev->hwif) && !HINIC_IS_PPF(hwdev->hwif))
+   return -EINVAL;
+
+   if (dev_cap->intr_type != INTR_MSIX_TYPE)
+   return -EFAULT;
+
+   num_aeqs = HINIC_HWIF_NUM_AEQS(hwdev->hwif);
+   num_ceqs = HINIC_HWIF_NUM_CEQS(hwdev->hwif);
+   num_irqs = HINIC_HWIF_NUM_IRQS(hwdev->hwif);
+
+   /* Each QP has its own (SQ + RQ) interrupts */
+   nic_cap->num_qps = (num_irqs - (num_aeqs + num_ceqs)) / 2;
+
+   /* num_qps must be power of 2 */
+   nic_cap->num_qps = BIT(fls(nic_cap->num_qps) - 1);
+
+   nic_cap->max_qps = dev_cap->max_sqs + 1;
+   if (nic_cap->max_qps != (dev_cap->max_rqs + 1))
+   return -EFAULT;
+
+   if (nic_cap->num_qps > nic_cap->max_qps)
+   nic_cap->num_qps = nic_cap->max_qps;
+
+   return 0;
+}
+
+/**
+ * get_cap_from_fw - get device capabilities from FW
+ * @pfhwdev: the PF HW device to get capabilities for
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int get_cap_from_fw(struct hinic_pfhwdev *pfhwdev)
+{
+   struct hinic_hwdev *hwdev = &pfhwdev->hwdev;
+   struct hinic_hwif *hwif = hwdev->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   struct hinic_dev_cap dev_cap;
+   u16 in_len, out_len;
+   int err;
+
+   in_len = 0;
+   out_len = sizeof(dev_cap);
+
+   err = hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt, HINIC_MOD_CFGM,
+   HINIC_CFG_NIC_CAP, &dev_cap, in_len, &dev_cap,
+   &out_len, HINIC_MGMT_MSG_SYNC);
+   if (err) {
+   dev_err(&pdev->dev, "Failed to get capability from FW\n");
+   return err;
+   }
+
+   return get_capability(hwdev, &dev_cap);
+}
+
+/**
+ * get_dev_cap - get device capabilities
+ * @hwdev: the NIC HW device to get capabilities for
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int get_dev_cap(struct hinic_hwdev *hwdev)
+{
+   struct hinic_hwif *hwif = hwdev->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   struct hinic_pfhwdev *pfhwdev;
+   int err;
+
+   switch (HINIC_FUNC_TYPE(hwif)) {
+   case HINIC_PPF:
+   case HINIC_PF:
+   pfhwdev = container_of(hwdev, stru

[PATCH V4 net-next 01/21] net-next/hinic: Initialize hw interface

2017-08-16 Thread Aviad Krawczyk
Initialize hw interface as part of the nic initialization for accessing hw.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 Documentation/networking/hinic.txt   | 125 ++
 drivers/net/ethernet/Kconfig |   1 +
 drivers/net/ethernet/Makefile|   1 +
 drivers/net/ethernet/huawei/Kconfig  |  19 +++
 drivers/net/ethernet/huawei/Makefile |   5 +
 drivers/net/ethernet/huawei/hinic/Kconfig|  13 ++
 drivers/net/ethernet/huawei/hinic/Makefile   |   3 +
 drivers/net/ethernet/huawei/hinic/hinic_dev.h|  33 
 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h |  36 
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c | 201 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h |  42 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.c  | 208 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h  | 160 +
 drivers/net/ethernet/huawei/hinic/hinic_main.c   | 195 +
 14 files changed, 1042 insertions(+)
 create mode 100644 Documentation/networking/hinic.txt
 create mode 100644 drivers/net/ethernet/huawei/Kconfig
 create mode 100644 drivers/net/ethernet/huawei/Makefile
 create mode 100644 drivers/net/ethernet/huawei/hinic/Kconfig
 create mode 100644 drivers/net/ethernet/huawei/hinic/Makefile
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_dev.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_if.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_main.c

diff --git a/Documentation/networking/hinic.txt 
b/Documentation/networking/hinic.txt
new file mode 100644
index 000..989366a
--- /dev/null
+++ b/Documentation/networking/hinic.txt
@@ -0,0 +1,125 @@
+Linux Kernel Driver for Huawei Intelligent NIC(HiNIC) family
+
+
+Overview:
+=
+HiNIC is a network interface card for the Data Center Area.
+
+The driver supports a range of link-speed devices (10GbE, 25GbE, 40GbE, etc.).
+The driver supports also a negotiated and extendable feature set.
+
+Some HiNIC devices support SR-IOV. This driver is used for Physical Function
+(PF).
+
+HiNIC devices support MSI-X interrupt vector for each Tx/Rx queue and
+adaptive interrupt moderation.
+
+HiNIC devices support also various offload features such as checksum offload,
+TCP Transmit Segmentation Offload(TSO), Receive-Side Scaling(RSS) and
+LRO(Large Receive Offload).
+
+
+Supported PCI vendor ID/device IDs:
+===
+
+19e5:1822 - HiNIC PF
+
+
+Driver Architecture and Source Code:
+
+
+hinic_dev - Implement a Logical Network device that is independent from
+specific HW details about HW data structure formats.
+
+hinic_hwdev - Implement the HW details of the device and include the components
+for accessing the PCI NIC.
+
+hinic_hwdev contains the following components:
+===
+
+HW Interface:
+=
+
+The interface for accessing the pci device (DMA memory and PCI BARs).
+(hinic_hw_if.c, hinic_hw_if.h)
+
+Configuration Status Registers Area that describes the HW Registers on the
+configuration and status BAR0. (hinic_hw_csr.h)
+
+MGMT components:
+
+
+Asynchronous Event Queues(AEQs) - The event queues for receiving messages from
+the MGMT modules on the cards. (hinic_hw_eqs.c, hinic_hw_eqs.h)
+
+Application Programmable Interface commands(API CMD) - Interface for sending
+MGMT commands to the card. (hinic_hw_api_cmd.c, hinic_hw_api_cmd.h)
+
+Management (MGMT) - the PF to MGMT channel that uses API CMD for sending MGMT
+commands to the card and receives notifications from the MGMT modules on the
+card by AEQs. Also set the addresses of the IO CMDQs in HW.
+(hinic_hw_mgmt.c, hinic_hw_mgmt.h)
+
+IO components:
+==
+
+Completion Event Queues(CEQs) - The completion Event Queues that describe IO
+tasks that are finished. (hinic_hw_eqs.c, hinic_hw_eqs.h)
+
+Work Queues(WQ) - Contain the memory and operations for use by CMD queues and
+the Queue Pairs. The WQ is a Memory Block in a Page. The Block contains
+pointers to Memory Areas that are the Memory for the Work Queue Elements(WQEs).
+(hinic_hw_wq.c, hinic_hw_wq.h)
+
+Command Queues(CMDQ) - The queues for sending commands for IO management and is
+used to set the QPs addresses in HW. The commands completion events are
+accumulated on the CEQ that is configured to receive the CMDQ completion 
events.
+(hinic_hw_cmdq.c, hinic_hw_cmdq.h)
+
+Queue Pairs(QPs) - The HW Receive and Send queues for Receiving and 
Transmitting
+Data. (hinic_hw_qp.c, hini

[PATCH V4 net-next 03/21] net-next/hinic: Initialize api cmd resources

2017-08-16 Thread Aviad Krawczyk
Initialize api cmd resources as part of management initialization.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/Makefile |   4 +-
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.c   | 446 +
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.h   | 102 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c  |  10 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h  |   3 +
 5 files changed, 563 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.h

diff --git a/drivers/net/ethernet/huawei/hinic/Makefile 
b/drivers/net/ethernet/huawei/hinic/Makefile
index 717ad71..beba90a 100644
--- a/drivers/net/ethernet/huawei/hinic/Makefile
+++ b/drivers/net/ethernet/huawei/hinic/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_HINIC) += hinic.o
 
-hinic-y := hinic_main.o hinic_hw_dev.o hinic_hw_mgmt.o hinic_hw_eqs.o \
-  hinic_hw_if.o
+hinic-y := hinic_main.o hinic_hw_dev.o hinic_hw_mgmt.o hinic_hw_api_cmd.o \
+  hinic_hw_eqs.o hinic_hw_if.o
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
new file mode 100644
index 000..4291f8e
--- /dev/null
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
@@ -0,0 +1,446 @@
+/*
+ * Huawei HiNIC PCI Express Linux driver
+ * Copyright(c) 2017 Huawei Technologies Co., Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "hinic_hw_if.h"
+#include "hinic_hw_api_cmd.h"
+
+#define API_CHAIN_NUM_CELLS 32
+
+#define API_CMD_CELL_SIZE_SHIFT 6
+#define API_CMD_CELL_SIZE_MIN   (BIT(API_CMD_CELL_SIZE_SHIFT))
+
+#define API_CMD_CELL_SIZE(cell_size)\
+   (((cell_size) >= API_CMD_CELL_SIZE_MIN) ? \
+(1 << (fls(cell_size - 1))) : API_CMD_CELL_SIZE_MIN)
+
+#define API_CMD_BUF_SIZE2048
+
+/**
+ * api_cmd_chain_hw_init - initialize the chain in the HW
+ * @chain: the API CMD specific chain to initialize in HW
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int api_cmd_chain_hw_init(struct hinic_api_cmd_chain *chain)
+{
+   /* should be implemented */
+   return 0;
+}
+
+/**
+ * free_cmd_buf - free the dma buffer of API CMD command
+ * @chain: the API CMD specific chain of the cmd
+ * @cell_idx: the cell index of the cmd
+ **/
+static void free_cmd_buf(struct hinic_api_cmd_chain *chain, int cell_idx)
+{
+   struct hinic_api_cmd_cell_ctxt *cell_ctxt;
+   struct hinic_hwif *hwif = chain->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+
+   cell_ctxt = &chain->cell_ctxt[cell_idx];
+
+   dma_free_coherent(&pdev->dev, API_CMD_BUF_SIZE,
+ cell_ctxt->api_cmd_vaddr,
+ cell_ctxt->api_cmd_paddr);
+}
+
+/**
+ * alloc_cmd_buf - allocate a dma buffer for API CMD command
+ * @chain: the API CMD specific chain for the cmd
+ * @cell: the cell in the HW for the cmd
+ * @cell_idx: the index of the cell
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int alloc_cmd_buf(struct hinic_api_cmd_chain *chain,
+struct hinic_api_cmd_cell *cell, int cell_idx)
+{
+   struct hinic_api_cmd_cell_ctxt *cell_ctxt;
+   struct hinic_hwif *hwif = chain->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   dma_addr_t cmd_paddr;
+   u8 *cmd_vaddr;
+   int err = 0;
+
+   cmd_vaddr = dma_zalloc_coherent(&pdev->dev, API_CMD_BUF_SIZE,
+   &cmd_paddr, GFP_KERNEL);
+   if (!cmd_vaddr) {
+   dev_err(&pdev->dev, "Failed to allocate API CMD DMA memory\n");
+   return -ENOMEM;
+   }
+
+   cell_ctxt = &chain->cell_ctxt[cell_idx];
+
+   cell_ctxt->api_cmd_vaddr = cmd_vaddr;
+   cell_ctxt->api_cmd_paddr = cmd_paddr;
+
+   /* set the cmd DMA address in the cell */
+   switch (chain->chain_type) {
+   case HINIC_API_CMD_WRITE_TO_MGMT_CPU:
+   /* The data in the HW should be in Big Endian Format */
+   cell->write.hw_cmd_paddr = cpu_to_be64(cmd_paddr);
+   break;
+
+   default:
+   dev_err(&pdev->dev, "Unsupported API CMD chain type\n");
+   free_cmd_buf(chain, cell_idx);
+   err = -EINVAL;
+   break;
+   }
+
+ 

[PATCH V4 net-next 04/21] net-next/hinic: Initialize api cmd hw

2017-08-16 Thread Aviad Krawczyk
Update the hardware about api cmd resources and initialize it.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.c   | 173 -
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.h   |  38 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h   |  26 
 3 files changed, 236 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
index 4291f8e..234ede9 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
@@ -13,6 +13,7 @@
  *
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -21,8 +22,12 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 
+#include "hinic_hw_csr.h"
 #include "hinic_hw_if.h"
 #include "hinic_hw_api_cmd.h"
 
@@ -35,8 +40,157 @@
(((cell_size) >= API_CMD_CELL_SIZE_MIN) ? \
 (1 << (fls(cell_size - 1))) : API_CMD_CELL_SIZE_MIN)
 
+#define API_CMD_CELL_SIZE_VAL(size) \
+   ilog2((size) >> API_CMD_CELL_SIZE_SHIFT)
+
 #define API_CMD_BUF_SIZE2048
 
+#define API_CMD_TIMEOUT 1000
+
+enum api_cmd_xor_chk_level {
+   XOR_CHK_DIS = 0,
+
+   XOR_CHK_ALL = 3,
+};
+
+/**
+ * api_cmd_hw_restart - restart the chain in the HW
+ * @chain: the API CMD specific chain to restart
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int api_cmd_hw_restart(struct hinic_api_cmd_chain *chain)
+{
+   struct hinic_hwif *hwif = chain->hwif;
+   int err = -ETIMEDOUT;
+   unsigned long end;
+   u32 reg_addr, val;
+
+   /* Read Modify Write */
+   reg_addr = HINIC_CSR_API_CMD_CHAIN_REQ_ADDR(chain->chain_type);
+   val = hinic_hwif_read_reg(hwif, reg_addr);
+
+   val = HINIC_API_CMD_CHAIN_REQ_CLEAR(val, RESTART);
+   val |= HINIC_API_CMD_CHAIN_REQ_SET(1, RESTART);
+
+   hinic_hwif_write_reg(hwif, reg_addr, val);
+
+   end = jiffies + msecs_to_jiffies(API_CMD_TIMEOUT);
+   do {
+   val = hinic_hwif_read_reg(hwif, reg_addr);
+
+   if (!HINIC_API_CMD_CHAIN_REQ_GET(val, RESTART)) {
+   err = 0;
+   break;
+   }
+
+   msleep(20);
+   } while (time_before(jiffies, end));
+
+   return err;
+}
+
+/**
+ * api_cmd_ctrl_init - set the control register of a chain
+ * @chain: the API CMD specific chain to set control register for
+ **/
+static void api_cmd_ctrl_init(struct hinic_api_cmd_chain *chain)
+{
+   struct hinic_hwif *hwif = chain->hwif;
+   u32 reg_addr, ctrl;
+   u16 cell_size;
+
+   /* Read Modify Write */
+   reg_addr = HINIC_CSR_API_CMD_CHAIN_CTRL_ADDR(chain->chain_type);
+
+   cell_size = API_CMD_CELL_SIZE_VAL(chain->cell_size);
+
+   ctrl = hinic_hwif_read_reg(hwif, reg_addr);
+
+   ctrl =  HINIC_API_CMD_CHAIN_CTRL_CLEAR(ctrl, RESTART_WB_STAT) &
+   HINIC_API_CMD_CHAIN_CTRL_CLEAR(ctrl, XOR_ERR) &
+   HINIC_API_CMD_CHAIN_CTRL_CLEAR(ctrl, AEQE_EN) &
+   HINIC_API_CMD_CHAIN_CTRL_CLEAR(ctrl, XOR_CHK_EN)  &
+   HINIC_API_CMD_CHAIN_CTRL_CLEAR(ctrl, CELL_SIZE);
+
+   ctrl |= HINIC_API_CMD_CHAIN_CTRL_SET(1, XOR_ERR)  |
+   HINIC_API_CMD_CHAIN_CTRL_SET(XOR_CHK_ALL, XOR_CHK_EN) |
+   HINIC_API_CMD_CHAIN_CTRL_SET(cell_size, CELL_SIZE);
+
+   hinic_hwif_write_reg(hwif, reg_addr, ctrl);
+}
+
+/**
+ * api_cmd_set_status_addr - set the status address of a chain in the HW
+ * @chain: the API CMD specific chain to set in HW status address for
+ **/
+static void api_cmd_set_status_addr(struct hinic_api_cmd_chain *chain)
+{
+   struct hinic_hwif *hwif = chain->hwif;
+   u32 addr, val;
+
+   addr = HINIC_CSR_API_CMD_STATUS_HI_ADDR(chain->chain_type);
+   val = upper_32_bits(chain->wb_status_paddr);
+   hinic_hwif_write_reg(hwif, addr, val);
+
+   addr = HINIC_CSR_API_CMD_STATUS_LO_ADDR(chain->chain_type);
+   val = lower_32_bits(chain->wb_status_paddr);
+   hinic_hwif_write_reg(hwif, addr, val);
+}
+
+/**
+ * api_cmd_set_num_cells - set the number cells of a chain in the HW
+ * @chain: the API CMD specific chain to set in HW the number of cells for
+ **/
+static void api_cmd_set_num_cells(struct hinic_api_cmd_chain *chain)
+{
+   struct hinic_hwif *hwif = chain->hwif;
+   u32 addr, val;
+
+   addr = HINIC_CSR_API_CMD_CHAIN_NUM_CELLS_ADDR(chain->chain_type);
+   val  = chain->num_cells;
+   hinic_hwif_write_reg(hwif, addr, val);
+}
+
+/**
+ * api_cmd_head_init - set the head of a chain in the HW
+ * @chain: the API CMD specific chain to set in HW the head for
+ **/
+static void api_cmd_head_init(struct hinic_api_cmd_chain *chain)
+{
+   struct hinic_hwif *hwif = chain->hwif;
+   u32 addr, val;
+
+   addr = HINIC_C

[PATCH V4 net-next 05/21] net-next/hinic: Add management messages

2017-08-16 Thread Aviad Krawczyk
Add the management messages for sending to api cmd and the asynchronous
event handler for the completion of the messages.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.c   |  35 ++
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.h   |   3 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h|   5 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c  | 439 -
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h  |  59 +++
 5 files changed, 538 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
index 234ede9..b44ddf2 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
@@ -54,6 +54,41 @@ enum api_cmd_xor_chk_level {
 };
 
 /**
+ * api_cmd - API CMD command
+ * @chain: chain for the command
+ * @dest: destination node on the card that will receive the command
+ * @cmd: command data
+ * @size: the command size
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int api_cmd(struct hinic_api_cmd_chain *chain,
+  enum hinic_node_id dest, u8 *cmd, u16 cmd_size)
+{
+   /* should be implemented */
+   return -EINVAL;
+}
+
+/**
+ * hinic_api_cmd_write - Write API CMD command
+ * @chain: chain for write command
+ * @dest: destination node on the card that will receive the command
+ * @cmd: command data
+ * @size: the command size
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+int hinic_api_cmd_write(struct hinic_api_cmd_chain *chain,
+   enum hinic_node_id dest, u8 *cmd, u16 size)
+{
+   /* Verify the chain type */
+   if (chain->chain_type == HINIC_API_CMD_WRITE_TO_MGMT_CPU)
+   return api_cmd(chain, dest, cmd, size);
+
+   return -EINVAL;
+}
+
+/**
  * api_cmd_hw_restart - restart the chain in the HW
  * @chain: the API CMD specific chain to restart
  *
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.h 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.h
index 0c83b80..e8865d6 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.h
@@ -132,6 +132,9 @@ struct hinic_api_cmd_chain {
struct hinic_api_cmd_cell   *curr_node;
 };
 
+int hinic_api_cmd_write(struct hinic_api_cmd_chain *chain,
+   enum hinic_node_id dest, u8 *cmd, u16 size);
+
 int hinic_api_cmd_init(struct hinic_api_cmd_chain **chain,
   struct hinic_hwif *hwif);
 
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
index b6d9850..98623d6 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
@@ -93,6 +93,7 @@
 #define HINIC_HWIF_NUM_IRQS(hwif)   ((hwif)->attr.num_irqs)
 #define HINIC_HWIF_FUNC_IDX(hwif)   ((hwif)->attr.func_idx)
 #define HINIC_HWIF_PCI_INTF(hwif)   ((hwif)->attr.pci_intf_idx)
+#define HINIC_HWIF_PF_IDX(hwif) ((hwif)->attr.pf_idx)
 
 #define HINIC_FUNC_TYPE(hwif)   ((hwif)->attr.func_type)
 #define HINIC_IS_PF(hwif)   (HINIC_FUNC_TYPE(hwif) == HINIC_PF)
@@ -127,6 +128,10 @@ enum hinic_mod_type {
HINIC_MOD_MAX   = 15
 };
 
+enum hinic_node_id {
+   HINIC_NODE_ID_MGMT = 21,
+};
+
 struct hinic_func_attr {
u16 func_idx;
u8  pf_idx;
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
index f914bc7..147c404 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c
@@ -16,6 +16,12 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
 
 #include "hinic_hw_if.h"
 #include "hinic_hw_eqs.h"
@@ -23,9 +29,267 @@
 #include "hinic_hw_mgmt.h"
 #include "hinic_hw_dev.h"
 
+#define SYNC_MSG_ID_MASK0x1FF
+
+#define SYNC_MSG_ID(pf_to_mgmt) ((pf_to_mgmt)->sync_msg_id)
+
+#define SYNC_MSG_ID_INC(pf_to_mgmt) (SYNC_MSG_ID(pf_to_mgmt) = \
+   ((SYNC_MSG_ID(pf_to_mgmt) + 1) & \
+SYNC_MSG_ID_MASK))
+
+#define MSG_SZ_IS_VALID(in_size)((in_size) <= MAX_MSG_LEN)
+
+#define MGMT_MSG_LEN_MIN20
+#define MGMT_MSG_LEN_STEP   16
+#define MGMT_MSG_RSVD_FOR_DEV   8
+
+#define SEGMENT_LEN 48
+
+#define MAX_PF_MGMT_BUF_SIZE2048
+
+/* Data should be SEG LEN size aligned */
+#define MAX_MSG_LEN 2016
+
+#define MSG_NOT_RESP0x
+
+#define MGMT_MSG_TIMEOUT1000
+
 #define mgmt_to_pfhwdev(pf_mgmt)\
container_of(pf_mgmt, struct hinic_pfhwdev, pf_to_mgmt)
 
+enum ms

[PATCH V4 net-next 06/21] net-next/hinic: Add api cmd commands

2017-08-16 Thread Aviad Krawczyk
Add the api cmd commands for sending management messages to the nic.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.c   | 329 -
 .../net/ethernet/huawei/hinic/hinic_hw_api_cmd.h   |  65 
 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h   |   6 +
 3 files changed, 398 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
index b44ddf2..d0145c6 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_api_cmd.c
@@ -25,7 +25,9 @@
 #include 
 #include 
 #include 
+#include 
 #include 
+#include 
 
 #include "hinic_hw_csr.h"
 #include "hinic_hw_if.h"
@@ -45,14 +47,313 @@
 
 #define API_CMD_BUF_SIZE2048
 
+/* Sizes of the members in hinic_api_cmd_cell */
+#define API_CMD_CELL_DESC_SIZE  8
+#define API_CMD_CELL_DATA_ADDR_SIZE 8
+
+#define API_CMD_CELL_ALIGNMENT  8
+
 #define API_CMD_TIMEOUT 1000
 
+#define MASKED_IDX(chain, idx)  ((idx) & ((chain)->num_cells - 1))
+
+#define SIZE_8BYTES(size)   (ALIGN((size), 8) >> 3)
+#define SIZE_4BYTES(size)   (ALIGN((size), 4) >> 2)
+
+#define RD_DMA_ATTR_DEFAULT 0
+#define WR_DMA_ATTR_DEFAULT 0
+
+enum api_cmd_data_format {
+   SGE_DATA = 1,   /* cell data is passed by hw address */
+};
+
+enum api_cmd_type {
+   API_CMD_WRITE = 0,
+};
+
+enum api_cmd_bypass {
+   NO_BYPASS   = 0,
+   BYPASS  = 1,
+};
+
 enum api_cmd_xor_chk_level {
XOR_CHK_DIS = 0,
 
XOR_CHK_ALL = 3,
 };
 
+static u8 xor_chksum_set(void *data)
+{
+   int idx;
+   u8 *val, checksum = 0;
+
+   val = data;
+
+   for (idx = 0; idx < 7; idx++)
+   checksum ^= val[idx];
+
+   return checksum;
+}
+
+static void set_prod_idx(struct hinic_api_cmd_chain *chain)
+{
+   enum hinic_api_cmd_chain_type chain_type = chain->chain_type;
+   struct hinic_hwif *hwif = chain->hwif;
+   u32 hw_prod_idx_addr, prod_idx;
+
+   hw_prod_idx_addr = HINIC_CSR_API_CMD_CHAIN_PI_ADDR(chain_type);
+   prod_idx = hinic_hwif_read_reg(hwif, hw_prod_idx_addr);
+
+   prod_idx = HINIC_API_CMD_PI_CLEAR(prod_idx, IDX);
+
+   prod_idx |= HINIC_API_CMD_PI_SET(chain->prod_idx, IDX);
+
+   hinic_hwif_write_reg(hwif, hw_prod_idx_addr, prod_idx);
+}
+
+static u32 get_hw_cons_idx(struct hinic_api_cmd_chain *chain)
+{
+   u32 addr, val;
+
+   addr = HINIC_CSR_API_CMD_STATUS_ADDR(chain->chain_type);
+   val  = hinic_hwif_read_reg(chain->hwif, addr);
+
+   return HINIC_API_CMD_STATUS_GET(val, CONS_IDX);
+}
+
+/**
+ * chain_busy - check if the chain is still processing last requests
+ * @chain: chain to check
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int chain_busy(struct hinic_api_cmd_chain *chain)
+{
+   struct hinic_hwif *hwif = chain->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   u32 prod_idx;
+
+   switch (chain->chain_type) {
+   case HINIC_API_CMD_WRITE_TO_MGMT_CPU:
+   chain->cons_idx = get_hw_cons_idx(chain);
+   prod_idx = chain->prod_idx;
+
+   /* check for a space for a new command */
+   if (chain->cons_idx == MASKED_IDX(chain, prod_idx + 1)) {
+   dev_err(&pdev->dev, "API CMD chain %d is busy\n",
+   chain->chain_type);
+   return -EBUSY;
+   }
+   break;
+
+   default:
+   dev_err(&pdev->dev, "Unknown API CMD Chain type\n");
+   break;
+   }
+
+   return 0;
+}
+
+/**
+ * get_cell_data_size - get the data size of a specific cell type
+ * @type: chain type
+ *
+ * Return the data(Desc + Address) size in the cell
+ **/
+static u8 get_cell_data_size(enum hinic_api_cmd_chain_type type)
+{
+   u8 cell_data_size = 0;
+
+   switch (type) {
+   case HINIC_API_CMD_WRITE_TO_MGMT_CPU:
+   cell_data_size = ALIGN(API_CMD_CELL_DESC_SIZE +
+  API_CMD_CELL_DATA_ADDR_SIZE,
+  API_CMD_CELL_ALIGNMENT);
+   break;
+   default:
+   break;
+   }
+
+   return cell_data_size;
+}
+
+/**
+ * prepare_cell_ctrl - prepare the ctrl of the cell for the command
+ * @cell_ctrl: the control of the cell to set the control value into it
+ * @data_size: the size of the data in the cell
+ **/
+static void prepare_cell_ctrl(u64 *cell_ctrl, u16 data_size)
+{
+   u8 chksum;
+   u64 ctrl;
+
+   ctrl =  HINIC_API_CMD_CELL_CTRL_SET(SIZE_8BYTES(data_size), DATA_SZ)  |
+   HINIC_API_CMD_CELL_CTRL_SET(RD_DMA_ATTR_DEFAULT, RD_DMA_ATTR) |
+   HINIC_API_CMD_CELL_CTRL_SET(WR_DMA_ATTR_DEFAULT, WR_DMA_ATTR);
+
+   chksum = xor_chks

[PATCH V4 net-next 07/21] net-next/hinic: Add aeqs

2017-08-16 Thread Aviad Krawczyk
Handle aeq elements that are accumulated on the aeq by calling the
registered handler for the specific event.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h |  49 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c | 464 ++-
 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h |  81 
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.c  |  91 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h  |  46 +++
 5 files changed, 729 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
index ebbf054..52eb89c 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
@@ -65,4 +65,53 @@
 #define HINIC_CSR_API_CMD_STATUS_ADDR(idx)  \
(HINIC_CSR_API_CMD_BASE + 0x30 + (idx) * HINIC_CSR_API_CMD_STRIDE)
 
+/* MSI-X registers */
+#define HINIC_CSR_MSIX_CTRL_BASE0x2000
+#define HINIC_CSR_MSIX_CNT_BASE 0x2004
+
+#define HINIC_CSR_MSIX_STRIDE   0x8
+
+#define HINIC_CSR_MSIX_CTRL_ADDR(idx)   \
+   (HINIC_CSR_MSIX_CTRL_BASE + (idx) * HINIC_CSR_MSIX_STRIDE)
+
+#define HINIC_CSR_MSIX_CNT_ADDR(idx)\
+   (HINIC_CSR_MSIX_CNT_BASE + (idx) * HINIC_CSR_MSIX_STRIDE)
+
+/* EQ registers */
+#define HINIC_AEQ_MTT_OFF_BASE_ADDR 0x200
+
+#define HINIC_EQ_MTT_OFF_STRIDE 0x40
+
+#define HINIC_CSR_AEQ_MTT_OFF(id)   \
+   (HINIC_AEQ_MTT_OFF_BASE_ADDR + (id) * HINIC_EQ_MTT_OFF_STRIDE)
+
+#define HINIC_CSR_EQ_PAGE_OFF_STRIDE8
+
+#define HINIC_CSR_AEQ_HI_PHYS_ADDR_REG(q_id, pg_num)\
+   (HINIC_CSR_AEQ_MTT_OFF(q_id) + \
+(pg_num) * HINIC_CSR_EQ_PAGE_OFF_STRIDE)
+
+#define HINIC_CSR_AEQ_LO_PHYS_ADDR_REG(q_id, pg_num)\
+   (HINIC_CSR_AEQ_MTT_OFF(q_id) + \
+(pg_num) * HINIC_CSR_EQ_PAGE_OFF_STRIDE + 4)
+
+#define HINIC_AEQ_CTRL_0_ADDR_BASE  0xE00
+#define HINIC_AEQ_CTRL_1_ADDR_BASE  0xE04
+#define HINIC_AEQ_CONS_IDX_ADDR_BASE0xE08
+#define HINIC_AEQ_PROD_IDX_ADDR_BASE0xE0C
+
+#define HINIC_EQ_OFF_STRIDE 0x80
+
+#define HINIC_CSR_AEQ_CTRL_0_ADDR(idx)  \
+   (HINIC_AEQ_CTRL_0_ADDR_BASE + (idx) * HINIC_EQ_OFF_STRIDE)
+
+#define HINIC_CSR_AEQ_CTRL_1_ADDR(idx)  \
+   (HINIC_AEQ_CTRL_1_ADDR_BASE + (idx) * HINIC_EQ_OFF_STRIDE)
+
+#define HINIC_CSR_AEQ_CONS_IDX_ADDR(idx)\
+   (HINIC_AEQ_CONS_IDX_ADDR_BASE + (idx) * HINIC_EQ_OFF_STRIDE)
+
+#define HINIC_CSR_AEQ_PROD_IDX_ADDR(idx)\
+   (HINIC_AEQ_PROD_IDX_ADDR_BASE + (idx) * HINIC_EQ_OFF_STRIDE)
+
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c
index a099d20..a53d5b3 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c
@@ -13,17 +13,74 @@
  *
  */
 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
 
+#include "hinic_hw_csr.h"
 #include "hinic_hw_if.h"
 #include "hinic_hw_eqs.h"
 
 #define HINIC_EQS_WQ_NAME   "hinic_eqs"
 
+#define GET_EQ_NUM_PAGES(eq, pg_size)   \
+   (ALIGN((eq)->q_len * (eq)->elem_size, pg_size) / (pg_size))
+
+#define GET_EQ_NUM_ELEMS_IN_PG(eq, pg_size) ((pg_size) / (eq)->elem_size)
+
+#define EQ_CONS_IDX_REG_ADDR(eq)HINIC_CSR_AEQ_CONS_IDX_ADDR((eq)->q_id)
+#define EQ_PROD_IDX_REG_ADDR(eq)HINIC_CSR_AEQ_PROD_IDX_ADDR((eq)->q_id)
+
+#define EQ_HI_PHYS_ADDR_REG(eq, pg_num) \
+   HINIC_CSR_AEQ_HI_PHYS_ADDR_REG((eq)->q_id, pg_num)
+
+#define EQ_LO_PHYS_ADDR_REG(eq, pg_num) \
+   HINIC_CSR_AEQ_LO_PHYS_ADDR_REG((eq)->q_id, pg_num)
+
+#define GET_EQ_ELEMENT(eq, idx) \
+   ((eq)->virt_addr[(idx) / (eq)->num_elem_in_pg] + \
+(((idx) & ((eq)->num_elem_in_pg - 1)) * (eq)->elem_size))
+
+#define GET_AEQ_ELEM(eq, idx)   ((struct hinic_aeq_elem *) \
+   GET_EQ_ELEMENT(eq, idx))
+
+#define GET_CURR_AEQ_ELEM(eq)   GET_AEQ_ELEM(eq, (eq)->cons_idx)
+
+#define PAGE_IN_4K(page_size)   ((page_size) >> 12)
+#define EQ_SET_HW_PAGE_SIZE_VAL(eq) (ilog2(PAGE_IN_4K((eq)->page_size)))
+
+#define ELEMENT_SIZE_IN_32B(eq) (((eq)->elem_size) >> 5)
+#define EQ_SET_HW_ELEM_SIZE_VAL(eq) (ilog2(ELEMENT_SIZE_IN_32B(eq)))
+
+#define EQ_MAX_PAGES8
+
+#define aeq_to_aeqs(eq) \
+   container_of((eq) - (eq)->q_id, struct hinic_aeqs, aeq[0])
+
+#define work_to_aeq_work(work)  \
+   container_

[PATCH V4 net-next 08/21] net-next/hinic: Add port management commands

2017-08-16 Thread Aviad Krawczyk
Add the port management commands that are sent as management messages.
The port management commands are used for netdev operations.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/Makefile   |   4 +-
 drivers/net/ethernet/huawei/hinic/hinic_dev.h|   4 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c |  30 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h |  29 +++
 drivers/net/ethernet/huawei/hinic/hinic_main.c   | 195 +++-
 drivers/net/ethernet/huawei/hinic/hinic_port.c   | 224 +++
 drivers/net/ethernet/huawei/hinic/hinic_port.h   |  68 +++
 7 files changed, 551 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_port.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_port.h

diff --git a/drivers/net/ethernet/huawei/hinic/Makefile 
b/drivers/net/ethernet/huawei/hinic/Makefile
index beba90a..dbb1b9d 100644
--- a/drivers/net/ethernet/huawei/hinic/Makefile
+++ b/drivers/net/ethernet/huawei/hinic/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_HINIC) += hinic.o
 
-hinic-y := hinic_main.o hinic_hw_dev.o hinic_hw_mgmt.o hinic_hw_api_cmd.o \
-  hinic_hw_eqs.o hinic_hw_if.o
+hinic-y := hinic_main.o hinic_port.o hinic_hw_dev.o hinic_hw_mgmt.o \
+  hinic_hw_api_cmd.o hinic_hw_eqs.o hinic_hw_if.o
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_dev.h 
b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
index 6c2c896..e54a45c 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
@@ -18,6 +18,7 @@
 
 #include 
 #include 
+#include 
 
 #include "hinic_hw_dev.h"
 
@@ -28,6 +29,9 @@ struct hinic_dev {
struct hinic_hwdev  *hwdev;
 
u32 msg_enable;
+
+   struct semaphoremgmt_lock;
+   unsigned long   *vlan_bitmap;
 };
 
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
index d430e60..6bb6c33 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
@@ -200,6 +200,36 @@ static void disable_msix(struct hinic_hwdev *hwdev)
 }
 
 /**
+ * hinic_port_msg_cmd - send port msg to mgmt
+ * @hwdev: the NIC HW device
+ * @cmd: the port command
+ * @buf_in: input buffer
+ * @in_size: input size
+ * @buf_out: output buffer
+ * @out_size: returned output size
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+int hinic_port_msg_cmd(struct hinic_hwdev *hwdev, enum hinic_port_cmd cmd,
+  void *buf_in, u16 in_size, void *buf_out, u16 *out_size)
+{
+   struct hinic_hwif *hwif = hwdev->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   struct hinic_pfhwdev *pfhwdev;
+
+   if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) {
+   dev_err(&pdev->dev, "unsupported PCI Function type\n");
+   return -EINVAL;
+   }
+
+   pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev);
+
+   return hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt, HINIC_MOD_L2NIC, cmd,
+buf_in, in_size, buf_out, out_size,
+HINIC_MGMT_MSG_SYNC);
+}
+
+/**
  * init_pfhwdev - Initialize the extended components of PF
  * @pfhwdev: the HW device for PF
  *
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
index feb60138..ee9e76a 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
@@ -30,6 +30,31 @@ struct hinic_cap {
u16 num_qps;
 };
 
+enum hinic_port_cmd {
+   HINIC_PORT_CMD_CHANGE_MTU   = 2,
+
+   HINIC_PORT_CMD_ADD_VLAN = 3,
+   HINIC_PORT_CMD_DEL_VLAN = 4,
+
+   HINIC_PORT_CMD_SET_MAC  = 9,
+   HINIC_PORT_CMD_GET_MAC  = 10,
+   HINIC_PORT_CMD_DEL_MAC  = 11,
+
+   HINIC_PORT_CMD_SET_RX_MODE  = 12,
+
+   HINIC_PORT_CMD_GET_LINK_STATE   = 24,
+
+   HINIC_PORT_CMD_SET_PORT_STATE   = 41,
+
+   HINIC_PORT_CMD_FWCTXT_INIT  = 69,
+
+   HINIC_PORT_CMD_SET_FUNC_STATE   = 93,
+
+   HINIC_PORT_CMD_GET_GLOBAL_QPN   = 102,
+
+   HINIC_PORT_CMD_GET_CAP  = 170,
+};
+
 struct hinic_hwdev {
struct hinic_hwif   *hwif;
struct msix_entry   *msix_entries;
@@ -45,6 +70,10 @@ struct hinic_pfhwdev {
struct hinic_pf_to_mgmt pf_to_mgmt;
 };
 
+int hinic_port_msg_cmd(struct hinic_hwdev *hwdev, enum hinic_port_cmd cmd,
+  void *buf_in, u16 in_size, void *buf_out,
+  u16 *out_size);
+
 struct hinic_hwdev *hinic_init_hwdev(struct pci_dev *pdev);
 
 void hinic_free_hwdev(struct hinic_hwdev *hwdev);
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c 
b/drivers/net/ethernet/huawei/hinic/hinic_main.

[PATCH V4 net-next 09/21] net-next/hinic: Add Rx mode and link event handler

2017-08-16 Thread Aviad Krawczyk
Add port management message for setting Rx mode in the card,
used for rx_mode netdev operation. The link event handler is used for
getting a notification about the link state.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/hinic_dev.h |  17 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h  |   2 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c  | 118 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h  |  37 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.c   |  17 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h   |  17 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c |  64 -
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h |  28 +++
 drivers/net/ethernet/huawei/hinic/hinic_main.c| 284 ++
 drivers/net/ethernet/huawei/hinic/hinic_port.c|  92 +++
 drivers/net/ethernet/huawei/hinic/hinic_port.h|  66 +
 11 files changed, 741 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_dev.h 
b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
index e54a45c..5c5b4e9 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
@@ -19,19 +19,36 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "hinic_hw_dev.h"
 
 #define HINIC_DRV_NAME  "hinic"
 
+enum hinic_flags {
+   HINIC_LINK_UP = BIT(0),
+   HINIC_INTF_UP = BIT(1),
+};
+
+struct hinic_rx_mode_work {
+   struct work_struct  work;
+   u32 rx_mode;
+};
+
 struct hinic_dev {
struct net_device   *netdev;
struct hinic_hwdev  *hwdev;
 
u32 msg_enable;
 
+   unsigned intflags;
+
struct semaphoremgmt_lock;
unsigned long   *vlan_bitmap;
+
+   struct hinic_rx_mode_work   rx_mode_work;
+   struct workqueue_struct *workq;
 };
 
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
index 52eb89c..1f57301 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
@@ -20,6 +20,8 @@
 #define HINIC_CSR_FUNC_ATTR0_ADDR   0x0
 #define HINIC_CSR_FUNC_ATTR1_ADDR   0x4
 
+#define HINIC_CSR_FUNC_ATTR5_ADDR   0x14
+
 #define HINIC_DMA_ATTR_BASE 0xC80
 #define HINIC_ELECTION_BASE 0x4200
 
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
index 6bb6c33..75fd6d2 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
@@ -230,6 +230,114 @@ int hinic_port_msg_cmd(struct hinic_hwdev *hwdev, enum 
hinic_port_cmd cmd,
 }
 
 /**
+ * hinic_hwdev_cb_register - register callback handler for MGMT events
+ * @hwdev: the NIC HW device
+ * @cmd: the mgmt event
+ * @handle: private data for the handler
+ * @handler: event handler
+ **/
+void hinic_hwdev_cb_register(struct hinic_hwdev *hwdev,
+enum hinic_mgmt_msg_cmd cmd, void *handle,
+void (*handler)(void *handle, void *buf_in,
+u16 in_size, void *buf_out,
+u16 *out_size))
+{
+   struct hinic_hwif *hwif = hwdev->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   struct hinic_pfhwdev *pfhwdev;
+   struct hinic_nic_cb *nic_cb;
+   u8 cmd_cb;
+
+   if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) {
+   dev_err(&pdev->dev, "unsupported PCI Function type\n");
+   return;
+   }
+
+   pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev);
+
+   cmd_cb = cmd - HINIC_MGMT_MSG_CMD_BASE;
+   nic_cb = &pfhwdev->nic_cb[cmd_cb];
+
+   nic_cb->handler = handler;
+   nic_cb->handle = handle;
+   nic_cb->cb_state = HINIC_CB_ENABLED;
+}
+
+/**
+ * hinic_hwdev_cb_unregister - unregister callback handler for MGMT events
+ * @hwdev: the NIC HW device
+ * @cmd: the mgmt event
+ **/
+void hinic_hwdev_cb_unregister(struct hinic_hwdev *hwdev,
+  enum hinic_mgmt_msg_cmd cmd)
+{
+   struct hinic_hwif *hwif = hwdev->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   struct hinic_pfhwdev *pfhwdev;
+   struct hinic_nic_cb *nic_cb;
+   u8 cmd_cb;
+
+   if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) {
+   dev_err(&pdev->dev, "unsupported PCI Function type\n");
+   return;
+   }
+
+   pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev);
+
+   cmd_cb = cmd - HINIC_MGMT_MSG_CMD_BASE;
+   nic_cb = &pfhwdev->nic_cb[cmd_cb];
+
+   nic_cb->cb_state &= ~HINIC_CB_ENABLED;
+
+   while (

[PATCH V4 net-next 10/21] net-next/hinic: Add logical Txq and Rxq

2017-08-16 Thread Aviad Krawczyk
Create the logical queues of the nic.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/Makefile   |   5 +-
 drivers/net/ethernet/huawei/hinic/hinic_dev.h|   5 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c | 131 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h |  20 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.c  | 144 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.h  |  46 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h  |  32 +
 drivers/net/ethernet/huawei/hinic/hinic_main.c   | 172 ++-
 drivers/net/ethernet/huawei/hinic/hinic_rx.c |  72 ++
 drivers/net/ethernet/huawei/hinic/hinic_rx.h |  46 ++
 drivers/net/ethernet/huawei/hinic/hinic_tx.c |  75 ++
 drivers/net/ethernet/huawei/hinic/hinic_tx.h |  49 +++
 12 files changed, 793 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_io.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_io.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_rx.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_rx.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_tx.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_tx.h

diff --git a/drivers/net/ethernet/huawei/hinic/Makefile 
b/drivers/net/ethernet/huawei/hinic/Makefile
index dbb1b9d..f60c449 100644
--- a/drivers/net/ethernet/huawei/hinic/Makefile
+++ b/drivers/net/ethernet/huawei/hinic/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_HINIC) += hinic.o
 
-hinic-y := hinic_main.o hinic_port.o hinic_hw_dev.o hinic_hw_mgmt.o \
-  hinic_hw_api_cmd.o hinic_hw_eqs.o hinic_hw_if.o
+hinic-y := hinic_main.o hinic_tx.o hinic_rx.o hinic_port.o hinic_hw_dev.o \
+  hinic_hw_io.o hinic_hw_mgmt.o hinic_hw_api_cmd.o hinic_hw_eqs.o \
+  hinic_hw_if.o
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_dev.h 
b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
index 5c5b4e9..5b8231d 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
@@ -23,6 +23,8 @@
 #include 
 
 #include "hinic_hw_dev.h"
+#include "hinic_tx.h"
+#include "hinic_rx.h"
 
 #define HINIC_DRV_NAME  "hinic"
 
@@ -49,6 +51,9 @@ struct hinic_dev {
 
struct hinic_rx_mode_work   rx_mode_work;
struct workqueue_struct *workq;
+
+   struct hinic_txq*txqs;
+   struct hinic_rxq*rxqs;
 };
 
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
index 75fd6d2..d113908 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
@@ -25,6 +25,8 @@
 #include "hinic_hw_if.h"
 #include "hinic_hw_eqs.h"
 #include "hinic_hw_mgmt.h"
+#include "hinic_hw_qp.h"
+#include "hinic_hw_io.h"
 #include "hinic_hw_dev.h"
 
 #define MAX_IRQS(max_qps, num_aeqs, num_ceqs)   \
@@ -230,6 +232,99 @@ int hinic_port_msg_cmd(struct hinic_hwdev *hwdev, enum 
hinic_port_cmd cmd,
 }
 
 /**
+ * get_base_qpn - get the first qp number
+ * @hwdev: the NIC HW device
+ * @base_qpn: returned qp number
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int get_base_qpn(struct hinic_hwdev *hwdev, u16 *base_qpn)
+{
+   struct hinic_cmd_base_qpn cmd_base_qpn;
+   struct hinic_hwif *hwif = hwdev->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   u16 out_size;
+   int err;
+
+   cmd_base_qpn.func_idx = HINIC_HWIF_FUNC_IDX(hwif);
+
+   err = hinic_port_msg_cmd(hwdev, HINIC_PORT_CMD_GET_GLOBAL_QPN,
+&cmd_base_qpn, sizeof(cmd_base_qpn),
+&cmd_base_qpn, &out_size);
+   if (err || (out_size != sizeof(cmd_base_qpn)) || cmd_base_qpn.status) {
+   dev_err(&pdev->dev, "Failed to get base qpn, status = %d\n",
+   cmd_base_qpn.status);
+   return -EFAULT;
+   }
+
+   *base_qpn = cmd_base_qpn.qpn;
+   return 0;
+}
+
+/**
+ * hinic_hwdev_ifup - Preparing the HW for passing IO
+ * @hwdev: the NIC HW device
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+int hinic_hwdev_ifup(struct hinic_hwdev *hwdev)
+{
+   struct hinic_func_to_io *func_to_io = &hwdev->func_to_io;
+   struct hinic_cap *nic_cap = &hwdev->nic_cap;
+   struct hinic_hwif *hwif = hwdev->hwif;
+   int err, num_aeqs, num_ceqs, num_qps;
+   struct msix_entry *sq_msix_entries;
+   struct msix_entry *rq_msix_entries;
+   struct pci_dev *pdev = hwif->pdev;
+   u16 base_qpn;
+
+   err = get_base_qpn(hwdev, &base_qpn);
+   if (err) {
+   dev_err(&pdev->dev, "Failed to get global base qp number\n");
+   return err;
+   }
+
+   num_aeqs = HINIC_HWIF_NUM_AEQS(hw

[PATCH V4 net-next 11/21] net-next/hinic: Add wq

2017-08-16 Thread Aviad Krawczyk
Create work queues for being used by the queue pairs.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/Makefile   |   4 +-
 drivers/net/ethernet/huawei/hinic/hinic_common.h |  25 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.c  |  69 ++-
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.h  |   6 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h  |  14 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c  | 517 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h  |  86 
 drivers/net/ethernet/huawei/hinic/hinic_hw_wqe.h | 253 +++
 8 files changed, 969 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_common.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_wqe.h

diff --git a/drivers/net/ethernet/huawei/hinic/Makefile 
b/drivers/net/ethernet/huawei/hinic/Makefile
index f60c449..0575a34 100644
--- a/drivers/net/ethernet/huawei/hinic/Makefile
+++ b/drivers/net/ethernet/huawei/hinic/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_HINIC) += hinic.o
 
 hinic-y := hinic_main.o hinic_tx.o hinic_rx.o hinic_port.o hinic_hw_dev.o \
-  hinic_hw_io.o hinic_hw_mgmt.o hinic_hw_api_cmd.o hinic_hw_eqs.o \
-  hinic_hw_if.o
+  hinic_hw_io.o hinic_hw_wq.o hinic_hw_mgmt.o hinic_hw_api_cmd.o \
+  hinic_hw_eqs.o hinic_hw_if.o
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_common.h 
b/drivers/net/ethernet/huawei/hinic/hinic_common.h
new file mode 100644
index 000..6a83c15
--- /dev/null
+++ b/drivers/net/ethernet/huawei/hinic/hinic_common.h
@@ -0,0 +1,25 @@
+/*
+ * Huawei HiNIC PCI Express Linux driver
+ * Copyright(c) 2017 Huawei Technologies Co., Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ */
+
+#ifndef HINIC_COMMON_H
+#define HINIC_COMMON_H
+
+struct hinic_sge {
+   u32 hi_addr;
+   u32 lo_addr;
+   u32 len;
+};
+
+#endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c
index ebe28ee..1bf944e 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c
@@ -20,6 +20,8 @@
 #include 
 
 #include "hinic_hw_if.h"
+#include "hinic_hw_wqe.h"
+#include "hinic_hw_wq.h"
 #include "hinic_hw_qp.h"
 #include "hinic_hw_io.h"
 
@@ -38,8 +40,33 @@ static int init_qp(struct hinic_func_to_io *func_to_io,
   struct msix_entry *sq_msix_entry,
   struct msix_entry *rq_msix_entry)
 {
-   /* should be implemented */
+   struct hinic_hwif *hwif = func_to_io->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   int err;
+
+   qp->q_id = q_id;
+
+   err = hinic_wq_allocate(&func_to_io->wqs, &func_to_io->sq_wq[q_id],
+   HINIC_SQ_WQEBB_SIZE, HINIC_SQ_PAGE_SIZE,
+   HINIC_SQ_DEPTH, HINIC_SQ_WQE_MAX_SIZE);
+   if (err) {
+   dev_err(&pdev->dev, "Failed to allocate WQ for SQ\n");
+   return err;
+   }
+
+   err = hinic_wq_allocate(&func_to_io->wqs, &func_to_io->rq_wq[q_id],
+   HINIC_RQ_WQEBB_SIZE, HINIC_RQ_PAGE_SIZE,
+   HINIC_RQ_DEPTH, HINIC_RQ_WQE_SIZE);
+   if (err) {
+   dev_err(&pdev->dev, "Failed to allocate WQ for RQ\n");
+   goto err_rq_alloc;
+   }
+
return 0;
+
+err_rq_alloc:
+   hinic_wq_free(&func_to_io->wqs, &func_to_io->sq_wq[q_id]);
+   return err;
 }
 
 /**
@@ -50,7 +77,10 @@ static int init_qp(struct hinic_func_to_io *func_to_io,
 static void destroy_qp(struct hinic_func_to_io *func_to_io,
   struct hinic_qp *qp)
 {
-   /* should be implemented */
+   int q_id = qp->q_id;
+
+   hinic_wq_free(&func_to_io->wqs, &func_to_io->rq_wq[q_id]);
+   hinic_wq_free(&func_to_io->wqs, &func_to_io->sq_wq[q_id]);
 }
 
 /**
@@ -70,7 +100,7 @@ int hinic_io_create_qps(struct hinic_func_to_io *func_to_io,
 {
struct hinic_hwif *hwif = func_to_io->hwif;
struct pci_dev *pdev = hwif->pdev;
-   size_t qps_size;
+   size_t qps_size, wq_size;
int i, j, err;
 
qps_size = num_qps * sizeof(*func_to_io->qps);
@@ -78,6 +108,20 @@ int hinic_io_create_qps(struct hinic_func_to_io *func_to_io,
if (!func_to_io->qps)
return -ENOMEM;
 
+   wq_size = n

[PATCH V4 net-next 12/21] net-next/hinic: Add qp resources

2017-08-16 Thread Aviad Krawczyk
Create the resources for queue pair operations: doorbell area,
consumer index address and producer index address.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/Makefile  |   4 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h |   1 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.c | 164 ++-
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.h |  27 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c | 266 
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h |  50 -
 6 files changed, 507 insertions(+), 5 deletions(-)
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c

diff --git a/drivers/net/ethernet/huawei/hinic/Makefile 
b/drivers/net/ethernet/huawei/hinic/Makefile
index 0575a34..84815f7 100644
--- a/drivers/net/ethernet/huawei/hinic/Makefile
+++ b/drivers/net/ethernet/huawei/hinic/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_HINIC) += hinic.o
 
 hinic-y := hinic_main.o hinic_tx.o hinic_rx.o hinic_port.o hinic_hw_dev.o \
-  hinic_hw_io.o hinic_hw_wq.o hinic_hw_mgmt.o hinic_hw_api_cmd.o \
-  hinic_hw_eqs.o hinic_hw_if.o
+  hinic_hw_io.o hinic_hw_qp.o hinic_hw_wq.o hinic_hw_mgmt.o \
+  hinic_hw_api_cmd.o hinic_hw_eqs.o hinic_hw_if.o
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
index 2280698..8f59195 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_if.h
@@ -137,6 +137,7 @@
 #define HINIC_IS_PPF(hwif)  (HINIC_FUNC_TYPE(hwif) == HINIC_PPF)
 
 #define HINIC_PCI_CFG_REGS_BAR  0
+#define HINIC_PCI_DB_BAR4
 
 #define HINIC_PCIE_ST_DISABLE   0
 #define HINIC_PCIE_AT_DISABLE   0
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c
index 1bf944e..ad12cc7 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_io.c
@@ -13,11 +13,16 @@
  *
  */
 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 
 #include "hinic_hw_if.h"
 #include "hinic_hw_wqe.h"
@@ -25,6 +30,76 @@
 #include "hinic_hw_qp.h"
 #include "hinic_hw_io.h"
 
+#define CI_Q_ADDR_SIZE  sizeof(u32)
+
+#define CI_ADDR(base_addr, q_id)((base_addr) + \
+(q_id) * CI_Q_ADDR_SIZE)
+
+#define CI_TABLE_SIZE(num_qps)  ((num_qps) * CI_Q_ADDR_SIZE)
+
+#define DB_IDX(db, db_base) \
+   (((unsigned long)(db) - (unsigned long)(db_base)) / HINIC_DB_PAGE_SIZE)
+
+static void init_db_area_idx(struct hinic_free_db_area *free_db_area)
+{
+   int i;
+
+   for (i = 0; i < HINIC_DB_MAX_AREAS; i++)
+   free_db_area->db_idx[i] = i;
+
+   free_db_area->alloc_pos = 0;
+   free_db_area->return_pos = HINIC_DB_MAX_AREAS;
+
+   free_db_area->num_free = HINIC_DB_MAX_AREAS;
+
+   sema_init(&free_db_area->idx_lock, 1);
+}
+
+static void __iomem *get_db_area(struct hinic_func_to_io *func_to_io)
+{
+   struct hinic_free_db_area *free_db_area = &func_to_io->free_db_area;
+   int pos, idx;
+
+   down(&free_db_area->idx_lock);
+
+   free_db_area->num_free--;
+
+   if (free_db_area->num_free < 0) {
+   free_db_area->num_free++;
+   up(&free_db_area->idx_lock);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   pos = free_db_area->alloc_pos++;
+   pos &= HINIC_DB_MAX_AREAS - 1;
+
+   idx = free_db_area->db_idx[pos];
+
+   free_db_area->db_idx[pos] = -1;
+
+   up(&free_db_area->idx_lock);
+
+   return func_to_io->db_base + idx * HINIC_DB_PAGE_SIZE;
+}
+
+static void return_db_area(struct hinic_func_to_io *func_to_io,
+  void __iomem *db_base)
+{
+   struct hinic_free_db_area *free_db_area = &func_to_io->free_db_area;
+   int pos, idx = DB_IDX(db_base, func_to_io->db_base);
+
+   down(&free_db_area->idx_lock);
+
+   pos = free_db_area->return_pos++;
+   pos &= HINIC_DB_MAX_AREAS - 1;
+
+   free_db_area->db_idx[pos] = idx;
+
+   free_db_area->num_free++;
+
+   up(&free_db_area->idx_lock);
+}
+
 /**
  * init_qp - Initialize a Queue Pair
  * @func_to_io: func to io channel that holds the IO components
@@ -42,6 +117,7 @@ static int init_qp(struct hinic_func_to_io *func_to_io,
 {
struct hinic_hwif *hwif = func_to_io->hwif;
struct pci_dev *pdev = hwif->pdev;
+   void __iomem *db_base;
int err;
 
qp->q_id = q_id;
@@ -62,8 +138,42 @@ static int init_qp(struct hinic_func_to_io *func_to_io,
goto err_rq_alloc;
}
 
+   db_base = get_db_area(func_to_io);
+   if (IS_ERR(db_base)) {
+   dev_err(&pdev->dev, "Failed to get DB area for SQ\n");
+   err = PTR_ERR(db_base);
+   goto err_get_

[PATCH V4 net-next 13/21] net-next/hinic: Set qp context

2017-08-16 Thread Aviad Krawczyk
Update the nic about the resources of the queue pairs.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/Makefile |   5 +-
 drivers/net/ethernet/huawei/hinic/hinic_common.c   |  55 ++
 drivers/net/ethernet/huawei/hinic/hinic_common.h   |   4 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c  |  87 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h  |  84 
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c   |   4 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.c| 151 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.h|   5 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c| 159 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h|  11 ++
 .../net/ethernet/huawei/hinic/hinic_hw_qp_ctxt.h   | 214 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h|   9 +
 12 files changed, 786 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_common.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h
 create mode 100644 drivers/net/ethernet/huawei/hinic/hinic_hw_qp_ctxt.h

diff --git a/drivers/net/ethernet/huawei/hinic/Makefile 
b/drivers/net/ethernet/huawei/hinic/Makefile
index 84815f7..289ce88b 100644
--- a/drivers/net/ethernet/huawei/hinic/Makefile
+++ b/drivers/net/ethernet/huawei/hinic/Makefile
@@ -1,5 +1,6 @@
 obj-$(CONFIG_HINIC) += hinic.o
 
 hinic-y := hinic_main.o hinic_tx.o hinic_rx.o hinic_port.o hinic_hw_dev.o \
-  hinic_hw_io.o hinic_hw_qp.o hinic_hw_wq.o hinic_hw_mgmt.o \
-  hinic_hw_api_cmd.o hinic_hw_eqs.o hinic_hw_if.o
+  hinic_hw_io.o hinic_hw_qp.o hinic_hw_cmdq.o hinic_hw_wq.o \
+  hinic_hw_mgmt.o hinic_hw_api_cmd.o hinic_hw_eqs.o hinic_hw_if.o \
+  hinic_common.o
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_common.c 
b/drivers/net/ethernet/huawei/hinic/hinic_common.c
new file mode 100644
index 000..1915ad6
--- /dev/null
+++ b/drivers/net/ethernet/huawei/hinic/hinic_common.c
@@ -0,0 +1,55 @@
+/*
+ * Huawei HiNIC PCI Express Linux driver
+ * Copyright(c) 2017 Huawei Technologies Co., Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ */
+
+#include 
+#include 
+
+#include "hinic_common.h"
+
+/**
+ * hinic_cpu_to_be32 - convert data to big endian 32 bit format
+ * @data: the data to convert
+ * @len: length of data to convert
+ **/
+void hinic_cpu_to_be32(void *data, int len)
+{
+   u32 *mem = data;
+   int i;
+
+   len = len / sizeof(u32);
+
+   for (i = 0; i < len; i++) {
+   *mem = cpu_to_be32(*mem);
+   mem++;
+   }
+}
+
+/**
+ * hinic_be32_to_cpu - convert data from big endian 32 bit format
+ * @data: the data to convert
+ * @len: length of data to convert
+ **/
+void hinic_be32_to_cpu(void *data, int len)
+{
+   u32 *mem = data;
+   int i;
+
+   len = len / sizeof(u32);
+
+   for (i = 0; i < len; i++) {
+   *mem = be32_to_cpu(*mem);
+   mem++;
+   }
+}
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_common.h 
b/drivers/net/ethernet/huawei/hinic/hinic_common.h
index 6a83c15..0f2f4ff 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_common.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_common.h
@@ -22,4 +22,8 @@ struct hinic_sge {
u32 len;
 };
 
+void hinic_cpu_to_be32(void *data, int len);
+
+void hinic_be32_to_cpu(void *data, int len);
+
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
new file mode 100644
index 000..2fd3924
--- /dev/null
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
@@ -0,0 +1,87 @@
+/*
+ * Huawei HiNIC PCI Express Linux driver
+ * Copyright(c) 2017 Huawei Technologies Co., Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ */
+
+#include 
+#include 
+
+#include "hinic_hw_if.h"
+#include "hinic_hw_cmdq.h"
+
+/**
+ * hinic_alloc_cmdq_buf - alloc buffer for sending command
+ * @cmdqs: the cmdqs
+ * @cmdq_buf: the buffer returned in this struct

[PATCH V4 net-next 14/21] net-next/hinic: Initialize cmdq

2017-08-16 Thread Aviad Krawczyk
Create the work queues for cmdq and update the nic about the
work queue contexts. cmdq commands are used for updating the
nic about the qp contexts.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c | 282 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h |  53 
 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h  |   2 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h |   5 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c   | 156 
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h   |   8 +
 6 files changed, 500 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
index 2fd3924..0dccbe6 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
@@ -13,11 +13,49 @@
  *
  */
 
+#include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
 
 #include "hinic_hw_if.h"
+#include "hinic_hw_mgmt.h"
+#include "hinic_hw_wq.h"
 #include "hinic_hw_cmdq.h"
+#include "hinic_hw_io.h"
+#include "hinic_hw_dev.h"
+
+#define CMDQ_DB_OFF SZ_2K
+
+#define CMDQ_WQEBB_SIZE 64
+#define CMDQ_DEPTH  SZ_4K
+
+#define CMDQ_WQ_PAGE_SIZE   SZ_4K
+
+#define WQE_LCMD_SIZE   64
+#define WQE_SCMD_SIZE   64
+
+#define CMDQ_PFN(addr, page_size)   ((addr) >> (ilog2(page_size)))
+
+#define cmdq_to_cmdqs(cmdq) container_of((cmdq) - (cmdq)->cmdq_type, \
+struct hinic_cmdqs, cmdq[0])
+
+#define cmdqs_to_func_to_io(cmdqs)  container_of(cmdqs, \
+struct hinic_func_to_io, \
+cmdqs)
+
+enum cmdq_wqe_type {
+   WQE_LCMD_TYPE = 0,
+   WQE_SCMD_TYPE = 1,
+};
 
 /**
  * hinic_alloc_cmdq_buf - alloc buffer for sending command
@@ -29,8 +67,17 @@
 int hinic_alloc_cmdq_buf(struct hinic_cmdqs *cmdqs,
 struct hinic_cmdq_buf *cmdq_buf)
 {
-   /* should be implemented */
-   return -ENOMEM;
+   struct hinic_hwif *hwif = cmdqs->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+
+   cmdq_buf->buf = pci_pool_alloc(cmdqs->cmdq_buf_pool, GFP_KERNEL,
+  &cmdq_buf->dma_addr);
+   if (!cmdq_buf->buf) {
+   dev_err(&pdev->dev, "Failed to allocate cmd from the pool\n");
+   return -ENOMEM;
+   }
+
+   return 0;
 }
 
 /**
@@ -41,7 +88,7 @@ int hinic_alloc_cmdq_buf(struct hinic_cmdqs *cmdqs,
 void hinic_free_cmdq_buf(struct hinic_cmdqs *cmdqs,
 struct hinic_cmdq_buf *cmdq_buf)
 {
-   /* should be implemented */
+   pci_pool_free(cmdqs->cmdq_buf_pool, cmdq_buf->buf, cmdq_buf->dma_addr);
 }
 
 /**
@@ -63,6 +110,169 @@ int hinic_cmdq_direct_resp(struct hinic_cmdqs *cmdqs,
 }
 
 /**
+ * cmdq_init_queue_ctxt - init the queue ctxt of a cmdq
+ * @cmdq_ctxt: cmdq ctxt to initialize
+ * @cmdq: the cmdq
+ * @cmdq_pages: the memory of the queue
+ **/
+static void cmdq_init_queue_ctxt(struct hinic_cmdq_ctxt *cmdq_ctxt,
+struct hinic_cmdq *cmdq,
+struct hinic_cmdq_pages *cmdq_pages)
+{
+   struct hinic_cmdq_ctxt_info *ctxt_info = &cmdq_ctxt->ctxt_info;
+   u64 wq_first_page_paddr, cmdq_first_block_paddr, pfn;
+   struct hinic_cmdqs *cmdqs = cmdq_to_cmdqs(cmdq);
+   struct hinic_wq *wq = cmdq->wq;
+
+   /* The data in the HW is in Big Endian Format */
+   wq_first_page_paddr = be64_to_cpu(*wq->block_vaddr);
+
+   pfn = CMDQ_PFN(wq_first_page_paddr, wq->wq_page_size);
+
+   ctxt_info->curr_wqe_page_pfn =
+   HINIC_CMDQ_CTXT_PAGE_INFO_SET(pfn, CURR_WQE_PAGE_PFN)   |
+   HINIC_CMDQ_CTXT_PAGE_INFO_SET(HINIC_CEQ_ID_CMDQ, EQ_ID) |
+   HINIC_CMDQ_CTXT_PAGE_INFO_SET(1, CEQ_ARM)   |
+   HINIC_CMDQ_CTXT_PAGE_INFO_SET(1, CEQ_EN)|
+   HINIC_CMDQ_CTXT_PAGE_INFO_SET(cmdq->wrapped, WRAPPED);
+
+   /* block PFN - Read Modify Write */
+   cmdq_first_block_paddr = cmdq_pages->page_paddr;
+
+   pfn = CMDQ_PFN(cmdq_first_block_paddr, wq->wq_page_size);
+
+   ctxt_info->wq_block_pfn =
+   HINIC_CMDQ_CTXT_BLOCK_INFO_SET(pfn, WQ_BLOCK_PFN) |
+   HINIC_CMDQ_CTXT_BLOCK_INFO_SET(atomic_read(&wq->cons_idx), CI);
+
+   cmdq_ctxt->func_idx = HINIC_HWIF_FUNC_IDX(cmdqs->hwif);
+   cmdq_ctxt->cmdq_type  = cmdq->cmdq_type;
+}
+
+/**
+ * init_cmdq - initialize cmdq
+ * @cmdq: the cmdq
+ * @wq: the wq attaced to the cmdq
+ * @q_type: the cmdq type of the cmdq
+ * @db_area: doorbell area for the cmdq
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int init_

[PATCH V4 net-next 16/21] net-next/hinic: Add cmdq commands

2017-08-16 Thread Aviad Krawczyk
Add cmdq commands for setting queue pair contexts in the nic.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/hinic_common.c  |  25 ++
 drivers/net/ethernet/huawei/hinic/hinic_common.h  |   9 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c | 282 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h |  38 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.h   |  10 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c   | 194 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h   |  12 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_wqe.h  | 115 +
 8 files changed, 683 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_common.c 
b/drivers/net/ethernet/huawei/hinic/hinic_common.c
index 1915ad6..02c74fd 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_common.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_common.c
@@ -13,6 +13,7 @@
  *
  */
 
+#include 
 #include 
 #include 
 
@@ -53,3 +54,27 @@ void hinic_be32_to_cpu(void *data, int len)
mem++;
}
 }
+
+/**
+ * hinic_set_sge - set dma area in scatter gather entry
+ * @sge: scatter gather entry
+ * @addr: dma address
+ * @len: length of relevant data in the dma address
+ **/
+void hinic_set_sge(struct hinic_sge *sge, dma_addr_t addr, int len)
+{
+   sge->hi_addr = upper_32_bits(addr);
+   sge->lo_addr = lower_32_bits(addr);
+   sge->len  = len;
+}
+
+/**
+ * hinic_sge_to_dma - get dma address from scatter gather entry
+ * @sge: scatter gather entry
+ *
+ * Return dma address of sg entry
+ **/
+dma_addr_t hinic_sge_to_dma(struct hinic_sge *sge)
+{
+   return (dma_addr_t)u64)sge->hi_addr) << 32) | sge->lo_addr);
+}
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_common.h 
b/drivers/net/ethernet/huawei/hinic/hinic_common.h
index 0f2f4ff..2c06b76 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_common.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_common.h
@@ -16,6 +16,11 @@
 #ifndef HINIC_COMMON_H
 #define HINIC_COMMON_H
 
+#include 
+
+#define UPPER_8_BITS(data)  (((data) >> 8) & 0xFF)
+#define LOWER_8_BITS(data)  ((data) & 0xFF)
+
 struct hinic_sge {
u32 hi_addr;
u32 lo_addr;
@@ -26,4 +31,8 @@ struct hinic_sge {
 
 void hinic_be32_to_cpu(void *data, int len);
 
+void hinic_set_sge(struct hinic_sge *sge, dma_addr_t addr, int len);
+
+dma_addr_t hinic_sge_to_dma(struct hinic_sge *sge);
+
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
index ec24b95..d8c0807 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
@@ -24,19 +24,34 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
+#include 
 
+#include "hinic_common.h"
 #include "hinic_hw_if.h"
 #include "hinic_hw_eqs.h"
 #include "hinic_hw_mgmt.h"
+#include "hinic_hw_wqe.h"
 #include "hinic_hw_wq.h"
 #include "hinic_hw_cmdq.h"
 #include "hinic_hw_io.h"
 #include "hinic_hw_dev.h"
 
+#define CMDQ_DB_PI_OFF(pi)  (((u16)LOWER_8_BITS(pi)) << 3)
+
+#define CMDQ_DB_ADDR(db_base, pi)   ((db_base) + CMDQ_DB_PI_OFF(pi))
+
+#define CMDQ_WQE_HEADER(wqe)((struct hinic_cmdq_header *)(wqe))
+
+#define FIRST_DATA_TO_WRITE_LASTsizeof(u64)
+
 #define CMDQ_DB_OFF SZ_2K
 
 #define CMDQ_WQEBB_SIZE 64
+#define CMDQ_WQE_SIZE   64
 #define CMDQ_DEPTH  SZ_4K
 
 #define CMDQ_WQ_PAGE_SIZE   SZ_4K
@@ -44,6 +59,10 @@
 #define WQE_LCMD_SIZE   64
 #define WQE_SCMD_SIZE   64
 
+#define COMPLETE_LEN3
+
+#define CMDQ_TIMEOUT1000
+
 #define CMDQ_PFN(addr, page_size)   ((addr) >> (ilog2(page_size)))
 
 #define cmdq_to_cmdqs(cmdq) container_of((cmdq) - (cmdq)->cmdq_type, \
@@ -58,6 +77,40 @@ enum cmdq_wqe_type {
WQE_SCMD_TYPE = 1,
 };
 
+enum completion_format {
+   COMPLETE_DIRECT = 0,
+   COMPLETE_SGE= 1,
+};
+
+enum data_format {
+   DATA_SGE= 0,
+   DATA_DIRECT = 1,
+};
+
+enum bufdesc_len {
+   BUFDESC_LCMD_LEN = 2,   /* 16 bytes - 2(8 byte unit) */
+   BUFDESC_SCMD_LEN = 3,   /* 24 bytes - 3(8 byte unit) */
+};
+
+enum ctrl_sect_len {
+   CTRL_SECT_LEN= 1, /* 4 bytes (ctrl) - 1(8 byte unit) */
+   CTRL_DIRECT_SECT_LEN = 2, /* 12 bytes (ctrl + rsvd) - 2(8 byte unit) */
+};
+
+enum cmdq_scmd_type {
+   CMDQ_SET_ARM_CMD = 2,
+};
+
+enum cmdq_cmd_type {
+   CMDQ_CMD_SYNC_DIRECT_RESP = 0,
+   CMDQ_CMD_SYNC_SGE_RESP= 1,
+};
+
+enum completion_request {
+   NO_CEQ  = 0,
+   CEQ_SET = 1,
+};
+
 /**
  * hinic_alloc_cmdq_buf - alloc buffer for sending command
  * @cmdqs: the cmdqs
@@ -92,6 +145,221 @@ void hinic_free_cmdq_buf(struct hinic_cmdqs *cmdqs,
pci_pool_free(cmdqs->c

[PATCH V4 net-next 15/21] net-next/hinic: Add ceqs

2017-08-16 Thread Aviad Krawczyk
Initialize the completion event queues and handle ceq events by calling
the registered handlers. Used for cmdq command completion.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c |  16 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h  |  29 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c  |   7 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.c  | 291 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_eqs.h  |  75 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.c   |  15 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_io.h   |   3 +
 7 files changed, 428 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
index 0dccbe6..ec24b95 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
@@ -27,6 +27,7 @@
 #include 
 
 #include "hinic_hw_if.h"
+#include "hinic_hw_eqs.h"
 #include "hinic_hw_mgmt.h"
 #include "hinic_hw_wq.h"
 #include "hinic_hw_cmdq.h"
@@ -110,6 +111,16 @@ int hinic_cmdq_direct_resp(struct hinic_cmdqs *cmdqs,
 }
 
 /**
+ * cmdq_ceq_handler - cmdq completion event handler
+ * @handle: private data for the handler(cmdqs)
+ * @ceqe_data: ceq element data
+ **/
+static void cmdq_ceq_handler(void *handle, u32 ceqe_data)
+{
+   /* should be implemented */
+}
+
+/**
  * cmdq_init_queue_ctxt - init the queue ctxt of a cmdq
  * @cmdq_ctxt: cmdq ctxt to initialize
  * @cmdq: the cmdq
@@ -320,6 +331,8 @@ int hinic_init_cmdqs(struct hinic_cmdqs *cmdqs, struct 
hinic_hwif *hwif,
goto err_cmdq_ctxt;
}
 
+   hinic_ceq_register_cb(&func_to_io->ceqs, HINIC_CEQ_CMDQ, cmdqs,
+ cmdq_ceq_handler);
return 0;
 
 err_cmdq_ctxt:
@@ -340,10 +353,13 @@ int hinic_init_cmdqs(struct hinic_cmdqs *cmdqs, struct 
hinic_hwif *hwif,
  **/
 void hinic_free_cmdqs(struct hinic_cmdqs *cmdqs)
 {
+   struct hinic_func_to_io *func_to_io = cmdqs_to_func_to_io(cmdqs);
struct hinic_hwif *hwif = cmdqs->hwif;
struct pci_dev *pdev = hwif->pdev;
enum hinic_cmdq_type cmdq_type;
 
+   hinic_ceq_unregister_cb(&func_to_io->ceqs, HINIC_CEQ_CMDQ);
+
cmdq_type = HINIC_CMDQ_SYNC;
for (; cmdq_type < HINIC_MAX_CMDQ_TYPES; cmdq_type++)
free_cmdq(&cmdqs->cmdq[cmdq_type]);
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
index 1f57301..10b8c7b 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
@@ -81,27 +81,44 @@
 
 /* EQ registers */
 #define HINIC_AEQ_MTT_OFF_BASE_ADDR 0x200
+#define HINIC_CEQ_MTT_OFF_BASE_ADDR 0x400
 
 #define HINIC_EQ_MTT_OFF_STRIDE 0x40
 
 #define HINIC_CSR_AEQ_MTT_OFF(id)   \
(HINIC_AEQ_MTT_OFF_BASE_ADDR + (id) * HINIC_EQ_MTT_OFF_STRIDE)
 
+#define HINIC_CSR_CEQ_MTT_OFF(id)   \
+   (HINIC_CEQ_MTT_OFF_BASE_ADDR + (id) * HINIC_EQ_MTT_OFF_STRIDE)
+
 #define HINIC_CSR_EQ_PAGE_OFF_STRIDE8
 
 #define HINIC_CSR_AEQ_HI_PHYS_ADDR_REG(q_id, pg_num)\
(HINIC_CSR_AEQ_MTT_OFF(q_id) + \
 (pg_num) * HINIC_CSR_EQ_PAGE_OFF_STRIDE)
 
+#define HINIC_CSR_CEQ_HI_PHYS_ADDR_REG(q_id, pg_num)\
+   (HINIC_CSR_CEQ_MTT_OFF(q_id) +  \
+(pg_num) * HINIC_CSR_EQ_PAGE_OFF_STRIDE)
+
 #define HINIC_CSR_AEQ_LO_PHYS_ADDR_REG(q_id, pg_num)\
(HINIC_CSR_AEQ_MTT_OFF(q_id) + \
 (pg_num) * HINIC_CSR_EQ_PAGE_OFF_STRIDE + 4)
 
+#define HINIC_CSR_CEQ_LO_PHYS_ADDR_REG(q_id, pg_num)\
+   (HINIC_CSR_CEQ_MTT_OFF(q_id) +  \
+(pg_num) * HINIC_CSR_EQ_PAGE_OFF_STRIDE + 4)
+
 #define HINIC_AEQ_CTRL_0_ADDR_BASE  0xE00
 #define HINIC_AEQ_CTRL_1_ADDR_BASE  0xE04
 #define HINIC_AEQ_CONS_IDX_ADDR_BASE0xE08
 #define HINIC_AEQ_PROD_IDX_ADDR_BASE0xE0C
 
+#define HINIC_CEQ_CTRL_0_ADDR_BASE  0x1000
+#define HINIC_CEQ_CTRL_1_ADDR_BASE  0x1004
+#define HINIC_CEQ_CONS_IDX_ADDR_BASE0x1008
+#define HINIC_CEQ_PROD_IDX_ADDR_BASE0x100C
+
 #define HINIC_EQ_OFF_STRIDE 0x80
 
 #define HINIC_CSR_AEQ_CTRL_0_ADDR(idx)  \
@@ -116,4 +133,16 @@
 #define HINIC_CSR_AEQ_PROD_IDX_ADDR(idx)\
(HINIC_AEQ_PROD_IDX_ADDR_BASE + (idx) * HINIC_EQ_OFF_STRIDE)
 
+#define HINIC_CSR_CEQ_CTRL_0_ADDR(idx)  \
+   (HINIC_CEQ_CTRL_0_ADDR_BASE + (idx) * HINIC_EQ_OFF_STRIDE)
+
+#define HINIC_CSR_CEQ_CTRL_1_ADDR(idx)  \
+   (HINIC_CEQ_CTRL_1_ADDR_BASE + (idx) * HINIC_EQ_OFF_STRIDE)
+
+#define HINIC_CSR_CEQ_CONS_IDX_ADDR(idx)\
+   (HINI

[PATCH V4 net-next 17/21] net-next/hinic: Add cmdq completion handler

2017-08-16 Thread Aviad Krawczyk
Add cmdq completion handler for getting a notification about the
completion of cmdq commands.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c | 297 +-
 drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.h |  12 +
 2 files changed, 308 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
index d8c0807..8d72762 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_cmdq.c
@@ -40,12 +40,31 @@
 #include "hinic_hw_io.h"
 #include "hinic_hw_dev.h"
 
+#define CMDQ_CEQE_TYPE_SHIFT0
+
+#define CMDQ_CEQE_TYPE_MASK 0x7
+
+#define CMDQ_CEQE_GET(val, member)  \
+   (((val) >> CMDQ_CEQE_##member##_SHIFT) \
+& CMDQ_CEQE_##member##_MASK)
+
+#define CMDQ_WQE_ERRCODE_VAL_SHIFT  20
+
+#define CMDQ_WQE_ERRCODE_VAL_MASK   0xF
+
+#define CMDQ_WQE_ERRCODE_GET(val, member)   \
+   (((val) >> CMDQ_WQE_ERRCODE_##member##_SHIFT) \
+& CMDQ_WQE_ERRCODE_##member##_MASK)
+
 #define CMDQ_DB_PI_OFF(pi)  (((u16)LOWER_8_BITS(pi)) << 3)
 
 #define CMDQ_DB_ADDR(db_base, pi)   ((db_base) + CMDQ_DB_PI_OFF(pi))
 
 #define CMDQ_WQE_HEADER(wqe)((struct hinic_cmdq_header *)(wqe))
 
+#define CMDQ_WQE_COMPLETED(ctrl_info)   \
+   HINIC_CMDQ_CTRL_GET(ctrl_info, HW_BUSY_BIT)
+
 #define FIRST_DATA_TO_WRITE_LASTsizeof(u64)
 
 #define CMDQ_DB_OFF SZ_2K
@@ -145,6 +164,22 @@ void hinic_free_cmdq_buf(struct hinic_cmdqs *cmdqs,
pci_pool_free(cmdqs->cmdq_buf_pool, cmdq_buf->buf, cmdq_buf->dma_addr);
 }
 
+static unsigned int cmdq_wqe_size_from_bdlen(enum bufdesc_len len)
+{
+   unsigned int wqe_size = 0;
+
+   switch (len) {
+   case BUFDESC_LCMD_LEN:
+   wqe_size = WQE_LCMD_SIZE;
+   break;
+   case BUFDESC_SCMD_LEN:
+   wqe_size = WQE_SCMD_SIZE;
+   break;
+   }
+
+   return wqe_size;
+}
+
 static void cmdq_set_sge_completion(struct hinic_cmdq_completion *completion,
struct hinic_cmdq_buf *buf_out)
 {
@@ -211,6 +246,15 @@ static void cmdq_set_lcmd_bufdesc(struct 
hinic_cmdq_wqe_lcmd *wqe_lcmd,
hinic_set_sge(&wqe_lcmd->buf_desc.sge, buf_in->dma_addr, buf_in->size);
 }
 
+static void cmdq_set_direct_wqe_data(struct hinic_cmdq_direct_wqe *wqe,
+void *buf_in, u32 in_size)
+{
+   struct hinic_cmdq_wqe_scmd *wqe_scmd = &wqe->wqe_scmd;
+
+   wqe_scmd->buf_desc.buf_len = in_size;
+   memcpy(wqe_scmd->buf_desc.data, buf_in, in_size);
+}
+
 static void cmdq_set_lcmd_wqe(struct hinic_cmdq_wqe *wqe,
  enum cmdq_cmd_type cmd_type,
  struct hinic_cmdq_buf *buf_in,
@@ -239,6 +283,36 @@ static void cmdq_set_lcmd_wqe(struct hinic_cmdq_wqe *wqe,
cmdq_set_lcmd_bufdesc(wqe_lcmd, buf_in);
 }
 
+static void cmdq_set_direct_wqe(struct hinic_cmdq_wqe *wqe,
+   enum cmdq_cmd_type cmd_type,
+   void *buf_in, u16 in_size,
+   struct hinic_cmdq_buf *buf_out, int wrapped,
+   enum hinic_cmd_ack_type ack_type,
+   enum hinic_mod_type mod, u8 cmd, u16 prod_idx)
+{
+   struct hinic_cmdq_direct_wqe *direct_wqe = &wqe->direct_wqe;
+   enum completion_format complete_format;
+   struct hinic_cmdq_wqe_scmd *wqe_scmd;
+
+   wqe_scmd = &direct_wqe->wqe_scmd;
+
+   switch (cmd_type) {
+   case CMDQ_CMD_SYNC_SGE_RESP:
+   complete_format = COMPLETE_SGE;
+   cmdq_set_sge_completion(&wqe_scmd->completion, buf_out);
+   break;
+   case CMDQ_CMD_SYNC_DIRECT_RESP:
+   complete_format = COMPLETE_DIRECT;
+   wqe_scmd->completion.direct_resp = 0;
+   break;
+   }
+
+   cmdq_prepare_wqe_ctrl(wqe, wrapped, ack_type, mod, cmd, prod_idx,
+ complete_format, DATA_DIRECT, BUFDESC_SCMD_LEN);
+
+   cmdq_set_direct_wqe_data(direct_wqe, buf_in, in_size);
+}
+
 static void cmdq_wqe_fill(void *dst, void *src)
 {
memcpy(dst + FIRST_DATA_TO_WRITE_LAST, src + FIRST_DATA_TO_WRITE_LAST,
@@ -352,6 +426,52 @@ static int cmdq_sync_cmd_direct_resp(struct hinic_cmdq 
*cmdq,
return 0;
 }
 
+static int cmdq_set_arm_bit(struct hinic_cmdq *cmdq, void *buf_in,
+   u16 in_size)
+{
+   struct hinic_cmdq_wqe *curr_cmdq_wqe, cmdq_wqe;
+   u16 curr_prod_idx, next_prod_idx;
+   struct hinic_wq *wq = cmdq->wq;
+   struct hinic_hw_wqe *hw_wqe;
+   int wrapped, num_wqebbs;
+
+   /* Keep doorbell index corr

[PATCH V4 net-next 18/21] net-next/hinic: Add Rx handler

2017-08-16 Thread Aviad Krawczyk
Set the io resources in the nic and handle rx events by qp operations.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/hinic_dev.h |   1 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h  |   1 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c  | 361 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h  |  77 
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.c   |  36 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_if.h   |  35 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.h |  13 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c   | 210 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h   |  29 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.c   |  12 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_wq.h   |   2 +
 drivers/net/ethernet/huawei/hinic/hinic_main.c|  27 ++
 drivers/net/ethernet/huawei/hinic/hinic_port.c|  32 ++
 drivers/net/ethernet/huawei/hinic/hinic_port.h|  19 +
 drivers/net/ethernet/huawei/hinic/hinic_rx.c  | 419 ++
 drivers/net/ethernet/huawei/hinic/hinic_rx.h  |   7 +
 16 files changed, 1281 insertions(+)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_dev.h 
b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
index 5b8231d..3d0f6cf 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
@@ -43,6 +43,7 @@ struct hinic_dev {
struct hinic_hwdev  *hwdev;
 
u32 msg_enable;
+   unsigned intrx_weight;
 
unsigned intflags;
 
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
index 10b8c7b..f39b184 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_csr.h
@@ -20,6 +20,7 @@
 #define HINIC_CSR_FUNC_ATTR0_ADDR   0x0
 #define HINIC_CSR_FUNC_ATTR1_ADDR   0x4
 
+#define HINIC_CSR_FUNC_ATTR4_ADDR   0x10
 #define HINIC_CSR_FUNC_ATTR5_ADDR   0x14
 
 #define HINIC_DMA_ATTR_BASE 0xC80
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
index cb4c472..03149e8 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
@@ -20,6 +20,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 
 #include "hinic_hw_if.h"
@@ -30,6 +33,10 @@
 #include "hinic_hw_io.h"
 #include "hinic_hw_dev.h"
 
+#define IO_STATUS_TIMEOUT   100
+#define OUTBOUND_STATE_TIMEOUT  100
+#define DB_STATE_TIMEOUT100
+
 #define MAX_IRQS(max_qps, num_aeqs, num_ceqs)   \
 (2 * (max_qps) + (num_aeqs) + (num_ceqs))
 
@@ -37,6 +44,15 @@ enum intr_type {
INTR_MSIX_TYPE,
 };
 
+enum io_status {
+   IO_STOPPED = 0,
+   IO_RUNNING = 1,
+};
+
+enum hw_ioctxt_set_cmdq_depth {
+   HW_IOCTXT_SET_CMDQ_DEPTH_DEFAULT,
+};
+
 /* HW struct */
 struct hinic_dev_cap {
u8  status;
@@ -51,6 +67,31 @@ struct hinic_dev_cap {
u8  rsvd3[208];
 };
 
+struct rx_buf_sz {
+   int idx;
+   size_t  sz;
+};
+
+static struct rx_buf_sz rx_buf_sz_table[] = {
+   {0, 32},
+   {1, 64},
+   {2, 96},
+   {3, 128},
+   {4, 192},
+   {5, 256},
+   {6, 384},
+   {7, 512},
+   {8, 768},
+   {9, 1024},
+   {10, 1536},
+   {11, 2048},
+   {12, 3072},
+   {13, 4096},
+   {14, 8192},
+   {15, 16384},
+   {-1, -1},
+};
+
 /**
  * get_capability - convert device capabilities to NIC capabilities
  * @hwdev: the HW device to set and convert device capabilities for
@@ -236,6 +277,252 @@ int hinic_port_msg_cmd(struct hinic_hwdev *hwdev, enum 
hinic_port_cmd cmd,
 }
 
 /**
+ * init_fw_ctxt- Init Firmware tables before network mgmt and io operations
+ * @hwdev: the NIC HW device
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+static int init_fw_ctxt(struct hinic_hwdev *hwdev)
+{
+   struct hinic_hwif *hwif = hwdev->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   struct hinic_cmd_fw_ctxt fw_ctxt;
+   struct hinic_pfhwdev *pfhwdev;
+   u16 out_size;
+   int err;
+
+   if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) {
+   dev_err(&pdev->dev, "Unsupported PCI Function type\n");
+   return -EINVAL;
+   }
+
+   fw_ctxt.func_idx = HINIC_HWIF_FUNC_IDX(hwif);
+   fw_ctxt.rx_buf_sz = HINIC_RX_BUF_SZ;
+
+   pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev);
+
+   err = hinic_port_msg_cmd(hwdev, HINIC_PORT_CMD_FWCTXT_INIT,
+&fw_ctxt, sizeof(fw_ctxt),
+&fw_ctxt, &out_size);
+   if (err || (out_size != sizeof(fw_ctxt)) || fw_c

[PATCH V4 net-next 20/21] net-next/hinic: Add ethtool and stats

2017-08-16 Thread Aviad Krawczyk
Add ethtool operations and statistics operations.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/hinic_dev.h  |   3 +
 drivers/net/ethernet/huawei/hinic/hinic_main.c | 218 -
 drivers/net/ethernet/huawei/hinic/hinic_port.c |  31 
 drivers/net/ethernet/huawei/hinic/hinic_port.h |  45 +
 drivers/net/ethernet/huawei/hinic/hinic_rx.c   |  19 +++
 drivers/net/ethernet/huawei/hinic/hinic_rx.h   |   2 +
 drivers/net/ethernet/huawei/hinic/hinic_tx.c   |  22 +++
 drivers/net/ethernet/huawei/hinic/hinic_tx.h   |   2 +
 8 files changed, 341 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_dev.h 
b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
index 15d0c2e..5186cc9 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
@@ -56,6 +56,9 @@ struct hinic_dev {
 
struct hinic_txq*txqs;
struct hinic_rxq*rxqs;
+
+   struct hinic_txq_stats  tx_stats;
+   struct hinic_rxq_stats  rx_stats;
 };
 
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c 
b/drivers/net/ethernet/huawei/hinic/hinic_main.c
index 4a5f23f..a77a7f8 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_main.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c
@@ -69,6 +69,186 @@
 
 static int change_mac_addr(struct net_device *netdev, const u8 *addr);
 
+static void set_link_speed(struct ethtool_link_ksettings *link_ksettings,
+  enum hinic_speed speed)
+{
+   switch (speed) {
+   case HINIC_SPEED_10MB_LINK:
+   link_ksettings->base.speed = SPEED_10;
+   break;
+
+   case HINIC_SPEED_100MB_LINK:
+   link_ksettings->base.speed = SPEED_100;
+   break;
+
+   case HINIC_SPEED_1000MB_LINK:
+   link_ksettings->base.speed = SPEED_1000;
+   break;
+
+   case HINIC_SPEED_10GB_LINK:
+   link_ksettings->base.speed = SPEED_1;
+   break;
+
+   case HINIC_SPEED_25GB_LINK:
+   link_ksettings->base.speed = SPEED_25000;
+   break;
+
+   case HINIC_SPEED_40GB_LINK:
+   link_ksettings->base.speed = SPEED_4;
+   break;
+
+   case HINIC_SPEED_100GB_LINK:
+   link_ksettings->base.speed = SPEED_10;
+   break;
+
+   default:
+   link_ksettings->base.speed = SPEED_UNKNOWN;
+   break;
+   }
+}
+
+static int hinic_get_link_ksettings(struct net_device *netdev,
+   struct ethtool_link_ksettings
+   *link_ksettings)
+{
+   struct hinic_dev *nic_dev = netdev_priv(netdev);
+   enum hinic_port_link_state link_state;
+   struct hinic_port_cap port_cap;
+   int err;
+
+   ethtool_link_ksettings_zero_link_mode(link_ksettings, advertising);
+   ethtool_link_ksettings_add_link_mode(link_ksettings, supported,
+Autoneg);
+
+   link_ksettings->base.speed   = SPEED_UNKNOWN;
+   link_ksettings->base.autoneg = AUTONEG_DISABLE;
+   link_ksettings->base.duplex  = DUPLEX_UNKNOWN;
+
+   err = hinic_port_get_cap(nic_dev, &port_cap);
+   if (err) {
+   netif_err(nic_dev, drv, netdev,
+ "Failed to get port capabilities\n");
+   return err;
+   }
+
+   err = hinic_port_link_state(nic_dev, &link_state);
+   if (err) {
+   netif_err(nic_dev, drv, netdev,
+ "Failed to get port link state\n");
+   return err;
+   }
+
+   if (link_state != HINIC_LINK_STATE_UP) {
+   netif_info(nic_dev, drv, netdev, "No link\n");
+   return err;
+   }
+
+   set_link_speed(link_ksettings, port_cap.speed);
+
+   if (!!(port_cap.autoneg_cap & HINIC_AUTONEG_SUPPORTED))
+   ethtool_link_ksettings_add_link_mode(link_ksettings,
+advertising, Autoneg);
+
+   if (port_cap.autoneg_state == HINIC_AUTONEG_ACTIVE)
+   link_ksettings->base.autoneg = AUTONEG_ENABLE;
+
+   link_ksettings->base.duplex = (port_cap.duplex == HINIC_DUPLEX_FULL) ?
+  DUPLEX_FULL : DUPLEX_HALF;
+   return 0;
+}
+
+static void hinic_get_drvinfo(struct net_device *netdev,
+ struct ethtool_drvinfo *info)
+{
+   struct hinic_dev *nic_dev = netdev_priv(netdev);
+   struct hinic_hwdev *hwdev = nic_dev->hwdev;
+   struct hinic_hwif *hwif = hwdev->hwif;
+
+   strlcpy(info->driver, HINIC_DRV_NAME, sizeof(info->driver));
+   strlcpy(info->bus_info, pci_name(hwif->pdev), sizeof(info->bus_info));
+}
+
+static void hinic_get_ringparam(struct net_device *netdev,
+   struct

[PATCH V4 net-next 21/21] net-next/hinic: Add select_queue and netpoll

2017-08-16 Thread Aviad Krawczyk
Add more netdev operations.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 MAINTAINERS|  7 ++
 drivers/net/ethernet/huawei/hinic/hinic_main.c | 35 ++
 2 files changed, 42 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 2db0f8c..9ee5902 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6240,6 +6240,13 @@ L:   linux-in...@vger.kernel.org
 S: Maintained
 F: drivers/input/touchscreen/htcpen.c
 
+HUAWEI ETHERNET DRIVER
+M: Aviad Krawczyk 
+L: netdev@vger.kernel.org
+S: Supported
+F: Documentation/networking/hinic.txt
+F: drivers/net/ethernet/huawei/hinic/
+
 HUGETLB FILESYSTEM
 M: Nadia Yvette Chambers 
 S: Maintained
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c 
b/drivers/net/ethernet/huawei/hinic/hinic_main.c
index a77a7f8..5306ec1 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_main.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c
@@ -787,6 +787,37 @@ static void hinic_get_stats64(struct net_device *netdev,
stats->tx_errors  = nic_tx_stats->tx_dropped;
 }
 
+static u16 hinic_select_queue(struct net_device *netdev, struct sk_buff *skb,
+ void *accel_priv,
+ select_queue_fallback_t fallback)
+{
+   u16 qid;
+
+   if (skb_rx_queue_recorded(skb))
+   qid = skb_get_rx_queue(skb);
+   else
+   qid = fallback(netdev, skb);
+
+   return qid;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void hinic_netpoll(struct net_device *netdev)
+{
+   struct hinic_dev *nic_dev = netdev_priv(netdev);
+   int i, num_qps;
+
+   num_qps = hinic_hwdev_num_qps(nic_dev->hwdev);
+   for (i = 0; i < num_qps; i++) {
+   struct hinic_txq *txq = &nic_dev->txqs[i];
+   struct hinic_rxq *rxq = &nic_dev->rxqs[i];
+
+   napi_schedule(&txq->napi);
+   napi_schedule(&rxq->napi);
+   }
+}
+#endif
+
 static const struct net_device_ops hinic_netdev_ops = {
.ndo_open = hinic_open,
.ndo_stop = hinic_close,
@@ -799,6 +830,10 @@ static void hinic_get_stats64(struct net_device *netdev,
.ndo_start_xmit = hinic_xmit_frame,
.ndo_tx_timeout = hinic_tx_timeout,
.ndo_get_stats64 = hinic_get_stats64,
+   .ndo_select_queue = hinic_select_queue,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+   .ndo_poll_controller = hinic_netpoll,
+#endif
 };
 
 static void netdev_features_init(struct net_device *netdev)
-- 
1.9.1



[PATCH V4 net-next 19/21] net-next/hinic: Add Tx operation

2017-08-16 Thread Aviad Krawczyk
Add transmit operation for sending data by qp operations.

Signed-off-by: Aviad Krawczyk 
Signed-off-by: Zhao Chen 
---
 drivers/net/ethernet/huawei/hinic/hinic_dev.h|   1 +
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c |  47 +++
 drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h |  22 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c  | 257 ++
 drivers/net/ethernet/huawei/hinic/hinic_hw_qp.h  |  48 +++
 drivers/net/ethernet/huawei/hinic/hinic_main.c   |  12 +-
 drivers/net/ethernet/huawei/hinic/hinic_tx.c | 406 +++
 drivers/net/ethernet/huawei/hinic/hinic_tx.h |  11 +
 8 files changed, 802 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_dev.h 
b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
index 3d0f6cf..15d0c2e 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_dev.h
@@ -43,6 +43,7 @@ struct hinic_dev {
struct hinic_hwdev  *hwdev;
 
u32 msg_enable;
+   unsigned inttx_weight;
unsigned intrx_weight;
 
unsigned intflags;
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
index 03149e8..6606fba 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.c
@@ -40,6 +40,8 @@
 #define MAX_IRQS(max_qps, num_aeqs, num_ceqs)   \
 (2 * (max_qps) + (num_aeqs) + (num_ceqs))
 
+#define ADDR_IN_4BYTES(addr)((addr) >> 2)
+
 enum intr_type {
INTR_MSIX_TYPE,
 };
@@ -996,3 +998,48 @@ int hinic_hwdev_msix_set(struct hinic_hwdev *hwdev, u16 
msix_index,
   lli_timer_cfg, lli_credit_limit,
   resend_timer);
 }
+
+/**
+ * hinic_hwdev_hw_ci_addr_set - set cons idx addr and attributes in HW for sq
+ * @hwdev: the NIC HW device
+ * @sq: send queue
+ * @pending_limit: the maximum pending update ci events (unit 8)
+ * @coalesc_timer: coalesc period for update ci (unit 8 us)
+ *
+ * Return 0 - Success, negative - Failure
+ **/
+int hinic_hwdev_hw_ci_addr_set(struct hinic_hwdev *hwdev, struct hinic_sq *sq,
+  u8 pending_limit, u8 coalesc_timer)
+{
+   struct hinic_qp *qp = container_of(sq, struct hinic_qp, sq);
+   struct hinic_hwif *hwif = hwdev->hwif;
+   struct pci_dev *pdev = hwif->pdev;
+   struct hinic_pfhwdev *pfhwdev;
+   struct hinic_cmd_hw_ci hw_ci;
+
+   if (!HINIC_IS_PF(hwif) && !HINIC_IS_PPF(hwif)) {
+   dev_err(&pdev->dev, "Unsupported PCI Function type\n");
+   return -EINVAL;
+   }
+
+   hw_ci.dma_attr_off  = 0;
+   hw_ci.pending_limit = pending_limit;
+   hw_ci.coalesc_timer  = coalesc_timer;
+
+   hw_ci.msix_en = 1;
+   hw_ci.msix_entry_idx = sq->msix_entry;
+
+   hw_ci.func_idx = HINIC_HWIF_FUNC_IDX(hwif);
+
+   hw_ci.sq_id = qp->q_id;
+
+   hw_ci.ci_addr = ADDR_IN_4BYTES(sq->hw_ci_dma_addr);
+
+   pfhwdev = container_of(hwdev, struct hinic_pfhwdev, hwdev);
+
+   return hinic_msg_to_mgmt(&pfhwdev->pf_to_mgmt,
+HINIC_MOD_COMM,
+HINIC_COMM_CMD_SQ_HI_CI_SET,
+&hw_ci, sizeof(hw_ci), NULL,
+NULL, HINIC_MGMT_MSG_SYNC);
+}
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
index e7277d1..0f5563f 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_dev.h
@@ -153,6 +153,25 @@ struct hinic_cmd_base_qpn {
u16 qpn;
 };
 
+struct hinic_cmd_hw_ci {
+   u8  status;
+   u8  version;
+   u8  rsvd0[6];
+
+   u16 func_idx;
+
+   u8  dma_attr_off;
+   u8  pending_limit;
+   u8  coalesc_timer;
+
+   u8  msix_en;
+   u16 msix_entry_idx;
+
+   u32 sq_id;
+   u32 rsvd1;
+   u64 ci_addr;
+};
+
 struct hinic_hwdev {
struct hinic_hwif   *hwif;
struct msix_entry   *msix_entries;
@@ -214,4 +233,7 @@ int hinic_hwdev_msix_set(struct hinic_hwdev *hwdev, u16 
msix_index,
 u8 lli_timer_cfg, u8 lli_credit_limit,
 u8 resend_timer);
 
+int hinic_hwdev_hw_ci_addr_set(struct hinic_hwdev *hwdev, struct hinic_sq *sq,
+  u8 pending_limit, u8 coalesc_timer);
+
 #endif
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c 
b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c
index 6e540d2..97ee8eb 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_qp.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #incl

Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet

2017-08-16 Thread Jason Wang



On 2017年08月16日 18:24, Eric Dumazet wrote:

On Wed, 2017-08-16 at 11:55 +0800, Jason Wang wrote:

On 2017年08月16日 11:45, Eric Dumazet wrote:

You do realize that tun_build_skb() is not thread safe ?

Ok, I think the issue if skb_page_frag_refill(), need a spinlock
probably. Will prepare a patch.

But since tun is used from process context, why don't you use the
per-thread generator (no lock involved)


Haven't noticed this before.



tcp_sendmsg() uses this for GFP_KERNEL allocations.

Untested patch :

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 
5892284eb8d05b0678d820bad3d0d2c61a879aeb..c38cd840cc0b7fecf182b23976e36f709cacca1f
 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -175,7 +175,6 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
-   struct page_frag alloc_frag;
  };
  
  struct tun_flow_entry {

@@ -578,8 +577,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
}
if (tun)
skb_array_cleanup(&tfile->tx_array);
-   if (tfile->alloc_frag.page)
-   put_page(tfile->alloc_frag.page);
sock_put(&tfile->sk);
}
  }
@@ -1272,7 +1269,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
*tun,
 struct virtio_net_hdr *hdr,
 int len, int *generic_xdp)
  {
-   struct page_frag *alloc_frag = &tfile->alloc_frag;
+   struct page_frag *alloc_frag = ¤t->task_frag;
struct sk_buff *skb;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
@@ -2580,8 +2577,6 @@ static int tun_chr_open(struct inode *inode, struct file 
* file)
tfile->sk.sk_write_space = tun_sock_write_space;
tfile->sk.sk_sndbuf = INT_MAX;
  
-	tfile->alloc_frag.page = NULL;

-
file->private_data = tfile;
INIT_LIST_HEAD(&tfile->next);
  







Tested-by: Jason Wang 
Acked-by: Jason Wang 



Re: [PATCH] net/mlx4: fix spelling mistake: "availible" -> "available"

2017-08-16 Thread Boyer, Andrew


On 8/16/17, 5:05 AM, "linux-rdma-ow...@vger.kernel.org on behalf of Colin
King"  wrote:

>From: Colin Ian King 
>
>Trivial fix to spelling mistakes in the mlx4 driver
>
>Signed-off-by: Colin Ian King 
>---
> drivers/net/ethernet/mellanox/mlx4/cmd.c| 16 
> drivers/net/ethernet/mellanox/mlx4/fw_qos.c |  6 +++---
> drivers/net/ethernet/mellanox/mlx4/fw_qos.h | 10 +-
> 3 files changed, 16 insertions(+), 16 deletions(-)
>
>diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c
>b/drivers/net/ethernet/mellanox/mlx4/cmd.c
>index 674773b28b2e..6309389b09a7 100644
>--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
>+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
>@@ -1958,19 +1958,19 @@ static void mlx4_allocate_port_vpps(struct
>mlx4_dev *dev, int port)
>   int i;
>   int err;
>   int num_vfs;
>-  u16 availible_vpp;
>+  u16 available_vpp;
>   u8 vpp_param[MLX4_NUM_UP];
>   struct mlx4_qos_manager *port_qos;
>   struct mlx4_priv *priv = mlx4_priv(dev);
> 
>-  err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
>+  err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
>   if (err) {
>-  mlx4_info(dev, "Failed query availible VPPs\n");
>+  mlx4_info(dev, "Failed query available VPPs\n");
>   return;
>   }
> 
>   port_qos = &priv->mfunc.master.qos_ctl[port];
>-  num_vfs = (availible_vpp /
>+  num_vfs = (available_vpp /
>  bitmap_weight(port_qos->priority_bm, MLX4_NUM_UP));
> 
>   for (i = 0; i < MLX4_NUM_UP; i++) {
>@@ -1985,14 +1985,14 @@ static void mlx4_allocate_port_vpps(struct
>mlx4_dev *dev, int port)
>   }
> 
>   /* Query actual allocated VPP, just to make sure */
>-  err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
>+  err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
>   if (err) {
>-  mlx4_info(dev, "Failed query availible VPPs\n");
>+  mlx4_info(dev, "Failed query available VPPs\n");
>   return;
>   }
> 
>   port_qos->num_of_qos_vfs = num_vfs;
>-  mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, availible_vpp);
>+  mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, available_vpp);

One more here, in the text.  ^^^

> 
>   for (i = 0; i < MLX4_NUM_UP; i++)
>   mlx4_dbg(dev, "Port %d UP %d Allocated %d VPPs\n", port, i,
>@@ -2891,7 +2891,7 @@ static int mlx4_set_vport_qos(struct mlx4_priv
>*priv, int slave, int port,
>   memset(vpp_qos, 0, sizeof(struct mlx4_vport_qos_param) * MLX4_NUM_UP);
> 
>   if (slave > port_qos->num_of_qos_vfs) {
>-  mlx4_info(dev, "No availible VPP resources for this VF\n");
>+  mlx4_info(dev, "No available VPP resources for this VF\n");
>   return -EINVAL;
>   }
> 
>diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
>b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
>index 8f2fde0487c4..3a09d7122d3b 100644
>--- a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
>+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
>@@ -65,7 +65,7 @@ struct mlx4_set_port_scheduler_context {
> 
> /* Granular Qos (per VF) section */
> struct mlx4_alloc_vpp_param {
>-  __be32 availible_vpp;
>+  __be32 available_vpp;
>   __be32 vpp_p_up[MLX4_NUM_UP];
> };
> 
>@@ -157,7 +157,7 @@ int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8
>port, u8 *tc_tx_bw,
> EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
> 
> int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
>-u16 *availible_vpp, u8 *vpp_p_up)
>+u16 *available_vpp, u8 *vpp_p_up)
> {
>   int i;
>   int err;
>@@ -179,7 +179,7 @@ int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8
>port,
>   goto out;
> 
>   /* Total number of supported VPPs */
>-  *availible_vpp = (u16)be32_to_cpu(out_param->availible_vpp);
>+  *available_vpp = (u16)be32_to_cpu(out_param->available_vpp);
> 
>   for (i = 0; i < MLX4_NUM_UP; i++)
>   vpp_p_up[i] = (u8)be32_to_cpu(out_param->vpp_p_up[i]);
>diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
>b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
>index ac1f331878e6..582997577a04 100644
>--- a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
>+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
>@@ -84,23 +84,23 @@ int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8
>port, u8 *prio2tc);
> int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
>   u8 *pg, u16 *ratelimit);
> /**
>- * mlx4_ALLOCATE_VPP_get - Query port VPP availible resources and
>allocation.
>- * Before distribution of VPPs to priorities, only availible_vpp is
>returned.
>+ * mlx4_ALLOCATE_VPP_get - Query port VPP available resources and
>allocation.
>+ * Before distribution of VPPs to priorities, only available_vpp is
>returned.
>  * After initialization it returns the distribution of VPPs among

Re: [PATCH] bpf: Update sysctl documentation to list all supported architectures

2017-08-16 Thread Daniel Borkmann

On 08/16/2017 01:10 PM, Michael Ellerman wrote:

Daniel Borkmann  writes:

On 08/16/2017 07:15 AM, Michael Ellerman wrote:

The sysctl documentation states that the JIT is only available on
x86_64, which is no longer correct.

Update the list to include all architectures that enable HAVE_CBPF_JIT
or HAVE_EBPF_JIT under some configuration.

Signed-off-by: Michael Ellerman 


Thanks for the patch!


   Documentation/sysctl/net.txt | 5 +++--
   1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 14db18c970b1..f68356024d09 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -36,8 +36,9 @@ bpf_jit_enable
   --

   This enables Berkeley Packet Filter Just in Time compiler.
-Currently supported on x86_64 architecture, bpf_jit provides a framework
-to speed packet filtering, the one used by tcpdump/libpcap for example.
+Currently supported on arm, arm64, mips, powerpc, s390, sparc and x86_64
+architectures, bpf_jit provides a framework to speed packet filtering, the one
+used by tcpdump/libpcap for example.


Good point, could we actually make that as a bullet list and
differentiate between cBPF and eBPF JITs, so that a user doesn't
need to run git grep HAVE_{E,C}BPF_JIT to figure it out what the
switch enables on the arch used? That would be great.


We could.

Does a user of the sysctl want/need to know the difference though? Or do
they just want to turn on "the JIT"?


They would just turn it on, but I think it would be nice to inform
them which archs support eBPF (which is a superset of cBPF in term
of what can be jited), so in case they have some native eBPF programs
they would see whether these can also be jited.


[PATCH 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-16 Thread Tonghao Zhang
Because we remove the tcp_tw_recycle support in the commit
4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
Now when we call the 'af_ops->route_req', the dist always is
NULL, and we remove the unnecessay check.

Signed-off-by: Tonghao Zhang 
---
 net/ipv4/tcp_input.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d73903fe8c83..7eee2c7ddb7a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6132,11 +6132,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
isn = af_ops->init_seq(skb);
}
-   if (!dst) {
-   dst = af_ops->route_req(sk, &fl, req);
-   if (!dst)
-   goto drop_and_free;
-   }
+
+   dst = af_ops->route_req(sk, &fl, req);
+   if (!dst)
+   goto drop_and_free;
 
tcp_ecn_create_request(req, skb, sk, dst);
 
-- 
2.13.4



[PATCH 2/2] tcp: Remove the unused parameter for tcp_try_fastopen.

2017-08-16 Thread Tonghao Zhang
Signed-off-by: Tonghao Zhang 
---
 include/net/tcp.h   | 3 +--
 net/ipv4/tcp_fastopen.c | 6 ++
 net/ipv4/tcp_input.c| 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index afdab3781425..a995004ae946 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1533,8 +1533,7 @@ int tcp_fastopen_reset_cipher(void *key, unsigned int 
len);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
  struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst);
+ struct tcp_fastopen_cookie *foc);
 void tcp_fastopen_init_key_once(bool publish);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 struct tcp_fastopen_cookie *cookie);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ce9c7fef200f..e3c33220c418 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -171,7 +171,6 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff 
*skb)
 
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
  struct sk_buff *skb,
- struct dst_entry *dst,
  struct request_sock *req)
 {
struct tcp_sock *tp;
@@ -278,8 +277,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
  */
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
  struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst)
+ struct tcp_fastopen_cookie *foc)
 {
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
@@ -312,7 +310,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct 
sk_buff *skb,
 * data in SYN_RECV state.
 */
 fastopen:
-   child = tcp_fastopen_create_child(sk, skb, dst, req);
+   child = tcp_fastopen_create_child(sk, skb, req);
if (child) {
foc->len = -1;
NET_INC_STATS(sock_net(sk),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7eee2c7ddb7a..21df0868f206 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6151,7 +6151,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_openreq_init_rwin(req, sk, dst);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
-   fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
+   fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc);
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
-- 
2.13.4



[PATCH][V2] net/mlx4: fix spelling mistake: "availible" -> "available"

2017-08-16 Thread Colin King
From: Colin Ian King 

Trivial fix to spelling mistakes in the mlx4 driver.

Signed-off-by: Colin Ian King 
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c| 16 
 drivers/net/ethernet/mellanox/mlx4/fw_qos.c |  6 +++---
 drivers/net/ethernet/mellanox/mlx4/fw_qos.h | 10 +-
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 674773b28b2e..c6674bdd7da0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1958,19 +1958,19 @@ static void mlx4_allocate_port_vpps(struct mlx4_dev 
*dev, int port)
int i;
int err;
int num_vfs;
-   u16 availible_vpp;
+   u16 available_vpp;
u8 vpp_param[MLX4_NUM_UP];
struct mlx4_qos_manager *port_qos;
struct mlx4_priv *priv = mlx4_priv(dev);
 
-   err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
+   err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
if (err) {
-   mlx4_info(dev, "Failed query availible VPPs\n");
+   mlx4_info(dev, "Failed query available VPPs\n");
return;
}
 
port_qos = &priv->mfunc.master.qos_ctl[port];
-   num_vfs = (availible_vpp /
+   num_vfs = (available_vpp /
   bitmap_weight(port_qos->priority_bm, MLX4_NUM_UP));
 
for (i = 0; i < MLX4_NUM_UP; i++) {
@@ -1985,14 +1985,14 @@ static void mlx4_allocate_port_vpps(struct mlx4_dev 
*dev, int port)
}
 
/* Query actual allocated VPP, just to make sure */
-   err = mlx4_ALLOCATE_VPP_get(dev, port, &availible_vpp, vpp_param);
+   err = mlx4_ALLOCATE_VPP_get(dev, port, &available_vpp, vpp_param);
if (err) {
-   mlx4_info(dev, "Failed query availible VPPs\n");
+   mlx4_info(dev, "Failed query available VPPs\n");
return;
}
 
port_qos->num_of_qos_vfs = num_vfs;
-   mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, availible_vpp);
+   mlx4_dbg(dev, "Port %d Available VPPs %d\n", port, available_vpp);
 
for (i = 0; i < MLX4_NUM_UP; i++)
mlx4_dbg(dev, "Port %d UP %d Allocated %d VPPs\n", port, i,
@@ -2891,7 +2891,7 @@ static int mlx4_set_vport_qos(struct mlx4_priv *priv, int 
slave, int port,
memset(vpp_qos, 0, sizeof(struct mlx4_vport_qos_param) * MLX4_NUM_UP);
 
if (slave > port_qos->num_of_qos_vfs) {
-   mlx4_info(dev, "No availible VPP resources for this VF\n");
+   mlx4_info(dev, "No available VPP resources for this VF\n");
return -EINVAL;
}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c 
b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
index 8f2fde0487c4..3a09d7122d3b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.c
@@ -65,7 +65,7 @@ struct mlx4_set_port_scheduler_context {
 
 /* Granular Qos (per VF) section */
 struct mlx4_alloc_vpp_param {
-   __be32 availible_vpp;
+   __be32 available_vpp;
__be32 vpp_p_up[MLX4_NUM_UP];
 };
 
@@ -157,7 +157,7 @@ int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, 
u8 *tc_tx_bw,
 EXPORT_SYMBOL(mlx4_SET_PORT_SCHEDULER);
 
 int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
- u16 *availible_vpp, u8 *vpp_p_up)
+ u16 *available_vpp, u8 *vpp_p_up)
 {
int i;
int err;
@@ -179,7 +179,7 @@ int mlx4_ALLOCATE_VPP_get(struct mlx4_dev *dev, u8 port,
goto out;
 
/* Total number of supported VPPs */
-   *availible_vpp = (u16)be32_to_cpu(out_param->availible_vpp);
+   *available_vpp = (u16)be32_to_cpu(out_param->available_vpp);
 
for (i = 0; i < MLX4_NUM_UP; i++)
vpp_p_up[i] = (u8)be32_to_cpu(out_param->vpp_p_up[i]);
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h 
b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
index ac1f331878e6..582997577a04 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw_qos.h
@@ -84,23 +84,23 @@ int mlx4_SET_PORT_PRIO2TC(struct mlx4_dev *dev, u8 port, u8 
*prio2tc);
 int mlx4_SET_PORT_SCHEDULER(struct mlx4_dev *dev, u8 port, u8 *tc_tx_bw,
u8 *pg, u16 *ratelimit);
 /**
- * mlx4_ALLOCATE_VPP_get - Query port VPP availible resources and allocation.
- * Before distribution of VPPs to priorities, only availible_vpp is returned.
+ * mlx4_ALLOCATE_VPP_get - Query port VPP available resources and allocation.
+ * Before distribution of VPPs to priorities, only available_vpp is returned.
  * After initialization it returns the distribution of VPPs among priorities.
  *
  * @dev: mlx4_dev.
  * @port: Physical port number.
- * @availible_vpp: Pointer to variable where number of availible VPPs is stored
+ * @available_vpp: Pointer to variable wher

Re: [PATCH][V2] net/mlx4: fix spelling mistake: "availible" -> "available"

2017-08-16 Thread Leon Romanovsky
On Wed, Aug 16, 2017 at 02:42:50PM +0100, Colin King wrote:
> From: Colin Ian King 
>
> Trivial fix to spelling mistakes in the mlx4 driver.
>
> Signed-off-by: Colin Ian King 
> ---
>  drivers/net/ethernet/mellanox/mlx4/cmd.c| 16 
>  drivers/net/ethernet/mellanox/mlx4/fw_qos.c |  6 +++---
>  drivers/net/ethernet/mellanox/mlx4/fw_qos.h | 10 +-
>  3 files changed, 16 insertions(+), 16 deletions(-)
>

What are the changes between this version and previous one?

Thanks


signature.asc
Description: PGP signature


Re: [PATCH][V2] net/mlx4: fix spelling mistake: "availible" -> "available"

2017-08-16 Thread Colin Ian King
On 16/08/17 14:58, Leon Romanovsky wrote:
> On Wed, Aug 16, 2017 at 02:42:50PM +0100, Colin King wrote:
>> From: Colin Ian King 
>>
>> Trivial fix to spelling mistakes in the mlx4 driver.
>>
>> Signed-off-by: Colin Ian King 
>> ---
>>  drivers/net/ethernet/mellanox/mlx4/cmd.c| 16 
>>  drivers/net/ethernet/mellanox/mlx4/fw_qos.c |  6 +++---
>>  drivers/net/ethernet/mellanox/mlx4/fw_qos.h | 10 +-
>>  3 files changed, 16 insertions(+), 16 deletions(-)
>>
> 
> What are the changes between this version and previous one?
> 
> Thanks
> 
A fix on "Availible"

-   mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, availible_vpp);
+   mlx4_dbg(dev, "Port %d Available VPPs %d\n", port, available_vpp);

Colin



signature.asc
Description: OpenPGP digital signature


[PATCH net] dccp: defer ccid_hc_tx_delete() at dismantle time

2017-08-16 Thread Eric Dumazet
From: Eric Dumazet 

syszkaller team reported another problem in DCCP [1]

Problem here is that the structure holding RTO timer
(ccid2_hc_tx_rto_expire() handler) is freed too soon.

We can not use del_timer_sync() to cancel the timer
since this timer wants to grab socket lock (that would risk a dead lock)

Solution is to defer the freeing of memory when all references to
the socket were released. Socket timers do own a reference, so this
should fix the issue.

[1]
==
BUG: KASAN: use-after-free in ccid2_hc_tx_rto_expire+0x51c/0x5c0 
net/dccp/ccids/ccid2.c:144
Read of size 4 at addr 8801d2660540 by task kworker/u4:7/3365

CPU: 1 PID: 3365 Comm: kworker/u4:7 Not tainted 4.13.0-rc4+ #3
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Workqueue: events_unbound call_usermodehelper_exec_work
Call Trace:
 
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429
 ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144
 call_timer_fn+0x233/0x830 kernel/time/timer.c:1268
 expire_timers kernel/time/timer.c:1307 [inline]
 __run_timers+0x7fd/0xb90 kernel/time/timer.c:1601
 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614
 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284
 invoke_softirq kernel/softirq.c:364 [inline]
 irq_exit+0x1cc/0x200 kernel/softirq.c:405
 exiting_irq arch/x86/include/asm/apic.h:638 [inline]
 smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:1044
 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:702
RIP: 0010:arch_local_irq_enable arch/x86/include/asm/paravirt.h:824 [inline]
RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:267 [inline]
RIP: 0010:_raw_write_unlock_irq+0x56/0x70 kernel/locking/spinlock.c:343
RSP: 0018:8801cd50eaa8 EFLAGS: 0286 ORIG_RAX: ff10
RAX: dc00 RBX: 85a090c0 RCX: 0006
RDX: 10b595f3 RSI: 11003962f989 RDI: 85acaf98
RBP: 8801cd50eab0 R08: 0001 R09: 
R10:  R11:  R12: 8801cc96ea60
R13: dc00 R14: 8801cc96e4c0 R15: 8801cc96e4c0
 
 release_task+0xe9e/0x1a40 kernel/exit.c:220
 wait_task_zombie kernel/exit.c:1162 [inline]
 wait_consider_task+0x29b8/0x33c0 kernel/exit.c:1389
 do_wait_thread kernel/exit.c:1452 [inline]
 do_wait+0x441/0xa90 kernel/exit.c:1523
 kernel_wait4+0x1f5/0x370 kernel/exit.c:1665
 SYSC_wait4+0x134/0x140 kernel/exit.c:1677
 SyS_wait4+0x2c/0x40 kernel/exit.c:1673
 call_usermodehelper_exec_sync kernel/kmod.c:286 [inline]
 call_usermodehelper_exec_work+0x1a0/0x2c0 kernel/kmod.c:323
 process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2097
 worker_thread+0x223/0x1860 kernel/workqueue.c:2231
 kthread+0x35e/0x430 kernel/kthread.c:231
 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:425

Allocated by task 21267:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc+0x127/0x750 mm/slab.c:3561
 ccid_new+0x20e/0x390 net/dccp/ccid.c:151
 dccp_hdlr_ccid+0x27/0x140 net/dccp/feat.c:44
 __dccp_feat_activate+0x142/0x2a0 net/dccp/feat.c:344
 dccp_feat_activate_values+0x34e/0xa90 net/dccp/feat.c:1538
 dccp_rcv_request_sent_state_process net/dccp/input.c:472 [inline]
 dccp_rcv_state_process+0xed1/0x1620 net/dccp/input.c:677
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [inline]
 __release_sock+0x124/0x360 net/core/sock.c:2269
 release_sock+0xa4/0x2a0 net/core/sock.c:2784
 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline]
 __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643
 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682
 SYSC_connect+0x204/0x470 net/socket.c:1642
 SyS_connect+0x24/0x30 net/socket.c:1623
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 3049:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 ccid_hc_tx_delete+0xc5/0x100 net/dccp/ccid.c:190
 dccp_destroy_sock+0x1d1/0x2b0 net/dccp/proto.c:225
 inet_csk_destroy_sock+0x166/0x3f0 net/ipv4/inet_connection_sock.c:833
 dccp_done+0xb7/0xd0 net/dccp/proto.c:145
 dccp_time_wait+0x13d/0x300 net/dccp/minisocks.c:72
 dccp_rcv_reset+0x1d1/0x5b0 net/dccp/input.c:160
 dccp_rcv_state_process+0x8fc/0x1620 net/dccp/input.c:663
 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679
 sk_backlog_rcv include/net/sock.h:911 [in

Re: [PATCH v3] openvswitch: enable NSH support

2017-08-16 Thread Jiri Benc
On Wed, 16 Aug 2017 17:31:30 +0800, Yang, Yi wrote:
> On Wed, Aug 16, 2017 at 11:19:21AM +0200, Jiri Benc wrote:
> > > --- a/include/uapi/linux/openvswitch.h
> > > +++ b/include/uapi/linux/openvswitch.h
> > [...]
> > > +#define NSH_MD1_CONTEXT_SIZE 4
> > 
> > Please move this to nsh.h and use it there, too, instead of the open
> > coded 4.
> 
> ovs code is very ugly, it will convert array[4] in
> datapath/linux/compat/include/linux/openvswitch.h to other struct, I
> have to change context[4] to such format :-), we can use 4 here for
> Linux kernel.

Oh, right, this is uAPI and nsh.h is kernel internal. My suggestion was
nonsense, let's keep it as it was in your patch.

> > > + case OVS_KEY_ATTR_NSH: {
> > > + struct ovs_key_nsh nsh;
> > > + struct ovs_key_nsh nsh_mask;
> > > + size_t size = nla_len(a) / 2;
> > > + struct nlattr attr[1 + DIV_ROUND_UP(sizeof(struct ovs_key_ipv6)
> > > + , sizeof(struct nlattr))];
> > > + struct nlattr mask[1 + DIV_ROUND_UP(sizeof(struct ovs_key_ipv6)
> > > + , sizeof(struct nlattr))];
> > > +
> > > + attr->nla_type = nla_type(a);
> > > + mask->nla_type = attr->nla_type;
> > > + attr->nla_len = NLA_HDRLEN + size;
> > > + mask->nla_len = attr->nla_len;
> > > + memcpy(attr + 1, (char *)(a + 1), size);
> > > + memcpy(mask + 1, (char *)(a + 1) + size, size);
> > 
> > This is too hacky. Please find a better way to handle this.
> > 
> > One option is to create a struct with struct nlattr as the first member
> > followed by a char buffer. Still not nice but at least it's clear
> > what's the intent.
> 
> The issue is nested attributes only can use this way, nested attribute
> for SET_MASKED is very special, we have to handle it specially.

I'm not sure you understood what I meant. Let me explain in code:

struct {
struct nlattr attr;
struct ovs_key_ipv6 data;
} attr, mask;

attr->attr.nla_type = nla_type(a);
attr->attr.nl_len = NLA_HDRLEN + size;
memcpy(&attr->data, a + 1, size);

It's much less hacky and doing the same thing.

I'm not sure we don't need verification of size not overflowing the
available buffer. Is it checked beforehand elsewhere?

I also spotted one more bug: the 'mask' variable is not used anywhere.
The second call of nsh_key_from_nlattr should use mask, not attr.

> > > + key->nsh.path_hdr = nsh->path_hdr;
> > > + switch (key->nsh.mdtype) {
> > > + case NSH_M_TYPE1:
> > > + if ((length << 2) != NSH_M_TYPE1_LEN)
> > 
> > Why length << 2?
> 
> len in NSH header is number of 4 octets, so need to multiply 4.

Should nsh_get_len take care of that, then?

Thanks,

 Jiri


Re: DSA support for Micrel KSZ8895

2017-08-16 Thread Andrew Lunn
On Wed, Aug 16, 2017 at 09:55:24AM +0200, Pavel Machek wrote:
> Hi!
> 
> I've got hardware with KSZ8895, and I'd like to use switch ports as
> separate ethernet cards. I believe that means DSA support.
> 
> And there are even patches available from microchip... unfortunately
> they are in strange form and for v3.18.
> 
> http://www.microchip.com/SWLibraryWeb/product.aspx?product=KSZ8895%20Software%20Linux%203.18
> 
> Is there newer version of the driver available somewhere? Is the
> driver good starting point, or should I start with something else?

Hi Pavel

Woojung is the expert here. His DSA driver for the 9477 is a nice
clean driver.

Have you compared the 8895 to the 9477. Are they similar? Could the
existing 9477 be extended to support the 8895?

 Andrew


[PATCH] tun: make tun_build_skb() thread safe

2017-08-16 Thread Jason Wang
From: Eric Dumazet 

tun_build_skb() is not thread safe since it uses per queue page frag,
this will break things when multiple threads are sending through same
queue. Switch to use per-thread generator (no lock involved).

Fixes: 66ccbc9c87c2 ("tap: use build_skb() for small packet")
Tested-by: Jason Wang 
Signed-off-by: Eric Dumazet 
Signed-off-by: Jason Wang 
---
 drivers/net/tun.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5892284..c38cd84 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -175,7 +175,6 @@ struct tun_file {
struct list_head next;
struct tun_struct *detached;
struct skb_array tx_array;
-   struct page_frag alloc_frag;
 };
 
 struct tun_flow_entry {
@@ -578,8 +577,6 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
}
if (tun)
skb_array_cleanup(&tfile->tx_array);
-   if (tfile->alloc_frag.page)
-   put_page(tfile->alloc_frag.page);
sock_put(&tfile->sk);
}
 }
@@ -1272,7 +1269,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct 
*tun,
 struct virtio_net_hdr *hdr,
 int len, int *generic_xdp)
 {
-   struct page_frag *alloc_frag = &tfile->alloc_frag;
+   struct page_frag *alloc_frag = ¤t->task_frag;
struct sk_buff *skb;
struct bpf_prog *xdp_prog;
int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
@@ -2580,8 +2577,6 @@ static int tun_chr_open(struct inode *inode, struct file 
* file)
tfile->sk.sk_write_space = tun_sock_write_space;
tfile->sk.sk_sndbuf = INT_MAX;
 
-   tfile->alloc_frag.page = NULL;
-
file->private_data = tfile;
INIT_LIST_HEAD(&tfile->next);
 
-- 
2.7.4



Re: [PATCH][V2] net/mlx4: fix spelling mistake: "availible" -> "available"

2017-08-16 Thread Leon Romanovsky
On Wed, Aug 16, 2017 at 03:02:31PM +0100, Colin Ian King wrote:
> On 16/08/17 14:58, Leon Romanovsky wrote:
> > On Wed, Aug 16, 2017 at 02:42:50PM +0100, Colin King wrote:
> >> From: Colin Ian King 
> >>
> >> Trivial fix to spelling mistakes in the mlx4 driver.
> >>
> >> Signed-off-by: Colin Ian King 
> >> ---
> >>  drivers/net/ethernet/mellanox/mlx4/cmd.c| 16 
> >>  drivers/net/ethernet/mellanox/mlx4/fw_qos.c |  6 +++---
> >>  drivers/net/ethernet/mellanox/mlx4/fw_qos.h | 10 +-
> >>  3 files changed, 16 insertions(+), 16 deletions(-)
> >>
> >
> > What are the changes between this version and previous one?
> >
> > Thanks
> >
> A fix on "Availible"
>
> - mlx4_dbg(dev, "Port %d Availible VPPs %d\n", port, availible_vpp);
> + mlx4_dbg(dev, "Port %d Available VPPs %d\n", port, available_vpp);
>
> Colin
>


Thanks,
Reviewed-by: Leon Romanovsky 


signature.asc
Description: PGP signature


RE: DSA support for Micrel KSZ8895

2017-08-16 Thread Woojung.Huh
> > Hi!
> >
> > I've got hardware with KSZ8895, and I'd like to use switch ports as
> > separate ethernet cards. I believe that means DSA support.
> >
> > And there are even patches available from microchip... unfortunately
> > they are in strange form and for v3.18.
> >
> >
> http://www.microchip.com/SWLibraryWeb/product.aspx?product=KSZ8895
> %20Software%20Linux%203.18
> >
> > Is there newer version of the driver available somewhere? Is the
> > driver good starting point, or should I start with something else?
> 
> Hi Pavel
> 
> Woojung is the expert here. His DSA driver for the 9477 is a nice
> clean driver.
> 
> Have you compared the 8895 to the 9477. Are they similar? Could the
> existing 9477 be extended to support the 8895?
> 
>Andrew

Hi Pavel,

I'll forward your email to our support.
AFAIK, KSZ8895 has different register mapping from KSZ9477,
it will be more than ID changes in current driver.

Thanks.
Woojung



RE: [patch net-next 0/3] net/sched: Improve getting objects by indexes

2017-08-16 Thread David Laight
From: Christian König
> Sent: 16 August 2017 09:32
> Am 16.08.2017 um 10:16 schrieb Jiri Pirko:
> > Wed, Aug 16, 2017 at 09:49:07AM CEST, christian.koe...@amd.com wrote:
> >> Am 16.08.2017 um 04:12 schrieb Chris Mi:
...
> >>> - ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL);
> >>> - if (ret < 0) {
> >>> + ret = idr_alloc(&bsg_minor_idr, bcd, &idr_index, 0, BSG_MAX_DEVS,
> >>> + GFP_KERNEL);
> >>> + if (ret) {
> >>>   if (ret == -ENOSPC) {
> >>>   printk(KERN_ERR "bsg: too many bsg devices\n");
> >>>   ret = -EINVAL;
> >> The condition "if (ret)" will now always be true after the first allocation
> >> and so we always run into the error handling after that.
> > On success, idr_alloc returns 0.
> 
> Ah, I see. You change the idr_alloc to return the resulting index as
> separate parameter.

Returning values by reference typically generates considerably worse code
that using the function return value.
It isn't just the extra parameter, it can constrain the generated code
in other ways.
That is why ERR_PTR() and friends exist.
IMHO You need a really good reason to make this change.

David




Re: [PATCH] tun: make tun_build_skb() thread safe

2017-08-16 Thread Jason Wang



On 2017年08月16日 22:14, Jason Wang wrote:

From: Eric Dumazet 

tun_build_skb() is not thread safe since it uses per queue page frag,
this will break things when multiple threads are sending through same
queue. Switch to use per-thread generator (no lock involved).

Fixes: 66ccbc9c87c2 ("tap: use build_skb() for small packet")
Tested-by: Jason Wang 
Signed-off-by: Eric Dumazet 
Signed-off-by: Jason Wang 
---
  drivers/net/tun.c | 7 +--


Forget to mention, this is for net-next.

Thanks


Re: [PATCH 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-16 Thread Eric Dumazet
On Wed, 2017-08-16 at 06:31 -0700, Tonghao Zhang wrote:
> Because we remove the tcp_tw_recycle support in the commit


> 4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
> the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
> Now when we call the 'af_ops->route_req', the dist always is
> NULL, and we remove the unnecessay check.

Thanks for these patches.

You forgot :

1) a cover letter ( [PATCH next-next 0/2] tcp: 

2) clearly state which tree you are targeting 
( read Documentation/networking/netdev-FAQ.txt )

3) Also, I would also have removed tcp_peer_is_proven()
since it is also called with dst=NULL





Re: [patch net-next repost 1/3] idr: Use unsigned long instead of int

2017-08-16 Thread Eric Dumazet
On Wed, 2017-08-16 at 13:06 +0200, Jiri Pirko wrote:
> Wed, Aug 16, 2017 at 12:58:53PM CEST, eric.duma...@gmail.com wrote:
> >On Wed, 2017-08-16 at 12:53 +0200, Jiri Pirko wrote:
> >
> >> rhashtable is unnecesary big hammer for this. IDR is nice fit for
> >> this purpose.
> >
> >Obviously IDR does not fit, since you have to change its ABI.
> 
> I don't think it is a problem to adjust something to your needs.
> Moreover, if it's API is misdesigned from the beginning. We are just
> putting IDR back on track, cleaning it's API. I don't see anything wrong
> on that. Everyone would benefit.

Except that your patch is gigantic, and nobody really can review it.

You could define idr_alloc_ext() maybe.

Then provide a patch series grouped so that each maintainer can review
its part.

Or leave legacy code using the old idr_alloc() in place.




Re: [patch net-next repost 1/3] idr: Use unsigned long instead of int

2017-08-16 Thread Jiri Pirko
Wed, Aug 16, 2017 at 04:57:18PM CEST, eric.duma...@gmail.com wrote:
>On Wed, 2017-08-16 at 13:06 +0200, Jiri Pirko wrote:
>> Wed, Aug 16, 2017 at 12:58:53PM CEST, eric.duma...@gmail.com wrote:
>> >On Wed, 2017-08-16 at 12:53 +0200, Jiri Pirko wrote:
>> >
>> >> rhashtable is unnecesary big hammer for this. IDR is nice fit for
>> >> this purpose.
>> >
>> >Obviously IDR does not fit, since you have to change its ABI.
>> 
>> I don't think it is a problem to adjust something to your needs.
>> Moreover, if it's API is misdesigned from the beginning. We are just
>> putting IDR back on track, cleaning it's API. I don't see anything wrong
>> on that. Everyone would benefit.
>
>Except that your patch is gigantic, and nobody really can review it.
>
>You could define idr_alloc_ext() maybe.
>
>Then provide a patch series grouped so that each maintainer can review
>its part.
>
>Or leave legacy code using the old idr_alloc() in place.

Fair. Thanks.


[patch net-next] net: sched: cls_flower: fix ndo_setup_tc type for stats call

2017-08-16 Thread Jiri Pirko
From: Jiri Pirko 

I made a stupid mistake using TC_CLSFLOWER_STATS instead of
TC_SETUP_CLSFLOWER. Funny thing is that both are defined as "2" so it
actually did not cause any harm. Anyway, fixing it now.

Fixes: 2572ac53c46f ("net: sched: make type an argument for ndo_setup_tc")
Signed-off-by: Jiri Pirko 
---
 net/sched/cls_flower.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 052e902..bd9dab4 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -289,7 +289,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct 
cls_fl_filter *f)
cls_flower.cookie = (unsigned long) f;
cls_flower.exts = &f->exts;
 
-   dev->netdev_ops->ndo_setup_tc(dev, TC_CLSFLOWER_STATS,
+   dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER,
  &cls_flower);
 }
 
-- 
2.9.3



Re: [PATCH v3] openvswitch: enable NSH support

2017-08-16 Thread Eric Garver
On Wed, Aug 16, 2017 at 01:35:30PM +0800, Yi Yang wrote:
> v2->v3
>  - Change OVS_KEY_ATTR_NSH to nested key to handle
>length-fixed attributes and length-variable
>attriubte more flexibly.
>  - Remove struct ovs_action_push_nsh completely
>  - Add code to handle nested attribute for SET_MASKED
>  - Change PUSH_NSH to use the nested OVS_KEY_ATTR_NSH
>to transfer NSH header data.
>  - Fix comments and coding style issues by Jiri and Eric

Thanks for the updates Yi. Some feedback below.

> 
> v1->v2
>  - Change encap_nsh and decap_nsh to push_nsh and pop_nsh
>  - Dynamically allocate struct ovs_action_push_nsh for
>length-variable metadata.
> 
> OVS master and 2.8 branch has merged NSH userspace
> patch series, this patch is to enable NSH support
> in kernel data path in order that OVS can support
> NSH in 2.8 release in compat mode by porting this.
> 
> Signed-off-by: Yi Yang 
> ---
>  drivers/net/vxlan.c  |   7 +
>  include/net/nsh.h| 150 +++
>  include/uapi/linux/openvswitch.h |  30 
>  net/openvswitch/actions.c| 175 ++
>  net/openvswitch/flow.c   |  39 +
>  net/openvswitch/flow.h   |  11 ++
>  net/openvswitch/flow_netlink.c   | 304 
> ++-
>  net/openvswitch/flow_netlink.h   |   4 +
>  8 files changed, 719 insertions(+), 1 deletion(-)
>  create mode 100644 include/net/nsh.h
> 
[..]
> diff --git a/include/net/nsh.h b/include/net/nsh.h
> new file mode 100644
> index 000..54f44f6
> --- /dev/null
> +++ b/include/net/nsh.h
> @@ -0,0 +1,150 @@
[..]
> +#define NSH_VER_MASK   0xc000
> +#define NSH_VER_SHIFT  14
> +#define NSH_FLAGS_MASK 0x3fc0
> +#define NSH_FLAGS_SHIFT6
> +#define NSH_LEN_MASK   0x003f
> +#define NSH_LEN_SHIFT  0
> +
> +#define NSH_SPI_MASK   0xff00
> +#define NSH_SPI_SHIFT  8
> +#define NSH_SI_MASK0x00ff
> +#define NSH_SI_SHIFT   0
> +
> +#define NSH_DST_PORT4790 /* UDP Port for NSH on VXLAN. */
> +#define ETH_P_NSH   0x894F   /* Ethertype for NSH. */

ETH_P_NSH probably belongs in include/uapi/linux/if_ether.h with all the
other ETH_P_* defines.

> +
> +/* NSH Base Header Next Protocol. */
> +#define NSH_P_IPV40x01
> +#define NSH_P_IPV60x02
> +#define NSH_P_ETHERNET0x03
> +#define NSH_P_NSH 0x04
> +#define NSH_P_MPLS0x05
> +
> +/* MD Type Registry. */
> +#define NSH_M_TYPE1 0x01
> +#define NSH_M_TYPE2 0x02
> +#define NSH_M_EXP1  0xFE
> +#define NSH_M_EXP2  0xFF
> +
> +/* NSH Metadata Length. */
> +#define NSH_M_TYPE1_MDLEN 16
> +
> +/* NSH Base Header Length */
> +#define NSH_BASE_HDR_LEN  8
> +
> +/* NSH MD Type 1 header Length. */
> +#define NSH_M_TYPE1_LEN   24
> +
[..]
> diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
> index 1875bba..6c738d0 100644
> --- a/net/openvswitch/flow.h
> +++ b/net/openvswitch/flow.h
> @@ -35,6 +35,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  struct sk_buff;
>  
> @@ -66,6 +67,15 @@ struct vlan_head {
>   (offsetof(struct sw_flow_key, recirc_id) +  \
>   FIELD_SIZEOF(struct sw_flow_key, recirc_id))
>  
> +struct ovs_key_nsh {
> + __u8 flags;
> + __u8 mdtype;
> + __u8 np;
> + __u8 pad;
> + __be32 path_hdr;
> + __be32 context[NSH_MD1_CONTEXT_SIZE];
> +};
> +
>  struct sw_flow_key {
>   u8 tun_opts[IP_TUNNEL_OPTS_MAX];
>   u8 tun_opts_len;
> @@ -144,6 +154,7 @@ struct sw_flow_key {
>   };
>   } ipv6;
>   };
> + struct ovs_key_nsh nsh; /* network service header */

Are you intentionally not reserving space in the flow key for
OVS_NSH_KEY_ATTR_MD2? I know it's not supported yet, but much of the
code is stubbed out for it - just making sure this wasn't an oversight.

>   struct {
>   /* Connection tracking fields not packed above. */
>   struct {
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index f07d10a..79059db 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> @@ -78,9 +78,11 @@ static bool actions_may_change_flow(const struct nlattr 
> *actions)
>   case OVS_ACTION_ATTR_HASH:
>   case OVS_ACTION_ATTR_POP_ETH:
>   case OVS_ACTION_ATTR_POP_MPLS:
> + case OVS_ACTION_ATTR_POP_NSH:
>   case OVS_ACTION_ATTR_POP_VLAN:
>   case OVS_ACTION_ATTR_PUSH_ETH:
>   case OVS_ACTION_ATTR_PUSH_MPLS:
> + case OVS_ACTION_ATTR_PUSH_NSH:
>   case OVS_ACTION_ATTR_PUSH_VLAN:
>   case OVS_ACTION_ATTR_SAMPLE:
>   case OVS_ACTION_ATTR_SET:
> @@ -322,12 +324,22 @@ size_t ovs_tun_key_attr_size(void)
>   + nla_total_size(2);   /* OVS_TUNNEL_KEY_ATTR_TP_DST */
>  }
>  
> +size_t ovs_nsh_key_attr_size(void)
> +{
> + /* Whenever adding new OVS_NSH_KEY_ FIELDS, we should 

Re: [PATCH net] datagram: When peeking datagrams with offset < 0 don't skip empty skbs

2017-08-16 Thread Willem de Bruijn
> If I read the above correctly, you are arguining in favor of the
> addittional flag version, right?

I was. Though if we are going to thread the argument from the caller
to __skb_try_recv_from_queue to avoid rereading sk->sk_peek_off,
on second thought it might be simpler to do it through off:

@@ -511,7 +511,9 @@ static inline int sk_peek_offset(struct sock *sk, int flags)
if (unlikely(flags & MSG_PEEK)) {
s32 off = READ_ONCE(sk->sk_peek_off);
if (off >= 0)
-   return off;
+   return off + 1;
+   else
+   return 0;
}

return 0;

In __skb_try_recv_from_queue we can then disambiguate the two as follows:

@@ -170,13 +170,19 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
  struct sk_buff **last)
 {
struct sk_buff *skb;
-   int _off = *off;
+   bool peek_at_off = false;
+   int _off = 0;
+
+   if (flags & MSG_PEEK && *off) {
+   peek_at_off = true;
+   _off = (*off) - 1;
+   }

*last = queue->prev;
skb_queue_walk(queue, skb) {
if (flags & MSG_PEEK) {
-   if (_off >= skb->len && (skb->len || _off ||
-skb->peeked)) {
+   if (peek_at_off && _off >= skb->len &&
+   (skb->len || _off || skb->peeked)) {


This, of course, requires restricting sk_peek_off to protect against overflow.

If I'm not mistaken, the test in udp_recvmsg currently incorrectly sets
peeking to false when peeking at offset zero:

peeking = off = sk_peek_offset(sk, flags);


> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2408,9 +2408,7 @@ EXPORT_SYMBOL(__sk_mem_reclaim);
>
>  int sk_set_peek_off(struct sock *sk, int val)
>  {
> -   if (val < 0)
> -   return -EINVAL;
> -
> +   /* a negative value will disable peeking with offset */
> sk->sk_peek_off = val;
> return 0;
>  }

Separate patch to net-next?


Re: [net-next PATCH 00/10] BPF: sockmap and sk redirect support

2017-08-16 Thread Daniel Borkmann

On 08/16/2017 07:30 AM, John Fastabend wrote:

This series implements a sockmap and socket redirect helper for BPF
using a model similar to XDP netdev redirect. A sockmap is a BPF map
type that holds references to sock structs. Then with a new sk
redirect bpf helper BPF programs can use the map to redirect skbs
between sockets,

   bpf_sk_redirect_map(map, key, flags)

Finally, we need a call site to attach our BPF logic to do socket
redirects. We added hooks to recv_sock using the existing strparser
infrastructure to do this. The call site is added via the BPF attach
map call. To enable users to use this infrastructure a new BPF program
BPF_PROG_TYPE_SK_SKB is created that allows users to reference sock
details, such as port and ip address fields, to build useful socket
layer program. The sockmap datapath is as follows,

  recv -> strparser -> verdict/action

where this series implements the drop and redirect actions.
Additional, actions can be added as needed.

A sample program is provided to illustrate how a sockmap can
be integrated with cgroups and used to add/delete sockets in
a sockmap. The program is simple but should show many of the
key ideas.

To test this work test_maps in selftests/bpf was leveraged.
We added a set of tests to add sockets and do send/recv ops
on the sockets to ensure correct behavior. Additionally, the
selftests tests a series of negative test cases. We can expand
on this in the future.

I also have a basic test program I use with iperf/netperf
clients that could be sent as an additional sample if folks
want this. It needs a bit of cleanup to send to the list and
wasn't included in this series.

For people who prefer git over pulling patches out of their mail
editor I've posted the code here,

https://github.com/jrfastab/linux-kernel-xdp/tree/sockmap

For some background information on the genesis of this work
it might be helpful to review these slides from netconf 2017
by Thomas Graf,

http://vger.kernel.org/netconf2017.html
https://docs.google.com/a/covalent.io/presentation/d/1dwSKSBGpUHD3WO5xxzZWj8awV_-xL-oYhvqQMOBhhtk/edit?usp=sharing

Thanks to Daniel Borkmann for reviewing and providing initial
feedback.


LGTM, for the set:

Acked-by: Daniel Borkmann 


[PATCH net-next v2 0/2] simplify the tcp_conn_request.

2017-08-16 Thread Tonghao Zhang
These patches are not bugfix. But just simplify the tcp_conn_request
function.

These patches are based on Davem's net-next tree.

Tonghao Zhang (2):
  tcp: Remove unnecessary dst check in tcp_conn_request.
  tcp: Remove the unused parameter for tcp_try_fastopen.

 include/net/tcp.h   |  3 +--
 net/ipv4/tcp_fastopen.c |  6 ++
 net/ipv4/tcp_input.c| 11 +--
 3 files changed, 8 insertions(+), 12 deletions(-)

-- 
1.8.3.1



[PATCH net-next v2 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-16 Thread Tonghao Zhang
Because we remove the tcp_tw_recycle support in the commit
4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
Now when we call the 'af_ops->route_req', the dist always is
NULL, and we remove the unnecessay check.

Signed-off-by: Tonghao Zhang 
---
 net/ipv4/tcp_input.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d73903f..7eee2c7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6132,11 +6132,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
isn = af_ops->init_seq(skb);
}
-   if (!dst) {
-   dst = af_ops->route_req(sk, &fl, req);
-   if (!dst)
-   goto drop_and_free;
-   }
+
+   dst = af_ops->route_req(sk, &fl, req);
+   if (!dst)
+   goto drop_and_free;
 
tcp_ecn_create_request(req, skb, sk, dst);
 
-- 
1.8.3.1



[PATCH net-next 0/3] vmbus sendpacket cleanups

2017-08-16 Thread Stephen Hemminger
These patches remove and consolidate vmbus_sendpacket functions.

They should go through the net-next tree since these API's
were only used by the netvsc driver.

Stephen Hemminger (3):
  vmbus: remove unused vmbus_sendpacket_multipagebuffer
  vmbus: remove unused vmubs_sendpacket_pagebuffer_ctl
  vmbus: remove unused vmbus_sendpacket_ctl

 drivers/hv/channel.c| 129 
 drivers/net/hyperv/netvsc.c |  19 +++
 include/linux/hyperv.h  |  21 
 3 files changed, 31 insertions(+), 138 deletions(-)

-- 
2.11.0



[PATCH net-next 1/3] vmbus: remove unused vmbus_sendpacket_multipagebuffer

2017-08-16 Thread Stephen Hemminger
This function is not used anywhere in current code.

Signed-off-by: Stephen Hemminger 
---
 drivers/hv/channel.c   | 56 --
 include/linux/hyperv.h |  6 --
 2 files changed, 62 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index e57cc40cb768..756a1e841142 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -814,62 +814,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel 
*channel,
 }
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_mpb_desc);
 
-/*
- * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet
- * using a GPADL Direct packet type.
- */
-int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
-   struct hv_multipage_buffer *multi_pagebuffer,
-   void *buffer, u32 bufferlen, u64 requestid)
-{
-   struct vmbus_channel_packet_multipage_buffer desc;
-   u32 descsize;
-   u32 packetlen;
-   u32 packetlen_aligned;
-   struct kvec bufferlist[3];
-   u64 aligned_data = 0;
-   u32 pfncount = NUM_PAGES_SPANNED(multi_pagebuffer->offset,
-multi_pagebuffer->len);
-
-   if (pfncount > MAX_MULTIPAGE_BUFFER_COUNT)
-   return -EINVAL;
-
-   /*
-* Adjust the size down since vmbus_channel_packet_multipage_buffer is
-* the largest size we support
-*/
-   descsize = sizeof(struct vmbus_channel_packet_multipage_buffer) -
- ((MAX_MULTIPAGE_BUFFER_COUNT - pfncount) *
- sizeof(u64));
-   packetlen = descsize + bufferlen;
-   packetlen_aligned = ALIGN(packetlen, sizeof(u64));
-
-
-   /* Setup the descriptor */
-   desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
-   desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
-   desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */
-   desc.length8 = (u16)(packetlen_aligned >> 3);
-   desc.transactionid = requestid;
-   desc.rangecount = 1;
-
-   desc.range.len = multi_pagebuffer->len;
-   desc.range.offset = multi_pagebuffer->offset;
-
-   memcpy(desc.range.pfn_array, multi_pagebuffer->pfn_array,
-  pfncount * sizeof(u64));
-
-   bufferlist[0].iov_base = &desc;
-   bufferlist[0].iov_len = descsize;
-   bufferlist[1].iov_base = buffer;
-   bufferlist[1].iov_len = bufferlen;
-   bufferlist[2].iov_base = &aligned_data;
-   bufferlist[2].iov_len = (packetlen_aligned - packetlen);
-
-   return hv_ringbuffer_write(channel, bufferlist, 3);
-}
-EXPORT_SYMBOL_GPL(vmbus_sendpacket_multipagebuffer);
-
 /**
  * vmbus_recvpacket() - Retrieve the user packet on the specified channel
  * @channel: Pointer to vmbus_channel structure.
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index b7d7bbec74e0..39a080ce17da 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1052,12 +1052,6 @@ extern int vmbus_sendpacket_pagebuffer_ctl(struct 
vmbus_channel *channel,
   u64 requestid,
   u32 flags);
 
-extern int vmbus_sendpacket_multipagebuffer(struct vmbus_channel *channel,
-   struct hv_multipage_buffer *mpb,
-   void *buffer,
-   u32 bufferlen,
-   u64 requestid);
-
 extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel,
 struct vmbus_packet_mpb_array *mpb,
 u32 desc_size,
-- 
2.11.0



[PATCH net-next 3/3] vmbus: remove unused vmbus_sendpacket_ctl

2017-08-16 Thread Stephen Hemminger
The only usage of vmbus_sendpacket_ctl was by vmbus_sendpacket.

Signed-off-by: Stephen Hemminger 
---
 drivers/hv/channel.c| 43 +--
 drivers/net/hyperv/netvsc.c |  9 -
 include/linux/hyperv.h  |  7 ---
 3 files changed, 21 insertions(+), 38 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 9223fe8823e0..d9e9676e2b40 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -647,9 +647,23 @@ void vmbus_close(struct vmbus_channel *channel)
 }
 EXPORT_SYMBOL_GPL(vmbus_close);
 
-int vmbus_sendpacket_ctl(struct vmbus_channel *channel, void *buffer,
-u32 bufferlen, u64 requestid,
-enum vmbus_packet_type type, u32 flags)
+/**
+ * vmbus_sendpacket() - Send the specified buffer on the given channel
+ * @channel: Pointer to vmbus_channel structure.
+ * @buffer: Pointer to the buffer you want to receive the data into.
+ * @bufferlen: Maximum size of what the the buffer will hold
+ * @requestid: Identifier of the request
+ * @type: Type of packet that is being send e.g. negotiate, time
+ * packet etc.
+ *
+ * Sends data in @buffer directly to hyper-v via the vmbus
+ * This will send the data unparsed to hyper-v.
+ *
+ * Mainly used by Hyper-V drivers.
+ */
+int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
+  u32 bufferlen, u64 requestid,
+  enum vmbus_packet_type type, u32 flags)
 {
struct vmpacket_descriptor desc;
u32 packetlen = sizeof(struct vmpacket_descriptor) + bufferlen;
@@ -676,29 +690,6 @@ int vmbus_sendpacket_ctl(struct vmbus_channel *channel, 
void *buffer,
 
return hv_ringbuffer_write(channel, bufferlist, num_vecs);
 }
-EXPORT_SYMBOL(vmbus_sendpacket_ctl);
-
-/**
- * vmbus_sendpacket() - Send the specified buffer on the given channel
- * @channel: Pointer to vmbus_channel structure.
- * @buffer: Pointer to the buffer you want to receive the data into.
- * @bufferlen: Maximum size of what the the buffer will hold
- * @requestid: Identifier of the request
- * @type: Type of packet that is being send e.g. negotiate, time
- * packet etc.
- *
- * Sends data in @buffer directly to hyper-v via the vmbus
- * This will send the data unparsed to hyper-v.
- *
- * Mainly used by Hyper-V drivers.
- */
-int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer,
-  u32 bufferlen, u64 requestid,
-  enum vmbus_packet_type type, u32 flags)
-{
-   return vmbus_sendpacket_ctl(channel, buffer, bufferlen, requestid,
-   type, flags);
-}
 EXPORT_SYMBOL(vmbus_sendpacket);
 
 /*
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 6031102cbba3..0062b802676f 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -780,11 +780,10 @@ static inline int netvsc_send_pkt(
  &nvmsg, sizeof(nvmsg),
  req_id);
} else {
-   ret = vmbus_sendpacket_ctl(out_channel, &nvmsg,
-  sizeof(struct nvsp_message),
-  req_id,
-  VM_PKT_DATA_INBAND,
-  
VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+   ret = vmbus_sendpacket(out_channel,
+  &nvmsg, sizeof(nvmsg),
+  req_id, VM_PKT_DATA_INBAND,
+  
VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
}
 
if (ret == 0) {
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 9692592d43a3..a5f961c4149e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1030,13 +1030,6 @@ extern int vmbus_sendpacket(struct vmbus_channel 
*channel,
  enum vmbus_packet_type type,
  u32 flags);
 
-extern int vmbus_sendpacket_ctl(struct vmbus_channel *channel,
- void *buffer,
- u32 bufferLen,
- u64 requestid,
- enum vmbus_packet_type type,
- u32 flags);
-
 extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
struct hv_page_buffer pagebuffers[],
u32 pagecount,
-- 
2.11.0



[PATCH net-next 2/3] vmbus: remove unused vmubs_sendpacket_pagebuffer_ctl

2017-08-16 Thread Stephen Hemminger
The function vmbus_sendpacket_pagebuffer_ctl was never used directly.
Just have vmbus_send_pagebuffer

Signed-off-by: Stephen Hemminger 
---
 drivers/hv/channel.c| 30 ++
 drivers/net/hyperv/netvsc.c | 10 --
 include/linux/hyperv.h  |  8 
 3 files changed, 10 insertions(+), 38 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 756a1e841142..9223fe8823e0 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -702,16 +702,16 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void 
*buffer,
 EXPORT_SYMBOL(vmbus_sendpacket);
 
 /*
- * vmbus_sendpacket_pagebuffer_ctl - Send a range of single-page buffer
+ * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
  * packets using a GPADL Direct packet type. This interface allows you
  * to control notifying the host. This will be useful for sending
  * batched data. Also the sender can control the send flags
  * explicitly.
  */
-int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
-   struct hv_page_buffer pagebuffers[],
-   u32 pagecount, void *buffer, u32 bufferlen,
-   u64 requestid, u32 flags)
+int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
+   struct hv_page_buffer pagebuffers[],
+   u32 pagecount, void *buffer, u32 bufferlen,
+   u64 requestid)
 {
int i;
struct vmbus_channel_packet_page_buffer desc;
@@ -736,7 +736,7 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel 
*channel,
 
/* Setup the descriptor */
desc.type = VM_PKT_DATA_USING_GPA_DIRECT;
-   desc.flags = flags;
+   desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */
desc.length8 = (u16)(packetlen_aligned >> 3);
desc.transactionid = requestid;
@@ -757,24 +757,6 @@ int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel 
*channel,
 
return hv_ringbuffer_write(channel, bufferlist, 3);
 }
-EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer_ctl);
-
-/*
- * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer
- * packets using a GPADL Direct packet type.
- */
-int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel,
-struct hv_page_buffer pagebuffers[],
-u32 pagecount, void *buffer, u32 bufferlen,
-u64 requestid)
-{
-   u32 flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED;
-
-   return vmbus_sendpacket_pagebuffer_ctl(channel, pagebuffers, pagecount,
-  buffer, bufferlen,
-  requestid, flags);
-
-}
 EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer);
 
 /*
diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c
index 0530e7d729e1..6031102cbba3 100644
--- a/drivers/net/hyperv/netvsc.c
+++ b/drivers/net/hyperv/netvsc.c
@@ -775,12 +775,10 @@ static inline int netvsc_send_pkt(
if (packet->cp_partial)
pb += packet->rmsg_pgcnt;
 
-   ret = vmbus_sendpacket_pagebuffer_ctl(out_channel,
- pb, packet->page_buf_cnt,
- &nvmsg,
- sizeof(struct 
nvsp_message),
- req_id,
- 
VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
+   ret = vmbus_sendpacket_pagebuffer(out_channel,
+ pb, packet->page_buf_cnt,
+ &nvmsg, sizeof(nvmsg),
+ req_id);
} else {
ret = vmbus_sendpacket_ctl(out_channel, &nvmsg,
   sizeof(struct nvsp_message),
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 39a080ce17da..9692592d43a3 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1044,14 +1044,6 @@ extern int vmbus_sendpacket_pagebuffer(struct 
vmbus_channel *channel,
u32 bufferlen,
u64 requestid);
 
-extern int vmbus_sendpacket_pagebuffer_ctl(struct vmbus_channel *channel,
-  struct hv_page_buffer pagebuffers[],
-  u32 pagecount,
-  void *buffer,
-  u32 bufferlen,
-  u64 requestid,
-  u32 fla

Re: [PATCH 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-16 Thread Tonghao Zhang
On Wed, Aug 16, 2017 at 10:44 PM, Eric Dumazet  wrote:
> On Wed, 2017-08-16 at 06:31 -0700, Tonghao Zhang wrote:
>> Because we remove the tcp_tw_recycle support in the commit
>
>
>> 4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
>> the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
>> Now when we call the 'af_ops->route_req', the dist always is
>> NULL, and we remove the unnecessay check.
>
> Thanks for these patches.
>
> You forgot :
>
> 1) a cover letter ( [PATCH next-next 0/2] tcp: 
>
> 2) clearly state which tree you are targeting
> ( read Documentation/networking/netdev-FAQ.txt )
Thanks, I do this in v2

> 3) Also, I would also have removed tcp_peer_is_proven()
> since it is also called with dst=NULL
Thanks for your work.

>
>


Re: [PATCH net-next V2 1/3] tap: use build_skb() for small packet

2017-08-16 Thread David Miller
From: Jason Wang 
Date: Wed, 16 Aug 2017 17:17:45 +0800

> It looks like full page allocation just produce too much stress on the
> page allocator.
> 
> I get 1.58Mpps (full page) vs 1.95Mpps (page frag) with the patches
> attached.

Yes, this is why drivers doing XDP tend to drift towards implementing
a local cache of pages.


[PATCH net] tipc: fix use-after-free

2017-08-16 Thread Eric Dumazet
From: Eric Dumazet 

syszkaller reported use-after-free in tipc [1]

When msg->rep skb is freed, set the pointer to NULL,
so that caller does not free it again.

[1]

==
BUG: KASAN: use-after-free in skb_push+0xd4/0xe0 net/core/skbuff.c:1466
Read of size 8 at addr 8801c6e71e90 by task syz-executor5/4115

CPU: 1 PID: 4115 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #32
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:16 [inline]
 dump_stack+0x194/0x257 lib/dump_stack.c:52
 print_address_description+0x73/0x250 mm/kasan/report.c:252
 kasan_report_error mm/kasan/report.c:351 [inline]
 kasan_report+0x24e/0x340 mm/kasan/report.c:409
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
 skb_push+0xd4/0xe0 net/core/skbuff.c:1466
 tipc_nl_compat_recv+0x833/0x18f0 net/tipc/netlink_compat.c:1209
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4512e9
RSP: 002b:7f3bc8184c08 EFLAGS: 0216 ORIG_RAX: 0001
RAX: ffda RBX: 00718000 RCX: 004512e9
RDX: 0020 RSI: 20fdb000 RDI: 0006
RBP: 0086 R08:  R09: 
R10:  R11: 0216 R12: 004b5e76
R13: 7f3bc8184b48 R14: 004b5e86 R15: 

Allocated by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489
 kmem_cache_alloc_node+0x13d/0x750 mm/slab.c:3651
 __alloc_skb+0xf1/0x740 net/core/skbuff.c:219
 alloc_skb include/linux/skbuff.h:903 [inline]
 tipc_tlv_alloc+0x26/0xb0 net/tipc/netlink_compat.c:148
 tipc_nl_compat_dumpit+0xf2/0x3c0 net/tipc/netlink_compat.c:248
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe

Freed by task 4115:
 save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
 save_stack+0x43/0xd0 mm/kasan/kasan.c:447
 set_track mm/kasan/kasan.c:459 [inline]
 kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
 __cache_free mm/slab.c:3503 [inline]
 kmem_cache_free+0x77/0x280 mm/slab.c:3763
 kfree_skbmem+0x1a1/0x1d0 net/core/skbuff.c:622
 __kfree_skb net/core/skbuff.c:682 [inline]
 kfree_skb+0x165/0x4c0 net/core/skbuff.c:699
 tipc_nl_compat_dumpit+0x36a/0x3c0 net/tipc/netlink_compat.c:260
 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline]
 tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199
 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598
 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623
 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397
 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634
 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
 netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291
 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854
 sock_sendmsg_nosec net/socket.c:633 [inline]
 sock_sendmsg+0xca/0x110 net/socket.c:643
 sock_write_iter+0x31a/0x5d0 net/socket.c:898
 call_write_iter include/linux/fs.h:1743 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_w

  1   2   3   >